diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 0000000000..a9e4ee246d --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,3 @@ +# These are supported funding model platforms + +github: [dstackai] diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yaml index ec29747fb6..e9ee5c6415 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yaml +++ b/.github/ISSUE_TEMPLATE/bug_report.yaml @@ -39,4 +39,4 @@ body: attributes: label: Additional information description: | - Any links, references or screenshots to have more context about the issue. \ No newline at end of file + Any links, references or screenshots to have more context about the issue. diff --git a/.github/ISSUE_TEMPLATE/feature_request.yaml b/.github/ISSUE_TEMPLATE/feature_request.yaml index ecd6f89cdc..37457a1d50 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.yaml +++ b/.github/ISSUE_TEMPLATE/feature_request.yaml @@ -28,4 +28,4 @@ body: - 'Yes' - 'No' validations: - required: true \ No newline at end of file + required: true diff --git a/.github/workflows/build-artifacts.yml b/.github/workflows/build-artifacts.yml new file mode 100644 index 0000000000..d642a81117 --- /dev/null +++ b/.github/workflows/build-artifacts.yml @@ -0,0 +1,254 @@ +name: Build Artifacts + +on: + workflow_call: + inputs: + version: + type: string + required: true + staging: + type: boolean + required: true + go-integration-tests: + type: boolean + required: true + +jobs: + code-lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up uv + uses: astral-sh/setup-uv@v5 + with: + python-version: 3.11 + - run: uv tool install pre-commit + - run: pre-commit run -a --show-diff-on-failure + + frontend-build: + runs-on: ubuntu-latest + defaults: + run: + working-directory: frontend + steps: + - uses: actions/checkout@v4 + - name: Restore cached build + id: cache-build + uses: actions/cache@v4 + with: + path: frontend/build + key: frontend-build-${{ hashFiles('frontend/**') }} + restore-keys: | + frontend-build- + - name: Set up Node + if: steps.cache-build.outputs.cache-hit != 'true' + uses: actions/setup-node@v4 + with: + node-version: 18 + - name: Install packages + if: steps.cache-build.outputs.cache-hit != 'true' + run: npm ci + - name: Build dist + if: steps.cache-build.outputs.cache-hit != 'true' + run: npm run build + - name: Upload dist + uses: actions/upload-artifact@v4 + with: + name: frontend-build + path: frontend/build + retention-days: 1 + + python-test: + needs: [code-lint, frontend-build] + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [macos-latest, ubuntu-latest, windows-latest] + python-version: ["3.10", "3.11", "3.12", "3.13"] + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: astral-sh/setup-uv@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: uv sync --all-extras + - name: Run pyright + uses: jakebailey/pyright-action@v3 + with: + pylance-version: latest-release + - name: Download frontend build + uses: actions/download-artifact@v4 + with: + name: frontend-build + path: src/dstack/_internal/server/statics + - name: Run pytest on POSIX + if: matrix.os != 'windows-latest' + # Skip Postgres tests on macos since macos runner doesn't have Docker. + run: | + RUNPOSTGRES="" + if [ "${{ matrix.os }}" != "macos-latest" ]; then + RUNPOSTGRES="--runpostgres" + fi + uv run pytest -n auto src/tests --runui $RUNPOSTGRES + - name: Run pytest on Windows + if: matrix.os == 'windows-latest' + run: | + uv run pytest -n auto src/tests --runui --runpostgres + + runner-test: + defaults: + run: + working-directory: runner + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: runner/go.mod + cache-dependency-path: runner/go.sum + - name: Check if go.mod and go.sum are up-to-date + run: go mod tidy -diff + - name: Run golangci-lint + uses: golangci/golangci-lint-action@v9 + with: + version: v2.6.2 # Should match .pre-commit-config.yaml + args: --timeout=20m + working-directory: runner + - name: Test + # Only run slow tests if requested by workflow call inputs. + run: | + SHORT="-short" + if [[ "${{ inputs.go-integration-tests }}" == "true" ]]; then + SHORT="" + fi + go version + go fmt $(go list ./... | grep -v /vendor/) + go vet $(go list ./... | grep -v /vendor/) + go test $SHORT -race $(go list ./... | grep -v /vendor/) + + runner-compile: + needs: [runner-test] + defaults: + run: + working-directory: runner + env: + REPO_NAME: github.com/dstackai/dstack + strategy: + matrix: + include: + - { runs-on: "ubuntu-24.04", goos: "linux", goarch: "amd64" } + - { runs-on: "ubuntu-24.04-arm", goos: "linux", goarch: "arm64" } + runs-on: ${{ matrix.runs-on }} + steps: + - uses: actions/checkout@v4 + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: runner/go.mod + cache-dependency-path: runner/go.sum + - name: build + env: + GOOS: ${{ matrix.goos }} + GOARCH: ${{ matrix.goarch }} + run: | + CGO_ENABLED=0 go build -ldflags "-X 'main.Version=${{ inputs.version }}' -extldflags '-static'" -o dstack-runner-$GOOS-$GOARCH $REPO_NAME/runner/cmd/runner + CGO_ENABLED=1 go build -ldflags "-X 'main.Version=${{ inputs.version }}'" -o dstack-shim-$GOOS-$GOARCH $REPO_NAME/runner/cmd/shim + - uses: actions/upload-artifact@v4 + with: + name: dstack-runner-${{ matrix.goos }}-${{ matrix.goarch }} + path: | + runner/dstack-runner-${{ matrix.goos }}-${{ matrix.goarch }} + runner/dstack-shim-${{ matrix.goos }}-${{ matrix.goarch }} + retention-days: 1 + + gateway-build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up uv + uses: astral-sh/setup-uv@v5 + with: + python-version: 3.11 + - name: Build package + working-directory: gateway + run: | + echo "__version__ = \"${{ inputs.version }}\"" > src/dstack/gateway/version.py + # TODO: depend on a specific dstackai/dstack commit for staging builds? + if [[ "${{ inputs.staging }}" == "false" ]]; then + sed \ + -i.old \ + "s|@ https://github.com/dstackai/dstack/archive/refs/heads/master.tar.gz|== ${{ inputs.version }}|" \ + pyproject.toml + diff pyproject.toml pyproject.toml.old > /dev/null && echo "Could not set version" && exit 1 + fi + uv build + - uses: actions/upload-artifact@v4 + with: + name: dstack-gateway + path: gateway/dist/dstack_gateway-${{ inputs.version }}-py3-none-any.whl + retention-days: 1 + + python-build: + needs: [code-lint, frontend-build] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up uv + uses: astral-sh/setup-uv@v5 + with: + python-version: 3.11 + - name: Download frontend build + uses: actions/download-artifact@v4 + with: + name: frontend-build + path: src/dstack/_internal/server/statics + - name: Build dstack Python package + # TODO: set __version__ to inputs.version regardless of inputs.staging, + # so that staging builds are also tied to a specific runner and gateway version. + # May require changing how dstack handles __version__. + run: | + if [[ "${{ inputs.staging }}" == "true" ]]; then + VERSION=0.0.0 + IS_RELEASE=False + else + VERSION=${{ inputs.version }} + IS_RELEASE=True + fi + DOCKER_BASE_IMAGE=$(cat src/dstack/version.py | grep "docker_base_image = ") + DOCKER_BASE_IMAGE_UBUNTU_VERSION=$(cat src/dstack/version.py | grep "docker_base_image_ubuntu_version = ") + VM_BASE_IMAGE=$(cat src/dstack/version.py | grep "vm_base_image = ") + echo "__version__ = \"$VERSION\"" > src/dstack/version.py + echo "__is_release__ = $IS_RELEASE" >> src/dstack/version.py + echo $DOCKER_BASE_IMAGE >> src/dstack/version.py + echo $DOCKER_BASE_IMAGE_UBUNTU_VERSION >> src/dstack/version.py + echo $VM_BASE_IMAGE >> src/dstack/version.py + cp README.md src + uv build + - uses: actions/upload-artifact@v4 + with: + name: python-build + path: dist + retention-days: 1 + + generate-json-schema: + needs: [code-lint] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v5 + with: + python-version: 3.11 + - name: Install dstack + run: uv sync + - name: Generate json schema + run: | + mkdir /tmp/json-schemas + uv run python -c "from dstack._internal.core.models.configurations import DstackConfiguration; print(DstackConfiguration.schema_json())" > /tmp/json-schemas/configuration.json + uv run python -c "from dstack._internal.core.models.profiles import ProfilesConfig; print(ProfilesConfig.schema_json())" > /tmp/json-schemas/profiles.json + - uses: actions/upload-artifact@v4 + with: + name: json-schemas + path: /tmp/json-schemas + retention-days: 1 diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml new file mode 100644 index 0000000000..6ca99d764a --- /dev/null +++ b/.github/workflows/build-docs.yml @@ -0,0 +1,26 @@ +name: Build Docs + +on: + workflow_call: + +jobs: + build-docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v5 + with: + python-version: 3.11 + - name: Install dstack + run: | + uv sync --extra server + - name: Build + run: | + sudo apt-get update && sudo apt-get install -y libcairo2-dev libfreetype6-dev libffi-dev libjpeg-dev libpng-dev libz-dev + uv run mkdocs build -s + - uses: actions/upload-artifact@v4 + with: + name: site + path: site + retention-days: 1 + include-hidden-files: true diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index da6af2d686..77e50916c9 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,217 +1,69 @@ -name: Build +name: Test Build on: push: branches: - - 'master' + - "master" paths-ignore: - - 'docs/**' - - 'mkdocs.yml' + - "mkdocs/**" + - "mkdocs.yml" pull_request: branches: - - 'master' + - "master" workflow_dispatch: inputs: - intergation-tests: + go-integration-tests: type: boolean required: true default: false + description: Go integration tests env: BUILD_INCREMENT: 150 - PIP_DISABLE_PIP_VERSION_CHECK: on - PIP_DEFAULT_TIMEOUT: 10 - PIP_PROGRESS_BAR: off jobs: - python-lint: + compute-version: runs-on: ubuntu-latest + outputs: + version: ${{ steps.set-version.outputs.version }} steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: 3.11 - - run: python -m pip install pre-commit - - run: pre-commit run -a --show-diff-on-failure - - python-test: - needs: [ python-lint ] - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ macos-latest, ubuntu-latest] - python-version: [ "3.8", "3.9", "3.10", "3.11" ] - steps: - - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: pip install -U '.[all]' -r requirements_dev.txt - - name: Run pytest - run: pytest src/tests - - update-get-dstack: - if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository - needs: [ python-test ] - runs-on: ubuntu-latest - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - steps: - - name: Install AWS - run: pip install awscli - - run: | - VERSION=$((${{ github.run_number }} + ${{ env.BUILD_INCREMENT }})) - echo $VERSION | aws s3 cp - s3://get-dstack/stgn-cli/latest-version --acl public-read - - runner-test: - defaults: - run: - working-directory: runner - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version: "1.22" - - name: Run golangci-lint - uses: golangci/golangci-lint-action@v6 - with: - version: v1.58 - args: --timeout=20m - working-directory: runner - - name: Test - # Do not run slow integration tests automatically. - # Slow tests can be run manually via workflow_dispatch when required. - run: | - SHORT="-short" - if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then - if [[ "${{ github.event.inputs.intergation-tests }}" == "true" ]]; then - SHORT="" - fi - fi - go version - go fmt $(go list ./... | grep -v /vendor/) - go vet $(go list ./... | grep -v /vendor/) - go test $SHORT -race $(go list ./... | grep -v /vendor/) - - runner-compile: - needs: [runner-test] - defaults: - run: - working-directory: runner - env: - REPO_NAME: github.com/dstackai/dstack - strategy: - matrix: - include: - - { goos: "linux", goarch: "amd64", runson: "ubuntu-latest" } - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version: "1.22" - - name: build - env: - GOOS: ${{ matrix.goos }} - GOARCH: ${{ matrix.goarch }} - ACTIONSOS: ${{ matrix.runson }} - CGO_ENABLED: 0 + - id: set-version run: | - VERSION=$((${{ github.run_number }} + ${{ env.BUILD_INCREMENT }})) - go build -ldflags "-X 'main.Version=$VERSION' -extldflags '-static'" -o dstack-runner-$GOOS-$GOARCH $REPO_NAME/runner/cmd/runner - go build -ldflags "-X 'main.Version=$VERSION' -extldflags '-static'" -o dstack-shim-$GOOS-$GOARCH $REPO_NAME/runner/cmd/shim - echo $VERSION - - uses: actions/upload-artifact@v3 - with: - name: dstack-runner - path: | - runner/dstack-runner-${{ matrix.goos }}-${{ matrix.goarch }} - runner/dstack-shim-${{ matrix.goos }}-${{ matrix.goarch }} - retention-days: 1 + echo "version=$((${{ github.run_number }} + ${{ env.BUILD_INCREMENT }}))" >> $GITHUB_OUTPUT - runner-upload: + build-artifacts: + needs: [compute-version] + uses: ./.github/workflows/build-artifacts.yml + with: + version: ${{ needs.compute-version.outputs.version }} + staging: true + # TODO: run integration tests on every 'push' event + # https://github.com/dstackai/dstack/issues/3005 + go-integration-tests: ${{ github.event_name == 'workflow_dispatch' && inputs.go-integration-tests }} + + upload-pre-pypi-artifacts: + needs: [compute-version, build-artifacts] + # Skip for PRs from forks, where AWS S3 credentials are not available if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository - needs: [runner-compile] - runs-on: ubuntu-latest - steps: - - name: Install AWS - run: pip install awscli - - name: Download Runner - uses: actions/download-artifact@v3 - with: - name: dstack-runner - path: runner - - name: Upload to S3 - working-directory: runner - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - run: | - VERSION=$((${{ github.run_number }} + ${{ env.BUILD_INCREMENT }})) - aws s3 cp . "s3://dstack-runner-downloads-stgn/$VERSION/binaries/" --recursive --exclude "*" --include "dstack-*" --acl public-read - aws s3 cp . "s3://dstack-runner-downloads-stgn/latest/binaries/" --recursive --exclude "*" --include "dstack-*" --acl public-read - - generate-json-schema: + uses: ./.github/workflows/upload-pre-pypi-artifacts.yml + with: + version: ${{ needs.compute-version.outputs.version }} + staging: true + secrets: inherit + + upload-post-pypi-artifacts: + needs: [compute-version, build-artifacts] + # Skip for PRs from forks, where AWS S3 credentials are not available if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository - needs: [ python-test ] - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: 3.11 - - name: Install AWS - run: pip install awscli - - name: Install dstack - run: pip install . - - name: Generate json schema - run: | - python -c "from dstack._internal.core.models.configurations import DstackConfiguration; print(DstackConfiguration.schema_json(indent=2))" > configuration.json - python -c "from dstack._internal.core.models.profiles import ProfilesConfig; print(ProfilesConfig.schema_json(indent=2))" > profiles.json - - name: Upload json schema to S3 - run: | - VERSION=$((${{ github.run_number }} + ${{ env.BUILD_INCREMENT }})) - aws s3 cp configuration.json "s3://dstack-runner-downloads-stgn/$VERSION/schemas/configuration.json" --acl public-read - aws s3 cp configuration.json "s3://dstack-runner-downloads-stgn/latest/schemas/configuration.json" --acl public-read - aws s3 cp profiles.json "s3://dstack-runner-downloads-stgn/$VERSION/schemas/profiles.json" --acl public-read - aws s3 cp profiles.json "s3://dstack-runner-downloads-stgn/latest/schemas/profiles.json" --acl public-read + uses: ./.github/workflows/upload-post-pypi-artifacts.yml + with: + version: ${{ needs.compute-version.outputs.version }} + is-latest-version: true + staging: true + secrets: inherit - gateway-build: + build-docs: + # Skip for PRs from forks, where mkdocs-material-insiders is not available if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository - runs-on: ubuntu-latest - defaults: - run: - working-directory: gateway - steps: - - uses: actions/checkout@v4 - - name: Set up Python 3.11 - uses: actions/setup-python@v5 - with: - python-version: 3.11 - - name: Install AWS - run: pip install awscli - - name: Install dependencies - run: pip install wheel build - - name: Compute version - run: echo VERSION=$((${{ github.run_number }} + ${{ env.BUILD_INCREMENT }})) > $GITHUB_ENV - - name: Build package - run: | - echo "__version__ = \"${{ env.VERSION }}\"" > src/dstack/gateway/version.py - python -m build . - - name: Upload to S3 - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - run: | - WHEEL=dstack_gateway-${{ env.VERSION }}-py3-none-any.whl - aws s3 cp dist/$WHEEL "s3://dstack-gateway-downloads/stgn/$WHEEL" - echo "${{ env.VERSION }}" | aws s3 cp - "s3://dstack-gateway-downloads/stgn/latest-version" + uses: ./.github/workflows/build-docs.yml + secrets: inherit diff --git a/.github/workflows/close-inactive-issues.yml b/.github/workflows/close-inactive-issues.yml index 9cf2654032..1fcbd24292 100644 --- a/.github/workflows/close-inactive-issues.yml +++ b/.github/workflows/close-inactive-issues.yml @@ -1,24 +1,26 @@ name: Close inactive issues on: + workflow_dispatch: schedule: - cron: "30 1 * * *" jobs: close-issues: - if: github.repository == 'dstackai/dstack' runs-on: ubuntu-latest permissions: issues: write pull-requests: write steps: - - uses: actions/stale@v5 + - uses: actions/stale@v9 with: days-before-issue-stale: 30 days-before-issue-close: 14 stale-issue-label: "stale" stale-issue-message: "This issue is stale because it has been open for 30 days with no activity." close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale. Please reopen the issue if it is still relevant." - close-issue-reason: not_planned - days-before-pr-stale: -1 - days-before-pr-close: -1 - repo-token: ${{ secrets.GH_TOKEN }} + days-before-pr-stale: 14 + days-before-pr-close: 7 + stale-pr-label: "stale" + stale-pr-message: "This PR is stale because it has been open for 14 days with no activity." + close-pr-message: "This PR was closed because it has been inactive for 7 days since being marked as stale. Please reopen the PR if it is still relevant." + exempt-issue-labels: no-stale,major diff --git a/.github/workflows/docker-amd-smi.yml b/.github/workflows/docker-amd-smi.yml new file mode 100644 index 0000000000..df800fc3c0 --- /dev/null +++ b/.github/workflows/docker-amd-smi.yml @@ -0,0 +1,58 @@ +name: Build AMD SMI Docker image + +on: + workflow_dispatch: + inputs: + image_name: + description: "Docker image name" + required: true + default: "dstackai/amd-smi" + rocm_version: + description: "ROCm version" + required: true + default: "6.4" + dstack_revision: + description: "Docker image revision" + required: true + default: 0 + tag_latest: + description: "Update 'latest'" + type: boolean + default: false + +jobs: + build-amd-smi: + defaults: + run: + working-directory: docker/amd-smi + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: Build and upload to DockerHub + run: | + IMAGE_NAME=${{ inputs.image_name }} + docker buildx build . \ + --load \ + --provenance=false \ + --platform linux/amd64 \ + --build-arg IMAGE_NAME=${IMAGE_NAME} \ + --build-arg UBUNTU_VERSION=noble \ + --build-arg ROCM_VERSION=${{ inputs.rocm_version }} \ + --build-arg DSTACK_REVISION=${{ inputs.dstack_revision }} \ + --build-arg BUILD_DATE=$(date --utc --iso-8601=seconds)Z \ + --tag ${IMAGE_NAME}:latest + VERSION=$(docker inspect --format '{{ index .Config.Labels "org.opencontainers.image.version" }}' ${IMAGE_NAME}) + docker tag ${IMAGE_NAME}:latest ${IMAGE_NAME}:${VERSION} + docker push ${IMAGE_NAME}:${VERSION} + - name: Tag and push latest + if: ${{ inputs.tag_latest }} + run: | + docker push ${{ inputs.image_name }}:latest diff --git a/.github/workflows/docker-dind.yml b/.github/workflows/docker-dind.yml new file mode 100644 index 0000000000..ff03e9dcc5 --- /dev/null +++ b/.github/workflows/docker-dind.yml @@ -0,0 +1,46 @@ +name: Build DinD image + +on: + workflow_dispatch: + inputs: + image_name: + description: "Docker image name" + required: true + default: "dstackai/dind" + dstack_revision: + description: "Docker image revision" + required: true + default: 0 + +jobs: + build-dind: + defaults: + run: + working-directory: docker/dind + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: Build and upload to DockerHub + run: | + IMAGE_NAME=${{ inputs.image_name }} + BUILD_DATE=$(date --utc --iso-8601=seconds)Z + docker buildx build . \ + --load \ + --provenance=false \ + --platform linux/amd64 \ + --build-arg IMAGE_NAME=${IMAGE_NAME} \ + --build-arg DSTACK_REVISION=${{ inputs.dstack_revision }} \ + --build-arg BUILD_DATE=${BUILD_DATE} \ + --tag ${IMAGE_NAME}:latest + VERSION=$(docker inspect --format '{{ index .Config.Labels "org.opencontainers.image.version" }}' ${IMAGE_NAME}) + docker tag ${IMAGE_NAME}:latest ${IMAGE_NAME}:${VERSION} + docker push ${IMAGE_NAME}:${VERSION} + docker push ${IMAGE_NAME}:latest diff --git a/.github/workflows/docker-tt-smi.yml b/.github/workflows/docker-tt-smi.yml new file mode 100644 index 0000000000..444266ccdb --- /dev/null +++ b/.github/workflows/docker-tt-smi.yml @@ -0,0 +1,54 @@ +name: Build TT SMI Docker image + +on: + workflow_dispatch: + inputs: + image_name: + description: "Docker image name" + required: true + default: "dstackai/tt-smi" + tt_smi_version: + description: "TT SMI version" + required: true + default: "3.0.25" + tag_latest: + description: "Update 'latest'" + type: boolean + required: true + default: false + +jobs: + build-tt-smi: + defaults: + run: + working-directory: docker/tt-smi + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: Build and upload to DockerHub + run: | + IMAGE_NAME=${{ inputs.image_name }} + docker buildx build . \ + --load \ + --provenance=false \ + --platform linux/amd64 \ + --build-arg IMAGE_NAME=${IMAGE_NAME} \ + --build-arg TT_SMI_VERSION=${{ inputs.tt_smi_version }} \ + --build-arg BUILD_DATE=$(date --utc --iso-8601=seconds)Z \ + --tag ${IMAGE_NAME}:${{ inputs.tt_smi_version }} + VERSION=$(docker inspect --format '{{ index .Config.Labels "org.opencontainers.image.version" }}' ${IMAGE_NAME}:${{ inputs.tt_smi_version }}) + docker tag ${IMAGE_NAME}:${{ inputs.tt_smi_version }} ${IMAGE_NAME}:${VERSION} + docker push ${IMAGE_NAME}:${VERSION} + - name: Tag and push latest + if: ${{ inputs.tag_latest }} + run: | + docker tag ${{ inputs.image_name }}:${{ inputs.tt_smi_version }} ${{ inputs.image_name }}:latest + docker push ${{ inputs.image_name }}:latest diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 34d154134b..c17bf2cd95 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -1,281 +1,55 @@ -name: Build Docker & cloud images +name: Build Docker images +run-name: Build Docker images ${{ inputs.image_version }}${{ inputs.staging && ' (staging)' || '' }} on: workflow_dispatch: inputs: image_version: - description: "Docker image version" + description: "Image version" required: true staging: - description: "Staging build" + description: "Build staging images" type: boolean default: false - build_docker: - description: "Build docker images" - type: boolean - default: true - build_aws: - description: "Build AWS images" - type: boolean - default: true - build_azure: - description: "Build Azure images" - type: boolean - default: true - build_gcp: - description: "Build GCP images" - type: boolean - default: true - build_oci: - description: "Build OCI images" - type: boolean - default: true - build_nebius: - description: "Build Nebius images" - type: boolean - default: true env: - PACKER_VERSION: "1.9.2" - BUILD_PREFIX: ${{ inputs.staging && format('stgn-{0}-', github.run_number) || '' }} # staging ? prefix : '' + BUILD_DOCKER_REPO: ${{ inputs.staging && 'base-stgn' || 'base' }} jobs: build-docker: - if: inputs.build_docker defaults: run: working-directory: docker - runs-on: ubuntu-latest - strategy: - matrix: - python: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] - steps: - - name: Checkout repository - uses: actions/checkout@v3 - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - - name: Login to DockerHub - uses: docker/login-action@v2 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - - name: Set up QEMU - uses: docker/setup-qemu-action@v2 - - name: Build and upload to DockerHub - run: | - docker buildx build --platform linux/amd64 --build-arg PYTHON=${{ matrix.python }} --push --provenance=false --tag dstackai/base:py${{ matrix.python }}-${{ inputs.image_version }}-cuda-12.1 -f base/Dockerfile . - - build-aws-images: - needs: build-docker - if: always() && inputs.build_aws && (needs.build-docker.result == 'success' || needs.build-docker.result == 'skipped') - defaults: - run: - working-directory: scripts/packer - runs-on: ubuntu-latest - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + runs-on: dstack-ubuntu-latest-32-cores strategy: matrix: - variant: [ "", "-cuda" ] + flavor: ["base", "devel", "devel-efa"] + ubuntu_version: ["24"] steps: - - uses: actions/checkout@v3 - - name: Download packer - run: | - wget https://releases.hashicorp.com/packer/${{ env.PACKER_VERSION }}/packer_${{ env.PACKER_VERSION }}_linux_amd64.zip - unzip packer_${{ env.PACKER_VERSION }}_linux_amd64.zip - chmod +x packer - - name: Run packer - run: | - ./packer build -var-file=versions.json $PROD_VARS -var image_version=${{ inputs.image_version }} -var build_prefix=$BUILD_PREFIX aws-image${{ matrix.variant }}.json - env: - PROD_VARS: ${{ !inputs.staging && '-var-file=aws-vars-prod.json' || '' }} # production ? var-file : '' - - build-azure-images: - needs: build-docker - if: always() && inputs.build_azure && (needs.build-docker.result == 'success' || needs.build-docker.result == 'skipped') - defaults: - run: - working-directory: scripts/packer - runs-on: ubuntu-latest - env: - AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} - AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }} - AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} - AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - VERSION: ${{ github.run_number }} - strategy: - matrix: - variant: [ "", "-cuda", "-grid" ] - steps: - - uses: actions/checkout@v3 - - uses: Azure/login@v1 - name: Log in to az + - name: Checkout repository + uses: actions/checkout@v4 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Login to DockerHub + uses: docker/login-action@v3 with: - creds: '{"clientId":"${{ secrets.AZURE_CLIENT_ID }}","clientSecret":"${{ secrets.AZURE_CLIENT_SECRET }}","subscriptionId":"${{ secrets.AZURE_SUBSCRIPTION_ID }}","tenantId":"${{ secrets.AZURE_TENANT_ID }}"}' - - name: Download packer - run: | - wget https://releases.hashicorp.com/packer/${{ env.PACKER_VERSION }}/packer_${{ env.PACKER_VERSION }}_linux_amd64.zip - unzip packer_${{ env.PACKER_VERSION }}_linux_amd64.zip - chmod +x packer - - name: Run packer - run: | - ./packer build -var-file=versions.json -var image_version=${{ inputs.image_version }} -var build_prefix=$BUILD_PREFIX azure-image${{ matrix.variant }}.json - - name: Publish azure image - if: ${{ !inputs.staging }} - run: | - IMAGE_DEFINITION=${BUILD_PREFIX}dstack${{ matrix.variant }}-${{ inputs.image_version }} - IMAGE_NAME=${BUILD_PREFIX}dstack${{ matrix.variant }}-${{ inputs.image_version }} - ../publish_azure_image.sh $IMAGE_DEFINITION $IMAGE_NAME - - build-gcp-images: - needs: build-docker - if: always() && inputs.build_gcp && (needs.build-docker.result == 'success' || needs.build-docker.result == 'skipped') - defaults: - run: - working-directory: scripts/packer - runs-on: ubuntu-latest - strategy: - matrix: - variant: [ "", "-cuda" ] - permissions: - contents: 'read' - id-token: 'write' - steps: - - uses: actions/checkout@v3 - - name: 'Authenticate to Google Cloud' - uses: 'google-github-actions/auth@v1' - with: - workload_identity_provider: 'projects/531508670106/locations/global/workloadIdentityPools/github-identity-pool/providers/github-id-provider' - service_account: 'github-actions@dstack.iam.gserviceaccount.com' - create_credentials_file: true - - name: 'Set up Cloud SDK' - uses: 'google-github-actions/setup-gcloud@v1' - - name: Download packer - run: | - wget https://releases.hashicorp.com/packer/${{ env.PACKER_VERSION }}/packer_${{ env.PACKER_VERSION }}_linux_amd64.zip - unzip packer_${{ env.PACKER_VERSION }}_linux_amd64.zip - chmod +x packer - - name: Run packer - run: | - ./packer build -var-file=versions.json -var image_version=${{ inputs.image_version }} -var build_prefix=$BUILD_PREFIX gcp-image${{ matrix.variant }}.json - - name: Publish images - run: | - IMAGE_VERSION=${IMAGE_VERSION//./-} - gcloud compute images add-iam-policy-binding ${BUILD_PREFIX}dstack${{ matrix.variant }}-$IMAGE_VERSION --member='allAuthenticatedUsers' --role='roles/compute.imageUser' - env: - IMAGE_VERSION: ${{ inputs.image_version }} - - build-oci-images: - needs: build-docker - if: always() && inputs.build_oci && (needs.build-docker.result == 'success' || needs.build-docker.result == 'skipped') - runs-on: ubuntu-latest - env: - OCI_COMPARTMENT: ocid1.compartment.oc1..aaaaaaaaxu2uq64unfa2imwkp37icxqv6f7gwp2mczdt2mukuqbkauwqmbtq - OCI_SUBNET: ocid1.subnet.oc1.eu-frankfurt-1.aaaaaaaaewxkaqsmbi2tig5sfw4eexzo3mkb4zrpm4gwvfhqdddnxicxe4fa - OCI_AVAILABILITY_DOMAIN: kZql:EU-FRANKFURT-1-AD-3 - OCI_REGION: eu-frankfurt-1 - strategy: - matrix: - variant: [ "", "-cuda" ] - steps: - - uses: actions/checkout@v3 - - name: Setup OCI config - run: | - mkdir ~/.oci - echo "$OCI_KEY_CONTENT" > ~/.oci/key.pem - echo [DEFAULT] > ~/.oci/config - echo region=$OCI_REGION >> ~/.oci/config - echo tenancy=$OCI_TENANCY >> ~/.oci/config - echo user=$OCI_USER >> ~/.oci/config - echo fingerprint=$OCI_FINGERPRINT >> ~/.oci/config - echo key_file=~/.oci/key.pem >> ~/.oci/config - env: - OCI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }} - OCI_USER: ${{ secrets.OCI_CLI_USER }} - OCI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }} - OCI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }} - - name: Install packer - working-directory: scripts/packer - run: | - wget https://releases.hashicorp.com/packer/${{ env.PACKER_VERSION }}/packer_${{ env.PACKER_VERSION }}_linux_amd64.zip - unzip packer_${{ env.PACKER_VERSION }}_linux_amd64.zip - chmod +x packer - ./packer plugins install github.com/hashicorp/oracle - - name: Run packer - working-directory: scripts/packer - run: | - ./packer build \ - -var-file=versions.json \ - -var image_version=${{ inputs.image_version }} \ - -var build_prefix=$BUILD_PREFIX \ - -var oci_compartment_ocid=$OCI_COMPARTMENT \ - -var oci_subnet_ocid=$OCI_SUBNET \ - -var oci_availability_domain=$OCI_AVAILABILITY_DOMAIN \ - oci-image${{ matrix.variant }}.json - - uses: actions/setup-python@v5 - with: - python-version: '3.12' - - name: Install dependencies for publishing - run: | - pip install .[oci] - - name: Copy image to target regions - if: ${{ !inputs.staging }} - run: | - python scripts/oci_image_tools.py copy \ - --image ${BUILD_PREFIX}dstack${{ matrix.variant }}-${{ inputs.image_version }} \ - --from $OCI_REGION \ - --compartment $OCI_COMPARTMENT - - name: Publish image in OCI Marketplace - if: ${{ !inputs.staging }} - run: | - python scripts/oci_image_tools.py publish \ - --image ${BUILD_PREFIX}dstack${{ matrix.variant }}-${{ inputs.image_version }} \ - --compartment $OCI_COMPARTMENT \ - --version ${{ inputs.image_version }} \ - --description "Image for running workloads with dstack - https://dstack.ai/" \ - --os "Ubuntu 22.04" \ - --contact-name dstack \ - --contact-email hello@dstack.ai - - build-nebius-images: - needs: build-docker - if: always() && inputs.build_nebius && (needs.build-docker.result == 'success' || needs.build-docker.result == 'skipped') - defaults: - run: - working-directory: scripts/packer - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: Get Nebius CLI - run: | - echo "CLI_VERSION=$CLI_VERSION" - curl -sSL https://storage.ai.nebius.cloud/ncp/install.sh | bash - echo "$HOME/nebius-cloud/bin" >> $GITHUB_PATH - env: - CLI_VERSION: 0.113.0+Nebius-AI - - name: Write Nebius credentials - uses: jsdaniell/create-json@v1.2.2 - with: - name: "service_account.json" - json: ${{ secrets.NEBIUS_SERVICE_ACCOUNT }} - dir: "scripts/packer/" - - name: Setup Nebius profile - run: | - ncp config profile create packer - ncp config set endpoint api.ai.nebius.cloud:443 - ncp config set service-account-key service_account.json - rm service_account.json - - name: Download packer - run: | - wget https://releases.hashicorp.com/packer/${{ env.PACKER_VERSION }}/packer_${{ env.PACKER_VERSION }}_linux_amd64.zip - unzip packer_${{ env.PACKER_VERSION }}_linux_amd64.zip - chmod +x packer - ./packer init . - - name: Run packer (HCL2) - run: | - export PKR_VAR_nebius_token=$(ncp iam create-token) - ./packer build -only yandex.nebius,yandex.nebius-cuda -var image_version=${{ inputs.image_version }} -var build_prefix=$BUILD_PREFIX . - env: - PKR_VAR_nebius_folder_id: ${{ secrets.NEBIUS_FOLDER_ID }} - PKR_VAR_nebius_subnet_id: ${{ secrets.NEBIUS_SUBNET_ID }} + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + - name: Free up some space + run: | + df -h / + du -hs /usr/share/dotnet + rm -rf /usr/share/dotnet + df -h / + - name: Build and upload to DockerHub + run: | + docker buildx build \ + --platform linux/amd64 \ + --target ${{ matrix.flavor }} \ + --tag dstackai/${{ env.BUILD_DOCKER_REPO }}:${{ inputs.image_version }}-${{ matrix.flavor }}-ubuntu${{ matrix.ubuntu_version }}.04 \ + --build-arg UBUNTU_VERSION=${{ matrix.ubuntu_version }} \ + --provenance=false \ + --push \ + -f base/Dockerfile . diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 5ec3b892f9..3e0d5f3a75 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -1,37 +1,26 @@ -name: Deploy Docs +name: Build & Deploy Docs on: workflow_dispatch: - inputs: - release_tag: - description: "dstack version" jobs: - docs-deploy: + build-docs: + uses: ./.github/workflows/build-docs.yml + secrets: inherit + + deploy-docs: + needs: [build-docs] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: actions/setup-python@v4 + - uses: actions/download-artifact@v4 with: - python-version: 3.11 - - name: Install dstack - run: | - if [ -n "${{ inputs.release_tag }}" ]; then - pip install "dstack==${{ inputs.release_tag }}" - else - pip install -e . - fi - - name: Build - run: | - pip install pillow cairosvg - sudo apt-get install -y libcairo2-dev libfreetype6-dev libffi-dev libjpeg-dev libpng-dev libz-dev - pip install mkdocs-material "mkdocs-material[imaging]" mkdocs-material-extensions mkdocs-redirects mkdocs-gen-files "mkdocstrings[python]" mkdocs-render-swagger-plugin --upgrade - pip install git+https://${{ secrets.GH_TOKEN }}@github.com/squidfunk/mkdocs-material-insiders.git - mkdocs build + name: site + path: site - name: Deploy - uses: JamesIves/github-pages-deploy-action@v4 + uses: JamesIves/github-pages-deploy-action@v4.6.4 with: repository-name: dstackai/dstackai.github.io branch: gh-pages token: ${{ secrets.GH_TOKEN }} - folder: site \ No newline at end of file + folder: site diff --git a/.github/workflows/gcp-a3mega-image.yml b/.github/workflows/gcp-a3mega-image.yml new file mode 100644 index 0000000000..2957671a1b --- /dev/null +++ b/.github/workflows/gcp-a3mega-image.yml @@ -0,0 +1,38 @@ +name: Build GCP A3 Mega VM image + +on: + - workflow_dispatch + +env: + PACKER_VERSION: "1.9.2" + IMAGE_VERSION: ${{ github.run_number }} +jobs: + build-gcp-images: + defaults: + run: + working-directory: scripts/packer + runs-on: ubuntu-latest + permissions: + contents: 'read' + id-token: 'write' + steps: + - uses: actions/checkout@v4 + - name: Authenticate to Google Cloud + uses: google-github-actions/auth@v2 + with: + workload_identity_provider: 'projects/531508670106/locations/global/workloadIdentityPools/github-identity-pool/providers/github-id-provider' + service_account: 'github-actions@dstack.iam.gserviceaccount.com' + create_credentials_file: true + - name: Set up Cloud SDK + uses: google-github-actions/setup-gcloud@v2 + - name: Download packer + run: | + wget https://releases.hashicorp.com/packer/${{ env.PACKER_VERSION }}/packer_${{ env.PACKER_VERSION }}_linux_amd64.zip + unzip packer_${{ env.PACKER_VERSION }}_linux_amd64.zip + chmod +x packer + - name: Run packer + run: | + ./packer build -var image_version=${{ env.IMAGE_VERSION }} gcp-a3mega-image.json + - name: Publish image + run: | + gcloud compute images add-iam-policy-binding dstack-a3mega-${{ env.IMAGE_VERSION }} --member='allAuthenticatedUsers' --role='roles/compute.imageUser' diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index f70ecc1766..0618b00367 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -8,195 +8,109 @@ on: - "[0-9]+.[0-9]+.[0-9]+" - "[0-9]+.[0-9]+.[0-9]+.post[0-9]+" -env: - BUILD_INCREMENT: 150 - PIP_DISABLE_PIP_VERSION_CHECK: on - PIP_DEFAULT_TIMEOUT: 10 - PIP_PROGRESS_BAR: off - jobs: - python-lint: + compute-version: runs-on: ubuntu-latest + outputs: + version: ${{ steps.set-version.outputs.version }} + latest: ${{ steps.set-latest.outputs.latest }} steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 + - name: Set up Python + uses: astral-sh/setup-uv@v5 with: python-version: 3.11 - - run: python -m pip install pre-commit - - run: pre-commit run -a --show-diff-on-failure - - python-test: - needs: [ python-lint ] - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ macos-latest, ubuntu-latest] - python-version: [ "3.8", "3.9", "3.10", "3.11" ] - steps: - - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: pip install -U '.[all]' -r requirements_dev.txt - - name: Run pytest - run: pytest src/tests - - runner-test: - defaults: - run: - working-directory: runner - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: Set up Go - uses: actions/setup-go@v3 - with: - go-version: 1.21.1 - - name: golangci-lint - uses: golangci/golangci-lint-action@v3 - with: - version: v1.51.2 - args: --issues-exit-code=0 --timeout=20m - working-directory: runner - - name: Test + # settings to prevent warnings when running uv without a repo checkout + enable-cache: false + ignore-empty-workdir: true + - id: set-version run: | - go version - go fmt $(go list ./... | grep -v /vendor/) - go vet $(go list ./... | grep -v /vendor/) - go test -race $(go list ./... | grep -v /vendor/) - - runner-compile: - needs: [runner-test] - defaults: - run: - working-directory: runner - env: - REPO_NAME: github.com/dstackai/dstack - strategy: - matrix: - include: - - { goos: "linux", goarch: "amd64", runson: "ubuntu-latest" } - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version: "1.22" - - name: build - env: - GOOS: ${{ matrix.goos }} - GOARCH: ${{ matrix.goarch }} - ACTIONSOS: ${{ matrix.runson }} - CGO_ENABLED: 0 + echo "version=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT + - id: set-latest run: | - VERSION=${GITHUB_REF#refs/tags/} - go build -ldflags "-X '$REPO_NAME/runner/cmd/runner/version.Version=$VERSION' -extldflags '-static'" -o dstack-runner-$GOOS-$GOARCH $REPO_NAME/runner/cmd/runner - go build -ldflags "-X '$REPO_NAME/runner/cmd/shim/version.Version=$VERSION' -extldflags '-static'" -o dstack-shim-$GOOS-$GOARCH $REPO_NAME/runner/cmd/shim - - uses: actions/upload-artifact@v3 - with: - name: dstack-runner - path: | - runner/dstack-runner-${{ matrix.goos }}-${{ matrix.goarch }} - runner/dstack-shim-${{ matrix.goos }}-${{ matrix.goarch }} + uv pip install packaging + VERSION=${{ steps.set-version.outputs.version }} + LATEST=$(python -c "from packaging import version as pkg_version; print('' if pkg_version.parse('$VERSION').is_prerelease else '1', end='')") + echo "latest=$LATEST" >> "$GITHUB_OUTPUT" - runner-upload: - needs: [gateway-build, runner-compile] - runs-on: ubuntu-latest - steps: - - name: Install AWS - run: pip install awscli - - name: Download Runner - uses: actions/download-artifact@v3 - with: - name: dstack-runner - path: runner - - name: Upload to S3 - working-directory: runner - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - run: | - VERSION=${GITHUB_REF#refs/tags/} - aws s3 cp . "s3://dstack-runner-downloads/$VERSION/binaries/" --recursive --exclude "*" --include "dstack-*" --acl public-read - aws s3 cp . "s3://dstack-runner-downloads/latest/binaries/" --recursive --exclude "*" --include "dstack-*" --acl public-read + build-artifacts: + needs: [compute-version] + uses: ./.github/workflows/build-artifacts.yml + with: + version: ${{ needs.compute-version.outputs.version }} + staging: false + go-integration-tests: false + + upload-pre-pypi-artifacts: + needs: [compute-version, build-artifacts] + uses: ./.github/workflows/upload-pre-pypi-artifacts.yml + with: + version: ${{ needs.compute-version.outputs.version }} + staging: false + secrets: inherit pypi-upload: - needs: [ python-test, runner-upload ] + needs: [compute-version, upload-pre-pypi-artifacts] runs-on: ubuntu-latest - outputs: - LATEST: ${{ steps.set_latest.outputs.LATEST }} - name: Set latest variable steps: - - uses: actions/checkout@v3 - - name: Set up Python 3.9 - uses: actions/setup-python@v4 + - name: Set up uv + uses: astral-sh/setup-uv@v5 with: - python-version: 3.9 - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install wheel twine packaging - - name: Set output - id: set_latest - run: | - VERSION=${GITHUB_REF#refs/tags/} - LATEST=$(python -c "from packaging import version as pkg_version; print('' if pkg_version.parse('$VERSION').is_prerelease else '1', end='')") - echo "LATEST=$LATEST" >> "$GITHUB_OUTPUT" - - name: Upload pip package + python-version: 3.11 + # settings to prevent warnings when running uv without a repo checkout + enable-cache: false + ignore-empty-workdir: true + - name: Download Python package + uses: actions/download-artifact@v4 + with: + name: python-build + path: dist + - name: Upload Python package to PyPI run: | - VERSION=${GITHUB_REF#refs/tags/} - BASE_IMAGE=$(cat src/dstack/version.py | grep base_image) - echo "__version__ = \"$VERSION\"" > src/dstack/version.py - echo "__is_release__ = True" >> src/dstack/version.py - echo $BASE_IMAGE >> src/dstack/version.py - cp README.md src - python setup.py sdist bdist_wheel -v - python -m twine upload --repository pypi --username ${{ secrets.PYPI_USERNAME }} --password ${{ secrets.PYPI_PASSWORD }} dist/* + uv publish --username ${{ secrets.PYPI_USERNAME }} --password ${{ secrets.PYPI_PASSWORD }} - update-get-dstack-tag: - needs: [ pypi-upload ] - runs-on: ubuntu-latest - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - steps: - - name: Install AWS - run: pip install awscli - - run: | - VERSION=${GITHUB_REF#refs/tags/} - echo $VERSION | aws s3 cp - s3://get-dstack/cli/latest-version --acl public-read + upload-post-pypi-artifacts: + needs: [compute-version, pypi-upload] + uses: ./.github/workflows/upload-post-pypi-artifacts.yml + with: + version: ${{ needs.compute-version.outputs.version }} + is-latest-version: ${{ needs.compute-version.outputs.latest == '1' }} + staging: false + secrets: inherit server-docker-upload: - needs: [ pypi-upload ] + needs: [compute-version, pypi-upload] defaults: run: working-directory: docker/server runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@v3 - name: Login to DockerHub - uses: docker/login-action@v2 + uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - name: Set up QEMU - uses: docker/setup-qemu-action@v2 + uses: docker/setup-qemu-action@v3 - name: Build and upload to DockerHub run: | - VERSION=${GITHUB_REF#refs/tags/} + VERSION=${{ needs.compute-version.outputs.version }} docker buildx build --platform linux/arm64/v8 --build-arg VERSION=$VERSION --push --provenance=false --tag dstackai/dstack:$VERSION-arm64 -f release/Dockerfile . docker buildx build --platform linux/amd64 --build-arg VERSION=$VERSION --push --provenance=false --tag dstackai/dstack:$VERSION-amd64 -f release/Dockerfile . + docker buildx build --platform linux/arm64/v8 --build-arg BASE_IMAGE=dstackai/dstack:$VERSION-arm64 --push --provenance=false --tag dstackai/dstack:nebius-$VERSION-arm64 -f Dockerfile.nebius . + docker buildx build --platform linux/amd64 --build-arg BASE_IMAGE=dstackai/dstack:$VERSION-amd64 --push --provenance=false --tag dstackai/dstack:nebius-$VERSION-amd64 -f Dockerfile.nebius . docker manifest create dstackai/dstack:$VERSION --amend dstackai/dstack:$VERSION-arm64 --amend dstackai/dstack:$VERSION-amd64 docker manifest push dstackai/dstack:$VERSION - if [ -n "${{ needs.pypi-upload.outputs.LATEST }}" ]; then + docker manifest create dstackai/dstack:nebius-$VERSION --amend dstackai/dstack:nebius-$VERSION-arm64 --amend dstackai/dstack:nebius-$VERSION-amd64 + docker manifest push dstackai/dstack:nebius-$VERSION + if [ -n "${{ needs.compute-version.outputs.latest }}" ]; then docker manifest create dstackai/dstack:latest --amend dstackai/dstack:$VERSION-arm64 --amend dstackai/dstack:$VERSION-amd64 docker manifest push dstackai/dstack:latest + docker manifest create dstackai/dstack:nebius-latest --amend dstackai/dstack:nebius-$VERSION-arm64 --amend dstackai/dstack:nebius-$VERSION-amd64 + docker manifest push dstackai/dstack:nebius-latest fi - name: Docker Hub Description uses: peter-evans/dockerhub-description@v4 @@ -205,62 +119,3 @@ jobs: password: ${{ secrets.DOCKERHUB_TOKEN }} repository: dstackai/dstack readme-filepath: ./docker/server/README.md - - generate-json-schema: - needs: [ pypi-upload ] - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: 3.11 - - name: Install AWS - run: pip install awscli - - name: Install dstack - run: pip install . - - name: Generate json schema - run: | - python -c "from dstack._internal.core.models.configurations import DstackConfiguration; print(DstackConfiguration.schema_json(indent=2))" > configuration.json - python -c "from dstack._internal.core.models.profiles import ProfilesConfig; print(ProfilesConfig.schema_json(indent=2))" > profiles.json - - name: Upload json schema to S3 - run: | - VERSION=${GITHUB_REF#refs/tags/} - aws s3 cp configuration.json "s3://dstack-runner-downloads/$VERSION/schemas/configuration.json" --acl public-read - aws s3 cp profiles.json "s3://dstack-runner-downloads/$VERSION/schemas/profiles.json" --acl public-read - if [ -n "${{ needs.pypi-upload.outputs.LATEST }}" ]; then - aws s3 cp configuration.json "s3://dstack-runner-downloads/latest/schemas/configuration.json" --acl public-read - aws s3 cp profiles.json "s3://dstack-runner-downloads/latest/schemas/profiles.json" --acl public-read - fi - - gateway-build: - runs-on: ubuntu-latest - defaults: - run: - working-directory: gateway - steps: - - uses: actions/checkout@v4 - - name: Set up Python 3.11 - uses: actions/setup-python@v5 - with: - python-version: 3.11 - - name: Install AWS - run: pip install awscli - - name: Install dependencies - run: pip install wheel build - - name: Store version - run: echo VERSION=${GITHUB_REF#refs/tags/} > $GITHUB_ENV - - name: Build package - run: | - echo "__version__ = \"${{ env.VERSION }}\"" > src/dstack/gateway/version.py - python -m build . - - name: Upload to S3 - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - run: | - WHEEL=dstack_gateway-${{ env.VERSION }}-py3-none-any.whl - aws s3 cp dist/$WHEEL "s3://dstack-gateway-downloads/release/$WHEEL" - echo "${{ env.VERSION }}" | aws s3 cp - "s3://dstack-gateway-downloads/release/latest-version" diff --git a/.github/workflows/upload-post-pypi-artifacts.yml b/.github/workflows/upload-post-pypi-artifacts.yml new file mode 100644 index 0000000000..4d9899deda --- /dev/null +++ b/.github/workflows/upload-post-pypi-artifacts.yml @@ -0,0 +1,55 @@ +name: Upload Post-PyPI Artifacts + +on: + workflow_call: + inputs: + version: + type: string + required: true + is-latest-version: + type: boolean + required: true + staging: + type: boolean + required: true + +jobs: + upload-post-pypi-artifacts: + runs-on: ubuntu-latest + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + steps: + - uses: astral-sh/setup-uv@v5 + with: + python-version: 3.11 + # settings to prevent warnings when running uv without a repo checkout + enable-cache: false + ignore-empty-workdir: true + - name: Install AWS + run: uv tool install awscli + - name: Download JSON schemas + uses: actions/download-artifact@v4 + with: + name: json-schemas + path: json-schemas + - name: Upload JSON schemas to S3 + working-directory: json-schemas + run: | + BUCKET=dstack-runner-downloads + if [ "${{ inputs.staging }}" = "true" ]; then + BUCKET=dstack-runner-downloads-stgn + fi + aws s3 cp configuration.json "s3://$BUCKET/${{ inputs.version }}/schemas/configuration.json" --acl public-read + aws s3 cp profiles.json "s3://$BUCKET/${{ inputs.version }}/schemas/profiles.json" --acl public-read + if [ "${{ inputs.is-latest-version }}" = "true" ]; then + aws s3 cp configuration.json "s3://$BUCKET/latest/schemas/configuration.json" --acl public-read + aws s3 cp profiles.json "s3://$BUCKET/latest/schemas/profiles.json" --acl public-read + fi + - name: Set latest version in S3 + run: | + CHANNEL=cli + if [ "${{ inputs.staging }}" = "true" ]; then + CHANNEL=stgn-cli + fi + echo ${{ inputs.version }} | aws s3 cp - s3://get-dstack/$CHANNEL/latest-version --acl public-read diff --git a/.github/workflows/upload-pre-pypi-artifacts.yml b/.github/workflows/upload-pre-pypi-artifacts.yml new file mode 100644 index 0000000000..542f445e2e --- /dev/null +++ b/.github/workflows/upload-pre-pypi-artifacts.yml @@ -0,0 +1,57 @@ +name: Upload Pre-PyPI Artifacts + +on: + workflow_call: + inputs: + version: + type: string + required: true + staging: + type: boolean + required: true + +jobs: + upload-pre-pypi-artifacts: + runs-on: ubuntu-latest + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + steps: + - uses: astral-sh/setup-uv@v5 + with: + python-version: 3.11 + # settings to prevent warnings when running uv without a repo checkout + enable-cache: false + ignore-empty-workdir: true + - name: Install AWS + run: uv tool install awscli + - name: Download dstack-gateway + uses: actions/download-artifact@v4 + with: + name: dstack-gateway + path: gateway + - name: Upload dstack-gateway to S3 + working-directory: gateway + run: | + CHANNEL=release + if [ "${{ inputs.staging }}" = "true" ]; then + CHANNEL=stgn + fi + WHEEL=dstack_gateway-${{ inputs.version }}-py3-none-any.whl + aws s3 cp $WHEEL "s3://dstack-gateway-downloads/$CHANNEL/$WHEEL" + echo "${{ inputs.version }}" | aws s3 cp - "s3://dstack-gateway-downloads/$CHANNEL/latest-version" + - name: Download dstack-runner + uses: actions/download-artifact@v4 + with: + pattern: dstack-runner-* + merge-multiple: true + path: runner + - name: Upload dstack-runner to S3 + working-directory: runner + run: | + BUCKET=dstack-runner-downloads + if [ "${{ inputs.staging }}" = "true" ]; then + BUCKET=dstack-runner-downloads-stgn + fi + aws s3 cp . "s3://$BUCKET/${{ inputs.version }}/binaries/" --recursive --exclude "*" --include "dstack-*" --acl public-read + aws s3 cp . "s3://$BUCKET/latest/binaries/" --recursive --exclude "*" --include "dstack-*" --acl public-read diff --git a/.github/workflows/vm-images.yml b/.github/workflows/vm-images.yml new file mode 100644 index 0000000000..6251fa2027 --- /dev/null +++ b/.github/workflows/vm-images.yml @@ -0,0 +1,203 @@ +name: Build VM images +run-name: Build VM images ${{ inputs.image_version }}${{ inputs.staging && ' (staging)' || '' }} + +on: + workflow_dispatch: + inputs: + image_version: + description: "Image version" + required: true + staging: + description: "Build staging images" + type: boolean + default: false + build_aws: + description: "Build AWS images" + type: boolean + default: true + build_azure: + description: "Build Azure images" + type: boolean + default: true + build_gcp: + description: "Build GCP images" + type: boolean + default: true + build_oci: + description: "Build OCI images" + type: boolean + default: true + +env: + PACKER_VERSION: "1.9.2" + VM_IMAGE_BUILD_PREFIX: ${{ inputs.staging && format('stgn-{0}-', github.run_number) || '' }} # staging ? prefix : '' + +jobs: + build-aws-images: + if: inputs.build_aws + defaults: + run: + working-directory: scripts/packer + runs-on: ubuntu-latest + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + strategy: + matrix: + variant: ["", "-cuda"] + steps: + - uses: actions/checkout@v4 + - name: Download packer + run: | + wget https://releases.hashicorp.com/packer/${{ env.PACKER_VERSION }}/packer_${{ env.PACKER_VERSION }}_linux_amd64.zip + unzip packer_${{ env.PACKER_VERSION }}_linux_amd64.zip + chmod +x packer + - name: Run packer + run: | + ./packer build -var-file=versions.json $PROD_VARS -var image_version=${{ inputs.image_version }} -var build_prefix=$VM_IMAGE_BUILD_PREFIX aws-image${{ matrix.variant }}.json + env: + PROD_VARS: ${{ !inputs.staging && '-var-file=aws-vars-prod.json' || '' }} # production ? var-file : '' + + build-azure-images: + if: inputs.build_azure + defaults: + run: + working-directory: scripts/packer + runs-on: ubuntu-latest + env: + AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} + AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }} + AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} + AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + VERSION: ${{ github.run_number }} + strategy: + matrix: + variant: ["", "-cuda", "-grid"] + steps: + - uses: actions/checkout@v4 + - uses: Azure/login@v2 + name: Log in to az + with: + creds: '{"clientId":"${{ secrets.AZURE_CLIENT_ID }}","clientSecret":"${{ secrets.AZURE_CLIENT_SECRET }}","subscriptionId":"${{ secrets.AZURE_SUBSCRIPTION_ID }}","tenantId":"${{ secrets.AZURE_TENANT_ID }}"}' + - name: Download packer + run: | + wget https://releases.hashicorp.com/packer/${{ env.PACKER_VERSION }}/packer_${{ env.PACKER_VERSION }}_linux_amd64.zip + unzip packer_${{ env.PACKER_VERSION }}_linux_amd64.zip + chmod +x packer + - name: Run packer + run: | + ./packer build -var-file=versions.json -var image_version=${{ inputs.image_version }} -var build_prefix=$VM_IMAGE_BUILD_PREFIX azure-image${{ matrix.variant }}.json + - name: Publish azure image + if: ${{ !inputs.staging }} + run: | + IMAGE_DEFINITION=${VM_IMAGE_BUILD_PREFIX}dstack${{ matrix.variant }}-${{ inputs.image_version }} + IMAGE_NAME=${VM_IMAGE_BUILD_PREFIX}dstack${{ matrix.variant }}-${{ inputs.image_version }} + ../publish_azure_image.sh $IMAGE_DEFINITION $IMAGE_NAME + + build-gcp-images: + if: inputs.build_gcp + defaults: + run: + working-directory: scripts/packer + runs-on: ubuntu-latest + strategy: + matrix: + variant: ["", "-cuda"] + permissions: + contents: "read" + id-token: "write" + steps: + - uses: actions/checkout@v4 + - name: Authenticate to Google Cloud + uses: google-github-actions/auth@v2 + with: + workload_identity_provider: "projects/531508670106/locations/global/workloadIdentityPools/github-identity-pool/providers/github-id-provider" + service_account: "github-actions@dstack.iam.gserviceaccount.com" + create_credentials_file: true + - name: Set up Cloud SDK + uses: google-github-actions/setup-gcloud@v2 + - name: Download packer + run: | + wget https://releases.hashicorp.com/packer/${{ env.PACKER_VERSION }}/packer_${{ env.PACKER_VERSION }}_linux_amd64.zip + unzip packer_${{ env.PACKER_VERSION }}_linux_amd64.zip + chmod +x packer + - name: Run packer + run: | + ./packer build -var-file=versions.json -var image_version=${{ inputs.image_version }} -var build_prefix=$VM_IMAGE_BUILD_PREFIX gcp-image${{ matrix.variant }}.json + - name: Publish images + run: | + IMAGE_VERSION=${IMAGE_VERSION//./-} + gcloud compute images add-iam-policy-binding ${VM_IMAGE_BUILD_PREFIX}dstack${{ matrix.variant }}-$IMAGE_VERSION --member='allAuthenticatedUsers' --role='roles/compute.imageUser' + env: + IMAGE_VERSION: ${{ inputs.image_version }} + + build-oci-images: + if: inputs.build_oci + runs-on: ubuntu-latest + env: + OCI_COMPARTMENT: ocid1.compartment.oc1..aaaaaaaaxu2uq64unfa2imwkp37icxqv6f7gwp2mczdt2mukuqbkauwqmbtq + OCI_SUBNET: ocid1.subnet.oc1.eu-frankfurt-1.aaaaaaaaewxkaqsmbi2tig5sfw4eexzo3mkb4zrpm4gwvfhqdddnxicxe4fa + OCI_AVAILABILITY_DOMAIN: kZql:EU-FRANKFURT-1-AD-3 + OCI_REGION: eu-frankfurt-1 + strategy: + matrix: + variant: ["", "-cuda"] + steps: + - uses: actions/checkout@v4 + - name: Setup OCI config + run: | + mkdir ~/.oci + echo "$OCI_KEY_CONTENT" > ~/.oci/key.pem + echo [DEFAULT] > ~/.oci/config + echo region=$OCI_REGION >> ~/.oci/config + echo tenancy=$OCI_TENANCY >> ~/.oci/config + echo user=$OCI_USER >> ~/.oci/config + echo fingerprint=$OCI_FINGERPRINT >> ~/.oci/config + echo key_file=~/.oci/key.pem >> ~/.oci/config + env: + OCI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }} + OCI_USER: ${{ secrets.OCI_CLI_USER }} + OCI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }} + OCI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }} + - name: Install packer + working-directory: scripts/packer + run: | + wget https://releases.hashicorp.com/packer/${{ env.PACKER_VERSION }}/packer_${{ env.PACKER_VERSION }}_linux_amd64.zip + unzip packer_${{ env.PACKER_VERSION }}_linux_amd64.zip + chmod +x packer + ./packer plugins install github.com/hashicorp/oracle + - name: Run packer + working-directory: scripts/packer + run: | + ./packer build \ + -var-file=versions.json \ + -var image_version=${{ inputs.image_version }} \ + -var build_prefix=$VM_IMAGE_BUILD_PREFIX \ + -var oci_compartment_ocid=$OCI_COMPARTMENT \ + -var oci_subnet_ocid=$OCI_SUBNET \ + -var oci_availability_domain=$OCI_AVAILABILITY_DOMAIN \ + oci-image${{ matrix.variant }}.json + - uses: astral-sh/setup-uv@v5 + with: + python-version: "3.12" + - name: Install dependencies for publishing + run: | + uv sync --extra oci + - name: Copy image to target regions + if: ${{ !inputs.staging }} + run: | + uv run scripts/oci_image_tools.py copy \ + --image ${VM_IMAGE_BUILD_PREFIX}dstack${{ matrix.variant }}-${{ inputs.image_version }} \ + --from $OCI_REGION \ + --compartment $OCI_COMPARTMENT + - name: Publish image in OCI Marketplace + if: ${{ !inputs.staging }} + run: | + uv run scripts/oci_image_tools.py publish \ + --image ${VM_IMAGE_BUILD_PREFIX}dstack${{ matrix.variant }}-${{ inputs.image_version }} \ + --compartment $OCI_COMPARTMENT \ + --version ${{ inputs.image_version }} \ + --description "Image for running workloads with dstack - https://dstack.ai/" \ + --os "Ubuntu 22.04" \ + --contact-name dstack \ + --contact-email hello@dstack.ai diff --git a/.gitignore b/.gitignore index b723429d13..46bc383130 100644 --- a/.gitignore +++ b/.gitignore @@ -3,17 +3,30 @@ *.egg-info dist/ -/venv/ +build/ +venv/ +/site/ /.cache/ .pytest_cache/ .coverage .idea/ +.fleet +.vscode +.aider* +.local/ +.DS_Store +.env +.envrc +uv.lock /runner/cmd/shim/shim /runner/cmd/runner/runner -build/ -.DS_Store -.fleet -.env \ No newline at end of file +/src/dstack/_internal/server/statics + +profiling_results.html + +mkdocs/docs/reference/http/openapi.json +mkdocs/docs/reference/api/rest/openapi.json +mkdocs/docs/reference/plugins/rest/rest_plugin_openapi.json diff --git a/.justfile b/.justfile new file mode 100644 index 0000000000..efa8c87f61 --- /dev/null +++ b/.justfile @@ -0,0 +1,21 @@ +# Root justfile +# +# This justfile serves as the main entry point to recipes from different components. +# +# Run `just` to see all available commands. +# +# Components: +# * runner/.justfile – Building and uploading dstack runner and shim +# * frontend/.justfile – Building and running the frontend +# * mkdocs/.justfile – Building and previewing the docs site + +default: + @just --list + +set allow-duplicate-recipes + +import "runner/.justfile" + +import "frontend/.justfile" + +import "mkdocs/.justfile" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9f49b863ff..c780267f2c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,14 +1,33 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.8 + rev: v0.12.7 # Should match pyproject.toml hooks: - id: ruff name: ruff common args: ['--fix'] - id: ruff-format - repo: https://github.com/golangci/golangci-lint - rev: v1.58.1 + rev: v2.6.2 # Should match .github/workflows/build-artifacts.yml hooks: - id: golangci-lint-full - entry: bash -c 'cd runner && golangci-lint run -D depguard --presets import,module,unused "$@"' + alias: runner-fix + language_version: 1.25.0 # Should match runner/go.mod + entry: bash -c 'cd runner && golangci-lint run --fix' stages: [manual] + - id: golangci-lint-full + alias: runner-lint + language_version: 1.25.0 # Should match runner/go.mod + entry: bash -c 'cd runner && golangci-lint run' + stages: [manual] + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: end-of-file-fixer + - repo: local + hooks: + - id: frontend-pre-commit + name: frontend-pre-commit + entry: bash -c "cd frontend && npm install && npm run pre-commit" + language: system + pass_filenames: false + files: ^frontend/ diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000000..0bbee7d51c --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,37 @@ +# Repository Guidelines + +## Project Structure & Module Organization +- Core Python package lives in `src/dstack`; internal modules (including server) sit under `_internal`, API surfaces under `api`, and plugin integrations under `plugins`. +- Tests reside in `src/tests` and mirror package paths; add new suites alongside the code they cover. +- Frontend lives in `frontend` (React/webpack) and is built into `src/dstack/_internal/server/statics`. +- Docs sources are in `mkdocs/docs/` with extra contributor notes in `contributing/*.md`. + +## Build, Test, and Development Commands +- Install deps (editable package with extras): `uv sync --all-extras` (uses `.venv` in repo). +- Run CLI/server from source: `uv run dstack ...` (e.g., `uv run dstack server --port 8000`). +- Lint/format: `uv run ruff check .` and `uv run ruff format .`. +- Type check: `uv run pyright -p .`. +- Test suite: `uv run pytest`. +- Frontend: from `frontend/` run `npm install`, `npm run build`, then copy `frontend/build` into `src/dstack/_internal/server/statics/`; for dev, `npm run start` with API on port 8000. + +## Coding Style & Naming Conventions +- Python targets 3.10+ with 4-space indentation and max line length of 99 (see `pyproject.toml`; `E501` is ignored but keep lines readable). +- Imports are sorted via Ruff’s isort settings (`dstack` treated as first-party). +- Keep primary/public functions before local helper functions in a module section. +- Roughly keep function definitions in the order they are referenced within a file so call flow stays easy to follow. +- Prefer early returns over nested `if`/`else` blocks when they make the control flow simpler. +- Keep private classes, exceptions, and similar implementation-specific types close to the private functions that use them unless they are shared more broadly in the module. +- Prefer pydantic-style models in `core/models`. +- Document attributes when the note adds behavior, compatibility, or semantic context that is not obvious from the name and type. Use attribute docstrings without leading newline. +- Tests use `test_*.py` modules and `test_*` functions; fixtures live near usage. + +## Testing Guidelines +- Default to `uv run pytest`. Use markers from `tests/conftest.py` like `--runpostgres` if need to include specific tests. +- Group tests for the same unit (function/class) using `Test*` classes that mirror unit's name. +- Keep tests hermetic (network disabled except localhost per `pytest.ini`); stub cloud calls with mocks. + +## Commit & Pull Request Guidelines +- Commit messages follow the existing style: short, imperative summaries (e.g., “Fix exclude_not_available ignored”); include rationale in the body if needed. +- For PRs, describe behavior changes and link related issues. +- Include screenshots or terminal output when touching UX/CLI messages or frontend flows. +- Always disclose AI Assistance in PRs. diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index f7a694a409..e373738b1b 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -3,7 +3,7 @@ ## Our Pledge In the interest of fostering an open and welcoming environment, we as -contributors and maintainers pledge to making participation in our project and +contributors and maintainers pledge to make participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal @@ -11,7 +11,7 @@ appearance, race, religion, or sexual identity and orientation. ## Our Standards -Examples of behavior that contributes to creating a positive environment +Examples of behavior that contribute to creating a positive environment include: * Using welcoming and inclusive language @@ -34,12 +34,12 @@ Examples of unacceptable behavior by participants include: ## Our Responsibilities Project maintainers are responsible for clarifying the standards of acceptable -behavior and are expected to take appropriate and fair corrective action in +behavior, and they are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions -that are not aligned to this Code of Conduct, or to ban temporarily or +that are not aligned with this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. @@ -58,7 +58,7 @@ Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at hello@dstack.ai. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is -obligated to maintain confidentiality with regard to the reporter of an incident. +obligated to maintain confidentiality regarding the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good @@ -73,4 +73,4 @@ available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.ht [homepage]: https://www.contributor-covenant.org For answers to common questions about this code of conduct, see -https://www.contributor-covenant.org/faq \ No newline at end of file +https://www.contributor-covenant.org/faq diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 24705478e5..1782025bb6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,6 +2,19 @@ We appreciate your interest in contributing to `dstack`! This document will help you get up to speed with `dstack` codebase and guide you through the contribution process. +## AI Assistance Notice + +If you are using any kind of AI assistance while contributing to `dstack`, +**this must be disclosed in the pull request**, along with the extent to +which AI assistance was used. +As an exception, tab-completions and trivial PRs don't need to be disclosed. + +An example disclosure: + +> This PR was written primarily by Claude Code. + +Failure to disclose this, makes it difficult to determine how much scrutiny to apply to the contribution. Please be respectful to maintainers and disclose AI assistance. + ## Set up your development environment Follow [contributing/DEVELOPMENT.md](contributing/DEVELOPMENT.md). @@ -22,21 +35,40 @@ If you make a non-trivial change to `dstack`, we recommend you learn about `dsta * Bug fixes that address a clearly defined bug. Include steps to reproduce in the linked issue or the PR. * New features. Before submitting a feature PR, create an issue with a proposal to discuss it with the core team and other interested parties. * Minor fixes such as typos. -* [Examples](examples/README.md). +* [Examples](examples). ### Before pushing your changes We use [`ruff`](https://docs.astral.sh/ruff/) to format Python code and to sort Python imports. Before committing your changes, run: -1. `ruff check --fix` -2. `ruff format` +1. `uv run ruff check --fix` +2. `uv run ruff format` > There are also helper pre-commits installed for [`ruff`](https://docs.astral.sh/ruff/integrations/#pre-commit) that make commits fail if the code is not formatted or the imports are not sorted. They also change the code as required so that you can review the changes and commit again. +### Run tests + +It's recommended to run tests locally before running them in CI. +To run Python tests, first ensure you've install dev dependencies as described in [contributing/DEVELOPMENT.md](contributing/DEVELOPMENT.md). +Then you can do: + +```shell +uv run pytest src/tests +``` + +(Optionally) By default, tests run against SQLite. +Use the `--runpostgres` flag to run the tests against Postgres as well: + +```shell +uv run pytest src/tests --runpostgres +``` + ## Add a new backend If you'd like to integrate a new cloud provider to `dstack`, follow [contributing/BACKENDS.md](contributing/BACKENDS.md). -## Get help +## What's next + +You can find more subject-focused guides in the [contributing](contributing/) directory. If you have any questions, you can always get help in our [Discord](https://discord.gg/u8SmfwPpMd) community. diff --git a/README.md b/README.md index d1d658c32c..b1e616f1b9 100644 --- a/README.md +++ b/README.md @@ -1,63 +1,55 @@ -
+

- - dstack + + dstack

[![Last commit](https://img.shields.io/github/last-commit/dstackai/dstack?style=flat-square)](https://github.com/dstackai/dstack/commits/) [![PyPI - License](https://img.shields.io/pypi/l/dstack?style=flat-square&color=blue)](https://github.com/dstackai/dstack/blob/master/LICENSE.md) -[![Discord](https://dcbadge.vercel.app/api/server/u8SmfwPpMd?style=flat-square)](https://discord.gg/CBgdrGnZjy) +[![Discord](https://img.shields.io/discord/1106906313969123368?style=flat-square)](https://discord.gg/u8SmfwPpMd)
-`dstack` is an open-source container orchestration engine designed for running AI workloads across any cloud or data -center. It simplifies dev environments, running tasks on clusters, and deployment. +`dstack` is a unified control plane for GPU provisioning and orchestration that works with any GPU cloud, Kubernetes, or on-prem clusters. -The supported cloud providers include AWS, GCP, Azure, OCI, Lambda, TensorDock, Vast.ai, RunPod, and CUDO. -You can also use `dstack` to run workloads on on-prem clusters. +It streamlines development, training, and inference, and is compatible with any hardware, open-source tools, and frameworks. -`dstack` natively supports NVIDIA GPU, and Google Cloud TPU accelerator chips. - -## Latest news ✨ - -- [2024/05] [dstack 0.18.4: Google Cloud TPU, and more](https://github.com/dstackai/dstack/releases/tag/0.18.4) (Release) -- [2024/05] [dstack 0.18.3: OCI, and more](https://github.com/dstackai/dstack/releases/tag/0.18.3) (Release) -- [2024/05] [dstack 0.18.2: On-prem clusters, private subnets, and more](https://github.com/dstackai/dstack/releases/tag/0.18.2) (Release) -- [2024/04] [dstack 0.18.0: RunPod, multi-node tasks, and more](https://github.com/dstackai/dstack/releases/tag/0.18.0) (Release) -- [2024/03] [dstack 0.17.0: Auto-scaling, and other improvements](https://github.com/dstackai/dstack/releases/tag/0.17.0) (Release) - -## Installation +#### Accelerators -Before using `dstack` through CLI or API, set up a `dstack` server. +`dstack` supports `NVIDIA`, `AMD`, `Google TPU`, and `Tenstorrent` accelerators out of the box. -### Install the server - -The easiest way to install the server, is via `pip`: - -```shell -pip install "dstack[all]" -U -``` +## Latest news ✨ +- [2026/04] [dstack 0.20.17: PD disaggregation, Kubernetes volumes](https://github.com/dstackai/dstack/releases/tag/0.20.17) +- [2026/04] [dstack 0.20.16: Performance, SSH proxy](https://github.com/dstackai/dstack/releases/tag/0.20.16) +- [2026/03] [dstack 0.20.13: Exports, Templates](https://github.com/dstackai/dstack/releases/tag/0.20.13) +- [2026/02] [dstack 0.20.12: Crusoe](https://github.com/dstackai/dstack/releases/tag/0.20.12) +- [2026/02] [dstack 0.20.8: Skills](https://github.com/dstackai/dstack/releases/tag/0.20.8) +- [2025/12] [dstack 0.20.0: Fleet-first UX, Events, and more](https://github.com/dstackai/dstack/releases/tag/0.20.0) -### Configure backends +## How does it work? -If you have default AWS, GCP, Azure, or OCI credentials on your machine, the `dstack` server will pick them up automatically. + + + + -Otherwise, you need to manually specify the cloud credentials in `~/.dstack/server/config.yml`. +### Launch the server -See the [server/config.yml reference](https://dstack.ai/docs/reference/server/config.yml.md#examples) -for details on how to configure backends for all supported cloud providers. +> Before using `dstack` through CLI or API, set up a `dstack` server. If you already have a running `dstack` server, you only need to [install the CLI](#install-the-cli). -### Start the server +To orchestrate compute across GPU clouds or Kubernetes clusters, you need to [configure backends](https://dstack.ai/docs/concepts/backends). -To start the server, use the `dstack server` command: +> When using `dstack` with on-prem servers, backend configuration isn’t required. Simply create [SSH fleets](https://dstack.ai/docs/concepts/fleets#ssh-fleets) once the server is up. -
+The server can be installed on Linux, macOS, and Windows (via WSL 2). It requires Git and +OpenSSH. ```shell +$ uv tool install "dstack[all]" -U $ dstack server Applying ~/.dstack/server/config.yml... @@ -66,76 +58,76 @@ The admin token is "bbae0f28-d3dd-4820-bf61-8f4bb40815da" The server is running at http://127.0.0.1:3000/ ``` -
- -> **Note** -> It's also possible to run the server via [Docker](https://hub.docker.com/r/dstackai/dstack). +> For more details on server configuration options, see the +[Server deployment](https://dstack.ai/docs/guides/server-deployment) guide. -### CLI & API +### Install the CLI -Once the server is up, you can use either `dstack`'s CLI or API to run workloads. -Below is a live demo of how it works with the CLI. +
If the CLI is not installed with the server -### Dev environments +Once the server is up, you can access it via the `dstack` CLI. -You specify the required environment and resources, then run it. `dstack` provisions the dev -environment in the cloud and enables access via your desktop IDE. +The CLI can be installed on Linux, macOS, and Windows. It requires Git and OpenSSH. - - -### Tasks - -Tasks allow for convenient scheduling of any kind of batch jobs, such as training, fine-tuning, -or data processing, as well as running web applications. +```shell +$ uv tool install dstack -U +``` -Specify the environment and resources, then run it. `dstack` executes the task in the -cloud, enabling port forwarding to your local machine for convenient access. +To point the CLI to the `dstack` server, configure it +with the server address, user token, and project name: - +```shell +$ dstack project add \ + --name main \ + --url http://127.0.0.1:3000 \ + --token bbae0f28-d3dd-4820-bf61-8f4bb40815da -### Services +Configuration is updated at ~/.dstack/config.yml +``` -Services make it very easy to deploy any kind of model or web application as public endpoints. +
-Use any serving frameworks and specify required resources. `dstack` deploys it in the configured -backend, handles authorization, and provides an OpenAI-compatible interface if needed. +### Install agent skills - +Install [`dstack` skills](https://skills.sh/dstackai/dstack/dstack) to help AI agents use the CLI and edit configuration files. -### Pools +```shell +$ npx skills add dstackai/dstack +``` -Pools simplify managing the lifecycle of cloud instances and enable their efficient reuse across runs. +AI agents like Claude, Codex, and Cursor can now create and manage fleets and submit workloads on your behalf. -You can have instances provisioned in the cloud automatically, or add them manually, configuring the required resources, -idle duration, etc. +### Define configurations - +`dstack` supports the following configurations: + +* [Fleets](https://dstack.ai/docs/concepts/fleets) — for managing cloud and on-prem clusters +* [Dev environments](https://dstack.ai/docs/concepts/dev-environments) — for interactive development using a desktop IDE +* [Tasks](https://dstack.ai/docs/concepts/tasks) — for scheduling jobs (incl. distributed jobs) or running web apps +* [Services](https://dstack.ai/docs/concepts/services) — for deployment of models and web apps (with auto-scaling and authorization) +* [Volumes](https://dstack.ai/docs/concepts/volumes) — for managing persisted volumes -## Examples +Configuration can be defined as YAML files within your repo. -Here are some featured examples: +### Apply configurations -- [Llama 3](examples/llms/llama3) -- [Alignment Handbook](examples/fine-tuning/alignment-handbook) -- [vLLM](examples/deployment/vllm) -- [Axolotl](examples/fine-tuning/axolotl) -- [TGI](examples/deployment/tgi) -- [Ollama](examples/deployment/ollama) -- [LoRaX](examples/deployment/lorax) +Apply the configuration via the `dstack apply` CLI command, a programmatic API, or through [AI agent skills](#install-ai-agent-skills). -Browse [examples](examples) for more examples. +`dstack` automatically manages provisioning, job queuing, auto-scaling, networking, volumes, run failures, +out-of-capacity errors, port-forwarding, and more — across clouds and on-prem clusters. -## More information +## Useful links -For additional information and examples, see the following links: +For additional information, see the following links: -- [Docs](https://dstack.ai/docs) -- [Discord](https://discord.gg/u8SmfwPpMd) +* [Docs](https://dstack.ai/docs) +* [Examples](https://dstack.ai/examples) +* [Discord](https://discord.gg/u8SmfwPpMd) ## Contributing -We welcome contributions to `dstack`! -To learn more about getting involved in the project, please refer to [CONTRIBUTING.md](CONTRIBUTING.md). +You're very welcome to contribute to `dstack`. +Learn more about how to contribute to the project at [CONTRIBUTING.md](CONTRIBUTING.md). ## License diff --git a/contributing/ARCHITECTURE.md b/contributing/ARCHITECTURE.md index 4ede39296f..ca14f36245 100644 --- a/contributing/ARCHITECTURE.md +++ b/contributing/ARCHITECTURE.md @@ -11,66 +11,43 @@ The `dstack` platform consists of six major components: * Shim * Gateway (optional) -The server provides an HTTP API for submitting runs and managing all of the `dstack` functionality including users, -projects, backends, repos, secrets, and gateways. +The server provides an HTTP API for submitting runs and managing all of the `dstack` functionality including users, projects, backends, repos, secrets, and gateways. -The Python API consists of the low-level and high-level Python API. The low-level Python API is a Python wrapper around -the server's HTTP API. It's available as `dstack.api.server`. The high-level API provides a more convenient interface to -work with `dstack` programatically. It's available as `dstack.api`. The `dstack` CLI is implemented on top of the -high-level API. +The Python API consists of the low-level and high-level Python API. The low-level Python API is a Python wrapper around the server's HTTP API. It's available as `dstack.api.server`. The high-level API provides a more convenient interface to work with `dstack` programmatically. It's available as `dstack.api`. The `dstack` CLI is implemented on top of the high-level API. -When the server provisions a cloud instance for a run, it launches a Docker image with the runner inside the image. The -runner provides an HTTP API that the server uses for submitting the run, uploading the code, fetching logs and so on. +When the server provisions a cloud instance for a run, it launches a Docker image with the runner inside the image. The runner provides an HTTP API that the server uses for submitting the run, uploading the code, fetching logs and so on. -The shim may be or may not be present depending on which type of cloud is used. If it's a GPU cloud that provides an API -for running Docker images, then no shim is required. If it's a traditional cloud that provisions VMs, then the shim is -started on the VM launch. It pulls and runs the Docker image, controls its execution, and implements any cloud-specific -functionality such as terminating the instance. +The shim may be or may not be present depending on which type of cloud is used. If it's a GPU cloud that provides an API for running Docker images, then no shim is required. If it's a traditional cloud that provisions VMs, then the shim is started on the VM launch. It pulls and runs the Docker image, controls its execution, and implements any cloud-specific functionality such as terminating the instance. -The gateway makes jobs available via a public URL. It works like a reverse proxy that forwards requests to the job -instance via an SSH tunnel. +The gateway makes jobs available via a public URL. It works like a reverse proxy that forwards requests to the job instance via an SSH tunnel. -## Implementation of `dstack run` +## Implementation of `dstack apply` -When a user invokes `dstack run`, the CLI first sends the run configuration and other profile parameters to the server -to get the run plan. The server iterates over configured backends to get all instance offers matching the requirements -and their availability. If the user is willing to proceed with the offers suggested, the CLI uploads the code from the -user's machine to the server and submits the run configuration. +When a user applies a run configuration with `dstack apply`, the CLI sends the run configuration and other profile parameters to the server to get the run plan. The server iterates over configured backends to get all instance offers matching the requirements +and their availability. If the user is willing to proceed with the offers suggested, the CLI uploads the code from the user's machine to the server and submits the run configuration. -Note: If a git repository is used, `dstack` only uploads the code diff. The runner then pulls the repository and applies -the diff to get the copy of the user's files. The `dstack init` command uploads git credentials to the server so that -the runner can access private repositories. +Note: If a git repository is used, `dstack` only uploads the code diff. The runner then pulls the repository and applies the diff to get the copy of the user's files. The `dstack init` command uploads git credentials to the server so that the runner can access private repositories. -The submitted runs are stored in the server database. For each run, the server also creates one or more jobs. (Multiple -jobs allow for distributed runs.) And for each job, it creates an initial job submission. If one submission fails, the -server may create new submissions. +The submitted runs are stored in the server database. For each run, the server also creates one or more jobs. (Multiple jobs allow for distributed runs and multi-replica services.) For each job, it creates a job submission. If a job submission fails, the server may create new submissions. -A background worker fetches a job submission and iterates over configured backends to provision an instance. It tries -best offers first until the provisioning succeeds. The instance is instructed to run the shim on the launch. In case -of "Docker-only" clouds, the docker image is run directly. +A background worker fetches a job submission and iterates over configured backends to provision an instance. It tries best offers first until the provisioning succeeds. The instance is instructed to run the shim on the launch. In case of "Docker-only" clouds, the docker image is run directly. -A successfully provisioned job enters the provisioning state. Another background worker processes such jobs. It waits -for the runner to become available and submits the job. +A successfully provisioned job enters the provisioning state. Another background worker processes such jobs. It waits for the runner to become available and submits the job. -Note: The runner HTTP API is not exposed publicly. In order to use it, the server established an SSH connection to the -instance. The runner HTTP API becomes available via port-forwarding. +Note: The runner HTTP API is not exposed publicly. In order to use it, the server established an SSH connection to the instance. The runner HTTP API becomes available via port-forwarding. -After the job is submitted, the job enters the running state. A background worker pings the runner periodically for the -job status and logs updates. +After the job is submitted, the job enters the running state. A background worker pings the runner periodically for the job status and logs updates. -When all job's commands are executed, the runner marks job as done, the container exists, and the shim terminates the -instance. The job may also be interrupted by `dstack stop` that asks the runner shutdown gracefully. The `--abort` flag -tells the server to force instance shutdown without notifying the runner, which may be useful if the runner becomes -unavailable. +When all job's commands are executed, the runner marks job as done, the container exists, and the shim terminates the instance. The job may also be interrupted by `dstack stop` that asks the runner shutdown gracefully. The `--abort` flag tells the server to force instance shutdown without notifying the runner and waiting for the runner graceful stop. ## Project structure -The server is a FastAPI app backend by sqlite. The runner and shim are written in Go. +The server is a FastAPI app backend by SQLite or Postgres. The runner and shim are written in Go. * `docker/` – Dockefiles for `dstack` images * `docs/` – source files for mkdocs generated documentation * `runner/` – source code for the runner and the shim -* `scripts/` – dev and CI/CD scripts +* `scripts/` – dev/CI/CD scripts and packer files for building `dstack` cloud VM images. * `src/` – source code for the `dstack` Python package that includes the server, the CLI and the Python API * `dstack/` * `_internal/` – modules hidden from the users of the `dstack` Python API diff --git a/contributing/AUTOSCALING.md b/contributing/AUTOSCALING.md index 87e7a5c328..eb5f6e1978 100644 --- a/contributing/AUTOSCALING.md +++ b/contributing/AUTOSCALING.md @@ -1,17 +1,22 @@ -dstack features auto-scaling for services published via the gateway. The general flow is: +# Autoscaling + +`dstack` features auto-scaling for services published via the gateway. The general flow is: - STEP 1: `dstack-gateway` parses nginx `access.log` to collect per-second statistics about requests to the service and request times. -- STEP 2: `dstack-gateway` aggregates statistics over a 1-minute window. -- STEP 3: The dstack server pulls all service statistics in the `process_gateways` background task. -- STEP 4: The `process_runs` background task passes statistics and current replicas to the autoscaler. -- STEP 5: The autoscaler (configured via the `dstack.yml` file) returns the replica change as an int. -- STEP 6: `process_runs` calls `scale_run_replicas` to add or remove replicas. -- STEP 7: `scale_run_replicas` terminates or starts replicas. - - `SUBMITTED` and `PROVISIONING` replicas get terminated before `RUNNING`. - - Replicas are terminated by descending `replica_num` and launched by ascending `replica_num`. +- STEP 2: `dstack-gateway` aggregates statistics over several predefined windows. +- STEP 3: The server keeps gateway connections alive in the scheduled `process_gateways_connections` task and continuously collects stats from active gateways. This is separate from `GatewayPipeline`, which handles gateway provisioning and deletion. +- STEP 4: When `RunPipeline` processes a service run, it loads the latest collected gateway stats for that service. +- STEP 5: The autoscaler (configured via `dstack.yml`) computes the desired replica count for each replica group. +- STEP 6: `RunPipeline` applies that desired state. + - For scale-up, it creates new `SUBMITTED` jobs. `JobSubmittedPipeline` then assigns existing capacity or provisions new capacity for them. + - For scale-down, it marks the least-important active replicas as `TERMINATING` with `SCALED_DOWN`. `JobTerminatingPipeline` unregisters and cleans them up. +- STEP 7: If the service is in rolling deployment, `RunPipeline` handles that in the same active-run processing path. + - It allows only a limited surge of replacement replicas. + - It delays teardown of old replicas until replacement capacity is available. + - It also cleans up replicas that belong to replica groups removed from the configuration. ## RPSAutoscaler -`RPSAutoscaler` implements simple target tracking scaling. The target value represents requests per second per replica (in a 1-minute window). +`RPSAutoscaler` implements simple target tracking scaling. The target value represents requests per second per replica (in a configurable window). -`scale_up_delay` tells how much time has to pass since the last upscale or downscale event before the next upscaling. `scale_down_delay` tells how much time has to pass since the last upscale or downscale event before the next downscaling. \ No newline at end of file +`scale_up_delay` tells how much time has to pass since the last upscale or downscale event before the next upscaling. `scale_down_delay` tells how much time has to pass since the last upscale or downscale event before the next downscaling. diff --git a/contributing/BACKENDS.md b/contributing/BACKENDS.md index 37d8aae711..0c7ac6a151 100644 --- a/contributing/BACKENDS.md +++ b/contributing/BACKENDS.md @@ -4,12 +4,12 @@ The guide below explains the steps required to extend `dstack` with support for ## Overview of the process -1. Add the cloud provider to [gpuhunt](https://https://github.com/dstackai/gpuhunt) -2. Integrate the cloud provider into [dstack](https://https://github.com/dstackai/dstack) +1. Add the cloud provider to [gpuhunt](https://github.com/dstackai/gpuhunt) +2. Integrate the cloud provider into [dstack](https://github.com/dstackai/dstack) ## 1. Add a cloud provider to dstackai/gpuhunt -The [gpuhunt](https://https://github.com/dstackai/gpuhunt) project is a utility that `dstack` uses to collect information +The [gpuhunt](https://github.com/dstackai/gpuhunt) project is a utility that `dstack` uses to collect information about cloud providers, their supported machine configurations, pricing, etc. This information is later used by `dstack` for provisioning machines. @@ -23,169 +23,149 @@ To add a new cloud provider to `gpuhunt`, follow these steps: git clone https://github.com/dstackai/gpuhunt.git ``` -### 1.2. Create the provider class +### 1.2. Decide if you will implement an offline or an online provider -Create the provider class file under `src/gpuhunt/providers`. +- **Offline providers** offer static machine configurations that are not frequently updated. + `gpuhunt` collects offline providers' instance offers on an hourly basis. + Examples: `aws`, `gcp`, `azure`, etc. +- **Online providers** offer dynamic machine configurations that are available at the very moment + when you fetch configurations (e.g., GPU marketplaces). + `gpuhunt` collects online providers' instance offers each time a `dstack` user provisions a new instance. + Examples: `tensordock`, `vastai`, etc. -Ensure your class... +### 1.3. Create the provider class -- Extends the `AbstractProvider` base class. -- Has the `NAME` property, that will be used as the unique identifier for your provider. -- Implements the `get` method, that is responsible for fetching the available machine configurations from the cloud provider. +Create the provider class file under `src/gpuhunt/providers`. -[//]: # (TODO: Elaborate better on how to use `query_filter` and `balance_resources`) +Make sure your class extends the [`AbstractProvider`](https://github.com/dstackai/gpuhunt/blob/main/src/gpuhunt/providers/__init__.py) +base class. See its docstrings for descriptions of the methods that your class should implement. -Refer to examples: [datacrunch.py](https://github.com/dstackai/gpuhunt/blob/main/src/gpuhunt/providers/datacrunch.py), -[aws.py](https://github.com/dstackai/gpuhunt/blob/main/src/gpuhunt/providers/aws.py), -[gcp.py](https://github.com/dstackai/gpuhunt/blob/main/src/gpuhunt/providers/gcp.py), -[azure.py](https://github.com/dstackai/gpuhunt/blob/main/src/gpuhunt/providers/azure.py), -[lambdalabs.py](https://github.com/dstackai/gpuhunt/blob/main/src/gpuhunt/providers/lambdalabs.py), -[tensordock.py](https://github.com/dstackai/gpuhunt/blob/main/src/gpuhunt/providers/tensordock.py), -[vastai.py](https://github.com/dstackai/gpuhunt/blob/main/src/gpuhunt/providers/vastai.py). - -### 1.3. Register the provider with the catalog - -Update the `src/gpuhunt/_internal/catalog.py` file by adding the provider name -to either `OFFLINE_PROVIDERS` or `ONLINE_PROVIDERS` depending on the type of the provider. - -How do I decide which type my provider is? - -- `OFFLINE_PROVIDERS` - Use this type if your provider offers static machine configurations that may be collected and - published on a daily basis. Examples: `aws`, `gcp`, `azure`, etc. These providers offer many machine configurations, - but they are not updated frequently. -- `ONLINE_PROVIDERS` - Use this type if your provider offers dynamic machine configurations that are available at the very moment when you fetch configurations (e.g., GPU marketplaces). - Examples: `tensordock`, `vast`, etc. - -### 1.4. Add data quality tests - -If the provider is registered via `OFFLINE_PROVIDERS`, you can add data quality tests -under `src/integrity_tests/`. - -Refer to examples: [test_datacrunch.py](https://github.com/dstackai/gpuhunt/blob/main/src/integrity_tests/test_datacrunch.py), +Refer to examples: +- Offline providers: + [verda.py](https://github.com/dstackai/gpuhunt/blob/main/src/gpuhunt/providers/verda.py), + [aws.py](https://github.com/dstackai/gpuhunt/blob/main/src/gpuhunt/providers/aws.py), + [azure.py](https://github.com/dstackai/gpuhunt/blob/main/src/gpuhunt/providers/azure.py), + [lambdalabs.py](https://github.com/dstackai/gpuhunt/blob/main/src/gpuhunt/providers/lambdalabs.py). +- Online providers: + [vultr.py](https://github.com/dstackai/gpuhunt/blob/main/src/gpuhunt/providers/vultr.py) + [tensordock.py](https://github.com/dstackai/gpuhunt/blob/main/src/gpuhunt/providers/tensordock.py), + [vastai.py](https://github.com/dstackai/gpuhunt/blob/main/src/gpuhunt/providers/vastai.py). + +### 1.4. Register the provider with the catalog + +Add your provider in the following places: +- Either `OFFLINE_PROVIDERS` or `ONLINE_PROVIDERS` in `src/gpuhunt/_internal/catalog.py`. +- The `python -m gpuhunt` command in `src/gpuhunt/__main__.py`. +- (offline providers) The CI workflow in `.github/workflows/catalogs.yml`. +- (online providers) The default catalog in `src/gpuhunt/_internal/default.py`. + +### 1.5. Add data quality tests + +For offline providers, you can add data quality tests under `src/integrity_tests/`. +Data quality tests are run after collecting offline catalogs to ensure their integrity. + +Refer to examples: [test_verda.py](https://github.com/dstackai/gpuhunt/blob/main/src/integrity_tests/test_verda.py), [test_gcp.py](https://github.com/dstackai/gpuhunt/blob/main/src/integrity_tests/test_gcp.py). -> Anything unclear? Ask questions on the [Discord server](https://discord.gg/u8SmfwPpMd). +### 1.6. Submit a pull request -Once the cloud provider is added, submit a pull request. +Once the cloud provider is added, submit a pull request. +> Anything unclear? Ask questions on the [Discord server](https://discord.gg/u8SmfwPpMd). ## 2. Integrate the cloud provider to dstackai/dstack -Once the provider is added to `gpuhunt`, we can proceed with implementing +Once the provider is added to `gpuhunt`, we can proceed with implementing the corresponding backend with `dstack`. Follow the steps below. -#### 2.1 Clone the repo - -```bash -git clone https://github.com/dstackai/dstack.git -``` - -#### 2.2. Set up the development environment - -Follow [DEVELOPMENT.md](DEVELOPMENT.md)`. - -#### 2.3. Add dependencies to setup.py - -Add any dependencies required by your cloud provider to `setup.py`. Create a separate section with the provider's name for -these dependencies, and ensure that you update the `all` section to include them as well. +### 2.1. Determine if you will implement a VM-based or a container-based backend -#### 2.4. Implement the provider backend +See the Appendix at the end of this document and make sure the provider meets the outlined requirements. -##### 2.4.1. Define the backend type +### 2.2. Set up the development environment -Add a new enumeration member for your provider to `BackendType` (`src/dstack/_internal/core/models/backends/base.py`). -Use the name of the provider. +Follow [DEVELOPMENT.md](DEVELOPMENT.md). -##### 2.4.2. Create the provider directory +### 2.3. Add dependencies to setup.py -Create a new directory under `src/dstack/_internal/core/backends` with the name of the backend type. +Add any dependencies required by your cloud provider to `setup.py`. Create a separate section with the provider's name for these dependencies, and ensure that you update the `all` section to include them as well. -##### 2.4.3. Create the backend class +### 2.4. Add a new backend type -Under the backend directory you've created, create the `__init__.py` file and define the -backend class there (should extend `dstack._internal.core.backends.base.Backend`). +Add a new enumeration member for your provider to `BackendType` ([`src/dstack/_internal/core/models/backends/base.py`](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/models/backends/base.py)). -Refer to examples: -[datacrunch](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/datacrunch/__init__.py), -[aws](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/aws/__init__.py), -[gcp.py](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/gcp/__init__.py), -[azure](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/azure/__init__.py), etc. +### 2.5. Create backend files and classes -##### 2.4.4. Create the backend compute class +`dstack` provides a helper script to generate all the necessary files and classes for a new backend. +To add a new backend named `ExampleXYZ`, you should run: -Under the backend directory you've created, create the `compute.py` file and define the -backend compute class there (should extend `dstack._internal.core.backends.base.compute.Compute`). +```shell +python scripts/add_backend.py -n ExampleXYZ +``` -You'll have to implement `get_offers`, `create_instance`, `run_job` and `terminate_instance`. +It will create an `examplexyz` backend directory under `src/dstack/_internal/core/backends` with the following files: -The `create_instance` method is required for the pool feature. If you implement the `create_instance` method, you should add the provider name to `BACKENDS_WITH_CREATE_INSTANCE_SUPPORT`. (`src/dstack/_internal/server/services/runs.py`). +* `backend.py` with the `Backend` class implementation. You typically don't need to modify it. +* `compute.py` with the `Compute` class implementation. This is the core of the backend that you need to implement. +* `configurator.py` with the `Configurator` class implementation. It deals with validating and storing backend config. You need to adjust it with custom backend config validation. +* `models.py` with all the backend config models used by `Backend`, `Compute`, `Configurator` and other parts of `dstack`. -Refer to examples: -[datacrunch](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/datacrunch/compute.py), -[aws](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/aws/compute.py), -[gcp.py](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/gcp/compute.py), -[azure](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/azure/compute.py), etc. +### 2.6. Adjust and register the backend config models -##### 2.4.5. Create the backend config model class +Go to `models.py`. It'll contain two config models required for all backends: -Under the `src/dstack/_internal/core/models/backends` directory, create the file with the name of the backend, and define the -backend config model classes there. +* `*BackendConfig` that contains all backend parameters available for user configuration except for creds. +* `*BackendConfigWithCreds` that contains all backends parameters available for user configuration and also creds. -[//]: # (TODO: Mention what config model classes are and how they work) +Adjust generated config models by adding additional config parameters. +Typically you'd need to only modify the `*BackendConfig` model since other models extend it. -[//]: # (TODO: Mention what config values class is and how it works) +Then add these models to `AnyBackendConfig*` unions in [`src/dstack/_internal/core/backends/models.py`](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/models.py). -Refer to examples: -[datacrunch](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/models/backends/datacrunch.py), -[aws](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/models/backends/aws.py), -[gcp.py](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/models/backends/gcp.py), -[azure](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/models/backends/azure.py), etc. +The script also generates `*BackendStoredConfig` that extends `*BackendConfig` to be able to store extra parameters in the DB. By the same logic, it generates `*Config` that extends `*BackendStoredConfig` with creds and uses it as the main `Backend` and `Compute` config instead of using `*BackendConfigWithCreds` directly. -##### 2.4.6. Create the backend config class +Refer to examples: +[verda](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/verda/models.py), +[aws](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/aws/models.py), +[gcp](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/gcp/models.py), +[azure](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/models.py), etc. -Under the backend directory you've created, create the `config.py` file and define the -backend config class there (should extend `dstack._internal.core.backends.base.config.BackendConfig` -and the backend configuration model class defined above). +### 2.7. Implement the backend compute class -[//]: # (TODO: Mention what config class is and how it works) +Go to `compute.py` and implement `Compute` methods. +Optionally, extend and implement `ComputeWith*` classes to support additional features such as fleets, volumes, gateways, placement groups, etc. For example, extend `ComputeWithCreateInstanceSupport` to support fleets. Refer to examples: -[datacrunch](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/datacrunch/config.py), -[aws](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/aws/config.py), -[gcp.py](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/gcp/config.py), -[azure](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/azure/config.py), etc. - -##### 2.4.7. Import config model classes - -Ensure the config model classes are imported -into [`src/dstack/_internal/core/models/backends/__init__.py`](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/models/backends/__init__.py). +[verda](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/verda/compute.py), +[aws](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/aws/compute.py), +[gcp](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/gcp/compute.py), +[azure](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/azure/compute.py), etc. -[//]: # (TODO: The backend configuration is overly complex and needs simplification: https://github.com/dstackai/dstack/issues/888) +### 2.8. Implement and register the configurator class -##### 2.4.8. Create the configurator class +Go to `configurator.py` and implement custom `Configurator` logic. At minimum, you should implement creds validation. +You may also need to validate other config parameters if there are any. -Create the file with the backend name under `src/dstack/_internal/server/services/backends/configurators`(https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/server/services/backends/configurators) -and define the backend configurator class (must extend `dstack._internal.server.services.backends.configurators.base.Configurator`). +Refer to examples: [verda](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/verda/configurator.py), +[aws](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/aws/configurator.py), +[gcp](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/gcp/configurator.py), +[azure](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/azure/configurator.py), etc. -Refer to examples: [datacrunch](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/server/services/backends/configurators/datacrunch.py), -[aws](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/server/services/backends/configurators/aws.py), -[gcp.py](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/server/services/backends/configurators/gcp.py), -[azure](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/server/services/backends/configurators/azure.py), etc. +Register configurator by appending it to `_CONFIGURATOR_CLASSES` in [`src/dstack/_internal/core/backends/configurators.py`](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/configurators.py). -##### 2.4.9. Create the server config class +### 2.9. (Optional) Override provisioning timeout -In [`src/dstack/_internal/server/services/config.py`](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/server/services/config.py), -define the corresponding server config class (that represents the `~/.dstack/server/config.yml` file), -and add it to `AnyBackendConfig` (in the same file). +If instances in the backend take more than 10 minutes to start, override the default provisioning timeout in +[`src/dstack/_internal/server/background/tasks/common.py`](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/server/background/tasks/common.py). -##### 2.4.10. Add safe imports +### 2.10. Document the backend -In [`src/dstack/_internal/server/services/backends/__init__.py`](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/server/services/backends/__init__.py), -add the `try`/`except` block that imports the backend configurator and appends it to `_CONFIGURATOR_CLASSES`. +Add the backend to the [Concepts->Backends](https://github.com/dstackai/dstack/blob/master/mkdocs/docs/concepts/backends.md +) page and the [server/comfig.yml](https://github.com/dstackai/dstack/blob/master/mkdocs/docs/reference/server/config.yml.md) reference. ## 3. Appendix -#### 3.1. Backend compute type +### 3.1. Backend compute type `dstack` supports two types of backend compute: @@ -194,49 +174,47 @@ add the `try`/`except` block that imports the backend configurator and appends i #### 3.1.1. VM-based backend compute type -It's when the cloud provider allows provisioning Virtual machines (VMs). -This is the most flexible backend compute type. +Used if the cloud provider allows provisioning virtual machines (VMs). +When `dstack` provisions a VM, it launches the `dstack-shim` agent inside the VM. +The agent controls the VM and starts Docker containers for users' jobs. -[//]: # (TODO: Elaborate why it's the most flexible) +Since `dstack` controls the entire VM, VM-based backends can support more features, +such as blocks, instance volumes, privileged containers, and reusable instances. -To support it, `dstack` expects the following from the cloud provider: +Note, all VM-based backend `Compute`s should sublass the `ComputeWithPrivilegedSupport` +and `ComputeWithInstanceVolumesSupport` mixins, as the `dstack-shim` agent +provides these functionalities OOTB. -- An API for creating and terminating VMs -- Ubuntu 22.04 LTS -- NVIDIA CUDA driver 535 -- Docker with NVIDIA runtime -- OpenSSH server -- Cloud-init script (preferred) -- An external IP and public port for SSH - -When `dstack` provisions a VM, it launches there `dstack-shim`. +To support a VM-based backend, `dstack` expects the following: -[//]: # (TODO: Elaborate on what dstack-shim is and how it works) +- An API for creating and terminating VMs +- An external IP and a public port for SSH +- Cloud-init (preferred) +- VM images with Ubuntu, OpenSSH, GPU drivers, and Docker with NVIDIA runtime -The examples of VM-based backends include: `aws`, `azure`, `gcp`, `lambda`, `datacrunch`, `tensordock`, etc. +For some VM-based backends, the `dstack` team also maintains +[custom VM images](../scripts/packer/README.md) with the required dependencies +and `dstack`-specific optimizations. -[//]: # (TODO: Elaborate on packer scripts) +Examples of VM-based backends include: `aws`, `azure`, `gcp`, `lambda`, `verda`, etc. #### 3.1.2. Container-based backend compute type -It's when the cloud provider allows provisioning only containers. -This is the most limited backend compute type. +Used if the cloud provider only allows provisioning containers. +When `dstack` provisions a container, it launches the `dstack-runner` agent inside the container. +The agent accepts and runs users' jobs. -[//]: # (TODO: Elaborate on why it's the most limited) +Since `dstack` doesn't control the underlying machine, container-based backends don't support some +`dstack` features, such as blocks, instance volumes, privileged containers, and reusable instances. -To support it, `dstack` expects the following from the cloud provider: +To support a container-based backend, `dstack` expects the following: - An API for creating and terminating containers -- Docker with NVIDIA runtime +- Containers properly configured to access GPUs - An external IP and a public port for SSH +- A way to specify the Docker image +- A way to specify credentials for pulling images from private Docker registries - A way to override the container entrypoint (at least ~2KB) +- A way to override the container user to root (as in `docker run --user root ...`) -The examples of container-based backends include: `kubernetes`, `vastai`, etc. - -Note: There are two types of compute in dstack: - -When `dstack` provisions a VM, it launches there `dstack-runner`. - -[//]: # (TODO: Elaborate on what dstack-runner is and how it works) - -[//]: # (TODO: Update this guide to incorporate the pool feature) \ No newline at end of file +Examples of container-based backends include: `kubernetes`, `vastai`, `runpod`. diff --git a/contributing/DEVELOPMENT.md b/contributing/DEVELOPMENT.md index 128c27f03e..b9570c2c60 100644 --- a/contributing/DEVELOPMENT.md +++ b/contributing/DEVELOPMENT.md @@ -1,30 +1,54 @@ # Development setup -## Set up the development environment - -### 1. Clone the repo: - ``` - git clone https://github.com/dstackai/dstack - cd dstack - ``` - -### 2. (Recommended) Create a virtual environment: - ``` - python3 -m venv venv - source venv/bin/activate - ``` - -### 3. Install `dstack` in editable mode: - ``` - pip install -e '.[all]' - ``` - -### 4. Install dev dependencies: - ``` - pip install -r requirements_dev.txt - ``` - -### 5. (Recommended) Install pre-commits: - ``` - pre-commit install - ``` \ No newline at end of file +## 1. Clone the repo: + +```shell +git clone https://github.com/dstackai/dstack +cd dstack +``` + +## 2. Install uv: + +https://docs.astral.sh/uv/getting-started/installation + +```shell +curl -LsSf https://astral.sh/uv/install.sh | sh +``` + +## 3. Install `dstack` with all extras and dev dependencies: + +```shell +uv sync --all-extras +``` + +`dstack` will be installed into the project's `.venv` in editable mode and can be run with `uv run dstack`. + +Alternatively, if you want to manage virtual environments by yourself, you can install `dstack` into the activated virtual environment with `uv sync --all-extras --active`. + +## 4. (Recommended) Install pre-commit hooks: + +Code formatting and linting can be done automatically on each commit with `pre-commit` hooks: + +```shell +uv run pre-commit install +``` + +## 5. (Recommended) Use pyright: + +The CI runs `pyright` for type checking `dstack` Python code. +So we recommend you configure your IDE to use `pyright`/`pylance` with `standard` type checking mode. + +You can also install `pyright` and run it from the CLI: + +```shell +uv tool install pyright +pyright -p . +``` + +## 6. Frontend + +See [FRONTEND.md](FRONTEND.md) for the details on how to build and develop the frontend. + +## 7. Documentation + +See [DOCS.md](DOCS.md) for the details on how to preview or build the documentation. diff --git a/contributing/DOCS.md b/contributing/DOCS.md new file mode 100644 index 0000000000..663e5c4c77 --- /dev/null +++ b/contributing/DOCS.md @@ -0,0 +1,181 @@ +# Documentation setup + +## 1. Clone the repo: + +```shell +git clone https://github.com/dstackai/dstack +cd dstack +``` + +## 2. Install uv: + +https://docs.astral.sh/uv/getting-started/installation + +```shell +curl -LsSf https://astral.sh/uv/install.sh | sh +``` + +## 3. Install `dstack` with all extras and dev dependencies: + +> [!WARNING] +> Building documentation requires `python_version >= 3.11`. + +```shell +uv sync --all-extras +``` + +`dstack` will be installed into the project's `.venv` in editable mode. + +## 4. (Recommended) Install pre-commit hooks: + +Code formatting and linting can be done automatically on each commit with `pre-commit` hooks: + +```shell +uv run pre-commit install +``` + +## 5. Preview documentation + +To preview the documentation, run the follow command: + +```shell +uv run mkdocs serve --livereload -s +``` + +The `--livereload` flag is required to work around live-reload bugs in recent `mkdocs` versions. + +If you want to build static files, you can use the following command: + +```shell +uv run mkdocs build -s +``` + +## Documentation build system + +The documentation uses a custom build system with MkDocs hooks to generate various files dynamically. + +### Disable flags + +Use these in `.envrc` to disable expensive docs regeneration, especially during `mkdocs serve` auto-reload. Set any of them to disable the corresponding artifact. + +```shell +export DSTACK_DOCS_DISABLE_LLM_TXT=1 +export DSTACK_DOCS_DISABLE_CLI_REFERENCE=1 +export DSTACK_DOCS_DISABLE_YAML_SCHEMAS=1 +export DSTACK_DOCS_DISABLE_OPENAPI_REFERENCE=1 +export DSTACK_DOCS_DISABLE_REST_PLUGIN_SPEC_REFERENCE=1 +``` + +### Build hooks + +The build process is customized via hooks in `scripts/docs/hooks.py`: + +#### 1. Schema reference expansion + +Files in `docs/reference/**/*.md` can use `#SCHEMA#` placeholders that are expanded with generated schema documentation during the build. + +#### 2. llms.txt generation + +Two files are generated for LLM consumption: + +- **llms.txt**: Structured overview of documentation with titles and descriptions + - Generated from mkdocs nav structure + - Includes sections: Getting started, Concepts, Guides, Examples + - Excludes: Reference section + - Configuration: `scripts/docs/gen_llms_files.py` (INCLUDE_SECTIONS, EXCLUDE_SECTIONS) + +- **llms-full.txt**: Full concatenation of all pages from llms.txt + - Contains complete markdown content of all included pages + +The generation logic is in `scripts/docs/gen_llms_files.py` and uses: +- `site_name`, `site_description`, `site_url` from `mkdocs.yml` +- Page titles from mkdocs nav structure +- Page descriptions from markdown frontmatter + +**Adding descriptions**: To add descriptions to pages, add YAML frontmatter: + +```yaml +--- +title: Page Title +description: Short description of what this page covers +--- +``` + +For examples, add frontmatter to the page files (e.g., `mkdocs/docs/examples/training/trl.md`). + +#### 3. Skills discovery + +The build creates `.well-known/skills/` directory structure for skills discovery: +- Reads `skills/dstack/SKILL.md` +- Parses name and description from frontmatter +- Generates `.well-known/skills/index.json` +- Copies SKILL.md to both `.well-known/skills/dstack/` and site root + +#### 4. HTTP API reference + +The HTTP API reference is generated from the FastAPI OpenAPI schema: + +- `scripts/docs/gen_openapi_reference.py` writes `mkdocs/docs/reference/http/openapi.json`, + keeps the per-tag Markdown pages in sync, and updates the generated tag list in the HTTP API + index page. +- Tag pages use `!!swagger openapi.json tag=""!!`. Keep tag names exactly as they appear + in the OpenAPI schema. +- `scripts/docs/hooks.py` expands the `!!swagger` directive into the Swagger UI container and + the hidden operation headings that MkDocs uses for the page table of contents. +- `mkdocs/assets/javascripts/swagger.js` loads the shared `openapi.json`, filters it by tag on + the client, and adapts Swagger UI markup to the docs layout. +- `mkdocs/assets/stylesheets/swagger.css` contains Swagger-specific styling and should stay + scoped under `.dstack-swagger-ui`. + +Keep hook logic limited to build-time Markdown/page structure, generated assets, and data +attributes needed by the client. Small presentation changes belong in `swagger.css`; small +behavior changes belong in `swagger.js`. + +If the HTTP API reference needs deeper structural customization, such as replacing major Swagger +UI panels, request/response rendering, model rendering, or "try it out" behavior, prefer moving +toward a dedicated local bundle or custom Swagger UI layout instead of adding more DOM patching. +That bundle can still use the single generated `openapi.json` and filter by tag on the client, so +we should not reintroduce per-tag OpenAPI files unless there is a concrete reason. + +### File structure + +``` +mkdocs/ # docs_dir for the mkdocs site +├── index.md # Homepage +├── docs/ # /docs/ URL section +│ ├── index.md # Getting started +│ ├── installation.md +│ ├── quickstart.md +│ ├── concepts/ # Concept pages +│ ├── guides/ # How-to guides +│ ├── reference/ # API reference (schema expansion) +│ └── examples/ # Example pages (inline source code) +│ └── training/ +│ └── trl.md # Page content with frontmatter +├── blog/ # Blog posts +├── overrides/ # Theme customization +├── layouts/ # Social card layouts +└── assets/ # Stylesheets, images, fonts + +scripts/docs/ +├── hooks.py # MkDocs build hooks +├── gen_llms_files.py # llms.txt generation +├── gen_schema_reference.py # Schema expansion +└── gen_cli_reference.py # CLI reference generation + +skills/ +└── dstack/ + └── SKILL.md # Skills discovery content +``` + +### Testing changes + +When modifying the build system: + +1. Test local build: `uv run mkdocs build -s` +2. Check generated files in `site/`: + - `site/llms.txt` + - `site/llms-full.txt` + - `site/.well-known/skills/index.json` +3. Verify example pages render correctly +4. Check that descriptions appear in llms.txt diff --git a/contributing/FRONTEND.md b/contributing/FRONTEND.md new file mode 100644 index 0000000000..19b623b1d4 --- /dev/null +++ b/contributing/FRONTEND.md @@ -0,0 +1,56 @@ +# Frontend development setup + +To work with the frontend, switch the current folder to [frontend](../frontend) + +```shell +cd frontend +``` + +## Building the frontend + +### 1. Install NPM dependencies + +Use `npm` to install dependencies: + +```shell +npm install +``` + +### 2. Build the frontend + +For building the frontend, run: + +```shell +npm run build +``` + +### 3. Copy the compiled frontend files + +Copy the contents of the `frontend/build` directory to the backend directory (`src/dstack/_internal/server/statics`): + +```shell +cp -r build/ ../src/dstack/_internal/server/statics/ +``` + +### 4. Run the dstack server + +Now, if you've installed `dstack` in editable mode, you can simply run `dstack server` +and the frontend will be working. + +```shell +dstack server +``` + +## Developing the frontend + +For frontend development, run a `webpack` dev server: + +```shell +npm run start +``` + +The `webpack` dev server expects the API to be running on `http://127.0.0.1:8000`. So ensure to run the API on port `8000`: + +```shell +dstack server --port 8000 +``` diff --git a/contributing/GATEWAY.md b/contributing/GATEWAY.md deleted file mode 100644 index 29d9ea033b..0000000000 --- a/contributing/GATEWAY.md +++ /dev/null @@ -1,72 +0,0 @@ -A dstack gateway is a dedicated instance responsible for publishing user applications to the outer internet via the HTTP protocol. One dstack gateway can serve many services, domains, or projects. - -## Gateway creation - -Gateways are managed by the dstack server. A gateway is associated with a project and some backend in the project. Users must attach a wildcard domain to the gateway, i.e., all direct subdomains should resolve to the gateway IP address. Since the IP address is unknown during provisioning, dstack doesn't check DNS records. - -Provisioning happens as follows: -1. Launch a non-GPU instance (usually the smallest) with all ports exposed. -2. Install Nginx, Certbot, and patch configs. -3. Create blue-green virtual environments. -4. Install the latest `dstack-gateway` from the S3 bucket. -5. Run the systemd service `dstack.gateway.service`. - -## Gateway update - -The `dstack-gateway` has a "blue-green deployment"-like configuration: there are two virtual environments to be swapped on update. The systemd service uses the newly installed package after a restart. - -The update process looks like this: -1. Install the new package to the not-used venv. -2. Update scripts and systemd service config. -3. Swap the active venv name in the file `version`. -4. Restart the systemd service. - -The `dstack-gateway` server dumps its internal state to the file `~/dstack/state.json` on termination. It tries to load the state from the same file on start. That allows updating the gateway with published services with minimal downtime. - -## Connection between server and gateway - -The dstack server keeps a bidirectional tunnel with each GatewayCompute for the whole uptime of the server. - -- The tunnel from the server to the gateway is used to manage the gateway: register and unregister services and replicas. -- The tunnel from the gateway to the server is used to authenticate requests to the gateway based on dstack's tokens. - -Authorization responses are cached for 60 seconds. If the server is not responding, the request is denied. - -## Nginx - -`dstack-gateway` configures an Nginx reverse proxy. Each service or entrypoint configuration is stored as `/etc/nginx/sites-enabled/{port}-{server_name}.conf`. If the Nginx reload fails, `dstack-gateway` rolls back the changes. - -`dstack-gateway` enforces HTTPS (except for local traffic). On each service registration a TLS certificate is issued by Let's Encrypt or other configured CA via Certbot. - -If there are no replicas, the service configuration always returns 503; otherwise, the upstream with replicas is used. The upstream handles load balancing for us. `dstack-gateway` uses Unix sockets for SSH tunnels to avoid port conflicts between services. - -Service authorization is handled with the `localhost:8000/auth` endpoint if needed. `dstack-gateway` may request services without authorization and HTTPS, for example, from the OpenAI interface. - -Entrypoint configurations forward requests back to `dstack-gateway`, to a specific module (e.g., OpenAI). Authorization is handled by those modules. - -## Gateway registry - -The core component of `dstack-gateway` is the services store. It is responsible for: - -- Registering a service — assigning a domain, creating an Nginx config. -- Registering a replica — starting an SSH tunnel, updating Nginx upstream. -- Unregistering a replica — stopping an SSH tunnel, updating Nginx upstream. -- Unregistering a service — releasing a domain, removing an Nginx config. -- Registering an entrypoint — assigning a domain, creating an Nginx config. - -To decouple the store from other modules, there is a subscription mechanism. Subscribers will be notified on register service and unregister service. - -## OpenAI interface - -The OpenAI interface subscribes to `Store` events and emulates the real OpenAI API for chat completion models. It can list running models in the project and redirect requests to the right service. - -## Stats Collector - -The Stats collector parses nginx `/var/log/nginx/dstack.access.log` to collect basic metrics: - -1. Requests per second -2. Average request processing time - -By default, it stores 5 minutes with 1-second resolution frames for each domain. It aggregates these frames in windows of the size 30 seconds, 1 minute, and 5 minutes, before sending to the server. - -To increase performance, `StatsCollector` keeps position in file and read only new records. It can detect log rotation and reopen the log file. \ No newline at end of file diff --git a/contributing/GPUHUNT.md b/contributing/GPUHUNT.md index 498584b4ef..33e3150fa1 100644 --- a/contributing/GPUHUNT.md +++ b/contributing/GPUHUNT.md @@ -1,3 +1,5 @@ +# gpuhunt + [`dstackai/gpuhunt`](https://github.com/dstackai/gpuhunt) is a library developed and used for dstack. It implements the unified interface for fetching offers and prices from different cloud providers. An offer is a possible configuration. It consists of: @@ -63,7 +65,7 @@ Some providers offer extreme flexibility in possible configurations, but not all - Filters out if: outdated family, not supported family - Queries configuration details to fill CPU, RAM, and GPU information -### DataCrunch +### Verda - Just queries all offers via API @@ -83,13 +85,23 @@ Some providers offer extreme flexibility in possible configurations, but not all - Parses Oracle's [Cost Estimator](https://www.oracle.com/cloud/costestimator.html) datasets - Duplicates each offer in all regions, since prices are the same everywhere and availability is mostly the same -### Nebius - -- Uses hardcoded CPU and GPU platforms configurations -- Applies SKUs pricing to the platforms - ### GitHub Actions: collect catalog The offline catalog is built in GitHub Actions every night. Every offline provider produces a CSV file with offers. Later, those files get compressed into a zip archive and uploaded to the public S3 bucket. -To ensure data quality, there is a catalog integrity testing step. It uses some simple heuristics to avoid empty catalog files, zero prices, or missing regions. \ No newline at end of file +To ensure data quality, there is a catalog integrity testing step. It uses some simple heuristics to avoid empty catalog files, zero prices, or missing regions. + +### Backward compatibility + +The same `gpuhunt` version can be used by different `dstack` versions. +Additionally, offline catalogs are produced by the latest `gpuhunt` version, but used by all `dstack` versions. + +These mechanisms are used to preserve backward compatibility: + +- **`gpuhunt` version**: The interfaces in the `gpuhunt` package preserve backward compatibility + within a minor version (`X` in `0.X.Y`). +- **Offer flags**: If an offer breaks older `dstack` versions, it is marked with a flag in `RawCatalogItem.flags` + and the flag is added to the list of supported flags in `dstack`. + Older `dstack` versions that don't support this flag will not see the respective offers. +- **Offline catalog versions**: If a breaking change in the structure or content of an offline catalog is unavoidable, + a new version of the catalog can be introduced. Catalog versions are published at `s3://dstack-gpu-pricing/v{N}`. diff --git a/contributing/LOCKING.md b/contributing/LOCKING.md new file mode 100644 index 0000000000..c89ad526a4 --- /dev/null +++ b/contributing/LOCKING.md @@ -0,0 +1,141 @@ +# Locking + +The `dstack` server supports SQLite and Postgres databases with two implementations of resource locking to handle concurrent access: + +* In-memory locking for SQLite. +* DB-level locking for Postgres. + +## SQLite locking + +SQLite is missing efficient mechanisms to handle concurrent writes (e.g. select for update), so `dstack` implements in-memory resource-level locking. In-memory locking works correctly under the assumption that there is only one server instance (process), which is a `dstack` limitation when using SQLite. + +The in-memory locking is implemented via locksets. Locksets are Python sets that store IDs of locked resources. Concurrent access to locksets is guarded with asyncio locks: + +```python +lock, lockset = get_lockset("my_table") +async with lock: + # select resource that is not in lockset + lockset.add(resource.id) +try: + process_resource(resource) +finally: + lockset.remove(resource.id) +``` + +Locksets are an optimization. One can think of them as per-resource-id locks that allow independent locking of different resources. + +## Postgres locking + +Postgres resource locking is implemented via standard SELECT FOR UPDATE. +SQLAlchemy provides `.with_for_update()` that has no effect if SELECT FOR UPDATE is not supported as in SQLite. + +There are few places that rely on advisory locks as when generating unique resource names. + +## Working with locks + +Concurrency is hard. Concurrency with locking is especially hard. Below you'll find common patterns and gotchas when working with locks to make it a bit more manageable. + +**A task should acquire locks on resources it modifies** + +This is common sense. An alternative could be the inverse: job processing cannot run in parallel with run processing, so job processing takes run lock. This indirection complicates things and is discouraged. In this example, run processing should take job lock instead. + +**Start new transaction after acquiring a lock to see other transactions changes in SQLite.** + +```python +select resource ids by names +lock resource ids +await session.commit() +# The next statement will start new transaction +select ... +``` + +> SQLite exhibits Snapshot Isolation. When a read transaction starts, that reader continues to see an unchanging "snapshot" of the database file as it existed at the moment in time when the read transaction started. Any write transactions that commit while the read transaction is active are still invisible to the read transaction, because the reader is seeing a snapshot of database file from a prior moment in time. Source: https://www.sqlite.org/isolation.html + +Thus, if a new transaction is not started, you won't see changes that concurrent transactions made before you acquired the lock. + +This is not relevant for Postgres since it doesn't rely on in-memory locking (and it also runs on Read Committed isolation level by default). + +**Release in-memory locks only after committing changes** + +```python +# Don't do this! +lock resources +unlock resources +do smth else +await session.commit() +``` + +```python +# Do this! +lock resources +await session.commit() +unlock resources +``` + +If a transaction releases a lock before committing changes, the changes may not be visible to another transaction that acquired the lock and relies upon seeing all committed changes. + +**Using `joinedload` when selecting `.with_for_update()`** + +Using `joinedload` and `.with_for_update()` triggers an error in case of no related rows because `joinedload` produces OUTER LEFT JOIN and SELECT FOR UPDATE cannot be applied to the nullable side of an OUTER JOIN. Here's the options: + +* Use `.with_for_update(of=MainModel)`. +* Select with `selectinload` +* First select with `.with_for_update()` without loading related attributes and then re-select with `joinedload` without `.with_for_update()`. +* Use regular `.join()` to lock related resources, but you may get 0 rows if there is no related row to join. + +**Always use `.with_for_update(key_share=True)` unless you plan to delete rows or update a primary key column** + +If you `SELECT FOR UPDATE` from a table that is referenced in a child table via a foreign key, it can lead to deadlocks if the child table is updated because Postgres will issue a `FOR KEY SHARE` lock on the parent table rows to ensure valid foreign keys. For this reason, you should always do `SELECT FOR NO KEY UPDATE` (.`with_for_update(key_share=True)`) if primary key columns are not modified. `SELECT FOR NO KEY UPDATE` is not blocked by a `FOR KEY SHARE` lock, so no deadlock. + +**Lock unique names** + +The following pattern can be used to lock a unique name of some resource type: + +```python +lock_namespace = f"fleet_names_{project.name}" +if get_db().dialect_name == "sqlite": + # Start new transaction to see committed changes after lock + await session.commit() +elif get_db().dialect_name == "postgresql": + await session.execute( + select(func.pg_advisory_xact_lock(string_to_lock_id(lock_namespace))) + ) + +lock, _ = get_locker(get_db().dialect_name).get_lockset(lock_namespace) +async with lock: + # ... select taken names, use a unique name + await session.commit() +``` + +Note that: + +* This pattern works assuming that Postgres is using default isolation level Read Committed. By the time a transaction acquires the advisory lock, all other transactions that can take the name have committed, so their changes can be seen and a unique name is taken. +* SQLite needs a commit before selecting taken names due to Snapshot Isolation as noted above. + +**Use `AsyncExitStack`** + +In-memory locking typically requires taking lock for long (until commit). +Using lock context managers for in-memory locking is often hard because the lock is tied to a block: + +```python +if something: + # Can't do this because the lock will be released before commit. How to lock? + async with get_locker(get_db().dialect_name).lock_ctx(...): + # ... +# ... +await session.commit() +``` + +Use [`contextlib.AsyncExitStack`](https://docs.python.org/3/library/contextlib.html#contextlib.AsyncExitStack): + +```python +async with AsyncExitStack() as exit_stack: + if something: + # The lock will be released only on stack exit, so it's ok. + await exit_stack.enter_async_context( + get_locker(get_db().dialect_name).lock_ctx(...) + ) + # ... + # ... + await session.commit() +``` diff --git a/contributing/MIGRATIONS.md b/contributing/MIGRATIONS.md new file mode 100644 index 0000000000..f494d829d5 --- /dev/null +++ b/contributing/MIGRATIONS.md @@ -0,0 +1,75 @@ +# Database migrations + +`dstack` uses Alembic to manage database migrations. If you modify any SQLAlchemy +[models](../src/dstack/_internal/server/models.py) or related data structures, +generate a new migration with Alembic: + +```shell +cd src/dstack/_internal/server/ +alembic revision -m "" --autogenerate +``` + +Then adjust the generated migration if needed. + +## Deployment-compatible migrations + +The `dstack` server claims to support multi-replica setups with zero-downtime deployments. +This means DB migrations should not make changes that break old replicas. +Incompatible changes should be introduced in multiple stages (releases), following +the [expand and contract pattern](https://www.prisma.io/dataguide/types/relational/expand-and-contract-pattern). + +**Note**: If it's impossible to make the migration compatible with older versions, the PR should say so explicitly, so that the change is planned and released with the migration notice. + +Below are some common changes and how to make them. + +### Removing a column + +1. First release: + * Stop reading the column. In SQLAlchemy this can be done by setting `deferred=True` on a model field. +2. Second release: + * Drop the column. + +### Changing a column + +These steps apply to **renaming a column** or **changing the type of a column** + +1. First release: + * Introduce a new column with the new type. + * Write to both the new and the old column. +2. Second release: + * Migrate data from the old column to the new column. + * Start reading the new column. + * Stop reading the old column. + * Stop writing to the old column. +3. Third release: + * Drop the old column. + +### Altering multiple tables + +Altering a table requires Postgres to [take an ACCESS EXCLUSIVE lock](https://www.postgresql.org/docs/current/sql-altertable.html). (This applies not only to statements that rewrite the tables but also to statements that modify tables metadata.) Altering multiple tables can cause deadlocks due to conflict with read operations since the `dstack` server does not define an order for read operations. Altering multiple tables should be done in separate transactions/migrations. + +### Adding indexes + +Use `CREATE INDEX CONCURRENTLY` to avoid tacking exclusive lock on the table for a long time. +For migrations that create multiple indexes, failures can leave the schema in a partial state +(some indexes already created, some missing). On Postgres, concurrent index creation can also fail +midway and leave an invalid index object with the same name. Retrying the migration then fails +with "already exists". + +For retry-safe migrations, pre-drop indexes with `if_exists=True` before creating them again: + +```python +with op.get_context().autocommit_block(): + op.drop_index( + "ix_table_col", + table_name="table", + if_exists=True, + postgresql_concurrently=True, + ) + op.create_index( + "ix_table_col", + "table", + ["col"], + postgresql_concurrently=True, + ) +``` diff --git a/contributing/PIPELINES.md b/contributing/PIPELINES.md new file mode 100644 index 0000000000..7e01c2465e --- /dev/null +++ b/contributing/PIPELINES.md @@ -0,0 +1,127 @@ +# Pipelines + +This document describes how the `dstack` server implements background processing via so-called "pipelines". + +*Historical context: `dstack` used to do all background processing via scheduled tasks. A scheduled task would process a specific resource type like volumes or runs by keeping DB transaction open for the entire processing duration and keeping the resource lock with SELECT FOR UPDATE (or in-memory lock on SQLite). This approach didn't scale well because the number of DB connections was a huge bottleneck. Pipelines replaced scheduled tasks: the do all the heavy processing outside of DB transactions and write locks to DB columns.* + +## Overview + +* Resources are continuously processed in the background by pipelines. A pipeline consists of a fetcher, workers, and a heartbeater. +* A fetcher selects rows to be processed from the DB, marks them as locked in the DB, and puts them into an in-memory queue. +* Workers consume rows from the in-memory queue, process the rows, and unlock them. +* The locking (unlocking) is done by setting (unsetting) `lock_expires_at`, `lock_token`, `lock_owner`. +* If the replica/pipeline dies, the rows stay locked in the db. Another replica picks up the rows after `lock_expires_at`. +* `lock_token` prevents stale replica/pipeline to update the rows already picked up by the new replica. +* `lock_owner` stores the pipeline that's locked the row so that only that pipeline can recover if it's stale. +* A heartbeater tracks all rows in the pipeline (in the queue or in processing), and updates the lock expiration. This allows setting small `lock_expires_at` and picking up stale rows quickly +* A fetcher performs the fetch when the queue size goes under a configured lower limit. It has exponential retry delays between empty fetches, thus reducing load on the DB. +* There is a fetch hint mechanism that services can use to notify the pipelines within the replica – in that case the fetcher stops sleeping and fetches immediately. +* Each pipeline locks one main resource but may lock related resources as well. It's not necessary to heartbeat related resources if the pipeline ensures no one else can re-lock them. This is typically done via setting and respecting `lock_owner`. + +Related notes: + +* All write APIs must respect DB-level locks. The endpoints can either try to acquire the lock with a timeout and error or provide an async API by storing the request in the DB. + +## Implementation checklist + +Brief checklist for implementing a new pipeline: + +1. Fetcher locks only rows that are ready for processing: +`status/time` filters, `lock_expires_at` is empty or expired, and `lock_owner` is empty or equal to the pipeline name. Keep the fetch order stable with `last_processed_at`. +2. Fetcher takes row locks with `skip_locked` and updates `lock_expires_at`, `lock_token`, `lock_owner` before enqueueing items. +3. Worker keeps heavy work outside DB sessions. DB sessions should be short and used only for refetch/locking and final apply. +4. Apply stage updates rows using update maps/update rows, not by relying on mutating detached ORM models. +5. Main apply update is guarded by `id + lock_token`. If the update affects `0` rows, the item is stale and processing results must not be applied. +6. Successful apply updates `last_processed_at` and unlocks resources that were locked by this item. +7. If related lock is unavailable, reset main lock for retry: keep `lock_owner`, clear `lock_token` and `lock_expires_at`, and set `last_processed_at` to now. +8. Register the pipeline in `PipelineManager` and hint fetch from services after commit via `pipeline_hinter.hint_fetch(Model.__name__)`. +9. Add minimum tests: fetch eligibility/order, successful unlock path, stale lock token path, and related lock contention retry path. + +## Typical worker structure + +Most workers are easiest to reason about when `process()` is split into three phases: + +1. Load/refetch: open a short DB session, refetch the locked main row by `id + lock_token`, lock any required related rows, and gather any extra data needed for processing. +2. Process: do the heavy work outside DB sessions and build result objects or update maps instead of mutating detached ORM models. +3. Apply: open a short DB session, guard the main update by `id + lock_token`, resolve time placeholders, apply related updates, emit events, and unlock rows. + +A dedicated context object is often useful for the load step when the worker needs multiple loaded models, related lock metadata, or derived values that should be passed cleanly into processing and apply. For very small pipelines, a direct load -> process -> apply flow may still be clearer. + +Workers can share one context type and one apply function across all states even if the processing logic differs by state: + +```python +async def process(item): + context = await _load_process_context(item) + if context is None: + return + result = await _process_item(context) + await _apply_process_result(item, context, result) +``` + +Sometimes state-specific helpers are still the cleanest option, but they can still share a common apply phase if all states write results in the same general shape: + +```python +async def process(item): + if item.status == Status.PENDING: + context = await _load_pending_context(item) + elif item.status == Status.RUNNING: + context = await _load_running_context(item) + else: + return + if context is None: + return + result = await _process_item(context) + await _apply_process_result(item, context, result) +``` + +If different states have materially different write-side behavior, different apply paths are fine as well. This commonly happens when one state does a normal guarded update while another does delete-or-cleanup work with different related updates: + +```python +async def process(item): + if item.to_be_deleted: + await _process_to_be_deleted_item(item) + elif item.status == Status.SUBMITTED: + await _process_submitted_item(item) +``` + +It's ok not to force all pipelines into one exact shape. + +## Implementation patterns + +**Guarded apply by lock token** + +When writing processing results, update the main row with a filter by both `id` and `lock_token`. This guarantees that only the worker that still owns the lock can apply its results. If the update affects no rows, treat the item as stale and skip applying other changes (status changes, related updates, events). A stale item means another worker or replica already continued processing. + +**Locking related resources before refetch** + +If you first refetch a main resource and only after lock the related resources, you need to ensure the worker doesn't get the stale view on related resources or works properly even in this case. It's often more robust to first lock related resources and then refetch the main resource with related resources already locked. + +**Locking many related resources** + +A pipeline may need to lock a potentially big set of related resource, e.g. fleet pipeline locking all fleet's instances. For this, do one SELECT FOR UPDATE of non-locked instances and one SELECT to see how many instances there are, and check if you managed to lock all of them. If fail to lock, release the main lock and try processing on another fetch iteration. You may keep `lock_owner` on the main resource or set `lock_owner` on locked related resource and make other pipelines respect that to guarantee the eventual locking of all related resources and avoid lock starvation. + +**Locking a shared related resource** + +Multiple main resources may need to lock the same related resource, e.g. multiple jobs may need to change the shared instance. In this case it's not sufficient to set `lock_owner` on the related resource to the pipeline name because workers processing different main resources can still race with each other. To avoid heartbeating the related resource, you may include main resource id in `lock_owner`, e.g. set `lock_owner = f"{Pipeline.__name__}:{item.id}"`. + +**Reset-and-retry when related lock is unavailable** + +If a worker cannot lock a required related resource, it should release only the main lock state needed for fast retry: unset `lock_token` and `lock_expires_at`, keep `lock_owner`, and set `last_processed_at` to now. This avoids long waiting and lets the same pipeline retry quickly on the next fetch iteration while other pipelines can still respect ownership intent. + +**Dealing with side effects** + +If processing has side effects and the apply phase fails due to a lock mismatch, there are several options: a) revert side effects b) make processing idempotent, i.e. next processing iteration detects side effects does not perform duplicating actions c) log side effects as errors and warn user about possible issues such as orphaned instances – as a temporary solution. + +**Bulk apply with one consistent current time** + +When apply needs to update multiple rows (main + related resources), build update maps/update rows first and resolve current-time placeholders once in the apply transaction using `NOW_PLACEHOLDER` + `resolve_now_placeholders()`. This keeps timestamps consistent across all rows and avoids subtle ordering bugs when the same processing pass writes several `*_at` fields. + +## Performance analysis + +* Pipeline throughput = workers_num / worker_processing_time. So quick tasks easily give high-throughput pipelines, e.g. 1s task with 20 workers is 1200 tasks/min. +A slow 30s task gives only 40 tasks/min with the same number of workers. We can increase the number of workers but the peak memory usage will grow proportionally. +In general, workers should be optimized to be as quick as possible to improve throughput. +* Processing latency (wait) is close to 0 due to fetch hints if the pipeline is not saturated. In general, latency = queue_size / throughput. +* In-memory queue maxsize provides a cap on memory usage and recovery time after crashes (number of locked items to retry). +* Fetcher's DB load is proportional to the number of pipelines and is expected to be negligible. Workers can put a considerable read/write DB load as it's proportional to the number of workers. This can be optimized by batching workers' writes. Workers do processing outside of transactions so DB connections won't be a bottleneck. +* There is a risk of lock starvation if a worker needs to lock all related resources. This is to be mitigated by 1) related pipelines checking `lock_owner` and skip locking to let the parent pipeline acquire all the locks eventually and 2) do the related resource locking only on paths that require it. diff --git a/contributing/PROXY.md b/contributing/PROXY.md new file mode 100644 index 0000000000..6c738e999d --- /dev/null +++ b/contributing/PROXY.md @@ -0,0 +1,207 @@ +# `dstack-proxy` + +`dstack-proxy` is a set of `dstack` components responsible for exposing [services](https://dstack.ai/docs/concepts/services/). + +- By default, services are published at `dstack` server URL subpaths. The component that handles traffic to such services is called **in-server proxy**. It runs as part of the `dstack` server. It is implemented in `dstack._internal.server.services.proxy`. +- Users can optionally deploy a **gateway** to handle traffic to their services. The gateway app runs on a dedicated instance. Although it requires additional configuration, it provides higher performance and supports more features than the in-server proxy. It is implemented in `dstack._internal.proxy.gateway`. +- The in-server proxy and the gateway share some business logic, such as the OpenAI-compatible API and the connection pool implementation. The common details are implemented in `dstack._internal.proxy.lib`. + +## Proxy functions and modules + +### Reverse proxy + +`dstack-proxy` acts as a reverse proxy and load balancer for services. + +The in-server proxy uses a custom reverse proxy implementation based on FastAPI and httpx. It routes requests based on the `/proxy/services//` path. It performs load balancing by selecting a random service replica for each forwarded request. + +The gateway uses Nginx. It automatically maintains Nginx configs for each service in `/etc/nginx/sites-enabled/*`. Each service is published at a subdomain. Nginx performs load balancing using the round-robin method. Nginx forwards requests directly to service replicas, so traffic does not go through the gateway app. + +### HTTPS + +The in-server proxy is part of the `dstack` server, so services are only available over HTTPS if the `dstack` server is deployed with HTTPS. + +The gateway can enforce HTTPS in the Nginx config. It uses Certbot to obtain TLS certificates from Let's Encrypt or another configured CA when the service is registered, unless the service configuration specifies `https: false`. + +### Auth + +Unless the service configuration specifies `auth: false`, `dstack-proxy` checks the authorization of incoming requests based on `dstack` user tokens from the `Authorization` header. + +The in-server proxy validates the tokens by querying the correct token from the database. + +On gateways, Nginx makes a headers-only subrequest to the gateway app to check authorization for each incoming request. The gateway app then makes a request to the `dstack` server to validate the token. Responses from the `dstack` server are cached for 60 seconds. If the `dstack` server does not respond, the incoming request is denied. + +### OpenAI-compatible API + +The OpenAI interface emulates the real OpenAI API for chat completion models. It can list running models in the project, convert between OpenAI and TGI request formats, and forward requests to the correct service. + +The in-server proxy forwards requests directly to service replicas. + +```mermaid +sequenceDiagram + User->>dstack server: http(s):///proxy/models// + dstack server->>Service replica: http+unix:///path/to/ssh/socket +``` + +The gateway uses Nginx to forward requests so they are included in access logs and service stats. + +```mermaid +sequenceDiagram + User->>Nginx: http(s)://gateway./ + Nginx->>Gateway app: http://localhost:8000/api/models// + Gateway app->>Nginx: http://./ + Nginx->>Service replica: http+unix:///path/to/ssh/socket +``` + +### Stats collector + +`dstack-proxy` collects service usage stats that are then used by the `dstack` server for autoscaling. Stats collection is only supported on gateways and is implemented by reading `/var/log/nginx/dstack.access.log` + +### Service connection pool + +`dstack-proxy` connects to service replicas via SSH and forwards their service port to a local Unix socket, which is then used by the reverse proxy. All SSH connections are added to a pool and reused between requests. + +The in-server proxy opens an SSH connection when it needs to forward the first request to it, so there may be a delay when forwarding the first request. + +The gateway opens SSH connections when each replica is registered. + +### Communication with the `dstack` server + +The in-server proxy is part of the `dstack` server, so no network communication takes place between them. + +Gateway-to-server communication happens over an SSH connection that is established by the `dstack` server when it creates a new gateway or starts. The SSH connection includes bidirectional port forwarding, enabling the gateway and the `dstack` server to call each other's APIs: + +- The server calls the gateway to register services, fetch stats, etc. +- The gateway calls the server to validate user tokens. + +### Storage + +`dstack-proxy` has a set of stored models and a common storage repo interface. + +The in-server proxy repo implementation fetches the models from the database. It doesn't write to the database, as all the details about services and replicas are written by the `dstack` server. + +The gateway maintains its own in-memory storage repo. A copy of the repo is also stored in `~/dstack/state-v2.json`. The copy is updated on every write operation and is used for data recovery after restarts. The repo is populated when new services and replicas are registered. + +## Dependency injection + +When a module has to be implemented differently for the in-server proxy and the gateway, `dstack-proxy` uses interfaces with multiple implementations. For example, there are common interfaces for the storage repo and for auth checks. The in-server proxy and the gateway provide their own implementations for both interfaces. + +```mermaid +classDiagram + class BaseProxyRepo { + <> + } + BaseProxyRepo <|-- ServerProxyRepo + BaseProxyRepo <|-- GatewayProxyRepo + + class BaseProxyAuthProvider { + <> + } + BaseProxyAuthProvider <|-- ServerProxyAuthProvider + BaseProxyAuthProvider <|-- GatewayProxyAuthProvider +``` + +`dstack-proxy` then uses dependency injection to select the relevant implementation. Both the in-server proxy and the gateway provide an "injector" class that is used to obtain concrete interface implementations. + +```mermaid +classDiagram + class ProxyDependencyInjector { + <> + +get_repo() BaseProxyRepo + +get_auth_provider() BaseProxyAuthProvider + } + class ServerProxyDependencyInjector { + +get_repo() ServerProxyRepo + +get_auth_provider() ServerProxyAuthProvider + } + class GatewayDependencyInjector { + +get_repo() GatewayProxyRepo + +get_auth_provider() GatewayProxyAuthProvider + } + ProxyDependencyInjector <|-- ServerProxyDependencyInjector + ProxyDependencyInjector <|-- GatewayDependencyInjector +``` + +An instance of the relevant injector class is stored in the FastAPI global app state and can be accessed from FastAPI path operations. There are helper functions that can be used as FastAPI path operation dependencies to obtain the injector or a module implementation: `get_injector`, `get_proxy_repo`, `get_proxy_auth_provider`, etc. + +There are also similar gateway-specific helper functions to obtain gateway-specific module implementations: `get_gateway_proxy_repo` (guaranteed to return the gateway repo, which has more methods than the base repo interface), `get_nginx`, `get_stats_collector`, etc. + +## Gateway operations + +Gateway instances are managed by the `dstack` server. A gateway is associated with a project and some backend in the project. Gateways can be shared between projects using Exports. + +### Creation + +Users can create a gateway using the `dstack apply` command. The gateway YAML configuration must specify the gateway's wildcard domain - all direct subdomains should resolve to the gateway IP address. Since the IP address is unknown during provisioning, `dstack` doesn't check DNS records. + +Provisioning happens as follows: +1. Launch a non-GPU instance (usually the smallest) with all ports exposed. +2. Install Nginx, Certbot, and patch configs. +3. Create blue-green virtual environments. +4. Install the latest `dstack-gateway` package from the S3 bucket. `dstack-gateway` is a thin package that depends on the `dstack` package, which contains the actual gateway implementation. +5. Run the systemd service `dstack.gateway.service`. + +### Update + +The gateway has a "blue-green deployment"-like configuration: there are two virtual environments to be swapped on update. The systemd service uses the newly installed package after a restart. + +The update process looks like this: +1. Install the new package to the unused venv. +2. Update scripts and systemd service config. +3. Swap the active venv name in `~/dstack/version`. +4. Restart the systemd service. + +## Gateway development + +The gateway app needs to interact with Nginx and certbot, so running it locally can be challenging. One way to test your code is to upload your development branch to an existing gateway and run the gateway app from source. + +1. Run `dstack server` with `DSTACK_SKIP_GATEWAY_UPDATE=1` environment variable. This will prevent `dstack` from updating and starting the standard gateway version on each server restart. + +1. Provision a gateway through `dstack`: + + ```shell + dstack apply -f my-gateway.dstack.yml + ``` + +1. Save the gateway key to a file: + + ```shell + sqlite3 ~/.dstack/server/data/sqlite.db "SELECT ssh_private_key FROM gateway_computes WHERE deleted = 0 AND ip_address = ''" > /tmp/gateway.key + chmod 600 /tmp/gateway.key + ``` + +1. Deliver your code to the gateway. For example, clone it from a remote repo: + + ```shell + ssh -i /tmp/gateway.key ubuntu@gateway.example "git clone https://github.com/dstackai/dstack.git ~/dstack-repo" + ``` + + Or push it from your machine: + + ```shell + ssh -i /tmp/gateway.key ubuntu@gateway.example "git init ~/dstack-repo" + git remote add gateway ubuntu@gateway.example:~/dstack-repo + GIT_SSH_COMMAND='ssh -i /tmp/gateway.key' git push gateway branch_name + ``` + +1. Connect to the gateway: + + ```shell + ssh -i /tmp/gateway.key ubuntu@gateway.example + ``` + +1. Prepare an environment with your development branch on the gateway: + + ```shell + cd ~/dstack-repo + git checkout branch_name + curl -LsSf https://astral.sh/uv/install.sh | sh + source ~/.local/bin/env + uv sync --extra gateway + ``` + +1. Stop the gateway service and start your development version from source: + + ```shell + sudo systemctl stop dstack.gateway.service + uv run uvicorn dstack._internal.proxy.gateway.main:app + ``` diff --git a/contributing/RELEASE.md b/contributing/RELEASE.md new file mode 100644 index 0000000000..f0798082fc --- /dev/null +++ b/contributing/RELEASE.md @@ -0,0 +1,25 @@ +# Release + +This is a `dstack` release guide and checklist for core maintainers. + +## Checklist + +1. Test `master`: + 1. Compare changes to the previous release, e.g. [`https://github.com/dstackai/dstack/compare/0.19.39...master`](https://github.com/dstackai/dstack/compare/0.19.39...master). + 2. Test that `master` CLI works with the previous server release. PRs that add new model fields can potentially break client backward compatibility. + 3. Test that `master` server works with the previous CLI release. + 4. Pay special attention to releases with DB migrations. See [MIGRATIONS.md](MIGRATIONS.md). + * Ensure migrations work with rolling deployments and do not lock multiple tables. + * Test applying migrations while old replicas do active processing. + * Test migrations can be retried if they fail. For example, concurrent index may fail and stay in invalid state. +2. Create a tag, e.g. `git tag 0.19.40`. +3. Push the tag to trigger the Release `workflow`, i.e. `git push --tags`. +4. Generate GitHub release notes from the tag. Highlight major features, deprecations, breaking changes. +5. Install the release build and test once again, e.g. `uv pip install 'dstack[all]==0.19.40' --refresh`. +6. Release `dstack` Sky and `dstack` Enterprise. +7. Publish the release notes and make announcements. + +## Troubleshooting + +* If a release workflow fails due to a release workflow mistake and the release build is not published, commit a fix and update the tag reference: `git tag -d 0.19.40 && git tag 0.19.40 && git push --tags -f`. +* If a critical bug is found after the release is published, make a new release. In an extreme case, the broken release can be yanked. diff --git a/contributing/RUNNER-AND-SHIM.md b/contributing/RUNNER-AND-SHIM.md index e962528b42..747a0745d2 100644 --- a/contributing/RUNNER-AND-SHIM.md +++ b/contributing/RUNNER-AND-SHIM.md @@ -1,26 +1,42 @@ -dstack runs the user's configuration as a Docker container. The user can specify their own image name or use the preconfigured dstack image (with Python and CUDA). +# runner and shim -`dstack-runner` is a component responsible for setting environment variables and secrets, executing user commands, reporting logs and job status, and terminating the job on signal from the dstack server. `dstack-runner` is cloud-agnostic and runs as an entrypoint of a Docker container. +`dstack` runs the user's configuration as a Docker container. The user can specify their own image name or use the preconfigured dstack image (with Python and CUDA). -If the cloud provider has VM capabilities, dstack runs `dstack-shim` on the host to emulate a container-only environment. `dstack-shim` is responsible for pulling Docker images (public or private), configuring Docker containers (mounts, GPU forwarding, entrypoint, etc.), running Docker containers, and terminating the container on signal from the dstack server. +`dstack-runner` is a component responsible for setting environment variables and secrets, executing user commands, reporting logs and job status, and terminating the job on signal from the `dstack` server. `dstack-runner` is cloud-agnostic and runs as an entrypoint of a Docker container. + +If the cloud provider has VM capabilities, `dstack` runs `dstack-shim` on the host to emulate a container-only environment. `dstack-shim` is responsible for pulling Docker images (public or private), configuring Docker containers (mounts, GPU forwarding, entrypoint, etc.), running Docker containers, and terminating the container on signal from the `dstack` server. ## dstack-shim -`dstack-shim` works in cycles, allowing to run a different container once the job is finished. +`dstack-shim` works with _tasks_. Essentially, a task is a `dstack-shim`-specific part of `dstack`'s job, namely a Docker container with its associated data. `dstack-shim` is able to process multiple tasks in parallel. + +A task is identified by a unique ID assigned by the `dstack` server. A task has a state: a status, allocated resources, data on disk (container, even if stopped, runner logs), etc. `dstack-shim` keeps a task in memory and its data on disk until the `dstack` server requests removal. The `dstack` server should periodically remove old tasks to clean up storage. Currently, the server removes a task right after termination request, but this is subject to change. + +A lifecycle of a task is as follows: + +- Wait for a task submission from the `dstack` server (image ref, registry credentials if needed, user, resource constraints, network mode, etc.) +- Allocate GPU resources, find and mount volumes, pull the image +- Run the container and + - either wait for container to exit + - or wait for the termination request from the `dstack` server +- Deallocate GPU resources, unmount volumes + +A container is started in either `host` or `bridge` network mode depending on the instance and the job: -- STEP 1: Wait for Docker image info (+ registry auth credentials if needed) from the dstack server -- STEP 2: Pull the Docker image -- STEP 3: Run the container - - Wait for container exit - - Or wait for the interruption signal from the dstack server -- STEP 4: Go to STEP 1 +- If the instance is shared (split into GPU blocks), network mode is set to `bridge` to avoid port conflicts +- …unless there are multiple jobs in multinode mode — in that case, the instance is never shared (the jobs takes the whole instance), and network mode is set `host` + + **NOTE**: `host` networking mode would allow jobs to use any port at any moment for internal communication. For example, during distributed PyTorch training. +- If the instance is not shared by multiple jobs (i.e. GPU blocks feature is not used), network mode is `host` to avoid unnecessary overhead + +In `bridge` mode, container ports are mapped to ephemeral host ports. `dstack-shim` stores port mapping as a part of task's state. Currently, the default `bridge` network is used for all containers, but this could be changed in the future to improve container isolation. + +All communication between the `dstack` server and `dstack-shim` happens via HTTP API through an SSH tunnel. `dstack-shim` doesn't collect logs. Usually, it is run from a `cloud-init` user-data script. -All communication between the dstack server and `dstack-shim` happens via REST API through an SSH tunnel. `dstack-shim` doesn't collect logs. Usually, it is run from a `cloud-init` user-data script. The entrypoint for the container: - Installs `openssh-server` - Adds project and user public keys to `~/.ssh/authorized_keys` -- Downloads `dstack-runner` from the public S3 bucket -- Starts `dstack-runner` +- Starts `sshd` and `dstack-runner` ## dstack-runner @@ -30,18 +46,16 @@ The entrypoint for the container: - STEP 2: Wait for the code (tarball or diff) - STEP 3: Prepare the repo (clone git repo and apply the diff, or extract the archive) - STEP 4: Run the commands from the job spec - - Wait for the commands to exit - - Serve logs to the dstack server via HTTP - - Serve real-time logs to the CLI via WebSocket - - Wait for the signal to terminate the commands + - Wait for the commands to exit + - Serve logs to the `dstack` server via HTTP + - Serve real-time logs to the CLI via WebSocket + - Wait for the signal to terminate the commands - STEP 5: Wait until all logs are read by the server and the CLI. Or exit after a timeout -All communication between the dstack server and `dstack-runner` happens via REST API through an SSH tunnel. `dstack-runner` collects the job logs and its own logs. Only the job logs are served via WebSocket. +All communication between the `dstack` server and `dstack-runner` happens via HTTP API through an SSH tunnel. `dstack-runner` collects the job logs and its own logs. Only the job logs are served via WebSocket. ## SSH tunnels -dstack expects a running SSH server right next to the `dstack-runner`. It provides a secure channel for communication with the runner API and forwarding any ports without listening for `0.0.0.0`. The `dstack-gateway` also uses this SSH server for forwarding requests from public endpoints. - -`dstack-shim` must also be running next to the SSH server. The dstack server connects to this SSH server for interacting with both `dstack-shim` and `dstack-runner` since we use `host` networking mode for the Docker container. The CLI uses this SSH server as a jump host because the user wants to connect to the container. +`dstack` expects a running SSH server right next to the `dstack-runner`. It provides a secure channel for communication with the runner API and forwarding any ports without listening for `0.0.0.0`. The `dstack-gateway` also uses this SSH server for forwarding requests from public endpoints. -> `host` networking mode would allow jobs to use any port at any moment for internal communication. For example, during distributed PyTorch training. \ No newline at end of file +`dstack-shim` must also be running next to the SSH server. The `dstack` server connects to this SSH server for interacting with both `dstack-shim` and `dstack-runner`. The CLI uses this SSH server as a jump host because the user wants to connect to the container. diff --git a/contributing/RUNS-AND-JOBS.md b/contributing/RUNS-AND-JOBS.md index 010ed2bd4a..2f00e751be 100644 --- a/contributing/RUNS-AND-JOBS.md +++ b/contributing/RUNS-AND-JOBS.md @@ -1,89 +1,87 @@ +# Runs and jobs + ## Introduction -Run is the primary unit of workload in dstack. Users can: -1. Submit a run using `dstack run` or the API. +Run is the primary unit of workload in `dstack`. Users can: + +1. Submit a run using `dstack apply` or the API. 2. Stop a run using `dstack stop` or the API. -Runs are created from configurations. There are three basic types of configurations: +Runs are created from run configurations. There are three types of run configurations: + 1. `dev-environment` — runs a VS Code server. 2. `task` — runs the user's bash script until completion. -3. `service` — runs the user's bash script and exposes a port through the gateway, making it accessible from the internet via the HTTP protocol. - -A run can spawn one or multiple jobs, depending on the configuration. There could be multiple nodes in a cluster (for distributed training), multiple replicas (for load balancing), or both. During the execution, a job runs on an instance, and the instance can run only one job at any given moment. +3. `service` — runs the user's bash script and exposes a port through [dstack-proxy](PROXY.md). -If a job fails and the configuration allows retrying, the server will spawn a new job submission for the job. +A run can spawn one or multiple jobs, depending on the configuration. A task that specifies multiple `nodes` spawns a job for every node (a multi-node task). A service that specifies multiple `replicas` spawns a job for every replica. A job submission is always assigned to one particular instance. If a job fails and the configuration allows retrying, the server creates a new job submission for the job. ## Run's Lifecycle -- STEP 1: The user submits the run. `services.runs.submit_run` creates jobs with status `SUBMITTED`. Now the run has status `SUBMITTED`. -- STEP 2: The server periodically pulls unfinished runs and processes them in `background.tasks.process_runs`. - - If any job is `RUNNING`, the run becomes `RUNNING`. - - If any job is `PROVISIONING` or `PULLING`, the run becomes `PROVISIONING`. - - If any job fails and cannot be retried, the run becomes `TERMINATING`, and after processing, `FAILED`. - - If all jobs are `DONE`, the run becomes `TERMINATING`, and after processing, `DONE`. - - If any job fails, can be retried, and there is any other active job, the failed job will be resubmitted in-place. The run status is defined by the rules above. - - If all jobs fail and can be resubmitted, the run becomes `PENDING`. -- STEP 3: If the run is `TERMINATING`, the server terminates all jobs by setting their status to `TERMINATING` and assigning the proper `JobTerminationReason`. -- STEP 4: Once all jobs are finished, the run becomes `TERMINATED`, `DONE`, or `FAILED` based on `RunTerminationReason`. -- STEP 0: If the run is `PENDING`, `background.tasks.process_runs` will resubmit jobs. The run becomes `SUBMITTED` again. - -> No one must assign the finished status to the run, except `services.runs.process_terminating_run`. To terminate the run, assign `TERMINATING` status and `RunTerminationReason`. +- STEP 1: The user submits the run. `services.runs.submit_run` creates jobs with status `SUBMITTED`. The run starts in `SUBMITTED`. +- STEP 2: `RunPipeline` continuously processes unfinished runs. + - For active runs, it derives the run status from the latest job states in priority order: + 1. If any non-retryable failure is present, the run becomes `TERMINATING` with the relevant `RunTerminationReason`. + 2. If `stop_criteria == MASTER_DONE` and the master job is done, the run becomes `TERMINATING` with `ALL_JOBS_DONE`. + 3. Otherwise, if any job is `RUNNING`, the run becomes `RUNNING`. + 4. Otherwise, if any job is `PROVISIONING` or `PULLING`, the run becomes `PROVISIONING`. + 5. Otherwise, if jobs are still waiting for placement or provisioning, the run stays `SUBMITTED`. + 6. Otherwise, if all contributing jobs are `DONE`, the run becomes `TERMINATING` with `ALL_JOBS_DONE`. + 7. Otherwise, if no active replicas remain and the run should be retried, the run becomes `PENDING`. + - Retryable replica failures are handled before the final transition is applied: + - If a replica fails with a retryable reason while other replicas are still active, `RunPipeline` creates a new `SUBMITTED` submission for that replica and terminates the old jobs in that replica. + - If all remaining work is retryable, the run ends up in `PENDING`. +- STEP 3: If the run is `PENDING`, `RunPipeline` processes it in the pending phase. + - For retrying runs, it waits for an exponential backoff before resubmitting. + - For scheduled runs, it waits until `next_triggered_at`. + - For scaled-to-zero services, it can keep the run in `PENDING` until autoscaling wants replicas again. + - Once the run is ready to continue, `RunPipeline` creates new `SUBMITTED` jobs and moves the run back to `SUBMITTED`. +- STEP 4: If the run is `TERMINATING`, `RunPipeline` marks active jobs as `TERMINATING` and assigns the corresponding `JobTerminationReason`. +- STEP 5: Once all jobs are finished, the terminating phase of `RunPipeline` either: + - assigns the final run status (`TERMINATED`, `DONE`, or `FAILED`), or + - for scheduled runs that were not stopped or aborted by the user, returns the run to `PENDING` and computes a new `next_triggered_at`. ### Services -Services' lifecycle has some modifications: -- During STEP 1, the service is registered on the gateway. If the gateway is not accessible or the domain name is taken, the run submission fails. -- During STEP 2, downscaled jobs are ignored. -- During STEP 4, the service is unregistered on the gateway. -- During STEP 0, the service can stay in `PENDING` status if it was downscaled to zero (WIP). + +Services' run lifecycle has some modifications: + +- During STEP 1, the service itself is registered on the gateway or the in-server proxy. If the gateway is not accessible or the domain name is taken, submission fails. +- During STEP 2, active run processing also computes desired replica counts from gateway stats and handles scale-up, scale-down, rolling deployment, and cleanup of removed replica groups. +- During STEP 2, jobs already marked `SCALED_DOWN` do not contribute to the run status. +- During STEP 3, a service can stay in `PENDING` when autoscaling currently wants zero replicas. +- During STEP 5, the terminating phase of `RunPipeline` unregisters the service from the gateway. ### When can the job be retried? -It's a complicated question and will be elaborated later with multi-node and replica implementation. -For now, dstack retries only if: -- The configuration has enabled the retry policy. -- The job failed because of `NO_CAPACITY`, and the instance was a spot. +`dstack` retries the run only if: + +- The configuration enables `retry`. +- The job termination reason is covered by `retry.on_events`. +- The `retry.duration` is not exceeded. ## Job's Lifecycle - STEP 1: A newly submitted job has status `SUBMITTED`. It is not assigned to any instance yet. -- STEP 2: `background.tasks.process_submitted_jobs` tries to assign an existing instance or provision a new one. - - On success, the job becomes `PROVISIONING`. - - On failure, the job becomes `TERMINATING`, and after processing, `FAILED` because of `NO_CAPACITY`. -- STEP 3: `background.tasks.process_running_jobs` periodically pulls unfinished jobs and processes them. - - While `dstack-shim`/`dstack-runner` is not responding, the job stays `PROVISIONING`. - - Once `dstack-shim` (for VM-featured backends) becomes available, the server submits the docker image name, and the job becomes `PULLING`. - - Once `dstack-runner` inside a docker container becomes available, the server submits the code and the job spec, and the job becomes `RUNNING`. - - If `dstack-shim` or `dstack-runner` don't respond for a long time or fail to respond after successful connection and multiple retries, the job becomes `TERMINATING`, and after processing, `FAILED`. -- STEP 4: `background.tasks.process_running_jobs` processes `RUNNING` jobs, pulling job logs, runner logs, and job status. - - If the pulled status is `DONE`, the job becomes `TERMINATING`, and after processing, `DONE`. - - Otherwise, the job becomes `TERMINATING`, and after processing, `FAILED`. -- STEP 5: `background.tasks.process_terminating_jobs` processes `TERMINATING` jobs. - - If the job has `remove_at` in the future, nothing happens. - - Once `remove_at` is in the past, the server stops the container via `dstack-shim` and releases the instance. The job becomes `TERMINATED`, `DONE`, `FAILED`, or `ABORTED` based on `JobTerminationReason`. - -> No one must assign the finished status to the job, except `services.jobs.process_terminating_job`. To terminate the job, assign `TERMINATING` status and `JobTerminationReason`. +- STEP 2: `JobSubmittedPipeline` assigns the job in two phases: + - Assignment: claim an existing instance or reserve a *placeholder* `InstanceModel`. Placeholders are `PENDING` instances that reserve an `instance_num` and a `nodes.max` slot. `InstancePipeline` ignores them. + - Provisioning: reuse the existing instance, or cloud-provision and promote the placeholder to `PROVISIONING`. + - On success, the job becomes `PROVISIONING`. + - On failure, the job becomes `TERMINATING`. `JobTerminatingPipeline` later assigns the final failed status. +- STEP 3: `JobRunningPipeline` processes `PROVISIONING`, `PULLING`, and `RUNNING` jobs. + - While `dstack-shim` / `dstack-runner` is not responding, the job stays `PROVISIONING`. + - Once `dstack-shim` (for VM-featured backends) becomes available, the pipeline submits the image and the job becomes `PULLING`. + - Once `dstack-runner` inside the container becomes available, the pipeline uploads the code and job spec, and the job becomes `RUNNING`. + - While the job is `RUNNING`, the pipeline keeps collecting logs and runner status. + - If startup, runner communication, or replica registration fails, the job becomes `TERMINATING`. +- STEP 4: Once the job is actually ready, `JobRunningPipeline` initializes probes. +- STEP 5: `JobTerminatingPipeline` processes `TERMINATING` jobs. + - If the job has `remove_at` in the future, it waits. This gives the job time for a graceful stop. + - Once `remove_at` is in the past, it stops the container, detaches volumes, unregisters service replicas if needed, and releases the instance assignment. + - If some volumes are not detached yet, the job stays `TERMINATING` and is retried. + - When cleanup is complete, the job becomes `TERMINATED`, `DONE`, `FAILED`, or `ABORTED` based on `JobTerminationReason`. ### Services' Jobs -Services' jobs lifecycle has some modifications: -- During STEP 3, once the job becomes `RUNNING`, it is registered on the gateway as a replica. If the gateway is not accessible, the job fails. -- During STEP 5, the job is unregistered on the gateway (WIP). - -## Stop a Run -To stop a run, `services.runs.stop_runs` assigns `TERMINATING` status to the run and executes one iteration of the processing without waiting for the background task. -## Concurrency -Since SQLite lacks per-row locking, we use an in-memory locking mechanism to avoid race conditions. - -Every lock consists of a lock primitive (`asyncio.Lock`) and a set of locked IDs (`Set[uuid.UUID]`). It follows the rules below: -- Only the `asyncio.Lock` holder can add an ID to the set. -- The processing task must remove the corresponding ID from the set; acquiring `asyncio.Lock` is not required. - -Runs and jobs are processed by concurrent background tasks. There are locks for runs and jobs: -- `PROCESSING_RUNS_(LOCK|IDS)` -- `SUBMITTED_PROCESSING_JOBS_(LOCK|IDS)` -- `RUNNING_PROCESSING_JOBS_(LOCK|IDS)` -- `TERMINATING_PROCESSING_JOBS_(LOCK|IDS)` +Services' jobs lifecycle has some modifications: -Run processing takes priority over job processing; that's why: -- Once `run.id` is in `PROCESSING_RUNS_IDS`, any job processing task should not take any job with `job.run_id` in `PROCESSING_RUNS_IDS`. -- Any run processing task should wait until job processing tasks release all run's job IDs from `*_PROCESSING_JOBS_IDS` sets. \ No newline at end of file +- During STEP 3, once the primary job of a replica is `RUNNING` and ready to receive traffic, `JobRunningPipeline` registers that replica on the gateway. If the gateway is not accessible, the job fails with a gateway-related termination reason. +- During STEP 5, `JobTerminatingPipeline` unregisters the replica from receiving requests before the job is fully cleaned up. diff --git a/docker/amd-smi/Dockerfile b/docker/amd-smi/Dockerfile new file mode 100644 index 0000000000..3736f06095 --- /dev/null +++ b/docker/amd-smi/Dockerfile @@ -0,0 +1,30 @@ +ARG UBUNTU_VERSION + +FROM ubuntu:${UBUNTU_VERSION} + +ARG IMAGE_NAME +ARG UBUNTU_VERSION +ARG ROCM_VERSION +ARG DSTACK_REVISION +ARG BUILD_DATE + +ENV PATH="/opt/rocm/bin:${PATH}" + +RUN \ + export DEBIAN_FRONTEND=noninteractive && \ + apt-get update && \ + apt-get install -y --no-install-recommends wget ca-certificates && \ + base_url="https://repo.radeon.com/amdgpu-install/${ROCM_VERSION}/ubuntu/${UBUNTU_VERSION}/" && \ + deb_name=$(wget -qO- "${base_url}" | grep -Po '(?<=href=")amdgpu-install_[^"]+') && \ + wget -O amdgpu-install.deb "${base_url}${deb_name}" && \ + apt-get install -y --no-install-recommends ./amdgpu-install.deb && \ + rm ./amdgpu-install.deb && \ + apt-get update && \ + apt-get install -y amd-smi-lib + +ENTRYPOINT ["/opt/rocm/bin/amd-smi"] +CMD ["--help"] + +LABEL org.opencontainers.image.title="${IMAGE_NAME}" +LABEL org.opencontainers.image.version="${ROCM_VERSION}-${DSTACK_REVISION}" +LABEL org.opencontainers.image.created="${BUILD_DATE}" diff --git a/docker/amd-smi/README.md b/docker/amd-smi/README.md new file mode 100644 index 0000000000..577036d9c4 --- /dev/null +++ b/docker/amd-smi/README.md @@ -0,0 +1,9 @@ +# dstack AMD SMI + +An Ubuntu-based image with [AMD SMI](https://rocm.docs.amd.com/projects/amdsmi/en/latest/) preinstalled. Suitable for AMD GPU detection. + +## Usage + +```shell +docker run --rm --device /dev/kfd --device /dev/dri dstackai/amd-smi static +``` diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index 9e110458bf..6c01f2e94d 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -1,30 +1,187 @@ -FROM nvidia/cuda:12.1.0-base-ubuntu20.04 +ARG UBUNTU_VERSION -ARG PYTHON -ENV PYTHON=$PYTHON +# ============================================================================ +# common: shared base for all flavors. Select a flavor with `--target ` +# (base / devel / devel-efa). +# ============================================================================ + +FROM nvidia/cuda:12.8.2-base-ubuntu${UBUNTU_VERSION}.04 AS common + +# ARGs before FROM must be redeclared to be used after FROM +ARG UBUNTU_VERSION + +ARG _UV_HOME="/opt/uv" + +ENV UV_INSTALL_DIR="${_UV_HOME}/bin" +ENV UV_MANAGED_PYTHON=1 ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 -ENV PIP_ROOT_USER_ACTION=ignore - -RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \ - apt-get update --fix-missing && \ - apt upgrade -y && \ - export DEBIAN_FRONTEND=noninteractive && \ - ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \ - apt-get install -y tzdata && \ - dpkg-reconfigure --frontend noninteractive tzdata && \ - apt-get install -y bzip2 ca-certificates curl build-essential git libglib2.0-0 libsm6 libxext6 libxrender1 mercurial openssh-server subversion wget && \ - sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config && mkdir /run/sshd && \ - mkdir ~/.ssh && chmod 700 ~/.ssh && touch ~/.ssh/authorized_keys && chmod 600 ~/.ssh/authorized_keys && rm /etc/ssh/ssh_host_* - -RUN wget -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" && \ - bash Miniforge3.sh -b -p "/opt/conda" && \ - rm Miniforge3.sh && \ - ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ - chmod +x /opt/conda/etc/profile.d/conda.sh && \ - echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ - /opt/conda/condabin/conda update conda --all -y && \ - /opt/conda/condabin/conda create --name workflow python=${PYTHON} -y && \ - /opt/conda/condabin/conda config --prepend channels "nvidia/label/cuda-12.1.0" && \ - /opt/conda/condabin/conda config --set always_yes true && \ - /opt/conda/condabin/conda clean --all && \ - echo "conda activate workflow" >> ~/.bashrc \ No newline at end of file + +ENV PATH="${UV_INSTALL_DIR}:${PATH}" + +ENV OMPI_MCA_pml=^cm,ucx +ENV OMPI_MCA_btl=tcp,self +ENV OMPI_MCA_btl_tcp_if_exclude=lo,docker0 +ENV NCCL_SOCKET_IFNAME=^docker,lo + +RUN export DEBIAN_FRONTEND=noninteractive \ + && apt-get update --fix-missing \ + && apt-get upgrade -y \ + && ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \ + && apt-get install -y tzdata \ + && dpkg-reconfigure --frontend noninteractive tzdata \ + && apt-get install -y bzip2 ca-certificates curl build-essential git libglib2.0-0 libsm6 libxext6 libxrender1 openssh-server wget \ + libibverbs1 ibverbs-providers ibverbs-utils libibverbs-dev infiniband-diags \ + # nvidia/cuda ships nvidia's apt singing key in legacy format (/etc/apt/trusted.gpg). + # This lead to warnings, so we install cuda-keyring. + && wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/cuda-keyring_1.1-1_all.deb \ + && dpkg -i cuda-keyring_1.1-1_all.deb \ + && rm cuda-keyring_1.1-1_all.deb \ + && rm -f /etc/apt/sources.list.d/cuda.list \ + && sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config \ + && mkdir /run/sshd \ + && mkdir -p ~/.ssh && chmod 700 ~/.ssh && touch ~/.ssh/authorized_keys \ + && chmod 600 ~/.ssh/authorized_keys \ + && rm /etc/ssh/ssh_host_* \ + # User: UID/GID 1001 because Ubuntu 24.04 ships a default 'ubuntu' user at 1000. + && apt-get install -y sudo \ + && groupadd -g 1001 dstack \ + && useradd -u 1001 -g 1001 -G sudo -s /bin/bash -m dstack \ + && echo 'dstack ALL=(ALL) NOPASSWD: ALL' > /etc/sudoers.d/dstack \ + # Default working dir + && mkdir -p /dstack/run \ + && chmod a+rwx /dstack/run \ + # Cleanup + && rm -rf /var/lib/apt/lists/* + +RUN curl -LsSf https://astral.sh/uv/install.sh | INSTALLER_NO_MODIFY_PATH=1 sh \ + && uv python install --preview --default + +# ============================================================================ +# builder: builds NCCL and nccl-tests from source for the base/devel flavors. +# ============================================================================ + +FROM nvidia/cuda:12.8.2-base-ubuntu${UBUNTU_VERSION}.04 AS builder + +ENV NCCL_HOME=/opt/nccl +ENV CUDA_HOME=/usr/local/cuda +ENV OPEN_MPI_PATH=/usr/lib/x86_64-linux-gnu/openmpi + +RUN export DEBIAN_FRONTEND=noninteractive \ + && apt-get update --fix-missing \ + && apt-get upgrade -y \ + && ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \ + && apt-get install -y tzdata \ + && dpkg-reconfigure --frontend noninteractive tzdata \ + && cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ + && apt-get install -y --no-install-recommends \ + cuda-libraries-dev-${cuda_version} \ + cuda-nvcc-${cuda_version} \ + libhwloc-dev \ + autoconf \ + automake \ + libtool \ + libopenmpi-dev \ + git \ + curl \ + python3 \ + build-essential + +ARG NCCL_VERSION=2.26.2-1 + +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ + && cd nccl \ + && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} + +RUN cd /opt \ + && git clone https://github.com/NVIDIA/nccl-tests \ + && cd nccl-tests \ + && make -j$(nproc) \ + MPI=1 \ + MPI_HOME=${OPEN_MPI_PATH} \ + CUDA_HOME=${CUDA_HOME} \ + NCCL_HOME=${NCCL_HOME} + +# ============================================================================ +# base: common + NCCL (from builder) + OpenMPI. +# ============================================================================ + +FROM common AS base + +ENV NCCL_HOME=/opt/nccl + +COPY --from=builder ${NCCL_HOME} ${NCCL_HOME} +COPY --from=builder /opt/nccl-tests/build /opt/nccl-tests/build + +RUN apt-get update \ + && apt-get install -y --no-install-recommends openmpi-bin \ + && rm -rf /var/lib/apt/lists/* \ + && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ + && ldconfig + +WORKDIR /dstack/run + +# ============================================================================ +# devel: base + CUDA development libraries and NVCC. +# ============================================================================ + +FROM base AS devel + +RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ + && apt-get update \ + && apt-get install -y --no-install-recommends \ + cuda-libraries-dev-${cuda_version} \ + cuda-nvcc-${cuda_version} \ + libhwloc-dev \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /dstack/run + +# ============================================================================ +# devel-efa: common + CUDA dev libraries + AWS EFA + EFA-aware NCCL. +# ============================================================================ + +FROM common AS devel-efa + +ENV NCCL_HOME=/usr/local +ENV CUDA_HOME=/usr/local/cuda +ENV LIBFABRIC_PATH=/opt/amazon/efa +ENV OPEN_MPI_PATH=/opt/amazon/openmpi +ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" +ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}" + +RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ + && apt-get update \ + && apt-get install -y --no-install-recommends \ + cuda-libraries-dev-${cuda_version} \ + cuda-nvcc-${cuda_version} \ + && rm -rf /var/lib/apt/lists/* + +ARG EFA_VERSION=1.48.0 + +RUN cd /tmp \ + && apt-get update \ + && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ + && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y --skip-kmod -g \ + && rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/* + +ARG NCCL_VERSION=2.27.7-1 + +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ + && cd nccl \ + && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \ + && rm -rf /tmp/nccl + +RUN cd /opt \ + && git clone https://github.com/NVIDIA/nccl-tests \ + && cd nccl-tests \ + && make -j$(nproc) \ + MPI=1 \ + MPI_HOME=${OPEN_MPI_PATH} \ + CUDA_HOME=${CUDA_HOME} \ + NCCL_HOME=${NCCL_HOME} + +WORKDIR /dstack/run diff --git a/docker/base/README.md b/docker/base/README.md index 3bbbafa732..5246cbc40b 100644 --- a/docker/base/README.md +++ b/docker/base/README.md @@ -1 +1,19 @@ -Image for `dstack` runner instances. +# dstack base images + +Base images for `dstack` runner instances. A single multi-stage `Dockerfile` +produces all flavors; select one with `docker build --target `: + +- **base** — CUDA 12.8, Python (uv-managed), NCCL 2.26.2-1 + NCCL Tests, Open MPI. +- **devel** — `base` plus the CUDA development libraries and NVCC. +- **devel-efa** — `base` plus CUDA dev libraries, AWS EFA Installer 1.48.0 + (Libfabric + Open MPI + AWS OFI NCCL 1.19.0), and an EFA-aware NCCL 2.27.7-1 + build + NCCL Tests. + +Build args: `UBUNTU_VERSION` (e.g. `24`). + +Example: + +```bash +docker build --target devel-efa --build-arg UBUNTU_VERSION=24 \ + -t dstackai/base:local-devel-efa-ubuntu24.04 -f base/Dockerfile . +``` diff --git a/docker/dind/Dockerfile b/docker/dind/Dockerfile new file mode 100644 index 0000000000..d41042d187 --- /dev/null +++ b/docker/dind/Dockerfile @@ -0,0 +1,24 @@ +# Keep BASE_VERSION (base image tag) and BASE_SHA256 in sync +ARG BASE_VERSION=24.04-20240827003913 +ARG BASE_SHA256=c9812aad4dbb79d800e5aecdfe41b66ffe47b73f0c070b5143c5acdcd6643ba1 + +FROM ghcr.io/ehfd/nvidia-dind@sha256:${BASE_SHA256} + +ARG IMAGE_NAME +ARG BASE_VERSION +ARG DSTACK_REVISION +ARG BUILD_DATE + +COPY start-dockerd /usr/local/bin/ + +RUN \ + chmod 755 /usr/local/bin/start-dockerd && \ + rm /usr/local/bin/entrypoint.sh && \ + sed -i -e '/nodaemon/d' -e '/program:entrypoint/,/^[[:space:]]*$/d' /etc/supervisord.conf + +ENTRYPOINT [] +CMD ["/usr/local/bin/start-dockerd", "-v", "-l"] + +LABEL org.opencontainers.image.title="${IMAGE_NAME}" +LABEL org.opencontainers.image.version="${BASE_VERSION}-${DSTACK_REVISION}" +LABEL org.opencontainers.image.created="${BUILD_DATE}" diff --git a/docker/dind/README.md b/docker/dind/README.md new file mode 100644 index 0000000000..7e0f378a20 --- /dev/null +++ b/docker/dind/README.md @@ -0,0 +1,34 @@ +# dstack DinD + +An [NVIDIA Docker in Docker](https://github.com/ehfd/nvidia-dind) image tailored for use with `dstack` + +## Usage + +```yaml +type: service + +name: dind + +image: dstackai/dind +privileged: true + +port: 3000 +auth: false + +commands: + # start docker daemon + - start-dockerd + # list stored images + - docker image ls + # run docker with nvidia gpu example (nvidia-smi) + - docker run --rm --gpus all debian nvidia-smi + # run docker compose example (gitea+postgres) + - git clone --depth 1 https://github.com/docker/awesome-compose.git + - cd awesome-compose/gitea-postgres + - docker compose up + +# preserve docker data root between runs (including volumes and image store) +volumes: + - name: dind-volume + path: /var/lib/docker +``` diff --git a/docker/dind/start-dockerd b/docker/dind/start-dockerd new file mode 100755 index 0000000000..8a4abb3c0b --- /dev/null +++ b/docker/dind/start-dockerd @@ -0,0 +1,201 @@ +#!/usr/bin/env bash +set -euo pipefail + +# constants + +SUPERVISORD_LOG='/tmp/supervisord.log' +DOCKERD_LOG='/tmp/dockerd.log' +DEFAULT_TIMEOUT=60 + +ERR_INSUFFICIENT_PRIVILEGES=101 +ERR_DOCKERD_FAILED=102 +ERR_INVALID_ARGS=103 +ERR_SUPERVISORD_FAILED=104 + +# options + +verbose=false +quiet=false +logs=false +timeout=${DEFAULT_TIMEOUT} + +# functions + +is_true() { + [[ ${1} = true ]] +} + +is_verbose() { + is_true ${verbose} +} + +is_quiet() { + is_true ${quiet} +} + +say() { + echo "${@}" >&2 +} + +say_verbose() { + if is_verbose; then + say "${@}" + fi +} + +check_privileged_mode_or_die() { + mkdir /mnt/_tmp + if ! mount -t tmpfs none /mnt/_tmp 2> /dev/null; then + say 'docker privileged mode required' + rm -r /mnt/_tmp + exit ${ERR_INSUFFICIENT_PRIVILEGES} + fi + umount /mnt/_tmp + rm -r /mnt/_tmp +} + +start_restart_dockerd() { + if supervisorctl pid > /dev/null; then + say_verbose "restarting dockerd" + supervisorctl stop dockerd > /dev/null + if [[ -f ${DOCKERD_LOG} ]]; then + rm ${DOCKERD_LOG} + fi + supervisorctl start dockerd > /dev/null + echo 'restarted' + else + local ctl_status=${?} + # LSBInitExitStatuses.NOT_RUNNING = 7 + if [[ ${ctl_status} -eq 7 ]]; then + say_verbose "starting dockerd" + supervisord -c /etc/supervisord.conf + echo 'started' + else + say "supervisorctl exited with status ${ctl_status}" + if ! is_quiet && [[ -f ${SUPERVISORD_LOG} ]]; then + cat ${SUPERVISORD_LOG} >&2 + fi + exit ${ERR_SUPERVISORD_FAILED} + fi + fi +} + +move_processes_to_separate_cgroup() { + # Move processes to a separate cgroup to prevent the root cgroup from becoming + # threaded -- "Once you have a threaded controller you can not create cgroups + # below it that reference non-threaded controllers like the memory controller". + # "A domain cgroup is turned into a threaded domain when [...] threaded controllers + # are enabled in the “cgroup.subtree_control” file while there are processes + # in the cgroup." + # Fixes "cannot enter cgroupv2 "/sys/fs/cgroup/docker" with domain controllers -- + # it is in threaded mode" when starting containers with resource constraints, + # see https://github.com/dstackai/dstack/issues/1854 + # Based on https://github.com/moby/moby/blob/65cfcc2/hack/dind#L59 and + # https://github.com/earthly/earthly/blob/08b0d1f/buildkitd/dockerd-wrapper.sh#L63 + if [[ -f /sys/fs/cgroup/cgroup.controllers ]]; then + local group=/sys/fs/cgroup/dind + mkdir -p ${group} + xargs -rn1 < /sys/fs/cgroup/cgroup.procs > ${group}/cgroup.procs || true + fi +} + +wait_dockerd_started() { + local counter=1 + while true; do + if grep -qs 'API listen on' ${DOCKERD_LOG}; then + return 0 + fi + if [[ ${counter} -gt ${timeout} ]]; then + break + fi + say_verbose "waiting for dockerd to start (${counter}/${timeout})" + ((counter++)) + sleep 1 + done + return 1 +} + +stop_dockerd_and_die() { + supervisorctl stop dockerd > /dev/null + say 'failed to start dockerd' + if ! is_quiet; then + cat ${DOCKERD_LOG} >&2 + fi + exit ${ERR_DOCKERD_FAILED} +} + +usage() { + echo 'usage: start-dockerd [-v|-q] [-l] [-t SECONDS]' + echo ' -v, --verbose get more output, mutually exclusive with -q' + echo ' -q, --quiet get less output, mutually exclusive with -v' + echo ' -l, --logs follow dockerd log output' + echo ' -t, --timeout SECONDS wait for dockerd to start the specified amount' + echo ' of seconds before failing with error, ' + echo " ${DEFAULT_TIMEOUT} seconds by default" +} + +# main + +check_privileged_mode_or_die + +while [[ ${#} -gt 0 ]]; do + option=${1} + shift + case ${option} in + --verbose|-v) + verbose=true + ;; + --quiet|-q) + quiet=true + ;; + --logs|-l) + logs=true + ;; + --timeout|-t) + if [[ ${#} -eq 0 ]]; then + say "${option}: value expected" + exit ${ERR_INVALID_ARGS} + fi + timeout=${1} + shift + # single brackets are intentional, compare to: + # set -u; [[ "foo" -gt 0 ]] + # bash: foo: unbound variable + if ! [ "${timeout}" -gt 0 ] 2> /dev/null; then + say "${option}: invalid value" + exit ${ERR_INVALID_ARGS} + fi + ;; + --help|-h) + usage + exit 0 + ;; + *) + say "${option}: invalid option" + usage + exit ${ERR_INVALID_ARGS} + ;; + esac +done + +if is_verbose && is_quiet; then + say '--verbose and --quiet are mutually exclusive' + exit ${ERR_INVALID_ARGS} +fi + +event=$(start_restart_dockerd) +if ! wait_dockerd_started; then + stop_dockerd_and_die +fi + +if [[ ${event} = 'started' ]]; then + move_processes_to_separate_cgroup +fi + +if ! is_quiet; then + say "dockerd ${event}" +fi + +if is_true ${logs}; then + tail -f ${DOCKERD_LOG} +fi diff --git a/docker/server/Dockerfile.nebius b/docker/server/Dockerfile.nebius new file mode 100644 index 0000000000..d449d8833b --- /dev/null +++ b/docker/server/Dockerfile.nebius @@ -0,0 +1,5 @@ +ARG BASE_IMAGE + +FROM ${BASE_IMAGE} + +RUN curl -sSL https://storage.eu-north1.nebius.cloud/cli/install.sh | bash diff --git a/docker/server/README.md b/docker/server/README.md index 1e3f278b81..2c91a4e152 100644 --- a/docker/server/README.md +++ b/docker/server/README.md @@ -1,24 +1,12 @@ -`dstack` is an open-source engine that automates infrastructure provisioning on any cloud — for development, training, and deployment of AI models. +`dstack` is a streamlined alternative to Kubernetes, specifically designed for AI. It simplifies container orchestration +for AI workloads both in the cloud and on-prem, speeding up the development, training, and deployment of AI models. -## Configure backends - -To let `dstack` run workloads in your cloud account(s), you need to configure cloud credentials -in `~/.dstack/server/config.yml` under the `backends` property of the respective project. - -Example: +`dstack` supports `NVIDIA GPU`, `AMD GPU`, and `Google Cloud TPU` out of the box. -```yaml -projects: -- name: main - backends: - - type: aws - creds: - type: access_key - access_key: AIZKISCVKUKO5AAKLAEH - secret_key: QSbmpqJIUBn1V5U3pyM9S6lwwiu8/fOJ2dgfwFdW -``` +## Configure backends -For further backend configuration details, refer to [Installation](https://dstack.ai/docs/installation/). +To use `dstack` with your own cloud accounts, create the `~/.dstack/server/config.yml` file and +[configure backends](https://dstack.ai/docs/reference/server/config.yml). ## Start the server @@ -31,74 +19,58 @@ The dstack server is running at http://0.0.0.0:3000 The admin user token is 'bbae0f28-d3dd-4820-bf61-8f4bb40815da' ``` -## Set up the CLI +For more details on server configuration options, see the +[server deployment](https://dstack.ai/docs/guides/server-deployment.md) guide. -The client is configured via `~/.dstack/config.yml` with the server address, user token, and -the project name. +### Run with PostgreSQL and the SSH proxy -To configure this, use the `dstack config` command: +In production, the `dstack` server is usually run with +[PostgreSQL](https://dstack.ai/docs/guides/server-deployment#postgresql) instead of the default +SQLite, and with the [SSH proxy](https://dstack.ai/docs/guides/server-deployment#ssh-proxy). The +[`docker-compose.yml`](https://github.com/dstackai/dstack/blob/master/docker/server/docker-compose.yml) +runs that combination locally, so you can try or test a production-like server on your own machine. +A full production deployment would also configure external +[logs storage](https://dstack.ai/docs/guides/server-deployment#logs-storage) and +[file storage](https://dstack.ai/docs/guides/server-deployment#file-storage). ```shell -dstack config --project main --server http://0.0.0.0:3000 --token bbae0f28-d3dd-4820-bf61-8f4bb40815da +docker compose -f docker/server/docker-compose.yml up ``` -This will update `~/.dstack/config.yml` allowing the CLI and API to connect to the server by default. - -## Environment variables - -Here's the list of environment variables which you can override: - -- `DSTACK_SERVER_DIR` – (Optional) The path to the directory where the `dstack` server stores the state. Defaults to `/root/.dstack/server`. -- `DSTACK_DATABASE_URL` – (Optional) The database URL to use instead of default SQLite. Currently `dstack` supports Postgres. Example: `postgresql+asyncpg://myuser:mypassword@localhost:5432/mydatabase`. -- `DSTACK_SERVER_ADMIN_TOKEN` – (Optional) The default token of the `admin` user. By default, it's generated randomly - at the first startup. - -## Persist state - -By default, `dstack` stores its state in `~/.dstack/server/data` using SQLite. -To use a database, set the `DSTACK_DATABASE_URL` environment variable (see below). - -### Replicate SQLite state via Litestream +This starts PostgreSQL, the `dstack` server at `http://localhost:3000`, and the SSH proxy at +`localhost:30022`. The admin token is printed to the logs (`docker compose logs server`). -If not using `DSTACK_DATABASE_URL`, you can still replicate the state to cloud object storage using Litestream. To do -this, you need to set the following environment variables. +To access the server from the CLI, add it as a project with `dstack project add`, using the +admin token from the logs (see [Set up the CLI](#set-up-the-cli) below). -- `LITESTREAM_REPLICA_URL` - The url of the cloud object storage. - Examples: `s3:///`, `gcs:///`, `abs://@/`, etc. - -#### AWS S3 - -To persist state into an AWS S3 bucket, provide the following environment variables: - -- `AWS_ACCESS_KEY_ID` - The AWS access key ID -- `AWS_SECRET_ACCESS_KEY` - The AWS secret access key - -#### GCP Storage - -To persist state into an AWS S3 bucket, provide one of the following environment variables: - -- `GOOGLE_APPLICATION_CREDENTIALS` - The path to the GCP service account key JSON file -- `GOOGLE_APPLICATION_CREDENTIALS_JSON` - The GCP service account key JSON - -#### Azure Blob Storage - -To persist state into an Azure blog storage, provide the following environment variable. +## Set up the CLI -- `LITESTREAM_AZURE_ACCOUNT_KEY` - The Azure storage account key +To point the CLI to the `dstack` server, configure it +with the server address, user token, and project name: -More [details](https://litestream.io/guides/) on options for configuring replication. +```shell +$ pip install dstack +$ dstack project add --name main \ + --url http://127.0.0.1:3000 \ + --token bbae0f28-d3dd-4820-bf61-8f4bb40815da + +Configuration is updated at ~/.dstack/config.yml +``` -_**️Note:** The use of Litestream requires that only one instance of the `dstack` server is running at a time._ +## Create SSH fleets + +If you want the `dstack` server to run containers on your on-prem servers, +use [fleets](https://dstack.ai/docs/concepts/fleets#ssh-fleets). ## More information For additional information and examples, see the following links: -* [Docs](https://dstack.ai/docs/) -* [Examples](https://github.com/dstackai/dstack/tree/master/examples) +* [Docs](https://dstack.ai/docs) +* [Examples](https://dstack.ai/examples) * [Changelog](https://github.com/dstackai/dstack/releases) * [Discord](https://discord.gg/u8SmfwPpMd) ## License -[Mozilla Public License 2.0](https://github.com/dstackai/dstack/blob/master/LICENSE.md) \ No newline at end of file +[Mozilla Public License 2.0](https://github.com/dstackai/dstack/blob/master/LICENSE.md) diff --git a/docker/server/docker-compose.yml b/docker/server/docker-compose.yml new file mode 100644 index 0000000000..529271059c --- /dev/null +++ b/docker/server/docker-compose.yml @@ -0,0 +1,73 @@ +name: dstack + +services: + postgres: + image: postgres:16 + restart: unless-stopped + environment: + POSTGRES_USER: ${DSTACK_POSTGRES_USER:-dstack} + POSTGRES_PASSWORD: ${DSTACK_POSTGRES_PASSWORD:-dstack} + POSTGRES_DB: ${DSTACK_POSTGRES_DB:-dstack} + volumes: + - postgres-data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U $$POSTGRES_USER -d $$POSTGRES_DB"] + interval: 5s + timeout: 5s + retries: 10 + + server: + image: dstackai/dstack:latest # multi-arch — no platform override needed + restart: unless-stopped + depends_on: + postgres: + condition: service_healthy + environment: + DSTACK_DATABASE_URL: postgresql+asyncpg://${DSTACK_POSTGRES_USER:-dstack}:${DSTACK_POSTGRES_PASSWORD:-dstack}@postgres:5432/${DSTACK_POSTGRES_DB:-dstack} + DSTACK_SERVER_LOG_FORMAT: rich # human-readable console logs (image defaults to json) + # Shared secret between the server and the SSH proxy. The default works locally; + # override with a real secret for non-local deployments. + DSTACK_SSHPROXY_API_TOKEN: ${DSTACK_SSHPROXY_API_TOKEN:-dstack-sshproxy-token} + # Address clients use to reach the SSH proxy. `localhost:30022` is correct for a + # client on the same host; use a publicly reachable address for multi-host setups. + DSTACK_SERVER_SSHPROXY_ADDRESS: localhost:30022 + volumes: + - server-data:/root/.dstack/server # config.yml + run logs (Postgres holds the rest) + ports: + - "3000:3000" + + # One-shot init: generates the SSH proxy's host key into a shared volume so no manual + # ssh-keygen is needed. The proxy refuses to start without a host key. + sshproxy-keygen: + image: alpine:3 + command: + - sh + - -c + - | + apk add --no-cache openssh-keygen >/dev/null + [ -f /keys/host_key ] || ssh-keygen -t ed25519 -f /keys/host_key -N "" -C dstack-sshproxy + volumes: + - sshproxy-keys:/keys + + sshproxy: + image: dstackai/sshproxy:latest + platform: linux/amd64 # image is amd64-only; emulated on Apple Silicon + restart: unless-stopped + depends_on: + sshproxy-keygen: + condition: service_completed_successfully + server: + condition: service_started + environment: + DSTACK_SSHPROXY_API_URL: http://server:3000 + DSTACK_SSHPROXY_API_TOKEN: ${DSTACK_SSHPROXY_API_TOKEN:-dstack-sshproxy-token} + command: ["--host-key", "/keys/host_key"] + volumes: + - sshproxy-keys:/keys + ports: + - "30022:30022" + +volumes: + postgres-data: + server-data: + sshproxy-keys: diff --git a/docker/server/entrypoint.sh b/docker/server/entrypoint.sh index c326937916..8cd67fd260 100644 --- a/docker/server/entrypoint.sh +++ b/docker/server/entrypoint.sh @@ -7,9 +7,15 @@ if [ -n "${GOOGLE_APPLICATION_CREDENTIALS_JSON}" ]; then echo "${GOOGLE_APPLICATION_CREDENTIALS_JSON}" > "${GOOGLE_APPLICATION_CREDENTIALS_DIR}/application_default_credentials.json" fi +DB_PATH="${HOME}/.dstack/server/data/sqlite.db" +mkdir -p "$(dirname "$DB_PATH")" if [[ -z "${LITESTREAM_REPLICA_URL}" ]]; then - dstack server --host 0.0.0.0 + exec dstack server --host 0.0.0.0 else - litestream restore -if-replica-exists -o ${HOME}/.dstack/server/data/sqlite.db ${LITESTREAM_REPLICA_URL} - litestream replicate -exec "dstack server --host 0.0.0.0" ${HOME}/.dstack/server/data/sqlite.db ${LITESTREAM_REPLICA_URL} + if [[ ! -f "$DB_PATH" ]]; then + echo "Starting db restore" + litestream restore -if-replica-exists -o "$DB_PATH" "$LITESTREAM_REPLICA_URL" + echo "Finished db restore" + fi + exec litestream replicate -exec "dstack server --host 0.0.0.0" "$DB_PATH" "$LITESTREAM_REPLICA_URL" fi diff --git a/docker/server/release/Dockerfile b/docker/server/release/Dockerfile index fff4b7a57e..a88439d61e 100644 --- a/docker/server/release/Dockerfile +++ b/docker/server/release/Dockerfile @@ -9,19 +9,24 @@ WORKDIR /dstack-server RUN apt-get update && apt-get install -y \ curl \ - software-properties-common \ git \ + sqlite3 \ + tini \ && rm -rf /var/lib/apt/lists/* -RUN if [ $(uname -m) = "aarch64" ]; then ARCH="arm64"; else ARCH="amd64"; fi && \ - curl https://github.com/benbjohnson/litestream/releases/download/v0.3.9/litestream-v0.3.9-linux-$ARCH.deb -O -L && \ - dpkg -i litestream-v0.3.9-linux-$ARCH.deb +RUN if [ $(uname -m) = "aarch64" ]; then ARCH="arm64"; else ARCH="x86_64"; fi && \ + curl https://github.com/benbjohnson/litestream/releases/download/v0.5.9/litestream-0.5.9-linux-$ARCH.deb -O -L && \ + dpkg -i litestream-0.5.9-linux-$ARCH.deb -RUN pip install "dstack[all]==$VERSION" --progress-bar off +ADD https://astral.sh/uv/install.sh /uv-installer.sh +RUN sh /uv-installer.sh && rm /uv-installer.sh +ENV PATH="/root/.local/bin/:$PATH" + +RUN uv tool install "dstack[all]==$VERSION" COPY entrypoint.sh entrypoint.sh RUN chmod 777 entrypoint.sh EXPOSE 3000 -ENTRYPOINT ["./entrypoint.sh"] +ENTRYPOINT ["/usr/bin/tini", "--", "./entrypoint.sh"] diff --git a/docker/server/stgn/Dockerfile b/docker/server/stgn/Dockerfile index e9e410aa98..b2c4f96370 100644 --- a/docker/server/stgn/Dockerfile +++ b/docker/server/stgn/Dockerfile @@ -7,21 +7,26 @@ WORKDIR /dstack-server RUN apt-get update && apt-get install -y \ curl \ - software-properties-common \ git \ + sqlite3 \ + tini \ && rm -rf /var/lib/apt/lists/* -RUN if [ $(uname -m) = "aarch64" ]; then ARCH="arm64"; else ARCH="amd64"; fi && \ - curl https://github.com/benbjohnson/litestream/releases/download/v0.3.9/litestream-v0.3.9-linux-$ARCH.deb -O -L && \ - dpkg -i litestream-v0.3.9-linux-$ARCH.deb +RUN if [ $(uname -m) = "aarch64" ]; then ARCH="arm64"; else ARCH="x86_64"; fi && \ + curl https://github.com/benbjohnson/litestream/releases/download/v0.5.9/litestream-0.5.9-linux-$ARCH.deb -O -L && \ + dpkg -i litestream-0.5.9-linux-$ARCH.deb +ADD https://astral.sh/uv/install.sh /uv-installer.sh +RUN sh /uv-installer.sh && rm /uv-installer.sh +ENV PATH="/root/.local/bin/:$PATH" + +COPY pyproject.toml uv.lock README.md ./ COPY src src -COPY setup.py README.md ./ -RUN pip install '.[all]' +RUN uv sync --extra all COPY docker/server/entrypoint.sh entrypoint.sh RUN chmod 777 entrypoint.sh EXPOSE 3000 -ENTRYPOINT ["./entrypoint.sh"] +ENTRYPOINT ["/usr/bin/tini", "--", "./entrypoint.sh"] diff --git a/docker/tt-smi/Dockerfile b/docker/tt-smi/Dockerfile new file mode 100644 index 0000000000..da190db313 --- /dev/null +++ b/docker/tt-smi/Dockerfile @@ -0,0 +1,23 @@ +FROM ubuntu:22.04 + +ARG IMAGE_NAME +ARG TT_SMI_VERSION +ARG BUILD_DATE + +ENV PATH="/root/.cargo/bin:${PATH}" + +RUN \ + apt-get update && \ + apt-get install -y curl git python3 python3-pip && \ + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \ + pip3 install --no-cache-dir --upgrade pip setuptools wheel tomli && \ + pip3 install --no-cache-dir --no-build-isolation git+https://github.com/tenstorrent/tt-smi@v${TT_SMI_VERSION} && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +ENTRYPOINT ["/usr/local/bin/tt-smi"] +CMD ["--help"] + +LABEL org.opencontainers.image.title="${IMAGE_NAME}" +LABEL org.opencontainers.image.version="${TT_SMI_VERSION}" +LABEL org.opencontainers.image.created="${BUILD_DATE}" diff --git a/docker/tt-smi/README.md b/docker/tt-smi/README.md new file mode 100644 index 0000000000..a9ef70f80c --- /dev/null +++ b/docker/tt-smi/README.md @@ -0,0 +1,9 @@ +# dstack TT SMI + +An Ubuntu-based image with [TT SMI](https://github.com/tenstorrent/tt-smi/) preinstalled. Suitable for Tenstorrent GPU detection. + +## Usage + +```shell +docker run --device /dev/tenstorrent/ dstackai/tt-smi -s +``` diff --git a/docs/CNAME b/docs/CNAME deleted file mode 100644 index c909514661..0000000000 --- a/docs/CNAME +++ /dev/null @@ -1 +0,0 @@ -dstack.ai \ No newline at end of file diff --git a/docs/assets/images/kubernetes-logo.svg b/docs/assets/images/kubernetes-logo.svg deleted file mode 100644 index 9a61c190a0..0000000000 --- a/docs/assets/images/kubernetes-logo.svg +++ /dev/null @@ -1,91 +0,0 @@ - - - - - Kubernetes logo with no border - - - - - - image/svg+xml - - Kubernetes logo with no border - "kubectl" is pronounced "kyoob kuttel" - - - - - - - - - - diff --git a/docs/assets/images/oci-logo.svg b/docs/assets/images/oci-logo.svg deleted file mode 100644 index 561271d2fa..0000000000 --- a/docs/assets/images/oci-logo.svg +++ /dev/null @@ -1,4 +0,0 @@ - - - - \ No newline at end of file diff --git a/docs/assets/images/plus.svg b/docs/assets/images/plus.svg deleted file mode 100644 index 7a617a528b..0000000000 --- a/docs/assets/images/plus.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/docs/assets/images/runpod-logo.svg b/docs/assets/images/runpod-logo.svg deleted file mode 100644 index e2af977734..0000000000 --- a/docs/assets/images/runpod-logo.svg +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - - \ No newline at end of file diff --git a/docs/assets/javascripts/extra.js b/docs/assets/javascripts/extra.js deleted file mode 100644 index a1401b2d80..0000000000 --- a/docs/assets/javascripts/extra.js +++ /dev/null @@ -1,158 +0,0 @@ -controller = (function () { - let setupEventListeners = function () { - document.addEventListener('keydown', function (event) { - if (event.keyCode === 75 && (event.metaKey || event.ctrlKey)) { - document.querySelector('.md-search__input').focus() - } - }); - }; - return { - init: function () { - setupEventListeners(); - } - }; -})(); - -controller.init(); - -function setupTermynal() { - document.querySelectorAll(".use-termynal").forEach(node => { - node.style.display = "block"; - new Termynal(node, { - lineDelay: 500 - }); - }); - const progressLiteralStart = "---> 100%"; - const promptLiteralStart = "$ "; - const customPromptLiteralStart = "# "; - const termynalActivateClass = "termy"; - let termynals = []; - - function createTermynals() { - document - .querySelectorAll(`.${termynalActivateClass} .highlight`) - .forEach(node => { - const text = node.textContent; - const lines = text.split(/(? { - if (line) { - isBlankSpace = false; - } - }); - dataValue = {}; - if (isBlankSpace) { - dataValue["delay"] = 0; - } - if (buffer[buffer.length - 1] === "") { - // A last single
won't have effect - // so put an additional one - buffer.push(""); - } - const bufferValue = buffer.join("
"); - dataValue["value"] = bufferValue; - useLines.push(dataValue); - buffer = []; - } - } - for (let line of lines) { - if (line === progressLiteralStart) { - saveBuffer(); - useLines.push({ - type: "progress" - }); - } else if (line.startsWith(promptLiteralStart)) { - saveBuffer(); - const value = line.replace(promptLiteralStart, "").trimEnd(); - useLines.push({ - type: "input", - value: value - }); - } else if (line.startsWith("// ")) { - saveBuffer(); - const value = "💬 " + line.replace("// ", "").trimEnd(); - useLines.push({ - value: value, - class: "termynal-comment", - delay: 0 - }); - } else if (line.startsWith(customPromptLiteralStart)) { - saveBuffer(); - const promptStart = line.indexOf(promptLiteralStart); - if (promptStart === -1) { - console.error("Custom prompt found but no end delimiter", line) - } - const prompt = line.slice(0, promptStart).replace(customPromptLiteralStart, "") - let value = line.slice(promptStart + promptLiteralStart.length); - useLines.push({ - type: "input", - value: value, - prompt: prompt - }); - } else { - buffer.push(line); - } - } - saveBuffer(); - const div = document.createElement("div"); - node.replaceWith(div); - const termynal = new Termynal(div, { - lineData: useLines, - noInit: true, - lineDelay: 500, - typeDelay: 20 - }); - termynals.push(termynal); - }); - } - - function loadVisibleTermynals() { - termynals = termynals.filter(termynal => { - if (termynal.container.getBoundingClientRect().top - innerHeight <= 0) { - termynal.init(); - return false; - } - return true; - }); - } - window.addEventListener("scroll", loadVisibleTermynals); - createTermynals(); - loadVisibleTermynals(); -} - -function setupCustomCodeTitles() { - document.querySelectorAll("div[editor-title]").forEach(div => { - let code = div.getElementsByTagName('code')[0]; - // code.setAttribute("editor-title", div.getAttribute("editor-title")) - let editorTitle = document.createElement("span") - editorTitle.className = "editor-title" - editorTitle.innerHTML = div.getAttribute("editor-title") - code.appendChild(editorTitle) - }); -} - -window.addEventListener("DOMContentLoaded", function() { - let tabs = document.querySelector(".md-tabs") - let header = document.querySelector(".md-header") - let search = document.querySelector(".md-search") - search.parentNode.insertBefore(tabs, search) - header.classList.add("ready") - setupTermynal() - setupCustomCodeTitles() -}); - -(function () { - document.querySelectorAll('.tx-faq__item').forEach(function (faqItem) { - faqItem.querySelector('.tx-faq__item-title').addEventListener('click', function () { - if (faqItem.classList.contains('_open')) { - faqItem.classList.remove('_open') - } else { - faqItem.classList.add('_open') - } - }); - }) -})() diff --git a/docs/assets/javascripts/termynal.js b/docs/assets/javascripts/termynal.js deleted file mode 100644 index cbcb47be45..0000000000 --- a/docs/assets/javascripts/termynal.js +++ /dev/null @@ -1,266 +0,0 @@ -/** - * termynal.js - * A lightweight, modern and extensible animated terminal window, using - * async/await. - * - * @author Ines Montani - * @version 0.0.1 - * @license MIT - */ - -'use strict'; - -/** Generate a terminal widget. */ -class Termynal { - /** - * Construct the widget's settings. - * @param {(string|Node)=} container - Query selector or container element. - * @param {Object=} options - Custom settings. - * @param {string} options.prefix - Prefix to use for data attributes. - * @param {number} options.startDelay - Delay before animation, in ms. - * @param {number} options.typeDelay - Delay between each typed character, in ms. - * @param {number} options.lineDelay - Delay between each line, in ms. - * @param {number} options.progressLength - Number of characters displayed as progress bar. - * @param {string} options.progressChar – Character to use for progress bar, defaults to █. - * @param {number} options.progressPercent - Max percent of progress. - * @param {string} options.cursor – Character to use for cursor, defaults to ▋. - * @param {Object[]} lineData - Dynamically loaded line data objects. - * @param {boolean} options.noInit - Don't initialise the animation. - */ - constructor(container = '#termynal', options = {}) { - this.container = (typeof container === 'string') ? document.querySelector(container) : container; - this.pfx = `data-${options.prefix || 'ty'}`; - this.originalStartDelay = this.startDelay = options.startDelay - || parseFloat(this.container.getAttribute(`${this.pfx}-startDelay`)) || 300; - this.originalTypeDelay = this.typeDelay = options.typeDelay - || parseFloat(this.container.getAttribute(`${this.pfx}-typeDelay`)) || 60; - this.originalLineDelay = this.lineDelay = options.lineDelay - || parseFloat(this.container.getAttribute(`${this.pfx}-lineDelay`)) || 1500; - this.progressLength = options.progressLength - || parseFloat(this.container.getAttribute(`${this.pfx}-progressLength`)) || 40; - this.progressChar = options.progressChar - || this.container.getAttribute(`${this.pfx}-progressChar`) || '█'; - this.progressPercent = options.progressPercent - || parseFloat(this.container.getAttribute(`${this.pfx}-progressPercent`)) || 100; - this.cursor = options.cursor - || this.container.getAttribute(`${this.pfx}-cursor`) || '▋'; - this.lineData = this.lineDataToElements(options.lineData || []); - this.loadLines() - if (!options.noInit) this.init() - } - - loadLines() { - // Load all the lines and create the container so that the size is fixed - // Otherwise it would be changing and the user viewport would be constantly - // moving as she/he scrolls - const finish = this.generateFinish() - finish.style.visibility = 'hidden' - this.container.appendChild(finish) - // Appends dynamically loaded lines to existing line elements. - this.lines = [...this.container.querySelectorAll(`[${this.pfx}]`)].concat(this.lineData); - for (let line of this.lines) { - line.style.visibility = 'hidden' - this.container.appendChild(line) - } - const restart = this.generateRestart() - restart.style.visibility = 'hidden' - this.container.appendChild(restart) - this.container.setAttribute('data-termynal', ''); - } - - /** - * Initialise the widget, get lines, clear container and start animation. - */ - init() { - /** - * Calculates width and height of Termynal container. - * If container is empty and lines are dynamically loaded, defaults to browser `auto` or CSS. - */ - const containerStyle = getComputedStyle(this.container); - this.container.style.width = containerStyle.width !== '0px' ? - containerStyle.width : undefined; - this.container.style.minHeight = containerStyle.height !== '0px' ? - containerStyle.height : undefined; - - this.container.setAttribute('data-termynal', ''); - this.container.innerHTML = ''; - for (let line of this.lines) { - line.style.visibility = 'visible' - } - this.start(); - } - - /** - * Start the animation and rener the lines depending on their data attributes. - */ - async start() { - this.addFinish() - await this._wait(this.startDelay); - - for (let line of this.lines) { - const type = line.getAttribute(this.pfx); - const delay = line.getAttribute(`${this.pfx}-delay`) || this.lineDelay; - - if (type == 'input') { - line.setAttribute(`${this.pfx}-cursor`, this.cursor); - await this.type(line); - await this._wait(delay); - } - - else if (type == 'progress') { - await this.progress(line); - await this._wait(delay); - } - - else { - this.container.appendChild(line); - await this._wait(delay); - } - - line.removeAttribute(`${this.pfx}-cursor`); - } - this.addRestart() - this.finishElement.style.visibility = 'hidden' - this.lineDelay = this.originalLineDelay - this.typeDelay = this.originalTypeDelay - this.startDelay = this.originalStartDelay - } - - generateRestart() { - const restart = document.createElement('a') - restart.onclick = (e) => { - e.preventDefault() - this.container.innerHTML = '' - this.init() - } - restart.href = '#' - restart.setAttribute('data-terminal-control', '') - restart.innerHTML = "restart ↻" - return restart - } - - generateFinish() { - const finish = document.createElement('a') - finish.onclick = (e) => { - e.preventDefault() - this.lineDelay = 0 - this.typeDelay = 0 - this.startDelay = 0 - } - finish.href = '#' - finish.setAttribute('data-terminal-control', '') - finish.innerHTML = "fast →" - this.finishElement = finish - return finish - } - - addRestart() { - const restart = this.generateRestart() - this.container.appendChild(restart) - } - - addFinish() { - const finish = this.generateFinish() - this.container.appendChild(finish) - } - - /** - * Animate a typed line. - * @param {Node} line - The line element to render. - */ - async type(line) { - const chars = [...line.textContent]; - line.textContent = ''; - this.container.appendChild(line); - - for (let char of chars) { - const delay = line.getAttribute(`${this.pfx}-typeDelay`) || this.typeDelay; - await this._wait(delay); - line.textContent += char; - } - } - - /** - * Animate a progress bar. - * @param {Node} line - The line element to render. - */ - async progress(line) { - const progressLength = line.getAttribute(`${this.pfx}-progressLength`) - || this.progressLength; - const progressChar = line.getAttribute(`${this.pfx}-progressChar`) - || this.progressChar; - const chars = progressChar.repeat(progressLength); - const progressPercent = line.getAttribute(`${this.pfx}-progressPercent`) - || this.progressPercent; - const typeDelay = line.getAttribute(`${this.pfx}-typeDelay`) - || this.typeDelay; - line.textContent = ''; - this.container.appendChild(line); - - for (let i = 1; i < chars.length + 1; i++) { - await this._wait(typeDelay); - const percent = Math.round(i / chars.length * 100); - line.textContent = `${chars.slice(0, i)} ${percent}%`; - if (percent>progressPercent) { - break; - } - } - } - - /** - * Helper function for animation delays, called with `await`. - * @param {number} time - Timeout, in ms. - */ - _wait(time) { - return new Promise(resolve => setTimeout(resolve, time)); - } - - /** - * Converts line data objects into line elements. - * - * @param {Object[]} lineData - Dynamically loaded lines. - * @param {Object} line - Line data object. - * @returns {Element[]} - Array of line elements. - */ - lineDataToElements(lineData) { - return lineData.map(line => { - let div = document.createElement('div'); - div.innerHTML = `${line.value || ''}`; - - return div.firstElementChild; - }); - } - - /** - * Helper function for generating attributes string. - * - * @param {Object} line - Line data object. - * @returns {string} - String of attributes. - */ - _attributes(line) { - let attrs = ''; - for (let prop in line) { - // Custom add class - if (prop === 'class') { - attrs += ` class=${line[prop]} ` - continue - } - if (prop === 'type') { - attrs += `${this.pfx}="${line[prop]}" ` - } else if (prop !== 'value') { - attrs += `${this.pfx}-${prop}="${line[prop]}" ` - } - } - - return attrs; - } -} - -/** -* HTML API: If current script has container(s) specified, initialise Termynal. -*/ -if (document.currentScript.hasAttribute('data-termynal-container')) { - const containers = document.currentScript.getAttribute('data-termynal-container'); - containers.split('|') - .forEach(container => new Termynal(container)) -} \ No newline at end of file diff --git a/docs/assets/stylesheets/extra.css b/docs/assets/stylesheets/extra.css deleted file mode 100644 index f3eb1b3df8..0000000000 --- a/docs/assets/stylesheets/extra.css +++ /dev/null @@ -1,1493 +0,0 @@ -@media screen and (min-width: 76.1875em) { - .md-header { - backdrop-filter: blur(5px); - background-color: rgba(255, 255, 255, 0.6); - } - - [dir=ltr] .md-header__source { - margin-left: 0; - width: 10rem; - } - - .md-source__facts { - font-size: 0.6rem; - } - - .md-source__fact:before { - height: 0.75rem; - } - - .md-source { - font-size: 0.75rem; - } -} -@media screen and (max-width: 76.1875em) { - .md-header { - background-color: rgb(255, 255, 255); - } -} - -.md-copyright { - width: inherit; -} - -.md-copyright__highlight { - font-size: 0.75rem; - margin-top: -8px; -} - -.md-copyright__highlight a { - font-weight: 700; -} - -.md-footer { - /*border-top: 1px solid #E4E4E7;*/ -} -.md-typeset a.md-footer__link { - margin: 0.5rem 0 0.4rem; - border: 1px solid black; - color: black; - border-radius: 5px; - flex: 1; -} - -.md-footer__link:focus, .md-footer__link:hover { - opacity: 1; -} - -.md-typeset a.md-footer__link:hover .md-ellipsis { - color: var(--md-typeset-a-color); - opacity: 1; -} - -.md-footer__link--prev .md-footer__title { - display: block; -} - -.md-footer__direction { - opacity: 1; - font-weight: 800; -} - -.md-footer__title { - margin-top: 0.2rem; - margin-bottom: 0.5rem; - padding: 0; -} - -.md-header[data-md-state=shadow] { - box-shadow: none; -} -.md-sidebar__scrollwrap { - margin-top: -25px; - overflow-y: hidden; - - -webkit-mask-image: linear-gradient(rgba(0,0,0,1) 85%, rgba(0,0,0,0)); - mask-image: linear-gradient(rgba(0,0,0,1) 85%, rgba(0,0,0,0)); - - padding-bottom: 40px; -} - -.md-sidebar__scrollwrap:hover { - overflow-y: scroll; -} - -*::-webkit-scrollbar-thumb { - background: linear-gradient(0deg, white 0%, white 4%, #e5e5e9 5%, #e5e5e9 93%, white 94%, white 100%); - border-radius: 2px; - padding-top: 200px; -} - -.md-sidebar__scrollwrap::-webkit-scrollbar-thumb:hover { - background-color: var(--md-default-fg-color--lighter) -} - -.md-typeset :is(.admonition,details):is(.info,.tip,.warning,.c) { - background-color: rgba(0,0,0,0.005); - border-color: rgba(0,0,0,0.6); -} - -.md-typeset :is(.info,.tip,.warning)>:is(.admonition-title) { - color: var(--md-default-fg-color); -} - -.md-typeset :is(.info,.tip,.warning)>:is(.admonition-title,summary):before, -.md-typeset :is(.info,.tip,.warning)>:is(.admonition-title,summary):after { - background-color: var(--md-default-fg-color); -} - -[dir=ltr] .md-typeset :is(.admonition,details) { - border-style: solid; - /*border-width: 1px;*/ - border-width: 0; - border-radius: 6px; - box-shadow: none; - padding: .6rem .8rem; - background: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.1), rgb(0 114 255 / 1%), rgba(0, 42, 255, 0.05)); -} - -.md-typeset iframe { - border-radius: 6px; -} - -[dir=ltr] .md-typeset :is(.admonition,details) blockquote { - margin-left: 32px; -} - -[dir=ltr] .md-typeset :is(.admonition,details):not(blockquote) > :is(.highlight,.termy,.md-typeset__table,p,h4,h3,.tabbed-set):not(.admonition-title) { - padding-left: 32px; -} - -.admonition-title, details > summary { - border: none !important; - background-color: transparent !important; -} - -.md-typeset .admonition, .md-typeset details { - font-size: 1em; -} - -[dir=ltr] .md-typeset :is(.admonition-title,summary) { - font-size: 18px !important; - letter-spacing: -0.5px; - /*font-weight: 800;*/ - font-weight: 700; - /*padding-left: 18px;*/ - padding-bottom: 0; - border: none; - border-radius: 0; -} - -.md-typeset .admonition.info:focus-within, .md-typeset details.info:focus-within { - box-shadow: none; -} - -.md-typeset :is(.info,.tip, .warning)>:is(.admonition-title) { - margin-bottom: -12px; -} - -.md-typeset :is(.info,.tip, .warning)>:is(summary) { - margin-bottom: 12px; - font-size: 1.1em !important; -} - -.md-typeset :is(.info,.tip, .warning)>:is(.admonition-title,summary):before { - /*display: none;*/ -} - -.md-header__title { - margin-left: 1px !important; - font-weight: 700; - font-size: 20px; - padding-top: 2px; -} - -@media screen and (min-width: 76.1875em) { - .md-header__topic:first-child { - font-size: 30px; - /*font-family: Poppins, metro-web, Metro, -apple-system, "system-ui", "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", sans-serif;*/ - /*font-weight: 500;*/ - } - - .md-header__title { - flex-grow: unset; - } - - .md-header__topic { - position: relative; - } - - .md-header__title--active .md-header__topic { - opacity: 1; - transition: inherit; - transform: inherit; - pointer-events: auto; - } - - .md-header__title--active .md-header__topic+.md-header__topic { - opacity: 0; - } - - .md-header__topic+.md-header__topic { - display: none; - } -} - -@media screen and (max-width: 76.1875em) { - .md-nav--primary .md-nav__title .md-logo { - padding: 0.1rem 0.4rem; - } - .md-nav__title .md-nav__button.md-logo :-webkit-any(img,svg) { - max-width: 100px; - } - .md-nav--primary .md-nav__title { - display: block; - font-weight: 700; - /*display: none;*/ - height: 10px; - } - - .md-nav__source { - background-color: transparent; - } -} - -.md-nav--secondary:not(:has(ul)) { - display: none; -} - -.md-nav--secondary { - border: 1px solid black; - padding: 20px 5px; - margin-top: 20px; - border-radius: 12px; -} - -.md-nav--secondary .md-nav__title { - background: transparent; - box-shadow: none; - padding: 2px 15px 3px; - font-size: 17px; - font-weight: 700; - color: rgba(0, 0, 0, 0.87); - position: relative; -} - -@font-face { - font-family: 'Poppins'; - font-style: normal; - font-weight: 500; - font-display: swap; - src: url(https://fonts.gstatic.com/s/poppins/v20/pxiByp8kv8JHgFVrLGT9Z11lFd2JQEl8qw.woff2) format('woff2'); - unicode-range: U+0900-097F, U+1CD0-1CF6, U+1CF8-1CF9, U+200C-200D, U+20A8, U+20B9, U+25CC, U+A830-A839, U+A8E0-A8FB; -} -/* latin-ext */ -@font-face { - font-family: 'Poppins'; - font-style: normal; - font-weight: 500; - font-display: swap; - src: url(https://fonts.gstatic.com/s/poppins/v20/pxiByp8kv8JHgFVrLGT9Z1JlFd2JQEl8qw.woff2) format('woff2'); - unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; -} -/* latin */ -@font-face { - font-family: 'Poppins'; - font-style: normal; - font-weight: 500; - font-display: swap; - src: url(https://fonts.gstatic.com/s/poppins/v20/pxiByp8kv8JHgFVrLGT9Z1xlFd2JQEk.woff2) format('woff2'); - unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; -} -/* devanagari */ -@font-face { - font-family: 'Poppins'; - font-style: normal; - font-weight: 600; - font-display: swap; - src: url(https://fonts.gstatic.com/s/poppins/v20/pxiByp8kv8JHgFVrLEj6Z11lFd2JQEl8qw.woff2) format('woff2'); - unicode-range: U+0900-097F, U+1CD0-1CF6, U+1CF8-1CF9, U+200C-200D, U+20A8, U+20B9, U+25CC, U+A830-A839, U+A8E0-A8FB; -} -/* latin-ext */ -@font-face { - font-family: 'Poppins'; - font-style: normal; - font-weight: 600; - font-display: swap; - src: url(https://fonts.gstatic.com/s/poppins/v20/pxiByp8kv8JHgFVrLEj6Z1JlFd2JQEl8qw.woff2) format('woff2'); - unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; -} -/* latin */ -@font-face { - font-family: 'Poppins'; - font-style: normal; - font-weight: 600; - font-display: swap; - src: url(https://fonts.gstatic.com/s/poppins/v20/pxiByp8kv8JHgFVrLEj6Z1xlFd2JQEk.woff2) format('woff2'); - unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; -} -/* devanagari */ -@font-face { - font-family: 'Poppins'; - font-style: normal; - font-weight: 700; - font-display: swap; - src: url(https://fonts.gstatic.com/s/poppins/v20/pxiByp8kv8JHgFVrLCz7Z11lFd2JQEl8qw.woff2) format('woff2'); - unicode-range: U+0900-097F, U+1CD0-1CF6, U+1CF8-1CF9, U+200C-200D, U+20A8, U+20B9, U+25CC, U+A830-A839, U+A8E0-A8FB; -} -/* latin-ext */ -@font-face { - font-family: 'Poppins'; - font-style: normal; - font-weight: 700; - font-display: swap; - src: url(https://fonts.gstatic.com/s/poppins/v20/pxiByp8kv8JHgFVrLCz7Z1JlFd2JQEl8qw.woff2) format('woff2'); - unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; -} -/* latin */ -@font-face { - font-family: 'Poppins'; - font-style: normal; - font-weight: 700; - font-display: swap; - src: url(https://fonts.gstatic.com/s/poppins/v20/pxiByp8kv8JHgFVrLCz7Z1xlFd2JQEk.woff2) format('woff2'); - unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; -} - -/* latin */ -@font-face { - font-family: 'Fira Mono'; - font-style: normal; - font-weight: 400; - src: local('Fira Mono Regular'), local('FiraMono-Regular'), url(https://fonts.gstatic.com/s/firamono/v6/N0bX2SlFPv1weGeLZDtgJv7Ss9XZYQ.woff2) format('woff2'); - unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; -} - -/* latin */ -@font-face { - font-family: 'Fira Mono'; - font-style: normal; - font-weight: 500; - src: local('Fira Mono Medium'), local('FiraMono-Medium'), url(https://fonts.gstatic.com/s/firamono/v8/N0bS2SlFPv1weGeLZDto1d3HnvfUS5NBBA.woff2) format('woff2'); - unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; -} - -/* latin */ -@font-face { - font-family: 'Fira Mono'; - font-style: normal; - font-weight: 700; - src: local('Fira Mono Bold'), local('FiraMono-Bold'), url(https://fonts.gstatic.com/s/firamono/v6/N0bS2SlFPv1weGeLZDtondvHnvfUS5NBBA.woff2) format('woff2'); - unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; -} - -/*label.md-nav__link {*/ -/* font-weight: 500;*/ -/*}*/ - -.md-nav__link { - margin-top: .6em; -} - -.md-nav__link:hover { - color: var(--md-default-fg-color); -} - -.md-nav__item, .md-nav__link .md-typeset { - /*color: rgba(0,0,0,0.87);*/ - font-weight: 500; - font-size: 0.75rem; -} - -.md-nav__item .md-nav__link:hover:not(.md-nav__link--active) { - color: black; -} - -.md-typeset pre > code, .md-typeset code { - /*-webkit-font-smoothing: auto;*/ -} - -.md-sidebar.md-sidebar--secondary .md-typeset code { - color: rgba(0,0,0,0.87); -} - -.md-typeset :not(pre) :is(h1, h2, h3, h4) > code { - color: inherit; - background: inherit; - padding: 0; -} - -h4.doc-heading { - font-size: inherit; -} - -.md-typeset :not(pre, h1, h2, h3, h4) > code { - background-color: rgba(163, 68, 215, 0.05); - /*border: 1px solid #dce0e6;*/ - border-radius: 2px; - font-weight: 600; - color: var(--md-primary-fg-color); - text-align: center; - padding: 4px; - height: 16px; - margin: 0 4px; -} - -.md-typeset :is(h1, h2, h3, h4) > code { - background-color: inherit; - color: inherit; - /*padding: 0; - margin: 0;*/ -} -.md-typeset :is(h1, h2, h3, h4) > a > code { - font-size: inherit; - color: inherit; -} - - -.md-typeset :is(table) :not(pre, h1, h2, h3, h4) > code { - font-size: .85em; -} - -.md-typeset :not(pre, h1, h2, h3, h4) > code { - font-size: 0.65rem; -} - -.md-typeset :not(pre, h1, h2, h3, h4) > a code { - color: #ce00ff; -} - -.md-typeset pre > code { - background-color: rgb(21, 22, 29); - padding: 45px 30px 35px 40px; - border-radius: 4px; - font-size: 15px; - - /*border-radius: 6px;*/ - /*border-top: 1px solid #dce0e6;*/ - /*background-color: rgba(0,0,0,.87);*/ - /*padding: 15px 20px;*/ - /*font-size: .85em;*/ -} - -.md-typeset div[editor-title] pre > code:before { - content: ''; - position: absolute; - top: 15px; - left: 15px; - display: inline-block; - width: 12px; - height: 12px; - border-radius: 50%; - /* A little hack to display the window buttons in one pseudo element. */ - background: #d9515d; - /*-webkit-box-shadow: 25px 0 0 #f4c025, 50px 0 0 #3ec930;*/ - /* box-shadow: 25px 0 0 #f4c025, 50px 0 0 #3ec930;*/ - -webkit-box-shadow: 20px 0 0 #f4c025, 40px 0 0 #3ec930; - box-shadow: 20px 0 0 #f4c025, 40px 0 0 #3ec930; -} - -.md-typeset div[editor-title] pre > code { - padding: 65px 25px 35px 45px; -} - -@media screen and (min-width: 60em) { - [data-md-color-primary=white] .md-search__form { - background-color: transparent; - border-radius: 18px; - border: 1px solid rgba(0,0,0,0.87); - } - - [data-md-color-primary=white] .md-search__form:hover { - background-color: transparent; - border: 1px solid black; - } -} - -.md-search__input { - font-size: .7rem; -} - -.md-code__button:after { - width: 1em; - height: 1em; -} - -.md-code__nav { - top: 1em; - right: .25em; - background: none; -} - -.md-code__nav .md-code__button { - color: #a2a2a2; -} - -.md-code__nav:hover .md-code__button { - color: #eee; -} - -pre:hover .md-code__nav, -code .md-code__nav:hover { - background: transparent; -} - -code .md-code__nav:hover .md-code__button { - color: var(--md-accent-fg-color); -} - -.md-clipboard { - top: 0.65em; -} - -.md-clipboard:after { - color: var(--md-default-fg-color--lighter); -} - -.md-clipboard:hover:after { - color: var(--md-default-fg-color--light) -} - -.md-annotation:not([hidden]) { - line-height: 1.225; -} - -.md-annotation__index { - margin-left: 0; - margin-right: 0; -} - -.md-annotation__index:after { - background-color: rgba(0, 0, 0, 0.87); - transform: scale(0.9); -} - -@media screen and (max-width: 44.9375em) { - .md-typeset pre > code { - border: none; - } -} - -.footer__inner { - background-color: var(--md-footer-bg-color--dark); - margin-top: 0.7rem; -} - -.md-footer__inner:not([hidden]) { - gap: 20px !important; -} - -.md-footer__link .md-footer__link--next { - font-weight: 500; -} - -.md-typeset .admonition.note { - border-color: var(--md-primary-fg-color); -} - -.md-typeset { - line-height: 1.3rem; - font-size: 0.8rem; -} - -.md-typeset h1 { - margin: 0 0 0.75em; - font-size: 33px; -} - -.md-typeset h2 { - margin: 1.4em 0 0.64em; - padding-top: 0.2em; - font-size: 25px; -} - -.md-typeset h1, .md-typeset h2 { - font-weight: 400; - /*letter-spacing: 0;*/ -} - -.md-typeset h1, .md-typeset h2, .md-typeset h3, .md-typeset h4, .md-typeset h5 { - font-weight: 800; - letter-spacing: -1px; - color: rgb(0, 0, 0); - text-transform: none; -} - -.md-typeset h4 { - font-size: 20px; -} - -.md-typeset h5 { - font-size: 16px; -} - -.md-typeset h3 { - font-size: 23px; - margin-block-end: 0; - padding-bottom: 0.7em; - border-bottom: 1px solid rgba(243, 244, 246, 1); -} - -[data-md-color-scheme=slate][data-md-color-primary=black], [data-md-color-primary=white] { - --md-primary-fg-color: #0048ff; - --md-accent-fg-color: #ce00ff; - --md-typeset-a-color: #ce00ff; - --md-code-hl-function-color: #e3b4fb; - --md-code-hl-keyword-color: #e37cff; - --md-code-hl-string-color: #b4f9c6; - --md-code-fg-color: #eee; - /*--md-code-fg-color: rgba(0, 0, 0, 0.95);*/ - --md-mermaid-label-fg-color: rgba(0, 0, 0, 0.95); - --md-mermaid-edge-color: rgba(0, 0, 0, 0.95); - --md-code-hl-comment-color: #757585; - --md-code-hl-number-color: #d9548c; - --md-code-hl-operator-color: #5b6369; - --md-code-hl-punctuation-color: #5b6369; - --md-code-bg-color: #f7f7fb; - --md-code-hl-constant-color: var(--md-code-fg-color); - /*--md-primary-bg-color: white;*/ - --md-default-fg-color--light: rgba(0,0,0,.6); - --md-default-fg-color--lighter: rgb(159, 172, 190); - --md-default-fg-color--lightest: #f6f9fc; - --md-footer-fg-color--light: var(--md-default-fg-color); - --md-code-hl-color--light: rgba(197, 173, 255, 0.12); -} - -#__mermaid_0 .note { - stroke: var(--md-primary-fg-color) !important; -} - -.md-typeset .highlight :where(.l) { - /*color: #eee !important;*/ - /*color: var(--md-code-fg-color) !important;*/ -} - -.highlight .sd { - color: var(--md-code-hl-string-color); -} - -.highlight .na, .highlight .nv, .highlight .vc, .highlight .vg, .highlight .vi { - color: #c6c052; -} - -.md-typeset .highlight .hll { - box-shadow: none; - /*box-shadow: 3px 0px 0px 0.1px var(--md-primary-fg-color) inset;*/ - margin: 0 -1.8em; - padding: 0 1.8em; -} - -body { - --md-text-font-family: metro-web, Metro, -apple-system, "system-ui", "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", sans-serif; - --md-code-font-family: Fira Mono, ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, Liberation Mono, Courier New, monospace; -} - -.md-content { - /*border-left: 1px solid #E4E4E7;*/ -} - -.md-header__button.md-logo :where(img,svg) { - height: 1.6rem !important; -} - -.md-header__button.md-logo { - margin: 0 0 0 0.5rem; - padding: 0; -} - -.md-header__button.md-logo:hover { - opacity: 1; -} - -.md-sidebar__inner { - margin: 0 0; -} - -@media screen and (min-width: 76.1875em) { - .md-sidebar--primary .md-nav__link, - .md-sidebar--primary .md-nav__item--section.md-nav__item .md-nav__link[for=__toc] { - display: none; - } - - .md-sidebar--primary .md-nav__item--section.md-nav__item .md-nav__link { - display: flex; - } - - .md-sidebar__inner > .md-nav--primary > .md-nav__list:not(.md-post__meta) > .md-nav__item > .md-nav > .md-nav__list > .md-nav__item:not(.md-nav__item--section) { - display: none; - } - - .md-nav__item--section > .md-nav__link { - text-transform: uppercase; - display: inline-block; - font-size: 0.75rem; - font-weight: 800; - line-height: 1.4rem; - letter-spacing: -0.5px; - } - - .md-nav__item--section>.md-nav__link[for] { - color: rgba(0,0,0,0.87); - } - - .md-search__form > * { - z-index: 101; - } - - .md-search__form::before { - content: "⌘"; - color: white; - font-weight: 600; - position: absolute; - padding: 5px; - margin: 4.5px; - font-size: .65rem; - background-color: rgba(0,0,0,.87); - border-radius: 5px; - right: 40px; - z-index: 1; - width: 26px; - text-align: center; - } - - .md-search__form::after { - content: "K"; - color: white; - font-weight: 600; - position: absolute; - padding: 5px; - margin: 4.5px; - font-size: .65rem; - background-color: rgba(0,0,0,.87); - border-radius: 5px; - right: 10px; - top: 0; - /*z-index: 1;*/ - width: 26px; - text-align: center; - } - - .md-nav--lifted > .md-nav__list > .md-nav__item > [for] { - display: none; - } -} - -.md-sidebar--primary .md-nav__link { - padding: 3px 15px 4px; - margin-top: 5px; -} - -.md-sidebar--secondary .md-nav__link { - padding-left: 15px; - margin-left: -8px; - border-left: 2px solid transparent; - padding-top: 1px; - padding-bottom: 1px; -} - -.md-sidebar--secondary .md-nav__item { - padding-left: 0.6rem; -} - -.md-sidebar--secondary .md-nav__item .md-nav__link--passed { - color: inherit; -} - -.md-sidebar--secondary .md-nav__item .md-nav__link--active .md-typeset { - font-weight: 700; -} - -.md-sidebar--secondary .md-nav__item .md-nav__link--active { - border-left: 2px solid var(--md-typeset-a-color); - color: inherit; -} - -.md-nav__item .md-nav__link--active, .md-nav__item .md-nav__link--active:hover { - font-weight: 700; - color: inherit; -} - -.md-path__link { - color: var(--md-default-fg-color); - font-size: 0.7rem; -} - -@media screen and (min-width: 76.1875em) { - .md-nav__item--section>.md-nav>.md-nav__list>li.md-nav__item:not(.md-nav__item--nested):not(.md-nav__item--active) { - padding-left: 0.4rem; - /*margin-left: -4px;*/ - /*color: inherit;*/ - } - .md-sidebar--primary a.md-nav__link--active { - overflow: inherit; - } - .md-nav__item--section>.md-nav>.md-nav__list>.md-nav__item { - padding-left: 0.4rem; - } - /*MKDocs Insiders fix*/ - /*.md-sidebar--primary a.md-nav__link--active::before {*/ - /* content: "•";*/ - /* min-width: 1.1rem;*/ - /* font-size: 2rem;*/ - /* height: 0;*/ - /* display: flex;*/ - /* background-color: #000000;*/ - /* !*Add absolute positioning*!*/ - /* position: relative;*/ - /* top: -1.48rem;*/ - /* margin-left: -24px;*/ - /*}*/ - .md-sidebar--primary .md-nav__link--active { - font-weight: 700; - justify-content: start; - } -} - -.md-footer { - background-color: var(--md-default-bg-color); - color: inherit; - font-weight: 500; -} - -.md-footer-meta { - background-color: var(--md-default-bg-color); -} - -.md-typeset a { - letter-spacing: -0.5px; -} - -html .md-footer-meta.md-typeset a:is(:focus,:hover) { - color: var(--md-primary-fg-color) !important; -} - -@media screen and (max-width: 76.1875em) { - .md-sidebar--primary .md-nav__link { - padding: inherit; - margin: 5px 5px; - } - - .md-sidebar--primary .md-nav__item .md-nav__link--active { - color: inherit; - background-color: inherit; - border-radius: inherit; - } - - .md-nav--primary .md-nav__title[for=__drawer] { - background-color: inherit; - } -} - -@media screen and (min-width: 76.25em) { - .md-nav--integrated>.md-nav__list>.md-nav__item--active .md-nav--secondary { - margin-bottom: 0; - } -} - -.md-typeset :where(ol, ul) { - list-style: none !important; -} - -[dir=ltr] .md-typeset ol li, [dir=ltr] .md-typeset ul li { - margin-left: 0; - padding-left: 1.25rem; - position: relative; -} - -.md-typeset :where(ul) > li:before { - background-color: rgba(0,0,0,87); - border-radius: 50%; - content: ""; - height: 0.48em; - width: 0.48em; - left: 0.25em; - position: absolute; - top: 0.6875em; -} - -.md-typeset :where(ol) > li:before { - /*color: #6b7280;*/ - content: counter(list-item,var(--list-counter-style,decimal)) "."; - font-weight: 400; - left: 0; - position: absolute; -} - -.md-typeset :where(ol) { - margin-left: 0; -} - -.md-typeset :where(ul,ol) li :where(ul,ol) { - margin-top: 1.25em; - margin-bottom: 1.25em; -} - -@media screen and (min-width: 76.25em) { - [dir=ltr] .md-sidebar--primary:not([hidden]) ~ .md-content > .md-content__inner { - margin-left: 1.2rem; - } - - .md-content__inner:before { - display: none; - } -} - -.md-typeset .md-content__button { - color: var(--md-default-fg-color--light); -} - -/*.md-typeset p > img {*/ -/* border: 1px solid #E4E4E7;*/ -/*}*/ - -.md-typeset figure p img { - border: none; - display: inline-block; -} - -.md-typeset .grid.fit { - grid-template-columns: repeat(auto-fit,minmax(15rem,1fr)); -} - -.md-typeset .tabbed-labels>label { - padding: 18px 18px 16px !important; - font-weight: 800 !important; - font-size: 16.5px !important; - line-height: 1.2 !important; - -webkit-font-smoothing: auto !important; - z-index: 1000 !important; - color: rgba(0,0,0,.63) -} - -.md-typeset .tabbed-labels--linked>label>a { - /*MKDocs Insiders fix*/ - padding: initial; -} - -.md-typeset .tabbed-labels--linked>label>a code { - /*MKDocs Insiders fix*/ - /*background: initial;*/ - font-weight: 600; - color: var(--md-primary-fg-color); -} - -.md-typeset .highlight :is(.nd,.ni,.nl,.nt), -.md-typeset .highlight :is(.k,.kd,.kn,.kp,.kr,.kt), -.md-typeset .highlight :is(.nc,.ne,.nf,.nn) { - font-weight: 100; -} - -.md-typeset .tabbed-labels>label > code { - background-color: transparent; - letter-spacing: -0.25px; -} - -.md-typeset .tabbed-set { - border-radius: 0; -} - -.md-typeset .tabbed-block>.highlight:first-child>pre>code, .md-typeset .tabbed-block>pre:first-child>code, -.md-typeset .tabbed-block>.termy { - margin-top: 17px; -} - -@media screen and (min-width: 76.1875em) { - .md-typeset .tabbed-block > .highlight:first-child > pre > code, .md-typeset .tabbed-block > pre:first-child > code { - border-radius: 6px; - } -} - -.md-typeset .tabbed-set>input:checked~.tabbed-labels code, -.md-typeset .tabbed-set>input:first-child:checked~.tabbed-labels>:first-child { - /*MKDocs Insiders fix*/ - /*font-weight: 500;*/ - /*color: var(--md-code-fg-color);*/ -} - -.js .md-typeset .tabbed-labels:before { - height: 100%; - background: none; - z-index: 1; - padding: 5px; - border-radius: 6px; - border: 1px solid black; -} - -.md-typeset .tabbed-labels { - box-shadow: none !important; -} - -.md-typeset .grid { - grid-gap: 1.2rem; - display: grid; - grid-template-columns: repeat(auto-fill,minmax(15rem,1fr)); - margin: 1.6em 0; -} - -.md-typeset .grid.cards>:-webkit-any(ul,ol) { - display: contents; -} - -.md-typeset .grid.cards>:-webkit-any(ul,ol)>li strong, .md-typeset .grid>.card strong { - display: block; - color: var(--md-default-fg-color); - margin-bottom: 6px; - font-weight: 800; -} - -.md-typeset .grid.cards>:-webkit-any(ul,ol)>li a:hover strong, -.md-typeset .grid>.card a:hover strong { - color: var(--md-typeset-a-color); -} - -.md-typeset .grid.cards > :-webkit-any(ul,ol) > li a, -.md-typeset .grid.cards > :-webkit-any(ul,ol) > li span, -.md-typeset .grid > .card a { - color: var(--md-default-fg-color); - text-decoration: none; - display: block; - margin: 0; - padding: 1rem 1.4rem; - border-radius: 6px; - border: rgba(0,0,0,0.6) 0.5px solid; -} - -.md-typeset .grid.cards>ol>li:focus-within,.md-typeset .grid.cards>ol>li:hover,.md-typeset .grid.cards>ul>li:focus-within,.md-typeset .grid.cards>ul>li:hover,.md-typeset .grid>.card:focus-within,.md-typeset .grid>.card:hover { - box-shadow: none; -} - -.md-typeset .grid.cards>:-webkit-any(ul,ol)>li, .md-typeset .grid>.card { - font-size: 95%; - line-height: 1.6; - padding: 0; -} - -.md-typeset .grid.cards>:-webkit-any(ul,ol)>li:before { - display: none; -} - -.md-header--shadow { - box-shadow: none; -} - -.md-tabs { - background-color: transparent; - flex-grow: 1; - width: initial; -} - -@media screen and (max-width: 76.25em) { - .md-tabs { - display: none; - } -} -@media screen and (min-width: 76.1875em) { - .md-tabs { - padding-left: 2.5rem; - display: none; - } - - .ready .md-tabs { - display: block; - } - - [data-md-color-primary=white] .md-tabs { - border-bottom: none; - } - - .md-tabs[hidden] .md-tabs__link { - opacity: inherit; - transform: inherit; - transition: inherit; - } - - .md-tabs[hidden] { - pointer-events: inherit; - } - - /*.md-nav__title { - display: none; - }*/ - - [dir=ltr] .md-tabs__list { - display: flex; - } - - .md-tabs__item { - display: flex; - height: 2.9rem; - padding-left: 0; - padding-right: 20px; - } - - .md-tabs__item:nth-child(1) { - display: none; - } - - .md-tabs__item:nth-child(6) { - margin-left: auto; - } - - .md-tabs__item:nth-child(7) { - padding-right: 1.2rem; - } - - .md-tabs__item:nth-child(n+6):nth-child(-n+7) .md-tabs__link { - visibility: hidden; - width: 35px; - display: inline-block; - margin-top: 18px; - } - - .md-tabs__item:nth-child(n+6) .md-tabs__link:before { - width: 38px; - height: 38px; - margin-top: 4px; - visibility: visible; - } - - .md-tabs__item:nth-child(4) .md-tabs__link:after, .md-tabs__item:nth-child(8) .md-tabs__link:after { - content: url('data:image/svg+xml,'); - line-height: 14px; - margin-left: 4px; - position: relative; - top: 0; - margin-right: -7px; - } - - .twemoji.external { - position: relative; - top: 2.5px; - height: 18.5px; - margin-left: -3px; - } - - .tx-footer__section-link.external:after { - content: url('data:image/svg+xml,'); - line-height: 14px; - margin-left: 5px; - position: relative; - top: 1.5px; - margin-right: -7px; - } - - .md-tabs__item:nth-child(6) .md-tabs__link:before { - position: relative; - content: ''; - width: 43px; - height: 37px; - display: inline-block; - -webkit-mask: url('data:image/svg+xml,') no-repeat 50% 50%; - mask: url('data:image/svg+xml,') no-repeat 50% 50%; - -webkit-mask-size: cover; - mask-size: cover; - background: -webkit-linear-gradient(45deg, #0048ff, #ce00ff); - margin-top: 0; - } - - .md-tabs__item:nth-child(7) .md-tabs__link:before { - position: relative; - content: ''; - width: 43px; - height: 37px; - display: inline-block; - -webkit-mask: url('data:image/svg+xml,') no-repeat 50% 50%; - mask: url('data:image/svg+xml,') no-repeat 50% 50%; - -webkit-mask-size: cover; - mask-size: cover; - background: -webkit-linear-gradient(45deg, black, black); - margin-top: 0; - } - - .md-tabs__link { - display: flex; - align-items: center; - } - - .md-tabs__link { - font-size: 0.95rem; - font-weight: 600; - color: black; - letter-spacing: -0.5px; - margin-top: 7px; - line-height: 24px; - } - - .md-tabs__link.md-tabs__link--active { - color: var(--md-accent-fg-color); - } - - .md-nav[aria-label="Community"] .md-nav__item:nth-child(1) { - display: flex; - } -} - -.md-source__repository--active { - font-weight: 700; -} - -.md-source__icon { - height: 2.6rem; -} - -.md-source { - font-size: 0.7rem; -} - -.md-source:hover { - opacity: 1; -} - -[dir=ltr] .md-source__icon+.md-source__repository { - margin-left: -2.6rem !important; -} - -.md-source__icon.md-icon svg { - height: 1.38rem; - width: 1.38rem; - fill: none; - margin-left: -0.1rem; - margin-top: 0.61rem; -} - -.md-source__facts { - color: black; -} - -.md-social__link svg { - max-height: 1.2rem; -} - -@media screen and (min-width: 76.25em) { - .md-search .md-search__inner { - padding-top: 0.58rem; - margin-right: 0.8rem; - } - - [data-md-toggle=search]:checked ~ .md-header .md-search__inner, .md-search__scrollwrap { - width: 30rem; - } - - [data-md-toggle=search]:checked~.md-header .md-search__form .md-search__input+.md-search__icon { - color: black; - } - - [data-md-toggle=search]:checked~.md-header .md-search__form { - border-radius: 5px 5px 0 0; - border: none; - border-bottom: 0.5px solid black; - } - - .md-search__scrollwrap { - border-radius: 0 0 5px 5px; - } - - [dir=ltr] .md-search__options { - right: 3.8rem; - } - - .md-search__options .md-icon svg { - width: 1rem; - height: 1rem; - } - - .md-search__options > * { - color: var(--md-default-fg-color--light); - } -} - -.md-search-result mark { - font-weight: 600; -} - -.md-search__input::placeholder { - color: inherit; -} - -.md-top { - font-weight: 500; - box-shadow: 0 0 0.2rem rgb(0 0 0 / 5%), 0 0.2rem 0.9rem rgb(0 0 0 / 10%); - color: var(--md-default-fg-color); - border-radius: 20px; -} - -.md-top:hover { - background-color: white; - color: var(--md-default-fg-color); -} - -@media screen and (max-width: 44.9375em) { - .md-typeset .tabbed-set { - margin: 0 -.8rem - } - - [dir=ltr] .md-content__inner>.tabbed-set .tabbed-labels { - margin-left: 0.8rem; - padding-left: 0; - } -} - -.md-banner { - background: black; - color: var(--md-default-bg-color); - margin-bottom: 1px; - font-weight: 500; -} - -.md-typeset.md-banner__inner { - margin: 0.3rem auto; - text-align: center; - font-size: 0.77rem; -} - -.md-typeset.md-banner__inner a { - color: var(--md-default-bg-color); - border-bottom: 2px dotted; - font-weight: 700; -} - -.md-typeset.md-banner__inner .md-banner__button { - color: var(--md-default-fg-color--lighter); -} - -.md-typeset.md-banner__inner .md-banner__button:hover { - color: var(--md-accent-fg-color); -} - -.md-typeset .footnote-backref { - vertical-align: inherit; -} - -.md-go-to-action.secondary.discord:before { - position: relative; - top: 5px; - content: url('data:image/svg+xml,'); - padding-right: 10px; -} - -.md-go-to-action.primary.discord:before { - position: relative; - top: 5px; - content: url('data:image/svg+xml,'); - padding-right: 10px; -} - -.md-go-to-action.github:before { - position: relative; - top: 5px; - content: url('data:image/svg+xml,'); - padding-right: 10px; -} - -[data-md-color-primary=white] .md-button { - border: 1.5px solid rgba(0,0,0,0.87); - line-height: 35px; - color: rgba(0,0,0,0.87); - margin-right: 5px; - background: transparent; - font-weight: 500 !important; - padding: 0.4em 1.5em; - font-size: 17px; - border-radius: 4px; -} - -/*[data-md-color-primary=white] .md-button:hover{ - background: transparent; - border: 1.5px solid rgba(0,0,0,1); - color: black !important; -}*/ - -.md-post .md-nav__title { - font-weight: 500; -} - -[data-md-color-primary=white] .md-button--primary { - color: white; - background: rgba(0,0,0,0.87); -} - -[data-md-color-primary=white] .md-button--github:before { - position: relative; - top: 5px; - content: url('data:image/svg+xml,'); - padding-right: 10px; -} - -/*[data-md-color-primary=white] .md-button:hover { - background: inherit; - color: inherit; - border-color: inherit; -}*/ - -/* -[data-md-color-primary=white] .md-button--primary:hover { - background: rgba(0,0,0,1); - color: white !important; -} -*/ - -[dir=ltr] .md-typeset blockquote { - /*border: 1px solid black;*/ - border: none; - color: var(--md-default-fg-color); - padding: 8px 25px; - border-radius: 6px; - background: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.1), rgb(0 114 255 / 1%), rgba(0, 42, 255, 0.05)); -} - -a.md-go-to-action.secondary { - color: rgba(0,0,0,0.87); - background: white; -} - -.md-post__content h2 a { - color: rgba(0,0,0,0.87); -} - -div[editor-title] code .editor-title { - position: absolute; - color: #a2a2a2; - top: 10px; - left: 0; - width: 100%; - text-align: center; -} - -.md-status:after, .md-status:hover:after{ - background: rgba(0,0,0,0.7); -} - -.md-ellipsis, .md-ellipsis .md-typeset { - white-space: normal; - font-size: 98%; -} - -.md-header__topic .md-ellipsis { - white-space: nowrap; -} - -/*.md-blog-sidebar .md-nav__link--active { - display: none; -}*/ - -[data-ty].no-newline, .no-newline { - display: inline-block; -} - -[data-ty].newline, .newline { - display: block; -} - -img.border { - border: 0.25px rgba(0,0,0,0.2) solid; - border-radius: 7px; -} - -.md-typeset .reference-item:hover > .headerlink { - display: none; -} - -.md-typeset .reference-item { - display: list-item; - margin-left: .625em; - font-weight: inherit; - color: inherit; - font-size: inherit; - letter-spacing: inherit; -} - -.md-typeset .reference-item a code { - color: var(--md-typeset-a-color); - font-size: .85em; -} - -.md-typeset .reference-item code { - background-color: rgba(163, 68, 215, 0.05); - border-radius: 2px; - font-weight: 600; - color: var(--md-primary-fg-color); - text-align: center; - padding: 4px; - height: 16px; - margin: 0 4px; -} \ No newline at end of file diff --git a/docs/assets/stylesheets/landing.css b/docs/assets/stylesheets/landing.css deleted file mode 100644 index ba8b6f4ce7..0000000000 --- a/docs/assets/stylesheets/landing.css +++ /dev/null @@ -1,910 +0,0 @@ -.tx-landing { - margin: 0 .8rem; - color: var(--md-primary-bg-color) -} - -.tx-landing__hero_text { - display: flex; - flex-direction: column; - align-items: center; - text-align: center; -} - -.tx-landing__hero_text h1 { - margin-bottom: .75rem; - font-weight: 800; - font-size: 2.5em; - letter-spacing: -3px; -} - -.tx-landing__hero_text h1 strong { - font-weight: 800; -} - -/*.tx-landing__hero_text h1*/ .gradient { - background: linear-gradient(90deg, #4631C8 -1.29%, #CD4AE2 88.05%, #FFD43C 111.26%); - -webkit-background-clip: text; - -webkit-text-fill-color: transparent; - padding-right: 4px; - margin-right: -4px; -} - -/*.tx-landing__hero_text p { - font-size: 0.95rem; -}*/ - -.tx-landing__hero_buttons { - display: flex; - align-items: flex-start; - justify-content: center; - gap: 24px; -} - -.tx-container .tx-landing__hero_buttons .md-button { - margin-bottom: 0.2rem; - margin-right: 0; -} - -.tx-landing__hero_button_container { - text-align: center; -} - -.tx-landing__hero_button_placeholder { - color: #202128; - font-size: 0.67rem; - line-height: 24px; - text-align: center; - padding: 18px 5px 13px; -} - -@media screen and (max-width: 76.1875em) { - .tx-landing h1 { - font-size: 1.4rem; - } - - .tx-landing__hero_text { - max-width: 30rem; - margin-left: auto; - margin-right: auto; - } - - .tx-landing__hero_image { - margin-top: 1.5rem; - max-width: 26rem; - } - - .tx-landing__hero_code { - margin-top: 2.5rem; - max-width: 750px; - margin-left: auto; - margin-right: auto; - } - - .tx-landing__hero_buttons { - flex-direction: column; - align-items: center; - gap: 0; - } -} - -@media screen and (min-width: 76.1875em) { - .tx-container { - padding-bottom: 3vw; - } - - .tx-landing__hero { - margin-bottom: 2.5rem; - font-size: 1.1em; - line-height: 1.5; - } - - .tx-landing__hero_text { - margin-top: 5.1rem; - } - - .tx-landing__hero_text h1 { - font-size: 2.8rem; - max-width: 36rem; - line-height: 1.2; - } - - .tx-landing__hero_text p { - max-width: 30rem; - } - - .tx-landing__hero_image { - order: 1; - width: 30rem; - margin-top: 0.5rem; - margin-left: 3rem; - } - - .tx-landing__hero_code { - /*width: 100vw;*/ - position: relative; - left: 50%; - transform: translateX(-50%); - margin-top: 2.5rem; - padding-top: 4.5rem; - padding-bottom: 4.5rem; - /*border-top-left-radius: 2.5rem;*/ - /*border-top-right-radius: 2.5rem;*/ - border-radius: 2.5rem; - background-image: url("/https/github.com/assets/images/hero_code_background.png"); - background-size: cover; - background-position: center; - background-repeat: no-repeat; - } - - .tx-landing__hero_code > [data-termynal] { - max-width: 750px; - margin-left: auto; - margin-right: auto; - } -} - -.md-header__buttons .md-button.discord:before, -.md-typeset .md-button.discord:before { - position: relative; - top: 5px; - content: ''; - width: 26px; - height: 26px; - display: inline-block; - -webkit-mask: url('data:image/svg+xml,') no-repeat 50% 50%; - mask: url('data:image/svg+xml,') no-repeat 50% 50%; - -webkit-mask-size: cover; - mask-size: cover; - margin-right: 5px; - background: -webkit-linear-gradient(45deg, rgba(0, 0, 0, 0.87), rgba(0, 0, 0, 0.87)); -} - -.md-header__buttons .md-button.github:before, -.md-typeset .md-button.github:before { - position: relative; - top: 5px; - content: ''; - width: 26px; - height: 26px; - display: inline-block; - -webkit-mask: url('data:image/svg+xml,') no-repeat 50% 50%; - mask: url('data:image/svg+xml,') no-repeat 50% 50%; - -webkit-mask-size: cover; - mask-size: cover; - margin-right: 5px; - background: black; -} - -.md-header__buttons { - margin-left: auto; -} - -.md-header__buttons .md-button, -.md-typeset .md-button { - margin-top: 1.2rem; - margin-bottom: 1.5rem; - font-size: 22px; - font-weight: 400 !important; - text-align: center; - border-radius: 3px; - border-color: transparent; - margin-right: 5px; -} - -.md-header__buttons .md-button { - font-size: 0.95rem; -} - -.md-typeset .md-button { - min-width: 225px; - padding: 0.55em 2em; -} - -.md-typeset .md-button.small { - min-width: 150px; - padding: 0.5em 1.9em; - font-size: 19px; -} - -.md-typeset .md-button { - vertical-align: middle; -} - -.md-typeset .md-button-secondary:hover, .md-typeset .md-button--primary:hover { - transform: translateY(-2px); - transition: opacity .2s ease,transform .2s ease; -} - -.md-typeset .md-button .icon { - display: inline-block; - position: relative; - width: 15px; - height: 15px; - margin-left: 7px; - transition: opacity .2s ease,transform .2s ease; -} - -.md-typeset .md-button-secondary .icon, .md-typeset .md-button--primary .icon { - color: #a91ffe; -} -.md-typeset .md-button-secondary:hover .icon, .md-typeset .md-button--primary:hover .icon { - color: #a91ffe; - transform: translateX(3px) -} - - -[data-md-color-primary=white] .md-header__buttons .md-button--primary, [data-md-color-primary=white].md-header__buttons .md-button--primary:hover, -[data-md-color-primary=white] .md-typeset .md-button--primary, [data-md-color-primary=white] .md-typeset .md-button--primary:hover { - background: rgba(0, 0, 0, 0.87); - border: 1.5px solid rgba(0, 0, 0, 0.87); - border-radius: 8px; - font-weight: 400 !important; -} - -.md-header__buttons .md-button--primary, -.md-header__buttons .md-button-secondary { - font-weight: 500 !important; - white-space: nowrap; - padding: 0.45rem 1rem; -} - -.md-button-secondary.external:after { - content: url('data:image/svg+xml,'); - line-height: 14px; - margin-left: 5px; - position: relative; - top: 3px; - margin-right: -7px; -} - -.md-button--primary.sky.external:after { - content: url('data:image/svg+xml,'); - line-height: 14px; - margin-left: 5px; - position: relative; - top: 2px; - margin-right: -7px; -} - -.md-header__buttons .md-button-secondary, -.md-typeset .md-button-secondary, -.md-header__buttons .md-button-secondary:hover, -.md-typeset .md-button-secondary:hover { - background: transparent; - color: black; - border: 1.5px solid rgba(0, 0, 0, 0.87); - border-radius: 8px; -} - -.md-header__buttons { - padding-top: 9px; -} - -.md-header__buttons .md-button-secondary { - -} - -.tx-landing__highlights { - margin-bottom: 5vw; - font-size: 17px; - line-height: 1.5; -} - -.tx-landing__highlights_text h2 { - font-size: 2.2em; - max-width: 500px; - margin-top: 1.5em; - margin-bottom: 1.8em; - letter-spacing: -1.5px; - line-height: 1.3; -} - -.tx-landing__highlights_cta { - margin-top: 3vw; -} - -.tx-landing__highlights_cta a { - display: inline-block; - font-size: 19px; - margin-top: 30px; - border: 1px solid; - padding: 10px 30px; -} - -.tx-landing__highlights_text h2 .gradient { - background: -webkit-linear-gradient(45deg, #0048ff, #ce00ff); - -webkit-background-clip: text; - -webkit-text-fill-color: transparent; -} - -.tx-landing__highlights_grid .feature-cell { - padding: 30px 40px; - border-radius: 12px; - border-color: black; - border-width: 1px; - border-style: solid; - display: flex; - flex-direction: column; -} - -@media screen and (min-width: 76.1875em) { - .tx-landing__highlights_grid { - grid-gap: 20px !important; - border: none; - - grid-template-columns: repeat(4, 1fr) !important; - } - - .tx-landing__highlights_grid .feature-cell { - } -} - -.tx-landing__highlights_grid .feature-cell:hover { - background: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.015), rgba(0, 42, 255, 0.015), rgba(225, 101, 254, 0.025)); -} - -.tx-landing__highlights_grid .feature-cell strong { - font-weight: 500; -} - -.tx-landing__highlights_grid .feature-cell .feature-tags { - gap: 2px; - margin: 0 -5px; - margin-top: auto; - display: none; -} - -.tx-landing__highlights_grid .feature-cell .feature-tags .feature-tag { - display: flex; - align-items: center; - gap: 8px; - padding: 8px 20px; - font-size: 0.85em; - font-weight: 400; - line-height: 1.44; - color: black; - margin-top: 20px; - margin-right: 5px; - border-radius: 30px; - border-width: 0.5px; - border-style: solid; - white-space: nowrap; -} - -.tx-landing__highlights_grid > a, .tx-landing__highlights_grid > a:hover { - text-decoration: none; - color: inherit; -} - -.tx-landing__integrations_text { - color: #202128; - font-size: 0.65rem; - line-height: 24px; - text-align: center; - padding: 21px 5px 3px; -} - -.tx-landing__integrations_logos { - display: flex; - align-items: center; - justify-content: center; - gap: 14px; - padding: 8px 5px 3px; -} - -.tx-landing__integrations .logo-xlarge { - width: 41px; - margin-top: 2px; -} - -.tx-landing__integrations .logo-large { - width: 30px; -} - -.tx-landing__integrations .logo-medium { - width: 26px; -} - -.tx-landing__highlights_grid { - grid-gap: 2rem; - display: grid; - grid-template-columns: repeat(auto-fill, minmax(15rem, 1fr)); - margin-top: 40px; - margin-bottom: 30px; -} - -.tx-landing__highlights_grid .feature-icon svg { - padding: 12px; - background: rgba(125, 4, 233, 0.02); - color: #002aff; - width: 50px; - height: 50px; - border-radius: 25px; - text-align: center; - display: inline-flex; - vertical-align: text-top; - fill: currentColor; - margin-bottom: 15px; -} - -.tx-landing__highlights_grid h3 { - font-size: 1.125em; - font-weight: 700; - border-bottom: none; - padding-bottom: 0.2em; - margin-top: 0; - line-height: 32px; -} - -.tx-landing__highlights_grid h3:after { - content: url('data:image/svg+xml,'); - margin-left: 2px; - position: relative; - top: 3px; - margin-right: -7px; -} - -.tx-landing__highlights_grid p { - font-size: 16px; - margin-top: 5px; - margin-bottom: 5px; - color: rgba(0, 0, 0, 0.87); -} - -.tx-landing__features { - margin-bottom: 5vw; -} - -.tx-landing__features_text h2 { - font-size: 1.7em; - max-width: 500px; - color: rgba(0, 0, 0, 0.87); - margin-bottom: 1.5em; -} - -.tx-landing__features_grid { - grid-gap: 1.2rem; - display: grid; - grid-template-columns: repeat(auto-fill, minmax(15rem, 1fr)); -} - -.tx-landing__bottom_cta { - margin-top: 9vw; - /*margin-bottom: 5vw;*/ - font-size: 18px; - line-height: 1.5; -} - -.tx-landing__major_feature { - font-size: 1.025em; - margin-top: 5em; -} - -.tx-landing__bottom_cta_text { - font-size: 0.95em; - max-width: 500px; -} - -.tx-landing__major_feature h3 { - padding-bottom: 0; - border-bottom: none; - background: -webkit-linear-gradient(45deg, #0048ff, #ce00ff); - -webkit-background-clip: text; - -webkit-text-fill-color: transparent; - font-size: 1.5em; -} - -.tx-landing__major_feature img.border { - border: 0.25px rgba(0,0,0,0.2) solid; - border-radius: 7px; -} - -.tx-landing__major_feature h2 { - font-size: 1.8em; - max-width: 500px; - margin-top: 0.75em; - margin-bottom: 0.75em; - background: black; - -webkit-background-clip: text; - -webkit-text-fill-color: transparent; - letter-spacing: -1.5px; -} - -.tx-landing__major_feature { - margin-bottom: 7vw; -} - -@media screen and (min-width: 76.1875em) { - .tx-landing__major_feature .section { - display: flex; - } -} - -.tx-landing__major_feature .block { - max-width: 800px; - width: 100%; -} - -.tx-landing__major_feature .block.margin { - margin-right: 50px; -} - -.tx-landing__major_feature .block.large { - width: 800px; - max-width: 100%; - flex: 0 0 auto; -} - -/*.tx-landing__bottom_cta*/ a[data-terminal-control] { - color: #e37cff; -} - -/*.tx-landing__bottom_cta*/ [data-ty="input"]:before, [data-ty-prompt]:before { - color: #e37cff; -} - -/*.tx-landing__bottom_cta*/ [data-termynal] { - font-size: 16px; -} - -.tx-landing__bottom_cta .termy { - max-width: 750px; - margin-bottom: 2rem; -} - -.tx-footer { - padding-top: 2.5rem; -} - -.md-footer__inner { - padding: 0; -} - -.tx-footer .md-main__inner { - border-width: 0; - /*border-top-width: 0.6px;*/ - border-image: linear-gradient(45deg, #0048ff, #ce00ff) 10; - border-style: solid; -} - -.tx-footer__section { - display: flex; - flex-direction: column; - gap: 0.5rem; - margin: 0 0.6rem; -} - -.tx-footer__section-title { - text-transform: uppercase; - font-size: 0.8rem; - font-weight: 700; - color: black; - letter-spacing: -0.5px; - line-height: 24px; - margin-top: 0.6rem; - margin-bottom: 0.1rem; -} - -.tx-footer__logo:hover { - opacity: .7; -} - -.tx-footer__copyright { - margin-top: 0.6rem; - font-size: 18px; - line-height: 26px; - color: black; - font-weight: 500; -} - -.tx-footer__section-link { - font-size: 0.75rem; - line-height: 26px; - color: #151414; - transition: opacity .2s ease; -} - -.tx-footer__section-link:hover { - opacity: .7; -} - -@media screen and (max-width: 76.1875em) { - .tx-footer .md-main__inner { - flex-direction: column; - gap: 1.5rem; - margin-left: .8rem; - margin-right: .8rem; - } - - .tx-footer__section { - margin-bottom: 1.5rem; - } -} - -@media screen and (min-width: 76.1875em) { - .tx-footer { - padding-bottom: 3.5rem; - } - - .tx-footer__right-side { - margin-left: auto; - display: flex; - gap: 4.5rem; - } -} - -.tx-landing__plans { - margin-bottom: 2.75rem; -} - -.tx-landing__plans_text { - -} - -.tx-landing__plans_text h2 { - margin-bottom: 2rem; - font-weight: 800; - font-size: 2rem; - text-align: center; -} - -.tx-landing__plans_cards { - display: grid; - flex-wrap: wrap; - gap: 2rem; - grid-template-columns: repeat(auto-fill, minmax(15rem, 1fr)); -} - -.md-header__buttons .md-button--primary.sky, .md-header__buttons .md-button--primary.sky:hover, -.md-typeset .md-button--primary.sky, .md-typeset .md-button--primary.sky:hover { - background: -webkit-linear-gradient(45deg, #002aff, #002aff, #e165fe); - border-radius: 8px; - border: 1px solid transparent; -} - -.plans_card.open_source { -} - -.plans_card.dstack_sky { - background: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.2), rgb(0 114 255 / 1%), rgba(0, 42, 255, 0.07)); -} - -.plans_card.enterprise { -} - -.plans_card { - display: flex; - flex-direction: column; - border-color: black; - border-radius: 0; - border-width: 1px; - border-style: solid; - padding: 38px 46px; -} - -@media screen and (min-width: 76.1875em) { - .plans_card.open_source { - padding: 38px 46px 38px 46px; - border-right: none; - border-top-left-radius: 12px; - border-bottom-left-radius: 12px; - /*margin-right: 5px;*/ - } - - .plans_card.dstack_sky { - /*border-left: none;*/ - border-top-right-radius: 12px; - border-bottom-right-radius: 12px; - /*margin-left: 5px;*/ - } - - .plans_card.enterprise { - margin: 0 -5px; - } - - .tx-landing__plans_cards { - gap: 0; - } -} - -.plans_card__title { - margin-bottom: 0.7rem; - font-size: 1.5em; - font-weight: 800; - line-height: 1.33; -} - -.plans_card__subtitle-2 { - color: rgba(0, 0, 0, 0.6); - margin-bottom: 0.75rem; - font-size: 0.9em; -} - -.plans_card__subtitle { - margin-bottom: 1rem; - font-size: 0.9em; - line-height: 1.44; - color: black; -} - -.plans_card__panel { - font-size: 0.9em; - line-height: 32px; - margin-bottom: 1.5em; -} - -.plans_card__items .item::before { - -webkit-mask: url('data:image/svg+xml,') no-repeat 50% 50%; - mask: url('data:image/svg+xml,') no-repeat 50% 50%; - -webkit-mask-size: cover; - mask-size: cover; - background: -webkit-linear-gradient(45deg, rgba(0, 0, 0, 0.87), rgba(0, 0, 0, 0.87)); - content: ''; - display: inline-block; - width: 24px; - height: 24px; - margin-right: 20px; - margin-left: 5px; - position: relative; - top: 5px; -} - -.plans_card__items .item.crown::before { - -webkit-mask: url('data:image/svg+xml,') no-repeat 50% 50%; - mask: url('data:image/svg+xml,') no-repeat 50% 50%; - background: -webkit-linear-gradient(45deg, #0048ff, #ce00ff); -} - -.plans_card__buttons_subtitle { - margin-top: 18px; - margin-left: 5px; - color: #202128; - font-size: 0.65rem; - line-height: 22px; -} - -.plans_card__services { - display: flex; - flex-wrap: wrap; - gap: 10px; - margin-bottom: 1.3rem; -} - -.plans_card__service-item { - display: flex; - align-items: center; - gap: 8px; - padding: 7px 16px; - border-radius: 30px; - border: 0.5px solid black; - font-size: .85em; - line-height: 1.44; - color: #2A292D; -} - -.plans_card__service-item img { - max-width: none; - height: initial; -} - -.plans_card__service-name { - flex-shrink: 0; - white-space: nowrap; -} - -.plans_card__items { - margin-bottom: 1.25rem; -} - -.plans_card__buttons { - margin-top: auto; -} - -.plans_card__item { - font-size: 1.0em; - line-height: 2.0; - color: black; -} - -.plans_card__item .strong { - color: #2A292D; - font-weight: 700; -} - -.highlighted { - font-weight: 500; - color: var(--md-typeset-a-color); -} - -.plans_card__link { - margin-top: 1.0rem; - margin-bottom: 1.9rem; - font-size: 1.0em; - line-height: 1.66; -} - -.plans_card__buttons .md-button { - margin: 0; -} - -@media screen and (max-width: 76.1875em) { - -} - -@media screen and (min-width: 76.1875em) { - .tx-landing__plans_text h2 { - font-size: 2.5rem; - } -} - -.typed-cursor { - background: linear-gradient(90deg, #4631C8 -1.29%, #CD4AE2 88.05%, #FFD43C 111.26%); - -webkit-background-clip: text; - -webkit-text-fill-color: transparent; - padding: 0 7px; -} - -.tx-landing__quotes_grid { - grid-gap: 1.2rem; - display: grid; - grid-template-columns: repeat(auto-fill, minmax(15rem, 2fr)); - margin-bottom: 2.4em; -} - -.heart::after { - position: relative; - content: ''; - width: 45px; - height: 40px; - display: inline-block; - -webkit-mask: url('data:image/svg+xml, ') no-repeat 50% 50%; - mask: url('data:image/svg+xml, ') no-repeat 50% 50%; - -webkit-mask-size: cover; - mask-size: cover; - background: -webkit-linear-gradient(45deg, #0048ff, #ce00ff); - top: 4px; - margin: 0 3px; -} - -.tx-landing__quotes_grid .photo { - height: 80px; - float: left; - margin: 0 15px 15px 0; -} - -.tx-landing__quotes_grid .photo img { - width: auto; - height: 100%; - aspect-ratio: 1; - object-fit: cover; - border-radius: 50px; -} - -.tx-landing__quotes_grid h3 { - border-bottom: none; - font-size: 1em; - font-weight: 700; - margin: 0; - padding: 0; - margin-top: 13px; -} - -.tx-landing__quotes_grid h4 { - border-bottom: none; - font-size: 0.95em; - font-weight: 500; - margin: 0; - padding: 0; - color: var(--md-primary-bg-color--light);) -} - -.tx-landing__quotes_grid p { - clear: both; - font-size: 0.9em; -} - -.tx-landing__quotes_grid .cell { - padding: 26px 28px; - border-radius: 12px; - border: 1px solid black; -} \ No newline at end of file diff --git a/docs/assets/stylesheets/termynal.css b/docs/assets/stylesheets/termynal.css deleted file mode 100644 index 2c06d7f183..0000000000 --- a/docs/assets/stylesheets/termynal.css +++ /dev/null @@ -1,123 +0,0 @@ -/** - * termynal.js - * - * @author Ines Montani - * @version 0.0.1 - * @license MIT - */ - -:root { - --color-bg: rgb(21, 22, 29); - --color-text: #eee; - --color-text-subtle: #a2a2a2; -} - -[data-termynal] span { - white-space: pre; -} - -.small > [data-termynal] { - font-size: 14px; - line-height: 1.4; -} - -[data-termynal] { - overflow-x: scroll; - /*white-space: pre;*/ - /*width: 750px;*/ - max-width: 100%; - background: var(--color-bg); - color: var(--color-text); - /* font-size: 18px; */ - font-size: 14px; - /* font-family: 'Fira Mono', Consolas, Menlo, Monaco, 'Courier New', Courier, monospace; */ - font-family: var(--md-code-font-family) !important; - border-radius: 4px; - padding: 45px 25px 25px; - /*padding: 75px 45px 35px;*/ - position: relative; - -webkit-box-sizing: border-box; - box-sizing: border-box; -} - -[data-termynal]:before { - content: ''; - position: absolute; - top: 15px; - left: 15px; - display: inline-block; - width: 12px; - height: 12px; - border-radius: 50%; - /* A little hack to display the window buttons in one pseudo element. */ - background: #d9515d; - /*-webkit-box-shadow: 25px 0 0 #f4c025, 50px 0 0 #3ec930;*/ - /* box-shadow: 25px 0 0 #f4c025, 50px 0 0 #3ec930;*/ - -webkit-box-shadow: 20px 0 0 #f4c025, 40px 0 0 #3ec930; - box-shadow: 20px 0 0 #f4c025, 40px 0 0 #3ec930; -} - -[data-termynal]:after { - content: ''; - position: absolute; - color: var(--color-text-subtle); - top: 7px; - left: 0; - width: 100%; - text-align: center; -} - -a[data-terminal-control] { - text-align: right; - display: block; - color: #aebbff; -} - -[data-ty] { - display: block; - line-height: 2; -} - -[data-ty]:before { - /* Set up defaults and ensure empty lines are displayed. */ - content: ''; - display: inline-block; - vertical-align: middle; -} - -[data-ty="input"]:before, -[data-ty-prompt]:before { - margin-right: 0.75em; - color: var(--color-text-subtle); -} - -[data-ty="input"]:before { - content: '$'; -} - -[data-ty][data-ty-prompt]:before { - content: attr(data-ty-prompt); -} - -[data-ty-cursor]:after { - content: attr(data-ty-cursor); - font-family: monospace; - margin-left: 0.5em; - -webkit-animation: blink 1s infinite; - animation: blink 1s infinite; -} - - -/* Cursor animation */ - -@-webkit-keyframes blink { - 50% { - opacity: 0; - } -} - -@keyframes blink { - 50% { - opacity: 0; - } -} \ No newline at end of file diff --git a/docs/blog/archive/say-goodbye-to-managed-notebooks.md b/docs/blog/archive/say-goodbye-to-managed-notebooks.md deleted file mode 100644 index 55e00e1c04..0000000000 --- a/docs/blog/archive/say-goodbye-to-managed-notebooks.md +++ /dev/null @@ -1,101 +0,0 @@ ---- -date: 2023-06-29 -description: Discover how cloud dev environments can benefit ML engineers and why why they are gaining popularity over managed notebooks . -slug: say-goodbye-to-managed-notebooks ---- - -# Say goodbye to managed notebooks - -Data science and ML tools have made significant advancements in recent years. This blog post aims to examine the -advantages of cloud dev environments (CDE) for ML engineers and compare them with web-based managed notebooks. - -[//]: # (TODO: Should be technical and controversial) - - - -## Notebooks are here to stay - -Jupyter notebooks are instrumental for interactive work with data. They provide numerous advantages such as high -interactivity, visualization support, remote accessibility, and effortless sharing. - -Managed notebook platforms, like Google Colab and AWS SageMaker have become popular thanks to their easy integration with clouds. -With pre-configured environments, managed notebooks remove the need to worry about infrastructure. - -![](../../assets/images/dstack-google-colab.png){ width=800 } - -## Reproducibility challenge - -As the code evolves, it needs to be converted into Python scripts and stored in Git for improved organization and -version control. Notebooks alone cannot handle this task, which is why they must be a part of a developer -environment that also supports Python scripts and Git. - -The JupyterLab project attempts to address this by turning notebooks into an IDE by adding a file browser, -terminal, and Git support. - -![](../../assets/images/dstack-jupyterlab.png){ width=800 } - -## IDEs get equipped for ML - -Recently, IDEs have improved in their ability to support machine learning. They have started to combine the benefits of -traditional IDEs and managed notebooks. - -IDEs have upgraded their remote capabilities, with better SSH support. Additionally, they now offer built-in support for editing notebooks. - -Two popular IDEs, VS Code and PyCharm, have both integrated remote capabilities and seamless notebook editing features. - -![](../../assets/images/dstack-vscode.png){ width=800 } - -## The rise of app ecosystem - -Notebooks have been beneficial for their interactivity and sharing features. However, there are new alternatives like -Streamlit and Gradio that allow developers to build data apps using Python code. These frameworks not only simplify -app-building but also enhance reproducibility by integrating with Git. - -Hugging Face Spaces, for example, is a popular tool today for sharing Streamlit and Gradio apps with others. - -![](../../assets/images/dstack-huggingface-space.png){ width=800 class="border"} - -## Say hello to cloud dev environments! - -Remote development within IDEs is becoming increasingly popular, and as a result, cloud dev environments have emerged as -a new concept. Various managed services, such as Codespaces and GitPod, offer scalable infrastructure while maintaining -the familiar IDE experience. - -One such open-source tool is `dstack`, which enables you to define your dev environment declaratively as code and run it on any cloud. - -
- -```yaml -type: dev-environment -build: - - apt-get update - - apt-get install -y ffmpeg - - pip install -r requirements.txt -ide: vscode -``` - -
- -With this tool, provisioning the required hardware, setting up the pre-built environment (no Docker is needed), and -fetching your local code is automated. - -```shell -$ dstack run . - - RUN CONFIGURATION USER PROJECT INSTANCE SPOT POLICY - honest-jellyfish-1 .dstack.yml peter gcp a2-highgpu-1g on-demand - -Starting SSH tunnel... - -To open in VS Code Desktop, use one of these link: - vscode://vscode-remote/ssh-remote+honest-jellyfish-1/workflow - -To exit, press Ctrl+C. -``` - -You can securely access the cloud development environment with the desktop IDE of your choice. - -![](../../assets/images/dstack-vscode-jupyter.png){ width=800 } - -!!! info "Learn more" - Check out our [guide](../../docs/concepts/dev-environments.md) for running dev environments in your cloud. \ No newline at end of file diff --git a/docs/blog/index.md b/docs/blog/index.md deleted file mode 100644 index c58f16c501..0000000000 --- a/docs/blog/index.md +++ /dev/null @@ -1,2 +0,0 @@ -# Blog - diff --git a/docs/blog/posts/dstack-research.md b/docs/blog/posts/dstack-research.md deleted file mode 100644 index 622797ece4..0000000000 --- a/docs/blog/posts/dstack-research.md +++ /dev/null @@ -1,39 +0,0 @@ ---- -date: 2024-05-06 -description: Universities and research labs engaged in open-source AI can now access dstack Enterprise for free. -slug: dstack-research ---- - -# Launching research and open-source program - -To all universities and research labs developing open-source AI: If you're utilizing cloud GPUs or on-prem clusters -to train and deploy AI, you can now use `dstack Enterprise` for free. - -![dstack-sky-banner.png](images/dstack-research-banner-2.png){ width=650 } - - - -As recent developments show, access to and the ability to efficiently manage compute resources are key to training and -utilizing AI models. Inefficiency in these processes leads to high costs and erodes competitiveness in the race for AI -research. - -Universities and research labs working on open-source AI do more research than any other organizations. They require a -lot of compute and often prefer focusing on AI rather than infrastructure. - -`dstack` is building an open-source foundation for more effective orchestration of compute resources for AI workloads. - -![dstack-diagram-stack-3.png](images/dstack-diagram-stack-3.png){ width=750 } - -Leveraging containers, `dstack` facilitates AI model training and deployment at scale while allowing the effective -use of open-source models, training, and serving frameworks. - -`dstack Enterprise` is a self-hosted AI platform that extends the capabilities of the open-source `dstack` to simplify -cost optimization, and compute utilization, and workload management for multiple teams. - -> To support universities and research labs engaged in open-source AI, we're thrilled to announce that `dstack Enterprise` -> can now be used by them for free - -If you're a researcher facing GPU usage challenges and seeking a platform friendly to AI researchers, capable of -seamless integration with the open-source ecosystem, share this program with your supervisor. - -To apply to this program, fill [this](https://tally.so/r/nrl2l5) form. \ No newline at end of file diff --git a/docs/changelog/index.md b/docs/changelog/index.md deleted file mode 100644 index c58f16c501..0000000000 --- a/docs/changelog/index.md +++ /dev/null @@ -1,2 +0,0 @@ -# Blog - diff --git a/docs/docs/concepts/dev-environments.md b/docs/docs/concepts/dev-environments.md deleted file mode 100644 index a191a0e454..0000000000 --- a/docs/docs/concepts/dev-environments.md +++ /dev/null @@ -1,107 +0,0 @@ -# Dev environments - -Before scheduling a task or deploying a model, you may want to run code interactively. Dev environments allow you to -provision a remote machine set up with your code and favorite IDE with just one command. - -## Configuration - -First, create a YAML file in your project folder. Its name must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml` are -both acceptable). - -
- -```yaml -type: dev-environment - -# Specify the Python version, or your Docker image -python: "3.11" - -# This pre-configures the IDE with required extensions -ide: vscode - -# Specify GPU, disk, and other resource requirements -resources: - gpu: 80GB -``` - -
- -If you don't specify your Docker image, `dstack` uses the [base](https://hub.docker.com/r/dstackai/base/tags) image -(pre-configured with Python, Conda, and essential CUDA drivers). - -!!! info "Reference" - See the [.dstack.yml reference](../reference/dstack.yml/dev-environment.md) - for all supported configuration options and multiple examples. - -## Running - -To run a configuration, use the [`dstack run`](../reference/cli/index.md#dstack-run) command followed by the working directory path, -configuration file path, and other options. - -
- -```shell -$ dstack run . -f .dstack.yml - - BACKEND REGION RESOURCES SPOT PRICE - tensordock unitedkingdom 10xCPU, 80GB, 1xA100 (80GB) no $1.595 - azure westus3 24xCPU, 220GB, 1xA100 (80GB) no $3.673 - azure westus2 24xCPU, 220GB, 1xA100 (80GB) no $3.673 - -Continue? [y/n]: y - -Provisioning `fast-moth-1`... ----> 100% - -To open in VS Code Desktop, use this link: - vscode://vscode-remote/ssh-remote+fast-moth-1/workflow -``` - -
- -When `dstack` provisions the dev environment, it mounts the project folder contents. - -!!! info ".gitignore" - If there are large files or folders you'd like to avoid uploading, - you can list them in `.gitignore`. - -!!! info "Reference" - See the [CLI reference](../reference/cli/index.md#dstack-run) for more details - on how `dstack run` works. - -### VS Code - -To open the dev environment in your desktop IDE, use the link from the output -(such as `vscode://vscode-remote/ssh-remote+fast-moth-1/workflow`). - -![](../../assets/images/dstack-vscode-jupyter.png){ width=800 } - -### SSH - -Alternatively, while the CLI is attached to the run, you can connect to the dev environment via SSH: - -
- -```shell -$ ssh fast-moth-1 -``` - -
- -## Managing runs - -**Stopping runs** - -Once the run exceeds the max duration, -or when you use [`dstack stop`](../reference/cli/index.md#dstack-stop), -the dev environment and its cloud resources are deleted. - -**Listing runs** - -The [`dstack ps`](../reference/cli/index.md#dstack-ps) command lists all running runs and their status. - -[//]: # (TODO: Mention `dstack logs` and `dstack logs -d`) - -## What's next? - -1. Check the [`.dstack.yml` reference](../reference/dstack.yml/dev-environment.md) for more details and examples \ No newline at end of file diff --git a/docs/docs/concepts/gateways.md b/docs/docs/concepts/gateways.md deleted file mode 100644 index 952fb2cb3f..0000000000 --- a/docs/docs/concepts/gateways.md +++ /dev/null @@ -1,85 +0,0 @@ -# Gateways - -Gateways handle the ingress traffic of running services. -They provide [services](services.md) with HTTPS domains, handle authentication, distribute load, and perform auto-scaling. -In order to run a service, you need to have at least one gateway set up. - -!!! info "dstack Sky" - If you're using [dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"}, - the gateway is already set up for you. - -## Configuration - -First, create a YAML file in your project folder. Its name must end with `.dstack.yml` (e.g. `.dstack.yml` or `gateway.dstack.yml` -are both acceptable). - -
- -```yaml -type: gateway -name: example-gateway - -backend: aws -region: eu-west-1 -domain: example.com -``` - -
- -A domain name is required to create a gateway. - -!!! info "Reference" - See the [.dstack.yml reference](../reference/dstack.yml/gateway.md) - for all supported configuration options and examples. - -## Creating and updating gateways - -To create or update the gateway, simply call the [`dstack apply`](../reference/cli/index.md#dstack-apply) command: - -
- -```shell -$ dstack apply . -f examples/deployment/gateway.dstack.yml - -The example-gateway doesn't exist. Create it? [y/n]: y - - BACKEND REGION NAME HOSTNAME DOMAIN DEFAULT STATUS - aws eu-west-1 example-gateway example.com ✓ submitted - -``` - -
- -## Updating DNS records - -Once the gateway is assigned a hostname, go to your domain's DNS settings -and add an `A` DNS record for `*.` (e.g., `*.example.com`) pointing to the gateway's hostname. - -This will allow you to access runs and models using this domain. - -## Managing gateways - -**Deleting gateways** - -To delete a gateway, pass gateway configuration to [`dstack delete`](../reference/cli/index.md#dstack-delete): - -
- -```shell -$ dstack delete . -f examples/deployment/gateway.dstack.yml -``` - -
- -**Listing gateways** - -The [`dstack gateway list`](../reference/cli/index.md#dstack-gateway-list) command lists existing gateways and their status. - -[//]: # (TODO: Ellaborate on default`) - -[//]: # (TODO: ## Accessing endpoints) - -## What's next? - -1. See [services](services.md) on how to run services -2. Check the [`.dstack.yml` reference](../reference/dstack.yml/gateway.md) for more details and examples \ No newline at end of file diff --git a/docs/docs/concepts/pools.md b/docs/docs/concepts/pools.md deleted file mode 100644 index 19cb8e098e..0000000000 --- a/docs/docs/concepts/pools.md +++ /dev/null @@ -1,117 +0,0 @@ -# Pools - -Pools enable the efficient reuse of cloud instances and on-premises servers across runs, simplifying their management. - -## Adding instances - -### Automatic provisioning - -By default, when using the `dstack run` command, it tries to reuse an instance from a pool. If no idle instance meets the -requirements, `dstack` automatically provisions a new cloud instance and adds it to the pool. - -??? info "Reuse policy" - To avoid provisioning new cloud instances with `dstack run`, use `--reuse`. Your run will be assigned to an idle instance in - the pool. If there are no available idle instances in the pool, the run will fail. - -??? info "Idle duration" - By default, `dstack run` sets the idle duration of a newly provisioned instance to `5m`. - This means that if the run is finished and the instance remains idle for longer than five minutes, it is automatically - removed from the pool. To override the default idle duration, use `--idle-duration DURATION` with `dstack run`. - -### Manual provisioning - -To manually provision a cloud instance and add it to a pool, use [`dstack pool add`](../reference/cli/index.md#dstack-pool-add): - -
- -```shell -$ dstack pool add --gpu 80GB - - BACKEND REGION RESOURCES SPOT PRICE - tensordock unitedkingdom 10xCPU, 80GB, 1xA100 (80GB) no $1.595 - azure westus3 24xCPU, 220GB, 1xA100 (80GB) no $3.673 - azure westus2 24xCPU, 220GB, 1xA100 (80GB) no $3.673 - -Continue? [y/n]: y -``` - -
- -The `dstack pool add` command allows specifying resource requirements, along with the spot policy, idle duration, max -price, retry policy, and other policies. - -??? info "Idle duration" - The default idle duration if you're using `dstack pool add` is `72h`. To override it, use the `--idle-duration DURATION` argument. - -[//]: # (TODO: Mention the retry policy) - -You can also specify the policies via [`.dstack/profiles.yml`](../reference/profiles.yml.md) instead of passing them as arguments. -For more details on policies and their defaults, refer to [`.dstack/profiles.yml`](../reference/profiles.yml.md). - -??? info "Limitations" - The `dstack pool add` command is not supported for Kubernetes, VastAI, and RunPod backends yet. - -### Adding on-prem clusters - -Any on-prem server that can be accessed via SSH can be added to a pool and used to run workloads. - -To add on-prem servers to the pool, use the `dstack pool add-ssh` command and pass the hostname of your server along with -the SSH key. - -
- -```shell -$ dstack pool add-ssh -i ~/.ssh/id_rsa ubuntu@54.73.155.119 -``` - -
- -The command accepts the same arguments as the standard `ssh` command. - -!!! warning "Requirements" - The on-prem server should be pre-installed with CUDA 12.1 and NVIDIA Docker. - -Once the instance is provisioned, you'll see it in the pool and will be able to run workloads on it. - -#### Clusters - -If you want on-prem instances to run multi-node tasks, ensure these on-prem servers share the same private network. -Additionally, you need to pass the `--network` option to `dstack pool add-ssh`: - -
- -```shell -$ dstack pool add-ssh -i ~/.ssh/id_rsa ubuntu@54.73.155.119 \ - --network 10.0.0.0/24 -``` - -
- -The `--network` argument accepts the IP address range (CIDR) of the private network of the instance. - -Once you've added multiple instances with the same network value, you can use them as a cluster to run -[multi-node tasks](../reference/dstack.yml/task.md#_nodes). - -## Removing instances - -If the instance remains idle for the configured idle duration, `dstack` removes it and deletes all cloud resources. - -To remove an instance from the pool manually, use the `dstack pool rm` command. - -
- -```shell -$ dstack pool rm <instance name> -``` - -
- -## List instances - -The [`dstack pool ps`](../reference/cli/index.md#dstack-pool-ps) command lists active instances and their status (`busy` -or `idle`). - -[//]: # (#### Manage pools) - -[//]: # (TBA) - diff --git a/docs/docs/concepts/services.md b/docs/docs/concepts/services.md deleted file mode 100644 index 1daa6ea80c..0000000000 --- a/docs/docs/concepts/services.md +++ /dev/null @@ -1,137 +0,0 @@ -# Services - -Services make it easy to deploy models and web applications as public, -secure, and scalable endpoints. They are provisioned behind a [gateway](gateways.md) that -automatically provides an HTTPS domain, handles authentication, distributes load, and performs auto-scaling. - -??? info "Gateways" - If you're using the open-source server, you must set up a [gateway](gateways.md) before you can run a service. - - If you're using [dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"}, - the gateway is already set up for you. - -## Configuration - -First, create a YAML file in your project folder. Its name must end with `.dstack.yml` (e.g. `.dstack.yml` or `serve.dstack.yml` -are both acceptable). - -
- -```yaml -type: service - -python: "3.11" -env: - - MODEL=NousResearch/Llama-2-7b-chat-hf -commands: - - pip install vllm - - python -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000 -port: 8000 - -resources: - gpu: 80GB - -# (Optional) Enable the OpenAI-compatible endpoint -model: - format: openai - type: chat - name: NousResearch/Llama-2-7b-chat-hf -``` - -
- -If you don't specify your Docker image, `dstack` uses the [base](https://hub.docker.com/r/dstackai/base/tags) image -(pre-configured with Python, Conda, and essential CUDA drivers). - -!!! info "Auto-scaling" - By default, the service is deployed to a single instance. However, you can specify the - [number of replicas and scaling policy](../reference/dstack.yml/service.md#replicas-and-auto-scaling). - In this case, `dstack` auto-scales it based on the load. - -!!! info "Reference" - See the [.dstack.yml reference](../reference/dstack.yml/service.md) - for all supported configuration options and multiple examples. - -## Running - -To run a configuration, use the [`dstack run`](../reference/cli/index.md#dstack-run) command followed by the working directory path, -configuration file path, and any other options. - -
- -```shell -$ dstack run . -f serve.dstack.yml - - BACKEND REGION RESOURCES SPOT PRICE - tensordock unitedkingdom 10xCPU, 80GB, 1xA100 (80GB) no $1.595 - azure westus3 24xCPU, 220GB, 1xA100 (80GB) no $3.673 - azure westus2 24xCPU, 220GB, 1xA100 (80GB) no $3.673 - -Continue? [y/n]: y - -Provisioning... ----> 100% - -Service is published at https://yellow-cat-1.example.com -``` - -
- -When deploying the service, `dstack run` mounts the current folder's contents. - -!!! info ".gitignore" - If there are large files or folders you'd like to avoid uploading, - you can list them in `.gitignore`. - -!!! info "Reference" - See the [CLI reference](../reference/cli/index.md#dstack-run) for more details - on how `dstack run` works. - -## Service endpoint - -One the service is up, its endpoint is accessible at `https://.`. - -By default, the service endpoint requires the `Authorization` header with `Bearer `. - -
- -```shell -$ curl https://yellow-cat-1.example.com/v1/chat/completions \ - -H 'Content-Type: application/json' \ - -H 'Authorization: Bearer <dstack token>' \ - -d '{ - "model": "NousResearch/Llama-2-7b-chat-hf", - "messages": [ - { - "role": "user", - "content": "Compose a poem that explains the concept of recursion in programming." - } - ] - }' -``` - -
- -Authorization can be disabled by setting `auth` to `false` in the service configuration file. - -### Model endpoint - -In case the service has the [model mapping](../reference/dstack.yml/service.md#model-mapping) configured, you will also be able -to access the model at `https://gateway.` via the OpenAI-compatible interface. - -## Managing runs - -**Stopping runs** - -When you use [`dstack stop`](../reference/cli/index.md#dstack-stop), the service and its cloud resources are deleted. - -**Listing runs** - -The [`dstack ps`](../reference/cli/index.md#dstack-ps) command lists all running runs and their status. - -## What's next? - -1. Check the [Text Generation Inference :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/deployment/tgi/README.md){:target="_blank"} and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/deployment/vllm/README.md){:target="_blank"} examples -2. Check the [`.dstack.yml` reference](../reference/dstack.yml/service.md) for more details and examples -3. See [gateways](gateways.md) on how to set up a gateway -4. Browse [examples :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/tree/master/examples){:target="_blank"} \ No newline at end of file diff --git a/docs/docs/concepts/tasks.md b/docs/docs/concepts/tasks.md deleted file mode 100644 index 1f369d7550..0000000000 --- a/docs/docs/concepts/tasks.md +++ /dev/null @@ -1,108 +0,0 @@ -# Tasks - -Tasks allow for convenient scheduling of various batch jobs, such as training, fine-tuning, or -data processing. They can also be used to run web applications -when features offered by [services](services.md) are not needed, such as for debugging. - -You can run tasks on a single machine or on a cluster of nodes. - -## Configuration - -First, create a YAML file in your project folder. Its name must end with `.dstack.yml` (e.g. `.dstack.yml` or `train.dstack.yml` -are both acceptable). - -
- -```yaml -type: task - -python: "3.11" -env: - - HF_HUB_ENABLE_HF_TRANSFER=1 -commands: - - pip install -r fine-tuning/qlora/requirements.txt - - tensorboard --logdir results/runs & - - python fine-tuning/qlora/train.py -ports: - - 6000 - -# (Optional) Configure `gpu`, `memory`, `disk`, etc -resources: - gpu: 80GB -``` - -
- -If you don't specify your Docker image, `dstack` uses the [base](https://hub.docker.com/r/dstackai/base/tags) image -(pre-configured with Python, Conda, and essential CUDA drivers). - - -!!! info "Distributed tasks" - By default, tasks run on a single instance. However, you can specify - the [number of nodes](../reference/dstack.yml/task.md#_nodes). - In this case, `dstack` provisions a cluster of instances. - -!!! info "Reference" - See the [.dstack.yml reference](../reference/dstack.yml/task.md) - for all supported configuration options and multiple examples. - -## Running - -To run a configuration, use the [`dstack run`](../reference/cli/index.md#dstack-run) command followed by the working directory path, -configuration file path, and other options. - -
- -```shell -$ dstack run . -f train.dstack.yml - - BACKEND REGION RESOURCES SPOT PRICE - tensordock unitedkingdom 10xCPU, 80GB, 1xA100 (80GB) no $1.595 - azure westus3 24xCPU, 220GB, 1xA100 (80GB) no $3.673 - azure westus2 24xCPU, 220GB, 1xA100 (80GB) no $3.673 - -Continue? [y/n]: y - -Provisioning... ----> 100% - -TensorBoard 2.13.0 at http://localhost:6006/ (Press CTRL+C to quit) - -Epoch 0: 100% 1719/1719 [00:18<00:00, 92.32it/s, loss=0.0981, acc=0.969] -Epoch 1: 100% 1719/1719 [00:18<00:00, 92.32it/s, loss=0.0981, acc=0.969] -Epoch 2: 100% 1719/1719 [00:18<00:00, 92.32it/s, loss=0.0981, acc=0.969] -``` - -
- -If the task specifies `ports`, `dstack run` automatically forwards them to your local machine for -convenient and secure access. - -When running the task, `dstack run` mounts the current folder's contents. - -!!! info ".gitignore" - If there are large files or folders you'd like to avoid uploading, - you can list them in `.gitignore`. - -!!! info "Reference" - See the [CLI reference](../reference/cli/index.md#dstack-run) for more details - on how `dstack run` works. - -## Managing runs - -**Stoping runs** - -Once you use [`dstack stop`](../reference/cli/index.md#dstack-stop) (or when the run exceeds the -`max_duration`), the instances return to the [pool](pools.md). - -**Listing runs** - -The [`dstack ps`](../reference/cli/index.md#dstack-ps) command lists all running runs and their status. - -[//]: # (TODO: Mention `dstack logs` and `dstack logs -d`) - -## What's next? - -1. Check the [QLoRA :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/fine-tuning/qlora/README.md){:target="_blank"} example -2. Check the [`.dstack.yml` reference](../reference/dstack.yml/task.md) for more details and examples -3. Browse [all examples :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/tree/master/examples){:target="_blank"} \ No newline at end of file diff --git a/docs/docs/concepts/volumes.md b/docs/docs/concepts/volumes.md deleted file mode 100644 index dabb8d0576..0000000000 --- a/docs/docs/concepts/volumes.md +++ /dev/null @@ -1,108 +0,0 @@ -# Volumes - -Volumes allow you to persist data between runs. `dstack` simplifies managing volumes and lets you mount them to a specific -directory when working with dev environments, tasks, and services. - -!!! info "Experimental" - Volumes are currently experimental and only work with the `aws` backend. Support for other backends is coming soon. - -## Configuration - -First, create a YAML file in your project folder. Its name must end with `.dstack.yml` (e.g. `.dstack.yml` or `vol.dstack.yml` -are both acceptable). - -
- -```yaml -type: volume -name: my-new-volume -backend: aws -region: eu-central-1 -size: 100GB -``` - -
- -If you use this configuration, `dstack` will create a new volume based on the specified options. - -!!! info "Registering existing volumes" - If you prefer not to create a new volume but to reuse an existing one (e.g., created manually), you can - [specify its ID via `volume_id`](../reference/dstack.yml/volume.md#register-volume). In this case, `dstack` will register the specified volume so that you can use it with development - environments, tasks, and services. - -!!! info "Reference" - See the [.dstack.yml reference](../reference/dstack.yml/dev-environment.md) - for all supported configuration options and multiple examples. - -## Creating and registering volumes - -To create or register the volume, simply call the `dstack apply` command: - -
- -```shell -$ dstack apply -f volume.dstack.yml -Volume my-new-volume does not exist yet. Create the volume? [y/n]: y - NAME BACKEND REGION STATUS CREATED - my-new-volume aws eu-central-1 submitted now - -``` - -
- -> When creating the volume `dstack` automatically creates an `ext4` file system on it. - -Once created, the volume can be attached with dev environments, tasks, and services. - -## Attaching volumes - -Dev environments, tasks, and services let you attach any number of volumes. -To attach a volume, simply specify its name using the `volumes` property and specify where to mount its contents: - -
- -```yaml -type: dev-environment -ide: vscode -volumes: - - name: my-new-volume - path: /volume_data -``` - -
- -Once you run this configuration, the contents of the volume will be attached to `/volume_data` inside the dev environment, -and its contents will persist across runs. - -!!! info "Limitations" - When you're running a dev environment, task, or service with `dstack`, it automatically mounts the project folder contents - to `/workflow` (and sets that as the current working directory). Right now, `dstack` doesn't allow you to - attach volumes to `/workflow` or any of its subdirectories. - -## Managing gateways - -**Deleting gateways** - -When the volume isn't attached to any active dev environment, task, or service, you can delete it using `dstack delete`: - -```shell -$ dstack delete -f vol.dstack.yaml -``` - -If the volume was created using `dstack`, it will be physically destroyed along with the data. -If you've registered an existing volume, it will be de-registered with `dstack` but will keep the data. - -**Listing volumes** - -The [`dstack volume list`](../reference/cli/index.md#dstack-gateway-list) command lists created and registered volumes. - -## FAQ - -??? info "Using volumes across backends" - Since volumes are backed up by cloud network disks, you can only use them within the same cloud. If you need to access - data across different backends, you should either use object storage (or replicate the data across multiple volumes). - -??? info "Using volumes across regions" - Typically, network volumes are associated with specific regions, so you can't use them in other regions. Sometimes, - volumes are also linked to availability zones, but some systems allow volumes that can be used across different - availability zones within the same region. \ No newline at end of file diff --git a/docs/docs/index.md b/docs/docs/index.md deleted file mode 100644 index ee9b54a4db..0000000000 --- a/docs/docs/index.md +++ /dev/null @@ -1,49 +0,0 @@ -# What is dstack? - -`dstack` is an open-source container orchestration engine for running AI workloads across diverse cloud providers -and on-premises data centers. It simplifies provisioning compute resources, managing dev environments, executing tasks on clusters, and deploying services. - -!!! info "Cloud and on-premises" - `dstack` allows workload orchestration on both cloud and on-premises clusters. - Supported cloud providers include AWS, GCP, Azure, OCI, Lambda, TensorDock, Vast.ai, RunPod, and CUDO. - -!!! info "Accelerators" - `dstack` supports NVIDIA GPUs and Google Cloud TPUs out of the box. - -## How does it work? - -!!! info "Installation" - Before using `dstack`, [install](installation/index.md) the `dstack` server and configure credentials - and other settings for each cloud account that you intend to use. - -#### 1. Define configurations - -`dstack` supports three types of run configurations: - -* [`dev environment`](concepts/dev-environments.md) — for interactive development using a desktop IDE -* [`task`](concepts/tasks.md) — for any kind of batch jobs or web applications (supports distributed jobs) -* [`service`](concepts/services.md)— for production-grade deployment (supports auto-scaling and authorization) - -Each type of run configuration allows you to specify commands for execution, required compute resources, retry policies, auto-scaling rules, authorization settings, and more. - -Configuration can be defined as YAML files within your repo. - -#### 2. Run configurations - -Run any defined configuration either via `dstack` CLI or API. - -`dstack` automatically provisions compute resources (whether from the cloud or on-premises), executes commands, handles interruptions, port-forwarding, auto-scaling, network, volumes, run failures, out-of-capacity errors, and more. - -#### 3. Manage pools - -Use [pools](concepts/pools.md) to manage the lifecycle of cloud instances and add/remove on-prem clusters. - -You can manually add or remove cloud instances from the pool, or have them provisioned on-demand and configure how long they remain idle before automatic termination. - - -## Where do I start? - -1. Proceed to [installation](installation/index.md) -2. See [quickstart](quickstart.md) -3. Browse [examples :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/tree/master/examples){:target="_blank"} -4. Join [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} \ No newline at end of file diff --git a/docs/docs/installation/index.md b/docs/docs/installation/index.md deleted file mode 100644 index 99382d8d8e..0000000000 --- a/docs/docs/installation/index.md +++ /dev/null @@ -1,147 +0,0 @@ -# Installation - -To use the open-source version of `dstack` (which is self-hosted to use your own cloud accounts or data centers), -go ahead and [set up the server](#set-up-the-server). - -To use [dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"} -(a managed service that allows you to use either GPUs via marketplace, or connect to your own cloud accounts or data centers), -proceed to [dstack Sky](#dstack-sky). - -## Set up the server - -### Configure backends - -Before starting the `dstack` server, create `~/.dstack/server/config.yml` and -configure a backend for each cloud account that you'd like to use. - -
- -```yaml -projects: - - name: main - backends: - - type: aws - creds: - type: access_key - access_key: AIZKISCVKUKO5AAKLAEH - secret_key: QSbmpqJIUBn1V5U3pyM9S6lwwiu8/fOJ2dgfwFdW -``` - -
- -> Go to the [server/config.yml reference](../reference/server/config.yml.md#examples) -> for details on how to configure backends for AWS, GCP, Azure, OCI, Lambda, -> TensorDock, Vast.ai, RunPod, CUDO, Kubernetes, etc. - -### Start the server - -Once the `~/.dstack/server/config.yml` file is configured, proceed to start the server: - -=== "pip" - -
- - ```shell - $ pip install "dstack[all]" -U - $ dstack server - - Applying ~/.dstack/server/config.yml... - - The admin token is "bbae0f28-d3dd-4820-bf61-8f4bb40815da" - The server is running at http://127.0.0.1:3000/ - ``` - -
- -=== "Docker" - -
- - ```shell - $ docker run -p 3000:3000 \ - -v $HOME/.dstack/server/:/root/.dstack/server \ - dstackai/dstack - - Applying ~/.dstack/server/config.yml... - - The admin token is "bbae0f28-d3dd-4820-bf61-8f4bb40815da" - The server is running at http://127.0.0.1:3000/ - ``` - -
- - > For more details on how to deploy `dstack` using Docker, check its [Docker repo](https://hub.docker.com/r/dstackai/dstack). - -> By default, `dstack` stores its state in `~/.dstack/server/data` using SQLite. -> To use a database, set the [`DSTACK_DATABASE_URL`](../reference/cli/index.md#environment-variables) environment variable. - -Once the `dstack` server is up, feel free to use the CLI or API to work with it. - -### Set up the CLI - -To point the CLI to the `dstack` server, configure it -with the server address, user token and project name: - -
- -```shell -$ pip install dstack -$ dstack config --url http://127.0.0.1:3000 \ - --project main \ - --token bbae0f28-d3dd-4820-bf61-8f4bb40815da - -Configuration is updated at ~/.dstack/config.yml -``` - -
- -This configuration is stored in `~/.dstack/config.yml`. - -### Add on-prem clusters - -If you'd like to use `dstack` to run workloads on your on-prem clusters, -check out the [dstack pool add-ssh](../concepts/pools.md#adding-on-prem-clusters) command. - -## dstack Sky - -### Set up the CLI - -If you've signed up with [dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"}, -open the project settings, and copy the `dstack config` command to point the CLI to the project. - -![](https://raw.githubusercontent.com/dstackai/static-assets/main/static-assets/images/dstack-sky-project-config.png){ width=800 } - -Then, install the CLI on your machine and use the copied command. - -
- -```shell -$ pip install dstack -$ dstack config --url https://sky.dstack.ai \ - --project peterschmidt85 \ - --token bbae0f28-d3dd-4820-bf61-8f4bb40815da - -Configuration is updated at ~/.dstack/config.yml -``` - -
- -### Configure backends - -By default, [dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"} -uses the GPU from its marketplace, which requires a credit card to be attached in your account -settings. - -To use your own cloud accounts, click the settings icon of the corresponding backend and specify credentials: - -![](https://raw.githubusercontent.com/dstackai/static-assets/main/static-assets/images/dstack-sky-edit-backend-config.png){ width=800 } - -[//]: # (The `dstack server` command automatically updates `~/.dstack/config.yml`) -[//]: # (with the `main` project.) - -## What's next? - -1. Check the [server/config.yml reference](../reference/server/config.yml.md) on how to configure backends -2. Follow [quickstart](../quickstart.md) -3. Browse [examples :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/tree/master/examples) -4. Join the community via [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd) \ No newline at end of file diff --git a/docs/docs/protips.md b/docs/docs/protips.md deleted file mode 100644 index aadf8ded3f..0000000000 --- a/docs/docs/protips.md +++ /dev/null @@ -1,250 +0,0 @@ -# Protips - -Below are tips and tricks to use `dstack` more efficiently. - -## Dev environments - -Before running a task or service, it's recommended that you first start with a dev environment. Dev environments -allow you to run commands interactively. - -Once the commands work, go ahead and run them as a task or a service. - -??? info "Notebooks" - **VS Code** - - When you access a dev environment using your desktop VS Code, it allows you to work with Jupyter notebooks via its - pre-configured and easy-to-use extension. - - **JupyterLab** - - If you prefer to use JupyterLab, you can run it as a task: - - ```yaml - type: task - - commands: - - pip install jupyterlab - - jupyter lab --allow-root - - ports: - - 8888 - - ``` - -## Tasks vs Services for web applications - -Tasks can be used not only for batch jobs but also for web applications. - -
- -```yaml -type: task - -python: "3.11" - -commands: - - pip3 install streamlit - - streamlit hello - -ports: - - 8501 - -``` - -
- -While you run a task, `dstack` forwards the remote ports to `localhost`. - -
- -```shell -$ dstack run . -f app.dstack.yml - - Welcome to Streamlit. Check out our demo in your browser. - - Local URL: http://localhost:8501 -``` - -
- -This allows you to access the remote `8501` port on `localhost:8501` while the CLI is attached. - -??? info "Port mapping" - If you want to override the local port, use the `--port` option: - -
- - ```shell - $ dstack run . -f app.dstack.yml --port 3000:8501 - ``` - -
- - This will forward the remote `8501` port to `localhost:3000`. - -[Services](concepts/services.md) require a gateway but they also provide additional features for -production-grade service deployment not offered by tasks, such as HTTPS domains and auto-scaling. -If you run a web app as a task and it works, go ahead and run it as a service. - -## Environment variables - -If a configuration requires an environment variable that you don't want to hardcode in the YAML, you can define it -without assigning a value: - -
- -```yaml -type: dev-environment - -env: - - HUGGING_FACE_HUB_TOKEN - -python: "3.11" -ide: vscode -``` - -
- -Then, you can pass the environment variable either via the shell: - -```shell -HUGGING_FACE_HUB_TOKEN=... dstack run . -f .dstack.yml -``` - -Or via the `-e` option of the `dstack run` command: - -```shell -dstack run . -f .dstack.yml -e HUGGING_FACE_HUB_TOKEN=... -``` - -??? info ".env" - A better way to configure environment variables not hardcoded in YAML is by specifying them in a `.env` file: - - ``` - HUGGING_FACE_HUB_TOKEN=... - ``` - - If you install [`direnv` :material-arrow-top-right-thin:{ .external }](https://direnv.net/){:target="_blank"}, - it will automatically pass the environment variables from the `.env` file to the `dstack run` command. - - Remember to add `.env` to `.gitignore` to avoid pushing it to the repo. - -## Data and models - -`dstack` has support for [volumes](concepts/volumes.md) -to persist data across different runs and instance interruptions. -Volumes are ideal for storing intermediate work and data that should be quickly accessible. - -You can also load and save data using an object storage like S3 or HuggingFace Datasets. -For models, it's best to use services like HuggingFace Hub. -`dstack` has no explicit support for object storage. -You can load and save data directly from your code. - -## Idle instances - -By default, the `dstack` run command reuses an idle instance from the pool. If no instance matches the requirements, it creates a new one. - -When the run finishes, the instance remains idle for the configured time (by default, `5m`) before it gets destroyed. - -You can change the default idle duration by using ``--idle-duration DURATION`` with `dstack run`, or -set `termination_idle_duration` in the configuration or profile. - -An idle instance can be destroyed at any time via `dstack pool rm INSTANCE_NAME`. - -## Profiles - -If you don't want to specify the same parameters for each configuration, you can define them once via [profiles](reference/profiles.yml.md) -and reuse them across configurations. - -This can be handy, for example, for configuring parameters such as `max_duration`, `max_price`, `termination_idle_duration`, -`regions`, etc. - -Set `default` to `true` in your profile, and it will be applied automatically to any run. - -## Attached mode - -By default, `dstack run` runs in attached mode. -This means it streams the logs as they come in and, in the case of a task, forwards its ports to `localhost`. - -If you detach the CLI, you can re-attach it using `dstack logs -a RUN_NAME`. - -To run in detached mode, use `-d` with `dstack run`. - -## GPU - -`dstack` natively supports NVIDIA GPU, and Google Cloud TPU accelerator chips. - -The `gpu` property withing `resources` (or the `--gpu` option with `dstack run`) -allows specifying not only memory size but also GPU names, their memory, and quantity. - -Examples: - -- `1` (any GPU) -- `A100` (A100) -- `24GB..` (any GPU starting from 24GB) -- `24GB..40GB:2` (two GPUs between 24GB and 40GB) -- `A10G,A100` (either A10G or A100) -- `A100:80GB` (one A100 of 80GB) -- `A100:2` (two A100) -- `A100:40GB:2` (two A100 40GB) -- `tpu-v2-8` (`v2` with 8 TPU cores) - -??? info "Google Cloud TPU" - Currently, you can't specify other than 8 TPU cores. This means only single host workloads are supported. - Support for multiple hosts is coming soon. - -## Service quotas - -If you're using your own AWS, GCP, Azure, or OCI accounts, before you can use GPUs or spot instances, you have to request the -corresponding service quotas for each type of instance in each region. - -??? info "AWS" - Check this [guide :material-arrow-top-right-thin:{ .external }](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-resource-limits.html){:target="_blank"} on EC2 service quotas. - The relevant service quotas include: - - - `Running On-Demand P instances` (on-demand V100, A100 80GB x8) - - `All P4, P3 and P2 Spot Instance Requests` (spot V100, A100 80GB x8) - - `Running On-Demand G and VT instances` (on-demand T4, A10G, L4) - - `All G and VT Spot Instance Requests` (spot T4, A10G, L4) - - `Running Dedicated p5 Hosts` (on-demand H100) - - `All P5 Spot Instance Requests` (spot H100) - -??? info "GCP" - Check this [guide :material-arrow-top-right-thin:{ .external }](https://cloud.google.com/compute/resource-usage){:target="_blank"} on Compute Engine service quotas. - The relevant service quotas include: - - - `NVIDIA V100 GPUs` (on-demand V100) - - `Preemtible V100 GPUs` (spot V100) - - `NVIDIA T4 GPUs` (on-demand T4) - - `Preemtible T4 GPUs` (spot T4) - - `NVIDIA L4 GPUs` (on-demand L4) - - `Preemtible L4 GPUs` (spot L4) - - `NVIDIA A100 GPUs` (on-demand A100) - - `Preemtible A100 GPUs` (spot A100) - - `NVIDIA A100 80GB GPUs` (on-demand A100 80GB) - - `Preemtible A100 80GB GPUs` (spot A100 80GB) - - `NVIDIA H100 GPUs` (on-demand H100) - - `Preemtible H100 GPUs` (spot H100) - -??? info "Azure" - Check this [guide :material-arrow-top-right-thin:{ .external }](https://learn.microsoft.com/en-us/azure/quotas/quickstart-increase-quota-portal){:target="_blank"} on Azure service quotas. - The relevant service quotas include: - - - `Total Regional Spot vCPUs` (any spot instances) - - `Standard NCASv3_T4 Family vCPUs` (on-demand T4) - - `Standard NVADSA10v5 Family vCPUs` (on-demand A10) - - `Standard NCADS_A100_v4 Family vCPUs` (on-demand A100 80GB) - - `Standard NDASv4_A100 Family vCPUs` (on-demand A100 40GB x8) - - `Standard NDAMSv4_A100Family vCPUs` (on-demand A100 80GB x8) - - `Standard NCadsH100v5 Family vCPUs` (on-demand H100) - - `Standard NDSH100v5 Family vCPUs` (on-demand H100 x8) - -??? info "OCI" - Check this [guide :material-arrow-top-right-thin:{ .external }](https://docs.oracle.com/en-us/iaas/Content/General/Concepts/servicelimits.htm#Requesti){:target="_blank"} on requesting OCI service limits increase. - The relevant service category is compute. The relevant resources include: - - - `GPUs for GPU.A10 based VM and BM instances` (on-demand A10) - - `GPUs for GPU2 based VM and BM instances` (on-demand P100) - - `GPUs for GPU3 based VM and BM instances` (on-demand V100) - -Note, for AWS, GCP, and Azure, service quota values are measured with the number of CPUs rather than GPUs. diff --git a/docs/docs/quickstart.md b/docs/docs/quickstart.md deleted file mode 100644 index 5cb0cea6b9..0000000000 --- a/docs/docs/quickstart.md +++ /dev/null @@ -1,134 +0,0 @@ -# Quickstart - -!!! info "Installation" - Before using `dstack`, either set up the open-source server, or sign up - with `dstack Sky`. - See [Installation](installation/index.md) for more details. - -## Initialize a repo - -To use `dstack`'s CLI in a folder, first run [`dstack init`](reference/cli/index.md#dstack-init) within that folder. - -
- -```shell -$ mkdir quickstart && cd quickstart -$ dstack init -``` - -
- -Your folder can be a regular local folder or a Git repo. - -## Define a configuration - -Define what you want to run as a YAML file. The filename must end with `.dstack.yml` (e.g., `.dstack.yml` -or `train.dstack.yml` are both acceptable). - -=== "Dev environment" - - Dev environments allow you to quickly provision a machine with a pre-configured environment, resources, IDE, code, etc. - -
- - ```yaml - type: dev-environment - - # Use either `python` or `image` to configure environment - python: "3.11" - # image: ghcr.io/huggingface/text-generation-inference:latest - - ide: vscode - - # (Optional) Configure `gpu`, `memory`, `disk`, etc - resources: - gpu: 24GB - ``` - -
- -=== "Task" - - Tasks make it very easy to run any scripts, be it for training, data processing, or web apps. They allow you to pre-configure the environment, resources, code, etc. - -
- - ```yaml - type: task - - python: "3.11" - env: - - HF_HUB_ENABLE_HF_TRANSFER=1 - commands: - - pip install -r fine-tuning/qlora/requirements.txt - - python fine-tuning/qlora/train.py - - # (Optional) Configure `gpu`, `memory`, `disk`, etc - resources: - gpu: 24GB - ``` - -
- - Ensure `requirements.txt` and `train.py` are in your folder. You can take them from [`examples`](https://github.com/dstackai/dstack/tree/master/examples/fine-tuning/qlora). - -=== "Service" - - Services make it easy to deploy models and apps cost-effectively as public endpoints, allowing you to use any frameworks. - -
- - ```yaml - type: service - - image: ghcr.io/huggingface/text-generation-inference:latest - env: - - HUGGING_FACE_HUB_TOKEN # required to run gated models - - MODEL_ID=mistralai/Mistral-7B-Instruct-v0.1 - commands: - - text-generation-launcher --port 8000 --trust-remote-code - port: 8000 - - # (Optional) Configure `gpu`, `memory`, `disk`, etc - resources: - gpu: 24GB - ``` - -
- -## Run configuration - -Run a configuration using the [`dstack run`](reference/cli/index.md#dstack-run) command, followed by the working directory path (e.g., `.`), -and the path to the configuration file. - -
- -```shell -$ dstack run . -f train.dstack.yml - - BACKEND REGION RESOURCES SPOT PRICE - tensordock unitedkingdom 10xCPU, 80GB, 1xA100 (80GB) no $1.595 - azure westus3 24xCPU, 220GB, 1xA100 (80GB) no $3.673 - azure westus2 24xCPU, 220GB, 1xA100 (80GB) no $3.673 - -Continue? [y/n]: y - -Provisioning... ----> 100% - -Epoch 0: 100% 1719/1719 [00:18<00:00, 92.32it/s, loss=0.0981, acc=0.969] -Epoch 1: 100% 1719/1719 [00:18<00:00, 92.32it/s, loss=0.0981, acc=0.969] -Epoch 2: 100% 1719/1719 [00:18<00:00, 92.32it/s, loss=0.0981, acc=0.969] -``` - -
- -The `dstack run` command automatically uploads your code, including any local uncommitted changes. -To exclude any files from uploading, use `.gitignore`. - -## What's next? - -1. Read about [dev environments](concepts/dev-environments.md), [tasks](concepts/tasks.md), - [services](concepts/services.md), and [pools](concepts/pools.md) -2. Browse [examples :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/tree/master/examples){:target="_blank"} -3. Join the community via [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd) \ No newline at end of file diff --git a/docs/docs/reference/api/python/index.md b/docs/docs/reference/api/python/index.md deleted file mode 100644 index 6c7d3cf821..0000000000 --- a/docs/docs/reference/api/python/index.md +++ /dev/null @@ -1,218 +0,0 @@ -# Python API - -The Python API enables running tasks, services, and managing runs programmatically. - -## Usage example - -Below is a quick example of submitting a task for running and displaying its logs. - -```python -import sys - -from dstack.api import Task, GPU, Client, Resources - -client = Client.from_config() - -task = Task( - image="ghcr.io/huggingface/text-generation-inference:latest", - env={"MODEL_ID": "TheBloke/Llama-2-13B-chat-GPTQ"}, - commands=[ - "text-generation-launcher --trust-remote-code --quantize gptq", - ], - ports=["80"], - resources=Resources(gpu=GPU(memory="24GB")), -) - -run = client.runs.submit( - run_name="my-awesome-run", # If not specified, a random name is assigned - configuration=task, - repo=None, # Specify to mount additional files -) - -run.attach() - -try: - for log in run.logs(): - sys.stdout.buffer.write(log) - sys.stdout.buffer.flush() -except KeyboardInterrupt: - run.stop(abort=True) -finally: - run.detach() -``` - -!!! info "NOTE:" - 1. The `configuration` argument in the `submit` method can be either `dstack.api.Task` or `dstack.api.Service`. - 2. If you create `dstack.api.Task` or `dstack.api.Service`, you may specify the `image` argument. If `image` isn't - specified, the default image will be used. For a private Docker registry, ensure you also pass the `registry_auth` argument. - 3. The `repo` argument in the `submit` method allows the mounting of a local folder, a remote repo, or a - programmatically created repo. In this case, the `commands` argument can refer to the files within this repo. - 4. The `attach` method waits for the run to start and, for `dstack.api.Task` sets up an SSH tunnel and forwards - configured `ports` to `localhost`. - -## `dstack.api` { #dstack.api data-toc-label="dstack.api" } - -### `dstack.api.Client` { #dstack.api.Client data-toc-label="Client" } - -::: dstack.api.Client - options: - show_root_heading: false - show_root_toc_entry: false - heading_level: 4 - -### `dstack.api.RunCollection` { #dstack.api.Client.runs data-toc-label="RunCollection" } - -::: dstack.api.RunCollection - options: - show_bases: false - show_symbol_type_heading: true - show_root_toc_entry: false - heading_level: 4 - -### `dstack.api.RepoCollection` { #dstack.api.Client.repos data-toc-label="RepoCollection" } - -::: dstack.api.RepoCollection - options: - show_root_heading: false - show_root_toc_entry: false - heading_level: 4 - -[//]: # (### `dstack.api.BackendCollection` { #dstack.api.Client.backends data-toc-label="BackendCollection" }) - -[//]: # (::: dstack.api.BackendCollection) -[//]: # ( options:) -[//]: # ( show_bases: false) -[//]: # ( show_root_heading: false) -[//]: # ( show_root_toc_entry: false) -[//]: # ( heading_level: 4) - -### `dstack.api.Task` { #dstack.api.Task data-toc-label="Task" } - -#SCHEMA# dstack.api.Task - overrides: - show_root_heading: false - show_root_toc_entry: false - heading_level: 4 - item_id_mapping: - registry_auth: dstack.api.RegistryAuth - resources: dstack.api.Resources - -### `dstack.api.Service` { #dstack.api.Service data-toc-label="Service" } - -#SCHEMA# dstack.api.Service - overrides: - show_root_heading: false - show_root_toc_entry: false - heading_level: 4 - item_id_mapping: - scaling: dstack.api.Scaling - registry_auth: dstack.api.RegistryAuth - resources: dstack.api.Resources - -### `dstack.api.Run` { #dstack.api.Run data-toc-label="Run" } - -::: dstack.api.Run - options: - show_bases: false - show_root_heading: false - show_root_toc_entry: false - heading_level: 4 - -### `dstack.api.Resources` { #dstack.api.Resources data-toc-label="Resources" } - -#SCHEMA# dstack.api.Resources - overrides: - show_root_heading: false - show_root_toc_entry: false - heading_level: 4 - item_id_mapping: - gpu: dstack.api.GPU - memory: dstack.api.Memory - Range: dstack.api.Range - -### `dstack.api.GPU` { #dstack.api.GPU data-toc-label="GPU" } - -#SCHEMA# dstack.api.GPU - overrides: - show_root_heading: false - show_root_toc_entry: false - heading_level: 4 - item_id_mapping: - memory: dstack.api.Memory - Range: dstack.api.Range - -### `dstack.api.Disk` { #dstack.api.Disk data-toc-label="Disk" } - -#SCHEMA# dstack.api.Disk - overrides: - show_root_heading: false - show_root_toc_entry: false - heading_level: 4 - item_id_mapping: - memory: dstack.api.Memory - Range: dstack.api.Range - -### `dstack.api.LocalRepo` { #dstack.api.LocalRepo data-toc-label="LocalRepo" } - -::: dstack.api.LocalRepo - options: - show_bases: false - show_root_heading: false - show_root_toc_entry: false - heading_level: 4 - -### `dstack.api.RemoteRepo` { #dstack.api.RemoteRepo data-toc-label="RemoteRepo" } - -::: dstack.api.RemoteRepo - options: - show_bases: false - show_root_heading: false - show_root_toc_entry: false - heading_level: 4 - -### `dstack.api.VirtualRepo` { #dstack.api.VirtualRepo data-toc-label="VirtualRepo" } - -::: dstack.api.VirtualRepo - options: - show_bases: false - show_root_heading: false - show_root_toc_entry: false - heading_level: 4 - -### `dstack.api.RegistryAuth` { #dstack.api.RegistryAuth data-toc-label="RegistryAuth" } - -#SCHEMA# dstack.api.RegistryAuth - overrides: - show_root_heading: false - show_root_toc_entry: false - heading_level: 4 - -### `dstack.api.Scaling` { #dstack.api.Scaling data-toc-label="Scaling" } - -#SCHEMA# dstack.api.Scaling - overrides: - show_root_heading: false - show_root_toc_entry: false - heading_level: 4 - -### `dstack.api.BackendType` { #dstack.api.BackendType data-toc-label="BackendType" } - -::: dstack.api.BackendType - options: - show_bases: false - show_root_heading: false - show_root_toc_entry: false - heading_level: 4 - - \ No newline at end of file diff --git a/docs/docs/reference/api/rest/index.md b/docs/docs/reference/api/rest/index.md deleted file mode 100644 index 99948e1bf6..0000000000 --- a/docs/docs/reference/api/rest/index.md +++ /dev/null @@ -1,35 +0,0 @@ ---- -title: REST API ---- - - -!!swagger openapi.json!! diff --git a/docs/docs/reference/cli/index.md b/docs/docs/reference/cli/index.md deleted file mode 100644 index 5b73b94348..0000000000 --- a/docs/docs/reference/cli/index.md +++ /dev/null @@ -1,359 +0,0 @@ -# CLI - -## Commands - -### dstack server - -This command starts the `dstack` server. - -
- -```shell -$ dstack server --help -#GENERATE# -``` - -
- -[//]: # (DSTACK_SERVER_ENVIRONMENT, DSTACK_SERVER_CONFIG_DISABLED, DSTACK_SENTRY_DSN, DSTACK_SENTRY_TRACES_SAMPLE_RATE, DSTACK_SERVER_BUCKET_REGION, DSTACK_SERVER_BUCKET, DSTACK_ALEMBIC_MIGRATIONS_LOCATION) - -### dstack init - -This command must be called inside a folder before you can use `dstack run` or `dstack apply`. - -**Git credentials** - -If the current folder is a remote Git repository, `dstack init` ensures that `dstack` can access it. -By default, the command uses the remote repo's default Git credentials. These can be overridden with -`--git-identity` (private SSH key) or `--token` (OAuth token). - -
- -```shell -$ dstack init --help -#GENERATE# -``` - -
- -**User SSH key** - -By default, `dstack` uses its own SSH key to access instances (`~/.dstack/ssh/id_rsa`). -It is possible to override this key via the `--ssh-identity` argument. - -### dstack run - -This command runs a given configuration. - -
- -```shell -$ dstack run . --help -#GENERATE# -``` - -
- -??? info ".gitignore" - When running anything via CLI, `dstack` uses the exact version of code from your project directory. - - If there are large files, consider creating a `.gitignore` file to exclude them for better performance. - -### dstack apply - -This command applies a given configuration. If a resource does not exist, `dstack apply` creates the resource. -If a resource exists, `dstack apply` updates the resource in-place or re-creates the resource if the update is not possible. - -
- -```shell -$ dstack apply --help -#GENERATE# -``` - -
- -!!! info "NOTE:" - The `dstack apply` command currently supports only `gateway` and `volume` configurations. - Support for other configuration types is coming soon. - -### dstack delete - -This command deletes the resources defined by a given configuration. - -
- -```shell -$ dstack delete --help -#GENERATE# -``` - -
- -!!! info "NOTE:" - The `dstack delete` command currently supports only `gateway` configurations. - Support for other configuration types is coming soon. - -### dstack ps - -This command shows the status of runs. - -
- -```shell -$ dstack ps --help -#GENERATE# -``` - -
- -### dstack stop - -This command stops run(s) within the current repository. - -
- -```shell -$ dstack stop --help -#GENERATE# -``` - -
- -### dstack logs - -This command shows the output of a given run within the current repository. - -
- -```shell -$ dstack logs --help -#GENERATE# -``` - -
- -### dstack config - -Both the CLI and API need to be configured with the server address, user token, and project name -via `~/.dstack/config.yml`. - -At startup, the server automatically configures CLI and API with the server address, user token, and -the default project name (`main`). This configuration is stored via `~/.dstack/config.yml`. - -To use CLI and API on different machines or projects, use the `dstack config` command. - -
- -```shell -$ dstack config --help -#GENERATE# -``` - -
- -### dstack pool - -Pools allow for managing the lifecycle of instances and reusing them across runs. -The default pool is created automatically. - -##### dstack pool add - -The `dstack pool add` command provisions a cloud instance and adds it to a pool. If no pool name is specified, the instance goes to the default pool. - -
- -```shell -$ dstack pool add --help -#GENERATE# -``` - -
- -##### dstack pool add-ssh - -The `dstack pool add-ssh` command adds an existing remote instance to a pool. -If no pool name is specified, the instance goes to the default pool. - -
- -```shell -$ dstack pool add-ssh --help -#GENERATE# -``` - -
- -##### dstack pool ps - -The `dstack pool ps` command lists all active instances of a pool. -If no pool name is specified, default pool instances are displayed. - -
- -```shell -$ dstack pool ps --help -#GENERATE# -``` - -
- -##### dstack pool rm - -The `dstack pool rm` command removes an instance from a pool. -Cloud instances are terminated upon removal. - -
- -```shell -$ dstack pool rm --help -#GENERATE# -``` - -
- -##### dstack pool create - -The `dstack pool create` command creates a new pool. - -
- -```shell -$ dstack pool create --help -#GENERATE# -``` - -
- -##### dstack pool list - -The `dstack pool list` command lists all existing pools. - -
- -```shell -$ dstack pool delete --help -#GENERATE# -``` - -
- -##### dstack pool set-default - -The `dstack pool set-default` command sets the project's default pool. - -
- -```shell -$ dstack pool set-default --help -#GENERATE# -``` - -
- -##### dstack pool delete - -The `dstack pool delete` command deletes a specified pool. - -
- -```shell -$ dstack pool delete --help -#GENERATE# -``` - -
- -### dstack gateway - -A gateway is required for running services. It handles ingress traffic, authorization, domain mapping, model mapping -for the OpenAI-compatible endpoint, and so on. - -##### dstack gateway list - -The `dstack gateway list` command displays the names and addresses of the gateways configured in the project. - -
- -```shell -$ dstack gateway list --help -#GENERATE# -``` - -
- -##### dstack gateway create - -The `dstack gateway create` command creates a new gateway instance in the project. - -
- -```shell -$ dstack gateway create --help -#GENERATE# -``` - -
- -##### dstack gateway delete - -The `dstack gateway delete` command deletes the specified gateway. - -
- -```shell -$ dstack gateway delete --help -#GENERATE# -``` - -
- -##### dstack gateway update - -The `dstack gateway update` command updates the specified gateway. - -
- -```shell -$ dstack gateway update --help -#GENERATE# -``` - -
- -### dstack volume - -The volumes commands. - -##### dstack volume list - -The `dstack volume list` command lists volumes. - -
- -```shell -$ dstack volume list --help -#GENERATE# -``` - -
- -## Environment variables - - * `DSTACK_CLI_LOG_LEVEL` – (Optional) Configures CLI logging level. Defaults to `INFO`. - * `DSTACK_SERVER_LOG_LEVEL` – (Optional) Has the same effect as `--log-level`. Defaults to `INFO`. - * `DSTACK_SERVER_HOST` – (Optional) Has the same effect as `--host`. Defaults to `127.0.0.1`. - * `DSTACK_SERVER_PORT` – (Optional) Has the same effect as `--port`. Defaults to `3000`. - * `DSTACK_SERVER_ADMIN_TOKEN` – (Optional) Has the same effect as `--token`. Defaults to `None`. - * `DSTACK_DATABASE_URL` – (Optional) The database URL to use instead of default SQLite. Currently `dstack` supports Postgres. Example: `postgresql+asyncpg://myuser:mypassword@localhost:5432/mydatabase`. Defaults to `None`. - * `DSTACK_SERVER_DIR` – (Optional) Sets path to store data and server configs. Defaults to `~/.dstack/server`. - -??? info "Internal environment variables" - * `DSTACK_SERVER_ROOT_LOG_LEVEL` – (Optional) Sets root logger log level. Defaults to `ERROR`. - * `DSTACK_SERVER_LOG_FORMAT` – (Optional) Sets format of log output. Can be `rich`, `standard`, `json`.. Defaults to `rich`. - * `DSTACK_SERVER_UVICORN_LOG_LEVEL` – (Optional) Sets uvicorn logger log level. Defaults to `ERROR`. - * `DSTACK_PROFILE` – (Optional) Has the same effect as `--profile`. Defaults to `None`. - * `DSTACK_PROJECT` – (Optional) Has the same effect as `--project`. Defaults to `None`. - * `DSTACK_RUNNER_VERSION` – (Optional) Sets exact runner version for debug. Defaults to `latest`. - * `DSTACK_DEFAULT_CREDS_DISABLED` – (Optional) Disables default credentials detection if set. Defaults to `None`. - * `DSTACK_LOCAL_BACKEND_ENABLED` – (Optional) Enables local backend for debug if set. Defaults to `None`. \ No newline at end of file diff --git a/docs/docs/reference/dstack.yml.md b/docs/docs/reference/dstack.yml.md deleted file mode 100644 index 04b20b154c..0000000000 --- a/docs/docs/reference/dstack.yml.md +++ /dev/null @@ -1,5 +0,0 @@ -# .dstack.yml - -- [`dev-environment`](dstack.yml/dev-environment.md) -- [`task`](dstack.yml/task.md) -- [`service`](dstack.yml/service.md) \ No newline at end of file diff --git a/docs/docs/reference/dstack.yml/dev-environment.md b/docs/docs/reference/dstack.yml/dev-environment.md deleted file mode 100644 index 1c129ad538..0000000000 --- a/docs/docs/reference/dstack.yml/dev-environment.md +++ /dev/null @@ -1,269 +0,0 @@ -# dev-environment - -The `dev-environment` configuration type allows running [dev environments](../../concepts/dev-environments.md). - -> Configuration files must have a name ending with `.dstack.yml` (e.g., `.dstack.yml` or `serve.dstack.yml` are both acceptable) -> and can be located in the project's root directory or any nested folder. -> Any configuration can be run via [`dstack run`](../cli/index.md#dstack-run). - -## Examples - -### Python version - -If you don't specify `image`, `dstack` uses the default Docker image pre-configured with -`python`, `pip`, `conda` (Miniforge), and essential CUDA drivers. -The `python` property determines which default Docker image is used. - -
- -```yaml -type: dev-environment - -python: "3.11" - -ide: vscode -``` - -
- -!!! info "nvcc" - Note that the default Docker image doesn't bundle `nvcc`, which is required for building custom CUDA kernels. - To install it, use `conda install cuda`. - -### Docker image - -
- -```yaml -type: dev-environment - -image: ghcr.io/huggingface/text-generation-inference:latest - -ide: vscode -``` - -
- -??? info "Private registry" - - Use the `registry_auth` property to provide credentials for a private Docker registry. - - ```yaml - type: dev-environment - - image: ghcr.io/huggingface/text-generation-inference:latest - registry_auth: - username: peterschmidt85 - password: ghp_e49HcZ9oYwBzUbcSk2080gXZOU2hiT9AeSR5 - - ide: vscode - ``` - -### Resources { #_resources } - -If you specify memory size, you can either specify an explicit size (e.g. `24GB`) or a -range (e.g. `24GB..`, or `24GB..80GB`, or `..80GB`). - -
- -```yaml -type: dev-environment - -ide: vscode - -resources: - # 200GB or more RAM - memory: 200GB.. - - # 4 GPUs from 40GB to 80GB - gpu: 40GB..80GB:4 - - # Shared memory - shm_size: 16GB - - disk: 500GB -``` - -
- -The `gpu` property allows specifying not only memory size but also GPU names -and their quantity. Examples: `A100` (one A100), `A10G,A100` (either A10G or A100), -`A100:80GB` (one A100 of 80GB), `A100:2` (two A100), `24GB..40GB:2` (two GPUs between 24GB and 40GB), -`A100:40GB:2` (two A100 GPUs of 40GB). - -??? info "Google Cloud TPU" - To use TPUs, specify its architecture prefixed by `tpu-` via the `gpu` property. - - ```yaml - type: dev-environment - - ide: vscode - - resources: - gpu: tpu-v2-8 - ``` - - Currently, only 8 TPU cores can be specified, supporting single TPU device workloads. Multi-TPU support is coming soon. - -??? info "Shared memory" - If you are using parallel communicating processes (e.g., dataloaders in PyTorch), you may need to configure - `shm_size`, e.g. set it to `16GB`. - -### Environment variables - -
- -```yaml -type: dev-environment - -env: - - HUGGING_FACE_HUB_TOKEN - - HF_HUB_ENABLE_HF_TRANSFER=1 - -ide: vscode -``` - -
- -If you don't assign a value to an environment variable (see `HUGGING_FACE_HUB_TOKEN` above), -`dstack` will require the value to be passed via the CLI or set in the current process. - -For instance, you can define environment variables in a `.env` file and utilize tools like `direnv`. - -#### Default environment variables - -The following environment variables are available in any run and are passed by `dstack` by default: - -| Name | Description | -|-------------------------|-----------------------------------------| -| `DSTACK_RUN_NAME` | The name of the run | -| `DSTACK_REPO_ID` | The ID of the repo | -| `DSTACK_GPUS_NUM` | The total number of GPUs in the run | - -### Spot policy - -You can choose whether to use spot instances, on-demand instances, or any available type. - -
- -```yaml -type: dev-environment - -ide: vscode - -spot_policy: auto -``` - -
- -The `spot_policy` accepts `spot`, `on-demand`, and `auto`. The default for dev environments is `on-demand`. - -### Backends - -By default, `dstack` provisions instances in all configured backends. However, you can specify the list of backends: - -
- -```yaml -type: dev-environment - -ide: vscode - -backends: [aws, gcp] -``` - -
- -### Regions - -By default, `dstack` uses all configured regions. However, you can specify the list of regions: - -
- -```yaml -type: dev-environment - -ide: vscode - -regions: [eu-west-1, eu-west-2] -``` - -
- -### Volumes - -Volumes allow you to persist data between runs. -To attach a volume, simply specify its name using the `volumes` property and specify where to mount its contents: - -
- -```yaml -type: dev-environment - -ide: vscode - -volumes: - - name: my-new-volume - path: /volume_data -``` - -
- -Once you run this configuration, the contents of the volume will be attached to `/volume_data` inside the development -environment, and its contents will persist across runs. - -!!! info "Limitations" - When you're running a dev environment, task, or service with `dstack`, it automatically mounts the project folder contents - to `/workflow` (and sets that as the current working directory). Right now, `dstack` doesn't allow you to - attach volumes to `/workflow` or any of its subdirectories. - -The `dev-environment` configuration type supports many other options. See below. - -## Root reference - -#SCHEMA# dstack._internal.core.models.configurations.DevEnvironmentConfiguration - overrides: - show_root_heading: false - type: - required: true - -## `resources` - -#SCHEMA# dstack._internal.core.models.resources.ResourcesSpecSchema - overrides: - show_root_heading: false - type: - required: true - item_id_prefix: resources- - -## `resources.gpu` { #resources-gpu data-toc-label="resources.gpu" } - -#SCHEMA# dstack._internal.core.models.resources.GPUSpecSchema - overrides: - show_root_heading: false - type: - required: true - -## `resources.disk` { #resources-disk data-toc-label="resources.disk" } - -#SCHEMA# dstack._internal.core.models.resources.DiskSpecSchema - overrides: - show_root_heading: false - type: - required: true - -## `registry_auth` - -#SCHEMA# dstack._internal.core.models.configurations.RegistryAuth - overrides: - show_root_heading: false - type: - required: true - -## `volumes` - -#SCHEMA# dstack._internal.core.models.volumes.VolumeMountPoint - overrides: - show_root_heading: false - type: - required: true diff --git a/docs/docs/reference/dstack.yml/gateway.md b/docs/docs/reference/dstack.yml/gateway.md deleted file mode 100644 index 6be8b4d2b0..0000000000 --- a/docs/docs/reference/dstack.yml/gateway.md +++ /dev/null @@ -1,47 +0,0 @@ -# gateway - -The `gateway` configuration type allows creating and updating [gateways](../../concepts/services.md). - -> Configuration files must have a name ending with `.dstack.yml` (e.g., `.dstack.yml` or `gateway.dstack.yml` are both acceptable) -> and can be located in the project's root directory or any nested folder. -> Any configuration can be applied via [`dstack apply`](../cli/index.md#dstack-apply). - -## Examples - -
- -```yaml -type: gateway -name: example-gateway - -backend: aws -region: eu-west-1 -domain: example.com -``` - -
- - -## Root reference - -#SCHEMA# dstack._internal.core.models.gateways.GatewayConfiguration - overrides: - show_root_heading: false - type: - required: true - -## `certificate[type=lets-encrypt]` - -#SCHEMA# dstack._internal.core.models.gateways.LetsEncryptGatewayCertificate - overrides: - show_root_heading: false - type: - required: true - -## `certificate[type=acm]` - -#SCHEMA# dstack._internal.core.models.gateways.ACMGatewayCertificate - overrides: - show_root_heading: false - type: - required: true diff --git a/docs/docs/reference/dstack.yml/service.md b/docs/docs/reference/dstack.yml/service.md deleted file mode 100644 index 25345e431c..0000000000 --- a/docs/docs/reference/dstack.yml/service.md +++ /dev/null @@ -1,442 +0,0 @@ -# service - -The `service` configuration type allows running [services](../../concepts/services.md). - -> Configuration files must have a name ending with `.dstack.yml` (e.g., `.dstack.yml` or `serve.dstack.yml` are both acceptable) -> and can be located in the project's root directory or any nested folder. -> Any configuration can be run via [`dstack run . -f PATH`](../cli/index.md#dstack-run). - -## Examples - -### Python version - -If you don't specify `image`, `dstack` uses the default Docker image pre-configured with -`python`, `pip`, `conda` (Miniforge), and essential CUDA drivers. -The `python` property determines which default Docker image is used. - -
- -```yaml -type: service - -python: "3.11" - -commands: - - python3 -m http.server - -port: 8000 -``` - -
- -!!! info "nvcc" - Note that the default Docker image doesn't bundle `nvcc`, which is required for building custom CUDA kernels. - To install it, use `conda install cuda`. - -### Docker image - -
- - ```yaml - type: service - - image: dstackai/base:py3.11-0.4-cuda-12.1 - - commands: - - python3 -m http.server - - port: 8000 - ``` - -
- -??? info "Private Docker registry" - - Use the `registry_auth` property to provide credentials for a private Docker registry. - - ```yaml - type: service - - image: dstackai/base:py3.11-0.4-cuda-12.1 - - commands: - - python3 -m http.server - registry_auth: - username: peterschmidt85 - password: ghp_e49HcZ9oYwBzUbcSk2080gXZOU2hiT9AeSR5 - - port: 8000 - ``` - -### OpenAI-compatible interface { #model-mapping } - -By default, if you run a service, its endpoint is accessible at `https://.`. - -If you run a model, you can optionally configure the mapping to make it accessible via the -OpenAI-compatible interface. - -
- -```yaml -type: service - -python: "3.11" - -env: - - MODEL=NousResearch/Llama-2-7b-chat-hf -commands: - - pip install vllm - - python -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000 -port: 8000 - -resources: - gpu: 24GB - -# Enable the OpenAI-compatible endpoint -model: - format: openai - type: chat - name: NousResearch/Llama-2-7b-chat-hf -``` - -
- -In this case, with such a configuration, once the service is up, you'll be able to access the model at -`https://gateway.` via the OpenAI-compatible interface. - -The `format` supports only `tgi` (Text Generation Inference) -and `openai` (if you are using Text Generation Inference or vLLM with OpenAI-compatible mode). - -??? info "Chat template" - - By default, `dstack` loads the [chat template](https://huggingface.co/docs/transformers/main/en/chat_templating) - from the model's repository. If it is not present there, manual configuration is required. - - ```yaml - type: service - - image: ghcr.io/huggingface/text-generation-inference:latest - env: - - MODEL_ID=TheBloke/Llama-2-13B-chat-GPTQ - commands: - - text-generation-launcher --port 8000 --trust-remote-code --quantize gptq - port: 8000 - - resources: - gpu: 80GB - - # Enable the OpenAI-compatible endpoint - model: - type: chat - name: TheBloke/Llama-2-13B-chat-GPTQ - format: tgi - chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' }}{% endif %}{% endfor %}" - eos_token: "" - ``` - - ##### Limitations - - Please note that model mapping is an experimental feature with the following limitations: - - 1. Doesn't work if your `chat_template` uses `bos_token`. As a workaround, replace `bos_token` inside `chat_template` with the token content itself. - 2. Doesn't work if `eos_token` is defined in the model repository as a dictionary. As a workaround, set `eos_token` manually, as shown in the example above (see Chat template). - - If you encounter any other issues, please make sure to file a [GitHub issue](https://github.com/dstackai/dstack/issues/new/choose). - - -### Auto-scaling - -By default, `dstack` runs a single replica of the service. -You can configure the number of replicas as well as the auto-scaling rules. - -
- -```yaml -type: service - -python: "3.11" - -env: - - MODEL=NousResearch/Llama-2-7b-chat-hf -commands: - - pip install vllm - - python -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000 -port: 8000 - -resources: - gpu: 24GB - -# Enable the OpenAI-compatible endpoint -model: - format: openai - type: chat - name: NousResearch/Llama-2-7b-chat-hf - -replicas: 1..4 -scaling: - metric: rps - target: 10 -``` - -
- -The [`replicas`](#replicas) property can be a number or a range. - -> The [`metric`](#metric) property of [`scaling`](#scaling) only supports the `rps` metric (requests per second). In this -> case `dstack` adjusts the number of replicas (scales up or down) automatically based on the load. - -Setting the minimum number of replicas to `0` allows the service to scale down to zero when there are no requests. - -### Resources { #_resources } - -If you specify memory size, you can either specify an explicit size (e.g. `24GB`) or a -range (e.g. `24GB..`, or `24GB..80GB`, or `..80GB`). - -
- -```yaml -type: service - -python: "3.11" -commands: - - pip install vllm - - python -m vllm.entrypoints.openai.api_server - --model mistralai/Mixtral-8X7B-Instruct-v0.1 - --host 0.0.0.0 - --tensor-parallel-size 2 # Match the number of GPUs -port: 8000 - -resources: - # 2 GPUs of 80GB - gpu: 80GB:2 - - disk: 200GB - -# Enable the OpenAI-compatible endpoint -model: - type: chat - name: TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ - format: openai -``` - -
- -The `gpu` property allows specifying not only memory size but also GPU names -and their quantity. Examples: `A100` (one A100), `A10G,A100` (either A10G or A100), -`A100:80GB` (one A100 of 80GB), `A100:2` (two A100), `24GB..40GB:2` (two GPUs between 24GB and 40GB), -`A100:40GB:2` (two A100 GPUs of 40GB). - -??? info "Shared memory" - If you are using parallel communicating processes (e.g., dataloaders in PyTorch), you may need to configure - `shm_size`, e.g. set it to `16GB`. - -### Authorization - -By default, the service endpoint requires the `Authorization` header with `"Bearer "`. -Authorization can be disabled by setting `auth` to `false`. - -
- -```yaml -type: service - -python: "3.11" - -commands: - - python3 -m http.server - -port: 8000 - -auth: false -``` - -
- -### Environment variables - -
- -```yaml -type: service - -python: "3.11" - -env: - - HUGGING_FACE_HUB_TOKEN - - MODEL=NousResearch/Llama-2-7b-chat-hf -commands: - - pip install vllm - - python -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000 -port: 8000 - -resources: - gpu: 24GB -``` - -
- -If you don't assign a value to an environment variable (see `HUGGING_FACE_HUB_TOKEN` above), -`dstack` will require the value to be passed via the CLI or set in the current process. - -For instance, you can define environment variables in a `.env` file and utilize tools like `direnv`. - -#### Default environment variables - -The following environment variables are available in any run and are passed by `dstack` by default: - -| Name | Description | -|-------------------------|-----------------------------------------| -| `DSTACK_RUN_NAME` | The name of the run | -| `DSTACK_REPO_ID` | The ID of the repo | -| `DSTACK_GPUS_NUM` | The total number of GPUs in the run | - -### Spot policy - -You can choose whether to use spot instances, on-demand instances, or any available type. - -
- -```yaml -type: service - -commands: - - python3 -m http.server - -port: 8000 - -spot_policy: auto -``` - -
- -The `spot_policy` accepts `spot`, `on-demand`, and `auto`. The default for services is `auto`. - -### Backends - -By default, `dstack` provisions instances in all configured backends. However, you can specify the list of backends: - -
- -```yaml -type: service - -commands: - - python3 -m http.server - -port: 8000 - -backends: [aws, gcp] -``` - -
- -### Regions - -By default, `dstack` uses all configured regions. However, you can specify the list of regions: - -
- -```yaml -type: service - -commands: - - python3 -m http.server - -port: 8000 - -regions: [eu-west-1, eu-west-2] -``` - -
- -### Volumes - -Volumes allow you to persist data between runs. -To attach a volume, simply specify its name using the `volumes` property and specify where to mount its contents: - -
- -```yaml -type: service - -commands: - - python3 -m http.server - -port: 8000 - -volumes: - - name: my-new-volume - path: /volume_data -``` - -
- -Once you run this configuration, the contents of the volume will be attached to `/volume_data` inside the service, -and its contents will persist across runs. - -The `service` configuration type supports many other options. See below. - -## Root reference - -#SCHEMA# dstack._internal.core.models.configurations.ServiceConfiguration - overrides: - show_root_heading: false - type: - required: true - -## `model` - -#SCHEMA# dstack._internal.core.models.gateways.BaseChatModel - overrides: - show_root_heading: false - type: - required: true - -## `scaling` - -#SCHEMA# dstack._internal.core.models.configurations.ScalingSpec - overrides: - show_root_heading: false - type: - required: true - -## `resources` - -#SCHEMA# dstack._internal.core.models.resources.ResourcesSpecSchema - overrides: - show_root_heading: false - type: - required: true - item_id_prefix: resources- - -## `resouces.gpu` { #resources-gpu data-toc-label="resources.gpu" } - -#SCHEMA# dstack._internal.core.models.resources.GPUSpecSchema - overrides: - show_root_heading: false - type: - required: true - -## `resouces.disk` { #resources-disk data-toc-label="resources.disk" } - -#SCHEMA# dstack._internal.core.models.resources.DiskSpecSchema - overrides: - show_root_heading: false - type: - required: true - -## `registry_auth` - -#SCHEMA# dstack._internal.core.models.configurations.RegistryAuth - overrides: - show_root_heading: false - type: - required: true - -## `volumes` - -#SCHEMA# dstack._internal.core.models.volumes.VolumeMountPoint - overrides: - show_root_heading: false - type: - required: true diff --git a/docs/docs/reference/dstack.yml/task.md b/docs/docs/reference/dstack.yml/task.md deleted file mode 100644 index 2ee08168a4..0000000000 --- a/docs/docs/reference/dstack.yml/task.md +++ /dev/null @@ -1,416 +0,0 @@ -# task - -The `task` configuration type allows running [tasks](../../concepts/tasks.md). - -> Configuration files must have a name ending with `.dstack.yml` (e.g., `.dstack.yml` or `serve.dstack.yml` are both acceptable) -> and can be located in the project's root directory or any nested folder. -> Any configuration can be run via [`dstack run`](../cli/index.md#dstack-run). - -## Examples - -### Python version - -If you don't specify `image`, `dstack` uses the default Docker image pre-configured with -`python`, `pip`, `conda` (Miniforge), and essential CUDA drivers. -The `python` property determines which default Docker image is used. - -
- -```yaml -type: task - -python: "3.11" - -commands: - - pip install -r fine-tuning/qlora/requirements.txt - - python fine-tuning/qlora/train.py -``` - -
- -!!! info "nvcc" - Note that the default Docker image doesn't bundle `nvcc`, which is required for building custom CUDA kernels. - To install it, use `conda install cuda`. - -### Ports { #_ports } - -A task can configure ports. In this case, if the task is running an application on a port, `dstack run` -will securely allow you to access this port from your local machine through port forwarding. - -
- -```yaml -type: task - -python: "3.11" - -commands: - - pip install -r fine-tuning/qlora/requirements.txt - - tensorboard --logdir results/runs & - - python fine-tuning/qlora/train.py - -ports: - - 6000 -``` - -
- -When running it, `dstack run` forwards `6000` port to `localhost:6000`, enabling secure access. -See [tasks](../../concepts/tasks.md#configure-ports) for more detail. - -### Docker image - -
- -```yaml -type: dev-environment - -image: dstackai/base:py3.11-0.4-cuda-12.1 - -commands: - - pip install -r fine-tuning/qlora/requirements.txt - - python fine-tuning/qlora/train.py -``` - -
- -??? info "Private registry" - Use the `registry_auth` property to provide credentials for a private Docker registry. - - ```yaml - type: dev-environment - - image: dstackai/base:py3.11-0.4-cuda-12.1 - registry_auth: - username: peterschmidt85 - password: ghp_e49HcZ9oYwBzUbcSk2080gXZOU2hiT9AeSR5 - - commands: - - pip install -r fine-tuning/qlora/requirements.txt - - python fine-tuning/qlora/train.py - ``` - -### Resources { #_resources } - -If you specify memory size, you can either specify an explicit size (e.g. `24GB`) or a -range (e.g. `24GB..`, or `24GB..80GB`, or `..80GB`). - -
- -```yaml -type: task - -commands: - - pip install -r fine-tuning/qlora/requirements.txt - - python fine-tuning/qlora/train.py - -resources: - # 200GB or more RAM - memory: 200GB.. - - # 4 GPUs from 40GB to 80GB - gpu: 40GB..80GB:4 - - # Shared memory - shm_size: 16GB - - disk: 500GB -``` - -
- -The `gpu` property allows specifying not only memory size but also GPU names -and their quantity. Examples: `A100` (one A100), `A10G,A100` (either A10G or A100), -`A100:80GB` (one A100 of 80GB), `A100:2` (two A100), `24GB..40GB:2` (two GPUs between 24GB and 40GB), -`A100:40GB:2` (two A100 GPUs of 40GB). - -??? info "Google Cloud TPU" - To use TPUs, specify its architecture prefixed by `tpu-` via the `gpu` property. - - ```yaml - type: task - - python: "3.11" - - commands: - - pip install torch~=2.3.0 torch_xla[tpu]~=2.3.0 torchvision -f https://storage.googleapis.com/libtpu-releases/index.html - - git clone --recursive https://github.com/pytorch/xla.git - - python3 xla/test/test_train_mp_imagenet.py --fake_data --model=resnet50 --num_epochs=1 - - resources: - gpu: tpu-v2-8 - ``` - - Currently, only 8 TPU cores can be specified, supporting single host workloads. Multi-host support is coming soon. - -??? info "Shared memory" - If you are using parallel communicating processes (e.g., dataloaders in PyTorch), you may need to configure - `shm_size`, e.g. set it to `16GB`. - -### Environment variables - -
- -```yaml -type: task - -python: "3.11" - -env: - - HUGGING_FACE_HUB_TOKEN - - HF_HUB_ENABLE_HF_TRANSFER=1 - -commands: - - pip install -r fine-tuning/qlora/requirements.txt - - python fine-tuning/qlora/train.py -``` - -
- -If you don't assign a value to an environment variable (see `HUGGING_FACE_HUB_TOKEN` above), -`dstack` will require the value to be passed via the CLI or set in the current process. - -For instance, you can define environment variables in a `.env` file and utilize tools like `direnv`. - -##### Default environment variables - -The following environment variables are available in any run and are passed by `dstack` by default: - -| Name | Description | -|-------------------------|-----------------------------------------| -| `DSTACK_RUN_NAME` | The name of the run | -| `DSTACK_REPO_ID` | The ID of the repo | -| `DSTACK_GPUS_NUM` | The total number of GPUs in the run | -| `DSTACK_NODES_NUM` | The number of nodes in the run | -| `DSTACK_NODE_RANK` | The rank of the node | -| `DSTACK_MASTER_NODE_IP` | The internal IP address the master node | - -### Distributed tasks { #_nodes } - -By default, the task runs on a single node. However, you can run it on a cluster of nodes. - -
- -```yaml -type: task - -# The size of the cluster -nodes: 2 - -python: "3.11" -env: - - HF_HUB_ENABLE_HF_TRANSFER=1 -commands: - - pip install -r requirements.txt - - torchrun - --nproc_per_node=$DSTACK_GPUS_PER_NODE - --node_rank=$DSTACK_NODE_RANK - --nnodes=$DSTACK_NODES_NUM - --master_addr=$DSTACK_MASTER_NODE_IP - --master_port=8008 resnet_ddp.py - --num_epochs 20 - -resources: - gpu: 24GB -``` - -
- -If you run the task, `dstack` first provisions the master node and then runs the other nodes of the cluster. -All nodes are provisioned in the same region. - -`dstack` is easy to use with `accelerate`, `torchrun`, and other distributed frameworks. All you need to do -is pass the corresponding environment variables such as `DSTACK_GPUS_PER_NODE`, `DSTACK_NODE_RANK`, `DSTACK_NODES_NUM`, -`DSTACK_MASTER_NODE_IP`, and `DSTACK_GPUS_NUM` (see [System environment variables](#default-environment-variables)). - -??? info "Backends" - Running on multiple nodes is supported only with `aws`, `gcp`, `azure`, `oci`, and instances added via - [`dstack pool add-ssh`](../../concepts/pools.md#adding-on-prem-clusters). - -### Arguments - -You can parameterize tasks with user arguments using `${{ run.args }}` in the configuration. - -
- -```yaml -type: task - -python: "3.11" - -commands: - - pip install -r fine-tuning/qlora/requirements.txt - - python fine-tuning/qlora/train.py ${{ run.args }} -``` - -
- -Now, you can pass your arguments to the `dstack run` command: - -
- -```shell -$ dstack run . -f train.dstack.yml --train_batch_size=1 --num_train_epochs=100 -``` - -
- -### Web applications - -Here's an example of using `ports` to run web apps with `tasks`. - -
- -```yaml -type: task - -python: "3.11" - -commands: - - pip3 install streamlit - - streamlit hello - -ports: - - 8501 - -``` - -
- -### Spot policy - -You can choose whether to use spot instances, on-demand instances, or any available type. - -
- -```yaml -type: task - -commands: - - pip install -r fine-tuning/qlora/requirements.txt - - python fine-tuning/qlora/train.py - -spot_policy: auto -``` - -
- -The `spot_policy` accepts `spot`, `on-demand`, and `auto`. The default for tasks is `auto`. - -### Backends - -By default, `dstack` provisions instances in all configured backends. However, you can specify the list of backends: - -
- -```yaml -type: task - -commands: - - pip install -r fine-tuning/qlora/requirements.txt - - python fine-tuning/qlora/train.py - -backends: [aws, gcp] -``` - -
- -### Regions - -By default, `dstack` uses all configured regions. However, you can specify the list of regions: - -
- -```yaml -type: task - -commands: - - pip install -r fine-tuning/qlora/requirements.txt - - python fine-tuning/qlora/train.py - -regions: [eu-west-1, eu-west-2] -``` - -
- -### Volumes - -Volumes allow you to persist data between runs. -To attach a volume, simply specify its name using the `volumes` property and specify where to mount its contents: - -
- -```yaml -type: task - -python: "3.11" - -commands: - - pip install -r fine-tuning/qlora/requirements.txt - - python fine-tuning/qlora/train.py - -volumes: - - name: my-new-volume - path: /volume_data -``` - -
- -Once you run this configuration, the contents of the volume will be attached to `/volume_data` inside the task, -and its contents will persist across runs. - -!!! info "Limitations" - When you're running a dev environment, task, or service with `dstack`, it automatically mounts the project folder contents - to `/workflow` (and sets that as the current working directory). Right now, `dstack` doesn't allow you to - attach volumes to `/workflow` or any of its subdirectories. - -The `task` configuration type supports many other options. See below. - -## Root reference - -#SCHEMA# dstack._internal.core.models.configurations.TaskConfiguration - overrides: - show_root_heading: false - type: - required: true - -## `resources` - -#SCHEMA# dstack._internal.core.models.resources.ResourcesSpecSchema - overrides: - show_root_heading: false - type: - required: true - item_id_prefix: resources- - -## `resouces.gpu` { #resources-gpu data-toc-label="resources.gpu" } - -#SCHEMA# dstack._internal.core.models.resources.GPUSpecSchema - overrides: - show_root_heading: false - type: - required: true - -## `resouces.disk` { #resources-disk data-toc-label="resources.disk" } - -#SCHEMA# dstack._internal.core.models.resources.DiskSpecSchema - overrides: - show_root_heading: false - type: - required: true - -## `registry_auth` - -#SCHEMA# dstack._internal.core.models.configurations.RegistryAuth - overrides: - show_root_heading: false - type: - required: true - -## `volumes[n]` - -#SCHEMA# dstack._internal.core.models.volumes.VolumeMountPoint - overrides: - show_root_heading: false - type: - required: true diff --git a/docs/docs/reference/dstack.yml/volume.md b/docs/docs/reference/dstack.yml/volume.md deleted file mode 100644 index 03351fb6eb..0000000000 --- a/docs/docs/reference/dstack.yml/volume.md +++ /dev/null @@ -1,46 +0,0 @@ -# volume - -The `volume` configuration type allows creating, registering, and updating volumes. - -> Configuration files must have a name ending with `.dstack.yml` (e.g., `.dstack.yml` or `vol.dstack.yml` are both acceptable) -> and can be located in the project's root directory or any nested folder. -> Any configuration can be applied via [`dstack apply`](../cli/index.md#dstack-apply). - -## Examples - -### Creating a new volume { #create-volume } - -
- -```yaml -type: volume -name: my-aws-volume -backend: aws -region: eu-central-1 -size: 100GB -``` - -
- -### Registering an existing volume { #register-volume } - -
- -```yaml -type: volume -name: my-external-volume -backend: aws -region: eu-central-1 -volume_id: vol1235 -``` - -
- - -## Root reference - -#SCHEMA# dstack._internal.core.models.volumes.VolumeConfiguration - overrides: - show_root_heading: false - type: - required: true diff --git a/docs/docs/reference/profiles.yml.md b/docs/docs/reference/profiles.yml.md deleted file mode 100644 index a6c7a4990b..0000000000 --- a/docs/docs/reference/profiles.yml.md +++ /dev/null @@ -1,53 +0,0 @@ -# profiles.yml - -Sometimes, you may want to reuse the same parameters across different [`.dstack.yml`](dstack.yml.md) configurations. - -This can be achieved by defining those parameters in a profile. - -Profiles can be defined on the repository level (via the `.dstack/profiles.yml` file in the root directory of the -repository) or on the global level (via the `~/.dstack/profiles.yml` file). - -Any profile can be marked as default so that it will be applied automatically for any run. Otherwise, you can refer to a specific profile -via `--profile NAME` in `dstack run`. - -### Example - -
- -```yaml -profiles: - - name: my-profile - - # The spot pololicy can be "spot", "on-demand", or "auto" - spot_policy: auto - - # Limit the maximum price of the instance per hour - max_price: 1.5 - - # Stop any run if it runs longer that this duration - max_duration: 1d - - # Use only these backends - backends: [azure, lambda] - - # If set to true, this profile will be applied automatically - default: true -``` - -
- -The profile configuration supports many properties. See below. - -### Root reference - -#SCHEMA# dstack._internal.core.models.profiles.Profile - overrides: - show_root_heading: false - max_price: - type: 'Optional[float]' - -### `retry` - -#SCHEMA# dstack._internal.core.models.profiles.ProfileRetry - overrides: - show_root_heading: false diff --git a/docs/docs/reference/server/config.yml.md b/docs/docs/reference/server/config.yml.md deleted file mode 100644 index 7781bf81de..0000000000 --- a/docs/docs/reference/server/config.yml.md +++ /dev/null @@ -1,955 +0,0 @@ -# ~/.dstack/server/config.yml - -The `~/.dstack/server/config.yml` file is used by the `dstack` server -to [configure](../../installation/index.md#configure-backends) cloud accounts. - -> The `dstack` server allows you to configure backends for multiple projects. -> If you don't need multiple projects, use only the `main` project. - -Each cloud account must be configured under the `backends` property of the respective project. -See the examples below. - -## Examples - -### AWS - -There are two ways to configure AWS: using an access key or using the default credentials. - -=== "Access key" - - Create an access key by following the [this guide :material-arrow-top-right-thin:{ .external }](https://docs.aws.amazon.com/cli/latest/userguide/cli-authentication-user.html#cli-authentication-user-get). - Once you've downloaded the `.csv` file with your IAM user's Access key ID and Secret access key, proceed to - configure the backend. - -
- - ```yaml - projects: - - name: main - backends: - - type: aws - creds: - type: access_key - access_key: KKAAUKLIZ5EHKICAOASV - secret_key: pn158lMqSBJiySwpQ9ubwmI6VUU3/W2fdJdFwfgO - ``` - -
- -=== "Default credentials" - - If you have default credentials set up (e.g. in `~/.aws/credentials`), configure the backend like this: - -
- - ```yaml - projects: - - name: main - backends: - - type: aws - creds: - type: default - ``` - -
- -??? info "VPC" - By default, `dstack` uses the default VPC. It's possible to customize it: - - === "vpc_name" - - ```yaml - projects: - - name: main - backends: - - type: aws - creds: - type: default - - vpc_name: my-vpc - ``` - - === "vpc_ids" - ```yaml - projects: - - name: main - backends: - - type: aws - creds: - type: default - - default_vpcs: true - vpc_ids: - us-east-1: vpc-0a2b3c4d5e6f7g8h - us-east-2: vpc-9i8h7g6f5e4d3c2b - us-west-1: vpc-4d3c2b1a0f9e8d7 - ``` - - For the regions without configured `vpc_ids`, enable default VPCs by setting `default_vpcs` to `true`. - -??? info "Required AWS permissions" - The following AWS policy permissions are sufficient for `dstack` to work: - - ``` - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "ec2:AttachVolume", - "ec2:AuthorizeSecurityGroupEgress", - "ec2:AuthorizeSecurityGroupIngress", - "ec2:CancelSpotInstanceRequests", - "ec2:CreateSecurityGroup", - "ec2:CreateTags", - "ec2:CreateVolume", - "ec2:DeleteVolume", - "ec2:DescribeAvailabilityZones", - "ec2:DescribeImages", - "ec2:DescribeInstances", - "ec2:DescribeInstanceAttribute", - "ec2:DescribeRouteTables", - "ec2:DescribeSecurityGroups", - "ec2:DescribeSubnets", - "ec2:DescribeVpcs", - "ec2:DescribeVolumes", - "ec2:DetachVolume", - "ec2:RunInstances", - "ec2:TerminateInstances" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "servicequotas:ListServiceQuotas", - "servicequotas:GetServiceQuota" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "elasticloadbalancing:CreateLoadBalancer", - "elasticloadbalancing:CreateTargetGroup", - "elasticloadbalancing:CreateListener", - "elasticloadbalancing:RegisterTargets", - "elasticloadbalancing:AddTags", - "elasticloadbalancing:DeleteLoadBalancer", - "elasticloadbalancing:DeleteTargetGroup", - "elasticloadbalancing:DeleteListener", - "elasticloadbalancing:DeregisterTargets" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "acm:DescribeCertificate", - "acm:ListCertificates" - ], - "Resource": "*" - } - ] - } - ``` - - The `elasticloadbalancing:*` and `acm:*` permissions are only needed for provisioning gateways with ACM (AWS Certificate Manager) certificates. - -??? info "Private subnets" - By default, `dstack` utilizes public subnets and permits inbound SSH traffic exclusively for any provisioned instances. - If you want `dstack` to use private subnets, set `public_ips` to `false`. - - ```yaml - projects: - - name: main - backends: - - type: aws - creds: - type: default - - public_ips: false - ``` - - Using private subnets assumes that both the `dstack` server and users can access the configured VPC's private subnets - (e.g., through VPC peering). - -### Azure - -There are two ways to configure Azure: using a client secret or using the default credentials. - -=== "Client secret" - - A client secret can be created using the [Azure CLI :material-arrow-top-right-thin:{ .external }](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli): - - ```shell - SUBSCRIPTION_ID=... - az ad sp create-for-rbac - --name dstack-app \ - --role $DSTACK_ROLE \ - --scopes /subscriptions/$SUBSCRIPTION_ID \ - --query "{ tenant_id: tenant, client_id: appId, client_secret: password }" - ``` - - Once you have `tenant_id`, `client_id`, and `client_secret`, go ahead and configure the backend. - -
- - ```yaml - projects: - - name: main - backends: - - type: azure - subscription_id: 06c82ce3-28ff-4285-a146-c5e981a9d808 - tenant_id: f84a7584-88e4-4fd2-8e97-623f0a715ee1 - creds: - type: client - client_id: acf3f73a-597b-46b6-98d9-748d75018ed0 - client_secret: 1Kb8Q~o3Q2hdEvrul9yaj5DJDFkuL3RG7lger2VQ - ``` - -
- -=== "Default credentials" - - Obtain the `subscription_id` and `tenant_id` via the [Azure CLI :material-arrow-top-right-thin:{ .external }](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli): - - ```shell - az account show --query "{subscription_id: id, tenant_id: tenantId}" - ``` - - Then proceed to configure the backend: - -
- - ```yaml - projects: - - name: main - backends: - - type: azure - subscription_id: 06c82ce3-28ff-4285-a146-c5e981a9d808 - tenant_id: f84a7584-88e4-4fd2-8e97-623f0a715ee1 - creds: - type: default - ``` - -
- -If you don't know your `subscription_id`, run - -```shell -az account show --query "{subscription_id: id}" -``` - -??? info "Required Azure permissions" - The following Azure permissions are sufficient for `dstack` to work: - ``` - { - "properties": { - "roleName": "dstack-role", - "description": "Minimal required permissions for using Azure with dstack", - "assignableScopes": [ - "/subscriptions/${YOUR_SUBSCRIPTION_ID}" - ], - "permissions": [ - { - "actions": [ - "Microsoft.Authorization/*/read", - "Microsoft.Compute/availabilitySets/*", - "Microsoft.Compute/locations/*", - "Microsoft.Compute/virtualMachines/*", - "Microsoft.Compute/virtualMachineScaleSets/*", - "Microsoft.Compute/cloudServices/*", - "Microsoft.Compute/disks/write", - "Microsoft.Compute/disks/read", - "Microsoft.Compute/disks/delete", - "Microsoft.Network/networkSecurityGroups/*", - "Microsoft.Network/locations/*", - "Microsoft.Network/virtualNetworks/*", - "Microsoft.Network/networkInterfaces/*", - "Microsoft.Network/publicIPAddresses/*", - "Microsoft.Resources/subscriptions/resourceGroups/read", - "Microsoft.Resources/subscriptions/resourceGroups/write", - "Microsoft.Resources/subscriptions/read" - ], - "notActions": [], - "dataActions": [], - "notDataActions": [] - } - ] - } - } - ``` - -### GCP - -??? info "Enable APIs" - First, ensure the required APIs are enabled in your GCP `project_id`. - - ```shell - PROJECT_ID=... - gcloud config set project $PROJECT_ID - gcloud services enable cloudapis.googleapis.com - gcloud services enable compute.googleapis.com - ``` - -There are two ways to configure GCP: using a service account or using the default credentials. - -=== "Service account" - - To create a service account, follow [this guide :material-arrow-top-right-thin:{ .external }](https://cloud.google.com/iam/docs/service-accounts-create). After setting up the service account [create a key :material-arrow-top-right-thin:{ .external }](https://cloud.google.com/iam/docs/keys-create-delete) for it and download the corresponding JSON file. - - Then go ahead and configure the backend by specifying the downloaded file path. - -
- - ```yaml - projects: - - name: main - backends: - - type: gcp - project_id: gcp-project-id - creds: - type: service_account - filename: ~/.dstack/server/gcp-024ed630eab5.json - ``` - -
- -=== "Default credentials" - - Enable GCP application default credentials: - - ```shell - gcloud auth application-default login - ``` - - Then configure the backend like this: - -
- - ```yaml - projects: - - name: main - backends: - - type: gcp - project_id: gcp-project-id - creds: - type: default - ``` - -
- -If you don't know your GCP project ID, run - -```shell -gcloud projects list --format="json(projectId)" -``` - -=== "VPC" - -
- - ```yaml - projects: - - name: main - backends: - - type: gcp - project_id: gcp-project-id - creds: - type: default - - vpc_name: my-custom-vpc - ``` - -
- -=== "Shared VPC" - -
- - ```yaml - projects: - - name: main - backends: - - type: gcp - project_id: gcp-project-id - creds: - type: default - - vpc_name: my-custom-vpc - vpc_project_id: another-project-id - ``` - -
- - To use a shared VPC, that VPC has to be configured with two additional firewall rules: - - * Allow `INGRESS` traffic on port `22`, with the target tag `dstack-runner-instance` - * Allow `INGRESS` traffic on ports `22`, `80`, `443`, with the target tag `dstack-gateway-instance` - -??? info "Required GCP permissions" - The following GCP permissions are sufficient for `dstack` to work: - - ``` - compute.disks.create - compute.firewalls.create - compute.images.useReadOnly - compute.instances.create - compute.instances.delete - compute.instances.get - compute.instances.setLabels - compute.instances.setMetadata - compute.instances.setTags - compute.networks.get - compute.networks.updatePolicy - compute.regions.list - compute.subnetworks.list - compute.subnetworks.use - compute.subnetworks.useExternalIp - compute.zoneOperations.get - ``` - - If you plan to use TPUs, additional permissions are required: - - ``` - tpu.nodes.create - tpu.nodes.delete - tpu.nodes.get - tpu.operations.get - tpu.operations.list - ``` - - Also, the use of TPUs requires the `serviceAccountUser` role. - For TPU VMs, dstack will use the default service account. - -??? info "Private subnets" - By default, `dstack` utilizes public subnets and permits inbound SSH traffic exclusively for any provisioned instances. - If you want `dstack` to use private subnets, set `public_ips` to `false`. - - ```yaml - projects: - - name: main - backends: - - type: gcp - creds: - type: default - - public_ips: false - ``` - - Using private subnets assumes that both the `dstack` server and users can access the configured VPC's private subnets (e.g., through VPC peering). Additionally, [Cloud NAT](https://cloud.google.com/nat/docs/overview) must be configured to provide access to external resources for provisioned instances. - -### OCI - -There are two ways to configure OCI: using client credentials or using the default credentials. - -=== "Client credentials" - - Log into the [OCI Console :material-arrow-top-right-thin:{ .external }](https://cloud.oracle.com), go to `My profile`, - select `API keys`, and click `Add API key`. - - Once you add a key, you'll see the configuration file. Copy its values to configure the backend as follows: - -
- - ```yaml - projects: - - name: main - backends: - - type: oci - creds: - type: client - user: ocid1.user.oc1..g5vlaeqfu47akmaafq665xsgmyaqjktyfxtacfxc4ftjxuca7aohnd2ev66m - tenancy: ocid1.tenancy.oc1..ajqsftvk4qarcfaak3ha4ycdsaahxmaita5frdwg3tqo2bcokpd3n7oizwai - region: eu-frankfurt-1 - fingerprint: 77:32:77:00:49:7c:cb:56:84:75:8e:77:96:7d:53:17 - key_file: ~/.oci/private_key.pem - ``` - -
- - Make sure to include either the path to your private key via `key_file` or the contents of the key via `key_content`. - -=== "Default credentials" - If you have default credentials set up in `~/.oci/config`, configure the backend like this: - -
- - ```yaml - projects: - - name: main - backends: - - type: oci - creds: - type: default - ``` - -
- -??? info "Required OCI permissions" - - This is an example of a restrictive policy for a group of `dstack` users: - - ``` - Allow group to read compartments in tenancy where target.compartment.name = '' - Allow group to read marketplace-community-listings in compartment - Allow group to manage app-catalog-listing in compartment - Allow group to manage instances in compartment - Allow group to manage compute-capacity-reports in compartment - Allow group to manage volumes in compartment - Allow group to manage volume-attachments in compartment - Allow group to manage virtual-network-family in compartment - ``` - - To use this policy, create a compartment for `dstack` and specify it in `~/.dstack/server/config.yml`. - - ```yaml - projects: - - name: main - backends: - - type: oci - creds: - type: default - compartment_id: ocid1.compartment.oc1..aaaaaaaa - ``` - -### Lambda - -Log into your [Lambda Cloud :material-arrow-top-right-thin:{ .external }](https://lambdalabs.com/service/gpu-cloud) account, click API keys in the sidebar, and then click the `Generate API key` -button to create a new API key. - -Then, go ahead and configure the backend: - -
- -```yaml -projects: -- name: main - backends: - - type: lambda - creds: - type: api_key - api_key: eersct_yrpiey-naaeedst-tk-_cb6ba38e1128464aea9bcc619e4ba2a5.iijPMi07obgt6TZ87v5qAEj61RVxhd0p -``` - -
- -### TensorDock - -Log into your [TensorDock :material-arrow-top-right-thin:{ .external }](https://marketplace.tensordock.com/) account, click API in the sidebar, and use the `Create an Authorization` -section to create a new authorization key. - -Then, go ahead and configure the backend: - -
- -```yaml -projects: - - name: main - backends: - - type: tensordock - creds: - type: api_key - api_key: 248e621d-9317-7494-dc1557fa5825b-98b - api_token: FyBI3YbnFEYXdth2xqYRnQI7hiusssBC -``` - -
- -The `tensordock` backend supports on-demand instances only. Spot instance support coming soon. - -### Vast.ai - -Log into your [Vast.ai :material-arrow-top-right-thin:{ .external }](https://cloud.vast.ai/) account, click Account in the sidebar, and copy your -API Key. - -Then, go ahead and configure the backend: - -
- -```yaml -projects: -- name: main - backends: - - type: vastai - creds: - type: api_key - api_key: d75789f22f1908e0527c78a283b523dd73051c8c7d05456516fc91e9d4efd8c5 -``` - -
- -Also, the `vastai` backend supports on-demand instances only. Spot instance support coming soon. - -### RunPod - -Log into your [RunPod :material-arrow-top-right-thin:{ .external }](https://www.runpod.io/console/) console, click Settings in the sidebar, expand the `API Keys` section, and click -the button to create a key. - -Then proceed to configuring the backend. - -
- -```yaml -projects: - - name: main - backends: - - type: runpod - creds: - type: api_key - api_key: US9XTPDIV8AR42MMINY8TCKRB8S4E7LNRQ6CAUQ9 -``` - -
- -### CUDO - -Log into your [CUDO Compute :material-arrow-top-right-thin:{ .external }](https://compute.cudo.org/) account, click API keys in the sidebar, and click the `Create an API key` button. - -Ensure you've created a project with CUDO Compute, then proceed to configuring the backend. - -
- -```yaml -projects: - - name: main - backends: - - type: cudo - project_id: my-cudo-project - creds: - type: api_key - api_key: 7487240a466624b48de22865589 -``` - -
- -### DataCrunch - -Log into your [DataCrunch :material-arrow-top-right-thin:{ .external }](https://cloud.datacrunch.io/signin) account, click Account Settings in the sidebar, find `REST API Credentials` area and then click the `Generate Credentials` button. - -Then, go ahead and configure the backend: - -
- -```yaml -projects: - - name: main - backends: - - type: datacrunch - creds: - type: api_key - client_id: xfaHBqYEsArqhKWX-e52x3HH7w8T - client_secret: B5ZU5Qx9Nt8oGMlmMhNI3iglK8bjMhagTbylZy4WzncZe39995f7Vxh8 -``` - -
- -### Kubernetes - -`dstack` supports both self-managed, and managed Kubernetes clusters. - -??? info "Prerequisite" - To use GPUs with Kubernetes, the cluster must be installed with the - [NVIDIA GPU Operator :material-arrow-top-right-thin:{ .external }](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/index.html). - - [//]: # (TODO: Provide short yet clear instructions. Elaborate on whether it works with Kind.) - -To configure a Kubernetes backend, specify the path to the kubeconfig file, -and the port that `dstack` can use for proxying SSH traffic. -In case of a self-managed cluster, also specify the IP address of any node in the cluster. - -[//]: # (TODO: Mention that the Kind context has to be selected via `current-context` ) - -=== "Self-managed" - - Here's how to configure the backend to use a self-managed cluster. - -
- - ```yaml - projects: - - name: main - backends: - - type: kubernetes - kubeconfig: - filename: ~/.kube/config - networking: - ssh_host: localhost # The external IP address of any node - ssh_port: 32000 # Any port accessible outside of the cluster - ``` - -
- - The port specified to `ssh_port` must be accessible outside of the cluster. - - ??? info "Kind" - For example, if you are using Kind, make sure to add it via `extraPortMappings`: - -
- - ```yaml - kind: Cluster - apiVersion: kind.x-k8s.io/v1alpha4 - nodes: - - role: control-plane - extraPortMappings: - - containerPort: 32000 # Must be same as `ssh_port` - hostPort: 32000 # Must be same as `ssh_port` - ``` - -
- -[//]: # (TODO: Elaborate on the Kind's IP address on Linux) - -=== "Managed" - Here's how to configure the backend to use a managed cluster (AWS, GCP, Azure). - -
- - ```yaml - projects: - - name: main - backends: - - type: kubernetes - kubeconfig: - filename: ~/.kube/config - networking: - ssh_port: 32000 # Any port accessible outside of the cluster - ``` - -
- - The port specified to `ssh_port` must be accessible outside of the cluster. - - ??? info "EKS" - For example, if you are using EKS, make sure to add it via an ingress rule - of the corresponding security group: - - ```shell - aws ec2 authorize-security-group-ingress --group-id --protocol tcp --port 32000 --cidr 0.0.0.0/0 - ``` - -[//]: # (TODO: Elaborate on gateways, and what backends allow configuring them) - -[//]: # (TODO: Should we automatically detect ~/.kube/config) - -## Root reference - -#SCHEMA# dstack._internal.server.services.config.ServerConfig - overrides: - show_root_heading: false - -## `projects[n]` { #projects data-toc-label="projects" } - -#SCHEMA# dstack._internal.server.services.config.ProjectConfig - overrides: - show_root_heading: false - backends: - type: 'Union[AWSConfigInfoWithCreds, AzureConfigInfoWithCreds, GCPConfigInfoWithCreds, LambdaConfigInfoWithCreds, TensorDockConfigInfoWithCreds, VastAIConfigInfoWithCreds, KubernetesConfig]' - -## `projects[n].backends[type=aws]` { #aws data-toc-label="backends[type=aws]" } - -#SCHEMA# dstack._internal.server.services.config.AWSConfig - overrides: - show_root_heading: false - type: - required: true - item_id_prefix: aws- - -## `projects[n].backends[type=aws].creds` { #aws-creds data-toc-label="backends[type=aws].creds" } - -=== "Access key" - #SCHEMA# dstack._internal.core.models.backends.aws.AWSAccessKeyCreds - overrides: - show_root_heading: false - type: - required: true - -=== "Default" - #SCHEMA# dstack._internal.core.models.backends.aws.AWSDefaultCreds - overrides: - show_root_heading: false - type: - required: true - -## `projects[n].backends[type=azure]` { #azure data-toc-label="backends[type=azure]" } - -#SCHEMA# dstack._internal.server.services.config.AzureConfig - overrides: - show_root_heading: false - type: - required: true - item_id_prefix: azure- - -## `projects[n].backends[type=azure].creds` { #azure-creds data-toc-label="backends[type=azure].creds" } - -=== "Client" - #SCHEMA# dstack._internal.core.models.backends.azure.AzureClientCreds - overrides: - show_root_heading: false - type: - required: true - -=== "Default" - #SCHEMA# dstack._internal.core.models.backends.azure.AzureDefaultCreds - overrides: - show_root_heading: false - type: - required: true - -## `projects[n].backends[type=datacrunch]` { #datacrunch data-toc-label="backends[type=datacrunch]" } - -#SCHEMA# dstack._internal.server.services.config.DataCrunchConfig - overrides: - show_root_heading: false - type: - required: true - item_id_prefix: datacrunch- - -## `projects[n].backends[type=datacrunch].creds` { #datacrunch-creds data-toc-label="backends[type=datacrunch].creds" } - -#SCHEMA# dstack._internal.core.models.backends.datacrunch.DataCrunchAPIKeyCreds - overrides: - show_root_heading: false - type: - required: true - -## `projects[n].backends[type=gcp]` { #gcp data-toc-label="backends[type=gcp]" } - -#SCHEMA# dstack._internal.server.services.config.GCPConfig - overrides: - show_root_heading: false - type: - required: true - item_id_prefix: gcp- - -## `projects[n].backends[type=gcp].creds` { #gcp-creds data-toc-label="backends[type=gcp].creds" } - -=== "Service account" - #SCHEMA# dstack._internal.server.services.config.GCPServiceAccountCreds - overrides: - show_root_heading: false - type: - required: true - -=== "Default" - #SCHEMA# dstack._internal.server.services.config.GCPDefaultCreds - overrides: - show_root_heading: false - type: - required: true - -## `projects[n].backends[type=lambda]` { #lambda data-toc-label="backends[type=lambda]" } - -#SCHEMA# dstack._internal.server.services.config.LambdaConfig - overrides: - show_root_heading: false - type: - required: true - item_id_prefix: lambda- - -## `projects[n].backends[type=lambda].creds` { #lambda-creds data-toc-label="backends[type=lambda].creds" } - -#SCHEMA# dstack._internal.core.models.backends.lambdalabs.LambdaAPIKeyCreds - overrides: - show_root_heading: false - type: - required: true - -## `projects[n].backends[type=oci]` { #oci data-toc-label="backends[type=oci]" } - -#SCHEMA# dstack._internal.server.services.config.OCIConfig - overrides: - show_root_heading: false - type: - required: true - item_id_prefix: oci- - -## `projects[n].backends[type=oci].creds` { #oci-creds data-toc-label="backends[type=oci].creds" } - -=== "Client" - #SCHEMA# dstack._internal.core.models.backends.oci.OCIClientCreds - overrides: - show_root_heading: false - type: - required: true - -=== "Default" - #SCHEMA# dstack._internal.core.models.backends.oci.OCIDefaultCreds - overrides: - show_root_heading: false - type: - required: true - -## `projects[n].backends[type=tensordock]` { #tensordock data-toc-label="backends[type=tensordock]" } - -#SCHEMA# dstack._internal.server.services.config.TensorDockConfig - overrides: - show_root_heading: false - type: - required: true - item_id_prefix: tensordock- - -## `projects[n].backends[type=tensordock].creds` { #tensordock-creds data-toc-label="backends[type=tensordock].creds" } - -#SCHEMA# dstack._internal.core.models.backends.tensordock.TensorDockAPIKeyCreds - overrides: - show_root_heading: false - type: - required: true - -## `projects[n].backends[type=vastai]` { #vastai data-toc-label="backends[type=vastai]" } - -#SCHEMA# dstack._internal.server.services.config.VastAIConfig - overrides: - show_root_heading: false - type: - required: true - item_id_prefix: vastai- - -## `projects[n].backends[type=vastai].creds` { #vastai-creds data-toc-label="backends[type=vastai].creds" } - -#SCHEMA# dstack._internal.core.models.backends.vastai.VastAIAPIKeyCreds - overrides: - show_root_heading: false - type: - required: true - -## `projects[n].backends[type=cudo]` { #cudo data-toc-label="backends[type=cudo]" } - -#SCHEMA# dstack._internal.server.services.config.CudoConfig - overrides: - show_root_heading: false - type: - required: true - item_id_prefix: cudo- - -## `projects[n].backends[type=cudo].creds` { #cudo-creds data-toc-label="backends[type=cudo].creds" } - -#SCHEMA# dstack._internal.core.models.backends.cudo.CudoAPIKeyCreds - overrides: - show_root_heading: false - type: - required: true - -## `projects[n].backends[type=kubernetes]` { #kubernetes data-toc-label="backends[type=kubernetes]" } - -#SCHEMA# dstack._internal.server.services.config.KubernetesConfig - overrides: - show_root_heading: false - type: - required: true - -## `projects[n].backends[type=kubernetes].kubeconfig` { #kubeconfig data-toc-label="kubeconfig" } - -##SCHEMA# dstack._internal.server.services.config.KubeconfigConfig - overrides: - show_root_heading: false - -## `projects[n].backends[type=kubernetes].networking` { #networking data-toc-label="networking" } - -##SCHEMA# dstack._internal.core.models.backends.kubernetes.KubernetesNetworkingConfig - overrides: - show_root_heading: false diff --git a/docs/index.md b/docs/index.md deleted file mode 100644 index 55c1afa3f4..0000000000 --- a/docs/index.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -template: home.html -title: AI container orchestration platform for everyone -hide: - - navigation - - toc - - footer ---- diff --git a/docs/overrides/.icons/custom/colored/github.svg b/docs/overrides/.icons/custom/colored/github.svg deleted file mode 100644 index 66e5797f00..0000000000 --- a/docs/overrides/.icons/custom/colored/github.svg +++ /dev/null @@ -1,2 +0,0 @@ - \ No newline at end of file diff --git a/docs/overrides/.icons/custom/github.svg b/docs/overrides/.icons/custom/github.svg deleted file mode 100644 index fe24d0e0d5..0000000000 --- a/docs/overrides/.icons/custom/github.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/docs/overrides/assets/images/quotes/cudopete.png b/docs/overrides/assets/images/quotes/cudopete.png deleted file mode 100644 index dd447d0f3d..0000000000 Binary files a/docs/overrides/assets/images/quotes/cudopete.png and /dev/null differ diff --git a/docs/overrides/examples.html b/docs/overrides/examples.html deleted file mode 100644 index 6b933ac34e..0000000000 --- a/docs/overrides/examples.html +++ /dev/null @@ -1,182 +0,0 @@ -{% extends "main.html" %} - -{% block container %} - -{% endblock %} diff --git a/docs/overrides/home.html b/docs/overrides/home.html deleted file mode 100644 index eb503da20b..0000000000 --- a/docs/overrides/home.html +++ /dev/null @@ -1,830 +0,0 @@ -{% extends "landing.html" %} - -{% block scripts %} -{{ super() }} - - - - -{% endblock %} - -{% block content %} -
-
-
-
-

AI container orchestration platform for everyone

- -

- dstack is an open-source orchestration platform for development, training, and deployment of AI models in any cloud or data center. -

-
- - - - -
- -
-
-
-

Dev environments

-

Before scheduling a task or deploying a model, you may want to run code interactively.

- -

- Dev environments allow you to provision a remote machine set up with your code and favorite IDE - with just one command. -

- -

- Learn more -

-
- -
- -
-
-
- -
-
-
- -
- -
-

Tasks

- -

Tasks allow for convenient scheduling of various batch jobs, such as training, fine-tuning, or - data processing, as well as running web applications.

- -

You can run tasks on a single machine or on a cluster of nodes.

- -

- Learn more -

-
-
-
- -
-
-
-

Services

- -

- Services make it very easy to deploy any kind of model as public, - secure, and scalable endpoints. -

- -

- Learn more -

-
- -
- -
-
-
- -
-
-
- -
- -
-

Pools

- -

Pools enable the efficient reuse of cloud instances and on-premises servers across runs, - simplifying their management.

- -

- Learn more -

-
-
-
- -
-
-

Why community dstack

- -
-
-
- -
-

Andrew Spott

- -

ML Engineer at Stealth Startup

- -

- Thanks to @dstack, I get the convenience of having a personal Slurm cluster - and using budget-friendly cloud GPUs, without paying the super-high premiums charged by the big three. -

-
- -
-
- -
-

Alvaro Bartolome

- -

ML Engineer at Argilla

- -

- With @dstack it's incredibly easy to define a configuration within a repository - and run it without worrying about GPU availability. It lets you focus on - data and your research. -

-
- -
-
- -
-

Park Chansung

- -

ML Researcher at ETRI

- -

- Thanks to @dstack, I can effortlessly access the top GPU options across different clouds, - saving me time and money while pushing my AI work forward. -

-
- -
-
- -
-

Eckart Burgwedel

- -

CEO at Uberchord

- -

- With @dstack, running an open-source LLM or a dev environment on a cloud GPU is as - easy as running a local Docker container. - It combines the ease of Docker with the auto-scaling capabilities of K8s. -

-
- -
-
- -
-

Peter Hill

- -

Co-Founder at CUDO Compute

- -

- @dstack is instrumental in simplifying infrastructure provisioning and AI - model development. - if your organization is on the lookout for an platform to speed up the adoption of AI, I - wholeheartedly recommend @dstack -

-
-
-
-
- - - - - - - -
-
-

Get started in a minute

-
- -
-
-
Open-source
-
- Self-hosted -
-
- Use your own cloud accounts or data centers. -
- -
-
- -
AWS
-
- -
- -
Azure
-
- -
- -
GCP
-
- -
- -
OCI
-
- -
- -
Lambda
-
- -
- -
TensorDock
-
- -
- -
Vast.ai
-
- -
- -
RunPod
-
- -
- -
CUDO
-
- -
- -
K8S
-
-
- -
-
-
-
Your cloud accounts
-
Your data centers
-
CLI & API
-
-
- - Install open-source -
Always free.
-
-
- - - -
-
dstack Sky
-
- Managed by dstack -
- -
- Access GPUs at the best possible rate. -
- -
-
- -
Marketplace
-
- - -
- -
-
-
- -
GPU marketplace
-
Your cloud accounts
-
Your data centers
-
CLI & API
-
-
- - - Sign up now - - -
Pay per compute.
-
-
-
-
- -
-

- Have questions, need help, or looking for an enterprise solution? -
- - Talk to an expert - -

-
- -
-
-

FAQ

-
- -
-
-
- What is dstack? -
-
-
- -
-

- dstack is an open-source orchestration platform for efficiently training and deploying AI - models across any cloud or data center. It provides a unified interface to manage AI model - development at any scale, whether in the cloud or on-premises. -

-

- With dstack, you can utilize various cloud providers or on-prem infrastructure, along with - any hardware, and leverage open-source frameworks and tools for both training and - deployment. -

-
-
- -
-
- What is dstack Sky? -
-
-
- -
-

- dstack Sky is a managed version of dstack. Unlike the open-source - dstack where you have to - set up and manage the server yourself, dstack Sky takes care of hosting the server on your - behalf. Most importantly, dstack Sky offers access to cloud GPUs through its marketplace, - ensuring you get them at competitive rates. -

-

- If needed, you can still configure dstack Sky to use your own cloud accounts or connect to - your on-prem servers. -

-
-
- -
-
- What is the pricing for dstack? -
-
-
- -
-

- The open-source version of dstack is completely free for commercial use. You can use it with - your own cloud accounts or data centers without any costs, and you can also rely on support - from the open-source community, Discord, and other channels. -

-

- If you use cloud GPUs through dstack Sky, you're only charged based on the compute prices - offered by the marketplace provider. dstack Sky does not add any commission on top of these - prices. -

-

- If you need enterprise-grade SLAs, premium support, or additional features, please - reach out to our - team. -

-
-
-
-
-
-
-{% endblock %} diff --git a/docs/overrides/landing.html b/docs/overrides/landing.html deleted file mode 100644 index 4eefbe6542..0000000000 --- a/docs/overrides/landing.html +++ /dev/null @@ -1,61 +0,0 @@ -{% extends "base.html" %} - -{% block header %} - {% include "header.html" %} -{% endblock %} - -{% block scripts %} - - -{{ super() }} -{% endblock %} - -{% block announce %} -🔥 Access GPUs at the best rate with dstack Sky! Learn more. -{% endblock %} - -{% block footer %} - -{% endblock %} \ No newline at end of file diff --git a/docs/overrides/main.html b/docs/overrides/main.html deleted file mode 100644 index e3cd0d09bd..0000000000 --- a/docs/overrides/main.html +++ /dev/null @@ -1,116 +0,0 @@ -{% extends "base.html" %} - -{% block container %} -
- {% if "navigation.path" in features %} - {% include "path.html" %} - {% endif %} -
- {% block content %} - {% include "partials/content.html" %} - - {% if page.previous_page or page.next_page %} - {% if page.meta and page.meta.hide %} - {% set hidden = "hidden" if "footer" in page.meta.hide %} - {% endif %} - - {% endif %} - {% endblock %} -
-
-{% endblock %} - -{% block header %} - {% include "header-2.html" %} -{% endblock %} - -{% block scripts %} - - -{{ super() }} -{% endblock %} - -{% block announce %} -🔥 Access GPUs at the best rate with dstack Sky! Learn more. -{% endblock %} - -{% block footer %} - -{% endblock %} \ No newline at end of file diff --git a/docs/overrides/path.html b/docs/overrides/path.html deleted file mode 100644 index 16233bb9a4..0000000000 --- a/docs/overrides/path.html +++ /dev/null @@ -1,20 +0,0 @@ -{#- - This file was automatically generated - do not edit --#} -{% import "partials/path-item.html" as item with context %} -{% if page.meta and page.meta.hide %} - {% set hidden = "hidden" if "path" in page.meta.hide %} -{% endif %} -{% set depth = page.ancestors | length %} -{% if nav.homepage %} - {% set depth = depth + 1 %} -{% endif %} -{% if depth > 1 %} - -{% endif %} \ No newline at end of file diff --git a/docs/pricing.md b/docs/pricing.md deleted file mode 100644 index 19ce885525..0000000000 --- a/docs/pricing.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -template: pricing.html -title: AI container orchestration platform for everyone -hide: - - navigation - - toc - - footer ---- diff --git a/docs/terms.md b/docs/terms.md deleted file mode 100644 index 630079cd81..0000000000 --- a/docs/terms.md +++ /dev/null @@ -1,461 +0,0 @@ ---- -hide: - - navigation - - footer ---- - -# Terms of service - -## Agreement to terms - -We are dstack GmbH ("**Company**," "**we**," "**us**," "**our**"), a company registered in Germany at Franz-Joseph-Straße, 11, Munich, -Bayern 80801. - -These Legal Terms constitute a legally binding agreement made between you, whether personally or on behalf of an -entity ("**you**"), and dstack GmbH, concerning your access to and use of the Services. You agree that by accessing the -Services, you have read, understood, and agreed to be bound by all of these Legal Terms. IF YOU DO NOT AGREE WITH ALL OF -THESE LEGAL TERMS, THEN YOU ARE EXPRESSLY PROHIBITED FROM USING THE SERVICES AND YOU MUST DISCONTINUE USE IMMEDIATELY. - -Supplemental terms and conditions or documents that may be posted on the Services from time to time are hereby expressly -incorporated herein by reference. We reserve the right, in our sole discretion, to make changes or modifications to -these Legal Terms from time to time. We will alert you about any changes by updating the "Last updated" date of these -Legal Terms, and you waive any right to receive specific notice of each such change. It is your responsibility to -periodically review these Legal Terms to stay informed of updates. You will be subject to, and will be deemed to have -been made aware of and to have accepted, the changes in any revised Legal Terms by your continued use of the Services -after the date such revised Legal Terms are posted. - -## 1. Our services - -The information provided when using the Services is not intended for distribution to or use by any person or entity in -any jurisdiction or country where such distribution or use would be contrary to law or regulation or which would subject -us to any registration requirement within such jurisdiction or country. Accordingly, those persons who choose to access -the Services from other locations do so on their own initiative and are solely responsible for compliance with local -laws, if and to the extent local laws are applicable. - -The Services are not tailored to comply with industry-specific regulations (Health Insurance Portability and -Accountability Act (HIPAA), Federal Information Security Management Act (FISMA), etc.), so if your interactions would be -subjected to such laws, you may not use the Services. You may not use the Services in a way that would violate the -Gramm-Leach-Bliley Act (GLBA). - -## 2. Intelliectual property rights - -**Our intellectual property** - -We are the owner or the licensee of all intellectual property rights in our Services, including all source code, -databases, functionality, software, website designs, audio, video, text, photographs, and graphics in the Services ( -collectively, the "Content"), as well as the trademarks, service marks, and logos contained therein (the "Marks"). - -Our Content and Marks are protected by copyright and trademark laws (and various other intellectual property rights and -unfair competition laws) and treaties in the United States and around the world. - -The Content and Marks are provided in or through the Services "AS IS" for your personal, non-commercial use or internal -business purpose only. - -**Your use of our Services** - -Subject to your compliance with these Legal Terms, including the "Prohibited activities" section below, we grant you a -non-exclusive, non-transferable, revocable license to: - - * access the Services; and - * download or print a copy of any portion of the Content to which you have properly gained access. -solely for your personal, non-commercial use or internal business purpose. - -Except as set out in this section or elsewhere in our Legal Terms, no part of the Services and no Content or Marks may -be copied, reproduced, aggregated, republished, uploaded, posted, publicly displayed, encoded, translated, transmitted, -distributed, sold, licensed, or otherwise exploited for any commercial purpose whatsoever, without our express prior -written permission. - -If you wish to make any use of the Services, Content, or Marks other than as set out in this section or elsewhere in our -Legal Terms, please address your request to: hello@dstack.ai. If we ever grant you the permission to post, reproduce, or -publicly display any part of our Services or Content, you must identify us as the owners or licensors of the Services, -Content, or Marks and ensure that any copyright or proprietary notice appears or is visible on posting, reproducing, or -displaying our Content. - -We reserve all rights not expressly granted to you in and to the Services, Content, and Marks. - -Any breach of these Intellectual Property Rights will constitute a material breach of our Legal Terms and your right to -use our Services will terminate immediately. - -**Your submissions** - -Please review this section and the "Prohibited activities" section carefully prior to using our Services to understand -the (a) rights you give us and (b) obligations you have when you post or upload any content through the Services. - -**Submissions**: By directly sending us any question, comment, suggestion, idea, feedback, or other information about the -Services ("Submissions"), you agree to assign to us all intellectual property rights in such Submission. You agree that -we shall own this Submission and be entitled to its unrestricted use and dissemination for any lawful purpose, -commercial or otherwise, without acknowledgment or compensation to you. - -**You are responsible for what you post or upload**: By sending us Submissions through any part of the Services you: -* confirm that you have read and agree with our "Prohibited activities" and will not post, send, publish, upload, or -* transmit through the Services any Submission that is illegal, harassing, hateful, harmful, defamatory, obscene, -* bullying, abusive, discriminatory, threatening to any person or group, sexually explicit, false, inaccurate, deceitful, - or misleading; -* to the extent permissible by applicable law, waive any and all moral rights to any such Submission; -* warrant that any such Submission are original to you or that you have the necessary rights and licenses to submit such - Submissions and that you have full authority to grant us the above-mentioned rights in relation to your Submissions; and -* warrant and represent that your Submissions do not constitute confidential information. - -You are solely responsible for your Submissions and you expressly agree to reimburse us for any and all losses that we -may suffer because of your breach of (a) this section, (b) any third party’s intellectual property rights, or (c) -applicable law. - -## 3. User representations - -By using the Services, you represent and warrant that: (1) all registration information you submit will be true, -accurate, current, and complete; (2) you will maintain the accuracy of such information and promptly update such -registration information as necessary; (3) you have the legal capacity and you agree to comply with these Legal Terms; ( -4) you are not a minor in the jurisdiction in which you reside; (5) you will not access the Services through automated -or non-human means, whether through a bot, script or otherwise; (6) you will not use the Services for any illegal or -unauthorized purpose; and (7) your use of the Services will not violate any applicable law or regulation. - -If you provide any information that is untrue, inaccurate, not current, or incomplete, we have the right to suspend or -terminate your account and refuse any and all current or future use of the Services (or any portion thereof). - -## 4. User registration - -You may be required to register to use the Services. You agree to keep your password confidential and will be -responsible for all use of your account and password. We reserve the right to remove, reclaim, or change a username you -select if we determine, in our sole discretion, that such username is inappropriate, obscene, or otherwise -objectionable. - -## 5. Purchases and payment - -We accept the following forms of payment: - -* Visa -* Mastercard - -You agree to provide current, complete, and accurate purchase and account information for all purchases made via the -Services. You further agree to promptly update account and payment information, including email address, payment method, -and payment card expiration date, so that we can complete your transactions and contact you as needed. Sales tax will be -added to the price of purchases as deemed required by us. We may change prices at any time. All payments shall be in US -dollars. - -You agree to pay all charges at the prices then in effect for your purchases and any applicable shipping fees, and you -authorize us to charge your chosen payment provider for any such amounts upon placing your order. We reserve the right -to correct any errors or mistakes in pricing, even if we have already requested or received payment. - -We reserve the right to refuse any order placed through the Services. We may, in our sole discretion, limit or cancel -quantities purchased per person, per household, or per order. These restrictions may include orders placed by or under -the same customer account, the same payment method, and/or orders that use the same billing or shipping address. We -reserve the right to limit or prohibit orders that, in our sole judgment, appear to be placed by dealers, resellers, or -distributors. - -## 6. Subscriptions - -**Billing and Renewal** - -e.g. by topping up their balance manually using their credit card. - -**Cancellation** - -You can cancel your subscription at any time by contacting us using the contact information provided below. Your -cancellation will take effect at the end of the current paid term. If you have any questions or are unsatisfied with our -Services, please email us at hello@dstack.ai . - -**Fee Changes** - -We may, from time to time, make changes to the subscription fee and will communicate any price changes to you in -accordance with applicable law. - -## 7. Software - -We may include software for use in connection with our Services. If such software is accompanied by an end user license -agreement ("EULA"), the terms of the EULA will govern your use of the software. If such software is not accompanied by a -EULA, then we grant to you a non-exclusive, revocable, personal, and non-transferable license to use such software -solely in connection with our services and in accordance with these Legal Terms. Any software and any related -documentation is provided "AS IS" without warranty of any kind, either express or implied, including, without -limitation, the implied warranties of merchantability, fitness for a particular purpose, or non-infringement. You accept -any and all risk arising out of use or performance of any software. You may not reproduce or redistribute any software -except in accordance with the EULA or these Legal Terms. - -## 8. Prohibited activities - -You may not access or use the Services for any purpose other than that for which we make the Services available. The -Services may not be used in connection with any commercial endeavors except those that are specifically endorsed or -approved by us. - -As a user of the Services, you agree not to: - -* Systematically retrieve data or other content from the Services to create or compile, directly or indirectly, a - collection, compilation, database, or directory without written permission from us. -* Trick, defraud, or mislead us and other users, especially in any attempt to learn sensitive account information such - as user passwords. -* Circumvent, disable, or otherwise interfere with security-related features of the Services, including features that - prevent or restrict the use or copying of any Content or enforce limitations on the use of the Services and/or the - Content contained therein. -* Disparage, tarnish, or otherwise harm, in our opinion, us and/or the Services. -* Use any information obtained from the Services in order to harass, abuse, or harm another person. -* Make improper use of our support services or submit false reports of abuse or misconduct. -* Use the Services in a manner inconsistent with any applicable laws or regulations. -* Engage in unauthorized framing of or linking to the Services. -* Upload or transmit (or attempt to upload or to transmit) viruses, Trojan horses, or other material, including - excessive use of capital letters and spamming (continuous posting of repetitive text), that interferes with any - party’s uninterrupted use and enjoyment of the Services or modifies, impairs, disrupts, alters, or interferes with the - use, features, functions, operation, or maintenance of the Services. -* Engage in any automated use of the system, such as using scripts to send comments or messages, or using any data - mining, robots, or similar data gathering and extraction tools. -* Delete the copyright or other proprietary rights notice from any Content. -* Attempt to impersonate another user or person or use the username of another user. -* Upload or transmit (or attempt to upload or to transmit) any material that acts as a passive or active information - collection or transmission mechanism, including without limitation, clear graphics interchange formats ("gifs"), 1×1 - pixels, web bugs, cookies, or other similar devices (sometimes referred to as "spyware" or "passive collection - mechanisms" or "pcms"). -* Interfere with, disrupt, or create an undue burden on the Services or the networks or services connected to the - Services. -* Harass, annoy, intimidate, or threaten any of our employees or agents engaged in providing any portion of the Services - to you. -* Attempt to bypass any measures of the Services designed to prevent or restrict access to the Services, or any portion - of the Services. -* Copy or adapt the Services' software, including but not limited to Flash, PHP, HTML, JavaScript, or other code. -* Except as permitted by applicable law, decipher, decompile, disassemble, or reverse engineer any of the software - comprising or in any way making up a part of the Services. -* Except as may be the result of standard search engine or Internet browser usage, use, launch, develop, or distribute - any automated system, including without limitation, any spider, robot, cheat utility, scraper, or offline reader that - accesses the Services, or use or launch any unauthorized script or other software. -* Use a buying agent or purchasing agent to make purchases on the Services. -* Make any unauthorized use of the Services, including collecting usernames and/or email addresses of users by - electronic or other means for the purpose of sending unsolicited email, or creating user accounts by automated means - or under false pretenses. -* Use the Services as part of any effort to compete with us or otherwise use the Services and/or the Content for any - revenue-generating endeavor or commercial enterprise. - -## 9. User generated contributions - -The Services does not offer users to submit or post content. - -## 10. Contribution license - -You and Services agree that we may access, store, process, and use any information and personal data that you provide -following the terms of the Privacy Policy and your choices (including settings). - -By submitting suggestions or other feedback regarding the Services, you agree that we can use and share such feedback -for any purpose without compensation to you. - -## 11. Social media - -As part of the functionality of the Services, you may link your account with online accounts you have with third-party -service providers (each such account, a "Third-Party Account") by either: (1) providing your Third-Party Account login -information through the Services; or (2) allowing us to access your Third-Party Account, as is permitted under the -applicable terms and conditions that govern your use of each Third-Party Account. You represent and warrant that you are -entitled to disclose your Third-Party Account login information to us and/or grant us access to your Third-Party -Account, without breach by you of any of the terms and conditions that govern your use of the applicable Third-Party -Account, and without obligating us to pay any fees or making us subject to any usage limitations imposed by the -third-party service provider of the Third-Party Account. By granting us access to any Third-Party Accounts, you -understand that (1) we may access, make available, and store (if applicable) any content that you have provided to and -stored in your Third-Party Account (the "Social Network Content") so that it is available on and through the Services -via your account, including without limitation any friend lists and (2) we may submit to and receive from your -Third-Party Account additional information to the extent you are notified when you link your account with the -Third-Party Account. Depending on the Third-Party Accounts you choose and subject to the privacy settings that you have -set in such Third-Party Accounts, personally identifiable information that you post to your Third-Party Accounts may be -available on and through your account on the Services. Please note that if a Third-Party Account or associated service -becomes unavailable or our access to such Third-Party Account is terminated by the third-party service provider, then -Social Network Content may no longer be available on and through the Services. You will have the ability to disable the -connection between your account on the Services and your Third-Party Accounts at any time. PLEASE NOTE THAT YOUR -RELATIONSHIP WITH THE THIRD-PARTY SERVICE PROVIDERS ASSOCIATED WITH YOUR THIRD-PARTY ACCOUNTS IS GOVERNED SOLELY BY YOUR -AGREEMENT(S) WITH SUCH THIRD-PARTY SERVICE PROVIDERS. We make no effort to review any Social Network Content for any -purpose, including but not limited to, for accuracy, legality, or non-infringement, and we are not responsible for any -Social Network Content. You acknowledge and agree that we may access your email address book associated with a -Third-Party Account and your contacts list stored on your mobile device or tablet computer solely for purposes of -identifying and informing you of those contacts who have also registered to use the Services. You can deactivate the -connection between the Services and your Third-Party Account by contacting us using the contact information below or -through your account settings (if applicable). We will attempt to delete any information stored on our servers that was -obtained through such Third-Party Account, except the username and profile picture that become associated with your -account. - -## 12. Third-party websites and content - -The Services may contain (or you may be sent via the Site) links to other websites ("Third-Party Websites") as well as -articles, photographs, text, graphics, pictures, designs, music, sound, video, information, applications, software, and -other content or items belonging to or originating from third parties ("Third-Party Content"). Such Third-Party Websites -and Third-Party Content are not investigated, monitored, or checked for accuracy, appropriateness, or completeness by -us, and we are not responsible for any Third-Party Websites accessed through the Services or any Third-Party Content -posted on, available through, or installed from the Services, including the content, accuracy, offensiveness, opinions, -reliability, privacy practices, or other policies of or contained in the Third-Party Websites or the Third-Party -Content. Inclusion of, linking to, or permitting the use or installation of any Third-Party Websites or any Third-Party -Content does not imply approval or endorsement thereof by us. If you decide to leave the Services and access the -Third-Party Websites or to use or install any Third-Party Content, you do so at your own risk, and you should be aware -these Legal Terms no longer govern. You should review the applicable terms and policies, including privacy and data -gathering practices, of any website to which you navigate from the Services or relating to any applications you use or -install from the Services. Any purchases you make through Third-Party Websites will be through other websites and from -other companies, and we take no responsibility whatsoever in relation to such purchases which are exclusively between -you and the applicable third party. You agree and acknowledge that we do not endorse the products or services offered on -Third-Party Websites and you shall hold us blameless from any harm caused by your purchase of such products or services. -Additionally, you shall hold us blameless from any losses sustained by you or harm caused to you relating to or -resulting in any way from any Third-Party Content or any contact with Third-Party Websites. - -## 13. Services management - -We reserve the right, but not the obligation, to: (1) monitor the Services for violations of these Legal Terms; (2) take -appropriate legal action against anyone who, in our sole discretion, violates the law or these Legal Terms, including -without limitation, reporting such user to law enforcement authorities; (3) in our sole discretion and without -limitation, refuse, restrict access to, limit the availability of, or disable (to the extent technologically feasible) -any of your Contributions or any portion thereof; (4) in our sole discretion and without limitation, notice, or -liability, to remove from the Services or otherwise disable all files and content that are excessive in size or are in -any way burdensome to our systems; and (5) otherwise manage the Services in a manner designed to protect our rights and -property and to facilitate the proper functioning of the Services. - -## 14. Privacy policy - -We care about data privacy and security. Please review our [Privacy Policy](https://dstack.ai/privacy/). By using the -Services, you agree to be bound by our Privacy Policy, which is incorporated into these Legal Terms. Please be advised -the Services are hosted in Germany and United States. If you access the Services from any other region of the world with -laws or other requirements governing personal data collection, use, or disclosure that differ from applicable laws in -Germany and United States, then through your continued use of the Services, you are transferring your data to Germany -and United States, and you expressly consent to have your data transferred to and processed in Germany and United -States. - -## 15. Term and termination - -These Legal Terms shall remain in full force and effect while you use the Services. WITHOUT LIMITING ANY OTHER PROVISION OF THESE LEGAL TERMS, WE RESERVE THE RIGHT TO, IN OUR SOLE DISCRETION AND WITHOUT NOTICE OR LIABILITY, DENY ACCESS TO AND USE OF THE SERVICES (INCLUDING BLOCKING CERTAIN IP ADDRESSES), TO ANY PERSON FOR ANY REASON OR FOR NO REASON, INCLUDING WITHOUT LIMITATION FOR BREACH OF ANY REPRESENTATION, WARRANTY, OR COVENANT CONTAINED IN THESE LEGAL TERMS OR OF ANY APPLICABLE LAW OR REGULATION. WE MAY TERMINATE YOUR USE OR PARTICIPATION IN THE SERVICES OR DELETE YOUR ACCOUNT AND ANY CONTENT OR INFORMATION THAT YOU POSTED AT ANY TIME, WITHOUT WARNING, IN OUR SOLE DISCRETION. - -If we terminate or suspend your account for any reason, you are prohibited from registering and creating a new account under your name, a fake or borrowed name, or the name of any third party, even if you may be acting on behalf of the third party. In addition to terminating or suspending your account, we reserve the right to take appropriate legal action, including without limitation pursuing civil, criminal, and injunctive redress. - -## 16. Modifications and interruptions - -We reserve the right to change, modify, or remove the contents of the Services at any time or for any reason at our sole -discretion without notice. However, we have no obligation to update any information on our Services. We will not be -liable to you or any third party for any modification, price change, suspension, or discontinuance of the Services. - -We cannot guarantee the Services will be available at all times. We may experience hardware, software, or other problems -or need to perform maintenance related to the Services, resulting in interruptions, delays, or errors. We reserve the -right to change, revise, update, suspend, discontinue, or otherwise modify the Services at any time or for any reason -without notice to you. You agree that we have no liability whatsoever for any loss, damage, or inconvenience caused by -your inability to access or use the Services during any downtime or discontinuance of the Services. Nothing in these -Legal Terms will be construed to obligate us to maintain and support the Services or to supply any corrections, updates, -or releases in connection therewith. - -## 17. Governing law - -These Legal Terms are governed by and interpreted following the laws of Germany, and the use of the United Nations -Convention of Contracts for the International Sales of Goods is expressly excluded. If your habitual residence is in the -EU, and you are a consumer, you additionally possess the protection provided to you by obligatory provisions of the law -in your country to residence. dstack GmbH and yourself both agree to submit to the non-exclusive jurisdiction of the -courts of Bayern, which means that you may make a claim to defend your consumer protection rights in regards to these -Legal Terms in Germany, or in the EU country in which you reside. - -## 18. Dispute resolution - -**Informal Negotiations** - -To expedite resolution and control the cost of any dispute, controversy, or claim related to these Legal Terms (each a " -Dispute" and collectively, the "Disputes") brought by either you or us (individually, a "Party" and collectively, the " -Parties"), the Parties agree to first attempt to negotiate any Dispute (except those Disputes expressly provided below) -informally for at least thirty (30) days before initiating arbitration. Such informal negotiations commence upon written -notice from one Party to the other Party. - -**Binding Arbitration** - -Any dispute arising from the relationships between the Parties to these Legal Terms shall be determined by one -arbitrator who will be chosen in accordance with the Arbitration and Internal Rules of the European Court of Arbitration -being part of the European Centre of Arbitration having its seat in Strasbourg, and which are in force at the time the -application for arbitration is filed, and of which adoption of this clause constitutes acceptance. The seat of -arbitration shall be Munich , Germany . The language of the proceedings shall be German . Applicable rules of -substantive law shall be the law of Germany . - -**Restrictions** - -The Parties agree that any arbitration shall be limited to the Dispute between the Parties individually. To the full -extent permitted by law, (a) no arbitration shall be joined with any other proceeding; (b) there is no right or -authority for any Dispute to be arbitrated on a class-action basis or to utilize class action procedures; and (c) there -is no right or authority for any Dispute to be brought in a purported representative capacity on behalf of the general -public or any other persons. - -**Exceptions to Informal Negotiations and Arbitration** - -The Parties agree that the following Disputes are not subject to the above provisions concerning informal negotiations -binding arbitration: (a) any Disputes seeking to enforce or protect, or concerning the validity of, any of the -intellectual property rights of a Party; (b) any Dispute related to, or arising from, allegations of theft, piracy, -invasion of privacy, or unauthorized use; and (c) any claim for injunctive relief. If this provision is found to be -illegal or unenforceable, then neither Party will elect to arbitrate any Dispute falling within that portion of this -provision found to be illegal or unenforceable and such Dispute shall be decided by a court of competent jurisdiction -within the courts listed for jurisdiction above, and the Parties agree to submit to the personal jurisdiction of that -court. - -## 19. Corrections - -There may be information on the Services that contains typographical errors, inaccuracies, or omissions, including -descriptions, pricing, availability, and various other information. We reserve the right to correct any errors, -inaccuracies, or omissions and to change or update the information on the Services at any time, without prior notice. - -## 20. Disclaimer - -THE SERVICES ARE PROVIDED ON AN AS-IS AND AS-AVAILABLE BASIS. YOU AGREE THAT YOUR USE OF THE SERVICES WILL BE AT YOUR -SOLE RISK. TO THE FULLEST EXTENT PERMITTED BY LAW, WE DISCLAIM ALL WARRANTIES, EXPRESS OR IMPLIED, IN CONNECTION WITH -THE SERVICES AND YOUR USE THEREOF, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR -A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. WE MAKE NO WARRANTIES OR REPRESENTATIONS ABOUT THE ACCURACY OR COMPLETENESS -OF THE SERVICES' CONTENT OR THE CONTENT OF ANY WEBSITES OR MOBILE APPLICATIONS LINKED TO THE SERVICES AND WE WILL ASSUME -NO LIABILITY OR RESPONSIBILITY FOR ANY (1) ERRORS, MISTAKES, OR INACCURACIES OF CONTENT AND MATERIALS, (2) PERSONAL -INJURY OR PROPERTY DAMAGE, OF ANY NATURE WHATSOEVER, RESULTING FROM YOUR ACCESS TO AND USE OF THE SERVICES, (3) ANY -UNAUTHORIZED ACCESS TO OR USE OF OUR SECURE SERVERS AND/OR ANY AND ALL PERSONAL INFORMATION AND/OR FINANCIAL INFORMATION -STORED THEREIN, (4) ANY INTERRUPTION OR CESSATION OF TRANSMISSION TO OR FROM THE SERVICES, (5) ANY BUGS, VIRUSES, TROJAN -HORSES, OR THE LIKE WHICH MAY BE TRANSMITTED TO OR THROUGH THE SERVICES BY ANY THIRD PARTY, AND/OR (6) ANY ERRORS OR -OMISSIONS IN ANY CONTENT AND MATERIALS OR FOR ANY LOSS OR DAMAGE OF ANY KIND INCURRED AS A RESULT OF THE USE OF ANY -CONTENT POSTED, TRANSMITTED, OR OTHERWISE MADE AVAILABLE VIA THE SERVICES. WE DO NOT WARRANT, ENDORSE, GUARANTEE, OR -ASSUME RESPONSIBILITY FOR ANY PRODUCT OR SERVICE ADVERTISED OR OFFERED BY A THIRD PARTY THROUGH THE SERVICES, ANY -HYPERLINKED WEBSITE, OR ANY WEBSITE OR MOBILE APPLICATION FEATURED IN ANY BANNER OR OTHER ADVERTISING, AND WE WILL NOT -BE A PARTY TO OR IN ANY WAY BE RESPONSIBLE FOR MONITORING ANY TRANSACTION BETWEEN YOU AND ANY THIRD-PARTY PROVIDERS OF -PRODUCTS OR SERVICES. AS WITH THE PURCHASE OF A PRODUCT OR SERVICE THROUGH ANY MEDIUM OR IN ANY ENVIRONMENT, YOU SHOULD -USE YOUR BEST JUDGMENT AND EXERCISE CAUTION WHERE APPROPRIATE. - -## 21. Limitations of liability - -IN NO EVENT WILL WE OR OUR DIRECTORS, EMPLOYEES, OR AGENTS BE LIABLE TO YOU OR ANY THIRD PARTY FOR ANY DIRECT, INDIRECT, -CONSEQUENTIAL, EXEMPLARY, INCIDENTAL, SPECIAL, OR PUNITIVE DAMAGES, INCLUDING LOST PROFIT, LOST REVENUE, LOSS OF DATA, -OR OTHER DAMAGES ARISING FROM YOUR USE OF THE SERVICES, EVEN IF WE HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. -NOTWITHSTANDING ANYTHING TO THE CONTRARY CONTAINED HEREIN, OUR LIABILITY TO YOU FOR ANY CAUSE WHATSOEVER AND REGARDLESS -OF THE FORM OF THE ACTION, WILL AT ALL TIMES BE LIMITED TO THE LESSER OF THE AMOUNT PAID, IF ANY, BY YOU TO US DURING -THE zero (0) MONTH PERIOD PRIOR TO ANY CAUSE OF ACTION ARISING OR $0.00 USD. CERTAIN US STATE LAWS AND INTERNATIONAL -LAWS DO NOT ALLOW LIMITATIONS ON IMPLIED WARRANTIES OR THE EXCLUSION OR LIMITATION OF CERTAIN DAMAGES. IF THESE LAWS -APPLY TO YOU, SOME OR ALL OF THE ABOVE DISCLAIMERS OR LIMITATIONS MAY NOT APPLY TO YOU, AND YOU MAY HAVE ADDITIONAL -RIGHTS. - -## 22. Indemnification - -You agree to defend, indemnify, and hold us harmless, including our subsidiaries, affiliates, and all of our respective officers, agents, partners, and employees, from and against any loss, damage, liability, claim, or demand, including reasonable attorneys’ fees and expenses, made by any third party due to or arising out of: (1) use of the Services; (2) breach of these Legal Terms; (3) any breach of your representations and warranties set forth in these Legal Terms; (4) your violation of the rights of a third party, including but not limited to intellectual property rights; or (5) any overt harmful act toward any other user of the Services with whom you connected via the Services. Notwithstanding the foregoing, we reserve the right, at your expense, to assume the exclusive defense and control of any matter for which you are required to indemnify us, and you agree to cooperate, at your expense, with our defense of such claims. We will use reasonable efforts to notify you of any such claim, action, or proceeding which is subject to this indemnification upon becoming aware of it. - -## 23. User data - -We will maintain certain data that you transmit to the Services for the purpose of managing the performance of the -Services, as well as data relating to your use of the Services. Although we perform regular routine backups of data, you -are solely responsible for all data that you transmit or that relates to any activity you have undertaken using the -Services. You agree that we shall have no liability to you for any loss or corruption of any such data, and you hereby -waive any right of action against us arising from any such loss or corruption of such data. - -## 24. Electronic communications, transactions, and signatures - -Visiting the Services, sending us emails, and completing online forms constitute electronic communications. You consent -to receive electronic communications, and you agree that all agreements, notices, disclosures, and other communications -we provide to you electronically, via email and on the Services, satisfy any legal requirement that such communication -be in writing. YOU HEREBY AGREE TO THE USE OF ELECTRONIC SIGNATURES, CONTRACTS, ORDERS, AND OTHER RECORDS, AND TO -ELECTRONIC DELIVERY OF NOTICES, POLICIES, AND RECORDS OF TRANSACTIONS INITIATED OR COMPLETED BY US OR VIA THE SERVICES. -You hereby waive any rights or requirements under any statutes, regulations, rules, ordinances, or other laws in any -jurisdiction which require an original signature or delivery or retention of non-electronic records, or to payments or -the granting of credits by any means other than electronic means. - -## 25. California users and residents - -If any complaint with us is not satisfactorily resolved, you can contact the Complaint Assistance Unit of the Division -of Consumer Services of the California Department of Consumer Affairs in writing at 1625 North Market Blvd., Suite N -112, Sacramento, California 95834 or by telephone at (800) 952-5210 or (916) 445-1254. - -## 26. Miscellaneous - -These Legal Terms and any policies or operating rules posted by us on the Services or in respect to the Services -constitute the entire agreement and understanding between you and us. Our failure to exercise or enforce any right or -provision of these Legal Terms shall not operate as a waiver of such right or provision. These Legal Terms operate to -the fullest extent permissible by law. We may assign any or all of our rights and obligations to others at any time. We -shall not be responsible or liable for any loss, damage, delay, or failure to act caused by any cause beyond our -reasonable control. If any provision or part of a provision of these Legal Terms is determined to be unlawful, void, or -unenforceable, that provision or part of the provision is deemed severable from these Legal Terms and does not affect -the validity and enforceability of any remaining provisions. There is no joint venture, partnership, employment or -agency relationship created between you and us as a result of these Legal Terms or use of the Services. You agree that -these Legal Terms will not be construed against us by virtue of having drafted them. You hereby waive any and all -defenses you may have based on the electronic form of these Legal Terms and the lack of signing by the parties hereto to -execute these Legal Terms. - -## 27. Contact us - -In order to resolve a complaint regarding the Services or to receive further information regarding use of the Services, -please contact us at **hello@dstack.ai**. diff --git a/examples/.dstack.yml b/examples/.dstack.yml index 80b0a6dd9e..dc8a9c58c3 100644 --- a/examples/.dstack.yml +++ b/examples/.dstack.yml @@ -1,6 +1,12 @@ type: dev-environment -# This configuration launches a blank dev environment +name: cursor -python: "3.11" +python: 3.12 +ide: cursor -ide: vscode \ No newline at end of file +# Mount the repo directory to `/workflow` (the default working directory) +repos: + - .. + +resources: + gpu: 1 diff --git a/examples/README.md b/examples/README.md deleted file mode 100644 index 6f860b0cc8..0000000000 --- a/examples/README.md +++ /dev/null @@ -1,54 +0,0 @@ -# Examples - -This folder contains examples showing how to use `dstack`. - -> [!IMPORTANT] -> Feel free to contribute your examples or enhance the existing ones—your PRs are warmly welcomed in this repo! - -## Getting started - -### Prerequisites - -To use the open-source version, make sure to [install the server](https://dstack.ai/docs/installation/) and configure backends. - -### Run examples - -#### Init the repo - -```shell -git clone https://github.com/dstackai/dstack -cd dstack -dstack init -``` - -#### Run a dev environment - -Here's how to run a dev environment with the current repo: - -```shell -dstack run . -f examples/.dstack.yml -``` - -#### Run other examples - -Here's how to run other examples, e.g. [`deployment/vllm`](deployment/vllm/): - -```shell -dstack run . -f examples/deployment/vllm/serve.dstack.yml -``` - -## Featrued - -Here are some featured examples: - -- [Llama 3](llms/llama3) -- [Alignment Handbook](fine-tuning/alignment-handbook) -- [vLLM](deployment/vllm) -- [Axolotl](fine-tuning/axolotl) -- [TGI](deployment/tgi) -- [Ollama](deployment/ollama) - -Browse [deployment](deployment), [fine-tuning](deployment), [llms](llms), and [misc](misc) for more. - -> [!IMPORTANT] -> Feel free to contribute your examples or enhance the existing ones—your PRs are warmly welcomed in this repo! \ No newline at end of file diff --git a/gateway/src/dstack/gateway/auth/__init__.py b/examples/__init__.py similarity index 100% rename from gateway/src/dstack/gateway/auth/__init__.py rename to examples/__init__.py diff --git a/examples/deployment/infinity/README.md b/examples/deployment/infinity/README.md deleted file mode 100644 index 5053be17ad..0000000000 --- a/examples/deployment/infinity/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# Infinity - -The following command a text embedding model as a service: - -```shell -dstack run . -f examples/deployment/infinity/serve.dstack.yml -``` - -See the configuration at [serve.dstack.yml](serve.dstack.yml). - -For more details, refer to [services](https://dstack.ai/docs/concepts/services). \ No newline at end of file diff --git a/examples/deployment/infinity/serve.dstack.yml b/examples/deployment/infinity/serve.dstack.yml deleted file mode 100644 index 66a5289a18..0000000000 --- a/examples/deployment/infinity/serve.dstack.yml +++ /dev/null @@ -1,13 +0,0 @@ -type: service -# This service deploys embeddings model and reranker model with Infinity. Learn more at https://dstack.ai/examples/infinity/ - -image: michaelf34/infinity:latest -env: - - INFINITY_MODEL_ID=BAAI/bge-small-en-v1.5;BAAI/bge-reranker-base; - - INFINITY_PORT=8000 -commands: - - infinity_emb v2 -port: 8000 - -resources: - gpu: 16GB diff --git a/examples/deployment/lorax/README.md b/examples/deployment/lorax/README.md deleted file mode 100644 index 893eaf0206..0000000000 --- a/examples/deployment/lorax/README.md +++ /dev/null @@ -1,26 +0,0 @@ -# LoRaX - -[LoRAX](https://github.com/predibase/lorax) allows serving multiple fine-tuned models optimized on -the same endpoint by dynamically loading and switching LoRA adapters. - -## Service - -The following command deploys Mistral 7B Instruct as a base model via a service: - -```shell -dstack run . -f examples/deployment/lorax/serve.dstack.yml -``` - -See the configuration at [serve.dstack.yml](serve.dstack.yml). - -## Task - -The following command runs Mistral 7B Instruct as a base model via a task: - -```shell -dstack run . -f examples/deployment/lorax/serve-task.dstack.yml -``` - -See the configuration at [serve.dstack.yml](serve-task.dstack.yml). - -For more details, refer to [services](https://dstack.ai/docs/concepts/services) or [tasks](https://dstack.ai/docs/concepts/tasks). \ No newline at end of file diff --git a/examples/deployment/lorax/serve-task.dstack.yml b/examples/deployment/lorax/serve-task.dstack.yml deleted file mode 100644 index 36ac0e950d..0000000000 --- a/examples/deployment/lorax/serve-task.dstack.yml +++ /dev/null @@ -1,18 +0,0 @@ -type: task - -image: ghcr.io/predibase/lorax:latest - -env: - - HUGGING_FACE_HUB_TOKEN - - MODEL_ID=mistralai/Mistral-7B-Instruct-v0.1 - -commands: - - cd /usr/src - - lorax-launcher --port 8000 --model-id $MODEL_ID - -ports: - - 8000 - -resources: - gpu: 24GB - shm_size: 1GB diff --git a/examples/deployment/lorax/serve.dstack.yml b/examples/deployment/lorax/serve.dstack.yml deleted file mode 100644 index a48513cfdc..0000000000 --- a/examples/deployment/lorax/serve.dstack.yml +++ /dev/null @@ -1,17 +0,0 @@ -type: service - -image: ghcr.io/predibase/lorax:latest - -env: - - HUGGING_FACE_HUB_TOKEN - - MODEL_ID=mistralai/Mistral-7B-Instruct-v0.1 - -commands: - - cd /usr/src - - lorax-launcher --port 8000 --model-id $MODEL_ID - -port: 8000 - -resources: - gpu: 24GB - shm_size: 1GB diff --git a/examples/deployment/ollama/.dstack.yml b/examples/deployment/ollama/.dstack.yml deleted file mode 100644 index 7775f8d2b6..0000000000 --- a/examples/deployment/ollama/.dstack.yml +++ /dev/null @@ -1,9 +0,0 @@ -type: dev-environment -# Launches a dev environment to play with Ollama - -image: ollama/ollama - -ide: vscode - -resources: - gpu: 48GB..80GB diff --git a/examples/deployment/ollama/README.md b/examples/deployment/ollama/README.md deleted file mode 100644 index ce7bb3916d..0000000000 --- a/examples/deployment/ollama/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# Ollama - -The following command deploys Mixtral 8x7B as a service: - -```shell -dstack run . -f examples/deployment/ollama/serve.dstack.yml -``` - -See the configuration at [serve.dstack.yml](serve.dstack.yml). - -For more details, refer to [services](https://dstack.ai/docs/concepts/services). \ No newline at end of file diff --git a/examples/deployment/ollama/serve.dstack.yml b/examples/deployment/ollama/serve.dstack.yml deleted file mode 100644 index 219d6222ef..0000000000 --- a/examples/deployment/ollama/serve.dstack.yml +++ /dev/null @@ -1,19 +0,0 @@ -type: service -# Deploys Mixtral 8x7B with Ollama - -image: ollama/ollama -commands: - - ollama serve & - - sleep 3 - - ollama pull mixtral - - fg -port: 11434 - -resources: - gpu: 48GB..80GB - -# (Optional) Enable the OpenAI-compatible endpoint -model: - type: chat - name: mixtral - format: openai \ No newline at end of file diff --git a/examples/deployment/tei/README.md b/examples/deployment/tei/README.md deleted file mode 100644 index 030f249f66..0000000000 --- a/examples/deployment/tei/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# Text Embeddings Inference - -The following command a text embedding model as a service: - -```shell -dstack run . -f examples/deployment/tae/serve.dstack.yml -``` - -See the configuration at [serve.dstack.yml](serve.dstack.yml). - -For more details, refer to [services](https://dstack.ai/docs/concepts/services). \ No newline at end of file diff --git a/examples/deployment/tei/serve.dstack.yml b/examples/deployment/tei/serve.dstack.yml deleted file mode 100644 index a0ac770d86..0000000000 --- a/examples/deployment/tei/serve.dstack.yml +++ /dev/null @@ -1,12 +0,0 @@ -type: service -# This service deploys an embeddings model with TEI - -image: ghcr.io/huggingface/text-embeddings-inference:latest -env: - - MODEL_ID=thenlper/gte-base -commands: - - text-embeddings-router --port 8000 -port: 8000 - -resources: - gpu: 16GB.. diff --git a/examples/deployment/tgi/.dstack.yml b/examples/deployment/tgi/.dstack.yml deleted file mode 100644 index 856faa0a4e..0000000000 --- a/examples/deployment/tgi/.dstack.yml +++ /dev/null @@ -1,9 +0,0 @@ -type: dev-environment -# Launches a dev environment to play with TGI - -image: ghcr.io/huggingface/text-generation-inference:latest - -ide: vscode - -resources: - gpu: 24GB \ No newline at end of file diff --git a/examples/deployment/tgi/README.md b/examples/deployment/tgi/README.md deleted file mode 100644 index 60fb18bfbd..0000000000 --- a/examples/deployment/tgi/README.md +++ /dev/null @@ -1,23 +0,0 @@ -# Text Generation Inference - -## Service - -The following command deploys Llama 7B Instruct as a service: - -```shell -dstack run . -f examples/deployment/tgi/serve.dstack.yml -``` - -See the configuration at [serve.dstack.yml](serve.dstack.yml). - -## Task - -The following command runs Llama 7B Instruct as a task: - -```shell -dstack run . -f examples/deployment/tgi/serve-task.dstack.yml -``` - -See the configuration at [serve.dstack.yml](serve-task.dstack.yml). - -For more details, refer to [services](https://dstack.ai/docs/concepts/services) or [tasks](https://dstack.ai/docs/concepts/tasks). \ No newline at end of file diff --git a/examples/deployment/tgi/serve-gptq.dstack.yml b/examples/deployment/tgi/serve-gptq.dstack.yml deleted file mode 100644 index f1a811896b..0000000000 --- a/examples/deployment/tgi/serve-gptq.dstack.yml +++ /dev/null @@ -1,20 +0,0 @@ -type: service -# This service runs Llama 2 13B (quantized) with TGI - -image: ghcr.io/huggingface/text-generation-inference:latest -env: - - MODEL_ID=TheBloke/Mistral-7B-Instruct-v0.2-GPTQ -commands: - - text-generation-launcher --port 8000 --trust-remote-code --quantize gptq -port: 8000 - -resources: - gpu: 24GB - -# (Optional) Enable the OpenAI-compatible endpoint -model: - type: chat - name: TheBloke/Mistral-7B-Instruct-v0.2-GPTQ - format: tgi - chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' }}{% endif %}{% endfor %}" - eos_token: "" diff --git a/examples/deployment/tgi/serve-task.dstack.yml b/examples/deployment/tgi/serve-task.dstack.yml deleted file mode 100644 index 5376e635b1..0000000000 --- a/examples/deployment/tgi/serve-task.dstack.yml +++ /dev/null @@ -1,15 +0,0 @@ -type: task -# This task runs Llama 2 with TGI - -image: ghcr.io/huggingface/text-generation-inference:latest -env: - - HUGGING_FACE_HUB_TOKEN - - MODEL_ID=mistralai/Mistral-7B-Instruct-v0.2 -commands: - - text-generation-launcher --port 8000 --trust-remote-code -ports: - - 8000 - -resources: - gpu: 24GB - diff --git a/examples/deployment/tgi/serve.dstack.yml b/examples/deployment/tgi/serve.dstack.yml deleted file mode 100644 index 81a27e4f0d..0000000000 --- a/examples/deployment/tgi/serve.dstack.yml +++ /dev/null @@ -1,19 +0,0 @@ -type: service -# This service runs Llama 2 with TGI - -image: ghcr.io/huggingface/text-generation-inference:latest -env: - - HUGGING_FACE_HUB_TOKEN - - MODEL_ID=mistralai/Mistral-7B-Instruct-v0.2 -commands: - - text-generation-launcher --port 8000 --trust-remote-code -port: 8000 - -resources: - gpu: 24GB - -# (Optional) Enable the OpenAI-compatible endpoint -model: - format: tgi - type: chat - name: mistralai/Mistral-7B-Instruct-v0.2 diff --git a/examples/deployment/vllm/.dstack.yml b/examples/deployment/vllm/.dstack.yml deleted file mode 100644 index d59d5f6dec..0000000000 --- a/examples/deployment/vllm/.dstack.yml +++ /dev/null @@ -1,11 +0,0 @@ -type: dev-environment -# Launches a dev environment to play with vllm - -image: vllm/vllm-openai:latest -env: - - PYTHONPATH=/workspace - -ide: vscode - -resources: - gpu: 24GB diff --git a/examples/deployment/vllm/README.md b/examples/deployment/vllm/README.md deleted file mode 100644 index e87766a894..0000000000 --- a/examples/deployment/vllm/README.md +++ /dev/null @@ -1,23 +0,0 @@ -# vLLM - -## Service - -The following command deploys Llama 7B Instruct as a service: - -```shell -dstack run . -f examples/deployment/vllm/serve.dstack.yml -``` - -See the configuration at [serve.dstack.yml](serve.dstack.yml). - -## Task - -The following command runs Llama 7B Instruct as a task: - -```shell -dstack run . -f examples/deployment/vllm/serve-task.dstack.yml -``` - -See the configuration at [serve.dstack.yml](serve-task.dstack.yml). - -For more details, refer to [services](https://dstack.ai/docs/concepts/services) or [tasks](https://dstack.ai/docs/concepts/tasks). \ No newline at end of file diff --git a/examples/deployment/vllm/serve-task.dstack.yml b/examples/deployment/vllm/serve-task.dstack.yml deleted file mode 100644 index 60b90cccc3..0000000000 --- a/examples/deployment/vllm/serve-task.dstack.yml +++ /dev/null @@ -1,14 +0,0 @@ -type: task -# This task runs Llama 2 with vllm - -image: vllm/vllm-openai:latest -env: - - MODEL=NousResearch/Llama-2-7b-chat-hf - - PYTHONPATH=/workspace -commands: - - python3 -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000 -ports: - - 8000 - -resources: - gpu: 24GB diff --git a/examples/deployment/vllm/serve.dstack.yml b/examples/deployment/vllm/serve.dstack.yml deleted file mode 100644 index 849c187d4a..0000000000 --- a/examples/deployment/vllm/serve.dstack.yml +++ /dev/null @@ -1,19 +0,0 @@ -type: service -# This service runs Llama 2 with vllm - -image: vllm/vllm-openai:latest -env: - - MODEL=NousResearch/Llama-2-7b-chat-hf - - PYTHONPATH=/workspace -commands: - - python3 -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000 -port: 8000 - -resources: - gpu: 24GB - -# (Optional) Enable the OpenAI-compatible endpoint -model: - format: openai - type: chat - name: NousResearch/Llama-2-7b-chat-hf diff --git a/examples/distributed-training/torchrun/.dstack.yml b/examples/distributed-training/torchrun/.dstack.yml new file mode 100644 index 0000000000..4eccdb263e --- /dev/null +++ b/examples/distributed-training/torchrun/.dstack.yml @@ -0,0 +1,24 @@ +type: task +name: train-distrib + +nodes: 2 + +python: 3.12 +env: + - NCCL_DEBUG=INFO +commands: + - git clone https://github.com/pytorch/examples.git pytorch-examples + - cd pytorch-examples/distributed/ddp-tutorial-series + - uv pip install -r requirements.txt + - | + torchrun \ + --nproc-per-node=$DSTACK_GPUS_PER_NODE \ + --node-rank=$DSTACK_NODE_RANK \ + --nnodes=$DSTACK_NODES_NUM \ + --master-addr=$DSTACK_MASTER_NODE_IP \ + --master-port=12345 \ + multinode.py 50 10 + +resources: + gpu: 1..8 + shm_size: 16GB diff --git a/examples/fine-tuning/alignment-handbook/README.md b/examples/fine-tuning/alignment-handbook/README.md deleted file mode 100644 index 1e3df46732..0000000000 --- a/examples/fine-tuning/alignment-handbook/README.md +++ /dev/null @@ -1,111 +0,0 @@ -# Alignment Handbook - -[Alignment Handbook](https://github.com/huggingface/alignment-handbook) provides robust recipes to continue pretraining -and to align language models with human and AI preferences. It basically comes with two types of recipes and four types -of scripts that were used to create Hugging Face [Zephyr models](https://huggingface.co/HuggingFaceH4): - -- Accelerate recipes: configurations for [DeepSpeed Zero3](https://huggingface.co/docs/accelerate/v0.11.0/en/deepspeed), -- [FSDP(Fully Sharded Data Parallel)](https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html), and multi GPU. - -- Training recipes: configurations of how GPT2, StarChat2-15B, Zephyr-141B-A35B, Zephyr-7B-Beta, and Zephyr-7B-Gemma - models were fine-tuned. - -- Scripts: [`run_cpt.py`](https://github.com/huggingface/alignment-handbook/blob/main/scripts/run_cpt.py) for continual - pre-training, [`run_sft.py`](https://github.com/huggingface/alignment-handbook/blob/main/scripts/run_sft.py) for - supervised fine-tuning, [`run_dpo.py`](https://github.com/huggingface/alignment-handbook/blob/main/scripts/run_dpo.py) - for aligning with preferences via [DPO](https://arxiv.org/abs/2305.18290), - and [`run_orpo.py`](https://github.com/huggingface/alignment-handbook/blob/main/scripts/run_orpo.py) aligning - with [ORPO](https://arxiv.org/abs/2403.07691). - -## Basics - -Alignment Handbook provides all the code you need to run CPT, SFT, DPO, and ORPO within Hugging Face OSS ecosystem -such as `transformers`, `peft`, `accelerate`, `trl`. All you need to do is to modify recipes for accelerate and -training, and run appropriate script. - -For instance, if you want to QLoRA fine-tune Gemma 7B model on your own SFT dataset hosted on Hugging Face Hub, you can -prepare a `yaml` config file as [config.yaml](config.yaml). This config is based on the Zephyr-7B-Gemma recipe except -the following modification: - -- `dataset_mixer` field to point which SFT dataset to be used. -- `hub_model_id` and `output_dir` fields to point where the model and its checkpoints should be saved. -- `LoRA arguments` related fields to indicate that this fine-tuning is based on QLoRA methodology. - -With the `config.yaml` file configured, you can run the following command to QLoRA fine-tune Gemma 7B model on 2 -GPUs: - -```shell -ACCELERATE_LOG_LEVEL=info accelerate launch \ - --config_file config.yaml \ - --num_processes=2 \ - scripts/run_sft.py \ - recipes/{model_name}/{task}/config_qlora.yaml -``` - -For more details and other alignment methods, please check out the -alignment-handbook's [official repository](https://github.com/huggingface/alignment-handbook). - -## Running via `dstack` - -This example demonstrate how to run an Alignment Handbook recipe via `dstack`. - -First, define the [`train.dstack.yaml`](train.dstack.yaml) task configuration file as following: - -```yaml -type: task - -python: "3.11" - -env: - - HUGGING_FACE_HUB_TOKEN - - WANDB_API_KEY - -commands: - - conda install cuda - - git clone https://github.com/huggingface/alignment-handbook.git - - mkdir -p alignment-handbook/recipes/custom/ - - cp config.yaml alignment-handbook/recipes/custom/config.yaml - - - cd alignment-handbook - - python -m pip install . - - python -m pip install flash-attn --no-build-isolation - - - pip install wandb - - wandb login $WANDB_API_KEY - - - ACCELERATE_LOG_LEVEL=info accelerate launch - --config_file recipes/accelerate_configs/multi_gpu.yaml - --num_processes=$DSTACK_GPUS_NUM - scripts/run_sft.py - recipes/custom/config.yaml - -ports: - - 6006 - -resources: - gpu: - memory: 40GB - name: A6000 - count: 2 -``` - -> [!NOTE] -> Feel free to adjust `resources` to specify the required resources. - -The task clones the `huggingface/alignment-handbook` repo, and copies our local `config.yaml` to the recipies subfolder. -Then, the task installs dependencies, and launches the recipe. - -Our `config.yaml` sets `report_to` to `wandb` and `tensorboard`. That's why we the task also installs `wandb`. - -To run the task, use the following command: - -```shell -HUGGING_FACE_HUB_TOKEN=<...> \ -WANDB_API_KEY=<...> \ -dstack run . -f examples/fine-tuning/alignment-handbook/train.dstack.yaml -``` - -## Results - -- [merged_ds_coding](https://huggingface.co/datasets/chansung/merged_ds_coding): SFT dataset for solely coding task. It roughly contains 60k training dataset. -- [chansung/coding_llamaduo_60k_v0.2](https://huggingface.co/chansung/coding_llamaduo_60k_v0.2): QLoRA adapter for Gemma 7B with the exactly the same configuration as in [`config.yaml`](./config.yaml). This adapter is fine-tuned on the `merged_ds_coding` dataset with 2xA6000 GPUs via `dstack` Sky. \ No newline at end of file diff --git a/examples/fine-tuning/alignment-handbook/config.yaml b/examples/fine-tuning/alignment-handbook/config.yaml deleted file mode 100644 index abeab49c93..0000000000 --- a/examples/fine-tuning/alignment-handbook/config.yaml +++ /dev/null @@ -1,64 +0,0 @@ -# Model arguments -model_name_or_path: google/gemma-7b -model_revision: main -tokenizer_name_or_path: philschmid/gemma-tokenizer-chatml # Custom tokenizer with <|im_start|> and <|im_end|> tokens -torch_dtype: bfloat16 -use_flash_attention_2: true - -# LoRA arguments -load_in_4bit: true -use_peft: true -lora_r: 16 -lora_alpha: 16 -lora_dropout: 0.05 -lora_target_modules: -- q_proj -- k_proj -- v_proj -- o_proj -- gate_proj -- up_proj -- down_proj - -# Data training arguments -dataset_mixer: - chansung/merged_ds_coding: 1.0 -dataset_splits: -- train_sft -- test_sft -preprocessing_num_workers: 12 - -# SFT trainer config -bf16: true -dataset_kwargs: - add_special_tokens: false # We already wrap and in the chat template - append_concat_token: false # No need to add across samples -do_eval: true -evaluation_strategy: epoch -gradient_accumulation_steps: 2 -gradient_checkpointing: true -gradient_checkpointing_kwargs: - use_reentrant: false -hub_model_id: chansung/coding_llamaduo_60k_v0.2 -hub_strategy: every_save -learning_rate: 2.0e-04 -log_level: info -logging_steps: 5 -logging_strategy: steps -lr_scheduler_type: cosine -max_seq_length: 2048 -max_steps: -1 -num_train_epochs: 5 -output_dir: data/coding_llamaduo_60k_v0.2 -overwrite_output_dir: true -per_device_eval_batch_size: 2 -per_device_train_batch_size: 2 -push_to_hub: true -report_to: -- tensorboard -- wandb -save_strategy: "steps" -save_steps: 100 -save_total_limit: 1 -seed: 42 -warmup_ratio: 0.1 \ No newline at end of file diff --git a/examples/fine-tuning/alignment-handbook/train.dstack.yaml b/examples/fine-tuning/alignment-handbook/train.dstack.yaml deleted file mode 100644 index 67fdb119e4..0000000000 --- a/examples/fine-tuning/alignment-handbook/train.dstack.yaml +++ /dev/null @@ -1,35 +0,0 @@ -type: task - -python: "3.11" - -env: - - HUGGING_FACE_HUB_TOKEN - - WANDB_API_KEY - -commands: - - conda install cuda - - git clone https://github.com/huggingface/alignment-handbook.git - - mkdir -p alignment-handbook/recipes/custom/ - - cp config.yaml alignment-handbook/recipes/custom/config.yaml - - - cd alignment-handbook - - python -m pip install . - - python -m pip install flash-attn --no-build-isolation - - - pip install wandb - - wandb login $WANDB_API_KEY - - - ACCELERATE_LOG_LEVEL=info accelerate launch - --config_file recipes/accelerate_configs/multi_gpu.yaml - --num_processes=2 - scripts/run_sft.py - recipes/custom/config.yaml - -ports: - - 6006 - -resources: - gpu: - memory: 40GB - name: A6000 - count: 2 \ No newline at end of file diff --git a/examples/fine-tuning/axolotl/README.md b/examples/fine-tuning/axolotl/README.md deleted file mode 100644 index 3f8c5afea2..0000000000 --- a/examples/fine-tuning/axolotl/README.md +++ /dev/null @@ -1,85 +0,0 @@ -# Axolotl - -[`axolotl`](https://github.com/OpenAccess-AI-Collective/axolotl) streamlines the fine-tuning of AI models, offering support for multiple configurations and architectures. - -Furthermore, `axolotl` provides a set of [`yaml` examples](https://github.com/OpenAccess-AI-Collective/axolotl/tree/main/examples) for almost all kinds of LLMs such as LLaMA2 family, Gemma family, LLaMA3 family, Jamba, and so on. It's recommended to navigate through the examples to get a sense about the role of each parameters, and adjust them for your specific use cases. Also, it is worth checking out all configs/parameters options with a brief description from [this doc](https://github.com/OpenAccess-AI-Collective/axolotl/blob/main/docs/config.qmd). - -The example below replicates the [FSDP+QLoRA on LLaMA3 70B](https://github.com/OpenAccess-AI-Collective/axolotl/blob/main/examples/llama-3/qlora-fsdp-70b.yaml), except that here we use Llama3 8B. You can see the [`config.yaml`](config.yaml). - -## Running with `dstack` - -Running `axolotl` with `dstack` is very straightforward. - -First, define the [`train.dstack.yaml`](train.dstack.yaml) task configuration file as follows: - -```yaml -type: task - -image: winglian/axolotl-cloud:main-20240429-py3.11-cu121-2.2.1 - -env: - - HUGGING_FACE_HUB_TOKEN - - WANDB_API_KEY - -commands: - - accelerate launch -m axolotl.cli.train config.yaml - -ports: - - 6006 - -resources: - gpu: - memory: 24GB.. - count: 2 -``` - -> [!NOTE] -> Feel free to adjust `resources` to specify the required resources. - -We are using the official Docker image provided by Axolotl team (`winglian/axolotl-cloud:main-20240429-py3.11-cu121-2.2.1`). If you want to see other images, their official [repo](https://hub.docker.com/r/winglian/axolotl-cloud/tags). Note, `dstack` requires the CUDA driver to be 12.1+. - -To run the task, use the following command: - -```shell -HUGGING_FACE_HUB_TOKEN=<...> \ -WANDB_API_KEY=<...> \ -dstack run . -f examples/fine-tuning/axolotl/train.dstack.yaml -``` - -To push the final fine-tuned model to Hugging Face Hub, set `hub_model_id` in [`config.yaml`](config.yaml). - -### Building `axolotl` from sources - -If you'd like to build `axolot` from sources (e.g. if you intend to modify its source code), follow its [installation guide](https://github.com/OpenAccess-AI-Collective/axolotl?tab=readme-ov-file#condapip-venv). - -Example: - -```yaml -type: task - -python: 3.11 - -env: - - HUGGING_FACE_HUB_TOKEN - - WANDB_API_KEY - -commands: - - conda install cuda - - pip3 install torch torchvision torchaudio - - - git clone https://github.com/OpenAccess-AI-Collective/axolotl.git - - cd axolotl - - - pip3 install packaging - - pip3 install -e '.[flash-attn,deepspeed]' - - - accelerate launch -m axolotl.cli.train ../config.yaml - -ports: - - 6006 - -resources: - gpu: - memory: 24GB.. - count: 2 -``` diff --git a/examples/fine-tuning/axolotl/config.yaml b/examples/fine-tuning/axolotl/config.yaml deleted file mode 100644 index 5087bc4348..0000000000 --- a/examples/fine-tuning/axolotl/config.yaml +++ /dev/null @@ -1,82 +0,0 @@ -base_model: meta-llama/Meta-Llama-3-8B -model_type: LlamaForCausalLM -tokenizer_type: AutoTokenizer # PreTrainedTokenizerFast - -load_in_8bit: false -load_in_4bit: true -strict: false - -datasets: - - path: tatsu-lab/alpaca - type: alpaca -dataset_prepared_path: last_run_prepared -val_set_size: 0.05 -output_dir: ./out/qlora-llama3-8B - -adapter: qlora -lora_model_dir: - -sequence_len: 512 -sample_packing: false -pad_to_sequence_len: true - -lora_r: 8 -lora_alpha: 16 -lora_dropout: 0.05 -lora_target_modules: -lora_target_linear: true -lora_fan_in_fan_out: - -wandb_project: dstack+axolotl -wandb_entity: -wandb_watch: -wandb_name: llama3-8b-fp16-fsdp+qlora -wandb_log_model: - -gradient_accumulation_steps: 4 -micro_batch_size: 1 -num_epochs: 4 -optimizer: adamw_torch -lr_scheduler: cosine -learning_rate: 0.00001 - -train_on_inputs: false -group_by_length: false -bf16: auto -fp16: -tf32: false - -gradient_checkpointing: true -gradient_checkpointing_kwargs: - use_reentrant: true -early_stopping_patience: -resume_from_checkpoint: -local_rank: -logging_steps: 1 -xformers_attention: -flash_attention: true - -warmup_steps: 10 -evals_per_epoch: 4 -eval_table_size: -saves_per_epoch: 1 -debug: -deepspeed: -weight_decay: 0.0 -fsdp: - - full_shard - - auto_wrap -fsdp_config: - fsdp_limit_all_gathers: true - fsdp_sync_module_states: true - fsdp_offload_params: true - fsdp_use_orig_params: false - fsdp_cpu_ram_efficient_loading: true - fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP - fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer - fsdp_state_dict_type: FULL_STATE_DICT - fsdp_sharding_strategy: FULL_SHARD -special_tokens: - pad_token: <|end_of_text|> - -hub_model_id: chansung/axolotl_llama3_8b_fsdp_qlora \ No newline at end of file diff --git a/examples/fine-tuning/axolotl/train.dstack.yaml b/examples/fine-tuning/axolotl/train.dstack.yaml deleted file mode 100644 index 1239b496ce..0000000000 --- a/examples/fine-tuning/axolotl/train.dstack.yaml +++ /dev/null @@ -1,18 +0,0 @@ -type: task - -image: winglian/axolotl-cloud:main-20240429-py3.11-cu121-2.2.1 - -env: - - HUGGING_FACE_HUB_TOKEN - - WANDB_API_KEY - -commands: - - accelerate launch -m axolotl.cli.train config.yaml - -ports: - - 6006 - -resources: - gpu: - memory: 24GB.. - count: 2 \ No newline at end of file diff --git a/examples/fine-tuning/pytorch-distributed/train.dstack.yml b/examples/fine-tuning/pytorch-distributed/train.dstack.yml deleted file mode 100644 index 12f38aa5a4..0000000000 --- a/examples/fine-tuning/pytorch-distributed/train.dstack.yml +++ /dev/null @@ -1,24 +0,0 @@ -type: task - -nodes: 2 - -commands: - - git clone https://github.com/r4victor/pytorch-distributed-resnet.git - - cd pytorch-distributed-resnet - - mkdir -p data - - cd data - - wget -c --quiet https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz - - tar -xvzf cifar-10-python.tar.gz - - cd .. - - pip3 install -r requirements.txt torch - - mkdir -p saved_models - - torchrun --nproc_per_node=$DSTACK_GPUS_PER_NODE - --node_rank=$DSTACK_NODE_RANK - --nnodes=$DSTACK_NODES_NUM - --master_addr=$DSTACK_MASTER_NODE_IP - --master_port=8008 resnet_ddp.py - --num_epochs 20 - -resources: - gpu: 1..2 - shm_size: 16GB diff --git a/examples/fine-tuning/qlora/README.md b/examples/fine-tuning/qlora/README.md deleted file mode 100644 index 133ec70bb4..0000000000 --- a/examples/fine-tuning/qlora/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# QLoRA - -The following command runs the task to fine-tune an LLM using QLoRA: - -```shell -dstack run . -f examples/fine-tuning/qlora/task.dstack.yml -``` - -See the configuration at [train.dstack.yml](train.dstack.yml). - -For more details, refer to [tasks](https://dstack.ai/docs/concepts/tasks). \ No newline at end of file diff --git a/examples/fine-tuning/qlora/requirements.txt b/examples/fine-tuning/qlora/requirements.txt deleted file mode 100644 index 3041543739..0000000000 --- a/examples/fine-tuning/qlora/requirements.txt +++ /dev/null @@ -1,10 +0,0 @@ -accelerate==0.21.0 -peft==0.4.0 -bitsandbytes==0.40.2 -transformers==4.31.0 -trl==0.4.7 -scipy -tensorboard -sentencepiece -hf-transfer -protobuf<5 \ No newline at end of file diff --git a/examples/fine-tuning/qlora/train.dstack.yml b/examples/fine-tuning/qlora/train.dstack.yml deleted file mode 100644 index 029c472de0..0000000000 --- a/examples/fine-tuning/qlora/train.dstack.yml +++ /dev/null @@ -1,18 +0,0 @@ -type: task -# This task fine-tunes Llama 2 with QLoRA. Learn more at https://dstack.ai/examples/qlora/ - -python: "3.11" - -env: - - HUGGING_FACE_HUB_TOKEN - - HF_HUB_ENABLE_HF_TRANSFER=1 - -commands: - - pip install -r examples/fine-tuning/qlora/requirements.txt - - tensorboard --logdir results/runs & - - python examples/fine-tuning/qlora/train.py --merge_and_push ${{ run.args }} -ports: - - 6006 - -resources: - gpu: 16GB..24GB diff --git a/examples/llms/chat-ui/README.md b/examples/llms/chat-ui/README.md deleted file mode 100644 index 5af67a27e8..0000000000 --- a/examples/llms/chat-ui/README.md +++ /dev/null @@ -1,53 +0,0 @@ -# Chat UI - -This example walks you through setting up [Chat UI](https://github.com/huggingface/chat-ui) locally -to chat with an LLM deployed using `dstack`. - -### Clone the repo - -```shell -git clone https://github.com/huggingface/chat-ui -cd chat-ui -``` - -### Run local MongoDB - -```shell -docker run -d -p 27017:27017 --name mongo-chatui mongo:latest -``` - -### Create `.env.local` - -```shell -MONGODB_URL=mongodb://localhost:27017 - -MODELS=`[ - { - "name": "", - "displayName": "My model", - "endpoints": [ - { - "type": "openai", - "baseURL" : "https://gateway.", - "apiKey": "" - } - ] - } -]` -``` - -Replace `` with your `dstack` gateway's domain (e.g. `.sky.dstack.ai` if you are using [dstack Sky](https://sky.dstack.ai)). - -Replace `` with your `dstack` user's token. - -Replace `` with the name of the deployed mode, - -### Run Chat UI - -```shell -npm run dev -``` - -Now you can conveniently chat with your model! 🤗 - -![](images/dstack-chat-ui-llama3.png) \ No newline at end of file diff --git a/examples/llms/chat-ui/images/dstack-chat-ui-llama3.png b/examples/llms/chat-ui/images/dstack-chat-ui-llama3.png deleted file mode 100644 index d59249cd51..0000000000 Binary files a/examples/llms/chat-ui/images/dstack-chat-ui-llama3.png and /dev/null differ diff --git a/examples/llms/code-llama/tgi-gptq.dstack.yml b/examples/llms/code-llama/tgi-gptq.dstack.yml deleted file mode 100644 index 95e9cef4bd..0000000000 --- a/examples/llms/code-llama/tgi-gptq.dstack.yml +++ /dev/null @@ -1,23 +0,0 @@ -type: service -# This service deploys Code Llama 70B (GPTQ) with TGI - -image: ghcr.io/huggingface/text-generation-inference:latest -env: - - MODEL_ID=TheBloke/CodeLlama-70B-Instruct-GPTQ -commands: - - text-generation-server download-weights $MODEL_ID --trust-remote-code - - text-generation-launcher - --port 80 - --trust-remote-code - --quantize gptq -port: 80 - -resources: - gpu: 80GB - -# (Optional) Enable the OpenAI-compatible endpoint -model: - type: chat - name: TheBloke/CodeLlama-70B-Instruct-GPTQ - format: tgi - eos_token: "" diff --git a/examples/llms/deepseek/trl/amd/.dstack.yml b/examples/llms/deepseek/trl/amd/.dstack.yml new file mode 100644 index 0000000000..fe3dbc31ae --- /dev/null +++ b/examples/llms/deepseek/trl/amd/.dstack.yml @@ -0,0 +1,41 @@ +type: task +# The name is optional, if not specified, generated randomly +name: trl-train + +image: rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0 + +# Required environment variables +env: + - WANDB_API_KEY + - WANDB_PROJECT + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B + +# Commands of the task +commands: + - git clone https://github.com/huggingface/trl.git + - pip install trl + - pip install "numpy<2" + - pip install peft + - pip install wandb + - cd trl/trl/scripts + - python sft.py + --model_name_or_path $MODEL_ID + --dataset_name trl-lib/Capybara + --learning_rate 2.0e-4 + --num_train_epochs 1 + --packing + --per_device_train_batch_size 2 + --gradient_accumulation_steps 8 + --gradient_checkpointing + --logging_steps 25 + --eval_strategy steps + --eval_steps 100 + --use_peft + --lora_r 32 + --lora_alpha 16 + --report_to wandb + --output_dir DeepSeek-R1-Distill-Qwen-1.5B-SFT + +resources: + gpu: MI300X + disk: 150GB diff --git a/examples/llms/deepseek/trl/amd/deepseek_v2.dstack.yml b/examples/llms/deepseek/trl/amd/deepseek_v2.dstack.yml new file mode 100644 index 0000000000..4c719dcd5e --- /dev/null +++ b/examples/llms/deepseek/trl/amd/deepseek_v2.dstack.yml @@ -0,0 +1,59 @@ +type: task +# The name is optional, if not specified, generated randomly +name: trl-train + +image: rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0 + +# Required environment variables +env: + - WANDB_API_KEY + - WANDB_PROJECT + - MODEL_ID=deepseek-ai/DeepSeek-V2-Lite + - ACCELERATE_USE_FSDP=False +# Commands of the task +commands: + - git clone https://github.com/huggingface/peft.git + - pip install trl + - pip install "numpy<2" + - pip install peft + - pip install wandb + - cd peft/examples/sft + - python train.py + --seed 100 + --model_name_or_path "deepseek-ai/DeepSeek-V2-Lite" + --dataset_name "smangrul/ultrachat-10k-chatml" + --chat_template_format "chatml" + --add_special_tokens False + --append_concat_token False + --splits "train,test" + --max_seq_len 512 + --num_train_epochs 1 + --logging_steps 5 + --log_level "info" + --logging_strategy "steps" + --eval_strategy "epoch" + --save_strategy "epoch" + --hub_private_repo True + --hub_strategy "every_save" + --packing True + --learning_rate 1e-4 + --lr_scheduler_type "cosine" + --weight_decay 1e-4 + --warmup_ratio 0.0 + --max_grad_norm 1.0 + --output_dir "deepseek-sft-lora" + --per_device_train_batch_size 8 + --per_device_eval_batch_size 8 + --gradient_accumulation_steps 4 + --gradient_checkpointing True + --use_reentrant True + --dataset_text_field "content" + --use_peft_lora True + --lora_r 16 + --lora_alpha 16 + --lora_dropout 0.05 + --lora_target_modules "all-linear" + +resources: + gpu: MI300X + disk: 150GB diff --git a/examples/llms/deepseek/trl/amd/grpo.dstack.yml b/examples/llms/deepseek/trl/amd/grpo.dstack.yml new file mode 100644 index 0000000000..c1a76e528b --- /dev/null +++ b/examples/llms/deepseek/trl/amd/grpo.dstack.yml @@ -0,0 +1,32 @@ +type: task +# The name is optional, if not specified, generated randomly +name: trl-train-grpo + +image: rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0 + +# Required environment variables +env: + - WANDB_API_KEY + - WANDB_PROJECT + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +# Mount files +files: + - grpo_train.py +# Commands of the task +commands: + - pip install trl + - pip install datasets + # numpy version less than 2 is required for the scipy installation with AMD. + - pip install "numpy<2" + - python grpo_train.py + --model_name_or_path $MODEL_ID + --dataset_name trl-lib/tldr + --per_device_train_batch_size 2 + --logging_steps 25 + --output_dir Deepseek-Distill-Qwen-1.5B-GRPO + --trust_remote_code + +# GRPO fine-tuning of DeepSeek-R1-Distill-Qwen-1.5B consumes 70% of VRAM +resources: + gpu: MI300X + disk: 150GB diff --git a/examples/llms/deepseek/trl/amd/grpo_train.py b/examples/llms/deepseek/trl/amd/grpo_train.py new file mode 100644 index 0000000000..ab59291de6 --- /dev/null +++ b/examples/llms/deepseek/trl/amd/grpo_train.py @@ -0,0 +1,60 @@ +import argparse + +from datasets import load_dataset +from transformers import AutoModelForCausalLM +from trl import GRPOConfig, GRPOTrainer + + +def parse_args(): + parser = argparse.ArgumentParser(description="Train a model using GRPOTrainer.") + parser.add_argument( + "--model_name_or_path", + type=str, + required=True, + help="Path to the model or model identifier from huggingface.co/models", + ) + parser.add_argument( + "--dataset_name", type=str, required=True, help="Name of the dataset to use" + ) + parser.add_argument( + "--per_device_train_batch_size", + type=int, + default=1, + help="Batch size per device for training", + ) + parser.add_argument("--logging_steps", type=int, default=10, help="Logging steps interval") + parser.add_argument( + "--output_dir", type=str, default="output", help="Output directory for the trained model" + ) + parser.add_argument( + "--trust_remote_code", action="store_true", help="Trust remote code when loading the model" + ) + return parser.parse_args() + + +def reward_len(completions, **kwargs): + return [abs(20 - len(completion)) for completion in completions] + + +def main(): + args = parse_args() + + dataset = load_dataset(args.dataset_name, split="train") + training_args = GRPOConfig( + output_dir=args.output_dir, + logging_steps=args.logging_steps, + per_device_train_batch_size=args.per_device_train_batch_size, + ) + + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, trust_remote_code=args.trust_remote_code + ) + trainer = GRPOTrainer( + model=model, reward_funcs=reward_len, args=training_args, train_dataset=dataset + ) + + trainer.train() + + +if __name__ == "__main__": + main() diff --git a/examples/llms/deepseek/trl/nvidia/.dstack.yml b/examples/llms/deepseek/trl/nvidia/.dstack.yml new file mode 100644 index 0000000000..23564c3c86 --- /dev/null +++ b/examples/llms/deepseek/trl/nvidia/.dstack.yml @@ -0,0 +1,38 @@ +type: task +# The name is optional, if not specified, generated randomly +name: trl-train + +python: 3.12 + +# Required environment variables +env: + - WANDB_API_KEY + - WANDB_PROJECT + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +# Commands of the task +commands: + - git clone https://github.com/huggingface/trl.git + - pip install trl + - pip install peft + - pip install wandb + - cd trl/trl/scripts + - python sft.py + --model_name_or_path $MODEL_ID + --dataset_name trl-lib/Capybara + --learning_rate 2.0e-4 + --num_train_epochs 1 + --packing + --per_device_train_batch_size 2 + --gradient_accumulation_steps 8 + --gradient_checkpointing + --logging_steps 25 + --eval_strategy steps + --eval_steps 100 + --use_peft + --lora_r 32 + --lora_alpha 16 + --report_to wandb + --output_dir DeepSeek-R1-Distill-Qwen-1.5B-SFT + +resources: + gpu: 24GB diff --git a/examples/llms/deepseek/trl/nvidia/deepseek_v2.dstack.yml b/examples/llms/deepseek/trl/nvidia/deepseek_v2.dstack.yml new file mode 100644 index 0000000000..4ad972e813 --- /dev/null +++ b/examples/llms/deepseek/trl/nvidia/deepseek_v2.dstack.yml @@ -0,0 +1,64 @@ +type: task +# The name is optional, if not specified, generated randomly +name: trl-train-deepseek-v2 + +python: "3.10" + +nvcc: true +# Required environment variables +env: + - WANDB_API_KEY + - WANDB_PROJECT + - MODEL_ID=deepseek-ai/DeepSeek-V2-Lite + - ACCELERATE_USE_FSDP=False +# Commands of the task +commands: + - git clone https://github.com/huggingface/peft.git + - pip install trl + - pip install peft + - pip install wandb + - pip install bitsandbytes + - cd peft/examples/sft + - python train.py + --seed 100 + --model_name_or_path "deepseek-ai/DeepSeek-V2-Lite" + --dataset_name "smangrul/ultrachat-10k-chatml" + --chat_template_format "chatml" + --add_special_tokens False + --append_concat_token False + --splits "train,test" + --max_seq_len 512 + --num_train_epochs 1 + --logging_steps 5 + --log_level "info" + --logging_strategy "steps" + --eval_strategy "epoch" + --save_strategy "epoch" + --hub_private_repo True + --hub_strategy "every_save" + --bf16 True + --packing True + --learning_rate 1e-4 + --lr_scheduler_type "cosine" + --weight_decay 1e-4 + --warmup_ratio 0.0 + --max_grad_norm 1.0 + --output_dir "deepseek-sft-lora" + --per_device_train_batch_size 8 + --per_device_eval_batch_size 8 + --gradient_accumulation_steps 4 + --gradient_checkpointing True + --use_reentrant True + --dataset_text_field "content" + --use_peft_lora True + --lora_r 16 + --lora_alpha 16 + --lora_dropout 0.05 + --lora_target_modules "all-linear" + --use_4bit_quantization True + --use_nested_quant True + --bnb_4bit_compute_dtype "bfloat16" + +resources: + # Consumes ~25GB of VRAM for QLoRA fine-tuning deepseek-ai/DeepSeek-V2-Lite + gpu: 48GB diff --git a/examples/llms/llama3/README.md b/examples/llms/llama3/README.md deleted file mode 100644 index 7dca55df01..0000000000 --- a/examples/llms/llama3/README.md +++ /dev/null @@ -1,29 +0,0 @@ -# Llama 3 - -## Deploy Llama 8B - -The following command deploys Llama 3 8B as a service: - -```shell -dstack run . -f examples/llms/llama3/ollama-8b.dstack.yml -``` - -See the configuration at [ollama-8b.dstack.yml](ollama-8b.dstack.yml). - -## Deploy Llama 70B - -The following command deploys Llama 3 70B as a service: - -```shell -dstack run . -f examples/llms/llama3/ollama-70b.dstack.yml -``` - -See the configuration at [ollama-70b.dstack.yml](ollama-70b.dstack.yml). - -For more details, refer to [services](https://dstack.ai/docs/concepts/services). - -## Run Chat UI - -Refer to [chat-ui](../chat-ui/README.md) to learn how to launch a UI for chatting with the LLM you've deployed to `dstack`. - -![](../chat-ui/images/dstack-chat-ui-llama3.png) \ No newline at end of file diff --git a/examples/llms/llama3/ollama-70b.dstack.yml b/examples/llms/llama3/ollama-70b.dstack.yml deleted file mode 100644 index b65e52f736..0000000000 --- a/examples/llms/llama3/ollama-70b.dstack.yml +++ /dev/null @@ -1,19 +0,0 @@ -type: service -# Deploys Llama 3 70B with Ollama - -image: ollama/ollama -commands: - - ollama serve & - - sleep 3 - - ollama pull llama3:70b - - fg -port: 11434 - -resources: - gpu: 80GB - -# (Optional) Enable the OpenAI-compatible endpoint -model: - type: chat - name: llama3:70b - format: openai diff --git a/examples/llms/llama3/ollama-8b.dstack.yml b/examples/llms/llama3/ollama-8b.dstack.yml deleted file mode 100644 index f5841e6235..0000000000 --- a/examples/llms/llama3/ollama-8b.dstack.yml +++ /dev/null @@ -1,19 +0,0 @@ -type: service -# Deploys Llama 3 8B with Ollama - -image: ollama/ollama -commands: - - ollama serve & - - sleep 3 - - ollama pull llama3 - - fg -port: 11434 - -resources: - gpu: 16GB - -# (Optional) Enable the OpenAI-compatible endpoint -model: - type: chat - name: llama3 - format: openai diff --git a/examples/llms/mixtral/README.md b/examples/llms/mixtral/README.md deleted file mode 100644 index 0451dd9a5e..0000000000 --- a/examples/llms/mixtral/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# Mixtral 8x7B - -The following command deploys Mixtral 8x7B as a service: - -```shell -dstack run . -f examples/llms/mixtral/serve.dstack.yml -``` - -See the configuration at [serve.dstack.yml](serve.dstack.yml). - -For more details, refer to [services](https://dstack.ai/docs/concepts/services). \ No newline at end of file diff --git a/examples/llms/mixtral/tgi-gptq.dstack.yml b/examples/llms/mixtral/tgi-gptq.dstack.yml deleted file mode 100644 index 6f3945c10a..0000000000 --- a/examples/llms/mixtral/tgi-gptq.dstack.yml +++ /dev/null @@ -1,21 +0,0 @@ -type: service -# This service deploys Mixtral (GPTQ) with TGI - -image: ghcr.io/huggingface/text-generation-inference:latest -env: - - MODEL_ID=TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ -commands: - - text-generation-launcher - --port 80 - --trust-remote-code - --quantize gptq -port: 80 - -resources: - gpu: 25GB..50GB - -# (Optional) Enable the OpenAI-compatible endpoint -model: - type: chat - name: TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ - format: tgi diff --git a/examples/llms/mixtral/tgi.dstack.yml b/examples/llms/mixtral/tgi.dstack.yml deleted file mode 100644 index 31868de97f..0000000000 --- a/examples/llms/mixtral/tgi.dstack.yml +++ /dev/null @@ -1,23 +0,0 @@ -type: service -# This service deploys Mixtral with TGI - -image: ghcr.io/huggingface/text-generation-inference:latest -env: - - HUGGING_FACE_HUB_TOKEN - - MODEL_ID=mistralai/Mixtral-8x7B-Instruct-v0.1 -commands: - - text-generation-launcher - --port 80 - --trust-remote-code - --num-shard $DSTACK_GPUS_NUM -port: 80 - -resources: - gpu: 80GB:2 - disk: 200GB - -# (Optional) Enable the OpenAI-compatible endpoint -model: - type: chat - name: TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ - format: tgi diff --git a/examples/llms/mixtral/vllm.dstack.yml b/examples/llms/mixtral/vllm.dstack.yml deleted file mode 100644 index 59d0376a3f..0000000000 --- a/examples/llms/mixtral/vllm.dstack.yml +++ /dev/null @@ -1,23 +0,0 @@ -type: service -# This service deploys Mixtral with vLLM - -python: "3.11" -env: - - HUGGING_FACE_HUB_TOKEN -commands: - - pip install vllm - - python -m vllm.entrypoints.openai.api_server - --model mistralai/Mixtral-8x7B-Instruct-v0.1 - --host 0.0.0.0 - --tensor-parallel-size $DSTACK_GPUS_NUM -port: 8000 - -resources: - gpu: 80GB:2 - disk: 200GB - -# (Optional) Enable the OpenAI-compatible endpoint -model: - type: chat - name: TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ - format: openai \ No newline at end of file diff --git a/examples/misc/airflow/README.md b/examples/misc/airflow/README.md new file mode 100644 index 0000000000..fa1a15f762 --- /dev/null +++ b/examples/misc/airflow/README.md @@ -0,0 +1,81 @@ +# Airflow + +This example shows how to run the `dstack` CLI and API from Airflow pipelines. +It uses Airflow 2 and the [TaskFlow API](https://airflow.apache.org/docs/apache-airflow/stable/tutorial/taskflow.html). + +## Preparing a virtual environment + +`dstack` and Airflow may have conflicting dependencies, so it's recommended to install +`dstack` to a separate virtual environment available to Airflow. + +Ensure the virtual environment created for `dstack` is +available to all the workers in case your Airflow runs in a distributed environment. + +## Running dstack CLI + +To run the `dstack` CLI from Airflow, +we can run it as regular bash commands using [BashOperator](https://airflow.apache.org/docs/apache-airflow/stable/howto/operator/bash.html). +The only special step here is that we need to activate a virtual environment before running `dstack`: + +```python + +DSTACK_VENV_PATH = "/path/to/dstack-venv" + +@dag(...) +def pipeline(...): + ... + @task.bash + def dstack_cli_apply_venv() -> str: + return ( + f"source {DSTACK_VENV_PATH}/bin/activate" + f" && cd {DSTACK_REPO_PATH}" + " && dstack apply -y -f task.dstack.yml --repo ." + ) +``` + +## Running dstack API + +To run the `dstack` API from Airflow, we can use [ExternalPythonOperator](https://airflow.apache.org/docs/apache-airflow/stable/howto/operator/python.html#externalpythonoperator). Specify a path to the Python binary inside the dstack virtual environment, and +Airflow will run the code inside that virtual environment: + +```python + +DSTACK_VENV_PYTHON_BINARY_PATH = f"{DSTACK_VENV_PATH}/bin/python" + +@dag(...) +def pipeline(...): + ... + @task.external_python(task_id="external_python", python=DSTACK_VENV_PYTHON_BINARY_PATH) + def dstack_api_submit_venv(): + from dstack.api import Client, Task + + task = Task( + name="my-airflow-task", + commands=[ + "echo 'Running dstack task via Airflow'", + "sleep 10", + "echo 'Finished'", + ] + ) + # Pick up config from `~/.dstack/config.yml` + # or set explicitly from Ariflow Variables. + client = Client.from_config() + + run = client.runs.apply_configuration( + configuration=task, + ) + run.attach() + try: + for log in run.logs(): + sys.stdout.buffer.write(log) + sys.stdout.buffer.flush() + except KeyboardInterrupt: + run.stop(abort=True) + finally: + run.detach() +``` + +## Source code + +The source code for this example can be found in +[`examples/misc/airflow`](https://github.com/dstackai/dstack/blob/master/examples/misc/airflow). diff --git a/examples/misc/airflow/dags/dstack-repo/task.dstack.yml b/examples/misc/airflow/dags/dstack-repo/task.dstack.yml new file mode 100644 index 0000000000..53812c25e1 --- /dev/null +++ b/examples/misc/airflow/dags/dstack-repo/task.dstack.yml @@ -0,0 +1,5 @@ +type: task +commands: + - echo "Running dstack task via Airflow" + - sleep 10 + - echo "Finished" diff --git a/examples/misc/airflow/dags/dstack_tasks.py b/examples/misc/airflow/dags/dstack_tasks.py new file mode 100644 index 0000000000..8002e123b5 --- /dev/null +++ b/examples/misc/airflow/dags/dstack_tasks.py @@ -0,0 +1,96 @@ +import os +import sys +from datetime import datetime, timedelta + +from airflow.configuration import conf +from airflow.decorators import dag, task + +# dstack repo files are stored in the dags folder as an example. +# Put dstack repo files in another place if appropriate. +DAGS_DIR_PATH = os.path.join(conf.get("core", "DAGS_FOLDER")) +DSTACK_REPO_PATH = f"{DAGS_DIR_PATH}/dstack-repo" + +# A separate virtual environment should be created for dstack if dstack cannot be +# installed into the main Airflow environment. For example, due to incompatible dependencies. +DSTACK_VENV_PATH = "/path/to/dstack-venv" # Change this ! +DSTACK_VENV_PYTHON_BINARY_PATH = f"{DSTACK_VENV_PATH}/bin/python" + + +default_args = { + "owner": "airflow", + "retries": 1, + "retry_delay": timedelta(minutes=5), + "start_date": datetime(2024, 11, 13), +} + + +@dag( + default_args=default_args, + schedule_interval=timedelta(days=1), + catchup=False, + description="Examples of running dstack via Airflow", +) +def dstack_tasks(): + @task.bash + def dstack_cli_apply() -> str: + """ + This task shows how to run the dstack CLI when + dstack is installed into the main Airflow environment. + NOT RECOMMENDED since dstack and Airflow may have conflicting dependencies. + """ + return f"cd {DSTACK_REPO_PATH} && dstack apply -y -f task.dstack.yml --repo ." + + @task.bash + def dstack_cli_apply_venv() -> str: + """ + This task shows how to run the dstack CLI when + dstack is installed into a separate virtual environment available to Airflow. + """ + return ( + f". {DSTACK_VENV_PATH}/bin/activate" + f" && cd {DSTACK_REPO_PATH}" + " && dstack apply -y -f task.dstack.yml --repo ." + ) + + @task.external_python(task_id="external_python", python=DSTACK_VENV_PYTHON_BINARY_PATH) + def dstack_api_submit_venv(): + """ + This task shows how to run the dstack API when + dstack is installed into a separate virtual environment available to Airflow. + Note that the venv must have the `pendulum` package installed. + """ + from dstack.api import Client, Task + + task = Task( + name="my-airflow-task", + commands=[ + "echo 'Running dstack task via Airflow'", + "sleep 10", + "echo 'Finished'", + ], + ) + # Pick up config from `~/.dstack/config.yml` + # or set explicitly from Ariflow Variables. + client = Client.from_config() + + run = client.runs.apply_configuration( + configuration=task, + ) + run.attach() + try: + for log in run.logs(): + sys.stdout.buffer.write(log) + sys.stdout.buffer.flush() + except KeyboardInterrupt: + run.stop(abort=True) + finally: + run.detach() + + # Uncomment a task you want to run + + # dstack_cli_apply() + # dstack_cli_apply_venv() + dstack_api_submit_venv() + + +dstack_tasks() diff --git a/examples/misc/http.server/serve.dstack.yml b/examples/misc/http.server/.dstack.yml similarity index 100% rename from examples/misc/http.server/serve.dstack.yml rename to examples/misc/http.server/.dstack.yml diff --git a/examples/misc/http.server/README.md b/examples/misc/http.server/README.md index a521a0d074..c98b57f4be 100644 --- a/examples/misc/http.server/README.md +++ b/examples/misc/http.server/README.md @@ -5,19 +5,19 @@ The following command runs `http.server` as a service: ```shell -dstack run . -f examples/misc/http.server/serve.dstack.yml +dstack apply -f examples/misc/http.server/.dstack.yml ``` -See the configuration at [serve.dstack.yml](serve.dstack.yml). +See the configuration at [.dstack.yml](.dstack.yml). ## Task The following command runs `http.server` as a task: ```shell -dstack run . -f examples/misc/http.server/serve-task.dstack.yml +dstack apply -f examples/misc/http.server/task.dstack.yml ``` -See the configuration at [serve.dstack.yml](serve-task.dstack.yml). +See the configuration at [task.dstack.yml](task.dstack.yml). -For more details, refer to [services](https://dstack.ai/docs/concepts/services) or [tasks](https://dstack.ai/docs/concepts/tasks). \ No newline at end of file +For more details, refer to [services](https://dstack.ai/docs/services) or [tasks](https://dstack.ai/docs/tasks). diff --git a/examples/misc/http.server/serve-task.dstack.yml b/examples/misc/http.server/task.dstack.yml similarity index 100% rename from examples/misc/http.server/serve-task.dstack.yml rename to examples/misc/http.server/task.dstack.yml diff --git a/examples/misc/ray/README.md b/examples/misc/ray/README.md new file mode 100644 index 0000000000..e25b87c206 --- /dev/null +++ b/examples/misc/ray/README.md @@ -0,0 +1,71 @@ +# Ray + +This example shows how use `dstack` to spin up a [Ray](https://docs.ray.io/en/latest/ray-overview/index.html) cluster and run Ray jobs on it. + +## Create a fleet + +First create a fleet for the Ray cluster. We'll use one instance for a master node and three instances for worker nodes: + +```yaml +type: fleet +name: ray-fleet +nodes: 4 +placement: cluster +backends: [gcp] +resources: + cpu: 8.. + memory: 32GB.. + gpu: 1 +``` + +```shell +dstack apply -f fleet.dstack.yaml +``` + +## Launch Ray cluster + +The following `dstack` task launches Ray master and worker nodes. +`dstack` makes the Ray dashboard available at `localhost:8265`. + +```yaml +type: task +name: ray-cluster +nodes: 4 +commands: + - pip install -U "ray[default]" + - | + if [ $DSTACK_NODE_RANK = 0 ]; then + ray start --head --port=6379; + else + ray start --address=$DSTACK_MASTER_NODE_IP:6379 + fi +ports: + - 8265 # ray dashboard port +resources: + shm_size: 8GB +``` + +```shell +dstack apply -f cluster.dstack.yaml +``` + +## Run Ray jobs + +Install Ray locally: + +```shell +pip install ray +``` + +Now you can submit Ray jobs to the cluster available at `localhost:8265`: + +```shell +RAY_ADDRESS='http://localhost:8265' ray job submit \ +--working-dir . \ +--runtime-env-json='{"pip": ["ray[train]", "torch", "torchvision", "tqdm", "filelock"]}' \ +-- python tasks/pytorch-mnist.py +``` + +See more examples in the [Ray docs](https://docs.ray.io/en/latest/train/examples.html). + +Using Ray via `dstack` is a powerful way to get access to the rich Ray ecosystem while benefiting from `dstack`'s provisioning capabilities. diff --git a/examples/misc/ray/cluster.dstack.yaml b/examples/misc/ray/cluster.dstack.yaml new file mode 100644 index 0000000000..c30ff6bc25 --- /dev/null +++ b/examples/misc/ray/cluster.dstack.yaml @@ -0,0 +1,15 @@ +type: task +name: ray-cluster +nodes: 4 +commands: + - pip install -U "ray[default]" + - > + if [ $DSTACK_NODE_RANK = 0 ]; then + ray start --head --port=6379; + else + ray start --address=$DSTACK_MASTER_NODE_IP:6379 + fi +ports: + - 8265 # ray dashboard port +resources: + shm_size: 8GB diff --git a/examples/misc/ray/fleet.dstack.yaml b/examples/misc/ray/fleet.dstack.yaml new file mode 100644 index 0000000000..66712afb1d --- /dev/null +++ b/examples/misc/ray/fleet.dstack.yaml @@ -0,0 +1,9 @@ +type: fleet +name: ray-fleet +nodes: 4 +placement: cluster +backends: [gcp] +resources: + cpu: 8.. + memory: 32GB.. + gpu: 1 diff --git a/examples/misc/ray/tasks/pytorch-mnist.py b/examples/misc/ray/tasks/pytorch-mnist.py new file mode 100644 index 0000000000..b1049d885d --- /dev/null +++ b/examples/misc/ray/tasks/pytorch-mnist.py @@ -0,0 +1,168 @@ +# This example runs distributed training of a PyTorch model on Fashion MNIST with Ray Train. +# Source: https://docs.ray.io/en/latest/train/examples/pytorch/torch_fashion_mnist_example.html + +# Copyright 2023 Ray Authors + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import Dict + +import ray.train +import torch +from filelock import FileLock +from ray.train import ScalingConfig +from ray.train.torch import TorchTrainer +from torch import nn +from torch.utils.data import DataLoader +from torchvision import datasets, transforms +from torchvision.transforms import Normalize, ToTensor +from tqdm import tqdm + + +def get_dataloaders(batch_size): + # Transform to normalize the input images + transform = transforms.Compose([ToTensor(), Normalize((0.5,), (0.5,))]) + + with FileLock(os.path.expanduser("~/data.lock")): + # Download training data from open datasets + training_data = datasets.FashionMNIST( + root="~/data", + train=True, + download=True, + transform=transform, + ) + + # Download test data from open datasets + test_data = datasets.FashionMNIST( + root="~/data", + train=False, + download=True, + transform=transform, + ) + + # Create data loaders + train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True) + test_dataloader = DataLoader(test_data, batch_size=batch_size) + + return train_dataloader, test_dataloader + + +# Model Definition +class NeuralNetwork(nn.Module): + def __init__(self): + super(NeuralNetwork, self).__init__() + self.flatten = nn.Flatten() + self.linear_relu_stack = nn.Sequential( + nn.Linear(28 * 28, 512), + nn.ReLU(), + nn.Dropout(0.25), + nn.Linear(512, 512), + nn.ReLU(), + nn.Dropout(0.25), + nn.Linear(512, 10), + nn.ReLU(), + ) + + def forward(self, x): + x = self.flatten(x) + logits = self.linear_relu_stack(x) + return logits + + +def train_func_per_worker(config: Dict): + lr = config["lr"] + epochs = config["epochs"] + batch_size = config["batch_size_per_worker"] + + # Get dataloaders inside the worker training function + train_dataloader, test_dataloader = get_dataloaders(batch_size=batch_size) + + # [1] Prepare Dataloader for distributed training + # Shard the datasets among workers and move batches to the correct device + # ======================================================================= + train_dataloader = ray.train.torch.prepare_data_loader(train_dataloader) + test_dataloader = ray.train.torch.prepare_data_loader(test_dataloader) + + model = NeuralNetwork() + + # [2] Prepare and wrap your model with DistributedDataParallel + # Move the model to the correct GPU/CPU device + # ============================================================ + model = ray.train.torch.prepare_model(model) + + loss_fn = nn.CrossEntropyLoss() + optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9) + + # Model training loop + for epoch in range(epochs): + if ray.train.get_context().get_world_size() > 1: + # Required for the distributed sampler to shuffle properly across epochs. + train_dataloader.sampler.set_epoch(epoch) + + model.train() + for X, y in tqdm(train_dataloader, desc=f"Train Epoch {epoch}"): + pred = model(X) + loss = loss_fn(pred, y) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + model.eval() + test_loss, num_correct, num_total = 0, 0, 0 + with torch.no_grad(): + for X, y in tqdm(test_dataloader, desc=f"Test Epoch {epoch}"): + pred = model(X) + loss = loss_fn(pred, y) + + test_loss += loss.item() + num_total += y.shape[0] + num_correct += (pred.argmax(1) == y).sum().item() + + test_loss /= len(test_dataloader) + accuracy = num_correct / num_total + + # [3] Report metrics to Ray Train + # =============================== + ray.train.report(metrics={"loss": test_loss, "accuracy": accuracy}) + + +def train_fashion_mnist(num_workers=2, use_gpu=False): + global_batch_size = 32 + + train_config = { + "lr": 1e-3, + "epochs": 10, + "batch_size_per_worker": global_batch_size // num_workers, + } + + # Configure computation resources + scaling_config = ScalingConfig(num_workers=num_workers, use_gpu=use_gpu) + + # Initialize a Ray TorchTrainer + trainer = TorchTrainer( + train_loop_per_worker=train_func_per_worker, + train_loop_config=train_config, + scaling_config=scaling_config, + ) + + # [4] Start distributed training + # Run `train_func_per_worker` on all workers + # ============================================= + result = trainer.fit() + print(f"Training result: {result}") + + +if __name__ == "__main__": + train_fashion_mnist(num_workers=3, use_gpu=True) diff --git a/examples/misc/ray/tasks/quicksort.py b/examples/misc/ray/tasks/quicksort.py new file mode 100644 index 0000000000..2975499f7d --- /dev/null +++ b/examples/misc/ray/tasks/quicksort.py @@ -0,0 +1,72 @@ +# This example sorts the list in a distributed and parallel fashion. +# Source: https://docs.ray.io/en/latest/ray-core/patterns/nested-tasks.html#code-example + +# Copyright 2023 Ray Authors + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time + +import ray +from numpy import random + + +def partition(collection): + # Use the last element as the pivot + pivot = collection.pop() + greater, lesser = [], [] + for element in collection: + if element > pivot: + greater.append(element) + else: + lesser.append(element) + return lesser, pivot, greater + + +def quick_sort(collection): + if len(collection) <= 200000: # magic number + return sorted(collection) + else: + lesser, pivot, greater = partition(collection) + lesser = quick_sort(lesser) + greater = quick_sort(greater) + return lesser + [pivot] + greater + + +@ray.remote +def quick_sort_distributed(collection): + # Tiny tasks are an antipattern. + # Thus, in our example we have a "magic number" to + # toggle when distributed recursion should be used vs + # when the sorting should be done in place. The rule + # of thumb is that the duration of an individual task + # should be at least 1 second. + if len(collection) <= 200000: # magic number + return sorted(collection) + else: + lesser, pivot, greater = partition(collection) + lesser = quick_sort_distributed.remote(lesser) + greater = quick_sort_distributed.remote(greater) + return ray.get(lesser) + [pivot] + ray.get(greater) + + +for size in [200000, 4000000, 8000000]: + print(f"Array size: {size}") + unsorted = random.randint(1000000, size=(size)).tolist() + s = time.time() + quick_sort(unsorted) + print(f"Sequential execution: {(time.time() - s):.3f}") + s = time.time() + ray.get(quick_sort_distributed.remote(unsorted)) + print(f"Distributed execution: {(time.time() - s):.3f}") + print("--" * 10) diff --git a/examples/misc/spark/README.md b/examples/misc/spark/README.md new file mode 100644 index 0000000000..f9a37252e8 --- /dev/null +++ b/examples/misc/spark/README.md @@ -0,0 +1,62 @@ +# Spark + +This example shows how use `dstack` to spin up an [Apache Spark](https://spark.apache.org/docs/latest/index.html) cluster and run tasks on it. + +## Create a fleet + +First create a fleet for the Spark cluster. We'll use one instance as the Spark master node and one instance as the Spark worker node. We'll also provision a third instance that we'll use to submit Spark apps to the cluster so that we don't need to run anything locally: + +```yaml +type: fleet +name: spark-fleet +nodes: 3 +placement: cluster +backends: [gcp] +``` + +```shell +dstack apply -f fleet.dstack.yaml +``` + +## Launch Spark cluster + +The following `dstack` task launches Spark master and worker nodes. `dstack` makes the cluster UI available at `localhost:8080`, and Spark apps can be submitted to `localhost:7077`. + +```yaml +type: task +name: spark-cluster +image: spark +nodes: 2 +commands: + - export SPARK_MASTER_HOST=$DSTACK_MASTER_NODE_IP + - export SPARK_NO_DAEMONIZE=true + - if [ $DSTACK_NODE_RANK = 0 ]; then /opt/spark/sbin/start-master.sh; else /opt/spark/sbin/start-worker.sh spark://$DSTACK_MASTER_NODE_IP:7077; fi +ports: + - 7077 + - 8080 +``` + +```shell +dstack apply -f cluster.dstack.yaml +``` + +## Run Spark app + +If you have Spark installed locally, you can submit your code directly to `localhost:7077`. In this example, we run another `dstack` task to submit code to the Spark cluster. You should provide it with `SPARK_CLUSTER_IP` set to the IP address of the Spark cluster master node. (You can find it in the output of `dstack apply -f cluster.dstack.yaml`.) + +```yaml +type: task +name: spark-task +image: spark +env: + - SPARK_CLUSTER_IP +commands: + - pip install pyspark + - python3 tasks/words.py +``` + +```shell +export SPARK_CLUSTER_IP= + +dstack apply -f task.dstack.yaml +``` diff --git a/examples/misc/spark/cluster.dstack.yaml b/examples/misc/spark/cluster.dstack.yaml new file mode 100644 index 0000000000..386f8d2ff7 --- /dev/null +++ b/examples/misc/spark/cluster.dstack.yaml @@ -0,0 +1,11 @@ +type: task +name: spark-cluster +image: spark +nodes: 2 +commands: + - export SPARK_MASTER_HOST=$DSTACK_MASTER_NODE_IP + - export SPARK_NO_DAEMONIZE=true + - if [ $DSTACK_NODE_RANK = 0 ]; then /opt/spark/sbin/start-master.sh; else /opt/spark/sbin/start-worker.sh spark://$DSTACK_MASTER_NODE_IP:7077; fi +ports: + - 7077 + - 8080 diff --git a/examples/misc/spark/fleet.dstack.yaml b/examples/misc/spark/fleet.dstack.yaml new file mode 100644 index 0000000000..b821874c8e --- /dev/null +++ b/examples/misc/spark/fleet.dstack.yaml @@ -0,0 +1,5 @@ +type: fleet +name: spark-fleet +nodes: 3 +placement: cluster +backends: [gcp] diff --git a/examples/misc/spark/task.dstack.yaml b/examples/misc/spark/task.dstack.yaml new file mode 100644 index 0000000000..7f30020643 --- /dev/null +++ b/examples/misc/spark/task.dstack.yaml @@ -0,0 +1,8 @@ +type: task +name: spark-task +image: spark +env: + - SPARK_CLUSTER_IP +commands: + - pip install pyspark + - python3 tasks/words.py diff --git a/examples/misc/spark/tasks/words.py b/examples/misc/spark/tasks/words.py new file mode 100644 index 0000000000..72f9a8d28b --- /dev/null +++ b/examples/misc/spark/tasks/words.py @@ -0,0 +1,39 @@ +# A trivial Spark app example that counts words occurencies in a text + +import os +from operator import add + +from pyspark.sql import SparkSession + +SPARK_CLUSTER_IP = os.environ["SPARK_CLUSTER_IP"] + +# Initialize a Spark session +spark = SparkSession.builder.master(f"spark://{SPARK_CLUSTER_IP}:7077").getOrCreate() + +data = [ + "Apache Spark is a unified analytics engine", + "for large-scale data processing", + "Spark runs on Hadoop, Apache Mesos, Kubernetes", + "Standalone, or in the cloud", + "It can access diverse data sources", +] + +# Parallelize the data (create an RDD) +lines = spark.sparkContext.parallelize(data) + +# Split each line into words +words = lines.flatMap(lambda line: line.split()) + +# Map each word to a pair of (word, 1) +word_pairs = words.map(lambda word: (word, 1)) + +# Reduce by key (word) to count occurrences +word_counts = word_pairs.reduceByKey(add) + +# Collect the results and print +results = word_counts.collect() +for word, count in results: + print(f"{word}: {count}") + +# Stop the Spark session +spark.stop() diff --git a/examples/misc/streamlit/serve.dstack.yml b/examples/misc/streamlit/.dstack.yml similarity index 100% rename from examples/misc/streamlit/serve.dstack.yml rename to examples/misc/streamlit/.dstack.yml diff --git a/examples/misc/streamlit/README.md b/examples/misc/streamlit/README.md index 5f758b1a9e..855c84b61d 100644 --- a/examples/misc/streamlit/README.md +++ b/examples/misc/streamlit/README.md @@ -5,19 +5,19 @@ The following command runs `streamlit hello` as a service with disabled authentication: ```shell -dstack run . -f examples/misc/streamlit/serve.dstack.yml +dstack apply -f examples/misc/streamlit/.dstack.yml ``` -See the configuration at [serve.dstack.yml](serve.dstack.yml). +See the configuration at [.dstack.yml](.dstack.yml). ## Task The following command runs `streamlit hello` as a task: ```shell -dstack run . -f examples/misc/streamlit/serve-task.dstack.yml +dstack apply -f examples/misc/streamlit/task.dstack.yml ``` -See the configuration at [serve-task.dstack.yml](serve-task.dstack.yml). +See the configuration at [task.dstack.yml](task.dstack.yml). -For more details, refer to [services](https://dstack.ai/docs/concepts/services) or [tasks](https://dstack.ai/docs/concepts/tasks). \ No newline at end of file +For more details, refer to [services](https://dstack.ai/docs/services) or [tasks](https://dstack.ai/docs/tasks). diff --git a/examples/misc/streamlit/serve-task.dstack.yml b/examples/misc/streamlit/task.dstack.yml similarity index 100% rename from examples/misc/streamlit/serve-task.dstack.yml rename to examples/misc/streamlit/task.dstack.yml diff --git a/examples/models/gpt-oss/amd/120b.dstack.yml b/examples/models/gpt-oss/amd/120b.dstack.yml new file mode 100644 index 0000000000..662fb9f403 --- /dev/null +++ b/examples/models/gpt-oss/amd/120b.dstack.yml @@ -0,0 +1,31 @@ +type: service +name: gpt-oss-120b + +model: openai/gpt-oss-120b + +env: + - HF_TOKEN + - MODEL=openai/gpt-oss-120b + # To enable AITER, set below to 1. Otherwise, set it to 0. + - VLLM_ROCM_USE_AITER=1 + # To enable AITER Triton unified attention + - VLLM_USE_AITER_UNIFIED_ATTENTION=1 + # below is required in order to enable AITER unified attention by disabling AITER MHA + - VLLM_ROCM_USE_AITER_MHA=0 +image: rocm/vllm-dev:open-mi300-08052025 +commands: + - | + vllm serve $MODEL \ + --tensor-parallel $DSTACK_GPUS_NUM \ + --no-enable-prefix-caching \ + --disable-log-requests \ + --compilation-config '{"full_cuda_graph": true}' +port: 8000 + +volumes: + # Cache downloaded models + - /root/.cache/huggingface:/root/.cache/huggingface + +resources: + gpu: MI300X:8 + shm_size: 32GB diff --git a/gateway/src/dstack/gateway/config/__init__.py b/examples/plugins/__init__.py similarity index 100% rename from gateway/src/dstack/gateway/config/__init__.py rename to examples/plugins/__init__.py diff --git a/examples/plugins/example_plugin/.python-version b/examples/plugins/example_plugin/.python-version new file mode 100644 index 0000000000..2c0733315e --- /dev/null +++ b/examples/plugins/example_plugin/.python-version @@ -0,0 +1 @@ +3.11 diff --git a/examples/plugins/example_plugin/Dockerfile b/examples/plugins/example_plugin/Dockerfile new file mode 100644 index 0000000000..54b5660ecb --- /dev/null +++ b/examples/plugins/example_plugin/Dockerfile @@ -0,0 +1,9 @@ +# Example of including plugins into the dstack server Docker image +FROM dstackai/dstack:latest + +# Installing plugin from Docker context +COPY . plugins/example_plugin +RUN uv tool install "dstack[all]" --with plugins/example_plugin + +# Installing some other plugins from pypi/git +# RUN uv tool install "dstack[all]" --with plugin1 --with plugin2 diff --git a/examples/plugins/example_plugin/README.md b/examples/plugins/example_plugin/README.md new file mode 100644 index 0000000000..112cb3a293 --- /dev/null +++ b/examples/plugins/example_plugin/README.md @@ -0,0 +1,52 @@ +## Overview + +This is a basic `dstack` plugin example. +You can use it as a reference point when implementing new `dstack` plugins. + +## Steps + +1. Init the plugin package: + + ``` + uv init --library + ``` + +2. Define `ApplyPolicy` and `Plugin` subclasses: + + ```python + from dstack.plugins import ApplyPolicy, Plugin, RunSpec, get_plugin_logger + + + logger = get_plugin_logger(__name__) + + + class ExamplePolicy(ApplyPolicy): + + def on_run_apply(self, user: str, project: str, spec: RunSpec) -> RunSpec: + # ... + return spec + + + class ExamplePlugin(Plugin): + + def get_apply_policies(self) -> list[ApplyPolicy]: + return [ExamplePolicy()] + + ``` + +3. Specify a "dstack.plugins" entry point in `pyproject.toml`: + + ```toml + [project.entry-points."dstack.plugins"] + example_plugin = "example_plugin:ExamplePlugin" + ``` + +4. Make sure to install the plugin and enable it in the `server/config.yml`: + + ```yaml + plugins: + - example_plugin + projects: + - name: main + # ... + ``` diff --git a/examples/plugins/example_plugin/enterprise.Dockerfile b/examples/plugins/example_plugin/enterprise.Dockerfile new file mode 100644 index 0000000000..f59d900ae1 --- /dev/null +++ b/examples/plugins/example_plugin/enterprise.Dockerfile @@ -0,0 +1,9 @@ +# Example of including plugins into the dstack Enterprise Docker image +FROM ghcr.io/dstackai/dstack-enterprise:latest + +# Installing plugin from Docker context +COPY . plugins/example_plugin +RUN uv pip install plugins/example_plugin + +# Installing some other plugins from pypi/git +# RUN uv pip install plugin-name diff --git a/examples/plugins/example_plugin/pyproject.toml b/examples/plugins/example_plugin/pyproject.toml new file mode 100644 index 0000000000..66f954a59a --- /dev/null +++ b/examples/plugins/example_plugin/pyproject.toml @@ -0,0 +1,17 @@ +[project] +name = "example-plugin" +version = "0.1.0" +description = "A dstack plugin example" +readme = "README.md" +authors = [ + { name = "Victor Skvortsov", email = "victor@dstack.ai" } +] +requires-python = ">=3.10" +dependencies = [] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project.entry-points."dstack.plugins"] +example_plugin = "example_plugin:ExamplePlugin" diff --git a/examples/plugins/example_plugin/src/example_plugin/__init__.py b/examples/plugins/example_plugin/src/example_plugin/__init__.py new file mode 100644 index 0000000000..f431e5c282 --- /dev/null +++ b/examples/plugins/example_plugin/src/example_plugin/__init__.py @@ -0,0 +1,34 @@ +from dstack.api import Service +from dstack.plugins import ApplyPolicy, GatewaySpec, Plugin, RunSpec, get_plugin_logger + +logger = get_plugin_logger(__name__) + + +class ExamplePolicy(ApplyPolicy): + def on_run_apply(self, user: str, project: str, spec: RunSpec) -> RunSpec: + # Forcing some limits + spec.configuration.max_price = 2.0 + spec.configuration.max_duration = "1d" + # Setting some extra tags + if spec.configuration.tags is None: + spec.configuration.tags = {} + spec.configuration.tags |= { + "team": "my_team", + } + # Forbid something + if spec.configuration.privileged: + logger.warning("User %s tries to run privileged containers", user) + raise ValueError("Running privileged containers is forbidden") + # Set some service-specific properties + if isinstance(spec.configuration, Service): + spec.configuration.https = True + return spec + + def on_gateway_apply(self, user: str, project: str, spec: GatewaySpec) -> GatewaySpec: + # Forbid creating new gateways altogether + raise ValueError("Creating gateways is forbidden") + + +class ExamplePlugin(Plugin): + def get_apply_policies(self) -> list[ApplyPolicy]: + return [ExamplePolicy()] diff --git a/gateway/src/dstack/gateway/core/__init__.py b/examples/plugins/example_plugin/src/example_plugin/py.typed similarity index 100% rename from gateway/src/dstack/gateway/core/__init__.py rename to examples/plugins/example_plugin/src/example_plugin/py.typed diff --git a/examples/plugins/example_plugin_server/.python-version b/examples/plugins/example_plugin_server/.python-version new file mode 100644 index 0000000000..2c0733315e --- /dev/null +++ b/examples/plugins/example_plugin_server/.python-version @@ -0,0 +1 @@ +3.11 diff --git a/examples/plugins/example_plugin_server/README.md b/examples/plugins/example_plugin_server/README.md new file mode 100644 index 0000000000..032cf0bdca --- /dev/null +++ b/examples/plugins/example_plugin_server/README.md @@ -0,0 +1,29 @@ +## Overview + +If you wish to hook up your own plugin server through `dstack` builtin `rest_plugin`, here's a basic example on how to do so. + +## Steps + +1. Install the plugin server: + + ```bash + uv pip install examples/plugins/example_plugin_server + ``` + +2. Start the plugin server: + + ```bash + python -m example_plugin_server.main + ``` + +3. Enable `rest_plugin` in `server/config.yaml`: + + ```yaml + plugins: + - rest_plugin + ``` + +4. Point the `dstack` server to your plugin server: + ```bash + export DSTACK_PLUGIN_SERVICE_URI=http://127.0.0.1:8000 + ``` diff --git a/gateway/src/dstack/gateway/openai/__init__.py b/examples/plugins/example_plugin_server/__init__.py similarity index 100% rename from gateway/src/dstack/gateway/openai/__init__.py rename to examples/plugins/example_plugin_server/__init__.py diff --git a/examples/plugins/example_plugin_server/pyproject.toml b/examples/plugins/example_plugin_server/pyproject.toml new file mode 100644 index 0000000000..cc5d3b1c0a --- /dev/null +++ b/examples/plugins/example_plugin_server/pyproject.toml @@ -0,0 +1,18 @@ +[project] +name = "dstack-plugin-server" +version = "0.1.0" +description = "Example plugin server" +readme = "README.md" +requires-python = ">=3.10" +dependencies = [ + "fastapi", + "uvicorn", + "dstack", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/example_plugin_server"] diff --git a/gateway/src/dstack/gateway/registry/__init__.py b/examples/plugins/example_plugin_server/src/example_plugin_server/__init__.py similarity index 100% rename from gateway/src/dstack/gateway/registry/__init__.py rename to examples/plugins/example_plugin_server/src/example_plugin_server/__init__.py diff --git a/examples/plugins/example_plugin_server/src/example_plugin_server/main.py b/examples/plugins/example_plugin_server/src/example_plugin_server/main.py new file mode 100644 index 0000000000..1733a6e998 --- /dev/null +++ b/examples/plugins/example_plugin_server/src/example_plugin_server/main.py @@ -0,0 +1,66 @@ +import logging +import os + +import uvicorn +from fastapi import FastAPI + +from dstack.plugins.builtin.rest_plugin import ( + FleetSpecRequest, + FleetSpecResponse, + GatewaySpecRequest, + GatewaySpecResponse, + RunSpecRequest, + RunSpecResponse, + VolumeSpecRequest, + VolumeSpecResponse, +) +from example_plugin_server.utils import configure_logging + +configure_logging() +logger = logging.getLogger(__name__) + +app = FastAPI() + + +@app.post("/apply_policies/on_run_apply") +async def on_run_apply(request: RunSpecRequest) -> RunSpecResponse: + logger.info( + f"Received run spec request from user {request.user} and project {request.project}" + ) + response = RunSpecResponse(spec=request.spec, error=None) + return response + + +@app.post("/apply_policies/on_fleet_apply") +async def on_fleet_apply(request: FleetSpecRequest) -> FleetSpecResponse: + logger.info( + f"Received fleet spec request from user {request.user} and project {request.project}" + ) + response = FleetSpecResponse(spec=request.spec, error=None) + return response + + +@app.post("/apply_policies/on_volume_apply") +async def on_volume_apply(request: VolumeSpecRequest) -> VolumeSpecResponse: + logger.info( + f"Received volume spec request from user {request.user} and project {request.project}" + ) + response = VolumeSpecResponse(spec=request.spec, error=None) + return response + + +@app.post("/apply_policies/on_gateway_apply") +async def on_gateway_apply(request: GatewaySpecRequest) -> GatewaySpecResponse: + logger.info( + f"Received gateway spec request from user {request.user} and project {request.project}" + ) + response = GatewaySpecResponse(spec=request.spec, error=None) + return response + + +if __name__ == "__main__": + uvicorn.run( + app, + host="127.0.0.1", + port=int(os.getenv("DSTACK_REST_PLUGIN_SERVER_PORT", 8000)), + ) diff --git a/examples/plugins/example_plugin_server/src/example_plugin_server/utils.py b/examples/plugins/example_plugin_server/src/example_plugin_server/utils.py new file mode 100644 index 0000000000..b07406682f --- /dev/null +++ b/examples/plugins/example_plugin_server/src/example_plugin_server/utils.py @@ -0,0 +1,7 @@ +import logging +import os + + +def configure_logging(): + log_level = os.getenv("LOG_LEVEL", "INFO").upper() + logging.basicConfig(level=log_level) diff --git a/examples/server-deployment/cloudformation/README.md b/examples/server-deployment/cloudformation/README.md new file mode 100644 index 0000000000..2e2057df47 --- /dev/null +++ b/examples/server-deployment/cloudformation/README.md @@ -0,0 +1,17 @@ +# Deploying server to a private VPC via AWS CloudFormation + +If you'd like to deploy the server to a private AWS VPC, you can use +our CloudFormation [template](https://console.aws.amazon.com/cloudformation/home#/stacks/quickcreate?templateURL=https://get-dstack.s3.eu-west-1.amazonaws.com/cloudformation/template.yaml). + +First, ensure, you've set up a private VPC with public and private subnets. + +![](https://dstack.ai/static-assets/static-assets/images/dstack-aws-private-vpc-example-v2.png) + +Create a stack using the template, and specify the VPC and private subnets. +Once, the stack is created, go to `Outputs` for the server URL and admin token. + +> To access the server URL, ensure you're connected to the VPC, e.g. via VPN client. + +!!! info "Source code" + If you'd like to adjust anything, the source code of the template can be found at + [`examples/server-deployment/cloudformation/template.yaml`](https://github.com/dstackai/dstack/blob/master/examples/server-deployment/cloudformation/template.yaml). diff --git a/examples/server-deployment/cloudformation/template.yaml b/examples/server-deployment/cloudformation/template.yaml new file mode 100644 index 0000000000..09ab404941 --- /dev/null +++ b/examples/server-deployment/cloudformation/template.yaml @@ -0,0 +1,266 @@ +AWSTemplateFormatVersion: '2010-09-09' +Description: 'CloudFormation template for deploying a dstack server' + +Parameters: + VpcId: + Description: ID of an existing Virtual Private Cloud (VPC). + Type: 'AWS::EC2::VPC::Id' + PrivateSubnetIds: + Description: IDs of existing private subnets within the specified VPC. + Type: 'List' + + DstackVersion: + Type: String + Description: The version of dstack server + Default: latest + + AdminToken: + Type: String + Description: The secure token of the admin user. If not specified, it's generated randomly. + Default: '' + +Resources: + LitestreamBucket: + Type: 'AWS::S3::Bucket' + DeletionPolicy: Retain + Properties: + BucketName: !Join + - '-' + - - 'dstack-server-litestream-bucket' + - !Select + - 0 + - !Split + - '-' + - !Select + - 2 + - !Split + - / + - !Ref AWS::StackId + + TaskDefinitionRole: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Principal: + Service: ecs-tasks.amazonaws.com + Action: sts:AssumeRole + Policies: + - PolicyName: !Join [ '-', [ 'dstack-server-task-definition-policy', !Ref 'AWS::StackName' ] ] + PolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Action: + - ec2:AttachVolume + - ec2:AuthorizeSecurityGroupEgress + - ec2:AuthorizeSecurityGroupIngress + - ec2:CancelSpotInstanceRequests + - ec2:CreateSecurityGroup + - ec2:CreateTags + - ec2:CreateVolume + - ec2:DeleteVolume + - ec2:DescribeAvailabilityZones + - ec2:DescribeImages + - ec2:DescribeInstances + - ec2:DescribeInstanceAttribute + - ec2:DescribeInstanceTypes + - ec2:DescribeRouteTables + - ec2:DescribeSecurityGroups + - ec2:DescribeSubnets + - ec2:DescribeVpcs + - ec2:DescribeVolumes + - ec2:DetachVolume + - ec2:RunInstances + - ec2:TerminateInstances + Resource: '*' + - Effect: Allow + Action: + - servicequotas:ListServiceQuotas + - servicequotas:GetServiceQuota + Resource: '*' + - Effect: Allow + Action: + - elasticloadbalancing:CreateLoadBalancer + - elasticloadbalancing:CreateTargetGroup + - elasticloadbalancing:CreateListener + - elasticloadbalancing:RegisterTargets + - elasticloadbalancing:AddTags + - elasticloadbalancing:DeleteLoadBalancer + - elasticloadbalancing:DeleteTargetGroup + - elasticloadbalancing:DeleteListener + - elasticloadbalancing:DeregisterTargets + Resource: '*' + - Effect: Allow + Action: + - acm:DescribeCertificate + - acm:ListCertificates + Resource: '*' + - Effect: Allow + Action: + - s3:* + Resource: + - !Sub 'arn:aws:s3:::${LitestreamBucket}' + - !Sub 'arn:aws:s3:::${LitestreamBucket}/*' + + TaskExecutionRole: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Principal: + Service: ecs-tasks.amazonaws.com + Action: sts:AssumeRole + ManagedPolicyArns: + - arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy + + Cluster: + Type: AWS::ECS::Cluster + Properties: + ClusterName: !Join [ '-', [ 'dstack-server', !Ref 'AWS::StackName' ] ] + CapacityProviders: + - FARGATE + DefaultCapacityProviderStrategy: + - CapacityProvider: FARGATE + Weight: 1 + Configuration: + ExecuteCommandConfiguration: + Logging: DEFAULT + + TaskDefinition: + Type: AWS::ECS::TaskDefinition + Properties: + Family: !Join [ '-', [ 'dstack-server-task-definition-family', !Ref 'AWS::StackName' ] ] + NetworkMode: awsvpc + RequiresCompatibilities: + - FARGATE + Cpu: '1024' + Memory: '2048' + ExecutionRoleArn: !GetAtt TaskExecutionRole.Arn + TaskRoleArn: !GetAtt TaskDefinitionRole.Arn + ContainerDefinitions: + - Name: dstack-server + Image: !Join [ ':', [ 'dstackai/dstack', !Ref DstackVersion ] ] + PortMappings: + - ContainerPort: 8000 + LogConfiguration: + LogDriver: awslogs + Options: + awslogs-group: !Join [ '-', [ 'dstack-server', !Ref 'AWS::StackName' ] ] + awslogs-region: !Ref AWS::Region + awslogs-stream-prefix: dstack-server + Environment: + - Name: DSTACK_SERVER_PORT + Value: '8000' + - Name: DSTACK_SERVER_ADMIN_TOKEN + Value: !If [ AdminTokenEmpty, !Select [ 2, !Split [ '/', !Ref AWS::StackId ] ], !Ref AdminToken ] + - Name: LITESTREAM_REPLICA_URL + Value: !Join ['', ['s3://', !Ref LitestreamBucket]] + - Name: DSTACK_SERVER_CLOUDWATCH_LOG_GROUP + Value: !Join [ '-', [ 'dstack-server-runs', !Ref 'AWS::StackName' ] ] + - Name: DSTACK_SERVER_CLOUDWATCH_LOG_REGION + Value: !Ref AWS::Region + + LoadBalancerSecurityGroup: + Type: AWS::EC2::SecurityGroup + Properties: + GroupDescription: Access to the public facing load balancer + VpcId: !Ref VpcId + SecurityGroupIngress: + - CidrIp: 0.0.0.0/0 + FromPort: 80 + ToPort: 80 + IpProtocol: tcp + + LoadBalancer: + Type: AWS::ElasticLoadBalancingV2::LoadBalancer + Properties: + Scheme: internal + SecurityGroups: [ !Ref 'LoadBalancerSecurityGroup' ] + Subnets: !Split [',', !Join [',', !Ref PrivateSubnetIds]] + Type: application + + ListenerHTTP: + Type: AWS::ElasticLoadBalancingV2::Listener + Properties: + DefaultActions: + - TargetGroupArn: !Ref TargetGroup + Type: forward + LoadBalancerArn: !Ref LoadBalancer + Port: 80 + Protocol: HTTP + + ContainerSecurityGroup: + Type: AWS::EC2::SecurityGroup + Properties: + GroupDescription: Security group for container + VpcId: !Ref VpcId + SecurityGroupIngress: + - IpProtocol: tcp + FromPort: 8000 + ToPort: 8000 + SourceSecurityGroupId: !Ref LoadBalancerSecurityGroup + + Service: + Type: AWS::ECS::Service + DependsOn: ListenerHTTP + Properties: + ServiceName: dstack-server + Cluster: !Ref Cluster + TaskDefinition: !Ref TaskDefinition + DesiredCount: 1 + DeploymentConfiguration: + MinimumHealthyPercent: 0 + MaximumPercent: 100 + HealthCheckGracePeriodSeconds: 300 + LaunchType: FARGATE + NetworkConfiguration: + AwsvpcConfiguration: + Subnets: !Split [',', !Join [',', !Ref PrivateSubnetIds]] + SecurityGroups: + - !Ref ContainerSecurityGroup + LoadBalancers: + - ContainerName: dstack-server + ContainerPort: 8000 + TargetGroupArn: !Ref TargetGroup + + TargetGroup: + Type: AWS::ElasticLoadBalancingV2::TargetGroup + Properties: + TargetType: ip + Port: 8000 + Protocol: HTTP + VpcId: !Ref VpcId + HealthCheckPath: / + HealthCheckIntervalSeconds: 30 + Matcher: + HttpCode: '200' + HealthyThresholdCount: 2 + UnhealthyThresholdCount: 3 + + LogGroup: + Type: AWS::Logs::LogGroup + Properties: + LogGroupName: !Join [ '-', [ 'dstack-server', !Ref 'AWS::StackName' ] ] + RetentionInDays: 30 + + RunsLogGroup: + Type: AWS::Logs::LogGroup + Properties: + LogGroupName: !Join [ '-', [ 'dstack-server-runs', !Ref 'AWS::StackName' ] ] + RetentionInDays: 30 + +Conditions: + AdminTokenEmpty: !Equals [ !Ref AdminToken, '' ] + +Outputs: + ServerUrl: + Description: The URL of the dstack server + Value: !GetAtt LoadBalancer.DNSName + AdminToken: + Description: The token of the admin useer + Value: !If [ AdminTokenEmpty, !Select [ 2, !Split [ '/', !Ref AWS::StackId ] ], !Ref AdminToken ] diff --git a/examples/single-node-training/axolotl/amd/.dstack.yml b/examples/single-node-training/axolotl/amd/.dstack.yml new file mode 100644 index 0000000000..1e9886faa3 --- /dev/null +++ b/examples/single-node-training/axolotl/amd/.dstack.yml @@ -0,0 +1,44 @@ +type: task +# The name is optional, if not specified, generated randomly +name: axolotl-amd-llama31-train +image: runpod/pytorch:2.1.2-py3.10-rocm6.0.2-ubuntu22.04 +# Required environment variables +env: + - HF_TOKEN + - WANDB_API_KEY + - WANDB_PROJECT + - WANDB_NAME=axolotl-amd-llama31-train + - HUB_MODEL_ID +# Commands of the task +commands: + - export PATH=/opt/conda/envs/py_3.10/bin:$PATH + - pip uninstall torch torchvision torchaudio -y + - python3 -m pip install --pre torch==2.3.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0/ + - git clone https://github.com/OpenAccess-AI-Collective/axolotl + - cd axolotl + - git checkout d4f6c65 + - pip install -e . + # Latest pynvml is not compatible with axolotl commit d4f6c65, so we need to fall back to version 11.5.3 + - pip uninstall pynvml -y + - pip install pynvml==11.5.3 + - cd .. + - wget https://dstack-binaries.s3.amazonaws.com/flash_attn-2.0.4-cp310-cp310-linux_x86_64.whl + - pip install flash_attn-2.0.4-cp310-cp310-linux_x86_64.whl + - wget https://dstack-binaries.s3.amazonaws.com/xformers-0.0.26-cp310-cp310-linux_x86_64.whl + - pip install xformers-0.0.26-cp310-cp310-linux_x86_64.whl + - git clone --recurse https://github.com/ROCm/bitsandbytes + - cd bitsandbytes + - git checkout rocm_enabled + - pip install -r requirements-dev.txt + - cmake -DBNB_ROCM_ARCH="gfx942" -DCOMPUTE_BACKEND=hip -S . + - make + - pip install . + - cd .. + - accelerate launch -m axolotl.cli.train -- axolotl/examples/llama-3/fft-8b.yaml + --wandb-project "$WANDB_PROJECT" + --wandb-name "$WANDB_NAME" + --hub-model-id "$HUB_MODEL_ID" + +resources: + gpu: MI300X + disk: 150GB diff --git a/examples/single-node-training/axolotl/amd/build-flash-attention.yml b/examples/single-node-training/axolotl/amd/build-flash-attention.yml new file mode 100644 index 0000000000..030729ce46 --- /dev/null +++ b/examples/single-node-training/axolotl/amd/build-flash-attention.yml @@ -0,0 +1,37 @@ +type: task +# The name is optional, if not specified, generated randomly +name: build-flash-attention + +image: runpod/pytorch:2.1.2-py3.10-rocm6.0.2-ubuntu22.04 + +# Required environment variables +env: + - HF_TOKEN + - GPU_ARCHS="gfx90a;gfx942" + - AWS_ACCESS_KEY_ID + - AWS_SECRET_ACCESS_KEY + - AWS_REGION + - BUCKET_NAME + +# Commands of the task +commands: + - apt-get update -y + - apt-get install awscli -y + - aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID + - aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY + - aws configure set region $AWS_REGION + - export PATH=/opt/conda/envs/py_3.10/bin:$PATH + - pip uninstall torch torchvision torchaudio -y + - python3 -m pip install --pre torch==2.3.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0/ + - pip install ninja + - pip install wheel setuptools + - git clone https://github.com/ROCm/flash-attention.git + - cd flash-attention + - git checkout stride_fix + - python setup.py bdist_wheel -d dist/ + - cd dist + - aws s3 cp "$(ls -1 | head -n 1)" s3://$BUCKET_NAME/ --acl public-read + +resources: + gpu: MI300X + disk: 150GB diff --git a/examples/single-node-training/axolotl/amd/build-xformers.dstack.yml b/examples/single-node-training/axolotl/amd/build-xformers.dstack.yml new file mode 100644 index 0000000000..064a3a233a --- /dev/null +++ b/examples/single-node-training/axolotl/amd/build-xformers.dstack.yml @@ -0,0 +1,38 @@ +type: task +# The name is optional, if not specified, generated randomly +name: build-flash-attention + +image: runpod/pytorch:2.1.2-py3.10-rocm6.0.2-ubuntu22.04 + +# Required environment variables +env: + - HF_TOKEN + - GPU_ARCHS="gfx90a;gfx942" + - AWS_ACCESS_KEY_ID + - AWS_SECRET_ACCESS_KEY + - AWS_REGION + - BUCKET_NAME + +# Commands of the task +commands: + - apt-get update -y + - apt-get install awscli -y + - aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID + - aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY + - aws configure set region $AWS_REGION + - export PATH=/opt/conda/envs/py_3.10/bin:$PATH + - pip uninstall torch torchvision torchaudio -y + - python3 -m pip install --pre torch==2.3.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0/ + - pip install ninja + - pip install wheel setuptools + - git clone https://github.com/ROCm/xformers + - cd xformers + - git checkout dfc196d + - git submodule update --init --recursive + - python setup.py bdist_wheel -d dist/ + - cd dist + - aws s3 cp "$(ls -1 | head -n 1)" s3://$BUCKET_NAME/ --acl public-read + +resources: + gpu: MI300X + disk: 150GB diff --git a/examples/single-node-training/optimum-tpu/llama31/.dstack.yml b/examples/single-node-training/optimum-tpu/llama31/.dstack.yml new file mode 100644 index 0000000000..c93862e678 --- /dev/null +++ b/examples/single-node-training/optimum-tpu/llama31/.dstack.yml @@ -0,0 +1,30 @@ +type: task +# The name is optional, if not specified, generated randomly +name: train-tpu + +python: "3.11" + +# Required environment variables +env: + - HF_TOKEN + +# Mount files +files: + - train.py + - config.yaml + +# Commands of the task +commands: + - git clone -b add_llama_31_support https://github.com/dstackai/optimum-tpu.git + - mkdir -p optimum-tpu/examples/custom/ + - cp train.py optimum-tpu/examples/custom/train.py + - cp config.yaml optimum-tpu/examples/custom/config.yaml + - cd optimum-tpu + - pip install -e . -f https://storage.googleapis.com/libtpu-releases/index.html + - pip install datasets evaluate + - pip install accelerate -U + - pip install peft + - python examples/custom/train.py examples/custom/config.yaml + +resources: + gpu: v5litepod-8 diff --git a/examples/single-node-training/optimum-tpu/llama31/config.yaml b/examples/single-node-training/optimum-tpu/llama31/config.yaml new file mode 100644 index 0000000000..2d3677f2f4 --- /dev/null +++ b/examples/single-node-training/optimum-tpu/llama31/config.yaml @@ -0,0 +1,10 @@ +per_device_train_batch_size: 24 +per_device_eval_batch_size: 8 +num_train_epochs: 1 +max_steps: -1 +output_dir: "./finetuned_models/llama3_fine_tuned" +optim: "adafactor" +dataset_name: "Abirate/english_quotes" +model_name: "meta-llama/Meta-Llama-3.1-8B" +lora_r: 4 +push_to_hub: True diff --git a/examples/single-node-training/optimum-tpu/llama31/train.py b/examples/single-node-training/optimum-tpu/llama31/train.py new file mode 100644 index 0000000000..0c8c8a614b --- /dev/null +++ b/examples/single-node-training/optimum-tpu/llama31/train.py @@ -0,0 +1,140 @@ +from dataclasses import dataclass, field +from typing import Optional + +from datasets import load_dataset +from optimum.tpu import AutoModelForCausalLM, fsdp_v2 +from peft import LoraConfig, TaskType, get_peft_model +from transformers import ( + AutoTokenizer, + DataCollatorForLanguageModeling, + HfArgumentParser, + Trainer, + TrainingArguments, +) + + +@dataclass +class ScriptArguments: + per_device_train_batch_size: Optional[int] = field( + default=8, metadata={"help": "Batch size per device for training."} + ) + per_device_eval_batch_size: Optional[int] = field( + default=8, metadata={"help": "Batch size per device for evaluation."} + ) + num_train_epochs: Optional[int] = field( + default=1, + metadata={"help": "The number of training epochs for the SFTTrainer."}, + ) + max_steps: int = field( + default=-1, metadata={"help": "How many optimizer update steps to take"} + ) + output_dir: str = field( + default="./results", + metadata={ + "help": "The output directory where the model predictions and checkpoints will be written." + }, + ) + optim: Optional[str] = field( + default="adafactor", + metadata={"help": "The optimizer to use."}, + ) + dataset_name: Optional[str] = field( + default="Abirate/english_quotes", + metadata={"help": "The dataset to use."}, + ) + model_name: Optional[str] = field( + default="meta-llama/Meta-Llama-3.1-8B", + metadata={ + "help": "Only models Gemma 2B, Gemma 7B, Llama-2 7B and Llama-3 8B Llama-3.1 8B are tested with TPU v5e" + }, + ) + lora_r: Optional[int] = field(default=4, metadata={"help": "LoRA attention dimension."}) + max_seq_length: Optional[int] = field( + default=1024, metadata={"help": "Maximum sequence length to use."} + ) + packing: Optional[bool] = field( + default=True, + metadata={"help": "Use packing dataset creating."}, + ) + push_to_hub: Optional[bool] = field( + default=True, + metadata={"help": "Push fined tuned model to hub."}, + ) + + +def create_and_prepare_model(args): + base_model = AutoModelForCausalLM.from_pretrained(args.model_name) + lora_config = LoraConfig( + r=args.lora_r, # the dimension of the low-rank matrices + lora_alpha=8, # scaling factor for LoRA activations vs pre-trained weight activations + lora_dropout=0.05, + bias="none", + inference_mode=False, + task_type=TaskType.CAUSAL_LM, + target_modules=["o_proj", "v_proj"], + ) # + + model = get_peft_model(base_model, lora_config) + tokenizer = AutoTokenizer.from_pretrained(args.model_name) + # Add custom token for padding Llama + tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token}) + return model, tokenizer + + +def create_and_prepare_trainer(model, tokenizer, dataset, args): + data = dataset.map(lambda samples: tokenizer(samples["quote"]), batched=True) + fsdp_training_args = fsdp_v2.get_fsdp_training_args(model) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=args.per_device_train_batch_size, + num_train_epochs=args.num_train_epochs, + max_steps=args.max_steps, + output_dir=args.output_dir, + optim=args.optim, + logging_steps=1, + dataloader_drop_last=True, # Required by FSDP v2 and SPMD. + **fsdp_training_args, + ), + data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), + ) + + return trainer + + +def parse_config() -> ScriptArguments: + import sys + + import yaml + + # Ensure a YAML file is provided as an argument + if len(sys.argv) != 2: + sys.exit(1) + + config_path = sys.argv[1] + + # Read the YAML file + with open(config_path, "r") as f: + config = yaml.safe_load(f) + + # Parse arguments using HfArgumentParser + parser = HfArgumentParser(ScriptArguments) + script_args = parser.parse_dict(config)[0] + return script_args + + +if __name__ == "__main__": + args = parse_config() + fsdp_v2.use_fsdp_v2() + dataset = load_dataset(args.dataset_name) + model, tokenizer = create_and_prepare_model(args) + trainer = create_and_prepare_trainer(model, tokenizer, dataset, args) + trainer.train() + if args.push_to_hub: + kwargs = { + "finetuned_from": args.model_name, + "dataset": args.dataset_name, + } + trainer.push_to_hub(**kwargs) diff --git a/examples/single-node-training/qlora/.dstack.yml b/examples/single-node-training/qlora/.dstack.yml new file mode 100644 index 0000000000..7d87f630aa --- /dev/null +++ b/examples/single-node-training/qlora/.dstack.yml @@ -0,0 +1,21 @@ +type: task + +python: "3.11" + +env: + - HF_TOKEN + - HF_HUB_ENABLE_HF_TRANSFER=1 + +files: + - requirements.txt + - train.py + +commands: + - pip install -r requirements.txt + - tensorboard --logdir results/runs & + - python train.py --merge_and_push ${{ run.args }} +ports: + - 6006 + +resources: + gpu: 16GB..24GB diff --git a/examples/single-node-training/qlora/README.md b/examples/single-node-training/qlora/README.md new file mode 100644 index 0000000000..eda65cfe25 --- /dev/null +++ b/examples/single-node-training/qlora/README.md @@ -0,0 +1,11 @@ +# QLoRA + +The following command runs the task to fine-tune an LLM using QLoRA: + +```shell +dstack apply -f examples/single-node-training/qlora/.dstack.yml +``` + +See the configuration at [.dstack.yml](.dstack.yml). + +For more details, refer to [tasks](https://dstack.ai/docs/tasks). diff --git a/gateway/src/dstack/gateway/stats/__init__.py b/examples/single-node-training/qlora/requirements.txt similarity index 100% rename from gateway/src/dstack/gateway/stats/__init__.py rename to examples/single-node-training/qlora/requirements.txt diff --git a/examples/fine-tuning/qlora/train.py b/examples/single-node-training/qlora/train.py similarity index 100% rename from examples/fine-tuning/qlora/train.py rename to examples/single-node-training/qlora/train.py diff --git a/examples/single-node-training/trl/amd/.dstack.yml b/examples/single-node-training/trl/amd/.dstack.yml new file mode 100644 index 0000000000..8e6baad788 --- /dev/null +++ b/examples/single-node-training/trl/amd/.dstack.yml @@ -0,0 +1,35 @@ +type: task +# The name is optional, if not specified, generated randomly +name: trl-amd-llama31-train + +# If `image` is not specified, dstack uses its default image +image: runpod/pytorch:2.1.2-py3.10-rocm6.1-ubuntu22.04 + +# Required environment variables +env: + - HF_TOKEN + +files: + - train.py + +commands: + - export PATH=/opt/conda/envs/py_3.10/bin:$PATH + - git clone https://github.com/ROCm/bitsandbytes + - cd bitsandbytes + - git checkout rocm_enabled + - pip install -r requirements-dev.txt + - cmake -DBNB_ROCM_ARCH="gfx942" -DCOMPUTE_BACKEND=hip -S . # Use to target specific gpu arch + - make + - pip install . + - pip install trl + - pip install peft + - pip install transformers datasets huggingface-hub scipy + - cd .. + - python train.py + +# Uncomment to leverage spot instances +#spot_policy: auto + +resources: + gpu: MI300X + disk: 150GB diff --git a/examples/single-node-training/trl/amd/train.py b/examples/single-node-training/trl/amd/train.py new file mode 100644 index 0000000000..15118fc2a1 --- /dev/null +++ b/examples/single-node-training/trl/amd/train.py @@ -0,0 +1,61 @@ +from datasets import load_dataset +from peft import LoraConfig, get_peft_model +from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments +from trl import SFTTrainer + +# Base model and tokenizer names. +base_model_name = "meta-llama/Meta-Llama-3.1-8B" + +# Load base model to GPU memory. +device = "cuda:0" +base_model = AutoModelForCausalLM.from_pretrained(base_model_name, trust_remote_code=True).to( + device +) + +# Load tokenizer. +tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True) +tokenizer.pad_token = tokenizer.eos_token +tokenizer.padding_side = "right" + +# Dataset for fine-tuning. +training_dataset_name = "mlabonne/guanaco-llama2-1k" +training_dataset = load_dataset(training_dataset_name, split="train") + + +# Training parameters for SFTTrainer. +training_arguments = TrainingArguments( + output_dir="./results", + num_train_epochs=1, + per_device_train_batch_size=4, + gradient_accumulation_steps=1, + optim="paged_adamw_32bit", + save_steps=50, + logging_steps=50, + learning_rate=4e-5, + weight_decay=0.001, + fp16=False, + bf16=False, + max_grad_norm=0.3, + max_steps=-1, + warmup_ratio=0.03, + group_by_length=True, + lr_scheduler_type="constant", + report_to="tensorboard", +) + +peft_config = LoraConfig(lora_alpha=16, lora_dropout=0.1, r=64, bias="none", task_type="CAUSAL_LM") +peft_model = get_peft_model(base_model, peft_config) +peft_model.print_trainable_parameters() + +# Initialize an SFT trainer. +sft_trainer = SFTTrainer( + model=base_model, + train_dataset=training_dataset, + peft_config=peft_config, + dataset_text_field="text", + tokenizer=tokenizer, + args=training_arguments, +) + +# Run the trainer. +sft_trainer.train() diff --git a/frontend/.babelrc b/frontend/.babelrc new file mode 100644 index 0000000000..4d3ac7addb --- /dev/null +++ b/frontend/.babelrc @@ -0,0 +1,35 @@ +{ + "presets": [ + [ + "@babel/preset-env", + { + "targets": { + "browsers": "last 2 versions" + } + } + ], + "@babel/preset-typescript", + "@babel/preset-react" + ], + "plugins": [ + "@babel/plugin-transform-runtime", + "@babel/plugin-proposal-class-properties" + ], + "env": { + "development": { + "plugins": ["react-refresh/babel"] + }, + "production": { + "presets": [ + [ + "minify", + { + "builtIns": false, + "evaluate": false, + "mangle": false + } + ] + ] + } + } +} diff --git a/frontend/.gitignore b/frontend/.gitignore new file mode 100644 index 0000000000..f41806b3cf --- /dev/null +++ b/frontend/.gitignore @@ -0,0 +1,3 @@ +/node_modules/ +/.idea/ +build diff --git a/frontend/.justfile b/frontend/.justfile new file mode 100644 index 0000000000..b949968413 --- /dev/null +++ b/frontend/.justfile @@ -0,0 +1,32 @@ +# Justfile for building frontend +# +# Run `just` to see all available commands + +default: + @just --list + +[private] +install-frontend: + #!/usr/bin/env bash + set -e + cd {{source_directory()}} + npm install + +run-frontend: + #!/usr/bin/env bash + set -e + cd {{source_directory()}} + npm run start + +run-frontend-sky: + #!/usr/bin/env bash + set -e + cd {{source_directory()}} + npm run start-sky + +build-frontend: + #!/usr/bin/env bash + set -e + cd {{source_directory()}} + npm run build + cp -r build/ ../src/dstack/_internal/server/statics/ diff --git a/frontend/.prettierrc b/frontend/.prettierrc new file mode 100644 index 0000000000..c750ce9d9a --- /dev/null +++ b/frontend/.prettierrc @@ -0,0 +1,7 @@ +{ + "semi": true, + "trailingComma": "all", + "singleQuote": true, + "printWidth": 128, + "tabWidth": 4 +} diff --git a/frontend/eslint.config.cjs b/frontend/eslint.config.cjs new file mode 100644 index 0000000000..9750398241 --- /dev/null +++ b/frontend/eslint.config.cjs @@ -0,0 +1,77 @@ +const { defineConfig, globalIgnores } = require('eslint/config'); +const i18N = require('eslint-plugin-i18n'); +const simpleImportSort = require('eslint-plugin-simple-import-sort'); +const react = require('eslint-plugin-react'); +const { FlatCompat } = require('@eslint/eslintrc'); +const js = require('@eslint/js'); +const typescriptEslint = require('@typescript-eslint/eslint-plugin'); +const tsParser = require('@typescript-eslint/parser'); + +const compat = new FlatCompat({ + baseDirectory: __dirname, + recommendedConfig: js.configs.recommended, + allConfig: js.configs.all, +}); + +const BASE_CONFIG = { + extends: compat.extends( + 'eslint:recommended', + 'plugin:@typescript-eslint/eslint-recommended', + 'plugin:@typescript-eslint/recommended', + 'prettier', + 'plugin:prettier/recommended', + 'plugin:react/recommended', + ), + + plugins: { + '@typescript-eslint': typescriptEslint, + i18n: i18N, + 'simple-import-sort': simpleImportSort, + react: react, + }, + + languageOptions: { + parser: tsParser, + }, + settings: {}, + + rules: { + 'react/jsx-no-target-blank': 'off', + 'react/no-unescaped-entities': 'off', + 'i18n/no-russian-character': 1, + + 'simple-import-sort/imports': [ + 'error', + { + groups: [ + ['^react', 'lodash', '^\\w', '^@?\\w'], + ['^components', '^layouts'], + ['^consts', '^hooks', '^libs', '^routes', '^services', '^types'], + ['^App', '^pages'], + ['^\\./(?=.*/)(?!/?$)', '^\\.(?!/?$)', '^\\./?$'], + ['./constants/.'], + ['./definitions/.', './types'], + ['^.+\\.svg', '^.+\\.png$', '^.+\\.jpg', '^.+\\.s?css$'], + ], + }, + ], + }, +}; + +module.exports = defineConfig([ + globalIgnores([ + 'node_modules', + 'build', + 'server.js', + 'src/locale', + 'src/types', + 'src/setupProxy.js', + 'webpack/**/*', + 'webpack/env.js', + 'webpack/prod.js', + 'public', + 'staticServer.js', + 'webpack.config.js', + ]), + { ...BASE_CONFIG }, +]); diff --git a/frontend/jest.config.ts b/frontend/jest.config.ts new file mode 100644 index 0000000000..0fc7934ef3 --- /dev/null +++ b/frontend/jest.config.ts @@ -0,0 +1,188 @@ +/* + * For a detailed explanation regarding each configuration property and type check, visit: + * https://jestjs.io/docs/en/configuration.html + */ + +export default { + // All imported modules in your tests should be mocked automatically + // automock: false, + + // Stop running tests after `n` failures + // bail: 0, + + // The directory where Jest should store its cached dependency information + // cacheDirectory: "/private/var/folders/2c/kgqgdjnd513_j7gml91lytx00000gn/T/jest_dx", + + // Automatically clear mock calls and instances between every test + clearMocks: true, + + // Indicates whether the coverage information should be collected while executing the test + // collectCoverage: false, + + // An array of glob patterns indicating a set of files for which coverage information should be collected + // collectCoverageFrom: undefined, + + // The directory where Jest should output its coverage files + coverageDirectory: '/tests/__coverage__/', + + // An array of regexp pattern strings used to skip coverage collection + // coveragePathIgnorePatterns: [ + // "/node_modules/" + // ], + + // Indicates which provider should be used to instrument code for coverage + // coverageProvider: "babel", + + // A list of reporter names that Jest uses when writing coverage reports + // coverageReporters: [ + // "json", + // "text", + // "lcov", + // "clover" + // ], + + // An object that configures minimum threshold enforcement for coverage results + // coverageThreshold: undefined, + + // A path to a custom dependency extractor + // dependencyExtractor: undefined, + + // Make calling deprecated APIs throw helpful error messages + // errorOnDeprecated: false, + + // Force coverage collection from ignored files using an array of glob patterns + // forceCoverageMatch: [], + + // A path to a module which exports an async function that is triggered once before all test suites + // globalSetup: undefined, + + // A path to a module which exports an async function that is triggered once after all test suites + // globalTeardown: undefined, + + // A set of global variables that need to be available in all test environments + globals: {}, + + // The maximum amount of workers used to run your tests. Can be specified as % or a number. E.g. maxWorkers: 10% will use 10% of your CPU amount + 1 as the maximum worker number. maxWorkers: 2 will use a maximum of 2 workers. + // maxWorkers: "50%", + + // An array of directory names to be searched recursively up from the requiring module's location + moduleDirectories: ['node_modules', 'src', 'tests'], + + // An array of file extensions your modules use + moduleFileExtensions: ['js', 'jsx', 'ts', 'tsx'], + + // A map from regular expressions to module names or to arrays of module names that allow to stub out resources with a single module + moduleNameMapper: { + '\\.svg': '/tests/__mocks__/svgrMock.ts', + '\\.(jpg|jpeg|png|gif|eot|otf|webp|svg|ttf|woff|woff2|mp4|webm|wav|mp3|m4a|aac|oga)$': + '/tests/__mocks__/fileMock.ts', + '\\.(css|less|scss|sss|styl)$': 'identity-obj-proxy', + }, + + // An array of regexp pattern strings, matched against all module paths before considered 'visible' to the module loader + // modulePathIgnorePatterns: [], + + // Activates notifications for test results + // notify: false, + + // An enum that specifies notification mode. Requires { notify: true } + // notifyMode: "failure-change", + + // A preset that is used as a base for Jest's configuration + // preset: undefined, + preset: 'ts-jest', + + // Run tests from one or more projects + // projects: undefined, + + // Use this configuration option to add custom reporters to Jest + // reporters: undefined, + + // Automatically reset mock state between every test + // resetMocks: false, + + // Reset the module registry before running each individual test + // resetModules: false, + + // A path to a custom resolver + // resolver: undefined, + + // Automatically restore mock state between every test + // restoreMocks: false, + + // The root directory that Jest should scan for tests and modules within + rootDir: '.', + + // A list of paths to directories that Jest should use to search for files in + roots: ['/src/', '/tests/'], + + // Allows you to use a custom runner instead of Jest's default test runner + // runner: "jest-runner", + + // The paths to modules that run some code to configure or set up the testing environment before each test + setupFiles: ['/tests/__mocks__/shim.ts'], + + // A list of paths to modules that run some code to configure or set up the testing framework before each test + setupFilesAfterEnv: ['/tests/setupEnzyme.ts'], + + // The number of seconds after which a test is considered as slow and reported as such in the results. + // slowTestThreshold: 5, + + // A list of paths to snapshot serializer modules Jest should use for snapshot testing + // snapshotSerializers: [], + + // The test environment that will be used for testing + testEnvironment: 'jest-environment-jsdom', + + // Options that will be passed to the testEnvironment + // testEnvironmentOptions: {}, + + // Adds a location field to test results + // testLocationInResults: false, + + // The glob patterns Jest uses to detect test files + // testMatch: [ + // "**/__tests__/**/*.[jt]s?(x)", + // "**/?(*.)+(spec|test).[tj]s?(x)" + // ], + + // An array of regexp pattern strings that are matched against all test paths, matched tests are skipped + // testPathIgnorePatterns: [ + // "/node_modules/" + // ], + + // The regexp pattern or array of patterns that Jest uses to detect test files + testRegex: ['/src/.*\\.test.(ts|tsx)$'], + + // This option allows the use of a custom results processor + // testResultsProcessor: undefined, + + // This option allows use of a custom test runner + // testRunner: "jasmine2", + + // This option sets the URL for the jsdom environment. It is reflected in properties such as location.href + // testURL: "http://localhost", + + // Setting this value to "fake" allows the use of fake timers for functions such as "setTimeout" + // timers: "real", + + // A map from regular expressions to paths to transformers + transform: { + '\\.[jt]sx?$': 'babel-jest', + }, + + // An array of regexp pattern strings that are matched against all source file paths, matched files will skip transformation + transformIgnorePatterns: ['/node_modules/'], + + // An array of regexp pattern strings that are matched against all modules before the module loader will automatically return a mock for them + // unmockedModulePathPatterns: undefined, + + // Indicates whether each individual test should be reported during the run + // verbose: undefined, + + // An array of regexp patterns that are matched against all source file paths before re-running tests in watch mode + // watchPathIgnorePatterns: [], + + // Whether to use watchman for file crawling + // watchman: true, +}; diff --git a/frontend/openapi-config.ts b/frontend/openapi-config.ts new file mode 100644 index 0000000000..258a49c086 --- /dev/null +++ b/frontend/openapi-config.ts @@ -0,0 +1,30 @@ +import { intersectionWith } from 'lodash'; +import type { ConfigFile } from '@rtk-query/codegen-openapi'; + +import type { OperationDefinition } from '@rtk-query/codegen-openapi/src/types'; + +const filterEndpoint = + (tags: string[]) => + (_, { operation }: OperationDefinition) => { + const test = (a: string, b: string) => a.toLowerCase().includes(b.toLowerCase()); + + return Boolean(intersectionWith(operation.tags as string[], tags, test).length); + }; + +const FILES = { + './src/api-services/userPayments.ts': { + filterEndpoints: filterEndpoint(['user_payments']), + exportName: 'userPayments', + }, +}; + +const config: ConfigFile = { + schemaFile: 'http://127.0.0.1:8000/openapi.json', + apiFile: './src/services/mainApi.ts', + apiImport: 'mainApi', + outputFiles: FILES, + hooks: true, + tag: true, +}; + +export default config; diff --git a/frontend/package-lock.json b/frontend/package-lock.json new file mode 100644 index 0000000000..e76e6e8b13 --- /dev/null +++ b/frontend/package-lock.json @@ -0,0 +1,25177 @@ +{ + "name": "dstackai", + "version": "2.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "dstackai", + "version": "2.0.0", + "license": "Apache 2.0", + "dependencies": { + "@cloudscape-design/chat-components": "^1.0.62", + "@cloudscape-design/collection-hooks": "^1.0.74", + "@cloudscape-design/component-toolkit": "^1.0.0-beta.120", + "@cloudscape-design/components": "^3.0.1188", + "@cloudscape-design/design-tokens": "^3.0.60", + "@cloudscape-design/global-styles": "^1.0.45", + "@hookform/resolvers": "^2.9.10", + "@reduxjs/toolkit": "^1.9.1", + "@types/yup": "^0.29.14", + "ace-builds": "^1.36.3", + "classnames": "^2.5.1", + "css-minimizer-webpack-plugin": "^4.2.2", + "date-fns": "^2.29.3", + "i18next": "^24.0.2", + "js-yaml": "^4.1.0", + "lodash": "^4.17.21", + "openai": "^4.33.1", + "prismjs": "^1.30.0", + "rc-tooltip": "^5.2.2", + "react": "^18.3.1", + "react-avatar": "^5.0.3", + "react-bus": "^4.0.1", + "react-dom": "^18.3.1", + "react-helmet": "^6.1.0", + "react-hook-form": "^7.53.0", + "react-i18next": "^12.1.4", + "react-redux": "^8.0.5", + "react-router-dom": "^6.27.0", + "react-string-replace": "^1.1.1", + "redux": "^5.0.1", + "yup": "^0.32.11" + }, + "devDependencies": { + "@babel/cli": "^7.25.9", + "@babel/core": "^7.26.0", + "@babel/plugin-proposal-class-properties": "^7.18.6", + "@babel/plugin-transform-runtime": "^7.25.9", + "@babel/preset-env": "^7.26.0", + "@babel/preset-react": "^7.25.9", + "@babel/preset-typescript": "^7.26.0", + "@babel/register": "^7.25.9", + "@cfaester/enzyme-adapter-react-18": "^0.8.0", + "@eslint/eslintrc": "^3.3.1", + "@eslint/js": "^9.28.0", + "@pmmmwh/react-refresh-webpack-plugin": "^0.5.15", + "@rtk-query/codegen-openapi": "^2.0.0", + "@svgr/webpack": "^6.5.1", + "@testing-library/jest-dom": "^6.6.3", + "@testing-library/react": "^16.0.1", + "@testing-library/user-event": "^14.5.2", + "@types/axios": "^0.14.0", + "@types/date-fns": "^2.6.0", + "@types/enzyme": "^3.10.18", + "@types/jest": "^29.5.14", + "@types/js-yaml": "^4.0.9", + "@types/lodash": "^4.17.13", + "@types/node": "^22.10.1", + "@types/react": "^18.3.12", + "@types/react-dom": "^18.3.1", + "@types/react-helmet": "^6.1.11", + "@types/react-redux": "^7.1.34", + "@types/react-router-dom": "^5.3.3", + "@types/react-test-renderer": "^18.3.0", + "@typescript-eslint/eslint-plugin": "^8.33.1", + "@typescript-eslint/parser": "^8.33.1", + "@webpack-cli/serve": "^2.0.5", + "babel-loader": "^9.2.1", + "babel-preset-minify": "^0.5.2", + "circular-dependency-plugin": "^5.2.2", + "copy-webpack-plugin": "^11.0.0", + "cross-env": "^7.0.3", + "css-loader": "^6.7.3", + "enzyme": "^3.11.0", + "eslint": "^9.39.2", + "eslint-config-prettier": "^10.1.5", + "eslint-plugin-i18n": "^2.4.0", + "eslint-plugin-prettier": "^5.4.1", + "eslint-plugin-react": "^7.37.5", + "eslint-plugin-simple-import-sort": "^12.1.1", + "favicons": "^7.2.0", + "favicons-webpack-plugin": "^6.0.1", + "file-loader": "^6.2.0", + "html-webpack-plugin": "^5.6.3", + "http-proxy-middleware": "^2.0.6", + "identity-obj-proxy": "^3.0.0", + "jest": "^29.7.0", + "jest-styled-components": "^7.2.0", + "lint-staged": "^16.1.2", + "loader-utils": "^3.3.1", + "mini-css-extract-plugin": "^2.9.2", + "npx": "^10.2.2", + "postcss": "^8.4.49", + "postcss-loader": "^7.0.2", + "postcss-preset-env": "7.8.3", + "prettier": "^3.5.3", + "react-dev-utils": "^12.0.1", + "react-refresh": "^0.14.2", + "react-test-renderer": "^18.3.1", + "resolve-url-loader": "^5.0.0", + "sass": "^1.81.0", + "sass-loader": "^16.0.3", + "style-loader": "^4.0.0", + "ts-jest": "^29.2.5", + "ts-node": "^10.9.2", + "typescript": "^5.7.2", + "webpack": "^5.96.1", + "webpack-cli": "^5.1.4", + "webpack-dev-server": "^5.1.0", + "webpack-merge": "^6.0.1", + "webpack-nano": "^1.1.1" + } + }, + "node_modules/@adobe/css-tools": { + "version": "4.4.0", + "resolved": "https://registry.npmjs.org/@adobe/css-tools/-/css-tools-4.4.0.tgz", + "integrity": "sha512-Ff9+ksdQQB3rMncgqDK78uLznstjyfIf2Arnh22pW8kBpLs6rpKDwgnZT46hin5Hl1WzazzK64DOrhSwYpS7bQ==", + "dev": true + }, + "node_modules/@ampproject/remapping": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/@ampproject/remapping/-/remapping-2.3.0.tgz", + "integrity": "sha512-30iZtAPgz+LTIYoeivqYo853f02jBYSd5uGnGpkFV0M3xOt9aN73erkgYAmZU43x4VfqcnLxW9Kpg3R5LC4YYw==", + "dev": true, + "dependencies": { + "@jridgewell/gen-mapping": "^0.3.5", + "@jridgewell/trace-mapping": "^0.3.24" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@apidevtools/json-schema-ref-parser": { + "version": "9.0.6", + "resolved": "https://registry.npmjs.org/@apidevtools/json-schema-ref-parser/-/json-schema-ref-parser-9.0.6.tgz", + "integrity": "sha512-M3YgsLjI0lZxvrpeGVk9Ap032W6TPQkH6pRAZz81Ac3WUNF79VQooAFnp8umjvVzUmD93NkogxEwbSce7qMsUg==", + "dev": true, + "dependencies": { + "@jsdevtools/ono": "^7.1.3", + "call-me-maybe": "^1.0.1", + "js-yaml": "^3.13.1" + } + }, + "node_modules/@apidevtools/json-schema-ref-parser/node_modules/argparse": { + "version": "1.0.10", + "resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz", + "integrity": "sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==", + "dev": true, + "license": "MIT", + "dependencies": { + "sprintf-js": "~1.0.2" + } + }, + "node_modules/@apidevtools/json-schema-ref-parser/node_modules/js-yaml": { + "version": "3.14.2", + "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.14.2.tgz", + "integrity": "sha512-PMSmkqxr106Xa156c2M265Z+FTrPl+oxd/rgOQy2tijQeK5TxQ43psO1ZCwhVOSdnn+RzkzlRz/eY4BgJBYVpg==", + "dev": true, + "license": "MIT", + "dependencies": { + "argparse": "^1.0.7", + "esprima": "^4.0.0" + }, + "bin": { + "js-yaml": "bin/js-yaml.js" + } + }, + "node_modules/@apidevtools/openapi-schemas": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/@apidevtools/openapi-schemas/-/openapi-schemas-2.1.0.tgz", + "integrity": "sha512-Zc1AlqrJlX3SlpupFGpiLi2EbteyP7fXmUOGup6/DnkRgjP9bgMM/ag+n91rsv0U1Gpz0H3VILA/o3bW7Ua6BQ==", + "dev": true, + "engines": { + "node": ">=10" + } + }, + "node_modules/@apidevtools/swagger-methods": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/@apidevtools/swagger-methods/-/swagger-methods-3.0.2.tgz", + "integrity": "sha512-QAkD5kK2b1WfjDS/UQn/qQkbwF31uqRjPTrsCs5ZG9BQGAkjwvqGFjjPqAuzac/IYzpPtRzjCP1WrTuAIjMrXg==", + "dev": true + }, + "node_modules/@apidevtools/swagger-parser": { + "version": "10.1.0", + "resolved": "https://registry.npmjs.org/@apidevtools/swagger-parser/-/swagger-parser-10.1.0.tgz", + "integrity": "sha512-9Kt7EuS/7WbMAUv2gSziqjvxwDbFSg3Xeyfuj5laUODX8o/k/CpsAKiQ8W7/R88eXFTMbJYg6+7uAmOWNKmwnw==", + "dev": true, + "dependencies": { + "@apidevtools/json-schema-ref-parser": "9.0.6", + "@apidevtools/openapi-schemas": "^2.1.0", + "@apidevtools/swagger-methods": "^3.0.2", + "@jsdevtools/ono": "^7.1.3", + "ajv": "^8.6.3", + "ajv-draft-04": "^1.0.0", + "call-me-maybe": "^1.0.1" + }, + "peerDependencies": { + "openapi-types": ">=7" + } + }, + "node_modules/@babel/cli": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/cli/-/cli-7.25.9.tgz", + "integrity": "sha512-I+02IfrTiSanpxJBlZQYb18qCxB6c2Ih371cVpfgIrPQrjAYkf45XxomTJOG8JBWX5GY35/+TmhCMdJ4ZPkL8Q==", + "dev": true, + "dependencies": { + "@jridgewell/trace-mapping": "^0.3.25", + "commander": "^6.2.0", + "convert-source-map": "^2.0.0", + "fs-readdir-recursive": "^1.1.0", + "glob": "^7.2.0", + "make-dir": "^2.1.0", + "slash": "^2.0.0" + }, + "bin": { + "babel": "bin/babel.js", + "babel-external-helpers": "bin/babel-external-helpers.js" + }, + "engines": { + "node": ">=6.9.0" + }, + "optionalDependencies": { + "@nicolo-ribaudo/chokidar-2": "2.1.8-no-fsevents.3", + "chokidar": "^3.6.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/code-frame": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.28.6.tgz", + "integrity": "sha512-JYgintcMjRiCvS8mMECzaEn+m3PfoQiyqukOMCCVQtoJGYJw8j/8LBJEiqkHLkfwCcs74E3pbAUFNg7d9VNJ+Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-validator-identifier": "^7.28.5", + "js-tokens": "^4.0.0", + "picocolors": "^1.1.1" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/compat-data": { + "version": "7.26.0", + "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.26.0.tgz", + "integrity": "sha512-qETICbZSLe7uXv9VE8T/RWOdIE5qqyTucOt4zLYMafj2MRO271VGgLd4RACJMeBO37UPWhXiKMBk7YlJ0fOzQA==", + "dev": true, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/core": { + "version": "7.26.0", + "resolved": "https://registry.npmjs.org/@babel/core/-/core-7.26.0.tgz", + "integrity": "sha512-i1SLeK+DzNnQ3LL/CswPCa/E5u4lh1k6IAEphON8F+cXt0t9euTshDru0q7/IqMa1PMPz5RnHuHscF8/ZJsStg==", + "dev": true, + "dependencies": { + "@ampproject/remapping": "^2.2.0", + "@babel/code-frame": "^7.26.0", + "@babel/generator": "^7.26.0", + "@babel/helper-compilation-targets": "^7.25.9", + "@babel/helper-module-transforms": "^7.26.0", + "@babel/helpers": "^7.26.0", + "@babel/parser": "^7.26.0", + "@babel/template": "^7.25.9", + "@babel/traverse": "^7.25.9", + "@babel/types": "^7.26.0", + "convert-source-map": "^2.0.0", + "debug": "^4.1.0", + "gensync": "^1.0.0-beta.2", + "json5": "^2.2.3", + "semver": "^6.3.1" + }, + "engines": { + "node": ">=6.9.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/babel" + } + }, + "node_modules/@babel/generator": { + "version": "7.26.0", + "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.26.0.tgz", + "integrity": "sha512-/AIkAmInnWwgEAJGQr9vY0c66Mj6kjkE2ZPB1PurTRaRAh3U+J45sAQMjQDJdh4WbR3l0x5xkimXBKyBXXAu2w==", + "dev": true, + "dependencies": { + "@babel/parser": "^7.26.0", + "@babel/types": "^7.26.0", + "@jridgewell/gen-mapping": "^0.3.5", + "@jridgewell/trace-mapping": "^0.3.25", + "jsesc": "^3.0.2" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-annotate-as-pure": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/helper-annotate-as-pure/-/helper-annotate-as-pure-7.25.9.tgz", + "integrity": "sha512-gv7320KBUFJz1RnylIg5WWYPRXKZ884AGkYpgpWW02TH66Dl+HaC1t1CKd0z3R4b6hdYEcmrNZHUmfCP+1u3/g==", + "dev": true, + "dependencies": { + "@babel/types": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-builder-binary-assignment-operator-visitor": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/helper-builder-binary-assignment-operator-visitor/-/helper-builder-binary-assignment-operator-visitor-7.25.9.tgz", + "integrity": "sha512-C47lC7LIDCnz0h4vai/tpNOI95tCd5ZT3iBt/DBH5lXKHZsyNQv18yf1wIIg2ntiQNgmAvA+DgZ82iW8Qdym8g==", + "dev": true, + "dependencies": { + "@babel/traverse": "^7.25.9", + "@babel/types": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-compilation-targets": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/helper-compilation-targets/-/helper-compilation-targets-7.25.9.tgz", + "integrity": "sha512-j9Db8Suy6yV/VHa4qzrj9yZfZxhLWQdVnRlXxmKLYlhWUVB1sB2G5sxuWYXk/whHD9iW76PmNzxZ4UCnTQTVEQ==", + "dev": true, + "dependencies": { + "@babel/compat-data": "^7.25.9", + "@babel/helper-validator-option": "^7.25.9", + "browserslist": "^4.24.0", + "lru-cache": "^5.1.1", + "semver": "^6.3.1" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-create-class-features-plugin": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/helper-create-class-features-plugin/-/helper-create-class-features-plugin-7.25.9.tgz", + "integrity": "sha512-UTZQMvt0d/rSz6KI+qdu7GQze5TIajwTS++GUozlw8VBJDEOAqSXwm1WvmYEZwqdqSGQshRocPDqrt4HBZB3fQ==", + "dev": true, + "dependencies": { + "@babel/helper-annotate-as-pure": "^7.25.9", + "@babel/helper-member-expression-to-functions": "^7.25.9", + "@babel/helper-optimise-call-expression": "^7.25.9", + "@babel/helper-replace-supers": "^7.25.9", + "@babel/helper-skip-transparent-expression-wrappers": "^7.25.9", + "@babel/traverse": "^7.25.9", + "semver": "^6.3.1" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" + } + }, + "node_modules/@babel/helper-create-regexp-features-plugin": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/helper-create-regexp-features-plugin/-/helper-create-regexp-features-plugin-7.25.9.tgz", + "integrity": "sha512-ORPNZ3h6ZRkOyAa/SaHU+XsLZr0UQzRwuDQ0cczIA17nAzZ+85G5cVkOJIj7QavLZGSe8QXUmNFxSZzjcZF9bw==", + "dev": true, + "dependencies": { + "@babel/helper-annotate-as-pure": "^7.25.9", + "regexpu-core": "^6.1.1", + "semver": "^6.3.1" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" + } + }, + "node_modules/@babel/helper-define-polyfill-provider": { + "version": "0.6.2", + "resolved": "https://registry.npmjs.org/@babel/helper-define-polyfill-provider/-/helper-define-polyfill-provider-0.6.2.tgz", + "integrity": "sha512-LV76g+C502biUK6AyZ3LK10vDpDyCzZnhZFXkH1L75zHPj68+qc8Zfpx2th+gzwA2MzyK+1g/3EPl62yFnVttQ==", + "dev": true, + "dependencies": { + "@babel/helper-compilation-targets": "^7.22.6", + "@babel/helper-plugin-utils": "^7.22.5", + "debug": "^4.1.1", + "lodash.debounce": "^4.0.8", + "resolve": "^1.14.2" + }, + "peerDependencies": { + "@babel/core": "^7.4.0 || ^8.0.0-0 <8.0.0" + } + }, + "node_modules/@babel/helper-member-expression-to-functions": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/helper-member-expression-to-functions/-/helper-member-expression-to-functions-7.25.9.tgz", + "integrity": "sha512-wbfdZ9w5vk0C0oyHqAJbc62+vet5prjj01jjJ8sKn3j9h3MQQlflEdXYvuqRWjHnM12coDEqiC1IRCi0U/EKwQ==", + "dev": true, + "dependencies": { + "@babel/traverse": "^7.25.9", + "@babel/types": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-module-imports": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/helper-module-imports/-/helper-module-imports-7.25.9.tgz", + "integrity": "sha512-tnUA4RsrmflIM6W6RFTLFSXITtl0wKjgpnLgXyowocVPrbYrLUXSBXDgTs8BlbmIzIdlBySRQjINYs2BAkiLtw==", + "dev": true, + "dependencies": { + "@babel/traverse": "^7.25.9", + "@babel/types": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-module-transforms": { + "version": "7.26.0", + "resolved": "https://registry.npmjs.org/@babel/helper-module-transforms/-/helper-module-transforms-7.26.0.tgz", + "integrity": "sha512-xO+xu6B5K2czEnQye6BHA7DolFFmS3LB7stHZFaOLb1pAwO1HWLS8fXA+eh0A2yIvltPVmx3eNNDBJA2SLHXFw==", + "dev": true, + "dependencies": { + "@babel/helper-module-imports": "^7.25.9", + "@babel/helper-validator-identifier": "^7.25.9", + "@babel/traverse": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" + } + }, + "node_modules/@babel/helper-optimise-call-expression": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/helper-optimise-call-expression/-/helper-optimise-call-expression-7.25.9.tgz", + "integrity": "sha512-FIpuNaz5ow8VyrYcnXQTDRGvV6tTjkNtCK/RYNDXGSLlUD6cBuQTSw43CShGxjvfBTfcUA/r6UhUCbtYqkhcuQ==", + "dev": true, + "dependencies": { + "@babel/types": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-plugin-utils": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/helper-plugin-utils/-/helper-plugin-utils-7.25.9.tgz", + "integrity": "sha512-kSMlyUVdWe25rEsRGviIgOWnoT/nfABVWlqt9N19/dIPWViAOW2s9wznP5tURbs/IDuNk4gPy3YdYRgH3uxhBw==", + "dev": true, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-remap-async-to-generator": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/helper-remap-async-to-generator/-/helper-remap-async-to-generator-7.25.9.tgz", + "integrity": "sha512-IZtukuUeBbhgOcaW2s06OXTzVNJR0ybm4W5xC1opWFFJMZbwRj5LCk+ByYH7WdZPZTt8KnFwA8pvjN2yqcPlgw==", + "dev": true, + "dependencies": { + "@babel/helper-annotate-as-pure": "^7.25.9", + "@babel/helper-wrap-function": "^7.25.9", + "@babel/traverse": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" + } + }, + "node_modules/@babel/helper-replace-supers": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/helper-replace-supers/-/helper-replace-supers-7.25.9.tgz", + "integrity": "sha512-IiDqTOTBQy0sWyeXyGSC5TBJpGFXBkRynjBeXsvbhQFKj2viwJC76Epz35YLU1fpe/Am6Vppb7W7zM4fPQzLsQ==", + "dev": true, + "dependencies": { + "@babel/helper-member-expression-to-functions": "^7.25.9", + "@babel/helper-optimise-call-expression": "^7.25.9", + "@babel/traverse": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" + } + }, + "node_modules/@babel/helper-simple-access": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/helper-simple-access/-/helper-simple-access-7.25.9.tgz", + "integrity": "sha512-c6WHXuiaRsJTyHYLJV75t9IqsmTbItYfdj99PnzYGQZkYKvan5/2jKJ7gu31J3/BJ/A18grImSPModuyG/Eo0Q==", + "dev": true, + "dependencies": { + "@babel/traverse": "^7.25.9", + "@babel/types": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-skip-transparent-expression-wrappers": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/helper-skip-transparent-expression-wrappers/-/helper-skip-transparent-expression-wrappers-7.25.9.tgz", + "integrity": "sha512-K4Du3BFa3gvyhzgPcntrkDgZzQaq6uozzcpGbOO1OEJaI+EJdqWIMTLgFgQf6lrfiDFo5FU+BxKepI9RmZqahA==", + "dev": true, + "dependencies": { + "@babel/traverse": "^7.25.9", + "@babel/types": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-string-parser": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.27.1.tgz", + "integrity": "sha512-qMlSxKbpRlAridDExk92nSobyDdpPijUq2DW6oDnUqd0iOGxmQjyqhMIihI9+zv4LPyZdRje2cavWPbCbWm3eA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-validator-identifier": { + "version": "7.28.5", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.28.5.tgz", + "integrity": "sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-validator-option": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-option/-/helper-validator-option-7.25.9.tgz", + "integrity": "sha512-e/zv1co8pp55dNdEcCynfj9X7nyUKUXoUEwfXqaZt0omVOmDe9oOTdKStH4GmAw6zxMFs50ZayuMfHDKlO7Tfw==", + "dev": true, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-wrap-function": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/helper-wrap-function/-/helper-wrap-function-7.25.9.tgz", + "integrity": "sha512-ETzz9UTjQSTmw39GboatdymDq4XIQbR8ySgVrylRhPOFpsd+JrKHIuF0de7GCWmem+T4uC5z7EZguod7Wj4A4g==", + "dev": true, + "dependencies": { + "@babel/template": "^7.25.9", + "@babel/traverse": "^7.25.9", + "@babel/types": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helpers": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.28.6.tgz", + "integrity": "sha512-xOBvwq86HHdB7WUDTfKfT/Vuxh7gElQ+Sfti2Cy6yIWNW05P8iUslOVcZ4/sKbE+/jQaukQAdz/gf3724kYdqw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/template": "^7.28.6", + "@babel/types": "^7.28.6" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/parser": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.28.6.tgz", + "integrity": "sha512-TeR9zWR18BvbfPmGbLampPMW+uW1NZnJlRuuHso8i87QZNq2JRF9i6RgxRqtEq+wQGsS19NNTWr2duhnE49mfQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/types": "^7.28.6" + }, + "bin": { + "parser": "bin/babel-parser.js" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@babel/plugin-bugfix-firefox-class-in-computed-class-key": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-bugfix-firefox-class-in-computed-class-key/-/plugin-bugfix-firefox-class-in-computed-class-key-7.25.9.tgz", + "integrity": "sha512-ZkRyVkThtxQ/J6nv3JFYv1RYY+JT5BvU0y3k5bWrmuG4woXypRa4PXmm9RhOwodRkYFWqC0C0cqcJ4OqR7kW+g==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9", + "@babel/traverse": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" + } + }, + "node_modules/@babel/plugin-bugfix-safari-class-field-initializer-scope": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-bugfix-safari-class-field-initializer-scope/-/plugin-bugfix-safari-class-field-initializer-scope-7.25.9.tgz", + "integrity": "sha512-MrGRLZxLD/Zjj0gdU15dfs+HH/OXvnw/U4jJD8vpcP2CJQapPEv1IWwjc/qMg7ItBlPwSv1hRBbb7LeuANdcnw==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" + } + }, + "node_modules/@babel/plugin-bugfix-safari-id-destructuring-collision-in-function-expression": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-bugfix-safari-id-destructuring-collision-in-function-expression/-/plugin-bugfix-safari-id-destructuring-collision-in-function-expression-7.25.9.tgz", + "integrity": "sha512-2qUwwfAFpJLZqxd02YW9btUCZHl+RFvdDkNfZwaIJrvB8Tesjsk8pEQkTvGwZXLqXUx/2oyY3ySRhm6HOXuCug==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" + } + }, + "node_modules/@babel/plugin-bugfix-v8-spread-parameters-in-optional-chaining": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-bugfix-v8-spread-parameters-in-optional-chaining/-/plugin-bugfix-v8-spread-parameters-in-optional-chaining-7.25.9.tgz", + "integrity": "sha512-6xWgLZTJXwilVjlnV7ospI3xi+sl8lN8rXXbBD6vYn3UYDlGsag8wrZkKcSI8G6KgqKP7vNFaDgeDnfAABq61g==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9", + "@babel/helper-skip-transparent-expression-wrappers": "^7.25.9", + "@babel/plugin-transform-optional-chaining": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.13.0" + } + }, + "node_modules/@babel/plugin-bugfix-v8-static-class-fields-redefine-readonly": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-bugfix-v8-static-class-fields-redefine-readonly/-/plugin-bugfix-v8-static-class-fields-redefine-readonly-7.25.9.tgz", + "integrity": "sha512-aLnMXYPnzwwqhYSCyXfKkIkYgJ8zv9RK+roo9DkTXz38ynIhd9XCbN08s3MGvqL2MYGVUGdRQLL/JqBIeJhJBg==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9", + "@babel/traverse": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" + } + }, + "node_modules/@babel/plugin-proposal-class-properties": { + "version": "7.18.6", + "resolved": "https://registry.npmjs.org/@babel/plugin-proposal-class-properties/-/plugin-proposal-class-properties-7.18.6.tgz", + "integrity": "sha512-cumfXOF0+nzZrrN8Rf0t7M+tF6sZc7vhQwYQck9q1/5w2OExlD+b4v4RpMJFaV1Z7WcDRgO6FqvxqxGlwo+RHQ==", + "deprecated": "This proposal has been merged to the ECMAScript standard and thus this plugin is no longer maintained. Please use @babel/plugin-transform-class-properties instead.", + "dev": true, + "dependencies": { + "@babel/helper-create-class-features-plugin": "^7.18.6", + "@babel/helper-plugin-utils": "^7.18.6" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-proposal-private-property-in-object": { + "version": "7.21.0-placeholder-for-preset-env.2", + "resolved": "https://registry.npmjs.org/@babel/plugin-proposal-private-property-in-object/-/plugin-proposal-private-property-in-object-7.21.0-placeholder-for-preset-env.2.tgz", + "integrity": "sha512-SOSkfJDddaM7mak6cPEpswyTRnuRltl429hMraQEglW+OkovnCzsiszTmsrlY//qLFjCpQDFRvjdm2wA5pPm9w==", + "dev": true, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-syntax-async-generators": { + "version": "7.8.4", + "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-async-generators/-/plugin-syntax-async-generators-7.8.4.tgz", + "integrity": "sha512-tycmZxkGfZaxhMRbXlPXuVFpdWlXpir2W4AMhSJgRKzk/eDlIXOhb2LHWoLpDF7TEHylV5zNhykX6KAgHJmTNw==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.8.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-syntax-bigint": { + "version": "7.8.3", + "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-bigint/-/plugin-syntax-bigint-7.8.3.tgz", + "integrity": "sha512-wnTnFlG+YxQm3vDxpGE57Pj0srRU4sHE/mDkt1qv2YJJSeUAec2ma4WLUnUPeKjyrfntVwe/N6dCXpU+zL3Npg==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.8.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-syntax-class-properties": { + "version": "7.12.13", + "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-class-properties/-/plugin-syntax-class-properties-7.12.13.tgz", + "integrity": "sha512-fm4idjKla0YahUNgFNLCB0qySdsoPiZP3iQE3rky0mBUtMZ23yDJ9SJdg6dXTSDnulOVqiF3Hgr9nbXvXTQZYA==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.12.13" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-syntax-class-static-block": { + "version": "7.14.5", + "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-class-static-block/-/plugin-syntax-class-static-block-7.14.5.tgz", + "integrity": "sha512-b+YyPmr6ldyNnM6sqYeMWE+bgJcJpO6yS4QD7ymxgH34GBPNDM/THBh8iunyvKIZztiwLH4CJZ0RxTk9emgpjw==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.14.5" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-syntax-import-assertions": { + "version": "7.26.0", + "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-import-assertions/-/plugin-syntax-import-assertions-7.26.0.tgz", + "integrity": "sha512-QCWT5Hh830hK5EQa7XzuqIkQU9tT/whqbDz7kuaZMHFl1inRRg7JnuAEOQ0Ur0QUl0NufCk1msK2BeY79Aj/eg==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-syntax-import-attributes": { + "version": "7.26.0", + "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-import-attributes/-/plugin-syntax-import-attributes-7.26.0.tgz", + "integrity": "sha512-e2dttdsJ1ZTpi3B9UYGLw41hifAubg19AtCu/2I/F1QNVclOBr1dYpTdmdyZ84Xiz43BS/tCUkMAZNLv12Pi+A==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-syntax-import-meta": { + "version": "7.10.4", + "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-import-meta/-/plugin-syntax-import-meta-7.10.4.tgz", + "integrity": "sha512-Yqfm+XDx0+Prh3VSeEQCPU81yC+JWZ2pDPFSS4ZdpfZhp4MkFMaDC1UqseovEKwSUpnIL7+vK+Clp7bfh0iD7g==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.10.4" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-syntax-json-strings": { + "version": "7.8.3", + "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-json-strings/-/plugin-syntax-json-strings-7.8.3.tgz", + "integrity": "sha512-lY6kdGpWHvjoe2vk4WrAapEuBR69EMxZl+RoGRhrFGNYVK8mOPAW8VfbT/ZgrFbXlDNiiaxQnAtgVCZ6jv30EA==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.8.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-syntax-jsx": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-jsx/-/plugin-syntax-jsx-7.25.9.tgz", + "integrity": "sha512-ld6oezHQMZsZfp6pWtbjaNDF2tiiCYYDqQszHt5VV437lewP9aSi2Of99CK0D0XB21k7FLgnLcmQKyKzynfeAA==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-syntax-logical-assignment-operators": { + "version": "7.10.4", + "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-logical-assignment-operators/-/plugin-syntax-logical-assignment-operators-7.10.4.tgz", + "integrity": "sha512-d8waShlpFDinQ5MtvGU9xDAOzKH47+FFoney2baFIoMr952hKOLp1HR7VszoZvOsV/4+RRszNY7D17ba0te0ig==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.10.4" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-syntax-nullish-coalescing-operator": { + "version": "7.8.3", + "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-nullish-coalescing-operator/-/plugin-syntax-nullish-coalescing-operator-7.8.3.tgz", + "integrity": "sha512-aSff4zPII1u2QD7y+F8oDsz19ew4IGEJg9SVW+bqwpwtfFleiQDMdzA/R+UlWDzfnHFCxxleFT0PMIrR36XLNQ==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.8.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-syntax-numeric-separator": { + "version": "7.10.4", + "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-numeric-separator/-/plugin-syntax-numeric-separator-7.10.4.tgz", + "integrity": "sha512-9H6YdfkcK/uOnY/K7/aA2xpzaAgkQn37yzWUMRK7OaPOqOpGS1+n0H5hxT9AUw9EsSjPW8SVyMJwYRtWs3X3ug==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.10.4" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-syntax-object-rest-spread": { + "version": "7.8.3", + "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-object-rest-spread/-/plugin-syntax-object-rest-spread-7.8.3.tgz", + "integrity": "sha512-XoqMijGZb9y3y2XskN+P1wUGiVwWZ5JmoDRwx5+3GmEplNyVM2s2Dg8ILFQm8rWM48orGy5YpI5Bl8U1y7ydlA==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.8.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-syntax-optional-catch-binding": { + "version": "7.8.3", + "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-optional-catch-binding/-/plugin-syntax-optional-catch-binding-7.8.3.tgz", + "integrity": "sha512-6VPD0Pc1lpTqw0aKoeRTMiB+kWhAoT24PA+ksWSBrFtl5SIRVpZlwN3NNPQjehA2E/91FV3RjLWoVTglWcSV3Q==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.8.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-syntax-optional-chaining": { + "version": "7.8.3", + "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-optional-chaining/-/plugin-syntax-optional-chaining-7.8.3.tgz", + "integrity": "sha512-KoK9ErH1MBlCPxV0VANkXW2/dw4vlbGDrFgz8bmUsBGYkFRcbRwMh6cIJubdPrkxRwuGdtCk0v/wPTKbQgBjkg==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.8.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-syntax-private-property-in-object": { + "version": "7.14.5", + "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-private-property-in-object/-/plugin-syntax-private-property-in-object-7.14.5.tgz", + "integrity": "sha512-0wVnp9dxJ72ZUJDV27ZfbSj6iHLoytYZmh3rFcxNnvsJF3ktkzLDZPy/mA17HGsaQT3/DQsWYX1f1QGWkCoVUg==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.14.5" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-syntax-top-level-await": { + "version": "7.14.5", + "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-top-level-await/-/plugin-syntax-top-level-await-7.14.5.tgz", + "integrity": "sha512-hx++upLv5U1rgYfwe1xBQUhRmU41NEvpUvrp8jkrSCdvGSnM5/qdRMtylJ6PG5OFkBaHkbTAKTnd3/YyESRHFw==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.14.5" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-syntax-typescript": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-typescript/-/plugin-syntax-typescript-7.25.9.tgz", + "integrity": "sha512-hjMgRy5hb8uJJjUcdWunWVcoi9bGpJp8p5Ol1229PoN6aytsLwNMgmdftO23wnCLMfVmTwZDWMPNq/D1SY60JQ==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-syntax-unicode-sets-regex": { + "version": "7.18.6", + "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-unicode-sets-regex/-/plugin-syntax-unicode-sets-regex-7.18.6.tgz", + "integrity": "sha512-727YkEAPwSIQTv5im8QHz3upqp92JTWhidIC81Tdx4VJYIte/VndKf1qKrfnnhPLiPghStWfvC/iFaMCQu7Nqg==", + "dev": true, + "dependencies": { + "@babel/helper-create-regexp-features-plugin": "^7.18.6", + "@babel/helper-plugin-utils": "^7.18.6" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" + } + }, + "node_modules/@babel/plugin-transform-arrow-functions": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-arrow-functions/-/plugin-transform-arrow-functions-7.25.9.tgz", + "integrity": "sha512-6jmooXYIwn9ca5/RylZADJ+EnSxVUS5sjeJ9UPk6RWRzXCmOJCy6dqItPJFpw2cuCangPK4OYr5uhGKcmrm5Qg==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-async-generator-functions": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-async-generator-functions/-/plugin-transform-async-generator-functions-7.25.9.tgz", + "integrity": "sha512-RXV6QAzTBbhDMO9fWwOmwwTuYaiPbggWQ9INdZqAYeSHyG7FzQ+nOZaUUjNwKv9pV3aE4WFqFm1Hnbci5tBCAw==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9", + "@babel/helper-remap-async-to-generator": "^7.25.9", + "@babel/traverse": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-async-to-generator": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-async-to-generator/-/plugin-transform-async-to-generator-7.25.9.tgz", + "integrity": "sha512-NT7Ejn7Z/LjUH0Gv5KsBCxh7BH3fbLTV0ptHvpeMvrt3cPThHfJfst9Wrb7S8EvJ7vRTFI7z+VAvFVEQn/m5zQ==", + "dev": true, + "dependencies": { + "@babel/helper-module-imports": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9", + "@babel/helper-remap-async-to-generator": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-block-scoped-functions": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-block-scoped-functions/-/plugin-transform-block-scoped-functions-7.25.9.tgz", + "integrity": "sha512-toHc9fzab0ZfenFpsyYinOX0J/5dgJVA2fm64xPewu7CoYHWEivIWKxkK2rMi4r3yQqLnVmheMXRdG+k239CgA==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-block-scoping": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-block-scoping/-/plugin-transform-block-scoping-7.25.9.tgz", + "integrity": "sha512-1F05O7AYjymAtqbsFETboN1NvBdcnzMerO+zlMyJBEz6WkMdejvGWw9p05iTSjC85RLlBseHHQpYaM4gzJkBGg==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-class-properties": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-class-properties/-/plugin-transform-class-properties-7.25.9.tgz", + "integrity": "sha512-bbMAII8GRSkcd0h0b4X+36GksxuheLFjP65ul9w6C3KgAamI3JqErNgSrosX6ZPj+Mpim5VvEbawXxJCyEUV3Q==", + "dev": true, + "dependencies": { + "@babel/helper-create-class-features-plugin": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-class-static-block": { + "version": "7.26.0", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-class-static-block/-/plugin-transform-class-static-block-7.26.0.tgz", + "integrity": "sha512-6J2APTs7BDDm+UMqP1useWqhcRAXo0WIoVj26N7kPFB6S73Lgvyka4KTZYIxtgYXiN5HTyRObA72N2iu628iTQ==", + "dev": true, + "dependencies": { + "@babel/helper-create-class-features-plugin": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.12.0" + } + }, + "node_modules/@babel/plugin-transform-classes": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-classes/-/plugin-transform-classes-7.25.9.tgz", + "integrity": "sha512-mD8APIXmseE7oZvZgGABDyM34GUmK45Um2TXiBUt7PnuAxrgoSVf123qUzPxEr/+/BHrRn5NMZCdE2m/1F8DGg==", + "dev": true, + "dependencies": { + "@babel/helper-annotate-as-pure": "^7.25.9", + "@babel/helper-compilation-targets": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9", + "@babel/helper-replace-supers": "^7.25.9", + "@babel/traverse": "^7.25.9", + "globals": "^11.1.0" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-computed-properties": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-computed-properties/-/plugin-transform-computed-properties-7.25.9.tgz", + "integrity": "sha512-HnBegGqXZR12xbcTHlJ9HGxw1OniltT26J5YpfruGqtUHlz/xKf/G2ak9e+t0rVqrjXa9WOhvYPz1ERfMj23AA==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9", + "@babel/template": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-destructuring": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-destructuring/-/plugin-transform-destructuring-7.25.9.tgz", + "integrity": "sha512-WkCGb/3ZxXepmMiX101nnGiU+1CAdut8oHyEOHxkKuS1qKpU2SMXE2uSvfz8PBuLd49V6LEsbtyPhWC7fnkgvQ==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-dotall-regex": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-dotall-regex/-/plugin-transform-dotall-regex-7.25.9.tgz", + "integrity": "sha512-t7ZQ7g5trIgSRYhI9pIJtRl64KHotutUJsh4Eze5l7olJv+mRSg4/MmbZ0tv1eeqRbdvo/+trvJD/Oc5DmW2cA==", + "dev": true, + "dependencies": { + "@babel/helper-create-regexp-features-plugin": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-duplicate-keys": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-duplicate-keys/-/plugin-transform-duplicate-keys-7.25.9.tgz", + "integrity": "sha512-LZxhJ6dvBb/f3x8xwWIuyiAHy56nrRG3PeYTpBkkzkYRRQ6tJLu68lEF5VIqMUZiAV7a8+Tb78nEoMCMcqjXBw==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-duplicate-named-capturing-groups-regex": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-duplicate-named-capturing-groups-regex/-/plugin-transform-duplicate-named-capturing-groups-regex-7.25.9.tgz", + "integrity": "sha512-0UfuJS0EsXbRvKnwcLjFtJy/Sxc5J5jhLHnFhy7u4zih97Hz6tJkLU+O+FMMrNZrosUPxDi6sYxJ/EA8jDiAog==", + "dev": true, + "dependencies": { + "@babel/helper-create-regexp-features-plugin": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" + } + }, + "node_modules/@babel/plugin-transform-dynamic-import": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-dynamic-import/-/plugin-transform-dynamic-import-7.25.9.tgz", + "integrity": "sha512-GCggjexbmSLaFhqsojeugBpeaRIgWNTcgKVq/0qIteFEqY2A+b9QidYadrWlnbWQUrW5fn+mCvf3tr7OeBFTyg==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-exponentiation-operator": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-exponentiation-operator/-/plugin-transform-exponentiation-operator-7.25.9.tgz", + "integrity": "sha512-KRhdhlVk2nObA5AYa7QMgTMTVJdfHprfpAk4DjZVtllqRg9qarilstTKEhpVjyt+Npi8ThRyiV8176Am3CodPA==", + "dev": true, + "dependencies": { + "@babel/helper-builder-binary-assignment-operator-visitor": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-export-namespace-from": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-export-namespace-from/-/plugin-transform-export-namespace-from-7.25.9.tgz", + "integrity": "sha512-2NsEz+CxzJIVOPx2o9UsW1rXLqtChtLoVnwYHHiB04wS5sgn7mrV45fWMBX0Kk+ub9uXytVYfNP2HjbVbCB3Ww==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-for-of": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-for-of/-/plugin-transform-for-of-7.25.9.tgz", + "integrity": "sha512-LqHxduHoaGELJl2uhImHwRQudhCM50pT46rIBNvtT/Oql3nqiS3wOwP+5ten7NpYSXrrVLgtZU3DZmPtWZo16A==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9", + "@babel/helper-skip-transparent-expression-wrappers": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-function-name": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-function-name/-/plugin-transform-function-name-7.25.9.tgz", + "integrity": "sha512-8lP+Yxjv14Vc5MuWBpJsoUCd3hD6V9DgBon2FVYL4jJgbnVQ9fTgYmonchzZJOVNgzEgbxp4OwAf6xz6M/14XA==", + "dev": true, + "dependencies": { + "@babel/helper-compilation-targets": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9", + "@babel/traverse": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-json-strings": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-json-strings/-/plugin-transform-json-strings-7.25.9.tgz", + "integrity": "sha512-xoTMk0WXceiiIvsaquQQUaLLXSW1KJ159KP87VilruQm0LNNGxWzahxSS6T6i4Zg3ezp4vA4zuwiNUR53qmQAw==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-literals": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-literals/-/plugin-transform-literals-7.25.9.tgz", + "integrity": "sha512-9N7+2lFziW8W9pBl2TzaNht3+pgMIRP74zizeCSrtnSKVdUl8mAjjOP2OOVQAfZ881P2cNjDj1uAMEdeD50nuQ==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-logical-assignment-operators": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-logical-assignment-operators/-/plugin-transform-logical-assignment-operators-7.25.9.tgz", + "integrity": "sha512-wI4wRAzGko551Y8eVf6iOY9EouIDTtPb0ByZx+ktDGHwv6bHFimrgJM/2T021txPZ2s4c7bqvHbd+vXG6K948Q==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-member-expression-literals": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-member-expression-literals/-/plugin-transform-member-expression-literals-7.25.9.tgz", + "integrity": "sha512-PYazBVfofCQkkMzh2P6IdIUaCEWni3iYEerAsRWuVd8+jlM1S9S9cz1dF9hIzyoZ8IA3+OwVYIp9v9e+GbgZhA==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-modules-amd": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-modules-amd/-/plugin-transform-modules-amd-7.25.9.tgz", + "integrity": "sha512-g5T11tnI36jVClQlMlt4qKDLlWnG5pP9CSM4GhdRciTNMRgkfpo5cR6b4rGIOYPgRRuFAvwjPQ/Yk+ql4dyhbw==", + "dev": true, + "dependencies": { + "@babel/helper-module-transforms": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-modules-commonjs": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-modules-commonjs/-/plugin-transform-modules-commonjs-7.25.9.tgz", + "integrity": "sha512-dwh2Ol1jWwL2MgkCzUSOvfmKElqQcuswAZypBSUsScMXvgdT8Ekq5YA6TtqpTVWH+4903NmboMuH1o9i8Rxlyg==", + "dev": true, + "dependencies": { + "@babel/helper-module-transforms": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9", + "@babel/helper-simple-access": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-modules-systemjs": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-modules-systemjs/-/plugin-transform-modules-systemjs-7.25.9.tgz", + "integrity": "sha512-hyss7iIlH/zLHaehT+xwiymtPOpsiwIIRlCAOwBB04ta5Tt+lNItADdlXw3jAWZ96VJ2jlhl/c+PNIQPKNfvcA==", + "dev": true, + "dependencies": { + "@babel/helper-module-transforms": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9", + "@babel/helper-validator-identifier": "^7.25.9", + "@babel/traverse": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-modules-umd": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-modules-umd/-/plugin-transform-modules-umd-7.25.9.tgz", + "integrity": "sha512-bS9MVObUgE7ww36HEfwe6g9WakQ0KF07mQF74uuXdkoziUPfKyu/nIm663kz//e5O1nPInPFx36z7WJmJ4yNEw==", + "dev": true, + "dependencies": { + "@babel/helper-module-transforms": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-named-capturing-groups-regex": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-named-capturing-groups-regex/-/plugin-transform-named-capturing-groups-regex-7.25.9.tgz", + "integrity": "sha512-oqB6WHdKTGl3q/ItQhpLSnWWOpjUJLsOCLVyeFgeTktkBSCiurvPOsyt93gibI9CmuKvTUEtWmG5VhZD+5T/KA==", + "dev": true, + "dependencies": { + "@babel/helper-create-regexp-features-plugin": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" + } + }, + "node_modules/@babel/plugin-transform-new-target": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-new-target/-/plugin-transform-new-target-7.25.9.tgz", + "integrity": "sha512-U/3p8X1yCSoKyUj2eOBIx3FOn6pElFOKvAAGf8HTtItuPyB+ZeOqfn+mvTtg9ZlOAjsPdK3ayQEjqHjU/yLeVQ==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-nullish-coalescing-operator": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-nullish-coalescing-operator/-/plugin-transform-nullish-coalescing-operator-7.25.9.tgz", + "integrity": "sha512-ENfftpLZw5EItALAD4WsY/KUWvhUlZndm5GC7G3evUsVeSJB6p0pBeLQUnRnBCBx7zV0RKQjR9kCuwrsIrjWog==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-numeric-separator": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-numeric-separator/-/plugin-transform-numeric-separator-7.25.9.tgz", + "integrity": "sha512-TlprrJ1GBZ3r6s96Yq8gEQv82s8/5HnCVHtEJScUj90thHQbwe+E5MLhi2bbNHBEJuzrvltXSru+BUxHDoog7Q==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-object-rest-spread": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-object-rest-spread/-/plugin-transform-object-rest-spread-7.25.9.tgz", + "integrity": "sha512-fSaXafEE9CVHPweLYw4J0emp1t8zYTXyzN3UuG+lylqkvYd7RMrsOQ8TYx5RF231be0vqtFC6jnx3UmpJmKBYg==", + "dev": true, + "dependencies": { + "@babel/helper-compilation-targets": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9", + "@babel/plugin-transform-parameters": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-object-super": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-object-super/-/plugin-transform-object-super-7.25.9.tgz", + "integrity": "sha512-Kj/Gh+Rw2RNLbCK1VAWj2U48yxxqL2x0k10nPtSdRa0O2xnHXalD0s+o1A6a0W43gJ00ANo38jxkQreckOzv5A==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9", + "@babel/helper-replace-supers": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-optional-catch-binding": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-optional-catch-binding/-/plugin-transform-optional-catch-binding-7.25.9.tgz", + "integrity": "sha512-qM/6m6hQZzDcZF3onzIhZeDHDO43bkNNlOX0i8n3lR6zLbu0GN2d8qfM/IERJZYauhAHSLHy39NF0Ctdvcid7g==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-optional-chaining": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-optional-chaining/-/plugin-transform-optional-chaining-7.25.9.tgz", + "integrity": "sha512-6AvV0FsLULbpnXeBjrY4dmWF8F7gf8QnvTEoO/wX/5xm/xE1Xo8oPuD3MPS+KS9f9XBEAWN7X1aWr4z9HdOr7A==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9", + "@babel/helper-skip-transparent-expression-wrappers": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-parameters": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-parameters/-/plugin-transform-parameters-7.25.9.tgz", + "integrity": "sha512-wzz6MKwpnshBAiRmn4jR8LYz/g8Ksg0o80XmwZDlordjwEk9SxBzTWC7F5ef1jhbrbOW2DJ5J6ayRukrJmnr0g==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-private-methods": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-private-methods/-/plugin-transform-private-methods-7.25.9.tgz", + "integrity": "sha512-D/JUozNpQLAPUVusvqMxyvjzllRaF8/nSrP1s2YGQT/W4LHK4xxsMcHjhOGTS01mp9Hda8nswb+FblLdJornQw==", + "dev": true, + "dependencies": { + "@babel/helper-create-class-features-plugin": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-private-property-in-object": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-private-property-in-object/-/plugin-transform-private-property-in-object-7.25.9.tgz", + "integrity": "sha512-Evf3kcMqzXA3xfYJmZ9Pg1OvKdtqsDMSWBDzZOPLvHiTt36E75jLDQo5w1gtRU95Q4E5PDttrTf25Fw8d/uWLw==", + "dev": true, + "dependencies": { + "@babel/helper-annotate-as-pure": "^7.25.9", + "@babel/helper-create-class-features-plugin": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-property-literals": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-property-literals/-/plugin-transform-property-literals-7.25.9.tgz", + "integrity": "sha512-IvIUeV5KrS/VPavfSM/Iu+RE6llrHrYIKY1yfCzyO/lMXHQ+p7uGhonmGVisv6tSBSVgWzMBohTcvkC9vQcQFA==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-react-constant-elements": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-constant-elements/-/plugin-transform-react-constant-elements-7.25.9.tgz", + "integrity": "sha512-Ncw2JFsJVuvfRsa2lSHiC55kETQVLSnsYGQ1JDDwkUeWGTL/8Tom8aLTnlqgoeuopWrbbGndrc9AlLYrIosrow==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-react-display-name": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-display-name/-/plugin-transform-react-display-name-7.25.9.tgz", + "integrity": "sha512-KJfMlYIUxQB1CJfO3e0+h0ZHWOTLCPP115Awhaz8U0Zpq36Gl/cXlpoyMRnUWlhNUBAzldnCiAZNvCDj7CrKxQ==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-react-jsx": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx/-/plugin-transform-react-jsx-7.25.9.tgz", + "integrity": "sha512-s5XwpQYCqGerXl+Pu6VDL3x0j2d82eiV77UJ8a2mDHAW7j9SWRqQ2y1fNo1Z74CdcYipl5Z41zvjj4Nfzq36rw==", + "dev": true, + "dependencies": { + "@babel/helper-annotate-as-pure": "^7.25.9", + "@babel/helper-module-imports": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9", + "@babel/plugin-syntax-jsx": "^7.25.9", + "@babel/types": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-react-jsx-development": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx-development/-/plugin-transform-react-jsx-development-7.25.9.tgz", + "integrity": "sha512-9mj6rm7XVYs4mdLIpbZnHOYdpW42uoiBCTVowg7sP1thUOiANgMb4UtpRivR0pp5iL+ocvUv7X4mZgFRpJEzGw==", + "dev": true, + "dependencies": { + "@babel/plugin-transform-react-jsx": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-react-pure-annotations": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-pure-annotations/-/plugin-transform-react-pure-annotations-7.25.9.tgz", + "integrity": "sha512-KQ/Takk3T8Qzj5TppkS1be588lkbTp5uj7w6a0LeQaTMSckU/wK0oJ/pih+T690tkgI5jfmg2TqDJvd41Sj1Cg==", + "dev": true, + "dependencies": { + "@babel/helper-annotate-as-pure": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-regenerator": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-regenerator/-/plugin-transform-regenerator-7.25.9.tgz", + "integrity": "sha512-vwDcDNsgMPDGP0nMqzahDWE5/MLcX8sv96+wfX7as7LoF/kr97Bo/7fI00lXY4wUXYfVmwIIyG80fGZ1uvt2qg==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9", + "regenerator-transform": "^0.15.2" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-regexp-modifiers": { + "version": "7.26.0", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-regexp-modifiers/-/plugin-transform-regexp-modifiers-7.26.0.tgz", + "integrity": "sha512-vN6saax7lrA2yA/Pak3sCxuD6F5InBjn9IcrIKQPjpsLvuHYLVroTxjdlVRHjjBWxKOqIwpTXDkOssYT4BFdRw==", + "dev": true, + "dependencies": { + "@babel/helper-create-regexp-features-plugin": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" + } + }, + "node_modules/@babel/plugin-transform-reserved-words": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-reserved-words/-/plugin-transform-reserved-words-7.25.9.tgz", + "integrity": "sha512-7DL7DKYjn5Su++4RXu8puKZm2XBPHyjWLUidaPEkCUBbE7IPcsrkRHggAOOKydH1dASWdcUBxrkOGNxUv5P3Jg==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-runtime": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-runtime/-/plugin-transform-runtime-7.25.9.tgz", + "integrity": "sha512-nZp7GlEl+yULJrClz0SwHPqir3lc0zsPrDHQUcxGspSL7AKrexNSEfTbfqnDNJUO13bgKyfuOLMF8Xqtu8j3YQ==", + "dev": true, + "dependencies": { + "@babel/helper-module-imports": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9", + "babel-plugin-polyfill-corejs2": "^0.4.10", + "babel-plugin-polyfill-corejs3": "^0.10.6", + "babel-plugin-polyfill-regenerator": "^0.6.1", + "semver": "^6.3.1" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-shorthand-properties": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-shorthand-properties/-/plugin-transform-shorthand-properties-7.25.9.tgz", + "integrity": "sha512-MUv6t0FhO5qHnS/W8XCbHmiRWOphNufpE1IVxhK5kuN3Td9FT1x4rx4K42s3RYdMXCXpfWkGSbCSd0Z64xA7Ng==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-spread": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-spread/-/plugin-transform-spread-7.25.9.tgz", + "integrity": "sha512-oNknIB0TbURU5pqJFVbOOFspVlrpVwo2H1+HUIsVDvp5VauGGDP1ZEvO8Nn5xyMEs3dakajOxlmkNW7kNgSm6A==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9", + "@babel/helper-skip-transparent-expression-wrappers": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-sticky-regex": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-sticky-regex/-/plugin-transform-sticky-regex-7.25.9.tgz", + "integrity": "sha512-WqBUSgeVwucYDP9U/xNRQam7xV8W5Zf+6Eo7T2SRVUFlhRiMNFdFz58u0KZmCVVqs2i7SHgpRnAhzRNmKfi2uA==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-template-literals": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-template-literals/-/plugin-transform-template-literals-7.25.9.tgz", + "integrity": "sha512-o97AE4syN71M/lxrCtQByzphAdlYluKPDBzDVzMmfCobUjjhAryZV0AIpRPrxN0eAkxXO6ZLEScmt+PNhj2OTw==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-typeof-symbol": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-typeof-symbol/-/plugin-transform-typeof-symbol-7.25.9.tgz", + "integrity": "sha512-v61XqUMiueJROUv66BVIOi0Fv/CUuZuZMl5NkRoCVxLAnMexZ0A3kMe7vvZ0nulxMuMp0Mk6S5hNh48yki08ZA==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-typescript": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-typescript/-/plugin-transform-typescript-7.25.9.tgz", + "integrity": "sha512-7PbZQZP50tzv2KGGnhh82GSyMB01yKY9scIjf1a+GfZCtInOWqUH5+1EBU4t9fyR5Oykkkc9vFTs4OHrhHXljQ==", + "dev": true, + "dependencies": { + "@babel/helper-annotate-as-pure": "^7.25.9", + "@babel/helper-create-class-features-plugin": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9", + "@babel/helper-skip-transparent-expression-wrappers": "^7.25.9", + "@babel/plugin-syntax-typescript": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-unicode-escapes": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-unicode-escapes/-/plugin-transform-unicode-escapes-7.25.9.tgz", + "integrity": "sha512-s5EDrE6bW97LtxOcGj1Khcx5AaXwiMmi4toFWRDP9/y0Woo6pXC+iyPu/KuhKtfSrNFd7jJB+/fkOtZy6aIC6Q==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-unicode-property-regex": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-unicode-property-regex/-/plugin-transform-unicode-property-regex-7.25.9.tgz", + "integrity": "sha512-Jt2d8Ga+QwRluxRQ307Vlxa6dMrYEMZCgGxoPR8V52rxPyldHu3hdlHspxaqYmE7oID5+kB+UKUB/eWS+DkkWg==", + "dev": true, + "dependencies": { + "@babel/helper-create-regexp-features-plugin": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-unicode-regex": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-unicode-regex/-/plugin-transform-unicode-regex-7.25.9.tgz", + "integrity": "sha512-yoxstj7Rg9dlNn9UQxzk4fcNivwv4nUYz7fYXBaKxvw/lnmPuOm/ikoELygbYq68Bls3D/D+NBPHiLwZdZZ4HA==", + "dev": true, + "dependencies": { + "@babel/helper-create-regexp-features-plugin": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-unicode-sets-regex": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-unicode-sets-regex/-/plugin-transform-unicode-sets-regex-7.25.9.tgz", + "integrity": "sha512-8BYqO3GeVNHtx69fdPshN3fnzUNLrWdHhk/icSwigksJGczKSizZ+Z6SBCxTs723Fr5VSNorTIK7a+R2tISvwQ==", + "dev": true, + "dependencies": { + "@babel/helper-create-regexp-features-plugin": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" + } + }, + "node_modules/@babel/preset-env": { + "version": "7.26.0", + "resolved": "https://registry.npmjs.org/@babel/preset-env/-/preset-env-7.26.0.tgz", + "integrity": "sha512-H84Fxq0CQJNdPFT2DrfnylZ3cf5K43rGfWK4LJGPpjKHiZlk0/RzwEus3PDDZZg+/Er7lCA03MVacueUuXdzfw==", + "dev": true, + "dependencies": { + "@babel/compat-data": "^7.26.0", + "@babel/helper-compilation-targets": "^7.25.9", + "@babel/helper-plugin-utils": "^7.25.9", + "@babel/helper-validator-option": "^7.25.9", + "@babel/plugin-bugfix-firefox-class-in-computed-class-key": "^7.25.9", + "@babel/plugin-bugfix-safari-class-field-initializer-scope": "^7.25.9", + "@babel/plugin-bugfix-safari-id-destructuring-collision-in-function-expression": "^7.25.9", + "@babel/plugin-bugfix-v8-spread-parameters-in-optional-chaining": "^7.25.9", + "@babel/plugin-bugfix-v8-static-class-fields-redefine-readonly": "^7.25.9", + "@babel/plugin-proposal-private-property-in-object": "7.21.0-placeholder-for-preset-env.2", + "@babel/plugin-syntax-import-assertions": "^7.26.0", + "@babel/plugin-syntax-import-attributes": "^7.26.0", + "@babel/plugin-syntax-unicode-sets-regex": "^7.18.6", + "@babel/plugin-transform-arrow-functions": "^7.25.9", + "@babel/plugin-transform-async-generator-functions": "^7.25.9", + "@babel/plugin-transform-async-to-generator": "^7.25.9", + "@babel/plugin-transform-block-scoped-functions": "^7.25.9", + "@babel/plugin-transform-block-scoping": "^7.25.9", + "@babel/plugin-transform-class-properties": "^7.25.9", + "@babel/plugin-transform-class-static-block": "^7.26.0", + "@babel/plugin-transform-classes": "^7.25.9", + "@babel/plugin-transform-computed-properties": "^7.25.9", + "@babel/plugin-transform-destructuring": "^7.25.9", + "@babel/plugin-transform-dotall-regex": "^7.25.9", + "@babel/plugin-transform-duplicate-keys": "^7.25.9", + "@babel/plugin-transform-duplicate-named-capturing-groups-regex": "^7.25.9", + "@babel/plugin-transform-dynamic-import": "^7.25.9", + "@babel/plugin-transform-exponentiation-operator": "^7.25.9", + "@babel/plugin-transform-export-namespace-from": "^7.25.9", + "@babel/plugin-transform-for-of": "^7.25.9", + "@babel/plugin-transform-function-name": "^7.25.9", + "@babel/plugin-transform-json-strings": "^7.25.9", + "@babel/plugin-transform-literals": "^7.25.9", + "@babel/plugin-transform-logical-assignment-operators": "^7.25.9", + "@babel/plugin-transform-member-expression-literals": "^7.25.9", + "@babel/plugin-transform-modules-amd": "^7.25.9", + "@babel/plugin-transform-modules-commonjs": "^7.25.9", + "@babel/plugin-transform-modules-systemjs": "^7.25.9", + "@babel/plugin-transform-modules-umd": "^7.25.9", + "@babel/plugin-transform-named-capturing-groups-regex": "^7.25.9", + "@babel/plugin-transform-new-target": "^7.25.9", + "@babel/plugin-transform-nullish-coalescing-operator": "^7.25.9", + "@babel/plugin-transform-numeric-separator": "^7.25.9", + "@babel/plugin-transform-object-rest-spread": "^7.25.9", + "@babel/plugin-transform-object-super": "^7.25.9", + "@babel/plugin-transform-optional-catch-binding": "^7.25.9", + "@babel/plugin-transform-optional-chaining": "^7.25.9", + "@babel/plugin-transform-parameters": "^7.25.9", + "@babel/plugin-transform-private-methods": "^7.25.9", + "@babel/plugin-transform-private-property-in-object": "^7.25.9", + "@babel/plugin-transform-property-literals": "^7.25.9", + "@babel/plugin-transform-regenerator": "^7.25.9", + "@babel/plugin-transform-regexp-modifiers": "^7.26.0", + "@babel/plugin-transform-reserved-words": "^7.25.9", + "@babel/plugin-transform-shorthand-properties": "^7.25.9", + "@babel/plugin-transform-spread": "^7.25.9", + "@babel/plugin-transform-sticky-regex": "^7.25.9", + "@babel/plugin-transform-template-literals": "^7.25.9", + "@babel/plugin-transform-typeof-symbol": "^7.25.9", + "@babel/plugin-transform-unicode-escapes": "^7.25.9", + "@babel/plugin-transform-unicode-property-regex": "^7.25.9", + "@babel/plugin-transform-unicode-regex": "^7.25.9", + "@babel/plugin-transform-unicode-sets-regex": "^7.25.9", + "@babel/preset-modules": "0.1.6-no-external-plugins", + "babel-plugin-polyfill-corejs2": "^0.4.10", + "babel-plugin-polyfill-corejs3": "^0.10.6", + "babel-plugin-polyfill-regenerator": "^0.6.1", + "core-js-compat": "^3.38.1", + "semver": "^6.3.1" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/preset-modules": { + "version": "0.1.6-no-external-plugins", + "resolved": "https://registry.npmjs.org/@babel/preset-modules/-/preset-modules-0.1.6-no-external-plugins.tgz", + "integrity": "sha512-HrcgcIESLm9aIR842yhJ5RWan/gebQUJ6E/E5+rf0y9o6oj7w0Br+sWuL6kEQ/o/AdfvR1Je9jG18/gnpwjEyA==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.0.0", + "@babel/types": "^7.4.4", + "esutils": "^2.0.2" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0 || ^8.0.0-0 <8.0.0" + } + }, + "node_modules/@babel/preset-react": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/preset-react/-/preset-react-7.25.9.tgz", + "integrity": "sha512-D3to0uSPiWE7rBrdIICCd0tJSIGpLaaGptna2+w7Pft5xMqLpA1sz99DK5TZ1TjGbdQ/VI1eCSZ06dv3lT4JOw==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9", + "@babel/helper-validator-option": "^7.25.9", + "@babel/plugin-transform-react-display-name": "^7.25.9", + "@babel/plugin-transform-react-jsx": "^7.25.9", + "@babel/plugin-transform-react-jsx-development": "^7.25.9", + "@babel/plugin-transform-react-pure-annotations": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/preset-typescript": { + "version": "7.26.0", + "resolved": "https://registry.npmjs.org/@babel/preset-typescript/-/preset-typescript-7.26.0.tgz", + "integrity": "sha512-NMk1IGZ5I/oHhoXEElcm+xUnL/szL6xflkFZmoEU9xj1qSJXpiS7rsspYo92B4DRCDvZn2erT5LdsCeXAKNCkg==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.25.9", + "@babel/helper-validator-option": "^7.25.9", + "@babel/plugin-syntax-jsx": "^7.25.9", + "@babel/plugin-transform-modules-commonjs": "^7.25.9", + "@babel/plugin-transform-typescript": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/register": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/register/-/register-7.25.9.tgz", + "integrity": "sha512-8D43jXtGsYmEeDvm4MWHYUpWf8iiXgWYx3fW7E7Wb7Oe6FWqJPl5K6TuFW0dOwNZzEE5rjlaSJYH9JjrUKJszA==", + "dev": true, + "dependencies": { + "clone-deep": "^4.0.1", + "find-cache-dir": "^2.0.0", + "make-dir": "^2.1.0", + "pirates": "^4.0.6", + "source-map-support": "^0.5.16" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/runtime": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.28.6.tgz", + "integrity": "sha512-05WQkdpL9COIMz4LjTxGpPNCdlpyimKppYNoJ5Di5EUObifl8t4tuLuUBBZEpoLYOmfvIWrsp9fCl0HoPRVTdA==", + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/template": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.28.6.tgz", + "integrity": "sha512-YA6Ma2KsCdGb+WC6UpBVFJGXL58MDA6oyONbjyF/+5sBgxY/dwkhLogbMT2GXXyU84/IhRw/2D1Os1B/giz+BQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/code-frame": "^7.28.6", + "@babel/parser": "^7.28.6", + "@babel/types": "^7.28.6" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/traverse": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.25.9.tgz", + "integrity": "sha512-ZCuvfwOwlz/bawvAuvcj8rrithP2/N55Tzz342AkTvq4qaWbGfmCk/tKhNaV2cthijKrPAA8SRJV5WWe7IBMJw==", + "dev": true, + "dependencies": { + "@babel/code-frame": "^7.25.9", + "@babel/generator": "^7.25.9", + "@babel/parser": "^7.25.9", + "@babel/template": "^7.25.9", + "@babel/types": "^7.25.9", + "debug": "^4.3.1", + "globals": "^11.1.0" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/types": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.28.6.tgz", + "integrity": "sha512-0ZrskXVEHSWIqZM/sQZ4EV3jZJXRkio/WCxaqKZP1g//CEWEPSfeZFcms4XeKBCHU0ZKnIkdJeU/kF+eRp5lBg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-string-parser": "^7.27.1", + "@babel/helper-validator-identifier": "^7.28.5" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@bcoe/v8-coverage": { + "version": "0.2.3", + "resolved": "https://registry.npmjs.org/@bcoe/v8-coverage/-/v8-coverage-0.2.3.tgz", + "integrity": "sha512-0hYQ8SB4Db5zvZB4axdMHGwEaQjkZzFjQiN9LVYvIFB2nSUHW9tYpxWriPrWDASIxiaXax83REcLxuSdnGPZtw==", + "dev": true + }, + "node_modules/@cfaester/enzyme-adapter-react-18": { + "version": "0.8.0", + "resolved": "https://registry.npmjs.org/@cfaester/enzyme-adapter-react-18/-/enzyme-adapter-react-18-0.8.0.tgz", + "integrity": "sha512-3Z3ThTUouHwz8oIyhTYQljEMNRFtlVyc3VOOHCbxs47U6cnXs8K9ygi/c1tv49s7MBlTXeIcuN+Ttd9aPtILFQ==", + "dev": true, + "dependencies": { + "enzyme-shallow-equal": "^1.0.0", + "function.prototype.name": "^1.1.6", + "has": "^1.0.4", + "react-is": "^18.2.0", + "react-shallow-renderer": "^16.15.0" + }, + "peerDependencies": { + "enzyme": "^3.11.0", + "react": ">=18", + "react-dom": ">=18" + } + }, + "node_modules/@cloudscape-design/chat-components": { + "version": "1.0.62", + "resolved": "https://registry.npmjs.org/@cloudscape-design/chat-components/-/chat-components-1.0.62.tgz", + "integrity": "sha512-8Tqc5JqLmSMQe2nG0q1I7Q8m08kfuLJCFhTqgfYZDKu9g/HVP8d8Q42FPCcHesGWq1xM+S1wSioxJ5uhdzWE8A==", + "license": "Apache-2.0", + "dependencies": { + "@cloudscape-design/component-toolkit": "^1.0.0-beta", + "@cloudscape-design/test-utils-core": "^1.0.0", + "clsx": "^1.2.1" + }, + "peerDependencies": { + "@cloudscape-design/components": "^3", + "react": ">=18.2.0" + } + }, + "node_modules/@cloudscape-design/collection-hooks": { + "version": "1.0.74", + "resolved": "https://registry.npmjs.org/@cloudscape-design/collection-hooks/-/collection-hooks-1.0.74.tgz", + "integrity": "sha512-yAcD7vjFqbwqMCamUcKRXp403u8RcmC9izyPEYiWod9elt7x0GT1ypPyo9ZRyQuFrBsv2nwubBUrChcYaWooZw==", + "license": "Apache-2.0", + "peerDependencies": { + "react": ">=16.8.0" + } + }, + "node_modules/@cloudscape-design/component-toolkit": { + "version": "1.0.0-beta.120", + "resolved": "https://registry.npmjs.org/@cloudscape-design/component-toolkit/-/component-toolkit-1.0.0-beta.120.tgz", + "integrity": "sha512-QQfquFjubZvDpJ+Tlt3UHI3KWGvMhwoksY6tG7E41qOrS9y+YbDJuJyiqaCbm5S2PzZ33JBL0bWsXrJesZu6tA==", + "license": "Apache-2.0", + "dependencies": { + "@juggle/resize-observer": "^3.3.1", + "tslib": "^2.3.1" + } + }, + "node_modules/@cloudscape-design/components": { + "version": "3.0.1188", + "resolved": "https://registry.npmjs.org/@cloudscape-design/components/-/components-3.0.1188.tgz", + "integrity": "sha512-Ajk7wFr2boPO7v4pgBLjdLVcA7myBiCk5hzpzURGcg+oogX9lYtXHv80k60dqyn1kzx+J0xnZwEjLa0oRmflmA==", + "license": "Apache-2.0", + "dependencies": { + "@cloudscape-design/collection-hooks": "^1.0.0", + "@cloudscape-design/component-toolkit": "^1.0.0-beta", + "@cloudscape-design/test-utils-core": "^1.0.0", + "@cloudscape-design/theming-runtime": "^1.0.0", + "@dnd-kit/core": "^6.0.8", + "@dnd-kit/sortable": "^7.0.2", + "@dnd-kit/utilities": "^3.2.1", + "ace-builds": "^1.34.0", + "clsx": "^1.1.0", + "d3-shape": "^1.3.7", + "date-fns": "^2.25.0", + "intl-messageformat": "^10.3.1", + "mnth": "^2.0.0", + "react-keyed-flatten-children": "^2.2.1", + "react-transition-group": "^4.4.2", + "tslib": "^2.4.0", + "weekstart": "^1.1.0" + }, + "peerDependencies": { + "react": ">=16.8.0" + } + }, + "node_modules/@cloudscape-design/design-tokens": { + "version": "3.0.60", + "resolved": "https://registry.npmjs.org/@cloudscape-design/design-tokens/-/design-tokens-3.0.60.tgz", + "integrity": "sha512-ybj8FfjdhuHZflVDA//ooHJdwc+vny9MESvB95AJpVDhf6PXoaOpWAObn4hkMC770Wk/YwXtKXbx7rjJJQr6ZA==", + "license": "Apache-2.0" + }, + "node_modules/@cloudscape-design/global-styles": { + "version": "1.0.45", + "resolved": "https://registry.npmjs.org/@cloudscape-design/global-styles/-/global-styles-1.0.45.tgz", + "integrity": "sha512-fSrbVpK9W+bg8tmUYqU9Wh2JGciUCGEByVUQDbgMY6feXtYEUKRP2MBL6kEHvoJB7lssZbHdh5/gYaiyxg+P5w==", + "license": "Apache-2.0" + }, + "node_modules/@cloudscape-design/test-utils-core": { + "version": "1.0.44", + "resolved": "https://registry.npmjs.org/@cloudscape-design/test-utils-core/-/test-utils-core-1.0.44.tgz", + "integrity": "sha512-2nGG763Nsbac03ct4KfqH+ec/0UVrs/sjILQTInAPg9em3E9W1IPbF4nV0MBKUd1irgxwKmmbBiZNFfTFjpiqA==", + "dependencies": { + "css-selector-tokenizer": "^0.8.0", + "css.escape": "^1.5.1" + } + }, + "node_modules/@cloudscape-design/theming-runtime": { + "version": "1.0.63", + "resolved": "https://registry.npmjs.org/@cloudscape-design/theming-runtime/-/theming-runtime-1.0.63.tgz", + "integrity": "sha512-PDiNTdnLCDGHTWwNOQkyF/o5OJEScDLglsIY01qNA4NesiVzK9AtSJBlPTgZFWH5Bjp6ukkxdRuX/Ak+28GGLw==", + "dependencies": { + "tslib": "^2.4.0" + } + }, + "node_modules/@cspotcode/source-map-support": { + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/@cspotcode/source-map-support/-/source-map-support-0.8.1.tgz", + "integrity": "sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==", + "dev": true, + "dependencies": { + "@jridgewell/trace-mapping": "0.3.9" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/@cspotcode/source-map-support/node_modules/@jridgewell/trace-mapping": { + "version": "0.3.9", + "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.9.tgz", + "integrity": "sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==", + "dev": true, + "dependencies": { + "@jridgewell/resolve-uri": "^3.0.3", + "@jridgewell/sourcemap-codec": "^1.4.10" + } + }, + "node_modules/@csstools/postcss-cascade-layers": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/@csstools/postcss-cascade-layers/-/postcss-cascade-layers-1.1.1.tgz", + "integrity": "sha512-+KdYrpKC5TgomQr2DlZF4lDEpHcoxnj5IGddYYfBWJAKfj1JtuHUIqMa+E1pJJ+z3kvDViWMqyqPlG4Ja7amQA==", + "dev": true, + "dependencies": { + "@csstools/selector-specificity": "^2.0.2", + "postcss-selector-parser": "^6.0.10" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/@csstools/postcss-color-function": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/@csstools/postcss-color-function/-/postcss-color-function-1.1.1.tgz", + "integrity": "sha512-Bc0f62WmHdtRDjf5f3e2STwRAl89N2CLb+9iAwzrv4L2hncrbDwnQD9PCq0gtAt7pOI2leIV08HIBUd4jxD8cw==", + "dev": true, + "dependencies": { + "@csstools/postcss-progressive-custom-properties": "^1.1.0", + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/@csstools/postcss-font-format-keywords": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/@csstools/postcss-font-format-keywords/-/postcss-font-format-keywords-1.0.1.tgz", + "integrity": "sha512-ZgrlzuUAjXIOc2JueK0X5sZDjCtgimVp/O5CEqTcs5ShWBa6smhWYbS0x5cVc/+rycTDbjjzoP0KTDnUneZGOg==", + "dev": true, + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/@csstools/postcss-hwb-function": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/@csstools/postcss-hwb-function/-/postcss-hwb-function-1.0.2.tgz", + "integrity": "sha512-YHdEru4o3Rsbjmu6vHy4UKOXZD+Rn2zmkAmLRfPet6+Jz4Ojw8cbWxe1n42VaXQhD3CQUXXTooIy8OkVbUcL+w==", + "dev": true, + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/@csstools/postcss-ic-unit": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/@csstools/postcss-ic-unit/-/postcss-ic-unit-1.0.1.tgz", + "integrity": "sha512-Ot1rcwRAaRHNKC9tAqoqNZhjdYBzKk1POgWfhN4uCOE47ebGcLRqXjKkApVDpjifL6u2/55ekkpnFcp+s/OZUw==", + "dev": true, + "dependencies": { + "@csstools/postcss-progressive-custom-properties": "^1.1.0", + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/@csstools/postcss-is-pseudo-class": { + "version": "2.0.7", + "resolved": "https://registry.npmjs.org/@csstools/postcss-is-pseudo-class/-/postcss-is-pseudo-class-2.0.7.tgz", + "integrity": "sha512-7JPeVVZHd+jxYdULl87lvjgvWldYu+Bc62s9vD/ED6/QTGjy0jy0US/f6BG53sVMTBJ1lzKZFpYmofBN9eaRiA==", + "dev": true, + "dependencies": { + "@csstools/selector-specificity": "^2.0.0", + "postcss-selector-parser": "^6.0.10" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/@csstools/postcss-nested-calc": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/@csstools/postcss-nested-calc/-/postcss-nested-calc-1.0.0.tgz", + "integrity": "sha512-JCsQsw1wjYwv1bJmgjKSoZNvf7R6+wuHDAbi5f/7MbFhl2d/+v+TvBTU4BJH3G1X1H87dHl0mh6TfYogbT/dJQ==", + "dev": true, + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/@csstools/postcss-normalize-display-values": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/@csstools/postcss-normalize-display-values/-/postcss-normalize-display-values-1.0.1.tgz", + "integrity": "sha512-jcOanIbv55OFKQ3sYeFD/T0Ti7AMXc9nM1hZWu8m/2722gOTxFg7xYu4RDLJLeZmPUVQlGzo4jhzvTUq3x4ZUw==", + "dev": true, + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/@csstools/postcss-oklab-function": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/@csstools/postcss-oklab-function/-/postcss-oklab-function-1.1.1.tgz", + "integrity": "sha512-nJpJgsdA3dA9y5pgyb/UfEzE7W5Ka7u0CX0/HIMVBNWzWemdcTH3XwANECU6anWv/ao4vVNLTMxhiPNZsTK6iA==", + "dev": true, + "dependencies": { + "@csstools/postcss-progressive-custom-properties": "^1.1.0", + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/@csstools/postcss-progressive-custom-properties": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@csstools/postcss-progressive-custom-properties/-/postcss-progressive-custom-properties-1.3.0.tgz", + "integrity": "sha512-ASA9W1aIy5ygskZYuWams4BzafD12ULvSypmaLJT2jvQ8G0M3I8PRQhC0h7mG0Z3LI05+agZjqSR9+K9yaQQjA==", + "dev": true, + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "peerDependencies": { + "postcss": "^8.3" + } + }, + "node_modules/@csstools/postcss-stepped-value-functions": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/@csstools/postcss-stepped-value-functions/-/postcss-stepped-value-functions-1.0.1.tgz", + "integrity": "sha512-dz0LNoo3ijpTOQqEJLY8nyaapl6umbmDcgj4AD0lgVQ572b2eqA1iGZYTTWhrcrHztWDDRAX2DGYyw2VBjvCvQ==", + "dev": true, + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/@csstools/postcss-text-decoration-shorthand": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/@csstools/postcss-text-decoration-shorthand/-/postcss-text-decoration-shorthand-1.0.0.tgz", + "integrity": "sha512-c1XwKJ2eMIWrzQenN0XbcfzckOLLJiczqy+YvfGmzoVXd7pT9FfObiSEfzs84bpE/VqfpEuAZ9tCRbZkZxxbdw==", + "dev": true, + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/@csstools/postcss-trigonometric-functions": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/@csstools/postcss-trigonometric-functions/-/postcss-trigonometric-functions-1.0.2.tgz", + "integrity": "sha512-woKaLO///4bb+zZC2s80l+7cm07M7268MsyG3M0ActXXEFi6SuhvriQYcb58iiKGbjwwIU7n45iRLEHypB47Og==", + "dev": true, + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/@csstools/postcss-unset-value": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/@csstools/postcss-unset-value/-/postcss-unset-value-1.0.2.tgz", + "integrity": "sha512-c8J4roPBILnelAsdLr4XOAR/GsTm0GJi4XpcfvoWk3U6KiTCqiFYc63KhRMQQX35jYMp4Ao8Ij9+IZRgMfJp1g==", + "dev": true, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/@csstools/selector-specificity": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@csstools/selector-specificity/-/selector-specificity-2.2.0.tgz", + "integrity": "sha512-+OJ9konv95ClSTOJCmMZqpd5+YGsB2S+x6w3E1oaM8UuR5j8nTNHYSz8c9BEPGDOCMQYIEEGlVPj/VY64iTbGw==", + "dev": true, + "engines": { + "node": "^14 || ^16 || >=18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss-selector-parser": "^6.0.10" + } + }, + "node_modules/@discoveryjs/json-ext": { + "version": "0.5.7", + "resolved": "https://registry.npmjs.org/@discoveryjs/json-ext/-/json-ext-0.5.7.tgz", + "integrity": "sha512-dBVuXR082gk3jsFp7Rd/JI4kytwGHecnCoTtXFb7DB6CNHp4rg5k1bhg0nWdLGLnOV71lmDzGQaLMy8iPLY0pw==", + "dev": true, + "engines": { + "node": ">=10.0.0" + } + }, + "node_modules/@dnd-kit/accessibility": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/@dnd-kit/accessibility/-/accessibility-3.1.0.tgz", + "integrity": "sha512-ea7IkhKvlJUv9iSHJOnxinBcoOI3ppGnnL+VDJ75O45Nss6HtZd8IdN8touXPDtASfeI2T2LImb8VOZcL47wjQ==", + "dependencies": { + "tslib": "^2.0.0" + }, + "peerDependencies": { + "react": ">=16.8.0" + } + }, + "node_modules/@dnd-kit/core": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/@dnd-kit/core/-/core-6.1.0.tgz", + "integrity": "sha512-J3cQBClB4TVxwGo3KEjssGEXNJqGVWx17aRTZ1ob0FliR5IjYgTxl5YJbKTzA6IzrtelotH19v6y7uoIRUZPSg==", + "dependencies": { + "@dnd-kit/accessibility": "^3.1.0", + "@dnd-kit/utilities": "^3.2.2", + "tslib": "^2.0.0" + }, + "peerDependencies": { + "react": ">=16.8.0", + "react-dom": ">=16.8.0" + } + }, + "node_modules/@dnd-kit/sortable": { + "version": "7.0.2", + "resolved": "https://registry.npmjs.org/@dnd-kit/sortable/-/sortable-7.0.2.tgz", + "integrity": "sha512-wDkBHHf9iCi1veM834Gbk1429bd4lHX4RpAwT0y2cHLf246GAvU2sVw/oxWNpPKQNQRQaeGXhAVgrOl1IT+iyA==", + "dependencies": { + "@dnd-kit/utilities": "^3.2.0", + "tslib": "^2.0.0" + }, + "peerDependencies": { + "@dnd-kit/core": "^6.0.7", + "react": ">=16.8.0" + } + }, + "node_modules/@dnd-kit/utilities": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/@dnd-kit/utilities/-/utilities-3.2.2.tgz", + "integrity": "sha512-+MKAJEOfaBe5SmV6t34p80MMKhjvUz0vRrvVJbPT0WElzaOJ/1xs+D+KDv+tD/NE5ujfrChEcshd4fLn0wpiqg==", + "dependencies": { + "tslib": "^2.0.0" + }, + "peerDependencies": { + "react": ">=16.8.0" + } + }, + "node_modules/@emnapi/runtime": { + "version": "1.8.1", + "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.8.1.tgz", + "integrity": "sha512-mehfKSMWjjNol8659Z8KxEMrdSJDDot5SXMq00dM8BN4o+CLNXQ0xH2V7EchNHV4RmbZLmmPdEaXZc5H2FXmDg==", + "dev": true, + "license": "MIT", + "optional": true, + "dependencies": { + "tslib": "^2.4.0" + } + }, + "node_modules/@emotion/is-prop-valid": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/@emotion/is-prop-valid/-/is-prop-valid-1.2.2.tgz", + "integrity": "sha512-uNsoYd37AFmaCdXlg6EYD1KaPOaRWRByMCYzbKUX4+hhMfrxdVSelShywL4JVaAeM/eHUOSprYBQls+/neX3pw==", + "dev": true, + "peer": true, + "dependencies": { + "@emotion/memoize": "^0.8.1" + } + }, + "node_modules/@emotion/memoize": { + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/@emotion/memoize/-/memoize-0.8.1.tgz", + "integrity": "sha512-W2P2c/VRW1/1tLox0mVUalvnWXxavmv/Oum2aPsRcoDJuob75FC3Y8FbpfLwUegRcxINtGUMPq0tFCvYNTBXNA==", + "dev": true, + "peer": true + }, + "node_modules/@emotion/unitless": { + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/@emotion/unitless/-/unitless-0.8.1.tgz", + "integrity": "sha512-KOEGMu6dmJZtpadb476IsZBclKvILjopjUii3V+7MnXIQCYh8W3NgNcgwo21n9LXZX6EDIKvqfjYxXebDwxKmQ==", + "dev": true, + "peer": true + }, + "node_modules/@eslint-community/eslint-utils": { + "version": "4.9.1", + "resolved": "https://registry.npmjs.org/@eslint-community/eslint-utils/-/eslint-utils-4.9.1.tgz", + "integrity": "sha512-phrYmNiYppR7znFEdqgfWHXR6NCkZEK7hwWDHZUjit/2/U0r6XvkDl0SYnoM51Hq7FhCGdLDT6zxCCOY1hexsQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "eslint-visitor-keys": "^3.4.3" + }, + "engines": { + "node": "^12.22.0 || ^14.17.0 || >=16.0.0" + }, + "funding": { + "url": "https://opencollective.com/eslint" + }, + "peerDependencies": { + "eslint": "^6.0.0 || ^7.0.0 || >=8.0.0" + } + }, + "node_modules/@eslint-community/regexpp": { + "version": "4.12.1", + "resolved": "https://registry.npmjs.org/@eslint-community/regexpp/-/regexpp-4.12.1.tgz", + "integrity": "sha512-CCZCDJuduB9OUkFkY2IgppNZMi2lBQgD2qzwXkEia16cge2pijY/aXi96CJMquDMn3nJdlPV1A5KrJEXwfLNzQ==", + "dev": true, + "engines": { + "node": "^12.0.0 || ^14.0.0 || >=16.0.0" + } + }, + "node_modules/@eslint/config-array": { + "version": "0.21.1", + "resolved": "https://registry.npmjs.org/@eslint/config-array/-/config-array-0.21.1.tgz", + "integrity": "sha512-aw1gNayWpdI/jSYVgzN5pL0cfzU02GT3NBpeT/DXbx1/1x7ZKxFPd9bwrzygx/qiwIQiJ1sw/zD8qY/kRvlGHA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@eslint/object-schema": "^2.1.7", + "debug": "^4.3.1", + "minimatch": "^3.1.2" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + } + }, + "node_modules/@eslint/config-helpers": { + "version": "0.4.2", + "resolved": "https://registry.npmjs.org/@eslint/config-helpers/-/config-helpers-0.4.2.tgz", + "integrity": "sha512-gBrxN88gOIf3R7ja5K9slwNayVcZgK6SOUORm2uBzTeIEfeVaIhOpCtTox3P6R7o2jLFwLFTLnC7kU/RGcYEgw==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@eslint/core": "^0.17.0" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + } + }, + "node_modules/@eslint/core": { + "version": "0.17.0", + "resolved": "https://registry.npmjs.org/@eslint/core/-/core-0.17.0.tgz", + "integrity": "sha512-yL/sLrpmtDaFEiUj1osRP4TI2MDz1AddJL+jZ7KSqvBuliN4xqYY54IfdN8qD8Toa6g1iloph1fxQNkjOxrrpQ==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@types/json-schema": "^7.0.15" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + } + }, + "node_modules/@eslint/eslintrc": { + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/@eslint/eslintrc/-/eslintrc-3.3.1.tgz", + "integrity": "sha512-gtF186CXhIl1p4pJNGZw8Yc6RlshoePRvE0X91oPGb3vZ8pM3qOS9W9NGPat9LziaBV7XrJWGylNQXkGcnM3IQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "ajv": "^6.12.4", + "debug": "^4.3.2", + "espree": "^10.0.1", + "globals": "^14.0.0", + "ignore": "^5.2.0", + "import-fresh": "^3.2.1", + "js-yaml": "^4.1.0", + "minimatch": "^3.1.2", + "strip-json-comments": "^3.1.1" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "url": "https://opencollective.com/eslint" + } + }, + "node_modules/@eslint/eslintrc/node_modules/ajv": { + "version": "6.12.6", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz", + "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==", + "dev": true, + "license": "MIT", + "dependencies": { + "fast-deep-equal": "^3.1.1", + "fast-json-stable-stringify": "^2.0.0", + "json-schema-traverse": "^0.4.1", + "uri-js": "^4.2.2" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/epoberezkin" + } + }, + "node_modules/@eslint/eslintrc/node_modules/globals": { + "version": "14.0.0", + "resolved": "https://registry.npmjs.org/globals/-/globals-14.0.0.tgz", + "integrity": "sha512-oahGvuMGQlPw/ivIYBjVSrWAfWLBeku5tpPE2fOPLi+WHffIWbuh2tCjhyQhTBPMf5E9jDEH4FOmTYgYwbKwtQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/@eslint/eslintrc/node_modules/json-schema-traverse": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", + "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==", + "dev": true, + "license": "MIT" + }, + "node_modules/@eslint/js": { + "version": "9.39.2", + "resolved": "https://registry.npmjs.org/@eslint/js/-/js-9.39.2.tgz", + "integrity": "sha512-q1mjIoW1VX4IvSocvM/vbTiveKC4k9eLrajNEuSsmjymSDEbpGddtpfOoN7YGAqBK3NG+uqo8ia4PDTt8buCYA==", + "dev": true, + "license": "MIT", + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "url": "https://eslint.org/donate" + } + }, + "node_modules/@eslint/object-schema": { + "version": "2.1.7", + "resolved": "https://registry.npmjs.org/@eslint/object-schema/-/object-schema-2.1.7.tgz", + "integrity": "sha512-VtAOaymWVfZcmZbp6E2mympDIHvyjXs/12LqWYjVw6qjrfF+VK+fyG33kChz3nnK+SU5/NeHOqrTEHS8sXO3OA==", + "dev": true, + "license": "Apache-2.0", + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + } + }, + "node_modules/@eslint/plugin-kit": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/@eslint/plugin-kit/-/plugin-kit-0.4.1.tgz", + "integrity": "sha512-43/qtrDUokr7LJqoF2c3+RInu/t4zfrpYdoSDfYyhg52rwLV6TnOvdG4fXm7IkSB3wErkcmJS9iEhjVtOSEjjA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@eslint/core": "^0.17.0", + "levn": "^0.4.1" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + } + }, + "node_modules/@exodus/schemasafe": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@exodus/schemasafe/-/schemasafe-1.3.0.tgz", + "integrity": "sha512-5Aap/GaRupgNx/feGBwLLTVv8OQFfv3pq2lPRzPg9R+IOBnDgghTGW7l7EuVXOvg5cc/xSAlRW8rBrjIC3Nvqw==", + "dev": true + }, + "node_modules/@formatjs/ecma402-abstract": { + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/@formatjs/ecma402-abstract/-/ecma402-abstract-2.2.1.tgz", + "integrity": "sha512-O4ywpkdJybrjFc9zyL8qK5aklleIAi5O4nYhBVJaOFtCkNrnU+lKFeJOFC48zpsZQmR8Aok2V79hGpHnzbmFpg==", + "dependencies": { + "@formatjs/fast-memoize": "2.2.2", + "@formatjs/intl-localematcher": "0.5.6", + "tslib": "2" + } + }, + "node_modules/@formatjs/fast-memoize": { + "version": "2.2.2", + "resolved": "https://registry.npmjs.org/@formatjs/fast-memoize/-/fast-memoize-2.2.2.tgz", + "integrity": "sha512-mzxZcS0g1pOzwZTslJOBTmLzDXseMLLvnh25ymRilCm8QLMObsQ7x/rj9GNrH0iUhZMlFisVOD6J1n6WQqpKPQ==", + "dependencies": { + "tslib": "2" + } + }, + "node_modules/@formatjs/icu-messageformat-parser": { + "version": "2.9.1", + "resolved": "https://registry.npmjs.org/@formatjs/icu-messageformat-parser/-/icu-messageformat-parser-2.9.1.tgz", + "integrity": "sha512-7AYk4tjnLi5wBkxst2w7qFj38JLMJoqzj7BhdEl7oTlsWMlqwgx4p9oMmmvpXWTSDGNwOKBRc1SfwMh5MOHeNg==", + "dependencies": { + "@formatjs/ecma402-abstract": "2.2.1", + "@formatjs/icu-skeleton-parser": "1.8.5", + "tslib": "2" + } + }, + "node_modules/@formatjs/icu-skeleton-parser": { + "version": "1.8.5", + "resolved": "https://registry.npmjs.org/@formatjs/icu-skeleton-parser/-/icu-skeleton-parser-1.8.5.tgz", + "integrity": "sha512-zRZ/e3B5qY2+JCLs7puTzWS1Jb+t/K+8Jur/gEZpA2EjWeLDE17nsx8thyo9P48Mno7UmafnPupV2NCJXX17Dg==", + "dependencies": { + "@formatjs/ecma402-abstract": "2.2.1", + "tslib": "2" + } + }, + "node_modules/@formatjs/intl-localematcher": { + "version": "0.5.6", + "resolved": "https://registry.npmjs.org/@formatjs/intl-localematcher/-/intl-localematcher-0.5.6.tgz", + "integrity": "sha512-roz1+Ba5e23AHX6KUAWmLEyTRZegM5YDuxuvkHCyK3RJddf/UXB2f+s7pOMm9ktfPGla0g+mQXOn5vsuYirnaA==", + "dependencies": { + "tslib": "2" + } + }, + "node_modules/@hookform/resolvers": { + "version": "2.9.11", + "resolved": "https://registry.npmjs.org/@hookform/resolvers/-/resolvers-2.9.11.tgz", + "integrity": "sha512-bA3aZ79UgcHj7tFV7RlgThzwSSHZgvfbt2wprldRkYBcMopdMvHyO17Wwp/twcJasNFischFfS7oz8Katz8DdQ==", + "peerDependencies": { + "react-hook-form": "^7.0.0" + } + }, + "node_modules/@humanfs/core": { + "version": "0.19.1", + "resolved": "https://registry.npmjs.org/@humanfs/core/-/core-0.19.1.tgz", + "integrity": "sha512-5DyQ4+1JEUzejeK1JGICcideyfUbGixgS9jNgex5nqkW+cY7WZhxBigmieN5Qnw9ZosSNVC9KQKyb+GUaGyKUA==", + "dev": true, + "license": "Apache-2.0", + "engines": { + "node": ">=18.18.0" + } + }, + "node_modules/@humanfs/node": { + "version": "0.16.6", + "resolved": "https://registry.npmjs.org/@humanfs/node/-/node-0.16.6.tgz", + "integrity": "sha512-YuI2ZHQL78Q5HbhDiBA1X4LmYdXCKCMQIfw0pw7piHJwyREFebJUvrQN4cMssyES6x+vfUbx1CIpaQUKYdQZOw==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@humanfs/core": "^0.19.1", + "@humanwhocodes/retry": "^0.3.0" + }, + "engines": { + "node": ">=18.18.0" + } + }, + "node_modules/@humanfs/node/node_modules/@humanwhocodes/retry": { + "version": "0.3.1", + "resolved": "https://registry.npmjs.org/@humanwhocodes/retry/-/retry-0.3.1.tgz", + "integrity": "sha512-JBxkERygn7Bv/GbN5Rv8Ul6LVknS+5Bp6RgDC/O8gEBU/yeH5Ui5C/OlWrTb6qct7LjjfT6Re2NxB0ln0yYybA==", + "dev": true, + "license": "Apache-2.0", + "engines": { + "node": ">=18.18" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/nzakas" + } + }, + "node_modules/@humanwhocodes/module-importer": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/@humanwhocodes/module-importer/-/module-importer-1.0.1.tgz", + "integrity": "sha512-bxveV4V8v5Yb4ncFTT3rPSgZBOpCkjfK0y4oVVVJwIuDVBRMDXrPyXRL988i5ap9m9bnyEEjWfm5WkBmtffLfA==", + "dev": true, + "engines": { + "node": ">=12.22" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/nzakas" + } + }, + "node_modules/@humanwhocodes/retry": { + "version": "0.4.3", + "resolved": "https://registry.npmjs.org/@humanwhocodes/retry/-/retry-0.4.3.tgz", + "integrity": "sha512-bV0Tgo9K4hfPCek+aMAn81RppFKv2ySDQeMoSZuvTASywNTnVJCArCZE2FWqpvIatKu7VMRLWlR1EazvVhDyhQ==", + "dev": true, + "license": "Apache-2.0", + "engines": { + "node": ">=18.18" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/nzakas" + } + }, + "node_modules/@img/sharp-darwin-arm64": { + "version": "0.33.5", + "resolved": "https://registry.npmjs.org/@img/sharp-darwin-arm64/-/sharp-darwin-arm64-0.33.5.tgz", + "integrity": "sha512-UT4p+iz/2H4twwAoLCqfA9UH5pI6DggwKEGuaPy7nCVQ8ZsiY5PIcrRvD1DzuY3qYL07NtIQcWnBSY/heikIFQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-darwin-arm64": "1.0.4" + } + }, + "node_modules/@img/sharp-darwin-x64": { + "version": "0.33.5", + "resolved": "https://registry.npmjs.org/@img/sharp-darwin-x64/-/sharp-darwin-x64-0.33.5.tgz", + "integrity": "sha512-fyHac4jIc1ANYGRDxtiqelIbdWkIuQaI84Mv45KvGRRxSAa7o7d1ZKAOBaYbnepLC1WqxfpimdeWfvqqSGwR2Q==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-darwin-x64": "1.0.4" + } + }, + "node_modules/@img/sharp-libvips-darwin-arm64": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-arm64/-/sharp-libvips-darwin-arm64-1.0.4.tgz", + "integrity": "sha512-XblONe153h0O2zuFfTAbQYAX2JhYmDHeWikp1LM9Hul9gVPjFY427k6dFEcOL72O01QxQsWi761svJ/ev9xEDg==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-libvips-darwin-x64": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-x64/-/sharp-libvips-darwin-x64-1.0.4.tgz", + "integrity": "sha512-xnGR8YuZYfJGmWPvmlunFaWJsb9T/AO2ykoP3Fz/0X5XV2aoYBPkX6xqCQvUTKKiLddarLaxpzNe+b1hjeWHAQ==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-libvips-linux-arm": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm/-/sharp-libvips-linux-arm-1.0.5.tgz", + "integrity": "sha512-gvcC4ACAOPRNATg/ov8/MnbxFDJqf/pDePbBnuBDcjsI8PssmjoKMAz4LtLaVi+OnSb5FK/yIOamqDwGmXW32g==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-libvips-linux-arm64": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm64/-/sharp-libvips-linux-arm64-1.0.4.tgz", + "integrity": "sha512-9B+taZ8DlyyqzZQnoeIvDVR/2F4EbMepXMc/NdVbkzsJbzkUjhXv/70GQJ7tdLA4YJgNP25zukcxpX2/SueNrA==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-libvips-linux-s390x": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-s390x/-/sharp-libvips-linux-s390x-1.0.4.tgz", + "integrity": "sha512-u7Wz6ntiSSgGSGcjZ55im6uvTrOxSIS8/dgoVMoiGE9I6JAfU50yH5BoDlYA1tcuGS7g/QNtetJnxA6QEsCVTA==", + "cpu": [ + "s390x" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-libvips-linux-x64": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-x64/-/sharp-libvips-linux-x64-1.0.4.tgz", + "integrity": "sha512-MmWmQ3iPFZr0Iev+BAgVMb3ZyC4KeFc3jFxnNbEPas60e1cIfevbtuyf9nDGIzOaW9PdnDciJm+wFFaTlj5xYw==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-libvips-linuxmusl-arm64": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-arm64/-/sharp-libvips-linuxmusl-arm64-1.0.4.tgz", + "integrity": "sha512-9Ti+BbTYDcsbp4wfYib8Ctm1ilkugkA/uscUn6UXK1ldpC1JjiXbLfFZtRlBhjPZ5o1NCLiDbg8fhUPKStHoTA==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-libvips-linuxmusl-x64": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-x64/-/sharp-libvips-linuxmusl-x64-1.0.4.tgz", + "integrity": "sha512-viYN1KX9m+/hGkJtvYYp+CCLgnJXwiQB39damAO7WMdKWlIhmYTfHjwSbQeUK/20vY154mwezd9HflVFM1wVSw==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-linux-arm": { + "version": "0.33.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm/-/sharp-linux-arm-0.33.5.tgz", + "integrity": "sha512-JTS1eldqZbJxjvKaAkxhZmBqPRGmxgu+qFKSInv8moZ2AmT5Yib3EQ1c6gp493HvrvV8QgdOXdyaIBrhvFhBMQ==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linux-arm": "1.0.5" + } + }, + "node_modules/@img/sharp-linux-arm64": { + "version": "0.33.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm64/-/sharp-linux-arm64-0.33.5.tgz", + "integrity": "sha512-JMVv+AMRyGOHtO1RFBiJy/MBsgz0x4AWrT6QoEVVTyh1E39TrCUpTRI7mx9VksGX4awWASxqCYLCV4wBZHAYxA==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linux-arm64": "1.0.4" + } + }, + "node_modules/@img/sharp-linux-s390x": { + "version": "0.33.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-s390x/-/sharp-linux-s390x-0.33.5.tgz", + "integrity": "sha512-y/5PCd+mP4CA/sPDKl2961b+C9d+vPAveS33s6Z3zfASk2j5upL6fXVPZi7ztePZ5CuH+1kW8JtvxgbuXHRa4Q==", + "cpu": [ + "s390x" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linux-s390x": "1.0.4" + } + }, + "node_modules/@img/sharp-linux-x64": { + "version": "0.33.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-x64/-/sharp-linux-x64-0.33.5.tgz", + "integrity": "sha512-opC+Ok5pRNAzuvq1AG0ar+1owsu842/Ab+4qvU879ippJBHvyY5n2mxF1izXqkPYlGuP/M556uh53jRLJmzTWA==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linux-x64": "1.0.4" + } + }, + "node_modules/@img/sharp-linuxmusl-arm64": { + "version": "0.33.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-arm64/-/sharp-linuxmusl-arm64-0.33.5.tgz", + "integrity": "sha512-XrHMZwGQGvJg2V/oRSUfSAfjfPxO+4DkiRh6p2AFjLQztWUuY/o8Mq0eMQVIY7HJ1CDQUJlxGGZRw1a5bqmd1g==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linuxmusl-arm64": "1.0.4" + } + }, + "node_modules/@img/sharp-linuxmusl-x64": { + "version": "0.33.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-x64/-/sharp-linuxmusl-x64-0.33.5.tgz", + "integrity": "sha512-WT+d/cgqKkkKySYmqoZ8y3pxx7lx9vVejxW/W4DOFMYVSkErR+w7mf2u8m/y4+xHe7yY9DAXQMWQhpnMuFfScw==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linuxmusl-x64": "1.0.4" + } + }, + "node_modules/@img/sharp-wasm32": { + "version": "0.33.5", + "resolved": "https://registry.npmjs.org/@img/sharp-wasm32/-/sharp-wasm32-0.33.5.tgz", + "integrity": "sha512-ykUW4LVGaMcU9lu9thv85CbRMAwfeadCJHRsg2GmeRa/cJxsVY9Rbd57JcMxBkKHag5U/x7TSBpScF4U8ElVzg==", + "cpu": [ + "wasm32" + ], + "dev": true, + "optional": true, + "dependencies": { + "@emnapi/runtime": "^1.2.0" + }, + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-win32-ia32": { + "version": "0.33.5", + "resolved": "https://registry.npmjs.org/@img/sharp-win32-ia32/-/sharp-win32-ia32-0.33.5.tgz", + "integrity": "sha512-T36PblLaTwuVJ/zw/LaH0PdZkRz5rd3SmMHX8GSmR7vtNSP5Z6bQkExdSK7xGWyxLw4sUknBuugTelgw2faBbQ==", + "cpu": [ + "ia32" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-win32-x64": { + "version": "0.33.5", + "resolved": "https://registry.npmjs.org/@img/sharp-win32-x64/-/sharp-win32-x64-0.33.5.tgz", + "integrity": "sha512-MpY/o8/8kj+EcnxwvrP4aTJSWw/aZ7JIGR4aBeZkZw5B7/Jn+tY9/VNwtcoGmdT7GfggGIU4kygOMSbYnOrAbg==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@istanbuljs/load-nyc-config": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@istanbuljs/load-nyc-config/-/load-nyc-config-1.1.0.tgz", + "integrity": "sha512-VjeHSlIzpv/NyD3N0YuHfXOPDIixcA1q2ZV98wsMqcYlPmv2n3Yb2lYP9XMElnaFVXg5A7YLTeLu6V84uQDjmQ==", + "dev": true, + "dependencies": { + "camelcase": "^5.3.1", + "find-up": "^4.1.0", + "get-package-type": "^0.1.0", + "js-yaml": "^3.13.1", + "resolve-from": "^5.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/@istanbuljs/load-nyc-config/node_modules/argparse": { + "version": "1.0.10", + "resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz", + "integrity": "sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==", + "dev": true, + "license": "MIT", + "dependencies": { + "sprintf-js": "~1.0.2" + } + }, + "node_modules/@istanbuljs/load-nyc-config/node_modules/camelcase": { + "version": "5.3.1", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-5.3.1.tgz", + "integrity": "sha512-L28STB170nwWS63UjtlEOE3dldQApaJXZkOI1uMFfzf3rRuPegHaHesyee+YxQ+W6SvRDQV6UrdOdRiR153wJg==", + "dev": true, + "engines": { + "node": ">=6" + } + }, + "node_modules/@istanbuljs/load-nyc-config/node_modules/find-up": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-4.1.0.tgz", + "integrity": "sha512-PpOwAdQ/YlXQ2vj8a3h8IipDuYRi3wceVQQGYWxNINccq40Anw7BlsEXCMbt1Zt+OLA6Fq9suIpIWD0OsnISlw==", + "dev": true, + "dependencies": { + "locate-path": "^5.0.0", + "path-exists": "^4.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/@istanbuljs/load-nyc-config/node_modules/js-yaml": { + "version": "3.14.2", + "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.14.2.tgz", + "integrity": "sha512-PMSmkqxr106Xa156c2M265Z+FTrPl+oxd/rgOQy2tijQeK5TxQ43psO1ZCwhVOSdnn+RzkzlRz/eY4BgJBYVpg==", + "dev": true, + "license": "MIT", + "dependencies": { + "argparse": "^1.0.7", + "esprima": "^4.0.0" + }, + "bin": { + "js-yaml": "bin/js-yaml.js" + } + }, + "node_modules/@istanbuljs/load-nyc-config/node_modules/locate-path": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-5.0.0.tgz", + "integrity": "sha512-t7hw9pI+WvuwNJXwk5zVHpyhIqzg2qTlklJOf0mVxGSbe3Fp2VieZcduNYjaLDoy6p9uGpQEGWG87WpMKlNq8g==", + "dev": true, + "dependencies": { + "p-locate": "^4.1.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/@istanbuljs/load-nyc-config/node_modules/p-limit": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.3.0.tgz", + "integrity": "sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==", + "dev": true, + "dependencies": { + "p-try": "^2.0.0" + }, + "engines": { + "node": ">=6" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/@istanbuljs/load-nyc-config/node_modules/p-locate": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-4.1.0.tgz", + "integrity": "sha512-R79ZZ/0wAxKGu3oYMlz8jy/kbhsNrS7SKZ7PxEHBgJ5+F2mtFW2fK2cOtBh1cHYkQsbzFV7I+EoRKe6Yt0oK7A==", + "dev": true, + "dependencies": { + "p-limit": "^2.2.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/@istanbuljs/load-nyc-config/node_modules/resolve-from": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-5.0.0.tgz", + "integrity": "sha512-qYg9KP24dD5qka9J47d0aVky0N+b4fTU89LN9iDnjB5waksiC49rvMB0PrUJQGoTmH50XPiqOvAjDfaijGxYZw==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/@istanbuljs/schema": { + "version": "0.1.3", + "resolved": "https://registry.npmjs.org/@istanbuljs/schema/-/schema-0.1.3.tgz", + "integrity": "sha512-ZXRY4jNvVgSVQ8DL3LTcakaAtXwTVUxE81hslsyD2AtoXW/wVob10HkOJ1X/pAlcI7D+2YoZKg5do8G/w6RYgA==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/@jest/console": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/@jest/console/-/console-29.7.0.tgz", + "integrity": "sha512-5Ni4CU7XHQi32IJ398EEP4RrB8eV09sXP2ROqD4bksHrnTree52PsxvX8tpL8LvTZ3pFzXyPbNQReSN41CAhOg==", + "dev": true, + "dependencies": { + "@jest/types": "^29.6.3", + "@types/node": "*", + "chalk": "^4.0.0", + "jest-message-util": "^29.7.0", + "jest-util": "^29.7.0", + "slash": "^3.0.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/@jest/console/node_modules/slash": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz", + "integrity": "sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/@jest/core": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/@jest/core/-/core-29.7.0.tgz", + "integrity": "sha512-n7aeXWKMnGtDA48y8TLWJPJmLmmZ642Ceo78cYWEpiD7FzDgmNDV/GCVRorPABdXLJZ/9wzzgZAlHjXjxDHGsg==", + "dev": true, + "dependencies": { + "@jest/console": "^29.7.0", + "@jest/reporters": "^29.7.0", + "@jest/test-result": "^29.7.0", + "@jest/transform": "^29.7.0", + "@jest/types": "^29.6.3", + "@types/node": "*", + "ansi-escapes": "^4.2.1", + "chalk": "^4.0.0", + "ci-info": "^3.2.0", + "exit": "^0.1.2", + "graceful-fs": "^4.2.9", + "jest-changed-files": "^29.7.0", + "jest-config": "^29.7.0", + "jest-haste-map": "^29.7.0", + "jest-message-util": "^29.7.0", + "jest-regex-util": "^29.6.3", + "jest-resolve": "^29.7.0", + "jest-resolve-dependencies": "^29.7.0", + "jest-runner": "^29.7.0", + "jest-runtime": "^29.7.0", + "jest-snapshot": "^29.7.0", + "jest-util": "^29.7.0", + "jest-validate": "^29.7.0", + "jest-watcher": "^29.7.0", + "micromatch": "^4.0.4", + "pretty-format": "^29.7.0", + "slash": "^3.0.0", + "strip-ansi": "^6.0.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + }, + "peerDependencies": { + "node-notifier": "^8.0.1 || ^9.0.0 || ^10.0.0" + }, + "peerDependenciesMeta": { + "node-notifier": { + "optional": true + } + } + }, + "node_modules/@jest/core/node_modules/ansi-styles": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz", + "integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/@jest/core/node_modules/pretty-format": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/pretty-format/-/pretty-format-29.7.0.tgz", + "integrity": "sha512-Pdlw/oPxN+aXdmM9R00JVC9WVFoCLTKJvDVLgmJ+qAffBMxsV85l/Lu7sNx4zSzPyoL2euImuEwHhOXdEgNFZQ==", + "dev": true, + "dependencies": { + "@jest/schemas": "^29.6.3", + "ansi-styles": "^5.0.0", + "react-is": "^18.0.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/@jest/core/node_modules/slash": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz", + "integrity": "sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/@jest/environment": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/@jest/environment/-/environment-29.7.0.tgz", + "integrity": "sha512-aQIfHDq33ExsN4jP1NWGXhxgQ/wixs60gDiKO+XVMd8Mn0NWPWgc34ZQDTb2jKaUWQ7MuwoitXAsN2XVXNMpAw==", + "dev": true, + "dependencies": { + "@jest/fake-timers": "^29.7.0", + "@jest/types": "^29.6.3", + "@types/node": "*", + "jest-mock": "^29.7.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/@jest/expect": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/@jest/expect/-/expect-29.7.0.tgz", + "integrity": "sha512-8uMeAMycttpva3P1lBHB8VciS9V0XAr3GymPpipdyQXbBcuhkLQOSe8E/p92RyAdToS6ZD1tFkX+CkhoECE0dQ==", + "dev": true, + "dependencies": { + "expect": "^29.7.0", + "jest-snapshot": "^29.7.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/@jest/expect-utils": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/@jest/expect-utils/-/expect-utils-29.7.0.tgz", + "integrity": "sha512-GlsNBWiFQFCVi9QVSx7f5AgMeLxe9YCCs5PuP2O2LdjDAA8Jh9eX7lA1Jq/xdXw3Wb3hyvlFNfZIfcRetSzYcA==", + "dev": true, + "dependencies": { + "jest-get-type": "^29.6.3" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/@jest/fake-timers": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/@jest/fake-timers/-/fake-timers-29.7.0.tgz", + "integrity": "sha512-q4DH1Ha4TTFPdxLsqDXK1d3+ioSL7yL5oCMJZgDYm6i+6CygW5E5xVr/D1HdsGxjt1ZWSfUAs9OxSB/BNelWrQ==", + "dev": true, + "dependencies": { + "@jest/types": "^29.6.3", + "@sinonjs/fake-timers": "^10.0.2", + "@types/node": "*", + "jest-message-util": "^29.7.0", + "jest-mock": "^29.7.0", + "jest-util": "^29.7.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/@jest/globals": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/@jest/globals/-/globals-29.7.0.tgz", + "integrity": "sha512-mpiz3dutLbkW2MNFubUGUEVLkTGiqW6yLVTA+JbP6fI6J5iL9Y0Nlg8k95pcF8ctKwCS7WVxteBs29hhfAotzQ==", + "dev": true, + "dependencies": { + "@jest/environment": "^29.7.0", + "@jest/expect": "^29.7.0", + "@jest/types": "^29.6.3", + "jest-mock": "^29.7.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/@jest/reporters": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/@jest/reporters/-/reporters-29.7.0.tgz", + "integrity": "sha512-DApq0KJbJOEzAFYjHADNNxAE3KbhxQB1y5Kplb5Waqw6zVbuWatSnMjE5gs8FUgEPmNsnZA3NCWl9NG0ia04Pg==", + "dev": true, + "dependencies": { + "@bcoe/v8-coverage": "^0.2.3", + "@jest/console": "^29.7.0", + "@jest/test-result": "^29.7.0", + "@jest/transform": "^29.7.0", + "@jest/types": "^29.6.3", + "@jridgewell/trace-mapping": "^0.3.18", + "@types/node": "*", + "chalk": "^4.0.0", + "collect-v8-coverage": "^1.0.0", + "exit": "^0.1.2", + "glob": "^7.1.3", + "graceful-fs": "^4.2.9", + "istanbul-lib-coverage": "^3.0.0", + "istanbul-lib-instrument": "^6.0.0", + "istanbul-lib-report": "^3.0.0", + "istanbul-lib-source-maps": "^4.0.0", + "istanbul-reports": "^3.1.3", + "jest-message-util": "^29.7.0", + "jest-util": "^29.7.0", + "jest-worker": "^29.7.0", + "slash": "^3.0.0", + "string-length": "^4.0.1", + "strip-ansi": "^6.0.0", + "v8-to-istanbul": "^9.0.1" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + }, + "peerDependencies": { + "node-notifier": "^8.0.1 || ^9.0.0 || ^10.0.0" + }, + "peerDependenciesMeta": { + "node-notifier": { + "optional": true + } + } + }, + "node_modules/@jest/reporters/node_modules/slash": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz", + "integrity": "sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/@jest/schemas": { + "version": "29.6.3", + "resolved": "https://registry.npmjs.org/@jest/schemas/-/schemas-29.6.3.tgz", + "integrity": "sha512-mo5j5X+jIZmJQveBKeS/clAueipV7KgiX1vMgCxam1RNYiqE1w62n0/tJJnHtjW8ZHcQco5gY85jA3mi0L+nSA==", + "dependencies": { + "@sinclair/typebox": "^0.27.8" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/@jest/source-map": { + "version": "29.6.3", + "resolved": "https://registry.npmjs.org/@jest/source-map/-/source-map-29.6.3.tgz", + "integrity": "sha512-MHjT95QuipcPrpLM+8JMSzFx6eHp5Bm+4XeFDJlwsvVBjmKNiIAvasGK2fxz2WbGRlnvqehFbh07MMa7n3YJnw==", + "dev": true, + "dependencies": { + "@jridgewell/trace-mapping": "^0.3.18", + "callsites": "^3.0.0", + "graceful-fs": "^4.2.9" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/@jest/test-result": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/@jest/test-result/-/test-result-29.7.0.tgz", + "integrity": "sha512-Fdx+tv6x1zlkJPcWXmMDAG2HBnaR9XPSd5aDWQVsfrZmLVT3lU1cwyxLgRmXR9yrq4NBoEm9BMsfgFzTQAbJYA==", + "dev": true, + "dependencies": { + "@jest/console": "^29.7.0", + "@jest/types": "^29.6.3", + "@types/istanbul-lib-coverage": "^2.0.0", + "collect-v8-coverage": "^1.0.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/@jest/test-sequencer": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/@jest/test-sequencer/-/test-sequencer-29.7.0.tgz", + "integrity": "sha512-GQwJ5WZVrKnOJuiYiAF52UNUJXgTZx1NHjFSEB0qEMmSZKAkdMoIzw/Cj6x6NF4AvV23AUqDpFzQkN/eYCYTxw==", + "dev": true, + "dependencies": { + "@jest/test-result": "^29.7.0", + "graceful-fs": "^4.2.9", + "jest-haste-map": "^29.7.0", + "slash": "^3.0.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/@jest/test-sequencer/node_modules/slash": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz", + "integrity": "sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/@jest/transform": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/@jest/transform/-/transform-29.7.0.tgz", + "integrity": "sha512-ok/BTPFzFKVMwO5eOHRrvnBVHdRy9IrsrW1GpMaQ9MCnilNLXQKmAX8s1YXDFaai9xJpac2ySzV0YeRRECr2Vw==", + "dev": true, + "dependencies": { + "@babel/core": "^7.11.6", + "@jest/types": "^29.6.3", + "@jridgewell/trace-mapping": "^0.3.18", + "babel-plugin-istanbul": "^6.1.1", + "chalk": "^4.0.0", + "convert-source-map": "^2.0.0", + "fast-json-stable-stringify": "^2.1.0", + "graceful-fs": "^4.2.9", + "jest-haste-map": "^29.7.0", + "jest-regex-util": "^29.6.3", + "jest-util": "^29.7.0", + "micromatch": "^4.0.4", + "pirates": "^4.0.4", + "slash": "^3.0.0", + "write-file-atomic": "^4.0.2" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/@jest/transform/node_modules/slash": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz", + "integrity": "sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/@jest/types": { + "version": "29.6.3", + "resolved": "https://registry.npmjs.org/@jest/types/-/types-29.6.3.tgz", + "integrity": "sha512-u3UPsIilWKOM3F9CXtrG8LEJmNxwoCQC/XVj4IKYXvvpx7QIi/Kg1LI5uDmDpKlac62NUtX7eLjRh+jVZcLOzw==", + "dependencies": { + "@jest/schemas": "^29.6.3", + "@types/istanbul-lib-coverage": "^2.0.0", + "@types/istanbul-reports": "^3.0.0", + "@types/node": "*", + "@types/yargs": "^17.0.8", + "chalk": "^4.0.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/@jridgewell/gen-mapping": { + "version": "0.3.5", + "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.5.tgz", + "integrity": "sha512-IzL8ZoEDIBRWEzlCcRhOaCupYyN5gdIK+Q6fbFdPDg6HqX6jpkItn7DFIpW9LQzXG6Df9sA7+OKnq0qlz/GaQg==", + "dependencies": { + "@jridgewell/set-array": "^1.2.1", + "@jridgewell/sourcemap-codec": "^1.4.10", + "@jridgewell/trace-mapping": "^0.3.24" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@jridgewell/resolve-uri": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz", + "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==", + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@jridgewell/set-array": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/@jridgewell/set-array/-/set-array-1.2.1.tgz", + "integrity": "sha512-R8gLRTZeyp03ymzP/6Lil/28tGeGEzhx1q2k703KGWRAI1VdvPIXdG70VJc2pAMw3NA6JKL5hhFu1sJX0Mnn/A==", + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@jridgewell/source-map": { + "version": "0.3.6", + "resolved": "https://registry.npmjs.org/@jridgewell/source-map/-/source-map-0.3.6.tgz", + "integrity": "sha512-1ZJTZebgqllO79ue2bm3rIGud/bOe0pP5BjSRCRxxYkEZS8STV7zN84UBbiYu7jy+eCKSnVIUgoWWE/tt+shMQ==", + "dependencies": { + "@jridgewell/gen-mapping": "^0.3.5", + "@jridgewell/trace-mapping": "^0.3.25" + } + }, + "node_modules/@jridgewell/sourcemap-codec": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.0.tgz", + "integrity": "sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ==" + }, + "node_modules/@jridgewell/trace-mapping": { + "version": "0.3.25", + "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.25.tgz", + "integrity": "sha512-vNk6aEwybGtawWmy/PzwnGDOjCkLWSD2wqvjGGAgOAwCGWySYXfYoxt00IJkTF+8Lb57DwOb3Aa0o9CApepiYQ==", + "dependencies": { + "@jridgewell/resolve-uri": "^3.1.0", + "@jridgewell/sourcemap-codec": "^1.4.14" + } + }, + "node_modules/@jsdevtools/ono": { + "version": "7.1.3", + "resolved": "https://registry.npmjs.org/@jsdevtools/ono/-/ono-7.1.3.tgz", + "integrity": "sha512-4JQNk+3mVzK3xh2rqd6RB4J46qUR19azEHBneZyTZM+c456qOrbbM/5xcR8huNCCcbVt7+UmizG6GuUvPvKUYg==", + "dev": true + }, + "node_modules/@jsonjoy.com/base64": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@jsonjoy.com/base64/-/base64-1.1.2.tgz", + "integrity": "sha512-q6XAnWQDIMA3+FTiOYajoYqySkO+JSat0ytXGSuRdq9uXE7o92gzuQwQM14xaCRlBLGq3v5miDGC4vkVTn54xA==", + "dev": true, + "engines": { + "node": ">=10.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/streamich" + }, + "peerDependencies": { + "tslib": "2" + } + }, + "node_modules/@jsonjoy.com/json-pack": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@jsonjoy.com/json-pack/-/json-pack-1.1.0.tgz", + "integrity": "sha512-zlQONA+msXPPwHWZMKFVS78ewFczIll5lXiVPwFPCZUsrOKdxc2AvxU1HoNBmMRhqDZUR9HkC3UOm+6pME6Xsg==", + "dev": true, + "dependencies": { + "@jsonjoy.com/base64": "^1.1.1", + "@jsonjoy.com/util": "^1.1.2", + "hyperdyperid": "^1.2.0", + "thingies": "^1.20.0" + }, + "engines": { + "node": ">=10.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/streamich" + }, + "peerDependencies": { + "tslib": "2" + } + }, + "node_modules/@jsonjoy.com/util": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@jsonjoy.com/util/-/util-1.5.0.tgz", + "integrity": "sha512-ojoNsrIuPI9g6o8UxhraZQSyF2ByJanAY4cTFbc8Mf2AXEF4aQRGY1dJxyJpuyav8r9FGflEt/Ff3u5Nt6YMPA==", + "dev": true, + "engines": { + "node": ">=10.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/streamich" + }, + "peerDependencies": { + "tslib": "2" + } + }, + "node_modules/@juggle/resize-observer": { + "version": "3.4.0", + "resolved": "https://registry.npmjs.org/@juggle/resize-observer/-/resize-observer-3.4.0.tgz", + "integrity": "sha512-dfLbk+PwWvFzSxwk3n5ySL0hfBog779o8h68wK/7/APo/7cgyWp5jcXockbxdk5kFRkbeXWm4Fbi9FrdN381sA==" + }, + "node_modules/@leichtgewicht/ip-codec": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/@leichtgewicht/ip-codec/-/ip-codec-2.0.5.tgz", + "integrity": "sha512-Vo+PSpZG2/fmgmiNzYK9qWRh8h/CHrwD0mo1h1DzL4yzHNSfWYujGTYsWGreD000gcgmZ7K4Ys6Tx9TxtsKdDw==", + "dev": true + }, + "node_modules/@nicolo-ribaudo/chokidar-2": { + "version": "2.1.8-no-fsevents.3", + "resolved": "https://registry.npmjs.org/@nicolo-ribaudo/chokidar-2/-/chokidar-2-2.1.8-no-fsevents.3.tgz", + "integrity": "sha512-s88O1aVtXftvp5bCPB7WnmXc5IwOZZ7YPuwNPt+GtOOXpPvad1LfbmjYv+qII7zP6RU2QGnqve27dnLycEnyEQ==", + "dev": true, + "optional": true + }, + "node_modules/@noble/hashes": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/@noble/hashes/-/hashes-1.4.0.tgz", + "integrity": "sha512-V1JJ1WTRUqHHrOSh597hURcMqVKVGL/ea3kv0gSnEdsEZ0/+VyPghM1lMNGc00z7CIQorSvbKpuJkxvuHbvdbg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 16" + }, + "funding": { + "url": "https://paulmillr.com/funding/" + } + }, + "node_modules/@nodelib/fs.scandir": { + "version": "2.1.5", + "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz", + "integrity": "sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==", + "dev": true, + "dependencies": { + "@nodelib/fs.stat": "2.0.5", + "run-parallel": "^1.1.9" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/@nodelib/fs.stat": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-2.0.5.tgz", + "integrity": "sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==", + "dev": true, + "engines": { + "node": ">= 8" + } + }, + "node_modules/@nodelib/fs.walk": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/@nodelib/fs.walk/-/fs.walk-1.2.8.tgz", + "integrity": "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==", + "dev": true, + "dependencies": { + "@nodelib/fs.scandir": "2.1.5", + "fastq": "^1.6.0" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/@oazapfts/runtime": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/@oazapfts/runtime/-/runtime-1.0.3.tgz", + "integrity": "sha512-8tKiYffhwTGHSHYGnZ3oneLGCjX0po/XAXQ5Ng9fqKkvIdl/xz8+Vh8i+6xjzZqvZ2pLVpUcuSfnvNI/x67L0g==", + "dev": true, + "peer": true + }, + "node_modules/@parcel/watcher": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/@parcel/watcher/-/watcher-2.4.1.tgz", + "integrity": "sha512-HNjmfLQEVRZmHRET336f20H/8kOozUGwk7yajvsonjNxbj2wBTK1WsQuHkD5yYh9RxFGL2EyDHryOihOwUoKDA==", + "dev": true, + "optional": true, + "dependencies": { + "detect-libc": "^1.0.3", + "is-glob": "^4.0.3", + "micromatch": "^4.0.5", + "node-addon-api": "^7.0.0" + }, + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + }, + "optionalDependencies": { + "@parcel/watcher-android-arm64": "2.4.1", + "@parcel/watcher-darwin-arm64": "2.4.1", + "@parcel/watcher-darwin-x64": "2.4.1", + "@parcel/watcher-freebsd-x64": "2.4.1", + "@parcel/watcher-linux-arm-glibc": "2.4.1", + "@parcel/watcher-linux-arm64-glibc": "2.4.1", + "@parcel/watcher-linux-arm64-musl": "2.4.1", + "@parcel/watcher-linux-x64-glibc": "2.4.1", + "@parcel/watcher-linux-x64-musl": "2.4.1", + "@parcel/watcher-win32-arm64": "2.4.1", + "@parcel/watcher-win32-ia32": "2.4.1", + "@parcel/watcher-win32-x64": "2.4.1" + } + }, + "node_modules/@parcel/watcher-android-arm64": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/@parcel/watcher-android-arm64/-/watcher-android-arm64-2.4.1.tgz", + "integrity": "sha512-LOi/WTbbh3aTn2RYddrO8pnapixAziFl6SMxHM69r3tvdSm94JtCenaKgk1GRg5FJ5wpMCpHeW+7yqPlvZv7kg==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-darwin-arm64": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/@parcel/watcher-darwin-arm64/-/watcher-darwin-arm64-2.4.1.tgz", + "integrity": "sha512-ln41eihm5YXIY043vBrrHfn94SIBlqOWmoROhsMVTSXGh0QahKGy77tfEywQ7v3NywyxBBkGIfrWRHm0hsKtzA==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-darwin-x64": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/@parcel/watcher-darwin-x64/-/watcher-darwin-x64-2.4.1.tgz", + "integrity": "sha512-yrw81BRLjjtHyDu7J61oPuSoeYWR3lDElcPGJyOvIXmor6DEo7/G2u1o7I38cwlcoBHQFULqF6nesIX3tsEXMg==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-freebsd-x64": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/@parcel/watcher-freebsd-x64/-/watcher-freebsd-x64-2.4.1.tgz", + "integrity": "sha512-TJa3Pex/gX3CWIx/Co8k+ykNdDCLx+TuZj3f3h7eOjgpdKM+Mnix37RYsYU4LHhiYJz3DK5nFCCra81p6g050w==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-linux-arm-glibc": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/@parcel/watcher-linux-arm-glibc/-/watcher-linux-arm-glibc-2.4.1.tgz", + "integrity": "sha512-4rVYDlsMEYfa537BRXxJ5UF4ddNwnr2/1O4MHM5PjI9cvV2qymvhwZSFgXqbS8YoTk5i/JR0L0JDs69BUn45YA==", + "cpu": [ + "arm" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-linux-arm64-glibc": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/@parcel/watcher-linux-arm64-glibc/-/watcher-linux-arm64-glibc-2.4.1.tgz", + "integrity": "sha512-BJ7mH985OADVLpbrzCLgrJ3TOpiZggE9FMblfO65PlOCdG++xJpKUJ0Aol74ZUIYfb8WsRlUdgrZxKkz3zXWYA==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-linux-arm64-musl": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/@parcel/watcher-linux-arm64-musl/-/watcher-linux-arm64-musl-2.4.1.tgz", + "integrity": "sha512-p4Xb7JGq3MLgAfYhslU2SjoV9G0kI0Xry0kuxeG/41UfpjHGOhv7UoUDAz/jb1u2elbhazy4rRBL8PegPJFBhA==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-linux-x64-glibc": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/@parcel/watcher-linux-x64-glibc/-/watcher-linux-x64-glibc-2.4.1.tgz", + "integrity": "sha512-s9O3fByZ/2pyYDPoLM6zt92yu6P4E39a03zvO0qCHOTjxmt3GHRMLuRZEWhWLASTMSrrnVNWdVI/+pUElJBBBg==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-linux-x64-musl": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/@parcel/watcher-linux-x64-musl/-/watcher-linux-x64-musl-2.4.1.tgz", + "integrity": "sha512-L2nZTYR1myLNST0O632g0Dx9LyMNHrn6TOt76sYxWLdff3cB22/GZX2UPtJnaqQPdCRoszoY5rcOj4oMTtp5fQ==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-win32-arm64": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/@parcel/watcher-win32-arm64/-/watcher-win32-arm64-2.4.1.tgz", + "integrity": "sha512-Uq2BPp5GWhrq/lcuItCHoqxjULU1QYEcyjSO5jqqOK8RNFDBQnenMMx4gAl3v8GiWa59E9+uDM7yZ6LxwUIfRg==", + "cpu": [ + "arm64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-win32-ia32": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/@parcel/watcher-win32-ia32/-/watcher-win32-ia32-2.4.1.tgz", + "integrity": "sha512-maNRit5QQV2kgHFSYwftmPBxiuK5u4DXjbXx7q6eKjq5dsLXZ4FJiVvlcw35QXzk0KrUecJmuVFbj4uV9oYrcw==", + "cpu": [ + "ia32" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-win32-x64": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/@parcel/watcher-win32-x64/-/watcher-win32-x64-2.4.1.tgz", + "integrity": "sha512-+DvS92F9ezicfswqrvIRM2njcYJbd5mb9CUgtrHCHmvn7pPPa+nMDRu1o1bYYz/l5IB2NVGNJWiH7h1E58IF2A==", + "cpu": [ + "x64" + ], + "dev": true, + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@peculiar/asn1-cms": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@peculiar/asn1-cms/-/asn1-cms-2.6.0.tgz", + "integrity": "sha512-2uZqP+ggSncESeUF/9Su8rWqGclEfEiz1SyU02WX5fUONFfkjzS2Z/F1Li0ofSmf4JqYXIOdCAZqIXAIBAT1OA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@peculiar/asn1-schema": "^2.6.0", + "@peculiar/asn1-x509": "^2.6.0", + "@peculiar/asn1-x509-attr": "^2.6.0", + "asn1js": "^3.0.6", + "tslib": "^2.8.1" + } + }, + "node_modules/@peculiar/asn1-csr": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@peculiar/asn1-csr/-/asn1-csr-2.6.0.tgz", + "integrity": "sha512-BeWIu5VpTIhfRysfEp73SGbwjjoLL/JWXhJ/9mo4vXnz3tRGm+NGm3KNcRzQ9VMVqwYS2RHlolz21svzRXIHPQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@peculiar/asn1-schema": "^2.6.0", + "@peculiar/asn1-x509": "^2.6.0", + "asn1js": "^3.0.6", + "tslib": "^2.8.1" + } + }, + "node_modules/@peculiar/asn1-ecc": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@peculiar/asn1-ecc/-/asn1-ecc-2.6.0.tgz", + "integrity": "sha512-FF3LMGq6SfAOwUG2sKpPXblibn6XnEIKa+SryvUl5Pik+WR9rmRA3OCiwz8R3lVXnYnyRkSZsSLdml8H3UiOcw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@peculiar/asn1-schema": "^2.6.0", + "@peculiar/asn1-x509": "^2.6.0", + "asn1js": "^3.0.6", + "tslib": "^2.8.1" + } + }, + "node_modules/@peculiar/asn1-pfx": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@peculiar/asn1-pfx/-/asn1-pfx-2.6.0.tgz", + "integrity": "sha512-rtUvtf+tyKGgokHHmZzeUojRZJYPxoD/jaN1+VAB4kKR7tXrnDCA/RAWXAIhMJJC+7W27IIRGe9djvxKgsldCQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@peculiar/asn1-cms": "^2.6.0", + "@peculiar/asn1-pkcs8": "^2.6.0", + "@peculiar/asn1-rsa": "^2.6.0", + "@peculiar/asn1-schema": "^2.6.0", + "asn1js": "^3.0.6", + "tslib": "^2.8.1" + } + }, + "node_modules/@peculiar/asn1-pkcs8": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@peculiar/asn1-pkcs8/-/asn1-pkcs8-2.6.0.tgz", + "integrity": "sha512-KyQ4D8G/NrS7Fw3XCJrngxmjwO/3htnA0lL9gDICvEQ+GJ+EPFqldcJQTwPIdvx98Tua+WjkdKHSC0/Km7T+lA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@peculiar/asn1-schema": "^2.6.0", + "@peculiar/asn1-x509": "^2.6.0", + "asn1js": "^3.0.6", + "tslib": "^2.8.1" + } + }, + "node_modules/@peculiar/asn1-pkcs9": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@peculiar/asn1-pkcs9/-/asn1-pkcs9-2.6.0.tgz", + "integrity": "sha512-b78OQ6OciW0aqZxdzliXGYHASeCvvw5caqidbpQRYW2mBtXIX2WhofNXTEe7NyxTb0P6J62kAAWLwn0HuMF1Fw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@peculiar/asn1-cms": "^2.6.0", + "@peculiar/asn1-pfx": "^2.6.0", + "@peculiar/asn1-pkcs8": "^2.6.0", + "@peculiar/asn1-schema": "^2.6.0", + "@peculiar/asn1-x509": "^2.6.0", + "@peculiar/asn1-x509-attr": "^2.6.0", + "asn1js": "^3.0.6", + "tslib": "^2.8.1" + } + }, + "node_modules/@peculiar/asn1-rsa": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@peculiar/asn1-rsa/-/asn1-rsa-2.6.0.tgz", + "integrity": "sha512-Nu4C19tsrTsCp9fDrH+sdcOKoVfdfoQQ7S3VqjJU6vedR7tY3RLkQ5oguOIB3zFW33USDUuYZnPEQYySlgha4w==", + "dev": true, + "license": "MIT", + "dependencies": { + "@peculiar/asn1-schema": "^2.6.0", + "@peculiar/asn1-x509": "^2.6.0", + "asn1js": "^3.0.6", + "tslib": "^2.8.1" + } + }, + "node_modules/@peculiar/asn1-schema": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@peculiar/asn1-schema/-/asn1-schema-2.6.0.tgz", + "integrity": "sha512-xNLYLBFTBKkCzEZIw842BxytQQATQv+lDTCEMZ8C196iJcJJMBUZxrhSTxLaohMyKK8QlzRNTRkUmanucnDSqg==", + "dev": true, + "license": "MIT", + "dependencies": { + "asn1js": "^3.0.6", + "pvtsutils": "^1.3.6", + "tslib": "^2.8.1" + } + }, + "node_modules/@peculiar/asn1-x509": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@peculiar/asn1-x509/-/asn1-x509-2.6.0.tgz", + "integrity": "sha512-uzYbPEpoQiBoTq0/+jZtpM6Gq6zADBx+JNFP3yqRgziWBxQ/Dt/HcuvRfm9zJTPdRcBqPNdaRHTVwpyiq6iNMA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@peculiar/asn1-schema": "^2.6.0", + "asn1js": "^3.0.6", + "pvtsutils": "^1.3.6", + "tslib": "^2.8.1" + } + }, + "node_modules/@peculiar/asn1-x509-attr": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@peculiar/asn1-x509-attr/-/asn1-x509-attr-2.6.0.tgz", + "integrity": "sha512-MuIAXFX3/dc8gmoZBkwJWxUWOSvG4MMDntXhrOZpJVMkYX+MYc/rUAU2uJOved9iJEoiUx7//3D8oG83a78UJA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@peculiar/asn1-schema": "^2.6.0", + "@peculiar/asn1-x509": "^2.6.0", + "asn1js": "^3.0.6", + "tslib": "^2.8.1" + } + }, + "node_modules/@peculiar/x509": { + "version": "1.14.3", + "resolved": "https://registry.npmjs.org/@peculiar/x509/-/x509-1.14.3.tgz", + "integrity": "sha512-C2Xj8FZ0uHWeCXXqX5B4/gVFQmtSkiuOolzAgutjTfseNOHT3pUjljDZsTSxXFGgio54bCzVFqmEOUrIVk8RDA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@peculiar/asn1-cms": "^2.6.0", + "@peculiar/asn1-csr": "^2.6.0", + "@peculiar/asn1-ecc": "^2.6.0", + "@peculiar/asn1-pkcs9": "^2.6.0", + "@peculiar/asn1-rsa": "^2.6.0", + "@peculiar/asn1-schema": "^2.6.0", + "@peculiar/asn1-x509": "^2.6.0", + "pvtsutils": "^1.3.6", + "reflect-metadata": "^0.2.2", + "tslib": "^2.8.1", + "tsyringe": "^4.10.0" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@pkgr/core": { + "version": "0.2.7", + "resolved": "https://registry.npmjs.org/@pkgr/core/-/core-0.2.7.tgz", + "integrity": "sha512-YLT9Zo3oNPJoBjBc4q8G2mjU4tqIbf5CEOORbUUr48dCD9q3umJ3IPlVqOqDakPfd2HuwccBaqlGhN4Gmr5OWg==", + "dev": true, + "license": "MIT", + "engines": { + "node": "^12.20.0 || ^14.18.0 || >=16.0.0" + }, + "funding": { + "url": "https://opencollective.com/pkgr" + } + }, + "node_modules/@pmmmwh/react-refresh-webpack-plugin": { + "version": "0.5.15", + "resolved": "https://registry.npmjs.org/@pmmmwh/react-refresh-webpack-plugin/-/react-refresh-webpack-plugin-0.5.15.tgz", + "integrity": "sha512-LFWllMA55pzB9D34w/wXUCf8+c+IYKuJDgxiZ3qMhl64KRMBHYM1I3VdGaD2BV5FNPV2/S2596bppxHbv2ZydQ==", + "dev": true, + "dependencies": { + "ansi-html": "^0.0.9", + "core-js-pure": "^3.23.3", + "error-stack-parser": "^2.0.6", + "html-entities": "^2.1.0", + "loader-utils": "^2.0.4", + "schema-utils": "^4.2.0", + "source-map": "^0.7.3" + }, + "engines": { + "node": ">= 10.13" + }, + "peerDependencies": { + "@types/webpack": "4.x || 5.x", + "react-refresh": ">=0.10.0 <1.0.0", + "sockjs-client": "^1.4.0", + "type-fest": ">=0.17.0 <5.0.0", + "webpack": ">=4.43.0 <6.0.0", + "webpack-dev-server": "3.x || 4.x || 5.x", + "webpack-hot-middleware": "2.x", + "webpack-plugin-serve": "0.x || 1.x" + }, + "peerDependenciesMeta": { + "@types/webpack": { + "optional": true + }, + "sockjs-client": { + "optional": true + }, + "type-fest": { + "optional": true + }, + "webpack-dev-server": { + "optional": true + }, + "webpack-hot-middleware": { + "optional": true + }, + "webpack-plugin-serve": { + "optional": true + } + } + }, + "node_modules/@pmmmwh/react-refresh-webpack-plugin/node_modules/loader-utils": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/loader-utils/-/loader-utils-2.0.4.tgz", + "integrity": "sha512-xXqpXoINfFhgua9xiqD8fPFHgkoq1mmmpE92WlDbm9rNRd/EbRb+Gqf908T2DMfuHjjJlksiK2RbHVOdD/MqSw==", + "dev": true, + "dependencies": { + "big.js": "^5.2.2", + "emojis-list": "^3.0.0", + "json5": "^2.1.2" + }, + "engines": { + "node": ">=8.9.0" + } + }, + "node_modules/@reduxjs/toolkit": { + "version": "1.9.7", + "resolved": "https://registry.npmjs.org/@reduxjs/toolkit/-/toolkit-1.9.7.tgz", + "integrity": "sha512-t7v8ZPxhhKgOKtU+uyJT13lu4vL7az5aFi4IdoDs/eS548edn2M8Ik9h8fxgvMjGoAUVFSt6ZC1P5cWmQ014QQ==", + "dependencies": { + "immer": "^9.0.21", + "redux": "^4.2.1", + "redux-thunk": "^2.4.2", + "reselect": "^4.1.8" + }, + "peerDependencies": { + "react": "^16.9.0 || ^17.0.0 || ^18", + "react-redux": "^7.2.1 || ^8.0.2" + }, + "peerDependenciesMeta": { + "react": { + "optional": true + }, + "react-redux": { + "optional": true + } + } + }, + "node_modules/@reduxjs/toolkit/node_modules/redux": { + "version": "4.2.1", + "resolved": "https://registry.npmjs.org/redux/-/redux-4.2.1.tgz", + "integrity": "sha512-LAUYz4lc+Do8/g7aeRa8JkyDErK6ekstQaqWQrNRW//MY1TvCEpMtpTWvlQ+FPbWCx+Xixu/6SHt5N0HR+SB4w==", + "dependencies": { + "@babel/runtime": "^7.9.2" + } + }, + "node_modules/@reduxjs/toolkit/node_modules/redux-thunk": { + "version": "2.4.2", + "resolved": "https://registry.npmjs.org/redux-thunk/-/redux-thunk-2.4.2.tgz", + "integrity": "sha512-+P3TjtnP0k/FEjcBL5FZpoovtvrTNT/UXd4/sluaSyrURlSlhLSzEdfsTBW7WsKB6yPvgd7q/iZPICFjW4o57Q==", + "peerDependencies": { + "redux": "^4" + } + }, + "node_modules/@remix-run/router": { + "version": "1.23.2", + "resolved": "https://registry.npmjs.org/@remix-run/router/-/router-1.23.2.tgz", + "integrity": "sha512-Ic6m2U/rMjTkhERIa/0ZtXJP17QUi2CbWE7cqx4J58M8aA3QTfW+2UlQ4psvTX9IO1RfNVhK3pcpdjej7L+t2w==", + "license": "MIT", + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/@rtk-query/codegen-openapi": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/@rtk-query/codegen-openapi/-/codegen-openapi-2.0.0.tgz", + "integrity": "sha512-uIOshfqX6bcsMpiwUMKAC+oFEw2fUxICMruhXunB6wq7tHpUg2b+gz+qGjiWAWw1Ly6g6jjvb3N4HRxWy9Yqew==", + "dev": true, + "dependencies": { + "@apidevtools/swagger-parser": "^10.0.2", + "commander": "^6.2.0", + "lodash.camelcase": "^4.3.0", + "oazapfts": "^6.1.0", + "prettier": "^3.2.5", + "semver": "^7.3.5", + "swagger2openapi": "^7.0.4", + "typescript": "^5.5.4" + }, + "bin": { + "rtk-query-codegen-openapi": "lib/bin/cli.mjs" + } + }, + "node_modules/@rtk-query/codegen-openapi/node_modules/semver": { + "version": "7.6.3", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.3.tgz", + "integrity": "sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==", + "dev": true, + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/@sinclair/typebox": { + "version": "0.27.8", + "resolved": "https://registry.npmjs.org/@sinclair/typebox/-/typebox-0.27.8.tgz", + "integrity": "sha512-+Fj43pSMwJs4KRrH/938Uf+uAELIgVBmQzg/q1YG10djyfA3TnrU8N8XzqCh/okZdszqBQTZf96idMfE5lnwTA==" + }, + "node_modules/@sinonjs/commons": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/@sinonjs/commons/-/commons-3.0.1.tgz", + "integrity": "sha512-K3mCHKQ9sVh8o1C9cxkwxaOmXoAMlDxC1mYyHrjqOWEcBjYr76t96zL2zlj5dUGZ3HSw240X1qgH3Mjf1yJWpQ==", + "dev": true, + "dependencies": { + "type-detect": "4.0.8" + } + }, + "node_modules/@sinonjs/fake-timers": { + "version": "10.3.0", + "resolved": "https://registry.npmjs.org/@sinonjs/fake-timers/-/fake-timers-10.3.0.tgz", + "integrity": "sha512-V4BG07kuYSUkTCSBHG8G8TNhM+F19jXFWnQtzj+we8DrkpSBCee9Z3Ms8yiGer/dlmhe35/Xdgyo3/0rQKg7YA==", + "dev": true, + "dependencies": { + "@sinonjs/commons": "^3.0.0" + } + }, + "node_modules/@svgr/babel-plugin-add-jsx-attribute": { + "version": "6.5.1", + "resolved": "https://registry.npmjs.org/@svgr/babel-plugin-add-jsx-attribute/-/babel-plugin-add-jsx-attribute-6.5.1.tgz", + "integrity": "sha512-9PYGcXrAxitycIjRmZB+Q0JaN07GZIWaTBIGQzfaZv+qr1n8X1XUEJ5rZ/vx6OVD9RRYlrNnXWExQXcmZeD/BQ==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/gregberge" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@svgr/babel-plugin-remove-jsx-attribute": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/@svgr/babel-plugin-remove-jsx-attribute/-/babel-plugin-remove-jsx-attribute-8.0.0.tgz", + "integrity": "sha512-BcCkm/STipKvbCl6b7QFrMh/vx00vIP63k2eM66MfHJzPr6O2U0jYEViXkHJWqXqQYjdeA9cuCl5KWmlwjDvbA==", + "dev": true, + "engines": { + "node": ">=14" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/gregberge" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@svgr/babel-plugin-remove-jsx-empty-expression": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/@svgr/babel-plugin-remove-jsx-empty-expression/-/babel-plugin-remove-jsx-empty-expression-8.0.0.tgz", + "integrity": "sha512-5BcGCBfBxB5+XSDSWnhTThfI9jcO5f0Ai2V24gZpG+wXF14BzwxxdDb4g6trdOux0rhibGs385BeFMSmxtS3uA==", + "dev": true, + "engines": { + "node": ">=14" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/gregberge" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@svgr/babel-plugin-replace-jsx-attribute-value": { + "version": "6.5.1", + "resolved": "https://registry.npmjs.org/@svgr/babel-plugin-replace-jsx-attribute-value/-/babel-plugin-replace-jsx-attribute-value-6.5.1.tgz", + "integrity": "sha512-8DPaVVE3fd5JKuIC29dqyMB54sA6mfgki2H2+swh+zNJoynC8pMPzOkidqHOSc6Wj032fhl8Z0TVn1GiPpAiJg==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/gregberge" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@svgr/babel-plugin-svg-dynamic-title": { + "version": "6.5.1", + "resolved": "https://registry.npmjs.org/@svgr/babel-plugin-svg-dynamic-title/-/babel-plugin-svg-dynamic-title-6.5.1.tgz", + "integrity": "sha512-FwOEi0Il72iAzlkaHrlemVurgSQRDFbk0OC8dSvD5fSBPHltNh7JtLsxmZUhjYBZo2PpcU/RJvvi6Q0l7O7ogw==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/gregberge" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@svgr/babel-plugin-svg-em-dimensions": { + "version": "6.5.1", + "resolved": "https://registry.npmjs.org/@svgr/babel-plugin-svg-em-dimensions/-/babel-plugin-svg-em-dimensions-6.5.1.tgz", + "integrity": "sha512-gWGsiwjb4tw+ITOJ86ndY/DZZ6cuXMNE/SjcDRg+HLuCmwpcjOktwRF9WgAiycTqJD/QXqL2f8IzE2Rzh7aVXA==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/gregberge" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@svgr/babel-plugin-transform-react-native-svg": { + "version": "6.5.1", + "resolved": "https://registry.npmjs.org/@svgr/babel-plugin-transform-react-native-svg/-/babel-plugin-transform-react-native-svg-6.5.1.tgz", + "integrity": "sha512-2jT3nTayyYP7kI6aGutkyfJ7UMGtuguD72OjeGLwVNyfPRBD8zQthlvL+fAbAKk5n9ZNcvFkp/b1lZ7VsYqVJg==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/gregberge" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@svgr/babel-plugin-transform-svg-component": { + "version": "6.5.1", + "resolved": "https://registry.npmjs.org/@svgr/babel-plugin-transform-svg-component/-/babel-plugin-transform-svg-component-6.5.1.tgz", + "integrity": "sha512-a1p6LF5Jt33O3rZoVRBqdxL350oge54iZWHNI6LJB5tQ7EelvD/Mb1mfBiZNAan0dt4i3VArkFRjA4iObuNykQ==", + "dev": true, + "engines": { + "node": ">=12" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/gregberge" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@svgr/babel-preset": { + "version": "6.5.1", + "resolved": "https://registry.npmjs.org/@svgr/babel-preset/-/babel-preset-6.5.1.tgz", + "integrity": "sha512-6127fvO/FF2oi5EzSQOAjo1LE3OtNVh11R+/8FXa+mHx1ptAaS4cknIjnUA7e6j6fwGGJ17NzaTJFUwOV2zwCw==", + "dev": true, + "dependencies": { + "@svgr/babel-plugin-add-jsx-attribute": "^6.5.1", + "@svgr/babel-plugin-remove-jsx-attribute": "*", + "@svgr/babel-plugin-remove-jsx-empty-expression": "*", + "@svgr/babel-plugin-replace-jsx-attribute-value": "^6.5.1", + "@svgr/babel-plugin-svg-dynamic-title": "^6.5.1", + "@svgr/babel-plugin-svg-em-dimensions": "^6.5.1", + "@svgr/babel-plugin-transform-react-native-svg": "^6.5.1", + "@svgr/babel-plugin-transform-svg-component": "^6.5.1" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/gregberge" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@svgr/core": { + "version": "6.5.1", + "resolved": "https://registry.npmjs.org/@svgr/core/-/core-6.5.1.tgz", + "integrity": "sha512-/xdLSWxK5QkqG524ONSjvg3V/FkNyCv538OIBdQqPNaAta3AsXj/Bd2FbvR87yMbXO2hFSWiAe/Q6IkVPDw+mw==", + "dev": true, + "dependencies": { + "@babel/core": "^7.19.6", + "@svgr/babel-preset": "^6.5.1", + "@svgr/plugin-jsx": "^6.5.1", + "camelcase": "^6.2.0", + "cosmiconfig": "^7.0.1" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/gregberge" + } + }, + "node_modules/@svgr/hast-util-to-babel-ast": { + "version": "6.5.1", + "resolved": "https://registry.npmjs.org/@svgr/hast-util-to-babel-ast/-/hast-util-to-babel-ast-6.5.1.tgz", + "integrity": "sha512-1hnUxxjd83EAxbL4a0JDJoD3Dao3hmjvyvyEV8PzWmLK3B9m9NPlW7GKjFyoWE8nM7HnXzPcmmSyOW8yOddSXw==", + "dev": true, + "dependencies": { + "@babel/types": "^7.20.0", + "entities": "^4.4.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/gregberge" + } + }, + "node_modules/@svgr/plugin-jsx": { + "version": "6.5.1", + "resolved": "https://registry.npmjs.org/@svgr/plugin-jsx/-/plugin-jsx-6.5.1.tgz", + "integrity": "sha512-+UdQxI3jgtSjCykNSlEMuy1jSRQlGC7pqBCPvkG/2dATdWo082zHTTK3uhnAju2/6XpE6B5mZ3z4Z8Ns01S8Gw==", + "dev": true, + "dependencies": { + "@babel/core": "^7.19.6", + "@svgr/babel-preset": "^6.5.1", + "@svgr/hast-util-to-babel-ast": "^6.5.1", + "svg-parser": "^2.0.4" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/gregberge" + }, + "peerDependencies": { + "@svgr/core": "^6.0.0" + } + }, + "node_modules/@svgr/plugin-svgo": { + "version": "6.5.1", + "resolved": "https://registry.npmjs.org/@svgr/plugin-svgo/-/plugin-svgo-6.5.1.tgz", + "integrity": "sha512-omvZKf8ixP9z6GWgwbtmP9qQMPX4ODXi+wzbVZgomNFsUIlHA1sf4fThdwTWSsZGgvGAG6yE+b/F5gWUkcZ/iQ==", + "dev": true, + "dependencies": { + "cosmiconfig": "^7.0.1", + "deepmerge": "^4.2.2", + "svgo": "^2.8.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/gregberge" + }, + "peerDependencies": { + "@svgr/core": "*" + } + }, + "node_modules/@svgr/webpack": { + "version": "6.5.1", + "resolved": "https://registry.npmjs.org/@svgr/webpack/-/webpack-6.5.1.tgz", + "integrity": "sha512-cQ/AsnBkXPkEK8cLbv4Dm7JGXq2XrumKnL1dRpJD9rIO2fTIlJI9a1uCciYG1F2aUsox/hJQyNGbt3soDxSRkA==", + "dev": true, + "dependencies": { + "@babel/core": "^7.19.6", + "@babel/plugin-transform-react-constant-elements": "^7.18.12", + "@babel/preset-env": "^7.19.4", + "@babel/preset-react": "^7.18.6", + "@babel/preset-typescript": "^7.18.6", + "@svgr/core": "^6.5.1", + "@svgr/plugin-jsx": "^6.5.1", + "@svgr/plugin-svgo": "^6.5.1" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/gregberge" + } + }, + "node_modules/@testing-library/dom": { + "version": "10.4.0", + "resolved": "https://registry.npmjs.org/@testing-library/dom/-/dom-10.4.0.tgz", + "integrity": "sha512-pemlzrSESWbdAloYml3bAJMEfNh1Z7EduzqPKprCH5S341frlpYnUEW0H72dLxa6IsYr+mPno20GiSm+h9dEdQ==", + "dev": true, + "peer": true, + "dependencies": { + "@babel/code-frame": "^7.10.4", + "@babel/runtime": "^7.12.5", + "@types/aria-query": "^5.0.1", + "aria-query": "5.3.0", + "chalk": "^4.1.0", + "dom-accessibility-api": "^0.5.9", + "lz-string": "^1.5.0", + "pretty-format": "^27.0.2" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@testing-library/jest-dom": { + "version": "6.6.3", + "resolved": "https://registry.npmjs.org/@testing-library/jest-dom/-/jest-dom-6.6.3.tgz", + "integrity": "sha512-IteBhl4XqYNkM54f4ejhLRJiZNqcSCoXUOG2CPK7qbD322KjQozM4kHQOfkG2oln9b9HTYqs+Sae8vBATubxxA==", + "dev": true, + "dependencies": { + "@adobe/css-tools": "^4.4.0", + "aria-query": "^5.0.0", + "chalk": "^3.0.0", + "css.escape": "^1.5.1", + "dom-accessibility-api": "^0.6.3", + "lodash": "^4.17.21", + "redent": "^3.0.0" + }, + "engines": { + "node": ">=14", + "npm": ">=6", + "yarn": ">=1" + } + }, + "node_modules/@testing-library/jest-dom/node_modules/chalk": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-3.0.0.tgz", + "integrity": "sha512-4D3B6Wf41KOYRFdszmDqMCGq5VV/uMAB273JILmO+3jAlh8X4qDtdtgCR3fxtbLEMzSx22QdhnDcJvu2u1fVwg==", + "dev": true, + "dependencies": { + "ansi-styles": "^4.1.0", + "supports-color": "^7.1.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/@testing-library/jest-dom/node_modules/dom-accessibility-api": { + "version": "0.6.3", + "resolved": "https://registry.npmjs.org/dom-accessibility-api/-/dom-accessibility-api-0.6.3.tgz", + "integrity": "sha512-7ZgogeTnjuHbo+ct10G9Ffp0mif17idi0IyWNVA/wcwcm7NPOD/WEHVP3n7n3MhXqxoIYm8d6MuZohYWIZ4T3w==", + "dev": true + }, + "node_modules/@testing-library/react": { + "version": "16.0.1", + "resolved": "https://registry.npmjs.org/@testing-library/react/-/react-16.0.1.tgz", + "integrity": "sha512-dSmwJVtJXmku+iocRhWOUFbrERC76TX2Mnf0ATODz8brzAZrMBbzLwQixlBSanZxR6LddK3eiwpSFZgDET1URg==", + "dev": true, + "dependencies": { + "@babel/runtime": "^7.12.5" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@testing-library/dom": "^10.0.0", + "@types/react": "^18.0.0", + "@types/react-dom": "^18.0.0", + "react": "^18.0.0", + "react-dom": "^18.0.0" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, + "node_modules/@testing-library/user-event": { + "version": "14.5.2", + "resolved": "https://registry.npmjs.org/@testing-library/user-event/-/user-event-14.5.2.tgz", + "integrity": "sha512-YAh82Wh4TIrxYLmfGcixwD18oIjyC1pFQC2Y01F2lzV2HTMiYrI0nze0FD0ocB//CKS/7jIUgae+adPqxK5yCQ==", + "dev": true, + "engines": { + "node": ">=12", + "npm": ">=6" + }, + "peerDependencies": { + "@testing-library/dom": ">=7.21.4" + } + }, + "node_modules/@trysound/sax": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/@trysound/sax/-/sax-0.2.0.tgz", + "integrity": "sha512-L7z9BgrNEcYyUYtF+HaEfiS5ebkh9jXqbszz7pC0hRBPaatV0XjSD3+eHrpqFemQfgwiFF0QPIarnIihIDn7OA==", + "engines": { + "node": ">=10.13.0" + } + }, + "node_modules/@tsconfig/node10": { + "version": "1.0.11", + "resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.11.tgz", + "integrity": "sha512-DcRjDCujK/kCk/cUe8Xz8ZSpm8mS3mNNpta+jGCA6USEDfktlNvm1+IuZ9eTcDbNk41BHwpHHeW+N1lKCz4zOw==", + "dev": true + }, + "node_modules/@tsconfig/node12": { + "version": "1.0.11", + "resolved": "https://registry.npmjs.org/@tsconfig/node12/-/node12-1.0.11.tgz", + "integrity": "sha512-cqefuRsh12pWyGsIoBKJA9luFu3mRxCA+ORZvA4ktLSzIuCUtWVxGIuXigEwO5/ywWFMZ2QEGKWvkZG1zDMTag==", + "dev": true + }, + "node_modules/@tsconfig/node14": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/@tsconfig/node14/-/node14-1.0.3.tgz", + "integrity": "sha512-ysT8mhdixWK6Hw3i1V2AeRqZ5WfXg1G43mqoYlM2nc6388Fq5jcXyr5mRsqViLx/GJYdoL0bfXD8nmF+Zn/Iow==", + "dev": true + }, + "node_modules/@tsconfig/node16": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/@tsconfig/node16/-/node16-1.0.4.tgz", + "integrity": "sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==", + "dev": true + }, + "node_modules/@types/aria-query": { + "version": "5.0.4", + "resolved": "https://registry.npmjs.org/@types/aria-query/-/aria-query-5.0.4.tgz", + "integrity": "sha512-rfT93uj5s0PRL7EzccGMs3brplhcrghnDoV26NqKhCAS1hVo+WdNsPvE/yb6ilfr5hi2MEk6d5EWJTKdxg8jVw==", + "dev": true, + "peer": true + }, + "node_modules/@types/axios": { + "version": "0.14.4", + "resolved": "https://registry.npmjs.org/@types/axios/-/axios-0.14.4.tgz", + "integrity": "sha512-9JgOaunvQdsQ/qW2OPmE5+hCeUB52lQSolecrFrthct55QekhmXEwT203s20RL+UHtCQc15y3VXpby9E7Kkh/g==", + "deprecated": "This is a stub types definition. axios provides its own type definitions, so you do not need this installed.", + "dev": true, + "dependencies": { + "axios": "*" + } + }, + "node_modules/@types/babel__core": { + "version": "7.20.5", + "resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.20.5.tgz", + "integrity": "sha512-qoQprZvz5wQFJwMDqeseRXWv3rqMvhgpbXFfVyWhbx9X47POIA6i/+dXefEmZKoAgOaTdaIgNSMqMIU61yRyzA==", + "dev": true, + "dependencies": { + "@babel/parser": "^7.20.7", + "@babel/types": "^7.20.7", + "@types/babel__generator": "*", + "@types/babel__template": "*", + "@types/babel__traverse": "*" + } + }, + "node_modules/@types/babel__generator": { + "version": "7.6.8", + "resolved": "https://registry.npmjs.org/@types/babel__generator/-/babel__generator-7.6.8.tgz", + "integrity": "sha512-ASsj+tpEDsEiFr1arWrlN6V3mdfjRMZt6LtK/Vp/kreFLnr5QH5+DhvD5nINYZXzwJvXeGq+05iUXcAzVrqWtw==", + "dev": true, + "dependencies": { + "@babel/types": "^7.0.0" + } + }, + "node_modules/@types/babel__template": { + "version": "7.4.4", + "resolved": "https://registry.npmjs.org/@types/babel__template/-/babel__template-7.4.4.tgz", + "integrity": "sha512-h/NUaSyG5EyxBIp8YRxo4RMe2/qQgvyowRwVMzhYhBCONbW8PUsg4lkFMrhgZhUe5z3L3MiLDuvyJ/CaPa2A8A==", + "dev": true, + "dependencies": { + "@babel/parser": "^7.1.0", + "@babel/types": "^7.0.0" + } + }, + "node_modules/@types/babel__traverse": { + "version": "7.20.6", + "resolved": "https://registry.npmjs.org/@types/babel__traverse/-/babel__traverse-7.20.6.tgz", + "integrity": "sha512-r1bzfrm0tomOI8g1SzvCaQHo6Lcv6zu0EA+W2kHrt8dyrHQxGzBBL4kdkzIS+jBMV+EYcMAEAqXqYaLJq5rOZg==", + "dev": true, + "dependencies": { + "@babel/types": "^7.20.7" + } + }, + "node_modules/@types/body-parser": { + "version": "1.19.5", + "resolved": "https://registry.npmjs.org/@types/body-parser/-/body-parser-1.19.5.tgz", + "integrity": "sha512-fB3Zu92ucau0iQ0JMCFQE7b/dv8Ot07NI3KaZIkIUNXq82k4eBAqUaneXfleGY9JWskeS9y+u0nXMyspcuQrCg==", + "dev": true, + "dependencies": { + "@types/connect": "*", + "@types/node": "*" + } + }, + "node_modules/@types/bonjour": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/@types/bonjour/-/bonjour-3.5.13.tgz", + "integrity": "sha512-z9fJ5Im06zvUL548KvYNecEVlA7cVDkGUi6kZusb04mpyEFKCIZJvloCcmpmLaIahDpOQGHaHmG6imtPMmPXGQ==", + "dev": true, + "dependencies": { + "@types/node": "*" + } + }, + "node_modules/@types/cheerio": { + "version": "0.22.35", + "resolved": "https://registry.npmjs.org/@types/cheerio/-/cheerio-0.22.35.tgz", + "integrity": "sha512-yD57BchKRvTV+JD53UZ6PD8KWY5g5rvvMLRnZR3EQBCZXiDT/HR+pKpMzFGlWNhFrXlo7VPZXtKvIEwZkAWOIA==", + "dev": true, + "dependencies": { + "@types/node": "*" + } + }, + "node_modules/@types/connect": { + "version": "3.4.38", + "resolved": "https://registry.npmjs.org/@types/connect/-/connect-3.4.38.tgz", + "integrity": "sha512-K6uROf1LD88uDQqJCktA4yzL1YYAK6NgfsI0v/mTgyPKWsX1CnJ0XPSDhViejru1GcRkLWb8RlzFYJRqGUbaug==", + "dev": true, + "dependencies": { + "@types/node": "*" + } + }, + "node_modules/@types/connect-history-api-fallback": { + "version": "1.5.4", + "resolved": "https://registry.npmjs.org/@types/connect-history-api-fallback/-/connect-history-api-fallback-1.5.4.tgz", + "integrity": "sha512-n6Cr2xS1h4uAulPRdlw6Jl6s1oG8KrVilPN2yUITEs+K48EzMJJ3W1xy8K5eWuFvjp3R74AOIGSmp2UfBJ8HFw==", + "dev": true, + "dependencies": { + "@types/express-serve-static-core": "*", + "@types/node": "*" + } + }, + "node_modules/@types/date-fns": { + "version": "2.6.3", + "resolved": "https://registry.npmjs.org/@types/date-fns/-/date-fns-2.6.3.tgz", + "integrity": "sha512-Ke1lw2Ni1t/wMUoLtKFmSNCLozcTBd6vmMqFP4hRzXn6qzkNt97bPAX0x5Y/c15DP43kKvwW1ycStD5+43jVQA==", + "deprecated": "This is a stub types definition. date-fns provides its own type definitions, so you do not need this installed.", + "dev": true, + "dependencies": { + "date-fns": "*" + } + }, + "node_modules/@types/enzyme": { + "version": "3.10.18", + "resolved": "https://registry.npmjs.org/@types/enzyme/-/enzyme-3.10.18.tgz", + "integrity": "sha512-RaO/TyyHZvXkpzinbMTZmd/S5biU4zxkvDsn22ujC29t9FMSzq8tnn8f2MxQ2P8GVhFRG5jTAL05DXKyTtpEQQ==", + "dev": true, + "dependencies": { + "@types/cheerio": "*", + "@types/react": "^16" + } + }, + "node_modules/@types/enzyme/node_modules/@types/react": { + "version": "16.14.62", + "resolved": "https://registry.npmjs.org/@types/react/-/react-16.14.62.tgz", + "integrity": "sha512-BWf7hqninZav6nerxXj+NeZT/mTpDeG6Lk2zREHAy63CrnXoOGPGtNqTFYFN/sqpSaREDP5otVV88axIXmKfGA==", + "dev": true, + "dependencies": { + "@types/prop-types": "*", + "@types/scheduler": "^0.16", + "csstype": "^3.0.2" + } + }, + "node_modules/@types/eslint": { + "version": "9.6.1", + "resolved": "https://registry.npmjs.org/@types/eslint/-/eslint-9.6.1.tgz", + "integrity": "sha512-FXx2pKgId/WyYo2jXw63kk7/+TY7u7AziEJxJAnSFzHlqTAS3Ync6SvgYAN/k4/PQpnnVuzoMuVnByKK2qp0ag==", + "dependencies": { + "@types/estree": "*", + "@types/json-schema": "*" + } + }, + "node_modules/@types/eslint-scope": { + "version": "3.7.7", + "resolved": "https://registry.npmjs.org/@types/eslint-scope/-/eslint-scope-3.7.7.tgz", + "integrity": "sha512-MzMFlSLBqNF2gcHWO0G1vP/YQyfvrxZ0bF+u7mzUdZ1/xK4A4sru+nraZz5i3iEIk1l1uyicaDVTB4QbbEkAYg==", + "dependencies": { + "@types/eslint": "*", + "@types/estree": "*" + } + }, + "node_modules/@types/estree": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.6.tgz", + "integrity": "sha512-AYnb1nQyY49te+VRAVgmzfcgjYS91mY5P0TKUDCLEM+gNnA+3T6rWITXRLYCpahpqSQbN5cE+gHpnPyXjHWxcw==" + }, + "node_modules/@types/express": { + "version": "4.17.25", + "resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.25.tgz", + "integrity": "sha512-dVd04UKsfpINUnK0yBoYHDF3xu7xVH4BuDotC/xGuycx4CgbP48X/KF/586bcObxT0HENHXEU8Nqtu6NR+eKhw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/body-parser": "*", + "@types/express-serve-static-core": "^4.17.33", + "@types/qs": "*", + "@types/serve-static": "^1" + } + }, + "node_modules/@types/express-serve-static-core": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/@types/express-serve-static-core/-/express-serve-static-core-5.0.1.tgz", + "integrity": "sha512-CRICJIl0N5cXDONAdlTv5ShATZ4HEwk6kDDIW2/w9qOWKg+NU/5F8wYRWCrONad0/UKkloNSmmyN/wX4rtpbVA==", + "dev": true, + "dependencies": { + "@types/node": "*", + "@types/qs": "*", + "@types/range-parser": "*", + "@types/send": "*" + } + }, + "node_modules/@types/express/node_modules/@types/express-serve-static-core": { + "version": "4.19.6", + "resolved": "https://registry.npmjs.org/@types/express-serve-static-core/-/express-serve-static-core-4.19.6.tgz", + "integrity": "sha512-N4LZ2xG7DatVqhCZzOGb1Yi5lMbXSZcmdLDe9EzSndPV2HpWYWzRbaerl2n27irrm94EPpprqa8KpskPT085+A==", + "dev": true, + "dependencies": { + "@types/node": "*", + "@types/qs": "*", + "@types/range-parser": "*", + "@types/send": "*" + } + }, + "node_modules/@types/graceful-fs": { + "version": "4.1.9", + "resolved": "https://registry.npmjs.org/@types/graceful-fs/-/graceful-fs-4.1.9.tgz", + "integrity": "sha512-olP3sd1qOEe5dXTSaFvQG+02VdRXcdytWLAZsAq1PecU8uqQAhkrnbli7DagjtXKW/Bl7YJbUsa8MPcuc8LHEQ==", + "dev": true, + "dependencies": { + "@types/node": "*" + } + }, + "node_modules/@types/history": { + "version": "4.7.11", + "resolved": "https://registry.npmjs.org/@types/history/-/history-4.7.11.tgz", + "integrity": "sha512-qjDJRrmvBMiTx+jyLxvLfJU7UznFuokDv4f3WRuriHKERccVpFU+8XMQUAbDzoiJCsmexxRExQeMwwCdamSKDA==", + "dev": true + }, + "node_modules/@types/hoist-non-react-statics": { + "version": "3.3.5", + "resolved": "https://registry.npmjs.org/@types/hoist-non-react-statics/-/hoist-non-react-statics-3.3.5.tgz", + "integrity": "sha512-SbcrWzkKBw2cdwRTwQAswfpB9g9LJWfjtUeW/jvNwbhC8cpmmNYVePa+ncbUe0rGTQ7G3Ff6mYUN2VMfLVr+Sg==", + "dependencies": { + "@types/react": "*", + "hoist-non-react-statics": "^3.3.0" + } + }, + "node_modules/@types/html-minifier-terser": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/@types/html-minifier-terser/-/html-minifier-terser-6.1.0.tgz", + "integrity": "sha512-oh/6byDPnL1zeNXFrDXFLyZjkr1MsBG667IM792caf1L2UPOOMf65NFzjUH/ltyfwjAGfs1rsX1eftK0jC/KIg==", + "dev": true + }, + "node_modules/@types/http-errors": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/@types/http-errors/-/http-errors-2.0.4.tgz", + "integrity": "sha512-D0CFMMtydbJAegzOyHjtiKPLlvnm3iTZyZRSZoLq2mRhDdmLfIWOCYPfQJ4cu2erKghU++QvjcUjp/5h7hESpA==", + "dev": true + }, + "node_modules/@types/http-proxy": { + "version": "1.17.15", + "resolved": "https://registry.npmjs.org/@types/http-proxy/-/http-proxy-1.17.15.tgz", + "integrity": "sha512-25g5atgiVNTIv0LBDTg1H74Hvayx0ajtJPLLcYE3whFv75J0pWNtOBzaXJQgDTmrX1bx5U9YC2w/n65BN1HwRQ==", + "dev": true, + "dependencies": { + "@types/node": "*" + } + }, + "node_modules/@types/istanbul-lib-coverage": { + "version": "2.0.6", + "resolved": "https://registry.npmjs.org/@types/istanbul-lib-coverage/-/istanbul-lib-coverage-2.0.6.tgz", + "integrity": "sha512-2QF/t/auWm0lsy8XtKVPG19v3sSOQlJe/YHZgfjb/KBBHOGSV+J2q/S671rcq9uTBrLAXmZpqJiaQbMT+zNU1w==" + }, + "node_modules/@types/istanbul-lib-report": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/@types/istanbul-lib-report/-/istanbul-lib-report-3.0.3.tgz", + "integrity": "sha512-NQn7AHQnk/RSLOxrBbGyJM/aVQ+pjj5HCgasFxc0K/KhoATfQ/47AyUl15I2yBUpihjmas+a+VJBOqecrFH+uA==", + "dependencies": { + "@types/istanbul-lib-coverage": "*" + } + }, + "node_modules/@types/istanbul-reports": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/@types/istanbul-reports/-/istanbul-reports-3.0.4.tgz", + "integrity": "sha512-pk2B1NWalF9toCRu6gjBzR69syFjP4Od8WRAX+0mmf9lAjCRicLOWc+ZrxZHx/0XRjotgkF9t6iaMJ+aXcOdZQ==", + "dependencies": { + "@types/istanbul-lib-report": "*" + } + }, + "node_modules/@types/jest": { + "version": "29.5.14", + "resolved": "https://registry.npmjs.org/@types/jest/-/jest-29.5.14.tgz", + "integrity": "sha512-ZN+4sdnLUbo8EVvVc2ao0GFW6oVrQRPn4K2lglySj7APvSrgzxHiNNK99us4WDMi57xxA2yggblIAMNhXOotLQ==", + "dev": true, + "dependencies": { + "expect": "^29.0.0", + "pretty-format": "^29.0.0" + } + }, + "node_modules/@types/jest/node_modules/ansi-styles": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz", + "integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/@types/jest/node_modules/pretty-format": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/pretty-format/-/pretty-format-29.7.0.tgz", + "integrity": "sha512-Pdlw/oPxN+aXdmM9R00JVC9WVFoCLTKJvDVLgmJ+qAffBMxsV85l/Lu7sNx4zSzPyoL2euImuEwHhOXdEgNFZQ==", + "dev": true, + "dependencies": { + "@jest/schemas": "^29.6.3", + "ansi-styles": "^5.0.0", + "react-is": "^18.0.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/@types/js-yaml": { + "version": "4.0.9", + "resolved": "https://registry.npmjs.org/@types/js-yaml/-/js-yaml-4.0.9.tgz", + "integrity": "sha512-k4MGaQl5TGo/iipqb2UDG2UwjXziSWkh0uysQelTlJpX1qGlpUZYm8PnO4DxG1qBomtJUdYJ6qR6xdIah10JLg==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/json-schema": { + "version": "7.0.15", + "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz", + "integrity": "sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==" + }, + "node_modules/@types/lodash": { + "version": "4.17.13", + "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.17.13.tgz", + "integrity": "sha512-lfx+dftrEZcdBPczf9d0Qv0x+j/rfNCMuC6OcfXmO8gkfeNAY88PgKUbvG56whcN23gc27yenwF6oJZXGFpYxg==" + }, + "node_modules/@types/mime": { + "version": "1.3.5", + "resolved": "https://registry.npmjs.org/@types/mime/-/mime-1.3.5.tgz", + "integrity": "sha512-/pyBZWSLD2n0dcHE3hq8s8ZvcETHtEuF+3E7XVt0Ig2nvsVQXdghHVcEkIWjy9A0wKfTn97a/PSDYohKIlnP/w==", + "dev": true + }, + "node_modules/@types/node": { + "version": "22.10.1", + "resolved": "https://registry.npmjs.org/@types/node/-/node-22.10.1.tgz", + "integrity": "sha512-qKgsUwfHZV2WCWLAnVP1JqnpE6Im6h3Y0+fYgMTasNQ7V++CBX5OT1as0g0f+OyubbFqhf6XVNIsmN4IIhEgGQ==", + "dependencies": { + "undici-types": "~6.20.0" + } + }, + "node_modules/@types/node-fetch": { + "version": "2.6.11", + "resolved": "https://registry.npmjs.org/@types/node-fetch/-/node-fetch-2.6.11.tgz", + "integrity": "sha512-24xFj9R5+rfQJLRyM56qh+wnVSYhyXC2tkoBndtY0U+vubqNsYXGjufB2nn8Q6gt0LrARwL6UBtMCSVCwl4B1g==", + "dependencies": { + "@types/node": "*", + "form-data": "^4.0.0" + } + }, + "node_modules/@types/parse-json": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/@types/parse-json/-/parse-json-4.0.2.tgz", + "integrity": "sha512-dISoDXWWQwUquiKsyZ4Ng+HX2KsPL7LyHKHQwgGFEA3IaKac4Obd+h2a/a6waisAoepJlBcx9paWqjA8/HVjCw==", + "dev": true + }, + "node_modules/@types/prop-types": { + "version": "15.7.13", + "resolved": "https://registry.npmjs.org/@types/prop-types/-/prop-types-15.7.13.tgz", + "integrity": "sha512-hCZTSvwbzWGvhqxp/RqVqwU999pBf2vp7hzIjiYOsl8wqOmUxkQ6ddw1cV3l8811+kdUFus/q4d1Y3E3SyEifA==" + }, + "node_modules/@types/qs": { + "version": "6.9.16", + "resolved": "https://registry.npmjs.org/@types/qs/-/qs-6.9.16.tgz", + "integrity": "sha512-7i+zxXdPD0T4cKDuxCUXJ4wHcsJLwENa6Z3dCu8cfCK743OGy5Nu1RmAGqDPsoTDINVEcdXKRvR/zre+P2Ku1A==", + "dev": true + }, + "node_modules/@types/range-parser": { + "version": "1.2.7", + "resolved": "https://registry.npmjs.org/@types/range-parser/-/range-parser-1.2.7.tgz", + "integrity": "sha512-hKormJbkJqzQGhziax5PItDUTMAM9uE2XXQmM37dyd4hVM+5aVl7oVxMVUiVQn2oCQFN/LKCZdvSM0pFRqbSmQ==", + "dev": true + }, + "node_modules/@types/react": { + "version": "18.3.12", + "resolved": "https://registry.npmjs.org/@types/react/-/react-18.3.12.tgz", + "integrity": "sha512-D2wOSq/d6Agt28q7rSI3jhU7G6aiuzljDGZ2hTZHIkrTLUI+AF3WMeKkEZ9nN2fkBAlcktT6vcZjDFiIhMYEQw==", + "dependencies": { + "@types/prop-types": "*", + "csstype": "^3.0.2" + } + }, + "node_modules/@types/react-dom": { + "version": "18.3.1", + "resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-18.3.1.tgz", + "integrity": "sha512-qW1Mfv8taImTthu4KoXgDfLuk4bydU6Q/TkADnDWWHwi4NX4BR+LWfTp2sVmTqRrsHvyDDTelgelxJ+SsejKKQ==", + "devOptional": true, + "dependencies": { + "@types/react": "*" + } + }, + "node_modules/@types/react-helmet": { + "version": "6.1.11", + "resolved": "https://registry.npmjs.org/@types/react-helmet/-/react-helmet-6.1.11.tgz", + "integrity": "sha512-0QcdGLddTERotCXo3VFlUSWO3ztraw8nZ6e3zJSgG7apwV5xt+pJUS8ewPBqT4NYB1optGLprNQzFleIY84u/g==", + "dev": true, + "dependencies": { + "@types/react": "*" + } + }, + "node_modules/@types/react-redux": { + "version": "7.1.34", + "resolved": "https://registry.npmjs.org/@types/react-redux/-/react-redux-7.1.34.tgz", + "integrity": "sha512-GdFaVjEbYv4Fthm2ZLvj1VSCedV7TqE5y1kNwnjSdBOTXuRSgowux6J8TAct15T3CKBr63UMk+2CO7ilRhyrAQ==", + "dev": true, + "dependencies": { + "@types/hoist-non-react-statics": "^3.3.0", + "@types/react": "*", + "hoist-non-react-statics": "^3.3.0", + "redux": "^4.0.0" + } + }, + "node_modules/@types/react-redux/node_modules/redux": { + "version": "4.2.1", + "resolved": "https://registry.npmjs.org/redux/-/redux-4.2.1.tgz", + "integrity": "sha512-LAUYz4lc+Do8/g7aeRa8JkyDErK6ekstQaqWQrNRW//MY1TvCEpMtpTWvlQ+FPbWCx+Xixu/6SHt5N0HR+SB4w==", + "dev": true, + "dependencies": { + "@babel/runtime": "^7.9.2" + } + }, + "node_modules/@types/react-router": { + "version": "5.1.20", + "resolved": "https://registry.npmjs.org/@types/react-router/-/react-router-5.1.20.tgz", + "integrity": "sha512-jGjmu/ZqS7FjSH6owMcD5qpq19+1RS9DeVRqfl1FeBMxTDQAGwlMWOcs52NDoXaNKyG3d1cYQFMs9rCrb88o9Q==", + "dev": true, + "dependencies": { + "@types/history": "^4.7.11", + "@types/react": "*" + } + }, + "node_modules/@types/react-router-dom": { + "version": "5.3.3", + "resolved": "https://registry.npmjs.org/@types/react-router-dom/-/react-router-dom-5.3.3.tgz", + "integrity": "sha512-kpqnYK4wcdm5UaWI3fLcELopqLrHgLqNsdpHauzlQktfkHL3npOSwtj1Uz9oKBAzs7lFtVkV8j83voAz2D8fhw==", + "dev": true, + "dependencies": { + "@types/history": "^4.7.11", + "@types/react": "*", + "@types/react-router": "*" + } + }, + "node_modules/@types/react-test-renderer": { + "version": "18.3.0", + "resolved": "https://registry.npmjs.org/@types/react-test-renderer/-/react-test-renderer-18.3.0.tgz", + "integrity": "sha512-HW4MuEYxfDbOHQsVlY/XtOvNHftCVEPhJF2pQXXwcUiUF+Oyb0usgp48HSgpK5rt8m9KZb22yqOeZm+rrVG8gw==", + "dev": true, + "dependencies": { + "@types/react": "*" + } + }, + "node_modules/@types/retry": { + "version": "0.12.2", + "resolved": "https://registry.npmjs.org/@types/retry/-/retry-0.12.2.tgz", + "integrity": "sha512-XISRgDJ2Tc5q4TRqvgJtzsRkFYNJzZrhTdtMoGVBttwzzQJkPnS3WWTFc7kuDRoPtPakl+T+OfdEUjYJj7Jbow==", + "dev": true + }, + "node_modules/@types/scheduler": { + "version": "0.16.8", + "resolved": "https://registry.npmjs.org/@types/scheduler/-/scheduler-0.16.8.tgz", + "integrity": "sha512-WZLiwShhwLRmeV6zH+GkbOFT6Z6VklCItrDioxUnv+u4Ll+8vKeFySoFyK/0ctcRpOmwAicELfmys1sDc/Rw+A==", + "dev": true + }, + "node_modules/@types/send": { + "version": "0.17.4", + "resolved": "https://registry.npmjs.org/@types/send/-/send-0.17.4.tgz", + "integrity": "sha512-x2EM6TJOybec7c52BX0ZspPodMsQUd5L6PRwOunVyVUhXiBSKf3AezDL8Dgvgt5o0UfKNfuA0eMLr2wLT4AiBA==", + "dev": true, + "dependencies": { + "@types/mime": "^1", + "@types/node": "*" + } + }, + "node_modules/@types/serve-index": { + "version": "1.9.4", + "resolved": "https://registry.npmjs.org/@types/serve-index/-/serve-index-1.9.4.tgz", + "integrity": "sha512-qLpGZ/c2fhSs5gnYsQxtDEq3Oy8SXPClIXkW5ghvAvsNuVSA8k+gCONcUCS/UjLEYvYps+e8uBtfgXgvhwfNug==", + "dev": true, + "dependencies": { + "@types/express": "*" + } + }, + "node_modules/@types/serve-static": { + "version": "1.15.7", + "resolved": "https://registry.npmjs.org/@types/serve-static/-/serve-static-1.15.7.tgz", + "integrity": "sha512-W8Ym+h8nhuRwaKPaDw34QUkwsGi6Rc4yYqvKFo5rm2FUEhCFbzVWrxXUxuKK8TASjWsysJY0nsmNCGhCOIsrOw==", + "dev": true, + "dependencies": { + "@types/http-errors": "*", + "@types/node": "*", + "@types/send": "*" + } + }, + "node_modules/@types/sockjs": { + "version": "0.3.36", + "resolved": "https://registry.npmjs.org/@types/sockjs/-/sockjs-0.3.36.tgz", + "integrity": "sha512-MK9V6NzAS1+Ud7JV9lJLFqW85VbC9dq3LmwZCuBe4wBDgKC0Kj/jd8Xl+nSviU+Qc3+m7umHHyHg//2KSa0a0Q==", + "dev": true, + "dependencies": { + "@types/node": "*" + } + }, + "node_modules/@types/stack-utils": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/@types/stack-utils/-/stack-utils-2.0.3.tgz", + "integrity": "sha512-9aEbYZ3TbYMznPdcdr3SmIrLXwC/AKZXQeCf9Pgao5CKb8CyHuEX5jzWPTkvregvhRJHcpRO6BFoGW9ycaOkYw==", + "dev": true + }, + "node_modules/@types/stylis": { + "version": "4.2.5", + "resolved": "https://registry.npmjs.org/@types/stylis/-/stylis-4.2.5.tgz", + "integrity": "sha512-1Xve+NMN7FWjY14vLoY5tL3BVEQ/n42YLwaqJIPYhotZ9uBHt87VceMwWQpzmdEt2TNXIorIFG+YeCUUW7RInw==", + "dev": true, + "peer": true + }, + "node_modules/@types/use-sync-external-store": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/@types/use-sync-external-store/-/use-sync-external-store-0.0.3.tgz", + "integrity": "sha512-EwmlvuaxPNej9+T4v5AuBPJa2x2UOJVdjCtDHgcDqitUeOtjnJKJ+apYjVcAoBEMjKW1VVFGZLUb5+qqa09XFA==" + }, + "node_modules/@types/ws": { + "version": "8.5.12", + "resolved": "https://registry.npmjs.org/@types/ws/-/ws-8.5.12.tgz", + "integrity": "sha512-3tPRkv1EtkDpzlgyKyI8pGsGZAGPEaXeu0DOj5DI25Ja91bdAYddYHbADRYVrZMRbfW+1l5YwXVDKohDJNQxkQ==", + "dev": true, + "dependencies": { + "@types/node": "*" + } + }, + "node_modules/@types/yargs": { + "version": "17.0.33", + "resolved": "https://registry.npmjs.org/@types/yargs/-/yargs-17.0.33.tgz", + "integrity": "sha512-WpxBCKWPLr4xSsHgz511rFJAM+wS28w2zEO1QDNY5zM/S8ok70NNfztH0xwhqKyaK0OHCbN98LDAZuy1ctxDkA==", + "dependencies": { + "@types/yargs-parser": "*" + } + }, + "node_modules/@types/yargs-parser": { + "version": "21.0.3", + "resolved": "https://registry.npmjs.org/@types/yargs-parser/-/yargs-parser-21.0.3.tgz", + "integrity": "sha512-I4q9QU9MQv4oEOz4tAHJtNz1cwuLxn2F3xcc2iV5WdqLPpUnj30aUuxt1mAxYTG+oe8CZMV/+6rU4S4gRDzqtQ==" + }, + "node_modules/@types/yup": { + "version": "0.29.14", + "resolved": "https://registry.npmjs.org/@types/yup/-/yup-0.29.14.tgz", + "integrity": "sha512-Ynb/CjHhE/Xp/4bhHmQC4U1Ox+I2OpfRYF3dnNgQqn1cHa6LK3H1wJMNPT02tSVZA6FYuXE2ITORfbnb6zBCSA==", + "license": "MIT" + }, + "node_modules/@typescript-eslint/eslint-plugin": { + "version": "8.33.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-8.33.1.tgz", + "integrity": "sha512-TDCXj+YxLgtvxvFlAvpoRv9MAncDLBV2oT9Bd7YBGC/b/sEURoOYuIwLI99rjWOfY3QtDzO+mk0n4AmdFExW8A==", + "dev": true, + "license": "MIT", + "dependencies": { + "@eslint-community/regexpp": "^4.10.0", + "@typescript-eslint/scope-manager": "8.33.1", + "@typescript-eslint/type-utils": "8.33.1", + "@typescript-eslint/utils": "8.33.1", + "@typescript-eslint/visitor-keys": "8.33.1", + "graphemer": "^1.4.0", + "ignore": "^7.0.0", + "natural-compare": "^1.4.0", + "ts-api-utils": "^2.1.0" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + }, + "peerDependencies": { + "@typescript-eslint/parser": "^8.33.1", + "eslint": "^8.57.0 || ^9.0.0", + "typescript": ">=4.8.4 <5.9.0" + } + }, + "node_modules/@typescript-eslint/eslint-plugin/node_modules/ignore": { + "version": "7.0.5", + "resolved": "https://registry.npmjs.org/ignore/-/ignore-7.0.5.tgz", + "integrity": "sha512-Hs59xBNfUIunMFgWAbGX5cq6893IbWg4KnrjbYwX3tx0ztorVgTDA6B2sxf8ejHJ4wz8BqGUMYlnzNBer5NvGg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 4" + } + }, + "node_modules/@typescript-eslint/parser": { + "version": "8.33.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-8.33.1.tgz", + "integrity": "sha512-qwxv6dq682yVvgKKp2qWwLgRbscDAYktPptK4JPojCwwi3R9cwrvIxS4lvBpzmcqzR4bdn54Z0IG1uHFskW4dA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@typescript-eslint/scope-manager": "8.33.1", + "@typescript-eslint/types": "8.33.1", + "@typescript-eslint/typescript-estree": "8.33.1", + "@typescript-eslint/visitor-keys": "8.33.1", + "debug": "^4.3.4" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + }, + "peerDependencies": { + "eslint": "^8.57.0 || ^9.0.0", + "typescript": ">=4.8.4 <5.9.0" + } + }, + "node_modules/@typescript-eslint/project-service": { + "version": "8.33.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/project-service/-/project-service-8.33.1.tgz", + "integrity": "sha512-DZR0efeNklDIHHGRpMpR5gJITQpu6tLr9lDJnKdONTC7vvzOlLAG/wcfxcdxEWrbiZApcoBCzXqU/Z458Za5Iw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@typescript-eslint/tsconfig-utils": "^8.33.1", + "@typescript-eslint/types": "^8.33.1", + "debug": "^4.3.4" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + }, + "peerDependencies": { + "typescript": ">=4.8.4 <5.9.0" + } + }, + "node_modules/@typescript-eslint/scope-manager": { + "version": "8.33.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-8.33.1.tgz", + "integrity": "sha512-dM4UBtgmzHR9bS0Rv09JST0RcHYearoEoo3pG5B6GoTR9XcyeqX87FEhPo+5kTvVfKCvfHaHrcgeJQc6mrDKrA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@typescript-eslint/types": "8.33.1", + "@typescript-eslint/visitor-keys": "8.33.1" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + } + }, + "node_modules/@typescript-eslint/tsconfig-utils": { + "version": "8.33.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/tsconfig-utils/-/tsconfig-utils-8.33.1.tgz", + "integrity": "sha512-STAQsGYbHCF0/e+ShUQ4EatXQ7ceh3fBCXkNU7/MZVKulrlq1usH7t2FhxvCpuCi5O5oi1vmVaAjrGeL71OK1g==", + "dev": true, + "license": "MIT", + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + }, + "peerDependencies": { + "typescript": ">=4.8.4 <5.9.0" + } + }, + "node_modules/@typescript-eslint/type-utils": { + "version": "8.33.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/type-utils/-/type-utils-8.33.1.tgz", + "integrity": "sha512-1cG37d9xOkhlykom55WVwG2QRNC7YXlxMaMzqw2uPeJixBFfKWZgaP/hjAObqMN/u3fr5BrTwTnc31/L9jQ2ww==", + "dev": true, + "license": "MIT", + "dependencies": { + "@typescript-eslint/typescript-estree": "8.33.1", + "@typescript-eslint/utils": "8.33.1", + "debug": "^4.3.4", + "ts-api-utils": "^2.1.0" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + }, + "peerDependencies": { + "eslint": "^8.57.0 || ^9.0.0", + "typescript": ">=4.8.4 <5.9.0" + } + }, + "node_modules/@typescript-eslint/types": { + "version": "8.33.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-8.33.1.tgz", + "integrity": "sha512-xid1WfizGhy/TKMTwhtVOgalHwPtV8T32MS9MaH50Cwvz6x6YqRIPdD2WvW0XaqOzTV9p5xdLY0h/ZusU5Lokg==", + "dev": true, + "license": "MIT", + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + } + }, + "node_modules/@typescript-eslint/typescript-estree": { + "version": "8.33.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-8.33.1.tgz", + "integrity": "sha512-+s9LYcT8LWjdYWu7IWs7FvUxpQ/DGkdjZeE/GGulHvv8rvYwQvVaUZ6DE+j5x/prADUgSbbCWZ2nPI3usuVeOA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@typescript-eslint/project-service": "8.33.1", + "@typescript-eslint/tsconfig-utils": "8.33.1", + "@typescript-eslint/types": "8.33.1", + "@typescript-eslint/visitor-keys": "8.33.1", + "debug": "^4.3.4", + "fast-glob": "^3.3.2", + "is-glob": "^4.0.3", + "minimatch": "^9.0.4", + "semver": "^7.6.0", + "ts-api-utils": "^2.1.0" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + }, + "peerDependencies": { + "typescript": ">=4.8.4 <5.9.0" + } + }, + "node_modules/@typescript-eslint/typescript-estree/node_modules/brace-expansion": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz", + "integrity": "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "balanced-match": "^1.0.0" + } + }, + "node_modules/@typescript-eslint/typescript-estree/node_modules/minimatch": { + "version": "9.0.5", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz", + "integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==", + "dev": true, + "license": "ISC", + "dependencies": { + "brace-expansion": "^2.0.1" + }, + "engines": { + "node": ">=16 || 14 >=14.17" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/@typescript-eslint/typescript-estree/node_modules/semver": { + "version": "7.7.2", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.2.tgz", + "integrity": "sha512-RF0Fw+rO5AMf9MAyaRXI4AV0Ulj5lMHqVxxdSgiVbixSCXoEmmX/jk0CuJw4+3SqroYO9VoUh+HcuJivvtJemA==", + "dev": true, + "license": "ISC", + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/@typescript-eslint/utils": { + "version": "8.33.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/utils/-/utils-8.33.1.tgz", + "integrity": "sha512-52HaBiEQUaRYqAXpfzWSR2U3gxk92Kw006+xZpElaPMg3C4PgM+A5LqwoQI1f9E5aZ/qlxAZxzm42WX+vn92SQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@eslint-community/eslint-utils": "^4.7.0", + "@typescript-eslint/scope-manager": "8.33.1", + "@typescript-eslint/types": "8.33.1", + "@typescript-eslint/typescript-estree": "8.33.1" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + }, + "peerDependencies": { + "eslint": "^8.57.0 || ^9.0.0", + "typescript": ">=4.8.4 <5.9.0" + } + }, + "node_modules/@typescript-eslint/visitor-keys": { + "version": "8.33.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-8.33.1.tgz", + "integrity": "sha512-3i8NrFcZeeDHJ+7ZUuDkGT+UHq+XoFGsymNK2jZCOHcfEzRQ0BdpRtdpSx/Iyf3MHLWIcLS0COuOPibKQboIiQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@typescript-eslint/types": "8.33.1", + "eslint-visitor-keys": "^4.2.0" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + } + }, + "node_modules/@typescript-eslint/visitor-keys/node_modules/eslint-visitor-keys": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-4.2.0.tgz", + "integrity": "sha512-UyLnSehNt62FFhSwjZlHmeokpRK59rcz29j+F1/aDgbkbRTk7wIc9XzdoasMUbRNKDM0qQt/+BJ4BrpFeABemw==", + "dev": true, + "license": "Apache-2.0", + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "url": "https://opencollective.com/eslint" + } + }, + "node_modules/@webassemblyjs/ast": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@webassemblyjs/ast/-/ast-1.12.1.tgz", + "integrity": "sha512-EKfMUOPRRUTy5UII4qJDGPpqfwjOmZ5jeGFwid9mnoqIFK+e0vqoi1qH56JpmZSzEL53jKnNzScdmftJyG5xWg==", + "dependencies": { + "@webassemblyjs/helper-numbers": "1.11.6", + "@webassemblyjs/helper-wasm-bytecode": "1.11.6" + } + }, + "node_modules/@webassemblyjs/floating-point-hex-parser": { + "version": "1.11.6", + "resolved": "https://registry.npmjs.org/@webassemblyjs/floating-point-hex-parser/-/floating-point-hex-parser-1.11.6.tgz", + "integrity": "sha512-ejAj9hfRJ2XMsNHk/v6Fu2dGS+i4UaXBXGemOfQ/JfQ6mdQg/WXtwleQRLLS4OvfDhv8rYnVwH27YJLMyYsxhw==" + }, + "node_modules/@webassemblyjs/helper-api-error": { + "version": "1.11.6", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-api-error/-/helper-api-error-1.11.6.tgz", + "integrity": "sha512-o0YkoP4pVu4rN8aTJgAyj9hC2Sv5UlkzCHhxqWj8butaLvnpdc2jOwh4ewE6CX0txSfLn/UYaV/pheS2Txg//Q==" + }, + "node_modules/@webassemblyjs/helper-buffer": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-buffer/-/helper-buffer-1.12.1.tgz", + "integrity": "sha512-nzJwQw99DNDKr9BVCOZcLuJJUlqkJh+kVzVl6Fmq/tI5ZtEyWT1KZMyOXltXLZJmDtvLCDgwsyrkohEtopTXCw==" + }, + "node_modules/@webassemblyjs/helper-numbers": { + "version": "1.11.6", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-numbers/-/helper-numbers-1.11.6.tgz", + "integrity": "sha512-vUIhZ8LZoIWHBohiEObxVm6hwP034jwmc9kuq5GdHZH0wiLVLIPcMCdpJzG4C11cHoQ25TFIQj9kaVADVX7N3g==", + "dependencies": { + "@webassemblyjs/floating-point-hex-parser": "1.11.6", + "@webassemblyjs/helper-api-error": "1.11.6", + "@xtuc/long": "4.2.2" + } + }, + "node_modules/@webassemblyjs/helper-wasm-bytecode": { + "version": "1.11.6", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-bytecode/-/helper-wasm-bytecode-1.11.6.tgz", + "integrity": "sha512-sFFHKwcmBprO9e7Icf0+gddyWYDViL8bpPjJJl0WHxCdETktXdmtWLGVzoHbqUcY4Be1LkNfwTmXOJUFZYSJdA==" + }, + "node_modules/@webassemblyjs/helper-wasm-section": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-section/-/helper-wasm-section-1.12.1.tgz", + "integrity": "sha512-Jif4vfB6FJlUlSbgEMHUyk1j234GTNG9dBJ4XJdOySoj518Xj0oGsNi59cUQF4RRMS9ouBUxDDdyBVfPTypa5g==", + "dependencies": { + "@webassemblyjs/ast": "1.12.1", + "@webassemblyjs/helper-buffer": "1.12.1", + "@webassemblyjs/helper-wasm-bytecode": "1.11.6", + "@webassemblyjs/wasm-gen": "1.12.1" + } + }, + "node_modules/@webassemblyjs/ieee754": { + "version": "1.11.6", + "resolved": "https://registry.npmjs.org/@webassemblyjs/ieee754/-/ieee754-1.11.6.tgz", + "integrity": "sha512-LM4p2csPNvbij6U1f19v6WR56QZ8JcHg3QIJTlSwzFcmx6WSORicYj6I63f9yU1kEUtrpG+kjkiIAkevHpDXrg==", + "dependencies": { + "@xtuc/ieee754": "^1.2.0" + } + }, + "node_modules/@webassemblyjs/leb128": { + "version": "1.11.6", + "resolved": "https://registry.npmjs.org/@webassemblyjs/leb128/-/leb128-1.11.6.tgz", + "integrity": "sha512-m7a0FhE67DQXgouf1tbN5XQcdWoNgaAuoULHIfGFIEVKA6tu/edls6XnIlkmS6FrXAquJRPni3ZZKjw6FSPjPQ==", + "dependencies": { + "@xtuc/long": "4.2.2" + } + }, + "node_modules/@webassemblyjs/utf8": { + "version": "1.11.6", + "resolved": "https://registry.npmjs.org/@webassemblyjs/utf8/-/utf8-1.11.6.tgz", + "integrity": "sha512-vtXf2wTQ3+up9Zsg8sa2yWiQpzSsMyXj0qViVP6xKGCUT8p8YJ6HqI7l5eCnWx1T/FYdsv07HQs2wTFbbof/RA==" + }, + "node_modules/@webassemblyjs/wasm-edit": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-edit/-/wasm-edit-1.12.1.tgz", + "integrity": "sha512-1DuwbVvADvS5mGnXbE+c9NfA8QRcZ6iKquqjjmR10k6o+zzsRVesil54DKexiowcFCPdr/Q0qaMgB01+SQ1u6g==", + "dependencies": { + "@webassemblyjs/ast": "1.12.1", + "@webassemblyjs/helper-buffer": "1.12.1", + "@webassemblyjs/helper-wasm-bytecode": "1.11.6", + "@webassemblyjs/helper-wasm-section": "1.12.1", + "@webassemblyjs/wasm-gen": "1.12.1", + "@webassemblyjs/wasm-opt": "1.12.1", + "@webassemblyjs/wasm-parser": "1.12.1", + "@webassemblyjs/wast-printer": "1.12.1" + } + }, + "node_modules/@webassemblyjs/wasm-gen": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-gen/-/wasm-gen-1.12.1.tgz", + "integrity": "sha512-TDq4Ojh9fcohAw6OIMXqiIcTq5KUXTGRkVxbSo1hQnSy6lAM5GSdfwWeSxpAo0YzgsgF182E/U0mDNhuA0tW7w==", + "dependencies": { + "@webassemblyjs/ast": "1.12.1", + "@webassemblyjs/helper-wasm-bytecode": "1.11.6", + "@webassemblyjs/ieee754": "1.11.6", + "@webassemblyjs/leb128": "1.11.6", + "@webassemblyjs/utf8": "1.11.6" + } + }, + "node_modules/@webassemblyjs/wasm-opt": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-opt/-/wasm-opt-1.12.1.tgz", + "integrity": "sha512-Jg99j/2gG2iaz3hijw857AVYekZe2SAskcqlWIZXjji5WStnOpVoat3gQfT/Q5tb2djnCjBtMocY/Su1GfxPBg==", + "dependencies": { + "@webassemblyjs/ast": "1.12.1", + "@webassemblyjs/helper-buffer": "1.12.1", + "@webassemblyjs/wasm-gen": "1.12.1", + "@webassemblyjs/wasm-parser": "1.12.1" + } + }, + "node_modules/@webassemblyjs/wasm-parser": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-parser/-/wasm-parser-1.12.1.tgz", + "integrity": "sha512-xikIi7c2FHXysxXe3COrVUPSheuBtpcfhbpFj4gmu7KRLYOzANztwUU0IbsqvMqzuNK2+glRGWCEqZo1WCLyAQ==", + "dependencies": { + "@webassemblyjs/ast": "1.12.1", + "@webassemblyjs/helper-api-error": "1.11.6", + "@webassemblyjs/helper-wasm-bytecode": "1.11.6", + "@webassemblyjs/ieee754": "1.11.6", + "@webassemblyjs/leb128": "1.11.6", + "@webassemblyjs/utf8": "1.11.6" + } + }, + "node_modules/@webassemblyjs/wast-printer": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@webassemblyjs/wast-printer/-/wast-printer-1.12.1.tgz", + "integrity": "sha512-+X4WAlOisVWQMikjbcvY2e0rwPsKQ9F688lksZhBcPycBBuii3O7m8FACbDMWDojpAqvjIncrG8J0XHKyQfVeA==", + "dependencies": { + "@webassemblyjs/ast": "1.12.1", + "@xtuc/long": "4.2.2" + } + }, + "node_modules/@webpack-cli/configtest": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/@webpack-cli/configtest/-/configtest-2.1.1.tgz", + "integrity": "sha512-wy0mglZpDSiSS0XHrVR+BAdId2+yxPSoJW8fsna3ZpYSlufjvxnP4YbKTCBZnNIcGN4r6ZPXV55X4mYExOfLmw==", + "dev": true, + "engines": { + "node": ">=14.15.0" + }, + "peerDependencies": { + "webpack": "5.x.x", + "webpack-cli": "5.x.x" + } + }, + "node_modules/@webpack-cli/info": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/@webpack-cli/info/-/info-2.0.2.tgz", + "integrity": "sha512-zLHQdI/Qs1UyT5UBdWNqsARasIA+AaF8t+4u2aS2nEpBQh2mWIVb8qAklq0eUENnC5mOItrIB4LiS9xMtph18A==", + "dev": true, + "engines": { + "node": ">=14.15.0" + }, + "peerDependencies": { + "webpack": "5.x.x", + "webpack-cli": "5.x.x" + } + }, + "node_modules/@webpack-cli/serve": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/@webpack-cli/serve/-/serve-2.0.5.tgz", + "integrity": "sha512-lqaoKnRYBdo1UgDX8uF24AfGMifWK19TxPmM5FHc2vAGxrJ/qtyUyFBWoY1tISZdelsQ5fBcOusifo5o5wSJxQ==", + "dev": true, + "engines": { + "node": ">=14.15.0" + }, + "peerDependencies": { + "webpack": "5.x.x", + "webpack-cli": "5.x.x" + }, + "peerDependenciesMeta": { + "webpack-dev-server": { + "optional": true + } + } + }, + "node_modules/@xtuc/ieee754": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/@xtuc/ieee754/-/ieee754-1.2.0.tgz", + "integrity": "sha512-DX8nKgqcGwsc0eJSqYt5lwP4DH5FlHnmuWWBRy7X0NcaGR0ZtuyeESgMwTYVEtxmsNGY+qit4QYT/MIYTOTPeA==" + }, + "node_modules/@xtuc/long": { + "version": "4.2.2", + "resolved": "https://registry.npmjs.org/@xtuc/long/-/long-4.2.2.tgz", + "integrity": "sha512-NuHqBY1PB/D8xU6s/thBgOAiAP7HOYDQ32+BFZILJ8ivkUkAHQnWfn6WhL79Owj1qmUnoN/YPhktdIoucipkAQ==" + }, + "node_modules/abort-controller": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz", + "integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==", + "dependencies": { + "event-target-shim": "^5.0.0" + }, + "engines": { + "node": ">=6.5" + } + }, + "node_modules/accepts": { + "version": "1.3.8", + "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.8.tgz", + "integrity": "sha512-PYAthTa2m2VKxuvSD3DPC/Gy+U+sOA1LAuT8mkmRuvw+NACSaeXEQ+NHcVF7rONl6qcaxV3Uuemwawk+7+SJLw==", + "dev": true, + "dependencies": { + "mime-types": "~2.1.34", + "negotiator": "0.6.3" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/ace-builds": { + "version": "1.36.3", + "resolved": "https://registry.npmjs.org/ace-builds/-/ace-builds-1.36.3.tgz", + "integrity": "sha512-YcdwV2IIaJSfjkWAR1NEYN5IxBiXefTgwXsJ//UlaFrjXDX5hQpvPFvEePHz2ZBUfvO54RjHeRUQGX8MS5HaMQ==" + }, + "node_modules/acorn": { + "version": "8.15.0", + "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz", + "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", + "license": "MIT", + "bin": { + "acorn": "bin/acorn" + }, + "engines": { + "node": ">=0.4.0" + } + }, + "node_modules/acorn-jsx": { + "version": "5.3.2", + "resolved": "https://registry.npmjs.org/acorn-jsx/-/acorn-jsx-5.3.2.tgz", + "integrity": "sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ==", + "dev": true, + "license": "MIT", + "peerDependencies": { + "acorn": "^6.0.0 || ^7.0.0 || ^8.0.0" + } + }, + "node_modules/acorn-walk": { + "version": "8.3.4", + "resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-8.3.4.tgz", + "integrity": "sha512-ueEepnujpqee2o5aIYnvHU6C0A42MNdsIDeqy5BydrkuC5R1ZuUFnm27EeFJGoEHJQgn3uleRvmTXaJgfXbt4g==", + "dev": true, + "dependencies": { + "acorn": "^8.11.0" + }, + "engines": { + "node": ">=0.4.0" + } + }, + "node_modules/address": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/address/-/address-1.2.2.tgz", + "integrity": "sha512-4B/qKCfeE/ODUaAUpSwfzazo5x29WD4r3vXiWsB7I2mSDAihwEqKO+g8GELZUQSSAo5e1XTYh3ZVfLyxBc12nA==", + "dev": true, + "engines": { + "node": ">= 10.0.0" + } + }, + "node_modules/adjust-sourcemap-loader": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/adjust-sourcemap-loader/-/adjust-sourcemap-loader-4.0.0.tgz", + "integrity": "sha512-OXwN5b9pCUXNQHJpwwD2qP40byEmSgzj8B4ydSN0uMNYWiFmJ6x6KwUllMmfk8Rwu/HJDFR7U8ubsWBoN0Xp0A==", + "dev": true, + "dependencies": { + "loader-utils": "^2.0.0", + "regex-parser": "^2.2.11" + }, + "engines": { + "node": ">=8.9" + } + }, + "node_modules/adjust-sourcemap-loader/node_modules/loader-utils": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/loader-utils/-/loader-utils-2.0.4.tgz", + "integrity": "sha512-xXqpXoINfFhgua9xiqD8fPFHgkoq1mmmpE92WlDbm9rNRd/EbRb+Gqf908T2DMfuHjjJlksiK2RbHVOdD/MqSw==", + "dev": true, + "dependencies": { + "big.js": "^5.2.2", + "emojis-list": "^3.0.0", + "json5": "^2.1.2" + }, + "engines": { + "node": ">=8.9.0" + } + }, + "node_modules/agentkeepalive": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/agentkeepalive/-/agentkeepalive-4.5.0.tgz", + "integrity": "sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew==", + "dependencies": { + "humanize-ms": "^1.2.1" + }, + "engines": { + "node": ">= 8.0.0" + } + }, + "node_modules/ajv": { + "version": "8.17.1", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.17.1.tgz", + "integrity": "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==", + "dependencies": { + "fast-deep-equal": "^3.1.3", + "fast-uri": "^3.0.1", + "json-schema-traverse": "^1.0.0", + "require-from-string": "^2.0.2" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/epoberezkin" + } + }, + "node_modules/ajv-draft-04": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/ajv-draft-04/-/ajv-draft-04-1.0.0.tgz", + "integrity": "sha512-mv00Te6nmYbRp5DCwclxtt7yV/joXJPGS7nM+97GdxvuttCOfgI3K4U25zboyeX0O+myI8ERluxQe5wljMmVIw==", + "dev": true, + "peerDependencies": { + "ajv": "^8.5.0" + }, + "peerDependenciesMeta": { + "ajv": { + "optional": true + } + } + }, + "node_modules/ajv-formats": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/ajv-formats/-/ajv-formats-2.1.1.tgz", + "integrity": "sha512-Wx0Kx52hxE7C18hkMEggYlEifqWZtYaRgouJor+WMdPnQyEK13vgEWyVNup7SoeeoLMsr4kf5h6dOW11I15MUA==", + "dependencies": { + "ajv": "^8.0.0" + }, + "peerDependencies": { + "ajv": "^8.0.0" + }, + "peerDependenciesMeta": { + "ajv": { + "optional": true + } + } + }, + "node_modules/ajv-keywords": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-5.1.0.tgz", + "integrity": "sha512-YCS/JNFAUyr5vAuhk1DWm1CBxRHW9LbJ2ozWeemrIqpbsqKjHVxYPyi5GC0rjZIT5JxJ3virVTS8wk4i/Z+krw==", + "dependencies": { + "fast-deep-equal": "^3.1.3" + }, + "peerDependencies": { + "ajv": "^8.8.2" + } + }, + "node_modules/ansi-escapes": { + "version": "4.3.2", + "resolved": "https://registry.npmjs.org/ansi-escapes/-/ansi-escapes-4.3.2.tgz", + "integrity": "sha512-gKXj5ALrKWQLsYG9jlTRmR/xKluxHV+Z9QEwNIgCfM1/uwPMCuzVVnh5mwTd+OuBZcwSIMbqssNWRm1lE51QaQ==", + "dev": true, + "dependencies": { + "type-fest": "^0.21.3" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/ansi-html": { + "version": "0.0.9", + "resolved": "https://registry.npmjs.org/ansi-html/-/ansi-html-0.0.9.tgz", + "integrity": "sha512-ozbS3LuenHVxNRh/wdnN16QapUHzauqSomAl1jwwJRRsGwFwtj644lIhxfWu0Fy0acCij2+AEgHvjscq3dlVXg==", + "dev": true, + "engines": [ + "node >= 0.8.0" + ], + "bin": { + "ansi-html": "bin/ansi-html" + } + }, + "node_modules/ansi-html-community": { + "version": "0.0.8", + "resolved": "https://registry.npmjs.org/ansi-html-community/-/ansi-html-community-0.0.8.tgz", + "integrity": "sha512-1APHAyr3+PCamwNw3bXCPp4HFLONZt/yIH0sZp0/469KWNTEy+qN5jQ3GVX6DMZ1UXAi34yVwtTeaG/HpBuuzw==", + "dev": true, + "engines": [ + "node >= 0.8.0" + ], + "bin": { + "ansi-html": "bin/ansi-html" + } + }, + "node_modules/ansi-regex": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", + "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/ansi-styles": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", + "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", + "dependencies": { + "color-convert": "^2.0.1" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/anymatch": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.3.tgz", + "integrity": "sha512-KMReFUr0B4t+D+OBkjR3KYqvocp2XaSzO55UcB6mgQMd3KbcE+mWTyvVV7D/zsdEbNnV6acZUutkiHQXvTr1Rw==", + "dev": true, + "dependencies": { + "normalize-path": "^3.0.0", + "picomatch": "^2.0.4" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/arg": { + "version": "4.1.3", + "resolved": "https://registry.npmjs.org/arg/-/arg-4.1.3.tgz", + "integrity": "sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==", + "dev": true + }, + "node_modules/argparse": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", + "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==", + "license": "Python-2.0" + }, + "node_modules/aria-query": { + "version": "5.3.0", + "resolved": "https://registry.npmjs.org/aria-query/-/aria-query-5.3.0.tgz", + "integrity": "sha512-b0P0sZPKtyu8HkeRAfCq0IfURZK+SuwMjY1UXGBU27wpAiTwQAIlq56IbIO+ytk/JjS1fMR14ee5WBBfKi5J6A==", + "dev": true, + "dependencies": { + "dequal": "^2.0.3" + } + }, + "node_modules/array-buffer-byte-length": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/array-buffer-byte-length/-/array-buffer-byte-length-1.0.2.tgz", + "integrity": "sha512-LHE+8BuR7RYGDKvnrmcuSq3tDcKv9OFEXQt/HpbZhY7V6h0zlUXutnAD82GiFx9rdieCMjkvtcsPqBwgUl1Iiw==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.3", + "is-array-buffer": "^3.0.5" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/array-flatten": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz", + "integrity": "sha512-PCVAQswWemu6UdxsDFFX/+gVeYqKAod3D3UVm91jHwynguOwAvYPhx8nNlM++NqRcK6CxxpUafjmhIdKiHibqg==", + "dev": true, + "license": "MIT" + }, + "node_modules/array-includes": { + "version": "3.1.9", + "resolved": "https://registry.npmjs.org/array-includes/-/array-includes-3.1.9.tgz", + "integrity": "sha512-FmeCCAenzH0KH381SPT5FZmiA/TmpndpcaShhfgEN9eCVjnFBqq3l1xrI42y8+PPLI6hypzou4GXw00WHmPBLQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "call-bound": "^1.0.4", + "define-properties": "^1.2.1", + "es-abstract": "^1.24.0", + "es-object-atoms": "^1.1.1", + "get-intrinsic": "^1.3.0", + "is-string": "^1.1.1", + "math-intrinsics": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/array-union": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/array-union/-/array-union-2.1.0.tgz", + "integrity": "sha512-HGyxoOTYUyCM6stUe6EJgnd4EoewAI7zMdfqO+kGjnlZmBDz/cR5pf8r/cR4Wq60sL/p0IkcjUEEPwS3GFrIyw==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/array.prototype.filter": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/array.prototype.filter/-/array.prototype.filter-1.0.4.tgz", + "integrity": "sha512-r+mCJ7zXgXElgR4IRC+fkvNCeoaavWBs6EdCso5Tbcf+iEMKzBU/His60lt34WEZ9vlb8wDkZvQGcVI5GwkfoQ==", + "dev": true, + "dependencies": { + "call-bind": "^1.0.7", + "define-properties": "^1.2.1", + "es-abstract": "^1.23.2", + "es-array-method-boxes-properly": "^1.0.0", + "es-object-atoms": "^1.0.0", + "is-string": "^1.0.7" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/array.prototype.findlast": { + "version": "1.2.5", + "resolved": "https://registry.npmjs.org/array.prototype.findlast/-/array.prototype.findlast-1.2.5.tgz", + "integrity": "sha512-CVvd6FHg1Z3POpBLxO6E6zr+rSKEQ9L6rZHAaY7lLfhKsWYUBBOuMs0e9o24oopj6H+geRCX0YJ+TJLBK2eHyQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.7", + "define-properties": "^1.2.1", + "es-abstract": "^1.23.2", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.0.0", + "es-shim-unscopables": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/array.prototype.flat": { + "version": "1.3.3", + "resolved": "https://registry.npmjs.org/array.prototype.flat/-/array.prototype.flat-1.3.3.tgz", + "integrity": "sha512-rwG/ja1neyLqCuGZ5YYrznA62D4mZXg0i1cIskIUKSiqF3Cje9/wXAls9B9s1Wa2fomMsIv8czB8jZcPmxCXFg==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "define-properties": "^1.2.1", + "es-abstract": "^1.23.5", + "es-shim-unscopables": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/array.prototype.flatmap": { + "version": "1.3.3", + "resolved": "https://registry.npmjs.org/array.prototype.flatmap/-/array.prototype.flatmap-1.3.3.tgz", + "integrity": "sha512-Y7Wt51eKJSyi80hFrJCePGGNo5ktJCslFuboqJsbf57CCPcm5zztluPlc4/aD8sWsKvlwatezpV4U1efk8kpjg==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "define-properties": "^1.2.1", + "es-abstract": "^1.23.5", + "es-shim-unscopables": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/array.prototype.tosorted": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/array.prototype.tosorted/-/array.prototype.tosorted-1.1.4.tgz", + "integrity": "sha512-p6Fx8B7b7ZhL/gmUsAy0D15WhvDccw3mnGNbZpi3pmeJdxtWsj2jEaI4Y6oo3XiHfzuSgPwKc04MYt6KgvC/wA==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.7", + "define-properties": "^1.2.1", + "es-abstract": "^1.23.3", + "es-errors": "^1.3.0", + "es-shim-unscopables": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/arraybuffer.prototype.slice": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/arraybuffer.prototype.slice/-/arraybuffer.prototype.slice-1.0.4.tgz", + "integrity": "sha512-BNoCY6SXXPQ7gF2opIP4GBE+Xw7U+pHMYKuzjgCN3GwiaIR09UUeKfheyIry77QtrCBlC0KK0q5/TER/tYh3PQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "array-buffer-byte-length": "^1.0.1", + "call-bind": "^1.0.8", + "define-properties": "^1.2.1", + "es-abstract": "^1.23.5", + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.6", + "is-array-buffer": "^3.0.4" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/asn1js": { + "version": "3.0.7", + "resolved": "https://registry.npmjs.org/asn1js/-/asn1js-3.0.7.tgz", + "integrity": "sha512-uLvq6KJu04qoQM6gvBfKFjlh6Gl0vOKQuR5cJMDHQkmwfMOQeN3F3SHCv9SNYSL+CRoHvOGFfllDlVz03GQjvQ==", + "dev": true, + "license": "BSD-3-Clause", + "dependencies": { + "pvtsutils": "^1.3.6", + "pvutils": "^1.1.3", + "tslib": "^2.8.1" + }, + "engines": { + "node": ">=12.0.0" + } + }, + "node_modules/async": { + "version": "3.2.6", + "resolved": "https://registry.npmjs.org/async/-/async-3.2.6.tgz", + "integrity": "sha512-htCUDlxyyCLMgaM3xXg0C0LW2xqfuQ6p05pCEIsXuyQ+a1koYKTuBMzRNwmybfLgvJDMd0r1LTn4+E0Ti6C2AA==", + "dev": true + }, + "node_modules/async-function": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/async-function/-/async-function-1.0.0.tgz", + "integrity": "sha512-hsU18Ae8CDTR6Kgu9DYf0EbCr/a5iGL0rytQDobUcdpYOKokk8LEjVphnXkDkgpi0wYVsqrXuP0bZxJaTqdgoA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/asynckit": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", + "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==" + }, + "node_modules/at-least-node": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/at-least-node/-/at-least-node-1.0.0.tgz", + "integrity": "sha512-+q/t7Ekv1EDY2l6Gda6LLiX14rU9TV20Wa3ofeQmwPFZbOMo9DXrLbOjFaaclkXKWidIaopwAObQDqwWtGUjqg==", + "dev": true, + "engines": { + "node": ">= 4.0.0" + } + }, + "node_modules/author-regex": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/author-regex/-/author-regex-1.0.0.tgz", + "integrity": "sha512-KbWgR8wOYRAPekEmMXrYYdc7BRyhn2Ftk7KWfMUnQ43hFdojWEFRxhhRUm3/OFEdPa1r0KAvTTg9YQK57xTe0g==", + "dev": true, + "engines": { + "node": ">=0.8" + } + }, + "node_modules/autoprefixer": { + "version": "10.4.20", + "resolved": "https://registry.npmjs.org/autoprefixer/-/autoprefixer-10.4.20.tgz", + "integrity": "sha512-XY25y5xSv/wEoqzDyXXME4AFfkZI0P23z6Fs3YgymDnKJkCGOnkL0iTxCa85UTqaSgfcqyf3UA6+c7wUvx/16g==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/autoprefixer" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "dependencies": { + "browserslist": "^4.23.3", + "caniuse-lite": "^1.0.30001646", + "fraction.js": "^4.3.7", + "normalize-range": "^0.1.2", + "picocolors": "^1.0.1", + "postcss-value-parser": "^4.2.0" + }, + "bin": { + "autoprefixer": "bin/autoprefixer" + }, + "engines": { + "node": "^10 || ^12 || >=14" + }, + "peerDependencies": { + "postcss": "^8.1.0" + } + }, + "node_modules/available-typed-arrays": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/available-typed-arrays/-/available-typed-arrays-1.0.7.tgz", + "integrity": "sha512-wvUjBtSGN7+7SjNpq/9M2Tg350UZD3q62IFZLbRAR1bSMlCo1ZaeW+BJ+D090e4hIIZLBcTDWe4Mh4jvUDajzQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "possible-typed-array-names": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/axios": { + "version": "1.13.2", + "resolved": "https://registry.npmjs.org/axios/-/axios-1.13.2.tgz", + "integrity": "sha512-VPk9ebNqPcy5lRGuSlKx752IlDatOjT9paPlm8A7yOuW2Fbvp4X3JznJtT4f0GzGLLiWE9W8onz51SqLYwzGaA==", + "dev": true, + "license": "MIT", + "dependencies": { + "follow-redirects": "^1.15.6", + "form-data": "^4.0.4", + "proxy-from-env": "^1.1.0" + } + }, + "node_modules/babel-helper-evaluate-path": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/babel-helper-evaluate-path/-/babel-helper-evaluate-path-0.5.0.tgz", + "integrity": "sha512-mUh0UhS607bGh5wUMAQfOpt2JX2ThXMtppHRdRU1kL7ZLRWIXxoV2UIV1r2cAeeNeU1M5SB5/RSUgUxrK8yOkA==", + "dev": true + }, + "node_modules/babel-helper-flip-expressions": { + "version": "0.4.3", + "resolved": "https://registry.npmjs.org/babel-helper-flip-expressions/-/babel-helper-flip-expressions-0.4.3.tgz", + "integrity": "sha512-rSrkRW4YQ2ETCWww9gbsWk4N0x1BOtln349Tk0dlCS90oT68WMLyGR7WvaMp3eAnsVrCqdUtC19lo1avyGPejA==", + "dev": true + }, + "node_modules/babel-helper-is-nodes-equiv": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/babel-helper-is-nodes-equiv/-/babel-helper-is-nodes-equiv-0.0.1.tgz", + "integrity": "sha512-ri/nsMFVRqXn7IyT5qW4/hIAGQxuYUFHa3qsxmPtbk6spZQcYlyDogfVpNm2XYOslH/ULS4VEJGUqQX5u7ACQw==", + "dev": true + }, + "node_modules/babel-helper-is-void-0": { + "version": "0.4.3", + "resolved": "https://registry.npmjs.org/babel-helper-is-void-0/-/babel-helper-is-void-0-0.4.3.tgz", + "integrity": "sha512-07rBV0xPRM3TM5NVJEOQEkECX3qnHDjaIbFvWYPv+T1ajpUiVLiqTfC+MmiZxY5KOL/Ec08vJdJD9kZiP9UkUg==", + "dev": true + }, + "node_modules/babel-helper-mark-eval-scopes": { + "version": "0.4.3", + "resolved": "https://registry.npmjs.org/babel-helper-mark-eval-scopes/-/babel-helper-mark-eval-scopes-0.4.3.tgz", + "integrity": "sha512-+d/mXPP33bhgHkdVOiPkmYoeXJ+rXRWi7OdhwpyseIqOS8CmzHQXHUp/+/Qr8baXsT0kjGpMHHofHs6C3cskdA==", + "dev": true + }, + "node_modules/babel-helper-remove-or-void": { + "version": "0.4.3", + "resolved": "https://registry.npmjs.org/babel-helper-remove-or-void/-/babel-helper-remove-or-void-0.4.3.tgz", + "integrity": "sha512-eYNceYtcGKpifHDir62gHJadVXdg9fAhuZEXiRQnJJ4Yi4oUTpqpNY//1pM4nVyjjDMPYaC2xSf0I+9IqVzwdA==", + "dev": true + }, + "node_modules/babel-helper-to-multiple-sequence-expressions": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/babel-helper-to-multiple-sequence-expressions/-/babel-helper-to-multiple-sequence-expressions-0.5.0.tgz", + "integrity": "sha512-m2CvfDW4+1qfDdsrtf4dwOslQC3yhbgyBFptncp4wvtdrDHqueW7slsYv4gArie056phvQFhT2nRcGS4bnm6mA==", + "dev": true + }, + "node_modules/babel-jest": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/babel-jest/-/babel-jest-29.7.0.tgz", + "integrity": "sha512-BrvGY3xZSwEcCzKvKsCi2GgHqDqsYkOP4/by5xCgIwGXQxIEh+8ew3gmrE1y7XRR6LHZIj6yLYnUi/mm2KXKBg==", + "dev": true, + "dependencies": { + "@jest/transform": "^29.7.0", + "@types/babel__core": "^7.1.14", + "babel-plugin-istanbul": "^6.1.1", + "babel-preset-jest": "^29.6.3", + "chalk": "^4.0.0", + "graceful-fs": "^4.2.9", + "slash": "^3.0.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + }, + "peerDependencies": { + "@babel/core": "^7.8.0" + } + }, + "node_modules/babel-jest/node_modules/slash": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz", + "integrity": "sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/babel-loader": { + "version": "9.2.1", + "resolved": "https://registry.npmjs.org/babel-loader/-/babel-loader-9.2.1.tgz", + "integrity": "sha512-fqe8naHt46e0yIdkjUZYqddSXfej3AHajX+CSO5X7oy0EmPc6o5Xh+RClNoHjnieWz9AW4kZxW9yyFMhVB1QLA==", + "dev": true, + "dependencies": { + "find-cache-dir": "^4.0.0", + "schema-utils": "^4.0.0" + }, + "engines": { + "node": ">= 14.15.0" + }, + "peerDependencies": { + "@babel/core": "^7.12.0", + "webpack": ">=5" + } + }, + "node_modules/babel-loader/node_modules/find-cache-dir": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/find-cache-dir/-/find-cache-dir-4.0.0.tgz", + "integrity": "sha512-9ZonPT4ZAK4a+1pUPVPZJapbi7O5qbbJPdYw/NOQWZZbVLdDTYM3A4R9z/DpAM08IDaFGsvPgiGZ82WEwUDWjg==", + "dev": true, + "dependencies": { + "common-path-prefix": "^3.0.0", + "pkg-dir": "^7.0.0" + }, + "engines": { + "node": ">=14.16" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/babel-loader/node_modules/find-up": { + "version": "6.3.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-6.3.0.tgz", + "integrity": "sha512-v2ZsoEuVHYy8ZIlYqwPe/39Cy+cFDzp4dXPaxNvkEuouymu+2Jbz0PxpKarJHYJTmv2HWT3O382qY8l4jMWthw==", + "dev": true, + "dependencies": { + "locate-path": "^7.1.0", + "path-exists": "^5.0.0" + }, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/babel-loader/node_modules/locate-path": { + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-7.2.0.tgz", + "integrity": "sha512-gvVijfZvn7R+2qyPX8mAuKcFGDf6Nc61GdvGafQsHL0sBIxfKzA+usWn4GFC/bk+QdwPUD4kWFJLhElipq+0VA==", + "dev": true, + "dependencies": { + "p-locate": "^6.0.0" + }, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/babel-loader/node_modules/p-limit": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-4.0.0.tgz", + "integrity": "sha512-5b0R4txpzjPWVw/cXXUResoD4hb6U/x9BH08L7nw+GN1sezDzPdxeRvpc9c433fZhBan/wusjbCsqwqm4EIBIQ==", + "dev": true, + "dependencies": { + "yocto-queue": "^1.0.0" + }, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/babel-loader/node_modules/p-locate": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-6.0.0.tgz", + "integrity": "sha512-wPrq66Llhl7/4AGC6I+cqxT07LhXvWL08LNXz1fENOw0Ap4sRZZ/gZpTTJ5jpurzzzfS2W/Ge9BY3LgLjCShcw==", + "dev": true, + "dependencies": { + "p-limit": "^4.0.0" + }, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/babel-loader/node_modules/path-exists": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-5.0.0.tgz", + "integrity": "sha512-RjhtfwJOxzcFmNOi6ltcbcu4Iu+FL3zEj83dk4kAS+fVpTxXLO1b38RvJgT/0QwvV/L3aY9TAnyv0EOqW4GoMQ==", + "dev": true, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + } + }, + "node_modules/babel-loader/node_modules/pkg-dir": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/pkg-dir/-/pkg-dir-7.0.0.tgz", + "integrity": "sha512-Ie9z/WINcxxLp27BKOCHGde4ITq9UklYKDzVo1nhk5sqGEXU3FpkwP5GM2voTGJkGd9B3Otl+Q4uwSOeSUtOBA==", + "dev": true, + "dependencies": { + "find-up": "^6.3.0" + }, + "engines": { + "node": ">=14.16" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/babel-loader/node_modules/yocto-queue": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-1.1.1.tgz", + "integrity": "sha512-b4JR1PFR10y1mKjhHY9LaGo6tmrgjit7hxVIeAmyMw3jegXR4dhYqLaQF5zMXZxY7tLpMyJeLjr1C4rLmkVe8g==", + "dev": true, + "engines": { + "node": ">=12.20" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/babel-plugin-istanbul": { + "version": "6.1.1", + "resolved": "https://registry.npmjs.org/babel-plugin-istanbul/-/babel-plugin-istanbul-6.1.1.tgz", + "integrity": "sha512-Y1IQok9821cC9onCx5otgFfRm7Lm+I+wwxOx738M/WLPZ9Q42m4IG5W0FNX8WLL2gYMZo3JkuXIH2DOpWM+qwA==", + "dev": true, + "dependencies": { + "@babel/helper-plugin-utils": "^7.0.0", + "@istanbuljs/load-nyc-config": "^1.0.0", + "@istanbuljs/schema": "^0.1.2", + "istanbul-lib-instrument": "^5.0.4", + "test-exclude": "^6.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/babel-plugin-istanbul/node_modules/istanbul-lib-instrument": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/istanbul-lib-instrument/-/istanbul-lib-instrument-5.2.1.tgz", + "integrity": "sha512-pzqtp31nLv/XFOzXGuvhCb8qhjmTVo5vjVk19XE4CRlSWz0KoeJ3bw9XsA7nOp9YBf4qHjwBxkDzKcME/J29Yg==", + "dev": true, + "dependencies": { + "@babel/core": "^7.12.3", + "@babel/parser": "^7.14.7", + "@istanbuljs/schema": "^0.1.2", + "istanbul-lib-coverage": "^3.2.0", + "semver": "^6.3.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/babel-plugin-jest-hoist": { + "version": "29.6.3", + "resolved": "https://registry.npmjs.org/babel-plugin-jest-hoist/-/babel-plugin-jest-hoist-29.6.3.tgz", + "integrity": "sha512-ESAc/RJvGTFEzRwOTT4+lNDk/GNHMkKbNzsvT0qKRfDyyYTskxB5rnU2njIDYVxXCBHHEI1c0YwHob3WaYujOg==", + "dev": true, + "dependencies": { + "@babel/template": "^7.3.3", + "@babel/types": "^7.3.3", + "@types/babel__core": "^7.1.14", + "@types/babel__traverse": "^7.0.6" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/babel-plugin-minify-builtins": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/babel-plugin-minify-builtins/-/babel-plugin-minify-builtins-0.5.0.tgz", + "integrity": "sha512-wpqbN7Ov5hsNwGdzuzvFcjgRlzbIeVv1gMIlICbPj0xkexnfoIDe7q+AZHMkQmAE/F9R5jkrB6TLfTegImlXag==", + "dev": true + }, + "node_modules/babel-plugin-minify-constant-folding": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/babel-plugin-minify-constant-folding/-/babel-plugin-minify-constant-folding-0.5.0.tgz", + "integrity": "sha512-Vj97CTn/lE9hR1D+jKUeHfNy+m1baNiJ1wJvoGyOBUx7F7kJqDZxr9nCHjO/Ad+irbR3HzR6jABpSSA29QsrXQ==", + "dev": true, + "dependencies": { + "babel-helper-evaluate-path": "^0.5.0" + } + }, + "node_modules/babel-plugin-minify-dead-code-elimination": { + "version": "0.5.2", + "resolved": "https://registry.npmjs.org/babel-plugin-minify-dead-code-elimination/-/babel-plugin-minify-dead-code-elimination-0.5.2.tgz", + "integrity": "sha512-krq9Lwi0QIzyAlcNBXTL4usqUvevB4BzktdEsb8srcXC1AaYqRJiAQw6vdKdJSaXbz6snBvziGr6ch/aoRCfpA==", + "dev": true, + "dependencies": { + "babel-helper-evaluate-path": "^0.5.0", + "babel-helper-mark-eval-scopes": "^0.4.3", + "babel-helper-remove-or-void": "^0.4.3", + "lodash": "^4.17.11" + } + }, + "node_modules/babel-plugin-minify-flip-comparisons": { + "version": "0.4.3", + "resolved": "https://registry.npmjs.org/babel-plugin-minify-flip-comparisons/-/babel-plugin-minify-flip-comparisons-0.4.3.tgz", + "integrity": "sha512-8hNwgLVeJzpeLVOVArag2DfTkbKodzOHU7+gAZ8mGBFGPQHK6uXVpg3jh5I/F6gfi5Q5usWU2OKcstn1YbAV7A==", + "dev": true, + "dependencies": { + "babel-helper-is-void-0": "^0.4.3" + } + }, + "node_modules/babel-plugin-minify-guarded-expressions": { + "version": "0.4.4", + "resolved": "https://registry.npmjs.org/babel-plugin-minify-guarded-expressions/-/babel-plugin-minify-guarded-expressions-0.4.4.tgz", + "integrity": "sha512-RMv0tM72YuPPfLT9QLr3ix9nwUIq+sHT6z8Iu3sLbqldzC1Dls8DPCywzUIzkTx9Zh1hWX4q/m9BPoPed9GOfA==", + "dev": true, + "dependencies": { + "babel-helper-evaluate-path": "^0.5.0", + "babel-helper-flip-expressions": "^0.4.3" + } + }, + "node_modules/babel-plugin-minify-infinity": { + "version": "0.4.3", + "resolved": "https://registry.npmjs.org/babel-plugin-minify-infinity/-/babel-plugin-minify-infinity-0.4.3.tgz", + "integrity": "sha512-X0ictxCk8y+NvIf+bZ1HJPbVZKMlPku3lgYxPmIp62Dp8wdtbMLSekczty3MzvUOlrk5xzWYpBpQprXUjDRyMA==", + "dev": true + }, + "node_modules/babel-plugin-minify-mangle-names": { + "version": "0.5.1", + "resolved": "https://registry.npmjs.org/babel-plugin-minify-mangle-names/-/babel-plugin-minify-mangle-names-0.5.1.tgz", + "integrity": "sha512-8KMichAOae2FHlipjNDTo2wz97MdEb2Q0jrn4NIRXzHH7SJ3c5TaNNBkeTHbk9WUsMnqpNUx949ugM9NFWewzw==", + "dev": true, + "dependencies": { + "babel-helper-mark-eval-scopes": "^0.4.3" + } + }, + "node_modules/babel-plugin-minify-numeric-literals": { + "version": "0.4.3", + "resolved": "https://registry.npmjs.org/babel-plugin-minify-numeric-literals/-/babel-plugin-minify-numeric-literals-0.4.3.tgz", + "integrity": "sha512-5D54hvs9YVuCknfWywq0eaYDt7qYxlNwCqW9Ipm/kYeS9gYhJd0Rr/Pm2WhHKJ8DC6aIlDdqSBODSthabLSX3A==", + "dev": true + }, + "node_modules/babel-plugin-minify-replace": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/babel-plugin-minify-replace/-/babel-plugin-minify-replace-0.5.0.tgz", + "integrity": "sha512-aXZiaqWDNUbyNNNpWs/8NyST+oU7QTpK7J9zFEFSA0eOmtUNMU3fczlTTTlnCxHmq/jYNFEmkkSG3DDBtW3Y4Q==", + "dev": true + }, + "node_modules/babel-plugin-minify-simplify": { + "version": "0.5.1", + "resolved": "https://registry.npmjs.org/babel-plugin-minify-simplify/-/babel-plugin-minify-simplify-0.5.1.tgz", + "integrity": "sha512-OSYDSnoCxP2cYDMk9gxNAed6uJDiDz65zgL6h8d3tm8qXIagWGMLWhqysT6DY3Vs7Fgq7YUDcjOomhVUb+xX6A==", + "dev": true, + "dependencies": { + "babel-helper-evaluate-path": "^0.5.0", + "babel-helper-flip-expressions": "^0.4.3", + "babel-helper-is-nodes-equiv": "^0.0.1", + "babel-helper-to-multiple-sequence-expressions": "^0.5.0" + } + }, + "node_modules/babel-plugin-minify-type-constructors": { + "version": "0.4.3", + "resolved": "https://registry.npmjs.org/babel-plugin-minify-type-constructors/-/babel-plugin-minify-type-constructors-0.4.3.tgz", + "integrity": "sha512-4ADB0irJ/6BeXWHubjCJmrPbzhxDgjphBMjIjxCc25n4NGJ00NsYqwYt+F/OvE9RXx8KaSW7cJvp+iZX436tnQ==", + "dev": true, + "dependencies": { + "babel-helper-is-void-0": "^0.4.3" + } + }, + "node_modules/babel-plugin-polyfill-corejs2": { + "version": "0.4.11", + "resolved": "https://registry.npmjs.org/babel-plugin-polyfill-corejs2/-/babel-plugin-polyfill-corejs2-0.4.11.tgz", + "integrity": "sha512-sMEJ27L0gRHShOh5G54uAAPaiCOygY/5ratXuiyb2G46FmlSpc9eFCzYVyDiPxfNbwzA7mYahmjQc5q+CZQ09Q==", + "dev": true, + "dependencies": { + "@babel/compat-data": "^7.22.6", + "@babel/helper-define-polyfill-provider": "^0.6.2", + "semver": "^6.3.1" + }, + "peerDependencies": { + "@babel/core": "^7.4.0 || ^8.0.0-0 <8.0.0" + } + }, + "node_modules/babel-plugin-polyfill-corejs3": { + "version": "0.10.6", + "resolved": "https://registry.npmjs.org/babel-plugin-polyfill-corejs3/-/babel-plugin-polyfill-corejs3-0.10.6.tgz", + "integrity": "sha512-b37+KR2i/khY5sKmWNVQAnitvquQbNdWy6lJdsr0kmquCKEEUgMKK4SboVM3HtfnZilfjr4MMQ7vY58FVWDtIA==", + "dev": true, + "dependencies": { + "@babel/helper-define-polyfill-provider": "^0.6.2", + "core-js-compat": "^3.38.0" + }, + "peerDependencies": { + "@babel/core": "^7.4.0 || ^8.0.0-0 <8.0.0" + } + }, + "node_modules/babel-plugin-polyfill-regenerator": { + "version": "0.6.2", + "resolved": "https://registry.npmjs.org/babel-plugin-polyfill-regenerator/-/babel-plugin-polyfill-regenerator-0.6.2.tgz", + "integrity": "sha512-2R25rQZWP63nGwaAswvDazbPXfrM3HwVoBXK6HcqeKrSrL/JqcC/rDcf95l4r7LXLyxDXc8uQDa064GubtCABg==", + "dev": true, + "dependencies": { + "@babel/helper-define-polyfill-provider": "^0.6.2" + }, + "peerDependencies": { + "@babel/core": "^7.4.0 || ^8.0.0-0 <8.0.0" + } + }, + "node_modules/babel-plugin-transform-inline-consecutive-adds": { + "version": "0.4.3", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-inline-consecutive-adds/-/babel-plugin-transform-inline-consecutive-adds-0.4.3.tgz", + "integrity": "sha512-8D104wbzzI5RlxeVPYeQb9QsUyepiH1rAO5hpPpQ6NPRgQLpIVwkS/Nbx944pm4K8Z+rx7CgjPsFACz/VCBN0Q==", + "dev": true + }, + "node_modules/babel-plugin-transform-member-expression-literals": { + "version": "6.9.4", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-member-expression-literals/-/babel-plugin-transform-member-expression-literals-6.9.4.tgz", + "integrity": "sha512-Xq9/Rarpj+bjOZSl1nBbZYETsNEDDJSrb6Plb1sS3/36FukWFLLRysgecva5KZECjUJTrJoQqjJgtWToaflk5Q==", + "dev": true + }, + "node_modules/babel-plugin-transform-merge-sibling-variables": { + "version": "6.9.5", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-merge-sibling-variables/-/babel-plugin-transform-merge-sibling-variables-6.9.5.tgz", + "integrity": "sha512-xj/KrWi6/uP+DrD844h66Qh2cZN++iugEIgH8QcIxhmZZPNP6VpOE9b4gP2FFW39xDAY43kCmYMM6U0QNKN8fw==", + "dev": true + }, + "node_modules/babel-plugin-transform-minify-booleans": { + "version": "6.9.4", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-minify-booleans/-/babel-plugin-transform-minify-booleans-6.9.4.tgz", + "integrity": "sha512-9pW9ePng6DZpzGPalcrULuhSCcauGAbn8AeU3bE34HcDkGm8Ldt0ysjGkyb64f0K3T5ilV4mriayOVv5fg0ASA==", + "dev": true + }, + "node_modules/babel-plugin-transform-property-literals": { + "version": "6.9.4", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-property-literals/-/babel-plugin-transform-property-literals-6.9.4.tgz", + "integrity": "sha512-Pf8JHTjTPxecqVyL6KSwD/hxGpoTZjiEgV7nCx0KFQsJYM0nuuoCajbg09KRmZWeZbJ5NGTySABYv8b/hY1eEA==", + "dev": true, + "dependencies": { + "esutils": "^2.0.2" + } + }, + "node_modules/babel-plugin-transform-regexp-constructors": { + "version": "0.4.3", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-regexp-constructors/-/babel-plugin-transform-regexp-constructors-0.4.3.tgz", + "integrity": "sha512-JjymDyEyRNhAoNFp09y/xGwYVYzT2nWTGrBrWaL6eCg2m+B24qH2jR0AA8V8GzKJTgC8NW6joJmc6nabvWBD/g==", + "dev": true + }, + "node_modules/babel-plugin-transform-remove-console": { + "version": "6.9.4", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-remove-console/-/babel-plugin-transform-remove-console-6.9.4.tgz", + "integrity": "sha512-88blrUrMX3SPiGkT1GnvVY8E/7A+k6oj3MNvUtTIxJflFzXTw1bHkuJ/y039ouhFMp2prRn5cQGzokViYi1dsg==", + "dev": true + }, + "node_modules/babel-plugin-transform-remove-debugger": { + "version": "6.9.4", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-remove-debugger/-/babel-plugin-transform-remove-debugger-6.9.4.tgz", + "integrity": "sha512-Kd+eTBYlXfwoFzisburVwrngsrz4xh9I0ppoJnU/qlLysxVBRgI4Pj+dk3X8F5tDiehp3hhP8oarRMT9v2Z3lw==", + "dev": true + }, + "node_modules/babel-plugin-transform-remove-undefined": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-remove-undefined/-/babel-plugin-transform-remove-undefined-0.5.0.tgz", + "integrity": "sha512-+M7fJYFaEE/M9CXa0/IRkDbiV3wRELzA1kKQFCJ4ifhrzLKn/9VCCgj9OFmYWwBd8IB48YdgPkHYtbYq+4vtHQ==", + "dev": true, + "dependencies": { + "babel-helper-evaluate-path": "^0.5.0" + } + }, + "node_modules/babel-plugin-transform-simplify-comparison-operators": { + "version": "6.9.4", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-simplify-comparison-operators/-/babel-plugin-transform-simplify-comparison-operators-6.9.4.tgz", + "integrity": "sha512-GLInxhGAQWJ9YIdjwF6dAFlmh4U+kN8pL6Big7nkDzHoZcaDQOtBm28atEhQJq6m9GpAovbiGEShKqXv4BSp0A==", + "dev": true + }, + "node_modules/babel-plugin-transform-undefined-to-void": { + "version": "6.9.4", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-undefined-to-void/-/babel-plugin-transform-undefined-to-void-6.9.4.tgz", + "integrity": "sha512-D2UbwxawEY1xVc9svYAUZQM2xarwSNXue2qDIx6CeV2EuMGaes/0su78zlIDIAgE7BvnMw4UpmSo9fDy+znghg==", + "dev": true + }, + "node_modules/babel-preset-current-node-syntax": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/babel-preset-current-node-syntax/-/babel-preset-current-node-syntax-1.1.0.tgz", + "integrity": "sha512-ldYss8SbBlWva1bs28q78Ju5Zq1F+8BrqBZZ0VFhLBvhh6lCpC2o3gDJi/5DRLs9FgYZCnmPYIVFU4lRXCkyUw==", + "dev": true, + "dependencies": { + "@babel/plugin-syntax-async-generators": "^7.8.4", + "@babel/plugin-syntax-bigint": "^7.8.3", + "@babel/plugin-syntax-class-properties": "^7.12.13", + "@babel/plugin-syntax-class-static-block": "^7.14.5", + "@babel/plugin-syntax-import-attributes": "^7.24.7", + "@babel/plugin-syntax-import-meta": "^7.10.4", + "@babel/plugin-syntax-json-strings": "^7.8.3", + "@babel/plugin-syntax-logical-assignment-operators": "^7.10.4", + "@babel/plugin-syntax-nullish-coalescing-operator": "^7.8.3", + "@babel/plugin-syntax-numeric-separator": "^7.10.4", + "@babel/plugin-syntax-object-rest-spread": "^7.8.3", + "@babel/plugin-syntax-optional-catch-binding": "^7.8.3", + "@babel/plugin-syntax-optional-chaining": "^7.8.3", + "@babel/plugin-syntax-private-property-in-object": "^7.14.5", + "@babel/plugin-syntax-top-level-await": "^7.14.5" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" + } + }, + "node_modules/babel-preset-jest": { + "version": "29.6.3", + "resolved": "https://registry.npmjs.org/babel-preset-jest/-/babel-preset-jest-29.6.3.tgz", + "integrity": "sha512-0B3bhxR6snWXJZtR/RliHTDPRgn1sNHOR0yVtq/IiQFyuOVjFS+wuio/R4gSNkyYmKmJB4wGZv2NZanmKmTnNA==", + "dev": true, + "dependencies": { + "babel-plugin-jest-hoist": "^29.6.3", + "babel-preset-current-node-syntax": "^1.0.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" + } + }, + "node_modules/babel-preset-minify": { + "version": "0.5.2", + "resolved": "https://registry.npmjs.org/babel-preset-minify/-/babel-preset-minify-0.5.2.tgz", + "integrity": "sha512-v4GL+kk0TfovbRIKZnC3HPbu2cAGmPAby7BsOmuPdMJfHV+4FVdsGXTH/OOGQRKYdjemBuL1+MsE6mobobhe9w==", + "dev": true, + "dependencies": { + "babel-plugin-minify-builtins": "^0.5.0", + "babel-plugin-minify-constant-folding": "^0.5.0", + "babel-plugin-minify-dead-code-elimination": "^0.5.2", + "babel-plugin-minify-flip-comparisons": "^0.4.3", + "babel-plugin-minify-guarded-expressions": "^0.4.4", + "babel-plugin-minify-infinity": "^0.4.3", + "babel-plugin-minify-mangle-names": "^0.5.1", + "babel-plugin-minify-numeric-literals": "^0.4.3", + "babel-plugin-minify-replace": "^0.5.0", + "babel-plugin-minify-simplify": "^0.5.1", + "babel-plugin-minify-type-constructors": "^0.4.3", + "babel-plugin-transform-inline-consecutive-adds": "^0.4.3", + "babel-plugin-transform-member-expression-literals": "^6.9.4", + "babel-plugin-transform-merge-sibling-variables": "^6.9.5", + "babel-plugin-transform-minify-booleans": "^6.9.4", + "babel-plugin-transform-property-literals": "^6.9.4", + "babel-plugin-transform-regexp-constructors": "^0.4.3", + "babel-plugin-transform-remove-console": "^6.9.4", + "babel-plugin-transform-remove-debugger": "^6.9.4", + "babel-plugin-transform-remove-undefined": "^0.5.0", + "babel-plugin-transform-simplify-comparison-operators": "^6.9.4", + "babel-plugin-transform-undefined-to-void": "^6.9.4", + "lodash": "^4.17.11" + } + }, + "node_modules/balanced-match": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", + "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==", + "dev": true + }, + "node_modules/batch": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/batch/-/batch-0.6.1.tgz", + "integrity": "sha512-x+VAiMRL6UPkx+kudNvxTl6hB2XNNCG2r+7wixVfIYwu/2HKRXimwQyaumLjMveWvT2Hkd/cAJw+QBMfJ/EKVw==", + "dev": true + }, + "node_modules/big.js": { + "version": "5.2.2", + "resolved": "https://registry.npmjs.org/big.js/-/big.js-5.2.2.tgz", + "integrity": "sha512-vyL2OymJxmarO8gxMr0mhChsO9QGwhynfuu4+MHTAW6czfq9humCB7rKpUjDd9YUiDPU4mzpyupFSvOClAwbmQ==", + "dev": true, + "engines": { + "node": "*" + } + }, + "node_modules/binary-extensions": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.3.0.tgz", + "integrity": "sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw==", + "dev": true, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/body-parser": { + "version": "1.20.4", + "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.4.tgz", + "integrity": "sha512-ZTgYYLMOXY9qKU/57FAo8F+HA2dGX7bqGc71txDRC1rS4frdFI5R7NhluHxH6M0YItAP0sHB4uqAOcYKxO6uGA==", + "dev": true, + "license": "MIT", + "dependencies": { + "bytes": "~3.1.2", + "content-type": "~1.0.5", + "debug": "2.6.9", + "depd": "2.0.0", + "destroy": "~1.2.0", + "http-errors": "~2.0.1", + "iconv-lite": "~0.4.24", + "on-finished": "~2.4.1", + "qs": "~6.14.0", + "raw-body": "~2.5.3", + "type-is": "~1.6.18", + "unpipe": "~1.0.0" + }, + "engines": { + "node": ">= 0.8", + "npm": "1.2.8000 || >= 1.4.16" + } + }, + "node_modules/body-parser/node_modules/debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "dev": true, + "license": "MIT", + "dependencies": { + "ms": "2.0.0" + } + }, + "node_modules/body-parser/node_modules/iconv-lite": { + "version": "0.4.24", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz", + "integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==", + "dev": true, + "license": "MIT", + "dependencies": { + "safer-buffer": ">= 2.1.2 < 3" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/body-parser/node_modules/ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", + "dev": true, + "license": "MIT" + }, + "node_modules/bonjour-service": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/bonjour-service/-/bonjour-service-1.2.1.tgz", + "integrity": "sha512-oSzCS2zV14bh2kji6vNe7vrpJYCHGvcZnlffFQ1MEoX/WOeQ/teD8SYWKR942OI3INjq8OMNJlbPK5LLLUxFDw==", + "dev": true, + "dependencies": { + "fast-deep-equal": "^3.1.3", + "multicast-dns": "^7.2.5" + } + }, + "node_modules/boolbase": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", + "integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==" + }, + "node_modules/brace-expansion": { + "version": "1.1.12", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz", + "integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==", + "dev": true, + "license": "MIT", + "dependencies": { + "balanced-match": "^1.0.0", + "concat-map": "0.0.1" + } + }, + "node_modules/braces": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz", + "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==", + "dev": true, + "dependencies": { + "fill-range": "^7.1.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/browserslist": { + "version": "4.24.2", + "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.24.2.tgz", + "integrity": "sha512-ZIc+Q62revdMcqC6aChtW4jz3My3klmCO1fEmINZY/8J3EpBg5/A/D0AKmBveUh6pgoeycoMkVMko84tuYS+Gg==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/browserslist" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "dependencies": { + "caniuse-lite": "^1.0.30001669", + "electron-to-chromium": "^1.5.41", + "node-releases": "^2.0.18", + "update-browserslist-db": "^1.1.1" + }, + "bin": { + "browserslist": "cli.js" + }, + "engines": { + "node": "^6 || ^7 || ^8 || ^9 || ^10 || ^11 || ^12 || >=13.7" + } + }, + "node_modules/bs-logger": { + "version": "0.2.6", + "resolved": "https://registry.npmjs.org/bs-logger/-/bs-logger-0.2.6.tgz", + "integrity": "sha512-pd8DCoxmbgc7hyPKOvxtqNcjYoOsABPQdcCUjGp3d42VR2CX1ORhk2A87oqqu5R1kk+76nsxZupkmyd+MVtCog==", + "dev": true, + "dependencies": { + "fast-json-stable-stringify": "2.x" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/bser": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/bser/-/bser-2.1.1.tgz", + "integrity": "sha512-gQxTNE/GAfIIrmHLUE3oJyp5FO6HRBfhjnw4/wMmA63ZGDJnWBmgY/lyQBpnDUkGmAhbSe39tx2d/iTOAfglwQ==", + "dev": true, + "dependencies": { + "node-int64": "^0.4.0" + } + }, + "node_modules/buffer-from": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.2.tgz", + "integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==" + }, + "node_modules/bundle-name": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/bundle-name/-/bundle-name-4.1.0.tgz", + "integrity": "sha512-tjwM5exMg6BGRI+kNmTntNsvdZS1X8BFYS6tnJ2hdH0kVxM6/eVZ2xy+FqStSWvYmtfFMDLIxurorHwDKfDz5Q==", + "dev": true, + "dependencies": { + "run-applescript": "^7.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/bytes": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz", + "integrity": "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/bytestreamjs": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/bytestreamjs/-/bytestreamjs-2.0.1.tgz", + "integrity": "sha512-U1Z/ob71V/bXfVABvNr/Kumf5VyeQRBEm6Txb0PQ6S7V5GpBM3w4Cbqz/xPDicR5tN0uvDifng8C+5qECeGwyQ==", + "dev": true, + "license": "BSD-3-Clause", + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/call-bind": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.8.tgz", + "integrity": "sha512-oKlSFMcMwpUg2ednkhQ454wfWiU/ul3CkJe/PEHcTKuiX6RpbehUiFMXu13HalGZxfUwCQzZG747YXBn1im9ww==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind-apply-helpers": "^1.0.0", + "es-define-property": "^1.0.0", + "get-intrinsic": "^1.2.4", + "set-function-length": "^1.2.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/call-bind-apply-helpers": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz", + "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/call-bound": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/call-bound/-/call-bound-1.0.4.tgz", + "integrity": "sha512-+ys997U96po4Kx/ABpBCqhA9EuxJaQWDQg7295H4hBphv3IZg0boBKuwYpt4YXp6MZ5AmZQnU/tyMTlRpaSejg==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind-apply-helpers": "^1.0.2", + "get-intrinsic": "^1.3.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/call-me-maybe": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/call-me-maybe/-/call-me-maybe-1.0.2.tgz", + "integrity": "sha512-HpX65o1Hnr9HH25ojC1YGs7HCQLq0GCOibSaWER0eNpgJ/Z1MZv2mTc7+xh6WOPxbRVcmgbv4hGU+uSQ/2xFZQ==", + "dev": true + }, + "node_modules/callsites": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz", + "integrity": "sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==", + "dev": true, + "engines": { + "node": ">=6" + } + }, + "node_modules/camel-case": { + "version": "4.1.2", + "resolved": "https://registry.npmjs.org/camel-case/-/camel-case-4.1.2.tgz", + "integrity": "sha512-gxGWBrTT1JuMx6R+o5PTXMmUnhnVzLQ9SNutD4YqKtI6ap897t3tKECYla6gCWEkplXnlNybEkZg9GEGxKFCgw==", + "dev": true, + "dependencies": { + "pascal-case": "^3.1.2", + "tslib": "^2.0.3" + } + }, + "node_modules/camelcase": { + "version": "6.3.0", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-6.3.0.tgz", + "integrity": "sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/camelize": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/camelize/-/camelize-1.0.1.tgz", + "integrity": "sha512-dU+Tx2fsypxTgtLoE36npi3UqcjSSMNYfkqgmoEhtZrraP5VWq0K7FkWVTYa8eMPtnU/G2txVsfdCJTn9uzpuQ==", + "dev": true, + "peer": true, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/caniuse-api": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/caniuse-api/-/caniuse-api-3.0.0.tgz", + "integrity": "sha512-bsTwuIg/BZZK/vreVTYYbSWoe2F+71P7K5QGEX+pT250DZbfU1MQ5prOKpPR+LL6uWKK3KMwMCAS74QB3Um1uw==", + "dependencies": { + "browserslist": "^4.0.0", + "caniuse-lite": "^1.0.0", + "lodash.memoize": "^4.1.2", + "lodash.uniq": "^4.5.0" + } + }, + "node_modules/caniuse-lite": { + "version": "1.0.30001675", + "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001675.tgz", + "integrity": "sha512-/wV1bQwPrkLiQMjaJF5yUMVM/VdRPOCU8QZ+PmG6uW6DvYSrNY1bpwHI/3mOcUosLaJCzYDi5o91IQB51ft6cg==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/caniuse-lite" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ] + }, + "node_modules/chalk": { + "version": "4.1.2", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", + "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", + "dependencies": { + "ansi-styles": "^4.1.0", + "supports-color": "^7.1.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/chalk?sponsor=1" + } + }, + "node_modules/char-regex": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/char-regex/-/char-regex-1.0.2.tgz", + "integrity": "sha512-kWWXztvZ5SBQV+eRgKFeh8q5sLuZY2+8WUIzlxWVTg+oGwY14qylx1KbKzHd8P6ZYkAg0xyIDU9JMHhyJMZ1jw==", + "dev": true, + "engines": { + "node": ">=10" + } + }, + "node_modules/charenc": { + "version": "0.0.2", + "resolved": "https://registry.npmjs.org/charenc/-/charenc-0.0.2.tgz", + "integrity": "sha512-yrLQ/yVUFXkzg7EDQsPieE/53+0RlaWTs+wBrvW36cyilJ2SaDWfl4Yj7MtLTXleV9uEKefbAGUPv2/iWSooRA==", + "engines": { + "node": "*" + } + }, + "node_modules/cheerio": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0.tgz", + "integrity": "sha512-quS9HgjQpdaXOvsZz82Oz7uxtXiy6UIsIQcpBj7HRw2M63Skasm9qlDocAM7jNuaxdhpPU7c4kJN+gA5MCu4ww==", + "dev": true, + "dependencies": { + "cheerio-select": "^2.1.0", + "dom-serializer": "^2.0.0", + "domhandler": "^5.0.3", + "domutils": "^3.1.0", + "encoding-sniffer": "^0.2.0", + "htmlparser2": "^9.1.0", + "parse5": "^7.1.2", + "parse5-htmlparser2-tree-adapter": "^7.0.0", + "parse5-parser-stream": "^7.1.2", + "undici": "^6.19.5", + "whatwg-mimetype": "^4.0.0" + }, + "engines": { + "node": ">=18.17" + }, + "funding": { + "url": "https://github.com/cheeriojs/cheerio?sponsor=1" + } + }, + "node_modules/cheerio-select": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/cheerio-select/-/cheerio-select-2.1.0.tgz", + "integrity": "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==", + "dev": true, + "dependencies": { + "boolbase": "^1.0.0", + "css-select": "^5.1.0", + "css-what": "^6.1.0", + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/chokidar": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.6.0.tgz", + "integrity": "sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==", + "dev": true, + "dependencies": { + "anymatch": "~3.1.2", + "braces": "~3.0.2", + "glob-parent": "~5.1.2", + "is-binary-path": "~2.1.0", + "is-glob": "~4.0.1", + "normalize-path": "~3.0.0", + "readdirp": "~3.6.0" + }, + "engines": { + "node": ">= 8.10.0" + }, + "funding": { + "url": "https://paulmillr.com/funding/" + }, + "optionalDependencies": { + "fsevents": "~2.3.2" + } + }, + "node_modules/chrome-trace-event": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/chrome-trace-event/-/chrome-trace-event-1.0.4.tgz", + "integrity": "sha512-rNjApaLzuwaOTjCiT8lSDdGN1APCiqkChLMJxJPWLunPAt5fy8xgU9/jNOchV84wfIxrA0lRQB7oCT8jrn/wrQ==", + "engines": { + "node": ">=6.0" + } + }, + "node_modules/ci-info": { + "version": "3.9.0", + "resolved": "https://registry.npmjs.org/ci-info/-/ci-info-3.9.0.tgz", + "integrity": "sha512-NIxF55hv4nSqQswkAeiOi1r83xy8JldOFDTWiug55KBu9Jnblncd2U6ViHmYgHf01TPZS77NJBhBMKdWj9HQMQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/sibiraj-s" + } + ], + "engines": { + "node": ">=8" + } + }, + "node_modules/circular-dependency-plugin": { + "version": "5.2.2", + "resolved": "https://registry.npmjs.org/circular-dependency-plugin/-/circular-dependency-plugin-5.2.2.tgz", + "integrity": "sha512-g38K9Cm5WRwlaH6g03B9OEz/0qRizI+2I7n+Gz+L5DxXJAPAiWQvwlYNm1V1jkdpUv95bOe/ASm2vfi/G560jQ==", + "dev": true, + "engines": { + "node": ">=6.0.0" + }, + "peerDependencies": { + "webpack": ">=4.0.1" + } + }, + "node_modules/cjs-module-lexer": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/cjs-module-lexer/-/cjs-module-lexer-1.4.1.tgz", + "integrity": "sha512-cuSVIHi9/9E/+821Qjdvngor+xpnlwnuwIyZOaLmHBVdXL+gP+I6QQB9VkO7RI77YIcTV+S1W9AreJ5eN63JBA==", + "dev": true + }, + "node_modules/classnames": { + "version": "2.5.1", + "resolved": "https://registry.npmjs.org/classnames/-/classnames-2.5.1.tgz", + "integrity": "sha512-saHYOzhIQs6wy2sVxTM6bUDsQO4F50V9RQ22qBpEdCW+I+/Wmke2HOl6lS6dTpdxVhb88/I6+Hs+438c3lfUow==" + }, + "node_modules/clean-css": { + "version": "5.3.3", + "resolved": "https://registry.npmjs.org/clean-css/-/clean-css-5.3.3.tgz", + "integrity": "sha512-D5J+kHaVb/wKSFcyyV75uCn8fiY4sV38XJoe4CUyGQ+mOU/fMVYUdH1hJC+CJQ5uY3EnW27SbJYS4X8BiLrAFg==", + "dev": true, + "dependencies": { + "source-map": "~0.6.0" + }, + "engines": { + "node": ">= 10.0" + } + }, + "node_modules/clean-css/node_modules/source-map": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", + "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/cli-cursor": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/cli-cursor/-/cli-cursor-5.0.0.tgz", + "integrity": "sha512-aCj4O5wKyszjMmDT4tZj93kxyydN/K5zPWSCe6/0AV/AA1pqe5ZBIw0a2ZfPQV7lL5/yb5HsUreJ6UFAF1tEQw==", + "dev": true, + "license": "MIT", + "dependencies": { + "restore-cursor": "^5.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/cli-truncate": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/cli-truncate/-/cli-truncate-4.0.0.tgz", + "integrity": "sha512-nPdaFdQ0h/GEigbPClz11D0v/ZJEwxmeVZGeMo3Z5StPtUTkA9o1lD6QwoirYiSDzbcwn2XcjwmCp68W1IS4TA==", + "dev": true, + "license": "MIT", + "dependencies": { + "slice-ansi": "^5.0.0", + "string-width": "^7.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/cli-truncate/node_modules/ansi-regex": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.1.0.tgz", + "integrity": "sha512-7HSX4QQb4CspciLpVFwyRe79O3xsIZDDLER21kERQ71oaPodF8jL725AgJMFAYbooIqolJoRLuM81SpeUkpkvA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/ansi-regex?sponsor=1" + } + }, + "node_modules/cli-truncate/node_modules/emoji-regex": { + "version": "10.4.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-10.4.0.tgz", + "integrity": "sha512-EC+0oUMY1Rqm4O6LLrgjtYDvcVYTy7chDnM4Q7030tP4Kwj3u/pR6gP9ygnp2CJMK5Gq+9Q2oqmrFJAz01DXjw==", + "dev": true, + "license": "MIT" + }, + "node_modules/cli-truncate/node_modules/string-width": { + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-7.2.0.tgz", + "integrity": "sha512-tsaTIkKW9b4N+AEj+SVA+WhJzV7/zMhcSu78mLKWSk7cXMOSHsBKFWUs0fWwq8QyK3MgJBQRX6Gbi4kYbdvGkQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "emoji-regex": "^10.3.0", + "get-east-asian-width": "^1.0.0", + "strip-ansi": "^7.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/cli-truncate/node_modules/strip-ansi": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.0.tgz", + "integrity": "sha512-iq6eVVI64nQQTRYq2KtEg2d2uU7LElhTJwsH4YzIHZshxlgZms/wIc4VoDQTlG/IvVIrBKG06CrZnp0qv7hkcQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "ansi-regex": "^6.0.1" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/strip-ansi?sponsor=1" + } + }, + "node_modules/cliui": { + "version": "8.0.1", + "resolved": "https://registry.npmjs.org/cliui/-/cliui-8.0.1.tgz", + "integrity": "sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==", + "dev": true, + "dependencies": { + "string-width": "^4.2.0", + "strip-ansi": "^6.0.1", + "wrap-ansi": "^7.0.0" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/clone-deep": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/clone-deep/-/clone-deep-4.0.1.tgz", + "integrity": "sha512-neHB9xuzh/wk0dIHweyAXv2aPGZIVk3pLMe+/RNzINf17fe0OG96QroktYAUm7SM1PBnzTabaLboqqxDyMU+SQ==", + "dev": true, + "dependencies": { + "is-plain-object": "^2.0.4", + "kind-of": "^6.0.2", + "shallow-clone": "^3.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/clsx": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/clsx/-/clsx-1.2.1.tgz", + "integrity": "sha512-EcR6r5a8bj6pu3ycsa/E/cKVGuTgZJZdsyUYHOksG/UHIiKfjxzRxYJpyVBwYaQeOvghal9fcc4PidlgzugAQg==", + "engines": { + "node": ">=6" + } + }, + "node_modules/co": { + "version": "4.6.0", + "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz", + "integrity": "sha512-QVb0dM5HvG+uaxitm8wONl7jltx8dqhfU33DcqtOZcLSVIKSDDLDi7+0LbAKiyI8hD9u42m2YxXSkMGWThaecQ==", + "dev": true, + "engines": { + "iojs": ">= 1.0.0", + "node": ">= 0.12.0" + } + }, + "node_modules/collect-v8-coverage": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/collect-v8-coverage/-/collect-v8-coverage-1.0.2.tgz", + "integrity": "sha512-lHl4d5/ONEbLlJvaJNtsF/Lz+WvB07u2ycqTYbdrq7UypDXailES4valYb2eWiJFxZlVmpGekfqoxQhzyFdT4Q==", + "dev": true + }, + "node_modules/color": { + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/color/-/color-4.2.3.tgz", + "integrity": "sha512-1rXeuUUiGGrykh+CeBdu5Ie7OJwinCgQY0bc7GCRxy5xVHy+moaqkpL/jqQq0MtQOeYcrqEz4abc5f0KtU7W4A==", + "dev": true, + "dependencies": { + "color-convert": "^2.0.1", + "color-string": "^1.9.0" + }, + "engines": { + "node": ">=12.5.0" + } + }, + "node_modules/color-convert": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz", + "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==", + "dependencies": { + "color-name": "~1.1.4" + }, + "engines": { + "node": ">=7.0.0" + } + }, + "node_modules/color-name": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz", + "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==" + }, + "node_modules/color-string": { + "version": "1.9.1", + "resolved": "https://registry.npmjs.org/color-string/-/color-string-1.9.1.tgz", + "integrity": "sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==", + "dev": true, + "dependencies": { + "color-name": "^1.0.0", + "simple-swizzle": "^0.2.2" + } + }, + "node_modules/colord": { + "version": "2.9.3", + "resolved": "https://registry.npmjs.org/colord/-/colord-2.9.3.tgz", + "integrity": "sha512-jeC1axXpnb0/2nn/Y1LPuLdgXBLH7aDcHu4KEKfqw3CUhX7ZpfBSlPKyqXE6btIgEzfWtrX3/tyBCaCvXvMkOw==" + }, + "node_modules/colorette": { + "version": "2.0.20", + "resolved": "https://registry.npmjs.org/colorette/-/colorette-2.0.20.tgz", + "integrity": "sha512-IfEDxwoWIjkeXL1eXcDiow4UbKjhLdq6/EuSVR9GMN7KVH3r9gQ83e73hsz1Nd1T3ijd5xv1wcWRYO+D6kCI2w==", + "dev": true + }, + "node_modules/combined-stream": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", + "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", + "dependencies": { + "delayed-stream": "~1.0.0" + }, + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/commander": { + "version": "6.2.1", + "resolved": "https://registry.npmjs.org/commander/-/commander-6.2.1.tgz", + "integrity": "sha512-U7VdrJFnJgo4xjrHpTzu0yrHPGImdsmD95ZlgYSEajAn2JKzDhDTPG9kBTefmObL2w/ngeZnilk+OV9CG3d7UA==", + "dev": true, + "engines": { + "node": ">= 6" + } + }, + "node_modules/common-path-prefix": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/common-path-prefix/-/common-path-prefix-3.0.0.tgz", + "integrity": "sha512-QE33hToZseCH3jS0qN96O/bSh3kaw/h+Tq7ngyY9eWDUnTlTNUyqfqvCXioLe5Na5jFsL78ra/wuBU4iuEgd4w==", + "dev": true + }, + "node_modules/commondir": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/commondir/-/commondir-1.0.1.tgz", + "integrity": "sha512-W9pAhw0ja1Edb5GVdIF1mjZw/ASI0AlShXM83UUGe2DVr5TdAPEA1OA8m/g8zWp9x6On7gqufY+FatDbC3MDQg==", + "dev": true + }, + "node_modules/compressible": { + "version": "2.0.18", + "resolved": "https://registry.npmjs.org/compressible/-/compressible-2.0.18.tgz", + "integrity": "sha512-AF3r7P5dWxL8MxyITRMlORQNaOA2IkAFaTr4k7BUumjPtRpGDTZpl0Pb1XCO6JeDCBdp126Cgs9sMxqSjgYyRg==", + "dev": true, + "license": "MIT", + "dependencies": { + "mime-db": ">= 1.43.0 < 2" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/compression": { + "version": "1.8.1", + "resolved": "https://registry.npmjs.org/compression/-/compression-1.8.1.tgz", + "integrity": "sha512-9mAqGPHLakhCLeNyxPkK4xVo746zQ/czLH1Ky+vkitMnWfWZps8r0qXuwhwizagCRttsL4lfG4pIOvaWLpAP0w==", + "dev": true, + "license": "MIT", + "dependencies": { + "bytes": "3.1.2", + "compressible": "~2.0.18", + "debug": "2.6.9", + "negotiator": "~0.6.4", + "on-headers": "~1.1.0", + "safe-buffer": "5.2.1", + "vary": "~1.1.2" + }, + "engines": { + "node": ">= 0.8.0" + } + }, + "node_modules/compression/node_modules/debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "dev": true, + "license": "MIT", + "dependencies": { + "ms": "2.0.0" + } + }, + "node_modules/compression/node_modules/ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", + "dev": true, + "license": "MIT" + }, + "node_modules/compression/node_modules/negotiator": { + "version": "0.6.4", + "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.4.tgz", + "integrity": "sha512-myRT3DiWPHqho5PrJaIRyaMv2kgYf0mUVgBNOYMuCH5Ki1yEiQaf/ZJuQ62nvpc44wL5WDbTX7yGJi1Neevw8w==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/concat-map": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", + "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==", + "dev": true + }, + "node_modules/connect-history-api-fallback": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/connect-history-api-fallback/-/connect-history-api-fallback-2.0.0.tgz", + "integrity": "sha512-U73+6lQFmfiNPrYbXqr6kZ1i1wiRqXnp2nhMsINseWXO8lDau0LGEffJ8kQi4EjLZympVgRdvqjAgiZ1tgzDDA==", + "dev": true, + "engines": { + "node": ">=0.8" + } + }, + "node_modules/content-disposition": { + "version": "0.5.4", + "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.4.tgz", + "integrity": "sha512-FveZTNuGw04cxlAiWbzi6zTAL/lhehaWbTtgluJh4/E95DqMwTmha3KZN1aAWA8cFIhHzMZUvLevkw5Rqk+tSQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "safe-buffer": "5.2.1" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/content-type": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.5.tgz", + "integrity": "sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/convert-source-map": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-2.0.0.tgz", + "integrity": "sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==", + "dev": true + }, + "node_modules/cookie": { + "version": "0.7.2", + "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.7.2.tgz", + "integrity": "sha512-yki5XnKuf750l50uGTllt6kKILY4nQ1eNIQatoXEByZ5dWgnKqbnqmTrBE5B4N7lrMJKQ2ytWMiTO2o0v6Ew/w==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/cookie-signature": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.7.tgz", + "integrity": "sha512-NXdYc3dLr47pBkpUCHtKSwIOQXLVn8dZEuywboCOJY/osA0wFSLlSawr3KN8qXJEyX66FcONTH8EIlVuK0yyFA==", + "dev": true, + "license": "MIT" + }, + "node_modules/copy-webpack-plugin": { + "version": "11.0.0", + "resolved": "https://registry.npmjs.org/copy-webpack-plugin/-/copy-webpack-plugin-11.0.0.tgz", + "integrity": "sha512-fX2MWpamkW0hZxMEg0+mYnA40LTosOSa5TqZ9GYIBzyJa9C3QUaMPSE2xAi/buNr8u89SfD9wHSQVBzrRa/SOQ==", + "dev": true, + "dependencies": { + "fast-glob": "^3.2.11", + "glob-parent": "^6.0.1", + "globby": "^13.1.1", + "normalize-path": "^3.0.0", + "schema-utils": "^4.0.0", + "serialize-javascript": "^6.0.0" + }, + "engines": { + "node": ">= 14.15.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/webpack" + }, + "peerDependencies": { + "webpack": "^5.1.0" + } + }, + "node_modules/copy-webpack-plugin/node_modules/glob-parent": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-6.0.2.tgz", + "integrity": "sha512-XxwI8EOhVQgWp6iDL+3b0r86f4d6AX6zSU55HfB4ydCEuXLXc5FcYeOu+nnGftS4TEju/11rt4KJPTMgbfmv4A==", + "dev": true, + "dependencies": { + "is-glob": "^4.0.3" + }, + "engines": { + "node": ">=10.13.0" + } + }, + "node_modules/copy-webpack-plugin/node_modules/globby": { + "version": "13.2.2", + "resolved": "https://registry.npmjs.org/globby/-/globby-13.2.2.tgz", + "integrity": "sha512-Y1zNGV+pzQdh7H39l9zgB4PJqjRNqydvdYCDG4HFXM4XuvSaQQlEc91IU1yALL8gUTDomgBAfz3XJdmUS+oo0w==", + "dev": true, + "dependencies": { + "dir-glob": "^3.0.1", + "fast-glob": "^3.3.0", + "ignore": "^5.2.4", + "merge2": "^1.4.1", + "slash": "^4.0.0" + }, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/copy-webpack-plugin/node_modules/slash": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/slash/-/slash-4.0.0.tgz", + "integrity": "sha512-3dOsAHXXUkQTpOYcoAxLIorMTp4gIQr5IW3iVb7A7lFIp0VHhnynm9izx6TssdrIcVIESAlVjtnO2K8bg+Coew==", + "dev": true, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/core-js-compat": { + "version": "3.38.1", + "resolved": "https://registry.npmjs.org/core-js-compat/-/core-js-compat-3.38.1.tgz", + "integrity": "sha512-JRH6gfXxGmrzF3tZ57lFx97YARxCXPaMzPo6jELZhv88pBH5VXpQ+y0znKGlFnzuaihqhLbefxSJxWJMPtfDzw==", + "dev": true, + "dependencies": { + "browserslist": "^4.23.3" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/core-js" + } + }, + "node_modules/core-js-pure": { + "version": "3.38.1", + "resolved": "https://registry.npmjs.org/core-js-pure/-/core-js-pure-3.38.1.tgz", + "integrity": "sha512-BY8Etc1FZqdw1glX0XNOq2FDwfrg/VGqoZOZCdaL+UmdaqDwQwYXkMJT4t6In+zfEfOJDcM9T0KdbBeJg8KKCQ==", + "hasInstallScript": true, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/core-js" + } + }, + "node_modules/core-util-is": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz", + "integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==", + "dev": true + }, + "node_modules/cosmiconfig": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-7.1.0.tgz", + "integrity": "sha512-AdmX6xUzdNASswsFtmwSt7Vj8po9IuqXm0UXz7QKPuEUmPB4XyjGfaAr2PSuELMwkRMVH1EpIkX5bTZGRB3eCA==", + "dev": true, + "dependencies": { + "@types/parse-json": "^4.0.0", + "import-fresh": "^3.2.1", + "parse-json": "^5.0.0", + "path-type": "^4.0.0", + "yaml": "^1.10.0" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/create-jest": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/create-jest/-/create-jest-29.7.0.tgz", + "integrity": "sha512-Adz2bdH0Vq3F53KEMJOoftQFutWCukm6J24wbPWRO4k1kMY7gS7ds/uoJkNuV8wDCtWWnuwGcJwpWcih+zEW1Q==", + "dev": true, + "dependencies": { + "@jest/types": "^29.6.3", + "chalk": "^4.0.0", + "exit": "^0.1.2", + "graceful-fs": "^4.2.9", + "jest-config": "^29.7.0", + "jest-util": "^29.7.0", + "prompts": "^2.0.1" + }, + "bin": { + "create-jest": "bin/create-jest.js" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/create-require": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/create-require/-/create-require-1.1.1.tgz", + "integrity": "sha512-dcKFX3jn0MpIaXjisoRvexIJVEKzaq7z2rZKxf+MSr9TkdmHmsU4m2lcLojrj/FHl8mk5VxMmYA+ftRkP/3oKQ==", + "dev": true + }, + "node_modules/cross-env": { + "version": "7.0.3", + "resolved": "https://registry.npmjs.org/cross-env/-/cross-env-7.0.3.tgz", + "integrity": "sha512-+/HKd6EgcQCJGh2PSjZuUitQBQynKor4wrFbRg4DtAgS1aWO+gU52xpH7M9ScGgXSYmAVS9bIJ8EzuaGw0oNAw==", + "dev": true, + "dependencies": { + "cross-spawn": "^7.0.1" + }, + "bin": { + "cross-env": "src/bin/cross-env.js", + "cross-env-shell": "src/bin/cross-env-shell.js" + }, + "engines": { + "node": ">=10.14", + "npm": ">=6", + "yarn": ">=1" + } + }, + "node_modules/cross-spawn": { + "version": "7.0.6", + "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz", + "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==", + "dev": true, + "license": "MIT", + "dependencies": { + "path-key": "^3.1.0", + "shebang-command": "^2.0.0", + "which": "^2.0.1" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/crypt": { + "version": "0.0.2", + "resolved": "https://registry.npmjs.org/crypt/-/crypt-0.0.2.tgz", + "integrity": "sha512-mCxBlsHFYh9C+HVpiEacem8FEBnMXgU9gy4zmNC+SXAZNB/1idgp/aulFJ4FgCi7GPEVbfyng092GqL2k2rmow==", + "engines": { + "node": "*" + } + }, + "node_modules/css-blank-pseudo": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/css-blank-pseudo/-/css-blank-pseudo-3.0.3.tgz", + "integrity": "sha512-VS90XWtsHGqoM0t4KpH053c4ehxZ2E6HtGI7x68YFV0pTo/QmkV/YFA+NnlvK8guxZVNWGQhVNJGC39Q8XF4OQ==", + "dev": true, + "dependencies": { + "postcss-selector-parser": "^6.0.9" + }, + "bin": { + "css-blank-pseudo": "dist/cli.cjs" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "peerDependencies": { + "postcss": "^8.4" + } + }, + "node_modules/css-color-keywords": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/css-color-keywords/-/css-color-keywords-1.0.0.tgz", + "integrity": "sha512-FyyrDHZKEjXDpNJYvVsV960FiqQyXc/LlYmsxl2BcdMb2WPx0OGRVgTg55rPSyLSNMqP52R9r8geSp7apN3Ofg==", + "dev": true, + "peer": true, + "engines": { + "node": ">=4" + } + }, + "node_modules/css-declaration-sorter": { + "version": "6.4.1", + "resolved": "https://registry.npmjs.org/css-declaration-sorter/-/css-declaration-sorter-6.4.1.tgz", + "integrity": "sha512-rtdthzxKuyq6IzqX6jEcIzQF/YqccluefyCYheovBOLhFT/drQA9zj/UbRAa9J7C0o6EG6u3E6g+vKkay7/k3g==", + "engines": { + "node": "^10 || ^12 || >=14" + }, + "peerDependencies": { + "postcss": "^8.0.9" + } + }, + "node_modules/css-has-pseudo": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/css-has-pseudo/-/css-has-pseudo-3.0.4.tgz", + "integrity": "sha512-Vse0xpR1K9MNlp2j5w1pgWIJtm1a8qS0JwS9goFYcImjlHEmywP9VUF05aGBXzGpDJF86QXk4L0ypBmwPhGArw==", + "dev": true, + "dependencies": { + "postcss-selector-parser": "^6.0.9" + }, + "bin": { + "css-has-pseudo": "dist/cli.cjs" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "peerDependencies": { + "postcss": "^8.4" + } + }, + "node_modules/css-loader": { + "version": "6.11.0", + "resolved": "https://registry.npmjs.org/css-loader/-/css-loader-6.11.0.tgz", + "integrity": "sha512-CTJ+AEQJjq5NzLga5pE39qdiSV56F8ywCIsqNIRF0r7BDgWsN25aazToqAFg7ZrtA/U016xudB3ffgweORxX7g==", + "dev": true, + "dependencies": { + "icss-utils": "^5.1.0", + "postcss": "^8.4.33", + "postcss-modules-extract-imports": "^3.1.0", + "postcss-modules-local-by-default": "^4.0.5", + "postcss-modules-scope": "^3.2.0", + "postcss-modules-values": "^4.0.0", + "postcss-value-parser": "^4.2.0", + "semver": "^7.5.4" + }, + "engines": { + "node": ">= 12.13.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/webpack" + }, + "peerDependencies": { + "@rspack/core": "0.x || 1.x", + "webpack": "^5.0.0" + }, + "peerDependenciesMeta": { + "@rspack/core": { + "optional": true + }, + "webpack": { + "optional": true + } + } + }, + "node_modules/css-loader/node_modules/semver": { + "version": "7.6.3", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.3.tgz", + "integrity": "sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==", + "dev": true, + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/css-minimizer-webpack-plugin": { + "version": "4.2.2", + "resolved": "https://registry.npmjs.org/css-minimizer-webpack-plugin/-/css-minimizer-webpack-plugin-4.2.2.tgz", + "integrity": "sha512-s3Of/4jKfw1Hj9CxEO1E5oXhQAxlayuHO2y/ML+C6I9sQ7FdzfEV6QgMLN3vI+qFsjJGIAFLKtQK7t8BOXAIyA==", + "dependencies": { + "cssnano": "^5.1.8", + "jest-worker": "^29.1.2", + "postcss": "^8.4.17", + "schema-utils": "^4.0.0", + "serialize-javascript": "^6.0.0", + "source-map": "^0.6.1" + }, + "engines": { + "node": ">= 14.15.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/webpack" + }, + "peerDependencies": { + "webpack": "^5.0.0" + }, + "peerDependenciesMeta": { + "@parcel/css": { + "optional": true + }, + "@swc/css": { + "optional": true + }, + "clean-css": { + "optional": true + }, + "csso": { + "optional": true + }, + "esbuild": { + "optional": true + }, + "lightningcss": { + "optional": true + } + } + }, + "node_modules/css-minimizer-webpack-plugin/node_modules/source-map": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", + "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/css-prefers-color-scheme": { + "version": "6.0.3", + "resolved": "https://registry.npmjs.org/css-prefers-color-scheme/-/css-prefers-color-scheme-6.0.3.tgz", + "integrity": "sha512-4BqMbZksRkJQx2zAjrokiGMd07RqOa2IxIrrN10lyBe9xhn9DEvjUK79J6jkeiv9D9hQFXKb6g1jwU62jziJZA==", + "dev": true, + "bin": { + "css-prefers-color-scheme": "dist/cli.cjs" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "peerDependencies": { + "postcss": "^8.4" + } + }, + "node_modules/css-select": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/css-select/-/css-select-5.1.0.tgz", + "integrity": "sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==", + "dev": true, + "dependencies": { + "boolbase": "^1.0.0", + "css-what": "^6.1.0", + "domhandler": "^5.0.2", + "domutils": "^3.0.1", + "nth-check": "^2.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/css-selector-tokenizer": { + "version": "0.8.0", + "resolved": "https://registry.npmjs.org/css-selector-tokenizer/-/css-selector-tokenizer-0.8.0.tgz", + "integrity": "sha512-Jd6Ig3/pe62/qe5SBPTN8h8LeUg/pT4lLgtavPf7updwwHpvFzxvOQBHYj2LZDMjUnBzgvIUSjRcf6oT5HzHFg==", + "dependencies": { + "cssesc": "^3.0.0", + "fastparse": "^1.1.2" + } + }, + "node_modules/css-to-react-native": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/css-to-react-native/-/css-to-react-native-3.2.0.tgz", + "integrity": "sha512-e8RKaLXMOFii+02mOlqwjbD00KSEKqblnpO9e++1aXS1fPQOpS1YoqdVHBqPjHNoxeF2mimzVqawm2KCbEdtHQ==", + "dev": true, + "peer": true, + "dependencies": { + "camelize": "^1.0.0", + "css-color-keywords": "^1.0.0", + "postcss-value-parser": "^4.0.2" + } + }, + "node_modules/css-tree": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/css-tree/-/css-tree-1.1.3.tgz", + "integrity": "sha512-tRpdppF7TRazZrjJ6v3stzv93qxRcSsFmW6cX0Zm2NVKpxE1WV1HblnghVv9TreireHkqI/VDEsfolRF1p6y7Q==", + "dependencies": { + "mdn-data": "2.0.14", + "source-map": "^0.6.1" + }, + "engines": { + "node": ">=8.0.0" + } + }, + "node_modules/css-tree/node_modules/source-map": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", + "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/css-what": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/css-what/-/css-what-6.1.0.tgz", + "integrity": "sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==", + "engines": { + "node": ">= 6" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/css.escape": { + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/css.escape/-/css.escape-1.5.1.tgz", + "integrity": "sha512-YUifsXXuknHlUsmlgyY0PKzgPOr7/FjCePfHNt0jxm83wHZi44VDMQ7/fGNkjY3/jV1MC+1CmZbaHzugyeRtpg==" + }, + "node_modules/cssdb": { + "version": "7.11.2", + "resolved": "https://registry.npmjs.org/cssdb/-/cssdb-7.11.2.tgz", + "integrity": "sha512-lhQ32TFkc1X4eTefGfYPvgovRSzIMofHkigfH8nWtyRL4XJLsRhJFreRvEgKzept7x1rjBuy3J/MurXLaFxW/A==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + } + ] + }, + "node_modules/cssesc": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/cssesc/-/cssesc-3.0.0.tgz", + "integrity": "sha512-/Tb/JcjK111nNScGob5MNtsntNM1aCNUDipB/TkwZFhyDrrE47SOx/18wF2bbjgc3ZzCSKW1T5nt5EbFoAz/Vg==", + "bin": { + "cssesc": "bin/cssesc" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/cssnano": { + "version": "5.1.15", + "resolved": "https://registry.npmjs.org/cssnano/-/cssnano-5.1.15.tgz", + "integrity": "sha512-j+BKgDcLDQA+eDifLx0EO4XSA56b7uut3BQFH+wbSaSTuGLuiyTa/wbRYthUXX8LC9mLg+WWKe8h+qJuwTAbHw==", + "dependencies": { + "cssnano-preset-default": "^5.2.14", + "lilconfig": "^2.0.3", + "yaml": "^1.10.2" + }, + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/cssnano" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/cssnano-preset-default": { + "version": "5.2.14", + "resolved": "https://registry.npmjs.org/cssnano-preset-default/-/cssnano-preset-default-5.2.14.tgz", + "integrity": "sha512-t0SFesj/ZV2OTylqQVOrFgEh5uanxbO6ZAdeCrNsUQ6fVuXwYTxJPNAGvGTxHbD68ldIJNec7PyYZDBrfDQ+6A==", + "dependencies": { + "css-declaration-sorter": "^6.3.1", + "cssnano-utils": "^3.1.0", + "postcss-calc": "^8.2.3", + "postcss-colormin": "^5.3.1", + "postcss-convert-values": "^5.1.3", + "postcss-discard-comments": "^5.1.2", + "postcss-discard-duplicates": "^5.1.0", + "postcss-discard-empty": "^5.1.1", + "postcss-discard-overridden": "^5.1.0", + "postcss-merge-longhand": "^5.1.7", + "postcss-merge-rules": "^5.1.4", + "postcss-minify-font-values": "^5.1.0", + "postcss-minify-gradients": "^5.1.1", + "postcss-minify-params": "^5.1.4", + "postcss-minify-selectors": "^5.2.1", + "postcss-normalize-charset": "^5.1.0", + "postcss-normalize-display-values": "^5.1.0", + "postcss-normalize-positions": "^5.1.1", + "postcss-normalize-repeat-style": "^5.1.1", + "postcss-normalize-string": "^5.1.0", + "postcss-normalize-timing-functions": "^5.1.0", + "postcss-normalize-unicode": "^5.1.1", + "postcss-normalize-url": "^5.1.0", + "postcss-normalize-whitespace": "^5.1.1", + "postcss-ordered-values": "^5.1.3", + "postcss-reduce-initial": "^5.1.2", + "postcss-reduce-transforms": "^5.1.0", + "postcss-svgo": "^5.1.0", + "postcss-unique-selectors": "^5.1.1" + }, + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/cssnano-utils": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/cssnano-utils/-/cssnano-utils-3.1.0.tgz", + "integrity": "sha512-JQNR19/YZhz4psLX/rQ9M83e3z2Wf/HdJbryzte4a3NSuafyp9w/I4U+hx5C2S9g41qlstH7DEWnZaaj83OuEA==", + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/csso": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/csso/-/csso-4.2.0.tgz", + "integrity": "sha512-wvlcdIbf6pwKEk7vHj8/Bkc0B4ylXZruLvOgs9doS5eOsOpuodOV2zJChSpkp+pRpYQLQMeF04nr3Z68Sta9jA==", + "dependencies": { + "css-tree": "^1.1.2" + }, + "engines": { + "node": ">=8.0.0" + } + }, + "node_modules/csstype": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.3.tgz", + "integrity": "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==" + }, + "node_modules/d3-path": { + "version": "1.0.9", + "resolved": "https://registry.npmjs.org/d3-path/-/d3-path-1.0.9.tgz", + "integrity": "sha512-VLaYcn81dtHVTjEHd8B+pbe9yHWpXKZUC87PzoFmsFrJqgFwDe/qxfp5MlfsfM1V5E/iVt0MmEbWQ7FVIXh/bg==" + }, + "node_modules/d3-shape": { + "version": "1.3.7", + "resolved": "https://registry.npmjs.org/d3-shape/-/d3-shape-1.3.7.tgz", + "integrity": "sha512-EUkvKjqPFUAZyOlhY5gzCxCeI0Aep04LwIRpsZ/mLFelJiUfnK56jo5JMDSE7yyP2kLSb6LtF+S5chMk7uqPqw==", + "dependencies": { + "d3-path": "1" + } + }, + "node_modules/data-view-buffer": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/data-view-buffer/-/data-view-buffer-1.0.2.tgz", + "integrity": "sha512-EmKO5V3OLXh1rtK2wgXRansaK1/mtVdTUEiEI0W8RkvgT05kfxaH29PliLnpLP73yYO6142Q72QNa8Wx/A5CqQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.3", + "es-errors": "^1.3.0", + "is-data-view": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/data-view-byte-length": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/data-view-byte-length/-/data-view-byte-length-1.0.2.tgz", + "integrity": "sha512-tuhGbE6CfTM9+5ANGf+oQb72Ky/0+s3xKUpHvShfiz2RxMFgFPjsXuRLBVMtvMs15awe45SRb83D6wH4ew6wlQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.3", + "es-errors": "^1.3.0", + "is-data-view": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/inspect-js" + } + }, + "node_modules/data-view-byte-offset": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/data-view-byte-offset/-/data-view-byte-offset-1.0.1.tgz", + "integrity": "sha512-BS8PfmtDGnrgYdOonGZQdLZslWIeCGFP9tpan0hi1Co2Zr2NKADsvGYA8XxuG/4UWgJ6Cjtv+YJnB6MM69QGlQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.2", + "es-errors": "^1.3.0", + "is-data-view": "^1.0.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/date-fns": { + "version": "2.30.0", + "resolved": "https://registry.npmjs.org/date-fns/-/date-fns-2.30.0.tgz", + "integrity": "sha512-fnULvOpxnC5/Vg3NCiWelDsLiUc9bRwAPs/+LfTLNvetFCtCTN+yQz15C/fs4AwX1R9K5GLtLfn8QW+dWisaAw==", + "dependencies": { + "@babel/runtime": "^7.21.0" + }, + "engines": { + "node": ">=0.11" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/date-fns" + } + }, + "node_modules/debug": { + "version": "4.4.1", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.1.tgz", + "integrity": "sha512-KcKCqiftBJcZr++7ykoDIEwSa3XWowTfNPo92BYxjXiyYEVrUQh2aLyhxBCwww+heortUFxEJYcRzosstTEBYQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "ms": "^2.1.3" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } + } + }, + "node_modules/dedent": { + "version": "1.5.3", + "resolved": "https://registry.npmjs.org/dedent/-/dedent-1.5.3.tgz", + "integrity": "sha512-NHQtfOOW68WD8lgypbLA5oT+Bt0xXJhiYvoR6SmmNXZfpzOGXwdKWmcwG8N7PwVVWV3eF/68nmD9BaJSsTBhyQ==", + "dev": true, + "peerDependencies": { + "babel-plugin-macros": "^3.1.0" + }, + "peerDependenciesMeta": { + "babel-plugin-macros": { + "optional": true + } + } + }, + "node_modules/deep-is": { + "version": "0.1.4", + "resolved": "https://registry.npmjs.org/deep-is/-/deep-is-0.1.4.tgz", + "integrity": "sha512-oIPzksmTg4/MriiaYGO+okXDT7ztn/w3Eptv/+gSIdMdKsJo0u4CfYNFJPy+4SKMuCqGw2wxnA+URMg3t8a/bQ==", + "dev": true + }, + "node_modules/deepmerge": { + "version": "4.3.1", + "resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz", + "integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/default-browser": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/default-browser/-/default-browser-5.2.1.tgz", + "integrity": "sha512-WY/3TUME0x3KPYdRRxEJJvXRHV4PyPoUsxtZa78lwItwRQRHhd2U9xOscaT/YTf8uCXIAjeJOFBVEh/7FtD8Xg==", + "dev": true, + "dependencies": { + "bundle-name": "^4.1.0", + "default-browser-id": "^5.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/default-browser-id": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/default-browser-id/-/default-browser-id-5.0.0.tgz", + "integrity": "sha512-A6p/pu/6fyBcA1TRz/GqWYPViplrftcW2gZC9q79ngNCKAeR/X3gcEdXQHl4KNXV+3wgIJ1CPkJQ3IHM6lcsyA==", + "dev": true, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/define-data-property": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz", + "integrity": "sha512-rBMvIzlpA8v6E+SJZoo++HAYqsLrkg7MSfIinMPFhmkorw7X+dOXVJQs+QT69zGkzMyfDnIMN2Wid1+NbL3T+A==", + "dev": true, + "dependencies": { + "es-define-property": "^1.0.0", + "es-errors": "^1.3.0", + "gopd": "^1.0.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/define-lazy-prop": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/define-lazy-prop/-/define-lazy-prop-2.0.0.tgz", + "integrity": "sha512-Ds09qNh8yw3khSjiJjiUInaGX9xlqZDY7JVryGxdxV7NPeuqQfplOpQ66yJFZut3jLa5zOwkXw1g9EI2uKh4Og==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/define-properties": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/define-properties/-/define-properties-1.2.1.tgz", + "integrity": "sha512-8QmQKqEASLd5nx0U1B1okLElbUuuttJ/AnYmRXbbbGDWh6uS208EjD4Xqq/I9wK7u0v6O08XhTWnt5XtEbR6Dg==", + "dev": true, + "dependencies": { + "define-data-property": "^1.0.1", + "has-property-descriptors": "^1.0.0", + "object-keys": "^1.1.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/delayed-stream": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", + "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", + "engines": { + "node": ">=0.4.0" + } + }, + "node_modules/depd": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/depd/-/depd-2.0.0.tgz", + "integrity": "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/dequal": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz", + "integrity": "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==", + "dev": true, + "engines": { + "node": ">=6" + } + }, + "node_modules/destroy": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.2.0.tgz", + "integrity": "sha512-2sJGJTaXIIaR1w4iJSNoN0hnMY7Gpc/n8D4qSCJw8QqFWXf7cuAgnEHxBpweaVcPevC2l3KpjYCx3NypQQgaJg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.8", + "npm": "1.2.8000 || >= 1.4.16" + } + }, + "node_modules/detect-libc": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-1.0.3.tgz", + "integrity": "sha512-pGjwhsmsp4kL2RTz08wcOlGN83otlqHeD/Z5T8GXZB+/YcpQ/dgo+lbU8ZsGxV0HIvqqxo9l7mqYwyYMD9bKDg==", + "dev": true, + "optional": true, + "bin": { + "detect-libc": "bin/detect-libc.js" + }, + "engines": { + "node": ">=0.10" + } + }, + "node_modules/detect-newline": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/detect-newline/-/detect-newline-3.1.0.tgz", + "integrity": "sha512-TLz+x/vEXm/Y7P7wn1EJFNLxYpUD4TgMosxY6fAVJUnJMbupHBOncxyWUG9OpTaH9EBD7uFI5LfEgmMOc54DsA==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/detect-node": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/detect-node/-/detect-node-2.1.0.tgz", + "integrity": "sha512-T0NIuQpnTvFDATNuHN5roPwSBG83rFsuO+MXXH9/3N1eFbn4wcPjttvjMLEPWJ0RGUYgQE7cGgS3tNxbqCGM7g==", + "dev": true + }, + "node_modules/detect-port-alt": { + "version": "1.1.6", + "resolved": "https://registry.npmjs.org/detect-port-alt/-/detect-port-alt-1.1.6.tgz", + "integrity": "sha512-5tQykt+LqfJFBEYaDITx7S7cR7mJ/zQmLXZ2qt5w04ainYZw6tBf9dBunMjVeVOdYVRUzUOE4HkY5J7+uttb5Q==", + "dev": true, + "dependencies": { + "address": "^1.0.1", + "debug": "^2.6.0" + }, + "bin": { + "detect": "bin/detect-port", + "detect-port": "bin/detect-port" + }, + "engines": { + "node": ">= 4.2.1" + } + }, + "node_modules/detect-port-alt/node_modules/debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "dev": true, + "dependencies": { + "ms": "2.0.0" + } + }, + "node_modules/detect-port-alt/node_modules/ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", + "dev": true + }, + "node_modules/diff": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/diff/-/diff-4.0.4.tgz", + "integrity": "sha512-X07nttJQkwkfKfvTPG/KSnE2OMdcUCao6+eXF3wmnIQRn2aPAHH3VxDbDOdegkd6JbPsXqShpvEOHfAT+nCNwQ==", + "dev": true, + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.3.1" + } + }, + "node_modules/diff-sequences": { + "version": "29.6.3", + "resolved": "https://registry.npmjs.org/diff-sequences/-/diff-sequences-29.6.3.tgz", + "integrity": "sha512-EjePK1srD3P08o2j4f0ExnylqRs5B9tJjcp9t1krH2qRi8CCdsYfwe9JgSLurFBWwq4uOlipzfk5fHNvwFKr8Q==", + "dev": true, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/dir-glob": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/dir-glob/-/dir-glob-3.0.1.tgz", + "integrity": "sha512-WkrWp9GR4KXfKGYzOLmTuGVi1UWFfws377n9cc55/tb6DuqyF6pcQ5AbiHEshaDpY9v6oaSr2XCDidGmMwdzIA==", + "dev": true, + "dependencies": { + "path-type": "^4.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/discontinuous-range": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/discontinuous-range/-/discontinuous-range-1.0.0.tgz", + "integrity": "sha512-c68LpLbO+7kP/b1Hr1qs8/BJ09F5khZGTxqxZuhzxpmwJKOgRFHJWIb9/KmqnqHhLdO55aOxFH/EGBvUQbL/RQ==", + "dev": true + }, + "node_modules/dns-packet": { + "version": "5.6.1", + "resolved": "https://registry.npmjs.org/dns-packet/-/dns-packet-5.6.1.tgz", + "integrity": "sha512-l4gcSouhcgIKRvyy99RNVOgxXiicE+2jZoNmaNmZ6JXiGajBOJAesk1OBlJuM5k2c+eudGdLxDqXuPCKIj6kpw==", + "dev": true, + "dependencies": { + "@leichtgewicht/ip-codec": "^2.0.1" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/doctrine": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/doctrine/-/doctrine-2.1.0.tgz", + "integrity": "sha512-35mSku4ZXK0vfCuHEDAwt55dg2jNajHZ1odvF+8SSr82EsZY4QmXfuWso8oEd8zRhVObSN18aM0CjSdoBX7zIw==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "esutils": "^2.0.2" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/dom-accessibility-api": { + "version": "0.5.16", + "resolved": "https://registry.npmjs.org/dom-accessibility-api/-/dom-accessibility-api-0.5.16.tgz", + "integrity": "sha512-X7BJ2yElsnOJ30pZF4uIIDfBEVgF4XEBxL9Bxhy6dnrm5hkzqmsWHGTiHqRiITNhMyFLyAiWndIJP7Z1NTteDg==", + "dev": true, + "peer": true + }, + "node_modules/dom-align": { + "version": "1.12.4", + "resolved": "https://registry.npmjs.org/dom-align/-/dom-align-1.12.4.tgz", + "integrity": "sha512-R8LUSEay/68zE5c8/3BDxiTEvgb4xZTF0RKmAHfiEVN3klfIpXfi2/QCoiWPccVQ0J/ZGdz9OjzL4uJEP/MRAw==" + }, + "node_modules/dom-converter": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/dom-converter/-/dom-converter-0.2.0.tgz", + "integrity": "sha512-gd3ypIPfOMr9h5jIKq8E3sHOTCjeirnl0WK5ZdS1AW0Odt0b1PaWaHdJ4Qk4klv+YB9aJBS7mESXjFoDQPu6DA==", + "dev": true, + "dependencies": { + "utila": "~0.4" + } + }, + "node_modules/dom-helpers": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/dom-helpers/-/dom-helpers-5.2.1.tgz", + "integrity": "sha512-nRCa7CK3VTrM2NmGkIy4cbK7IZlgBE/PYMn55rrXefr5xXDP0LdtfPnblFDoVdcAfslJ7or6iqAUnx0CCGIWQA==", + "dependencies": { + "@babel/runtime": "^7.8.7", + "csstype": "^3.0.2" + } + }, + "node_modules/dom-serializer": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz", + "integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==", + "dev": true, + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.2", + "entities": "^4.2.0" + }, + "funding": { + "url": "https://github.com/cheeriojs/dom-serializer?sponsor=1" + } + }, + "node_modules/domelementtype": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz", + "integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ] + }, + "node_modules/domhandler": { + "version": "5.0.3", + "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz", + "integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==", + "dev": true, + "dependencies": { + "domelementtype": "^2.3.0" + }, + "engines": { + "node": ">= 4" + }, + "funding": { + "url": "https://github.com/fb55/domhandler?sponsor=1" + } + }, + "node_modules/domutils": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/domutils/-/domutils-3.1.0.tgz", + "integrity": "sha512-H78uMmQtI2AhgDJjWeQmHwJJ2bLPD3GMmO7Zja/ZZh84wkm+4ut+IUnUdRa8uCGX88DiVx1j6FRe1XfxEgjEZA==", + "dev": true, + "dependencies": { + "dom-serializer": "^2.0.0", + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3" + }, + "funding": { + "url": "https://github.com/fb55/domutils?sponsor=1" + } + }, + "node_modules/dot-case": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/dot-case/-/dot-case-3.0.4.tgz", + "integrity": "sha512-Kv5nKlh6yRrdrGvxeJ2e5y2eRUpkUosIW4A2AS38zwSz27zu7ufDwQPi5Jhs3XAlGNetl3bmnGhQsMtkKJnj3w==", + "dev": true, + "dependencies": { + "no-case": "^3.0.4", + "tslib": "^2.0.3" + } + }, + "node_modules/dunder-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", + "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==", + "license": "MIT", + "dependencies": { + "call-bind-apply-helpers": "^1.0.1", + "es-errors": "^1.3.0", + "gopd": "^1.2.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/duplexer": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/duplexer/-/duplexer-0.1.2.tgz", + "integrity": "sha512-jtD6YG370ZCIi/9GTaJKQxWTZD045+4R4hTk/x1UyoqadyJ9x9CgSi1RlVDQF8U2sxLLSnFkCaMihqljHIWgMg==", + "dev": true + }, + "node_modules/ee-first": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz", + "integrity": "sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow==", + "dev": true + }, + "node_modules/ejs": { + "version": "3.1.10", + "resolved": "https://registry.npmjs.org/ejs/-/ejs-3.1.10.tgz", + "integrity": "sha512-UeJmFfOrAQS8OJWPZ4qtgHyWExa088/MtK5UEyoJGFH67cDEXkZSviOiKRCZ4Xij0zxI3JECgYs3oKx+AizQBA==", + "dev": true, + "dependencies": { + "jake": "^10.8.5" + }, + "bin": { + "ejs": "bin/cli.js" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/electron-to-chromium": { + "version": "1.5.49", + "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.49.tgz", + "integrity": "sha512-ZXfs1Of8fDb6z7WEYZjXpgIRF6MEu8JdeGA0A40aZq6OQbS+eJpnnV49epZRna2DU/YsEjSQuGtQPPtvt6J65A==" + }, + "node_modules/emittery": { + "version": "0.13.1", + "resolved": "https://registry.npmjs.org/emittery/-/emittery-0.13.1.tgz", + "integrity": "sha512-DeWwawk6r5yR9jFgnDKYt4sLS0LmHJJi3ZOnb5/JdbYwj3nW+FxQnHIjhBKz8YLC7oRNPVM9NQ47I3CVx34eqQ==", + "dev": true, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sindresorhus/emittery?sponsor=1" + } + }, + "node_modules/emoji-regex": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", + "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==", + "dev": true + }, + "node_modules/emojis-list": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/emojis-list/-/emojis-list-3.0.0.tgz", + "integrity": "sha512-/kyM18EfinwXZbno9FyUGeFh87KC8HRQBQGildHZbEuRyWFOmv1U10o9BBp8XVZDVNNuQKyIGIu5ZYAAXJ0V2Q==", + "dev": true, + "engines": { + "node": ">= 4" + } + }, + "node_modules/encodeurl": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz", + "integrity": "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/encoding-sniffer": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/encoding-sniffer/-/encoding-sniffer-0.2.0.tgz", + "integrity": "sha512-ju7Wq1kg04I3HtiYIOrUrdfdDvkyO9s5XM8QAj/bN61Yo/Vb4vgJxy5vi4Yxk01gWHbrofpPtpxM8bKger9jhg==", + "dev": true, + "dependencies": { + "iconv-lite": "^0.6.3", + "whatwg-encoding": "^3.1.1" + }, + "funding": { + "url": "https://github.com/fb55/encoding-sniffer?sponsor=1" + } + }, + "node_modules/enhanced-resolve": { + "version": "5.17.1", + "resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-5.17.1.tgz", + "integrity": "sha512-LMHl3dXhTcfv8gM4kEzIUeTQ+7fpdA0l2tUf34BddXPkz2A5xJ5L/Pchd5BL6rdccM9QGvu0sWZzK1Z1t4wwyg==", + "dependencies": { + "graceful-fs": "^4.2.4", + "tapable": "^2.2.0" + }, + "engines": { + "node": ">=10.13.0" + } + }, + "node_modules/entities": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz", + "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==", + "dev": true, + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, + "node_modules/envinfo": { + "version": "7.14.0", + "resolved": "https://registry.npmjs.org/envinfo/-/envinfo-7.14.0.tgz", + "integrity": "sha512-CO40UI41xDQzhLB1hWyqUKgFhs250pNcGbyGKe1l/e4FSaI/+YE4IMG76GDt0In67WLPACIITC+sOi08x4wIvg==", + "dev": true, + "bin": { + "envinfo": "dist/cli.js" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/environment": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/environment/-/environment-1.1.0.tgz", + "integrity": "sha512-xUtoPkMggbz0MPyPiIWr1Kp4aeWJjDZ6SMvURhimjdZgsRuDplF5/s9hcgGhyXMhs+6vpnuoiZ2kFiu3FMnS8Q==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/enzyme": { + "version": "3.11.0", + "resolved": "https://registry.npmjs.org/enzyme/-/enzyme-3.11.0.tgz", + "integrity": "sha512-Dw8/Gs4vRjxY6/6i9wU0V+utmQO9kvh9XLnz3LIudviOnVYDEe2ec+0k+NQoMamn1VrjKgCUOWj5jG/5M5M0Qw==", + "dev": true, + "dependencies": { + "array.prototype.flat": "^1.2.3", + "cheerio": "^1.0.0-rc.3", + "enzyme-shallow-equal": "^1.0.1", + "function.prototype.name": "^1.1.2", + "has": "^1.0.3", + "html-element-map": "^1.2.0", + "is-boolean-object": "^1.0.1", + "is-callable": "^1.1.5", + "is-number-object": "^1.0.4", + "is-regex": "^1.0.5", + "is-string": "^1.0.5", + "is-subset": "^0.1.1", + "lodash.escape": "^4.0.1", + "lodash.isequal": "^4.5.0", + "object-inspect": "^1.7.0", + "object-is": "^1.0.2", + "object.assign": "^4.1.0", + "object.entries": "^1.1.1", + "object.values": "^1.1.1", + "raf": "^3.4.1", + "rst-selector-parser": "^2.2.3", + "string.prototype.trim": "^1.2.1" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/enzyme-shallow-equal": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/enzyme-shallow-equal/-/enzyme-shallow-equal-1.0.7.tgz", + "integrity": "sha512-/um0GFqUXnpM9SvKtje+9Tjoz3f1fpBC3eXRFrNs8kpYn69JljciYP7KZTqM/YQbUY9KUjvKB4jo/q+L6WGGvg==", + "dev": true, + "dependencies": { + "hasown": "^2.0.0", + "object-is": "^1.1.5" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/error-ex": { + "version": "1.3.2", + "resolved": "https://registry.npmjs.org/error-ex/-/error-ex-1.3.2.tgz", + "integrity": "sha512-7dFHNmqeFSEt2ZBsCriorKnn3Z2pj+fd9kmI6QoWw4//DL+icEBfc0U7qJCisqrTsKTjw4fNFy2pW9OqStD84g==", + "dev": true, + "dependencies": { + "is-arrayish": "^0.2.1" + } + }, + "node_modules/error-stack-parser": { + "version": "2.1.4", + "resolved": "https://registry.npmjs.org/error-stack-parser/-/error-stack-parser-2.1.4.tgz", + "integrity": "sha512-Sk5V6wVazPhq5MhpO+AUxJn5x7XSXGl1R93Vn7i+zS15KDVxQijejNCrz8340/2bgLBjR9GtEG8ZVKONDjcqGQ==", + "dev": true, + "dependencies": { + "stackframe": "^1.3.4" + } + }, + "node_modules/es-abstract": { + "version": "1.24.1", + "resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.24.1.tgz", + "integrity": "sha512-zHXBLhP+QehSSbsS9Pt23Gg964240DPd6QCf8WpkqEXxQ7fhdZzYsocOr5u7apWonsS5EjZDmTF+/slGMyasvw==", + "dev": true, + "license": "MIT", + "dependencies": { + "array-buffer-byte-length": "^1.0.2", + "arraybuffer.prototype.slice": "^1.0.4", + "available-typed-arrays": "^1.0.7", + "call-bind": "^1.0.8", + "call-bound": "^1.0.4", + "data-view-buffer": "^1.0.2", + "data-view-byte-length": "^1.0.2", + "data-view-byte-offset": "^1.0.1", + "es-define-property": "^1.0.1", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.1.1", + "es-set-tostringtag": "^2.1.0", + "es-to-primitive": "^1.3.0", + "function.prototype.name": "^1.1.8", + "get-intrinsic": "^1.3.0", + "get-proto": "^1.0.1", + "get-symbol-description": "^1.1.0", + "globalthis": "^1.0.4", + "gopd": "^1.2.0", + "has-property-descriptors": "^1.0.2", + "has-proto": "^1.2.0", + "has-symbols": "^1.1.0", + "hasown": "^2.0.2", + "internal-slot": "^1.1.0", + "is-array-buffer": "^3.0.5", + "is-callable": "^1.2.7", + "is-data-view": "^1.0.2", + "is-negative-zero": "^2.0.3", + "is-regex": "^1.2.1", + "is-set": "^2.0.3", + "is-shared-array-buffer": "^1.0.4", + "is-string": "^1.1.1", + "is-typed-array": "^1.1.15", + "is-weakref": "^1.1.1", + "math-intrinsics": "^1.1.0", + "object-inspect": "^1.13.4", + "object-keys": "^1.1.1", + "object.assign": "^4.1.7", + "own-keys": "^1.0.1", + "regexp.prototype.flags": "^1.5.4", + "safe-array-concat": "^1.1.3", + "safe-push-apply": "^1.0.0", + "safe-regex-test": "^1.1.0", + "set-proto": "^1.0.0", + "stop-iteration-iterator": "^1.1.0", + "string.prototype.trim": "^1.2.10", + "string.prototype.trimend": "^1.0.9", + "string.prototype.trimstart": "^1.0.8", + "typed-array-buffer": "^1.0.3", + "typed-array-byte-length": "^1.0.3", + "typed-array-byte-offset": "^1.0.4", + "typed-array-length": "^1.0.7", + "unbox-primitive": "^1.1.0", + "which-typed-array": "^1.1.19" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/es-array-method-boxes-properly": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/es-array-method-boxes-properly/-/es-array-method-boxes-properly-1.0.0.tgz", + "integrity": "sha512-wd6JXUmyHmt8T5a2xreUwKcGPq6f1f+WwIJkijUqiGcJz1qqnZgP6XIK+QyIWU5lT7imeNxUll48bziG+TSYcA==", + "dev": true + }, + "node_modules/es-define-property": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz", + "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-errors": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", + "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-iterator-helpers": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/es-iterator-helpers/-/es-iterator-helpers-1.2.2.tgz", + "integrity": "sha512-BrUQ0cPTB/IwXj23HtwHjS9n7O4h9FX94b4xc5zlTHxeLgTAdzYUDyy6KdExAl9lbN5rtfe44xpjpmj9grxs5w==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "call-bound": "^1.0.4", + "define-properties": "^1.2.1", + "es-abstract": "^1.24.1", + "es-errors": "^1.3.0", + "es-set-tostringtag": "^2.1.0", + "function-bind": "^1.1.2", + "get-intrinsic": "^1.3.0", + "globalthis": "^1.0.4", + "gopd": "^1.2.0", + "has-property-descriptors": "^1.0.2", + "has-proto": "^1.2.0", + "has-symbols": "^1.1.0", + "internal-slot": "^1.1.0", + "iterator.prototype": "^1.1.5", + "safe-array-concat": "^1.1.3" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-module-lexer": { + "version": "1.5.4", + "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-1.5.4.tgz", + "integrity": "sha512-MVNK56NiMrOwitFB7cqDwq0CQutbw+0BvLshJSse0MUNU+y1FC3bUS/AQg7oUng+/wKrrki7JfmwtVHkVfPLlw==" + }, + "node_modules/es-object-atoms": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz", + "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-set-tostringtag": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz", + "integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.6", + "has-tostringtag": "^1.0.2", + "hasown": "^2.0.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-shim-unscopables": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/es-shim-unscopables/-/es-shim-unscopables-1.1.0.tgz", + "integrity": "sha512-d9T8ucsEhh8Bi1woXCf+TIKDIROLG5WCkxg8geBCbvk22kzwC5G2OnXVMO6FUsvQlgUUXQ2itephWDLqDzbeCw==", + "dev": true, + "license": "MIT", + "dependencies": { + "hasown": "^2.0.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-to-primitive": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/es-to-primitive/-/es-to-primitive-1.3.0.tgz", + "integrity": "sha512-w+5mJ3GuFL+NjVtJlvydShqE1eN3h3PbI7/5LAsYJP/2qtuMXjfL2LpHSRqo4b4eSF5K/DH1JXKUAHSB2UW50g==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-callable": "^1.2.7", + "is-date-object": "^1.0.5", + "is-symbol": "^1.0.4" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/es6-promise": { + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/es6-promise/-/es6-promise-3.3.1.tgz", + "integrity": "sha512-SOp9Phqvqn7jtEUxPWdWfWoLmyt2VaJ6MpvP9Comy1MceMXqE6bxvaTu4iaxpYYPzhny28Lc+M87/c2cPK6lDg==", + "dev": true + }, + "node_modules/escalade": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz", + "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==", + "engines": { + "node": ">=6" + } + }, + "node_modules/escape-html": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz", + "integrity": "sha512-NiSupZ4OeuGwr68lGIeym/ksIZMJodUGOSCZ/FSnTxcrekbvqrgdUxlJOMpijaKZVjAJrWrGs/6Jy8OMuyj9ow==", + "dev": true + }, + "node_modules/escape-string-regexp": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz", + "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/eslint": { + "version": "9.39.2", + "resolved": "https://registry.npmjs.org/eslint/-/eslint-9.39.2.tgz", + "integrity": "sha512-LEyamqS7W5HB3ujJyvi0HQK/dtVINZvd5mAAp9eT5S/ujByGjiZLCzPcHVzuXbpJDJF/cxwHlfceVUDZ2lnSTw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@eslint-community/eslint-utils": "^4.8.0", + "@eslint-community/regexpp": "^4.12.1", + "@eslint/config-array": "^0.21.1", + "@eslint/config-helpers": "^0.4.2", + "@eslint/core": "^0.17.0", + "@eslint/eslintrc": "^3.3.1", + "@eslint/js": "9.39.2", + "@eslint/plugin-kit": "^0.4.1", + "@humanfs/node": "^0.16.6", + "@humanwhocodes/module-importer": "^1.0.1", + "@humanwhocodes/retry": "^0.4.2", + "@types/estree": "^1.0.6", + "ajv": "^6.12.4", + "chalk": "^4.0.0", + "cross-spawn": "^7.0.6", + "debug": "^4.3.2", + "escape-string-regexp": "^4.0.0", + "eslint-scope": "^8.4.0", + "eslint-visitor-keys": "^4.2.1", + "espree": "^10.4.0", + "esquery": "^1.5.0", + "esutils": "^2.0.2", + "fast-deep-equal": "^3.1.3", + "file-entry-cache": "^8.0.0", + "find-up": "^5.0.0", + "glob-parent": "^6.0.2", + "ignore": "^5.2.0", + "imurmurhash": "^0.1.4", + "is-glob": "^4.0.0", + "json-stable-stringify-without-jsonify": "^1.0.1", + "lodash.merge": "^4.6.2", + "minimatch": "^3.1.2", + "natural-compare": "^1.4.0", + "optionator": "^0.9.3" + }, + "bin": { + "eslint": "bin/eslint.js" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "url": "https://eslint.org/donate" + }, + "peerDependencies": { + "jiti": "*" + }, + "peerDependenciesMeta": { + "jiti": { + "optional": true + } + } + }, + "node_modules/eslint-config-prettier": { + "version": "10.1.5", + "resolved": "https://registry.npmjs.org/eslint-config-prettier/-/eslint-config-prettier-10.1.5.tgz", + "integrity": "sha512-zc1UmCpNltmVY34vuLRV61r1K27sWuX39E+uyUnY8xS2Bex88VV9cugG+UZbRSRGtGyFboj+D8JODyme1plMpw==", + "dev": true, + "license": "MIT", + "bin": { + "eslint-config-prettier": "bin/cli.js" + }, + "funding": { + "url": "https://opencollective.com/eslint-config-prettier" + }, + "peerDependencies": { + "eslint": ">=7.0.0" + } + }, + "node_modules/eslint-plugin-i18n": { + "version": "2.4.0", + "resolved": "https://registry.npmjs.org/eslint-plugin-i18n/-/eslint-plugin-i18n-2.4.0.tgz", + "integrity": "sha512-6RpPoj+lr0xk6SNljziOjGfDtuQSN6cw/gdds248N5MvCQUrPxo5+0s7b7TQsEl1qLr5OVnCMxsaRBy/4T62cg==", + "dev": true, + "engines": { + "node": ">=12.0.0" + } + }, + "node_modules/eslint-plugin-prettier": { + "version": "5.4.1", + "resolved": "https://registry.npmjs.org/eslint-plugin-prettier/-/eslint-plugin-prettier-5.4.1.tgz", + "integrity": "sha512-9dF+KuU/Ilkq27A8idRP7N2DH8iUR6qXcjF3FR2wETY21PZdBrIjwCau8oboyGj9b7etWmTGEeM8e7oOed6ZWg==", + "dev": true, + "license": "MIT", + "dependencies": { + "prettier-linter-helpers": "^1.0.0", + "synckit": "^0.11.7" + }, + "engines": { + "node": "^14.18.0 || >=16.0.0" + }, + "funding": { + "url": "https://opencollective.com/eslint-plugin-prettier" + }, + "peerDependencies": { + "@types/eslint": ">=8.0.0", + "eslint": ">=8.0.0", + "eslint-config-prettier": ">= 7.0.0 <10.0.0 || >=10.1.0", + "prettier": ">=3.0.0" + }, + "peerDependenciesMeta": { + "@types/eslint": { + "optional": true + }, + "eslint-config-prettier": { + "optional": true + } + } + }, + "node_modules/eslint-plugin-react": { + "version": "7.37.5", + "resolved": "https://registry.npmjs.org/eslint-plugin-react/-/eslint-plugin-react-7.37.5.tgz", + "integrity": "sha512-Qteup0SqU15kdocexFNAJMvCJEfa2xUKNV4CC1xsVMrIIqEy3SQ/rqyxCWNzfrd3/ldy6HMlD2e0JDVpDg2qIA==", + "dev": true, + "license": "MIT", + "dependencies": { + "array-includes": "^3.1.8", + "array.prototype.findlast": "^1.2.5", + "array.prototype.flatmap": "^1.3.3", + "array.prototype.tosorted": "^1.1.4", + "doctrine": "^2.1.0", + "es-iterator-helpers": "^1.2.1", + "estraverse": "^5.3.0", + "hasown": "^2.0.2", + "jsx-ast-utils": "^2.4.1 || ^3.0.0", + "minimatch": "^3.1.2", + "object.entries": "^1.1.9", + "object.fromentries": "^2.0.8", + "object.values": "^1.2.1", + "prop-types": "^15.8.1", + "resolve": "^2.0.0-next.5", + "semver": "^6.3.1", + "string.prototype.matchall": "^4.0.12", + "string.prototype.repeat": "^1.0.0" + }, + "engines": { + "node": ">=4" + }, + "peerDependencies": { + "eslint": "^3 || ^4 || ^5 || ^6 || ^7 || ^8 || ^9.7" + } + }, + "node_modules/eslint-plugin-react/node_modules/estraverse": { + "version": "5.3.0", + "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz", + "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==", + "dev": true, + "license": "BSD-2-Clause", + "engines": { + "node": ">=4.0" + } + }, + "node_modules/eslint-plugin-react/node_modules/resolve": { + "version": "2.0.0-next.5", + "resolved": "https://registry.npmjs.org/resolve/-/resolve-2.0.0-next.5.tgz", + "integrity": "sha512-U7WjGVG9sH8tvjW5SmGbQuui75FiyjAX72HX15DwBBwF9dNiQZRQAg9nnPhYy+TUnE0+VcrttuvNI8oSxZcocA==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-core-module": "^2.13.0", + "path-parse": "^1.0.7", + "supports-preserve-symlinks-flag": "^1.0.0" + }, + "bin": { + "resolve": "bin/resolve" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/eslint-plugin-simple-import-sort": { + "version": "12.1.1", + "resolved": "https://registry.npmjs.org/eslint-plugin-simple-import-sort/-/eslint-plugin-simple-import-sort-12.1.1.tgz", + "integrity": "sha512-6nuzu4xwQtE3332Uz0to+TxDQYRLTKRESSc2hefVT48Zc8JthmN23Gx9lnYhu0FtkRSL1oxny3kJ2aveVhmOVA==", + "dev": true, + "license": "MIT", + "peerDependencies": { + "eslint": ">=5.0.0" + } + }, + "node_modules/eslint-scope": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-5.1.1.tgz", + "integrity": "sha512-2NxwbF/hZ0KpepYN0cNbo+FN6XoK7GaHlQhgx/hIZl6Va0bF45RQOOwhLIy8lQDbuCiadSLCBnH2CFYquit5bw==", + "dependencies": { + "esrecurse": "^4.3.0", + "estraverse": "^4.1.1" + }, + "engines": { + "node": ">=8.0.0" + } + }, + "node_modules/eslint-visitor-keys": { + "version": "3.4.3", + "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-3.4.3.tgz", + "integrity": "sha512-wpc+LXeiyiisxPlEkUzU6svyS1frIO3Mgxj1fdy7Pm8Ygzguax2N3Fa/D/ag1WqbOprdI+uY6wMUl8/a2G+iag==", + "dev": true, + "engines": { + "node": "^12.22.0 || ^14.17.0 || >=16.0.0" + }, + "funding": { + "url": "https://opencollective.com/eslint" + } + }, + "node_modules/eslint/node_modules/ajv": { + "version": "6.12.6", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz", + "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==", + "dev": true, + "dependencies": { + "fast-deep-equal": "^3.1.1", + "fast-json-stable-stringify": "^2.0.0", + "json-schema-traverse": "^0.4.1", + "uri-js": "^4.2.2" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/epoberezkin" + } + }, + "node_modules/eslint/node_modules/eslint-scope": { + "version": "8.4.0", + "resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-8.4.0.tgz", + "integrity": "sha512-sNXOfKCn74rt8RICKMvJS7XKV/Xk9kA7DyJr8mJik3S7Cwgy3qlkkmyS2uQB3jiJg6VNdZd/pDBJu0nvG2NlTg==", + "dev": true, + "license": "BSD-2-Clause", + "dependencies": { + "esrecurse": "^4.3.0", + "estraverse": "^5.2.0" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "url": "https://opencollective.com/eslint" + } + }, + "node_modules/eslint/node_modules/eslint-visitor-keys": { + "version": "4.2.1", + "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-4.2.1.tgz", + "integrity": "sha512-Uhdk5sfqcee/9H/rCOJikYz67o0a2Tw2hGRPOG2Y1R2dg7brRe1uG0yaNQDHu+TO/uQPF/5eCapvYSmHUjt7JQ==", + "dev": true, + "license": "Apache-2.0", + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "url": "https://opencollective.com/eslint" + } + }, + "node_modules/eslint/node_modules/estraverse": { + "version": "5.3.0", + "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz", + "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==", + "dev": true, + "license": "BSD-2-Clause", + "engines": { + "node": ">=4.0" + } + }, + "node_modules/eslint/node_modules/glob-parent": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-6.0.2.tgz", + "integrity": "sha512-XxwI8EOhVQgWp6iDL+3b0r86f4d6AX6zSU55HfB4ydCEuXLXc5FcYeOu+nnGftS4TEju/11rt4KJPTMgbfmv4A==", + "dev": true, + "dependencies": { + "is-glob": "^4.0.3" + }, + "engines": { + "node": ">=10.13.0" + } + }, + "node_modules/eslint/node_modules/json-schema-traverse": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", + "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==", + "dev": true + }, + "node_modules/espree": { + "version": "10.4.0", + "resolved": "https://registry.npmjs.org/espree/-/espree-10.4.0.tgz", + "integrity": "sha512-j6PAQ2uUr79PZhBjP5C5fhl8e39FmRnOjsD5lGnWrFU8i2G776tBK7+nP8KuQUTTyAZUwfQqXAgrVH5MbH9CYQ==", + "dev": true, + "license": "BSD-2-Clause", + "dependencies": { + "acorn": "^8.15.0", + "acorn-jsx": "^5.3.2", + "eslint-visitor-keys": "^4.2.1" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "url": "https://opencollective.com/eslint" + } + }, + "node_modules/espree/node_modules/eslint-visitor-keys": { + "version": "4.2.1", + "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-4.2.1.tgz", + "integrity": "sha512-Uhdk5sfqcee/9H/rCOJikYz67o0a2Tw2hGRPOG2Y1R2dg7brRe1uG0yaNQDHu+TO/uQPF/5eCapvYSmHUjt7JQ==", + "dev": true, + "license": "Apache-2.0", + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "url": "https://opencollective.com/eslint" + } + }, + "node_modules/esprima": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.1.tgz", + "integrity": "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==", + "dev": true, + "license": "BSD-2-Clause", + "bin": { + "esparse": "bin/esparse.js", + "esvalidate": "bin/esvalidate.js" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/esquery": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/esquery/-/esquery-1.6.0.tgz", + "integrity": "sha512-ca9pw9fomFcKPvFLXhBKUK90ZvGibiGOvRJNbjljY7s7uq/5YO4BOzcYtJqExdx99rF6aAcnRxHmcUHcz6sQsg==", + "dev": true, + "dependencies": { + "estraverse": "^5.1.0" + }, + "engines": { + "node": ">=0.10" + } + }, + "node_modules/esquery/node_modules/estraverse": { + "version": "5.3.0", + "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz", + "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==", + "dev": true, + "engines": { + "node": ">=4.0" + } + }, + "node_modules/esrecurse": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/esrecurse/-/esrecurse-4.3.0.tgz", + "integrity": "sha512-KmfKL3b6G+RXvP8N1vr3Tq1kL/oCFgn2NYXEtqP8/L3pKapUA4G8cFVaoF3SU323CD4XypR/ffioHmkti6/Tag==", + "dependencies": { + "estraverse": "^5.2.0" + }, + "engines": { + "node": ">=4.0" + } + }, + "node_modules/esrecurse/node_modules/estraverse": { + "version": "5.3.0", + "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz", + "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==", + "engines": { + "node": ">=4.0" + } + }, + "node_modules/estraverse": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-4.3.0.tgz", + "integrity": "sha512-39nnKffWz8xN1BU/2c79n9nB9HDzo0niYUqx6xyqUnyoAnQyyWpOTdZEeiCch8BBu515t4wp9ZmgVfVhn9EBpw==", + "engines": { + "node": ">=4.0" + } + }, + "node_modules/esutils": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.3.tgz", + "integrity": "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/etag": { + "version": "1.8.1", + "resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz", + "integrity": "sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/event-target-shim": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz", + "integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==", + "engines": { + "node": ">=6" + } + }, + "node_modules/eventemitter3": { + "version": "4.0.7", + "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-4.0.7.tgz", + "integrity": "sha512-8guHBZCwKnFhYdHr2ysuRWErTwhoN2X8XELRlrRwpmfeY2jjuUN4taQMsULKUVo1K4DvZl+0pgfyoysHxvmvEw==", + "dev": true + }, + "node_modules/events": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/events/-/events-3.3.0.tgz", + "integrity": "sha512-mQw+2fkQbALzQ7V0MY0IqdnXNOeTtP4r0lN9z7AAawCXgqea7bDii20AYrIBrFd/Hx0M2Ocz6S111CaFkUcb0Q==", + "engines": { + "node": ">=0.8.x" + } + }, + "node_modules/execa": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/execa/-/execa-5.1.1.tgz", + "integrity": "sha512-8uSpZZocAZRBAPIEINJj3Lo9HyGitllczc27Eh5YYojjMFMn8yHMDMaUHE2Jqfq05D/wucwI4JGURyXt1vchyg==", + "dev": true, + "dependencies": { + "cross-spawn": "^7.0.3", + "get-stream": "^6.0.0", + "human-signals": "^2.1.0", + "is-stream": "^2.0.0", + "merge-stream": "^2.0.0", + "npm-run-path": "^4.0.1", + "onetime": "^5.1.2", + "signal-exit": "^3.0.3", + "strip-final-newline": "^2.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sindresorhus/execa?sponsor=1" + } + }, + "node_modules/exit": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/exit/-/exit-0.1.2.tgz", + "integrity": "sha512-Zk/eNKV2zbjpKzrsQ+n1G6poVbErQxJ0LBOJXaKZ1EViLzH+hrLu9cdXI4zw9dBQJslwBEpbQ2P1oS7nDxs6jQ==", + "dev": true, + "engines": { + "node": ">= 0.8.0" + } + }, + "node_modules/expect": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/expect/-/expect-29.7.0.tgz", + "integrity": "sha512-2Zks0hf1VLFYI1kbh0I5jP3KHHyCHpkfyHBzsSXRFgl/Bg9mWYfMW8oD+PdMPlEwy5HNsR9JutYy6pMeOh61nw==", + "dev": true, + "dependencies": { + "@jest/expect-utils": "^29.7.0", + "jest-get-type": "^29.6.3", + "jest-matcher-utils": "^29.7.0", + "jest-message-util": "^29.7.0", + "jest-util": "^29.7.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/express": { + "version": "4.22.1", + "resolved": "https://registry.npmjs.org/express/-/express-4.22.1.tgz", + "integrity": "sha512-F2X8g9P1X7uCPZMA3MVf9wcTqlyNp7IhH5qPCI0izhaOIYXaW9L535tGA3qmjRzpH+bZczqq7hVKxTR4NWnu+g==", + "dev": true, + "license": "MIT", + "dependencies": { + "accepts": "~1.3.8", + "array-flatten": "1.1.1", + "body-parser": "~1.20.3", + "content-disposition": "~0.5.4", + "content-type": "~1.0.4", + "cookie": "~0.7.1", + "cookie-signature": "~1.0.6", + "debug": "2.6.9", + "depd": "2.0.0", + "encodeurl": "~2.0.0", + "escape-html": "~1.0.3", + "etag": "~1.8.1", + "finalhandler": "~1.3.1", + "fresh": "~0.5.2", + "http-errors": "~2.0.0", + "merge-descriptors": "1.0.3", + "methods": "~1.1.2", + "on-finished": "~2.4.1", + "parseurl": "~1.3.3", + "path-to-regexp": "~0.1.12", + "proxy-addr": "~2.0.7", + "qs": "~6.14.0", + "range-parser": "~1.2.1", + "safe-buffer": "5.2.1", + "send": "~0.19.0", + "serve-static": "~1.16.2", + "setprototypeof": "1.2.0", + "statuses": "~2.0.1", + "type-is": "~1.6.18", + "utils-merge": "1.0.1", + "vary": "~1.1.2" + }, + "engines": { + "node": ">= 0.10.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } + }, + "node_modules/express/node_modules/debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "dev": true, + "license": "MIT", + "dependencies": { + "ms": "2.0.0" + } + }, + "node_modules/express/node_modules/ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", + "dev": true, + "license": "MIT" + }, + "node_modules/fast-deep-equal": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz", + "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==" + }, + "node_modules/fast-diff": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/fast-diff/-/fast-diff-1.3.0.tgz", + "integrity": "sha512-VxPP4NqbUjj6MaAOafWeUn2cXWLcCtljklUtZf0Ind4XQ+QPtmA0b18zZy0jIQx+ExRVCR/ZQpBmik5lXshNsw==", + "dev": true + }, + "node_modules/fast-glob": { + "version": "3.3.2", + "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.3.2.tgz", + "integrity": "sha512-oX2ruAFQwf/Orj8m737Y5adxDQO0LAB7/S5MnxCdTNDd4p6BsyIVsv9JQsATbTSq8KHRpLwIHbVlUNatxd+1Ow==", + "dev": true, + "dependencies": { + "@nodelib/fs.stat": "^2.0.2", + "@nodelib/fs.walk": "^1.2.3", + "glob-parent": "^5.1.2", + "merge2": "^1.3.0", + "micromatch": "^4.0.4" + }, + "engines": { + "node": ">=8.6.0" + } + }, + "node_modules/fast-json-stable-stringify": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz", + "integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==" + }, + "node_modules/fast-levenshtein": { + "version": "2.0.6", + "resolved": "https://registry.npmjs.org/fast-levenshtein/-/fast-levenshtein-2.0.6.tgz", + "integrity": "sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==", + "dev": true + }, + "node_modules/fast-safe-stringify": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/fast-safe-stringify/-/fast-safe-stringify-2.1.1.tgz", + "integrity": "sha512-W+KJc2dmILlPplD/H4K9l9LcAHAfPtP6BY84uVLXQ6Evcz9Lcg33Y2z1IVblT6xdY54PXYVHEv+0Wpq8Io6zkA==", + "dev": true + }, + "node_modules/fast-uri": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.0.3.tgz", + "integrity": "sha512-aLrHthzCjH5He4Z2H9YZ+v6Ujb9ocRuW6ZzkJQOrTxleEijANq4v1TsaPaVG1PZcuurEzrLcWRyYBYXD5cEiaw==" + }, + "node_modules/fastest-levenshtein": { + "version": "1.0.16", + "resolved": "https://registry.npmjs.org/fastest-levenshtein/-/fastest-levenshtein-1.0.16.tgz", + "integrity": "sha512-eRnCtTTtGZFpQCwhJiUOuxPQWRXVKYDn0b2PeHfXL6/Zi53SLAzAHfVhVWK2AryC/WH05kGfxhFIPvTF0SXQzg==", + "dev": true, + "engines": { + "node": ">= 4.9.1" + } + }, + "node_modules/fastparse": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/fastparse/-/fastparse-1.1.2.tgz", + "integrity": "sha512-483XLLxTVIwWK3QTrMGRqUfUpoOs/0hbQrl2oz4J0pAcm3A3bu84wxTFqGqkJzewCLdME38xJLJAxBABfQT8sQ==" + }, + "node_modules/fastq": { + "version": "1.17.1", + "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.17.1.tgz", + "integrity": "sha512-sRVD3lWVIXWg6By68ZN7vho9a1pQcN/WBFaAAsDDFzlJjvoGx0P8z7V1t72grFJfJhu3YPZBuu25f7Kaw2jN1w==", + "dev": true, + "dependencies": { + "reusify": "^1.0.4" + } + }, + "node_modules/favicons": { + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/favicons/-/favicons-7.2.0.tgz", + "integrity": "sha512-k/2rVBRIRzOeom3wI9jBPaSEvoTSQEW4iM0EveBmBBKFxO8mSyyRWtDlfC3VnEfu0avmjrMzy8/ZFPSe6F71Hw==", + "dev": true, + "dependencies": { + "escape-html": "^1.0.3", + "sharp": "^0.33.1", + "xml2js": "^0.6.1" + }, + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/favicons-webpack-plugin": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/favicons-webpack-plugin/-/favicons-webpack-plugin-6.0.1.tgz", + "integrity": "sha512-Gl0Co4zIZq74EKXdpfe8FaoJqbuf0undV4UgpsL34vqICRAYUDwQdp3D+z+uxEOV0i9o+vHDn7Q6jaSxRiDJUA==", + "dev": true, + "dependencies": { + "find-root": "^1.1.0", + "parse-author": "^2.0.0", + "parse5": "^7.1.1" + }, + "engines": { + "node": ">=16" + }, + "optionalDependencies": { + "html-webpack-plugin": "^5.5.0" + }, + "peerDependencies": { + "favicons": "^7.0.1", + "webpack": "^5.0.0" + } + }, + "node_modules/faye-websocket": { + "version": "0.11.4", + "resolved": "https://registry.npmjs.org/faye-websocket/-/faye-websocket-0.11.4.tgz", + "integrity": "sha512-CzbClwlXAuiRQAlUyfqPgvPoNKTckTPGfwZV4ZdAhVcP2lh9KUxJg2b5GkE7XbjKQ3YJnQ9z6D9ntLAlB+tP8g==", + "dev": true, + "dependencies": { + "websocket-driver": ">=0.5.1" + }, + "engines": { + "node": ">=0.8.0" + } + }, + "node_modules/fb-watchman": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/fb-watchman/-/fb-watchman-2.0.2.tgz", + "integrity": "sha512-p5161BqbuCaSnB8jIbzQHOlpgsPmK5rJVDfDKO91Axs5NC1uu3HRQm6wt9cd9/+GtQQIO53JdGXXoyDpTAsgYA==", + "dev": true, + "dependencies": { + "bser": "2.1.1" + } + }, + "node_modules/file-entry-cache": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-8.0.0.tgz", + "integrity": "sha512-XXTUwCvisa5oacNGRP9SfNtYBNAMi+RPwBFmblZEF7N7swHYQS6/Zfk7SRwx4D5j3CH211YNRco1DEMNVfZCnQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "flat-cache": "^4.0.0" + }, + "engines": { + "node": ">=16.0.0" + } + }, + "node_modules/file-loader": { + "version": "6.2.0", + "resolved": "https://registry.npmjs.org/file-loader/-/file-loader-6.2.0.tgz", + "integrity": "sha512-qo3glqyTa61Ytg4u73GultjHGjdRyig3tG6lPtyX/jOEJvHif9uB0/OCI2Kif6ctF3caQTW2G5gym21oAsI4pw==", + "dev": true, + "dependencies": { + "loader-utils": "^2.0.0", + "schema-utils": "^3.0.0" + }, + "engines": { + "node": ">= 10.13.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/webpack" + }, + "peerDependencies": { + "webpack": "^4.0.0 || ^5.0.0" + } + }, + "node_modules/file-loader/node_modules/ajv": { + "version": "6.12.6", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz", + "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==", + "dev": true, + "dependencies": { + "fast-deep-equal": "^3.1.1", + "fast-json-stable-stringify": "^2.0.0", + "json-schema-traverse": "^0.4.1", + "uri-js": "^4.2.2" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/epoberezkin" + } + }, + "node_modules/file-loader/node_modules/ajv-keywords": { + "version": "3.5.2", + "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-3.5.2.tgz", + "integrity": "sha512-5p6WTN0DdTGVQk6VjcEju19IgaHudalcfabD7yhDGeA6bcQnmL+CpveLJq/3hvfwd1aof6L386Ougkx6RfyMIQ==", + "dev": true, + "peerDependencies": { + "ajv": "^6.9.1" + } + }, + "node_modules/file-loader/node_modules/json-schema-traverse": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", + "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==", + "dev": true + }, + "node_modules/file-loader/node_modules/loader-utils": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/loader-utils/-/loader-utils-2.0.4.tgz", + "integrity": "sha512-xXqpXoINfFhgua9xiqD8fPFHgkoq1mmmpE92WlDbm9rNRd/EbRb+Gqf908T2DMfuHjjJlksiK2RbHVOdD/MqSw==", + "dev": true, + "dependencies": { + "big.js": "^5.2.2", + "emojis-list": "^3.0.0", + "json5": "^2.1.2" + }, + "engines": { + "node": ">=8.9.0" + } + }, + "node_modules/file-loader/node_modules/schema-utils": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-3.3.0.tgz", + "integrity": "sha512-pN/yOAvcC+5rQ5nERGuwrjLlYvLTbCibnZ1I7B1LaiAz9BRBlE9GMgE/eqV30P7aJQUf7Ddimy/RsbYO/GrVGg==", + "dev": true, + "dependencies": { + "@types/json-schema": "^7.0.8", + "ajv": "^6.12.5", + "ajv-keywords": "^3.5.2" + }, + "engines": { + "node": ">= 10.13.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/webpack" + } + }, + "node_modules/filelist": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/filelist/-/filelist-1.0.4.tgz", + "integrity": "sha512-w1cEuf3S+DrLCQL7ET6kz+gmlJdbq9J7yXCSjK/OZCPA+qEN1WyF4ZAf0YYJa4/shHJra2t/d/r8SV4Ji+x+8Q==", + "dev": true, + "dependencies": { + "minimatch": "^5.0.1" + } + }, + "node_modules/filelist/node_modules/brace-expansion": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz", + "integrity": "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "balanced-match": "^1.0.0" + } + }, + "node_modules/filelist/node_modules/minimatch": { + "version": "5.1.6", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-5.1.6.tgz", + "integrity": "sha512-lKwV/1brpG6mBUFHtb7NUmtABCb2WZZmm2wNiOA5hAb8VdCS4B3dtMWyvcoViccwAW/COERjXLt0zP1zXUN26g==", + "dev": true, + "dependencies": { + "brace-expansion": "^2.0.1" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/filesize": { + "version": "8.0.7", + "resolved": "https://registry.npmjs.org/filesize/-/filesize-8.0.7.tgz", + "integrity": "sha512-pjmC+bkIF8XI7fWaH8KxHcZL3DPybs1roSKP4rKDvy20tAWwIObE4+JIseG2byfGKhud5ZnM4YSGKBz7Sh0ndQ==", + "dev": true, + "engines": { + "node": ">= 0.4.0" + } + }, + "node_modules/fill-range": { + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz", + "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==", + "dev": true, + "dependencies": { + "to-regex-range": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/finalhandler": { + "version": "1.3.2", + "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.3.2.tgz", + "integrity": "sha512-aA4RyPcd3badbdABGDuTXCMTtOneUCAYH/gxoYRTZlIJdF0YPWuGqiAsIrhNnnqdXGswYk6dGujem4w80UJFhg==", + "dev": true, + "license": "MIT", + "dependencies": { + "debug": "2.6.9", + "encodeurl": "~2.0.0", + "escape-html": "~1.0.3", + "on-finished": "~2.4.1", + "parseurl": "~1.3.3", + "statuses": "~2.0.2", + "unpipe": "~1.0.0" + }, + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/finalhandler/node_modules/debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "dev": true, + "license": "MIT", + "dependencies": { + "ms": "2.0.0" + } + }, + "node_modules/finalhandler/node_modules/ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", + "dev": true, + "license": "MIT" + }, + "node_modules/find-cache-dir": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/find-cache-dir/-/find-cache-dir-2.1.0.tgz", + "integrity": "sha512-Tq6PixE0w/VMFfCgbONnkiQIVol/JJL7nRMi20fqzA4NRs9AfeqMGeRdPi3wIhYkxjeBaWh2rxwapn5Tu3IqOQ==", + "dev": true, + "dependencies": { + "commondir": "^1.0.1", + "make-dir": "^2.0.0", + "pkg-dir": "^3.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/find-root": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/find-root/-/find-root-1.1.0.tgz", + "integrity": "sha512-NKfW6bec6GfKc0SGx1e07QZY9PE99u0Bft/0rzSD5k3sO/vwkVUpDUKVm5Gpp5Ue3YfShPFTX2070tDs5kB9Ng==", + "dev": true + }, + "node_modules/find-up": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-5.0.0.tgz", + "integrity": "sha512-78/PXT1wlLLDgTzDs7sjq9hzz0vXD+zn+7wypEe4fXQxCmdmqfGsEPQxmiCSQI3ajFV91bVSsvNtrJRiW6nGng==", + "dev": true, + "dependencies": { + "locate-path": "^6.0.0", + "path-exists": "^4.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/flat": { + "version": "5.0.2", + "resolved": "https://registry.npmjs.org/flat/-/flat-5.0.2.tgz", + "integrity": "sha512-b6suED+5/3rTpUBdG1gupIl8MPFCAMA0QXwmljLhvCUKcUvdE4gWky9zpuGCcXHOsz4J9wPGNWq6OKpmIzz3hQ==", + "dev": true, + "bin": { + "flat": "cli.js" + } + }, + "node_modules/flat-cache": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/flat-cache/-/flat-cache-4.0.1.tgz", + "integrity": "sha512-f7ccFPK3SXFHpx15UIGyRJ/FJQctuKZ0zVuN3frBo4HnK3cay9VEW0R6yPYFHC0AgqhukPzKjq22t5DmAyqGyw==", + "dev": true, + "license": "MIT", + "dependencies": { + "flatted": "^3.2.9", + "keyv": "^4.5.4" + }, + "engines": { + "node": ">=16" + } + }, + "node_modules/flatted": { + "version": "3.3.3", + "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.3.3.tgz", + "integrity": "sha512-GX+ysw4PBCz0PzosHDepZGANEuFCMLrnRTiEy9McGjmkCQYwRq4A/X786G/fjM/+OjsWSU1ZrY5qyARZmO/uwg==", + "dev": true, + "license": "ISC" + }, + "node_modules/follow-redirects": { + "version": "1.15.9", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.9.tgz", + "integrity": "sha512-gew4GsXizNgdoRyqmyfMHyAmXsZDk6mHkSxZFCzW9gwlbtOW44CDtYavM+y+72qD/Vq2l550kMF52DT8fOLJqQ==", + "dev": true, + "funding": [ + { + "type": "individual", + "url": "https://github.com/sponsors/RubenVerborgh" + } + ], + "engines": { + "node": ">=4.0" + }, + "peerDependenciesMeta": { + "debug": { + "optional": true + } + } + }, + "node_modules/for-each": { + "version": "0.3.5", + "resolved": "https://registry.npmjs.org/for-each/-/for-each-0.3.5.tgz", + "integrity": "sha512-dKx12eRCVIzqCxFGplyFKJMPvLEWgmNtUrpTiJIR5u97zEhRG8ySrtboPHZXx7daLxQVrl643cTzbab2tkQjxg==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-callable": "^1.2.7" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/fork-ts-checker-webpack-plugin": { + "version": "6.5.3", + "resolved": "https://registry.npmjs.org/fork-ts-checker-webpack-plugin/-/fork-ts-checker-webpack-plugin-6.5.3.tgz", + "integrity": "sha512-SbH/l9ikmMWycd5puHJKTkZJKddF4iRLyW3DeZ08HTI7NGyLS38MXd/KGgeWumQO7YNQbW2u/NtPT2YowbPaGQ==", + "dev": true, + "dependencies": { + "@babel/code-frame": "^7.8.3", + "@types/json-schema": "^7.0.5", + "chalk": "^4.1.0", + "chokidar": "^3.4.2", + "cosmiconfig": "^6.0.0", + "deepmerge": "^4.2.2", + "fs-extra": "^9.0.0", + "glob": "^7.1.6", + "memfs": "^3.1.2", + "minimatch": "^3.0.4", + "schema-utils": "2.7.0", + "semver": "^7.3.2", + "tapable": "^1.0.0" + }, + "engines": { + "node": ">=10", + "yarn": ">=1.0.0" + }, + "peerDependencies": { + "eslint": ">= 6", + "typescript": ">= 2.7", + "vue-template-compiler": "*", + "webpack": ">= 4" + }, + "peerDependenciesMeta": { + "eslint": { + "optional": true + }, + "vue-template-compiler": { + "optional": true + } + } + }, + "node_modules/fork-ts-checker-webpack-plugin/node_modules/ajv": { + "version": "6.12.6", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz", + "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==", + "dev": true, + "dependencies": { + "fast-deep-equal": "^3.1.1", + "fast-json-stable-stringify": "^2.0.0", + "json-schema-traverse": "^0.4.1", + "uri-js": "^4.2.2" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/epoberezkin" + } + }, + "node_modules/fork-ts-checker-webpack-plugin/node_modules/ajv-keywords": { + "version": "3.5.2", + "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-3.5.2.tgz", + "integrity": "sha512-5p6WTN0DdTGVQk6VjcEju19IgaHudalcfabD7yhDGeA6bcQnmL+CpveLJq/3hvfwd1aof6L386Ougkx6RfyMIQ==", + "dev": true, + "peerDependencies": { + "ajv": "^6.9.1" + } + }, + "node_modules/fork-ts-checker-webpack-plugin/node_modules/cosmiconfig": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-6.0.0.tgz", + "integrity": "sha512-xb3ZL6+L8b9JLLCx3ZdoZy4+2ECphCMo2PwqgP1tlfVq6M6YReyzBJtvWWtbDSpNr9hn96pkCiZqUcFEc+54Qg==", + "dev": true, + "dependencies": { + "@types/parse-json": "^4.0.0", + "import-fresh": "^3.1.0", + "parse-json": "^5.0.0", + "path-type": "^4.0.0", + "yaml": "^1.7.2" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/fork-ts-checker-webpack-plugin/node_modules/json-schema-traverse": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", + "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==", + "dev": true + }, + "node_modules/fork-ts-checker-webpack-plugin/node_modules/schema-utils": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-2.7.0.tgz", + "integrity": "sha512-0ilKFI6QQF5nxDZLFn2dMjvc4hjg/Wkg7rHd3jK6/A4a1Hl9VFdQWvgB1UMGoU94pad1P/8N7fMcEnLnSiju8A==", + "dev": true, + "dependencies": { + "@types/json-schema": "^7.0.4", + "ajv": "^6.12.2", + "ajv-keywords": "^3.4.1" + }, + "engines": { + "node": ">= 8.9.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/webpack" + } + }, + "node_modules/fork-ts-checker-webpack-plugin/node_modules/semver": { + "version": "7.6.3", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.3.tgz", + "integrity": "sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==", + "dev": true, + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/fork-ts-checker-webpack-plugin/node_modules/tapable": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/tapable/-/tapable-1.1.3.tgz", + "integrity": "sha512-4WK/bYZmj8xLr+HUCODHGF1ZFzsYffasLUgEiMBY4fgtltdO6B4WJtlSbPaDTLpYTcGVwM2qLnFTICEcNxs3kA==", + "dev": true, + "engines": { + "node": ">=6" + } + }, + "node_modules/form-data": { + "version": "4.0.5", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.5.tgz", + "integrity": "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w==", + "license": "MIT", + "dependencies": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.8", + "es-set-tostringtag": "^2.1.0", + "hasown": "^2.0.2", + "mime-types": "^2.1.12" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/form-data-encoder": { + "version": "1.7.2", + "resolved": "https://registry.npmjs.org/form-data-encoder/-/form-data-encoder-1.7.2.tgz", + "integrity": "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A==" + }, + "node_modules/formdata-node": { + "version": "4.4.1", + "resolved": "https://registry.npmjs.org/formdata-node/-/formdata-node-4.4.1.tgz", + "integrity": "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ==", + "dependencies": { + "node-domexception": "1.0.0", + "web-streams-polyfill": "4.0.0-beta.3" + }, + "engines": { + "node": ">= 12.20" + } + }, + "node_modules/forwarded": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz", + "integrity": "sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/fraction.js": { + "version": "4.3.7", + "resolved": "https://registry.npmjs.org/fraction.js/-/fraction.js-4.3.7.tgz", + "integrity": "sha512-ZsDfxO51wGAXREY55a7la9LScWpwv9RxIrYABrlvOFBlH/ShPnrtsXeuUIfXKKOVicNxQ+o8JTbJvjS4M89yew==", + "dev": true, + "engines": { + "node": "*" + }, + "funding": { + "type": "patreon", + "url": "https://github.com/sponsors/rawify" + } + }, + "node_modules/fresh": { + "version": "0.5.2", + "resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz", + "integrity": "sha512-zJ2mQYM18rEFOudeV4GShTGIQ7RbzA7ozbU9I/XBpm7kqgMywgmylMwXHxZJmkVoYkna9d2pVXVXPdYTP9ej8Q==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/fs-extra": { + "version": "9.1.0", + "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-9.1.0.tgz", + "integrity": "sha512-hcg3ZmepS30/7BSFqRvoo3DOMQu7IjqxO5nCDt+zM9XWjb33Wg7ziNT+Qvqbuc3+gWpzO02JubVyk2G4Zvo1OQ==", + "dev": true, + "dependencies": { + "at-least-node": "^1.0.0", + "graceful-fs": "^4.2.0", + "jsonfile": "^6.0.1", + "universalify": "^2.0.0" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/fs-monkey": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/fs-monkey/-/fs-monkey-1.0.6.tgz", + "integrity": "sha512-b1FMfwetIKymC0eioW7mTywihSQE4oLzQn1dB6rZB5fx/3NpNEdAWeCSMB+60/AeT0TCXsxzAlcYVEFCTAksWg==", + "dev": true + }, + "node_modules/fs-readdir-recursive": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/fs-readdir-recursive/-/fs-readdir-recursive-1.1.0.tgz", + "integrity": "sha512-GNanXlVr2pf02+sPN40XN8HG+ePaNcvM0q5mZBd668Obwb0yD5GiUbZOFgwn8kGMY6I3mdyDJzieUy3PTYyTRA==", + "dev": true + }, + "node_modules/fs.realpath": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", + "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==", + "dev": true + }, + "node_modules/fsevents": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", + "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", + "dev": true, + "hasInstallScript": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/function-bind": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", + "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==", + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/function.prototype.name": { + "version": "1.1.8", + "resolved": "https://registry.npmjs.org/function.prototype.name/-/function.prototype.name-1.1.8.tgz", + "integrity": "sha512-e5iwyodOHhbMr/yNrc7fDYG4qlbIvI5gajyzPnb5TCwyhjApznQh1BMFou9b30SevY43gCJKXycoCBjMbsuW0Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "call-bound": "^1.0.3", + "define-properties": "^1.2.1", + "functions-have-names": "^1.2.3", + "hasown": "^2.0.2", + "is-callable": "^1.2.7" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/functions-have-names": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/functions-have-names/-/functions-have-names-1.2.3.tgz", + "integrity": "sha512-xckBUXyTIqT97tq2x2AMb+g163b5JFysYk0x4qxNFwbfQkmNZoiRHb6sPzI9/QV33WeuvVYBUIiD4NzNIyqaRQ==", + "dev": true, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/generator-function": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/generator-function/-/generator-function-2.0.1.tgz", + "integrity": "sha512-SFdFmIJi+ybC0vjlHN0ZGVGHc3lgE0DxPAT0djjVg+kjOnSqclqmj0KQ7ykTOLP6YxoqOvuAODGdcHJn+43q3g==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/gensync": { + "version": "1.0.0-beta.2", + "resolved": "https://registry.npmjs.org/gensync/-/gensync-1.0.0-beta.2.tgz", + "integrity": "sha512-3hN7NaskYvMDLQY55gnW3NQ+mesEAepTqlg+VEbj7zzqEMBVNhzcGYYeqFo/TlYz6eQiFcp1HcsCZO+nGgS8zg==", + "dev": true, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/get-caller-file": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz", + "integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==", + "dev": true, + "engines": { + "node": "6.* || 8.* || >= 10.*" + } + }, + "node_modules/get-east-asian-width": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/get-east-asian-width/-/get-east-asian-width-1.3.0.tgz", + "integrity": "sha512-vpeMIQKxczTD/0s2CdEWHcb0eeJe6TFjxb+J5xgX7hScxqrGuyjmv4c1D4A/gelKfyox0gJJwIHF+fLjeaM8kQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/get-intrinsic": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz", + "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==", + "license": "MIT", + "dependencies": { + "call-bind-apply-helpers": "^1.0.2", + "es-define-property": "^1.0.1", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.1.1", + "function-bind": "^1.1.2", + "get-proto": "^1.0.1", + "gopd": "^1.2.0", + "has-symbols": "^1.1.0", + "hasown": "^2.0.2", + "math-intrinsics": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/get-package-type": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/get-package-type/-/get-package-type-0.1.0.tgz", + "integrity": "sha512-pjzuKtY64GYfWizNAJ0fr9VqttZkNiK2iS430LtIHzjBEr6bX8Am2zm4sW4Ro5wjWW5cAlRL1qAMTcXbjNAO2Q==", + "dev": true, + "engines": { + "node": ">=8.0.0" + } + }, + "node_modules/get-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz", + "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==", + "license": "MIT", + "dependencies": { + "dunder-proto": "^1.0.1", + "es-object-atoms": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/get-stream": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-6.0.1.tgz", + "integrity": "sha512-ts6Wi+2j3jQjqi70w5AlN8DFnkSwC+MqmxEzdEALB2qXZYV3X/b1CTfgPLGJNMeAWxdPfU8FO1ms3NUfaHCPYg==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/get-symbol-description": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/get-symbol-description/-/get-symbol-description-1.1.0.tgz", + "integrity": "sha512-w9UMqWwJxHNOvoNzSJ2oPF5wvYcvP7jUvYzhp67yEhTi17ZDBBC1z9pTdGuzjD+EFIqLSYRweZjqfiPzQ06Ebg==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.3", + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.6" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/glob": { + "version": "7.2.3", + "resolved": "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz", + "integrity": "sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==", + "deprecated": "Glob versions prior to v9 are no longer supported", + "dev": true, + "dependencies": { + "fs.realpath": "^1.0.0", + "inflight": "^1.0.4", + "inherits": "2", + "minimatch": "^3.1.1", + "once": "^1.3.0", + "path-is-absolute": "^1.0.0" + }, + "engines": { + "node": "*" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/glob-parent": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz", + "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==", + "dev": true, + "dependencies": { + "is-glob": "^4.0.1" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/glob-to-regexp": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/glob-to-regexp/-/glob-to-regexp-0.4.1.tgz", + "integrity": "sha512-lkX1HJXwyMcprw/5YUZc2s7DrpAiHB21/V+E1rHUrVNokkvB6bqMzT0VfV6/86ZNabt1k14YOIaT7nDvOX3Iiw==" + }, + "node_modules/global-modules": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/global-modules/-/global-modules-2.0.0.tgz", + "integrity": "sha512-NGbfmJBp9x8IxyJSd1P+otYK8vonoJactOogrVfFRIAEY1ukil8RSKDz2Yo7wh1oihl51l/r6W4epkeKJHqL8A==", + "dev": true, + "dependencies": { + "global-prefix": "^3.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/global-prefix": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/global-prefix/-/global-prefix-3.0.0.tgz", + "integrity": "sha512-awConJSVCHVGND6x3tmMaKcQvwXLhjdkmomy2W+Goaui8YPgYgXJZewhg3fWC+DlfqqQuWg8AwqjGTD2nAPVWg==", + "dev": true, + "dependencies": { + "ini": "^1.3.5", + "kind-of": "^6.0.2", + "which": "^1.3.1" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/global-prefix/node_modules/which": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/which/-/which-1.3.1.tgz", + "integrity": "sha512-HxJdYWq1MTIQbJ3nw0cqssHoTNU267KlrDuGZ1WYlxDStUtKUhOaJmh112/TZmHxxUfuJqPXSOm7tDyas0OSIQ==", + "dev": true, + "dependencies": { + "isexe": "^2.0.0" + }, + "bin": { + "which": "bin/which" + } + }, + "node_modules/globals": { + "version": "11.12.0", + "resolved": "https://registry.npmjs.org/globals/-/globals-11.12.0.tgz", + "integrity": "sha512-WOBp/EEGUiIsJSp7wcv/y6MO+lV9UoncWqxuFfm8eBwzWNgyfBd6Gz+IeKQ9jCmyhoH99g15M3T+QaVHFjizVA==", + "dev": true, + "engines": { + "node": ">=4" + } + }, + "node_modules/globalthis": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/globalthis/-/globalthis-1.0.4.tgz", + "integrity": "sha512-DpLKbNU4WylpxJykQujfCcwYWiV/Jhm50Goo0wrVILAv5jOr9d+H+UR3PhSCD2rCCEIg0uc+G+muBTwD54JhDQ==", + "dev": true, + "dependencies": { + "define-properties": "^1.2.1", + "gopd": "^1.0.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/globby": { + "version": "11.1.0", + "resolved": "https://registry.npmjs.org/globby/-/globby-11.1.0.tgz", + "integrity": "sha512-jhIXaOzy1sb8IyocaruWSn1TjmnBVs8Ayhcy83rmxNJ8q2uWKCAj3CnJY+KpGSXCueAPc0i05kVvVKtP1t9S3g==", + "dev": true, + "dependencies": { + "array-union": "^2.1.0", + "dir-glob": "^3.0.1", + "fast-glob": "^3.2.9", + "ignore": "^5.2.0", + "merge2": "^1.4.1", + "slash": "^3.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/globby/node_modules/slash": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz", + "integrity": "sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/gopd": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", + "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/graceful-fs": { + "version": "4.2.11", + "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz", + "integrity": "sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==" + }, + "node_modules/graphemer": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/graphemer/-/graphemer-1.4.0.tgz", + "integrity": "sha512-EtKwoO6kxCL9WO5xipiHTZlSzBm7WLT627TqC/uVRd0HKmq8NXyebnNYxDoBi7wt8eTWrUrKXCOVaFq9x1kgag==", + "dev": true + }, + "node_modules/gzip-size": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/gzip-size/-/gzip-size-6.0.0.tgz", + "integrity": "sha512-ax7ZYomf6jqPTQ4+XCpUGyXKHk5WweS+e05MBO4/y3WJ5RkmPXNKvX+bx1behVILVwr6JSQvZAku021CHPXG3Q==", + "dev": true, + "dependencies": { + "duplexer": "^0.1.2" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/handle-thing": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/handle-thing/-/handle-thing-2.0.1.tgz", + "integrity": "sha512-9Qn4yBxelxoh2Ow62nP+Ka/kMnOXRi8BXnRaUwezLNhqelnN49xKz4F/dPP8OYLxLxq6JDtZb2i9XznUQbNPTg==", + "dev": true + }, + "node_modules/harmony-reflect": { + "version": "1.6.2", + "resolved": "https://registry.npmjs.org/harmony-reflect/-/harmony-reflect-1.6.2.tgz", + "integrity": "sha512-HIp/n38R9kQjDEziXyDTuW3vvoxxyxjxFzXLrBr18uB47GnSt+G9D29fqrpM5ZkspMcPICud3XsBJQ4Y2URg8g==", + "dev": true + }, + "node_modules/has": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/has/-/has-1.0.4.tgz", + "integrity": "sha512-qdSAmqLF6209RFj4VVItywPMbm3vWylknmB3nvNiUIs72xAimcM8nVYxYr7ncvZq5qzk9MKIZR8ijqD/1QuYjQ==", + "dev": true, + "engines": { + "node": ">= 0.4.0" + } + }, + "node_modules/has-bigints": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/has-bigints/-/has-bigints-1.1.0.tgz", + "integrity": "sha512-R3pbpkcIqv2Pm3dUwgjclDRVmWpTJW2DcMzcIhEXEx1oh/CEMObMm3KLmRJOdvhM7o4uQBnwr8pzRK2sJWIqfg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/has-flag": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz", + "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==", + "engines": { + "node": ">=8" + } + }, + "node_modules/has-property-descriptors": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/has-property-descriptors/-/has-property-descriptors-1.0.2.tgz", + "integrity": "sha512-55JNKuIW+vq4Ke1BjOTjM2YctQIvCT7GFzHwmfZPGo5wnrgkid0YQtnAleFSqumZm4az3n2BS+erby5ipJdgrg==", + "dev": true, + "dependencies": { + "es-define-property": "^1.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/has-proto": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/has-proto/-/has-proto-1.2.0.tgz", + "integrity": "sha512-KIL7eQPfHQRC8+XluaIw7BHUwwqL19bQn4hzNgdr+1wXoU0KKj6rufu47lhY7KbJR2C6T6+PfyN0Ea7wkSS+qQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "dunder-proto": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/has-symbols": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz", + "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/has-tostringtag": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz", + "integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==", + "dependencies": { + "has-symbols": "^1.0.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/hasown": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz", + "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==", + "dependencies": { + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/he": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/he/-/he-1.2.0.tgz", + "integrity": "sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==", + "dev": true, + "bin": { + "he": "bin/he" + } + }, + "node_modules/hoist-non-react-statics": { + "version": "3.3.2", + "resolved": "https://registry.npmjs.org/hoist-non-react-statics/-/hoist-non-react-statics-3.3.2.tgz", + "integrity": "sha512-/gGivxi8JPKWNm/W0jSmzcMPpfpPLc3dY/6GxhX2hQ9iGj3aDfklV4ET7NjKpSinLpJ5vafa9iiGIEZg10SfBw==", + "dependencies": { + "react-is": "^16.7.0" + } + }, + "node_modules/hoist-non-react-statics/node_modules/react-is": { + "version": "16.13.1", + "resolved": "https://registry.npmjs.org/react-is/-/react-is-16.13.1.tgz", + "integrity": "sha512-24e6ynE2H+OKt4kqsOvNd8kBpV65zoxbA4BVsEOB3ARVWQki/DHzaUoC5KuON/BiccDaCCTZBuOcfZs70kR8bQ==" + }, + "node_modules/hpack.js": { + "version": "2.1.6", + "resolved": "https://registry.npmjs.org/hpack.js/-/hpack.js-2.1.6.tgz", + "integrity": "sha512-zJxVehUdMGIKsRaNt7apO2Gqp0BdqW5yaiGHXXmbpvxgBYVZnAql+BJb4RO5ad2MgpbZKn5G6nMnegrH1FcNYQ==", + "dev": true, + "dependencies": { + "inherits": "^2.0.1", + "obuf": "^1.0.0", + "readable-stream": "^2.0.1", + "wbuf": "^1.1.0" + } + }, + "node_modules/hpack.js/node_modules/isarray": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", + "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==", + "dev": true + }, + "node_modules/hpack.js/node_modules/readable-stream": { + "version": "2.3.8", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.8.tgz", + "integrity": "sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==", + "dev": true, + "dependencies": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.3", + "isarray": "~1.0.0", + "process-nextick-args": "~2.0.0", + "safe-buffer": "~5.1.1", + "string_decoder": "~1.1.1", + "util-deprecate": "~1.0.1" + } + }, + "node_modules/hpack.js/node_modules/safe-buffer": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", + "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==", + "dev": true + }, + "node_modules/hpack.js/node_modules/string_decoder": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", + "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", + "dev": true, + "dependencies": { + "safe-buffer": "~5.1.0" + } + }, + "node_modules/html-element-map": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/html-element-map/-/html-element-map-1.3.1.tgz", + "integrity": "sha512-6XMlxrAFX4UEEGxctfFnmrFaaZFNf9i5fNuV5wZ3WWQ4FVaNP1aX1LkX9j2mfEx1NpjeE/rL3nmgEn23GdFmrg==", + "dev": true, + "dependencies": { + "array.prototype.filter": "^1.0.0", + "call-bind": "^1.0.2" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/html-entities": { + "version": "2.5.2", + "resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.5.2.tgz", + "integrity": "sha512-K//PSRMQk4FZ78Kyau+mZurHn3FH0Vwr+H36eE0rPbeYkRRi9YxceYPhuN60UwWorxyKHhqoAJl2OFKa4BVtaA==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/mdevils" + }, + { + "type": "patreon", + "url": "https://patreon.com/mdevils" + } + ] + }, + "node_modules/html-escaper": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz", + "integrity": "sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==", + "dev": true + }, + "node_modules/html-minifier-terser": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/html-minifier-terser/-/html-minifier-terser-6.1.0.tgz", + "integrity": "sha512-YXxSlJBZTP7RS3tWnQw74ooKa6L9b9i9QYXY21eUEvhZ3u9XLfv6OnFsQq6RxkhHygsaUMvYsZRV5rU/OVNZxw==", + "dev": true, + "dependencies": { + "camel-case": "^4.1.2", + "clean-css": "^5.2.2", + "commander": "^8.3.0", + "he": "^1.2.0", + "param-case": "^3.0.4", + "relateurl": "^0.2.7", + "terser": "^5.10.0" + }, + "bin": { + "html-minifier-terser": "cli.js" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/html-minifier-terser/node_modules/commander": { + "version": "8.3.0", + "resolved": "https://registry.npmjs.org/commander/-/commander-8.3.0.tgz", + "integrity": "sha512-OkTL9umf+He2DZkUq8f8J9of7yL6RJKI24dVITBmNfZBmri9zYZQrKkuXiKhyfPSu8tUhnVBB1iKXevvnlR4Ww==", + "dev": true, + "engines": { + "node": ">= 12" + } + }, + "node_modules/html-parse-stringify": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/html-parse-stringify/-/html-parse-stringify-3.0.1.tgz", + "integrity": "sha512-KknJ50kTInJ7qIScF3jeaFRpMpE8/lfiTdzf/twXyPBLAGrLRTmkz3AdTnKeh40X8k9L2fdYwEp/42WGXIRGcg==", + "dependencies": { + "void-elements": "3.1.0" + } + }, + "node_modules/html-webpack-plugin": { + "version": "5.6.3", + "resolved": "https://registry.npmjs.org/html-webpack-plugin/-/html-webpack-plugin-5.6.3.tgz", + "integrity": "sha512-QSf1yjtSAsmf7rYBV7XX86uua4W/vkhIt0xNXKbsi2foEeW7vjJQz4bhnpL3xH+l1ryl1680uNv968Z+X6jSYg==", + "dev": true, + "dependencies": { + "@types/html-minifier-terser": "^6.0.0", + "html-minifier-terser": "^6.0.2", + "lodash": "^4.17.21", + "pretty-error": "^4.0.0", + "tapable": "^2.0.0" + }, + "engines": { + "node": ">=10.13.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/html-webpack-plugin" + }, + "peerDependencies": { + "@rspack/core": "0.x || 1.x", + "webpack": "^5.20.0" + }, + "peerDependenciesMeta": { + "@rspack/core": { + "optional": true + }, + "webpack": { + "optional": true + } + } + }, + "node_modules/htmlparser2": { + "version": "9.1.0", + "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-9.1.0.tgz", + "integrity": "sha512-5zfg6mHUoaer/97TxnGpxmbR7zJtPwIYFMZ/H5ucTlPZhKvtum05yiPK3Mgai3a0DyVxv7qYqoweaEd2nrYQzQ==", + "dev": true, + "funding": [ + "https://github.com/fb55/htmlparser2?sponsor=1", + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.1.0", + "entities": "^4.5.0" + } + }, + "node_modules/http-deceiver": { + "version": "1.2.7", + "resolved": "https://registry.npmjs.org/http-deceiver/-/http-deceiver-1.2.7.tgz", + "integrity": "sha512-LmpOGxTfbpgtGVxJrj5k7asXHCgNZp5nLfp+hWc8QQRqtb7fUy6kRY3BO1h9ddF6yIPYUARgxGOwB42DnxIaNw==", + "dev": true + }, + "node_modules/http-errors": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.1.tgz", + "integrity": "sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "depd": "~2.0.0", + "inherits": "~2.0.4", + "setprototypeof": "~1.2.0", + "statuses": "~2.0.2", + "toidentifier": "~1.0.1" + }, + "engines": { + "node": ">= 0.8" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } + }, + "node_modules/http-parser-js": { + "version": "0.5.8", + "resolved": "https://registry.npmjs.org/http-parser-js/-/http-parser-js-0.5.8.tgz", + "integrity": "sha512-SGeBX54F94Wgu5RH3X5jsDtf4eHyRogWX1XGT3b4HuW3tQPM4AaBzoUji/4AAJNXCEOWZ5O0DgZmJw1947gD5Q==", + "dev": true + }, + "node_modules/http-proxy": { + "version": "1.18.1", + "resolved": "https://registry.npmjs.org/http-proxy/-/http-proxy-1.18.1.tgz", + "integrity": "sha512-7mz/721AbnJwIVbnaSv1Cz3Am0ZLT/UBwkC92VlxhXv/k/BBQfM2fXElQNC27BVGr0uwUpplYPQM9LnaBMR5NQ==", + "dev": true, + "dependencies": { + "eventemitter3": "^4.0.0", + "follow-redirects": "^1.0.0", + "requires-port": "^1.0.0" + }, + "engines": { + "node": ">=8.0.0" + } + }, + "node_modules/http-proxy-middleware": { + "version": "2.0.9", + "resolved": "https://registry.npmjs.org/http-proxy-middleware/-/http-proxy-middleware-2.0.9.tgz", + "integrity": "sha512-c1IyJYLYppU574+YI7R4QyX2ystMtVXZwIdzazUIPIJsHuWNd+mho2j+bKoHftndicGj9yh+xjd+l0yj7VeT1Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/http-proxy": "^1.17.8", + "http-proxy": "^1.18.1", + "is-glob": "^4.0.1", + "is-plain-obj": "^3.0.0", + "micromatch": "^4.0.2" + }, + "engines": { + "node": ">=12.0.0" + }, + "peerDependencies": { + "@types/express": "^4.17.13" + }, + "peerDependenciesMeta": { + "@types/express": { + "optional": true + } + } + }, + "node_modules/http2-client": { + "version": "1.3.5", + "resolved": "https://registry.npmjs.org/http2-client/-/http2-client-1.3.5.tgz", + "integrity": "sha512-EC2utToWl4RKfs5zd36Mxq7nzHHBuomZboI0yYL6Y0RmBgT7Sgkq4rQ0ezFTYoIsSs7Tm9SJe+o2FcAg6GBhGA==", + "dev": true + }, + "node_modules/human-signals": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/human-signals/-/human-signals-2.1.0.tgz", + "integrity": "sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw==", + "dev": true, + "engines": { + "node": ">=10.17.0" + } + }, + "node_modules/humanize-ms": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/humanize-ms/-/humanize-ms-1.2.1.tgz", + "integrity": "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ==", + "dependencies": { + "ms": "^2.0.0" + } + }, + "node_modules/hyperdyperid": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/hyperdyperid/-/hyperdyperid-1.2.0.tgz", + "integrity": "sha512-Y93lCzHYgGWdrJ66yIktxiaGULYc6oGiABxhcO5AufBeOyoIdZF7bIfLaOrbM0iGIOXQQgxxRrFEnb+Y6w1n4A==", + "dev": true, + "engines": { + "node": ">=10.18" + } + }, + "node_modules/i18next": { + "version": "24.0.2", + "resolved": "https://registry.npmjs.org/i18next/-/i18next-24.0.2.tgz", + "integrity": "sha512-D88xyIGcWAKwBTAs4RSqASi8NXR/NhCVSTM4LDbdoU8qb/5dcEZjNCLDhtQBB7Epw/Cp1w2vH/3ujoTbqLSs5g==", + "funding": [ + { + "type": "individual", + "url": "https://locize.com" + }, + { + "type": "individual", + "url": "https://locize.com/i18next.html" + }, + { + "type": "individual", + "url": "https://www.i18next.com/how-to/faq#i18next-is-awesome.-how-can-i-support-the-project" + } + ], + "dependencies": { + "@babel/runtime": "^7.23.2" + }, + "peerDependencies": { + "typescript": "^5" + }, + "peerDependenciesMeta": { + "typescript": { + "optional": true + } + } + }, + "node_modules/iconv-lite": { + "version": "0.6.3", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", + "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", + "dev": true, + "dependencies": { + "safer-buffer": ">= 2.1.2 < 3.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/icss-utils": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/icss-utils/-/icss-utils-5.1.0.tgz", + "integrity": "sha512-soFhflCVWLfRNOPU3iv5Z9VUdT44xFRbzjLsEzSr5AQmgqPMTHdU3PMT1Cf1ssx8fLNJDA1juftYl+PUcv3MqA==", + "dev": true, + "engines": { + "node": "^10 || ^12 || >= 14" + }, + "peerDependencies": { + "postcss": "^8.1.0" + } + }, + "node_modules/identity-obj-proxy": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/identity-obj-proxy/-/identity-obj-proxy-3.0.0.tgz", + "integrity": "sha512-00n6YnVHKrinT9t0d9+5yZC6UBNJANpYEQvL2LlX6Ab9lnmxzIRcEmTPuyGScvl1+jKuCICX1Z0Ab1pPKKdikA==", + "dev": true, + "dependencies": { + "harmony-reflect": "^1.4.6" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/ignore": { + "version": "5.3.2", + "resolved": "https://registry.npmjs.org/ignore/-/ignore-5.3.2.tgz", + "integrity": "sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==", + "dev": true, + "engines": { + "node": ">= 4" + } + }, + "node_modules/immer": { + "version": "9.0.21", + "resolved": "https://registry.npmjs.org/immer/-/immer-9.0.21.tgz", + "integrity": "sha512-bc4NBHqOqSfRW7POMkHd51LvClaeMXpm8dx0e8oE2GORbq5aRK7Bxl4FyzVLdGtLmvLKL7BTDBG5ACQm4HWjTA==", + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/immer" + } + }, + "node_modules/immutable": { + "version": "5.0.3", + "resolved": "https://registry.npmjs.org/immutable/-/immutable-5.0.3.tgz", + "integrity": "sha512-P8IdPQHq3lA1xVeBRi5VPqUm5HDgKnx0Ru51wZz5mjxHr5n3RWhjIpOFU7ybkUxfB+5IToy+OLaHYDBIWsv+uw==", + "dev": true + }, + "node_modules/import-fresh": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.0.tgz", + "integrity": "sha512-veYYhQa+D1QBKznvhUHxb8faxlrwUnxseDAbAp457E0wLNio2bOSKnjYDhMj+YiAq61xrMGhQk9iXVk5FzgQMw==", + "dev": true, + "dependencies": { + "parent-module": "^1.0.0", + "resolve-from": "^4.0.0" + }, + "engines": { + "node": ">=6" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/import-local": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/import-local/-/import-local-3.2.0.tgz", + "integrity": "sha512-2SPlun1JUPWoM6t3F0dw0FkCF/jWY8kttcY4f599GLTSjh2OCuuhdTkJQsEcZzBqbXZGKMK2OqW1oZsjtf/gQA==", + "dev": true, + "dependencies": { + "pkg-dir": "^4.2.0", + "resolve-cwd": "^3.0.0" + }, + "bin": { + "import-local-fixture": "fixtures/cli.js" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/import-local/node_modules/find-up": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-4.1.0.tgz", + "integrity": "sha512-PpOwAdQ/YlXQ2vj8a3h8IipDuYRi3wceVQQGYWxNINccq40Anw7BlsEXCMbt1Zt+OLA6Fq9suIpIWD0OsnISlw==", + "dev": true, + "dependencies": { + "locate-path": "^5.0.0", + "path-exists": "^4.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/import-local/node_modules/locate-path": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-5.0.0.tgz", + "integrity": "sha512-t7hw9pI+WvuwNJXwk5zVHpyhIqzg2qTlklJOf0mVxGSbe3Fp2VieZcduNYjaLDoy6p9uGpQEGWG87WpMKlNq8g==", + "dev": true, + "dependencies": { + "p-locate": "^4.1.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/import-local/node_modules/p-limit": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.3.0.tgz", + "integrity": "sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==", + "dev": true, + "dependencies": { + "p-try": "^2.0.0" + }, + "engines": { + "node": ">=6" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/import-local/node_modules/p-locate": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-4.1.0.tgz", + "integrity": "sha512-R79ZZ/0wAxKGu3oYMlz8jy/kbhsNrS7SKZ7PxEHBgJ5+F2mtFW2fK2cOtBh1cHYkQsbzFV7I+EoRKe6Yt0oK7A==", + "dev": true, + "dependencies": { + "p-limit": "^2.2.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/import-local/node_modules/pkg-dir": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/pkg-dir/-/pkg-dir-4.2.0.tgz", + "integrity": "sha512-HRDzbaKjC+AOWVXxAU/x54COGeIv9eb+6CkDSQoNTt4XyWoIJvuPsXizxu/Fr23EiekbtZwmh1IcIG/l/a10GQ==", + "dev": true, + "dependencies": { + "find-up": "^4.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/imurmurhash": { + "version": "0.1.4", + "resolved": "https://registry.npmjs.org/imurmurhash/-/imurmurhash-0.1.4.tgz", + "integrity": "sha512-JmXMZ6wuvDmLiHEml9ykzqO6lwFbof0GG4IkcGaENdCRDDmMVnny7s5HsIgHCbaq0w2MyPhDqkhTUgS2LU2PHA==", + "dev": true, + "engines": { + "node": ">=0.8.19" + } + }, + "node_modules/indent-string": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/indent-string/-/indent-string-4.0.0.tgz", + "integrity": "sha512-EdDDZu4A2OyIK7Lr/2zG+w5jmbuk1DVBnEwREQvBzspBJkCEbRa8GxU1lghYcaGJCnRWibjDXlq779X1/y5xwg==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/inflight": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", + "integrity": "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==", + "deprecated": "This module is not supported, and leaks memory. Do not use it. Check out lru-cache if you want a good and tested way to coalesce async requests by a key value, which is much more comprehensive and powerful.", + "dev": true, + "dependencies": { + "once": "^1.3.0", + "wrappy": "1" + } + }, + "node_modules/inherits": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", + "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==", + "dev": true + }, + "node_modules/ini": { + "version": "1.3.8", + "resolved": "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz", + "integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==", + "dev": true + }, + "node_modules/internal-slot": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/internal-slot/-/internal-slot-1.1.0.tgz", + "integrity": "sha512-4gd7VpWNQNB4UKKCFFVcp1AVv+FMOgs9NKzjHKusc8jTMhd5eL1NqQqOpE0KzMds804/yHlglp3uxgluOqAPLw==", + "dev": true, + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "hasown": "^2.0.2", + "side-channel": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/interpret": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/interpret/-/interpret-3.1.1.tgz", + "integrity": "sha512-6xwYfHbajpoF0xLW+iwLkhwgvLoZDfjYfoFNu8ftMoXINzwuymNLd9u/KmwtdT2GbR+/Cz66otEGEVVUHX9QLQ==", + "dev": true, + "engines": { + "node": ">=10.13.0" + } + }, + "node_modules/intl-messageformat": { + "version": "10.7.3", + "resolved": "https://registry.npmjs.org/intl-messageformat/-/intl-messageformat-10.7.3.tgz", + "integrity": "sha512-AAo/3oyh7ROfPhDuh7DxTTydh97OC+lv7h1Eq5LuHWuLsUMKOhtzTYuyXlUReuwZ9vANDHo4CS1bGRrn7TZRtg==", + "dependencies": { + "@formatjs/ecma402-abstract": "2.2.1", + "@formatjs/fast-memoize": "2.2.2", + "@formatjs/icu-messageformat-parser": "2.9.1", + "tslib": "2" + } + }, + "node_modules/ipaddr.js": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-2.2.0.tgz", + "integrity": "sha512-Ag3wB2o37wslZS19hZqorUnrnzSkpOVy+IiiDEiTqNubEYpYuHWIf6K4psgN2ZWKExS4xhVCrRVfb/wfW8fWJA==", + "dev": true, + "engines": { + "node": ">= 10" + } + }, + "node_modules/is-array-buffer": { + "version": "3.0.5", + "resolved": "https://registry.npmjs.org/is-array-buffer/-/is-array-buffer-3.0.5.tgz", + "integrity": "sha512-DDfANUiiG2wC1qawP66qlTugJeL5HyzMpfr8lLK+jMQirGzNod0B12cFB/9q838Ru27sBwfw78/rdoU7RERz6A==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "call-bound": "^1.0.3", + "get-intrinsic": "^1.2.6" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-arrayish": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.2.1.tgz", + "integrity": "sha512-zz06S8t0ozoDXMG+ube26zeCTNXcKIPJZJi8hBrF4idCLms4CG9QtK7qBl1boi5ODzFpjswb5JPmHCbMpjaYzg==", + "dev": true + }, + "node_modules/is-async-function": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/is-async-function/-/is-async-function-2.1.1.tgz", + "integrity": "sha512-9dgM/cZBnNvjzaMYHVoxxfPj2QXt22Ev7SuuPrs+xav0ukGB0S6d4ydZdEiM48kLx5kDV+QBPrpVnFyefL8kkQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "async-function": "^1.0.0", + "call-bound": "^1.0.3", + "get-proto": "^1.0.1", + "has-tostringtag": "^1.0.2", + "safe-regex-test": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-bigint": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/is-bigint/-/is-bigint-1.1.0.tgz", + "integrity": "sha512-n4ZT37wG78iz03xPRKJrHTdZbe3IicyucEtdRsV5yglwc3GyUfbAfpSeD0FJ41NbUNSt5wbhqfp1fS+BgnvDFQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "has-bigints": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-binary-path": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-2.1.0.tgz", + "integrity": "sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw==", + "dev": true, + "dependencies": { + "binary-extensions": "^2.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/is-boolean-object": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/is-boolean-object/-/is-boolean-object-1.2.2.tgz", + "integrity": "sha512-wa56o2/ElJMYqjCjGkXri7it5FbebW5usLw/nPmCMs5DeZ7eziSYZhSmPRn0txqeW4LnAmQQU7FgqLpsEFKM4A==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.3", + "has-tostringtag": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-buffer": { + "version": "1.1.6", + "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz", + "integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w==" + }, + "node_modules/is-callable": { + "version": "1.2.7", + "resolved": "https://registry.npmjs.org/is-callable/-/is-callable-1.2.7.tgz", + "integrity": "sha512-1BC0BVFhS/p0qtw6enp8e+8OD0UrK0oFLztSjNzhcKA3WDuJxxAPXzPuPtKkjEY9UUoEWlX/8fgKeu2S8i9JTA==", + "dev": true, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-core-module": { + "version": "2.16.1", + "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.16.1.tgz", + "integrity": "sha512-UfoeMA6fIJ8wTYFEUjelnaGI67v6+N7qXJEvQuIGa99l4xsCruSYOVSQ0uPANn4dAzm8lkYPaKLrrijLq7x23w==", + "dev": true, + "license": "MIT", + "dependencies": { + "hasown": "^2.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-data-view": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/is-data-view/-/is-data-view-1.0.2.tgz", + "integrity": "sha512-RKtWF8pGmS87i2D6gqQu/l7EYRlVdfzemCJN/P3UOs//x1QE7mfhvzHIApBTRf7axvT6DMGwSwBXYCT0nfB9xw==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.2", + "get-intrinsic": "^1.2.6", + "is-typed-array": "^1.1.13" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-date-object": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/is-date-object/-/is-date-object-1.1.0.tgz", + "integrity": "sha512-PwwhEakHVKTdRNVOw+/Gyh0+MzlCl4R6qKvkhuvLtPMggI1WAHt9sOwZxQLSGpUaDnrdyDsomoRgNnCfKNSXXg==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.2", + "has-tostringtag": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-docker": { + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/is-docker/-/is-docker-2.2.1.tgz", + "integrity": "sha512-F+i2BKsFrH66iaUFc0woD8sLy8getkwTwtOBjvs56Cx4CgJDeKQeqfz8wAYiSb8JOprWhHH5p77PbmYCvvUuXQ==", + "dev": true, + "bin": { + "is-docker": "cli.js" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/is-extglob": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", + "integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/is-finalizationregistry": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/is-finalizationregistry/-/is-finalizationregistry-1.1.1.tgz", + "integrity": "sha512-1pC6N8qWJbWoPtEjgcL2xyhQOP491EQjeUo3qTKcmV8YSDDJrOepfG8pcC7h/QgnQHYSv0mJ3Z/ZWxmatVrysg==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-fullwidth-code-point": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", + "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/is-generator-fn": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/is-generator-fn/-/is-generator-fn-2.1.0.tgz", + "integrity": "sha512-cTIB4yPYL/Grw0EaSzASzg6bBy9gqCofvWN8okThAYIxKJZC+udlRAmGbM0XLeniEJSs8uEgHPGuHSe1XsOLSQ==", + "dev": true, + "engines": { + "node": ">=6" + } + }, + "node_modules/is-generator-function": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/is-generator-function/-/is-generator-function-1.1.2.tgz", + "integrity": "sha512-upqt1SkGkODW9tsGNG5mtXTXtECizwtS2kA161M+gJPc1xdb/Ax629af6YrTwcOeQHbewrPNlE5Dx7kzvXTizA==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.4", + "generator-function": "^2.0.0", + "get-proto": "^1.0.1", + "has-tostringtag": "^1.0.2", + "safe-regex-test": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-glob": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz", + "integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==", + "dev": true, + "dependencies": { + "is-extglob": "^2.1.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/is-inside-container": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-inside-container/-/is-inside-container-1.0.0.tgz", + "integrity": "sha512-KIYLCCJghfHZxqjYBE7rEy0OBuTd5xCHS7tHVgvCLkx7StIoaxwNW3hCALgEUjFfeRk+MG/Qxmp/vtETEF3tRA==", + "dev": true, + "dependencies": { + "is-docker": "^3.0.0" + }, + "bin": { + "is-inside-container": "cli.js" + }, + "engines": { + "node": ">=14.16" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/is-inside-container/node_modules/is-docker": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-docker/-/is-docker-3.0.0.tgz", + "integrity": "sha512-eljcgEDlEns/7AXFosB5K/2nCM4P7FQPkGc/DWLy5rmFEWvZayGrik1d9/QIY5nJ4f9YsVvBkA6kJpHn9rISdQ==", + "dev": true, + "bin": { + "is-docker": "cli.js" + }, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/is-map": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/is-map/-/is-map-2.0.3.tgz", + "integrity": "sha512-1Qed0/Hr2m+YqxnM09CjA2d/i6YZNfF6R2oRAOj36eUdS6qIV/huPJNSEpKbupewFs+ZsJlxsjjPbc0/afW6Lw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-negative-zero": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/is-negative-zero/-/is-negative-zero-2.0.3.tgz", + "integrity": "sha512-5KoIu2Ngpyek75jXodFvnafB6DJgr3u8uuK0LEZJjrU19DrMD3EVERaR8sjz8CCGgpZvxPl9SuE1GMVPFHx1mw==", + "dev": true, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-network-error": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/is-network-error/-/is-network-error-1.1.0.tgz", + "integrity": "sha512-tUdRRAnhT+OtCZR/LxZelH/C7QtjtFrTu5tXCA8pl55eTUElUHT+GPYV8MBMBvea/j+NxQqVt3LbWMRir7Gx9g==", + "dev": true, + "engines": { + "node": ">=16" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/is-number": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz", + "integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==", + "dev": true, + "engines": { + "node": ">=0.12.0" + } + }, + "node_modules/is-number-object": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/is-number-object/-/is-number-object-1.1.1.tgz", + "integrity": "sha512-lZhclumE1G6VYD8VHe35wFaIif+CTy5SJIi5+3y4psDgWu4wPDoBhF8NxUOinEc7pHgiTsT6MaBb92rKhhD+Xw==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.3", + "has-tostringtag": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-plain-obj": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-plain-obj/-/is-plain-obj-3.0.0.tgz", + "integrity": "sha512-gwsOE28k+23GP1B6vFl1oVh/WOzmawBrKwo5Ev6wMKzPkaXaCDIQKzLnvsA42DRlbVTWorkgTKIviAKCWkfUwA==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/is-plain-object": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-2.0.4.tgz", + "integrity": "sha512-h5PpgXkWitc38BBMYawTYMWJHFZJVnBquFE57xFpjB8pJFiF6gZ+bU+WyI/yqXiFR5mdLsgYNaPe8uao6Uv9Og==", + "dev": true, + "dependencies": { + "isobject": "^3.0.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/is-regex": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.2.1.tgz", + "integrity": "sha512-MjYsKHO5O7mCsmRGxWcLWheFqN9DJ/2TmngvjKXihe6efViPqc274+Fx/4fYj/r03+ESvBdTXK0V6tA3rgez1g==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.2", + "gopd": "^1.2.0", + "has-tostringtag": "^1.0.2", + "hasown": "^2.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-retina": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/is-retina/-/is-retina-1.0.3.tgz", + "integrity": "sha512-/tCmbIETZwCd8uHWO+GvbRa7jxwHFHdfetHfiwoP0aN9UDf3prUJMtKn7iBFYipYhqY1bSTjur8hC/Dakt8eyw==" + }, + "node_modules/is-root": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/is-root/-/is-root-2.1.0.tgz", + "integrity": "sha512-AGOriNp96vNBd3HtU+RzFEc75FfR5ymiYv8E553I71SCeXBiMsVDUtdio1OEFvrPyLIQ9tVR5RxXIFe5PUFjMg==", + "dev": true, + "engines": { + "node": ">=6" + } + }, + "node_modules/is-set": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/is-set/-/is-set-2.0.3.tgz", + "integrity": "sha512-iPAjerrse27/ygGLxw+EBR9agv9Y6uLeYVJMu+QNCoouJ1/1ri0mGrcWpfCqFZuzzx3WjtwxG098X+n4OuRkPg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-shared-array-buffer": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/is-shared-array-buffer/-/is-shared-array-buffer-1.0.4.tgz", + "integrity": "sha512-ISWac8drv4ZGfwKl5slpHG9OwPNty4jOWPRIhBpxOoD+hqITiwuipOQ2bNthAzwA3B4fIjO4Nln74N0S9byq8A==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-stream": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-2.0.1.tgz", + "integrity": "sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==", + "dev": true, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/is-string": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/is-string/-/is-string-1.1.1.tgz", + "integrity": "sha512-BtEeSsoaQjlSPBemMQIrY1MY0uM6vnS1g5fmufYOtnxLGUZM2178PKbhsk7Ffv58IX+ZtcvoGwccYsh0PglkAA==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.3", + "has-tostringtag": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-subset": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/is-subset/-/is-subset-0.1.1.tgz", + "integrity": "sha512-6Ybun0IkarhmEqxXCNw/C0bna6Zb/TkfUX9UbwJtK6ObwAVCxmAP308WWTHviM/zAqXk05cdhYsUsZeGQh99iw==", + "dev": true + }, + "node_modules/is-symbol": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/is-symbol/-/is-symbol-1.1.1.tgz", + "integrity": "sha512-9gGx6GTtCQM73BgmHQXfDmLtfjjTUDSyoxTCbp5WtoixAhfgsDirWIcVQ/IHpvI5Vgd5i/J5F7B9cN/WlVbC/w==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.2", + "has-symbols": "^1.1.0", + "safe-regex-test": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-typed-array": { + "version": "1.1.15", + "resolved": "https://registry.npmjs.org/is-typed-array/-/is-typed-array-1.1.15.tgz", + "integrity": "sha512-p3EcsicXjit7SaskXHs1hA91QxgTw46Fv6EFKKGS5DRFLD8yKnohjF3hxoju94b/OcMZoQukzpPpBE9uLVKzgQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "which-typed-array": "^1.1.16" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-weakmap": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/is-weakmap/-/is-weakmap-2.0.2.tgz", + "integrity": "sha512-K5pXYOm9wqY1RgjpL3YTkF39tni1XajUIkawTLUo9EZEVUFga5gSQJF8nNS7ZwJQ02y+1YCNYcMh+HIf1ZqE+w==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-weakref": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/is-weakref/-/is-weakref-1.1.1.tgz", + "integrity": "sha512-6i9mGWSlqzNMEqpCp93KwRS1uUOodk2OJ6b+sq7ZPDSy2WuI5NFIxp/254TytR8ftefexkWn5xNiHUNpPOfSew==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-weakset": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/is-weakset/-/is-weakset-2.0.4.tgz", + "integrity": "sha512-mfcwb6IzQyOKTs84CQMrOwW4gQcaTOAWJ0zzJCl2WSPDrWk/OzDaImWFH3djXhb24g4eudZfLRozAvPGw4d9hQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.3", + "get-intrinsic": "^1.2.6" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-wsl": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/is-wsl/-/is-wsl-2.2.0.tgz", + "integrity": "sha512-fKzAra0rGJUUBwGBgNkHZuToZcn+TtXHpeCgmkMJMMYx1sQDYaCSyjJBSCa2nH1DGm7s3n1oBnohoVTBaN7Lww==", + "dev": true, + "dependencies": { + "is-docker": "^2.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/isarray": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/isarray/-/isarray-2.0.5.tgz", + "integrity": "sha512-xHjhDr3cNBK0BzdUJSPXZntQUx/mwMS5Rw4A7lPJ90XGAO6ISP/ePDNuo0vhqOZU+UD5JoodwCAAoZQd3FeAKw==", + "dev": true, + "license": "MIT" + }, + "node_modules/isexe": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz", + "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==", + "dev": true + }, + "node_modules/isobject": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", + "integrity": "sha512-WhB9zCku7EGTj/HQQRz5aUQEUeoQZH2bWcltRErOpymJ4boYE6wL9Tbr23krRPSZ+C5zqNSrSw+Cc7sZZ4b7vg==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/istanbul-lib-coverage": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/istanbul-lib-coverage/-/istanbul-lib-coverage-3.2.2.tgz", + "integrity": "sha512-O8dpsF+r0WV/8MNRKfnmrtCWhuKjxrq2w+jpzBL5UZKTi2LeVWnWOmWRxFlesJONmc+wLAGvKQZEOanko0LFTg==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/istanbul-lib-instrument": { + "version": "6.0.3", + "resolved": "https://registry.npmjs.org/istanbul-lib-instrument/-/istanbul-lib-instrument-6.0.3.tgz", + "integrity": "sha512-Vtgk7L/R2JHyyGW07spoFlB8/lpjiOLTjMdms6AFMraYt3BaJauod/NGrfnVG/y4Ix1JEuMRPDPEj2ua+zz1/Q==", + "dev": true, + "dependencies": { + "@babel/core": "^7.23.9", + "@babel/parser": "^7.23.9", + "@istanbuljs/schema": "^0.1.3", + "istanbul-lib-coverage": "^3.2.0", + "semver": "^7.5.4" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/istanbul-lib-instrument/node_modules/semver": { + "version": "7.6.3", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.3.tgz", + "integrity": "sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==", + "dev": true, + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/istanbul-lib-report": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/istanbul-lib-report/-/istanbul-lib-report-3.0.1.tgz", + "integrity": "sha512-GCfE1mtsHGOELCU8e/Z7YWzpmybrx/+dSTfLrvY8qRmaY6zXTKWn6WQIjaAFw069icm6GVMNkgu0NzI4iPZUNw==", + "dev": true, + "dependencies": { + "istanbul-lib-coverage": "^3.0.0", + "make-dir": "^4.0.0", + "supports-color": "^7.1.0" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/istanbul-lib-report/node_modules/make-dir": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/make-dir/-/make-dir-4.0.0.tgz", + "integrity": "sha512-hXdUTZYIVOt1Ex//jAQi+wTZZpUpwBj/0QsOzqegb3rGMMeJiSEu5xLHnYfBrRV4RH2+OCSOO95Is/7x1WJ4bw==", + "dev": true, + "dependencies": { + "semver": "^7.5.3" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/istanbul-lib-report/node_modules/semver": { + "version": "7.6.3", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.3.tgz", + "integrity": "sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==", + "dev": true, + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/istanbul-lib-source-maps": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/istanbul-lib-source-maps/-/istanbul-lib-source-maps-4.0.1.tgz", + "integrity": "sha512-n3s8EwkdFIJCG3BPKBYvskgXGoy88ARzvegkitk60NxRdwltLOTaH7CUiMRXvwYorl0Q712iEjcWB+fK/MrWVw==", + "dev": true, + "dependencies": { + "debug": "^4.1.1", + "istanbul-lib-coverage": "^3.0.0", + "source-map": "^0.6.1" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/istanbul-lib-source-maps/node_modules/source-map": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", + "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/istanbul-reports": { + "version": "3.1.7", + "resolved": "https://registry.npmjs.org/istanbul-reports/-/istanbul-reports-3.1.7.tgz", + "integrity": "sha512-BewmUXImeuRk2YY0PVbxgKAysvhRPUQE0h5QRM++nVWyubKGV0l8qQ5op8+B2DOmwSe63Jivj0BjkPQVf8fP5g==", + "dev": true, + "dependencies": { + "html-escaper": "^2.0.0", + "istanbul-lib-report": "^3.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/iterator.prototype": { + "version": "1.1.5", + "resolved": "https://registry.npmjs.org/iterator.prototype/-/iterator.prototype-1.1.5.tgz", + "integrity": "sha512-H0dkQoCa3b2VEeKQBOxFph+JAbcrQdE7KC0UkqwpLmv2EC4P41QXP+rqo9wYodACiG5/WM5s9oDApTU8utwj9g==", + "dev": true, + "license": "MIT", + "dependencies": { + "define-data-property": "^1.1.4", + "es-object-atoms": "^1.0.0", + "get-intrinsic": "^1.2.6", + "get-proto": "^1.0.0", + "has-symbols": "^1.1.0", + "set-function-name": "^2.0.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/jake": { + "version": "10.9.2", + "resolved": "https://registry.npmjs.org/jake/-/jake-10.9.2.tgz", + "integrity": "sha512-2P4SQ0HrLQ+fw6llpLnOaGAvN2Zu6778SJMrCUwns4fOoG9ayrTiZk3VV8sCPkVZF8ab0zksVpS8FDY5pRCNBA==", + "dev": true, + "dependencies": { + "async": "^3.2.3", + "chalk": "^4.0.2", + "filelist": "^1.0.4", + "minimatch": "^3.1.2" + }, + "bin": { + "jake": "bin/cli.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/jest": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest/-/jest-29.7.0.tgz", + "integrity": "sha512-NIy3oAFp9shda19hy4HK0HRTWKtPJmGdnvywu01nOqNC2vZg+Z+fvJDxpMQA88eb2I9EcafcdjYgsDthnYTvGw==", + "dev": true, + "dependencies": { + "@jest/core": "^29.7.0", + "@jest/types": "^29.6.3", + "import-local": "^3.0.2", + "jest-cli": "^29.7.0" + }, + "bin": { + "jest": "bin/jest.js" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + }, + "peerDependencies": { + "node-notifier": "^8.0.1 || ^9.0.0 || ^10.0.0" + }, + "peerDependenciesMeta": { + "node-notifier": { + "optional": true + } + } + }, + "node_modules/jest-changed-files": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest-changed-files/-/jest-changed-files-29.7.0.tgz", + "integrity": "sha512-fEArFiwf1BpQ+4bXSprcDc3/x4HSzL4al2tozwVpDFpsxALjLYdyiIK4e5Vz66GQJIbXJ82+35PtysofptNX2w==", + "dev": true, + "dependencies": { + "execa": "^5.0.0", + "jest-util": "^29.7.0", + "p-limit": "^3.1.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-circus": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest-circus/-/jest-circus-29.7.0.tgz", + "integrity": "sha512-3E1nCMgipcTkCocFwM90XXQab9bS+GMsjdpmPrlelaxwD93Ad8iVEjX/vvHPdLPnFf+L40u+5+iutRdA1N9myw==", + "dev": true, + "dependencies": { + "@jest/environment": "^29.7.0", + "@jest/expect": "^29.7.0", + "@jest/test-result": "^29.7.0", + "@jest/types": "^29.6.3", + "@types/node": "*", + "chalk": "^4.0.0", + "co": "^4.6.0", + "dedent": "^1.0.0", + "is-generator-fn": "^2.0.0", + "jest-each": "^29.7.0", + "jest-matcher-utils": "^29.7.0", + "jest-message-util": "^29.7.0", + "jest-runtime": "^29.7.0", + "jest-snapshot": "^29.7.0", + "jest-util": "^29.7.0", + "p-limit": "^3.1.0", + "pretty-format": "^29.7.0", + "pure-rand": "^6.0.0", + "slash": "^3.0.0", + "stack-utils": "^2.0.3" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-circus/node_modules/ansi-styles": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz", + "integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/jest-circus/node_modules/pretty-format": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/pretty-format/-/pretty-format-29.7.0.tgz", + "integrity": "sha512-Pdlw/oPxN+aXdmM9R00JVC9WVFoCLTKJvDVLgmJ+qAffBMxsV85l/Lu7sNx4zSzPyoL2euImuEwHhOXdEgNFZQ==", + "dev": true, + "dependencies": { + "@jest/schemas": "^29.6.3", + "ansi-styles": "^5.0.0", + "react-is": "^18.0.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-circus/node_modules/slash": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz", + "integrity": "sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/jest-cli": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest-cli/-/jest-cli-29.7.0.tgz", + "integrity": "sha512-OVVobw2IubN/GSYsxETi+gOe7Ka59EFMR/twOU3Jb2GnKKeMGJB5SGUUrEz3SFVmJASUdZUzy83sLNNQ2gZslg==", + "dev": true, + "dependencies": { + "@jest/core": "^29.7.0", + "@jest/test-result": "^29.7.0", + "@jest/types": "^29.6.3", + "chalk": "^4.0.0", + "create-jest": "^29.7.0", + "exit": "^0.1.2", + "import-local": "^3.0.2", + "jest-config": "^29.7.0", + "jest-util": "^29.7.0", + "jest-validate": "^29.7.0", + "yargs": "^17.3.1" + }, + "bin": { + "jest": "bin/jest.js" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + }, + "peerDependencies": { + "node-notifier": "^8.0.1 || ^9.0.0 || ^10.0.0" + }, + "peerDependenciesMeta": { + "node-notifier": { + "optional": true + } + } + }, + "node_modules/jest-config": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest-config/-/jest-config-29.7.0.tgz", + "integrity": "sha512-uXbpfeQ7R6TZBqI3/TxCU4q4ttk3u0PJeC+E0zbfSoSjq6bJ7buBPxzQPL0ifrkY4DNu4JUdk0ImlBUYi840eQ==", + "dev": true, + "dependencies": { + "@babel/core": "^7.11.6", + "@jest/test-sequencer": "^29.7.0", + "@jest/types": "^29.6.3", + "babel-jest": "^29.7.0", + "chalk": "^4.0.0", + "ci-info": "^3.2.0", + "deepmerge": "^4.2.2", + "glob": "^7.1.3", + "graceful-fs": "^4.2.9", + "jest-circus": "^29.7.0", + "jest-environment-node": "^29.7.0", + "jest-get-type": "^29.6.3", + "jest-regex-util": "^29.6.3", + "jest-resolve": "^29.7.0", + "jest-runner": "^29.7.0", + "jest-util": "^29.7.0", + "jest-validate": "^29.7.0", + "micromatch": "^4.0.4", + "parse-json": "^5.2.0", + "pretty-format": "^29.7.0", + "slash": "^3.0.0", + "strip-json-comments": "^3.1.1" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + }, + "peerDependencies": { + "@types/node": "*", + "ts-node": ">=9.0.0" + }, + "peerDependenciesMeta": { + "@types/node": { + "optional": true + }, + "ts-node": { + "optional": true + } + } + }, + "node_modules/jest-config/node_modules/ansi-styles": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz", + "integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/jest-config/node_modules/pretty-format": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/pretty-format/-/pretty-format-29.7.0.tgz", + "integrity": "sha512-Pdlw/oPxN+aXdmM9R00JVC9WVFoCLTKJvDVLgmJ+qAffBMxsV85l/Lu7sNx4zSzPyoL2euImuEwHhOXdEgNFZQ==", + "dev": true, + "dependencies": { + "@jest/schemas": "^29.6.3", + "ansi-styles": "^5.0.0", + "react-is": "^18.0.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-config/node_modules/slash": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz", + "integrity": "sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/jest-diff": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest-diff/-/jest-diff-29.7.0.tgz", + "integrity": "sha512-LMIgiIrhigmPrs03JHpxUh2yISK3vLFPkAodPeo0+BuF7wA2FoQbkEg1u8gBYBThncu7e1oEDUfIXVuTqLRUjw==", + "dev": true, + "dependencies": { + "chalk": "^4.0.0", + "diff-sequences": "^29.6.3", + "jest-get-type": "^29.6.3", + "pretty-format": "^29.7.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-diff/node_modules/ansi-styles": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz", + "integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/jest-diff/node_modules/pretty-format": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/pretty-format/-/pretty-format-29.7.0.tgz", + "integrity": "sha512-Pdlw/oPxN+aXdmM9R00JVC9WVFoCLTKJvDVLgmJ+qAffBMxsV85l/Lu7sNx4zSzPyoL2euImuEwHhOXdEgNFZQ==", + "dev": true, + "dependencies": { + "@jest/schemas": "^29.6.3", + "ansi-styles": "^5.0.0", + "react-is": "^18.0.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-docblock": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest-docblock/-/jest-docblock-29.7.0.tgz", + "integrity": "sha512-q617Auw3A612guyaFgsbFeYpNP5t2aoUNLwBUbc/0kD1R4t9ixDbyFTHd1nok4epoVFpr7PmeWHrhvuV3XaJ4g==", + "dev": true, + "dependencies": { + "detect-newline": "^3.0.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-each": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest-each/-/jest-each-29.7.0.tgz", + "integrity": "sha512-gns+Er14+ZrEoC5fhOfYCY1LOHHr0TI+rQUHZS8Ttw2l7gl+80eHc/gFf2Ktkw0+SIACDTeWvpFcv3B04VembQ==", + "dev": true, + "dependencies": { + "@jest/types": "^29.6.3", + "chalk": "^4.0.0", + "jest-get-type": "^29.6.3", + "jest-util": "^29.7.0", + "pretty-format": "^29.7.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-each/node_modules/ansi-styles": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz", + "integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/jest-each/node_modules/pretty-format": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/pretty-format/-/pretty-format-29.7.0.tgz", + "integrity": "sha512-Pdlw/oPxN+aXdmM9R00JVC9WVFoCLTKJvDVLgmJ+qAffBMxsV85l/Lu7sNx4zSzPyoL2euImuEwHhOXdEgNFZQ==", + "dev": true, + "dependencies": { + "@jest/schemas": "^29.6.3", + "ansi-styles": "^5.0.0", + "react-is": "^18.0.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-environment-node": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest-environment-node/-/jest-environment-node-29.7.0.tgz", + "integrity": "sha512-DOSwCRqXirTOyheM+4d5YZOrWcdu0LNZ87ewUoywbcb2XR4wKgqiG8vNeYwhjFMbEkfju7wx2GYH0P2gevGvFw==", + "dev": true, + "dependencies": { + "@jest/environment": "^29.7.0", + "@jest/fake-timers": "^29.7.0", + "@jest/types": "^29.6.3", + "@types/node": "*", + "jest-mock": "^29.7.0", + "jest-util": "^29.7.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-get-type": { + "version": "29.6.3", + "resolved": "https://registry.npmjs.org/jest-get-type/-/jest-get-type-29.6.3.tgz", + "integrity": "sha512-zrteXnqYxfQh7l5FHyL38jL39di8H8rHoecLH3JNxH3BwOrBsNeabdap5e0I23lD4HHI8W5VFBZqG4Eaq5LNcw==", + "dev": true, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-haste-map": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest-haste-map/-/jest-haste-map-29.7.0.tgz", + "integrity": "sha512-fP8u2pyfqx0K1rGn1R9pyE0/KTn+G7PxktWidOBTqFPLYX0b9ksaMFkhK5vrS3DVun09pckLdlx90QthlW7AmA==", + "dev": true, + "dependencies": { + "@jest/types": "^29.6.3", + "@types/graceful-fs": "^4.1.3", + "@types/node": "*", + "anymatch": "^3.0.3", + "fb-watchman": "^2.0.0", + "graceful-fs": "^4.2.9", + "jest-regex-util": "^29.6.3", + "jest-util": "^29.7.0", + "jest-worker": "^29.7.0", + "micromatch": "^4.0.4", + "walker": "^1.0.8" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + }, + "optionalDependencies": { + "fsevents": "^2.3.2" + } + }, + "node_modules/jest-leak-detector": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest-leak-detector/-/jest-leak-detector-29.7.0.tgz", + "integrity": "sha512-kYA8IJcSYtST2BY9I+SMC32nDpBT3J2NvWJx8+JCuCdl/CR1I4EKUJROiP8XtCcxqgTTBGJNdbB1A8XRKbTetw==", + "dev": true, + "dependencies": { + "jest-get-type": "^29.6.3", + "pretty-format": "^29.7.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-leak-detector/node_modules/ansi-styles": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz", + "integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/jest-leak-detector/node_modules/pretty-format": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/pretty-format/-/pretty-format-29.7.0.tgz", + "integrity": "sha512-Pdlw/oPxN+aXdmM9R00JVC9WVFoCLTKJvDVLgmJ+qAffBMxsV85l/Lu7sNx4zSzPyoL2euImuEwHhOXdEgNFZQ==", + "dev": true, + "dependencies": { + "@jest/schemas": "^29.6.3", + "ansi-styles": "^5.0.0", + "react-is": "^18.0.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-matcher-utils": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest-matcher-utils/-/jest-matcher-utils-29.7.0.tgz", + "integrity": "sha512-sBkD+Xi9DtcChsI3L3u0+N0opgPYnCRPtGcQYrgXmR+hmt/fYfWAL0xRXYU8eWOdfuLgBe0YCW3AFtnRLagq/g==", + "dev": true, + "dependencies": { + "chalk": "^4.0.0", + "jest-diff": "^29.7.0", + "jest-get-type": "^29.6.3", + "pretty-format": "^29.7.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-matcher-utils/node_modules/ansi-styles": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz", + "integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/jest-matcher-utils/node_modules/pretty-format": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/pretty-format/-/pretty-format-29.7.0.tgz", + "integrity": "sha512-Pdlw/oPxN+aXdmM9R00JVC9WVFoCLTKJvDVLgmJ+qAffBMxsV85l/Lu7sNx4zSzPyoL2euImuEwHhOXdEgNFZQ==", + "dev": true, + "dependencies": { + "@jest/schemas": "^29.6.3", + "ansi-styles": "^5.0.0", + "react-is": "^18.0.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-message-util": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest-message-util/-/jest-message-util-29.7.0.tgz", + "integrity": "sha512-GBEV4GRADeP+qtB2+6u61stea8mGcOT4mCtrYISZwfu9/ISHFJ/5zOMXYbpBE9RsS5+Gb63DW4FgmnKJ79Kf6w==", + "dev": true, + "dependencies": { + "@babel/code-frame": "^7.12.13", + "@jest/types": "^29.6.3", + "@types/stack-utils": "^2.0.0", + "chalk": "^4.0.0", + "graceful-fs": "^4.2.9", + "micromatch": "^4.0.4", + "pretty-format": "^29.7.0", + "slash": "^3.0.0", + "stack-utils": "^2.0.3" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-message-util/node_modules/ansi-styles": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz", + "integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/jest-message-util/node_modules/pretty-format": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/pretty-format/-/pretty-format-29.7.0.tgz", + "integrity": "sha512-Pdlw/oPxN+aXdmM9R00JVC9WVFoCLTKJvDVLgmJ+qAffBMxsV85l/Lu7sNx4zSzPyoL2euImuEwHhOXdEgNFZQ==", + "dev": true, + "dependencies": { + "@jest/schemas": "^29.6.3", + "ansi-styles": "^5.0.0", + "react-is": "^18.0.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-message-util/node_modules/slash": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz", + "integrity": "sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/jest-mock": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest-mock/-/jest-mock-29.7.0.tgz", + "integrity": "sha512-ITOMZn+UkYS4ZFh83xYAOzWStloNzJFO2s8DWrE4lhtGD+AorgnbkiKERe4wQVBydIGPx059g6riW5Btp6Llnw==", + "dev": true, + "dependencies": { + "@jest/types": "^29.6.3", + "@types/node": "*", + "jest-util": "^29.7.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-pnp-resolver": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/jest-pnp-resolver/-/jest-pnp-resolver-1.2.3.tgz", + "integrity": "sha512-+3NpwQEnRoIBtx4fyhblQDPgJI0H1IEIkX7ShLUjPGA7TtUTvI1oiKi3SR4oBR0hQhQR80l4WAe5RrXBwWMA8w==", + "dev": true, + "engines": { + "node": ">=6" + }, + "peerDependencies": { + "jest-resolve": "*" + }, + "peerDependenciesMeta": { + "jest-resolve": { + "optional": true + } + } + }, + "node_modules/jest-regex-util": { + "version": "29.6.3", + "resolved": "https://registry.npmjs.org/jest-regex-util/-/jest-regex-util-29.6.3.tgz", + "integrity": "sha512-KJJBsRCyyLNWCNBOvZyRDnAIfUiRJ8v+hOBQYGn8gDyF3UegwiP4gwRR3/SDa42g1YbVycTidUF3rKjyLFDWbg==", + "dev": true, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-resolve": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest-resolve/-/jest-resolve-29.7.0.tgz", + "integrity": "sha512-IOVhZSrg+UvVAshDSDtHyFCCBUl/Q3AAJv8iZ6ZjnZ74xzvwuzLXid9IIIPgTnY62SJjfuupMKZsZQRsCvxEgA==", + "dev": true, + "dependencies": { + "chalk": "^4.0.0", + "graceful-fs": "^4.2.9", + "jest-haste-map": "^29.7.0", + "jest-pnp-resolver": "^1.2.2", + "jest-util": "^29.7.0", + "jest-validate": "^29.7.0", + "resolve": "^1.20.0", + "resolve.exports": "^2.0.0", + "slash": "^3.0.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-resolve-dependencies": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest-resolve-dependencies/-/jest-resolve-dependencies-29.7.0.tgz", + "integrity": "sha512-un0zD/6qxJ+S0et7WxeI3H5XSe9lTBBR7bOHCHXkKR6luG5mwDDlIzVQ0V5cZCuoTgEdcdwzTghYkTWfubi+nA==", + "dev": true, + "dependencies": { + "jest-regex-util": "^29.6.3", + "jest-snapshot": "^29.7.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-resolve/node_modules/slash": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz", + "integrity": "sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/jest-runner": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest-runner/-/jest-runner-29.7.0.tgz", + "integrity": "sha512-fsc4N6cPCAahybGBfTRcq5wFR6fpLznMg47sY5aDpsoejOcVYFb07AHuSnR0liMcPTgBsA3ZJL6kFOjPdoNipQ==", + "dev": true, + "dependencies": { + "@jest/console": "^29.7.0", + "@jest/environment": "^29.7.0", + "@jest/test-result": "^29.7.0", + "@jest/transform": "^29.7.0", + "@jest/types": "^29.6.3", + "@types/node": "*", + "chalk": "^4.0.0", + "emittery": "^0.13.1", + "graceful-fs": "^4.2.9", + "jest-docblock": "^29.7.0", + "jest-environment-node": "^29.7.0", + "jest-haste-map": "^29.7.0", + "jest-leak-detector": "^29.7.0", + "jest-message-util": "^29.7.0", + "jest-resolve": "^29.7.0", + "jest-runtime": "^29.7.0", + "jest-util": "^29.7.0", + "jest-watcher": "^29.7.0", + "jest-worker": "^29.7.0", + "p-limit": "^3.1.0", + "source-map-support": "0.5.13" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-runner/node_modules/source-map": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", + "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/jest-runner/node_modules/source-map-support": { + "version": "0.5.13", + "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.13.tgz", + "integrity": "sha512-SHSKFHadjVA5oR4PPqhtAVdcBWwRYVd6g6cAXnIbRiIwc2EhPrTuKUBdSLvlEKyIP3GCf89fltvcZiP9MMFA1w==", + "dev": true, + "dependencies": { + "buffer-from": "^1.0.0", + "source-map": "^0.6.0" + } + }, + "node_modules/jest-runtime": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest-runtime/-/jest-runtime-29.7.0.tgz", + "integrity": "sha512-gUnLjgwdGqW7B4LvOIkbKs9WGbn+QLqRQQ9juC6HndeDiezIwhDP+mhMwHWCEcfQ5RUXa6OPnFF8BJh5xegwwQ==", + "dev": true, + "dependencies": { + "@jest/environment": "^29.7.0", + "@jest/fake-timers": "^29.7.0", + "@jest/globals": "^29.7.0", + "@jest/source-map": "^29.6.3", + "@jest/test-result": "^29.7.0", + "@jest/transform": "^29.7.0", + "@jest/types": "^29.6.3", + "@types/node": "*", + "chalk": "^4.0.0", + "cjs-module-lexer": "^1.0.0", + "collect-v8-coverage": "^1.0.0", + "glob": "^7.1.3", + "graceful-fs": "^4.2.9", + "jest-haste-map": "^29.7.0", + "jest-message-util": "^29.7.0", + "jest-mock": "^29.7.0", + "jest-regex-util": "^29.6.3", + "jest-resolve": "^29.7.0", + "jest-snapshot": "^29.7.0", + "jest-util": "^29.7.0", + "slash": "^3.0.0", + "strip-bom": "^4.0.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-runtime/node_modules/slash": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz", + "integrity": "sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/jest-snapshot": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest-snapshot/-/jest-snapshot-29.7.0.tgz", + "integrity": "sha512-Rm0BMWtxBcioHr1/OX5YCP8Uov4riHvKPknOGs804Zg9JGZgmIBkbtlxJC/7Z4msKYVbIJtfU+tKb8xlYNfdkw==", + "dev": true, + "dependencies": { + "@babel/core": "^7.11.6", + "@babel/generator": "^7.7.2", + "@babel/plugin-syntax-jsx": "^7.7.2", + "@babel/plugin-syntax-typescript": "^7.7.2", + "@babel/types": "^7.3.3", + "@jest/expect-utils": "^29.7.0", + "@jest/transform": "^29.7.0", + "@jest/types": "^29.6.3", + "babel-preset-current-node-syntax": "^1.0.0", + "chalk": "^4.0.0", + "expect": "^29.7.0", + "graceful-fs": "^4.2.9", + "jest-diff": "^29.7.0", + "jest-get-type": "^29.6.3", + "jest-matcher-utils": "^29.7.0", + "jest-message-util": "^29.7.0", + "jest-util": "^29.7.0", + "natural-compare": "^1.4.0", + "pretty-format": "^29.7.0", + "semver": "^7.5.3" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-snapshot/node_modules/ansi-styles": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz", + "integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/jest-snapshot/node_modules/pretty-format": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/pretty-format/-/pretty-format-29.7.0.tgz", + "integrity": "sha512-Pdlw/oPxN+aXdmM9R00JVC9WVFoCLTKJvDVLgmJ+qAffBMxsV85l/Lu7sNx4zSzPyoL2euImuEwHhOXdEgNFZQ==", + "dev": true, + "dependencies": { + "@jest/schemas": "^29.6.3", + "ansi-styles": "^5.0.0", + "react-is": "^18.0.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-snapshot/node_modules/semver": { + "version": "7.6.3", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.3.tgz", + "integrity": "sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==", + "dev": true, + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/jest-styled-components": { + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/jest-styled-components/-/jest-styled-components-7.2.0.tgz", + "integrity": "sha512-gwyyveNjvuRA0pyhbQoydXZllLZESs2VuL5fXCabzh0buHPAOUfANtW7n5YMPmdC0sH3VB7h2eUGZ23+tjvaBA==", + "dev": true, + "dependencies": { + "@adobe/css-tools": "^4.0.1" + }, + "engines": { + "node": ">= 12" + }, + "peerDependencies": { + "styled-components": ">= 5" + } + }, + "node_modules/jest-util": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest-util/-/jest-util-29.7.0.tgz", + "integrity": "sha512-z6EbKajIpqGKU56y5KBUgy1dt1ihhQJgWzUlZHArA/+X2ad7Cb5iF+AK1EWVL/Bo7Rz9uurpqw6SiBCefUbCGA==", + "dependencies": { + "@jest/types": "^29.6.3", + "@types/node": "*", + "chalk": "^4.0.0", + "ci-info": "^3.2.0", + "graceful-fs": "^4.2.9", + "picomatch": "^2.2.3" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-validate": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest-validate/-/jest-validate-29.7.0.tgz", + "integrity": "sha512-ZB7wHqaRGVw/9hST/OuFUReG7M8vKeq0/J2egIGLdvjHCmYqGARhzXmtgi+gVeZ5uXFF219aOc3Ls2yLg27tkw==", + "dev": true, + "dependencies": { + "@jest/types": "^29.6.3", + "camelcase": "^6.2.0", + "chalk": "^4.0.0", + "jest-get-type": "^29.6.3", + "leven": "^3.1.0", + "pretty-format": "^29.7.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-validate/node_modules/ansi-styles": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz", + "integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/jest-validate/node_modules/pretty-format": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/pretty-format/-/pretty-format-29.7.0.tgz", + "integrity": "sha512-Pdlw/oPxN+aXdmM9R00JVC9WVFoCLTKJvDVLgmJ+qAffBMxsV85l/Lu7sNx4zSzPyoL2euImuEwHhOXdEgNFZQ==", + "dev": true, + "dependencies": { + "@jest/schemas": "^29.6.3", + "ansi-styles": "^5.0.0", + "react-is": "^18.0.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-watcher": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest-watcher/-/jest-watcher-29.7.0.tgz", + "integrity": "sha512-49Fg7WXkU3Vl2h6LbLtMQ/HyB6rXSIX7SqvBLQmssRBGN9I0PNvPmAmCWSOY6SOvrjhI/F7/bGAv9RtnsPA03g==", + "dev": true, + "dependencies": { + "@jest/test-result": "^29.7.0", + "@jest/types": "^29.6.3", + "@types/node": "*", + "ansi-escapes": "^4.2.1", + "chalk": "^4.0.0", + "emittery": "^0.13.1", + "jest-util": "^29.7.0", + "string-length": "^4.0.1" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-worker": { + "version": "29.7.0", + "resolved": "https://registry.npmjs.org/jest-worker/-/jest-worker-29.7.0.tgz", + "integrity": "sha512-eIz2msL/EzL9UFTFFx7jBTkeZfku0yUAyZZZmJ93H2TYEiroIx2PQjEXcwYtYl8zXCxb+PAmA2hLIt/6ZEkPHw==", + "dependencies": { + "@types/node": "*", + "jest-util": "^29.7.0", + "merge-stream": "^2.0.0", + "supports-color": "^8.0.0" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || >=18.0.0" + } + }, + "node_modules/jest-worker/node_modules/supports-color": { + "version": "8.1.1", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-8.1.1.tgz", + "integrity": "sha512-MpUEN2OodtUzxvKQl72cUF7RQ5EiHsGvSsVG0ia9c5RbWGL2CI4C7EpPS8UTBIplnlzZiNuV56w+FuNxy3ty2Q==", + "dependencies": { + "has-flag": "^4.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/supports-color?sponsor=1" + } + }, + "node_modules/jiti": { + "version": "1.21.6", + "resolved": "https://registry.npmjs.org/jiti/-/jiti-1.21.6.tgz", + "integrity": "sha512-2yTgeWTWzMWkHu6Jp9NKgePDaYHbntiwvYuuJLbbN9vl7DC9DvXKOB2BC3ZZ92D3cvV/aflH0osDfwpHepQ53w==", + "dev": true, + "bin": { + "jiti": "bin/jiti.js" + } + }, + "node_modules/js-tokens": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", + "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==" + }, + "node_modules/js-yaml": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.1.tgz", + "integrity": "sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==", + "license": "MIT", + "dependencies": { + "argparse": "^2.0.1" + }, + "bin": { + "js-yaml": "bin/js-yaml.js" + } + }, + "node_modules/jsesc": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-3.0.2.tgz", + "integrity": "sha512-xKqzzWXDttJuOcawBt4KnKHHIf5oQ/Cxax+0PWFG+DFDgHNAdi+TXECADI+RYiFUMmx8792xsMbbgXj4CwnP4g==", + "dev": true, + "bin": { + "jsesc": "bin/jsesc" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/json-buffer": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.1.tgz", + "integrity": "sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/json-parse-even-better-errors": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/json-parse-even-better-errors/-/json-parse-even-better-errors-2.3.1.tgz", + "integrity": "sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w==" + }, + "node_modules/json-schema-traverse": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", + "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==" + }, + "node_modules/json-stable-stringify-without-jsonify": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/json-stable-stringify-without-jsonify/-/json-stable-stringify-without-jsonify-1.0.1.tgz", + "integrity": "sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==", + "dev": true + }, + "node_modules/json5": { + "version": "2.2.3", + "resolved": "https://registry.npmjs.org/json5/-/json5-2.2.3.tgz", + "integrity": "sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==", + "dev": true, + "bin": { + "json5": "lib/cli.js" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/jsonfile": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-6.1.0.tgz", + "integrity": "sha512-5dgndWOriYSm5cnYaJNhalLNDKOqFwyDB/rr1E9ZsGciGvKPs8R2xYGCacuf3z6K1YKDz182fd+fY3cn3pMqXQ==", + "dev": true, + "dependencies": { + "universalify": "^2.0.0" + }, + "optionalDependencies": { + "graceful-fs": "^4.1.6" + } + }, + "node_modules/jsx-ast-utils": { + "version": "3.3.5", + "resolved": "https://registry.npmjs.org/jsx-ast-utils/-/jsx-ast-utils-3.3.5.tgz", + "integrity": "sha512-ZZow9HBI5O6EPgSJLUb8n2NKgmVWTwCvHGwFuJlMjvLFqlGG6pjirPhtdsseaLZjSibD8eegzmYpUZwoIlj2cQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "array-includes": "^3.1.6", + "array.prototype.flat": "^1.3.1", + "object.assign": "^4.1.4", + "object.values": "^1.1.6" + }, + "engines": { + "node": ">=4.0" + } + }, + "node_modules/keyv": { + "version": "4.5.4", + "resolved": "https://registry.npmjs.org/keyv/-/keyv-4.5.4.tgz", + "integrity": "sha512-oxVHkHR/EJf2CNXnWxRLW6mg7JyCCUcG0DtEGmL2ctUo1PNTin1PUil+r/+4r5MpVgC/fn1kjsx7mjSujKqIpw==", + "dev": true, + "license": "MIT", + "dependencies": { + "json-buffer": "3.0.1" + } + }, + "node_modules/kind-of": { + "version": "6.0.3", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.3.tgz", + "integrity": "sha512-dcS1ul+9tmeD95T+x28/ehLgd9mENa3LsvDTtzm3vyBEO7RPptvAD+t44WVXaUjTBRcrpFeFlC8WCruUR456hw==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/kleur": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/kleur/-/kleur-3.0.3.tgz", + "integrity": "sha512-eTIzlVOSUR+JxdDFepEYcBMtZ9Qqdef+rnzWdRZuMbOywu5tO2w2N7rqjoANZ5k9vywhL6Br1VRjUIgTQx4E8w==", + "dev": true, + "engines": { + "node": ">=6" + } + }, + "node_modules/launch-editor": { + "version": "2.9.1", + "resolved": "https://registry.npmjs.org/launch-editor/-/launch-editor-2.9.1.tgz", + "integrity": "sha512-Gcnl4Bd+hRO9P9icCP/RVVT2o8SFlPXofuCxvA2SaZuH45whSvf5p8x5oih5ftLiVhEI4sp5xDY+R+b3zJBh5w==", + "dev": true, + "dependencies": { + "picocolors": "^1.0.0", + "shell-quote": "^1.8.1" + } + }, + "node_modules/leven": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/leven/-/leven-3.1.0.tgz", + "integrity": "sha512-qsda+H8jTaUaN/x5vzW2rzc+8Rw4TAQ/4KjB46IwK5VH+IlVeeeje/EoZRpiXvIqjFgK84QffqPztGI3VBLG1A==", + "dev": true, + "engines": { + "node": ">=6" + } + }, + "node_modules/levn": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/levn/-/levn-0.4.1.tgz", + "integrity": "sha512-+bT2uH4E5LGE7h/n3evcS/sQlJXCpIp6ym8OWJ5eV6+67Dsql/LaaT7qJBAt2rzfoa/5QBGBhxDix1dMt2kQKQ==", + "dev": true, + "dependencies": { + "prelude-ls": "^1.2.1", + "type-check": "~0.4.0" + }, + "engines": { + "node": ">= 0.8.0" + } + }, + "node_modules/lilconfig": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-2.1.0.tgz", + "integrity": "sha512-utWOt/GHzuUxnLKxB6dk81RoOeoNeHgbrXiuGk4yyF5qlRz+iIVWu56E2fqGHFrXz0QNUhLB/8nKqvRH66JKGQ==", + "engines": { + "node": ">=10" + } + }, + "node_modules/lines-and-columns": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz", + "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==", + "dev": true + }, + "node_modules/lint-staged": { + "version": "16.1.2", + "resolved": "https://registry.npmjs.org/lint-staged/-/lint-staged-16.1.2.tgz", + "integrity": "sha512-sQKw2Si2g9KUZNY3XNvRuDq4UJqpHwF0/FQzZR2M7I5MvtpWvibikCjUVJzZdGE0ByurEl3KQNvsGetd1ty1/Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "chalk": "^5.4.1", + "commander": "^14.0.0", + "debug": "^4.4.1", + "lilconfig": "^3.1.3", + "listr2": "^8.3.3", + "micromatch": "^4.0.8", + "nano-spawn": "^1.0.2", + "pidtree": "^0.6.0", + "string-argv": "^0.3.2", + "yaml": "^2.8.0" + }, + "bin": { + "lint-staged": "bin/lint-staged.js" + }, + "engines": { + "node": ">=20.17" + }, + "funding": { + "url": "https://opencollective.com/lint-staged" + } + }, + "node_modules/lint-staged/node_modules/chalk": { + "version": "5.4.1", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-5.4.1.tgz", + "integrity": "sha512-zgVZuo2WcZgfUEmsn6eO3kINexW8RAE4maiQ8QNs8CtpPCSyMiYsULR3HQYkm3w8FIA3SberyMJMSldGsW+U3w==", + "dev": true, + "license": "MIT", + "engines": { + "node": "^12.17.0 || ^14.13 || >=16.0.0" + }, + "funding": { + "url": "https://github.com/chalk/chalk?sponsor=1" + } + }, + "node_modules/lint-staged/node_modules/commander": { + "version": "14.0.0", + "resolved": "https://registry.npmjs.org/commander/-/commander-14.0.0.tgz", + "integrity": "sha512-2uM9rYjPvyq39NwLRqaiLtWHyDC1FvryJDa2ATTVims5YAS4PupsEQsDvP14FqhFr0P49CYDugi59xaxJlTXRA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=20" + } + }, + "node_modules/lint-staged/node_modules/lilconfig": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-3.1.3.tgz", + "integrity": "sha512-/vlFKAoH5Cgt3Ie+JLhRbwOsCQePABiU3tJ1egGvyQ+33R/vcwM2Zl2QR/LzjsBeItPt3oSVXapn+m4nQDvpzw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/antonk52" + } + }, + "node_modules/lint-staged/node_modules/yaml": { + "version": "2.8.0", + "resolved": "https://registry.npmjs.org/yaml/-/yaml-2.8.0.tgz", + "integrity": "sha512-4lLa/EcQCB0cJkyts+FpIRx5G/llPxfP6VQU5KByHEhLxY3IJCH0f0Hy1MHI8sClTvsIb8qwRJ6R/ZdlDJ/leQ==", + "dev": true, + "license": "ISC", + "bin": { + "yaml": "bin.mjs" + }, + "engines": { + "node": ">= 14.6" + } + }, + "node_modules/listr2": { + "version": "8.3.3", + "resolved": "https://registry.npmjs.org/listr2/-/listr2-8.3.3.tgz", + "integrity": "sha512-LWzX2KsqcB1wqQ4AHgYb4RsDXauQiqhjLk+6hjbaeHG4zpjjVAB6wC/gz6X0l+Du1cN3pUB5ZlrvTbhGSNnUQQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "cli-truncate": "^4.0.0", + "colorette": "^2.0.20", + "eventemitter3": "^5.0.1", + "log-update": "^6.1.0", + "rfdc": "^1.4.1", + "wrap-ansi": "^9.0.0" + }, + "engines": { + "node": ">=18.0.0" + } + }, + "node_modules/listr2/node_modules/ansi-regex": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.1.0.tgz", + "integrity": "sha512-7HSX4QQb4CspciLpVFwyRe79O3xsIZDDLER21kERQ71oaPodF8jL725AgJMFAYbooIqolJoRLuM81SpeUkpkvA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/ansi-regex?sponsor=1" + } + }, + "node_modules/listr2/node_modules/ansi-styles": { + "version": "6.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz", + "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/listr2/node_modules/emoji-regex": { + "version": "10.4.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-10.4.0.tgz", + "integrity": "sha512-EC+0oUMY1Rqm4O6LLrgjtYDvcVYTy7chDnM4Q7030tP4Kwj3u/pR6gP9ygnp2CJMK5Gq+9Q2oqmrFJAz01DXjw==", + "dev": true, + "license": "MIT" + }, + "node_modules/listr2/node_modules/eventemitter3": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-5.0.1.tgz", + "integrity": "sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA==", + "dev": true, + "license": "MIT" + }, + "node_modules/listr2/node_modules/string-width": { + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-7.2.0.tgz", + "integrity": "sha512-tsaTIkKW9b4N+AEj+SVA+WhJzV7/zMhcSu78mLKWSk7cXMOSHsBKFWUs0fWwq8QyK3MgJBQRX6Gbi4kYbdvGkQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "emoji-regex": "^10.3.0", + "get-east-asian-width": "^1.0.0", + "strip-ansi": "^7.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/listr2/node_modules/strip-ansi": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.0.tgz", + "integrity": "sha512-iq6eVVI64nQQTRYq2KtEg2d2uU7LElhTJwsH4YzIHZshxlgZms/wIc4VoDQTlG/IvVIrBKG06CrZnp0qv7hkcQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "ansi-regex": "^6.0.1" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/strip-ansi?sponsor=1" + } + }, + "node_modules/listr2/node_modules/wrap-ansi": { + "version": "9.0.0", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-9.0.0.tgz", + "integrity": "sha512-G8ura3S+3Z2G+mkgNRq8dqaFZAuxfsxpBB8OCTGRTCtp+l/v9nbFNmCUP1BZMts3G1142MsZfn6eeUKrr4PD1Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "ansi-styles": "^6.2.1", + "string-width": "^7.0.0", + "strip-ansi": "^7.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/chalk/wrap-ansi?sponsor=1" + } + }, + "node_modules/loader-runner": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/loader-runner/-/loader-runner-4.3.0.tgz", + "integrity": "sha512-3R/1M+yS3j5ou80Me59j7F9IMs4PXs3VqRrm0TU3AbKPxlmpoY1TNscJV/oGJXo8qCatFGTfDbY6W6ipGOYXfg==", + "engines": { + "node": ">=6.11.5" + } + }, + "node_modules/loader-utils": { + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/loader-utils/-/loader-utils-3.3.1.tgz", + "integrity": "sha512-FMJTLMXfCLMLfJxcX9PFqX5qD88Z5MRGaZCVzfuqeZSPsyiBzs+pahDQjbIWz2QIzPZz0NX9Zy4FX3lmK6YHIg==", + "dev": true, + "engines": { + "node": ">= 12.13.0" + } + }, + "node_modules/locate-path": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-6.0.0.tgz", + "integrity": "sha512-iPZK6eYjbxRu3uB4/WZ3EsEIMJFMqAoopl3R+zuq0UjcAm/MO6KCweDgPfP3elTztoKP3KtnVHxTn2NHBSDVUw==", + "dev": true, + "dependencies": { + "p-locate": "^5.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/lodash": { + "version": "4.17.23", + "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.23.tgz", + "integrity": "sha512-LgVTMpQtIopCi79SJeDiP0TfWi5CNEc/L/aRdTh3yIvmZXTnheWpKjSZhnvMl8iXbC1tFg9gdHHDMLoV7CnG+w==", + "license": "MIT" + }, + "node_modules/lodash-es": { + "version": "4.17.23", + "resolved": "https://registry.npmjs.org/lodash-es/-/lodash-es-4.17.23.tgz", + "integrity": "sha512-kVI48u3PZr38HdYz98UmfPnXl2DXrpdctLrFLCd3kOx1xUkOmpFPx7gCWWM5MPkL/fD8zb+Ph0QzjGFs4+hHWg==", + "license": "MIT" + }, + "node_modules/lodash.camelcase": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/lodash.camelcase/-/lodash.camelcase-4.3.0.tgz", + "integrity": "sha512-TwuEnCnxbc3rAvhf/LbG7tJUDzhqXyFnv3dtzLOPgCG/hODL7WFnsbwktkD7yUV0RrreP/l1PALq/YSg6VvjlA==", + "dev": true + }, + "node_modules/lodash.debounce": { + "version": "4.0.8", + "resolved": "https://registry.npmjs.org/lodash.debounce/-/lodash.debounce-4.0.8.tgz", + "integrity": "sha512-FT1yDzDYEoYWhnSGnpE/4Kj1fLZkDFyqRb7fNt6FdYOSxlUWAtp42Eh6Wb0rGIv/m9Bgo7x4GhQbm5Ys4SG5ow==", + "dev": true + }, + "node_modules/lodash.escape": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/lodash.escape/-/lodash.escape-4.0.1.tgz", + "integrity": "sha512-nXEOnb/jK9g0DYMr1/Xvq6l5xMD7GDG55+GSYIYmS0G4tBk/hURD4JR9WCavs04t33WmJx9kCyp9vJ+mr4BOUw==", + "dev": true + }, + "node_modules/lodash.flattendeep": { + "version": "4.4.0", + "resolved": "https://registry.npmjs.org/lodash.flattendeep/-/lodash.flattendeep-4.4.0.tgz", + "integrity": "sha512-uHaJFihxmJcEX3kT4I23ABqKKalJ/zDrDg0lsFtc1h+3uw49SIJ5beyhx5ExVRti3AvKoOJngIj7xz3oylPdWQ==", + "dev": true + }, + "node_modules/lodash.isequal": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/lodash.isequal/-/lodash.isequal-4.5.0.tgz", + "integrity": "sha512-pDo3lu8Jhfjqls6GkMgpahsF9kCyayhgykjyLMNFTKWrpVdAQtYyB4muAMWozBB4ig/dtWAmsMxLEI8wuz+DYQ==", + "dev": true + }, + "node_modules/lodash.memoize": { + "version": "4.1.2", + "resolved": "https://registry.npmjs.org/lodash.memoize/-/lodash.memoize-4.1.2.tgz", + "integrity": "sha512-t7j+NzmgnQzTAYXcsHYLgimltOV1MXHtlOWf6GjL9Kj8GK5FInw5JotxvbOs+IvV1/Dzo04/fCGfLVs7aXb4Ag==" + }, + "node_modules/lodash.merge": { + "version": "4.6.2", + "resolved": "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz", + "integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==", + "dev": true + }, + "node_modules/lodash.uniq": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/lodash.uniq/-/lodash.uniq-4.5.0.tgz", + "integrity": "sha512-xfBaXQd9ryd9dlSDvnvI0lvxfLJlYAZzXomUYzLKtUeOQvOP5piqAWuGtrhWeqaXK9hhoM/iyJc5AV+XfsX3HQ==" + }, + "node_modules/log-update": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/log-update/-/log-update-6.1.0.tgz", + "integrity": "sha512-9ie8ItPR6tjY5uYJh8K/Zrv/RMZ5VOlOWvtZdEHYSTFKZfIBPQa9tOAEeAWhd+AnIneLJ22w5fjOYtoutpWq5w==", + "dev": true, + "license": "MIT", + "dependencies": { + "ansi-escapes": "^7.0.0", + "cli-cursor": "^5.0.0", + "slice-ansi": "^7.1.0", + "strip-ansi": "^7.1.0", + "wrap-ansi": "^9.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/log-update/node_modules/ansi-escapes": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/ansi-escapes/-/ansi-escapes-7.0.0.tgz", + "integrity": "sha512-GdYO7a61mR0fOlAsvC9/rIHf7L96sBc6dEWzeOu+KAea5bZyQRPIpojrVoI4AXGJS/ycu/fBTdLrUkA4ODrvjw==", + "dev": true, + "license": "MIT", + "dependencies": { + "environment": "^1.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/log-update/node_modules/ansi-regex": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.1.0.tgz", + "integrity": "sha512-7HSX4QQb4CspciLpVFwyRe79O3xsIZDDLER21kERQ71oaPodF8jL725AgJMFAYbooIqolJoRLuM81SpeUkpkvA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/ansi-regex?sponsor=1" + } + }, + "node_modules/log-update/node_modules/ansi-styles": { + "version": "6.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz", + "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/log-update/node_modules/emoji-regex": { + "version": "10.4.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-10.4.0.tgz", + "integrity": "sha512-EC+0oUMY1Rqm4O6LLrgjtYDvcVYTy7chDnM4Q7030tP4Kwj3u/pR6gP9ygnp2CJMK5Gq+9Q2oqmrFJAz01DXjw==", + "dev": true, + "license": "MIT" + }, + "node_modules/log-update/node_modules/is-fullwidth-code-point": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-5.0.0.tgz", + "integrity": "sha512-OVa3u9kkBbw7b8Xw5F9P+D/T9X+Z4+JruYVNapTjPYZYUznQ5YfWeFkOj606XYYW8yugTfC8Pj0hYqvi4ryAhA==", + "dev": true, + "license": "MIT", + "dependencies": { + "get-east-asian-width": "^1.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/log-update/node_modules/slice-ansi": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/slice-ansi/-/slice-ansi-7.1.0.tgz", + "integrity": "sha512-bSiSngZ/jWeX93BqeIAbImyTbEihizcwNjFoRUIY/T1wWQsfsm2Vw1agPKylXvQTU7iASGdHhyqRlqQzfz+Htg==", + "dev": true, + "license": "MIT", + "dependencies": { + "ansi-styles": "^6.2.1", + "is-fullwidth-code-point": "^5.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/chalk/slice-ansi?sponsor=1" + } + }, + "node_modules/log-update/node_modules/string-width": { + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-7.2.0.tgz", + "integrity": "sha512-tsaTIkKW9b4N+AEj+SVA+WhJzV7/zMhcSu78mLKWSk7cXMOSHsBKFWUs0fWwq8QyK3MgJBQRX6Gbi4kYbdvGkQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "emoji-regex": "^10.3.0", + "get-east-asian-width": "^1.0.0", + "strip-ansi": "^7.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/log-update/node_modules/strip-ansi": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.0.tgz", + "integrity": "sha512-iq6eVVI64nQQTRYq2KtEg2d2uU7LElhTJwsH4YzIHZshxlgZms/wIc4VoDQTlG/IvVIrBKG06CrZnp0qv7hkcQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "ansi-regex": "^6.0.1" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/strip-ansi?sponsor=1" + } + }, + "node_modules/log-update/node_modules/wrap-ansi": { + "version": "9.0.0", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-9.0.0.tgz", + "integrity": "sha512-G8ura3S+3Z2G+mkgNRq8dqaFZAuxfsxpBB8OCTGRTCtp+l/v9nbFNmCUP1BZMts3G1142MsZfn6eeUKrr4PD1Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "ansi-styles": "^6.2.1", + "string-width": "^7.0.0", + "strip-ansi": "^7.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/chalk/wrap-ansi?sponsor=1" + } + }, + "node_modules/loose-envify": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz", + "integrity": "sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==", + "dependencies": { + "js-tokens": "^3.0.0 || ^4.0.0" + }, + "bin": { + "loose-envify": "cli.js" + } + }, + "node_modules/lower-case": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/lower-case/-/lower-case-2.0.2.tgz", + "integrity": "sha512-7fm3l3NAF9WfN6W3JOmf5drwpVqX78JtoGJ3A6W0a6ZnldM41w2fV5D490psKFTpMds8TJse/eHLFFsNHHjHgg==", + "dev": true, + "dependencies": { + "tslib": "^2.0.3" + } + }, + "node_modules/lru-cache": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", + "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==", + "dev": true, + "dependencies": { + "yallist": "^3.0.2" + } + }, + "node_modules/lz-string": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/lz-string/-/lz-string-1.5.0.tgz", + "integrity": "sha512-h5bgJWpxJNswbU7qCrV0tIKQCaS3blPDrqKWx+QxzuzL1zGUzij9XCWLrSLsJPu5t+eWA/ycetzYAO5IOMcWAQ==", + "dev": true, + "peer": true, + "bin": { + "lz-string": "bin/bin.js" + } + }, + "node_modules/make-dir": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/make-dir/-/make-dir-2.1.0.tgz", + "integrity": "sha512-LS9X+dc8KLxXCb8dni79fLIIUA5VyZoyjSMCwTluaXA0o27cCK0bhXkpgw+sTXVpPy/lSO57ilRixqk0vDmtRA==", + "dev": true, + "dependencies": { + "pify": "^4.0.1", + "semver": "^5.6.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/make-dir/node_modules/semver": { + "version": "5.7.2", + "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz", + "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==", + "dev": true, + "bin": { + "semver": "bin/semver" + } + }, + "node_modules/make-error": { + "version": "1.3.6", + "resolved": "https://registry.npmjs.org/make-error/-/make-error-1.3.6.tgz", + "integrity": "sha512-s8UhlNe7vPKomQhC1qFelMokr/Sc3AgNbso3n74mVPA5LTZwkB9NlXf4XPamLxJE8h0gh73rM94xvwRT2CVInw==", + "dev": true + }, + "node_modules/makeerror": { + "version": "1.0.12", + "resolved": "https://registry.npmjs.org/makeerror/-/makeerror-1.0.12.tgz", + "integrity": "sha512-JmqCvUhmt43madlpFzG4BQzG2Z3m6tvQDNKdClZnO3VbIudJYmxsT0FNJMeiB2+JTSlTQTSbU8QdesVmwJcmLg==", + "dev": true, + "dependencies": { + "tmpl": "1.0.5" + } + }, + "node_modules/math-intrinsics": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", + "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/md5": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/md5/-/md5-2.3.0.tgz", + "integrity": "sha512-T1GITYmFaKuO91vxyoQMFETst+O71VUPEU3ze5GNzDm0OWdP8v1ziTaAEPUr/3kLsY3Sftgz242A1SetQiDL7g==", + "dependencies": { + "charenc": "0.0.2", + "crypt": "0.0.2", + "is-buffer": "~1.1.6" + } + }, + "node_modules/mdn-data": { + "version": "2.0.14", + "resolved": "https://registry.npmjs.org/mdn-data/-/mdn-data-2.0.14.tgz", + "integrity": "sha512-dn6wd0uw5GsdswPFfsgMp5NSB0/aDe6fK94YJV/AJDYXL6HVLWBsxeq7js7Ad+mU2K9LAlwpk6kN2D5mwCPVow==" + }, + "node_modules/media-typer": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz", + "integrity": "sha512-dq+qelQ9akHpcOl/gUVRTxVIOkAJ1wR3QAvb4RsVjS8oVoFjDGTc679wJYmUmknUF5HwMLOgb5O+a3KxfWapPQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/memfs": { + "version": "3.5.3", + "resolved": "https://registry.npmjs.org/memfs/-/memfs-3.5.3.tgz", + "integrity": "sha512-UERzLsxzllchadvbPs5aolHh65ISpKpM+ccLbOJ8/vvpBKmAWf+la7dXFy7Mr0ySHbdHrFv5kGFCUHHe6GFEmw==", + "dev": true, + "dependencies": { + "fs-monkey": "^1.0.4" + }, + "engines": { + "node": ">= 4.0.0" + } + }, + "node_modules/merge-descriptors": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.3.tgz", + "integrity": "sha512-gaNvAS7TZ897/rVaZ0nMtAyxNyi/pdbjbAwUpFQpN70GqnVfOiXpeUUMKRBmzXaSQ8DdTX4/0ms62r2K+hE6mQ==", + "dev": true, + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/merge-stream": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/merge-stream/-/merge-stream-2.0.0.tgz", + "integrity": "sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w==" + }, + "node_modules/merge2": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz", + "integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==", + "dev": true, + "engines": { + "node": ">= 8" + } + }, + "node_modules/methods": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz", + "integrity": "sha512-iclAHeNqNm68zFtnZ0e+1L2yUIdvzNoauKU4WBA3VvH/vPFieF7qfRlwUZU+DA9P9bPXIS90ulxoUoCH23sV2w==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/micromatch": { + "version": "4.0.8", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.8.tgz", + "integrity": "sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==", + "dev": true, + "dependencies": { + "braces": "^3.0.3", + "picomatch": "^2.3.1" + }, + "engines": { + "node": ">=8.6" + } + }, + "node_modules/mime": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz", + "integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==", + "dev": true, + "license": "MIT", + "bin": { + "mime": "cli.js" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/mime-db": { + "version": "1.52.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", + "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/mime-types": { + "version": "2.1.35", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", + "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", + "dependencies": { + "mime-db": "1.52.0" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/mimic-fn": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-2.1.0.tgz", + "integrity": "sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg==", + "dev": true, + "engines": { + "node": ">=6" + } + }, + "node_modules/mimic-function": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/mimic-function/-/mimic-function-5.0.1.tgz", + "integrity": "sha512-VP79XUPxV2CigYP3jWwAUFSku2aKqBH7uTAapFWCBqutsbmDo96KY5o8uh6U+/YSIn5OxJnXp73beVkpqMIGhA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/min-indent": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/min-indent/-/min-indent-1.0.1.tgz", + "integrity": "sha512-I9jwMn07Sy/IwOj3zVkVik2JTvgpaykDZEigL6Rx6N9LbMywwUSMtxET+7lVoDLLd3O3IXwJwvuuns8UB/HeAg==", + "dev": true, + "engines": { + "node": ">=4" + } + }, + "node_modules/mini-css-extract-plugin": { + "version": "2.9.2", + "resolved": "https://registry.npmjs.org/mini-css-extract-plugin/-/mini-css-extract-plugin-2.9.2.tgz", + "integrity": "sha512-GJuACcS//jtq4kCtd5ii/M0SZf7OZRH+BxdqXZHaJfb8TJiVl+NgQRPwiYt2EuqeSkNydn/7vP+bcE27C5mb9w==", + "dev": true, + "dependencies": { + "schema-utils": "^4.0.0", + "tapable": "^2.2.1" + }, + "engines": { + "node": ">= 12.13.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/webpack" + }, + "peerDependencies": { + "webpack": "^5.0.0" + } + }, + "node_modules/minimalistic-assert": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/minimalistic-assert/-/minimalistic-assert-1.0.1.tgz", + "integrity": "sha512-UtJcAD4yEaGtjPezWuO9wC4nwUnVH/8/Im3yEHQP4b67cXlD/Qr9hdITCU1xDbSEXg2XKNaP8jsReV7vQd00/A==", + "dev": true + }, + "node_modules/minimatch": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz", + "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==", + "dev": true, + "dependencies": { + "brace-expansion": "^1.1.7" + }, + "engines": { + "node": "*" + } + }, + "node_modules/minimist": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz", + "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==", + "dev": true, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/mitt": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/mitt/-/mitt-3.0.1.tgz", + "integrity": "sha512-vKivATfr97l2/QBCYAkXYDbrIWPM2IIKEl7YPhjCvKlG3kE2gm+uBo6nEXK3M5/Ffh/FLpKExzOQ3JJoJGFKBw==", + "license": "MIT" + }, + "node_modules/mnth": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/mnth/-/mnth-2.0.0.tgz", + "integrity": "sha512-3ZH4UWBGpAwCKdfjynLQpUDVZWMe6vRHwarIpMdGLUp89CVR9hjzgyWERtMyqx+fPEqQ/PsAxFwvwPxLFxW40A==", + "dependencies": { + "@babel/runtime": "^7.8.0" + }, + "engines": { + "node": ">=12.13.0" + } + }, + "node_modules/moo": { + "version": "0.5.2", + "resolved": "https://registry.npmjs.org/moo/-/moo-0.5.2.tgz", + "integrity": "sha512-iSAJLHYKnX41mKcJKjqvnAN9sf0LMDTXDEvFv+ffuRR9a1MIuXLjMNL6EsnDHSkKLTWNqQQ5uo61P4EbU4NU+Q==", + "dev": true + }, + "node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==" + }, + "node_modules/multicast-dns": { + "version": "7.2.5", + "resolved": "https://registry.npmjs.org/multicast-dns/-/multicast-dns-7.2.5.tgz", + "integrity": "sha512-2eznPJP8z2BFLX50tf0LuODrpINqP1RVIm/CObbTcBRITQgmC/TjcREF1NeTBzIcR5XO/ukWo+YHOjBbFwIupg==", + "dev": true, + "dependencies": { + "dns-packet": "^5.2.2", + "thunky": "^1.0.2" + }, + "bin": { + "multicast-dns": "cli.js" + } + }, + "node_modules/nano-spawn": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/nano-spawn/-/nano-spawn-1.0.2.tgz", + "integrity": "sha512-21t+ozMQDAL/UGgQVBbZ/xXvNO10++ZPuTmKRO8k9V3AClVRht49ahtDjfY8l1q6nSHOrE5ASfthzH3ol6R/hg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=20.17" + }, + "funding": { + "url": "https://github.com/sindresorhus/nano-spawn?sponsor=1" + } + }, + "node_modules/nanoclone": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/nanoclone/-/nanoclone-0.2.1.tgz", + "integrity": "sha512-wynEP02LmIbLpcYw8uBKpcfF6dmg2vcpKqxeH5UcoKEYdExslsdUA4ugFauuaeYdTB76ez6gJW8XAZ6CgkXYxA==" + }, + "node_modules/nanoid": { + "version": "3.3.11", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz", + "integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "bin": { + "nanoid": "bin/nanoid.cjs" + }, + "engines": { + "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" + } + }, + "node_modules/natural-compare": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz", + "integrity": "sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==", + "dev": true + }, + "node_modules/nearley": { + "version": "2.20.1", + "resolved": "https://registry.npmjs.org/nearley/-/nearley-2.20.1.tgz", + "integrity": "sha512-+Mc8UaAebFzgV+KpI5n7DasuuQCHA89dmwm7JXw3TV43ukfNQ9DnBH3Mdb2g/I4Fdxc26pwimBWvjIw0UAILSQ==", + "dev": true, + "dependencies": { + "commander": "^2.19.0", + "moo": "^0.5.0", + "railroad-diagrams": "^1.0.0", + "randexp": "0.4.6" + }, + "bin": { + "nearley-railroad": "bin/nearley-railroad.js", + "nearley-test": "bin/nearley-test.js", + "nearley-unparse": "bin/nearley-unparse.js", + "nearleyc": "bin/nearleyc.js" + }, + "funding": { + "type": "individual", + "url": "https://nearley.js.org/#give-to-nearley" + } + }, + "node_modules/nearley/node_modules/commander": { + "version": "2.20.3", + "resolved": "https://registry.npmjs.org/commander/-/commander-2.20.3.tgz", + "integrity": "sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==", + "dev": true + }, + "node_modules/negotiator": { + "version": "0.6.3", + "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.3.tgz", + "integrity": "sha512-+EUsqGPLsM+j/zdChZjsnX51g4XrHFOIXwfnCVPGlQk/k5giakcKsuxCObBRu6DSm9opw/O6slWbJdghQM4bBg==", + "dev": true, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/neo-async": { + "version": "2.6.2", + "resolved": "https://registry.npmjs.org/neo-async/-/neo-async-2.6.2.tgz", + "integrity": "sha512-Yd3UES5mWCSqR+qNT93S3UoYUkqAZ9lLg8a7g9rimsWmYGK8cVToA4/sF3RrshdyV3sAGMXVUmpMYOw+dLpOuw==" + }, + "node_modules/no-case": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/no-case/-/no-case-3.0.4.tgz", + "integrity": "sha512-fgAN3jGAh+RoxUGZHTSOLJIqUc2wmoBwGR4tbpNAKmmovFoWq0OdRkb0VkldReO2a2iBT/OEulG9XSUc10r3zg==", + "dev": true, + "dependencies": { + "lower-case": "^2.0.2", + "tslib": "^2.0.3" + } + }, + "node_modules/node-addon-api": { + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-7.1.1.tgz", + "integrity": "sha512-5m3bsyrjFWE1xf7nz7YXdN4udnVtXK6/Yfgn5qnahL6bCkf2yKt4k3nuTKAtT4r3IG8JNR2ncsIMdZuAzJjHQQ==", + "dev": true, + "optional": true + }, + "node_modules/node-domexception": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/node-domexception/-/node-domexception-1.0.0.tgz", + "integrity": "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/jimmywarting" + }, + { + "type": "github", + "url": "https://paypal.me/jimmywarting" + } + ], + "engines": { + "node": ">=10.5.0" + } + }, + "node_modules/node-fetch": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", + "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==", + "dependencies": { + "whatwg-url": "^5.0.0" + }, + "engines": { + "node": "4.x || >=6.0.0" + }, + "peerDependencies": { + "encoding": "^0.1.0" + }, + "peerDependenciesMeta": { + "encoding": { + "optional": true + } + } + }, + "node_modules/node-fetch-h2": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/node-fetch-h2/-/node-fetch-h2-2.3.0.tgz", + "integrity": "sha512-ofRW94Ab0T4AOh5Fk8t0h8OBWrmjb0SSB20xh1H8YnPV9EJ+f5AMoYSUQ2zgJ4Iq2HAK0I2l5/Nequ8YzFS3Hg==", + "dev": true, + "dependencies": { + "http2-client": "^1.2.5" + }, + "engines": { + "node": "4.x || >=6.0.0" + } + }, + "node_modules/node-int64": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/node-int64/-/node-int64-0.4.0.tgz", + "integrity": "sha512-O5lz91xSOeoXP6DulyHfllpq+Eg00MWitZIbtPfoSEvqIHdl5gfcY6hYzDWnj0qD5tz52PI08u9qUvSVeUBeHw==", + "dev": true + }, + "node_modules/node-readfiles": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/node-readfiles/-/node-readfiles-0.2.0.tgz", + "integrity": "sha512-SU00ZarexNlE4Rjdm83vglt5Y9yiQ+XI1XpflWlb7q7UTN1JUItm69xMeiQCTxtTfnzt+83T8Cx+vI2ED++VDA==", + "dev": true, + "dependencies": { + "es6-promise": "^3.2.1" + } + }, + "node_modules/node-releases": { + "version": "2.0.18", + "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.18.tgz", + "integrity": "sha512-d9VeXT4SJ7ZeOqGX6R5EM022wpL+eWPooLI+5UpWn2jCT1aosUQEhQP214x33Wkwx3JQMvIm+tIoVOdodFS40g==" + }, + "node_modules/normalize-path": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz", + "integrity": "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/normalize-range": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/normalize-range/-/normalize-range-0.1.2.tgz", + "integrity": "sha512-bdok/XvKII3nUpklnV6P2hxtMNrCboOjAcyBuQnWEhO665FwrSNRxU+AqpsyvO6LgGYPspN+lu5CLtw4jPRKNA==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/normalize-url": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-6.1.0.tgz", + "integrity": "sha512-DlL+XwOy3NxAQ8xuC0okPgK46iuVNAK01YN7RueYBqqFeGsBjV9XmCAzAdgt+667bCl5kPh9EqKKDwnaPG1I7A==", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/npm-run-path": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/npm-run-path/-/npm-run-path-4.0.1.tgz", + "integrity": "sha512-S48WzZW777zhNIrn7gxOlISNAqi9ZC/uQFnRdbeIHhZhCA6UqpkOT8T1G7BvfdgP4Er8gF4sUbaS0i7QvIfCWw==", + "dev": true, + "dependencies": { + "path-key": "^3.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/npx": { + "version": "10.2.2", + "resolved": "https://registry.npmjs.org/npx/-/npx-10.2.2.tgz", + "integrity": "sha512-eImmySusyeWphzs5iNh791XbZnZG0FSNvM4KSah34pdQQIDsdTDhIwg1sjN3AIVcjGLpbQ/YcfqHPshKZQK1fA==", + "bundleDependencies": [ + "npm", + "libnpx" + ], + "deprecated": "This package is now part of the npm CLI.", + "dev": true, + "dependencies": { + "libnpx": "10.2.2", + "npm": "5.1.0" + }, + "bin": { + "npx": "index.js" + } + }, + "node_modules/npx/node_modules/ansi-align": { + "version": "2.0.0", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "string-width": "^2.0.0" + } + }, + "node_modules/npx/node_modules/ansi-regex": { + "version": "3.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/ansi-styles": { + "version": "3.2.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "color-convert": "^1.9.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/balanced-match": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/boxen": { + "version": "1.3.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "ansi-align": "^2.0.0", + "camelcase": "^4.0.0", + "chalk": "^2.0.1", + "cli-boxes": "^1.0.0", + "string-width": "^2.0.0", + "term-size": "^1.2.0", + "widest-line": "^2.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/brace-expansion": { + "version": "1.1.11", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "balanced-match": "^1.0.0", + "concat-map": "0.0.1" + } + }, + "node_modules/npx/node_modules/builtins": { + "version": "1.0.3", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/camelcase": { + "version": "4.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/capture-stack-trace": { + "version": "1.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/chalk": { + "version": "2.4.2", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "ansi-styles": "^3.2.1", + "escape-string-regexp": "^1.0.5", + "supports-color": "^5.3.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/ci-info": { + "version": "1.6.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/cli-boxes": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/cliui": { + "version": "4.1.0", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "string-width": "^2.1.1", + "strip-ansi": "^4.0.0", + "wrap-ansi": "^2.0.0" + } + }, + "node_modules/npx/node_modules/code-point-at": { + "version": "1.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/color-convert": { + "version": "1.9.3", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "color-name": "1.1.3" + } + }, + "node_modules/npx/node_modules/color-name": { + "version": "1.1.3", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/concat-map": { + "version": "0.0.1", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/configstore": { + "version": "3.1.2", + "dev": true, + "inBundle": true, + "license": "BSD-2-Clause", + "dependencies": { + "dot-prop": "^4.1.0", + "graceful-fs": "^4.1.2", + "make-dir": "^1.0.0", + "unique-string": "^1.0.0", + "write-file-atomic": "^2.0.0", + "xdg-basedir": "^3.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/create-error-class": { + "version": "3.0.2", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "capture-stack-trace": "^1.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/cross-spawn": { + "version": "5.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "lru-cache": "^4.0.1", + "shebang-command": "^1.2.0", + "which": "^1.2.9" + } + }, + "node_modules/npx/node_modules/crypto-random-string": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/decamelize": { + "version": "1.2.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/deep-extend": { + "version": "0.6.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4.0.0" + } + }, + "node_modules/npx/node_modules/dot-prop": { + "version": "4.2.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "is-obj": "^1.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/dotenv": { + "version": "5.0.1", + "dev": true, + "inBundle": true, + "license": "BSD-2-Clause", + "engines": { + "node": ">=4.6.0" + } + }, + "node_modules/npx/node_modules/duplexer3": { + "version": "0.1.4", + "dev": true, + "inBundle": true, + "license": "BSD-3-Clause" + }, + "node_modules/npx/node_modules/end-of-stream": { + "version": "1.4.4", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "once": "^1.4.0" + } + }, + "node_modules/npx/node_modules/escape-string-regexp": { + "version": "1.0.5", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.8.0" + } + }, + "node_modules/npx/node_modules/execa": { + "version": "0.7.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "cross-spawn": "^5.0.1", + "get-stream": "^3.0.0", + "is-stream": "^1.1.0", + "npm-run-path": "^2.0.0", + "p-finally": "^1.0.0", + "signal-exit": "^3.0.0", + "strip-eof": "^1.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/find-up": { + "version": "2.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "locate-path": "^2.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/fs.realpath": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/get-caller-file": { + "version": "1.0.3", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/get-stream": { + "version": "3.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/glob": { + "version": "7.1.6", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "fs.realpath": "^1.0.0", + "inflight": "^1.0.4", + "inherits": "2", + "minimatch": "^3.0.4", + "once": "^1.3.0", + "path-is-absolute": "^1.0.0" + }, + "engines": { + "node": "*" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/npx/node_modules/global-dirs": { + "version": "0.1.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "ini": "^1.3.4" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/got": { + "version": "6.7.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "create-error-class": "^3.0.0", + "duplexer3": "^0.1.4", + "get-stream": "^3.0.0", + "is-redirect": "^1.0.0", + "is-retry-allowed": "^1.0.0", + "is-stream": "^1.0.0", + "lowercase-keys": "^1.0.0", + "safe-buffer": "^5.0.1", + "timed-out": "^4.0.0", + "unzip-response": "^2.0.1", + "url-parse-lax": "^1.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/graceful-fs": { + "version": "4.2.3", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/has-flag": { + "version": "3.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/hosted-git-info": { + "version": "2.8.5", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/import-lazy": { + "version": "2.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/imurmurhash": { + "version": "0.1.4", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.8.19" + } + }, + "node_modules/npx/node_modules/inflight": { + "version": "1.0.6", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "once": "^1.3.0", + "wrappy": "1" + } + }, + "node_modules/npx/node_modules/inherits": { + "version": "2.0.4", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/ini": { + "version": "1.3.5", + "dev": true, + "inBundle": true, + "license": "ISC", + "engines": { + "node": "*" + } + }, + "node_modules/npx/node_modules/invert-kv": { + "version": "2.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/is-ci": { + "version": "1.2.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "ci-info": "^1.5.0" + }, + "bin": { + "is-ci": "bin.js" + } + }, + "node_modules/npx/node_modules/is-fullwidth-code-point": { + "version": "2.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/is-installed-globally": { + "version": "0.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "global-dirs": "^0.1.0", + "is-path-inside": "^1.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/is-npm": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/is-obj": { + "version": "1.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/is-path-inside": { + "version": "1.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "path-is-inside": "^1.0.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/is-redirect": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/is-retry-allowed": { + "version": "1.2.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/is-stream": { + "version": "1.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/isexe": { + "version": "2.0.0", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/latest-version": { + "version": "3.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "package-json": "^4.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/lcid": { + "version": "2.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "invert-kv": "^2.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/npx/node_modules/libnpx": { + "version": "10.2.2", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "dotenv": "^5.0.1", + "npm-package-arg": "^6.0.0", + "rimraf": "^2.6.2", + "safe-buffer": "^5.1.0", + "update-notifier": "^2.3.0", + "which": "^1.3.0", + "y18n": "^4.0.0", + "yargs": "^11.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/locate-path": { + "version": "2.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "p-locate": "^2.0.0", + "path-exists": "^3.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/lowercase-keys": { + "version": "1.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/lru-cache": { + "version": "4.1.5", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "pseudomap": "^1.0.2", + "yallist": "^2.1.2" + } + }, + "node_modules/npx/node_modules/make-dir": { + "version": "1.3.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "pify": "^3.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/map-age-cleaner": { + "version": "0.1.3", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "p-defer": "^1.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/npx/node_modules/mem": { + "version": "4.3.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "map-age-cleaner": "^0.1.1", + "mimic-fn": "^2.0.0", + "p-is-promise": "^2.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/npx/node_modules/mimic-fn": { + "version": "2.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/npx/node_modules/minimatch": { + "version": "3.0.4", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "brace-expansion": "^1.1.7" + }, + "engines": { + "node": "*" + } + }, + "node_modules/npx/node_modules/minimist": { + "version": "1.2.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/nice-try": { + "version": "1.0.5", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm": { + "version": "5.1.0", + "bundleDependencies": [ + "abbrev", + "ansi-regex", + "ansicolors", + "ansistyles", + "aproba", + "archy", + "cacache", + "call-limit", + "bluebird", + "chownr", + "cmd-shim", + "columnify", + "config-chain", + "debuglog", + "detect-indent", + "dezalgo", + "editor", + "fs-vacuum", + "fs-write-stream-atomic", + "fstream", + "fstream-npm", + "glob", + "graceful-fs", + "has-unicode", + "hosted-git-info", + "iferr", + "imurmurhash", + "inflight", + "inherits", + "ini", + "init-package-json", + "JSONStream", + "lazy-property", + "lockfile", + "lodash._baseindexof", + "lodash._baseuniq", + "lodash._bindcallback", + "lodash._cacheindexof", + "lodash._createcache", + "lodash._getnative", + "lodash.clonedeep", + "lodash.restparam", + "lodash.union", + "lodash.uniq", + "lodash.without", + "lru-cache", + "mkdirp", + "mississippi", + "move-concurrently", + "node-gyp", + "nopt", + "normalize-package-data", + "npm-cache-filename", + "npm-install-checks", + "npm-package-arg", + "npm-registry-client", + "npm-user-validate", + "npmlog", + "once", + "opener", + "osenv", + "pacote", + "path-is-inside", + "promise-inflight", + "read", + "read-cmd-shim", + "read-installed", + "read-package-json", + "read-package-tree", + "readable-stream", + "readdir-scoped-modules", + "request", + "retry", + "rimraf", + "semver", + "sha", + "slide", + "sorted-object", + "sorted-union-stream", + "ssri", + "strip-ansi", + "tar", + "text-table", + "uid-number", + "umask", + "unique-filename", + "unpipe", + "update-notifier", + "uuid", + "validate-npm-package-license", + "validate-npm-package-name", + "which", + "wrappy", + "write-file-atomic", + "safe-buffer", + "worker-farm" + ], + "dev": true, + "inBundle": true, + "license": "Artistic-2.0", + "dependencies": { + "abbrev": "~1.1.0", + "ansi-regex": "~3.0.0", + "ansicolors": "~0.3.2", + "ansistyles": "~0.1.3", + "aproba": "~1.1.2", + "archy": "~1.0.0", + "bluebird": "~3.5.0", + "cacache": "~9.2.9", + "call-limit": "~1.1.0", + "chownr": "~1.0.1", + "cmd-shim": "~2.0.2", + "columnify": "~1.5.4", + "config-chain": "~1.1.11", + "debuglog": "*", + "detect-indent": "~5.0.0", + "dezalgo": "~1.0.3", + "editor": "~1.0.0", + "fs-vacuum": "~1.2.10", + "fs-write-stream-atomic": "~1.0.10", + "fstream": "~1.0.11", + "fstream-npm": "~1.2.1", + "glob": "~7.1.2", + "graceful-fs": "~4.1.11", + "has-unicode": "~2.0.1", + "hosted-git-info": "~2.5.0", + "iferr": "~0.1.5", + "imurmurhash": "*", + "inflight": "~1.0.6", + "inherits": "~2.0.3", + "ini": "~1.3.4", + "init-package-json": "~1.10.1", + "JSONStream": "~1.3.1", + "lazy-property": "~1.0.0", + "lockfile": "~1.0.3", + "lodash._baseindexof": "*", + "lodash._baseuniq": "~4.6.0", + "lodash._bindcallback": "*", + "lodash._cacheindexof": "*", + "lodash._createcache": "*", + "lodash._getnative": "*", + "lodash.clonedeep": "~4.5.0", + "lodash.restparam": "*", + "lodash.union": "~4.6.0", + "lodash.uniq": "~4.5.0", + "lodash.without": "~4.4.0", + "lru-cache": "~4.1.1", + "mississippi": "~1.3.0", + "mkdirp": "~0.5.1", + "move-concurrently": "~1.0.1", + "node-gyp": "~3.6.2", + "nopt": "~4.0.1", + "normalize-package-data": "~2.4.0", + "npm-cache-filename": "~1.0.2", + "npm-install-checks": "~3.0.0", + "npm-package-arg": "~5.1.2", + "npm-registry-client": "~8.4.0", + "npm-user-validate": "~1.0.0", + "npmlog": "~4.1.2", + "once": "~1.4.0", + "opener": "~1.4.3", + "osenv": "~0.1.4", + "pacote": "~2.7.38", + "path-is-inside": "~1.0.2", + "promise-inflight": "~1.0.1", + "read": "~1.0.7", + "read-cmd-shim": "~1.0.1", + "read-installed": "~4.0.3", + "read-package-json": "~2.0.9", + "read-package-tree": "~5.1.6", + "readable-stream": "~2.3.2", + "readdir-scoped-modules": "*", + "request": "~2.81.0", + "retry": "~0.10.1", + "rimraf": "~2.6.1", + "safe-buffer": "~5.1.1", + "semver": "~5.3.0", + "sha": "~2.0.1", + "slide": "~1.1.6", + "sorted-object": "~2.0.1", + "sorted-union-stream": "~2.1.3", + "ssri": "~4.1.6", + "strip-ansi": "~4.0.0", + "tar": "~2.2.1", + "text-table": "~0.2.0", + "uid-number": "0.0.6", + "umask": "~1.1.0", + "unique-filename": "~1.1.0", + "unpipe": "~1.0.0", + "update-notifier": "~2.2.0", + "uuid": "~3.1.0", + "validate-npm-package-license": "*", + "validate-npm-package-name": "~3.0.0", + "which": "~1.2.14", + "worker-farm": "~1.3.1", + "wrappy": "~1.0.2", + "write-file-atomic": "~2.1.0" + }, + "bin": { + "npm": "bin/npm-cli.js" + } + }, + "node_modules/npx/node_modules/npm-package-arg": { + "version": "6.1.1", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "hosted-git-info": "^2.7.1", + "osenv": "^0.1.5", + "semver": "^5.6.0", + "validate-npm-package-name": "^3.0.0" + } + }, + "node_modules/npx/node_modules/npm-run-path": { + "version": "2.0.2", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "path-key": "^2.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/abbrev": { + "version": "1.1.0", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/npm/node_modules/ansi-regex": { + "version": "3.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/ansicolors": { + "version": "0.3.2", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/ansistyles": { + "version": "0.1.3", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/aproba": { + "version": "1.1.2", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/npm/node_modules/archy": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/bluebird": { + "version": "3.5.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/cacache": { + "version": "9.2.9", + "dev": true, + "inBundle": true, + "license": "CC0-1.0", + "dependencies": { + "bluebird": "^3.5.0", + "chownr": "^1.0.1", + "glob": "^7.1.2", + "graceful-fs": "^4.1.11", + "lru-cache": "^4.1.1", + "mississippi": "^1.3.0", + "mkdirp": "^0.5.1", + "move-concurrently": "^1.0.1", + "promise-inflight": "^1.0.1", + "rimraf": "^2.6.1", + "ssri": "^4.1.6", + "unique-filename": "^1.1.0", + "y18n": "^3.2.1" + } + }, + "node_modules/npx/node_modules/npm/node_modules/cacache/node_modules/lru-cache": { + "version": "4.1.1", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "pseudomap": "^1.0.2", + "yallist": "^2.1.2" + } + }, + "node_modules/npx/node_modules/npm/node_modules/cacache/node_modules/lru-cache/node_modules/pseudomap": { + "version": "1.0.2", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/npm/node_modules/cacache/node_modules/lru-cache/node_modules/yallist": { + "version": "2.1.2", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/npm/node_modules/cacache/node_modules/y18n": { + "version": "3.2.1", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/npm/node_modules/call-limit": { + "version": "1.1.0", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/npm/node_modules/chownr": { + "version": "1.0.1", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/npm/node_modules/cmd-shim": { + "version": "2.0.2", + "dev": true, + "inBundle": true, + "license": "BSD-2-Clause", + "dependencies": { + "graceful-fs": "^4.1.2", + "mkdirp": "~0.5.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/columnify": { + "version": "1.5.4", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "strip-ansi": "^3.0.0", + "wcwidth": "^1.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/columnify/node_modules/strip-ansi": { + "version": "3.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "ansi-regex": "^2.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/columnify/node_modules/strip-ansi/node_modules/ansi-regex": { + "version": "2.1.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/columnify/node_modules/wcwidth": { + "version": "1.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "defaults": "^1.0.3" + } + }, + "node_modules/npx/node_modules/npm/node_modules/columnify/node_modules/wcwidth/node_modules/defaults": { + "version": "1.0.3", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "clone": "^1.0.2" + } + }, + "node_modules/npx/node_modules/npm/node_modules/columnify/node_modules/wcwidth/node_modules/defaults/node_modules/clone": { + "version": "1.0.2", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.8" + } + }, + "node_modules/npx/node_modules/npm/node_modules/config-chain": { + "version": "1.1.11", + "dev": true, + "inBundle": true, + "dependencies": { + "ini": "^1.3.4", + "proto-list": "~1.2.1" + } + }, + "node_modules/npx/node_modules/npm/node_modules/config-chain/node_modules/proto-list": { + "version": "1.2.4", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/npm/node_modules/debuglog": { + "version": "1.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": "*" + } + }, + "node_modules/npx/node_modules/npm/node_modules/detect-indent": { + "version": "5.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/dezalgo": { + "version": "1.0.3", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "asap": "^2.0.0", + "wrappy": "1" + } + }, + "node_modules/npx/node_modules/npm/node_modules/dezalgo/node_modules/asap": { + "version": "2.0.5", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/editor": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/fs-vacuum": { + "version": "1.2.10", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "graceful-fs": "^4.1.2", + "path-is-inside": "^1.0.1", + "rimraf": "^2.5.2" + } + }, + "node_modules/npx/node_modules/npm/node_modules/fs-write-stream-atomic": { + "version": "1.0.10", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "graceful-fs": "^4.1.2", + "iferr": "^0.1.5", + "imurmurhash": "^0.1.4", + "readable-stream": "1 || 2" + } + }, + "node_modules/npx/node_modules/npm/node_modules/fstream": { + "version": "1.0.11", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "graceful-fs": "^4.1.2", + "inherits": "~2.0.0", + "mkdirp": ">=0.5 0", + "rimraf": "2" + }, + "engines": { + "node": ">=0.6" + } + }, + "node_modules/npx/node_modules/npm/node_modules/fstream-npm": { + "version": "1.2.1", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "fstream-ignore": "^1.0.0", + "inherits": "2" + } + }, + "node_modules/npx/node_modules/npm/node_modules/fstream-npm/node_modules/fstream-ignore": { + "version": "1.0.5", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "fstream": "^1.0.0", + "inherits": "2", + "minimatch": "^3.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/fstream-npm/node_modules/fstream-ignore/node_modules/minimatch": { + "version": "3.0.4", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "brace-expansion": "^1.1.7" + }, + "engines": { + "node": "*" + } + }, + "node_modules/npx/node_modules/npm/node_modules/fstream-npm/node_modules/fstream-ignore/node_modules/minimatch/node_modules/brace-expansion": { + "version": "1.1.8", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "balanced-match": "^1.0.0", + "concat-map": "0.0.1" + } + }, + "node_modules/npx/node_modules/npm/node_modules/fstream-npm/node_modules/fstream-ignore/node_modules/minimatch/node_modules/brace-expansion/node_modules/balanced-match": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/fstream-npm/node_modules/fstream-ignore/node_modules/minimatch/node_modules/brace-expansion/node_modules/concat-map": { + "version": "0.0.1", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/glob": { + "version": "7.1.2", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "fs.realpath": "^1.0.0", + "inflight": "^1.0.4", + "inherits": "2", + "minimatch": "^3.0.4", + "once": "^1.3.0", + "path-is-absolute": "^1.0.0" + }, + "engines": { + "node": "*" + } + }, + "node_modules/npx/node_modules/npm/node_modules/glob/node_modules/fs.realpath": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/npm/node_modules/glob/node_modules/minimatch": { + "version": "3.0.4", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "brace-expansion": "^1.1.7" + }, + "engines": { + "node": "*" + } + }, + "node_modules/npx/node_modules/npm/node_modules/glob/node_modules/minimatch/node_modules/brace-expansion": { + "version": "1.1.8", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "balanced-match": "^1.0.0", + "concat-map": "0.0.1" + } + }, + "node_modules/npx/node_modules/npm/node_modules/glob/node_modules/minimatch/node_modules/brace-expansion/node_modules/balanced-match": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/glob/node_modules/minimatch/node_modules/brace-expansion/node_modules/concat-map": { + "version": "0.0.1", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/glob/node_modules/path-is-absolute": { + "version": "1.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/graceful-fs": { + "version": "4.1.11", + "dev": true, + "inBundle": true, + "license": "ISC", + "engines": { + "node": ">=0.4.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/has-unicode": { + "version": "2.0.1", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/npm/node_modules/hosted-git-info": { + "version": "2.5.0", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/npm/node_modules/iferr": { + "version": "0.1.5", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/imurmurhash": { + "version": "0.1.4", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.8.19" + } + }, + "node_modules/npx/node_modules/npm/node_modules/inflight": { + "version": "1.0.6", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "once": "^1.3.0", + "wrappy": "1" + } + }, + "node_modules/npx/node_modules/npm/node_modules/inherits": { + "version": "2.0.3", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/npm/node_modules/ini": { + "version": "1.3.4", + "dev": true, + "inBundle": true, + "license": "ISC", + "engines": { + "node": "*" + } + }, + "node_modules/npx/node_modules/npm/node_modules/init-package-json": { + "version": "1.10.1", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "glob": "^7.1.1", + "npm-package-arg": "^4.0.0 || ^5.0.0", + "promzard": "^0.3.0", + "read": "~1.0.1", + "read-package-json": "1 || 2", + "semver": "2.x || 3.x || 4 || 5", + "validate-npm-package-license": "^3.0.1", + "validate-npm-package-name": "^3.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/init-package-json/node_modules/promzard": { + "version": "0.3.0", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "read": "1" + } + }, + "node_modules/npx/node_modules/npm/node_modules/JSONStream": { + "version": "1.3.1", + "dev": true, + "inBundle": true, + "license": "(MIT OR Apache-2.0)", + "dependencies": { + "jsonparse": "^1.2.0", + "through": ">=2.2.7 <3" + }, + "bin": { + "JSONStream": "index.js" + }, + "engines": { + "node": "*" + } + }, + "node_modules/npx/node_modules/npm/node_modules/JSONStream/node_modules/jsonparse": { + "version": "1.3.1", + "dev": true, + "engines": [ + "node >= 0.2.0" + ], + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/JSONStream/node_modules/through": { + "version": "2.3.8", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/lazy-property": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/lockfile": { + "version": "1.0.3", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/npm/node_modules/lodash._baseindexof": { + "version": "3.1.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/lodash._baseuniq": { + "version": "4.6.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "lodash._createset": "~4.0.0", + "lodash._root": "~3.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/lodash._baseuniq/node_modules/lodash._createset": { + "version": "4.0.3", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/lodash._baseuniq/node_modules/lodash._root": { + "version": "3.0.1", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/lodash._bindcallback": { + "version": "3.0.1", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/lodash._cacheindexof": { + "version": "3.0.2", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/lodash._createcache": { + "version": "3.1.2", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "lodash._getnative": "^3.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/lodash._getnative": { + "version": "3.9.1", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/lodash.clonedeep": { + "version": "4.5.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/lodash.restparam": { + "version": "3.6.1", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/lodash.union": { + "version": "4.6.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/lodash.uniq": { + "version": "4.5.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/lodash.without": { + "version": "4.4.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/lru-cache": { + "version": "4.1.1", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "pseudomap": "^1.0.2", + "yallist": "^2.1.2" + } + }, + "node_modules/npx/node_modules/npm/node_modules/lru-cache/node_modules/pseudomap": { + "version": "1.0.2", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/npm/node_modules/lru-cache/node_modules/yallist": { + "version": "2.1.2", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/npm/node_modules/mississippi": { + "version": "1.3.0", + "dev": true, + "inBundle": true, + "license": "BSD-2-Clause", + "dependencies": { + "concat-stream": "^1.5.0", + "duplexify": "^3.4.2", + "end-of-stream": "^1.1.0", + "flush-write-stream": "^1.0.0", + "from2": "^2.1.0", + "parallel-transform": "^1.1.0", + "pump": "^1.0.0", + "pumpify": "^1.3.3", + "stream-each": "^1.1.0", + "through2": "^2.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/mississippi/node_modules/concat-stream": { + "version": "1.6.0", + "dev": true, + "engines": [ + "node >= 0.8" + ], + "inBundle": true, + "license": "MIT", + "dependencies": { + "inherits": "^2.0.3", + "readable-stream": "^2.2.2", + "typedarray": "^0.0.6" + } + }, + "node_modules/npx/node_modules/npm/node_modules/mississippi/node_modules/concat-stream/node_modules/typedarray": { + "version": "0.0.6", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/mississippi/node_modules/duplexify": { + "version": "3.5.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "end-of-stream": "1.0.0", + "inherits": "^2.0.1", + "readable-stream": "^2.0.0", + "stream-shift": "^1.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/mississippi/node_modules/duplexify/node_modules/end-of-stream": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "once": "~1.3.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/mississippi/node_modules/duplexify/node_modules/end-of-stream/node_modules/once": { + "version": "1.3.3", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "wrappy": "1" + } + }, + "node_modules/npx/node_modules/npm/node_modules/mississippi/node_modules/duplexify/node_modules/stream-shift": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/mississippi/node_modules/end-of-stream": { + "version": "1.4.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "once": "^1.4.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/mississippi/node_modules/flush-write-stream": { + "version": "1.0.2", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "inherits": "^2.0.1", + "readable-stream": "^2.0.4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/mississippi/node_modules/from2": { + "version": "2.3.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "inherits": "^2.0.1", + "readable-stream": "^2.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/mississippi/node_modules/parallel-transform": { + "version": "1.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "cyclist": "~0.2.2", + "inherits": "^2.0.3", + "readable-stream": "^2.1.5" + } + }, + "node_modules/npx/node_modules/npm/node_modules/mississippi/node_modules/parallel-transform/node_modules/cyclist": { + "version": "0.2.2", + "dev": true, + "inBundle": true + }, + "node_modules/npx/node_modules/npm/node_modules/mississippi/node_modules/pump": { + "version": "1.0.2", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } + }, + "node_modules/npx/node_modules/npm/node_modules/mississippi/node_modules/pumpify": { + "version": "1.3.5", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "duplexify": "^3.1.2", + "inherits": "^2.0.1", + "pump": "^1.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/mississippi/node_modules/stream-each": { + "version": "1.2.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "end-of-stream": "^1.1.0", + "stream-shift": "^1.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/mississippi/node_modules/stream-each/node_modules/stream-shift": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/mississippi/node_modules/through2": { + "version": "2.0.3", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "readable-stream": "^2.1.5", + "xtend": "~4.0.1" + } + }, + "node_modules/npx/node_modules/npm/node_modules/mississippi/node_modules/through2/node_modules/xtend": { + "version": "4.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/mkdirp": { + "version": "0.5.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "minimist": "0.0.8" + }, + "bin": { + "mkdirp": "bin/cmd.js" + } + }, + "node_modules/npx/node_modules/npm/node_modules/mkdirp/node_modules/minimist": { + "version": "0.0.8", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/move-concurrently": { + "version": "1.0.1", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "aproba": "^1.1.1", + "copy-concurrently": "^1.0.0", + "fs-write-stream-atomic": "^1.0.8", + "mkdirp": "^0.5.1", + "rimraf": "^2.5.4", + "run-queue": "^1.0.3" + } + }, + "node_modules/npx/node_modules/npm/node_modules/move-concurrently/node_modules/copy-concurrently": { + "version": "1.0.3", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "aproba": "^1.1.1", + "fs-write-stream-atomic": "^1.0.8", + "iferr": "^0.1.5", + "mkdirp": "^0.5.1", + "rimraf": "^2.5.4", + "run-queue": "^1.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/move-concurrently/node_modules/run-queue": { + "version": "1.0.3", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "aproba": "^1.1.1" + } + }, + "node_modules/npx/node_modules/npm/node_modules/node-gyp": { + "version": "3.6.2", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "fstream": "^1.0.0", + "glob": "^7.0.3", + "graceful-fs": "^4.1.2", + "minimatch": "^3.0.2", + "mkdirp": "^0.5.0", + "nopt": "2 || 3", + "npmlog": "0 || 1 || 2 || 3 || 4", + "osenv": "0", + "request": "2", + "rimraf": "2", + "semver": "~5.3.0", + "tar": "^2.0.0", + "which": "1" + }, + "bin": { + "node-gyp": "bin/node-gyp.js" + }, + "engines": { + "node": ">= 0.8.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/node-gyp/node_modules/minimatch": { + "version": "3.0.4", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "brace-expansion": "^1.1.7" + }, + "engines": { + "node": "*" + } + }, + "node_modules/npx/node_modules/npm/node_modules/node-gyp/node_modules/minimatch/node_modules/brace-expansion": { + "version": "1.1.8", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "balanced-match": "^1.0.0", + "concat-map": "0.0.1" + } + }, + "node_modules/npx/node_modules/npm/node_modules/node-gyp/node_modules/minimatch/node_modules/brace-expansion/node_modules/balanced-match": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/node-gyp/node_modules/minimatch/node_modules/brace-expansion/node_modules/concat-map": { + "version": "0.0.1", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/node-gyp/node_modules/nopt": { + "version": "3.0.6", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "abbrev": "1" + }, + "bin": { + "nopt": "bin/nopt.js" + } + }, + "node_modules/npx/node_modules/npm/node_modules/nopt": { + "version": "4.0.1", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "abbrev": "1", + "osenv": "^0.1.4" + }, + "bin": { + "nopt": "bin/nopt.js" + } + }, + "node_modules/npx/node_modules/npm/node_modules/normalize-package-data": { + "version": "2.4.0", + "dev": true, + "inBundle": true, + "license": "BSD-2-Clause", + "dependencies": { + "hosted-git-info": "^2.1.4", + "is-builtin-module": "^1.0.0", + "semver": "2 || 3 || 4 || 5", + "validate-npm-package-license": "^3.0.1" + } + }, + "node_modules/npx/node_modules/npm/node_modules/normalize-package-data/node_modules/is-builtin-module": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "builtin-modules": "^1.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/normalize-package-data/node_modules/is-builtin-module/node_modules/builtin-modules": { + "version": "1.1.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/npm-cache-filename": { + "version": "1.0.2", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/npm/node_modules/npm-install-checks": { + "version": "3.0.0", + "dev": true, + "inBundle": true, + "license": "BSD-2-Clause", + "dependencies": { + "semver": "^2.3.0 || 3.x || 4 || 5" + } + }, + "node_modules/npx/node_modules/npm/node_modules/npm-package-arg": { + "version": "5.1.2", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "hosted-git-info": "^2.4.2", + "osenv": "^0.1.4", + "semver": "^5.1.0", + "validate-npm-package-name": "^3.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/npm-registry-client": { + "version": "8.4.0", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "concat-stream": "^1.5.2", + "graceful-fs": "^4.1.6", + "normalize-package-data": "~1.0.1 || ^2.0.0", + "npm-package-arg": "^3.0.0 || ^4.0.0 || ^5.0.0", + "once": "^1.3.3", + "request": "^2.74.0", + "retry": "^0.10.0", + "semver": "2 >=2.2.1 || 3.x || 4 || 5", + "slide": "^1.1.3", + "ssri": "^4.1.2" + }, + "optionalDependencies": { + "npmlog": "2 || ^3.1.0 || ^4.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/npm-registry-client/node_modules/concat-stream": { + "version": "1.6.0", + "dev": true, + "engines": [ + "node >= 0.8" + ], + "inBundle": true, + "license": "MIT", + "dependencies": { + "inherits": "^2.0.3", + "readable-stream": "^2.2.2", + "typedarray": "^0.0.6" + } + }, + "node_modules/npx/node_modules/npm/node_modules/npm-registry-client/node_modules/concat-stream/node_modules/typedarray": { + "version": "0.0.6", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/npm-user-validate": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "BSD-2-Clause" + }, + "node_modules/npx/node_modules/npm/node_modules/npmlog": { + "version": "4.1.2", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "are-we-there-yet": "~1.1.2", + "console-control-strings": "~1.1.0", + "gauge": "~2.7.3", + "set-blocking": "~2.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/npmlog/node_modules/are-we-there-yet": { + "version": "1.1.4", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "delegates": "^1.0.0", + "readable-stream": "^2.0.6" + } + }, + "node_modules/npx/node_modules/npm/node_modules/npmlog/node_modules/are-we-there-yet/node_modules/delegates": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/npmlog/node_modules/console-control-strings": { + "version": "1.1.0", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/npm/node_modules/npmlog/node_modules/gauge": { + "version": "2.7.4", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "aproba": "^1.0.3", + "console-control-strings": "^1.0.0", + "has-unicode": "^2.0.0", + "object-assign": "^4.1.0", + "signal-exit": "^3.0.0", + "string-width": "^1.0.1", + "strip-ansi": "^3.0.1", + "wide-align": "^1.1.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/npmlog/node_modules/gauge/node_modules/object-assign": { + "version": "4.1.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/npmlog/node_modules/gauge/node_modules/signal-exit": { + "version": "3.0.2", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/npm/node_modules/npmlog/node_modules/gauge/node_modules/string-width": { + "version": "1.0.2", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "code-point-at": "^1.0.0", + "is-fullwidth-code-point": "^1.0.0", + "strip-ansi": "^3.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/npmlog/node_modules/gauge/node_modules/string-width/node_modules/code-point-at": { + "version": "1.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/npmlog/node_modules/gauge/node_modules/string-width/node_modules/is-fullwidth-code-point": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "number-is-nan": "^1.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/npmlog/node_modules/gauge/node_modules/string-width/node_modules/is-fullwidth-code-point/node_modules/number-is-nan": { + "version": "1.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/npmlog/node_modules/gauge/node_modules/strip-ansi": { + "version": "3.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "ansi-regex": "^2.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/npmlog/node_modules/gauge/node_modules/strip-ansi/node_modules/ansi-regex": { + "version": "2.1.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/npmlog/node_modules/gauge/node_modules/wide-align": { + "version": "1.1.2", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "string-width": "^1.0.2" + } + }, + "node_modules/npx/node_modules/npm/node_modules/npmlog/node_modules/set-blocking": { + "version": "2.0.0", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/npm/node_modules/once": { + "version": "1.4.0", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "wrappy": "1" + } + }, + "node_modules/npx/node_modules/npm/node_modules/opener": { + "version": "1.4.3", + "dev": true, + "inBundle": true, + "license": "(WTFPL OR MIT)", + "bin": { + "opener": "opener.js" + } + }, + "node_modules/npx/node_modules/npm/node_modules/osenv": { + "version": "0.1.4", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "os-homedir": "^1.0.0", + "os-tmpdir": "^1.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/osenv/node_modules/os-homedir": { + "version": "1.0.2", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/osenv/node_modules/os-tmpdir": { + "version": "1.0.2", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote": { + "version": "2.7.38", + "dev": true, + "inBundle": true, + "license": "CC0-1.0", + "dependencies": { + "bluebird": "^3.5.0", + "cacache": "^9.2.9", + "glob": "^7.1.2", + "lru-cache": "^4.1.1", + "make-fetch-happen": "^2.4.13", + "minimatch": "^3.0.4", + "mississippi": "^1.2.0", + "normalize-package-data": "^2.4.0", + "npm-package-arg": "^5.1.2", + "npm-pick-manifest": "^1.0.4", + "osenv": "^0.1.4", + "promise-inflight": "^1.0.1", + "promise-retry": "^1.1.1", + "protoduck": "^4.0.0", + "safe-buffer": "^5.1.1", + "semver": "^5.3.0", + "ssri": "^4.1.6", + "tar-fs": "^1.15.3", + "tar-stream": "^1.5.4", + "unique-filename": "^1.1.0", + "which": "^1.2.12" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen": { + "version": "2.4.13", + "dev": true, + "inBundle": true, + "license": "CC0-1.0", + "dependencies": { + "agentkeepalive": "^3.3.0", + "cacache": "^9.2.9", + "http-cache-semantics": "^3.7.3", + "http-proxy-agent": "^2.0.0", + "https-proxy-agent": "^2.0.0", + "lru-cache": "^4.1.1", + "mississippi": "^1.2.0", + "node-fetch-npm": "^2.0.1", + "promise-retry": "^1.1.1", + "socks-proxy-agent": "^3.0.0", + "ssri": "^4.1.6" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/agentkeepalive": { + "version": "3.3.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "humanize-ms": "^1.2.1" + }, + "engines": { + "node": ">= 4.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/agentkeepalive/node_modules/humanize-ms": { + "version": "1.2.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "ms": "^2.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/agentkeepalive/node_modules/humanize-ms/node_modules/ms": { + "version": "2.0.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/http-cache-semantics": { + "version": "3.7.3", + "dev": true, + "inBundle": true, + "license": "BSD-2-Clause" + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/http-proxy-agent": { + "version": "2.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "agent-base": "4", + "debug": "2" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/http-proxy-agent/node_modules/agent-base": { + "version": "4.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "es6-promisify": "^5.0.0" + }, + "engines": { + "node": ">= 4.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/http-proxy-agent/node_modules/agent-base/node_modules/es6-promisify": { + "version": "5.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "es6-promise": "^4.0.3" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/http-proxy-agent/node_modules/agent-base/node_modules/es6-promisify/node_modules/es6-promise": { + "version": "4.1.1", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/http-proxy-agent/node_modules/debug": { + "version": "2.6.8", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "ms": "2.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/http-proxy-agent/node_modules/debug/node_modules/ms": { + "version": "2.0.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/https-proxy-agent": { + "version": "2.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "agent-base": "^4.1.0", + "debug": "^2.4.1" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/https-proxy-agent/node_modules/agent-base": { + "version": "4.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "es6-promisify": "^5.0.0" + }, + "engines": { + "node": ">= 4.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/https-proxy-agent/node_modules/agent-base/node_modules/es6-promisify": { + "version": "5.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "es6-promise": "^4.0.3" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/https-proxy-agent/node_modules/agent-base/node_modules/es6-promisify/node_modules/es6-promise": { + "version": "4.1.1", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/https-proxy-agent/node_modules/debug": { + "version": "2.6.8", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "ms": "2.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/https-proxy-agent/node_modules/debug/node_modules/ms": { + "version": "2.0.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/node-fetch-npm": { + "version": "2.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "encoding": "^0.1.11", + "json-parse-helpfulerror": "^1.0.3", + "safe-buffer": "^5.0.1" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/node-fetch-npm/node_modules/encoding": { + "version": "0.1.12", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "iconv-lite": "~0.4.13" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/node-fetch-npm/node_modules/encoding/node_modules/iconv-lite": { + "version": "0.4.18", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/node-fetch-npm/node_modules/json-parse-helpfulerror": { + "version": "1.0.3", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "jju": "^1.1.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/node-fetch-npm/node_modules/json-parse-helpfulerror/node_modules/jju": { + "version": "1.3.0", + "dev": true, + "inBundle": true, + "license": "WTFPL" + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/socks-proxy-agent": { + "version": "3.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "agent-base": "^4.0.1", + "socks": "^1.1.10" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/socks-proxy-agent/node_modules/agent-base": { + "version": "4.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "es6-promisify": "^5.0.0" + }, + "engines": { + "node": ">= 4.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/socks-proxy-agent/node_modules/agent-base/node_modules/es6-promisify": { + "version": "5.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "es6-promise": "^4.0.3" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/socks-proxy-agent/node_modules/agent-base/node_modules/es6-promisify/node_modules/es6-promise": { + "version": "4.1.1", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/socks-proxy-agent/node_modules/socks": { + "version": "1.1.10", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "ip": "^1.1.4", + "smart-buffer": "^1.0.13" + }, + "engines": { + "node": ">= 0.10.0", + "npm": ">= 1.3.5" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/socks-proxy-agent/node_modules/socks/node_modules/ip": { + "version": "1.1.5", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/make-fetch-happen/node_modules/socks-proxy-agent/node_modules/socks/node_modules/smart-buffer": { + "version": "1.1.15", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">= 0.10.15", + "npm": ">= 1.3.5" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/minimatch": { + "version": "3.0.4", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "brace-expansion": "^1.1.7" + }, + "engines": { + "node": "*" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/minimatch/node_modules/brace-expansion": { + "version": "1.1.8", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "balanced-match": "^1.0.0", + "concat-map": "0.0.1" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/minimatch/node_modules/brace-expansion/node_modules/balanced-match": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/minimatch/node_modules/brace-expansion/node_modules/concat-map": { + "version": "0.0.1", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/npm-pick-manifest": { + "version": "1.0.4", + "dev": true, + "inBundle": true, + "license": "CC0-1.0", + "dependencies": { + "npm-package-arg": "^5.1.2", + "semver": "^5.3.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/promise-retry": { + "version": "1.1.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "err-code": "^1.0.0", + "retry": "^0.10.0" + }, + "engines": { + "node": ">=0.12" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/promise-retry/node_modules/err-code": { + "version": "1.1.2", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/protoduck": { + "version": "4.0.0", + "dev": true, + "inBundle": true, + "license": "CC0-1.0", + "dependencies": { + "genfun": "^4.0.1" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/protoduck/node_modules/genfun": { + "version": "4.0.1", + "dev": true, + "inBundle": true, + "license": "CC0-1.0" + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/tar-fs": { + "version": "1.15.3", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "chownr": "^1.0.1", + "mkdirp": "^0.5.1", + "pump": "^1.0.0", + "tar-stream": "^1.1.2" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/tar-fs/node_modules/pump": { + "version": "1.0.2", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/tar-fs/node_modules/pump/node_modules/end-of-stream": { + "version": "1.4.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "once": "^1.4.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/tar-stream": { + "version": "1.5.4", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "bl": "^1.0.0", + "end-of-stream": "^1.0.0", + "readable-stream": "^2.0.0", + "xtend": "^4.0.0" + }, + "engines": { + "node": ">= 0.8.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/tar-stream/node_modules/bl": { + "version": "1.2.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "readable-stream": "^2.0.5" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/tar-stream/node_modules/end-of-stream": { + "version": "1.4.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "once": "^1.4.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/pacote/node_modules/tar-stream/node_modules/xtend": { + "version": "4.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/path-is-inside": { + "version": "1.0.2", + "dev": true, + "inBundle": true, + "license": "(WTFPL OR MIT)" + }, + "node_modules/npx/node_modules/npm/node_modules/promise-inflight": { + "version": "1.0.1", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/npm/node_modules/read": { + "version": "1.0.7", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "mute-stream": "~0.0.4" + }, + "engines": { + "node": ">=0.8" + } + }, + "node_modules/npx/node_modules/npm/node_modules/read-cmd-shim": { + "version": "1.0.1", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "graceful-fs": "^4.1.2" + } + }, + "node_modules/npx/node_modules/npm/node_modules/read-installed": { + "version": "4.0.3", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "debuglog": "^1.0.1", + "read-package-json": "^2.0.0", + "readdir-scoped-modules": "^1.0.0", + "semver": "2 || 3 || 4 || 5", + "slide": "~1.1.3", + "util-extend": "^1.0.1" + }, + "optionalDependencies": { + "graceful-fs": "^4.1.2" + } + }, + "node_modules/npx/node_modules/npm/node_modules/read-installed/node_modules/util-extend": { + "version": "1.0.3", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/read-package-json": { + "version": "2.0.9", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "glob": "^7.1.1", + "json-parse-helpfulerror": "^1.0.2", + "normalize-package-data": "^2.0.0" + }, + "optionalDependencies": { + "graceful-fs": "^4.1.2" + } + }, + "node_modules/npx/node_modules/npm/node_modules/read-package-json/node_modules/json-parse-helpfulerror": { + "version": "1.0.3", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "jju": "^1.1.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/read-package-json/node_modules/json-parse-helpfulerror/node_modules/jju": { + "version": "1.3.0", + "dev": true, + "inBundle": true, + "license": "WTFPL" + }, + "node_modules/npx/node_modules/npm/node_modules/read-package-tree": { + "version": "5.1.6", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "debuglog": "^1.0.1", + "dezalgo": "^1.0.0", + "once": "^1.3.0", + "read-package-json": "^2.0.0", + "readdir-scoped-modules": "^1.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/read/node_modules/mute-stream": { + "version": "0.0.7", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/npm/node_modules/readable-stream": { + "version": "2.3.2", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.3", + "isarray": "~1.0.0", + "process-nextick-args": "~1.0.6", + "safe-buffer": "~5.1.0", + "string_decoder": "~1.0.0", + "util-deprecate": "~1.0.1" + } + }, + "node_modules/npx/node_modules/npm/node_modules/readable-stream/node_modules/core-util-is": { + "version": "1.0.2", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/readable-stream/node_modules/isarray": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/readable-stream/node_modules/process-nextick-args": { + "version": "1.0.7", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/readable-stream/node_modules/string_decoder": { + "version": "1.0.3", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "safe-buffer": "~5.1.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/readable-stream/node_modules/util-deprecate": { + "version": "1.0.2", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/readdir-scoped-modules": { + "version": "1.0.2", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "debuglog": "^1.0.1", + "dezalgo": "^1.0.0", + "graceful-fs": "^4.1.2", + "once": "^1.3.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request": { + "version": "2.81.0", + "dev": true, + "inBundle": true, + "license": "Apache-2.0", + "dependencies": { + "aws-sign2": "~0.6.0", + "aws4": "^1.2.1", + "caseless": "~0.12.0", + "combined-stream": "~1.0.5", + "extend": "~3.0.0", + "forever-agent": "~0.6.1", + "form-data": "~2.1.1", + "har-validator": "~4.2.1", + "hawk": "~3.1.3", + "http-signature": "~1.1.0", + "is-typedarray": "~1.0.0", + "isstream": "~0.1.2", + "json-stringify-safe": "~5.0.1", + "mime-types": "~2.1.7", + "oauth-sign": "~0.8.1", + "performance-now": "^0.2.0", + "qs": "~6.4.0", + "safe-buffer": "^5.0.1", + "stringstream": "~0.0.4", + "tough-cookie": "~2.3.0", + "tunnel-agent": "^0.6.0", + "uuid": "^3.0.0" + }, + "engines": { + "node": ">= 4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/aws-sign2": { + "version": "0.6.0", + "dev": true, + "inBundle": true, + "license": "Apache-2.0", + "engines": { + "node": "*" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/aws4": { + "version": "1.6.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/caseless": { + "version": "0.12.0", + "dev": true, + "inBundle": true, + "license": "Apache-2.0" + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/combined-stream": { + "version": "1.0.5", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "delayed-stream": "~1.0.0" + }, + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/combined-stream/node_modules/delayed-stream": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.4.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/extend": { + "version": "3.0.1", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/forever-agent": { + "version": "0.6.1", + "dev": true, + "inBundle": true, + "license": "Apache-2.0", + "engines": { + "node": "*" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/form-data": { + "version": "2.1.4", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.5", + "mime-types": "^2.1.12" + }, + "engines": { + "node": ">= 0.12" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/form-data/node_modules/asynckit": { + "version": "0.4.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/har-validator": { + "version": "4.2.1", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "ajv": "^4.9.1", + "har-schema": "^1.0.5" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/har-validator/node_modules/ajv": { + "version": "4.11.8", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "co": "^4.6.0", + "json-stable-stringify": "^1.0.1" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/har-validator/node_modules/ajv/node_modules/co": { + "version": "4.6.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "iojs": ">= 1.0.0", + "node": ">= 0.12.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/har-validator/node_modules/ajv/node_modules/json-stable-stringify": { + "version": "1.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "jsonify": "~0.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/har-validator/node_modules/ajv/node_modules/json-stable-stringify/node_modules/jsonify": { + "version": "0.0.0", + "dev": true, + "inBundle": true, + "license": "Public Domain", + "engines": { + "node": "*" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/har-validator/node_modules/har-schema": { + "version": "1.0.5", + "dev": true, + "inBundle": true, + "license": "ISC", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/hawk": { + "version": "3.1.3", + "dev": true, + "inBundle": true, + "license": "BSD-3-Clause", + "dependencies": { + "boom": "2.x.x", + "cryptiles": "2.x.x", + "hoek": "2.x.x", + "sntp": "1.x.x" + }, + "engines": { + "node": ">=0.10.32" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/hawk/node_modules/boom": { + "version": "2.10.1", + "dev": true, + "inBundle": true, + "license": "BSD-3-Clause", + "dependencies": { + "hoek": "2.x.x" + }, + "engines": { + "node": ">=0.10.40" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/hawk/node_modules/cryptiles": { + "version": "2.0.5", + "dev": true, + "inBundle": true, + "license": "BSD-3-Clause", + "dependencies": { + "boom": "2.x.x" + }, + "engines": { + "node": ">=0.10.40" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/hawk/node_modules/hoek": { + "version": "2.16.3", + "dev": true, + "inBundle": true, + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.10.40" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/hawk/node_modules/sntp": { + "version": "1.0.9", + "dev": true, + "inBundle": true, + "dependencies": { + "hoek": "2.x.x" + }, + "engines": { + "node": ">=0.8.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/http-signature": { + "version": "1.1.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "assert-plus": "^0.2.0", + "jsprim": "^1.2.2", + "sshpk": "^1.7.0" + }, + "engines": { + "node": ">=0.8", + "npm": ">=1.3.7" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/http-signature/node_modules/assert-plus": { + "version": "0.2.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.8" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/http-signature/node_modules/jsprim": { + "version": "1.4.0", + "dev": true, + "engines": [ + "node >=0.6.0" + ], + "inBundle": true, + "license": "MIT", + "dependencies": { + "assert-plus": "1.0.0", + "extsprintf": "1.0.2", + "json-schema": "0.2.3", + "verror": "1.3.6" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/http-signature/node_modules/jsprim/node_modules/assert-plus": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.8" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/http-signature/node_modules/jsprim/node_modules/extsprintf": { + "version": "1.0.2", + "dev": true, + "engines": [ + "node >=0.6.0" + ], + "inBundle": true + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/http-signature/node_modules/jsprim/node_modules/json-schema": { + "version": "0.2.3", + "dev": true, + "inBundle": true + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/http-signature/node_modules/jsprim/node_modules/verror": { + "version": "1.3.6", + "dev": true, + "engines": [ + "node >=0.6.0" + ], + "inBundle": true, + "dependencies": { + "extsprintf": "1.0.2" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/http-signature/node_modules/sshpk": { + "version": "1.13.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "asn1": "~0.2.3", + "assert-plus": "^1.0.0", + "dashdash": "^1.12.0", + "getpass": "^0.1.1" + }, + "bin": { + "sshpk-conv": "bin/sshpk-conv", + "sshpk-sign": "bin/sshpk-sign", + "sshpk-verify": "bin/sshpk-verify" + }, + "engines": { + "node": ">=0.10.0" + }, + "optionalDependencies": { + "bcrypt-pbkdf": "^1.0.0", + "ecc-jsbn": "~0.1.1", + "jsbn": "~0.1.0", + "tweetnacl": "~0.14.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/http-signature/node_modules/sshpk/node_modules/asn1": { + "version": "0.2.3", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/http-signature/node_modules/sshpk/node_modules/assert-plus": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.8" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/http-signature/node_modules/sshpk/node_modules/bcrypt-pbkdf": { + "version": "1.0.1", + "dev": true, + "inBundle": true, + "license": "BSD-3-Clause", + "optional": true, + "dependencies": { + "tweetnacl": "^0.14.3" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/http-signature/node_modules/sshpk/node_modules/dashdash": { + "version": "1.14.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "assert-plus": "^1.0.0" + }, + "engines": { + "node": ">=0.10" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/http-signature/node_modules/sshpk/node_modules/ecc-jsbn": { + "version": "0.1.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "optional": true, + "dependencies": { + "jsbn": "~0.1.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/http-signature/node_modules/sshpk/node_modules/getpass": { + "version": "0.1.7", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "assert-plus": "^1.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/http-signature/node_modules/sshpk/node_modules/jsbn": { + "version": "0.1.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "optional": true + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/http-signature/node_modules/sshpk/node_modules/tweetnacl": { + "version": "0.14.5", + "dev": true, + "inBundle": true, + "license": "Unlicense", + "optional": true + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/is-typedarray": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/isstream": { + "version": "0.1.2", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/json-stringify-safe": { + "version": "5.0.1", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/mime-types": { + "version": "2.1.15", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "mime-db": "~1.27.0" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/mime-types/node_modules/mime-db": { + "version": "1.27.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/oauth-sign": { + "version": "0.8.2", + "dev": true, + "inBundle": true, + "license": "Apache-2.0", + "engines": { + "node": "*" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/performance-now": { + "version": "0.2.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/qs": { + "version": "6.4.0", + "dev": true, + "inBundle": true, + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.6" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/stringstream": { + "version": "0.0.5", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/tough-cookie": { + "version": "2.3.2", + "dev": true, + "inBundle": true, + "license": "BSD-3-Clause", + "dependencies": { + "punycode": "^1.4.1" + }, + "engines": { + "node": ">=0.8" + } + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/tough-cookie/node_modules/punycode": { + "version": "1.4.1", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/request/node_modules/tunnel-agent": { + "version": "0.6.0", + "dev": true, + "inBundle": true, + "license": "Apache-2.0", + "dependencies": { + "safe-buffer": "^5.0.1" + }, + "engines": { + "node": "*" + } + }, + "node_modules/npx/node_modules/npm/node_modules/retry": { + "version": "0.10.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": "*" + } + }, + "node_modules/npx/node_modules/npm/node_modules/rimraf": { + "version": "2.6.1", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "glob": "^7.0.5" + }, + "bin": { + "rimraf": "bin.js" + } + }, + "node_modules/npx/node_modules/npm/node_modules/safe-buffer": { + "version": "5.1.1", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/semver": { + "version": "5.3.0", + "dev": true, + "inBundle": true, + "license": "ISC", + "bin": { + "semver": "bin/semver" + } + }, + "node_modules/npx/node_modules/npm/node_modules/sha": { + "version": "2.0.1", + "dev": true, + "inBundle": true, + "license": "(BSD-2-Clause OR MIT)", + "dependencies": { + "graceful-fs": "^4.1.2", + "readable-stream": "^2.0.2" + } + }, + "node_modules/npx/node_modules/npm/node_modules/slide": { + "version": "1.1.6", + "dev": true, + "inBundle": true, + "license": "ISC", + "engines": { + "node": "*" + } + }, + "node_modules/npx/node_modules/npm/node_modules/sorted-object": { + "version": "2.0.1", + "dev": true, + "inBundle": true, + "license": "(WTFPL OR MIT)" + }, + "node_modules/npx/node_modules/npm/node_modules/sorted-union-stream": { + "version": "2.1.3", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "from2": "^1.3.0", + "stream-iterate": "^1.1.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/sorted-union-stream/node_modules/from2": { + "version": "1.3.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "inherits": "~2.0.1", + "readable-stream": "~1.1.10" + } + }, + "node_modules/npx/node_modules/npm/node_modules/sorted-union-stream/node_modules/from2/node_modules/readable-stream": { + "version": "1.1.14", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.1", + "isarray": "0.0.1", + "string_decoder": "~0.10.x" + } + }, + "node_modules/npx/node_modules/npm/node_modules/sorted-union-stream/node_modules/from2/node_modules/readable-stream/node_modules/core-util-is": { + "version": "1.0.2", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/sorted-union-stream/node_modules/from2/node_modules/readable-stream/node_modules/isarray": { + "version": "0.0.1", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/sorted-union-stream/node_modules/from2/node_modules/readable-stream/node_modules/string_decoder": { + "version": "0.10.31", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/sorted-union-stream/node_modules/stream-iterate": { + "version": "1.2.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "readable-stream": "^2.1.5", + "stream-shift": "^1.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/sorted-union-stream/node_modules/stream-iterate/node_modules/stream-shift": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/ssri": { + "version": "4.1.6", + "dev": true, + "inBundle": true, + "license": "CC0-1.0", + "dependencies": { + "safe-buffer": "^5.1.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/strip-ansi": { + "version": "4.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "ansi-regex": "^3.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/strip-ansi/node_modules/ansi-regex": { + "version": "3.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/tar": { + "version": "2.2.1", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "block-stream": "*", + "fstream": "^1.0.2", + "inherits": "2" + } + }, + "node_modules/npx/node_modules/npm/node_modules/tar/node_modules/block-stream": { + "version": "0.0.9", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "inherits": "~2.0.0" + }, + "engines": { + "node": "0.4 || >=0.5.8" + } + }, + "node_modules/npx/node_modules/npm/node_modules/text-table": { + "version": "0.2.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/uid-number": { + "version": "0.0.6", + "dev": true, + "inBundle": true, + "license": "ISC", + "engines": { + "node": "*" + } + }, + "node_modules/npx/node_modules/npm/node_modules/umask": { + "version": "1.1.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/unique-filename": { + "version": "1.1.0", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "unique-slug": "^2.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/unique-filename/node_modules/unique-slug": { + "version": "2.0.0", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "imurmurhash": "^0.1.4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/unpipe": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier": { + "version": "2.2.0", + "dev": true, + "inBundle": true, + "license": "BSD-2-Clause", + "dependencies": { + "boxen": "^1.0.0", + "chalk": "^1.0.0", + "configstore": "^3.0.0", + "import-lazy": "^2.1.0", + "is-npm": "^1.0.0", + "latest-version": "^3.0.0", + "semver-diff": "^2.0.0", + "xdg-basedir": "^3.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/boxen": { + "version": "1.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "ansi-align": "^2.0.0", + "camelcase": "^4.0.0", + "chalk": "^1.1.1", + "cli-boxes": "^1.0.0", + "string-width": "^2.0.0", + "term-size": "^0.1.0", + "widest-line": "^1.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/boxen/node_modules/ansi-align": { + "version": "2.0.0", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "string-width": "^2.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/boxen/node_modules/camelcase": { + "version": "4.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/boxen/node_modules/cli-boxes": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/boxen/node_modules/string-width": { + "version": "2.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "is-fullwidth-code-point": "^2.0.0", + "strip-ansi": "^4.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/boxen/node_modules/string-width/node_modules/is-fullwidth-code-point": { + "version": "2.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/boxen/node_modules/string-width/node_modules/strip-ansi": { + "version": "4.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "ansi-regex": "^3.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/boxen/node_modules/term-size": { + "version": "0.1.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "execa": "^0.4.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/boxen/node_modules/term-size/node_modules/execa": { + "version": "0.4.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "cross-spawn-async": "^2.1.1", + "is-stream": "^1.1.0", + "npm-run-path": "^1.0.0", + "object-assign": "^4.0.1", + "path-key": "^1.0.0", + "strip-eof": "^1.0.0" + }, + "engines": { + "node": ">=0.12" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/boxen/node_modules/term-size/node_modules/execa/node_modules/cross-spawn-async": { + "version": "2.2.5", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "lru-cache": "^4.0.0", + "which": "^1.2.8" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/boxen/node_modules/term-size/node_modules/execa/node_modules/is-stream": { + "version": "1.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/boxen/node_modules/term-size/node_modules/execa/node_modules/npm-run-path": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "path-key": "^1.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/boxen/node_modules/term-size/node_modules/execa/node_modules/object-assign": { + "version": "4.1.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/boxen/node_modules/term-size/node_modules/execa/node_modules/path-key": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/boxen/node_modules/term-size/node_modules/execa/node_modules/strip-eof": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/boxen/node_modules/widest-line": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "string-width": "^1.0.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/boxen/node_modules/widest-line/node_modules/string-width": { + "version": "1.0.2", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "code-point-at": "^1.0.0", + "is-fullwidth-code-point": "^1.0.0", + "strip-ansi": "^3.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/boxen/node_modules/widest-line/node_modules/string-width/node_modules/code-point-at": { + "version": "1.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/boxen/node_modules/widest-line/node_modules/string-width/node_modules/is-fullwidth-code-point": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "number-is-nan": "^1.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/boxen/node_modules/widest-line/node_modules/string-width/node_modules/is-fullwidth-code-point/node_modules/number-is-nan": { + "version": "1.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/boxen/node_modules/widest-line/node_modules/string-width/node_modules/strip-ansi": { + "version": "3.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "ansi-regex": "^2.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/boxen/node_modules/widest-line/node_modules/string-width/node_modules/strip-ansi/node_modules/ansi-regex": { + "version": "2.1.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/chalk": { + "version": "1.1.3", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "ansi-styles": "^2.2.1", + "escape-string-regexp": "^1.0.2", + "has-ansi": "^2.0.0", + "strip-ansi": "^3.0.0", + "supports-color": "^2.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/chalk/node_modules/ansi-styles": { + "version": "2.2.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/chalk/node_modules/escape-string-regexp": { + "version": "1.0.5", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.8.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/chalk/node_modules/has-ansi": { + "version": "2.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "ansi-regex": "^2.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/chalk/node_modules/has-ansi/node_modules/ansi-regex": { + "version": "2.1.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/chalk/node_modules/strip-ansi": { + "version": "3.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "ansi-regex": "^2.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/chalk/node_modules/strip-ansi/node_modules/ansi-regex": { + "version": "2.1.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/chalk/node_modules/supports-color": { + "version": "2.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.8.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/configstore": { + "version": "3.1.0", + "dev": true, + "inBundle": true, + "license": "BSD-2-Clause", + "dependencies": { + "dot-prop": "^4.1.0", + "graceful-fs": "^4.1.2", + "make-dir": "^1.0.0", + "unique-string": "^1.0.0", + "write-file-atomic": "^2.0.0", + "xdg-basedir": "^3.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/configstore/node_modules/dot-prop": { + "version": "4.1.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "is-obj": "^1.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/configstore/node_modules/dot-prop/node_modules/is-obj": { + "version": "1.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/configstore/node_modules/make-dir": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "pify": "^2.3.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/configstore/node_modules/make-dir/node_modules/pify": { + "version": "2.3.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/configstore/node_modules/unique-string": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "crypto-random-string": "^1.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/configstore/node_modules/unique-string/node_modules/crypto-random-string": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/import-lazy": { + "version": "2.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/is-npm": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version": { + "version": "3.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "package-json": "^4.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version/node_modules/package-json": { + "version": "4.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "got": "^6.7.1", + "registry-auth-token": "^3.0.1", + "registry-url": "^3.0.3", + "semver": "^5.1.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version/node_modules/package-json/node_modules/got": { + "version": "6.7.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "create-error-class": "^3.0.0", + "duplexer3": "^0.1.4", + "get-stream": "^3.0.0", + "is-redirect": "^1.0.0", + "is-retry-allowed": "^1.0.0", + "is-stream": "^1.0.0", + "lowercase-keys": "^1.0.0", + "safe-buffer": "^5.0.1", + "timed-out": "^4.0.0", + "unzip-response": "^2.0.1", + "url-parse-lax": "^1.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version/node_modules/package-json/node_modules/got/node_modules/create-error-class": { + "version": "3.0.2", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "capture-stack-trace": "^1.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version/node_modules/package-json/node_modules/got/node_modules/create-error-class/node_modules/capture-stack-trace": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version/node_modules/package-json/node_modules/got/node_modules/duplexer3": { + "version": "0.1.4", + "dev": true, + "inBundle": true, + "license": "BSD-3-Clause" + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version/node_modules/package-json/node_modules/got/node_modules/get-stream": { + "version": "3.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version/node_modules/package-json/node_modules/got/node_modules/is-redirect": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version/node_modules/package-json/node_modules/got/node_modules/is-retry-allowed": { + "version": "1.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version/node_modules/package-json/node_modules/got/node_modules/is-stream": { + "version": "1.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version/node_modules/package-json/node_modules/got/node_modules/lowercase-keys": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version/node_modules/package-json/node_modules/got/node_modules/timed-out": { + "version": "4.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version/node_modules/package-json/node_modules/got/node_modules/unzip-response": { + "version": "2.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version/node_modules/package-json/node_modules/got/node_modules/url-parse-lax": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "prepend-http": "^1.0.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version/node_modules/package-json/node_modules/got/node_modules/url-parse-lax/node_modules/prepend-http": { + "version": "1.0.4", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version/node_modules/package-json/node_modules/registry-auth-token": { + "version": "3.3.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "rc": "^1.1.6", + "safe-buffer": "^5.0.1" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version/node_modules/package-json/node_modules/registry-auth-token/node_modules/rc": { + "version": "1.2.1", + "dev": true, + "inBundle": true, + "license": "(BSD-2-Clause OR MIT OR Apache-2.0)", + "dependencies": { + "deep-extend": "~0.4.0", + "ini": "~1.3.0", + "minimist": "^1.2.0", + "strip-json-comments": "~2.0.1" + }, + "bin": { + "rc": "index.js" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version/node_modules/package-json/node_modules/registry-auth-token/node_modules/rc/node_modules/deep-extend": { + "version": "0.4.2", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "iojs": ">=1.0.0", + "node": ">=0.12.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version/node_modules/package-json/node_modules/registry-auth-token/node_modules/rc/node_modules/minimist": { + "version": "1.2.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version/node_modules/package-json/node_modules/registry-auth-token/node_modules/rc/node_modules/strip-json-comments": { + "version": "2.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version/node_modules/package-json/node_modules/registry-url": { + "version": "3.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "rc": "^1.0.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version/node_modules/package-json/node_modules/registry-url/node_modules/rc": { + "version": "1.2.1", + "dev": true, + "inBundle": true, + "license": "(BSD-2-Clause OR MIT OR Apache-2.0)", + "dependencies": { + "deep-extend": "~0.4.0", + "ini": "~1.3.0", + "minimist": "^1.2.0", + "strip-json-comments": "~2.0.1" + }, + "bin": { + "rc": "index.js" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version/node_modules/package-json/node_modules/registry-url/node_modules/rc/node_modules/deep-extend": { + "version": "0.4.2", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "iojs": ">=1.0.0", + "node": ">=0.12.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version/node_modules/package-json/node_modules/registry-url/node_modules/rc/node_modules/minimist": { + "version": "1.2.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/latest-version/node_modules/package-json/node_modules/registry-url/node_modules/rc/node_modules/strip-json-comments": { + "version": "2.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/semver-diff": { + "version": "2.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "semver": "^5.0.3" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/update-notifier/node_modules/xdg-basedir": { + "version": "3.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/uuid": { + "version": "3.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "bin": { + "uuid": "bin/uuid" + } + }, + "node_modules/npx/node_modules/npm/node_modules/validate-npm-package-license": { + "version": "3.0.1", + "dev": true, + "inBundle": true, + "license": "Apache-2.0", + "dependencies": { + "spdx-correct": "~1.0.0", + "spdx-expression-parse": "~1.0.0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/validate-npm-package-license/node_modules/spdx-correct": { + "version": "1.0.2", + "dev": true, + "inBundle": true, + "license": "Apache-2.0", + "dependencies": { + "spdx-license-ids": "^1.0.2" + } + }, + "node_modules/npx/node_modules/npm/node_modules/validate-npm-package-license/node_modules/spdx-correct/node_modules/spdx-license-ids": { + "version": "1.2.2", + "dev": true, + "inBundle": true, + "license": "Unlicense" + }, + "node_modules/npx/node_modules/npm/node_modules/validate-npm-package-license/node_modules/spdx-expression-parse": { + "version": "1.0.4", + "dev": true, + "inBundle": true, + "license": "(MIT AND CC-BY-3.0)" + }, + "node_modules/npx/node_modules/npm/node_modules/validate-npm-package-name": { + "version": "3.0.0", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "builtins": "^1.0.3" + } + }, + "node_modules/npx/node_modules/npm/node_modules/validate-npm-package-name/node_modules/builtins": { + "version": "1.0.3", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/which": { + "version": "1.2.14", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "isexe": "^2.0.0" + }, + "bin": { + "which": "bin/which" + } + }, + "node_modules/npx/node_modules/npm/node_modules/which/node_modules/isexe": { + "version": "2.0.0", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/npm/node_modules/worker-farm": { + "version": "1.3.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "errno": ">=0.1.1 <0.2.0-0", + "xtend": ">=4.0.0 <4.1.0-0" + } + }, + "node_modules/npx/node_modules/npm/node_modules/worker-farm/node_modules/errno": { + "version": "0.1.4", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "prr": "~0.0.0" + }, + "bin": { + "errno": "cli.js" + } + }, + "node_modules/npx/node_modules/npm/node_modules/worker-farm/node_modules/errno/node_modules/prr": { + "version": "0.0.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/npm/node_modules/worker-farm/node_modules/xtend": { + "version": "4.0.1", + "dev": true, + "inBundle": true, + "engines": { + "node": ">=0.4" + } + }, + "node_modules/npx/node_modules/npm/node_modules/wrappy": { + "version": "1.0.2", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/npm/node_modules/write-file-atomic": { + "version": "2.1.0", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "graceful-fs": "^4.1.11", + "imurmurhash": "^0.1.4", + "slide": "^1.1.5" + } + }, + "node_modules/npx/node_modules/number-is-nan": { + "version": "1.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/once": { + "version": "1.4.0", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "wrappy": "1" + } + }, + "node_modules/npx/node_modules/os-homedir": { + "version": "1.0.2", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/os-locale": { + "version": "3.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "execa": "^1.0.0", + "lcid": "^2.0.0", + "mem": "^4.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/npx/node_modules/os-locale/node_modules/cross-spawn": { + "version": "6.0.5", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "nice-try": "^1.0.4", + "path-key": "^2.0.1", + "semver": "^5.5.0", + "shebang-command": "^1.2.0", + "which": "^1.2.9" + }, + "engines": { + "node": ">=4.8" + } + }, + "node_modules/npx/node_modules/os-locale/node_modules/execa": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "cross-spawn": "^6.0.0", + "get-stream": "^4.0.0", + "is-stream": "^1.1.0", + "npm-run-path": "^2.0.0", + "p-finally": "^1.0.0", + "signal-exit": "^3.0.0", + "strip-eof": "^1.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/npx/node_modules/os-locale/node_modules/get-stream": { + "version": "4.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "pump": "^3.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/npx/node_modules/os-tmpdir": { + "version": "1.0.2", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/osenv": { + "version": "0.1.5", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "os-homedir": "^1.0.0", + "os-tmpdir": "^1.0.0" + } + }, + "node_modules/npx/node_modules/p-defer": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/p-finally": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/p-is-promise": { + "version": "2.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/npx/node_modules/p-limit": { + "version": "1.3.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "p-try": "^1.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/p-locate": { + "version": "2.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "p-limit": "^1.1.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/p-try": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/package-json": { + "version": "4.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "got": "^6.7.1", + "registry-auth-token": "^3.0.1", + "registry-url": "^3.0.3", + "semver": "^5.1.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/path-exists": { + "version": "3.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/path-is-absolute": { + "version": "1.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/path-is-inside": { + "version": "1.0.2", + "dev": true, + "inBundle": true, + "license": "(WTFPL OR MIT)" + }, + "node_modules/npx/node_modules/path-key": { + "version": "2.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/pify": { + "version": "3.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/prepend-http": { + "version": "1.0.4", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/pseudomap": { + "version": "1.0.2", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/pump": { + "version": "3.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } + }, + "node_modules/npx/node_modules/rc": { + "version": "1.2.8", + "dev": true, + "inBundle": true, + "license": "(BSD-2-Clause OR MIT OR Apache-2.0)", + "dependencies": { + "deep-extend": "^0.6.0", + "ini": "~1.3.0", + "minimist": "^1.2.0", + "strip-json-comments": "~2.0.1" + }, + "bin": { + "rc": "cli.js" + } + }, + "node_modules/npx/node_modules/registry-auth-token": { + "version": "3.4.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "rc": "^1.1.6", + "safe-buffer": "^5.0.1" + } + }, + "node_modules/npx/node_modules/registry-url": { + "version": "3.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "rc": "^1.0.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/require-directory": { + "version": "2.1.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/require-main-filename": { + "version": "1.0.1", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/rimraf": { + "version": "2.7.1", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "glob": "^7.1.3" + }, + "bin": { + "rimraf": "bin.js" + } + }, + "node_modules/npx/node_modules/safe-buffer": { + "version": "5.2.0", + "dev": true, + "inBundle": true, + "license": "MIT" + }, + "node_modules/npx/node_modules/semver": { + "version": "5.7.1", + "dev": true, + "inBundle": true, + "license": "ISC", + "bin": { + "semver": "bin/semver" + } + }, + "node_modules/npx/node_modules/semver-diff": { + "version": "2.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "semver": "^5.0.3" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/set-blocking": { + "version": "2.0.0", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/shebang-command": { + "version": "1.2.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "shebang-regex": "^1.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/shebang-regex": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/signal-exit": { + "version": "3.0.2", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/string-width": { + "version": "2.1.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "is-fullwidth-code-point": "^2.0.0", + "strip-ansi": "^4.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/strip-ansi": { + "version": "4.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "ansi-regex": "^3.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/strip-eof": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/strip-json-comments": { + "version": "2.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/supports-color": { + "version": "5.5.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "has-flag": "^3.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/term-size": { + "version": "1.2.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "execa": "^0.7.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/timed-out": { + "version": "4.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/unique-string": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "crypto-random-string": "^1.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/unzip-response": { + "version": "2.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/update-notifier": { + "version": "2.5.0", + "dev": true, + "inBundle": true, + "license": "BSD-2-Clause", + "dependencies": { + "boxen": "^1.2.1", + "chalk": "^2.0.1", + "configstore": "^3.0.0", + "import-lazy": "^2.1.0", + "is-ci": "^1.0.10", + "is-installed-globally": "^0.1.0", + "is-npm": "^1.0.0", + "latest-version": "^3.0.0", + "semver-diff": "^2.0.0", + "xdg-basedir": "^3.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/url-parse-lax": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "prepend-http": "^1.0.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/validate-npm-package-name": { + "version": "3.0.0", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "builtins": "^1.0.3" + } + }, + "node_modules/npx/node_modules/which": { + "version": "1.3.1", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "isexe": "^2.0.0" + }, + "bin": { + "which": "bin/which" + } + }, + "node_modules/npx/node_modules/which-module": { + "version": "2.0.0", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/widest-line": { + "version": "2.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "string-width": "^2.1.1" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/wrap-ansi": { + "version": "2.1.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "string-width": "^1.0.1", + "strip-ansi": "^3.0.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/wrap-ansi/node_modules/ansi-regex": { + "version": "2.1.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/wrap-ansi/node_modules/is-fullwidth-code-point": { + "version": "1.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "number-is-nan": "^1.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/wrap-ansi/node_modules/string-width": { + "version": "1.0.2", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "code-point-at": "^1.0.0", + "is-fullwidth-code-point": "^1.0.0", + "strip-ansi": "^3.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/wrap-ansi/node_modules/strip-ansi": { + "version": "3.0.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "ansi-regex": "^2.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npx/node_modules/wrappy": { + "version": "1.0.2", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/write-file-atomic": { + "version": "2.4.3", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "graceful-fs": "^4.1.11", + "imurmurhash": "^0.1.4", + "signal-exit": "^3.0.2" + } + }, + "node_modules/npx/node_modules/xdg-basedir": { + "version": "3.0.0", + "dev": true, + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/npx/node_modules/y18n": { + "version": "4.0.0", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/yallist": { + "version": "2.1.2", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/npx/node_modules/yargs": { + "version": "11.1.1", + "dev": true, + "inBundle": true, + "license": "MIT", + "dependencies": { + "cliui": "^4.0.0", + "decamelize": "^1.1.1", + "find-up": "^2.1.0", + "get-caller-file": "^1.0.1", + "os-locale": "^3.1.0", + "require-directory": "^2.1.1", + "require-main-filename": "^1.0.1", + "set-blocking": "^2.0.0", + "string-width": "^2.0.0", + "which-module": "^2.0.0", + "y18n": "^3.2.1", + "yargs-parser": "^9.0.2" + } + }, + "node_modules/npx/node_modules/yargs-parser": { + "version": "9.0.2", + "dev": true, + "inBundle": true, + "license": "ISC", + "dependencies": { + "camelcase": "^4.1.0" + } + }, + "node_modules/npx/node_modules/yargs/node_modules/y18n": { + "version": "3.2.1", + "dev": true, + "inBundle": true, + "license": "ISC" + }, + "node_modules/nth-check": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz", + "integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==", + "dependencies": { + "boolbase": "^1.0.0" + }, + "funding": { + "url": "https://github.com/fb55/nth-check?sponsor=1" + } + }, + "node_modules/oas-kit-common": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/oas-kit-common/-/oas-kit-common-1.0.8.tgz", + "integrity": "sha512-pJTS2+T0oGIwgjGpw7sIRU8RQMcUoKCDWFLdBqKB2BNmGpbBMH2sdqAaOXUg8OzonZHU0L7vfJu1mJFEiYDWOQ==", + "dev": true, + "dependencies": { + "fast-safe-stringify": "^2.0.7" + } + }, + "node_modules/oas-linter": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/oas-linter/-/oas-linter-3.2.2.tgz", + "integrity": "sha512-KEGjPDVoU5K6swgo9hJVA/qYGlwfbFx+Kg2QB/kd7rzV5N8N5Mg6PlsoCMohVnQmo+pzJap/F610qTodKzecGQ==", + "dev": true, + "dependencies": { + "@exodus/schemasafe": "^1.0.0-rc.2", + "should": "^13.2.1", + "yaml": "^1.10.0" + }, + "funding": { + "url": "https://github.com/Mermade/oas-kit?sponsor=1" + } + }, + "node_modules/oas-resolver": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/oas-resolver/-/oas-resolver-2.5.6.tgz", + "integrity": "sha512-Yx5PWQNZomfEhPPOphFbZKi9W93CocQj18NlD2Pa4GWZzdZpSJvYwoiuurRI7m3SpcChrnO08hkuQDL3FGsVFQ==", + "dev": true, + "dependencies": { + "node-fetch-h2": "^2.3.0", + "oas-kit-common": "^1.0.8", + "reftools": "^1.1.9", + "yaml": "^1.10.0", + "yargs": "^17.0.1" + }, + "bin": { + "resolve": "resolve.js" + }, + "funding": { + "url": "https://github.com/Mermade/oas-kit?sponsor=1" + } + }, + "node_modules/oas-schema-walker": { + "version": "1.1.5", + "resolved": "https://registry.npmjs.org/oas-schema-walker/-/oas-schema-walker-1.1.5.tgz", + "integrity": "sha512-2yucenq1a9YPmeNExoUa9Qwrt9RFkjqaMAA1X+U7sbb0AqBeTIdMHky9SQQ6iN94bO5NW0W4TRYXerG+BdAvAQ==", + "dev": true, + "funding": { + "url": "https://github.com/Mermade/oas-kit?sponsor=1" + } + }, + "node_modules/oas-validator": { + "version": "5.0.8", + "resolved": "https://registry.npmjs.org/oas-validator/-/oas-validator-5.0.8.tgz", + "integrity": "sha512-cu20/HE5N5HKqVygs3dt94eYJfBi0TsZvPVXDhbXQHiEityDN+RROTleefoKRKKJ9dFAF2JBkDHgvWj0sjKGmw==", + "dev": true, + "dependencies": { + "call-me-maybe": "^1.0.1", + "oas-kit-common": "^1.0.8", + "oas-linter": "^3.2.2", + "oas-resolver": "^2.5.6", + "oas-schema-walker": "^1.1.5", + "reftools": "^1.1.9", + "should": "^13.2.1", + "yaml": "^1.10.0" + }, + "funding": { + "url": "https://github.com/Mermade/oas-kit?sponsor=1" + } + }, + "node_modules/oazapfts": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/oazapfts/-/oazapfts-6.1.0.tgz", + "integrity": "sha512-+E0db72jn0AMJ36ZzEXF3qCZ+T4pnOem/tmiJVql1Kx4qkWBE4YGeiUicp3gkzLJ/OrmpvSyAufI8eZ7sODCYg==", + "dev": true, + "dependencies": { + "@apidevtools/swagger-parser": "^10.1.0", + "lodash": "^4.17.21", + "minimist": "^1.2.8", + "swagger2openapi": "^7.0.8", + "tapable": "^2.2.1", + "typescript": "^5.4.5" + }, + "bin": { + "oazapfts": "cli.js" + }, + "peerDependencies": { + "@oazapfts/runtime": "*" + } + }, + "node_modules/object-assign": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", + "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/object-inspect": { + "version": "1.13.4", + "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.4.tgz", + "integrity": "sha512-W67iLl4J2EXEGTbfeHCffrjDfitvLANg0UlX3wFUUSTx92KXRFegMHUVgSqE+wvhAbi4WqjGg9czysTV2Epbew==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/object-is": { + "version": "1.1.6", + "resolved": "https://registry.npmjs.org/object-is/-/object-is-1.1.6.tgz", + "integrity": "sha512-F8cZ+KfGlSGi09lJT7/Nd6KJZ9ygtvYC0/UYYLI9nmQKLMnydpB9yvbv9K1uSkEu7FU9vYPmVwLg328tX+ot3Q==", + "dev": true, + "dependencies": { + "call-bind": "^1.0.7", + "define-properties": "^1.2.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/object-keys": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/object-keys/-/object-keys-1.1.1.tgz", + "integrity": "sha512-NuAESUOUMrlIXOfHKzD6bpPu3tYt3xvjNdRIQ+FeT0lNb4K8WR70CaDxhuNguS2XG+GjkyMwOzsN5ZktImfhLA==", + "dev": true, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/object.assign": { + "version": "4.1.7", + "resolved": "https://registry.npmjs.org/object.assign/-/object.assign-4.1.7.tgz", + "integrity": "sha512-nK28WOo+QIjBkDduTINE4JkF/UJJKyf2EJxvJKfblDpyg0Q+pkOHNTL0Qwy6NP6FhE/EnzV73BxxqcJaXY9anw==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "call-bound": "^1.0.3", + "define-properties": "^1.2.1", + "es-object-atoms": "^1.0.0", + "has-symbols": "^1.1.0", + "object-keys": "^1.1.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/object.entries": { + "version": "1.1.9", + "resolved": "https://registry.npmjs.org/object.entries/-/object.entries-1.1.9.tgz", + "integrity": "sha512-8u/hfXFRBD1O0hPUjioLhoWFHRmt6tKA4/vZPyckBr18l1KE9uHrFaFaUi8MDRTpi4uak2goyPTSNJLXX2k2Hw==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "call-bound": "^1.0.4", + "define-properties": "^1.2.1", + "es-object-atoms": "^1.1.1" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/object.fromentries": { + "version": "2.0.8", + "resolved": "https://registry.npmjs.org/object.fromentries/-/object.fromentries-2.0.8.tgz", + "integrity": "sha512-k6E21FzySsSK5a21KRADBd/NGneRegFO5pLHfdQLpRDETUNJueLXs3WCzyQ3tFRDYgbq3KHGXfTbi2bs8WQ6rQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.7", + "define-properties": "^1.2.1", + "es-abstract": "^1.23.2", + "es-object-atoms": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/object.values": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/object.values/-/object.values-1.2.1.tgz", + "integrity": "sha512-gXah6aZrcUxjWg2zR2MwouP2eHlCBzdV4pygudehaKXSGW4v2AsRQUK+lwwXhii6KFZcunEnmSUoYp5CXibxtA==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "call-bound": "^1.0.3", + "define-properties": "^1.2.1", + "es-object-atoms": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/obuf": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/obuf/-/obuf-1.1.2.tgz", + "integrity": "sha512-PX1wu0AmAdPqOL1mWhqmlOd8kOIZQwGZw6rh7uby9fTc5lhaOWFLX3I6R1hrF9k3zUY40e6igsLGkDXK92LJNg==", + "dev": true + }, + "node_modules/on-finished": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.4.1.tgz", + "integrity": "sha512-oVlzkg3ENAhCk2zdv7IJwd/QUD4z2RxRwpkcGY8psCVcCYZNq4wYnVWALHM+brtuJjePWiYF/ClmuDr8Ch5+kg==", + "dev": true, + "dependencies": { + "ee-first": "1.1.1" + }, + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/on-headers": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/on-headers/-/on-headers-1.1.0.tgz", + "integrity": "sha512-737ZY3yNnXy37FHkQxPzt4UZ2UWPWiCZWLvFZ4fu5cueciegX0zGPnrlY6bwRg4FdQOe9YU8MkmJwGhoMybl8A==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/once": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", + "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==", + "dev": true, + "dependencies": { + "wrappy": "1" + } + }, + "node_modules/onetime": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/onetime/-/onetime-5.1.2.tgz", + "integrity": "sha512-kbpaSSGJTWdAY5KPVeMOKXSrPtr8C8C7wodJbcsd51jRnmD+GZu8Y0VoU6Dm5Z4vWr0Ig/1NKuWRKf7j5aaYSg==", + "dev": true, + "dependencies": { + "mimic-fn": "^2.1.0" + }, + "engines": { + "node": ">=6" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/open": { + "version": "8.4.2", + "resolved": "https://registry.npmjs.org/open/-/open-8.4.2.tgz", + "integrity": "sha512-7x81NCL719oNbsq/3mh+hVrAWmFuEYUqrq/Iw3kUzH8ReypT9QQ0BLoJS7/G9k6N81XjW4qHWtjWwe/9eLy1EQ==", + "dev": true, + "dependencies": { + "define-lazy-prop": "^2.0.0", + "is-docker": "^2.1.1", + "is-wsl": "^2.2.0" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/openai": { + "version": "4.68.4", + "resolved": "https://registry.npmjs.org/openai/-/openai-4.68.4.tgz", + "integrity": "sha512-LRinV8iU9VQplkr25oZlyrsYGPGasIwYN8KFMAAFTHHLHjHhejtJ5BALuLFrkGzY4wfbKhOhuT+7lcHZ+F3iEA==", + "dependencies": { + "@types/node": "^18.11.18", + "@types/node-fetch": "^2.6.4", + "abort-controller": "^3.0.0", + "agentkeepalive": "^4.2.1", + "form-data-encoder": "1.7.2", + "formdata-node": "^4.3.2", + "node-fetch": "^2.6.7" + }, + "bin": { + "openai": "bin/cli" + }, + "peerDependencies": { + "zod": "^3.23.8" + }, + "peerDependenciesMeta": { + "zod": { + "optional": true + } + } + }, + "node_modules/openai/node_modules/@types/node": { + "version": "18.19.61", + "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.61.tgz", + "integrity": "sha512-z8fH66NcVkDzBItOao+Nyh0fiy7CYdxIyxnNCcZ60aY0I+EA/y4TSi/S/W9i8DIQvwVo7a0pgzAxmDeNnqrpkw==", + "dependencies": { + "undici-types": "~5.26.4" + } + }, + "node_modules/openai/node_modules/undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==" + }, + "node_modules/openapi-types": { + "version": "12.1.3", + "resolved": "https://registry.npmjs.org/openapi-types/-/openapi-types-12.1.3.tgz", + "integrity": "sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw==", + "dev": true, + "peer": true + }, + "node_modules/optionator": { + "version": "0.9.4", + "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz", + "integrity": "sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g==", + "dev": true, + "dependencies": { + "deep-is": "^0.1.3", + "fast-levenshtein": "^2.0.6", + "levn": "^0.4.1", + "prelude-ls": "^1.2.1", + "type-check": "^0.4.0", + "word-wrap": "^1.2.5" + }, + "engines": { + "node": ">= 0.8.0" + } + }, + "node_modules/own-keys": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/own-keys/-/own-keys-1.0.1.tgz", + "integrity": "sha512-qFOyK5PjiWZd+QQIh+1jhdb9LpxTF0qs7Pm8o5QHYZ0M3vKqSqzsZaEB6oWlxZ+q2sJBMI/Ktgd2N5ZwQoRHfg==", + "dev": true, + "license": "MIT", + "dependencies": { + "get-intrinsic": "^1.2.6", + "object-keys": "^1.1.1", + "safe-push-apply": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/p-limit": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz", + "integrity": "sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ==", + "dev": true, + "dependencies": { + "yocto-queue": "^0.1.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/p-locate": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-5.0.0.tgz", + "integrity": "sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw==", + "dev": true, + "dependencies": { + "p-limit": "^3.0.2" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/p-retry": { + "version": "6.2.1", + "resolved": "https://registry.npmjs.org/p-retry/-/p-retry-6.2.1.tgz", + "integrity": "sha512-hEt02O4hUct5wtwg4H4KcWgDdm+l1bOaEy/hWzd8xtXB9BqxTWBBhb+2ImAtH4Cv4rPjV76xN3Zumqk3k3AhhQ==", + "dev": true, + "dependencies": { + "@types/retry": "0.12.2", + "is-network-error": "^1.0.0", + "retry": "^0.13.1" + }, + "engines": { + "node": ">=16.17" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/p-try": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/p-try/-/p-try-2.2.0.tgz", + "integrity": "sha512-R4nPAVTAU0B9D35/Gk3uJf/7XYbQcyohSKdvAxIRSNghFl4e71hVoGnBNQz9cWaXxO2I10KTC+3jMdvvoKw6dQ==", + "dev": true, + "engines": { + "node": ">=6" + } + }, + "node_modules/param-case": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/param-case/-/param-case-3.0.4.tgz", + "integrity": "sha512-RXlj7zCYokReqWpOPH9oYivUzLYZ5vAPIfEmCTNViosC78F8F0H9y7T7gG2M39ymgutxF5gcFEsyZQSph9Bp3A==", + "dev": true, + "dependencies": { + "dot-case": "^3.0.4", + "tslib": "^2.0.3" + } + }, + "node_modules/parent-module": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz", + "integrity": "sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==", + "dev": true, + "dependencies": { + "callsites": "^3.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/parse-author": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/parse-author/-/parse-author-2.0.0.tgz", + "integrity": "sha512-yx5DfvkN8JsHL2xk2Os9oTia467qnvRgey4ahSm2X8epehBLx/gWLcy5KI+Y36ful5DzGbCS6RazqZGgy1gHNw==", + "dev": true, + "dependencies": { + "author-regex": "^1.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/parse-json": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-5.2.0.tgz", + "integrity": "sha512-ayCKvm/phCGxOkYRSCM82iDwct8/EonSEgCSxWxD7ve6jHggsFl4fZVQBPRNgQoKiuV/odhFrGzQXZwbifC8Rg==", + "dev": true, + "dependencies": { + "@babel/code-frame": "^7.0.0", + "error-ex": "^1.3.1", + "json-parse-even-better-errors": "^2.3.0", + "lines-and-columns": "^1.1.6" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/parse5": { + "version": "7.2.1", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.2.1.tgz", + "integrity": "sha512-BuBYQYlv1ckiPdQi/ohiivi9Sagc9JG+Ozs0r7b/0iK3sKmrb0b9FdWdBbOdx6hBCM/F9Ir82ofnBhtZOjCRPQ==", + "dev": true, + "dependencies": { + "entities": "^4.5.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, + "node_modules/parse5-htmlparser2-tree-adapter": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.1.0.tgz", + "integrity": "sha512-ruw5xyKs6lrpo9x9rCZqZZnIUntICjQAd0Wsmp396Ul9lN/h+ifgVV1x1gZHi8euej6wTfpqX8j+BFQxF0NS/g==", + "dev": true, + "dependencies": { + "domhandler": "^5.0.3", + "parse5": "^7.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, + "node_modules/parse5-parser-stream": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/parse5-parser-stream/-/parse5-parser-stream-7.1.2.tgz", + "integrity": "sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow==", + "dev": true, + "dependencies": { + "parse5": "^7.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, + "node_modules/parseurl": { + "version": "1.3.3", + "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz", + "integrity": "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ==", + "dev": true, + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/pascal-case": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/pascal-case/-/pascal-case-3.1.2.tgz", + "integrity": "sha512-uWlGT3YSnK9x3BQJaOdcZwrnV6hPpd8jFH1/ucpiLRPh/2zCVJKS19E4GvYHvaCcACn3foXZ0cLB9Wrx1KGe5g==", + "dev": true, + "dependencies": { + "no-case": "^3.0.4", + "tslib": "^2.0.3" + } + }, + "node_modules/path-exists": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz", + "integrity": "sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/path-is-absolute": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", + "integrity": "sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/path-key": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz", + "integrity": "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/path-parse": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.7.tgz", + "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==", + "dev": true + }, + "node_modules/path-to-regexp": { + "version": "0.1.12", + "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.12.tgz", + "integrity": "sha512-RA1GjUVMnvYFxuqovrEqZoxxW5NUZqbwKtYz/Tt7nXerk0LbLblQmrsgdeOxV5SFHf0UDggjS/bSeOZwt1pmEQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/path-type": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/path-type/-/path-type-4.0.0.tgz", + "integrity": "sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/performance-now": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/performance-now/-/performance-now-2.1.0.tgz", + "integrity": "sha512-7EAHlyLHI56VEIdK57uwHdHKIaAGbnXPiw0yWbarQZOKaKpvUIgW0jWRVLiatnM+XXlSwsanIBH/hzGMJulMow==", + "dev": true + }, + "node_modules/picocolors": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", + "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==" + }, + "node_modules/picomatch": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz", + "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==", + "engines": { + "node": ">=8.6" + }, + "funding": { + "url": "https://github.com/sponsors/jonschlinkert" + } + }, + "node_modules/pidtree": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/pidtree/-/pidtree-0.6.0.tgz", + "integrity": "sha512-eG2dWTVw5bzqGRztnHExczNxt5VGsE6OwTeCG3fdUf9KBsZzO3R5OIIIzWR+iZA0NtZ+RDVdaoE2dK1cn6jH4g==", + "dev": true, + "license": "MIT", + "bin": { + "pidtree": "bin/pidtree.js" + }, + "engines": { + "node": ">=0.10" + } + }, + "node_modules/pify": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/pify/-/pify-4.0.1.tgz", + "integrity": "sha512-uB80kBFb/tfd68bVleG9T5GGsGPjJrLAUpR5PZIrhBnIaRTQRjqdJSsIKkOP6OAIFbj7GOrcudc5pNjZ+geV2g==", + "dev": true, + "engines": { + "node": ">=6" + } + }, + "node_modules/pirates": { + "version": "4.0.6", + "resolved": "https://registry.npmjs.org/pirates/-/pirates-4.0.6.tgz", + "integrity": "sha512-saLsH7WeYYPiD25LDuLRRY/i+6HaPYr6G1OUlN39otzkSTxKnubR9RTxS3/Kk50s1g2JTgFwWQDQyplC5/SHZg==", + "dev": true, + "engines": { + "node": ">= 6" + } + }, + "node_modules/pkg-dir": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pkg-dir/-/pkg-dir-3.0.0.tgz", + "integrity": "sha512-/E57AYkoeQ25qkxMj5PBOVgF8Kiu/h7cYS30Z5+R7WaiCCBfLq58ZI/dSeaEKb9WVJV5n/03QwrN3IeWIFllvw==", + "dev": true, + "dependencies": { + "find-up": "^3.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/pkg-dir/node_modules/find-up": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-3.0.0.tgz", + "integrity": "sha512-1yD6RmLI1XBfxugvORwlck6f75tYL+iR0jqwsOrOxMZyGYqUuDhJ0l4AXdO1iX/FTs9cBAMEk1gWSEx1kSbylg==", + "dev": true, + "dependencies": { + "locate-path": "^3.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/pkg-dir/node_modules/locate-path": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-3.0.0.tgz", + "integrity": "sha512-7AO748wWnIhNqAuaty2ZWHkQHRSNfPVIsPIfwEOWO22AmaoVrWavlOcMR5nzTLNYvp36X220/maaRsrec1G65A==", + "dev": true, + "dependencies": { + "p-locate": "^3.0.0", + "path-exists": "^3.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/pkg-dir/node_modules/p-limit": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.3.0.tgz", + "integrity": "sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==", + "dev": true, + "dependencies": { + "p-try": "^2.0.0" + }, + "engines": { + "node": ">=6" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/pkg-dir/node_modules/p-locate": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-3.0.0.tgz", + "integrity": "sha512-x+12w/To+4GFfgJhBEpiDcLozRJGegY+Ei7/z0tSLkMmxGZNybVMSfWj9aJn8Z5Fc7dBUNJOOVgPv2H7IwulSQ==", + "dev": true, + "dependencies": { + "p-limit": "^2.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/pkg-dir/node_modules/path-exists": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz", + "integrity": "sha512-bpC7GYwiDYQ4wYLe+FA8lhRjhQCMcQGuSgGGqDkg/QerRWw9CmGRT0iSOVRSZJ29NMLZgIzqaljJ63oaL4NIJQ==", + "dev": true, + "engines": { + "node": ">=4" + } + }, + "node_modules/pkg-up": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/pkg-up/-/pkg-up-3.1.0.tgz", + "integrity": "sha512-nDywThFk1i4BQK4twPQ6TA4RT8bDY96yeuCVBWL3ePARCiEKDRSrNGbFIgUJpLp+XeIR65v8ra7WuJOFUBtkMA==", + "dev": true, + "dependencies": { + "find-up": "^3.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/pkg-up/node_modules/find-up": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-3.0.0.tgz", + "integrity": "sha512-1yD6RmLI1XBfxugvORwlck6f75tYL+iR0jqwsOrOxMZyGYqUuDhJ0l4AXdO1iX/FTs9cBAMEk1gWSEx1kSbylg==", + "dev": true, + "dependencies": { + "locate-path": "^3.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/pkg-up/node_modules/locate-path": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-3.0.0.tgz", + "integrity": "sha512-7AO748wWnIhNqAuaty2ZWHkQHRSNfPVIsPIfwEOWO22AmaoVrWavlOcMR5nzTLNYvp36X220/maaRsrec1G65A==", + "dev": true, + "dependencies": { + "p-locate": "^3.0.0", + "path-exists": "^3.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/pkg-up/node_modules/p-limit": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.3.0.tgz", + "integrity": "sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==", + "dev": true, + "dependencies": { + "p-try": "^2.0.0" + }, + "engines": { + "node": ">=6" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/pkg-up/node_modules/p-locate": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-3.0.0.tgz", + "integrity": "sha512-x+12w/To+4GFfgJhBEpiDcLozRJGegY+Ei7/z0tSLkMmxGZNybVMSfWj9aJn8Z5Fc7dBUNJOOVgPv2H7IwulSQ==", + "dev": true, + "dependencies": { + "p-limit": "^2.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/pkg-up/node_modules/path-exists": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz", + "integrity": "sha512-bpC7GYwiDYQ4wYLe+FA8lhRjhQCMcQGuSgGGqDkg/QerRWw9CmGRT0iSOVRSZJ29NMLZgIzqaljJ63oaL4NIJQ==", + "dev": true, + "engines": { + "node": ">=4" + } + }, + "node_modules/pkijs": { + "version": "3.3.3", + "resolved": "https://registry.npmjs.org/pkijs/-/pkijs-3.3.3.tgz", + "integrity": "sha512-+KD8hJtqQMYoTuL1bbGOqxb4z+nZkTAwVdNtWwe8Tc2xNbEmdJYIYoc6Qt0uF55e6YW6KuTHw1DjQ18gMhzepw==", + "dev": true, + "license": "BSD-3-Clause", + "dependencies": { + "@noble/hashes": "1.4.0", + "asn1js": "^3.0.6", + "bytestreamjs": "^2.0.1", + "pvtsutils": "^1.3.6", + "pvutils": "^1.1.3", + "tslib": "^2.8.1" + }, + "engines": { + "node": ">=16.0.0" + } + }, + "node_modules/possible-typed-array-names": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/possible-typed-array-names/-/possible-typed-array-names-1.1.0.tgz", + "integrity": "sha512-/+5VFTchJDoVj3bhoqi6UeymcD00DAwb1nJwamzPvHEszJ4FpF6SNNbUbOS8yI56qHzdV8eK0qEfOSiodkTdxg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/postcss": { + "version": "8.4.49", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.49.tgz", + "integrity": "sha512-OCVPnIObs4N29kxTjzLfUryOkvZEq+pf8jTF0lg8E7uETuWHA+v7j3c/xJmiqpX450191LlmZfUKkXxkTry7nA==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "dependencies": { + "nanoid": "^3.3.7", + "picocolors": "^1.1.1", + "source-map-js": "^1.2.1" + }, + "engines": { + "node": "^10 || ^12 || >=14" + } + }, + "node_modules/postcss-attribute-case-insensitive": { + "version": "5.0.2", + "resolved": "https://registry.npmjs.org/postcss-attribute-case-insensitive/-/postcss-attribute-case-insensitive-5.0.2.tgz", + "integrity": "sha512-XIidXV8fDr0kKt28vqki84fRK8VW8eTuIa4PChv2MqKuT6C9UjmSKzen6KaWhWEoYvwxFCa7n/tC1SZ3tyq4SQ==", + "dev": true, + "dependencies": { + "postcss-selector-parser": "^6.0.10" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/postcss-calc": { + "version": "8.2.4", + "resolved": "https://registry.npmjs.org/postcss-calc/-/postcss-calc-8.2.4.tgz", + "integrity": "sha512-SmWMSJmB8MRnnULldx0lQIyhSNvuDl9HfrZkaqqE/WHAhToYsAvDq+yAsA/kIyINDszOp3Rh0GFoNuH5Ypsm3Q==", + "dependencies": { + "postcss-selector-parser": "^6.0.9", + "postcss-value-parser": "^4.2.0" + }, + "peerDependencies": { + "postcss": "^8.2.2" + } + }, + "node_modules/postcss-clamp": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/postcss-clamp/-/postcss-clamp-4.1.0.tgz", + "integrity": "sha512-ry4b1Llo/9zz+PKC+030KUnPITTJAHeOwjfAyyB60eT0AorGLdzp52s31OsPRHRf8NchkgFoG2y6fCfn1IV1Ow==", + "dev": true, + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": ">=7.6.0" + }, + "peerDependencies": { + "postcss": "^8.4.6" + } + }, + "node_modules/postcss-color-functional-notation": { + "version": "4.2.4", + "resolved": "https://registry.npmjs.org/postcss-color-functional-notation/-/postcss-color-functional-notation-4.2.4.tgz", + "integrity": "sha512-2yrTAUZUab9s6CpxkxC4rVgFEVaR6/2Pipvi6qcgvnYiVqZcbDHEoBDhrXzyb7Efh2CCfHQNtcqWcIruDTIUeg==", + "dev": true, + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/postcss-color-hex-alpha": { + "version": "8.0.4", + "resolved": "https://registry.npmjs.org/postcss-color-hex-alpha/-/postcss-color-hex-alpha-8.0.4.tgz", + "integrity": "sha512-nLo2DCRC9eE4w2JmuKgVA3fGL3d01kGq752pVALF68qpGLmx2Qrk91QTKkdUqqp45T1K1XV8IhQpcu1hoAQflQ==", + "dev": true, + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.4" + } + }, + "node_modules/postcss-color-rebeccapurple": { + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/postcss-color-rebeccapurple/-/postcss-color-rebeccapurple-7.1.1.tgz", + "integrity": "sha512-pGxkuVEInwLHgkNxUc4sdg4g3py7zUeCQ9sMfwyHAT+Ezk8a4OaaVZ8lIY5+oNqA/BXXgLyXv0+5wHP68R79hg==", + "dev": true, + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/postcss-colormin": { + "version": "5.3.1", + "resolved": "https://registry.npmjs.org/postcss-colormin/-/postcss-colormin-5.3.1.tgz", + "integrity": "sha512-UsWQG0AqTFQmpBegeLLc1+c3jIqBNB0zlDGRWR+dQ3pRKJL1oeMzyqmH3o2PIfn9MBdNrVPWhDbT769LxCTLJQ==", + "dependencies": { + "browserslist": "^4.21.4", + "caniuse-api": "^3.0.0", + "colord": "^2.9.1", + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-convert-values": { + "version": "5.1.3", + "resolved": "https://registry.npmjs.org/postcss-convert-values/-/postcss-convert-values-5.1.3.tgz", + "integrity": "sha512-82pC1xkJZtcJEfiLw6UXnXVXScgtBrjlO5CBmuDQc+dlb88ZYheFsjTn40+zBVi3DkfF7iezO0nJUPLcJK3pvA==", + "dependencies": { + "browserslist": "^4.21.4", + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-custom-media": { + "version": "8.0.2", + "resolved": "https://registry.npmjs.org/postcss-custom-media/-/postcss-custom-media-8.0.2.tgz", + "integrity": "sha512-7yi25vDAoHAkbhAzX9dHx2yc6ntS4jQvejrNcC+csQJAXjj15e7VcWfMgLqBNAbOvqi5uIa9huOVwdHbf+sKqg==", + "dev": true, + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.3" + } + }, + "node_modules/postcss-custom-properties": { + "version": "12.1.11", + "resolved": "https://registry.npmjs.org/postcss-custom-properties/-/postcss-custom-properties-12.1.11.tgz", + "integrity": "sha512-0IDJYhgU8xDv1KY6+VgUwuQkVtmYzRwu+dMjnmdMafXYv86SWqfxkc7qdDvWS38vsjaEtv8e0vGOUQrAiMBLpQ==", + "dev": true, + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/postcss-custom-selectors": { + "version": "6.0.3", + "resolved": "https://registry.npmjs.org/postcss-custom-selectors/-/postcss-custom-selectors-6.0.3.tgz", + "integrity": "sha512-fgVkmyiWDwmD3JbpCmB45SvvlCD6z9CG6Ie6Iere22W5aHea6oWa7EM2bpnv2Fj3I94L3VbtvX9KqwSi5aFzSg==", + "dev": true, + "dependencies": { + "postcss-selector-parser": "^6.0.4" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.3" + } + }, + "node_modules/postcss-dir-pseudo-class": { + "version": "6.0.5", + "resolved": "https://registry.npmjs.org/postcss-dir-pseudo-class/-/postcss-dir-pseudo-class-6.0.5.tgz", + "integrity": "sha512-eqn4m70P031PF7ZQIvSgy9RSJ5uI2171O/OO/zcRNYpJbvaeKFUlar1aJ7rmgiQtbm0FSPsRewjpdS0Oew7MPA==", + "dev": true, + "dependencies": { + "postcss-selector-parser": "^6.0.10" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/postcss-discard-comments": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/postcss-discard-comments/-/postcss-discard-comments-5.1.2.tgz", + "integrity": "sha512-+L8208OVbHVF2UQf1iDmRcbdjJkuBF6IS29yBDSiWUIzpYaAhtNl6JYnYm12FnkeCwQqF5LeklOu6rAqgfBZqQ==", + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-discard-duplicates": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/postcss-discard-duplicates/-/postcss-discard-duplicates-5.1.0.tgz", + "integrity": "sha512-zmX3IoSI2aoenxHV6C7plngHWWhUOV3sP1T8y2ifzxzbtnuhk1EdPwm0S1bIUNaJ2eNbWeGLEwzw8huPD67aQw==", + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-discard-empty": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/postcss-discard-empty/-/postcss-discard-empty-5.1.1.tgz", + "integrity": "sha512-zPz4WljiSuLWsI0ir4Mcnr4qQQ5e1Ukc3i7UfE2XcrwKK2LIPIqE5jxMRxO6GbI3cv//ztXDsXwEWT3BHOGh3A==", + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-discard-overridden": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/postcss-discard-overridden/-/postcss-discard-overridden-5.1.0.tgz", + "integrity": "sha512-21nOL7RqWR1kasIVdKs8HNqQJhFxLsyRfAnUDm4Fe4t4mCWL9OJiHvlHPjcd8zc5Myu89b/7wZDnOSjFgeWRtw==", + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-double-position-gradients": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/postcss-double-position-gradients/-/postcss-double-position-gradients-3.1.2.tgz", + "integrity": "sha512-GX+FuE/uBR6eskOK+4vkXgT6pDkexLokPaz/AbJna9s5Kzp/yl488pKPjhy0obB475ovfT1Wv8ho7U/cHNaRgQ==", + "dev": true, + "dependencies": { + "@csstools/postcss-progressive-custom-properties": "^1.1.0", + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/postcss-env-function": { + "version": "4.0.6", + "resolved": "https://registry.npmjs.org/postcss-env-function/-/postcss-env-function-4.0.6.tgz", + "integrity": "sha512-kpA6FsLra+NqcFnL81TnsU+Z7orGtDTxcOhl6pwXeEq1yFPpRMkCDpHhrz8CFQDr/Wfm0jLiNQ1OsGGPjlqPwA==", + "dev": true, + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "peerDependencies": { + "postcss": "^8.4" + } + }, + "node_modules/postcss-focus-visible": { + "version": "6.0.4", + "resolved": "https://registry.npmjs.org/postcss-focus-visible/-/postcss-focus-visible-6.0.4.tgz", + "integrity": "sha512-QcKuUU/dgNsstIK6HELFRT5Y3lbrMLEOwG+A4s5cA+fx3A3y/JTq3X9LaOj3OC3ALH0XqyrgQIgey/MIZ8Wczw==", + "dev": true, + "dependencies": { + "postcss-selector-parser": "^6.0.9" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "peerDependencies": { + "postcss": "^8.4" + } + }, + "node_modules/postcss-focus-within": { + "version": "5.0.4", + "resolved": "https://registry.npmjs.org/postcss-focus-within/-/postcss-focus-within-5.0.4.tgz", + "integrity": "sha512-vvjDN++C0mu8jz4af5d52CB184ogg/sSxAFS+oUJQq2SuCe7T5U2iIsVJtsCp2d6R4j0jr5+q3rPkBVZkXD9fQ==", + "dev": true, + "dependencies": { + "postcss-selector-parser": "^6.0.9" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "peerDependencies": { + "postcss": "^8.4" + } + }, + "node_modules/postcss-font-variant": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/postcss-font-variant/-/postcss-font-variant-5.0.0.tgz", + "integrity": "sha512-1fmkBaCALD72CK2a9i468mA/+tr9/1cBxRRMXOUaZqO43oWPR5imcyPjXwuv7PXbCid4ndlP5zWhidQVVa3hmA==", + "dev": true, + "peerDependencies": { + "postcss": "^8.1.0" + } + }, + "node_modules/postcss-gap-properties": { + "version": "3.0.5", + "resolved": "https://registry.npmjs.org/postcss-gap-properties/-/postcss-gap-properties-3.0.5.tgz", + "integrity": "sha512-IuE6gKSdoUNcvkGIqdtjtcMtZIFyXZhmFd5RUlg97iVEvp1BZKV5ngsAjCjrVy+14uhGBQl9tzmi1Qwq4kqVOg==", + "dev": true, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/postcss-image-set-function": { + "version": "4.0.7", + "resolved": "https://registry.npmjs.org/postcss-image-set-function/-/postcss-image-set-function-4.0.7.tgz", + "integrity": "sha512-9T2r9rsvYzm5ndsBE8WgtrMlIT7VbtTfE7b3BQnudUqnBcBo7L758oc+o+pdj/dUV0l5wjwSdjeOH2DZtfv8qw==", + "dev": true, + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/postcss-initial": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/postcss-initial/-/postcss-initial-4.0.1.tgz", + "integrity": "sha512-0ueD7rPqX8Pn1xJIjay0AZeIuDoF+V+VvMt/uOnn+4ezUKhZM/NokDeP6DwMNyIoYByuN/94IQnt5FEkaN59xQ==", + "dev": true, + "peerDependencies": { + "postcss": "^8.0.0" + } + }, + "node_modules/postcss-lab-function": { + "version": "4.2.1", + "resolved": "https://registry.npmjs.org/postcss-lab-function/-/postcss-lab-function-4.2.1.tgz", + "integrity": "sha512-xuXll4isR03CrQsmxyz92LJB2xX9n+pZJ5jE9JgcnmsCammLyKdlzrBin+25dy6wIjfhJpKBAN80gsTlCgRk2w==", + "dev": true, + "dependencies": { + "@csstools/postcss-progressive-custom-properties": "^1.1.0", + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/postcss-loader": { + "version": "7.3.4", + "resolved": "https://registry.npmjs.org/postcss-loader/-/postcss-loader-7.3.4.tgz", + "integrity": "sha512-iW5WTTBSC5BfsBJ9daFMPVrLT36MrNiC6fqOZTTaHjBNX6Pfd5p+hSBqe/fEeNd7pc13QiAyGt7VdGMw4eRC4A==", + "dev": true, + "dependencies": { + "cosmiconfig": "^8.3.5", + "jiti": "^1.20.0", + "semver": "^7.5.4" + }, + "engines": { + "node": ">= 14.15.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/webpack" + }, + "peerDependencies": { + "postcss": "^7.0.0 || ^8.0.1", + "webpack": "^5.0.0" + } + }, + "node_modules/postcss-loader/node_modules/cosmiconfig": { + "version": "8.3.6", + "resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-8.3.6.tgz", + "integrity": "sha512-kcZ6+W5QzcJ3P1Mt+83OUv/oHFqZHIx8DuxG6eZ5RGMERoLqp4BuGjhHLYGK+Kf5XVkQvqBSmAy/nGWN3qDgEA==", + "dev": true, + "dependencies": { + "import-fresh": "^3.3.0", + "js-yaml": "^4.1.0", + "parse-json": "^5.2.0", + "path-type": "^4.0.0" + }, + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/d-fischer" + }, + "peerDependencies": { + "typescript": ">=4.9.5" + }, + "peerDependenciesMeta": { + "typescript": { + "optional": true + } + } + }, + "node_modules/postcss-loader/node_modules/semver": { + "version": "7.6.3", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.3.tgz", + "integrity": "sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==", + "dev": true, + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/postcss-logical": { + "version": "5.0.4", + "resolved": "https://registry.npmjs.org/postcss-logical/-/postcss-logical-5.0.4.tgz", + "integrity": "sha512-RHXxplCeLh9VjinvMrZONq7im4wjWGlRJAqmAVLXyZaXwfDWP73/oq4NdIp+OZwhQUMj0zjqDfM5Fj7qby+B4g==", + "dev": true, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "peerDependencies": { + "postcss": "^8.4" + } + }, + "node_modules/postcss-media-minmax": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/postcss-media-minmax/-/postcss-media-minmax-5.0.0.tgz", + "integrity": "sha512-yDUvFf9QdFZTuCUg0g0uNSHVlJ5X1lSzDZjPSFaiCWvjgsvu8vEVxtahPrLMinIDEEGnx6cBe6iqdx5YWz08wQ==", + "dev": true, + "engines": { + "node": ">=10.0.0" + }, + "peerDependencies": { + "postcss": "^8.1.0" + } + }, + "node_modules/postcss-merge-longhand": { + "version": "5.1.7", + "resolved": "https://registry.npmjs.org/postcss-merge-longhand/-/postcss-merge-longhand-5.1.7.tgz", + "integrity": "sha512-YCI9gZB+PLNskrK0BB3/2OzPnGhPkBEwmwhfYk1ilBHYVAZB7/tkTHFBAnCrvBBOmeYyMYw3DMjT55SyxMBzjQ==", + "dependencies": { + "postcss-value-parser": "^4.2.0", + "stylehacks": "^5.1.1" + }, + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-merge-rules": { + "version": "5.1.4", + "resolved": "https://registry.npmjs.org/postcss-merge-rules/-/postcss-merge-rules-5.1.4.tgz", + "integrity": "sha512-0R2IuYpgU93y9lhVbO/OylTtKMVcHb67zjWIfCiKR9rWL3GUk1677LAqD/BcHizukdZEjT8Ru3oHRoAYoJy44g==", + "dependencies": { + "browserslist": "^4.21.4", + "caniuse-api": "^3.0.0", + "cssnano-utils": "^3.1.0", + "postcss-selector-parser": "^6.0.5" + }, + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-minify-font-values": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/postcss-minify-font-values/-/postcss-minify-font-values-5.1.0.tgz", + "integrity": "sha512-el3mYTgx13ZAPPirSVsHqFzl+BBBDrXvbySvPGFnQcTI4iNslrPaFq4muTkLZmKlGk4gyFAYUBMH30+HurREyA==", + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-minify-gradients": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/postcss-minify-gradients/-/postcss-minify-gradients-5.1.1.tgz", + "integrity": "sha512-VGvXMTpCEo4qHTNSa9A0a3D+dxGFZCYwR6Jokk+/3oB6flu2/PnPXAh2x7x52EkY5xlIHLm+Le8tJxe/7TNhzw==", + "dependencies": { + "colord": "^2.9.1", + "cssnano-utils": "^3.1.0", + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-minify-params": { + "version": "5.1.4", + "resolved": "https://registry.npmjs.org/postcss-minify-params/-/postcss-minify-params-5.1.4.tgz", + "integrity": "sha512-+mePA3MgdmVmv6g+30rn57USjOGSAyuxUmkfiWpzalZ8aiBkdPYjXWtHuwJGm1v5Ojy0Z0LaSYhHaLJQB0P8Jw==", + "dependencies": { + "browserslist": "^4.21.4", + "cssnano-utils": "^3.1.0", + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-minify-selectors": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/postcss-minify-selectors/-/postcss-minify-selectors-5.2.1.tgz", + "integrity": "sha512-nPJu7OjZJTsVUmPdm2TcaiohIwxP+v8ha9NehQ2ye9szv4orirRU3SDdtUmKH+10nzn0bAyOXZ0UEr7OpvLehg==", + "dependencies": { + "postcss-selector-parser": "^6.0.5" + }, + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-modules-extract-imports": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/postcss-modules-extract-imports/-/postcss-modules-extract-imports-3.1.0.tgz", + "integrity": "sha512-k3kNe0aNFQDAZGbin48pL2VNidTF0w4/eASDsxlyspobzU3wZQLOGj7L9gfRe0Jo9/4uud09DsjFNH7winGv8Q==", + "dev": true, + "engines": { + "node": "^10 || ^12 || >= 14" + }, + "peerDependencies": { + "postcss": "^8.1.0" + } + }, + "node_modules/postcss-modules-local-by-default": { + "version": "4.0.5", + "resolved": "https://registry.npmjs.org/postcss-modules-local-by-default/-/postcss-modules-local-by-default-4.0.5.tgz", + "integrity": "sha512-6MieY7sIfTK0hYfafw1OMEG+2bg8Q1ocHCpoWLqOKj3JXlKu4G7btkmM/B7lFubYkYWmRSPLZi5chid63ZaZYw==", + "dev": true, + "dependencies": { + "icss-utils": "^5.0.0", + "postcss-selector-parser": "^6.0.2", + "postcss-value-parser": "^4.1.0" + }, + "engines": { + "node": "^10 || ^12 || >= 14" + }, + "peerDependencies": { + "postcss": "^8.1.0" + } + }, + "node_modules/postcss-modules-scope": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/postcss-modules-scope/-/postcss-modules-scope-3.2.0.tgz", + "integrity": "sha512-oq+g1ssrsZOsx9M96c5w8laRmvEu9C3adDSjI8oTcbfkrTE8hx/zfyobUoWIxaKPO8bt6S62kxpw5GqypEw1QQ==", + "dev": true, + "dependencies": { + "postcss-selector-parser": "^6.0.4" + }, + "engines": { + "node": "^10 || ^12 || >= 14" + }, + "peerDependencies": { + "postcss": "^8.1.0" + } + }, + "node_modules/postcss-modules-values": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/postcss-modules-values/-/postcss-modules-values-4.0.0.tgz", + "integrity": "sha512-RDxHkAiEGI78gS2ofyvCsu7iycRv7oqw5xMWn9iMoR0N/7mf9D50ecQqUo5BZ9Zh2vH4bCUR/ktCqbB9m8vJjQ==", + "dev": true, + "dependencies": { + "icss-utils": "^5.0.0" + }, + "engines": { + "node": "^10 || ^12 || >= 14" + }, + "peerDependencies": { + "postcss": "^8.1.0" + } + }, + "node_modules/postcss-nesting": { + "version": "10.2.0", + "resolved": "https://registry.npmjs.org/postcss-nesting/-/postcss-nesting-10.2.0.tgz", + "integrity": "sha512-EwMkYchxiDiKUhlJGzWsD9b2zvq/r2SSubcRrgP+jujMXFzqvANLt16lJANC+5uZ6hjI7lpRmI6O8JIl+8l1KA==", + "dev": true, + "dependencies": { + "@csstools/selector-specificity": "^2.0.0", + "postcss-selector-parser": "^6.0.10" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/postcss-normalize-charset": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/postcss-normalize-charset/-/postcss-normalize-charset-5.1.0.tgz", + "integrity": "sha512-mSgUJ+pd/ldRGVx26p2wz9dNZ7ji6Pn8VWBajMXFf8jk7vUoSrZ2lt/wZR7DtlZYKesmZI680qjr2CeFF2fbUg==", + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-normalize-display-values": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/postcss-normalize-display-values/-/postcss-normalize-display-values-5.1.0.tgz", + "integrity": "sha512-WP4KIM4o2dazQXWmFaqMmcvsKmhdINFblgSeRgn8BJ6vxaMyaJkwAzpPpuvSIoG/rmX3M+IrRZEz2H0glrQNEA==", + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-normalize-positions": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/postcss-normalize-positions/-/postcss-normalize-positions-5.1.1.tgz", + "integrity": "sha512-6UpCb0G4eofTCQLFVuI3EVNZzBNPiIKcA1AKVka+31fTVySphr3VUgAIULBhxZkKgwLImhzMR2Bw1ORK+37INg==", + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-normalize-repeat-style": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/postcss-normalize-repeat-style/-/postcss-normalize-repeat-style-5.1.1.tgz", + "integrity": "sha512-mFpLspGWkQtBcWIRFLmewo8aC3ImN2i/J3v8YCFUwDnPu3Xz4rLohDO26lGjwNsQxB3YF0KKRwspGzE2JEuS0g==", + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-normalize-string": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/postcss-normalize-string/-/postcss-normalize-string-5.1.0.tgz", + "integrity": "sha512-oYiIJOf4T9T1N4i+abeIc7Vgm/xPCGih4bZz5Nm0/ARVJ7K6xrDlLwvwqOydvyL3RHNf8qZk6vo3aatiw/go3w==", + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-normalize-timing-functions": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/postcss-normalize-timing-functions/-/postcss-normalize-timing-functions-5.1.0.tgz", + "integrity": "sha512-DOEkzJ4SAXv5xkHl0Wa9cZLF3WCBhF3o1SKVxKQAa+0pYKlueTpCgvkFAHfk+Y64ezX9+nITGrDZeVGgITJXjg==", + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-normalize-unicode": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/postcss-normalize-unicode/-/postcss-normalize-unicode-5.1.1.tgz", + "integrity": "sha512-qnCL5jzkNUmKVhZoENp1mJiGNPcsJCs1aaRmURmeJGES23Z/ajaln+EPTD+rBeNkSryI+2WTdW+lwcVdOikrpA==", + "dependencies": { + "browserslist": "^4.21.4", + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-normalize-url": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/postcss-normalize-url/-/postcss-normalize-url-5.1.0.tgz", + "integrity": "sha512-5upGeDO+PVthOxSmds43ZeMeZfKH+/DKgGRD7TElkkyS46JXAUhMzIKiCa7BabPeIy3AQcTkXwVVN7DbqsiCew==", + "dependencies": { + "normalize-url": "^6.0.1", + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-normalize-whitespace": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/postcss-normalize-whitespace/-/postcss-normalize-whitespace-5.1.1.tgz", + "integrity": "sha512-83ZJ4t3NUDETIHTa3uEg6asWjSBYL5EdkVB0sDncx9ERzOKBVJIUeDO9RyA9Zwtig8El1d79HBp0JEi8wvGQnA==", + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-opacity-percentage": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/postcss-opacity-percentage/-/postcss-opacity-percentage-1.1.3.tgz", + "integrity": "sha512-An6Ba4pHBiDtyVpSLymUUERMo2cU7s+Obz6BTrS+gxkbnSBNKSuD0AVUc+CpBMrpVPKKfoVz0WQCX+Tnst0i4A==", + "dev": true, + "funding": [ + { + "type": "kofi", + "url": "https://ko-fi.com/mrcgrtz" + }, + { + "type": "liberapay", + "url": "https://liberapay.com/mrcgrtz" + } + ], + "engines": { + "node": "^12 || ^14 || >=16" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/postcss-ordered-values": { + "version": "5.1.3", + "resolved": "https://registry.npmjs.org/postcss-ordered-values/-/postcss-ordered-values-5.1.3.tgz", + "integrity": "sha512-9UO79VUhPwEkzbb3RNpqqghc6lcYej1aveQteWY+4POIwlqkYE21HKWaLDF6lWNuqCobEAyTovVhtI32Rbv2RQ==", + "dependencies": { + "cssnano-utils": "^3.1.0", + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-overflow-shorthand": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/postcss-overflow-shorthand/-/postcss-overflow-shorthand-3.0.4.tgz", + "integrity": "sha512-otYl/ylHK8Y9bcBnPLo3foYFLL6a6Ak+3EQBPOTR7luMYCOsiVTUk1iLvNf6tVPNGXcoL9Hoz37kpfriRIFb4A==", + "dev": true, + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/postcss-page-break": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/postcss-page-break/-/postcss-page-break-3.0.4.tgz", + "integrity": "sha512-1JGu8oCjVXLa9q9rFTo4MbeeA5FMe00/9C7lN4va606Rdb+HkxXtXsmEDrIraQ11fGz/WvKWa8gMuCKkrXpTsQ==", + "dev": true, + "peerDependencies": { + "postcss": "^8" + } + }, + "node_modules/postcss-place": { + "version": "7.0.5", + "resolved": "https://registry.npmjs.org/postcss-place/-/postcss-place-7.0.5.tgz", + "integrity": "sha512-wR8igaZROA6Z4pv0d+bvVrvGY4GVHihBCBQieXFY3kuSuMyOmEnnfFzHl/tQuqHZkfkIVBEbDvYcFfHmpSet9g==", + "dev": true, + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/postcss-preset-env": { + "version": "7.8.3", + "resolved": "https://registry.npmjs.org/postcss-preset-env/-/postcss-preset-env-7.8.3.tgz", + "integrity": "sha512-T1LgRm5uEVFSEF83vHZJV2z19lHg4yJuZ6gXZZkqVsqv63nlr6zabMH3l4Pc01FQCyfWVrh2GaUeCVy9Po+Aag==", + "dev": true, + "dependencies": { + "@csstools/postcss-cascade-layers": "^1.1.1", + "@csstools/postcss-color-function": "^1.1.1", + "@csstools/postcss-font-format-keywords": "^1.0.1", + "@csstools/postcss-hwb-function": "^1.0.2", + "@csstools/postcss-ic-unit": "^1.0.1", + "@csstools/postcss-is-pseudo-class": "^2.0.7", + "@csstools/postcss-nested-calc": "^1.0.0", + "@csstools/postcss-normalize-display-values": "^1.0.1", + "@csstools/postcss-oklab-function": "^1.1.1", + "@csstools/postcss-progressive-custom-properties": "^1.3.0", + "@csstools/postcss-stepped-value-functions": "^1.0.1", + "@csstools/postcss-text-decoration-shorthand": "^1.0.0", + "@csstools/postcss-trigonometric-functions": "^1.0.2", + "@csstools/postcss-unset-value": "^1.0.2", + "autoprefixer": "^10.4.13", + "browserslist": "^4.21.4", + "css-blank-pseudo": "^3.0.3", + "css-has-pseudo": "^3.0.4", + "css-prefers-color-scheme": "^6.0.3", + "cssdb": "^7.1.0", + "postcss-attribute-case-insensitive": "^5.0.2", + "postcss-clamp": "^4.1.0", + "postcss-color-functional-notation": "^4.2.4", + "postcss-color-hex-alpha": "^8.0.4", + "postcss-color-rebeccapurple": "^7.1.1", + "postcss-custom-media": "^8.0.2", + "postcss-custom-properties": "^12.1.10", + "postcss-custom-selectors": "^6.0.3", + "postcss-dir-pseudo-class": "^6.0.5", + "postcss-double-position-gradients": "^3.1.2", + "postcss-env-function": "^4.0.6", + "postcss-focus-visible": "^6.0.4", + "postcss-focus-within": "^5.0.4", + "postcss-font-variant": "^5.0.0", + "postcss-gap-properties": "^3.0.5", + "postcss-image-set-function": "^4.0.7", + "postcss-initial": "^4.0.1", + "postcss-lab-function": "^4.2.1", + "postcss-logical": "^5.0.4", + "postcss-media-minmax": "^5.0.0", + "postcss-nesting": "^10.2.0", + "postcss-opacity-percentage": "^1.1.2", + "postcss-overflow-shorthand": "^3.0.4", + "postcss-page-break": "^3.0.4", + "postcss-place": "^7.0.5", + "postcss-pseudo-class-any-link": "^7.1.6", + "postcss-replace-overflow-wrap": "^4.0.0", + "postcss-selector-not": "^6.0.1", + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/postcss-pseudo-class-any-link": { + "version": "7.1.6", + "resolved": "https://registry.npmjs.org/postcss-pseudo-class-any-link/-/postcss-pseudo-class-any-link-7.1.6.tgz", + "integrity": "sha512-9sCtZkO6f/5ML9WcTLcIyV1yz9D1rf0tWc+ulKcvV30s0iZKS/ONyETvoWsr6vnrmW+X+KmuK3gV/w5EWnT37w==", + "dev": true, + "dependencies": { + "postcss-selector-parser": "^6.0.10" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/postcss-reduce-initial": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/postcss-reduce-initial/-/postcss-reduce-initial-5.1.2.tgz", + "integrity": "sha512-dE/y2XRaqAi6OvjzD22pjTUQ8eOfc6m/natGHgKFBK9DxFmIm69YmaRVQrGgFlEfc1HePIurY0TmDeROK05rIg==", + "dependencies": { + "browserslist": "^4.21.4", + "caniuse-api": "^3.0.0" + }, + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-reduce-transforms": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/postcss-reduce-transforms/-/postcss-reduce-transforms-5.1.0.tgz", + "integrity": "sha512-2fbdbmgir5AvpW9RLtdONx1QoYG2/EtqpNQbFASDlixBbAYuTcJ0dECwlqNqH7VbaUnEnh8SrxOe2sRIn24XyQ==", + "dependencies": { + "postcss-value-parser": "^4.2.0" + }, + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-replace-overflow-wrap": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/postcss-replace-overflow-wrap/-/postcss-replace-overflow-wrap-4.0.0.tgz", + "integrity": "sha512-KmF7SBPphT4gPPcKZc7aDkweHiKEEO8cla/GjcBK+ckKxiZslIu3C4GCRW3DNfL0o7yW7kMQu9xlZ1kXRXLXtw==", + "dev": true, + "peerDependencies": { + "postcss": "^8.0.3" + } + }, + "node_modules/postcss-selector-not": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/postcss-selector-not/-/postcss-selector-not-6.0.1.tgz", + "integrity": "sha512-1i9affjAe9xu/y9uqWH+tD4r6/hDaXJruk8xn2x1vzxC2U3J3LKO3zJW4CyxlNhA56pADJ/djpEwpH1RClI2rQ==", + "dev": true, + "dependencies": { + "postcss-selector-parser": "^6.0.10" + }, + "engines": { + "node": "^12 || ^14 || >=16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + }, + "peerDependencies": { + "postcss": "^8.2" + } + }, + "node_modules/postcss-selector-parser": { + "version": "6.1.2", + "resolved": "https://registry.npmjs.org/postcss-selector-parser/-/postcss-selector-parser-6.1.2.tgz", + "integrity": "sha512-Q8qQfPiZ+THO/3ZrOrO0cJJKfpYCagtMUkXbnEfmgUjwXg6z/WBeOyS9APBBPCTSiDV+s4SwQGu8yFsiMRIudg==", + "dependencies": { + "cssesc": "^3.0.0", + "util-deprecate": "^1.0.2" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/postcss-svgo": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/postcss-svgo/-/postcss-svgo-5.1.0.tgz", + "integrity": "sha512-D75KsH1zm5ZrHyxPakAxJWtkyXew5qwS70v56exwvw542d9CRtTo78K0WeFxZB4G7JXKKMbEZtZayTGdIky/eA==", + "dependencies": { + "postcss-value-parser": "^4.2.0", + "svgo": "^2.7.0" + }, + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-unique-selectors": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/postcss-unique-selectors/-/postcss-unique-selectors-5.1.1.tgz", + "integrity": "sha512-5JiODlELrz8L2HwxfPnhOWZYWDxVHWL83ufOv84NrcgipI7TaeRsatAhK4Tr2/ZiYldpK/wBvw5BD3qfaK96GA==", + "dependencies": { + "postcss-selector-parser": "^6.0.5" + }, + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/postcss-value-parser": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/postcss-value-parser/-/postcss-value-parser-4.2.0.tgz", + "integrity": "sha512-1NNCs6uurfkVbeXG4S8JFT9t19m45ICnif8zWLd5oPSZ50QnwMfK+H3jv408d4jw/7Bttv5axS5IiHoLaVNHeQ==" + }, + "node_modules/prelude-ls": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/prelude-ls/-/prelude-ls-1.2.1.tgz", + "integrity": "sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==", + "dev": true, + "engines": { + "node": ">= 0.8.0" + } + }, + "node_modules/prettier": { + "version": "3.5.3", + "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.5.3.tgz", + "integrity": "sha512-QQtaxnoDJeAkDvDKWCLiwIXkTgRhwYDEQCghU9Z6q03iyek/rxRh/2lC3HB7P8sWT2xC/y5JDctPLBIGzHKbhw==", + "dev": true, + "license": "MIT", + "bin": { + "prettier": "bin/prettier.cjs" + }, + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/prettier/prettier?sponsor=1" + } + }, + "node_modules/prettier-linter-helpers": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/prettier-linter-helpers/-/prettier-linter-helpers-1.0.0.tgz", + "integrity": "sha512-GbK2cP9nraSSUF9N2XwUwqfzlAFlMNYYl+ShE/V+H8a9uNl/oUqB1w2EL54Jh0OlyRSd8RfWYJ3coVS4TROP2w==", + "dev": true, + "dependencies": { + "fast-diff": "^1.1.2" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/pretty-error": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/pretty-error/-/pretty-error-4.0.0.tgz", + "integrity": "sha512-AoJ5YMAcXKYxKhuJGdcvse+Voc6v1RgnsR3nWcYU7q4t6z0Q6T86sv5Zq8VIRbOWWFpvdGE83LtdSMNd+6Y0xw==", + "dev": true, + "dependencies": { + "lodash": "^4.17.20", + "renderkid": "^3.0.0" + } + }, + "node_modules/pretty-format": { + "version": "27.5.1", + "resolved": "https://registry.npmjs.org/pretty-format/-/pretty-format-27.5.1.tgz", + "integrity": "sha512-Qb1gy5OrP5+zDf2Bvnzdl3jsTf1qXVMazbvCoKhtKqVs4/YK4ozX4gKQJJVyNe+cajNPn0KoC0MC3FUmaHWEmQ==", + "dev": true, + "peer": true, + "dependencies": { + "ansi-regex": "^5.0.1", + "ansi-styles": "^5.0.0", + "react-is": "^17.0.1" + }, + "engines": { + "node": "^10.13.0 || ^12.13.0 || ^14.15.0 || >=15.0.0" + } + }, + "node_modules/pretty-format/node_modules/ansi-styles": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz", + "integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==", + "dev": true, + "peer": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/pretty-format/node_modules/react-is": { + "version": "17.0.2", + "resolved": "https://registry.npmjs.org/react-is/-/react-is-17.0.2.tgz", + "integrity": "sha512-w2GsyukL62IJnlaff/nRegPQR94C/XXamvMWmSHRJ4y7Ts/4ocGRmTHvOs8PSE6pB3dWOrD/nueuU5sduBsQ4w==", + "dev": true, + "peer": true + }, + "node_modules/prismjs": { + "version": "1.30.0", + "resolved": "https://registry.npmjs.org/prismjs/-/prismjs-1.30.0.tgz", + "integrity": "sha512-DEvV2ZF2r2/63V+tK8hQvrR2ZGn10srHbXviTlcv7Kpzw8jWiNTqbVgjO3IY8RxrrOUF8VPMQQFysYYYv0YZxw==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/process-nextick-args": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz", + "integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==", + "dev": true + }, + "node_modules/prompts": { + "version": "2.4.2", + "resolved": "https://registry.npmjs.org/prompts/-/prompts-2.4.2.tgz", + "integrity": "sha512-NxNv/kLguCA7p3jE8oL2aEBsrJWgAakBpgmgK6lpPWV+WuOmY6r2/zbAVnP+T8bQlA0nzHXSJSJW0Hq7ylaD2Q==", + "dev": true, + "dependencies": { + "kleur": "^3.0.3", + "sisteransi": "^1.0.5" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/prop-types": { + "version": "15.8.1", + "resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.8.1.tgz", + "integrity": "sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==", + "dependencies": { + "loose-envify": "^1.4.0", + "object-assign": "^4.1.1", + "react-is": "^16.13.1" + } + }, + "node_modules/prop-types/node_modules/react-is": { + "version": "16.13.1", + "resolved": "https://registry.npmjs.org/react-is/-/react-is-16.13.1.tgz", + "integrity": "sha512-24e6ynE2H+OKt4kqsOvNd8kBpV65zoxbA4BVsEOB3ARVWQki/DHzaUoC5KuON/BiccDaCCTZBuOcfZs70kR8bQ==" + }, + "node_modules/property-expr": { + "version": "2.0.6", + "resolved": "https://registry.npmjs.org/property-expr/-/property-expr-2.0.6.tgz", + "integrity": "sha512-SVtmxhRE/CGkn3eZY1T6pC8Nln6Fr/lu1mKSgRud0eC73whjGfoAogbn78LkD8aFL0zz3bAFerKSnOl7NlErBA==" + }, + "node_modules/proxy-addr": { + "version": "2.0.7", + "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.7.tgz", + "integrity": "sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==", + "dev": true, + "license": "MIT", + "dependencies": { + "forwarded": "0.2.0", + "ipaddr.js": "1.9.1" + }, + "engines": { + "node": ">= 0.10" + } + }, + "node_modules/proxy-addr/node_modules/ipaddr.js": { + "version": "1.9.1", + "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.1.tgz", + "integrity": "sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.10" + } + }, + "node_modules/proxy-from-env": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", + "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==", + "dev": true + }, + "node_modules/punycode": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz", + "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==", + "engines": { + "node": ">=6" + } + }, + "node_modules/pure-rand": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/pure-rand/-/pure-rand-6.1.0.tgz", + "integrity": "sha512-bVWawvoZoBYpp6yIoQtQXHZjmz35RSVHnUOTefl8Vcjr8snTPY1wnpSPMWekcFwbxI6gtmT7rSYPFvz71ldiOA==", + "dev": true, + "funding": [ + { + "type": "individual", + "url": "https://github.com/sponsors/dubzzz" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/fast-check" + } + ] + }, + "node_modules/pvtsutils": { + "version": "1.3.6", + "resolved": "https://registry.npmjs.org/pvtsutils/-/pvtsutils-1.3.6.tgz", + "integrity": "sha512-PLgQXQ6H2FWCaeRak8vvk1GW462lMxB5s3Jm673N82zI4vqtVUPuZdffdZbPDFRoU8kAhItWFtPCWiPpp4/EDg==", + "dev": true, + "license": "MIT", + "dependencies": { + "tslib": "^2.8.1" + } + }, + "node_modules/pvutils": { + "version": "1.1.5", + "resolved": "https://registry.npmjs.org/pvutils/-/pvutils-1.1.5.tgz", + "integrity": "sha512-KTqnxsgGiQ6ZAzZCVlJH5eOjSnvlyEgx1m8bkRJfOhmGRqfo5KLvmAlACQkrjEtOQ4B7wF9TdSLIs9O90MX9xA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=16.0.0" + } + }, + "node_modules/qs": { + "version": "6.14.1", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.14.1.tgz", + "integrity": "sha512-4EK3+xJl8Ts67nLYNwqw/dsFVnCf+qR7RgXSK9jEEm9unao3njwMDdmsdvoKBKHzxd7tCYz5e5M+SnMjdtXGQQ==", + "dev": true, + "license": "BSD-3-Clause", + "dependencies": { + "side-channel": "^1.1.0" + }, + "engines": { + "node": ">=0.6" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/queue-microtask": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz", + "integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ] + }, + "node_modules/raf": { + "version": "3.4.1", + "resolved": "https://registry.npmjs.org/raf/-/raf-3.4.1.tgz", + "integrity": "sha512-Sq4CW4QhwOHE8ucn6J34MqtZCeWFP2aQSmrlroYgqAV1PjStIhJXxYuTgUIfkEk7zTLjmIjLmU5q+fbD1NnOJA==", + "dev": true, + "dependencies": { + "performance-now": "^2.1.0" + } + }, + "node_modules/railroad-diagrams": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/railroad-diagrams/-/railroad-diagrams-1.0.0.tgz", + "integrity": "sha512-cz93DjNeLY0idrCNOH6PviZGRN9GJhsdm9hpn1YCS879fj4W+x5IFJhhkRZcwVgMmFF7R82UA/7Oh+R8lLZg6A==", + "dev": true + }, + "node_modules/randexp": { + "version": "0.4.6", + "resolved": "https://registry.npmjs.org/randexp/-/randexp-0.4.6.tgz", + "integrity": "sha512-80WNmd9DA0tmZrw9qQa62GPPWfuXJknrmVmLcxvq4uZBdYqb1wYoKTmnlGUchvVWe0XiLupYkBoXVOxz3C8DYQ==", + "dev": true, + "dependencies": { + "discontinuous-range": "1.0.0", + "ret": "~0.1.10" + }, + "engines": { + "node": ">=0.12" + } + }, + "node_modules/randombytes": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/randombytes/-/randombytes-2.1.0.tgz", + "integrity": "sha512-vYl3iOX+4CKUWuxGi9Ukhie6fsqXqS9FE2Zaic4tNFD2N2QQaXOMFbuKK4QmDHC0JO6B1Zp41J0LpT0oR68amQ==", + "dependencies": { + "safe-buffer": "^5.1.0" + } + }, + "node_modules/range-parser": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz", + "integrity": "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg==", + "dev": true, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/raw-body": { + "version": "2.5.3", + "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.5.3.tgz", + "integrity": "sha512-s4VSOf6yN0rvbRZGxs8Om5CWj6seneMwK3oDb4lWDH0UPhWcxwOWw5+qk24bxq87szX1ydrwylIOp2uG1ojUpA==", + "dev": true, + "license": "MIT", + "dependencies": { + "bytes": "~3.1.2", + "http-errors": "~2.0.1", + "iconv-lite": "~0.4.24", + "unpipe": "~1.0.0" + }, + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/raw-body/node_modules/iconv-lite": { + "version": "0.4.24", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz", + "integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==", + "dev": true, + "license": "MIT", + "dependencies": { + "safer-buffer": ">= 2.1.2 < 3" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/rc-align": { + "version": "4.0.15", + "resolved": "https://registry.npmjs.org/rc-align/-/rc-align-4.0.15.tgz", + "integrity": "sha512-wqJtVH60pka/nOX7/IspElA8gjPNQKIx/ZqJ6heATCkXpe1Zg4cPVrMD2vC96wjsFFL8WsmhPbx9tdMo1qqlIA==", + "dependencies": { + "@babel/runtime": "^7.10.1", + "classnames": "2.x", + "dom-align": "^1.7.0", + "rc-util": "^5.26.0", + "resize-observer-polyfill": "^1.5.1" + }, + "peerDependencies": { + "react": ">=16.9.0", + "react-dom": ">=16.9.0" + } + }, + "node_modules/rc-motion": { + "version": "2.9.3", + "resolved": "https://registry.npmjs.org/rc-motion/-/rc-motion-2.9.3.tgz", + "integrity": "sha512-rkW47ABVkic7WEB0EKJqzySpvDqwl60/tdkY7hWP7dYnh5pm0SzJpo54oW3TDUGXV5wfxXFmMkxrzRRbotQ0+w==", + "dependencies": { + "@babel/runtime": "^7.11.1", + "classnames": "^2.2.1", + "rc-util": "^5.43.0" + }, + "peerDependencies": { + "react": ">=16.9.0", + "react-dom": ">=16.9.0" + } + }, + "node_modules/rc-tooltip": { + "version": "5.3.1", + "resolved": "https://registry.npmjs.org/rc-tooltip/-/rc-tooltip-5.3.1.tgz", + "integrity": "sha512-e6H0dMD38EPaSPD2XC8dRfct27VvT2TkPdoBSuNl3RRZ5tspiY/c5xYEmGC0IrABvMBgque4Mr2SMZuliCvoiQ==", + "dependencies": { + "@babel/runtime": "^7.11.2", + "classnames": "^2.3.1", + "rc-trigger": "^5.3.1" + }, + "peerDependencies": { + "react": ">=16.9.0", + "react-dom": ">=16.9.0" + } + }, + "node_modules/rc-trigger": { + "version": "5.3.4", + "resolved": "https://registry.npmjs.org/rc-trigger/-/rc-trigger-5.3.4.tgz", + "integrity": "sha512-mQv+vas0TwKcjAO2izNPkqR4j86OemLRmvL2nOzdP9OWNWA1ivoTt5hzFqYNW9zACwmTezRiN8bttrC7cZzYSw==", + "dependencies": { + "@babel/runtime": "^7.18.3", + "classnames": "^2.2.6", + "rc-align": "^4.0.0", + "rc-motion": "^2.0.0", + "rc-util": "^5.19.2" + }, + "engines": { + "node": ">=8.x" + }, + "peerDependencies": { + "react": ">=16.9.0", + "react-dom": ">=16.9.0" + } + }, + "node_modules/rc-util": { + "version": "5.43.0", + "resolved": "https://registry.npmjs.org/rc-util/-/rc-util-5.43.0.tgz", + "integrity": "sha512-AzC7KKOXFqAdIBqdGWepL9Xn7cm3vnAmjlHqUnoQaTMZYhM4VlXGLkkHHxj/BZ7Td0+SOPKB4RGPboBVKT9htw==", + "dependencies": { + "@babel/runtime": "^7.18.3", + "react-is": "^18.2.0" + }, + "peerDependencies": { + "react": ">=16.9.0", + "react-dom": ">=16.9.0" + } + }, + "node_modules/react": { + "version": "18.3.1", + "resolved": "https://registry.npmjs.org/react/-/react-18.3.1.tgz", + "integrity": "sha512-wS+hAgJShR0KhEvPJArfuPVN1+Hz1t0Y6n5jLrGQbkb4urgPE/0Rve+1kMB1v/oWgHgm4WIcV+i7F2pTVj+2iQ==", + "dependencies": { + "loose-envify": "^1.1.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/react-avatar": { + "version": "5.0.3", + "resolved": "https://registry.npmjs.org/react-avatar/-/react-avatar-5.0.3.tgz", + "integrity": "sha512-DNc+qkWH9QehSEZqHBhqpXWsPY+rU9W7kD68QFHfu8Atfsvx/3ML0DzAePgTUd96nCXQQ3KZMcC3LKYT8FiBIg==", + "dependencies": { + "is-retina": "^1.0.3", + "md5": "^2.0.0" + }, + "peerDependencies": { + "@babel/runtime": ">=7", + "core-js-pure": ">=3", + "prop-types": "^15.0.0 || ^16.0.0", + "react": "^15.0.0 || ^16.0.0 || ^17.0.0 || ^18.0.0" + } + }, + "node_modules/react-bus": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/react-bus/-/react-bus-4.0.1.tgz", + "integrity": "sha512-tzPWE23WN0U9v3YaGKAlLW7GXYv1YkCUgWcnzm8HDtfBeD2vDvK8PYHkJVrwMdzg5BleHcxejtdKYfIYZxD7PQ==", + "license": "MIT", + "dependencies": { + "@types/react": "^18.0.8", + "mitt": "^3.0.1" + }, + "peerDependencies": { + "react": ">=17.0.0 || ^19.0.0-0" + } + }, + "node_modules/react-dev-utils": { + "version": "12.0.1", + "resolved": "https://registry.npmjs.org/react-dev-utils/-/react-dev-utils-12.0.1.tgz", + "integrity": "sha512-84Ivxmr17KjUupyqzFode6xKhjwuEJDROWKJy/BthkL7Wn6NJ8h4WE6k/exAv6ImS+0oZLRRW5j/aINMHyeGeQ==", + "dev": true, + "dependencies": { + "@babel/code-frame": "^7.16.0", + "address": "^1.1.2", + "browserslist": "^4.18.1", + "chalk": "^4.1.2", + "cross-spawn": "^7.0.3", + "detect-port-alt": "^1.1.6", + "escape-string-regexp": "^4.0.0", + "filesize": "^8.0.6", + "find-up": "^5.0.0", + "fork-ts-checker-webpack-plugin": "^6.5.0", + "global-modules": "^2.0.0", + "globby": "^11.0.4", + "gzip-size": "^6.0.0", + "immer": "^9.0.7", + "is-root": "^2.1.0", + "loader-utils": "^3.2.0", + "open": "^8.4.0", + "pkg-up": "^3.1.0", + "prompts": "^2.4.2", + "react-error-overlay": "^6.0.11", + "recursive-readdir": "^2.2.2", + "shell-quote": "^1.7.3", + "strip-ansi": "^6.0.1", + "text-table": "^0.2.0" + }, + "engines": { + "node": ">=14" + } + }, + "node_modules/react-dom": { + "version": "18.3.1", + "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.3.1.tgz", + "integrity": "sha512-5m4nQKp+rZRb09LNH59GM4BxTh9251/ylbKIbpe7TpGxfJ+9kv6BLkLBXIjjspbgbnIBNqlI23tRnTWT0snUIw==", + "dependencies": { + "loose-envify": "^1.1.0", + "scheduler": "^0.23.2" + }, + "peerDependencies": { + "react": "^18.3.1" + } + }, + "node_modules/react-error-overlay": { + "version": "6.0.11", + "resolved": "https://registry.npmjs.org/react-error-overlay/-/react-error-overlay-6.0.11.tgz", + "integrity": "sha512-/6UZ2qgEyH2aqzYZgQPxEnz33NJ2gNsnHA2o5+o4wW9bLM/JYQitNP9xPhsXwC08hMMovfGe/8retsdDsczPRg==", + "dev": true + }, + "node_modules/react-fast-compare": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/react-fast-compare/-/react-fast-compare-3.2.2.tgz", + "integrity": "sha512-nsO+KSNgo1SbJqJEYRE9ERzo7YtYbou/OqjSQKxV7jcKox7+usiUVZOAC+XnDOABXggQTno0Y1CpVnuWEc1boQ==" + }, + "node_modules/react-helmet": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/react-helmet/-/react-helmet-6.1.0.tgz", + "integrity": "sha512-4uMzEY9nlDlgxr61NL3XbKRy1hEkXmKNXhjbAIOVw5vcFrsdYbH2FEwcNyWvWinl103nXgzYNlns9ca+8kFiWw==", + "dependencies": { + "object-assign": "^4.1.1", + "prop-types": "^15.7.2", + "react-fast-compare": "^3.1.1", + "react-side-effect": "^2.1.0" + }, + "peerDependencies": { + "react": ">=16.3.0" + } + }, + "node_modules/react-hook-form": { + "version": "7.53.1", + "resolved": "https://registry.npmjs.org/react-hook-form/-/react-hook-form-7.53.1.tgz", + "integrity": "sha512-6aiQeBda4zjcuaugWvim9WsGqisoUk+etmFEsSUMm451/Ic8L/UAb7sRtMj3V+Hdzm6mMjU1VhiSzYUZeBm0Vg==", + "engines": { + "node": ">=18.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/react-hook-form" + }, + "peerDependencies": { + "react": "^16.8.0 || ^17 || ^18 || ^19" + } + }, + "node_modules/react-i18next": { + "version": "12.3.1", + "resolved": "https://registry.npmjs.org/react-i18next/-/react-i18next-12.3.1.tgz", + "integrity": "sha512-5v8E2XjZDFzK7K87eSwC7AJcAkcLt5xYZ4+yTPDAW1i7C93oOY1dnr4BaQM7un4Hm+GmghuiPvevWwlca5PwDA==", + "dependencies": { + "@babel/runtime": "^7.20.6", + "html-parse-stringify": "^3.0.1" + }, + "peerDependencies": { + "i18next": ">= 19.0.0", + "react": ">= 16.8.0" + }, + "peerDependenciesMeta": { + "react-dom": { + "optional": true + }, + "react-native": { + "optional": true + } + } + }, + "node_modules/react-is": { + "version": "18.3.1", + "resolved": "https://registry.npmjs.org/react-is/-/react-is-18.3.1.tgz", + "integrity": "sha512-/LLMVyas0ljjAtoYiPqYiL8VWXzUUdThrmU5+n20DZv+a+ClRoevUzw5JxU+Ieh5/c87ytoTBV9G1FiKfNJdmg==" + }, + "node_modules/react-keyed-flatten-children": { + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/react-keyed-flatten-children/-/react-keyed-flatten-children-2.2.1.tgz", + "integrity": "sha512-6yBLVO6suN8c/OcJk1mzIrUHdeEzf5rtRVBhxEXAHO49D7SlJ70cG4xrSJrBIAG7MMeQ+H/T151mM2dRDNnFaA==", + "license": "MIT", + "dependencies": { + "react-is": "^18.2.0" + }, + "peerDependencies": { + "react": ">=15.0.0" + } + }, + "node_modules/react-redux": { + "version": "8.1.3", + "resolved": "https://registry.npmjs.org/react-redux/-/react-redux-8.1.3.tgz", + "integrity": "sha512-n0ZrutD7DaX/j9VscF+uTALI3oUPa/pO4Z3soOBIjuRn/FzVu6aehhysxZCLi6y7duMf52WNZGMl7CtuK5EnRw==", + "dependencies": { + "@babel/runtime": "^7.12.1", + "@types/hoist-non-react-statics": "^3.3.1", + "@types/use-sync-external-store": "^0.0.3", + "hoist-non-react-statics": "^3.3.2", + "react-is": "^18.0.0", + "use-sync-external-store": "^1.0.0" + }, + "peerDependencies": { + "@types/react": "^16.8 || ^17.0 || ^18.0", + "@types/react-dom": "^16.8 || ^17.0 || ^18.0", + "react": "^16.8 || ^17.0 || ^18.0", + "react-dom": "^16.8 || ^17.0 || ^18.0", + "react-native": ">=0.59", + "redux": "^4 || ^5.0.0-beta.0" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + }, + "react-dom": { + "optional": true + }, + "react-native": { + "optional": true + }, + "redux": { + "optional": true + } + } + }, + "node_modules/react-refresh": { + "version": "0.14.2", + "resolved": "https://registry.npmjs.org/react-refresh/-/react-refresh-0.14.2.tgz", + "integrity": "sha512-jCvmsr+1IUSMUyzOkRcvnVbX3ZYC6g9TDrDbFuFmRDq7PD4yaGbLKNQL6k2jnArV8hjYxh7hVhAZB6s9HDGpZA==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/react-router": { + "version": "6.30.3", + "resolved": "https://registry.npmjs.org/react-router/-/react-router-6.30.3.tgz", + "integrity": "sha512-XRnlbKMTmktBkjCLE8/XcZFlnHvr2Ltdr1eJX4idL55/9BbORzyZEaIkBFDhFGCEWBBItsVrDxwx3gnisMitdw==", + "license": "MIT", + "dependencies": { + "@remix-run/router": "1.23.2" + }, + "engines": { + "node": ">=14.0.0" + }, + "peerDependencies": { + "react": ">=16.8" + } + }, + "node_modules/react-router-dom": { + "version": "6.30.3", + "resolved": "https://registry.npmjs.org/react-router-dom/-/react-router-dom-6.30.3.tgz", + "integrity": "sha512-pxPcv1AczD4vso7G4Z3TKcvlxK7g7TNt3/FNGMhfqyntocvYKj+GCatfigGDjbLozC4baguJ0ReCigoDJXb0ag==", + "license": "MIT", + "dependencies": { + "@remix-run/router": "1.23.2", + "react-router": "6.30.3" + }, + "engines": { + "node": ">=14.0.0" + }, + "peerDependencies": { + "react": ">=16.8", + "react-dom": ">=16.8" + } + }, + "node_modules/react-shallow-renderer": { + "version": "16.15.0", + "resolved": "https://registry.npmjs.org/react-shallow-renderer/-/react-shallow-renderer-16.15.0.tgz", + "integrity": "sha512-oScf2FqQ9LFVQgA73vr86xl2NaOIX73rh+YFqcOp68CWj56tSfgtGKrEbyhCj0rSijyG9M1CYprTh39fBi5hzA==", + "dev": true, + "dependencies": { + "object-assign": "^4.1.1", + "react-is": "^16.12.0 || ^17.0.0 || ^18.0.0" + }, + "peerDependencies": { + "react": "^16.0.0 || ^17.0.0 || ^18.0.0" + } + }, + "node_modules/react-side-effect": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/react-side-effect/-/react-side-effect-2.1.2.tgz", + "integrity": "sha512-PVjOcvVOyIILrYoyGEpDN3vmYNLdy1CajSFNt4TDsVQC5KpTijDvWVoR+/7Rz2xT978D8/ZtFceXxzsPwZEDvw==", + "peerDependencies": { + "react": "^16.3.0 || ^17.0.0 || ^18.0.0" + } + }, + "node_modules/react-string-replace": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/react-string-replace/-/react-string-replace-1.1.1.tgz", + "integrity": "sha512-26TUbLzLfHQ5jO5N7y3Mx88eeKo0Ml0UjCQuX4BMfOd/JX+enQqlKpL1CZnmjeBRvQE8TR+ds9j1rqx9CxhKHQ==", + "engines": { + "node": ">=0.12.0" + } + }, + "node_modules/react-test-renderer": { + "version": "18.3.1", + "resolved": "https://registry.npmjs.org/react-test-renderer/-/react-test-renderer-18.3.1.tgz", + "integrity": "sha512-KkAgygexHUkQqtvvx/otwxtuFu5cVjfzTCtjXLH9boS19/Nbtg84zS7wIQn39G8IlrhThBpQsMKkq5ZHZIYFXA==", + "dev": true, + "dependencies": { + "react-is": "^18.3.1", + "react-shallow-renderer": "^16.15.0", + "scheduler": "^0.23.2" + }, + "peerDependencies": { + "react": "^18.3.1" + } + }, + "node_modules/react-transition-group": { + "version": "4.4.5", + "resolved": "https://registry.npmjs.org/react-transition-group/-/react-transition-group-4.4.5.tgz", + "integrity": "sha512-pZcd1MCJoiKiBR2NRxeCRg13uCXbydPnmB4EOeRrY7480qNWO8IIgQG6zlDkm6uRMsURXPuKq0GWtiM59a5Q6g==", + "dependencies": { + "@babel/runtime": "^7.5.5", + "dom-helpers": "^5.0.1", + "loose-envify": "^1.4.0", + "prop-types": "^15.6.2" + }, + "peerDependencies": { + "react": ">=16.6.0", + "react-dom": ">=16.6.0" + } + }, + "node_modules/readable-stream": { + "version": "3.6.2", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz", + "integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==", + "dev": true, + "dependencies": { + "inherits": "^2.0.3", + "string_decoder": "^1.1.1", + "util-deprecate": "^1.0.1" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/readdirp": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.6.0.tgz", + "integrity": "sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==", + "dev": true, + "dependencies": { + "picomatch": "^2.2.1" + }, + "engines": { + "node": ">=8.10.0" + } + }, + "node_modules/rechoir": { + "version": "0.8.0", + "resolved": "https://registry.npmjs.org/rechoir/-/rechoir-0.8.0.tgz", + "integrity": "sha512-/vxpCXddiX8NGfGO/mTafwjq4aFa/71pvamip0++IQk3zG8cbCj0fifNPrjjF1XMXUne91jL9OoxmdykoEtifQ==", + "dev": true, + "dependencies": { + "resolve": "^1.20.0" + }, + "engines": { + "node": ">= 10.13.0" + } + }, + "node_modules/recursive-readdir": { + "version": "2.2.3", + "resolved": "https://registry.npmjs.org/recursive-readdir/-/recursive-readdir-2.2.3.tgz", + "integrity": "sha512-8HrF5ZsXk5FAH9dgsx3BlUer73nIhuj+9OrQwEbLTPOBzGkL1lsFCR01am+v+0m2Cmbs1nP12hLDl5FA7EszKA==", + "dev": true, + "dependencies": { + "minimatch": "^3.0.5" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/redent": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/redent/-/redent-3.0.0.tgz", + "integrity": "sha512-6tDA8g98We0zd0GvVeMT9arEOnTw9qM03L9cJXaCjrip1OO764RDBLBfrB4cwzNGDj5OA5ioymC9GkizgWJDUg==", + "dev": true, + "dependencies": { + "indent-string": "^4.0.0", + "strip-indent": "^3.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/redux": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/redux/-/redux-5.0.1.tgz", + "integrity": "sha512-M9/ELqF6fy8FwmkpnF0S3YKOqMyoWJ4+CS5Efg2ct3oY9daQvd/Pc71FpGZsVsbl3Cpb+IIcjBDUnnyBdQbq4w==" + }, + "node_modules/reflect-metadata": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/reflect-metadata/-/reflect-metadata-0.2.2.tgz", + "integrity": "sha512-urBwgfrvVP/eAyXx4hluJivBKzuEbSQs9rKWCrCkbSxNv8mxPcUZKeuoF3Uy4mJl3Lwprp6yy5/39VWigZ4K6Q==", + "dev": true, + "license": "Apache-2.0" + }, + "node_modules/reflect.getprototypeof": { + "version": "1.0.10", + "resolved": "https://registry.npmjs.org/reflect.getprototypeof/-/reflect.getprototypeof-1.0.10.tgz", + "integrity": "sha512-00o4I+DVrefhv+nX0ulyi3biSHCPDe+yLv5o/p6d/UVlirijB8E16FtfwSAi4g3tcqrQ4lRAqQSoFEZJehYEcw==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "define-properties": "^1.2.1", + "es-abstract": "^1.23.9", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.0.0", + "get-intrinsic": "^1.2.7", + "get-proto": "^1.0.1", + "which-builtin-type": "^1.2.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/reftools": { + "version": "1.1.9", + "resolved": "https://registry.npmjs.org/reftools/-/reftools-1.1.9.tgz", + "integrity": "sha512-OVede/NQE13xBQ+ob5CKd5KyeJYU2YInb1bmV4nRoOfquZPkAkxuOXicSe1PvqIuZZ4kD13sPKBbR7UFDmli6w==", + "dev": true, + "funding": { + "url": "https://github.com/Mermade/oas-kit?sponsor=1" + } + }, + "node_modules/regenerate": { + "version": "1.4.2", + "resolved": "https://registry.npmjs.org/regenerate/-/regenerate-1.4.2.tgz", + "integrity": "sha512-zrceR/XhGYU/d/opr2EKO7aRHUeiBI8qjtfHqADTwZd6Szfy16la6kqD0MIUs5z5hx6AaKa+PixpPrR289+I0A==", + "dev": true + }, + "node_modules/regenerate-unicode-properties": { + "version": "10.2.0", + "resolved": "https://registry.npmjs.org/regenerate-unicode-properties/-/regenerate-unicode-properties-10.2.0.tgz", + "integrity": "sha512-DqHn3DwbmmPVzeKj9woBadqmXxLvQoQIwu7nopMc72ztvxVmVk2SBhSnx67zuye5TP+lJsb/TBQsjLKhnDf3MA==", + "dev": true, + "dependencies": { + "regenerate": "^1.4.2" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/regenerator-transform": { + "version": "0.15.2", + "resolved": "https://registry.npmjs.org/regenerator-transform/-/regenerator-transform-0.15.2.tgz", + "integrity": "sha512-hfMp2BoF0qOk3uc5V20ALGDS2ddjQaLrdl7xrGXvAIow7qeWRM2VA2HuCHkUKk9slq3VwEwLNK3DFBqDfPGYtg==", + "dev": true, + "dependencies": { + "@babel/runtime": "^7.8.4" + } + }, + "node_modules/regex-parser": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/regex-parser/-/regex-parser-2.3.0.tgz", + "integrity": "sha512-TVILVSz2jY5D47F4mA4MppkBrafEaiUWJO/TcZHEIuI13AqoZMkK1WMA4Om1YkYbTx+9Ki1/tSUXbceyr9saRg==", + "dev": true + }, + "node_modules/regexp.prototype.flags": { + "version": "1.5.4", + "resolved": "https://registry.npmjs.org/regexp.prototype.flags/-/regexp.prototype.flags-1.5.4.tgz", + "integrity": "sha512-dYqgNSZbDwkaJ2ceRd9ojCGjBq+mOm9LmtXnAnEGyHhN/5R7iDW2TRw3h+o/jCFxus3P2LfWIIiwowAjANm7IA==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "define-properties": "^1.2.1", + "es-errors": "^1.3.0", + "get-proto": "^1.0.1", + "gopd": "^1.2.0", + "set-function-name": "^2.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/regexpu-core": { + "version": "6.1.1", + "resolved": "https://registry.npmjs.org/regexpu-core/-/regexpu-core-6.1.1.tgz", + "integrity": "sha512-k67Nb9jvwJcJmVpw0jPttR1/zVfnKf8Km0IPatrU/zJ5XeG3+Slx0xLXs9HByJSzXzrlz5EDvN6yLNMDc2qdnw==", + "dev": true, + "dependencies": { + "regenerate": "^1.4.2", + "regenerate-unicode-properties": "^10.2.0", + "regjsgen": "^0.8.0", + "regjsparser": "^0.11.0", + "unicode-match-property-ecmascript": "^2.0.0", + "unicode-match-property-value-ecmascript": "^2.1.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/regjsgen": { + "version": "0.8.0", + "resolved": "https://registry.npmjs.org/regjsgen/-/regjsgen-0.8.0.tgz", + "integrity": "sha512-RvwtGe3d7LvWiDQXeQw8p5asZUmfU1G/l6WbUXeHta7Y2PEIvBTwH6E2EfmYUK8pxcxEdEmaomqyp0vZZ7C+3Q==", + "dev": true + }, + "node_modules/regjsparser": { + "version": "0.11.2", + "resolved": "https://registry.npmjs.org/regjsparser/-/regjsparser-0.11.2.tgz", + "integrity": "sha512-3OGZZ4HoLJkkAZx/48mTXJNlmqTGOzc0o9OWQPuWpkOlXXPbyN6OafCcoXUnBqE2D3f/T5L+pWc1kdEmnfnRsA==", + "dev": true, + "dependencies": { + "jsesc": "~3.0.2" + }, + "bin": { + "regjsparser": "bin/parser" + } + }, + "node_modules/relateurl": { + "version": "0.2.7", + "resolved": "https://registry.npmjs.org/relateurl/-/relateurl-0.2.7.tgz", + "integrity": "sha512-G08Dxvm4iDN3MLM0EsP62EDV9IuhXPR6blNz6Utcp7zyV3tr4HVNINt6MpaRWbxoOHT3Q7YN2P+jaHX8vUbgog==", + "dev": true, + "engines": { + "node": ">= 0.10" + } + }, + "node_modules/renderkid": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/renderkid/-/renderkid-3.0.0.tgz", + "integrity": "sha512-q/7VIQA8lmM1hF+jn+sFSPWGlMkSAeNYcPLmDQx2zzuiDfaLrOmumR8iaUKlenFgh0XRPIUeSPlH3A+AW3Z5pg==", + "dev": true, + "dependencies": { + "css-select": "^4.1.3", + "dom-converter": "^0.2.0", + "htmlparser2": "^6.1.0", + "lodash": "^4.17.21", + "strip-ansi": "^6.0.1" + } + }, + "node_modules/renderkid/node_modules/css-select": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/css-select/-/css-select-4.3.0.tgz", + "integrity": "sha512-wPpOYtnsVontu2mODhA19JrqWxNsfdatRKd64kmpRbQgh1KtItko5sTnEpPdpSaJszTOhEMlF/RPz28qj4HqhQ==", + "dev": true, + "dependencies": { + "boolbase": "^1.0.0", + "css-what": "^6.0.1", + "domhandler": "^4.3.1", + "domutils": "^2.8.0", + "nth-check": "^2.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/renderkid/node_modules/dom-serializer": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-1.4.1.tgz", + "integrity": "sha512-VHwB3KfrcOOkelEG2ZOfxqLZdfkil8PtJi4P8N2MMXucZq2yLp75ClViUlOVwyoHEDjYU433Aq+5zWP61+RGag==", + "dev": true, + "dependencies": { + "domelementtype": "^2.0.1", + "domhandler": "^4.2.0", + "entities": "^2.0.0" + }, + "funding": { + "url": "https://github.com/cheeriojs/dom-serializer?sponsor=1" + } + }, + "node_modules/renderkid/node_modules/domhandler": { + "version": "4.3.1", + "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-4.3.1.tgz", + "integrity": "sha512-GrwoxYN+uWlzO8uhUXRl0P+kHE4GtVPfYzVLcUxPL7KNdHKj66vvlhiweIHqYYXWlw+T8iLMp42Lm67ghw4WMQ==", + "dev": true, + "dependencies": { + "domelementtype": "^2.2.0" + }, + "engines": { + "node": ">= 4" + }, + "funding": { + "url": "https://github.com/fb55/domhandler?sponsor=1" + } + }, + "node_modules/renderkid/node_modules/domutils": { + "version": "2.8.0", + "resolved": "https://registry.npmjs.org/domutils/-/domutils-2.8.0.tgz", + "integrity": "sha512-w96Cjofp72M5IIhpjgobBimYEfoPjx1Vx0BSX9P30WBdZW2WIKU0T1Bd0kz2eNZ9ikjKgHbEyKx8BB6H1L3h3A==", + "dev": true, + "dependencies": { + "dom-serializer": "^1.0.1", + "domelementtype": "^2.2.0", + "domhandler": "^4.2.0" + }, + "funding": { + "url": "https://github.com/fb55/domutils?sponsor=1" + } + }, + "node_modules/renderkid/node_modules/entities": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-2.2.0.tgz", + "integrity": "sha512-p92if5Nz619I0w+akJrLZH0MX0Pb5DX39XOwQTtXSdQQOaYH03S1uIQp4mhOZtAXrxq4ViO67YTiLBo2638o9A==", + "dev": true, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, + "node_modules/renderkid/node_modules/htmlparser2": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-6.1.0.tgz", + "integrity": "sha512-gyyPk6rgonLFEDGoeRgQNaEUvdJ4ktTmmUh/h2t7s+M8oPpIPxgNACWa+6ESR57kXstwqPiCut0V8NRpcwgU7A==", + "dev": true, + "funding": [ + "https://github.com/fb55/htmlparser2?sponsor=1", + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "dependencies": { + "domelementtype": "^2.0.1", + "domhandler": "^4.0.0", + "domutils": "^2.5.2", + "entities": "^2.0.0" + } + }, + "node_modules/require-directory": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz", + "integrity": "sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/require-from-string": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/require-from-string/-/require-from-string-2.0.2.tgz", + "integrity": "sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/requires-port": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/requires-port/-/requires-port-1.0.0.tgz", + "integrity": "sha512-KigOCHcocU3XODJxsu8i/j8T9tzT4adHiecwORRQ0ZZFcp7ahwXuRU1m+yuO90C5ZUyGeGfocHDI14M3L3yDAQ==", + "dev": true + }, + "node_modules/reselect": { + "version": "4.1.8", + "resolved": "https://registry.npmjs.org/reselect/-/reselect-4.1.8.tgz", + "integrity": "sha512-ab9EmR80F/zQTMNeneUr4cv+jSwPJgIlvEmVwLerwrWVbpLlBuls9XHzIeTFy4cegU2NHBp3va0LKOzU5qFEYQ==" + }, + "node_modules/resize-observer-polyfill": { + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/resize-observer-polyfill/-/resize-observer-polyfill-1.5.1.tgz", + "integrity": "sha512-LwZrotdHOo12nQuZlHEmtuXdqGoOD0OhaxopaNFxWzInpEgaLWoVuAMbTzixuosCx2nEG58ngzW3vxdWoxIgdg==" + }, + "node_modules/resolve": { + "version": "1.22.8", + "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.8.tgz", + "integrity": "sha512-oKWePCxqpd6FlLvGV1VU0x7bkPmmCNolxzjMf4NczoDnQcIWrAF+cPtZn5i6n+RfD2d9i0tzpKnG6Yk168yIyw==", + "dev": true, + "dependencies": { + "is-core-module": "^2.13.0", + "path-parse": "^1.0.7", + "supports-preserve-symlinks-flag": "^1.0.0" + }, + "bin": { + "resolve": "bin/resolve" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/resolve-cwd": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/resolve-cwd/-/resolve-cwd-3.0.0.tgz", + "integrity": "sha512-OrZaX2Mb+rJCpH/6CpSqt9xFVpN++x01XnN2ie9g6P5/3xelLAkXWVADpdz1IHD/KFfEXyE6V0U01OQ3UO2rEg==", + "dev": true, + "dependencies": { + "resolve-from": "^5.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/resolve-cwd/node_modules/resolve-from": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-5.0.0.tgz", + "integrity": "sha512-qYg9KP24dD5qka9J47d0aVky0N+b4fTU89LN9iDnjB5waksiC49rvMB0PrUJQGoTmH50XPiqOvAjDfaijGxYZw==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/resolve-from": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz", + "integrity": "sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==", + "dev": true, + "engines": { + "node": ">=4" + } + }, + "node_modules/resolve-url-loader": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/resolve-url-loader/-/resolve-url-loader-5.0.0.tgz", + "integrity": "sha512-uZtduh8/8srhBoMx//5bwqjQ+rfYOUq8zC9NrMUGtjBiGTtFJM42s58/36+hTqeqINcnYe08Nj3LkK9lW4N8Xg==", + "dev": true, + "dependencies": { + "adjust-sourcemap-loader": "^4.0.0", + "convert-source-map": "^1.7.0", + "loader-utils": "^2.0.0", + "postcss": "^8.2.14", + "source-map": "0.6.1" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/resolve-url-loader/node_modules/convert-source-map": { + "version": "1.9.0", + "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-1.9.0.tgz", + "integrity": "sha512-ASFBup0Mz1uyiIjANan1jzLQami9z1PoYSZCiiYW2FczPbenXc45FZdBZLzOT+r6+iciuEModtmCti+hjaAk0A==", + "dev": true + }, + "node_modules/resolve-url-loader/node_modules/loader-utils": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/loader-utils/-/loader-utils-2.0.4.tgz", + "integrity": "sha512-xXqpXoINfFhgua9xiqD8fPFHgkoq1mmmpE92WlDbm9rNRd/EbRb+Gqf908T2DMfuHjjJlksiK2RbHVOdD/MqSw==", + "dev": true, + "dependencies": { + "big.js": "^5.2.2", + "emojis-list": "^3.0.0", + "json5": "^2.1.2" + }, + "engines": { + "node": ">=8.9.0" + } + }, + "node_modules/resolve-url-loader/node_modules/source-map": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", + "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/resolve.exports": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/resolve.exports/-/resolve.exports-2.0.2.tgz", + "integrity": "sha512-X2UW6Nw3n/aMgDVy+0rSqgHlv39WZAlZrXCdnbyEiKm17DSqHX4MmQMaST3FbeWR5FTuRcUwYAziZajji0Y7mg==", + "dev": true, + "engines": { + "node": ">=10" + } + }, + "node_modules/restore-cursor": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/restore-cursor/-/restore-cursor-5.1.0.tgz", + "integrity": "sha512-oMA2dcrw6u0YfxJQXm342bFKX/E4sG9rbTzO9ptUcR/e8A33cHuvStiYOwH7fszkZlZ1z/ta9AAoPk2F4qIOHA==", + "dev": true, + "license": "MIT", + "dependencies": { + "onetime": "^7.0.0", + "signal-exit": "^4.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/restore-cursor/node_modules/onetime": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/onetime/-/onetime-7.0.0.tgz", + "integrity": "sha512-VXJjc87FScF88uafS3JllDgvAm+c/Slfz06lorj2uAY34rlUu0Nt+v8wreiImcrgAjjIHp1rXpTDlLOGw29WwQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "mimic-function": "^5.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/restore-cursor/node_modules/signal-exit": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz", + "integrity": "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==", + "dev": true, + "license": "ISC", + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/ret": { + "version": "0.1.15", + "resolved": "https://registry.npmjs.org/ret/-/ret-0.1.15.tgz", + "integrity": "sha512-TTlYpa+OL+vMMNG24xSlQGEJ3B/RzEfUlLct7b5G/ytav+wPrplCpVMFuwzXbkecJrb6IYo1iFb0S9v37754mg==", + "dev": true, + "engines": { + "node": ">=0.12" + } + }, + "node_modules/retry": { + "version": "0.13.1", + "resolved": "https://registry.npmjs.org/retry/-/retry-0.13.1.tgz", + "integrity": "sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg==", + "dev": true, + "engines": { + "node": ">= 4" + } + }, + "node_modules/reusify": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/reusify/-/reusify-1.0.4.tgz", + "integrity": "sha512-U9nH88a3fc/ekCF1l0/UP1IosiuIjyTh7hBvXVMHYgVcfGvt897Xguj2UOLDeI5BG2m7/uwyaLVT6fbtCwTyzw==", + "dev": true, + "engines": { + "iojs": ">=1.0.0", + "node": ">=0.10.0" + } + }, + "node_modules/rfdc": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/rfdc/-/rfdc-1.4.1.tgz", + "integrity": "sha512-q1b3N5QkRUWUl7iyylaaj3kOpIT0N2i9MqIEQXP73GVsN9cw3fdx8X63cEmWhJGi2PPCF23Ijp7ktmd39rawIA==", + "dev": true, + "license": "MIT" + }, + "node_modules/rst-selector-parser": { + "version": "2.2.3", + "resolved": "https://registry.npmjs.org/rst-selector-parser/-/rst-selector-parser-2.2.3.tgz", + "integrity": "sha512-nDG1rZeP6oFTLN6yNDV/uiAvs1+FS/KlrEwh7+y7dpuApDBy6bI2HTBcc0/V8lv9OTqfyD34eF7au2pm8aBbhA==", + "dev": true, + "dependencies": { + "lodash.flattendeep": "^4.4.0", + "nearley": "^2.7.10" + } + }, + "node_modules/run-applescript": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/run-applescript/-/run-applescript-7.0.0.tgz", + "integrity": "sha512-9by4Ij99JUr/MCFBUkDKLWK3G9HVXmabKz9U5MlIAIuvuzkiOicRYs8XJLxX+xahD+mLiiCYDqF9dKAgtzKP1A==", + "dev": true, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/run-parallel": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz", + "integrity": "sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "dependencies": { + "queue-microtask": "^1.2.2" + } + }, + "node_modules/safe-array-concat": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/safe-array-concat/-/safe-array-concat-1.1.3.tgz", + "integrity": "sha512-AURm5f0jYEOydBj7VQlVvDrjeFgthDdEF5H1dP+6mNpoXOMo1quQqJ4wvJDyRZ9+pO3kGWoOdmV08cSv2aJV6Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "call-bound": "^1.0.2", + "get-intrinsic": "^1.2.6", + "has-symbols": "^1.1.0", + "isarray": "^2.0.5" + }, + "engines": { + "node": ">=0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/safe-buffer": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", + "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ] + }, + "node_modules/safe-push-apply": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/safe-push-apply/-/safe-push-apply-1.0.0.tgz", + "integrity": "sha512-iKE9w/Z7xCzUMIZqdBsp6pEQvwuEebH4vdpjcDWnyzaI6yl6O9FHvVpmGelvEHNsoY6wGblkxR6Zty/h00WiSA==", + "dev": true, + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "isarray": "^2.0.5" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/safe-regex-test": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/safe-regex-test/-/safe-regex-test-1.1.0.tgz", + "integrity": "sha512-x/+Cz4YrimQxQccJf5mKEbIa1NzeCRNI5Ecl/ekmlYaampdNLPalVyIcCZNNH3MvmqBugV5TMYZXv0ljslUlaw==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.2", + "es-errors": "^1.3.0", + "is-regex": "^1.2.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/safer-buffer": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", + "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==", + "dev": true + }, + "node_modules/sass": { + "version": "1.81.0", + "resolved": "https://registry.npmjs.org/sass/-/sass-1.81.0.tgz", + "integrity": "sha512-Q4fOxRfhmv3sqCLoGfvrC9pRV8btc0UtqL9mN6Yrv6Qi9ScL55CVH1vlPP863ISLEEMNLLuu9P+enCeGHlnzhA==", + "dev": true, + "dependencies": { + "chokidar": "^4.0.0", + "immutable": "^5.0.2", + "source-map-js": ">=0.6.2 <2.0.0" + }, + "bin": { + "sass": "sass.js" + }, + "engines": { + "node": ">=14.0.0" + }, + "optionalDependencies": { + "@parcel/watcher": "^2.4.1" + } + }, + "node_modules/sass-loader": { + "version": "16.0.3", + "resolved": "https://registry.npmjs.org/sass-loader/-/sass-loader-16.0.3.tgz", + "integrity": "sha512-gosNorT1RCkuCMyihv6FBRR7BMV06oKRAs+l4UMp1mlcVg9rWN6KMmUj3igjQwmYys4mDP3etEYJgiHRbgHCHA==", + "dev": true, + "dependencies": { + "neo-async": "^2.6.2" + }, + "engines": { + "node": ">= 18.12.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/webpack" + }, + "peerDependencies": { + "@rspack/core": "0.x || 1.x", + "node-sass": "^4.0.0 || ^5.0.0 || ^6.0.0 || ^7.0.0 || ^8.0.0 || ^9.0.0", + "sass": "^1.3.0", + "sass-embedded": "*", + "webpack": "^5.0.0" + }, + "peerDependenciesMeta": { + "@rspack/core": { + "optional": true + }, + "node-sass": { + "optional": true + }, + "sass": { + "optional": true + }, + "sass-embedded": { + "optional": true + }, + "webpack": { + "optional": true + } + } + }, + "node_modules/sass/node_modules/chokidar": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-4.0.1.tgz", + "integrity": "sha512-n8enUVCED/KVRQlab1hr3MVpcVMvxtZjmEa956u+4YijlmQED223XMSYj2tLuKvr4jcCTzNNMpQDUer72MMmzA==", + "dev": true, + "dependencies": { + "readdirp": "^4.0.1" + }, + "engines": { + "node": ">= 14.16.0" + }, + "funding": { + "url": "https://paulmillr.com/funding/" + } + }, + "node_modules/sass/node_modules/readdirp": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-4.0.2.tgz", + "integrity": "sha512-yDMz9g+VaZkqBYS/ozoBJwaBhTbZo3UNYQHNRw1D3UFQB8oHB4uS/tAODO+ZLjGWmUbKnIlOWO+aaIiAxrUWHA==", + "dev": true, + "engines": { + "node": ">= 14.16.0" + }, + "funding": { + "type": "individual", + "url": "https://paulmillr.com/funding/" + } + }, + "node_modules/sax": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/sax/-/sax-1.4.1.tgz", + "integrity": "sha512-+aWOz7yVScEGoKNd4PA10LZ8sk0A/z5+nXQG5giUO5rprX9jgYsTdov9qCchZiPIZezbZH+jRut8nPodFAX4Jg==", + "dev": true + }, + "node_modules/scheduler": { + "version": "0.23.2", + "resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.23.2.tgz", + "integrity": "sha512-UOShsPwz7NrMUqhR6t0hWjFduvOzbtv7toDH1/hIrfRNIDBnnBWd0CwJTGvTpngVlmwGCdP9/Zl/tVrDqcuYzQ==", + "dependencies": { + "loose-envify": "^1.1.0" + } + }, + "node_modules/schema-utils": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-4.2.0.tgz", + "integrity": "sha512-L0jRsrPpjdckP3oPug3/VxNKt2trR8TcabrM6FOAAlvC/9Phcmm+cuAgTlxBqdBR1WJx7Naj9WHw+aOmheSVbw==", + "dependencies": { + "@types/json-schema": "^7.0.9", + "ajv": "^8.9.0", + "ajv-formats": "^2.1.1", + "ajv-keywords": "^5.1.0" + }, + "engines": { + "node": ">= 12.13.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/webpack" + } + }, + "node_modules/select-hose": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/select-hose/-/select-hose-2.0.0.tgz", + "integrity": "sha512-mEugaLK+YfkijB4fx0e6kImuJdCIt2LxCRcbEYPqRGCs4F2ogyfZU5IAZRdjCP8JPq2AtdNoC/Dux63d9Kiryg==", + "dev": true + }, + "node_modules/selfsigned": { + "version": "5.5.0", + "resolved": "https://registry.npmjs.org/selfsigned/-/selfsigned-5.5.0.tgz", + "integrity": "sha512-ftnu3TW4+3eBfLRFnDEkzGxSF/10BJBkaLJuBHZX0kiPS7bRdlpZGu6YGt4KngMkdTwJE6MbjavFpqHvqVt+Ew==", + "dev": true, + "license": "MIT", + "dependencies": { + "@peculiar/x509": "^1.14.2", + "pkijs": "^3.3.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/semver": { + "version": "6.3.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", + "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==", + "dev": true, + "bin": { + "semver": "bin/semver.js" + } + }, + "node_modules/send": { + "version": "0.19.2", + "resolved": "https://registry.npmjs.org/send/-/send-0.19.2.tgz", + "integrity": "sha512-VMbMxbDeehAxpOtWJXlcUS5E8iXh6QmN+BkRX1GARS3wRaXEEgzCcB10gTQazO42tpNIya8xIyNx8fll1OFPrg==", + "dev": true, + "license": "MIT", + "dependencies": { + "debug": "2.6.9", + "depd": "2.0.0", + "destroy": "1.2.0", + "encodeurl": "~2.0.0", + "escape-html": "~1.0.3", + "etag": "~1.8.1", + "fresh": "~0.5.2", + "http-errors": "~2.0.1", + "mime": "1.6.0", + "ms": "2.1.3", + "on-finished": "~2.4.1", + "range-parser": "~1.2.1", + "statuses": "~2.0.2" + }, + "engines": { + "node": ">= 0.8.0" + } + }, + "node_modules/send/node_modules/debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "dev": true, + "license": "MIT", + "dependencies": { + "ms": "2.0.0" + } + }, + "node_modules/send/node_modules/debug/node_modules/ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", + "dev": true, + "license": "MIT" + }, + "node_modules/serialize-javascript": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/serialize-javascript/-/serialize-javascript-6.0.2.tgz", + "integrity": "sha512-Saa1xPByTTq2gdeFZYLLo+RFE35NHZkAbqZeWNd3BpzppeVisAqpDjcp8dyf6uIvEqJRd46jemmyA4iFIeVk8g==", + "dependencies": { + "randombytes": "^2.1.0" + } + }, + "node_modules/serve-index": { + "version": "1.9.1", + "resolved": "https://registry.npmjs.org/serve-index/-/serve-index-1.9.1.tgz", + "integrity": "sha512-pXHfKNP4qujrtteMrSBb0rc8HJ9Ms/GrXwcUtUtD5s4ewDJI8bT3Cz2zTVRMKtri49pLx2e0Ya8ziP5Ya2pZZw==", + "dev": true, + "dependencies": { + "accepts": "~1.3.4", + "batch": "0.6.1", + "debug": "2.6.9", + "escape-html": "~1.0.3", + "http-errors": "~1.6.2", + "mime-types": "~2.1.17", + "parseurl": "~1.3.2" + }, + "engines": { + "node": ">= 0.8.0" + } + }, + "node_modules/serve-index/node_modules/debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "dev": true, + "dependencies": { + "ms": "2.0.0" + } + }, + "node_modules/serve-index/node_modules/depd": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz", + "integrity": "sha512-7emPTl6Dpo6JRXOXjLRxck+FlLRX5847cLKEn00PLAgc3g2hTZZgr+e4c2v6QpSmLeFP3n5yUo7ft6avBK/5jQ==", + "dev": true, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/serve-index/node_modules/http-errors": { + "version": "1.6.3", + "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.6.3.tgz", + "integrity": "sha512-lks+lVC8dgGyh97jxvxeYTWQFvh4uw4yC12gVl63Cg30sjPX4wuGcdkICVXDAESr6OJGjqGA8Iz5mkeN6zlD7A==", + "dev": true, + "dependencies": { + "depd": "~1.1.2", + "inherits": "2.0.3", + "setprototypeof": "1.1.0", + "statuses": ">= 1.4.0 < 2" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/serve-index/node_modules/inherits": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz", + "integrity": "sha512-x00IRNXNy63jwGkJmzPigoySHbaqpNuzKbBOmzK+g2OdZpQ9w+sxCN+VSB3ja7IAge2OP2qpfxTjeNcyjmW1uw==", + "dev": true + }, + "node_modules/serve-index/node_modules/ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", + "dev": true + }, + "node_modules/serve-index/node_modules/setprototypeof": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.1.0.tgz", + "integrity": "sha512-BvE/TwpZX4FXExxOxZyRGQQv651MSwmWKZGqvmPcRIjDqWub67kTKuIMx43cZZrS/cBBzwBcNDWoFxt2XEFIpQ==", + "dev": true + }, + "node_modules/serve-index/node_modules/statuses": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz", + "integrity": "sha512-OpZ3zP+jT1PI7I8nemJX4AKmAX070ZkYPVWV/AaKTJl+tXCTGyVdC1a4SL8RUQYEwk/f34ZX8UTykN68FwrqAA==", + "dev": true, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/serve-static": { + "version": "1.16.3", + "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.16.3.tgz", + "integrity": "sha512-x0RTqQel6g5SY7Lg6ZreMmsOzncHFU7nhnRWkKgWuMTu5NN0DR5oruckMqRvacAN9d5w6ARnRBXl9xhDCgfMeA==", + "dev": true, + "license": "MIT", + "dependencies": { + "encodeurl": "~2.0.0", + "escape-html": "~1.0.3", + "parseurl": "~1.3.3", + "send": "~0.19.1" + }, + "engines": { + "node": ">= 0.8.0" + } + }, + "node_modules/set-function-length": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz", + "integrity": "sha512-pgRc4hJ4/sNjWCSS9AmnS40x3bNMDTknHgL5UaMBTMyJnU90EgWh1Rz+MC9eFu4BuN/UwZjKQuY/1v3rM7HMfg==", + "dev": true, + "dependencies": { + "define-data-property": "^1.1.4", + "es-errors": "^1.3.0", + "function-bind": "^1.1.2", + "get-intrinsic": "^1.2.4", + "gopd": "^1.0.1", + "has-property-descriptors": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/set-function-name": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/set-function-name/-/set-function-name-2.0.2.tgz", + "integrity": "sha512-7PGFlmtwsEADb0WYyvCMa1t+yke6daIG4Wirafur5kcf+MhUnPms1UeR0CKQdTZD81yESwMHbtn+TR+dMviakQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "define-data-property": "^1.1.4", + "es-errors": "^1.3.0", + "functions-have-names": "^1.2.3", + "has-property-descriptors": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/set-proto": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/set-proto/-/set-proto-1.0.0.tgz", + "integrity": "sha512-RJRdvCo6IAnPdsvP/7m6bsQqNnn1FCBX5ZNtFL98MmFF/4xAIJTIg1YbHW5DC2W5SKZanrC6i4HsJqlajw/dZw==", + "dev": true, + "license": "MIT", + "dependencies": { + "dunder-proto": "^1.0.1", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/setprototypeof": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.2.0.tgz", + "integrity": "sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw==", + "dev": true, + "license": "ISC" + }, + "node_modules/shallow-clone": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/shallow-clone/-/shallow-clone-3.0.1.tgz", + "integrity": "sha512-/6KqX+GVUdqPuPPd2LxDDxzX6CAbjJehAAOKlNpqqUpAqPM6HeL8f+o3a+JsyGjn2lv0WY8UsTgUJjU9Ok55NA==", + "dev": true, + "dependencies": { + "kind-of": "^6.0.2" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/shallowequal": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/shallowequal/-/shallowequal-1.1.0.tgz", + "integrity": "sha512-y0m1JoUZSlPAjXVtPPW70aZWfIL/dSP7AFkRnniLCrK/8MDKog3TySTBmckD+RObVxH0v4Tox67+F14PdED2oQ==", + "dev": true, + "peer": true + }, + "node_modules/sharp": { + "version": "0.33.5", + "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.33.5.tgz", + "integrity": "sha512-haPVm1EkS9pgvHrQ/F3Xy+hgcuMV0Wm9vfIBSiwZ05k+xgb0PkBQpGsAA/oWdDobNaZTH5ppvHtzCFbnSEwHVw==", + "dev": true, + "hasInstallScript": true, + "dependencies": { + "color": "^4.2.3", + "detect-libc": "^2.0.3", + "semver": "^7.6.3" + }, + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-darwin-arm64": "0.33.5", + "@img/sharp-darwin-x64": "0.33.5", + "@img/sharp-libvips-darwin-arm64": "1.0.4", + "@img/sharp-libvips-darwin-x64": "1.0.4", + "@img/sharp-libvips-linux-arm": "1.0.5", + "@img/sharp-libvips-linux-arm64": "1.0.4", + "@img/sharp-libvips-linux-s390x": "1.0.4", + "@img/sharp-libvips-linux-x64": "1.0.4", + "@img/sharp-libvips-linuxmusl-arm64": "1.0.4", + "@img/sharp-libvips-linuxmusl-x64": "1.0.4", + "@img/sharp-linux-arm": "0.33.5", + "@img/sharp-linux-arm64": "0.33.5", + "@img/sharp-linux-s390x": "0.33.5", + "@img/sharp-linux-x64": "0.33.5", + "@img/sharp-linuxmusl-arm64": "0.33.5", + "@img/sharp-linuxmusl-x64": "0.33.5", + "@img/sharp-wasm32": "0.33.5", + "@img/sharp-win32-ia32": "0.33.5", + "@img/sharp-win32-x64": "0.33.5" + } + }, + "node_modules/sharp/node_modules/detect-libc": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.0.3.tgz", + "integrity": "sha512-bwy0MGW55bG41VqxxypOsdSdGqLwXPI/focwgTYCFMbdUiBAxLg9CFzG08sz2aqzknwiX7Hkl0bQENjg8iLByw==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/sharp/node_modules/semver": { + "version": "7.6.3", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.3.tgz", + "integrity": "sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==", + "dev": true, + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/shebang-command": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz", + "integrity": "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==", + "dev": true, + "dependencies": { + "shebang-regex": "^3.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/shebang-regex": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-3.0.0.tgz", + "integrity": "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/shell-quote": { + "version": "1.8.1", + "resolved": "https://registry.npmjs.org/shell-quote/-/shell-quote-1.8.1.tgz", + "integrity": "sha512-6j1W9l1iAs/4xYBI1SYOVZyFcCis9b4KCLQ8fgAGG07QvzaRLVVRQvAy85yNmmZSjYjg4MWh4gNvlPujU/5LpA==", + "dev": true, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/should": { + "version": "13.2.3", + "resolved": "https://registry.npmjs.org/should/-/should-13.2.3.tgz", + "integrity": "sha512-ggLesLtu2xp+ZxI+ysJTmNjh2U0TsC+rQ/pfED9bUZZ4DKefP27D+7YJVVTvKsmjLpIi9jAa7itwDGkDDmt1GQ==", + "dev": true, + "dependencies": { + "should-equal": "^2.0.0", + "should-format": "^3.0.3", + "should-type": "^1.4.0", + "should-type-adaptors": "^1.0.1", + "should-util": "^1.0.0" + } + }, + "node_modules/should-equal": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/should-equal/-/should-equal-2.0.0.tgz", + "integrity": "sha512-ZP36TMrK9euEuWQYBig9W55WPC7uo37qzAEmbjHz4gfyuXrEUgF8cUvQVO+w+d3OMfPvSRQJ22lSm8MQJ43LTA==", + "dev": true, + "dependencies": { + "should-type": "^1.4.0" + } + }, + "node_modules/should-format": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/should-format/-/should-format-3.0.3.tgz", + "integrity": "sha512-hZ58adtulAk0gKtua7QxevgUaXTTXxIi8t41L3zo9AHvjXO1/7sdLECuHeIN2SRtYXpNkmhoUP2pdeWgricQ+Q==", + "dev": true, + "dependencies": { + "should-type": "^1.3.0", + "should-type-adaptors": "^1.0.1" + } + }, + "node_modules/should-type": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/should-type/-/should-type-1.4.0.tgz", + "integrity": "sha512-MdAsTu3n25yDbIe1NeN69G4n6mUnJGtSJHygX3+oN0ZbO3DTiATnf7XnYJdGT42JCXurTb1JI0qOBR65shvhPQ==", + "dev": true + }, + "node_modules/should-type-adaptors": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/should-type-adaptors/-/should-type-adaptors-1.1.0.tgz", + "integrity": "sha512-JA4hdoLnN+kebEp2Vs8eBe9g7uy0zbRo+RMcU0EsNy+R+k049Ki+N5tT5Jagst2g7EAja+euFuoXFCa8vIklfA==", + "dev": true, + "dependencies": { + "should-type": "^1.3.0", + "should-util": "^1.0.0" + } + }, + "node_modules/should-util": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/should-util/-/should-util-1.0.1.tgz", + "integrity": "sha512-oXF8tfxx5cDk8r2kYqlkUJzZpDBqVY/II2WhvU0n9Y3XYvAYRmeaf1PvvIvTgPnv4KJ+ES5M0PyDq5Jp+Ygy2g==", + "dev": true + }, + "node_modules/side-channel": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.1.0.tgz", + "integrity": "sha512-ZX99e6tRweoUXqR+VBrslhda51Nh5MTQwou5tnUDgbtyM0dBgmhEDtWGP/xbKn6hqfPRHujUNwz5fy/wbbhnpw==", + "dev": true, + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "object-inspect": "^1.13.3", + "side-channel-list": "^1.0.0", + "side-channel-map": "^1.0.1", + "side-channel-weakmap": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/side-channel-list": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/side-channel-list/-/side-channel-list-1.0.0.tgz", + "integrity": "sha512-FCLHtRD/gnpCiCHEiJLOwdmFP+wzCmDEkc9y7NsYxeF4u7Btsn1ZuwgwJGxImImHicJArLP4R0yX4c2KCrMrTA==", + "dev": true, + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "object-inspect": "^1.13.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/side-channel-map": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/side-channel-map/-/side-channel-map-1.0.1.tgz", + "integrity": "sha512-VCjCNfgMsby3tTdo02nbjtM/ewra6jPHmpThenkTYh8pG9ucZ/1P8So4u4FGBek/BjpOVsDCMoLA/iuBKIFXRA==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.2", + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.5", + "object-inspect": "^1.13.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/side-channel-weakmap": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/side-channel-weakmap/-/side-channel-weakmap-1.0.2.tgz", + "integrity": "sha512-WPS/HvHQTYnHisLo9McqBHOJk2FkHO/tlpvldyrnem4aeQp4hai3gythswg6p01oSoTl58rcpiFAjF2br2Ak2A==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.2", + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.5", + "object-inspect": "^1.13.3", + "side-channel-map": "^1.0.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/signal-exit": { + "version": "3.0.7", + "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz", + "integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==", + "dev": true + }, + "node_modules/simple-swizzle": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz", + "integrity": "sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==", + "dev": true, + "dependencies": { + "is-arrayish": "^0.3.1" + } + }, + "node_modules/simple-swizzle/node_modules/is-arrayish": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz", + "integrity": "sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ==", + "dev": true + }, + "node_modules/sisteransi": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/sisteransi/-/sisteransi-1.0.5.tgz", + "integrity": "sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg==", + "dev": true + }, + "node_modules/slash": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/slash/-/slash-2.0.0.tgz", + "integrity": "sha512-ZYKh3Wh2z1PpEXWr0MpSBZ0V6mZHAQfYevttO11c51CaWjGTaadiKZ+wVt1PbMlDV5qhMFslpZCemhwOK7C89A==", + "dev": true, + "engines": { + "node": ">=6" + } + }, + "node_modules/slice-ansi": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/slice-ansi/-/slice-ansi-5.0.0.tgz", + "integrity": "sha512-FC+lgizVPfie0kkhqUScwRu1O/lF6NOgJmlCgK+/LYxDCTk8sGelYaHDhFcDN+Sn3Cv+3VSa4Byeo+IMCzpMgQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "ansi-styles": "^6.0.0", + "is-fullwidth-code-point": "^4.0.0" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/slice-ansi?sponsor=1" + } + }, + "node_modules/slice-ansi/node_modules/ansi-styles": { + "version": "6.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz", + "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/slice-ansi/node_modules/is-fullwidth-code-point": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-4.0.0.tgz", + "integrity": "sha512-O4L094N2/dZ7xqVdrXhh9r1KODPJpFms8B5sGdJLPy664AgvXsreZUyCQQNItZRDlYug4xStLjNp/sz3HvBowQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/sockjs": { + "version": "0.3.24", + "resolved": "https://registry.npmjs.org/sockjs/-/sockjs-0.3.24.tgz", + "integrity": "sha512-GJgLTZ7vYb/JtPSSZ10hsOYIvEYsjbNU+zPdIHcUaWVNUEPivzxku31865sSSud0Da0W4lEeOPlmw93zLQchuQ==", + "dev": true, + "dependencies": { + "faye-websocket": "^0.11.3", + "uuid": "^8.3.2", + "websocket-driver": "^0.7.4" + } + }, + "node_modules/source-map": { + "version": "0.7.4", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.7.4.tgz", + "integrity": "sha512-l3BikUxvPOcn5E74dZiq5BGsTb5yEwhaTSzccU6t4sDOH8NWJCstKO5QT2CvtFoK6F0saL7p9xHAqHOlCPJygA==", + "dev": true, + "engines": { + "node": ">= 8" + } + }, + "node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/source-map-support": { + "version": "0.5.21", + "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.21.tgz", + "integrity": "sha512-uBHU3L3czsIyYXKX88fdrGovxdSCoTGDRZ6SYXtSRxLZUzHg5P/66Ht6uoUlHu9EZod+inXhKo3qQgwXUT/y1w==", + "dependencies": { + "buffer-from": "^1.0.0", + "source-map": "^0.6.0" + } + }, + "node_modules/source-map-support/node_modules/source-map": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", + "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/spdy": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/spdy/-/spdy-4.0.2.tgz", + "integrity": "sha512-r46gZQZQV+Kl9oItvl1JZZqJKGr+oEkB08A6BzkiR7593/7IbtuncXHd2YoYeTsG4157ZssMu9KYvUHLcjcDoA==", + "dev": true, + "dependencies": { + "debug": "^4.1.0", + "handle-thing": "^2.0.0", + "http-deceiver": "^1.2.7", + "select-hose": "^2.0.0", + "spdy-transport": "^3.0.0" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/spdy-transport": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/spdy-transport/-/spdy-transport-3.0.0.tgz", + "integrity": "sha512-hsLVFE5SjA6TCisWeJXFKniGGOpBgMLmerfO2aCyCU5s7nJ/rpAepqmFifv/GCbSbueEeAJJnmSQ2rKC/g8Fcw==", + "dev": true, + "dependencies": { + "debug": "^4.1.0", + "detect-node": "^2.0.4", + "hpack.js": "^2.1.6", + "obuf": "^1.1.2", + "readable-stream": "^3.0.6", + "wbuf": "^1.7.3" + } + }, + "node_modules/sprintf-js": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz", + "integrity": "sha512-D9cPgkvLlV3t3IzL0D0YLvGA9Ahk4PcvVwUbN0dSGr1aP0Nrt4AEnTUbuGvquEC0mA64Gqt1fzirlRs5ibXx8g==", + "dev": true, + "license": "BSD-3-Clause" + }, + "node_modules/stable": { + "version": "0.1.8", + "resolved": "https://registry.npmjs.org/stable/-/stable-0.1.8.tgz", + "integrity": "sha512-ji9qxRnOVfcuLDySj9qzhGSEFVobyt1kIOSkj1qZzYLzq7Tos/oUUWvotUPQLlrsidqsK6tBH89Bc9kL5zHA6w==", + "deprecated": "Modern JS already guarantees Array#sort() is a stable sort, so this library is deprecated. See the compatibility table on MDN: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/sort#browser_compatibility" + }, + "node_modules/stack-utils": { + "version": "2.0.6", + "resolved": "https://registry.npmjs.org/stack-utils/-/stack-utils-2.0.6.tgz", + "integrity": "sha512-XlkWvfIm6RmsWtNJx+uqtKLS8eqFbxUg0ZzLXqY0caEy9l7hruX8IpiDnjsLavoBgqCCR71TqWO8MaXYheJ3RQ==", + "dev": true, + "dependencies": { + "escape-string-regexp": "^2.0.0" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/stack-utils/node_modules/escape-string-regexp": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-2.0.0.tgz", + "integrity": "sha512-UpzcLCXolUWcNu5HtVMHYdXJjArjsF9C0aNnquZYY4uW/Vu0miy5YoWvbV345HauVvcAUnpRuhMMcqTcGOY2+w==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/stackframe": { + "version": "1.3.4", + "resolved": "https://registry.npmjs.org/stackframe/-/stackframe-1.3.4.tgz", + "integrity": "sha512-oeVtt7eWQS+Na6F//S4kJ2K2VbRlS9D43mAlMyVpVWovy9o+jfgH8O9agzANzaiLjclA0oYzUXEM4PurhSUChw==", + "dev": true + }, + "node_modules/statuses": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.2.tgz", + "integrity": "sha512-DvEy55V3DB7uknRo+4iOGT5fP1slR8wQohVdknigZPMpMstaKJQWhwiYBACJE3Ul2pTnATihhBYnRhZQHGBiRw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/stop-iteration-iterator": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/stop-iteration-iterator/-/stop-iteration-iterator-1.1.0.tgz", + "integrity": "sha512-eLoXW/DHyl62zxY4SCaIgnRhuMr6ri4juEYARS8E6sCEqzKpOiE521Ucofdx+KnDZl5xmvGYaaKCk5FEOxJCoQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "internal-slot": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/string_decoder": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz", + "integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==", + "dev": true, + "dependencies": { + "safe-buffer": "~5.2.0" + } + }, + "node_modules/string-argv": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/string-argv/-/string-argv-0.3.2.tgz", + "integrity": "sha512-aqD2Q0144Z+/RqG52NeHEkZauTAUWJO8c6yTftGJKO3Tja5tUgIfmIl6kExvhtxSDP7fXB6DvzkfMpCd/F3G+Q==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.6.19" + } + }, + "node_modules/string-length": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/string-length/-/string-length-4.0.2.tgz", + "integrity": "sha512-+l6rNN5fYHNhZZy41RXsYptCjA2Igmq4EG7kZAYFQI1E1VTXarr6ZPXBg6eq7Y6eK4FEhY6AJlyuFIb/v/S0VQ==", + "dev": true, + "dependencies": { + "char-regex": "^1.0.2", + "strip-ansi": "^6.0.0" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/string-width": { + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", + "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", + "dev": true, + "dependencies": { + "emoji-regex": "^8.0.0", + "is-fullwidth-code-point": "^3.0.0", + "strip-ansi": "^6.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/string.prototype.matchall": { + "version": "4.0.12", + "resolved": "https://registry.npmjs.org/string.prototype.matchall/-/string.prototype.matchall-4.0.12.tgz", + "integrity": "sha512-6CC9uyBL+/48dYizRf7H7VAYCMCNTBeM78x/VTUe9bFEaxBepPJDa1Ow99LqI/1yF7kuy7Q3cQsYMrcjGUcskA==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "call-bound": "^1.0.3", + "define-properties": "^1.2.1", + "es-abstract": "^1.23.6", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.0.0", + "get-intrinsic": "^1.2.6", + "gopd": "^1.2.0", + "has-symbols": "^1.1.0", + "internal-slot": "^1.1.0", + "regexp.prototype.flags": "^1.5.3", + "set-function-name": "^2.0.2", + "side-channel": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/string.prototype.repeat": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/string.prototype.repeat/-/string.prototype.repeat-1.0.0.tgz", + "integrity": "sha512-0u/TldDbKD8bFCQ/4f5+mNRrXwZ8hg2w7ZR8wa16e8z9XpePWl3eGEcUD0OXpEH/VJH/2G3gjUtR3ZOiBe2S/w==", + "dev": true, + "license": "MIT", + "dependencies": { + "define-properties": "^1.1.3", + "es-abstract": "^1.17.5" + } + }, + "node_modules/string.prototype.trim": { + "version": "1.2.10", + "resolved": "https://registry.npmjs.org/string.prototype.trim/-/string.prototype.trim-1.2.10.tgz", + "integrity": "sha512-Rs66F0P/1kedk5lyYyH9uBzuiI/kNRmwJAR9quK6VOtIpZ2G+hMZd+HQbbv25MgCA6gEffoMZYxlTod4WcdrKA==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "call-bound": "^1.0.2", + "define-data-property": "^1.1.4", + "define-properties": "^1.2.1", + "es-abstract": "^1.23.5", + "es-object-atoms": "^1.0.0", + "has-property-descriptors": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/string.prototype.trimend": { + "version": "1.0.9", + "resolved": "https://registry.npmjs.org/string.prototype.trimend/-/string.prototype.trimend-1.0.9.tgz", + "integrity": "sha512-G7Ok5C6E/j4SGfyLCloXTrngQIQU3PWtXGst3yM7Bea9FRURf1S42ZHlZZtsNque2FN2PoUhfZXYLNWwEr4dLQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "call-bound": "^1.0.2", + "define-properties": "^1.2.1", + "es-object-atoms": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/string.prototype.trimstart": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/string.prototype.trimstart/-/string.prototype.trimstart-1.0.8.tgz", + "integrity": "sha512-UXSH262CSZY1tfu3G3Secr6uGLCFVPMhIqHjlgCUtCCcgihYc/xKs9djMTMUOb2j1mVSeU8EU6NWc/iQKU6Gfg==", + "dev": true, + "dependencies": { + "call-bind": "^1.0.7", + "define-properties": "^1.2.1", + "es-object-atoms": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/strip-ansi": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", + "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", + "dev": true, + "dependencies": { + "ansi-regex": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/strip-bom": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-4.0.0.tgz", + "integrity": "sha512-3xurFv5tEgii33Zi8Jtp55wEIILR9eh34FAW00PZf+JnSsTmV/ioewSgQl97JHvgjoRGwPShsWm+IdrxB35d0w==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/strip-final-newline": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/strip-final-newline/-/strip-final-newline-2.0.0.tgz", + "integrity": "sha512-BrpvfNAE3dcvq7ll3xVumzjKjZQ5tI1sEUIKr3Uoks0XUl45St3FlatVqef9prk4jRDzhW6WZg+3bk93y6pLjA==", + "dev": true, + "engines": { + "node": ">=6" + } + }, + "node_modules/strip-indent": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/strip-indent/-/strip-indent-3.0.0.tgz", + "integrity": "sha512-laJTa3Jb+VQpaC6DseHhF7dXVqHTfJPCRDaEbid/drOhgitgYku/letMUqOXFoWV0zIIUbjpdH2t+tYj4bQMRQ==", + "dev": true, + "dependencies": { + "min-indent": "^1.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/strip-json-comments": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz", + "integrity": "sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==", + "dev": true, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/style-loader": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/style-loader/-/style-loader-4.0.0.tgz", + "integrity": "sha512-1V4WqhhZZgjVAVJyt7TdDPZoPBPNHbekX4fWnCJL1yQukhCeZhJySUL+gL9y6sNdN95uEOS83Y55SqHcP7MzLA==", + "dev": true, + "engines": { + "node": ">= 18.12.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/webpack" + }, + "peerDependencies": { + "webpack": "^5.27.0" + } + }, + "node_modules/styled-components": { + "version": "6.1.13", + "resolved": "https://registry.npmjs.org/styled-components/-/styled-components-6.1.13.tgz", + "integrity": "sha512-M0+N2xSnAtwcVAQeFEsGWFFxXDftHUD7XrKla06QbpUMmbmtFBMMTcKWvFXtWxuD5qQkB8iU5gk6QASlx2ZRMw==", + "dev": true, + "peer": true, + "dependencies": { + "@emotion/is-prop-valid": "1.2.2", + "@emotion/unitless": "0.8.1", + "@types/stylis": "4.2.5", + "css-to-react-native": "3.2.0", + "csstype": "3.1.3", + "postcss": "8.4.38", + "shallowequal": "1.1.0", + "stylis": "4.3.2", + "tslib": "2.6.2" + }, + "engines": { + "node": ">= 16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/styled-components" + }, + "peerDependencies": { + "react": ">= 16.8.0", + "react-dom": ">= 16.8.0" + } + }, + "node_modules/styled-components/node_modules/postcss": { + "version": "8.4.38", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.38.tgz", + "integrity": "sha512-Wglpdk03BSfXkHoQa3b/oulrotAkwrlLDRSOb9D0bN86FdRyE9lppSp33aHNPgBa0JKCoB+drFLZkQoRRYae5A==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "peer": true, + "dependencies": { + "nanoid": "^3.3.7", + "picocolors": "^1.0.0", + "source-map-js": "^1.2.0" + }, + "engines": { + "node": "^10 || ^12 || >=14" + } + }, + "node_modules/styled-components/node_modules/tslib": { + "version": "2.6.2", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.6.2.tgz", + "integrity": "sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q==", + "dev": true, + "peer": true + }, + "node_modules/stylehacks": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/stylehacks/-/stylehacks-5.1.1.tgz", + "integrity": "sha512-sBpcd5Hx7G6seo7b1LkpttvTz7ikD0LlH5RmdcBNb6fFR0Fl7LQwHDFr300q4cwUqi+IYrFGmsIHieMBfnN/Bw==", + "dependencies": { + "browserslist": "^4.21.4", + "postcss-selector-parser": "^6.0.4" + }, + "engines": { + "node": "^10 || ^12 || >=14.0" + }, + "peerDependencies": { + "postcss": "^8.2.15" + } + }, + "node_modules/stylis": { + "version": "4.3.2", + "resolved": "https://registry.npmjs.org/stylis/-/stylis-4.3.2.tgz", + "integrity": "sha512-bhtUjWd/z6ltJiQwg0dUfxEJ+W+jdqQd8TbWLWyeIJHlnsqmGLRFFd8e5mA0AZi/zx90smXRlN66YMTcaSFifg==", + "dev": true, + "peer": true + }, + "node_modules/supports-color": { + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz", + "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==", + "dependencies": { + "has-flag": "^4.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/supports-preserve-symlinks-flag": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/supports-preserve-symlinks-flag/-/supports-preserve-symlinks-flag-1.0.0.tgz", + "integrity": "sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==", + "dev": true, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/svg-parser": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/svg-parser/-/svg-parser-2.0.4.tgz", + "integrity": "sha512-e4hG1hRwoOdRb37cIMSgzNsxyzKfayW6VOflrwvR+/bzrkyxY/31WkbgnQpgtrNp1SdpJvpUAGTa/ZoiPNDuRQ==", + "dev": true + }, + "node_modules/svgo": { + "version": "2.8.0", + "resolved": "https://registry.npmjs.org/svgo/-/svgo-2.8.0.tgz", + "integrity": "sha512-+N/Q9kV1+F+UeWYoSiULYo4xYSDQlTgb+ayMobAXPwMnLvop7oxKMo9OzIrX5x3eS4L4f2UHhc9axXwY8DpChg==", + "dependencies": { + "@trysound/sax": "0.2.0", + "commander": "^7.2.0", + "css-select": "^4.1.3", + "css-tree": "^1.1.3", + "csso": "^4.2.0", + "picocolors": "^1.0.0", + "stable": "^0.1.8" + }, + "bin": { + "svgo": "bin/svgo" + }, + "engines": { + "node": ">=10.13.0" + } + }, + "node_modules/svgo/node_modules/commander": { + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/commander/-/commander-7.2.0.tgz", + "integrity": "sha512-QrWXB+ZQSVPmIWIhtEO9H+gwHaMGYiF5ChvoJ+K9ZGHG/sVsa6yiesAD1GC/x46sET00Xlwo1u49RVVVzvcSkw==", + "engines": { + "node": ">= 10" + } + }, + "node_modules/svgo/node_modules/css-select": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/css-select/-/css-select-4.3.0.tgz", + "integrity": "sha512-wPpOYtnsVontu2mODhA19JrqWxNsfdatRKd64kmpRbQgh1KtItko5sTnEpPdpSaJszTOhEMlF/RPz28qj4HqhQ==", + "dependencies": { + "boolbase": "^1.0.0", + "css-what": "^6.0.1", + "domhandler": "^4.3.1", + "domutils": "^2.8.0", + "nth-check": "^2.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/svgo/node_modules/dom-serializer": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-1.4.1.tgz", + "integrity": "sha512-VHwB3KfrcOOkelEG2ZOfxqLZdfkil8PtJi4P8N2MMXucZq2yLp75ClViUlOVwyoHEDjYU433Aq+5zWP61+RGag==", + "dependencies": { + "domelementtype": "^2.0.1", + "domhandler": "^4.2.0", + "entities": "^2.0.0" + }, + "funding": { + "url": "https://github.com/cheeriojs/dom-serializer?sponsor=1" + } + }, + "node_modules/svgo/node_modules/domhandler": { + "version": "4.3.1", + "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-4.3.1.tgz", + "integrity": "sha512-GrwoxYN+uWlzO8uhUXRl0P+kHE4GtVPfYzVLcUxPL7KNdHKj66vvlhiweIHqYYXWlw+T8iLMp42Lm67ghw4WMQ==", + "dependencies": { + "domelementtype": "^2.2.0" + }, + "engines": { + "node": ">= 4" + }, + "funding": { + "url": "https://github.com/fb55/domhandler?sponsor=1" + } + }, + "node_modules/svgo/node_modules/domutils": { + "version": "2.8.0", + "resolved": "https://registry.npmjs.org/domutils/-/domutils-2.8.0.tgz", + "integrity": "sha512-w96Cjofp72M5IIhpjgobBimYEfoPjx1Vx0BSX9P30WBdZW2WIKU0T1Bd0kz2eNZ9ikjKgHbEyKx8BB6H1L3h3A==", + "dependencies": { + "dom-serializer": "^1.0.1", + "domelementtype": "^2.2.0", + "domhandler": "^4.2.0" + }, + "funding": { + "url": "https://github.com/fb55/domutils?sponsor=1" + } + }, + "node_modules/svgo/node_modules/entities": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-2.2.0.tgz", + "integrity": "sha512-p92if5Nz619I0w+akJrLZH0MX0Pb5DX39XOwQTtXSdQQOaYH03S1uIQp4mhOZtAXrxq4ViO67YTiLBo2638o9A==", + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, + "node_modules/swagger2openapi": { + "version": "7.0.8", + "resolved": "https://registry.npmjs.org/swagger2openapi/-/swagger2openapi-7.0.8.tgz", + "integrity": "sha512-upi/0ZGkYgEcLeGieoz8gT74oWHA0E7JivX7aN9mAf+Tc7BQoRBvnIGHoPDw+f9TXTW4s6kGYCZJtauP6OYp7g==", + "dev": true, + "dependencies": { + "call-me-maybe": "^1.0.1", + "node-fetch": "^2.6.1", + "node-fetch-h2": "^2.3.0", + "node-readfiles": "^0.2.0", + "oas-kit-common": "^1.0.8", + "oas-resolver": "^2.5.6", + "oas-schema-walker": "^1.1.5", + "oas-validator": "^5.0.8", + "reftools": "^1.1.9", + "yaml": "^1.10.0", + "yargs": "^17.0.1" + }, + "bin": { + "boast": "boast.js", + "oas-validate": "oas-validate.js", + "swagger2openapi": "swagger2openapi.js" + }, + "funding": { + "url": "https://github.com/Mermade/oas-kit?sponsor=1" + } + }, + "node_modules/synckit": { + "version": "0.11.8", + "resolved": "https://registry.npmjs.org/synckit/-/synckit-0.11.8.tgz", + "integrity": "sha512-+XZ+r1XGIJGeQk3VvXhT6xx/VpbHsRzsTkGgF6E5RX9TTXD0118l87puaEBZ566FhqblC6U0d4XnubznJDm30A==", + "dev": true, + "license": "MIT", + "dependencies": { + "@pkgr/core": "^0.2.4" + }, + "engines": { + "node": "^14.18.0 || >=16.0.0" + }, + "funding": { + "url": "https://opencollective.com/synckit" + } + }, + "node_modules/tapable": { + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/tapable/-/tapable-2.2.1.tgz", + "integrity": "sha512-GNzQvQTOIP6RyTfE2Qxb8ZVlNmw0n88vp1szwWRimP02mnTsx3Wtn5qRdqY9w2XduFNUgvOwhNnQsjwCp+kqaQ==", + "engines": { + "node": ">=6" + } + }, + "node_modules/terser": { + "version": "5.36.0", + "resolved": "https://registry.npmjs.org/terser/-/terser-5.36.0.tgz", + "integrity": "sha512-IYV9eNMuFAV4THUspIRXkLakHnV6XO7FEdtKjf/mDyrnqUg9LnlOn6/RwRvM9SZjR4GUq8Nk8zj67FzVARr74w==", + "dependencies": { + "@jridgewell/source-map": "^0.3.3", + "acorn": "^8.8.2", + "commander": "^2.20.0", + "source-map-support": "~0.5.20" + }, + "bin": { + "terser": "bin/terser" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/terser-webpack-plugin": { + "version": "5.3.10", + "resolved": "https://registry.npmjs.org/terser-webpack-plugin/-/terser-webpack-plugin-5.3.10.tgz", + "integrity": "sha512-BKFPWlPDndPs+NGGCr1U59t0XScL5317Y0UReNrHaw9/FwhPENlq6bfgs+4yPfyP51vqC1bQ4rp1EfXW5ZSH9w==", + "dependencies": { + "@jridgewell/trace-mapping": "^0.3.20", + "jest-worker": "^27.4.5", + "schema-utils": "^3.1.1", + "serialize-javascript": "^6.0.1", + "terser": "^5.26.0" + }, + "engines": { + "node": ">= 10.13.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/webpack" + }, + "peerDependencies": { + "webpack": "^5.1.0" + }, + "peerDependenciesMeta": { + "@swc/core": { + "optional": true + }, + "esbuild": { + "optional": true + }, + "uglify-js": { + "optional": true + } + } + }, + "node_modules/terser-webpack-plugin/node_modules/ajv": { + "version": "6.12.6", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz", + "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==", + "dependencies": { + "fast-deep-equal": "^3.1.1", + "fast-json-stable-stringify": "^2.0.0", + "json-schema-traverse": "^0.4.1", + "uri-js": "^4.2.2" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/epoberezkin" + } + }, + "node_modules/terser-webpack-plugin/node_modules/ajv-keywords": { + "version": "3.5.2", + "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-3.5.2.tgz", + "integrity": "sha512-5p6WTN0DdTGVQk6VjcEju19IgaHudalcfabD7yhDGeA6bcQnmL+CpveLJq/3hvfwd1aof6L386Ougkx6RfyMIQ==", + "peerDependencies": { + "ajv": "^6.9.1" + } + }, + "node_modules/terser-webpack-plugin/node_modules/jest-worker": { + "version": "27.5.1", + "resolved": "https://registry.npmjs.org/jest-worker/-/jest-worker-27.5.1.tgz", + "integrity": "sha512-7vuh85V5cdDofPyxn58nrPjBktZo0u9x1g8WtjQol+jZDaE+fhN+cIvTj11GndBnMnyfrUOG1sZQxCdjKh+DKg==", + "dependencies": { + "@types/node": "*", + "merge-stream": "^2.0.0", + "supports-color": "^8.0.0" + }, + "engines": { + "node": ">= 10.13.0" + } + }, + "node_modules/terser-webpack-plugin/node_modules/json-schema-traverse": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", + "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==" + }, + "node_modules/terser-webpack-plugin/node_modules/schema-utils": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-3.3.0.tgz", + "integrity": "sha512-pN/yOAvcC+5rQ5nERGuwrjLlYvLTbCibnZ1I7B1LaiAz9BRBlE9GMgE/eqV30P7aJQUf7Ddimy/RsbYO/GrVGg==", + "dependencies": { + "@types/json-schema": "^7.0.8", + "ajv": "^6.12.5", + "ajv-keywords": "^3.5.2" + }, + "engines": { + "node": ">= 10.13.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/webpack" + } + }, + "node_modules/terser-webpack-plugin/node_modules/supports-color": { + "version": "8.1.1", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-8.1.1.tgz", + "integrity": "sha512-MpUEN2OodtUzxvKQl72cUF7RQ5EiHsGvSsVG0ia9c5RbWGL2CI4C7EpPS8UTBIplnlzZiNuV56w+FuNxy3ty2Q==", + "dependencies": { + "has-flag": "^4.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/supports-color?sponsor=1" + } + }, + "node_modules/terser/node_modules/commander": { + "version": "2.20.3", + "resolved": "https://registry.npmjs.org/commander/-/commander-2.20.3.tgz", + "integrity": "sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==" + }, + "node_modules/test-exclude": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/test-exclude/-/test-exclude-6.0.0.tgz", + "integrity": "sha512-cAGWPIyOHU6zlmg88jwm7VRyXnMN7iV68OGAbYDk/Mh/xC/pzVPlQtY6ngoIH/5/tciuhGfvESU8GrHrcxD56w==", + "dev": true, + "dependencies": { + "@istanbuljs/schema": "^0.1.2", + "glob": "^7.1.4", + "minimatch": "^3.0.4" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/text-table": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/text-table/-/text-table-0.2.0.tgz", + "integrity": "sha512-N+8UisAXDGk8PFXP4HAzVR9nbfmVJ3zYLAWiTIoqC5v5isinhr+r5uaO8+7r3BMfuNIufIsA7RdpVgacC2cSpw==", + "dev": true + }, + "node_modules/thingies": { + "version": "1.21.0", + "resolved": "https://registry.npmjs.org/thingies/-/thingies-1.21.0.tgz", + "integrity": "sha512-hsqsJsFMsV+aD4s3CWKk85ep/3I9XzYV/IXaSouJMYIoDlgyi11cBhsqYe9/geRfB0YIikBQg6raRaM+nIMP9g==", + "dev": true, + "engines": { + "node": ">=10.18" + }, + "peerDependencies": { + "tslib": "^2" + } + }, + "node_modules/thunky": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/thunky/-/thunky-1.1.0.tgz", + "integrity": "sha512-eHY7nBftgThBqOyHGVN+l8gF0BucP09fMo0oO/Lb0w1OF80dJv+lDVpXG60WMQvkcxAkNybKsrEIE3ZtKGmPrA==", + "dev": true + }, + "node_modules/tmpl": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/tmpl/-/tmpl-1.0.5.tgz", + "integrity": "sha512-3f0uOEAQwIqGuWW2MVzYg8fV/QNnc/IpuJNG837rLuczAaLVHslWHZQj4IGiEl5Hs3kkbhwL9Ab7Hrsmuj+Smw==", + "dev": true + }, + "node_modules/to-regex-range": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz", + "integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==", + "dev": true, + "dependencies": { + "is-number": "^7.0.0" + }, + "engines": { + "node": ">=8.0" + } + }, + "node_modules/toidentifier": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.1.tgz", + "integrity": "sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.6" + } + }, + "node_modules/toposort": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/toposort/-/toposort-2.0.2.tgz", + "integrity": "sha512-0a5EOkAUp8D4moMi2W8ZF8jcga7BgZd91O/yabJCFY8az+XSzeGyTKs0Aoo897iV1Nj6guFq8orWDS96z91oGg==" + }, + "node_modules/tr46": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", + "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==" + }, + "node_modules/tree-dump": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/tree-dump/-/tree-dump-1.0.2.tgz", + "integrity": "sha512-dpev9ABuLWdEubk+cIaI9cHwRNNDjkBBLXTwI4UCUFdQ5xXKqNXoK4FEciw/vxf+NQ7Cb7sGUyeUtORvHIdRXQ==", + "dev": true, + "engines": { + "node": ">=10.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/streamich" + }, + "peerDependencies": { + "tslib": "2" + } + }, + "node_modules/ts-api-utils": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/ts-api-utils/-/ts-api-utils-2.1.0.tgz", + "integrity": "sha512-CUgTZL1irw8u29bzrOD/nH85jqyc74D6SshFgujOIA7osm2Rz7dYH77agkx7H4FBNxDq7Cjf+IjaX/8zwFW+ZQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=18.12" + }, + "peerDependencies": { + "typescript": ">=4.8.4" + } + }, + "node_modules/ts-jest": { + "version": "29.2.5", + "resolved": "https://registry.npmjs.org/ts-jest/-/ts-jest-29.2.5.tgz", + "integrity": "sha512-KD8zB2aAZrcKIdGk4OwpJggeLcH1FgrICqDSROWqlnJXGCXK4Mn6FcdK2B6670Xr73lHMG1kHw8R87A0ecZ+vA==", + "dev": true, + "dependencies": { + "bs-logger": "^0.2.6", + "ejs": "^3.1.10", + "fast-json-stable-stringify": "^2.1.0", + "jest-util": "^29.0.0", + "json5": "^2.2.3", + "lodash.memoize": "^4.1.2", + "make-error": "^1.3.6", + "semver": "^7.6.3", + "yargs-parser": "^21.1.1" + }, + "bin": { + "ts-jest": "cli.js" + }, + "engines": { + "node": "^14.15.0 || ^16.10.0 || ^18.0.0 || >=20.0.0" + }, + "peerDependencies": { + "@babel/core": ">=7.0.0-beta.0 <8", + "@jest/transform": "^29.0.0", + "@jest/types": "^29.0.0", + "babel-jest": "^29.0.0", + "jest": "^29.0.0", + "typescript": ">=4.3 <6" + }, + "peerDependenciesMeta": { + "@babel/core": { + "optional": true + }, + "@jest/transform": { + "optional": true + }, + "@jest/types": { + "optional": true + }, + "babel-jest": { + "optional": true + }, + "esbuild": { + "optional": true + } + } + }, + "node_modules/ts-jest/node_modules/semver": { + "version": "7.6.3", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.3.tgz", + "integrity": "sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==", + "dev": true, + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/ts-node": { + "version": "10.9.2", + "resolved": "https://registry.npmjs.org/ts-node/-/ts-node-10.9.2.tgz", + "integrity": "sha512-f0FFpIdcHgn8zcPSbf1dRevwt047YMnaiJM3u2w2RewrB+fob/zePZcrOyQoLMMO7aBIddLcQIEK5dYjkLnGrQ==", + "dev": true, + "dependencies": { + "@cspotcode/source-map-support": "^0.8.0", + "@tsconfig/node10": "^1.0.7", + "@tsconfig/node12": "^1.0.7", + "@tsconfig/node14": "^1.0.0", + "@tsconfig/node16": "^1.0.2", + "acorn": "^8.4.1", + "acorn-walk": "^8.1.1", + "arg": "^4.1.0", + "create-require": "^1.1.0", + "diff": "^4.0.1", + "make-error": "^1.1.1", + "v8-compile-cache-lib": "^3.0.1", + "yn": "3.1.1" + }, + "bin": { + "ts-node": "dist/bin.js", + "ts-node-cwd": "dist/bin-cwd.js", + "ts-node-esm": "dist/bin-esm.js", + "ts-node-script": "dist/bin-script.js", + "ts-node-transpile-only": "dist/bin-transpile.js", + "ts-script": "dist/bin-script-deprecated.js" + }, + "peerDependencies": { + "@swc/core": ">=1.2.50", + "@swc/wasm": ">=1.2.50", + "@types/node": "*", + "typescript": ">=2.7" + }, + "peerDependenciesMeta": { + "@swc/core": { + "optional": true + }, + "@swc/wasm": { + "optional": true + } + } + }, + "node_modules/tslib": { + "version": "2.8.1", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", + "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==", + "license": "0BSD" + }, + "node_modules/tsyringe": { + "version": "4.10.0", + "resolved": "https://registry.npmjs.org/tsyringe/-/tsyringe-4.10.0.tgz", + "integrity": "sha512-axr3IdNuVIxnaK5XGEUFTu3YmAQ6lllgrvqfEoR16g/HGnYY/6We4oWENtAnzK6/LpJ2ur9PAb80RBt7/U4ugw==", + "dev": true, + "license": "MIT", + "dependencies": { + "tslib": "^1.9.3" + }, + "engines": { + "node": ">= 6.0.0" + } + }, + "node_modules/tsyringe/node_modules/tslib": { + "version": "1.14.1", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-1.14.1.tgz", + "integrity": "sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg==", + "dev": true, + "license": "0BSD" + }, + "node_modules/type-check": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/type-check/-/type-check-0.4.0.tgz", + "integrity": "sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==", + "dev": true, + "dependencies": { + "prelude-ls": "^1.2.1" + }, + "engines": { + "node": ">= 0.8.0" + } + }, + "node_modules/type-detect": { + "version": "4.0.8", + "resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.8.tgz", + "integrity": "sha512-0fr/mIH1dlO+x7TlcMy+bIDqKPsw/70tVyeHW787goQjhmqaZe10uwLujubK9q9Lg6Fiho1KUKDYz0Z7k7g5/g==", + "dev": true, + "engines": { + "node": ">=4" + } + }, + "node_modules/type-fest": { + "version": "0.21.3", + "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.21.3.tgz", + "integrity": "sha512-t0rzBq87m3fVcduHDUFhKmyyX+9eo6WQjZvf51Ea/M0Q7+T374Jp1aUiyUl0GKxp8M/OETVHSDvmkyPgvX+X2w==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/type-is": { + "version": "1.6.18", + "resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.18.tgz", + "integrity": "sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g==", + "dev": true, + "license": "MIT", + "dependencies": { + "media-typer": "0.3.0", + "mime-types": "~2.1.24" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/typed-array-buffer": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/typed-array-buffer/-/typed-array-buffer-1.0.3.tgz", + "integrity": "sha512-nAYYwfY3qnzX30IkA6AQZjVbtK6duGontcQm1WSG1MD94YLqK0515GNApXkoxKOWMusVssAHWLh9SeaoefYFGw==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.3", + "es-errors": "^1.3.0", + "is-typed-array": "^1.1.14" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/typed-array-byte-length": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/typed-array-byte-length/-/typed-array-byte-length-1.0.3.tgz", + "integrity": "sha512-BaXgOuIxz8n8pIq3e7Atg/7s+DpiYrxn4vdot3w9KbnBhcRQq6o3xemQdIfynqSeXeDrF32x+WvfzmOjPiY9lg==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "for-each": "^0.3.3", + "gopd": "^1.2.0", + "has-proto": "^1.2.0", + "is-typed-array": "^1.1.14" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/typed-array-byte-offset": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/typed-array-byte-offset/-/typed-array-byte-offset-1.0.4.tgz", + "integrity": "sha512-bTlAFB/FBYMcuX81gbL4OcpH5PmlFHqlCCpAl8AlEzMz5k53oNDvN8p1PNOWLEmI2x4orp3raOFB51tv9X+MFQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "available-typed-arrays": "^1.0.7", + "call-bind": "^1.0.8", + "for-each": "^0.3.3", + "gopd": "^1.2.0", + "has-proto": "^1.2.0", + "is-typed-array": "^1.1.15", + "reflect.getprototypeof": "^1.0.9" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/typed-array-length": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/typed-array-length/-/typed-array-length-1.0.7.tgz", + "integrity": "sha512-3KS2b+kL7fsuk/eJZ7EQdnEmQoaho/r6KUef7hxvltNA5DR8NAUM+8wJMbJyZ4G9/7i3v5zPBIMN5aybAh2/Jg==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.7", + "for-each": "^0.3.3", + "gopd": "^1.0.1", + "is-typed-array": "^1.1.13", + "possible-typed-array-names": "^1.0.0", + "reflect.getprototypeof": "^1.0.6" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/typescript": { + "version": "5.7.2", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.7.2.tgz", + "integrity": "sha512-i5t66RHxDvVN40HfDd1PsEThGNnlMCMT3jMUuoh9/0TaqWevNontacunWyN02LA9/fIbEWlcHZcgTKb9QoaLfg==", + "devOptional": true, + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/unbox-primitive": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/unbox-primitive/-/unbox-primitive-1.1.0.tgz", + "integrity": "sha512-nWJ91DjeOkej/TA8pXQ3myruKpKEYgqvpw9lz4OPHj/NWFNluYrjbz9j01CJ8yKQd2g4jFoOkINCTW2I5LEEyw==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.3", + "has-bigints": "^1.0.2", + "has-symbols": "^1.1.0", + "which-boxed-primitive": "^1.1.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/undici": { + "version": "6.23.0", + "resolved": "https://registry.npmjs.org/undici/-/undici-6.23.0.tgz", + "integrity": "sha512-VfQPToRA5FZs/qJxLIinmU59u0r7LXqoJkCzinq3ckNJp3vKEh7jTWN589YQ5+aoAC/TGRLyJLCPKcLQbM8r9g==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=18.17" + } + }, + "node_modules/undici-types": { + "version": "6.20.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.20.0.tgz", + "integrity": "sha512-Ny6QZ2Nju20vw1SRHe3d9jVu6gJ+4e3+MMpqu7pqE5HT6WsTSlce++GQmK5UXS8mzV8DSYHrQH+Xrf2jVcuKNg==" + }, + "node_modules/unicode-canonical-property-names-ecmascript": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/unicode-canonical-property-names-ecmascript/-/unicode-canonical-property-names-ecmascript-2.0.1.tgz", + "integrity": "sha512-dA8WbNeb2a6oQzAQ55YlT5vQAWGV9WXOsi3SskE3bcCdM0P4SDd+24zS/OCacdRq5BkdsRj9q3Pg6YyQoxIGqg==", + "dev": true, + "engines": { + "node": ">=4" + } + }, + "node_modules/unicode-match-property-ecmascript": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/unicode-match-property-ecmascript/-/unicode-match-property-ecmascript-2.0.0.tgz", + "integrity": "sha512-5kaZCrbp5mmbz5ulBkDkbY0SsPOjKqVS35VpL9ulMPfSl0J0Xsm+9Evphv9CoIZFwre7aJoa94AY6seMKGVN5Q==", + "dev": true, + "dependencies": { + "unicode-canonical-property-names-ecmascript": "^2.0.0", + "unicode-property-aliases-ecmascript": "^2.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/unicode-match-property-value-ecmascript": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/unicode-match-property-value-ecmascript/-/unicode-match-property-value-ecmascript-2.2.0.tgz", + "integrity": "sha512-4IehN3V/+kkr5YeSSDDQG8QLqO26XpL2XP3GQtqwlT/QYSECAwFztxVHjlbh0+gjJ3XmNLS0zDsbgs9jWKExLg==", + "dev": true, + "engines": { + "node": ">=4" + } + }, + "node_modules/unicode-property-aliases-ecmascript": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/unicode-property-aliases-ecmascript/-/unicode-property-aliases-ecmascript-2.1.0.tgz", + "integrity": "sha512-6t3foTQI9qne+OZoVQB/8x8rk2k1eVy1gRXhV3oFQ5T6R1dqQ1xtin3XqSlx3+ATBkliTaR/hHyJBm+LVPNM8w==", + "dev": true, + "engines": { + "node": ">=4" + } + }, + "node_modules/universalify": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/universalify/-/universalify-2.0.1.tgz", + "integrity": "sha512-gptHNQghINnc/vTGIk0SOFGFNXw7JVrlRUtConJRlvaw6DuX0wO5Jeko9sWrMBhh+PsYAZ7oXAiOnf/UKogyiw==", + "dev": true, + "engines": { + "node": ">= 10.0.0" + } + }, + "node_modules/unpipe": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz", + "integrity": "sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/update-browserslist-db": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.1.1.tgz", + "integrity": "sha512-R8UzCaa9Az+38REPiJ1tXlImTJXlVfgHZsglwBD/k6nj76ctsH1E3q4doGrukiLQd3sGQYu56r5+lo5r94l29A==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/browserslist" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "dependencies": { + "escalade": "^3.2.0", + "picocolors": "^1.1.0" + }, + "bin": { + "update-browserslist-db": "cli.js" + }, + "peerDependencies": { + "browserslist": ">= 4.21.0" + } + }, + "node_modules/uri-js": { + "version": "4.4.1", + "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz", + "integrity": "sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==", + "dependencies": { + "punycode": "^2.1.0" + } + }, + "node_modules/use-sync-external-store": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/use-sync-external-store/-/use-sync-external-store-1.2.2.tgz", + "integrity": "sha512-PElTlVMwpblvbNqQ82d2n6RjStvdSoNe9FG28kNfz3WiXilJm4DdNkEzRhCZuIDwY8U08WVihhGR5iRqAwfDiw==", + "peerDependencies": { + "react": "^16.8.0 || ^17.0.0 || ^18.0.0" + } + }, + "node_modules/util-deprecate": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", + "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==" + }, + "node_modules/utila": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/utila/-/utila-0.4.0.tgz", + "integrity": "sha512-Z0DbgELS9/L/75wZbro8xAnT50pBVFQZ+hUEueGDU5FN51YSCYM+jdxsfCiHjwNP/4LCDD0i/graKpeBnOXKRA==", + "dev": true + }, + "node_modules/utils-merge": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz", + "integrity": "sha512-pMZTvIkT1d+TFGvDOqodOclx0QWkkgi6Tdoa8gC8ffGAAqz9pzPTZWAybbsHHoED/ztMtkv/VoYTYyShUn81hA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4.0" + } + }, + "node_modules/uuid": { + "version": "8.3.2", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-8.3.2.tgz", + "integrity": "sha512-+NYs2QeMWy+GWFOEm9xnn6HCDp0l7QBD7ml8zLUmJ+93Q5NF0NocErnwkTkXVFNiX3/fpC6afS8Dhb/gz7R7eg==", + "dev": true, + "bin": { + "uuid": "dist/bin/uuid" + } + }, + "node_modules/v8-compile-cache-lib": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz", + "integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==", + "dev": true + }, + "node_modules/v8-to-istanbul": { + "version": "9.3.0", + "resolved": "https://registry.npmjs.org/v8-to-istanbul/-/v8-to-istanbul-9.3.0.tgz", + "integrity": "sha512-kiGUalWN+rgBJ/1OHZsBtU4rXZOfj/7rKQxULKlIzwzQSvMJUUNgPwJEEh7gU6xEVxC0ahoOBvN2YI8GH6FNgA==", + "dev": true, + "dependencies": { + "@jridgewell/trace-mapping": "^0.3.12", + "@types/istanbul-lib-coverage": "^2.0.1", + "convert-source-map": "^2.0.0" + }, + "engines": { + "node": ">=10.12.0" + } + }, + "node_modules/vary": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz", + "integrity": "sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/void-elements": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/void-elements/-/void-elements-3.1.0.tgz", + "integrity": "sha512-Dhxzh5HZuiHQhbvTW9AMetFfBHDMYpo23Uo9btPXgdYP+3T5S+p+jgNy7spra+veYhBP2dCSgxR/i2Y02h5/6w==", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/walker": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/walker/-/walker-1.0.8.tgz", + "integrity": "sha512-ts/8E8l5b7kY0vlWLewOkDXMmPdLcVV4GmOQLyxuSswIJsweeFZtAsMF7k1Nszz+TYBQrlYRmzOnr398y1JemQ==", + "dev": true, + "dependencies": { + "makeerror": "1.0.12" + } + }, + "node_modules/watchpack": { + "version": "2.4.2", + "resolved": "https://registry.npmjs.org/watchpack/-/watchpack-2.4.2.tgz", + "integrity": "sha512-TnbFSbcOCcDgjZ4piURLCbJ3nJhznVh9kw6F6iokjiFPl8ONxe9A6nMDVXDiNbrSfLILs6vB07F7wLBrwPYzJw==", + "dependencies": { + "glob-to-regexp": "^0.4.1", + "graceful-fs": "^4.1.2" + }, + "engines": { + "node": ">=10.13.0" + } + }, + "node_modules/wbuf": { + "version": "1.7.3", + "resolved": "https://registry.npmjs.org/wbuf/-/wbuf-1.7.3.tgz", + "integrity": "sha512-O84QOnr0icsbFGLS0O3bI5FswxzRr8/gHwWkDlQFskhSPryQXvrTMxjxGP4+iWYoauLoBvfDpkrOauZ+0iZpDA==", + "dev": true, + "dependencies": { + "minimalistic-assert": "^1.0.0" + } + }, + "node_modules/web-streams-polyfill": { + "version": "4.0.0-beta.3", + "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-4.0.0-beta.3.tgz", + "integrity": "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug==", + "engines": { + "node": ">= 14" + } + }, + "node_modules/webidl-conversions": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", + "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==" + }, + "node_modules/webpack": { + "version": "5.96.1", + "resolved": "https://registry.npmjs.org/webpack/-/webpack-5.96.1.tgz", + "integrity": "sha512-l2LlBSvVZGhL4ZrPwyr8+37AunkcYj5qh8o6u2/2rzoPc8gxFJkLj1WxNgooi9pnoc06jh0BjuXnamM4qlujZA==", + "dependencies": { + "@types/eslint-scope": "^3.7.7", + "@types/estree": "^1.0.6", + "@webassemblyjs/ast": "^1.12.1", + "@webassemblyjs/wasm-edit": "^1.12.1", + "@webassemblyjs/wasm-parser": "^1.12.1", + "acorn": "^8.14.0", + "browserslist": "^4.24.0", + "chrome-trace-event": "^1.0.2", + "enhanced-resolve": "^5.17.1", + "es-module-lexer": "^1.2.1", + "eslint-scope": "5.1.1", + "events": "^3.2.0", + "glob-to-regexp": "^0.4.1", + "graceful-fs": "^4.2.11", + "json-parse-even-better-errors": "^2.3.1", + "loader-runner": "^4.2.0", + "mime-types": "^2.1.27", + "neo-async": "^2.6.2", + "schema-utils": "^3.2.0", + "tapable": "^2.1.1", + "terser-webpack-plugin": "^5.3.10", + "watchpack": "^2.4.1", + "webpack-sources": "^3.2.3" + }, + "bin": { + "webpack": "bin/webpack.js" + }, + "engines": { + "node": ">=10.13.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/webpack" + }, + "peerDependenciesMeta": { + "webpack-cli": { + "optional": true + } + } + }, + "node_modules/webpack-cli": { + "version": "5.1.4", + "resolved": "https://registry.npmjs.org/webpack-cli/-/webpack-cli-5.1.4.tgz", + "integrity": "sha512-pIDJHIEI9LR0yxHXQ+Qh95k2EvXpWzZ5l+d+jIo+RdSm9MiHfzazIxwwni/p7+x4eJZuvG1AJwgC4TNQ7NRgsg==", + "dev": true, + "dependencies": { + "@discoveryjs/json-ext": "^0.5.0", + "@webpack-cli/configtest": "^2.1.1", + "@webpack-cli/info": "^2.0.2", + "@webpack-cli/serve": "^2.0.5", + "colorette": "^2.0.14", + "commander": "^10.0.1", + "cross-spawn": "^7.0.3", + "envinfo": "^7.7.3", + "fastest-levenshtein": "^1.0.12", + "import-local": "^3.0.2", + "interpret": "^3.1.1", + "rechoir": "^0.8.0", + "webpack-merge": "^5.7.3" + }, + "bin": { + "webpack-cli": "bin/cli.js" + }, + "engines": { + "node": ">=14.15.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/webpack" + }, + "peerDependencies": { + "webpack": "5.x.x" + }, + "peerDependenciesMeta": { + "@webpack-cli/generators": { + "optional": true + }, + "webpack-bundle-analyzer": { + "optional": true + }, + "webpack-dev-server": { + "optional": true + } + } + }, + "node_modules/webpack-cli/node_modules/commander": { + "version": "10.0.1", + "resolved": "https://registry.npmjs.org/commander/-/commander-10.0.1.tgz", + "integrity": "sha512-y4Mg2tXshplEbSGzx7amzPwKKOCGuoSRP/CjEdwwk0FOGlUbq6lKuoyDZTNZkmxHdJtp54hdfY/JUrdL7Xfdug==", + "dev": true, + "engines": { + "node": ">=14" + } + }, + "node_modules/webpack-cli/node_modules/webpack-merge": { + "version": "5.10.0", + "resolved": "https://registry.npmjs.org/webpack-merge/-/webpack-merge-5.10.0.tgz", + "integrity": "sha512-+4zXKdx7UnO+1jaN4l2lHVD+mFvnlZQP/6ljaJVb4SZiwIKeUnrT5l0gkT8z+n4hKpC+jpOv6O9R+gLtag7pSA==", + "dev": true, + "dependencies": { + "clone-deep": "^4.0.1", + "flat": "^5.0.2", + "wildcard": "^2.0.0" + }, + "engines": { + "node": ">=10.0.0" + } + }, + "node_modules/webpack-dev-middleware": { + "version": "7.4.2", + "resolved": "https://registry.npmjs.org/webpack-dev-middleware/-/webpack-dev-middleware-7.4.2.tgz", + "integrity": "sha512-xOO8n6eggxnwYpy1NlzUKpvrjfJTvae5/D6WOK0S2LSo7vjmo5gCM1DbLUmFqrMTJP+W/0YZNctm7jasWvLuBA==", + "dev": true, + "dependencies": { + "colorette": "^2.0.10", + "memfs": "^4.6.0", + "mime-types": "^2.1.31", + "on-finished": "^2.4.1", + "range-parser": "^1.2.1", + "schema-utils": "^4.0.0" + }, + "engines": { + "node": ">= 18.12.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/webpack" + }, + "peerDependencies": { + "webpack": "^5.0.0" + }, + "peerDependenciesMeta": { + "webpack": { + "optional": true + } + } + }, + "node_modules/webpack-dev-middleware/node_modules/memfs": { + "version": "4.14.0", + "resolved": "https://registry.npmjs.org/memfs/-/memfs-4.14.0.tgz", + "integrity": "sha512-JUeY0F/fQZgIod31Ja1eJgiSxLn7BfQlCnqhwXFBzFHEw63OdLK7VJUJ7bnzNsWgCyoUP5tEp1VRY8rDaYzqOA==", + "dev": true, + "dependencies": { + "@jsonjoy.com/json-pack": "^1.0.3", + "@jsonjoy.com/util": "^1.3.0", + "tree-dump": "^1.0.1", + "tslib": "^2.0.0" + }, + "engines": { + "node": ">= 4.0.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/streamich" + } + }, + "node_modules/webpack-dev-server": { + "version": "5.2.3", + "resolved": "https://registry.npmjs.org/webpack-dev-server/-/webpack-dev-server-5.2.3.tgz", + "integrity": "sha512-9Gyu2F7+bg4Vv+pjbovuYDhHX+mqdqITykfzdM9UyKqKHlsE5aAjRhR+oOEfXW5vBeu8tarzlJFIZva4ZjAdrQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/bonjour": "^3.5.13", + "@types/connect-history-api-fallback": "^1.5.4", + "@types/express": "^4.17.25", + "@types/express-serve-static-core": "^4.17.21", + "@types/serve-index": "^1.9.4", + "@types/serve-static": "^1.15.5", + "@types/sockjs": "^0.3.36", + "@types/ws": "^8.5.10", + "ansi-html-community": "^0.0.8", + "bonjour-service": "^1.2.1", + "chokidar": "^3.6.0", + "colorette": "^2.0.10", + "compression": "^1.8.1", + "connect-history-api-fallback": "^2.0.0", + "express": "^4.22.1", + "graceful-fs": "^4.2.6", + "http-proxy-middleware": "^2.0.9", + "ipaddr.js": "^2.1.0", + "launch-editor": "^2.6.1", + "open": "^10.0.3", + "p-retry": "^6.2.0", + "schema-utils": "^4.2.0", + "selfsigned": "^5.5.0", + "serve-index": "^1.9.1", + "sockjs": "^0.3.24", + "spdy": "^4.0.2", + "webpack-dev-middleware": "^7.4.2", + "ws": "^8.18.0" + }, + "bin": { + "webpack-dev-server": "bin/webpack-dev-server.js" + }, + "engines": { + "node": ">= 18.12.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/webpack" + }, + "peerDependencies": { + "webpack": "^5.0.0" + }, + "peerDependenciesMeta": { + "webpack": { + "optional": true + }, + "webpack-cli": { + "optional": true + } + } + }, + "node_modules/webpack-dev-server/node_modules/@types/express-serve-static-core": { + "version": "4.19.8", + "resolved": "https://registry.npmjs.org/@types/express-serve-static-core/-/express-serve-static-core-4.19.8.tgz", + "integrity": "sha512-02S5fmqeoKzVZCHPZid4b8JH2eM5HzQLZWN2FohQEy/0eXTq8VXZfSN6Pcr3F6N9R/vNrj7cpgbhjie6m/1tCA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/node": "*", + "@types/qs": "*", + "@types/range-parser": "*", + "@types/send": "*" + } + }, + "node_modules/webpack-dev-server/node_modules/define-lazy-prop": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/define-lazy-prop/-/define-lazy-prop-3.0.0.tgz", + "integrity": "sha512-N+MeXYoqr3pOgn8xfyRPREN7gHakLYjhsHhWGT3fWAiL4IkAt0iDw14QiiEm2bE30c5XX5q0FtAA3CK5f9/BUg==", + "dev": true, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/webpack-dev-server/node_modules/is-wsl": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/is-wsl/-/is-wsl-3.1.0.tgz", + "integrity": "sha512-UcVfVfaK4Sc4m7X3dUSoHoozQGBEFeDC+zVo06t98xe8CzHSZZBekNXH+tu0NalHolcJ/QAGqS46Hef7QXBIMw==", + "dev": true, + "dependencies": { + "is-inside-container": "^1.0.0" + }, + "engines": { + "node": ">=16" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/webpack-dev-server/node_modules/open": { + "version": "10.1.0", + "resolved": "https://registry.npmjs.org/open/-/open-10.1.0.tgz", + "integrity": "sha512-mnkeQ1qP5Ue2wd+aivTD3NHd/lZ96Lu0jgf0pwktLPtx6cTZiH7tyeGRRHs0zX0rbrahXPnXlUnbeXyaBBuIaw==", + "dev": true, + "dependencies": { + "default-browser": "^5.2.1", + "define-lazy-prop": "^3.0.0", + "is-inside-container": "^1.0.0", + "is-wsl": "^3.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/webpack-merge": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/webpack-merge/-/webpack-merge-6.0.1.tgz", + "integrity": "sha512-hXXvrjtx2PLYx4qruKl+kyRSLc52V+cCvMxRjmKwoA+CBbbF5GfIBtR6kCvl0fYGqTUPKB+1ktVmTHqMOzgCBg==", + "dev": true, + "dependencies": { + "clone-deep": "^4.0.1", + "flat": "^5.0.2", + "wildcard": "^2.0.1" + }, + "engines": { + "node": ">=18.0.0" + } + }, + "node_modules/webpack-nano": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/webpack-nano/-/webpack-nano-1.1.1.tgz", + "integrity": "sha512-3dypjHtWQylpgsYmaovsUgFmXmDBoArq8aNMvDjmhVRpaA8MQsYxMiXgsp09GYL3n5y+1PfscytrFW3IXknlIQ==", + "dev": true, + "dependencies": { + "chalk": "^4.1.0", + "import-local": "^3.0.2", + "rechoir": "^0.7.0", + "yargs-parser": "^20.2.1" + }, + "bin": { + "wp": "bin/wp.js" + }, + "engines": { + "node": ">= 10.0.0" + }, + "peerDependencies": { + "webpack": ">=4.20.2" + } + }, + "node_modules/webpack-nano/node_modules/rechoir": { + "version": "0.7.1", + "resolved": "https://registry.npmjs.org/rechoir/-/rechoir-0.7.1.tgz", + "integrity": "sha512-/njmZ8s1wVeR6pjTZ+0nCnv8SpZNRMT2D1RLOJQESlYFDBvwpTA4KWJpZ+sBJ4+vhjILRcK7JIFdGCdxEAAitg==", + "dev": true, + "dependencies": { + "resolve": "^1.9.0" + }, + "engines": { + "node": ">= 0.10" + } + }, + "node_modules/webpack-nano/node_modules/yargs-parser": { + "version": "20.2.9", + "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-20.2.9.tgz", + "integrity": "sha512-y11nGElTIV+CT3Zv9t7VKl+Q3hTQoT9a1Qzezhhl6Rp21gJ/IVTW7Z3y9EWXhuUBC2Shnf+DX0antecpAwSP8w==", + "dev": true, + "engines": { + "node": ">=10" + } + }, + "node_modules/webpack-sources": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/webpack-sources/-/webpack-sources-3.2.3.tgz", + "integrity": "sha512-/DyMEOrDgLKKIG0fmvtz+4dUX/3Ghozwgm6iPp8KRhvn+eQf9+Q7GWxVNMk3+uCPWfdXYC4ExGBckIXdFEfH1w==", + "engines": { + "node": ">=10.13.0" + } + }, + "node_modules/webpack/node_modules/ajv": { + "version": "6.12.6", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz", + "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==", + "dependencies": { + "fast-deep-equal": "^3.1.1", + "fast-json-stable-stringify": "^2.0.0", + "json-schema-traverse": "^0.4.1", + "uri-js": "^4.2.2" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/epoberezkin" + } + }, + "node_modules/webpack/node_modules/ajv-keywords": { + "version": "3.5.2", + "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-3.5.2.tgz", + "integrity": "sha512-5p6WTN0DdTGVQk6VjcEju19IgaHudalcfabD7yhDGeA6bcQnmL+CpveLJq/3hvfwd1aof6L386Ougkx6RfyMIQ==", + "peerDependencies": { + "ajv": "^6.9.1" + } + }, + "node_modules/webpack/node_modules/json-schema-traverse": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", + "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==" + }, + "node_modules/webpack/node_modules/schema-utils": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-3.3.0.tgz", + "integrity": "sha512-pN/yOAvcC+5rQ5nERGuwrjLlYvLTbCibnZ1I7B1LaiAz9BRBlE9GMgE/eqV30P7aJQUf7Ddimy/RsbYO/GrVGg==", + "dependencies": { + "@types/json-schema": "^7.0.8", + "ajv": "^6.12.5", + "ajv-keywords": "^3.5.2" + }, + "engines": { + "node": ">= 10.13.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/webpack" + } + }, + "node_modules/websocket-driver": { + "version": "0.7.4", + "resolved": "https://registry.npmjs.org/websocket-driver/-/websocket-driver-0.7.4.tgz", + "integrity": "sha512-b17KeDIQVjvb0ssuSDF2cYXSg2iztliJ4B9WdsuB6J952qCPKmnVq4DyW5motImXHDC1cBT/1UezrJVsKw5zjg==", + "dev": true, + "dependencies": { + "http-parser-js": ">=0.5.1", + "safe-buffer": ">=5.1.0", + "websocket-extensions": ">=0.1.1" + }, + "engines": { + "node": ">=0.8.0" + } + }, + "node_modules/websocket-extensions": { + "version": "0.1.4", + "resolved": "https://registry.npmjs.org/websocket-extensions/-/websocket-extensions-0.1.4.tgz", + "integrity": "sha512-OqedPIGOfsDlo31UNwYbCFMSaO9m9G/0faIHj5/dZFDMFqPTcx6UwqyOy3COEaEOg/9VsGIpdqn62W5KhoKSpg==", + "dev": true, + "engines": { + "node": ">=0.8.0" + } + }, + "node_modules/weekstart": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/weekstart/-/weekstart-1.1.0.tgz", + "integrity": "sha512-ZO3I7c7J9nwGN1PZKZeBYAsuwWEsCOZi5T68cQoVNYrzrpp5Br0Bgi0OF4l8kH/Ez7nKfxa5mSsXjsgris3+qg==" + }, + "node_modules/whatwg-encoding": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz", + "integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==", + "dev": true, + "dependencies": { + "iconv-lite": "0.6.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/whatwg-mimetype": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz", + "integrity": "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==", + "dev": true, + "engines": { + "node": ">=18" + } + }, + "node_modules/whatwg-url": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", + "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==", + "dependencies": { + "tr46": "~0.0.3", + "webidl-conversions": "^3.0.0" + } + }, + "node_modules/which": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", + "integrity": "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==", + "dev": true, + "dependencies": { + "isexe": "^2.0.0" + }, + "bin": { + "node-which": "bin/node-which" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/which-boxed-primitive": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/which-boxed-primitive/-/which-boxed-primitive-1.1.1.tgz", + "integrity": "sha512-TbX3mj8n0odCBFVlY8AxkqcHASw3L60jIuF8jFP78az3C2YhmGvqbHBpAjTRH2/xqYunrJ9g1jSyjCjpoWzIAA==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-bigint": "^1.1.0", + "is-boolean-object": "^1.2.1", + "is-number-object": "^1.1.1", + "is-string": "^1.1.1", + "is-symbol": "^1.1.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/which-builtin-type": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/which-builtin-type/-/which-builtin-type-1.2.1.tgz", + "integrity": "sha512-6iBczoX+kDQ7a3+YJBnh3T+KZRxM/iYNPXicqk66/Qfm1b93iu+yOImkg0zHbj5LNOcNv1TEADiZ0xa34B4q6Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.2", + "function.prototype.name": "^1.1.6", + "has-tostringtag": "^1.0.2", + "is-async-function": "^2.0.0", + "is-date-object": "^1.1.0", + "is-finalizationregistry": "^1.1.0", + "is-generator-function": "^1.0.10", + "is-regex": "^1.2.1", + "is-weakref": "^1.0.2", + "isarray": "^2.0.5", + "which-boxed-primitive": "^1.1.0", + "which-collection": "^1.0.2", + "which-typed-array": "^1.1.16" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/which-collection": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/which-collection/-/which-collection-1.0.2.tgz", + "integrity": "sha512-K4jVyjnBdgvc86Y6BkaLZEN933SwYOuBFkdmBu9ZfkcAbdVbpITnDmjvZ/aQjRXQrv5EPkTnD1s39GiiqbngCw==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-map": "^2.0.3", + "is-set": "^2.0.3", + "is-weakmap": "^2.0.2", + "is-weakset": "^2.0.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/which-typed-array": { + "version": "1.1.20", + "resolved": "https://registry.npmjs.org/which-typed-array/-/which-typed-array-1.1.20.tgz", + "integrity": "sha512-LYfpUkmqwl0h9A2HL09Mms427Q1RZWuOHsukfVcKRq9q95iQxdw0ix1JQrqbcDR9PH1QDwf5Qo8OZb5lksZ8Xg==", + "dev": true, + "license": "MIT", + "dependencies": { + "available-typed-arrays": "^1.0.7", + "call-bind": "^1.0.8", + "call-bound": "^1.0.4", + "for-each": "^0.3.5", + "get-proto": "^1.0.1", + "gopd": "^1.2.0", + "has-tostringtag": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/wildcard": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/wildcard/-/wildcard-2.0.1.tgz", + "integrity": "sha512-CC1bOL87PIWSBhDcTrdeLo6eGT7mCFtrg0uIJtqJUFyK+eJnzl8A1niH56uu7KMa5XFrtiV+AQuHO3n7DsHnLQ==", + "dev": true + }, + "node_modules/word-wrap": { + "version": "1.2.5", + "resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.5.tgz", + "integrity": "sha512-BN22B5eaMMI9UMtjrGd5g5eCYPpCPDUy0FJXbYsaT5zYxjFOckS53SQDE3pWkVoWpHXVb3BrYcEN4Twa55B5cA==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/wrap-ansi": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz", + "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==", + "dev": true, + "dependencies": { + "ansi-styles": "^4.0.0", + "string-width": "^4.1.0", + "strip-ansi": "^6.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/wrap-ansi?sponsor=1" + } + }, + "node_modules/wrappy": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", + "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==", + "dev": true + }, + "node_modules/write-file-atomic": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/write-file-atomic/-/write-file-atomic-4.0.2.tgz", + "integrity": "sha512-7KxauUdBmSdWnmpaGFg+ppNjKF8uNLry8LyzjauQDOVONfFLNKrKvQOxZ/VuTIcS/gge/YNahf5RIIQWTSarlg==", + "dev": true, + "dependencies": { + "imurmurhash": "^0.1.4", + "signal-exit": "^3.0.7" + }, + "engines": { + "node": "^12.13.0 || ^14.15.0 || >=16.0.0" + } + }, + "node_modules/ws": { + "version": "8.18.0", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.0.tgz", + "integrity": "sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==", + "dev": true, + "engines": { + "node": ">=10.0.0" + }, + "peerDependencies": { + "bufferutil": "^4.0.1", + "utf-8-validate": ">=5.0.2" + }, + "peerDependenciesMeta": { + "bufferutil": { + "optional": true + }, + "utf-8-validate": { + "optional": true + } + } + }, + "node_modules/xml2js": { + "version": "0.6.2", + "resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.6.2.tgz", + "integrity": "sha512-T4rieHaC1EXcES0Kxxj4JWgaUQHDk+qwHcYOCFHfiwKz7tOVPLq7Hjq9dM1WCMhylqMEfP7hMcOIChvotiZegA==", + "dev": true, + "dependencies": { + "sax": ">=0.6.0", + "xmlbuilder": "~11.0.0" + }, + "engines": { + "node": ">=4.0.0" + } + }, + "node_modules/xmlbuilder": { + "version": "11.0.1", + "resolved": "https://registry.npmjs.org/xmlbuilder/-/xmlbuilder-11.0.1.tgz", + "integrity": "sha512-fDlsI/kFEx7gLvbecc0/ohLG50fugQp8ryHzMTuW9vSa1GJ0XYWKnhsUx7oie3G98+r56aTQIUB4kht42R3JvA==", + "dev": true, + "engines": { + "node": ">=4.0" + } + }, + "node_modules/y18n": { + "version": "5.0.8", + "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz", + "integrity": "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==", + "dev": true, + "engines": { + "node": ">=10" + } + }, + "node_modules/yallist": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz", + "integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==", + "dev": true + }, + "node_modules/yaml": { + "version": "1.10.2", + "resolved": "https://registry.npmjs.org/yaml/-/yaml-1.10.2.tgz", + "integrity": "sha512-r3vXyErRCYJ7wg28yvBY5VSoAF8ZvlcW9/BwUzEtUsjvX/DKs24dIkuwjtuprwJJHsbyUbLApepYTR1BN4uHrg==", + "engines": { + "node": ">= 6" + } + }, + "node_modules/yargs": { + "version": "17.7.2", + "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.7.2.tgz", + "integrity": "sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==", + "dev": true, + "dependencies": { + "cliui": "^8.0.1", + "escalade": "^3.1.1", + "get-caller-file": "^2.0.5", + "require-directory": "^2.1.1", + "string-width": "^4.2.3", + "y18n": "^5.0.5", + "yargs-parser": "^21.1.1" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/yargs-parser": { + "version": "21.1.1", + "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-21.1.1.tgz", + "integrity": "sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==", + "dev": true, + "engines": { + "node": ">=12" + } + }, + "node_modules/yn": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/yn/-/yn-3.1.1.tgz", + "integrity": "sha512-Ux4ygGWsu2c7isFWe8Yu1YluJmqVhxqK2cLXNQA5AcC3QfbGNpM7fu0Y8b/z16pXLnFxZYvWhd3fhBY9DLmC6Q==", + "dev": true, + "engines": { + "node": ">=6" + } + }, + "node_modules/yocto-queue": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz", + "integrity": "sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/yup": { + "version": "0.32.11", + "resolved": "https://registry.npmjs.org/yup/-/yup-0.32.11.tgz", + "integrity": "sha512-Z2Fe1bn+eLstG8DRR6FTavGD+MeAwyfmouhHsIUgaADz8jvFKbO/fXc2trJKZg+5EBjh4gGm3iU/t3onKlXHIg==", + "dependencies": { + "@babel/runtime": "^7.15.4", + "@types/lodash": "^4.14.175", + "lodash": "^4.17.21", + "lodash-es": "^4.17.21", + "nanoclone": "^0.2.1", + "property-expr": "^2.0.4", + "toposort": "^2.0.2" + }, + "engines": { + "node": ">=10" + } + } + } +} diff --git a/frontend/package.json b/frontend/package.json new file mode 100644 index 0000000000..f4dfc7c09e --- /dev/null +++ b/frontend/package.json @@ -0,0 +1,139 @@ +{ + "name": "dstackai", + "version": "2.0.0", + "main": "src/index.ts", + "repository": "git@gitlab.com:dstackai/dstackai-website.git", + "author": "Oleg Vavilov", + "license": "Apache 2.0", + "private": true, + "scripts": { + "start": "cross-env NODE_ENV=development webpack serve --config webpack.config.js", + "start-sky": "cross-env NODE_ENV=development UI_VERSION=sky webpack serve --config webpack.config.js", + "build": "cross-env NODE_ENV=production webpack build --config webpack.config.js", + "build-sky": "cross-env NODE_ENV=production UI_VERSION=sky webpack build --config webpack.config.js", + "eslint": "eslint ./src --ext .js,.jsx,.ts,.tsx", + "eslint-fix": "eslint ./src --ext .js,.jsx,.ts,.tsx --fix", + "test": "jest", + "test:update-snapshots": "jest -u", + "generate-api": "npx @rtk-query/codegen-openapi openapi-config.ts", + "pre-commit": "lint-staged" + }, + "devDependencies": { + "@babel/cli": "^7.25.9", + "@babel/core": "^7.26.0", + "@babel/plugin-proposal-class-properties": "^7.18.6", + "@babel/plugin-transform-runtime": "^7.25.9", + "@babel/preset-env": "^7.26.0", + "@babel/preset-react": "^7.25.9", + "@babel/preset-typescript": "^7.26.0", + "@babel/register": "^7.25.9", + "@cfaester/enzyme-adapter-react-18": "^0.8.0", + "@eslint/eslintrc": "^3.3.1", + "@eslint/js": "^9.28.0", + "@pmmmwh/react-refresh-webpack-plugin": "^0.5.15", + "@rtk-query/codegen-openapi": "^2.0.0", + "@svgr/webpack": "^6.5.1", + "@testing-library/jest-dom": "^6.6.3", + "@testing-library/react": "^16.0.1", + "@testing-library/user-event": "^14.5.2", + "@types/axios": "^0.14.0", + "@types/date-fns": "^2.6.0", + "@types/enzyme": "^3.10.18", + "@types/jest": "^29.5.14", + "@types/js-yaml": "^4.0.9", + "@types/lodash": "^4.17.13", + "@types/node": "^22.10.1", + "@types/react": "^18.3.12", + "@types/react-dom": "^18.3.1", + "@types/react-helmet": "^6.1.11", + "@types/react-redux": "^7.1.34", + "@types/react-router-dom": "^5.3.3", + "@types/react-test-renderer": "^18.3.0", + "@typescript-eslint/eslint-plugin": "^8.33.1", + "@typescript-eslint/parser": "^8.33.1", + "@webpack-cli/serve": "^2.0.5", + "babel-loader": "^9.2.1", + "babel-preset-minify": "^0.5.2", + "circular-dependency-plugin": "^5.2.2", + "copy-webpack-plugin": "^11.0.0", + "cross-env": "^7.0.3", + "css-loader": "^6.7.3", + "enzyme": "^3.11.0", + "eslint": "^9.39.2", + "eslint-config-prettier": "^10.1.5", + "eslint-plugin-i18n": "^2.4.0", + "eslint-plugin-prettier": "^5.4.1", + "eslint-plugin-react": "^7.37.5", + "eslint-plugin-simple-import-sort": "^12.1.1", + "favicons": "^7.2.0", + "favicons-webpack-plugin": "^6.0.1", + "file-loader": "^6.2.0", + "html-webpack-plugin": "^5.6.3", + "http-proxy-middleware": "^2.0.6", + "identity-obj-proxy": "^3.0.0", + "jest": "^29.7.0", + "jest-styled-components": "^7.2.0", + "lint-staged": "^16.1.2", + "loader-utils": "^3.3.1", + "mini-css-extract-plugin": "^2.9.2", + "npx": "^10.2.2", + "postcss": "^8.4.49", + "postcss-loader": "^7.0.2", + "postcss-preset-env": "7.8.3", + "prettier": "^3.5.3", + "react-dev-utils": "^12.0.1", + "react-refresh": "^0.14.2", + "react-test-renderer": "^18.3.1", + "resolve-url-loader": "^5.0.0", + "sass": "^1.81.0", + "sass-loader": "^16.0.3", + "style-loader": "^4.0.0", + "ts-jest": "^29.2.5", + "ts-node": "^10.9.2", + "typescript": "^5.7.2", + "webpack": "^5.96.1", + "webpack-cli": "^5.1.4", + "webpack-dev-server": "^5.1.0", + "webpack-merge": "^6.0.1", + "webpack-nano": "^1.1.1" + }, + "dependencies": { + "@cloudscape-design/chat-components": "^1.0.62", + "@cloudscape-design/collection-hooks": "^1.0.74", + "@cloudscape-design/component-toolkit": "^1.0.0-beta.120", + "@cloudscape-design/components": "^3.0.1188", + "@cloudscape-design/design-tokens": "^3.0.60", + "@cloudscape-design/global-styles": "^1.0.45", + "@hookform/resolvers": "^2.9.10", + "@reduxjs/toolkit": "^1.9.1", + "@types/yup": "^0.29.14", + "ace-builds": "^1.36.3", + "classnames": "^2.5.1", + "css-minimizer-webpack-plugin": "^4.2.2", + "date-fns": "^2.29.3", + "i18next": "^24.0.2", + "js-yaml": "^4.1.0", + "lodash": "^4.17.21", + "openai": "^4.33.1", + "prismjs": "^1.30.0", + "rc-tooltip": "^5.2.2", + "react": "^18.3.1", + "react-avatar": "^5.0.3", + "react-bus": "^4.0.1", + "react-dom": "^18.3.1", + "react-helmet": "^6.1.0", + "react-hook-form": "^7.53.0", + "react-i18next": "^12.1.4", + "react-redux": "^8.0.5", + "react-router-dom": "^6.27.0", + "react-string-replace": "^1.1.1", + "redux": "^5.0.1", + "yup": "^0.32.11" + }, + "lint-staged": { + "*.{js,jsx,ts,tsx}": [ + "eslint --fix --max-warnings=0 --no-warn-ignored", + "git add" + ] + } +} diff --git a/frontend/public/index.html b/frontend/public/index.html new file mode 100644 index 0000000000..c661f9de7b --- /dev/null +++ b/frontend/public/index.html @@ -0,0 +1,43 @@ + + + + + + + + <%= htmlWebpackPlugin.options.title %> + + + + + + + + <%= htmlWebpackPlugin.options.customHeadHTML %> + + + + +
+ + + + diff --git a/docs/assets/images/dstack-logo-notext.svg b/frontend/public/logo-notext.svg similarity index 100% rename from docs/assets/images/dstack-logo-notext.svg rename to frontend/public/logo-notext.svg diff --git a/frontend/public/manifest.json b/frontend/public/manifest.json new file mode 100644 index 0000000000..fb612dc348 --- /dev/null +++ b/frontend/public/manifest.json @@ -0,0 +1,16 @@ +{ + "short_name": "dstack", + "name": "dstack", + "icons": [ + { + "src": "favicon.ico", + "sizes": "64x64 32x32 24x24 16x16", + "type": "image/x-icon" + } + ], + "start_url": ".", + "display": "standalone", + "theme_color": "#000000", + "background_color": "#ffffff", + "permissions": ["clipboardWrite", "clipboardRead"] +} diff --git a/frontend/public/robots.txt b/frontend/public/robots.txt new file mode 100644 index 0000000000..e9e57dc4d4 --- /dev/null +++ b/frontend/public/robots.txt @@ -0,0 +1,3 @@ +# https://www.robotstxt.org/robotstxt.html +User-agent: * +Disallow: diff --git a/frontend/setupEnzyme.ts b/frontend/setupEnzyme.ts new file mode 100644 index 0000000000..aedc1b24cf --- /dev/null +++ b/frontend/setupEnzyme.ts @@ -0,0 +1,4 @@ +import { configure } from 'enzyme'; +import Adapter from '@cfaester/enzyme-adapter-react-18'; + +configure({ adapter: new Adapter() }); diff --git a/frontend/src/App/AuthErrorMessage/index.tsx b/frontend/src/App/AuthErrorMessage/index.tsx new file mode 100644 index 0000000000..f336f8c665 --- /dev/null +++ b/frontend/src/App/AuthErrorMessage/index.tsx @@ -0,0 +1,30 @@ +import React, { PropsWithChildren } from 'react'; + +import { Box, SpaceBetween } from 'components'; + +import styles from './styles.module.scss'; + +export interface Props extends PropsWithChildren { + title: string; + text?: string; +} + +export const AuthErrorMessage: React.FC = ({ title, text, children }) => { + return ( + + +
+ {title} + + {text && ( + + {text} + + )} +
+
+ +
{children}
+
+ ); +}; diff --git a/frontend/src/App/AuthErrorMessage/styles.module.scss b/frontend/src/App/AuthErrorMessage/styles.module.scss new file mode 100644 index 0000000000..bdee5af260 --- /dev/null +++ b/frontend/src/App/AuthErrorMessage/styles.module.scss @@ -0,0 +1,3 @@ +.content { + margin-top: 20px; +} diff --git a/frontend/src/App/Loading/index.tsx b/frontend/src/App/Loading/index.tsx new file mode 100644 index 0000000000..d6e06913d1 --- /dev/null +++ b/frontend/src/App/Loading/index.tsx @@ -0,0 +1,20 @@ +import React from 'react'; + +import { AppLayout, Spinner } from 'components'; + +import styles from './styles.module.scss'; + +export const Loading: React.FC = () => { + return ( + + +
+ } + /> + ); +}; diff --git a/frontend/src/App/Loading/styles.module.scss b/frontend/src/App/Loading/styles.module.scss new file mode 100644 index 0000000000..b2a1e1fbe3 --- /dev/null +++ b/frontend/src/App/Loading/styles.module.scss @@ -0,0 +1,6 @@ +.spinner { + height: 100%; + display: flex; + align-items: center; + justify-content: center; +} diff --git a/frontend/src/App/Login/EnterpriseLogin/index.tsx b/frontend/src/App/Login/EnterpriseLogin/index.tsx new file mode 100644 index 0000000000..7cb9b530af --- /dev/null +++ b/frontend/src/App/Login/EnterpriseLogin/index.tsx @@ -0,0 +1,53 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import cn from 'classnames'; + +import { Box, NavigateLink, SpaceBetween } from 'components'; +import { UnauthorizedLayout } from 'layouts/UnauthorizedLayout'; + +import { ROUTES } from 'routes'; +import { useGetEntraInfoQuery, useGetGoogleInfoQuery, useGetOktaInfoQuery } from 'services/auth'; + +import { LoginByEntraID } from '../EntraID/LoginByEntraID'; +import { LoginByGoogle } from '../LoginByGoogle'; +import { LoginByOkta } from '../LoginByOkta'; +import { LoginByTokenForm } from '../LoginByTokenForm'; + +import styles from './styles.module.scss'; + +export const EnterpriseLogin: React.FC = () => { + const { t } = useTranslation(); + const { data: oktaData, isLoading: isLoadingOkta } = useGetOktaInfoQuery(); + const { data: entraData, isLoading: isLoadingEntra } = useGetEntraInfoQuery(); + const { data: googleData, isLoading: isLoadingGoogle } = useGetGoogleInfoQuery(); + + const oktaEnabled = oktaData?.enabled; + const entraEnabled = entraData?.enabled; + const googleEnabled = googleData?.enabled; + + const isLoading = isLoadingOkta || isLoadingEntra; + const isShowTokenForm = !oktaEnabled && !entraEnabled; + + return ( + +
+ + + {t('auth.sign_in_to_dstack_enterprise')} + + + {!isLoading && isShowTokenForm && } + {!isLoadingOkta && oktaEnabled && } + {!isLoadingEntra && entraEnabled && } + {!isLoadingGoogle && googleEnabled && } + + {!isLoading && !isShowTokenForm && ( + + {t('auth.login_by_token')} + + )} + +
+
+ ); +}; diff --git a/frontend/src/App/Login/EnterpriseLogin/styles.module.scss b/frontend/src/App/Login/EnterpriseLogin/styles.module.scss new file mode 100644 index 0000000000..72a104a848 --- /dev/null +++ b/frontend/src/App/Login/EnterpriseLogin/styles.module.scss @@ -0,0 +1,16 @@ +.form { + max-width: 440px; + width: 100%; + margin-left: auto; + margin-right: auto; + padding-top: 120px; +} +.token { + +} +.okta { + margin-top: 20px; +} +.entra { + margin-top: 20px; +} diff --git a/frontend/src/App/Login/EntraID/LoginByEntraID/index.tsx b/frontend/src/App/Login/EntraID/LoginByEntraID/index.tsx new file mode 100644 index 0000000000..c52d14fe10 --- /dev/null +++ b/frontend/src/App/Login/EntraID/LoginByEntraID/index.tsx @@ -0,0 +1,39 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import cn from 'classnames'; + +import { Button } from 'components'; + +import { goToUrl } from 'libs'; +import { useEntraAuthorizeMutation } from 'services/auth'; + +import { getBaseUrl } from 'App/helpers'; + +import { ReactComponent as EntraIdIcon } from 'assets/icons/entraID.svg'; +import styles from './styles.module.scss'; + +export const LoginByEntraID: React.FC<{ className?: string }> = ({ className }) => { + const { t } = useTranslation(); + + const [entraAuthorize, { isLoading }] = useEntraAuthorizeMutation(); + + const signInClick = () => { + entraAuthorize({ base_url: getBaseUrl() }) + .unwrap() + .then((data) => { + goToUrl(data.authorization_url); + }) + .catch(console.log); + }; + + return ( +
+ +
+ ); +}; diff --git a/frontend/src/App/Login/EntraID/LoginByEntraID/styles.module.scss b/frontend/src/App/Login/EntraID/LoginByEntraID/styles.module.scss new file mode 100644 index 0000000000..cf258c4c3a --- /dev/null +++ b/frontend/src/App/Login/EntraID/LoginByEntraID/styles.module.scss @@ -0,0 +1,20 @@ +@use '@cloudscape-design/design-tokens/index' as awsui; + +.entraSignIn { + display: flex; + justify-content: center; + + button { + .loginButtonInner { + display: inline-flex; + align-items: center; + justify-content: center; + gap: 8px; + } + + .loginButtonLabel { + height: 20px; + line-height: 21px; + } + } +} diff --git a/frontend/src/App/Login/EntraID/LoginByEntraIDCallback/index.tsx b/frontend/src/App/Login/EntraID/LoginByEntraIDCallback/index.tsx new file mode 100644 index 0000000000..036851c3cf --- /dev/null +++ b/frontend/src/App/Login/EntraID/LoginByEntraIDCallback/index.tsx @@ -0,0 +1,76 @@ +import React, { useEffect, useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate, useSearchParams } from 'react-router-dom'; + +import { NavigateLink } from 'components'; +import { UnauthorizedLayout } from 'layouts/UnauthorizedLayout'; + +import { useAppDispatch } from 'hooks'; +import { ROUTES } from 'routes'; +import { useEntraCallbackMutation, useGetNextRedirectMutation } from 'services/auth'; + +import { AuthErrorMessage } from 'App/AuthErrorMessage'; +import { getBaseUrl } from 'App/helpers'; +import { Loading } from 'App/Loading'; +import { setAuthData } from 'App/slice'; + +export const LoginByEntraIDCallback: React.FC = () => { + const { t } = useTranslation(); + const [searchParams] = useSearchParams(); + const navigate = useNavigate(); + const code = searchParams.get('code'); + const state = searchParams.get('state'); + const [isInvalidCode, setIsInvalidCode] = useState(false); + const dispatch = useAppDispatch(); + + const [getNextRedirect] = useGetNextRedirectMutation(); + const [entraCallback] = useEntraCallbackMutation(); + + const checkCode = () => { + if (code && state) { + getNextRedirect({ code, state }) + .unwrap() + .then(({ redirect_url }) => { + if (redirect_url) { + window.location.href = redirect_url; + return; + } + entraCallback({ code, state, base_url: getBaseUrl() }) + .unwrap() + .then(({ creds: { token } }) => { + dispatch(setAuthData({ token })); + navigate('/'); + }) + .catch(() => { + setIsInvalidCode(true); + }); + }) + .catch(() => { + setIsInvalidCode(true); + }); + } + }; + + useEffect(() => { + if (code && state) { + checkCode(); + } else { + setIsInvalidCode(true); + } + }, []); + + if (isInvalidCode) + return ( + + + {t('auth.try_again')} + + + ); + + return ( + + ; + + ); +}; diff --git a/frontend/src/App/Login/LoginByGithub/index.tsx b/frontend/src/App/Login/LoginByGithub/index.tsx new file mode 100644 index 0000000000..61ce317cc8 --- /dev/null +++ b/frontend/src/App/Login/LoginByGithub/index.tsx @@ -0,0 +1,504 @@ +import React from 'react'; +import { createPortal } from 'react-dom'; +import { useTranslation } from 'react-i18next'; +import enMessages from '@cloudscape-design/components/i18n/messages/all.en.json'; +import { Mode } from '@cloudscape-design/global-styles'; + +import { + AnchorNavigation, + Box, + BreadcrumbGroup, + Button, + Container, + ContentLayout, + ExpandableSection, + Grid, + Header, + I18nProvider, + Icon, + Link, + Popover, + SpaceBetween, + TextContent, + TopNavigation, +} from 'components'; +import { DarkThemeIcon, LightThemeIcon } from 'layouts/AppLayout/themeIcons'; + +import { DISCORD_URL } from 'consts'; +import { useAppDispatch, useAppSelector } from 'hooks'; +import { goToUrl } from 'libs'; +import { ROUTES } from 'routes'; +import { useGithubAuthorizeMutation } from 'services/auth'; + +import { selectSystemMode, setSystemMode } from 'App/slice'; + +import logo from 'assets/images/logo.svg'; +import styles from './styles.module.scss'; + +type PortalProps = { + children: React.ReactNode; +}; + +const i18nStrings = { + overflowMenuTriggerText: '', + overflowMenuTitleText: '', + overflowMenuBackIconAriaLabel: '', + overflowMenuDismissIconAriaLabel: '', +}; + +const THEME_ICON_MAP: Record = { + [Mode.Dark]: DarkThemeIcon, + [Mode.Light]: LightThemeIcon, +}; + +const GitHubIcon: React.FC = () => ( + + + + + +); + +const askAi = () => { + window.document.body.focus(); + window?.Kapa?.open(); +}; + +const HeaderPortal = ({ children }: PortalProps) => { + const domNode = document.querySelector('#header'); + if (domNode) return createPortal(children, domNode); + return null; +}; + +function OnThisPageNavigation({ variant }: { variant: 'mobile' | 'side' }) { + const anchorNavigation = ( + + ); + + return variant === 'side' ? ( +
+ + On this page + + {anchorNavigation} +
+ ) : ( + + {anchorNavigation} + + ); +} + +function HeroHeader() { + const [githubAuthorize, { isLoading }] = useGithubAuthorizeMutation(); + + const signInClick = () => { + githubAuthorize() + .unwrap() + .then((data) => { + goToUrl(data.authorization_url); + }) + .catch(console.log); + }; + + return ( + + +
+ Welcome to dstack Sky + + Enjoy the full power of dstack without the hassle of hosting it yourself or managing + your own infrastructure.
+ Sign up for dstack Sky to use the cheapest GPUs from our marketplace or connect it to + your own cloud accounts. +
+ + By clicking Sign up you agree to the{' '} + + Terms + {' '} + and{' '} + + Privacy policy + + +
+ + + + + + + No credit card required + + + +
+ + + + + + +
+
+
+ ); +} + +function ProductOverview() { + return ( +
+ +
+ Overview +
+
+ + dstack is an open-source container orchestrator that lets ML teams easily manage + clusters, volumes, dev environments, training, and inference. Its container-native interface boosts + productivity, maximizes GPU efficiency, and lowers costs. + + + dstack Sky adds a managed service, letting you use the cheapest GPUs from our + marketplace or connect your own cloud accounts. + +
+ +
+ + Features + + +
+
+
+ + Open-source + +
+
dstack Sky
+ +
+ Bring your own cloud{' '} + + + Use compute from your own cloud account(s) by providing your credentials. + + + You pay for compute and storage usage directly to the configured cloud + provider(s) through their billing. dstack won't bill or charge you. + + + } + > + + + + +
+
+ +
+
+ +
+ +
+ GPU marketplace{' '} + + + Use compute from multiple cloud providers without needing your own cloud + account(s). + + + You pay for compute and storage usage directly to dstack. You can + top up your balance in your dstack user settings. + + When you sign up, you get $5 in credits. + + } + > + + + + +
+
+
+ +
+ +
+ SSH fleets{' '} + + + + + +
+
+ +
+
+ +
+ +
+ Gateway{' '} + + + + + +
+
Configure your own domain
+
+ Pre-configured *.sky.dstack.ai +
+ +
Pricing
+
Free
+
Pay only if you use GPU marketplace
+ +
+
Self-hosted
+
Hosted by dstack
+
+
+
+ +
+
+ Highlights +
+ +
    +
  • Use compute from your own cloud account(s) or through GPU marketplace.
  • +
  • Create dev environments, run training tasks, and deploy inference services.
  • +
  • Manage volumes and fleets.
  • +
  • Manage multiple projects and teams.
  • +
+
+
+ +
+
+ Documentation +
+ + + Want to learn more about dstack? Check out the{' '} + + documentation + + + +
+
+
+ ); +} + +function OtherVersions() { + return ( +
+ + Other versions + +
    +
  • + + + + Open-source + Self-hosted + + Fully customizable and self-hosted open-source version. + + + +
  • +
  • + + + + dstack Enterprise + Self-hosted + + Single sign-on, advanced governance controls, and dedicated support. + + + +
  • +
+
+ ); +} + +export const LoginByGithub: React.FC = () => { + const { t } = useTranslation(); + const dispatch = useAppDispatch(); + const systemMode = useAppSelector(selectSystemMode) ?? ''; + const ThemeIcon = THEME_ICON_MAP[systemMode]; + + const onChangeSystemModeToggle = (event: React.MouseEvent) => { + event.preventDefault(); + switch (systemMode) { + case Mode.Light: + dispatch(setSystemMode(Mode.Dark)); + return; + default: + dispatch(setSystemMode(Mode.Light)); + } + }; + + return ( + <> + +
+ goToUrl('https://dstack.ai/docs/', true), + }, + { + type: 'button', + text: t('common.discord'), + external: true, + onClick: () => goToUrl(DISCORD_URL, true), + }, + { + href: 'theme-button', + type: 'button', + iconSvg: , + onClick: onChangeSystemModeToggle, + }, + { + type: 'button', + iconName: 'gen-ai', + text: t('common.ask_ai'), + title: t('common.ask_ai'), + onClick: askAi, + }, + ]} + /> +
+
+ + + + } + headerVariant="high-contrast" + header={} + defaultPadding={true} + maxContentWidth={1040} + disableOverlap={true} + > +
+
+ +
+ + + +
+ + +
+
+
+
+ + ); +}; diff --git a/frontend/src/App/Login/LoginByGithub/styles.module.scss b/frontend/src/App/Login/LoginByGithub/styles.module.scss new file mode 100644 index 0000000000..d3efe6e389 --- /dev/null +++ b/frontend/src/App/Login/LoginByGithub/styles.module.scss @@ -0,0 +1,121 @@ +@use '~@cloudscape-design/design-tokens' as cs; + +body { + background: cs.$color-background-layout-main; + position: relative; +} + +$viewport-breakpoint-s: 912px; + +body { + // Note: This token will be themed (see the product page index.tsx) + background: cs.$color-background-layout-main; +} + +.productPageContentGrid { + display: grid; + grid-template-columns: 3fr 1fr; + grid-template-rows: 0 auto 0; + margin-block-start: cs.$space-static-xxl; +} + +.onThisPageMobile { + grid-row: 1; + grid-column: 1 / 3; + display: none; + margin-block-end: cs.$space-static-xxl; +} + +.productPageAside { + grid-row: 2; + grid-column: 2 / 3; + padding-inline-start: calc(#{cs.$space-scaled-xxxl} /2); +} + +.productPageContent { + grid-row: 2; + grid-column: 1 / 2; + padding-inline-end: calc(#{cs.$space-scaled-xxxl} /2); + margin-bottom: 20px; +} + + +.productPageAsideSticky { + position: sticky; + inset-block-start: 40px; +} + +@media only screen and (max-width: $viewport-breakpoint-s) { + .productPageContentGrid { + grid-template-columns: 100%; + grid-template-rows: auto auto auto; + } + + .onThisPageMobile { + display: block; + } + + .productPageMobile { + display: block; + } + + .productPageAside { + display: none; + } +} + + +/* High-level sections of the main content area */ +.pageSection { + padding-block-end: cs.$space-static-xxxl; + margin-block-end: cs.$space-static-xxl; + border-bottom: 1px solid cs.$color-border-divider-default; + + &:last-child { + border: none; + margin-block-end: 0; + } +} + +/* Product details list containing keys and values */ +.productDetails { + display: grid; + grid-template-columns: 30% 35% 35%; + margin: 0; + padding: 0; + + dt { + color: cs.$color-text-body-default; + font-weight: bold; + } + + dt, + dd { + margin: 0; + padding: 0; + padding-block: cs.$space-scaled-xs; + border-block-end: 1px solid cs.$color-border-divider-default; + } +} + +/* List of product cards */ +.productCardsList { + display: flex; + flex-wrap: wrap; + column-gap: cs.$space-scaled-l; + row-gap: cs.$space-scaled-l; + + list-style-type: none; + margin: 0; + padding: 0; +} + +.productCardsListItem { + flex: 1; + flex-basis: 250px; + max-inline-size: 312px; + + list-style-type: none; + margin: 0; + padding: 0; +} diff --git a/frontend/src/App/Login/LoginByGithubCallback/index.tsx b/frontend/src/App/Login/LoginByGithubCallback/index.tsx new file mode 100644 index 0000000000..45be311b6b --- /dev/null +++ b/frontend/src/App/Login/LoginByGithubCallback/index.tsx @@ -0,0 +1,84 @@ +import React, { useEffect, useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate, useSearchParams } from 'react-router-dom'; + +import { NavigateLink } from 'components'; +import { UnauthorizedLayout } from 'layouts/UnauthorizedLayout'; + +import { useAppDispatch } from 'hooks'; +import { ROUTES } from 'routes'; +import { useGetNextRedirectMutation, useGithubCallbackMutation } from 'services/auth'; +import { useLazyGetProjectsQuery } from 'services/project'; + +import { AuthErrorMessage } from 'App/AuthErrorMessage'; +import { Loading } from 'App/Loading'; +import { setAuthData } from 'App/slice'; + +export const LoginByGithubCallback: React.FC = () => { + const { t } = useTranslation(); + const [searchParams] = useSearchParams(); + const navigate = useNavigate(); + const code = searchParams.get('code'); + const state = searchParams.get('state'); + const [isInvalidCode, setIsInvalidCode] = useState(false); + const dispatch = useAppDispatch(); + + const [getNextRedirect] = useGetNextRedirectMutation(); + const [githubCallback] = useGithubCallbackMutation(); + const [getProjects] = useLazyGetProjectsQuery(); + + const checkCode = () => { + if (code && state) { + getNextRedirect({ code: code, state: state }) + .unwrap() + .then(async ({ redirect_url }) => { + if (redirect_url) { + window.location.href = redirect_url; + return; + } + githubCallback({ code, state }) + .unwrap() + .then(async ({ creds: { token } }) => { + dispatch(setAuthData({ token })); + if (process.env.UI_VERSION === 'sky') { + const result = await getProjects({}).unwrap(); + if (result?.length === 0) { + navigate(ROUTES.PROJECT.ADD); + return; + } + } + navigate('/'); + }) + .catch(() => { + setIsInvalidCode(true); + }); + }) + .catch(() => { + setIsInvalidCode(true); + }); + } + }; + + useEffect(() => { + if (code) { + checkCode(); + } else { + setIsInvalidCode(true); + } + }, []); + + if (isInvalidCode) + return ( + + + {t('auth.try_again')} + + + ); + + return ( + + ; + + ); +}; diff --git a/frontend/src/App/Login/LoginByGoogle/index.tsx b/frontend/src/App/Login/LoginByGoogle/index.tsx new file mode 100644 index 0000000000..b83a93f187 --- /dev/null +++ b/frontend/src/App/Login/LoginByGoogle/index.tsx @@ -0,0 +1,37 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import cn from 'classnames'; + +import { Button } from 'components'; + +import { goToUrl } from 'libs'; +import { useGoogleAuthorizeMutation } from 'services/auth'; + +import { ReactComponent as GoogleIcon } from 'assets/icons/google.svg'; +import styles from './styles.module.scss'; + +export const LoginByGoogle: React.FC<{ className?: string }> = ({ className }) => { + const { t } = useTranslation(); + + const [googleAuthorize, { isLoading }] = useGoogleAuthorizeMutation(); + + const signInClick = () => { + googleAuthorize() + .unwrap() + .then((data) => { + goToUrl(data.authorization_url); + }) + .catch(console.log); + }; + + return ( +
+ +
+ ); +}; diff --git a/frontend/src/App/Login/LoginByGoogle/styles.module.scss b/frontend/src/App/Login/LoginByGoogle/styles.module.scss new file mode 100644 index 0000000000..93407bda8d --- /dev/null +++ b/frontend/src/App/Login/LoginByGoogle/styles.module.scss @@ -0,0 +1,20 @@ +@use '@cloudscape-design/design-tokens/index' as awsui; + +.signIn { + display: flex; + justify-content: center; + + button { + .loginButtonInner { + display: inline-flex; + align-items: center; + justify-content: center; + gap: 8px; + } + + .loginButtonLabel { + height: 20px; + line-height: 21px; + } + } +} diff --git a/frontend/src/App/Login/LoginByGoogleCallback/index.tsx b/frontend/src/App/Login/LoginByGoogleCallback/index.tsx new file mode 100644 index 0000000000..4f95f94e27 --- /dev/null +++ b/frontend/src/App/Login/LoginByGoogleCallback/index.tsx @@ -0,0 +1,75 @@ +import React, { useEffect, useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate, useSearchParams } from 'react-router-dom'; + +import { NavigateLink } from 'components'; +import { UnauthorizedLayout } from 'layouts/UnauthorizedLayout'; + +import { useAppDispatch } from 'hooks'; +import { ROUTES } from 'routes'; +import { useGetNextRedirectMutation, useGoogleCallbackMutation } from 'services/auth'; + +import { AuthErrorMessage } from 'App/AuthErrorMessage'; +import { Loading } from 'App/Loading'; +import { setAuthData } from 'App/slice'; + +export const LoginByGoogleCallback: React.FC = () => { + const { t } = useTranslation(); + const [searchParams] = useSearchParams(); + const navigate = useNavigate(); + const code = searchParams.get('code'); + const state = searchParams.get('state'); + const [isInvalidCode, setIsInvalidCode] = useState(false); + const dispatch = useAppDispatch(); + + const [getNextRedirect] = useGetNextRedirectMutation(); + const [googleCallback] = useGoogleCallbackMutation(); + + const checkCode = () => { + if (code && state) { + getNextRedirect({ code, state }) + .unwrap() + .then(({ redirect_url }) => { + if (redirect_url) { + window.location.href = redirect_url; + return; + } + googleCallback({ code, state }) + .unwrap() + .then(({ creds: { token } }) => { + dispatch(setAuthData({ token })); + navigate('/'); + }) + .catch(() => { + setIsInvalidCode(true); + }); + }) + .catch(() => { + setIsInvalidCode(true); + }); + } + }; + + useEffect(() => { + if (code && state) { + checkCode(); + } else { + setIsInvalidCode(true); + } + }, []); + + if (isInvalidCode) + return ( + + + {t('auth.try_again')} + + + ); + + return ( + + ; + + ); +}; diff --git a/frontend/src/App/Login/LoginByOkta/index.tsx b/frontend/src/App/Login/LoginByOkta/index.tsx new file mode 100644 index 0000000000..5b01b6dd53 --- /dev/null +++ b/frontend/src/App/Login/LoginByOkta/index.tsx @@ -0,0 +1,37 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import cn from 'classnames'; + +import { Button } from 'components'; + +import { goToUrl } from 'libs'; +import { useOktaAuthorizeMutation } from 'services/auth'; + +import { ReactComponent as OktaIcon } from 'assets/icons/okta.svg'; +import styles from './styles.module.scss'; + +export const LoginByOkta: React.FC<{ className?: string }> = ({ className }) => { + const { t } = useTranslation(); + + const [oktaAuthorize, { isLoading }] = useOktaAuthorizeMutation(); + + const signInClick = () => { + oktaAuthorize() + .unwrap() + .then((data) => { + goToUrl(data.authorization_url); + }) + .catch(console.log); + }; + + return ( +
+ +
+ ); +}; diff --git a/frontend/src/App/Login/LoginByOkta/styles.module.scss b/frontend/src/App/Login/LoginByOkta/styles.module.scss new file mode 100644 index 0000000000..93407bda8d --- /dev/null +++ b/frontend/src/App/Login/LoginByOkta/styles.module.scss @@ -0,0 +1,20 @@ +@use '@cloudscape-design/design-tokens/index' as awsui; + +.signIn { + display: flex; + justify-content: center; + + button { + .loginButtonInner { + display: inline-flex; + align-items: center; + justify-content: center; + gap: 8px; + } + + .loginButtonLabel { + height: 20px; + line-height: 21px; + } + } +} diff --git a/frontend/src/App/Login/LoginByOktaCallback/index.tsx b/frontend/src/App/Login/LoginByOktaCallback/index.tsx new file mode 100644 index 0000000000..72cdc96185 --- /dev/null +++ b/frontend/src/App/Login/LoginByOktaCallback/index.tsx @@ -0,0 +1,75 @@ +import React, { useEffect, useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate, useSearchParams } from 'react-router-dom'; + +import { NavigateLink } from 'components'; +import { UnauthorizedLayout } from 'layouts/UnauthorizedLayout'; + +import { useAppDispatch } from 'hooks'; +import { ROUTES } from 'routes'; +import { useGetNextRedirectMutation, useOktaCallbackMutation } from 'services/auth'; + +import { AuthErrorMessage } from 'App/AuthErrorMessage'; +import { Loading } from 'App/Loading'; +import { setAuthData } from 'App/slice'; + +export const LoginByOktaCallback: React.FC = () => { + const { t } = useTranslation(); + const [searchParams] = useSearchParams(); + const navigate = useNavigate(); + const code = searchParams.get('code'); + const state = searchParams.get('state'); + const [isInvalidCode, setIsInvalidCode] = useState(false); + const dispatch = useAppDispatch(); + + const [getNextRedirect] = useGetNextRedirectMutation(); + const [oktaCallback] = useOktaCallbackMutation(); + + const checkCode = () => { + if (code && state) { + getNextRedirect({ code, state }) + .unwrap() + .then(({ redirect_url }) => { + if (redirect_url) { + window.location.href = redirect_url; + return; + } + oktaCallback({ code, state }) + .unwrap() + .then(({ creds: { token } }) => { + dispatch(setAuthData({ token })); + navigate('/'); + }) + .catch(() => { + setIsInvalidCode(true); + }); + }) + .catch(() => { + setIsInvalidCode(true); + }); + } + }; + + useEffect(() => { + if (code && state) { + checkCode(); + } else { + setIsInvalidCode(true); + } + }, []); + + if (isInvalidCode) + return ( + + + {t('auth.try_again')} + + + ); + + return ( + + ; + + ); +}; diff --git a/frontend/src/App/Login/LoginByTokenForm/index.tsx b/frontend/src/App/Login/LoginByTokenForm/index.tsx new file mode 100644 index 0000000000..d5b7a76111 --- /dev/null +++ b/frontend/src/App/Login/LoginByTokenForm/index.tsx @@ -0,0 +1,71 @@ +import React from 'react'; +import { useForm } from 'react-hook-form'; +import { useTranslation } from 'react-i18next'; +import { useNavigate } from 'react-router-dom'; +import cn from 'classnames'; + +import { Button, FormInput } from 'components'; + +import { useAppDispatch } from 'hooks'; +import { useCheckAuthTokenMutation } from 'services/user'; + +import { setAuthData } from 'App/slice'; + +import styles from './styles.module.scss'; + +type FormValues = Pick; + +export interface Props { + className?: string; +} + +export const LoginByTokenForm: React.FC = ({ className }) => { + const { t } = useTranslation(); + const { handleSubmit, control, setError } = useForm(); + const dispatch = useAppDispatch(); + const navigate = useNavigate(); + + const [checkToken, { isLoading }] = useCheckAuthTokenMutation(); + const onSubmit = (data: FormValues) => { + checkToken(data) + .unwrap() + .then(() => { + dispatch(setAuthData(data)); + navigate('/'); + }) + .catch((error) => { + if (error?.status === 401) { + setError('token', { type: 'custom', message: t('auth.invalid_token') }); + return; + } + + setError('token', { type: 'custom', message: t('common.server_error', { error: error?.msg }) }); + }); + }; + + return ( +
+
+
+
+ +
+ +
+ +
+
+
+
+ ); +}; diff --git a/frontend/src/App/Login/LoginByTokenForm/styles.module.scss b/frontend/src/App/Login/LoginByTokenForm/styles.module.scss new file mode 100644 index 0000000000..859b39ebcc --- /dev/null +++ b/frontend/src/App/Login/LoginByTokenForm/styles.module.scss @@ -0,0 +1,24 @@ +.form { + max-width: 440px; + margin-left: auto; + margin-right: auto; +} +.token { + display: flex; + align-items: flex-start; + gap: 12px; +} +.fieldWrap { + flex-grow: 1; + min-width: 0; +} +.buttonWrap { + display: flex; + flex-shrink: 0; + width: 104px; + margin-right: -20px; + + button { + white-space: nowrap !important; + } +} diff --git a/frontend/src/App/Login/TokenLogin/index.tsx b/frontend/src/App/Login/TokenLogin/index.tsx new file mode 100644 index 0000000000..b41393ba39 --- /dev/null +++ b/frontend/src/App/Login/TokenLogin/index.tsx @@ -0,0 +1,34 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import cn from 'classnames'; + +import { Box, NavigateLink, SpaceBetween } from 'components'; +import { UnauthorizedLayout } from 'layouts/UnauthorizedLayout'; + +import { ROUTES } from 'routes'; + +import { LoginByTokenForm } from '../LoginByTokenForm'; + +import styles from './styles.module.scss'; + +export const TokenLogin: React.FC = () => { + const { t } = useTranslation(); + + return ( + +
+ + + {t('auth.sign_in_to_dstack_enterprise')} + + + + + + {t('auth.another_login_methods')} + + +
+
+ ); +}; diff --git a/frontend/src/App/Login/TokenLogin/styles.module.scss b/frontend/src/App/Login/TokenLogin/styles.module.scss new file mode 100644 index 0000000000..b648c81066 --- /dev/null +++ b/frontend/src/App/Login/TokenLogin/styles.module.scss @@ -0,0 +1,7 @@ +.form { + max-width: 440px; + width: 100%; + margin-left: auto; + margin-right: auto; + padding-top: 120px; +} diff --git a/frontend/src/App/Logout/index.tsx b/frontend/src/App/Logout/index.tsx new file mode 100644 index 0000000000..0526c92cca --- /dev/null +++ b/frontend/src/App/Logout/index.tsx @@ -0,0 +1,22 @@ +import React, { useEffect } from 'react'; +import { Navigate } from 'react-router-dom'; + +import { useAppDispatch } from 'hooks'; +import { ROUTES } from 'routes'; +import { projectApi } from 'services/project'; +import { userApi } from 'services/user'; + +import { removeAuthData } from '../slice'; + +export const Logout: React.FC = () => { + const dispatch = useAppDispatch(); + + useEffect(() => { + dispatch(removeAuthData()); + + dispatch(userApi.util.resetApiState()); + dispatch(projectApi.util.resetApiState()); + }, []); + + return ; +}; diff --git a/frontend/src/App/constants.ts b/frontend/src/App/constants.ts new file mode 100644 index 0000000000..d861606c36 --- /dev/null +++ b/frontend/src/App/constants.ts @@ -0,0 +1,3 @@ +export const AUTH_DATA_STORAGE_KEY = 'authData'; +export const MODE_STORAGE_KEY = 'mode'; +export const TUTORIAL_SHOW_STARTUP_STORAGE_KEY = 'tutorial-show-startup'; diff --git a/frontend/src/App/helpers.ts b/frontend/src/App/helpers.ts new file mode 100644 index 0000000000..d02a016e1e --- /dev/null +++ b/frontend/src/App/helpers.ts @@ -0,0 +1,8 @@ +import { Mode } from '@cloudscape-design/global-styles'; + +export const getThemeMode = (): Mode => (window?.matchMedia('(prefers-color-scheme: dark)').matches ? Mode.Dark : Mode.Light); + +export function getBaseUrl(): string { + const { protocol, hostname, port } = window.location; + return `${protocol}//${hostname}${port ? `:${port}` : ''}`; +} diff --git a/frontend/src/App/index.tsx b/frontend/src/App/index.tsx new file mode 100644 index 0000000000..de0f919079 --- /dev/null +++ b/frontend/src/App/index.tsx @@ -0,0 +1,86 @@ +import React, { useEffect } from 'react'; +import { useTranslation } from 'react-i18next'; +import { Outlet, useLocation } from 'react-router-dom'; + +import AppLayout from 'layouts/AppLayout'; + +import { useAppDispatch, useAppSelector } from 'hooks'; +import { useGetUserDataQuery } from 'services/user'; + +import { EnterpriseLogin } from './Login/EnterpriseLogin'; +import { LoginByGithub } from './Login/LoginByGithub'; +import { ROUTES } from '../routes'; +import { AuthErrorMessage } from './AuthErrorMessage'; +import { selectAuthToken, setUserData } from './slice'; + +const localStorageIsAvailable = 'localStorage' in window; + +const IGNORED_AUTH_PATHS = [ + ROUTES.AUTH.GITHUB_CALLBACK, + ROUTES.AUTH.OKTA_CALLBACK, + ROUTES.AUTH.ENTRA_CALLBACK, + ROUTES.AUTH.GOOGLE_CALLBACK, + ROUTES.AUTH.TOKEN, +]; + +const LoginFormComponent = process.env.UI_VERSION === 'enterprise' ? EnterpriseLogin : LoginByGithub; + +const App: React.FC = () => { + const { t } = useTranslation(); + const token = useAppSelector(selectAuthToken); + const isAuthenticated = Boolean(token); + const dispatch = useAppDispatch(); + const { pathname } = useLocation(); + + const { + isLoading, + data: userData, + error: getUserError, + } = useGetUserDataQuery( + { token }, + { + skip: !isAuthenticated || !localStorageIsAvailable, + }, + ); + + useEffect(() => { + if (userData?.username || getUserError) { + if (userData?.username) { + dispatch(setUserData(userData)); + } + } + }, [userData, getUserError, isLoading]); + + const renderLocalstorageError = () => { + return ( + + ); + }; + + const renderTokenError = () => { + return ; + }; + + const renderNotAuthorizedError = () => { + return ; + }; + + if (IGNORED_AUTH_PATHS.includes(pathname)) { + return ; + } + + if (!localStorageIsAvailable) return renderLocalstorageError(); + if (getUserError) return renderTokenError(); + if (!isAuthenticated) return renderNotAuthorizedError(); + + return ( + + + + ); +}; + +export default App; diff --git a/frontend/src/App/slice.ts b/frontend/src/App/slice.ts new file mode 100644 index 0000000000..9d684d5850 --- /dev/null +++ b/frontend/src/App/slice.ts @@ -0,0 +1,193 @@ +import type { RootState } from 'store'; +import { applyMode, Mode } from '@cloudscape-design/global-styles'; +import { createSlice, PayloadAction } from '@reduxjs/toolkit'; + +import { AUTH_DATA_STORAGE_KEY, MODE_STORAGE_KEY, TUTORIAL_SHOW_STARTUP_STORAGE_KEY } from './constants'; +import { getThemeMode } from './helpers'; + +import { IAppState, ToolsTabs } from './types'; + +const getInitialState = (): IAppState => { + let authData = null; + let storageData = null; + let hideStartUp: null | boolean = null; + let activeMode = getThemeMode(); + + try { + storageData = localStorage.getItem(AUTH_DATA_STORAGE_KEY); + } catch (e) { + console.log(e); + } + + try { + hideStartUp = (() => { + if (!localStorage.getItem(TUTORIAL_SHOW_STARTUP_STORAGE_KEY)) { + return null; + } + + return localStorage.getItem(TUTORIAL_SHOW_STARTUP_STORAGE_KEY) === 'true'; + })(); + } catch (e) { + console.log(e); + } + + try { + const modeStorageData = localStorage.getItem(MODE_STORAGE_KEY); + + if (modeStorageData) { + activeMode = modeStorageData as Mode; + } + } catch (e) { + console.log(e); + } + + applyMode(activeMode); + + if (storageData) authData = JSON.parse(storageData) as IUserAuthData; + + return { + authData, + userData: null, + breadcrumbs: null, + systemMode: activeMode, + + toolsPanelState: { + isOpen: false, + tab: ToolsTabs.TUTORIAL, + }, + + helpPanel: { + content: {}, + }, + + tutorialPanel: { + createProjectCompleted: false, + billingCompleted: false, + configureCLICompleted: false, + discordCompleted: false, + tallyCompleted: false, + quickStartCompleted: false, + hideStartUp, + }, + }; +}; + +const initialState: IAppState = getInitialState(); + +export const appSlice = createSlice({ + name: 'app', + initialState, + + reducers: { + setAuthData: (state, action: PayloadAction) => { + state.authData = action.payload; + + try { + localStorage.setItem(AUTH_DATA_STORAGE_KEY, JSON.stringify(action.payload)); + } catch (e) { + console.log(e); + } + }, + + setSystemMode: (state, action: PayloadAction) => { + state.systemMode = action.payload; + applyMode(action.payload); + try { + localStorage.setItem(MODE_STORAGE_KEY, action.payload); + } catch (e) { + console.log(e); + } + }, + + removeAuthData: (state) => { + state.authData = null; + + try { + localStorage.removeItem(AUTH_DATA_STORAGE_KEY); + } catch (e) { + console.log(e); + } + }, + + setUserData: (state, action: PayloadAction) => { + state.userData = action.payload; + }, + + setBreadcrumb: (state, action: PayloadAction) => { + state.breadcrumbs = action.payload; + }, + + openHelpPanel: (state, action: PayloadAction) => { + state.toolsPanelState = { + isOpen: true, + tab: ToolsTabs.INFO, + }; + + state.helpPanel = { content: action.payload }; + }, + + openTutorialPanel: (state) => { + state.toolsPanelState = { + isOpen: true, + tab: ToolsTabs.TUTORIAL, + }; + }, + + closeToolsPanel: (state) => { + state.toolsPanelState = { + ...state.toolsPanelState, + isOpen: false, + }; + }, + + setToolsTab: (state, action: PayloadAction) => { + state.toolsPanelState = { + ...state.toolsPanelState, + tab: action.payload, + }; + }, + + updateTutorialPanelState: (state, action: PayloadAction>) => { + state.tutorialPanel = { + ...state.tutorialPanel, + ...action.payload, + }; + }, + + setHideAtStartup: (state, action: PayloadAction) => { + state.tutorialPanel = { + ...state.tutorialPanel, + hideStartUp: action.payload, + }; + + try { + localStorage.setItem(TUTORIAL_SHOW_STARTUP_STORAGE_KEY, JSON.stringify(action.payload)); + } catch (e) { + console.log(e); + } + }, + }, +}); + +export const { + setAuthData, + setSystemMode, + removeAuthData, + setUserData, + setBreadcrumb, + openHelpPanel, + closeToolsPanel, + setToolsTab, + openTutorialPanel, + updateTutorialPanelState, + setHideAtStartup, +} = appSlice.actions; +export const selectUserData = (state: RootState) => state.app.userData; +export const selectAuthToken = (state: RootState) => state.app.authData?.token; +export const selectUserName = (state: RootState) => state.app.userData?.username; +export const selectBreadcrumbs = (state: RootState) => state.app.breadcrumbs; +export const selectToolsPanelState = (state: RootState) => state.app.toolsPanelState; +export const selectHelpPanelContent = (state: RootState) => state.app.helpPanel.content; +export const selectTutorialPanel = (state: RootState) => state.app.tutorialPanel; +export const selectSystemMode = (state: RootState) => state.app.systemMode; +export default appSlice.reducer; diff --git a/frontend/src/App/types.ts b/frontend/src/App/types.ts new file mode 100644 index 0000000000..262c1b156c --- /dev/null +++ b/frontend/src/App/types.ts @@ -0,0 +1,43 @@ +import { TutorialPanelProps } from '@cloudscape-design/components'; +import { Mode } from '@cloudscape-design/global-styles'; + +import { HelpPanelProps } from 'components'; + +export type THelpPanelContent = Pick & { body?: HelpPanelProps['children'] }; + +export enum ToolsTabs { + INFO = 'info', + TUTORIAL = 'tutorial', +} + +export interface ITutorialItem extends TutorialPanelProps.Tutorial { + id: number; + startCallback?: (tutorial: ITutorialItem) => void; + startWithoutActivation?: boolean; + finishCallback?: (tutorial: ITutorialItem) => void; +} + +export interface IAppState { + userData: IUser | null; + authData: IUserAuthData | null; + breadcrumbs: TBreadcrumb[] | null; + systemMode: Mode; + toolsPanelState: { + isOpen: boolean; + tab: ToolsTabs; + }; + + helpPanel: { + content: THelpPanelContent; + }; + + tutorialPanel: { + createProjectCompleted: boolean; + billingCompleted: boolean; + configureCLICompleted: boolean; + discordCompleted: boolean; + tallyCompleted: boolean; + quickStartCompleted: boolean; + hideStartUp: boolean | null; + }; +} diff --git a/frontend/src/api.ts b/frontend/src/api.ts new file mode 100644 index 0000000000..78fcd72c33 --- /dev/null +++ b/frontend/src/api.ts @@ -0,0 +1,193 @@ +const BASE_URL = process.env.API_URL; + +export const API = { + BASE: () => `${BASE_URL}`, + + AUTH: { + BASE: () => `${API.BASE()}/auth`, + NEXT_REDIRECT: () => `${API.AUTH.BASE()}/get_next_redirect`, + GITHUB: { + BASE: () => `${API.AUTH.BASE()}/github`, + AUTHORIZE: () => `${API.AUTH.GITHUB.BASE()}/authorize`, + CALLBACK: () => `${API.AUTH.GITHUB.BASE()}/callback`, + }, + OKTA: { + BASE: () => `${API.AUTH.BASE()}/okta`, + INFO: () => `${API.AUTH.OKTA.BASE()}/info`, + AUTHORIZE: () => `${API.AUTH.OKTA.BASE()}/authorize`, + CALLBACK: () => `${API.AUTH.OKTA.BASE()}/callback`, + }, + ENTRA: { + BASE: () => `${API.AUTH.BASE()}/entra`, + INFO: () => `${API.AUTH.ENTRA.BASE()}/info`, + AUTHORIZE: () => `${API.AUTH.ENTRA.BASE()}/authorize`, + CALLBACK: () => `${API.AUTH.ENTRA.BASE()}/callback`, + }, + GOOGLE: { + BASE: () => `${API.AUTH.BASE()}/google`, + INFO: () => `${API.AUTH.GOOGLE.BASE()}/info`, + AUTHORIZE: () => `${API.AUTH.GOOGLE.BASE()}/authorize`, + CALLBACK: () => `${API.AUTH.GOOGLE.BASE()}/callback`, + }, + }, + + USERS: { + BASE: () => `${API.BASE()}/users`, + LIST: () => `${API.USERS.BASE()}/list`, + CREATE: () => `${API.USERS.BASE()}/create`, + UPDATE: () => `${API.USERS.BASE()}/update`, + DETAILS: () => `${API.USERS.BASE()}/get_user`, + CURRENT_USER: () => `${API.USERS.BASE()}/get_my_user`, + REFRESH_TOKEN: () => `${API.USERS.BASE()}/refresh_token`, + DELETE: () => `${API.USERS.BASE()}/delete`, + }, + + USER_PAYMENTS: { + BASE: (username: string) => `${API.BASE()}/user/${username}/payments`, + LIST: (username: string) => `${API.USER_PAYMENTS.BASE(username)}/list`, + ADD: (username: string) => `${API.USER_PAYMENTS.BASE(username)}/add`, + }, + + USER_BILLING: { + BASE: (username: string) => `${API.BASE()}/user/${username}/billing`, + INFO: (username: string) => `${API.USER_BILLING.BASE(username)}/info`, + CHECKOUT_SESSION: (username: string) => `${API.USER_BILLING.BASE(username)}/checkout_session`, + PORTAL_SESSION: (username: string) => `${API.USER_BILLING.BASE(username)}/portal_session`, + }, + + EVENTS: { + BASE: () => `${API.BASE()}/events`, + LIST: () => `${API.EVENTS.BASE()}/list`, + }, + + PROJECTS: { + BASE: () => `${API.BASE()}/projects`, + LIST: () => `${API.PROJECTS.BASE()}/list`, + LIST_ONLY_NO_FLEETS: () => `${API.PROJECTS.BASE()}/list_only_no_fleets`, + CREATE: () => `${API.PROJECTS.BASE()}/create`, + CREATE_WIZARD: () => `${API.PROJECTS.BASE()}/create_wizard`, + DELETE: () => `${API.PROJECTS.BASE()}/delete`, + DETAILS: (name: IProject['project_name']) => `${API.PROJECTS.BASE()}/${name}`, + DETAILS_INFO: (name: IProject['project_name']) => `${API.PROJECTS.DETAILS(name)}/get`, + SET_MEMBERS: (name: IProject['project_name']) => `${API.PROJECTS.DETAILS(name)}/set_members`, + ADD_MEMBERS: (name: IProject['project_name']) => `${API.PROJECTS.DETAILS(name)}/add_members`, + REMOVE_MEMBERS: (name: IProject['project_name']) => `${API.PROJECTS.DETAILS(name)}/remove_members`, + UPDATE: (name: IProject['project_name']) => `${API.PROJECTS.DETAILS(name)}/update`, + + // Repos + REPOS: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/repos`, + REPOS_LIST: (projectName: IProject['project_name']) => `${API.PROJECTS.REPOS(projectName)}/list`, + GET_REPO: (projectName: IProject['project_name']) => `${API.PROJECTS.REPOS(projectName)}/get`, + INIT_REPO: (projectName: IProject['project_name']) => `${API.PROJECTS.REPOS(projectName)}/init`, + + // Runs + RUNS: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/runs`, + RUNS_LIST: (projectName: IProject['project_name']) => `${API.PROJECTS.RUNS(projectName)}/list`, + RUN_DETAILS: (projectName: IProject['project_name']) => `${API.PROJECTS.RUNS(projectName)}/get`, + RUN_GET_PLAN: (projectName: IProject['project_name']) => `${API.PROJECTS.RUNS(projectName)}/get_plan`, + RUNS_DELETE: (projectName: IProject['project_name']) => `${API.PROJECTS.RUNS(projectName)}/delete`, + RUNS_STOP: (projectName: IProject['project_name']) => `${API.PROJECTS.RUNS(projectName)}/stop`, + RUNS_SUBMIT: (projectName: IProject['project_name']) => `${API.PROJECTS.RUNS(projectName)}/submit`, + RUNS_APPLY: (projectName: IProject['project_name']) => `${API.PROJECTS.RUNS(projectName)}/apply`, + + // Logs + LOGS: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/logs/poll`, + + // Logs + ARTIFACTS: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/artifacts/list`, + + // Fleets + FLEETS: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/fleets/list`, + FLEETS_DETAILS: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/fleets/get`, + FLEETS_APPLY: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/fleets/apply`, + FLEETS_DELETE: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/fleets/delete`, + FLEET_INSTANCES_DELETE: (projectName: IProject['project_name']) => + `${API.BASE()}/project/${projectName}/fleets/delete_instances`, + + // Fleets + VOLUMES_DELETE: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/volumes/delete`, + + // METRICS + JOB_METRICS: (projectName: IProject['project_name'], runName: IRun['run_spec']['run_name']) => + `${API.BASE()}/project/${projectName}/metrics/job/${runName}`, + + // SECRETS + SECRETS_LIST: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/secrets/list`, + SECRET_GET: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/secrets/get`, + SECRETS_UPDATE: (projectName: IProject['project_name']) => + `${API.BASE()}/project/${projectName}/secrets/create_or_update`, + SECRETS_DELETE: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/secrets/delete`, + // GPUS + GPUS_LIST: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/gpus/list`, + // GPUS + TEMPLATES_LIST: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/templates/list`, + }, + + BACKENDS: { + BASE: () => `${API.BASE()}/backends`, + LIST_TYPES: () => `${API.BACKENDS.BASE()}/list_types`, + LIST_BASE_TYPES: () => `${API.BACKENDS.BASE()}/list_base_types`, + CONFIG_VALUES: () => `${API.BACKENDS.BASE()}/config_values`, + }, + + PROJECT_BACKENDS: { + BASE: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/backends`, + LIST: (projectName: IProject['project_name']) => `${API.PROJECT_BACKENDS.BASE(projectName)}/list`, + CREATE: (projectName: IProject['project_name']) => `${API.PROJECT_BACKENDS.BASE(projectName)}/create`, + UPDATE: (projectName: IProject['project_name']) => `${API.PROJECT_BACKENDS.BASE(projectName)}/update`, + DELETE: (projectName: IProject['project_name']) => `${API.PROJECT_BACKENDS.BASE(projectName)}/delete`, + BACKEND_CONFIG_INFO: (projectName: IProject['project_name'], backendName: string) => + `${API.PROJECT_BACKENDS.BASE(projectName)}/${backendName}/config_info`, + CREATE_YAML: (projectName: IProject['project_name']) => `${API.PROJECT_BACKENDS.BASE(projectName)}/create_yaml`, + UPDATE_YAML: (projectName: IProject['project_name']) => `${API.PROJECT_BACKENDS.BASE(projectName)}/update_yaml`, + GET_YAML: (projectName: IProject['project_name'], backendName: string) => + `${API.PROJECT_BACKENDS.BASE(projectName)}/${backendName}/get_yaml`, + }, + + PROJECT_GATEWAYS: { + BASE: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/gateways`, + LIST: (projectName: IProject['project_name']) => `${API.PROJECT_GATEWAYS.BASE(projectName)}/list`, + CREATE: (projectName: IProject['project_name']) => `${API.PROJECT_GATEWAYS.BASE(projectName)}/create`, + DELETE: (projectName: IProject['project_name']) => `${API.PROJECT_GATEWAYS.BASE(projectName)}/delete`, + DETAILS: (projectName: IProject['project_name']) => `${API.PROJECT_GATEWAYS.BASE(projectName)}/get`, + SET_DEFAULT: (projectName: IProject['project_name']) => `${API.PROJECT_GATEWAYS.BASE(projectName)}/set_default`, + SET_WILDCARD_DOMAIN: (projectName: IProject['project_name']) => + `${API.PROJECT_GATEWAYS.BASE(projectName)}/set_wildcard_domain`, + + // TEST_DOMAIN: (projectName: IProject['project_name'], instanceName: string) => + // `${API.PROJECT_GATEWAYS.DETAILS(projectName, instanceName)}/test_domain`, + }, + + RUNS: { + BASE: () => `${API.BASE()}/runs`, + LIST: () => `${API.RUNS.BASE()}/list`, + }, + + FLEETS: { + BASE: () => `${API.BASE()}/fleets`, + LIST: () => `${API.FLEETS.BASE()}/list`, + }, + + INSTANCES: { + BASE: () => `${API.BASE()}/instances`, + LIST: () => `${API.INSTANCES.BASE()}/list`, + DETAILS: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/instances/get`, + }, + + SERVER: { + BASE: () => `${API.BASE()}/server`, + INFO: () => `${API.SERVER.BASE()}/get_info`, + }, + + VOLUME: { + BASE: () => `${API.BASE()}/volumes`, + LIST: () => `${API.VOLUME.BASE()}/list`, + }, + + USER_PUBLIC_KEYS: { + BASE: () => `${API.BASE()}/users/public_keys`, + LIST: () => `${API.USER_PUBLIC_KEYS.BASE()}/list`, + ADD: () => `${API.USER_PUBLIC_KEYS.BASE()}/add`, + DELETE: () => `${API.USER_PUBLIC_KEYS.BASE()}/delete`, + }, +}; diff --git a/frontend/src/assets/css/index.css b/frontend/src/assets/css/index.css new file mode 100644 index 0000000000..d309dec325 --- /dev/null +++ b/frontend/src/assets/css/index.css @@ -0,0 +1,4 @@ +.b-page-header { + position: relative; + z-index: 1002; +} diff --git a/gateway/src/tests/core/__init__.py b/frontend/src/assets/css/mixins.css similarity index 100% rename from gateway/src/tests/core/__init__.py rename to frontend/src/assets/css/mixins.css diff --git a/frontend/src/assets/css/variables.css b/frontend/src/assets/css/variables.css new file mode 100644 index 0000000000..76e0780a3a --- /dev/null +++ b/frontend/src/assets/css/variables.css @@ -0,0 +1,8 @@ +@custom-media --mobile (width <= 640px); +@custom-media --no-mobile (width > 640px); +@custom-media --no-desktop (width < 980px); +@custom-media --desktop (width >= 980px); + +:root { + +} diff --git a/src/dstack/_internal/core/backends/remote/__init__.py b/frontend/src/assets/icons/.gitkeep similarity index 100% rename from src/dstack/_internal/core/backends/remote/__init__.py rename to frontend/src/assets/icons/.gitkeep diff --git a/frontend/src/assets/icons/entraID.svg b/frontend/src/assets/icons/entraID.svg new file mode 100644 index 0000000000..0b8d726be7 --- /dev/null +++ b/frontend/src/assets/icons/entraID.svg @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/frontend/src/assets/icons/github.svg b/frontend/src/assets/icons/github.svg new file mode 100644 index 0000000000..31b3210206 --- /dev/null +++ b/frontend/src/assets/icons/github.svg @@ -0,0 +1,3 @@ + + + diff --git a/frontend/src/assets/icons/google.svg b/frontend/src/assets/icons/google.svg new file mode 100644 index 0000000000..b352dde5d2 --- /dev/null +++ b/frontend/src/assets/icons/google.svg @@ -0,0 +1 @@ + diff --git a/frontend/src/assets/icons/okta.svg b/frontend/src/assets/icons/okta.svg new file mode 100644 index 0000000000..b36435a638 --- /dev/null +++ b/frontend/src/assets/icons/okta.svg @@ -0,0 +1,2 @@ + + diff --git a/frontend/src/assets/icons/theme.svg b/frontend/src/assets/icons/theme.svg new file mode 100644 index 0000000000..b3c4d3b3bc --- /dev/null +++ b/frontend/src/assets/icons/theme.svg @@ -0,0 +1,3 @@ + + + diff --git a/src/dstack/_internal/server/background/tasks/__init__.py b/frontend/src/assets/images/.gitkeep similarity index 100% rename from src/dstack/_internal/server/background/tasks/__init__.py rename to frontend/src/assets/images/.gitkeep diff --git a/docs/assets/images/dstack-logo-notext.png b/frontend/src/assets/images/favicon.png similarity index 100% rename from docs/assets/images/dstack-logo-notext.png rename to frontend/src/assets/images/favicon.png diff --git a/frontend/src/assets/images/logo.svg b/frontend/src/assets/images/logo.svg new file mode 100644 index 0000000000..f02e82d421 --- /dev/null +++ b/frontend/src/assets/images/logo.svg @@ -0,0 +1,124 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/frontend/src/components/Button/index.tsx b/frontend/src/components/Button/index.tsx new file mode 100644 index 0000000000..770fc6bf7d --- /dev/null +++ b/frontend/src/components/Button/index.tsx @@ -0,0 +1,22 @@ +import React from 'react'; +import classNames from 'classnames'; +import ButtonGeneral, { ButtonProps } from '@cloudscape-design/components/button'; + +import styles from './styles.module.scss'; + +type Variant = ButtonProps.Variant | 'danger-normal'; + +export interface IProps extends Omit { + variant?: Variant; +} + +export const Button: React.FC = ({ children, variant, ...props }) => { + const componentVariant: ButtonProps.Variant | undefined = variant === 'danger-normal' ? 'normal' : variant; + return ( + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + + {children} + + ); +}; diff --git a/frontend/src/components/Button/styles.module.scss b/frontend/src/components/Button/styles.module.scss new file mode 100644 index 0000000000..02157f7200 --- /dev/null +++ b/frontend/src/components/Button/styles.module.scss @@ -0,0 +1,15 @@ +@use '@cloudscape-design/design-tokens/index' as awsui; + +.button { + &.danger-normal { + &:not([disabled]) { + color: awsui.$color-charts-red-600 !important; + border-color: awsui.$color-charts-red-600 !important; + + &:hover { + color: awsui.$color-charts-red-500 !important; + border-color: awsui.$color-charts-red-500 !important; + } + } + } +} diff --git a/frontend/src/components/ButtonWithConfirmation/index.tsx b/frontend/src/components/ButtonWithConfirmation/index.tsx new file mode 100644 index 0000000000..56ae78ad59 --- /dev/null +++ b/frontend/src/components/ButtonWithConfirmation/index.tsx @@ -0,0 +1,56 @@ +import React, { useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import Box from '@cloudscape-design/components/box'; + +import { Button } from '../Button'; +import { ConfirmationDialog } from '../ConfirmationDialog'; + +import { IProps } from './types'; + +export const ButtonWithConfirmation: React.FC = ({ + confirmTitle, + confirmContent, + onClick, + confirmButtonLabel, + ...props +}) => { + const { t } = useTranslation(); + const [showDeleteConfirm, setShowConfirmDelete] = useState(false); + + const toggleDeleteConfirm = () => { + setShowConfirmDelete((val) => !val); + }; + + const onConfirm = () => { + if (onClick) onClick(); + + setShowConfirmDelete(false); + }; + + const getContent = () => { + if (!confirmContent) { + return {t('confirm_dialog.message')}; + } + + if (typeof confirmContent === 'string') { + return {confirmContent}; + } + + return confirmContent; + }; + + return ( + <> + + + + + + } + > + {content} + + ); +}; diff --git a/frontend/src/components/ConfirmationDialog/slice.ts b/frontend/src/components/ConfirmationDialog/slice.ts new file mode 100644 index 0000000000..d662c6b5fa --- /dev/null +++ b/frontend/src/components/ConfirmationDialog/slice.ts @@ -0,0 +1,34 @@ +import type { RootState } from 'store'; +import { createSlice, PayloadAction } from '@reduxjs/toolkit'; + +import { IProps as ConfirmationDialogProps } from './types'; + +type ConfirmationDialogPropsWithUuid = ConfirmationDialogProps & { uuid: string }; + +type ConfirmationDialogsStata = { + dialogs: Array; +}; + +const initialState: ConfirmationDialogsStata = { + dialogs: [], +}; + +export const confirmationSlice = createSlice({ + name: 'confirmation', + initialState, + + reducers: { + open: (state, action: PayloadAction) => { + state.dialogs = [...state.dialogs, action.payload]; + }, + close: (state, action: PayloadAction) => { + state.dialogs = state.dialogs.filter((i) => i.uuid !== action.payload); + }, + }, +}); + +export const { open, close } = confirmationSlice.actions; + +export const selectConfirmationDialogs = (state: RootState) => state.confirmation.dialogs; + +export default confirmationSlice.reducer; diff --git a/src/dstack/_internal/server/services/backends/configurators/__init__.py b/frontend/src/components/ConfirmationDialog/styles.module.scss similarity index 100% rename from src/dstack/_internal/server/services/backends/configurators/__init__.py rename to frontend/src/components/ConfirmationDialog/styles.module.scss diff --git a/frontend/src/components/ConfirmationDialog/types.ts b/frontend/src/components/ConfirmationDialog/types.ts new file mode 100644 index 0000000000..cdfb6bd070 --- /dev/null +++ b/frontend/src/components/ConfirmationDialog/types.ts @@ -0,0 +1,14 @@ +import { ReactNode } from 'react'; + +import { ButtonProps } from 'components'; + +export interface IProps { + title?: string; + content?: ReactNode; + visible?: boolean; + onDiscard: ButtonProps['onClick']; + onConfirm: ButtonProps['onClick']; + + cancelButtonLabel?: string; + confirmButtonLabel?: string; +} diff --git a/frontend/src/components/DetailsHeader/index.tsx b/frontend/src/components/DetailsHeader/index.tsx new file mode 100644 index 0000000000..b10df3ec5b --- /dev/null +++ b/frontend/src/components/DetailsHeader/index.tsx @@ -0,0 +1,43 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import Button from '@cloudscape-design/components/button'; +import Header from '@cloudscape-design/components/header'; +import SpaceBetween from '@cloudscape-design/components/space-between'; + +import { IProps } from './types'; + +export const DetailsHeader: React.FC = ({ + title, + actionButtons, + editAction, + deleteAction, + editDisabled, + deleteDisabled, +}) => { + const { t } = useTranslation(); + + return ( +
+ {actionButtons} + + {editAction && ( + + )} + + {deleteAction && ( + + )} + + } + > + {title} +
+ ); +}; diff --git a/frontend/src/components/DetailsHeader/types.ts b/frontend/src/components/DetailsHeader/types.ts new file mode 100644 index 0000000000..c917b58553 --- /dev/null +++ b/frontend/src/components/DetailsHeader/types.ts @@ -0,0 +1,10 @@ +import React from 'react'; + +export interface IProps { + title: React.ReactNode; + editAction?: () => void; + deleteAction?: () => void; + editDisabled?: boolean; + deleteDisabled?: boolean; + actionButtons?: React.ReactNode; +} diff --git a/frontend/src/components/FileUploader/FileEntry/index.tsx b/frontend/src/components/FileUploader/FileEntry/index.tsx new file mode 100644 index 0000000000..21ba14a2f7 --- /dev/null +++ b/frontend/src/components/FileUploader/FileEntry/index.tsx @@ -0,0 +1,62 @@ +import React, { useState } from 'react'; +import Box from '@cloudscape-design/components/box'; +import SpaceBetween from '@cloudscape-design/components/space-between'; +import StatusIndicator from '@cloudscape-design/components/status-indicator'; + +export interface IProps { + file: File; + showImage?: boolean; + truncateLength?: number; + i18nStrings?: { + numberOfBytes: (n: number) => string; + lastModified: (d: Date) => string; + }; +} + +export const FileEntry: React.FC = ({ file, showImage = false, truncateLength = 20, i18nStrings }) => { + const [imageData, setImageData] = useState(); + + const reader = new FileReader(); + reader.onload = (event) => { + setImageData(event.target.result as string); + }; + reader.readAsDataURL(file); + + const ext = file.name.split('.').pop()!; + const displayFileName = + file.name.length - ext.length - 1 > truncateLength ? `${file.name.slice(0, truncateLength)}... .${ext}` : file.name; + const lastModifiedDate = new Date(file.lastModified); + const fileSize = file.size; + + return ( + + + {showImage && ( + {file.name} + )} + + {displayFileName} + + {/* eslint-disable-next-line no-constant-binary-expression */} + {false && ( + <> + {i18nStrings ? i18nStrings.numberOfBytes(fileSize) : `${fileSize} bytes`} + + {i18nStrings + ? i18nStrings.lastModified(lastModifiedDate) + : `Last modified: ${lastModifiedDate.toDateString()}`} + + + )} + + + ); +}; diff --git a/frontend/src/components/FileUploader/Token/index.tsx b/frontend/src/components/FileUploader/Token/index.tsx new file mode 100644 index 0000000000..0f5bcf6ae5 --- /dev/null +++ b/frontend/src/components/FileUploader/Token/index.tsx @@ -0,0 +1,23 @@ +import React, { ReactNode } from 'react'; +import Button from '@cloudscape-design/components/button'; +import SpaceBetween from '@cloudscape-design/components/space-between'; + +import styles from './styles.module.scss'; + +export interface IProps { + children: ReactNode; + onClose?: () => void; +} + +export const Token: React.FC = ({ children, onClose }) => { + return ( +
+
{children}
+ {!!onClose && ( + +
+ ); +}; diff --git a/frontend/src/components/FileUploader/Token/styles.module.scss b/frontend/src/components/FileUploader/Token/styles.module.scss new file mode 100644 index 0000000000..09e14fa8d7 --- /dev/null +++ b/frontend/src/components/FileUploader/Token/styles.module.scss @@ -0,0 +1,8 @@ +.tokenPanel { + background-color: var(--color-background-item-selected-ebt4bi); + display: flex; + border: 0.2em solid var(--color-border-item-selected-ppkssz); + padding: 1em; + margin: 1em; + border-radius: var(--border-radius-token-wohc9e); +} diff --git a/frontend/src/components/FileUploader/index.tsx b/frontend/src/components/FileUploader/index.tsx new file mode 100644 index 0000000000..c41f971a6e --- /dev/null +++ b/frontend/src/components/FileUploader/index.tsx @@ -0,0 +1,99 @@ +import React from 'react'; +import Box from '@cloudscape-design/components/box'; +import Button from '@cloudscape-design/components/button'; +import FormField from '@cloudscape-design/components/form-field'; + +import { FileEntry } from './FileEntry'; +import { Token } from './Token'; + +export interface IProp { + fileInputId: string; + text: string; + label?: string; + description?: string; + info?: React.ReactNode; + constraintText?: string; + errorText?: string; + files: File[]; + accept?: string; + onFilesUploaded: (uploadedFiles: File[]) => void; + onFileRemoved?: (fileIdx: number) => void; + multiple?: boolean; + i18nStrings?: { + numberOfBytes: (n: number) => string; + lastModified: (d: Date) => string; + }; +} + +export const FileUploader = ({ + fileInputId, + text, + label, + description, + info, + constraintText, + errorText, + files, + onFilesUploaded, + onFileRemoved, + multiple = false, + i18nStrings, + accept, +}: IProp) => { + return ( + <> + + + + + {files && + files.length > 0 && + (multiple ? ( + Array.from(files).map((file, fileIdx) => ( +
+ { + onFileRemoved(fileIdx); + } + : undefined + } + > + + +
+ )) + ) : ( + + ))} +
+ + ); +}; diff --git a/frontend/src/components/Hotspot/index.tsx b/frontend/src/components/Hotspot/index.tsx new file mode 100644 index 0000000000..12fa1ffffe --- /dev/null +++ b/frontend/src/components/Hotspot/index.tsx @@ -0,0 +1,14 @@ +import React from 'react'; +import HotspotGeneral, { HotspotProps } from '@cloudscape-design/components/hotspot'; + +export interface IProps extends HotspotProps { + renderHotspot?: boolean; +} + +export const Hotspot: React.FC = ({ renderHotspot = true, children, ...props }) => { + if (!renderHotspot) { + return children; + } + + return {children}; +}; diff --git a/frontend/src/components/InfoLink/index.tsx b/frontend/src/components/InfoLink/index.tsx new file mode 100644 index 0000000000..e795c6c0fc --- /dev/null +++ b/frontend/src/components/InfoLink/index.tsx @@ -0,0 +1,15 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import Link from '@cloudscape-design/components/link'; + +import { IProps } from './types'; + +export const InfoLink: React.FC = (props) => { + const { t } = useTranslation(); + + return ( + + {t('common.info')} + + ); +}; diff --git a/frontend/src/components/InfoLink/types.ts b/frontend/src/components/InfoLink/types.ts new file mode 100644 index 0000000000..07f19efe82 --- /dev/null +++ b/frontend/src/components/InfoLink/types.ts @@ -0,0 +1,7 @@ +import { LinkProps } from '@cloudscape-design/components/link'; + +export interface IProps { + id?: string; + ariaLabel?: string; + onFollow: LinkProps['onFollow']; +} diff --git a/frontend/src/components/ListEmptyMessage/index.tsx b/frontend/src/components/ListEmptyMessage/index.tsx new file mode 100644 index 0000000000..e62a643d29 --- /dev/null +++ b/frontend/src/components/ListEmptyMessage/index.tsx @@ -0,0 +1,16 @@ +import React from 'react'; +import Box from '@cloudscape-design/components/box'; + +import { IProps } from './types'; + +export const ListEmptyMessage: React.FC = ({ title, message, children }) => { + return ( + + {title && {title}} + + {message} + + {children} + + ); +}; diff --git a/frontend/src/components/ListEmptyMessage/types.ts b/frontend/src/components/ListEmptyMessage/types.ts new file mode 100644 index 0000000000..03c2d2ccb3 --- /dev/null +++ b/frontend/src/components/ListEmptyMessage/types.ts @@ -0,0 +1,7 @@ +import { ReactNode } from 'react'; + +export interface IProps { + title?: string; + message?: string; + children?: ReactNode; +} diff --git a/frontend/src/components/Loader/index.tsx b/frontend/src/components/Loader/index.tsx new file mode 100644 index 0000000000..9c6eddb92f --- /dev/null +++ b/frontend/src/components/Loader/index.tsx @@ -0,0 +1,33 @@ +import React from 'react'; +import classNames from 'classnames'; +import Box, { BoxProps } from '@cloudscape-design/components/box'; +import SpaceBetween from '@cloudscape-design/components/space-between'; +import Spinner from '@cloudscape-design/components/spinner'; + +import styles from './styles.module.scss'; + +export interface Props { + show?: boolean; + className?: string; + padding?: BoxProps['padding']; + loadingText?: string; +} + +export const Loader: React.FC = ({ + className, + show = true, + loadingText = 'Loading', + padding = { vertical: 'xxxl' }, +}) => { + return ( +
+ + + + + {loadingText} + + +
+ ); +}; diff --git a/frontend/src/components/Loader/styles.module.scss b/frontend/src/components/Loader/styles.module.scss new file mode 100644 index 0000000000..9160174513 --- /dev/null +++ b/frontend/src/components/Loader/styles.module.scss @@ -0,0 +1,9 @@ +.loader { + display: flex; + justify-content: center; + transition: opacity .2s ease; + + &:not(.show) { + opacity: 0; + } +} diff --git a/frontend/src/components/NavigateLink/index.tsx b/frontend/src/components/NavigateLink/index.tsx new file mode 100644 index 0000000000..7bac639685 --- /dev/null +++ b/frontend/src/components/NavigateLink/index.tsx @@ -0,0 +1,21 @@ +import React from 'react'; +import { useNavigate } from 'react-router-dom'; +import Link, { LinkProps } from '@cloudscape-design/components/link'; + +import styles from './style.module.scss'; + +export const NavigateLink: React.FC = ({ onFollow, ...props }) => { + const navigate = useNavigate(); + const onFollowHandler: LinkProps['onFollow'] = (event) => { + event.preventDefault(); + + if (onFollow) onFollow(event); + if (event.detail.href) navigate(event.detail.href); + }; + + return ( + + + + ); +}; diff --git a/frontend/src/components/NavigateLink/style.module.scss b/frontend/src/components/NavigateLink/style.module.scss new file mode 100644 index 0000000000..27eca2e170 --- /dev/null +++ b/frontend/src/components/NavigateLink/style.module.scss @@ -0,0 +1,5 @@ +.link { + & > a { + text-decoration: underline !important; + } +} diff --git a/frontend/src/components/Notifications/index.tsx b/frontend/src/components/Notifications/index.tsx new file mode 100644 index 0000000000..190788dda2 --- /dev/null +++ b/frontend/src/components/Notifications/index.tsx @@ -0,0 +1,12 @@ +import React from 'react'; +import Flashbar from '@cloudscape-design/components/flashbar'; + +import { useAppSelector } from 'hooks'; + +import { selectNotifications } from './slice'; + +export const Notifications: React.FC = () => { + const notifications = useAppSelector(selectNotifications); + + return ; +}; diff --git a/frontend/src/components/Notifications/slice.ts b/frontend/src/components/Notifications/slice.ts new file mode 100644 index 0000000000..bf51846193 --- /dev/null +++ b/frontend/src/components/Notifications/slice.ts @@ -0,0 +1,30 @@ +import type { RootState } from 'store'; +import { createSlice, PayloadAction } from '@reduxjs/toolkit'; + +import { Notification } from './types'; + +interface NotificationsState { + items: Notification[]; +} + +const initialState: NotificationsState = { + items: [], +}; + +export const notificationsSlice = createSlice({ + name: 'notifications', + initialState, + + reducers: { + push: (state, action: PayloadAction) => { + state.items = [...state.items, action.payload]; + }, + remove: (state, action: PayloadAction) => { + state.items = state.items.filter((i) => i.id !== action.payload); + }, + }, +}); + +export const { remove, push } = notificationsSlice.actions; +export const selectNotifications = (state: RootState) => state.notifications.items; +export default notificationsSlice.reducer; diff --git a/frontend/src/components/Notifications/types.ts b/frontend/src/components/Notifications/types.ts new file mode 100644 index 0000000000..b406b6e8dc --- /dev/null +++ b/frontend/src/components/Notifications/types.ts @@ -0,0 +1,3 @@ +import { FlashbarProps } from '@cloudscape-design/components/flashbar'; + +export type Notification = FlashbarProps.MessageDefinition; diff --git a/frontend/src/components/PermissionGuard/index.tsx b/frontend/src/components/PermissionGuard/index.tsx new file mode 100644 index 0000000000..4e73ca51eb --- /dev/null +++ b/frontend/src/components/PermissionGuard/index.tsx @@ -0,0 +1,13 @@ +import React from 'react'; + +import { usePermissionGuard } from 'hooks'; + +import { IProps } from './types'; + +export const PermissionGuard: React.FC = ({ children, ...props }) => { + const [isAvailable] = usePermissionGuard(props); + + if (!isAvailable) return null; + + return <>{children}; +}; diff --git a/frontend/src/components/PermissionGuard/types.ts b/frontend/src/components/PermissionGuard/types.ts new file mode 100644 index 0000000000..bf85822f39 --- /dev/null +++ b/frontend/src/components/PermissionGuard/types.ts @@ -0,0 +1,10 @@ +import React from 'react'; + +import { GlobalUserRole, ProjectUserRole } from 'types'; + +export interface IProps { + allowedGlobalRoles?: GlobalUserRole[]; + allowedProjectRoles?: ProjectUserRole[]; + projectRole?: string; + children: React.ReactNode; +} diff --git a/frontend/src/components/Tabs/index.tsx b/frontend/src/components/Tabs/index.tsx new file mode 100644 index 0000000000..e51196db9c --- /dev/null +++ b/frontend/src/components/Tabs/index.tsx @@ -0,0 +1,46 @@ +import React, { useMemo } from 'react'; +import { useLocation, useNavigate } from 'react-router-dom'; +import classNames from 'classnames'; +import type { TabsProps } from '@cloudscape-design/components/tabs'; +import GeneralTabs from '@cloudscape-design/components/tabs'; + +import styles from './styles.module.scss'; + +export interface IProps extends TabsProps { + className?: string; + withNavigation?: boolean; +} + +export const Tabs: React.FC = ({ className, withNavigation, onChange, activeTabId: activeTabIdProp, ...props }) => { + const navigate = useNavigate(); + const { pathname } = useLocation(); + + const hasContent = useMemo(() => { + return props.tabs.some((tab) => !!tab.content); + }, [props.tabs]); + + const activeTabId = useMemo(() => { + if (activeTabIdProp) return activeTabIdProp; + + if (withNavigation) { + const tab = props.tabs.find((t) => pathname === t.href); + return tab?.id; + } + }, [pathname, activeTabIdProp]); + + const onChangeTab: TabsProps['onChange'] = (event) => { + if (withNavigation) { + const { detail } = event; + + navigate(detail.activeTabHref!); + } + + if (onChange) onChange(event); + }; + + return ( +
+ +
+ ); +}; diff --git a/frontend/src/components/Tabs/styles.module.scss b/frontend/src/components/Tabs/styles.module.scss new file mode 100644 index 0000000000..f527927646 --- /dev/null +++ b/frontend/src/components/Tabs/styles.module.scss @@ -0,0 +1,9 @@ +.tabs { + &:not(.hasContent) { + :global { + [class^="awsui_tabs-content-wrapper"] { + display: none; + } + } + } +} diff --git a/frontend/src/components/form/Cards/index.tsx b/frontend/src/components/form/Cards/index.tsx new file mode 100644 index 0000000000..17b12f1934 --- /dev/null +++ b/frontend/src/components/form/Cards/index.tsx @@ -0,0 +1,38 @@ +import React from 'react'; +import { Controller, FieldValues } from 'react-hook-form'; +import Cards from '@cloudscape-design/components/cards'; +import { CardsProps } from '@cloudscape-design/components/cards'; + +import { FormCardsProps } from './types'; + +export const FormCards = ({ + name, + control, + onSelectionChange: onSelectionChangeProp, + ...props +}: FormCardsProps) => { + return ( + { + const onSelectionChange: CardsProps['onSelectionChange'] = (event) => { + onChange(event.detail.selectedItems.map(({ value }) => value)); + onSelectionChangeProp?.(event); + }; + + const selectedItems = props.items.filter((item) => fieldRest.value?.includes(item.value)); + + return ( + + ); + }} + /> + ); +}; diff --git a/frontend/src/components/form/Cards/types.ts b/frontend/src/components/form/Cards/types.ts new file mode 100644 index 0000000000..857ac77a5b --- /dev/null +++ b/frontend/src/components/form/Cards/types.ts @@ -0,0 +1,7 @@ +import { Control, FieldValues, Path } from 'react-hook-form'; +import { CardsProps } from '@cloudscape-design/components/cards'; + +export type FormCardsProps = CardsProps & { + control: Control; + name: Path; +}; diff --git a/frontend/src/components/form/Checkbox/index.tsx b/frontend/src/components/form/Checkbox/index.tsx new file mode 100644 index 0000000000..ac64f8f54c --- /dev/null +++ b/frontend/src/components/form/Checkbox/index.tsx @@ -0,0 +1,56 @@ +import React from 'react'; +import { Controller, FieldValues } from 'react-hook-form'; +import CheckboxCSD from '@cloudscape-design/components/checkbox'; +import FormField from '@cloudscape-design/components/form-field'; + +import { FormCheckboxProps } from './types'; + +export const FormCheckbox = ({ + name, + control, + rules, + label, + info, + constraintText, + description, + secondaryControl, + stretch, + leftContent, + checkboxLabel, + onChange: onChangeProp, + ...props +}: FormCheckboxProps) => { + return ( + { + return ( + + {leftContent} + { + onChange(event.detail.checked); + onChangeProp?.(event); + }} + > + {checkboxLabel} + + + ); + }} + /> + ); +}; diff --git a/frontend/src/components/form/Checkbox/types.ts b/frontend/src/components/form/Checkbox/types.ts new file mode 100644 index 0000000000..22b474bf71 --- /dev/null +++ b/frontend/src/components/form/Checkbox/types.ts @@ -0,0 +1,11 @@ +import { ReactNode } from 'react'; +import { ControllerProps, FieldValues } from 'react-hook-form'; +import { CheckboxProps } from '@cloudscape-design/components/checkbox'; +import { FormFieldProps } from '@cloudscape-design/components/form-field'; + +export type FormCheckboxProps = Omit & + Omit & + Pick, 'control' | 'name' | 'rules'> & { + leftContent?: ReactNode; + checkboxLabel?: string; + }; diff --git a/frontend/src/components/form/CodeEditor/index.tsx b/frontend/src/components/form/CodeEditor/index.tsx new file mode 100644 index 0000000000..254c960d00 --- /dev/null +++ b/frontend/src/components/form/CodeEditor/index.tsx @@ -0,0 +1,51 @@ +import React from 'react'; +import { Controller, FieldValues } from 'react-hook-form'; +import FormField from '@cloudscape-design/components/form-field'; + +import { CodeEditor } from '../../CodeEditor'; + +import { FormCodeEditorProps } from './types'; + +export const FormCodeEditor = ({ + name, + control, + rules, + label, + info, + constraintText, + description, + secondaryControl, + stretch, + onChange: onChangeProp, + ...props +}: FormCodeEditorProps) => { + return ( + { + return ( + + { + onChange(event.detail.value); + onChangeProp?.(event); + }} + /> + + ); + }} + /> + ); +}; diff --git a/frontend/src/components/form/CodeEditor/types.ts b/frontend/src/components/form/CodeEditor/types.ts new file mode 100644 index 0000000000..baedd567b8 --- /dev/null +++ b/frontend/src/components/form/CodeEditor/types.ts @@ -0,0 +1,11 @@ +import { ControllerProps, FieldValues } from 'react-hook-form'; +import { FormFieldProps } from '@cloudscape-design/components/form-field'; + +import { CodeEditorProps } from '../../CodeEditor'; + +export type FormCodeEditorProps = Omit< + CodeEditorProps, + 'value' | 'name' | 'i18nStrings' | 'ace' | 'onPreferencesChange' | 'preferences' +> & + Omit & + Pick, 'control' | 'name' | 'rules'>; diff --git a/frontend/src/components/form/Input/index.tsx b/frontend/src/components/form/Input/index.tsx new file mode 100644 index 0000000000..b0c745b105 --- /dev/null +++ b/frontend/src/components/form/Input/index.tsx @@ -0,0 +1,76 @@ +import React from 'react'; +import { Controller, FieldValues } from 'react-hook-form'; +import FormField from '@cloudscape-design/components/form-field'; +import Hotspot from '@cloudscape-design/components/hotspot'; +import InputCSD, { InputProps } from '@cloudscape-design/components/input'; + +import { FormInputProps } from './types'; + +export const FormInput = ({ + name, + control, + rules, + defaultValue, + label, + info, + constraintText, + description, + secondaryControl, + stretch, + leftContent, + hotspotId, + onChange: onChangeProp, + ...props +}: FormInputProps) => { + const renderInput = (renderProps: InputProps) => { + return ; + }; + + return ( + { + return ( + + {leftContent} + + {hotspotId ? ( + + {renderInput({ + ...fieldRest, + ...props, + invalid: !!error, + onChange: (event) => { + onChange(event.detail.value); + onChangeProp?.(event); + }, + })} + + ) : ( + renderInput({ + ...fieldRest, + ...props, + invalid: !!error, + onChange: (event) => { + onChange(event.detail.value); + onChangeProp?.(event); + }, + }) + )} + + ); + }} + /> + ); +}; diff --git a/frontend/src/components/form/Input/types.ts b/frontend/src/components/form/Input/types.ts new file mode 100644 index 0000000000..74db1fd036 --- /dev/null +++ b/frontend/src/components/form/Input/types.ts @@ -0,0 +1,11 @@ +import { ReactNode } from 'react'; +import { ControllerProps, FieldValues } from 'react-hook-form'; +import { FormFieldProps } from '@cloudscape-design/components/form-field'; +import { InputProps } from '@cloudscape-design/components/input'; + +export type FormInputProps = Omit & + Omit & + Pick, 'control' | 'name' | 'rules' | 'defaultValue'> & { + leftContent?: ReactNode; + hotspotId?: string; + }; diff --git a/frontend/src/components/form/Multiselect/index.tsx b/frontend/src/components/form/Multiselect/index.tsx new file mode 100644 index 0000000000..93166c010b --- /dev/null +++ b/frontend/src/components/form/Multiselect/index.tsx @@ -0,0 +1,58 @@ +import React from 'react'; +import { Controller, FieldValues } from 'react-hook-form'; +import FormField from '@cloudscape-design/components/form-field'; +import MultiselectCSD from '@cloudscape-design/components/multiselect'; +import { MultiselectProps } from '@cloudscape-design/components/multiselect/interfaces'; + +import { FormMultiselectProps } from './types'; + +export const FormMultiselect = ({ + name, + rules, + control, + label, + info, + constraintText, + description, + secondaryControl, + stretch, + onChange: onChangeProp, + ...props +}: FormMultiselectProps) => { + return ( + { + const selectedOptions = props.options?.filter((i) => fieldRest.value?.includes(i.value)) ?? null; + + const onChangeSelect: MultiselectProps['onChange'] = (event) => { + const value = event.detail.selectedOptions.map((item) => item.value); + onChange(value); + onChangeProp?.(event); + }; + + return ( + + + + ); + }} + /> + ); +}; diff --git a/frontend/src/components/form/Multiselect/types.ts b/frontend/src/components/form/Multiselect/types.ts new file mode 100644 index 0000000000..310bc0bf4d --- /dev/null +++ b/frontend/src/components/form/Multiselect/types.ts @@ -0,0 +1,15 @@ +import { ControllerProps, FieldValues } from 'react-hook-form'; +import { FormFieldProps } from '@cloudscape-design/components/form-field'; +import { MultiselectProps } from '@cloudscape-design/components/multiselect'; + +export type FormMultiselectOption = MultiselectProps.Option; +export type FormMultiselectOptions = ReadonlyArray; + +export type FormMultiselectProps = Omit< + MultiselectProps, + 'value' | 'name' | 'selectedOptions' | 'options' +> & + Omit & + Pick, 'control' | 'name' | 'rules'> & { + options: ReadonlyArray; + }; diff --git a/frontend/src/components/form/RadioButtons/index.tsx b/frontend/src/components/form/RadioButtons/index.tsx new file mode 100644 index 0000000000..aa56652d62 --- /dev/null +++ b/frontend/src/components/form/RadioButtons/index.tsx @@ -0,0 +1,48 @@ +import React from 'react'; +import { Controller, FieldValues } from 'react-hook-form'; +import FormField from '@cloudscape-design/components/form-field'; +import RadioGroup, { RadioGroupProps } from '@cloudscape-design/components/radio-group'; + +import { FormRadioButtonsProps } from './types'; + +export const FormRadioButtons = ({ + name, + rules, + control, + label, + info, + constraintText, + description, + secondaryControl, + stretch, + onChange: onChangeProp, + ...props +}: FormRadioButtonsProps) => { + return ( + { + const onChangeSelect: RadioGroupProps['onChange'] = (event) => { + onChange(event.detail.value); + onChangeProp?.(event); + }; + + return ( + + + + ); + }} + /> + ); +}; diff --git a/frontend/src/components/form/RadioButtons/types.ts b/frontend/src/components/form/RadioButtons/types.ts new file mode 100644 index 0000000000..9184b4841e --- /dev/null +++ b/frontend/src/components/form/RadioButtons/types.ts @@ -0,0 +1,7 @@ +import { ControllerProps, FieldValues } from 'react-hook-form'; +import { FormFieldProps } from '@cloudscape-design/components/form-field'; +import { RadioGroupProps } from '@cloudscape-design/components/radio-group'; + +export type FormRadioButtonsProps = Omit & + Omit & + Pick, 'control' | 'name' | 'rules'>; diff --git a/frontend/src/components/form/S3BucketSelector/index.tsx b/frontend/src/components/form/S3BucketSelector/index.tsx new file mode 100644 index 0000000000..90b930af6e --- /dev/null +++ b/frontend/src/components/form/S3BucketSelector/index.tsx @@ -0,0 +1,94 @@ +import React, { useMemo, useRef } from 'react'; +import { Controller, FieldValues } from 'react-hook-form'; +import cn from 'classnames'; +import FormField from '@cloudscape-design/components/form-field'; +import S3ResourceSelector from '@cloudscape-design/components/s3-resource-selector'; +import { S3ResourceSelectorProps } from '@cloudscape-design/components/s3-resource-selector'; + +import { getResourceSelectorI18n } from './utils'; + +import { FormS3BucketSelectorProps } from './types'; + +import styles from './styles.module.scss'; + +export const FormS3BucketSelector = ({ + name, + rules, + control, + label, + buckets: bucketsProp, + info, + constraintText, + description, + secondaryControl, + stretch, + onChange: onChangeProp, + disabled, + prefix = 's3://', + i18nStrings, + ...props +}: FormS3BucketSelectorProps) => { + const fetch = async () => Promise.resolve([]); + const lastValue = useRef(null); + + const buckets = useMemo(() => { + return bucketsProp.map((i) => ({ + Name: i.name, + CreationDate: i.created, + Region: i.region, + })); + }, [bucketsProp]); + + const fetchBuckets = (): Promise => Promise.resolve(buckets); + + const customProps = { + bucketsVisibleColumns: ['Name'], + fetchBuckets: fetchBuckets, + fetchObjects: fetch, + fetchVersions: fetch, + i18nStrings: getResourceSelectorI18n(prefix, i18nStrings), + }; + + return ( + { + const resource = { uri: value }; + const onChangeSelect: S3ResourceSelectorProps['onChange'] = (event) => { + const bucket = event.detail.resource.uri.replace(/^s3:\/\//, ''); + + if (lastValue.current === bucket) return; + lastValue.current = bucket; + + onChange(bucket); + onChangeProp?.(event); + }; + + return ( + +
+ +
+
+ ); + }} + /> + ); +}; diff --git a/frontend/src/components/form/S3BucketSelector/styles.module.scss b/frontend/src/components/form/S3BucketSelector/styles.module.scss new file mode 100644 index 0000000000..f0fb84b7f6 --- /dev/null +++ b/frontend/src/components/form/S3BucketSelector/styles.module.scss @@ -0,0 +1,24 @@ +@use '@cloudscape-design/design-tokens/index' as awsui; + +.bucketSelector { + &:global(.disabled) { + & * { + pointer-events: none!important; + } + & input { + background-color: awsui.$color-background-input-disabled !important; + border: 2px solid awsui.$color-background-input-disabled !important; + color: awsui.$color-text-input-disabled !important; + cursor: default !important; + } + + & button { + background-color: awsui.$color-background-input-disabled !important; + border-color: awsui.$color-border-button-normal-disabled !important; + color: awsui.$color-text-interactive-disabled; + text-decoration: none !important; + pointer-events: none !important; + cursor: auto !important; + } + } +} diff --git a/frontend/src/components/form/S3BucketSelector/types.ts b/frontend/src/components/form/S3BucketSelector/types.ts new file mode 100644 index 0000000000..82d2c2c732 --- /dev/null +++ b/frontend/src/components/form/S3BucketSelector/types.ts @@ -0,0 +1,16 @@ +import { ControllerProps, FieldValues } from 'react-hook-form'; +import { FormFieldProps } from '@cloudscape-design/components/form-field'; +import { S3ResourceSelectorProps } from '@cloudscape-design/components/s3-resource-selector'; + +export type FormS3BucketSelectorProps = Omit< + S3ResourceSelectorProps, + 'resource' | 'fetchBuckets' | 'fetchVersions' | 'fetchObjects' | 'name' | 'i18nStrings' +> & + Omit & + Pick, 'control' | 'name' | 'rules'> & { + prefix?: string; + label: string; + buckets: TAwsBucket[]; + disabled?: boolean; + i18nStrings?: Partial; + }; diff --git a/frontend/src/components/form/S3BucketSelector/utils.ts b/frontend/src/components/form/S3BucketSelector/utils.ts new file mode 100644 index 0000000000..8e24fb9187 --- /dev/null +++ b/frontend/src/components/form/S3BucketSelector/utils.ts @@ -0,0 +1,81 @@ +import { S3ResourceSelectorProps } from '@cloudscape-design/components/s3-resource-selector'; + +export const getResourceSelectorI18n = ( + prefix: string, + params?: Partial, +): S3ResourceSelectorProps['i18nStrings'] => ({ + inContextInputPlaceholder: 'my-bucket-name', + inContextSelectPlaceholder: '', + inContextBrowseButton: 'Browse S3', + inContextViewButton: 'View', + inContextViewButtonAriaLabel: 'View (opens in a new tab)', + inContextLoadingText: '', + inContextUriLabel: '', + inContextVersionSelectLabel: '', + modalTitle: 'Choose an archive in S3', + modalCancelButton: 'Cancel', + modalSubmitButton: 'Choose', + modalBreadcrumbRootItem: 'S3 buckets', + selectionBuckets: 'Buckets', + selectionObjects: '', + selectionVersions: '', + selectionBucketsSearchPlaceholder: 'Find bucket', + selectionObjectsSearchPlaceholder: 'Find object by prefix', + selectionVersionsSearchPlaceholder: 'Find version', + selectionBucketsLoading: 'Loading buckets', + selectionBucketsNoItems: 'No buckets', + selectionObjectsLoading: '', + selectionObjectsNoItems: '', + selectionVersionsLoading: '', + selectionVersionsNoItems: '', + filteringCounterText: (count: number) => '' + count + (count === 1 ? ' match' : ' matches'), + filteringNoMatches: 'No matches', + filteringCantFindMatch: "We can't find a match.", + clearFilterButtonText: 'Clear filter', + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + columnBucketID: 'ID', + columnBucketName: 'Name', + columnBucketCreationDate: 'Creation date', + columnBucketRegion: 'Region', + columnBucketAccess: 'Access', + // columnObjectID: 'ID', + // columnObjectKey: 'Key', + // columnObjectLastModified: 'Last modified', + // columnObjectSize: 'Size', + // columnVersionID: 'Version ID', + // columnVersionLastModified: 'Last modified', + // columnVersionSize: 'Size', + validationPathMustBegin: `The path must begin with ${prefix}`, + validationBucketLowerCase: 'The bucket name must start with a lowercase character or number.', + validationBucketMustNotContain: 'The bucket name must not contain uppercase characters.', + validationBucketMustComplyDns: 'The bucket name must comply with DNS naming conventions', + validationBucketLength: 'The bucket name must be from 3 to 63 characters.', + labelSortedDescending: (columnName: string) => columnName + ', sorted descending', + labelSortedAscending: (columnName: string) => columnName + ', sorted ascending', + labelNotSorted: (columnName: string) => columnName + ', not sorted', + labelsPagination: { + nextPageLabel: 'Next page', + previousPageLabel: 'Previous page', + pageLabel: (pageNumber: number) => 'Page ' + pageNumber + ' of all pages', + }, + labelsBucketsSelection: { + itemSelectionLabel: (_, item) => item?.Name ?? '', + selectionGroupLabel: 'Buckets', + }, + labelsObjectsSelection: { + itemSelectionLabel: () => '', + selectionGroupLabel: 'Objects', + }, + labelsVersionsSelection: { + itemSelectionLabel: () => '', + selectionGroupLabel: 'Versions', + }, + labelFiltering: (itemsType: string) => 'Find ' + itemsType, + labelRefresh: 'Refresh the data', + labelAlertDismiss: 'Dismiss the alert', + labelModalDismiss: 'Dismiss the modal', + labelBreadcrumbs: 'S3 navigation', + + ...params, +}); diff --git a/frontend/src/components/form/Select/index.tsx b/frontend/src/components/form/Select/index.tsx new file mode 100644 index 0000000000..77dee60e11 --- /dev/null +++ b/frontend/src/components/form/Select/index.tsx @@ -0,0 +1,59 @@ +import React from 'react'; +import { Controller, FieldValues } from 'react-hook-form'; +import FormField from '@cloudscape-design/components/form-field'; +import SelectCSD from '@cloudscape-design/components/select'; +import { SelectProps } from '@cloudscape-design/components/select/interfaces'; + +import { FormSelectProps } from './types'; + +export const FormSelect = ({ + name, + rules, + defaultValue, + control, + label, + info, + constraintText, + description, + secondaryControl, + stretch, + onChange: onChangeProp, + ...props +}: FormSelectProps) => { + return ( + { + const selectedOption = props.options?.find((i) => i.value === fieldRest.value) ?? null; + + const onChangeSelect: SelectProps['onChange'] = (event) => { + onChange(event.detail.selectedOption.value); + onChangeProp?.(event); + }; + + return ( + + + + ); + }} + /> + ); +}; diff --git a/frontend/src/components/form/Select/types.ts b/frontend/src/components/form/Select/types.ts new file mode 100644 index 0000000000..91bfd32427 --- /dev/null +++ b/frontend/src/components/form/Select/types.ts @@ -0,0 +1,12 @@ +import { ControllerProps, FieldValues } from 'react-hook-form'; +import { FormFieldProps } from '@cloudscape-design/components/form-field'; +import { SelectProps } from '@cloudscape-design/components/select'; + +export type FormSelectOption = SelectProps.Option; +export type FormSelectOptions = ReadonlyArray; + +export type FormSelectProps = Omit & + Omit & + Pick, 'control' | 'name' | 'rules' | 'defaultValue'> & { + options: ReadonlyArray; + }; diff --git a/frontend/src/components/form/Textarea/index.tsx b/frontend/src/components/form/Textarea/index.tsx new file mode 100644 index 0000000000..e34ba2e6b5 --- /dev/null +++ b/frontend/src/components/form/Textarea/index.tsx @@ -0,0 +1,53 @@ +import React from 'react'; +import { Controller, FieldValues } from 'react-hook-form'; +import FormField from '@cloudscape-design/components/form-field'; +import TextareaCSD from '@cloudscape-design/components/textarea'; + +import { FormTextareaProps } from './types'; + +export const FormTextarea = ({ + name, + control, + rules, + label, + info, + constraintText, + description, + secondaryControl, + stretch, + leftContent, + onChange: onChangeProp, + ...props +}: FormTextareaProps) => { + return ( + { + return ( + + {leftContent} + { + onChange(event.detail.value); + onChangeProp?.(event); + }} + /> + + ); + }} + /> + ); +}; diff --git a/frontend/src/components/form/Textarea/types.ts b/frontend/src/components/form/Textarea/types.ts new file mode 100644 index 0000000000..5614525cae --- /dev/null +++ b/frontend/src/components/form/Textarea/types.ts @@ -0,0 +1,10 @@ +import { ReactNode } from 'react'; +import { ControllerProps, FieldValues } from 'react-hook-form'; +import { FormFieldProps } from '@cloudscape-design/components/form-field'; +import { TextareaProps } from '@cloudscape-design/components/textarea'; + +export type FormTextareaProps = Omit & + Omit & + Pick, 'control' | 'name' | 'rules'> & { + leftContent?: ReactNode; + }; diff --git a/frontend/src/components/form/Tiles/index.tsx b/frontend/src/components/form/Tiles/index.tsx new file mode 100644 index 0000000000..82515115fa --- /dev/null +++ b/frontend/src/components/form/Tiles/index.tsx @@ -0,0 +1,23 @@ +import React from 'react'; +import { Controller, FieldValues } from 'react-hook-form'; +import Tiles from '@cloudscape-design/components/tiles'; +import { TilesProps } from '@cloudscape-design/components/tiles'; + +import { FormTilesProps } from './types'; + +export const FormTiles = ({ name, control, onChange: onChangeProp, ...props }: FormTilesProps) => { + return ( + { + const onChangeSelect: TilesProps['onChange'] = (event) => { + onChange(event.detail.value); + onChangeProp?.(event); + }; + + return ; + }} + /> + ); +}; diff --git a/frontend/src/components/form/Tiles/types.ts b/frontend/src/components/form/Tiles/types.ts new file mode 100644 index 0000000000..4e9a28daff --- /dev/null +++ b/frontend/src/components/form/Tiles/types.ts @@ -0,0 +1,7 @@ +import { Control, FieldValues, Path } from 'react-hook-form'; +import { TilesProps } from '@cloudscape-design/components/tiles'; + +export type FormTilesProps = Omit & { + control: Control; + name: Path; +}; diff --git a/frontend/src/components/form/Toogle/index.module.scss b/frontend/src/components/form/Toogle/index.module.scss new file mode 100644 index 0000000000..b57b20a850 --- /dev/null +++ b/frontend/src/components/form/Toogle/index.module.scss @@ -0,0 +1,17 @@ +@use '@cloudscape-design/design-tokens/index' as awsui; + +.labelWithInfo { + display: inline-flex; + align-items: center; +} + +.divider { + display: inline-block; + height: 16px; + border-left: 1px solid awsui.$color-border-divider-default; + margin: 0 8px; +} + +.info { + display: inline-flex; +} diff --git a/frontend/src/components/form/Toogle/index.tsx b/frontend/src/components/form/Toogle/index.tsx new file mode 100644 index 0000000000..1cdb3c62ad --- /dev/null +++ b/frontend/src/components/form/Toogle/index.tsx @@ -0,0 +1,81 @@ +import React from 'react'; +import { Controller, FieldValues } from 'react-hook-form'; +import FormField from '@cloudscape-design/components/form-field'; +import ToggleCSD from '@cloudscape-design/components/toggle'; + +import { FormToggleProps } from './types'; + +import styles from './index.module.scss'; + +export const FormToggle = ({ + name, + control, + rules, + defaultValue, + label, + info, + constraintText, + description, + secondaryControl, + stretch, + leftContent, + toggleLabel, + onChange: onChangeProp, + toggleDescription, + toggleInfo, + errorText: externalErrorText, + ...props +}: FormToggleProps) => { + return ( + { + return ( + + {leftContent} + + { + onChange(event.detail.checked); + onChangeProp?.(event); + }} + description={toggleDescription} + > + {(toggleLabel || toggleInfo) && ( + + {toggleLabel} + {toggleLabel && toggleInfo && + )} + + + ); + }} + /> + ); +}; diff --git a/frontend/src/components/form/Toogle/types.ts b/frontend/src/components/form/Toogle/types.ts new file mode 100644 index 0000000000..bc037c23f7 --- /dev/null +++ b/frontend/src/components/form/Toogle/types.ts @@ -0,0 +1,13 @@ +import { ReactNode } from 'react'; +import { ControllerProps, FieldValues } from 'react-hook-form'; +import { FormFieldProps } from '@cloudscape-design/components/form-field'; +import { ToggleProps } from '@cloudscape-design/components/toggle'; + +export type FormToggleProps = Omit & + FormFieldProps & + Pick, 'control' | 'name' | 'rules' | 'defaultValue'> & { + toggleDescription?: ReactNode; + leftContent?: ReactNode; + toggleLabel?: ReactNode | string; + toggleInfo?: ReactNode; + }; diff --git a/frontend/src/components/index.ts b/frontend/src/components/index.ts new file mode 100644 index 0000000000..6acd2a2b05 --- /dev/null +++ b/frontend/src/components/index.ts @@ -0,0 +1,104 @@ +export { default as Alert } from '@cloudscape-design/components/alert'; +export type { AlertProps } from '@cloudscape-design/components/alert'; +export { default as Icon } from '@cloudscape-design/components/icon'; +export { default as ButtonDropdown } from '@cloudscape-design/components/button-dropdown'; +export type { ButtonDropdownProps } from '@cloudscape-design/components/button-dropdown'; +export { default as AppLayout } from '@cloudscape-design/components/app-layout'; +export type { AppLayoutProps } from '@cloudscape-design/components/app-layout'; +export { default as SideNavigation } from '@cloudscape-design/components/side-navigation'; +export type { SideNavigationProps } from '@cloudscape-design/components/side-navigation'; +export { default as TopNavigation } from '@cloudscape-design/components/top-navigation'; +export type { TopNavigationProps } from '@cloudscape-design/components/top-navigation'; +export { default as Box } from '@cloudscape-design/components/box'; +export { default as SpaceBetween } from '@cloudscape-design/components/space-between'; +export { default as Container } from '@cloudscape-design/components/container'; +export { default as Spinner } from '@cloudscape-design/components/spinner'; +export { default as Cards } from '@cloudscape-design/components/cards'; +export type { CardsProps } from '@cloudscape-design/components/cards'; +export { default as Header } from '@cloudscape-design/components/header'; +export { default as Link } from '@cloudscape-design/components/link'; +export type { LinkProps } from '@cloudscape-design/components/link'; +export { default as TextFilter } from '@cloudscape-design/components/text-filter'; +export { default as Pagination } from '@cloudscape-design/components/pagination'; +export { default as Table } from '@cloudscape-design/components/table'; +export type { TableProps } from '@cloudscape-design/components/table'; +export { default as CollectionPreferences } from '@cloudscape-design/components/collection-preferences'; +export type { CollectionPreferencesProps } from '@cloudscape-design/components/collection-preferences'; +export { default as ContentLayout } from '@cloudscape-design/components/content-layout'; +export { default as ColumnLayout } from '@cloudscape-design/components/column-layout'; +export { default as BreadcrumbGroup } from '@cloudscape-design/components/breadcrumb-group'; +export type { BreadcrumbGroupProps } from '@cloudscape-design/components/breadcrumb-group'; +export { default as FormUI } from '@cloudscape-design/components/form'; +export { default as FormField } from '@cloudscape-design/components/form-field'; +export { default as CheckboxCSD } from '@cloudscape-design/components/checkbox'; +export { default as InputCSD } from '@cloudscape-design/components/input'; +export { default as SelectCSD } from '@cloudscape-design/components/select'; +export type { SelectProps as SelectCSDProps } from '@cloudscape-design/components/select'; +export { default as MultiselectCSD } from '@cloudscape-design/components/multiselect'; +export type { MultiselectProps } from '@cloudscape-design/components/multiselect'; +export { default as StatusIndicator } from '@cloudscape-design/components/status-indicator'; +export type { StatusIndicatorProps } from '@cloudscape-design/components/status-indicator'; +export { default as Popover } from '@cloudscape-design/components/popover'; +export { default as Autosuggest } from '@cloudscape-design/components/autosuggest'; +export type { AutosuggestProps } from '@cloudscape-design/components/autosuggest'; +export { default as Grid } from '@cloudscape-design/components/grid'; +export { default as HelpPanel } from '@cloudscape-design/components/help-panel'; +export type { HelpPanelProps } from '@cloudscape-design/components/help-panel'; +export { default as TextContent } from '@cloudscape-design/components/text-content'; +export { default as Toggle } from '@cloudscape-design/components/toggle'; +export type { ToggleProps } from '@cloudscape-design/components/toggle'; +export { default as Modal } from '@cloudscape-design/components/modal'; +export { default as TutorialPanel } from '@cloudscape-design/components/tutorial-panel'; +export type { TutorialPanelProps } from '@cloudscape-design/components/tutorial-panel'; +export { default as AnnotationContext } from '@cloudscape-design/components/annotation-context'; +export type { AnnotationContextProps } from '@cloudscape-design/components/annotation-context'; +export { default as ChatBubble } from '@cloudscape-design/chat-components/chat-bubble'; +export type { ChatBubbleProps } from '@cloudscape-design/chat-components/chat-bubble'; +export { default as Avatar } from '@cloudscape-design/chat-components/avatar'; +export type { AvatarProps } from '@cloudscape-design/chat-components/avatar'; +export { default as LineChart } from '@cloudscape-design/components/line-chart'; +export { default as PropertyFilter } from '@cloudscape-design/components/property-filter'; +export type { PropertyFilterProps } from '@cloudscape-design/components/property-filter'; +export type { LineChartProps } from '@cloudscape-design/components/line-chart/interfaces'; +export type { ModalProps } from '@cloudscape-design/components/modal'; +export { default as AnchorNavigation } from '@cloudscape-design/components/anchor-navigation'; +export { default as ExpandableSection } from '@cloudscape-design/components/expandable-section'; +export { default as KeyValuePairs } from '@cloudscape-design/components/key-value-pairs'; +export { I18nProvider } from '@cloudscape-design/components/i18n'; +export { default as Wizard } from '@cloudscape-design/components/wizard'; +export { default as SegmentedControl } from '@cloudscape-design/components/segmented-control'; +export type { SegmentedControlProps } from '@cloudscape-design/components/segmented-control'; + +// custom components +export { NavigateLink } from './NavigateLink'; +export { ListEmptyMessage } from './ListEmptyMessage'; +export { DetailsHeader } from './DetailsHeader'; +export { Loader } from './Loader'; +export { FormCheckbox } from './form/Checkbox'; +export { FormToggle } from './form/Toogle'; +export { FormInput } from './form/Input'; +export { FormMultiselect } from './form/Multiselect'; +export { FormSelect } from './form/Select'; +export { FormTextarea } from './form/Textarea'; +export { FormCodeEditor } from './form/CodeEditor'; +export { FormRadioButtons } from './form/RadioButtons'; +export type { FormSelectOptions, FormSelectProps } from './form/Select/types'; +export type { FormMultiselectOptions, FormMultiselectProps } from './form/Multiselect/types'; +export { FormS3BucketSelector } from './form/S3BucketSelector'; +export type { FormTilesProps } from './form/Tiles/types'; +export { FormTiles } from './form/Tiles'; +export type { FormCardsProps } from './form/Cards/types'; +export { FormCards } from './form/Cards'; +export { Notifications } from './Notifications'; +export { ConfirmationDialog } from './ConfirmationDialog'; +export { CodeEditor } from './CodeEditor'; +export type { CodeEditorProps } from './CodeEditor'; +export { FileUploader } from './FileUploader'; +export { InfoLink } from './InfoLink'; +export { ButtonWithConfirmation } from './ButtonWithConfirmation'; +export { Tabs } from './Tabs'; +export { Button } from './Button'; +export type { IProps as ButtonProps } from './Button'; +export { Code } from './Code'; +export { Hotspot } from './Hotspot'; +export type { IProps as TabsProps } from './Tabs'; diff --git a/frontend/src/consts.ts b/frontend/src/consts.ts new file mode 100644 index 0000000000..06715082d1 --- /dev/null +++ b/frontend/src/consts.ts @@ -0,0 +1,6 @@ +export const DATE_TIME_FORMAT = 'MM/dd/yyyy HH:mm'; +export const DISCORD_URL = 'https://discord.gg/u8SmfwPpMd'; +export const QUICK_START_URL = 'https://dstack.ai/docs/quickstart/'; +export const TALLY_FORM_ID = '3xYlYG'; +export const DOCS_URL = 'https://dstack.ai/docs/'; +export const DEFAULT_TABLE_PAGE_SIZE = 20; diff --git a/frontend/src/consts/index.ts b/frontend/src/consts/index.ts new file mode 100644 index 0000000000..663e7dedc1 --- /dev/null +++ b/frontend/src/consts/index.ts @@ -0,0 +1 @@ +export const TEMP = 'temp'; diff --git a/frontend/src/hooks/index.ts b/frontend/src/hooks/index.ts new file mode 100644 index 0000000000..19a109d52c --- /dev/null +++ b/frontend/src/hooks/index.ts @@ -0,0 +1,12 @@ +export { default as useAppDispatch } from './useAppDispatch'; +export { default as useAppSelector } from './useAppSelector'; +export { useBreadcrumbs } from './useBreadcrumbs'; +export { useNotifications } from './useNotifications'; +export { useConfirmationDialog } from './useConfirmationDialog'; +export { useHelpPanel } from './useHelpPanel'; +export { usePermissionGuard } from './usePermissionGuard'; +export { useInfiniteScroll } from './useInfiniteScroll'; +export { useLocalStorageState } from './useLocalStorageState'; + +// cloudscape +export { useCollection } from '@cloudscape-design/collection-hooks'; diff --git a/frontend/src/hooks/useAppDispatch.ts b/frontend/src/hooks/useAppDispatch.ts new file mode 100644 index 0000000000..5abcc639ac --- /dev/null +++ b/frontend/src/hooks/useAppDispatch.ts @@ -0,0 +1,6 @@ +import { useDispatch } from 'react-redux'; +import { AppDispatch } from 'store'; + +const useAppDispatch = () => useDispatch(); + +export default useAppDispatch; diff --git a/frontend/src/hooks/useAppSelector.ts b/frontend/src/hooks/useAppSelector.ts new file mode 100644 index 0000000000..a125d0e9f2 --- /dev/null +++ b/frontend/src/hooks/useAppSelector.ts @@ -0,0 +1,6 @@ +import { TypedUseSelectorHook, useSelector } from 'react-redux'; +import { RootState } from 'store'; + +const useAppSelector: TypedUseSelectorHook = useSelector; + +export default useAppSelector; diff --git a/frontend/src/hooks/useBreadcrumbs.ts b/frontend/src/hooks/useBreadcrumbs.ts new file mode 100644 index 0000000000..7d47520bca --- /dev/null +++ b/frontend/src/hooks/useBreadcrumbs.ts @@ -0,0 +1,17 @@ +import { useEffect } from 'react'; + +import { setBreadcrumb } from 'App/slice'; + +import useAppDispatch from './useAppDispatch'; + +export const useBreadcrumbs = (breadcrumbs: TBreadcrumb[]) => { + const dispatch = useAppDispatch(); + + useEffect(() => { + dispatch(setBreadcrumb(breadcrumbs)); + + return () => { + dispatch(setBreadcrumb(null)); + }; + }, [breadcrumbs]); +}; diff --git a/frontend/src/hooks/useCheckingForFleetsInProjectsOfMember.ts b/frontend/src/hooks/useCheckingForFleetsInProjectsOfMember.ts new file mode 100644 index 0000000000..b330358b46 --- /dev/null +++ b/frontend/src/hooks/useCheckingForFleetsInProjectsOfMember.ts @@ -0,0 +1,40 @@ +import { useMemo } from 'react'; + +import { useGetOnlyNoFleetsProjectsQuery, useGetProjectsQuery } from 'services/project'; + +type Args = { projectNames?: IProject['project_name'][] }; + +export const useCheckingForFleetsInProjects = ({ projectNames }: Args) => { + const { data: projectsData } = useGetProjectsQuery( + {}, + { + skip: !!projectNames?.length, + }, + ); + + const { data: noFleetsProjectsData } = useGetOnlyNoFleetsProjectsQuery(); + + const projectNameForChecking = useMemo(() => { + if (projectNames) { + return projectNames; + } + + if (projectsData?.data) { + return projectsData.data.map((project) => project.project_name); + } + + return []; + }, [projectNames, projectsData]); + + const projectHavingFleetMap = useMemo>(() => { + const map: Record = {}; + + projectNameForChecking.forEach((projectName) => { + map[projectName] = !noFleetsProjectsData?.some((i) => i.project_name === projectName); + }); + + return map; + }, [projectNameForChecking, noFleetsProjectsData]); + + return projectHavingFleetMap; +}; diff --git a/frontend/src/hooks/useConfirmationDialog.ts b/frontend/src/hooks/useConfirmationDialog.ts new file mode 100644 index 0000000000..f6ba67cdc5 --- /dev/null +++ b/frontend/src/hooks/useConfirmationDialog.ts @@ -0,0 +1,27 @@ +import { close, open } from 'components/ConfirmationDialog/slice'; +import { IProps as ConfirmationDialogProps } from 'components/ConfirmationDialog/types'; + +import { getUid } from '../libs'; +import useAppDispatch from './useAppDispatch'; + +export const useConfirmationDialog = () => { + const dispatch = useAppDispatch(); + + const onDiscard = (uuid: string) => { + dispatch(close(uuid)); + }; + + const openConfirmationDialog = (props: Omit) => { + const uuid = getUid(); + + dispatch( + open({ + uuid, + ...props, + onDiscard: () => onDiscard(uuid), + }), + ); + }; + + return [openConfirmationDialog]; +}; diff --git a/frontend/src/hooks/useHelpPanel.ts b/frontend/src/hooks/useHelpPanel.ts new file mode 100644 index 0000000000..fb88916065 --- /dev/null +++ b/frontend/src/hooks/useHelpPanel.ts @@ -0,0 +1,15 @@ +import { openHelpPanel } from 'App/slice'; + +import useAppDispatch from './useAppDispatch'; + +import { THelpPanelContent } from 'App/types'; + +export const useHelpPanel = () => { + const dispatch = useAppDispatch(); + + const openPanel = (content: THelpPanelContent) => { + dispatch(openHelpPanel(content)); + }; + + return [openPanel]; +}; diff --git a/frontend/src/hooks/useInfiniteScroll.ts b/frontend/src/hooks/useInfiniteScroll.ts new file mode 100644 index 0000000000..9064e6b346 --- /dev/null +++ b/frontend/src/hooks/useInfiniteScroll.ts @@ -0,0 +1,166 @@ +/* eslint-disable @typescript-eslint/no-explicit-any */ +import { useCallback, useEffect, useLayoutEffect, useRef, useState } from 'react'; +import { isEqual } from 'lodash'; +import { UseLazyQuery /*, UseQueryStateOptions*/ } from '@reduxjs/toolkit/dist/query/react/buildHooks'; +import { QueryDefinition } from '@reduxjs/toolkit/query'; + +const SCROLL_POSITION_GAP = 400; + +type InfinityListArgs = Partial>; + +type ListResponse = DataItem[]; +type ResponseWithDataProp = { data: ListResponse; total_count: number }; + +type LazyQueryResponse = ResponseWithDataProp | ListResponse; + +type UseInfinityParams = { + useLazyQuery: UseLazyQuery, any>>; + args: { limit?: number } & Args; + getResponseItems?: (listItem: DataItem) => Partial; + getPaginationParams: (listItem: DataItem) => Partial; + skip?: boolean; + // options?: UseQueryStateOptions, Record>; +}; + +export const useInfiniteScroll = ({ + useLazyQuery, + getPaginationParams, + // options, + args, + skip, +}: UseInfinityParams) => { + const [data, setData] = useState>([]); + const [totalCount, setTotalCount] = useState(); + const scrollElement = useRef(document.documentElement); + const isLoadingRef = useRef(false); + const isDisabledMoreRef = useRef(false); + const lastRequestParams = useRef(undefined); + const { limit, ...argsProp } = args; + const lastArgsProps = useRef>(null); + + const [getItems, { isLoading, isFetching }] = useLazyQuery({ ...args } as Args); + + const getDataRequest = (params: Args) => { + const request = getItems({ + limit, + ...params, + } as Args).unwrap(); + + request.then(() => { + lastRequestParams.current = { ...params }; + }); + + return request; + }; + + const getEmptyList = () => { + isLoadingRef.current = true; + + setData([]); + + getDataRequest(argsProp as Args).then((result: LazyQueryResponse) => { + // setDisabledMore(false); + isDisabledMoreRef.current = false; + + if ('data' in result) { + setData(result.data as ListResponse); + setTotalCount(result.total_count); + } else { + setData(result as ListResponse); + setTotalCount(); + } + + isLoadingRef.current = false; + }); + }; + + useEffect(() => { + if (!isEqual(argsProp, lastArgsProps.current) && !skip) { + getEmptyList(); + lastArgsProps.current = argsProp as Args; + } + }, [argsProp, lastArgsProps, skip]); + + const getMore = async () => { + if (isLoadingRef.current || isDisabledMoreRef.current || skip) { + return; + } + + const requestParams = { + ...argsProp, + ...getPaginationParams(data[data.length - 1]), + }; + + if (!isEqual(requestParams, lastRequestParams.current)) { + try { + isLoadingRef.current = true; + + const result = await getDataRequest(requestParams as Args); + + let listResponse: ListResponse; + + if ('data' in result) { + listResponse = result.data; + setTotalCount(result.total_count); + } else { + listResponse = result; + setTotalCount(); + } + + if (listResponse.length > 0) { + setData((prev) => [...prev, ...listResponse]); + } else { + isDisabledMoreRef.current = true; + } + } catch (e) { + console.log(e); + } + + setTimeout(() => { + isLoadingRef.current = false; + }, 50); + } + }; + + useLayoutEffect(() => { + const element = scrollElement.current; + + if (isLoadingRef.current || !data.length) return; + + if (element.scrollHeight - element.clientHeight <= 0) { + getMore().catch(console.log); + } + }, [data]); + + const onScroll = useCallback(() => { + if (isDisabledMoreRef.current || isLoadingRef.current) { + return; + } + + const element = scrollElement.current; + + const scrollPositionFromBottom = element.scrollHeight - (element.clientHeight + element.scrollTop); + + if (scrollPositionFromBottom < SCROLL_POSITION_GAP) { + getMore().catch(console.log); + } + }, [getMore]); + + useEffect(() => { + document.addEventListener('scroll', onScroll); + + return () => { + document.removeEventListener('scroll', onScroll); + }; + }, [onScroll]); + + const isLoadingMore = data.length > 0 && isFetching; + + return { + data, + totalCount, + isLoading: isLoading || (data.length === 0 && isFetching), + isLoadingMore, + refreshList: getEmptyList, + } as const; +}; diff --git a/frontend/src/hooks/useIsMounted.ts b/frontend/src/hooks/useIsMounted.ts new file mode 100644 index 0000000000..8eb727af34 --- /dev/null +++ b/frontend/src/hooks/useIsMounted.ts @@ -0,0 +1,17 @@ +import { useCallback, useEffect, useRef } from 'react'; + +function useIsMounted() { + const isMounted = useRef(false); + + useEffect(() => { + isMounted.current = true; + + return () => { + isMounted.current = false; + }; + }, []); + + return useCallback(() => isMounted.current, []); +} + +export default useIsMounted; diff --git a/frontend/src/hooks/useLocalStorageState.ts b/frontend/src/hooks/useLocalStorageState.ts new file mode 100644 index 0000000000..c2decd3ddd --- /dev/null +++ b/frontend/src/hooks/useLocalStorageState.ts @@ -0,0 +1,37 @@ +import { useCallback, useEffect, useState } from 'react'; + +export const useLocalStorageState = (key: string, defaultState?: T): [T, (state: T) => void] => { + const storageItem = localStorage.getItem(key); + const [state, setState] = useState(storageItem ? JSON.parse(storageItem) : defaultState); + + useEffect(() => { + const listener = (event: StorageEvent) => { + if (event.key === key) { + setState(event.newValue ? JSON.parse(event.newValue) : defaultState); + } + }; + + window.addEventListener('storage', listener); + + return () => { + window.removeEventListener('storage', listener); + }; + }, [key, defaultState]); + + const setStorage = useCallback( + (newState: T) => { + const storageState = JSON.stringify(newState); + + window.dispatchEvent( + new StorageEvent('storage', { + key, + newValue: storageState, + }), + ); + localStorage.setItem(key, storageState); + }, + [key], + ); + + return [state, setStorage]; +}; diff --git a/frontend/src/hooks/useNotifications.ts b/frontend/src/hooks/useNotifications.ts new file mode 100644 index 0000000000..61189303eb --- /dev/null +++ b/frontend/src/hooks/useNotifications.ts @@ -0,0 +1,65 @@ +import { useEffect, useRef } from 'react'; + +import { push, remove } from 'components/Notifications/slice'; +import { Notification } from 'components/Notifications/types'; + +import { getUid } from 'libs'; + +import useAppDispatch from './useAppDispatch'; + +const NOTIFICATION_LIFE_TIME = 6000; + +type TUseNotificationsArgs = { temporary?: boolean; liveTime?: number } | undefined; + +const defaultArgs: NonNullable> = { temporary: true, liveTime: NOTIFICATION_LIFE_TIME }; + +export const useNotifications = (args: TUseNotificationsArgs = defaultArgs) => { + const dispatch = useAppDispatch(); + const notificationIdsSet = useRef(new Set>()); + + const { temporary, liveTime } = { + ...defaultArgs, + ...args, + }; + + const removeNotification = (id: NonNullable) => { + dispatch(remove(id)); + + if (notificationIdsSet.current.has(id)) { + notificationIdsSet.current.delete(id); + } + }; + + const pushNotification = (notification: Omit) => { + const id = getUid(); + + dispatch( + push({ + id, + ...notification, + dismissible: true, + onDismiss: () => { + removeNotification(id); + }, + }), + ); + + if (temporary) { + setTimeout(() => { + removeNotification(id); + }, liveTime); + } else { + notificationIdsSet.current.add(id); + } + }; + + useEffect(() => { + return () => { + notificationIdsSet.current.forEach((notificationId) => { + removeNotification(notificationId); + }); + }; + }, []); + + return [pushNotification]; +}; diff --git a/frontend/src/hooks/useOnClickOutside.ts b/frontend/src/hooks/useOnClickOutside.ts new file mode 100644 index 0000000000..cd36d38b03 --- /dev/null +++ b/frontend/src/hooks/useOnClickOutside.ts @@ -0,0 +1,24 @@ +import React, { useEffect } from 'react'; + +function useOnClickOutside( + ref: React.RefObject, + handler: (event: MouseEvent | TouchEvent) => void, +) { + useEffect(() => { + const listener = (event: MouseEvent | TouchEvent) => { + if (!ref.current || ref.current.contains(event.target as Node)) return; + + handler(event); + }; + + document.addEventListener('mousedown', listener); + document.addEventListener('touchstart', listener); + + return () => { + document.removeEventListener('mousedown', listener); + document.removeEventListener('touchstart', listener); + }; + }, [ref, handler]); +} + +export default useOnClickOutside; diff --git a/frontend/src/hooks/usePermissionGuard.ts b/frontend/src/hooks/usePermissionGuard.ts new file mode 100644 index 0000000000..c794403a5a --- /dev/null +++ b/frontend/src/hooks/usePermissionGuard.ts @@ -0,0 +1,41 @@ +import { useMemo } from 'react'; + +import { selectUserData } from 'App/slice'; + +import useAppSelector from './useAppSelector'; + +import { GlobalUserRole, ProjectUserRole, UserPermission } from '../types'; + +interface Args { + allowedGlobalRoles?: GlobalUserRole[]; + allowedProjectRoles?: ProjectUserRole[]; + allowedPermissions?: UserPermission[]; + projectRole?: string; +} +export const usePermissionGuard = ({ allowedGlobalRoles, allowedProjectRoles, allowedPermissions, projectRole }: Args) => { + const userData = useAppSelector(selectUserData); + const userGlobalRole = userData?.global_role ?? ''; + const userPermissions = userData?.permissions ?? []; + + const isAvailableForGlobalUser = useMemo(() => { + if (!allowedGlobalRoles?.length) return false; + + return allowedGlobalRoles.includes(userGlobalRole as GlobalUserRole); + }, [allowedGlobalRoles, userGlobalRole]); + + const isAvailableForProjectUser = useMemo(() => { + if (!allowedProjectRoles?.length) return false; + + return allowedProjectRoles.includes(projectRole as ProjectUserRole); + }, [allowedGlobalRoles, userGlobalRole]); + + const hasPermission = useMemo(() => { + if (!allowedPermissions?.length) return false; + + return userPermissions.some((userPermission) => allowedPermissions.includes(userPermission as UserPermission)); + }, [allowedGlobalRoles, userGlobalRole]); + + const isAvailableContent = isAvailableForGlobalUser || isAvailableForProjectUser || hasPermission; + + return [isAvailableContent] as const; +}; diff --git a/frontend/src/hooks/usePrevious.ts b/frontend/src/hooks/usePrevious.ts new file mode 100644 index 0000000000..dadc595ee5 --- /dev/null +++ b/frontend/src/hooks/usePrevious.ts @@ -0,0 +1,13 @@ +import { useEffect, useRef } from 'react'; + +function usePrevious(value: T): T { + const ref = useRef(value); + + useEffect(() => { + ref.current = value; + }, [value]); + + return ref.current; +} + +export default usePrevious; diff --git a/frontend/src/hooks/useProjectFilter.ts b/frontend/src/hooks/useProjectFilter.ts new file mode 100644 index 0000000000..5c54fd28e0 --- /dev/null +++ b/frontend/src/hooks/useProjectFilter.ts @@ -0,0 +1,45 @@ +import { useEffect, useMemo } from 'react'; + +import { SelectCSDProps } from 'components'; + +import { useGetProjectsQuery } from 'services/project'; + +import { useLocalStorageState } from './useLocalStorageState'; + +type Args = { + localStorePrefix: string; +}; + +export const useProjectFilter = ({ localStorePrefix }: Args) => { + const [selectedProject, setSelectedProject] = useLocalStorageState( + `${localStorePrefix}-project_name`, + null, + ); + + const { data: projectsData, isLoading } = useGetProjectsQuery({}); + + const projectOptions = useMemo(() => { + if (!projectsData?.data?.length) return []; + + return projectsData.data.map((project) => ({ label: project.project_name, value: project.project_name })); + }, [projectsData]); + + useEffect(() => { + if (!projectsData?.data || !selectedProject) { + return; + } + + const hasSelectedProject = projectsData.data.some(({ project_name }) => selectedProject?.value === project_name); + + if (!hasSelectedProject) { + setSelectedProject(null); + } + }, [projectsData]); + + return { + projectOptions, + selectedProject, + setSelectedProject, + isLoadingProjectOptions: isLoading, + } as const; +}; diff --git a/frontend/src/index.tsx b/frontend/src/index.tsx new file mode 100644 index 0000000000..c02da1084c --- /dev/null +++ b/frontend/src/index.tsx @@ -0,0 +1,46 @@ +import React from 'react'; +import { Provider as BusProvider } from 'react-bus'; +import { createRoot } from 'react-dom/client'; +import { Provider } from 'react-redux'; +import { RouterProvider } from 'react-router-dom'; +import { applyTheme, Theme } from '@cloudscape-design/components/theming'; + +import { router } from './router'; +import { store } from './store'; + +import '@cloudscape-design/global-styles/index.css'; +import 'ace-builds/css/ace.css'; +import 'ace-builds/css/theme/cloud_editor.css'; +import 'ace-builds/css/theme/cloud_editor_dark.css'; +import 'assets/css/index.css'; + +import 'locale'; + +const container = document.getElementById('root'); + +const theme: Theme = { + tokens: { + fontFamilyBase: + 'metro-web, Metro, -apple-system, "system-ui", "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", sans-serif', + fontSizeHeadingS: '15px', + fontSizeHeadingL: '19px', + fontSizeHeadingXl: '22px', + fontSizeDisplayL: '40px', + }, +}; + +applyTheme({ theme }); + +if (container) { + const root = createRoot(container); + + root.render( + + + + + + + , + ); +} diff --git a/frontend/src/layouts/AppLayout/AnnotationContext/index.tsx b/frontend/src/layouts/AppLayout/AnnotationContext/index.tsx new file mode 100644 index 0000000000..4ecde1c8a7 --- /dev/null +++ b/frontend/src/layouts/AppLayout/AnnotationContext/index.tsx @@ -0,0 +1,61 @@ +import React, { PropsWithChildren, useMemo, useState } from 'react'; + +import { AnnotationContext as AnnotationContextGeneric, AnnotationContextProps } from 'components'; + +import { useAppDispatch, useAppSelector } from 'hooks'; + +import { selectToolsPanelState, setToolsTab } from 'App/slice'; + +import { overlayI18nStrings } from '../TutorialPanel/constants'; +import { useTutorials } from '../TutorialPanel/hooks'; + +import { ITutorialItem, ToolsTabs } from 'App/types'; + +export const AnnotationContext: React.FC = ({ children }) => { + const dispatch = useAppDispatch(); + const [activeTutorialId, setActiveTutorialId] = useState(); + const { tab } = useAppSelector(selectToolsPanelState); + + const { tutorials } = useTutorials(); + + const currentTutorial = useMemo(() => { + if (!activeTutorialId) return null; + + return tutorials.find((t) => t.id === activeTutorialId) ?? null; + }, [activeTutorialId, tutorials]); + + const onStepChange: AnnotationContextProps['onStepChange'] = () => { + if (tab !== ToolsTabs.TUTORIAL) dispatch(setToolsTab(ToolsTabs.TUTORIAL)); + }; + + return ( + { + const tutorial = event.detail.tutorial as ITutorialItem; + + if (tutorial.startCallback) { + tutorial.startCallback(tutorial); + } + + if (!tutorial.startWithoutActivation) { + setActiveTutorialId(tutorial.id); + } + }} + onExitTutorial={() => { + setActiveTutorialId(undefined); + }} + onFinish={() => { + if (currentTutorial && currentTutorial.finishCallback) { + currentTutorial.finishCallback(currentTutorial); + } + + setActiveTutorialId(undefined); + }} + i18nStrings={overlayI18nStrings} + > + {children} + + ); +}; diff --git a/frontend/src/layouts/AppLayout/Tally/index.tsx b/frontend/src/layouts/AppLayout/Tally/index.tsx new file mode 100644 index 0000000000..638b3259bb --- /dev/null +++ b/frontend/src/layouts/AppLayout/Tally/index.tsx @@ -0,0 +1,16 @@ +import React, { useEffect } from 'react'; + +export const TallyComponent: React.FC = () => { + useEffect(() => { + const widgetScriptSrc = 'https://tally.so/widgets/embed.js'; + + if (document.querySelector(`script[src="${widgetScriptSrc}"]`) === null) { + const script = document.createElement('script'); + script.src = widgetScriptSrc; + document.body.appendChild(script); + return; + } + }, []); + + return null; +}; diff --git a/frontend/src/layouts/AppLayout/TutorialPanel/constants.tsx b/frontend/src/layouts/AppLayout/TutorialPanel/constants.tsx new file mode 100644 index 0000000000..4ee909c0cf --- /dev/null +++ b/frontend/src/layouts/AppLayout/TutorialPanel/constants.tsx @@ -0,0 +1,177 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT-0 +import React from 'react'; + +import { AnnotationContextProps, TutorialPanelProps } from 'components'; +import { Box } from 'components'; + +export const tutorialPanelI18nStrings: TutorialPanelProps.I18nStrings = { + labelsTaskStatus: { pending: 'Pending', 'in-progress': 'In progress', success: 'Success' }, + loadingText: 'Loading', + tutorialListTitle: '', + tutorialListDescription: 'Follow the tutorials below to get up to speed with dstack Sky.', + tutorialListDownloadLinkText: 'Download PDF version', + tutorialCompletedText: 'Completed', + labelExitTutorial: 'dismiss tutorial', + learnMoreLinkText: 'Learn more', + startTutorialButtonText: 'Start', + restartTutorialButtonText: 'Restart', + completionScreenTitle: 'Congratulations! You completed it.', + feedbackLinkText: 'Feedback', + dismissTutorialButtonText: 'Dismiss', + taskTitle: (taskIndex, taskTitle) => `Task ${taskIndex + 1}: ${taskTitle}`, + stepTitle: (stepIndex, stepTitle) => `Step ${stepIndex + 1}: ${stepTitle}`, + labelTotalSteps: (totalStepCount) => `Total steps: ${totalStepCount}`, + labelLearnMoreExternalIcon: 'Opens in a new tab', + labelTutorialListDownloadLink: 'Download PDF version of this tutorial', + labelLearnMoreLink: 'Learn more about transcribe audio (opens new tab)', +}; + +export const overlayI18nStrings: AnnotationContextProps.I18nStrings = { + stepCounterText: (stepIndex, totalStepCount) => `Step ${stepIndex + 1}/${totalStepCount}`, + taskTitle: (taskIndex, taskTitle) => `Task ${taskIndex + 1}: ${taskTitle}`, + labelHotspot: (openState, stepIndex, totalStepCount) => + openState + ? `close annotation for step ${stepIndex + 1} of ${totalStepCount}` + : `open annotation for step ${stepIndex + 1} of ${totalStepCount}`, + nextButtonText: 'Next', + previousButtonText: 'Previous', + finishButtonText: 'Finish', + labelDismissAnnotation: 'hide annotation', +}; + +export enum HotspotIds { + ADD_TOP_UP_BALANCE = 'billing-top-up-balance', + PAYMENT_CONTINUE_BUTTON = 'billing-payment-continue-button', + INSTALL_CLI_COMMAND = 'install-cli-command', + CONFIGURE_CLI_COMMAND = 'configure-cli-command', + CREATE_FIRST_PROJECT = 'create-first-project', +} + +export const BILLING_TUTORIAL: TutorialPanelProps.Tutorial = { + completed: false, + title: 'Billing', + description: ( + <> + + If you plan to use the GPU marketplace, top up your balance with a credit card. + + + ), + completedScreenDescription: 'TBA', + tasks: [ + { + title: 'Add payment method', + steps: [ + { + title: 'Click Top up balance button', + content: 'Click Top up balance button', + hotspotId: HotspotIds.ADD_TOP_UP_BALANCE, + }, + { + title: 'Click continue', + content: 'Please, click continue', + hotspotId: HotspotIds.PAYMENT_CONTINUE_BUTTON, + }, + ], + }, + ], +}; + +export const CONFIGURE_CLI_TUTORIAL: TutorialPanelProps.Tutorial = { + completed: false, + title: 'CLI', + prerequisitesAlert: 'Please, create a project before set up the CLI', + description: ( + <> + + Configure the CLI on your local machine to submit workload to dstack Sky. + + + ), + completedScreenDescription: 'TBA', + tasks: [ + { + title: 'Configure the CLI', + steps: [ + { + title: 'Run the CLI install command', + content: 'Run this command on your local machine to install the CLI.', + hotspotId: HotspotIds.INSTALL_CLI_COMMAND, + }, + { + title: 'Run the dstack project add command', + content: 'Run this command on your local machine to configure the dstack CLI.', + hotspotId: HotspotIds.CONFIGURE_CLI_COMMAND, + }, + ], + }, + ], +}; + +export const CREATE_FIRST_PROJECT: TutorialPanelProps.Tutorial = { + completed: false, + title: 'Project', + description: ( + <> + + Create your first project. Choose to use the GPU marketplace or configure your own cloud credentials. + + + ), + completedScreenDescription: 'TBA', + tasks: [ + { + title: 'Create the first project', + steps: [ + { + title: 'Create the first project', + content: 'Create the first project', + hotspotId: HotspotIds.CREATE_FIRST_PROJECT, + }, + ], + }, + ], +}; + +export const JOIN_DISCORD_TUTORIAL: TutorialPanelProps.Tutorial = { + completed: false, + title: 'Discord', + description: ( + <> + + Need help or want to chat with other users of dstack? Join our Discord server! + + + ), + completedScreenDescription: 'TBA', + tasks: [], +}; + +export const QUICKSTART_TUTORIAL: TutorialPanelProps.Tutorial = { + completed: false, + title: 'Quickstart', + description: ( + <> + + Check out the quickstart guide to get started with dstack + + + ), + completedScreenDescription: 'TBA', + tasks: [], +}; + +export const CREDITS_TUTORIAL: TutorialPanelProps.Tutorial = { + completed: false, + title: 'Get free credits', + description: ( + <> + + Tell us about your project and get some free credits to try dstack Sky! + + + ), + completedScreenDescription: 'TBA', + tasks: [], +}; diff --git a/frontend/src/layouts/AppLayout/TutorialPanel/hooks.ts b/frontend/src/layouts/AppLayout/TutorialPanel/hooks.ts new file mode 100644 index 0000000000..305a711a05 --- /dev/null +++ b/frontend/src/layouts/AppLayout/TutorialPanel/hooks.ts @@ -0,0 +1,208 @@ +import { useCallback, useEffect, useMemo, useRef } from 'react'; +import { useLocation, useNavigate } from 'react-router-dom'; + +import { + DISCORD_URL, + QUICK_START_URL, + // TALLY_FORM_ID +} from 'consts'; +import { useAppDispatch, useAppSelector } from 'hooks'; +import { goToUrl } from 'libs'; +import { ROUTES } from 'routes'; +import { useGetProjectsQuery } from 'services/project'; +import { useGetRunsQuery } from 'services/run'; +import { useGetUserBillingInfoQuery } from 'services/user'; + +import { openTutorialPanel, selectTutorialPanel, selectUserName, updateTutorialPanelState } from 'App/slice'; + +import { useSideNavigation } from '../hooks'; +import { + BILLING_TUTORIAL, + CONFIGURE_CLI_TUTORIAL, + CREATE_FIRST_PROJECT, + // CREDITS_TUTORIAL, + JOIN_DISCORD_TUTORIAL, + QUICKSTART_TUTORIAL, +} from './constants'; + +import { ITutorialItem } from 'App/types'; + +export const useTutorials = () => { + const navigate = useNavigate(); + const location = useLocation(); + const dispatch = useAppDispatch(); + const { billingUrl } = useSideNavigation(); + const useName = useAppSelector(selectUserName); + const { + billingCompleted, + createProjectCompleted, + configureCLICompleted, + discordCompleted, + tallyCompleted, + quickStartCompleted, + hideStartUp, + } = useAppSelector(selectTutorialPanel); + + const { data: userBillingData } = useGetUserBillingInfoQuery({ username: useName ?? '' }, { skip: !useName }); + const { data: projectData } = useGetProjectsQuery({}); + const { data: runsData } = useGetRunsQuery({ + limit: 1, + }); + + const completeIsChecked = useRef(false); + + useEffect(() => { + if ( + userBillingData && + projectData?.data && + runsData && + !completeIsChecked.current && + location.pathname !== ROUTES.PROJECT.ADD + ) { + const billingCompleted = userBillingData.balance > 0; + const configureCLICompleted = runsData.length > 0; + const createProjectCompleted = projectData.data.length > 0; + + let tempHideStartUp = hideStartUp; + + if (hideStartUp === null) { + tempHideStartUp = billingCompleted && configureCLICompleted && createProjectCompleted; + } + + // Set hideStartUp without updating localstorage + dispatch( + updateTutorialPanelState({ + billingCompleted, + configureCLICompleted, + createProjectCompleted, + hideStartUp: tempHideStartUp, + }), + ); + + if (!tempHideStartUp && process.env.UI_VERSION === 'sky') { + dispatch(openTutorialPanel()); + } + + completeIsChecked.current = true; + } + }, [userBillingData, runsData, projectData, location.pathname]); + + useEffect(() => { + if (projectData?.data && projectData.data.length > 0 && !createProjectCompleted) { + dispatch( + updateTutorialPanelState({ + createProjectCompleted: true, + }), + ); + } + }, [projectData]); + + const startBillingTutorial = useCallback(() => { + navigate(billingUrl); + }, [billingUrl]); + + const finishBillingTutorial = useCallback(() => { + dispatch(updateTutorialPanelState({ billingCompleted: true })); + }, []); + + const startFirstProjectTutorial = useCallback(() => { + navigate(ROUTES.PROJECT.ADD); + }, []); + + const finishFirstProjectTutorial = useCallback(() => { + dispatch(updateTutorialPanelState({ createProjectCompleted: true })); + }, []); + + const startConfigCliTutorial = useCallback(() => { + if (projectData?.data?.length) { + navigate(ROUTES.PROJECT.DETAILS.SETTINGS.FORMAT(projectData.data[0].project_name)); + } + }, [projectData]); + + const finishConfigCliTutorial = useCallback(() => { + dispatch(updateTutorialPanelState({ configureCLICompleted: true })); + }, []); + + const startDiscordTutorial = useCallback(() => { + goToUrl(DISCORD_URL, true); + dispatch(updateTutorialPanelState({ discordCompleted: true })); + }, []); + + const startQuickStartTutorial = useCallback(() => { + goToUrl(QUICK_START_URL, true); + dispatch(updateTutorialPanelState({ quickStartCompleted: true })); + }, []); + + // const startCreditsTutorial = useCallback(() => { + // if (typeof Tally !== 'undefined') { + // Tally.openPopup(TALLY_FORM_ID); + // dispatch(updateTutorialPanelState({ tallyCompleted: true })); + // } + // }, []); + + const tutorials = useMemo(() => { + return [ + // { + // ...CREDITS_TUTORIAL, + // id: 1, + // startWithoutActivation: true, + // completed: tallyCompleted, + // startCallback: startCreditsTutorial, + // }, + + { + ...CREATE_FIRST_PROJECT, + id: 2, + completed: createProjectCompleted, + startCallback: startFirstProjectTutorial, + finishCallback: finishFirstProjectTutorial, + }, + + { + ...CONFIGURE_CLI_TUTORIAL, + id: 3, + completed: configureCLICompleted, + startCallback: startConfigCliTutorial, + finishCallback: finishConfigCliTutorial, + prerequisitesNeeded: !createProjectCompleted, + }, + + { + ...QUICKSTART_TUTORIAL, + id: 5, + startWithoutActivation: true, + completed: quickStartCompleted, + startCallback: startQuickStartTutorial, + }, + + { + ...JOIN_DISCORD_TUTORIAL, + id: 6, + startWithoutActivation: true, + completed: discordCompleted, + startCallback: startDiscordTutorial, + }, + + { + ...BILLING_TUTORIAL, + id: 4, + completed: billingCompleted, + startCallback: startBillingTutorial, + finishCallback: finishBillingTutorial, + }, + ]; + }, [ + billingUrl, + createProjectCompleted, + quickStartCompleted, + discordCompleted, + tallyCompleted, + billingCompleted, + configureCLICompleted, + finishFirstProjectTutorial, + finishBillingTutorial, + finishConfigCliTutorial, + ]); + + return { tutorials } as const; +}; diff --git a/frontend/src/layouts/AppLayout/TutorialPanel/index.tsx b/frontend/src/layouts/AppLayout/TutorialPanel/index.tsx new file mode 100644 index 0000000000..9a614d14e4 --- /dev/null +++ b/frontend/src/layouts/AppLayout/TutorialPanel/index.tsx @@ -0,0 +1,54 @@ +import React, { useLayoutEffect, useRef } from 'react'; +import { createRoot, Root } from 'react-dom/client'; + +import { Box, Toggle, TutorialPanel as TutorialPanelGeneric, TutorialPanelProps } from 'components'; + +import { useAppDispatch, useAppSelector } from 'hooks'; + +import { selectTutorialPanel, setHideAtStartup } from 'App/slice'; + +import { tutorialPanelI18nStrings } from './constants'; +import { useTutorials } from './hooks'; + +export interface Props extends Partial { + test?: string; +} + +export const TutorialPanel: React.FC = () => { + const dispatch = useAppDispatch(); + const { tutorials } = useTutorials(); + const tutorialRootRef = useRef(null); + const { hideStartUp } = useAppSelector(selectTutorialPanel); + + const onChangeShowStartUp = (value: boolean) => { + dispatch(setHideAtStartup(!value)); + }; + + const renderShowAtStartup = () => { + return ( + + onChangeShowStartUp(detail.checked)} checked={!hideStartUp}> + Show at startup + + + ); + }; + + useLayoutEffect(() => { + const tutorialPanelElement = document.querySelector('[class*="awsui_tutorial-panel"]'); + + if (tutorialPanelElement && !tutorialRootRef.current) { + const divElement = document.createElement('div'); + tutorialPanelElement.appendChild(divElement); + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + tutorialRootRef.current = createRoot(divElement); + } + + if (tutorialRootRef.current) { + tutorialRootRef.current.render(renderShowAtStartup()); + } + }, [hideStartUp]); + + return ; +}; diff --git a/frontend/src/layouts/AppLayout/hooks.ts b/frontend/src/layouts/AppLayout/hooks.ts new file mode 100644 index 0000000000..f46366fcd6 --- /dev/null +++ b/frontend/src/layouts/AppLayout/hooks.ts @@ -0,0 +1,101 @@ +import { useMemo } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useLocation, useMatch } from 'react-router-dom'; + +import { SideNavigationProps } from 'components'; + +import { useAppSelector, usePermissionGuard } from 'hooks'; +import { ROUTES } from 'routes'; +import { useGetServerInfoQuery } from 'services/server'; +import { GlobalUserRole } from 'types'; + +import { selectUserName } from 'App/slice'; + +export const useSideNavigation = () => { + const { t } = useTranslation(); + const userName = useAppSelector(selectUserName) ?? ''; + const { pathname } = useLocation(); + const [isGlobalAdmin] = usePermissionGuard({ allowedGlobalRoles: [GlobalUserRole.ADMIN] }); + const { data: serverInfoData } = useGetServerInfoQuery(); + + const isPoolDetails = Boolean(useMatch(ROUTES.FLEETS.DETAILS.TEMPLATE)); + const billingUrl = ROUTES.USER.BILLING.LIST.FORMAT(userName); + const userProjectsUrl = ROUTES.USER.PROJECTS.FORMAT(userName); + + const generalLinks = [ + { type: 'link', text: t('navigation.runs'), href: ROUTES.RUNS.LIST }, + { type: 'link', text: t('navigation.offers'), href: ROUTES.OFFERS.LIST }, + { type: 'link', text: t('navigation.fleets'), href: ROUTES.FLEETS.LIST }, + { type: 'link', text: t('navigation.instances'), href: ROUTES.INSTANCES.LIST }, + { type: 'link', text: t('navigation.volumes'), href: ROUTES.VOLUMES.LIST }, + { type: 'link', text: t('navigation.events'), href: ROUTES.EVENTS.LIST }, + { type: 'link', text: t('navigation.models'), href: ROUTES.MODELS.LIST }, + { type: 'link', text: t('navigation.project_other'), href: ROUTES.PROJECT.LIST }, + + isGlobalAdmin && { + type: 'link', + text: t('navigation.users'), + href: ROUTES.USER.LIST, + }, + ].filter(Boolean); + + const userSettingsLinks = [ + { + type: 'link', + text: t('navigation.settings'), + href: ROUTES.USER.DETAILS.FORMAT(userName), + }, + { + type: 'link', + text: t('users.projects'), + href: userProjectsUrl, + }, + process.env.UI_VERSION === 'sky' && { + type: 'link', + text: t('navigation.billing'), + href: billingUrl, + }, + ].filter(Boolean); + + const navLinks: SideNavigationProps['items'] = [ + { + type: 'section-group', + title: t('navigation.general'), + items: generalLinks, + }, + + { type: 'divider' }, + + { + type: 'section-group', + title: t('navigation.account'), + items: userSettingsLinks, + }, + + { type: 'divider' }, + + { + type: 'link', + href: '#version', + text: `dstack version: ${serverInfoData?.server_version ?? 'No version'}`, + }, + ].filter(Boolean) as SideNavigationProps['items']; + + const activeHref = useMemo(() => { + if (isPoolDetails) { + return ROUTES.FLEETS.LIST; + } + + const generalActiveLink = generalLinks.find((linkItem) => pathname.indexOf(linkItem.href) === 0); + + if (generalActiveLink) return pathname; + + const settingsActiveLink = userSettingsLinks.find((linkItem) => linkItem.href === pathname); + + if (settingsActiveLink) return pathname; + + return '/' + pathname.split('/')[1]; + }, [pathname, userName]); + + return { navLinks, activeHref, billingUrl } as const; +}; diff --git a/frontend/src/layouts/AppLayout/index.module.scss b/frontend/src/layouts/AppLayout/index.module.scss new file mode 100644 index 0000000000..dcf3c8858f --- /dev/null +++ b/frontend/src/layouts/AppLayout/index.module.scss @@ -0,0 +1,103 @@ +@use '@cloudscape-design/design-tokens/index' as awsui; + +:global { + .b-page-header { + .awsui-context-top-navigation { + border-bottom: 1px solid awsui.$color-border-divider-default; + + [class*="awsui_utility-type-button"] { + [class*="awsui_link"][href="theme-button"] { + [class^="awsui_icon"] { + width: 48px !important; + } + } + } + } + } +} + +[class^='awsui_navigation'] { + [class^='awsui_list-container'] { + a[href='#version'] { + pointer-events: none; + color: awsui.$color-text-status-inactive !important; + } + } +} + +.appHeader { + height: 48px; + @media (screen and min-width: 913px) { + height: 56px; + } + .userAvatar { + width: 40px; + height: 40px; + border-radius: 50%; + overflow: hidden; + } + [data-class="user-menu"] { + [class^="awsui_dropdown-trigger"] { + [class^="awsui_text"] { + margin-right: 0!important; + } + [class*="awsui_icon"] { + display: none !important; + } + } + } + & > * { + position: fixed; + top: 0; + left: 0; + right: 0; + } +} + +.themeIcon { + display: flex; + align-items: center; + width: 48px; + gap: 6px; + + .switcher { + position: relative; + flex-shrink: 0; + width: 24px; + height: 16px; + border-radius: 8px; + background-color: awsui.$color-background-layout-toggle-default; + transition: background-color .2s ease; + + &::before { + content: ""; + position: absolute; + top: 2px; + left: 0; + transform: translateX(2px); + width: 12px; + height: 12px; + border-radius: 50%; + background-color: awsui.$color-foreground-control-default; + transition: transform .2s ease; + } + + &.on { + background-color: awsui.$color-background-control-checked; + &::before { + transform: translateX(10px); + } + } + } + + .icon { + flex-shrink: 0; + } +} + +.dstackVersion { + z-index: 9999; + position: absolute; + bottom: 6px; + left: 28px; +} diff --git a/frontend/src/layouts/AppLayout/index.tsx b/frontend/src/layouts/AppLayout/index.tsx new file mode 100644 index 0000000000..3dd4db470b --- /dev/null +++ b/frontend/src/layouts/AppLayout/index.tsx @@ -0,0 +1,268 @@ +import React from 'react'; +import Avatar from 'react-avatar'; +import { createPortal } from 'react-dom'; +import { useTranslation } from 'react-i18next'; +import { useNavigate } from 'react-router-dom'; +import { SideNavigationProps } from '@cloudscape-design/components/side-navigation'; +import { Mode } from '@cloudscape-design/global-styles'; + +import { + AppLayout as GenericAppLayout, + AppLayoutProps as GenericAppLayoutProps, + BreadcrumbGroup, + ConfirmationDialog, + HelpPanel, + Notifications, + SideNavigation, + Tabs, + TopNavigation, +} from 'components'; + +import { DISCORD_URL, DOCS_URL } from 'consts'; +import { useAppDispatch, useAppSelector } from 'hooks'; +import { goToUrl } from 'libs'; +import { ROUTES } from 'routes'; +import { useGetServerInfoQuery } from 'services/server'; + +import { + closeToolsPanel, + openTutorialPanel, + selectBreadcrumbs, + selectHelpPanelContent, + selectSystemMode, + selectToolsPanelState, + selectUserName, + setSystemMode, + setToolsTab, +} from 'App/slice'; + +import { selectConfirmationDialogs } from '../../components/ConfirmationDialog/slice'; +import { AnnotationContext } from './AnnotationContext'; +import { useSideNavigation } from './hooks'; +import { TallyComponent } from './Tally'; +import { DarkThemeIcon, LightThemeIcon } from './themeIcons'; +import { TutorialPanel } from './TutorialPanel'; + +import { ToolsTabs } from 'App/types'; + +import logo from 'assets/images/logo.svg'; +import styles from './index.module.scss'; + +type PortalProps = { children: React.ReactNode }; + +const HeaderPortal = ({ children }: PortalProps) => { + const domNode = document.querySelector('#header'); + if (domNode) return createPortal(children, domNode); + return null; +}; + +const THEME_ICON_MAP: Record = { + [Mode.Dark]: DarkThemeIcon, + [Mode.Light]: LightThemeIcon, +}; + +const AppLayout: React.FC<{ children: React.ReactNode }> = ({ children }) => { + const { t } = useTranslation(); + const navigate = useNavigate(); + useGetServerInfoQuery(); + + const userName = useAppSelector(selectUserName) ?? ''; + const systemMode = useAppSelector(selectSystemMode) ?? ''; + const breadcrumbs = useAppSelector(selectBreadcrumbs); + const { isOpen: toolsIsOpen, tab: toolsActiveTab } = useAppSelector(selectToolsPanelState); + const helpPanelContent = useAppSelector(selectHelpPanelContent); + const dispatch = useAppDispatch(); + const { navLinks, activeHref } = useSideNavigation(); + const confirmationDialogs = useAppSelector(selectConfirmationDialogs); + + const onFollowHandler: SideNavigationProps['onFollow'] = (event) => { + event.preventDefault(); + + if (event.detail.external) { + goToUrl(event.detail.href, true); + } else { + navigate(event.detail.href); + } + }; + + const renderBreadcrumbs = () => { + if (breadcrumbs) return ; + }; + + const i18nStrings = { + overflowMenuTriggerText: '', + overflowMenuTitleText: '', + overflowMenuBackIconAriaLabel: '', + overflowMenuDismissIconAriaLabel: '', + }; + + const profileActions = [ + { type: 'button', href: ROUTES.USER.DETAILS.FORMAT(userName), id: 'settings', text: t('common.settings') }, + { type: 'button', href: ROUTES.LOGOUT, id: 'signout', text: t('common.sign_out') }, + ]; + + const onChangeToolHandler: GenericAppLayoutProps['onToolsChange'] = ({ detail: { open } }) => { + if (!open) dispatch(closeToolsPanel()); + }; + + const onChangeToolsTab = (tabName: ToolsTabs) => { + dispatch(setToolsTab(tabName)); + }; + + const toggleTutorialPanel = () => { + if (process.env.UI_VERSION !== 'sky') { + return; + } + + if (toolsIsOpen) { + dispatch(closeToolsPanel()); + return; + } + + dispatch(openTutorialPanel()); + }; + + const isVisibleInfoTab = helpPanelContent.header || helpPanelContent.footer || helpPanelContent.body; + + const avatarProps = process.env.UI_VERSION === 'enterprise' ? { name: userName } : { githubHandle: userName }; + + const onChangeSystemModeToggle: SideNavigationProps['onFollow'] = (event) => { + event.preventDefault(); + + switch (systemMode) { + case Mode.Light: + dispatch(setSystemMode(Mode.Dark)); + return; + default: + dispatch(setSystemMode(Mode.Light)); + } + }; + + const ThemeIcon = THEME_ICON_MAP[systemMode]; + + const askAi = () => { + window.document.body.focus(); + window?.Kapa?.open(); + }; + + return ( + + +
+ goToUrl(DOCS_URL, true), + }, + { + type: 'button', + text: t('common.discord'), + external: true, + onClick: () => goToUrl(DISCORD_URL, true), + }, + { + href: 'theme-button', + type: 'button', + iconSvg: , + onClick: onChangeSystemModeToggle, + }, + process.env.UI_VERSION === 'sky' && { + type: 'button', + iconName: 'gen-ai', + text: t('common.ask_ai'), + title: t('common.ask_ai'), + onClick: askAi, + }, + process.env.UI_VERSION === 'sky' && { + type: 'button', + iconName: 'support', + title: t('common.tutorial_other'), + onClick: toggleTutorialPanel, + }, + { + 'data-class': 'user-menu', + type: 'menu-dropdown', + text: ( +
+ +
+ ), + items: profileActions, + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + onItemFollow: onFollowHandler, + }, + ].filter(Boolean)} + /> +
+
+ + } + navigation={ + + } + tools={ + <> + onChangeToolsTab(event.detail.activeTabId as ToolsTabs)} + tabs={[ + isVisibleInfoTab && { + id: ToolsTabs.INFO, + label: t('common.info'), + content: ( + + {helpPanelContent.body} + + ), + }, + process.env.UI_VERSION === 'sky' && { + id: ToolsTabs.TUTORIAL, + label: t('common.tutorial_other'), + content: ( + + window.prompt('Please enter your feedback here (this will not be saved):') + } + /> + ), + }, + ].filter(Boolean)} + /> + + } + toolsHide={!toolsIsOpen} + toolsOpen={toolsIsOpen} + toolsWidth={330} + onToolsChange={onChangeToolHandler} + /> + + + + {confirmationDialogs.map(({ uuid, ...props }) => ( + + ))} +
+ ); +}; + +export default AppLayout; diff --git a/frontend/src/layouts/AppLayout/themeIcons.tsx b/frontend/src/layouts/AppLayout/themeIcons.tsx new file mode 100644 index 0000000000..214ef9b27c --- /dev/null +++ b/frontend/src/layouts/AppLayout/themeIcons.tsx @@ -0,0 +1,23 @@ +import React from 'react'; +import cn from 'classnames'; + +import { ReactComponent as ThemeIcon } from 'assets/icons/theme.svg'; +import styles from './index.module.scss'; + +export const DarkThemeIcon: React.FC = () => { + return ( +
+
+ +
+ ); +}; + +export const LightThemeIcon: React.FC = () => { + return ( +
+
+ +
+ ); +}; diff --git a/frontend/src/layouts/UnauthorizedLayout/index.tsx b/frontend/src/layouts/UnauthorizedLayout/index.tsx new file mode 100644 index 0000000000..7286699082 --- /dev/null +++ b/frontend/src/layouts/UnauthorizedLayout/index.tsx @@ -0,0 +1,11 @@ +import React from 'react'; + +import styles from './styles.module.scss'; + +export interface UnauthorizedLayoutProps { + children?: React.ReactNode; +} + +export const UnauthorizedLayout: React.FC = ({ children }) => { + return
{children}
; +}; diff --git a/frontend/src/layouts/UnauthorizedLayout/styles.module.scss b/frontend/src/layouts/UnauthorizedLayout/styles.module.scss new file mode 100644 index 0000000000..78d14c2f10 --- /dev/null +++ b/frontend/src/layouts/UnauthorizedLayout/styles.module.scss @@ -0,0 +1,9 @@ +@use '@cloudscape-design/design-tokens/index' as awsui; + +.layout { + background: awsui.$color-background-layout-main; + padding-top: 40px; + padding-bottom: 40px; + min-height: 100vh; + box-sizing: border-box; +} diff --git a/frontend/src/libs/fetchBaseQueryHeaders.ts b/frontend/src/libs/fetchBaseQueryHeaders.ts new file mode 100644 index 0000000000..d517358204 --- /dev/null +++ b/frontend/src/libs/fetchBaseQueryHeaders.ts @@ -0,0 +1,18 @@ +import type { BaseQueryApi } from '@reduxjs/toolkit/query'; + +import { RootState } from '../store'; + +function baseQueryHeaders(headers: Headers, { getState }: Pick): Headers { + const token = (getState() as RootState).app.authData?.token; + const authorizationHeader = headers.get('Authorization'); + + if (token && !authorizationHeader) { + headers.set('Authorization', `Bearer ${token}`); + } + + headers.set('X-API-VERSION', 'latest'); + + return headers; +} + +export default baseQueryHeaders; diff --git a/frontend/src/libs/filters.test.ts b/frontend/src/libs/filters.test.ts new file mode 100644 index 0000000000..1efeee9890 --- /dev/null +++ b/frontend/src/libs/filters.test.ts @@ -0,0 +1,36 @@ +import { getTokenAwareNamePatternFilterRequestParams } from './filters'; + +describe('filters helpers', () => { + test('loads the full list when reopening an existing token value', () => { + expect( + getTokenAwareNamePatternFilterRequestParams({ + filteringText: 'main', + limit: 100, + propertyKey: 'project_name', + tokens: [{ propertyKey: 'project_name', operator: '=', value: 'main' }], + }), + ).toEqual({ limit: 100 }); + }); + + test('keeps the typed text when the value is being edited', () => { + expect( + getTokenAwareNamePatternFilterRequestParams({ + filteringText: 'mai', + limit: 100, + propertyKey: 'project_name', + tokens: [{ propertyKey: 'project_name', operator: '=', value: 'main' }], + }), + ).toEqual({ limit: 100, name_pattern: 'mai' }); + }); + + test('ignores matching values from other properties', () => { + expect( + getTokenAwareNamePatternFilterRequestParams({ + filteringText: 'main', + limit: 100, + propertyKey: 'project_name', + tokens: [{ propertyKey: 'username', operator: '=', value: 'main' }], + }), + ).toEqual({ limit: 100, name_pattern: 'main' }); + }); +}); diff --git a/frontend/src/libs/filters.ts b/frontend/src/libs/filters.ts new file mode 100644 index 0000000000..13b3605372 --- /dev/null +++ b/frontend/src/libs/filters.ts @@ -0,0 +1,169 @@ +import type { PropertyFilterProps } from 'components'; + +export const tokensToSearchParams = ( + tokens: PropertyFilterProps.Query['tokens'], + onlyActive?: boolean, +) => { + const params = new URLSearchParams(); + + tokens.forEach((token) => { + if (token.propertyKey) { + params.append(token.propertyKey as RequestParamsKeys, token.value); + } + }); + + if (onlyActive) { + params.append('only_active', 'true'); + } + + return params; +}; + +export type RequestParam = string | { min: number } | { max: number }; + +export const getNamePatternFilterRequestParams = (filteringText: string, limit: number) => { + return { + ...(filteringText ? { name_pattern: filteringText } : {}), + limit, + }; +}; + +export const getTokenAwareNamePatternFilterRequestParams = ({ + filteringText, + limit, + propertyKey, + tokens, +}: { + filteringText: string; + limit: number; + propertyKey: PropertyKey; + tokens: PropertyFilterProps.Query['tokens']; +}) => { + const matchingExistingToken = tokens.some((token) => { + return token.propertyKey === propertyKey && typeof token.value === 'string' && token.value === filteringText; + }); + + return getNamePatternFilterRequestParams(matchingExistingToken ? '' : filteringText, limit); +}; + +const convertTokenValueToRequestParam = (token: PropertyFilterProps.Query['tokens'][number]): RequestParam => { + const { value, operator } = token; + + if (operator === '>=') { + return { min: Number(value) }; + } + + if (operator === '<=') { + return { max: Number(value) }; + } + + return value; +}; + +export const tokensToRequestParams = ({ + tokens, + arrayFieldKeys, +}: { + tokens: PropertyFilterProps.Query['tokens']; + arrayFieldKeys?: RequestParamsKeys[]; +}) => { + return tokens.reduce>( + (acc, token) => { + const propertyKey = token.propertyKey as RequestParamsKeys; + + if (!propertyKey) { + return acc; + } + + const convertedValue = convertTokenValueToRequestParam(token); + + if (arrayFieldKeys?.includes(propertyKey)) { + if (Array.isArray(acc[propertyKey])) { + acc[propertyKey].push(convertedValue as string); + } else { + acc[propertyKey] = [convertedValue as string]; + } + + return acc; + } + + acc[propertyKey] = convertedValue; + + return acc; + }, + {} as Record, + ); +}; + +export const EMPTY_QUERY: PropertyFilterProps.Query = { + tokens: [], + operation: 'and', +}; + +export const requestParamsToTokens = ({ + searchParams, + filterKeys, + defaultFilterValues, +}: { + searchParams: URLSearchParams; + filterKeys: Record; + defaultFilterValues?: Partial>; +}): PropertyFilterProps.Query => { + const tokens = []; + const filterKeysValues = Object.values(filterKeys); + + if (defaultFilterValues) { + Object.keys(defaultFilterValues).forEach((defaultFilterKey) => { + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-expect-error + const defaultFilterValue: string[] = Array.isArray(defaultFilterValues[defaultFilterKey]) + ? // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-expect-error + defaultFilterValues[defaultFilterKey] + : // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-expect-error + [defaultFilterValues[defaultFilterKey]]; + + defaultFilterValue.forEach((value) => { + tokens.push({ propertyKey: defaultFilterKey, operator: '=', value: value }); + }); + }); + } + + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + for (const [paramKey, paramValue] of searchParams.entries()) { + if (filterKeysValues.includes(paramKey)) { + tokens.push({ propertyKey: paramKey, operator: '=', value: paramValue }); + } + } + + if (!tokens.length) { + return EMPTY_QUERY; + } + + return { + ...EMPTY_QUERY, + tokens, + }; +}; + +export const requestParamsToArray = ({ + searchParams, + paramName, +}: { + searchParams: URLSearchParams; + paramName: Key; +}) => { + const paramValues: string[] = []; + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + + for (const [paramKey, paramValue] of searchParams.entries()) { + if (paramKey === paramName) { + paramValues.push(paramValue); + } + } + + return paramValues; +}; diff --git a/frontend/src/libs/fleet.ts b/frontend/src/libs/fleet.ts new file mode 100644 index 0000000000..3a56e23b96 --- /dev/null +++ b/frontend/src/libs/fleet.ts @@ -0,0 +1,176 @@ +import { isEqual } from 'lodash'; +import { StatusIndicatorProps } from '@cloudscape-design/components'; + +export const formatBackend = (backend: TBackendType | string | null | undefined): string => { + if (!backend) return '-'; + if (backend === 'remote') return 'ssh'; + return backend; +}; + +export const getStatusIconType = (status: IInstance['status']): StatusIndicatorProps['type'] => { + switch (status) { + case 'pending': + case 'creating': + return 'pending'; + case 'terminated': + return 'stopped'; + case 'terminating': + case 'provisioning': + case 'starting': + case 'busy': + return 'in-progress'; + case 'idle': + return 'success'; + default: + console.error(new Error('Undefined fleet status')); + } +}; + +export const getStatusIconColor = (status: IInstance['status']): StatusIndicatorProps.Color | undefined => { + switch (status) { + case 'busy': + case 'provisioning': + case 'starting': + case 'terminating': + return 'blue'; + default: + return undefined; + } +}; + +export const getFleetStatusIconType = (status: IFleet['status']): StatusIndicatorProps['type'] => { + switch (status) { + case 'submitted': + return 'pending'; + case 'failed': + case 'terminated': + return 'stopped'; + case 'terminating': + return 'in-progress'; + case 'active': + return 'success'; + default: + console.error(new Error('Undefined fleet status')); + } +}; + +export const getFleetPrice = (fleet: IFleet): number | null => { + return fleet.instances.reduce((acc, instance) => { + if (typeof instance.price === 'number' && instance.status !== 'terminated') { + if (acc === null) return instance.price; + + acc += instance.price; + } + + return acc; + }, null); +}; + +const getInstanceFields = (instance: IInstance) => ({ + backend: instance.backend, + region: instance.region, + type: instance.instance_type?.name, + spot: instance.instance_type?.resources.spot, +}); + +const formatRange = (min: unknown, max: unknown, suffix = ''): string => { + if (min == null && max == null) return ''; + if (min === max) return `${min}${suffix}`; + if (max == null) return `${min}${suffix}..`; + if (min == null) return `..${max}${suffix}`; + return `${min}${suffix}..${max}${suffix}`; +}; + +// eslint-disable-next-line @typescript-eslint/no-explicit-any +const formatCpu = (cpu: any): string | null => { + if (!cpu) return null; + if (typeof cpu === 'number') return `cpu=${cpu}`; + if (cpu.min != null || cpu.max != null) return `cpu=${formatRange(cpu.min, cpu.max)}`; + const arch = cpu.arch; + const count = cpu.count; + if (!count) return null; + const prefix = arch === 'arm' ? 'arm:' : ''; + return `cpu=${prefix}${formatRange(count.min, count.max)}`; +}; + +// eslint-disable-next-line @typescript-eslint/no-explicit-any +const formatGpu = (gpu: any): string | null => { + if (!gpu) return null; + const count = gpu.count; + if (!count || (count.min === 0 && (count.max == null || count.max === 0))) return null; + + const gpuParts: string[] = []; + + if (gpu.memory) { + const memStr = formatRange(gpu.memory.min, gpu.memory.max, 'GB'); + if (memStr) gpuParts.push(memStr); + } + + const countStr = formatRange(count.min, count.max); + if (countStr) gpuParts.push(countStr); + + if (gpu.total_memory) { + const tmStr = formatRange(gpu.total_memory.min, gpu.total_memory.max, 'GB'); + if (tmStr) gpuParts.push(tmStr); + } + + let label: string; + if (gpu.name && gpu.name.length > 0) { + label = gpu.name.join(','); + } else if (gpu.vendor) { + label = gpu.vendor; + } else { + label = ''; + } + + return 'gpu=' + [label, ...gpuParts].filter(Boolean).join(':'); +}; + +// eslint-disable-next-line @typescript-eslint/no-explicit-any +export const formatFleetResources = (resources: any): string => { + if (!resources) return '-'; + + const parts: string[] = []; + + const cpuStr = formatCpu(resources.cpu); + if (cpuStr) parts.push(cpuStr); + + if (resources.memory) { + const memStr = formatRange(resources.memory.min, resources.memory.max, 'GB'); + if (memStr) parts.push(`mem=${memStr}`); + } + + if (resources.disk?.size) { + const diskStr = formatRange(resources.disk.size.min, resources.disk.size.max, 'GB'); + if (diskStr) parts.push(`disk=${diskStr}`); + } + + const gpuStr = formatGpu(resources.gpu); + if (gpuStr) parts.push(gpuStr); + + return parts.length > 0 ? parts.join(' ') : '-'; +}; + +export const formatFleetBackend = (config: IFleetConfigurationRequest): string => { + if (config.ssh_config) return 'ssh'; + if (!config.backends || config.backends.length === 0) return '-'; + return config.backends.map((b) => formatBackend(b)).join(', '); +}; + +export const getFleetInstancesLinkText = (fleet: IFleet): string => { + const instances = fleet.instances.filter((i) => i.status !== 'terminated'); + const hasPending = instances.some((i) => i.status === 'pending'); + + if (!instances.length) return '0 instances'; + + if (hasPending) return `${instances.length} instances`; + + const isSameInstances = instances.every((i) => isEqual(getInstanceFields(instances[0]), getInstanceFields(i))); + + if (isSameInstances) + return `${instances.length}x ${instances[0].instance_type?.name}${ + instances[0].instance_type?.resources.spot ? ' (spot)' : '' + } @ ${formatBackend(instances[0].backend)} (${instances[0].region})`; + + return `${instances.length} instances`; +}; diff --git a/frontend/src/libs/form.ts b/frontend/src/libs/form.ts new file mode 100644 index 0000000000..86b34a1c5c --- /dev/null +++ b/frontend/src/libs/form.ts @@ -0,0 +1,8 @@ +import { FormFieldError } from './types'; + +export const getFieldErrorFromServerResponse = (error: FormFieldError): { fieldNamePath: string; message: string } => { + const fieldNamePath = error.loc.filter((key) => key !== 'body').join('.'); + const message = error.msg; + + return { fieldNamePath, message }; +}; diff --git a/frontend/src/libs/index.test.ts b/frontend/src/libs/index.test.ts new file mode 100644 index 0000000000..9eaf7b3b56 --- /dev/null +++ b/frontend/src/libs/index.test.ts @@ -0,0 +1,53 @@ +import { arrayToRecordByKeyName, getDateAgoSting, getUid, isErrorWithMessage, MINUTE } from './index'; + +describe('test libs', () => { + test('Check is error with message', () => { + expect(isErrorWithMessage({})).toBeFalsy(); + expect(isErrorWithMessage(null)).toBeFalsy(); + expect(isErrorWithMessage({ data: { test: 'test' } })).toBeFalsy(); + expect(isErrorWithMessage({ data: { message: 'error message' } })).toBeTruthy(); + }); + + test('array to record by name', () => { + const mockData = [ + { name: 'test', lastname: 'test_lastname' }, + { name: 'test2', lastname: 'test_lastname2' }, + ]; + + expect(arrayToRecordByKeyName(mockData, 'name')).toEqual({ + test: mockData[0], + test2: mockData[1], + }); + + expect(arrayToRecordByKeyName(mockData, 'lastname')).toEqual({ + test_lastname: mockData[0], + test_lastname2: mockData[1], + }); + }); + + test('getDateAgoSting', () => { + const date = new Date(); + const timestamp = date.getTime(); + date.setDate(date.getDate() - 1); + const day: string = date.getDate() < 10 ? `0${date.getDate()}` : `${date.getDate()}`; + const month: string = date.getMonth() < 9 ? `0${date.getMonth() + 1}` : `${date.getMonth() + 1}`; + const year: string = date.getFullYear().toString(); + + expect(getDateAgoSting(timestamp)).toEqual('Just now'); + expect(getDateAgoSting(timestamp - MINUTE + 100)).toEqual('Just now'); + expect(getDateAgoSting(timestamp - MINUTE)).toEqual('1 minute ago'); + expect(getDateAgoSting(timestamp - MINUTE * 2)).toEqual('2 minutes ago'); + expect(getDateAgoSting(timestamp - MINUTE * 60)).toEqual('1 hour ago'); + expect(getDateAgoSting(timestamp - MINUTE * 60 * 2)).toEqual('2 hours ago'); + expect(getDateAgoSting(timestamp - MINUTE * 60 * 24)).toEqual(`${day}/${month}/${year}`); + }); + + test('get unique id', () => { + const set = new Set(); + const iterationCount = 20; + + for (let i = 0; i < iterationCount; i++) set.add(getUid()); + + expect(set.size).toBe(iterationCount); + }); +}); diff --git a/frontend/src/libs/index.ts b/frontend/src/libs/index.ts new file mode 100644 index 0000000000..4d5123787c --- /dev/null +++ b/frontend/src/libs/index.ts @@ -0,0 +1,117 @@ +export { + default as isErrorWithMessage, + isResponseServerFormFieldError, + isResponseServerError, + getServerError, +} from './serverErrors'; +import { format, formatDistanceToNowStrict } from 'date-fns'; +export { generateSecurePassword, generatePassword, generateSimplePassword } from './password'; + +// eslint-disable-next-line @typescript-eslint/no-explicit-any +export function arrayToRecordByKeyName(array: T[], selector: K) { + return array.reduce( + (acc, item) => { + acc[item[selector]] = item; + return acc; + }, + {} as Record, + ); +} + +export function wait(delayInMS: number): Promise { + return new Promise((resolve) => setTimeout(resolve, delayInMS)); +} + +export function goToUrl(url: string, blank?: boolean): void { + const link = document.createElement('a'); + link.style.opacity = '0'; + link.style.position = 'absolute'; + link.style.top = '-2000px'; + + if (blank) link.target = '_blank'; + + link.href = url; + + document.body.append(link); + link.click(); + link.remove(); +} + +export const copyToClipboard = (copyText: string, success?: () => void, failed?: () => void) => { + navigator.clipboard.writeText(copyText).then(success, failed); +}; + +export const MINUTE = 60000; + +export const getDateAgoSting = (time: number): string => { + try { + if (Date.now() - time < MINUTE) return 'Just now'; + + if (Date.now() - time < MINUTE * 60 * 24) return formatDistanceToNowStrict(new Date(time), { addSuffix: true }); + + return format(new Date(time), 'dd/MM/yyyy'); + // eslint-disable-next-line @typescript-eslint/no-unused-vars + } catch (err) { + return ''; + } +}; + +export const getUid = (a?: string): string => { + return a ? (0 | (Math.random() * 16)).toString(16) : ('' + 1e11 + 1e11).replace(/1|0/g, getUid); +}; + +export const buildRoute = (route: string, params: HashMap): string => { + return Object.keys(params).reduce((acc, key) => { + const regExp = new RegExp(`:${key}`); + + return acc.replace(regExp, params[key] as string); + }, route); +}; + +export const formatBytes = (bytes: number, decimals = 2): string => { + if (bytes === 0) return '0Bytes'; + + const k = 1024; + + const dm = decimals <= 0 ? 0 : decimals; + + const sizes = ['Bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']; + + const i = Math.floor(Math.log(bytes) / Math.log(k)); + + return parseFloat((bytes / Math.pow(k, i)).toFixed(dm)) + sizes[i]; +}; + +export const centsToFormattedString = (cents: number, currency?: string): string => { + const floatValue = cents / 100; + + return `${floatValue < 0 ? '-' : ''}${currency}${Math.abs(floatValue).toFixed(2)}`; +}; + +export const riseRouterException = (status = 404, json = 'Not Found'): never => { + throw new Response(json, { status }); +}; + +export const base64ToArrayBuffer = (base64: string) => { + const binaryString = atob(base64); + const bytes = new Uint8Array(binaryString.length); + for (let i = 0; i < binaryString.length; i++) { + bytes[i] = binaryString.charCodeAt(i); + } + return bytes; +}; + +export const isValidUrl = (urlString: string) => { + try { + return Boolean(new URL(urlString)); + // eslint-disable-next-line @typescript-eslint/no-unused-vars + } catch (e) { + return false; + } +}; + +export const includeSubString = (value: string, query: string) => { + return value.toLowerCase().includes(query.trim().toLowerCase()); +}; + +export const capitalize = (str: string): string => str.charAt(0).toUpperCase() + str.slice(1); diff --git a/frontend/src/libs/instance.ts b/frontend/src/libs/instance.ts new file mode 100644 index 0000000000..614081569d --- /dev/null +++ b/frontend/src/libs/instance.ts @@ -0,0 +1,28 @@ +import { StatusIndicatorProps } from '@cloudscape-design/components'; + +export const prettyEnumValue = (value: string): string => { + return value.replace(/_/g, ' ').replace(/^\w/, (c) => c.toUpperCase()); +}; + +export const getHealthStatusIconType = (healthStatus: THealthStatus): StatusIndicatorProps['type'] => { + switch (healthStatus) { + case 'healthy': + return 'success'; + case 'warning': + return 'warning'; + case 'failure': + return 'error'; + default: + return 'info'; + } +}; + +export const formatInstanceStatusText = (instance: IInstance): string => { + const status = instance.status; + + if ((status === 'idle' || status === 'busy') && instance.total_blocks !== null && instance.total_blocks > 1) { + return `${instance.busy_blocks}/${instance.total_blocks} Busy`; + } + + return prettyEnumValue(status); +}; diff --git a/frontend/src/libs/password.ts b/frontend/src/libs/password.ts new file mode 100644 index 0000000000..e430cdcf15 --- /dev/null +++ b/frontend/src/libs/password.ts @@ -0,0 +1,106 @@ +const UPPERCASE_LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; +const LOWERCASE_LETTERS = 'abcdefghijklmnopqrstuvwxyz'; +const NUMBERS = '0123456789'; +const SPECIAL_CHARACTERS = '@#$^_+-'; + +interface PasswordOptions { + length: number; + includeUppercase?: boolean; + includeLowercase?: boolean; + includeNumbers?: boolean; + includeSpecial?: boolean; +} +function generatePassword(options: PasswordOptions): string { + const { length, includeUppercase = true, includeLowercase = true, includeNumbers = true, includeSpecial = true } = options; + + let allowedChars = ''; + + if (includeUppercase) allowedChars += UPPERCASE_LETTERS; + if (includeLowercase) allowedChars += LOWERCASE_LETTERS; + if (includeNumbers) allowedChars += NUMBERS; + if (includeSpecial) allowedChars += SPECIAL_CHARACTERS; + + if (allowedChars.length === 0) { + throw new Error('No character type is selected for the password'); + } + + if (length < 4) { + throw new Error('The password must be at least 4 characters long'); + } + + let password = ''; + const randomValues = new Uint32Array(length); + + crypto.getRandomValues(randomValues); + + for (let i = 0; i < length; i++) { + const randomIndex = randomValues[i] % allowedChars.length; + password += allowedChars[randomIndex]; + } + + return password; +} + +function generateSimplePassword(length: number): string { + const ALL_CHARS = UPPERCASE_LETTERS + LOWERCASE_LETTERS + NUMBERS + SPECIAL_CHARACTERS; + + if (length < 1) { + throw new Error('The password length must be a positive number'); + } + + let password = ''; + const randomValues = new Uint32Array(length); + + crypto.getRandomValues(randomValues); + + for (let i = 0; i < length; i++) { + const randomIndex = randomValues[i] % ALL_CHARS.length; + password += ALL_CHARS[randomIndex]; + } + + return password; +} + +function generateSecurePassword(length: number): string { + if (length < 4) { + throw new Error('The minimum length for a secure password is 4 characters'); + } + + const charSets = [UPPERCASE_LETTERS, LOWERCASE_LETTERS, NUMBERS, SPECIAL_CHARACTERS]; + + let password = ''; + password += UPPERCASE_LETTERS[Math.floor(Math.random() * UPPERCASE_LETTERS.length)]; + password += LOWERCASE_LETTERS[Math.floor(Math.random() * LOWERCASE_LETTERS.length)]; + password += NUMBERS[Math.floor(Math.random() * NUMBERS.length)]; + password += SPECIAL_CHARACTERS[Math.floor(Math.random() * SPECIAL_CHARACTERS.length)]; + + const ALL_CHARS = charSets.join(''); + const remainingLength = length - 4; + + if (remainingLength > 0) { + const randomValues = new Uint32Array(remainingLength); + crypto.getRandomValues(randomValues); + + for (let i = 0; i < remainingLength; i++) { + const randomIndex = randomValues[i] % ALL_CHARS.length; + password += ALL_CHARS[randomIndex]; + } + } + + return password + .split('') + .sort(() => Math.random() - 0.5) + .join(''); +} + +export { + generatePassword, + generateSimplePassword, + generateSecurePassword, + UPPERCASE_LETTERS, + LOWERCASE_LETTERS, + NUMBERS, + SPECIAL_CHARACTERS, +}; + +export type { PasswordOptions }; diff --git a/frontend/src/libs/repo.ts b/frontend/src/libs/repo.ts new file mode 100644 index 0000000000..8f82f6f371 --- /dev/null +++ b/frontend/src/libs/repo.ts @@ -0,0 +1,39 @@ +function bufferToHex(buffer: ArrayBuffer): string { + return Array.from(new Uint8Array(buffer)) + .map((b) => b.toString(16).padStart(2, '0')) + .join(''); +} + +export async function slugify(prefix: string, unique_key: string, hash_size: number = 8): Promise { + const encoder = new TextEncoder(); + const data = encoder.encode(unique_key); + const hashBuffer = await crypto.subtle.digest('SHA-256', data); + const fullHash = bufferToHex(hashBuffer); + return `${prefix}-${fullHash.substring(0, hash_size)}`; +} + +export function getRepoName(url: string): string { + const cleaned = url + .replace(/^https?:\/\//i, '') + .replace(/:\/(\S*)/, '') + .replace(/\/+$/, '') + .replace(/\.git$/, ''); + const parts = cleaned.split('/').filter(Boolean); + return parts.length ? parts[parts.length - 1] : ''; +} + +export function getPathWithoutProtocol(url: string): string { + return url.replace(/^https?:\/\//i, ''); +} + +export function getRepoUrlWithOutDir(url: string): string { + const parsedUrl = url.match(/^([^:]+(?::[^:]+)?)/)?.[1]; + + return parsedUrl ?? url; +} + +export function getRepoDirFromUrl(url: string): string | undefined { + const dirName = url.replace(/^https?:\/\//i, '').match(/:\/(\S*)/)?.[1]; + + return dirName ? `/${dirName}` : undefined; +} diff --git a/frontend/src/libs/resources.ts b/frontend/src/libs/resources.ts new file mode 100644 index 0000000000..17ec638740 --- /dev/null +++ b/frontend/src/libs/resources.ts @@ -0,0 +1,39 @@ +const mibToGB = (mib: number): string => `${Math.round(mib / 1024)}GB`; + +export const formatResources = (resources: IResources, includeSpot = true): string => { + const parts: string[] = []; + + if (resources.cpus > 0) { + const archPrefix = resources.cpu_arch === 'arm' ? 'arm:' : ''; + parts.push(`cpu=${archPrefix}${resources.cpus}`); + } + + if (resources.memory_mib > 0) { + parts.push(`mem=${mibToGB(resources.memory_mib)}`); + } + + if (resources.disk && resources.disk.size_mib > 0) { + parts.push(`disk=${mibToGB(resources.disk.size_mib)}`); + } + + if (resources.gpus.length > 0) { + const gpu = resources.gpus[0]; + const gpuParts: string[] = []; + + if (gpu.memory_mib > 0) { + gpuParts.push(mibToGB(gpu.memory_mib)); + } + + gpuParts.push(String(resources.gpus.length)); + + parts.push('gpu=' + [gpu.name, ...gpuParts].filter(Boolean).join(':')); + } + + let output = parts.join(' '); + + if (includeSpot && resources.spot) { + output += ' (spot)'; + } + + return output || '-'; +}; diff --git a/frontend/src/libs/run.ts b/frontend/src/libs/run.ts new file mode 100644 index 0000000000..d0773e15b3 --- /dev/null +++ b/frontend/src/libs/run.ts @@ -0,0 +1,117 @@ +import { get as _get } from 'lodash'; +import { StatusIndicatorProps } from '@cloudscape-design/components'; + +import { capitalize } from 'libs'; +import { formatResources } from 'libs/resources'; + +import { finishedRunStatuses } from '../pages/Runs/constants'; +import { getJobProbesStatuses } from '../pages/Runs/Details/Jobs/List/helpers'; + +import { IModelExtended } from '../pages/Models/List/types'; + +export const getStatusIconType = ( + status: IRun['status'] | TJobStatus, + terminationReason: string | null | undefined, +): StatusIndicatorProps['type'] => { + if (finishedRunStatuses.includes(status) && terminationReason === 'interrupted_by_no_capacity') { + return 'stopped'; + } + switch (status) { + case 'failed': + return 'error'; + case 'done': + return 'success'; + case 'aborted': + case 'terminated': + return 'stopped'; + case 'running': + return 'success'; + case 'terminating': + case 'pulling': + case 'provisioning': + return 'in-progress'; + case 'submitted': + case 'pending': + return 'pending'; + default: + console.error(new Error('Undefined run status')); + } +}; + +export const getStatusIconColor = ( + status: IRun['status'] | TJobStatus, + terminationReason: string | null | undefined, + statusMessage: string, +): StatusIndicatorProps.Color | undefined => { + if (statusMessage === 'No fleets') { + return 'red'; + } + if (terminationReason === 'failed_to_start_due_to_no_capacity' || terminationReason === 'interrupted_by_no_capacity') { + return 'yellow'; + } + switch (status) { + case 'submitted': + case 'pending': + case 'provisioning': + case 'pulling': + case 'terminating': + return 'blue'; + case 'aborted': + return 'yellow'; + case 'done': + return 'grey'; + default: + return undefined; + } +}; + +export const getRunStatusMessage = (run: IRun): string => { + if (finishedRunStatuses.includes(run.status) && run.latest_job_submission?.status_message) { + return capitalize(run.latest_job_submission.status_message); + } else { + return capitalize(run.status_message || run.status); + } +}; + +export const getRunError = (run: IRun): string | null => { + const error = run.error ?? run.latest_job_submission?.error ?? null; + return error ? capitalize(error) : null; +}; + +export const getRunProbeStatuses = (run: IRun): StatusIndicatorProps.Type[] => { + const job = run.jobs[0]; + + if (!job) { + return []; + } + + return getJobProbesStatuses(run.jobs[0]); +}; + +export const getRunPriority = (run: IRun): number | null => { + return run.run_spec.configuration?.priority ?? null; +}; + +export const getExtendedModelFromRun = (run: IRun): IModelExtended | null => { + if (!run?.service?.model) return null; + + return { + ...(run.service?.model ?? {}), + id: run.id, + project_name: run.project_name, + run_name: run?.run_spec.run_name ?? 'No run name', + user: run.user, + resources: run.latest_job_submission?.job_provisioning_data?.instance_type?.resources + ? formatResources(run.latest_job_submission.job_provisioning_data.instance_type.resources) + : null, + price: run.latest_job_submission?.job_provisioning_data?.price ?? null, + submitted_at: run.submitted_at, + repository: getRepoNameFromRun(run), + backend: run.latest_job_submission?.job_provisioning_data?.backend ?? null, + region: run.latest_job_submission?.job_provisioning_data?.region ?? null, + }; +}; + +export const getRepoNameFromRun = (run: IRun): string => { + return _get(run.run_spec.repo_data, 'repo_name', _get(run.run_spec.repo_data, 'repo_dir', '-')); +}; diff --git a/frontend/src/libs/serverErrors.ts b/frontend/src/libs/serverErrors.ts new file mode 100644 index 0000000000..ce2f6f0d68 --- /dev/null +++ b/frontend/src/libs/serverErrors.ts @@ -0,0 +1,52 @@ +import { isArray } from 'lodash'; + +import { FormFieldError, ResponseServerError } from './types'; + +export default function serverErrors(error: unknown): error is { data: { message: string } } { + return ( + typeof error === 'object' && + error !== null && + 'data' in error && + // eslint-disable-next-line @typescript-eslint/no-explicit-any + typeof ((error as any).data as any)?.message === 'string' + ); +} + +export function isResponseServerFormFieldError(fieldError: unknown): fieldError is FormFieldError { + return ( + typeof fieldError === 'object' && + fieldError !== null && + fieldError !== undefined && + 'loc' in fieldError && + 'msg' in fieldError && + isArray(fieldError?.loc) && + typeof fieldError?.msg === 'string' + ); +} + +export function isResponseServerError(formErrors: unknown): formErrors is ResponseServerError { + return ( + typeof formErrors === 'object' && + formErrors !== null && + formErrors !== undefined && + 'detail' in formErrors && + isArray(formErrors?.detail) && + !!formErrors.detail.length && + 'msg' in formErrors.detail[0] + ); +} + +// eslint-disable-next-line @typescript-eslint/no-explicit-any +export function getServerError(error: any): string { + let errorText = error?.error; + + const errorData = error.data; + + if (isResponseServerError(errorData)) { + const errorDetail = errorData.detail; + + errorText = errorDetail.flatMap(({ msg }) => msg).join(', '); + } + + return errorText; +} diff --git a/frontend/src/libs/types.ts b/frontend/src/libs/types.ts new file mode 100644 index 0000000000..22b1d14c27 --- /dev/null +++ b/frontend/src/libs/types.ts @@ -0,0 +1,15 @@ +export type FormFieldError = { + loc: string[]; + msg: string; + type?: string; + code: string; +}; + +export type ResponseServerErrorItem = { + msg: string; + code: string; +}; + +export type ResponseServerError = { + detail: (FormFieldError | ResponseServerErrorItem)[]; +}; diff --git a/frontend/src/libs/volumes.ts b/frontend/src/libs/volumes.ts new file mode 100644 index 0000000000..f5db13a3ef --- /dev/null +++ b/frontend/src/libs/volumes.ts @@ -0,0 +1,15 @@ +import { StatusIndicatorProps } from '@cloudscape-design/components'; + +export const getStatusIconType = (status: IVolume['status']): StatusIndicatorProps['type'] => { + switch (status) { + case 'failed': + return 'error'; + case 'active': + return 'success'; + case 'provisioning': + return 'in-progress'; + case 'submitted': + default: + console.error(new Error('Undefined volume status')); + } +}; diff --git a/frontend/src/locale/en.json b/frontend/src/locale/en.json new file mode 100644 index 0000000000..804134f3d1 --- /dev/null +++ b/frontend/src/locale/en.json @@ -0,0 +1,838 @@ +{ + "dstack": "Dstack", + "common": { + "ok": "OK", + "loading": "Loading", + "add": "Add", + "yes": "Yes", + "no": "No", + "create": "Create", + "create_wit_text": "Create {{text}}", + "edit": "Edit", + "delete": "Delete", + "remove": "Remove", + "apply": "Apply", + "next": "Next", + "previous": "Back", + "settings": "Settings", + "match_count_with_value_one": "{{count}} match", + "match_count_with_value_other": "{{count}} matches", + "nomatch_message_title": "No matches", + "nomatch_message_text": "We can't find a match.", + "sign_out": "Sign out", + "cancel": "Cancel", + "save": "Save", + "send": "Send", + "profile": "Profile", + "copied": "Copied", + "copy": "Copy", + "info": "Info", + "stop": "Stop", + "abort": "Abort", + "close": "Close", + "clearFilter": "Clear filter", + "server_error": "Server error: {{error}}", + "login": "Sign in", + "login_github": "Sign in with GitHub", + "login_okta": "Sign in with Okta", + "login_entra": "Sign in with EntraID", + "login_google": "Sign in with Google", + "general": "General", + "test": "Test", + "local_storage_unavailable": "Local Storage is unavailable", + "local_storage_unavailable_message": "Your browser doesn't support local storage", + "object": "Object", + "objects_other": "Objects", + "continue": "Continue", + "select_visible_columns": "Select visible columns", + "tutorial": "Tutorials", + "tutorial_other": "Take a tour", + "docs": "Docs", + "discord": "Discord", + "danger_zone": "Danger zone", + "control_plane": "Control plane", + "refresh": "Refresh", + "quickstart": "Quickstart", + "ask_ai": "Ask AI", + "new": "New", + "full_view": "Full view" + }, + + "auth": { + "invalid_token": "Invalid token", + "you_are_not_logged_in": "You are not logged in", + "contact_to_administrator": "For getting the authorization token, contact to the administrator", + "sign_in_to_dstack": "Welcome to dstack Sky", + "sign_in_to_dstack_enterprise": "Welcome to dstack", + "authorization_failed": "Authorization is failed", + "try_again": "Please try again", + "login_by_token": "Sign in via a token", + "another_login_methods": "Other sign in options" + }, + + "navigation": { + "settings": "Settings", + "runs": "Runs", + "models": "Models", + "fleets": "Fleets", + "fleet": "Fleet", + "project": "project", + "project_other": "Projects", + "general": "General", + "users": "Users", + "user_settings": "User settings", + "account": "User", + "billing": "Billing", + "resources": "Resources", + "volumes": "Volumes", + "instances": "Instances", + "offers": "Offers", + "events": "Events" + }, + + "backend": { + "page_title_one": "Backend", + "page_title_other": "Backends", + "add_backend": "Add backend", + "edit_backend": "Edit backend", + "empty_message_title": "No backends", + "empty_message_text": "No backends to display.", + "type": { + "aws": "AWS", + "aws_description": "Run workflows and store data in Amazon Web Services ", + "gcp": "GCP", + "gcp_description": "Run workflows and store data in Google Cloud Platform", + "azure": "Azure", + "azure_description": "Run workflows and store data in Microsoft Azure", + "lambda": "Lambda", + "lambda_description": "Run workflows and store data in Lambda", + "local": "Local", + "local_description": "Run workflows and store data locally via Docker" + }, + + "table": { + "region": "Region", + "bucket": "Storage" + }, + + "edit": { + "success_notification": "Project updating is successful", + "delete_backend_confirm_title": "Delete backend", + "delete_backend_confirm_message": "Are you sure you want to delete this backend?", + "delete_backends_confirm_title": "Delete backends", + "delete_backends_confirm_message": "Are you sure you want to delete these backends?" + }, + + "create": { + "success_notification": "Backend is created" + } + }, + + "gateway": { + "page_title_one": "Gateway", + "page_title_other": "Gateways", + "add_gateway": "Add gateway", + "edit_gateway": "Edit gateway", + "empty_message_title": "No gateways", + "empty_message_text": "No gateways to display.", + + "edit": { + "name": "Name", + "backend": "Backend", + "backend_description": "Select a backend", + "region": "Region", + "region_description": "Select a region", + "default": "Default", + "default_checkbox": "Turn on default", + "hostname": "Hostname", + "external_ip": "External IP", + "wildcard_domain": "Wildcard domain", + "wildcard_domain_description": "Specify the wildcard domain mapped to the external IP.", + "wildcard_domain_placeholder": "*.mydomain.com", + "delete_gateway_confirm_title": "Delete gateway", + "delete_gateway_confirm_message": "Are you sure you want to delete this gateway?", + "delete_gateways_confirm_title": "Delete gateways", + "delete_gateways_confirm_message": "Are you sure you want to delete these gateways?", + + "validation": { + "wildcard_domain_format": "Should use next format: {{pattern}}" + } + }, + + "create": { + "success_notification": "Gateway is created", + "creating_notification": "The gateway is creating. It may take some time" + }, + + "update": { + "success_notification": "Gateway is updated" + }, + + "test_domain": { + "success_notification": "Domain is valid" + } + }, + + "projects": { + "page_title": "Projects", + "search_placeholder": "Find projects", + "empty_message_title": "No projects", + "empty_message_text": "No projects to display.", + "nomatch_message_title": "No matches", + "nomatch_message_text": "We can't find a match.", + "nomatch_message_button_label": "Clear filter", + "repositories": "Repositories", + "runs": "Runs", + "tags": "Tags", + "events": "Events", + "settings": "Settings", + "join": "Join", + "leave_confirm_title": "Leave project", + "leave_confirm_message": "Are you sure you want to leave this project?", + "leave": "Leave", + "join_success": "Successfully joined the project", + "leave_success": "Successfully left the project", + "join_error": "Failed to join project", + "leave_error": "Failed to leave project", + "card": { + "backend": "Backend", + "settings": "Settings" + }, + "wizard": { + "submit": "Create" + }, + "edit": { + "general": "General", + "project_name": "Name", + "owner": "Owner", + "project_name_description": "Only latin characters, dashes, underscores, and digits", + "project_type": "Project type", + "project_type_description": "Choose which project type you want to create", + "backends": "Backends", + "base_backends_description": "dstack will automatically collect offers from the following providers. Deselect providers you don’t want to use.", + "backends_description": "The following backends can be configured with your own cloud credentials in the project settings after the project is created.", + "create_default_fleet": "Create a default fleet", + "default_fleet": "Default fleet", + "default_fleet_description": "At least one fleet is required to run dev environments, tasks, or services.", + "is_public": "Public", + "is_public_description": "Allow any user join the project as a member", + "backend": "Backend", + "backend_config": "Backend config", + "backend_config_description": "Specify the backend config in the YAML format. Click Info for examples.", + "backend_type": "Type", + "backend_type_description": "Select a backend type", + "members_empty_message_title": "No members", + "members_empty_message_text": "Select project's members", + "update_members_success": "Members are updated", + "update_visibility_success": "Project visibility updated successfully", + "update_templates_repo_success": "Templates updated successfully", + "update_visibility_confirm_title": "Change visibility", + "update_visibility_confirm_message": "Are you sure you want to change the project visibility? This will affect who can access this project.", + "change_visibility": "Change", + "project_visibility": "Visibility", + "project_visibility_settings": "Visibility", + "templates_repo": "Templates", + "override_project_templates": "Templates", + "transfer_ownership": "Ownership", + "templates_repo_description": "Set a project-level templates repository URL", + "templates_repo_placeholder": "https://github.com/org/templates.git", + "templates_repo_not_set": "not set", + "templates_repo_required": "Templates repo URL cannot be empty", + "save_templates_repo": "Save", + "configure_templates_repo": "Configure", + "change_templates_repo_title": "Override project templates", + "change_templates_repo_message": "Specify a new templates Git repo URL:", + "reset_templates_repo": "Reset", + "reset_templates_repo_title": "Reset templates", + "reset_templates_repo_message": "Are you sure you want to reset templates for this project?", + "project_visibility_description": "Control who can access this project", + "make_project_public": "Make project public", + "delete_project_confirm_title": "Delete project", + "delete_project_confirm_message": "Are you sure you want to delete this project?", + "delete_projects_confirm_title": "Delete projects", + "delete_projects_confirm_message": "Are you sure you want to delete these projects?", + "delete_this_project": "Delete this project", + "cli": "CLI", + "aws": { + "authorization": "Authorization", + "authorization_default": "Default credentials", + "authorization_access_key": "Access key", + "access_key": "Access key", + "access_key_id": "Access key ID", + "access_key_id_description": "Specify the AWS access key ID", + "secret_key": "Secret key", + "secret_key_id": "Secret access key", + "secret_key_id_description": "Specify the AWS secret access key", + "regions": "Regions", + "regions_description": "Select regions to run workflows and store artifacts", + "regions_placeholder": "Select regions", + "s3_bucket_name": "Bucket", + "s3_bucket_name_description": "Select an S3 bucket to store artifacts", + "ec2_subnet_id": "Subnet", + "ec2_subnet_id_description": "Select a subnet to run workflows in", + "ec2_subnet_id_placeholder": "Not selected", + "vpc_name": "VPC", + "vpc_name_description": "Enter a vpc" + }, + "azure": { + "authorization": "Authorization", + "authorization_default": "Default credentials", + "authorization_client": "Client secret", + "tenant_id": "Tenant ID", + "tenant_id_description": "Specify an Azure tenant ID", + "tenant_id_placeholder": "Not selected", + "client_id": "Client ID", + "client_id_description": "Specify an Azure client (application) ID", + "client_secret": "Client secret", + "client_secret_description": "Specify an Azure client (application) secret", + "subscription_id": "Subscription ID", + "subscription_id_description": "Select an Azure subscription ID", + "subscription_id_placeholder": "Not selected", + "locations": "Locations", + "locations_description": "Select locations to run workflows", + "locations_placeholder": "Select locations", + "storage_account": "Storage account", + "storage_account_description": "Select an Azure storage account to store artifacts", + "storage_account_placeholder": "Not selected" + }, + "gcp": { + "authorization": "Authorization", + "authorization_default": "Default credentials", + "service_account": "Service account key", + "credentials_description": "Credentials description", + "credentials_placeholder": "Credentials placeholder", + "regions": "Regions", + "regions_description": "Select regions to run workflows and store artifacts", + "regions_placeholder": "Select regions", + "project_id": "Project Id", + "project_id_description": "Select a project id", + "project_id_placeholder": "Select a project Id" + }, + "lambda": { + "api_key": "API key", + "api_key_description": "Specify the Lambda API key", + "regions": "Regions", + "regions_description": "Select regions to run workflows", + "regions_placeholder": "Select regions", + "storage_backend": { + "type": "Storage", + "type_description": "Select backend storage", + "type_placeholder": "Select type", + "credentials": { + "access_key_id": "Access key ID", + "access_key_id_description": "Specify the AWS access key ID", + "secret_key_id": "Secret access key", + "secret_key_id_description": "Specify the AWS secret access key" + }, + "s3_bucket_name": "Bucket", + "s3_bucket_name_description": "Select an S3 bucket to store artifacts" + } + }, + "local": { + "path": "Files path" + }, + "members": { + "section_title": "Members", + "name": "User name", + "role": "Project role" + }, + "secrets": { + "section_title": "Secrets", + "empty_message_title": "No secrets", + "empty_message_text": "No secrets to display.", + "name": "Secret name", + "value": "Secret value", + "create_secret": "Create secret", + "update_secret": "Update secret", + "delete_confirm_title": "Delete secret", + "delete_confirm_message": "Are you sure you want to delete the {{name}} secret?", + "multiple_delete_confirm_title": "Delete secrets", + "multiple_delete_confirm_message": "Are you sure you want to delete {{count}} secrets?", + "not_permissions_title": "No permissions", + "not_permissions_description": "You don't have permissions for managing secrets", + "validation": { + "secret_name_format": "Invalid secret name" + } + }, + "error_notification": "Update project error", + "validation": { + "user_name_format": "Only letters, numbers, - or _" + }, + "visibility": { + "private": "Private", + "public": "Public" + } + }, + "create": { + "page_title": "Create project", + "error_notification": "Create project error", + "success_notification": "Project is created" + }, + "repo": { + "search_placeholder": "Find repositories", + "empty_message_title": "No repositories", + "empty_message_text": "No repositories to display.", + "nomatch_message_title": "No matches", + "nomatch_message_text": "We can't find a match.", + "card": { + "owner": "Owner", + "last_run": "Last run", + "tags_count": "Tags count", + "directory": "Directory" + }, + "secrets": { + "table_title": "Secrets", + "add_modal_title": "Add secret", + "update_modal_title": "Update secret", + "name": "Secret name", + "name_description": "Secret name", + "value": "Secret value", + "value_description": "Secret value", + "search_placeholder": "Find secrets", + "empty_message_title": "No secrets", + "empty_message_text": "No secrets to display." + } + }, + "run": { + "list_page_title": "Runs", + "search_placeholder": "Find runs", + "empty_message_title": "No runs", + "empty_message_text": "No runs to display.", + "quickstart_message_text": "Check out the quickstart guide to get started with dstack", + "nomatch_message_title": "No matches", + "nomatch_message_text": "We can't find a match. Try to change project or clear filter", + "filter_property_placeholder": "Filter by properties", + "project": "Project", + "project_placeholder": "Filtering by project", + "repo": "Repository", + "repo_placeholder": "Filtering by repository", + "user": "User", + "user_placeholder": "Filtering by user", + "active_only": "Active runs", + "log": "Logs", + "log_empty_message_title": "No logs", + "log_empty_message_text": "No logs to display.", + "inspect": "Inspect", + "run_name": "Name", + "workflow_name": "Workflow", + "configuration": "Configuration", + "instance": "Instance", + "priority": "Priority", + "provider_name": "Provider", + "status": "Status", + "probe": "Probes", + "submitted_at": "Submitted", + "finished_at": "Finished", + "metrics": { + "title": "Metrics", + "show_metrics": "Show metrics", + "cpu_utilization": "CPU utilization %", + "memory_used": "System memory used", + "per_each_cpu_utilization": "GPU utilization %", + "per_each_memory_used": "GPU memory used" + }, + "jobs": "Jobs", + "job_name": "Job Name", + "cost": "Cost", + "backend": "Backend", + "region": "Region", + "instance_id": "Instance ID", + "schedule": "Schedule", + "next_run": "Next run", + "resources": "Resources", + "spot": "Spot", + "termination_reason": "Termination reason", + "price": "Price", + "error": "Error", + "artifacts": "Artifacts", + "artifacts_count": "Artifacts", + "hub_user_name": "User", + "service_url": "Service URL", + "statuses": { + "pending": "Pending", + "submitted": "Submitted", + "provisioning": "Provisioning", + "pulling": "Pulling", + "downloading": "Downloading", + "running": "Running", + "uploading": "Uploading", + "stopping": "Stopping", + "stopped": "Stopped", + "terminating": "Terminating", + "terminated": "Terminated", + "aborting": "Aborting", + "aborted": "Aborted", + "failed": "Failed", + "done": "Done", + "building": "Building" + } + }, + "tag": { + "list_page_title": "Artifacts", + "search_placeholder": "Find tags", + "empty_message_title": "No tags", + "empty_message_text": "No tags to display.", + "tag_name": "Tag", + "run_name": "Run", + "artifacts": "Files" + }, + "artifact": { + "list_page_title": "Artifacts", + "search_placeholder": "Find objects", + "empty_message_title": "No objects", + "empty_message_text": "No objects to display.", + "nomatch_message_title": "No matches", + "nomatch_message_text": "We can't find a match.", + "name": "Name", + "type": "Type", + "size": "Size" + } + }, + "runs": { + "launch_button": "Launch", + "no_templates_alert": { + "title": "No templates configured", + "description": "The selected project has no templates available for Launch.", + "action": "Settings" + }, + "launch": { + "wizard": { + "title": "Launch", + "submit": "Apply", + "project": "Project", + "project_description": "Select a project", + "project_empty": "No options", + "project_loading": "Loading options", + "template": "Template", + "template_description": "Select a template", + "template_empty": "No options", + "template_loading": "Loading options", + "template_placeholder": "Select a project to select a template", + "template_card_type": "Type", + "gpu": "GPU", + "gpu_description_enabled": "Choose a specific offer, or let dstack select it automatically.", + "gpu_description_disabled": "Enable GPU for this run.", + "offer": "Offer", + "offer_description": "Select an offer for the run.", + "name": "Name", + "name_description": "The name of the run, e.g. 'my-dev-env'", + "name_constraint": "Example: 'my-fleet' or 'default'. If not specified, generated automatically.", + "name_placeholder": "Optional", + "ide": "IDE", + "ide_description": "Optionally select an IDE to pre-install in the dev environment.", + "docker": "Docker", + "docker_image": "Image", + "docker_image_description": "A Docker image name, e.g. 'lmsysorg/sglang:latest'", + "docker_image_constraint": "The image must be public", + "docker_image_placeholder": "Required", + "python": "Python", + "python_description": "The version of Python, e.g. '3.12'", + "python_placeholder": "Optional", + "repo": "Repo", + "working_dir": "Working dir", + "working_dir_description": "The absolute path to the working directory inside the container, e.g. '/home/user/project'", + "working_dir_placeholder": "Optional", + "working_dir_constraint": "By default, set to '/workflow'", + "repo_url": "URL", + "repo_url_description": "A URL of a Git repository, e.g. 'https://github.com/user/repo'", + "repo_url_constraint": "The repo must be public", + "repo_url_placeholder": "Required", + "repo_path": "Path", + "repo_path_description": "The path inside the container to clone the repository, e.g. '/home/user/project'", + "repo_path_placeholder": "Optional", + "repo_path_constraint": "By default, set to '/workflow'", + "config": "Configuration file", + "configuration_label": "Configuration", + "configuration_description": "Review and adjust the configuration if needed.", + "success_notification": "The run is submitted!" + } + } + }, + "offer": { + "title": "Offers", + "filter_property_placeholder": "Filter by properties", + "backend": "Backend", + "backend_plural": "Backends", + "availability_not_available": "Not available", + "availability_no_quota": "No quota", + "availability_no_balance": "No balance", + "groupBy": "Group by properties", + "region": "Region", + "count": "Count", + "price": "$/GPU", + "memory_mib": "Memory", + "spot": "Spot policy", + "empty_message_title_select_project": "Select a project", + "empty_message_text_select_project": "Use the filter above to select a project", + "empty_message_title_select_groupBy": "Select a group by", + "empty_message_text_select_groupBy": "Use the field above to select a group by", + "empty_message_title": "No offers", + "empty_message_text": "No offers to display.", + "nomatch_message_title": "No matches", + "nomatch_message_text": "We can't find a match." + }, + + "models": { + "model_name": "Name", + "url": "URL", + "gateway": "Gateway", + "type": "Type", + "run": "Run", + "resources": "Resources", + "price": "Price", + "submitted_at": "Submitted", + "user": "User", + "repository": "Repository", + "backend": "Backend", + "code": "Code", + "empty_message_title": "No models", + "empty_message_text": "No models to display.", + "nomatch_message_title": "No matches", + "nomatch_message_text": "We can't find a match.", + "nomatch_message_button_label": "Clear filter", + + "details": { + "instructions": "System", + "instructions_description": "Specify system", + "message_placeholder": "Enter your question", + "chat_empty_title": "No messages yet", + "chat_empty_message": "Please start a chat", + "run_name": "Run name", + "view_code": "View code", + "view_code_description": "You can use the following code to start integrating your current prompt and settings into your application." + } + }, + + "fleets": { + "no_alert": { + "title": "No fleets", + "description": "The project has no fleets. Create one before submitting a run.", + "button_title": "Create a fleet" + }, + "fleet": "Fleet", + "fleet_column_name": "Name", + "fleet_placeholder": "Filtering by fleet", + "fleet_name": "Fleet name", + "total_instances": "Number of instances", + "inspect": "Inspect", + "empty_message_title": "No fleets", + "empty_message_text": "No fleets to display.", + "nomatch_message_title": "No matches", + "nomatch_message_text": "We can't find a match.", + "nomatch_message_button_label": "Clear filter", + "active_only": "Active fleets", + "filter_property_placeholder": "Filter by properties", + "statuses": { + "active": "Active", + "submitted": "Submitted", + "failed": "Failed", + "terminating": "Terminating", + "terminated": "Terminated" + }, + "create": { + "success_notification": "The fleet is created!" + }, + "instances": { + "active_only": "Active instances", + "filter_property_placeholder": "Filter by properties", + "title": "Instances", + "empty_message_title": "No instances", + "empty_message_text": "No instances to display.", + "nomatch_message_title": "No matches", + "nomatch_message_text": "We can't find a match.", + "instance_name": "Name", + "instance_num": "Num", + "created": "Created", + "status": "Status", + "project": "Project", + "hostname": "Hostname", + "instance_type": "Type", + "statuses": { + "pending": "Pending", + "provisioning": "Provisioning", + "idle": "Idle", + "busy": "Busy", + "terminating": "Terminating", + "terminated": "Terminated" + }, + "resources": "Resources", + "backend": "Backend", + "region": "Region", + "spot": "Spot", + "started": "Started", + "finished_at": "Finished", + "price": "Price", + "termination_reason": "Termination reason", + "health": "Health", + "blocks": "Blocks", + "inspect": "Inspect" + }, + "edit": { + "name": "Name", + "name_description": "The name of the fleet, e.g. 'my-fleet'", + "name_placeholder": "Optional", + "name_constraint": "Example: 'my-fleet' or 'default'. If not specified, generated automatically.", + "min_instances": "Min number of instances", + "min_instances_description": "Set it '0' to provision instances only when required", + "max_instances": "Max number of instances", + "max_instances_description": "Required only if you want to set an upper limit", + "max_instances_placeholder": "Optional", + "idle_duration": "Idle duration", + "idle_duration_description": "Example: '0s', '1m', '1h'", + "spot_policy": "Spot policy", + "spot_policy_description": "Set it to 'auto' to allow the use of both on-demand and spot instances" + } + }, + "volume": { + "volumes": "Volumes", + "empty_message_title": "No volumes", + "empty_message_text": "No volumes to display.", + "nomatch_message_title": "No matches", + "nomatch_message_text": "We can't find a match.", + "delete_volumes_confirm_title": "Delete volumes", + "delete_volumes_confirm_message": "Are you sure you want to delete these volumes?", + "active_only": "Active volumes", + "filter_property_placeholder": "Filter by properties", + + "name": "Name", + "project": "Project name", + "region": "Region", + "backend": "Backend", + "status": "Status", + "created": "Created", + "finished": "Finished", + "price": "Price (per month)", + "cost": "Cost", + "statuses": { + "failed": "Failed", + "submitted": "Submitted", + "provisioning": "Provisioning", + "active": "Active", + "deleted": "Deleted" + } + }, + + "events": { + "recorded_at": "Recorded At", + "actor": "Actor", + "targets": "Targets", + "message": "Message" + }, + + "users": { + "page_title": "Users", + "search_placeholder": "Find members", + "empty_message_title": "No members", + "empty_message_text": "No members to display.", + "nomatch_message_title": "No matches", + "nomatch_message_text": "We can't find a match.", + "user_name": "User name", + "user_name_description": "Only latin characters, dashes, underscores, and digits", + "global_role_description": "Whether the user is an administrator or not", + "email_description": "Enter user email", + "token": "Token", + "token_description": "Specify use your personal access token", + "global_role": "Global role", + "active": "Active", + "active_description": "Specify user activation", + "activated": "Activated", + "deactivated": "Deactivated", + "email": "Email", + "created_at": "Created at", + "account": "User", + "account_settings": "User settings", + "settings": "Settings", + "projects": "Projects", + "events": "Events", + "public_keys": { + "title": "SSH keys", + "add_key": "Add SSH key", + "name": "Title", + "fingerprint": "Fingerprint", + "key_type": "Key type", + "added": "Added", + "empty_title": "No SSH keys", + "empty_message": "You haven't added any SSH keys yet.", + "key_name_label": "Title", + "key_name_description": "A label to identify this key", + "key_name_placeholder": "My SSH key", + "key_label": "Key", + "key_description": "Paste your public key content (e.g. the contents of ~/.ssh/id_ed25519.pub)", + "key_required": "Key content is required", + "key_already_exists": "This public key is already added to your account", + "delete_confirm_title": "Delete SSH key", + "delete_confirm_message": "Are you sure you want to delete the selected SSH key(s)?" + }, + "create": { + "page_title": "Create user", + "error_notification": "Create user error", + "success_notification": "User is created" + }, + "edit": { + "error_notification": "Update user error", + "success_notification": "User updating is successful", + "refresh_token_success_notification": "Token rotating is successful", + "refresh_token_error_notification": "Token rotating error", + "refresh_token_confirm_title": "Rotate token", + "refresh_token_confirm_message": "Are you sure you want to rotate token?", + "refresh_token_button_label": "Rotate", + "validation": { + "user_name_format": "Only letters, numbers, - or _", + "email_format": "Incorrect email" + } + }, + + "manual_payments": { + "title": "Credits history", + "add_payment": "Add payment", + "empty_message_title": "No payments", + "empty_message_text": "No payments to display.", + + "create": { + "success_notification": "Payment creating is successful" + }, + + "edit": { + "value": "Amount", + "value_description": "Enter amount here", + "description": "Description", + "description_description": "Describe payment here", + "created_at": "Created at" + } + }, + + "token_copied": "Token copied" + }, + "billing": { + "title": "Billing", + "balance": "Balance", + "billing_history": "Billing history", + "payment_method": "Payment method", + "no_payment_method": "No payment method attached", + "top_up_balance": "Top up balance", + "edit_payment_method": "Edit payment method", + "payment_amount": "Payment amount", + "amount_description": "Minimum: ${{value}}", + "make_payment": "Make a payment", + "min_amount_error_message": "The amount is allowed to be more than {{value}}", + "payment_success_message": "Payment succeeded. There can be a short delay before the balance is updated." + }, + "validation": { + "required": "This is required field" + }, + "users_autosuggest": { + "placeholder": "Enter username or email to add member", + "entered_text": "Add member", + "loading": "Loading users", + "no_match": "No matches found" + }, + "roles": { + "admin": "Admin", + "manager": "Manager", + "user": "User" + }, + "confirm_dialog": { + "title": "Confirm delete", + "message": "Are you sure you want to delete?" + } +} diff --git a/frontend/src/locale/index.ts b/frontend/src/locale/index.ts new file mode 100644 index 0000000000..7b2f70ba0f --- /dev/null +++ b/frontend/src/locale/index.ts @@ -0,0 +1,18 @@ +import i18n from 'i18next'; +import { initReactI18next } from 'react-i18next'; + +import en from './en.json'; + +i18n.use(initReactI18next).init({ + returnNull: false, + resources: { + en: { + translation: en, + }, + }, + fallbackLng: 'en', + + interpolation: { + escapeValue: false, + }, +}); diff --git a/frontend/src/pages/Events/List/ListPage.tsx b/frontend/src/pages/Events/List/ListPage.tsx new file mode 100644 index 0000000000..9117b37bf0 --- /dev/null +++ b/frontend/src/pages/Events/List/ListPage.tsx @@ -0,0 +1,46 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; + +import { Button, Header, SpaceBetween } from 'components'; + +import { useBreadcrumbs } from 'hooks'; +import { ROUTES } from 'routes'; + +import { EventList } from './index'; + +export const ListPage: React.FC = () => { + const { t } = useTranslation(); + + useBreadcrumbs([ + { + text: t('navigation.events'), + href: ROUTES.EVENTS.LIST, + }, + ]); + + return ( + { + return ( +
+
+ ); + }} + /> + ); +}; diff --git a/frontend/src/pages/Events/List/helpers.ts b/frontend/src/pages/Events/List/helpers.ts new file mode 100644 index 0000000000..702c8a5617 --- /dev/null +++ b/frontend/src/pages/Events/List/helpers.ts @@ -0,0 +1,23 @@ +import type { PropertyFilterProps } from 'components'; + +export function filterLastElementByPrefix( + arr: PropertyFilterProps.Query['tokens'], + prefix: string, +): PropertyFilterProps.Query['tokens'] { + // Ищем индекс последнего элемента с префиксом "test_" + let lastTestIndex = -1; + for (let i = arr.length - 1; i >= 0; i--) { + if (arr[i].propertyKey?.startsWith(prefix)) { + lastTestIndex = i; + break; + } + } + + // Фильтруем массив + return arr.filter((item, index) => { + // Оставляем элемент, если: + // 1. Это не строка с префиксом "test_"? + // 2. ИЛИ это строка с префиксом "test_" И она последняя в массиве + return !item.propertyKey?.startsWith(prefix) || index === lastTestIndex; + }); +} diff --git a/frontend/src/pages/Events/List/hooks/useColumnDefinitions.tsx b/frontend/src/pages/Events/List/hooks/useColumnDefinitions.tsx new file mode 100644 index 0000000000..be4ec19a5e --- /dev/null +++ b/frontend/src/pages/Events/List/hooks/useColumnDefinitions.tsx @@ -0,0 +1,171 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import { format } from 'date-fns'; + +import { NavigateLink, TableProps } from 'components'; + +import { DATE_TIME_FORMAT } from 'consts'; +import { ROUTES } from 'routes'; + +export const useColumnsDefinitions = () => { + const { t } = useTranslation(); + + const columns: TableProps.ColumnDefinition[] = [ + { + id: 'recorded_at', + header: t('events.recorded_at'), + cell: (item) => format(new Date(item.recorded_at), DATE_TIME_FORMAT), + }, + { + id: 'actor', + header: t('events.actor'), + cell: (item) => + item.actor_user ? ( + {item.actor_user} + ) : ( + '-' + ), + }, + { + id: 'target', + header: t('events.targets'), + cell: (item) => { + return item.targets.map((target) => { + switch (target.type) { + case 'project': + return ( +
+ Project{' '} + {target.project_name && ( + + {target.project_name} + + )} +
+ ); + + case 'fleet': + return ( +
+ Fleet{' '} + {target.project_name && ( + + {target.project_name} + + )} + / + + {target.name} + +
+ ); + + case 'user': + return ( +
+ User{' '} + {target.name} +
+ ); + + case 'instance': + return ( +
+ Instance{' '} + {target.project_name && ( + + {target.project_name} + + )} + / + + {target.name} + +
+ ); + + case 'run': + return ( +
+ Run{' '} + {target.project_name && ( + + {target.project_name} + + )} + / + + {target.name} + +
+ ); + + case 'job': + return ( +
+ Job{' '} + {target.project_name && ( + + {target.project_name} + + )} + /{target.name} +
+ ); + + case 'volume': + return ( +
+ Volume{' '} + {target.project_name && ( + + {target.project_name} + + )} + /{target.name} +
+ ); + + case 'gateway': + return ( +
+ Gateway{' '} + {target.project_name && ( + + {target.project_name} + + )} + /{target.name} +
+ ); + + case 'secret': + return ( +
+ Secret{' '} + {target.project_name && ( + + {target.project_name} + + )} + /{target.name} +
+ ); + + default: + return '---'; + } + }); + }, + }, + { + id: 'message', + header: t('events.message'), + cell: ({ message }) => message, + }, + ]; + + return { columns } as const; +}; diff --git a/frontend/src/pages/Events/List/hooks/useFilters.ts b/frontend/src/pages/Events/List/hooks/useFilters.ts new file mode 100644 index 0000000000..d160caa17f --- /dev/null +++ b/frontend/src/pages/Events/List/hooks/useFilters.ts @@ -0,0 +1,392 @@ +import { useMemo, useState } from 'react'; +import { useSearchParams } from 'react-router-dom'; +import { omit } from 'lodash'; + +import type { PropertyFilterProps } from 'components'; + +import { + EMPTY_QUERY, + getNamePatternFilterRequestParams, + requestParamsToTokens, + tokensToRequestParams, + tokensToSearchParams, +} from 'libs/filters'; +import { useLazyGetProjectsQuery } from 'services/project'; +import { useLazyGetUserListQuery } from 'services/user'; + +import { filterLastElementByPrefix } from '../helpers'; + +type RequestParamsKeys = keyof TEventListFilters; + +const filterKeys: Record = { + TARGET_PROJECTS: 'target_projects', + TARGET_USERS: 'target_users', + TARGET_FLEETS: 'target_fleets', + TARGET_INSTANCES: 'target_instances', + TARGET_RUNS: 'target_runs', + TARGET_JOBS: 'target_jobs', + TARGET_VOLUMES: 'target_volumes', + TARGET_GATEWAYS: 'target_gateways', + TARGET_SECRETS: 'target_secrets', + WITHIN_PROJECTS: 'within_projects', + WITHIN_FLEETS: 'within_fleets', + WITHIN_RUNS: 'within_runs', + INCLUDE_TARGET_TYPES: 'include_target_types', + ACTORS: 'actors', +}; + +const onlyOneFilterGroupPrefixes = ['target_', 'within_']; + +const multipleChoiseKeys: RequestParamsKeys[] = [ + 'target_projects', + 'target_users', + 'target_fleets', + 'target_instances', + 'target_runs', + 'target_jobs', + 'target_volumes', + 'target_gateways', + 'target_secrets', + 'within_projects', + 'within_fleets', + 'within_runs', + 'include_target_types', + 'actors', +]; + +const targetTypes = [ + { label: 'Project', value: 'project' }, + { label: 'User', value: 'user' }, + { label: 'Fleet', value: 'fleet' }, + { label: 'Instance', value: 'instance' }, + { label: 'Run', value: 'run' }, + { label: 'Job', value: 'job' }, + { label: 'Volume', value: 'volume' }, + { label: 'Gateway', value: 'gateway' }, + { label: 'Secret', value: 'secret' }, +]; + +const baseFilteringProperties = [ + { + key: filterKeys.TARGET_PROJECTS, + operators: ['='], + propertyLabel: 'Target projects', + groupValuesLabel: 'Project ids', + }, + { + key: filterKeys.TARGET_USERS, + operators: ['='], + propertyLabel: 'Target users', + groupValuesLabel: 'User ids', + }, + { + key: filterKeys.TARGET_FLEETS, + operators: ['='], + propertyLabel: 'Target fleet IDs', + groupValuesLabel: 'Fleet ids', + }, + { + key: filterKeys.TARGET_INSTANCES, + operators: ['='], + propertyLabel: 'Target instance IDs', + groupValuesLabel: 'Instance ids', + }, + { + key: filterKeys.TARGET_RUNS, + operators: ['='], + propertyLabel: 'Target run IDs', + groupValuesLabel: 'Run ids', + }, + { + key: filterKeys.TARGET_JOBS, + operators: ['='], + propertyLabel: 'Target job IDs', + groupValuesLabel: 'Job ids', + }, + { + key: filterKeys.TARGET_VOLUMES, + operators: ['='], + propertyLabel: 'Target volume IDs', + groupValuesLabel: 'Volume ids', + }, + { + key: filterKeys.TARGET_GATEWAYS, + operators: ['='], + propertyLabel: 'Target gateway IDs', + groupValuesLabel: 'Gateway ids', + }, + { + key: filterKeys.TARGET_SECRETS, + operators: ['='], + propertyLabel: 'Target secret IDs', + groupValuesLabel: 'Secret ids', + }, + + { + key: filterKeys.WITHIN_PROJECTS, + operators: ['='], + propertyLabel: 'Within projects', + groupValuesLabel: 'Project ids', + }, + + { + key: filterKeys.WITHIN_FLEETS, + operators: ['='], + propertyLabel: 'Within fleet IDs', + groupValuesLabel: 'Fleet ids', + }, + + { + key: filterKeys.WITHIN_RUNS, + operators: ['='], + propertyLabel: 'Within run IDs', + groupValuesLabel: 'Run ids', + }, + + { + key: filterKeys.INCLUDE_TARGET_TYPES, + operators: ['='], + propertyLabel: 'Target types', + groupValuesLabel: 'Target type values', + }, + + { + key: filterKeys.ACTORS, + operators: ['='], + propertyLabel: 'Actors', + groupValuesLabel: 'User names', + }, +]; + +const limit = 100; + +export const useFilters = ({ + permanentFilters, + withSearchParams, +}: { + permanentFilters?: Partial; + withSearchParams?: boolean; +}) => { + const [searchParams, setSearchParams] = useSearchParams(); + const [dynamicFilteringOptions, setDynamicFilteringOptions] = useState([]); + const [filteringStatusType, setFilteringStatusType] = useState(); + const [getProjects] = useLazyGetProjectsQuery(); + const [getUsers] = useLazyGetUserListQuery(); + + const [propertyFilterQuery, setPropertyFilterQuery] = useState(() => + requestParamsToTokens({ searchParams, filterKeys }), + ); + + const clearFilter = () => { + if (withSearchParams) { + setSearchParams({}); + } + setPropertyFilterQuery(EMPTY_QUERY); + }; + + const filteringOptions = useMemo(() => { + const options: PropertyFilterProps.FilteringOption[] = [...dynamicFilteringOptions]; + + targetTypes?.forEach((targetType) => { + options.push({ + propertyKey: filterKeys.INCLUDE_TARGET_TYPES, + value: targetType.label, + }); + }); + + return options; + }, [dynamicFilteringOptions]); + + const setSearchParamsHandle = ({ tokens }: { tokens: PropertyFilterProps.Query['tokens'] }) => { + const searchParams = tokensToSearchParams(tokens); + + setSearchParams(searchParams); + }; + + const onChangePropertyFilterHandle = ({ tokens, operation }: PropertyFilterProps.Query) => { + let filteredTokens = [...tokens]; + + onlyOneFilterGroupPrefixes.forEach((prefix) => { + try { + filteredTokens = filterLastElementByPrefix(filteredTokens, prefix); + } catch (_) { + console.error(_); + } + }); + + if (withSearchParams) { + setSearchParamsHandle({ tokens: filteredTokens }); + } + + setPropertyFilterQuery({ + operation, + tokens: filteredTokens, + }); + }; + + const onChangePropertyFilter: PropertyFilterProps['onChange'] = ({ detail }) => { + onChangePropertyFilterHandle(detail); + }; + + const filteringProperties = useMemo(() => { + const permanentFiltersKeysMap = new Map(); + + for (const prefix of onlyOneFilterGroupPrefixes) { + const permanentFilterKey = Object.keys(permanentFilters ?? {}).find((filterKey) => filterKey.startsWith(prefix)); + + if (permanentFilterKey) { + permanentFiltersKeysMap.set(prefix, permanentFilterKey); + } + } + + if (permanentFiltersKeysMap.size === 0) { + return baseFilteringProperties; + } + + return baseFilteringProperties.filter(({ key }) => { + const propertyPrefix = onlyOneFilterGroupPrefixes.find((prefix) => key.startsWith(prefix)); + + if (!propertyPrefix) { + return true; + } + + if (permanentFiltersKeysMap.has(propertyPrefix)) { + return key === permanentFiltersKeysMap.get(propertyPrefix); + } + + return true; + }); + }, [permanentFilters]); + + const filteringRequestParams = useMemo(() => { + const params = tokensToRequestParams({ + tokens: propertyFilterQuery.tokens, + arrayFieldKeys: multipleChoiseKeys, + }); + + const filterParamsWithPermanentFitters = (filterKey: RequestParamsKeys): string[] => { + let paramsFilter = params[filterKey] ?? ''; + const permanentFilter = permanentFilters?.[filterKey] ?? ''; + + if (!Array.isArray(paramsFilter) && typeof paramsFilter === 'object') { + paramsFilter = ''; + } + + if (Array.isArray(paramsFilter) && Array.isArray(permanentFilter)) { + return [...paramsFilter, ...permanentFilter]; + } + + if (Array.isArray(paramsFilter) && !Array.isArray(permanentFilter)) { + return [...paramsFilter, permanentFilter]; + } + + if (!Array.isArray(paramsFilter) && Array.isArray(permanentFilter)) { + return [paramsFilter, ...permanentFilter]; + } + + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-expect-error + return [paramsFilter, permanentFilter]; + }; + + const targetProjects = filterParamsWithPermanentFitters(filterKeys.TARGET_PROJECTS).filter(Boolean); + + const withInProjects = filterParamsWithPermanentFitters(filterKeys.WITHIN_PROJECTS).filter(Boolean); + + const targetUsers = filterParamsWithPermanentFitters(filterKeys.TARGET_USERS).filter(Boolean); + + const actors = filterParamsWithPermanentFitters(filterKeys.ACTORS).filter(Boolean); + + const includeTargetTypes = filterParamsWithPermanentFitters(filterKeys.INCLUDE_TARGET_TYPES).filter(Boolean); + + const mappedFields = { + ...(targetProjects?.length + ? { + [filterKeys.TARGET_PROJECTS]: targetProjects, + } + : {}), + ...(withInProjects?.length + ? { + [filterKeys.WITHIN_PROJECTS]: withInProjects, + } + : {}), + + ...(targetUsers?.length + ? { + [filterKeys.TARGET_USERS]: targetUsers, + } + : {}), + + ...(actors?.length + ? { + [filterKeys.ACTORS]: actors, + } + : {}), + + ...(includeTargetTypes?.length + ? { + [filterKeys.INCLUDE_TARGET_TYPES]: includeTargetTypes, + } + : {}), + }; + + return { + ...omit(params, [ + filterKeys.TARGET_PROJECTS, + filterKeys.WITHIN_PROJECTS, + filterKeys.TARGET_USERS, + filterKeys.ACTORS, + filterKeys.INCLUDE_TARGET_TYPES, + ]), + ...permanentFilters, + ...mappedFields, + } as TEventListFilters; + }, [propertyFilterQuery, permanentFilters]); + + const handleLoadItems: PropertyFilterProps['onLoadItems'] = async ({ detail: { filteringProperty, filteringText } }) => { + setDynamicFilteringOptions([]); + + setFilteringStatusType('loading'); + + if (filteringProperty?.key === filterKeys.TARGET_PROJECTS || filteringProperty?.key === filterKeys.WITHIN_PROJECTS) { + await getProjects(getNamePatternFilterRequestParams(filteringText, limit)) + .unwrap() + .then(({ data }) => + data.map(({ project_name, project_id }) => ({ + propertyKey: filteringProperty?.key, + label: project_name, + value: project_id, + hiddenValue: 'test', + })), + ) + .then(setDynamicFilteringOptions); + } + + if (filteringProperty?.key === filterKeys.TARGET_USERS || filteringProperty?.key === filterKeys.ACTORS) { + await getUsers(getNamePatternFilterRequestParams(filteringText, limit)) + .unwrap() + .then(({ data }) => + data.map(({ username, id }) => ({ + propertyKey: filteringProperty?.key, + label: username, + value: id, + hiddenValue: 'test2', + })), + ) + .then(setDynamicFilteringOptions); + } + + setFilteringStatusType(undefined); + }; + + return { + filteringRequestParams, + clearFilter, + propertyFilterQuery, + onChangePropertyFilter, + filteringOptions, + filteringProperties, + filteringStatusType, + handleLoadItems, + } as const; +}; diff --git a/frontend/src/pages/Events/List/index.tsx b/frontend/src/pages/Events/List/index.tsx new file mode 100644 index 0000000000..5b22d245ad --- /dev/null +++ b/frontend/src/pages/Events/List/index.tsx @@ -0,0 +1,116 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; + +import { Loader, PropertyFilter, Table } from 'components'; +import { TableProps } from 'components'; + +import { DEFAULT_TABLE_PAGE_SIZE } from 'consts'; +import { useBreadcrumbs, useInfiniteScroll } from 'hooks'; +import { useCollection } from 'hooks'; +import { ROUTES } from 'routes'; +import { useLazyGetAllEventsQuery } from 'services/events'; + +import { useColumnsDefinitions } from './hooks/useColumnDefinitions'; +import { useFilters } from './hooks/useFilters'; + +import styles from 'pages/Runs/List/styles.module.scss'; + +type RenderHeaderArgs = { + refreshAction?: () => void; + disabledRefresh?: boolean; +}; + +type EventListProps = Pick & { + withSearchParams?: boolean; + renderHeader?: (args: RenderHeaderArgs) => React.ReactNode; + permanentFilters?: Partial; + showFilters?: boolean; +}; + +export const EventList: React.FC = ({ + withSearchParams, + permanentFilters, + renderHeader, + showFilters = true, + ...props +}) => { + const { t } = useTranslation(); + + useBreadcrumbs([ + { + text: t('navigation.events'), + href: ROUTES.EVENTS.LIST, + }, + ]); + + const { + filteringRequestParams, + propertyFilterQuery, + onChangePropertyFilter, + filteringOptions, + filteringProperties, + filteringStatusType, + handleLoadItems, + } = useFilters({ permanentFilters, withSearchParams }); + + const { data, isLoading, refreshList, isLoadingMore } = useInfiniteScroll({ + useLazyQuery: useLazyGetAllEventsQuery, + args: { ...filteringRequestParams, limit: DEFAULT_TABLE_PAGE_SIZE }, + + getPaginationParams: (lastEvent) => ({ + prev_recorded_at: lastEvent.recorded_at, + prev_id: lastEvent.id, + }), + }); + + const { items, collectionProps } = useCollection(data, { + filtering: { + // empty: renderEmptyMessage(), + // noMatch: renderNoMatchMessage(), + }, + selection: {}, + }); + + const { columns } = useColumnsDefinitions(); + + const loading = isLoading; + + return ( + +
+ `Use: ${value}`, + }} + filteringOptions={filteringOptions} + filteringProperties={filteringProperties} + filteringStatusType={filteringStatusType} + onLoadItems={handleLoadItems} + /> +
+ + ) + } + footer={} + /> + ); +}; diff --git a/frontend/src/pages/Events/index.tsx b/frontend/src/pages/Events/index.tsx new file mode 100644 index 0000000000..7171443bc1 --- /dev/null +++ b/frontend/src/pages/Events/index.tsx @@ -0,0 +1 @@ +export { ListPage as EventList } from './List/ListPage'; diff --git a/frontend/src/pages/Fleets/Add/FleetFormFields/constants.tsx b/frontend/src/pages/Fleets/Add/FleetFormFields/constants.tsx new file mode 100644 index 0000000000..7904a17746 --- /dev/null +++ b/frontend/src/pages/Fleets/Add/FleetFormFields/constants.tsx @@ -0,0 +1,143 @@ +import React from 'react'; +import { get } from 'lodash'; +import * as yup from 'yup'; + +import { FleetFormFields } from './type'; + +export const fleetFormDefaultValues: FleetFormFields = { + min_instances: 0, + idle_duration: '5m', + spot_policy: 'auto', +}; + +export const FLEET_MIN_INSTANCES_INFO = { + header:

Min number of instances

, + body: ( + <> +

+ If you create a fleet here, it's recommended to set Min number of instances to 0. In + this case, dstack will provision instances only when you run a dev environment, task, or service. +

+ +

+ If you set Min number of instances above 0, dstack will try to provision + them right away. Note, setting Min number of instances above 0 is supported for + VM-based backends only. +

+ +

+ To learn more about fleets, see the{' '} + + documentation + + . +

+ + ), +}; + +export const FLEET_MAX_INSTANCES_INFO = { + header:

Max number of instances

, + body: ( + <> +

+ Set Max number of instances only if you need to limit the number of instances in the fleet. +

+ +

+ To learn more about fleets, see the{' '} + + documentation + + . +

+ + ), +}; + +export const FLEET_IDLE_DURATION_INFO = { + header:

Idle duration

, + body: ( + <> +

Idle instances can be reused when you submit a dev environment, task, or service.

+ +

+ Set Idle duration to control how long instances stay idle before they are terminated. +

+ +

+ Set Idle duration to 0s if you want instances to be terminated immediately after they + are no longer needed. +

+ +

+ Note, dstack doesn't terminates if their total number would be below{' '} + Min number of instances. +

+ +

+ To learn more about fleets, see the{' '} + + documentation + + . +

+ + ), +}; + +export const FLEET_SPOT_POLICY_INFO = { + header:

Spot policy

, + body: ( + <> +

+ Some backends may support spot instances, also known as preemptive instances. Such instances come at a + significantly lower price but can be interrupted by the cloud provider at any time. +

+

+ If you set spot_policy to auto, the fleet will allow the use of both types of + instances: on-demand and spot. +

+

+ Note that run configurations must specify their own spot_policy, which by default is always{' '} + on-demand. +

+ + ), +}; + +const requiredFieldError = 'This is required field'; +const numberFieldError = 'This is number field'; + +export const getMinInstancesValidator = (maxInstancesFieldPath: string) => + yup + .number() + .required(requiredFieldError) + .typeError(numberFieldError) + .min(0) + .test('is-smaller-than-max', 'The minimum value must be less than the maximum value.', (value, context) => { + const maxInstances = get(context.parent, maxInstancesFieldPath); + + if (typeof maxInstances !== 'number' || typeof value !== 'number') { + return true; + } + + return value <= maxInstances; + }); + +export const getMaxInstancesValidator = (minInstancesFieldPath: string) => + yup + .number() + .typeError(numberFieldError) + .min(1) + .test('is-greater-than-min', 'The maximum value must be greater than the minimum value', (value, context) => { + const minInstances = get(context.parent, minInstancesFieldPath); + + if (typeof minInstances !== 'number' || typeof value !== 'number') { + return true; + } + + return value >= minInstances; + }); + +export const idleDurationValidator = yup.string().matches(/^\d+[smhdw]$/, 'Invalid duration'); diff --git a/frontend/src/pages/Fleets/Add/FleetFormFields/index.tsx b/frontend/src/pages/Fleets/Add/FleetFormFields/index.tsx new file mode 100644 index 0000000000..4aa04c3935 --- /dev/null +++ b/frontend/src/pages/Fleets/Add/FleetFormFields/index.tsx @@ -0,0 +1,100 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; + +import { FormInput, FormSelect, InfoLink, SpaceBetween } from 'components'; + +import { useHelpPanel } from 'hooks'; + +import { + FLEET_IDLE_DURATION_INFO, + FLEET_MAX_INSTANCES_INFO, + FLEET_MIN_INSTANCES_INFO, + FLEET_SPOT_POLICY_INFO, +} from './constants'; +import { FleetFormFieldsProps } from './type'; + +import type { FieldValues } from 'react-hook-form/dist/types/fields'; + +export function FleetFormFields({ + control, + disabledAllFields, + fieldNamePrefix, +}: FleetFormFieldsProps) { + const { t } = useTranslation(); + const [openHelpPanel] = useHelpPanel(); + + const getFieldNameWitPrefix = (name: string): string => { + if (!fieldNamePrefix) { + return name; + } + + return [fieldNamePrefix, name].join('.'); + }; + + return ( + + + + openHelpPanel(FLEET_MIN_INSTANCES_INFO)} />} + label={t('fleets.edit.min_instances')} + constraintText={t('fleets.edit.min_instances_description')} + control={control} + //eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-expect-error + name={getFieldNameWitPrefix(`min_instances`)} + disabled={disabledAllFields} + type="number" + /> + + openHelpPanel(FLEET_MAX_INSTANCES_INFO)} />} + label={t('fleets.edit.max_instances')} + constraintText={t('fleets.edit.max_instances_description')} + placeholder={t('fleets.edit.max_instances_placeholder')} + control={control} + //eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-expect-error + name={getFieldNameWitPrefix(`max_instances`)} + disabled={disabledAllFields} + type="number" + /> + + openHelpPanel(FLEET_SPOT_POLICY_INFO)} />} + label={t('fleets.edit.spot_policy')} + constraintText={t('fleets.edit.spot_policy_description')} + control={control} + //eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-expect-error + name={getFieldNameWitPrefix(`spot_policy`)} + disabled={disabledAllFields} + options={[ + { label: 'auto', value: 'auto' }, + { label: 'on-demand', value: 'on-demand' }, + { label: 'spot', value: 'spot' }, + ]} + /> + + openHelpPanel(FLEET_IDLE_DURATION_INFO)} />} + label={t('fleets.edit.idle_duration')} + constraintText={t('fleets.edit.idle_duration_description')} + control={control} + //eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-expect-error + name={getFieldNameWitPrefix(`idle_duration`)} + disabled={disabledAllFields} + /> + + ); +} diff --git a/frontend/src/pages/Fleets/Add/FleetFormFields/type.ts b/frontend/src/pages/Fleets/Add/FleetFormFields/type.ts new file mode 100644 index 0000000000..835ca83732 --- /dev/null +++ b/frontend/src/pages/Fleets/Add/FleetFormFields/type.ts @@ -0,0 +1,16 @@ +import type { FieldValues } from 'react-hook-form/dist/types/fields'; +import type { UseFormReturn } from 'react-hook-form/dist/types/form'; + +export interface FleetFormFieldsProps + extends Pick, 'control'> { + fieldNamePrefix?: string; + disabledAllFields?: boolean; +} + +export type FleetFormFields = { + name?: string; + min_instances: number; + max_instances?: number; + idle_duration?: string; + spot_policy: TSpotPolicy; +}; diff --git a/frontend/src/pages/Fleets/Add/index.tsx b/frontend/src/pages/Fleets/Add/index.tsx new file mode 100644 index 0000000000..d7390fc933 --- /dev/null +++ b/frontend/src/pages/Fleets/Add/index.tsx @@ -0,0 +1,266 @@ +import React, { useState } from 'react'; +import { useForm } from 'react-hook-form'; +import { useTranslation } from 'react-i18next'; +import { useNavigate, useParams } from 'react-router-dom'; +import { isNil } from 'lodash'; +import * as yup from 'yup'; +import { Box, WizardProps } from '@cloudscape-design/components'; + +import { Container, InfoLink, KeyValuePairs, SpaceBetween, Wizard } from 'components'; + +import { useBreadcrumbs, useConfirmationDialog, useHelpPanel, useNotifications } from 'hooks'; +import { ROUTES } from 'routes'; +import { useApplyFleetMutation } from 'services/fleet'; + +import { DEFAULT_FLEET_INFO } from 'pages/Project/constants'; +import { useYupValidationResolver } from 'pages/Project/hooks/useYupValidationResolver'; + +import { + fleetFormDefaultValues, + getMaxInstancesValidator, + getMinInstancesValidator, + idleDurationValidator, +} from './FleetFormFields/constants'; +import { FleetFormFields } from './FleetFormFields'; + +import { IFleetWizardForm } from './types'; + +const requiredFieldError = 'This is required field'; +const namesFieldError = 'Only latin characters, dashes, underscores, and digits'; + +const fleetStepIndex = 0; + +const fleetValidationSchema = yup.object({ + project_name: yup + .string() + .required(requiredFieldError) + .matches(/^[a-zA-Z0-9-_]+$/, namesFieldError), + min_instances: getMinInstancesValidator('max_instances'), + max_instances: getMaxInstancesValidator('min_instances'), + idle_duration: idleDurationValidator, + spot_policy: yup.string().required(requiredFieldError), +}); + +export const FleetAdd: React.FC = () => { + const { t } = useTranslation(); + const navigate = useNavigate(); + const params = useParams(); + const paramProjectName = params.projectName ?? ''; + const [openHelpPanel] = useHelpPanel(); + const [pushNotification] = useNotifications(); + const [openConfirmationDialog] = useConfirmationDialog(); + const [applyFleet, { isLoading: isApplyingFleet }] = useApplyFleetMutation(); + const [activeStepIndex, setActiveStepIndex] = useState(0); + const resolver = useYupValidationResolver(fleetValidationSchema); + + const loading = isApplyingFleet; + + const formMethods = useForm({ + resolver, + defaultValues: { + ...fleetFormDefaultValues, + project_name: paramProjectName, + }, + }); + + const { handleSubmit, control, clearErrors, trigger, watch, getValues } = formMethods; + const formValues = watch(); + + const getFormValuesForFleetApplying = (): IApplyFleetPlanRequestRequest => { + const { min_instances, max_instances, idle_duration, name, spot_policy } = getValues(); + + return { + plan: { + spec: { + configuration: { + ...(name ? { name } : {}), + nodes: { + min: min_instances, + ...(max_instances ? { max: max_instances } : {}), + }, + ...(idle_duration ? { idle_duration } : {}), + spot_policy, + }, + profile: {}, + }, + }, + force: false, + }; + }; + + useBreadcrumbs([ + { + text: t('navigation.project_other'), + href: ROUTES.PROJECT.LIST, + }, + { + text: paramProjectName, + href: ROUTES.PROJECT.DETAILS.FORMAT(paramProjectName), + }, + { + text: t('navigation.fleets'), + href: ROUTES.FLEETS.LIST, + }, + { + text: t('common.create_wit_text', { text: t('navigation.fleet') }), + href: ROUTES.FLEETS.ADD.FORMAT(paramProjectName), + }, + ]); + + const validateFleet = async () => { + return await trigger(['min_instances', 'max_instances', 'idle_duration']); + }; + + const emptyValidator = async () => Promise.resolve(true); + + const onNavigate = ({ + requestedStepIndex, + reason, + }: { + requestedStepIndex: number; + reason: WizardProps.NavigationReason; + }) => { + const stepValidators = [validateFleet, emptyValidator]; + + if (reason === 'next') { + stepValidators[activeStepIndex]?.().then((isValid) => { + if (isValid) { + if (activeStepIndex === fleetStepIndex && formValues?.['min_instances'] > 0) { + openConfirmationDialog({ + title: 'Are sure want to set min instances above than 0?', + content: null, + onConfirm: () => setActiveStepIndex(requestedStepIndex), + }); + } else { + setActiveStepIndex(requestedStepIndex); + } + } + }); + } else { + setActiveStepIndex(requestedStepIndex); + } + }; + + const onNavigateHandler: WizardProps['onNavigate'] = ({ detail: { requestedStepIndex, reason } }) => { + onNavigate({ requestedStepIndex, reason }); + }; + + const onCancelHandler = () => { + navigate(ROUTES.FLEETS.LIST); + }; + + const onSubmitWizard = async () => { + const isValid = await trigger(); + + const { project_name } = getValues(); + + if (!isValid) { + return; + } + + clearErrors(); + + const request = applyFleet({ + projectName: project_name, + ...getFormValuesForFleetApplying(), + }).unwrap(); + + request + .then((data) => { + pushNotification({ + type: 'success', + content: t('fleets.create.success_notification'), + }); + + navigate(ROUTES.FLEETS.DETAILS.FORMAT(data.project_name, data.id)); + }) + .catch((error) => { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: error?.error ?? error }), + }); + }); + }; + + const onSubmit = () => { + if (activeStepIndex < 2) { + onNavigate({ requestedStepIndex: activeStepIndex + 1, reason: 'next' }); + } else { + onSubmitWizard().catch(console.log); + } + }; + + const getDefaultFleetSummary = () => { + const summaryFields: Array = [ + 'name', + 'min_instances', + 'max_instances', + 'idle_duration', + 'spot_policy', + ]; + + const result: string[] = []; + + summaryFields.forEach((fieldName) => { + if (!isNil(formValues?.[fieldName])) { + result.push(`${t(`fleets.edit.${fieldName}`)}: ${formValues?.[fieldName]}`); + } + }); + + return result.join(', '); + }; + + return ( +
+ `Step ${stepNumber}`, + navigationAriaLabel: 'Steps', + cancelButton: t('common.cancel'), + previousButton: t('common.previous'), + nextButton: t('common.next'), + optional: 'optional', + }} + onCancel={onCancelHandler} + submitButtonText={t('projects.wizard.submit')} + steps={[ + { + title: 'Settings', + description: ( + + At least one fleet is required to run dev environments, tasks, or services. Create it here, or + create it using the dstack apply command via the CLI.{' '} + openHelpPanel(DEFAULT_FLEET_INFO)} /> + + ), + content: ( + + + control={control} disabledAllFields={loading} /> + + + ), + }, + { + title: 'Summary', + content: ( + + + + ), + }, + ]} + /> + + ); +}; diff --git a/frontend/src/pages/Fleets/Add/types.ts b/frontend/src/pages/Fleets/Add/types.ts new file mode 100644 index 0000000000..9e36ef72a6 --- /dev/null +++ b/frontend/src/pages/Fleets/Add/types.ts @@ -0,0 +1,5 @@ +import { FleetFormFields } from './FleetFormFields/type'; + +export interface IFleetWizardForm extends FleetFormFields { + project_name: IProject['project_name']; +} diff --git a/frontend/src/pages/Fleets/Details/Events/index.tsx b/frontend/src/pages/Fleets/Details/Events/index.tsx new file mode 100644 index 0000000000..9a81c7dec3 --- /dev/null +++ b/frontend/src/pages/Fleets/Details/Events/index.tsx @@ -0,0 +1,56 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate, useParams } from 'react-router-dom'; +import Button from '@cloudscape-design/components/button'; + +import { Header, Loader, Table } from 'components'; + +import { DEFAULT_TABLE_PAGE_SIZE } from 'consts'; +import { useCollection, useInfiniteScroll } from 'hooks'; +import { ROUTES } from 'routes'; +import { useLazyGetAllEventsQuery } from 'services/events'; + +import { useColumnsDefinitions } from 'pages/Events/List/hooks/useColumnDefinitions'; + +export const EventsList = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramFleetId = params.fleetId ?? ''; + const navigate = useNavigate(); + + const { data, isLoading, isLoadingMore } = useInfiniteScroll({ + useLazyQuery: useLazyGetAllEventsQuery, + args: { limit: DEFAULT_TABLE_PAGE_SIZE, within_fleets: [paramFleetId] }, + + getPaginationParams: (lastEvent) => ({ + prev_recorded_at: lastEvent.recorded_at, + prev_id: lastEvent.id, + }), + }); + + const { items, collectionProps } = useCollection(data, { + selection: {}, + }); + + const goToFullView = () => { + navigate(ROUTES.EVENTS.LIST + `?within_fleets=${paramFleetId}`); + }; + + const { columns } = useColumnsDefinitions(); + + return ( +
{t('common.full_view')}}> + {t('navigation.events')} + + } + footer={} + /> + ); +}; diff --git a/frontend/src/pages/Fleets/Details/FleetDetails/index.tsx b/frontend/src/pages/Fleets/Details/FleetDetails/index.tsx new file mode 100644 index 0000000000..97a09e2f38 --- /dev/null +++ b/frontend/src/pages/Fleets/Details/FleetDetails/index.tsx @@ -0,0 +1,114 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import { useParams } from 'react-router-dom'; +import { format } from 'date-fns'; + +import { Box, ColumnLayout, Container, Header, Loader, NavigateLink, StatusIndicator } from 'components'; + +import { DATE_TIME_FORMAT } from 'consts'; +import { + formatFleetBackend, + formatFleetResources, + getFleetInstancesLinkText, + getFleetPrice, + getFleetStatusIconType, +} from 'libs/fleet'; +import { ROUTES } from 'routes'; +import { useGetFleetDetailsQuery } from 'services/fleet'; + +export const FleetDetails = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramFleetId = params.fleetId ?? ''; + const paramProjectName = params.projectName ?? ''; + + const { data, isLoading } = useGetFleetDetailsQuery( + { + projectName: paramProjectName, + fleetId: paramFleetId, + }, + { + refetchOnMountOrArgChange: true, + }, + ); + + return ( + <> + {isLoading && ( + + + + )} + + {data && ( + {t('common.general')}}> + +
+ {t('fleets.fleet')} +
{data.name}
+
+ +
+ {t('fleets.instances.status')} + +
+ + {t(`fleets.statuses.${data.status}`)} + +
+
+ +
+ {t('fleets.instances.project')} + +
+ + {data.project_name} + +
+
+ +
+ {t('fleets.instances.backend')} +
{formatFleetBackend(data.spec.configuration)}
+
+ +
+ {t('fleets.instances.resources')} +
+ {data.spec.configuration.ssh_config + ? '-' + : formatFleetResources(data.spec.configuration.resources)} +
+
+ +
+ {t('fleets.instances.title')} + +
+ + {getFleetInstancesLinkText(data)} + +
+
+ +
+ {t('fleets.instances.created')} +
{format(new Date(data.created_at), DATE_TIME_FORMAT)}
+
+ +
+ {t('fleets.instances.price')} +
+ {(() => { + const p = getFleetPrice(data); + return typeof p === 'number' ? `$${p}` : '-'; + })()} +
+
+
+
+ )} + + ); +}; diff --git a/frontend/src/pages/Fleets/Details/Inspect/index.tsx b/frontend/src/pages/Fleets/Details/Inspect/index.tsx new file mode 100644 index 0000000000..8d9c5d5095 --- /dev/null +++ b/frontend/src/pages/Fleets/Details/Inspect/index.tsx @@ -0,0 +1,73 @@ +import React, { useEffect, useMemo } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useParams } from 'react-router-dom'; + +import { CodeEditor, Container, Header, Loader } from 'components'; + +import { useGetFleetDetailsQuery } from 'services/fleet'; + +interface AceEditorElement extends HTMLElement { + env?: { + editor?: { + setReadOnly: (readOnly: boolean) => void; + }; + }; +} + +export const FleetInspect = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramProjectName = params.projectName ?? ''; + const paramFleetId = params.fleetId ?? ''; + + const { data: fleetData, isLoading } = useGetFleetDetailsQuery( + { + projectName: paramProjectName, + fleetId: paramFleetId, + }, + { + refetchOnMountOrArgChange: true, + }, + ); + + const jsonContent = useMemo(() => { + if (!fleetData) return ''; + return JSON.stringify(fleetData, null, 2); + }, [fleetData]); + + // Set editor to read-only after it loads + useEffect(() => { + const timer = setTimeout(() => { + // Find the ace editor instance in the DOM + const editorElements = document.querySelectorAll('.ace_editor'); + editorElements.forEach((element: Element) => { + const aceEditor = (element as AceEditorElement).env?.editor; + if (aceEditor) { + aceEditor.setReadOnly(true); + } + }); + }, 100); + + return () => clearTimeout(timer); + }, [jsonContent]); + + if (isLoading) + return ( + + + + ); + + return ( + {t('fleets.inspect')}}> + { + // Prevent editing - onChange is required but we ignore changes + }} + /> + + ); +}; diff --git a/frontend/src/pages/Fleets/Details/index.tsx b/frontend/src/pages/Fleets/Details/index.tsx new file mode 100644 index 0000000000..6e5d9e6d7a --- /dev/null +++ b/frontend/src/pages/Fleets/Details/index.tsx @@ -0,0 +1,112 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import { Outlet, useNavigate, useParams } from 'react-router-dom'; + +import { Button, ContentLayout, DetailsHeader, Tabs } from 'components'; + +enum CodeTab { + Details = 'details', + Events = 'events', + Inspect = 'inspect', +} + +import { useBreadcrumbs } from 'hooks'; +import { ROUTES } from 'routes'; +import { useGetFleetDetailsQuery } from 'services/fleet'; + +import { useDeleteFleet } from '../List/useDeleteFleet'; + +import styles from './styles.module.scss'; + +export const FleetDetails: React.FC = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramFleetId = params.fleetId ?? ''; + const paramProjectName = params.projectName ?? ''; + const navigate = useNavigate(); + + const { deleteFleets, isDeleting } = useDeleteFleet(); + + const { data } = useGetFleetDetailsQuery( + { + projectName: paramProjectName, + fleetId: paramFleetId, + }, + { + refetchOnMountOrArgChange: true, + }, + ); + + useBreadcrumbs([ + { + text: t('navigation.project_other'), + href: ROUTES.PROJECT.LIST, + }, + { + text: paramProjectName, + href: ROUTES.PROJECT.DETAILS.FORMAT(paramProjectName), + }, + { + text: t('navigation.fleets'), + href: ROUTES.FLEETS.LIST, + }, + { + text: data?.name ?? '', + href: ROUTES.FLEETS.DETAILS.FORMAT(paramProjectName, paramFleetId), + }, + ]); + + const deleteClickHandle = () => { + if (!data) return; + + deleteFleets([data]) + .then(() => { + navigate(ROUTES.FLEETS.LIST); + }) + .catch(console.log); + }; + + const isDisabledDeleteButton = !data || isDeleting; + + return ( +
+ + + + } + /> + } + > + + + + +
+ ); +}; diff --git a/frontend/src/pages/Fleets/Details/styles.module.scss b/frontend/src/pages/Fleets/Details/styles.module.scss new file mode 100644 index 0000000000..1a7d41a9c5 --- /dev/null +++ b/frontend/src/pages/Fleets/Details/styles.module.scss @@ -0,0 +1,18 @@ +.page { + height: 100%; + + & [class^="awsui_tabs-content"] { + display: none; + } + + & > [class^="awsui_layout"] { + height: 100%; + + & > [class^="awsui_content"] { + display: flex; + flex-direction: column; + gap: 20px; + height: 100%; + } + } +} diff --git a/frontend/src/pages/Fleets/List/hooks.tsx b/frontend/src/pages/Fleets/List/hooks.tsx new file mode 100644 index 0000000000..73286d247a --- /dev/null +++ b/frontend/src/pages/Fleets/List/hooks.tsx @@ -0,0 +1,229 @@ +import React, { useCallback, useMemo, useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useSearchParams } from 'react-router-dom'; +import { format } from 'date-fns'; +import { ToggleProps } from '@cloudscape-design/components'; + +import type { PropertyFilterProps } from 'components'; +import { Button, ListEmptyMessage, NavigateLink, StatusIndicator, TableProps } from 'components'; + +import { DATE_TIME_FORMAT } from 'consts'; +import { useLocalStorageState } from 'hooks'; +import { + EMPTY_QUERY, + getNamePatternFilterRequestParams, + requestParamsToTokens, + tokensToRequestParams, + tokensToSearchParams, +} from 'libs/filters'; +import { + formatFleetBackend, + formatFleetResources, + getFleetInstancesLinkText, + getFleetPrice, + getFleetStatusIconType, +} from 'libs/fleet'; +import { ROUTES } from 'routes'; +import { useLazyGetProjectsQuery } from 'services/project'; + +const limit = 100; + +export const useEmptyMessages = ({ + clearFilter, + isDisabledClearFilter, +}: { + clearFilter?: () => void; + isDisabledClearFilter?: boolean; +}) => { + const { t } = useTranslation(); + + const renderEmptyMessage = useCallback<() => React.ReactNode>(() => { + return ( + + + + ); + }, [clearFilter, isDisabledClearFilter]); + + const renderNoMatchMessage = useCallback<() => React.ReactNode>(() => { + return ( + + + + ); + }, [clearFilter, isDisabledClearFilter]); + + return { renderEmptyMessage, renderNoMatchMessage } as const; +}; + +export const useColumnsDefinitions = () => { + const { t } = useTranslation(); + + const columns: TableProps.ColumnDefinition[] = [ + { + id: 'fleet_name', + header: t('fleets.fleet_column_name'), + cell: (item) => ( + {item.name} + ), + }, + { + id: 'status', + header: t('fleets.instances.status'), + cell: (item) => ( + + {t(`fleets.statuses.${item.status}`)} + + ), + }, + { + id: 'project', + header: t('fleets.instances.project'), + cell: (item) => ( + {item.project_name} + ), + }, + { + id: 'backend', + header: t('fleets.instances.backend'), + cell: (item) => formatFleetBackend(item.spec.configuration), + }, + { + id: 'resources', + header: t('fleets.instances.resources'), + cell: (item) => + item.spec.configuration.ssh_config ? '-' : formatFleetResources(item.spec.configuration.resources), + }, + { + id: 'instances', + header: t('fleets.instances.title'), + cell: (item) => ( + + {getFleetInstancesLinkText(item)} + + ), + }, + { + id: 'created', + header: t('fleets.instances.created'), + cell: (item) => format(new Date(item.created_at), DATE_TIME_FORMAT), + }, + { + id: 'price', + header: t('fleets.instances.price'), + cell: (item) => { + const price = getFleetPrice(item); + return typeof price === 'number' ? `$${price}` : '-'; + }, + }, + ]; + + return { columns } as const; +}; + +type RequestParamsKeys = keyof Pick; + +const filterKeys: Record = { + PROJECT_NAME: 'project_name', +}; + +export const useFilters = () => { + const [searchParams, setSearchParams] = useSearchParams(); + const [onlyActive, setOnlyActive] = useLocalStorageState('fleet-list-filter-only-active', true); + const [dynamicFilteringOptions, setDynamicFilteringOptions] = useState([]); + const [filteringStatusType, setFilteringStatusType] = useState(); + const [getProjects] = useLazyGetProjectsQuery(); + + const [propertyFilterQuery, setPropertyFilterQuery] = useState(() => + requestParamsToTokens({ searchParams, filterKeys }), + ); + + const clearFilter = () => { + setSearchParams({}); + setPropertyFilterQuery(EMPTY_QUERY); + }; + + const filteringOptions = useMemo(() => { + return [...dynamicFilteringOptions]; + }, [dynamicFilteringOptions]); + + const filteringProperties = [ + { + key: filterKeys.PROJECT_NAME, + operators: ['='], + propertyLabel: 'Project', + groupValuesLabel: 'Project values', + }, + ]; + + const onChangePropertyFilter: PropertyFilterProps['onChange'] = ({ detail }) => { + const { tokens, operation } = detail; + + const filteredTokens = tokens.filter((token, tokenIndex) => { + return !tokens.some((item, index) => token.propertyKey === item.propertyKey && index > tokenIndex); + }); + + setSearchParams(tokensToSearchParams(filteredTokens, onlyActive)); + + setPropertyFilterQuery({ + operation, + tokens: filteredTokens, + }); + }; + + const onChangeOnlyActive: ToggleProps['onChange'] = ({ detail }) => { + setOnlyActive(detail.checked); + }; + + const filteringRequestParams = useMemo(() => { + const params = tokensToRequestParams({ + tokens: propertyFilterQuery.tokens, + }); + + return { + ...params, + only_active: onlyActive, + include_imported: true, + } as Partial; + }, [propertyFilterQuery, onlyActive]); + + const isDisabledClearFilter = !propertyFilterQuery.tokens.length && !onlyActive; + + const handleLoadItems: PropertyFilterProps['onLoadItems'] = async ({ detail: { filteringProperty, filteringText } }) => { + setDynamicFilteringOptions([]); + + setFilteringStatusType('loading'); + + if (filteringProperty?.key === filterKeys.PROJECT_NAME) { + await getProjects(getNamePatternFilterRequestParams(filteringText, limit)) + .unwrap() + .then(({ data }) => + data.map(({ project_name }) => ({ + propertyKey: filterKeys.PROJECT_NAME, + value: project_name, + })), + ) + .then(setDynamicFilteringOptions); + } + + setFilteringStatusType(undefined); + }; + + return { + filteringRequestParams, + clearFilter, + propertyFilterQuery, + onChangePropertyFilter, + filteringOptions, + filteringProperties, + onlyActive, + onChangeOnlyActive, + isDisabledClearFilter, + filteringStatusType, + handleLoadItems, + } as const; +}; diff --git a/frontend/src/pages/Fleets/List/index.tsx b/frontend/src/pages/Fleets/List/index.tsx new file mode 100644 index 0000000000..7e5ef21cf5 --- /dev/null +++ b/frontend/src/pages/Fleets/List/index.tsx @@ -0,0 +1,151 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; + +import { Button, Header, Loader, PropertyFilter, SpaceBetween, Table, Toggle } from 'components'; + +import { DEFAULT_TABLE_PAGE_SIZE } from 'consts'; +import { useBreadcrumbs, useCollection, useInfiniteScroll } from 'hooks'; +import { useCheckingForFleetsInProjects } from 'hooks/useCheckingForFleetsInProjectsOfMember'; +import { ROUTES } from 'routes'; +import { useLazyGetFleetsQuery } from 'services/fleet'; + +import { NoFleetProjectAlert } from 'pages/Project/components/NoFleetProjectAlert'; + +import { useColumnsDefinitions, useEmptyMessages, useFilters } from './hooks'; +import { useDeleteFleet } from './useDeleteFleet'; + +import styles from './styles.module.scss'; + +export const FleetList: React.FC = () => { + const { t } = useTranslation(); + + useBreadcrumbs([ + { + text: t('navigation.fleets'), + href: ROUTES.FLEETS.LIST, + }, + ]); + + const { + clearFilter, + propertyFilterQuery, + onChangePropertyFilter, + filteringOptions, + filteringProperties, + filteringRequestParams, + onlyActive, + onChangeOnlyActive, + isDisabledClearFilter, + filteringStatusType, + handleLoadItems, + } = useFilters(); + + const projectHavingFleetMap = useCheckingForFleetsInProjects({}); + + const { data, isLoading, refreshList, isLoadingMore } = useInfiniteScroll({ + useLazyQuery: useLazyGetFleetsQuery, + args: { ...filteringRequestParams, limit: DEFAULT_TABLE_PAGE_SIZE }, + + getPaginationParams: (lastFleet) => ({ + prev_created_at: lastFleet.created_at, + prev_id: lastFleet.id, + }), + }); + + const { columns } = useColumnsDefinitions(); + const { deleteFleets, isDeleting } = useDeleteFleet(); + const { renderEmptyMessage, renderNoMatchMessage } = useEmptyMessages({ clearFilter, isDisabledClearFilter }); + + const { items, collectionProps } = useCollection(data, { + filtering: { + empty: renderEmptyMessage(), + noMatch: renderNoMatchMessage(), + }, + selection: {}, + }); + + const { selectedItems } = collectionProps; + + const isDisabledDeleteButton = !selectedItems?.length || isDeleting; + + const deleteClickHandle = () => { + if (!selectedItems?.length) return; + + deleteFleets([...selectedItems]).catch(console.log); + }; + + const projectDontHasFleet = Object.keys(projectHavingFleetMap).find((project) => !projectHavingFleetMap[project]); + + return ( +
+ + +
+ + +
+ + } + filter={ +
+
+ `Use: ${value}`, + }} + filteringOptions={filteringOptions} + filteringProperties={filteringProperties} + filteringStatusType={filteringStatusType} + onLoadItems={handleLoadItems} + /> +
+ +
+ + {t('fleets.active_only')} + +
+
+ } + footer={} + /> + ); +}; diff --git a/frontend/src/pages/Fleets/List/styles.module.scss b/frontend/src/pages/Fleets/List/styles.module.scss new file mode 100644 index 0000000000..ec38338c42 --- /dev/null +++ b/frontend/src/pages/Fleets/List/styles.module.scss @@ -0,0 +1,23 @@ +.noFleetAlert { + margin-bottom: 12px; +} +.filters { + display: flex; + flex-wrap: wrap; + gap: 0 20px; + + .propertyFilter { + max-width: 640px; + flex-grow: 1; + min-width: 0; + } + + .activeOnly { + display: flex; + padding-top: 7px; + } + + .clear { + + } +} diff --git a/frontend/src/pages/Fleets/List/useDeleteFleet.ts b/frontend/src/pages/Fleets/List/useDeleteFleet.ts new file mode 100644 index 0000000000..7e38dd2777 --- /dev/null +++ b/frontend/src/pages/Fleets/List/useDeleteFleet.ts @@ -0,0 +1,51 @@ +import { useCallback, useState } from 'react'; +import { useTranslation } from 'react-i18next'; + +import { useNotifications } from 'hooks'; +import { getServerError } from 'libs'; +import { useDeleteFleetMutation } from 'services/fleet'; + +export const useDeleteFleet = () => { + const { t } = useTranslation(); + const [deleteFleet] = useDeleteFleetMutation(); + const [pushNotification] = useNotifications(); + const [isDeleting, setIsDeleting] = useState(() => false); + + const namesOfFleetsGroupByProjectName = (volumes: IFleet[]) => { + return volumes.reduce>((acc, fleet) => { + if (acc[fleet.project_name]) { + acc[fleet.project_name].push(fleet.name); + } else { + acc[fleet.project_name] = [fleet.name]; + } + + return acc; + }, {}); + }; + + const deleteFleets = useCallback(async (fleets: IFleet[]) => { + if (!fleets.length) return Promise.reject('No fleets'); + + setIsDeleting(true); + + const groupedFleets = namesOfFleetsGroupByProjectName(fleets); + + const requests = Object.keys(groupedFleets).map((projectName) => { + return deleteFleet({ + projectName: projectName, + fleetNames: groupedFleets[projectName], + }).unwrap(); + }); + + return Promise.all(requests) + .finally(() => setIsDeleting(false)) + .catch((error) => { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: getServerError(error) }), + }); + }); + }, []); + + return { deleteFleets, isDeleting } as const; +}; diff --git a/frontend/src/pages/Fleets/index.ts b/frontend/src/pages/Fleets/index.ts new file mode 100644 index 0000000000..e388010498 --- /dev/null +++ b/frontend/src/pages/Fleets/index.ts @@ -0,0 +1,3 @@ +export { FleetList } from './List'; +export { FleetDetails } from './Details'; +export { FleetAdd } from './Add'; diff --git a/frontend/src/pages/Instances/Details/Events/index.tsx b/frontend/src/pages/Instances/Details/Events/index.tsx new file mode 100644 index 0000000000..53a07f1cdc --- /dev/null +++ b/frontend/src/pages/Instances/Details/Events/index.tsx @@ -0,0 +1,56 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate, useParams } from 'react-router-dom'; +import Button from '@cloudscape-design/components/button'; + +import { Header, Loader, Table } from 'components'; + +import { DEFAULT_TABLE_PAGE_SIZE } from 'consts'; +import { useCollection, useInfiniteScroll } from 'hooks'; +import { ROUTES } from 'routes'; +import { useLazyGetAllEventsQuery } from 'services/events'; + +import { useColumnsDefinitions } from 'pages/Events/List/hooks/useColumnDefinitions'; + +export const EventsList = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramInstanceId = params.instanceId ?? ''; + const navigate = useNavigate(); + + const { data, isLoading, isLoadingMore } = useInfiniteScroll({ + useLazyQuery: useLazyGetAllEventsQuery, + args: { limit: DEFAULT_TABLE_PAGE_SIZE, target_instances: [paramInstanceId] }, + + getPaginationParams: (lastEvent) => ({ + prev_recorded_at: lastEvent.recorded_at, + prev_id: lastEvent.id, + }), + }); + + const { items, collectionProps } = useCollection(data, { + selection: {}, + }); + + const goToFullView = () => { + navigate(ROUTES.EVENTS.LIST + `?target_instances=${paramInstanceId}`); + }; + + const { columns } = useColumnsDefinitions(); + + return ( +
{t('common.full_view')}}> + {t('navigation.events')} + + } + footer={} + /> + ); +}; diff --git a/frontend/src/pages/Instances/Details/Inspect/index.tsx b/frontend/src/pages/Instances/Details/Inspect/index.tsx new file mode 100644 index 0000000000..09205dad75 --- /dev/null +++ b/frontend/src/pages/Instances/Details/Inspect/index.tsx @@ -0,0 +1,64 @@ +import React, { useEffect, useMemo } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useParams } from 'react-router-dom'; + +import { CodeEditor, Container, Header, Loader } from 'components'; + +import { useGetInstanceDetailsQuery } from 'services/instance'; + +interface AceEditorElement extends HTMLElement { + env?: { + editor?: { + setReadOnly: (readOnly: boolean) => void; + }; + }; +} + +export const InstanceInspect = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramProjectName = params.projectName ?? ''; + const paramInstanceId = params.instanceId ?? ''; + + const { data, isLoading } = useGetInstanceDetailsQuery( + { + projectName: paramProjectName, + instanceId: paramInstanceId, + }, + { + refetchOnMountOrArgChange: true, + }, + ); + + const jsonContent = useMemo(() => { + if (!data) return ''; + return JSON.stringify(data, null, 2); + }, [data]); + + useEffect(() => { + const timer = setTimeout(() => { + const editorElements = document.querySelectorAll('.ace_editor'); + editorElements.forEach((element: Element) => { + const aceEditor = (element as AceEditorElement).env?.editor; + if (aceEditor) { + aceEditor.setReadOnly(true); + } + }); + }, 100); + + return () => clearTimeout(timer); + }, [jsonContent]); + + if (isLoading) + return ( + + + + ); + + return ( + {t('fleets.instances.inspect')}}> + {}} /> + + ); +}; diff --git a/frontend/src/pages/Instances/Details/InstanceDetails/index.tsx b/frontend/src/pages/Instances/Details/InstanceDetails/index.tsx new file mode 100644 index 0000000000..6b048729af --- /dev/null +++ b/frontend/src/pages/Instances/Details/InstanceDetails/index.tsx @@ -0,0 +1,158 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import { useParams } from 'react-router-dom'; +import { format } from 'date-fns'; + +import { Box, ColumnLayout, Container, Header, Loader, NavigateLink, StatusIndicator } from 'components'; + +import { DATE_TIME_FORMAT } from 'consts'; +import { formatBackend, getStatusIconColor, getStatusIconType } from 'libs/fleet'; +import { getHealthStatusIconType, prettyEnumValue } from 'libs/instance'; +import { formatResources } from 'libs/resources'; +import { ROUTES } from 'routes'; +import { useGetInstanceDetailsQuery } from 'services/instance'; + +export const InstanceDetails = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramInstanceId = params.instanceId ?? ''; + const paramProjectName = params.projectName ?? ''; + + const { data, isLoading } = useGetInstanceDetailsQuery( + { + projectName: paramProjectName, + instanceId: paramInstanceId, + }, + { + refetchOnMountOrArgChange: true, + }, + ); + + return ( + <> + {isLoading && ( + + + + )} + + {data && ( + {t('common.general')}}> + +
+ {t('fleets.instances.project')} +
+ + {data.project_name} + +
+
+ +
+ {t('fleets.fleet')} +
+ {data.fleet_name && data.fleet_id ? ( + + {data.fleet_name} + + ) : ( + '-' + )} +
+
+ +
+ {t('fleets.instances.status')} +
+ + {(data.status === 'idle' || data.status === 'busy') && + data.total_blocks !== null && + data.total_blocks > 1 + ? `${data.busy_blocks}/${data.total_blocks} Busy` + : prettyEnumValue(data.status)} + +
+
+ +
+ {t('projects.run.error')} +
+ {data.unreachable ? ( + Unreachable + ) : data.health_status !== 'healthy' ? ( + + {prettyEnumValue(data.health_status)} + + ) : ( + '-' + )} +
+
+ +
+ {t('fleets.instances.started')} +
{format(new Date(data.created), DATE_TIME_FORMAT)}
+
+ +
+ {t('fleets.instances.finished_at')} +
{data.finished_at ? format(new Date(data.finished_at), DATE_TIME_FORMAT) : '-'}
+
+ + {data.termination_reason && ( +
+ {t('fleets.instances.termination_reason')} +
{data.termination_reason_message ?? prettyEnumValue(data.termination_reason)}
+
+ )} + +
+ {t('fleets.instances.resources')} +
{data.instance_type ? formatResources(data.instance_type.resources) : '-'}
+
+ +
+ {t('fleets.instances.backend')} +
{formatBackend(data.backend)}
+
+ +
+ {t('fleets.instances.region')} +
{data.region ?? '-'}
+
+ +
+ {t('fleets.instances.instance_type')} +
{data.instance_type?.name ?? '-'}
+
+ +
+ {t('fleets.instances.spot')} +
{data.instance_type?.resources.spot ? t('common.yes') : t('common.no')}
+
+ +
+ {t('fleets.instances.price')} +
{typeof data.price === 'number' ? `$${data.price}` : '-'}
+
+ + {data.total_blocks !== null && ( +
+ {t('fleets.instances.blocks')} +
{data.total_blocks}
+
+ )} + +
+ {t('fleets.instances.hostname')} +
{data.hostname ?? '-'}
+
+
+
+ )} + + ); +}; diff --git a/frontend/src/pages/Instances/Details/index.tsx b/frontend/src/pages/Instances/Details/index.tsx new file mode 100644 index 0000000000..2c7f1a2b5c --- /dev/null +++ b/frontend/src/pages/Instances/Details/index.tsx @@ -0,0 +1,110 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import { Outlet, useNavigate, useParams } from 'react-router-dom'; + +import { Button, ContentLayout, DetailsHeader, Tabs } from 'components'; + +enum InstanceTab { + Details = 'details', + Events = 'events', + Inspect = 'inspect', +} + +import { useBreadcrumbs } from 'hooks'; +import { ROUTES } from 'routes'; +import { useGetInstanceDetailsQuery } from 'services/instance'; + +import { useDeleteInstance } from './useDeleteInstance'; + +import styles from './styles.module.scss'; + +export const InstanceDetailsPage: React.FC = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramInstanceId = params.instanceId ?? ''; + const paramProjectName = params.projectName ?? ''; + const navigate = useNavigate(); + + const { deleteInstance, isDeleting } = useDeleteInstance(); + + const { data } = useGetInstanceDetailsQuery( + { + projectName: paramProjectName, + instanceId: paramInstanceId, + }, + { + refetchOnMountOrArgChange: true, + }, + ); + + useBreadcrumbs([ + { + text: t('navigation.project_other'), + href: ROUTES.PROJECT.LIST, + }, + { + text: paramProjectName, + href: ROUTES.PROJECT.DETAILS.FORMAT(paramProjectName), + }, + { + text: t('navigation.instances'), + href: ROUTES.INSTANCES.LIST, + }, + { + text: data?.name ?? '', + href: ROUTES.INSTANCES.DETAILS.FORMAT(paramProjectName, paramInstanceId), + }, + ]); + + const deleteClickHandle = () => { + if (!data) return; + + deleteInstance(data) + .then(() => { + navigate(ROUTES.INSTANCES.LIST); + }) + .catch(console.log); + }; + + const isDisabledDeleteButton = !data || isDeleting || data.status === 'terminated'; + + return ( +
+ + {t('common.delete')} + + } + /> + } + > + + + + +
+ ); +}; diff --git a/frontend/src/pages/Instances/Details/styles.module.scss b/frontend/src/pages/Instances/Details/styles.module.scss new file mode 100644 index 0000000000..1a7d41a9c5 --- /dev/null +++ b/frontend/src/pages/Instances/Details/styles.module.scss @@ -0,0 +1,18 @@ +.page { + height: 100%; + + & [class^="awsui_tabs-content"] { + display: none; + } + + & > [class^="awsui_layout"] { + height: 100%; + + & > [class^="awsui_content"] { + display: flex; + flex-direction: column; + gap: 20px; + height: 100%; + } + } +} diff --git a/frontend/src/pages/Instances/Details/useDeleteInstance.ts b/frontend/src/pages/Instances/Details/useDeleteInstance.ts new file mode 100644 index 0000000000..b460ed4373 --- /dev/null +++ b/frontend/src/pages/Instances/Details/useDeleteInstance.ts @@ -0,0 +1,38 @@ +import { useCallback, useState } from 'react'; +import { useTranslation } from 'react-i18next'; + +import { useNotifications } from 'hooks'; +import { getServerError } from 'libs'; +import { useDeleteInstancesMutation } from 'services/instance'; + +export const useDeleteInstance = () => { + const { t } = useTranslation(); + const [deleteInstances] = useDeleteInstancesMutation(); + const [pushNotification] = useNotifications(); + const [isDeleting, setIsDeleting] = useState(false); + + const deleteInstance = useCallback(async (instance: IInstance) => { + if (!instance.project_name || !instance.fleet_name) { + return Promise.reject('Missing project or fleet name'); + } + + setIsDeleting(true); + + return deleteInstances({ + projectName: instance.project_name, + fleetName: instance.fleet_name, + instancesNums: [instance.instance_num], + }) + .unwrap() + .finally(() => setIsDeleting(false)) + .catch((error) => { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: getServerError(error) }), + }); + throw error; + }); + }, []); + + return { deleteInstance, isDeleting } as const; +}; diff --git a/frontend/src/pages/Instances/List/hooks/useActions.ts b/frontend/src/pages/Instances/List/hooks/useActions.ts new file mode 100644 index 0000000000..619bdbae5e --- /dev/null +++ b/frontend/src/pages/Instances/List/hooks/useActions.ts @@ -0,0 +1,56 @@ +import { useCallback, useState } from 'react'; +import { useTranslation } from 'react-i18next'; + +import { useNotifications } from 'hooks'; +import { getServerError } from 'libs'; +import { useDeleteInstancesMutation } from 'services/instance'; + +export const useActions = () => { + const { t } = useTranslation(); + const [deleteInstances] = useDeleteInstancesMutation(); + const [pushNotification] = useNotifications(); + const [isDeleting, setIsDeleting] = useState(() => false); + + const instancesGroupByFleetName = (instances: IInstance[]) => { + return instances.reduce>((acc, instance) => { + const key = `${instance.project_name}/${instance.fleet_name}`; + + if (acc[key]) { + acc[key].push(instance.instance_num); + } else { + acc[key] = [instance.instance_num]; + } + + return acc; + }, {}); + }; + + const deleteFleets = useCallback(async (instances: IInstance[]) => { + if (!instances.length) return Promise.reject('No instances'); + + setIsDeleting(true); + + const groupedInstances = instancesGroupByFleetName(instances); + + const requests = Object.keys(groupedInstances).map((key) => { + const [projectName, fleetName] = key.split('/'); + + return deleteInstances({ + projectName, + fleetName, + instancesNums: groupedInstances[key], + }).unwrap(); + }); + + return Promise.all(requests) + .finally(() => setIsDeleting(false)) + .catch((error) => { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: getServerError(error) }), + }); + }); + }, []); + + return { deleteFleets, isDeleting } as const; +}; diff --git a/frontend/src/pages/Instances/List/hooks/useColumnDefinitions.tsx b/frontend/src/pages/Instances/List/hooks/useColumnDefinitions.tsx new file mode 100644 index 0000000000..98b1ddd2b4 --- /dev/null +++ b/frontend/src/pages/Instances/List/hooks/useColumnDefinitions.tsx @@ -0,0 +1,120 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import { format } from 'date-fns'; + +import { Icon, NavigateLink, StatusIndicator, TableProps } from 'components'; + +import { DATE_TIME_FORMAT } from 'consts'; +import { formatBackend, getStatusIconColor, getStatusIconType } from 'libs/fleet'; +import { formatInstanceStatusText, getHealthStatusIconType, prettyEnumValue } from 'libs/instance'; +import { formatResources } from 'libs/resources'; +import { ROUTES } from 'routes'; + +export const useColumnsDefinitions = () => { + const { t } = useTranslation(); + + const columns: TableProps.ColumnDefinition[] = [ + { + id: 'name', + header: t('fleets.instances.instance_name'), + cell: (item) => + item.project_name ? ( + {item.name} + ) : ( + item.name + ), + }, + { + id: 'fleet_name', + header: t('fleets.fleet'), + cell: (item) => + item.fleet_name && item.project_name ? ( + + {item.fleet_name} + + ) : ( + '-' + ), + }, + { + id: 'project_name', + header: t('fleets.instances.project'), + cell: (item) => + item.project_name ? ( + {item.project_name} + ) : ( + item.project_name + ), + }, + { + id: 'status', + header: t('fleets.instances.status'), + cell: (item) => ( + + {formatInstanceStatusText(item)} + + ), + }, + { + id: 'error', + header: t('projects.run.error'), + cell: (item) => { + if (item.unreachable) return Unreachable; + if (item.health_status !== 'healthy') + return ( + + {prettyEnumValue(item.health_status)} + + ); + return null; + }, + }, + { + id: 'hostname', + header: t('fleets.instances.hostname'), + cell: (item) => item.hostname, + }, + { + id: 'backend', + header: t('fleets.instances.backend'), + cell: (item) => formatBackend(item.backend), + }, + { + id: 'price', + header: t('fleets.instances.price'), + cell: (item) => (typeof item.price === 'number' ? `$${item.price}` : '-'), + }, + { + id: 'region', + header: t('fleets.instances.region'), + cell: (item) => item.region, + }, + { + id: 'instance_type', + header: t('fleets.instances.instance_type'), + cell: (item) => item.instance_type?.name ?? '-', + }, + { + id: 'resources', + header: t('fleets.instances.resources'), + cell: (item) => (item.instance_type ? formatResources(item.instance_type.resources) : '-'), + }, + { + id: 'spot', + header: t('fleets.instances.spot'), + cell: (item) => item.instance_type?.resources.spot && , + }, + { + id: 'started', + header: t('fleets.instances.started'), + cell: (item) => format(new Date(item.created), DATE_TIME_FORMAT), + }, + { + id: 'finished_at', + header: t('fleets.instances.finished_at'), + cell: (item) => (item.finished_at ? format(new Date(item.finished_at), DATE_TIME_FORMAT) : '-'), + }, + ]; + + return { columns } as const; +}; diff --git a/frontend/src/pages/Instances/List/hooks/useEmptyMessage.tsx b/frontend/src/pages/Instances/List/hooks/useEmptyMessage.tsx new file mode 100644 index 0000000000..a2909c7637 --- /dev/null +++ b/frontend/src/pages/Instances/List/hooks/useEmptyMessage.tsx @@ -0,0 +1,42 @@ +import React, { useCallback } from 'react'; +import { useTranslation } from 'react-i18next'; + +import { Button, ListEmptyMessage } from 'components'; + +export const useEmptyMessages = ({ + clearFilter, + isDisabledClearFilter, +}: { + clearFilter?: () => void; + isDisabledClearFilter?: boolean; +}) => { + const { t } = useTranslation(); + + const renderEmptyMessage = useCallback<() => React.ReactNode>(() => { + return ( + + + + ); + }, [clearFilter, isDisabledClearFilter]); + + const renderNoMatchMessage = useCallback<() => React.ReactNode>(() => { + return ( + + + + ); + }, [clearFilter, isDisabledClearFilter]); + + return { renderEmptyMessage, renderNoMatchMessage } as const; +}; diff --git a/frontend/src/pages/Instances/List/hooks/useFilters.ts b/frontend/src/pages/Instances/List/hooks/useFilters.ts new file mode 100644 index 0000000000..4239ef50c3 --- /dev/null +++ b/frontend/src/pages/Instances/List/hooks/useFilters.ts @@ -0,0 +1,124 @@ +import { useMemo, useState } from 'react'; +import { useSearchParams } from 'react-router-dom'; +import { ToggleProps } from '@cloudscape-design/components'; + +import type { PropertyFilterProps } from 'components'; + +import { useLocalStorageState } from 'hooks'; +import { + EMPTY_QUERY, + getNamePatternFilterRequestParams, + requestParamsToTokens, + tokensToRequestParams, + tokensToSearchParams, +} from 'libs/filters'; +import { useLazyGetProjectsQuery } from 'services/project'; + +type RequestParamsKeys = keyof Pick; + +const filterKeys: Record = { + PROJECT_NAMES: 'project_names', + FLEET_IDS: 'fleet_ids', +}; + +const limit = 100; + +export const useFilters = () => { + const [searchParams, setSearchParams] = useSearchParams(); + const [onlyActive, setOnlyActive] = useLocalStorageState('instance-list-filter-only-active', true); + const [dynamicFilteringOptions, setDynamicFilteringOptions] = useState([]); + const [filteringStatusType, setFilteringStatusType] = useState(); + const [getProjects] = useLazyGetProjectsQuery(); + + const [propertyFilterQuery, setPropertyFilterQuery] = useState(() => { + return requestParamsToTokens({ searchParams, filterKeys }); + }); + + const clearFilter = () => { + setSearchParams({}); + setPropertyFilterQuery(EMPTY_QUERY); + }; + + const filteringOptions = useMemo(() => { + return [...dynamicFilteringOptions]; + }, [dynamicFilteringOptions]); + + const filteringProperties = [ + { + key: filterKeys.PROJECT_NAMES, + operators: ['='], + propertyLabel: 'Project', + groupValuesLabel: 'Project values', + }, + { + key: filterKeys.FLEET_IDS, + operators: ['='], + propertyLabel: 'Fleet ID', + groupValuesLabel: 'Fleet ID values', + }, + ]; + + const onChangePropertyFilter: PropertyFilterProps['onChange'] = ({ detail }) => { + const { tokens, operation } = detail; + + setSearchParams(tokensToSearchParams(tokens, onlyActive)); + + setPropertyFilterQuery({ + operation, + tokens, + }); + }; + + const onChangeOnlyActive: ToggleProps['onChange'] = ({ detail }) => { + setOnlyActive(detail.checked); + }; + + const filteringRequestParams = useMemo(() => { + const params = tokensToRequestParams({ + tokens: propertyFilterQuery.tokens, + arrayFieldKeys: [filterKeys.PROJECT_NAMES, filterKeys.FLEET_IDS], + }); + + return { + ...params, + only_active: onlyActive, + include_imported: true, + } as Partial; + }, [propertyFilterQuery, onlyActive]); + + const isDisabledClearFilter = !propertyFilterQuery.tokens.length && !onlyActive; + + const handleLoadItems: PropertyFilterProps['onLoadItems'] = async ({ detail: { filteringProperty, filteringText } }) => { + setDynamicFilteringOptions([]); + + setFilteringStatusType('loading'); + + if (filteringProperty?.key === filterKeys.PROJECT_NAMES) { + await getProjects(getNamePatternFilterRequestParams(filteringText, limit)) + .unwrap() + .then(({ data }) => + data.map(({ project_name }) => ({ + propertyKey: filterKeys.PROJECT_NAMES, + value: project_name, + })), + ) + .then(setDynamicFilteringOptions); + } + + setFilteringStatusType(undefined); + }; + + return { + filteringRequestParams, + clearFilter, + propertyFilterQuery, + onChangePropertyFilter, + filteringOptions, + filteringProperties, + onlyActive, + onChangeOnlyActive, + isDisabledClearFilter, + filteringStatusType, + handleLoadItems, + } as const; +}; diff --git a/frontend/src/pages/Instances/List/index.tsx b/frontend/src/pages/Instances/List/index.tsx new file mode 100644 index 0000000000..a0cd2be951 --- /dev/null +++ b/frontend/src/pages/Instances/List/index.tsx @@ -0,0 +1,140 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; + +import { Button, Header, Loader, PropertyFilter, SpaceBetween, Table, Toggle } from 'components'; + +import { DEFAULT_TABLE_PAGE_SIZE } from 'consts'; +import { useBreadcrumbs, useInfiniteScroll } from 'hooks'; +import { useCollection } from 'hooks'; +import { ROUTES } from 'routes'; +import { useLazyGetInstancesQuery } from 'services/instance'; + +import { useActions } from './hooks/useActions'; +import { useColumnsDefinitions } from './hooks/useColumnDefinitions'; +import { useEmptyMessages } from './hooks/useEmptyMessage'; +import { useFilters } from './hooks/useFilters'; + +import styles from './styles.module.scss'; + +export const List: React.FC = () => { + const { t } = useTranslation(); + + useBreadcrumbs([ + { + text: t('navigation.instances'), + href: ROUTES.INSTANCES.LIST, + }, + ]); + + const { columns } = useColumnsDefinitions(); + + const { + filteringRequestParams, + clearFilter, + propertyFilterQuery, + onChangePropertyFilter, + filteringOptions, + filteringProperties, + onlyActive, + onChangeOnlyActive, + isDisabledClearFilter, + filteringStatusType, + handleLoadItems, + } = useFilters(); + + const { data, isLoading, refreshList, isLoadingMore } = useInfiniteScroll({ + useLazyQuery: useLazyGetInstancesQuery, + args: { ...filteringRequestParams, limit: DEFAULT_TABLE_PAGE_SIZE }, + + getPaginationParams: (lastInstance) => ({ + prev_created_at: lastInstance.created, + prev_id: lastInstance.id, + }), + }); + + const { deleteFleets, isDeleting } = useActions(); + + const { renderEmptyMessage, renderNoMatchMessage } = useEmptyMessages({ clearFilter, isDisabledClearFilter }); + + const { items, collectionProps } = useCollection(data, { + filtering: { + empty: renderEmptyMessage(), + noMatch: renderNoMatchMessage(), + }, + selection: {}, + }); + + const { selectedItems } = collectionProps; + + const isDisabledDeleteButton = !selectedItems?.length || isDeleting; + + const deleteClickHandle = () => { + if (!selectedItems?.length) return; + + deleteFleets([...selectedItems]).catch(console.log); + }; + + return ( +
+ + + + } + /> + } + > + {isLoadingRun && ( + + + + )} + + {modelData && ( + <> +
+
+ {t('common.general')}}> + +
+ {t('models.details.run_name')} + +
+ + {modelData.run_name} + +
+
+ +
+ {t('models.model_name')} +
{modelData.name}
+
+ +
+ {t('models.type')} +
{modelData.type}
+
+
+
+
+ +
+
+
+ + + +
+ {!messageForShowing.length && ( + + )} + + {messageForShowing.map((message, index) => ( +
+ + } + > + {renderMessageBody(message.content || '...')} + +
+ ))} +
+ +
+ + +
+ +
+
+ +
+ + + + + } + onDismiss={() => setViewCodeVisible(false)} + > + + {t('models.details.view_code_description')} + +
+
+ +
+ + setCodeTab(detail.activeTabId as CodeTab)} + activeTabId={codeTab} + tabs={[ + { + label: 'python', + id: CodeTab.Python, + content: {pythonCode}, + }, + { + label: 'curl', + id: CodeTab.Curl, + content: {curlCode}, + }, + ]} + /> +
+
+
+ + )} + + ); +}; diff --git a/frontend/src/pages/Models/Details/styles.module.scss b/frontend/src/pages/Models/Details/styles.module.scss new file mode 100644 index 0000000000..7196f62526 --- /dev/null +++ b/frontend/src/pages/Models/Details/styles.module.scss @@ -0,0 +1,91 @@ +.modelDetailsLayout { + display: flex; + flex-direction: column; + height: calc(100vh - 159px); + margin-bottom: -40px; +} + +.general { + flex-shrink: 0; +} + +.modelForm { + position: relative; + flex-grow: 1; + min-height: 0; + display: grid; + grid-template-columns: 300px 1fr; + grid-template-rows: 60px 1fr auto; + grid-template-areas: "sidebar buttons" + "sidebar chat" + "sidebar form"; +} + +.side { + grid-area: sidebar; + border-right: 1px solid #b6bec9; + padding-right: 20px; + padding-top: 20px; + + textarea { + resize: none !important; + } +} + +.buttons { + display: flex; + justify-content: flex-end; + align-items: center; + gap: 12px; + + & > button { + flex-shrink: 0; + } +} + +.chat { + overflow-y: auto; + grid-area: chat; + padding: 0 20px 20px; + display: flex; + flex-direction: column; + gap: 20px; + + .message { + p { + white-space: pre-wrap; + } + } +} + +.messageForm { + display: grid; + grid-template-columns: 1fr auto; + column-gap: 20px; + grid-area: form; + border-top: 1px solid #b6bec9; + margin-right: -20px; + padding: 20px; + + textarea { + resize: none !important; + max-height: calc(100vh - 550px); + } + + .buttons { + align-self: flex-end; + } +} + +.viewCodeControls { + position: relative; + display: flex; + flex-direction: column; + gap: 20px; + + .copyButton { + position: absolute; + right: 20px; + top: 80px; + } +} diff --git a/frontend/src/pages/Models/Details/types.ts b/frontend/src/pages/Models/Details/types.ts new file mode 100644 index 0000000000..82722b7ff6 --- /dev/null +++ b/frontend/src/pages/Models/Details/types.ts @@ -0,0 +1,11 @@ +export interface FormValues { + instructions?: string; + message: string; +} + +export type Role = 'system' | 'user' | 'assistant' | 'tool'; + +export interface Message { + role: Role; + content: string; +} diff --git a/frontend/src/pages/Models/List/Preferences/consts.ts b/frontend/src/pages/Models/List/Preferences/consts.ts new file mode 100644 index 0000000000..d58d869abc --- /dev/null +++ b/frontend/src/pages/Models/List/Preferences/consts.ts @@ -0,0 +1,21 @@ +import { CollectionPreferencesProps } from 'components'; + +export const DEFAULT_PREFERENCES: CollectionPreferencesProps.Preferences = { + pageSize: 30, + contentDisplay: [ + { id: 'model_name', visible: true }, + { id: 'type', visible: true }, + { id: 'run', visible: true }, + { id: 'resources', visible: true }, + { id: 'price', visible: true }, + { id: 'submitted_at', visible: true }, + { id: 'user', visible: true }, + { id: 'repository', visible: true }, + { id: 'backend', visible: true }, + // hidden by default + { id: 'url', visible: false }, + ], + wrapLines: false, + stripedRows: false, + contentDensity: 'comfortable', +}; diff --git a/frontend/src/pages/Models/List/Preferences/index.tsx b/frontend/src/pages/Models/List/Preferences/index.tsx new file mode 100644 index 0000000000..ed546d5949 --- /dev/null +++ b/frontend/src/pages/Models/List/Preferences/index.tsx @@ -0,0 +1,37 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; + +import { CollectionPreferences } from 'components'; + +import { useModelListPreferences } from './useModelListPreferences'; + +export const Preferences: React.FC = () => { + const { t } = useTranslation(); + const [preferences, setPreferences] = useModelListPreferences(); + + return ( + setPreferences(detail)} + cancelLabel={t('common.cancel')} + confirmLabel={t('common.save')} + contentDisplayPreference={{ + title: t('common.select_visible_columns'), + options: [ + { id: 'model_name', label: t('models.model_name'), alwaysVisible: true }, + { id: 'type', label: `${t('models.type')}` }, + + { id: 'run', label: `${t('models.run')}` }, + { id: 'resources', label: `${t('models.resources')}` }, + { id: 'price', label: `${t('models.price')}` }, + { id: 'submitted_at', label: `${t('models.submitted_at')}` }, + { id: 'user', label: `${t('models.user')}` }, + { id: 'repository', label: `${t('models.repository')}` }, + { id: 'backend', label: `${t('models.backend')}` }, + // hidden by default + { id: 'url', label: `${t('models.url')}` }, + ], + }} + /> + ); +}; diff --git a/frontend/src/pages/Models/List/Preferences/useModelListPreferences.ts b/frontend/src/pages/Models/List/Preferences/useModelListPreferences.ts new file mode 100644 index 0000000000..14f17a8850 --- /dev/null +++ b/frontend/src/pages/Models/List/Preferences/useModelListPreferences.ts @@ -0,0 +1,14 @@ +import { CollectionPreferencesProps } from 'components'; + +import { useLocalStorageState } from 'hooks/useLocalStorageState'; + +import { DEFAULT_PREFERENCES } from './consts'; + +export const useModelListPreferences = () => { + const [preferences, setPreferences] = useLocalStorageState( + 'model-list-preferences', + DEFAULT_PREFERENCES, + ); + + return [preferences, setPreferences] as const; +}; diff --git a/frontend/src/pages/Models/List/constants.ts b/frontend/src/pages/Models/List/constants.ts new file mode 100644 index 0000000000..225f1c0e7f --- /dev/null +++ b/frontend/src/pages/Models/List/constants.ts @@ -0,0 +1,24 @@ +import { CollectionPreferencesProps } from 'components'; + +export const DEFAULT_PREFERENCES: CollectionPreferencesProps.Preferences = { + pageSize: 30, + contentDisplay: [ + { id: 'model_name', visible: true }, + { id: 'gateway', visible: false }, + { id: 'spot', visible: true }, + { id: 'price', visible: true }, + { id: 'submitted_at', visible: true }, + { id: 'status', visible: true }, + { id: 'cost', visible: true }, + // hidden by default + { id: 'project', visible: false }, + { id: 'hub_user_name', visible: false }, + { id: 'repo', visible: false }, + { id: 'instance', visible: false }, + { id: 'region', visible: false }, + { id: 'backend', visible: false }, + ], + wrapLines: false, + stripedRows: false, + contentDensity: 'comfortable', +}; diff --git a/frontend/src/pages/Models/List/hooks.tsx b/frontend/src/pages/Models/List/hooks.tsx new file mode 100644 index 0000000000..7994b6e78f --- /dev/null +++ b/frontend/src/pages/Models/List/hooks.tsx @@ -0,0 +1,232 @@ +import React, { useCallback, useMemo, useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useSearchParams } from 'react-router-dom'; +import { format } from 'date-fns'; + +import type { PropertyFilterProps } from 'components'; +import { Button, ListEmptyMessage, NavigateLink, TableProps } from 'components'; + +import { DATE_TIME_FORMAT } from 'consts'; +import { + EMPTY_QUERY, + getNamePatternFilterRequestParams, + requestParamsToTokens, + tokensToRequestParams, + tokensToSearchParams, +} from 'libs/filters'; +import { ROUTES } from 'routes'; +import { useLazyGetProjectsQuery } from 'services/project'; +import { useLazyGetUserListQuery } from 'services/user'; + +import { getModelGateway } from '../helpers'; + +import { IModelExtended } from './types'; + +export const useColumnsDefinitions = () => { + const { t } = useTranslation(); + + const columns: TableProps.ColumnDefinition[] = [ + { + id: 'model_name', + header: t('models.model_name'), + cell: (item) => { + return ( + + {item.name} + + ); + }, + }, + { + id: 'type', + header: `${t('models.type')}`, + cell: (item) => item.type, + }, + { + id: 'url', + header: `${t('models.url')}`, + cell: (item) => getModelGateway(item.base_url), + }, + { + id: 'run', + header: `${t('models.run')}`, + cell: (item) => ( + + {item.run_name} + + ), + }, + { + id: 'resources', + header: `${t('models.resources')}`, + cell: (item) => item.resources, + }, + { + id: 'price', + header: `${t('models.price')}`, + cell: (item) => (item.price ? `$${item.price}` : null), + }, + { + id: 'submitted_at', + header: `${t('models.submitted_at')}`, + cell: (item) => format(new Date(item.submitted_at), DATE_TIME_FORMAT), + }, + { + id: 'user', + header: `${t('models.user')}`, + cell: (item) => {item.user}, + }, + { + id: 'repository', + header: `${t('models.repository')}`, + cell: (item) => item.repository, + }, + { + id: 'backend', + header: `${t('models.backend')}`, + cell: (item) => item.backend, + }, + ]; + + return { columns } as const; +}; + +export const useEmptyMessages = ({ + clearFilter, + isDisabledClearFilter, +}: { + clearFilter?: () => void; + isDisabledClearFilter?: boolean; +}) => { + const { t } = useTranslation(); + + const renderEmptyMessage = useCallback<() => React.ReactNode>(() => { + return ( + + + + ); + }, [clearFilter, isDisabledClearFilter]); + + const renderNoMatchMessage = useCallback<() => React.ReactNode>(() => { + return ( + + + + ); + }, [clearFilter, isDisabledClearFilter]); + + return { renderEmptyMessage, renderNoMatchMessage } as const; +}; + +type RequestParamsKeys = keyof Pick; + +const filterKeys: Record = { + PROJECT_NAME: 'project_name', + USER_NAME: 'username', +}; + +const limit = 100; + +export const useFilters = () => { + const [searchParams, setSearchParams] = useSearchParams(); + + const [filteringOptions, setFilteringOptions] = useState([]); + const [filteringStatusType, setFilteringStatusType] = useState(); + const [getProjects] = useLazyGetProjectsQuery(); + const [getUsers] = useLazyGetUserListQuery(); + + const [propertyFilterQuery, setPropertyFilterQuery] = useState(() => + requestParamsToTokens({ searchParams, filterKeys }), + ); + + const clearFilter = () => { + setSearchParams({}); + setPropertyFilterQuery(EMPTY_QUERY); + }; + + const filteringProperties = [ + { + key: filterKeys.PROJECT_NAME, + operators: ['='], + propertyLabel: 'Project', + groupValuesLabel: 'Project values', + }, + { + key: filterKeys.USER_NAME, + operators: ['='], + propertyLabel: 'User', + groupValuesLabel: 'User values', + }, + ]; + + const onChangePropertyFilter: PropertyFilterProps['onChange'] = ({ detail }) => { + const { tokens, operation } = detail; + + const filteredTokens = tokens.filter((token, tokenIndex) => { + return !tokens.some((item, index) => token.propertyKey === item.propertyKey && index > tokenIndex); + }); + + setSearchParams(tokensToSearchParams(filteredTokens)); + + setPropertyFilterQuery({ + operation, + tokens: filteredTokens, + }); + }; + + const filteringRequestParams = useMemo(() => { + return tokensToRequestParams({ + tokens: propertyFilterQuery.tokens, + }) as Partial; + }, [propertyFilterQuery]); + + const handleLoadItems: PropertyFilterProps['onLoadItems'] = async ({ detail: { filteringProperty, filteringText } }) => { + setFilteringOptions([]); + + setFilteringStatusType('loading'); + + if (filteringProperty?.key === filterKeys.PROJECT_NAME) { + await getProjects(getNamePatternFilterRequestParams(filteringText, limit)) + .unwrap() + .then(({ data }) => + data.map(({ project_name }) => ({ + propertyKey: filterKeys.PROJECT_NAME, + value: project_name, + })), + ) + .then(setFilteringOptions); + } + + if (filteringProperty?.key === filterKeys.USER_NAME) { + await getUsers(getNamePatternFilterRequestParams(filteringText, limit)) + .unwrap() + .then(({ data }) => + data.map(({ username }) => ({ + propertyKey: filterKeys.USER_NAME, + value: username, + })), + ) + .then(setFilteringOptions); + } + + setFilteringStatusType(undefined); + }; + + return { + filteringRequestParams, + clearFilter, + propertyFilterQuery, + onChangePropertyFilter, + filteringOptions, + filteringProperties, + filteringStatusType, + handleLoadItems, + } as const; +}; diff --git a/frontend/src/pages/Models/List/index.tsx b/frontend/src/pages/Models/List/index.tsx new file mode 100644 index 0000000000..769c8bc105 --- /dev/null +++ b/frontend/src/pages/Models/List/index.tsx @@ -0,0 +1,117 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; + +import { Button, Header, Loader, PropertyFilter, Table } from 'components'; + +import { DEFAULT_TABLE_PAGE_SIZE } from 'consts'; +import { useBreadcrumbs, useCollection, useInfiniteScroll } from 'hooks'; +import { ROUTES } from 'routes'; +import { useLazyGetModelsQuery } from 'services/run'; + +import { useModelListPreferences } from './Preferences/useModelListPreferences'; +import { useColumnsDefinitions, useEmptyMessages, useFilters } from './hooks'; +import { Preferences } from './Preferences'; + +import { IModelExtended } from './types'; + +import styles from './styles.module.scss'; + +export const List: React.FC = () => { + const { t } = useTranslation(); + + const { + clearFilter, + propertyFilterQuery, + onChangePropertyFilter, + filteringOptions, + filteringProperties, + filteringRequestParams, + filteringStatusType, + handleLoadItems, + } = useFilters(); + + useBreadcrumbs([ + { + text: t('navigation.models'), + href: ROUTES.PROJECT.LIST, + }, + ]); + + const { columns } = useColumnsDefinitions(); + const [preferences] = useModelListPreferences(); + + const { data, isLoading, refreshList, isLoadingMore } = useInfiniteScroll({ + useLazyQuery: useLazyGetModelsQuery, + args: { ...filteringRequestParams, limit: DEFAULT_TABLE_PAGE_SIZE }, + + getPaginationParams: (lastModel) => ({ prev_submitted_at: lastModel.submitted_at }), + }); + + const isDisabledClearFilter = !propertyFilterQuery.tokens.length; + + const { renderEmptyMessage, renderNoMatchMessage } = useEmptyMessages({ + clearFilter, + isDisabledClearFilter, + }); + + const { items, collectionProps } = useCollection(data, { + filtering: { + empty: renderEmptyMessage(), + noMatch: renderNoMatchMessage(), + }, + selection: {}, + }); + + return ( +
+ } + > + {t('navigation.models')} + + } + filter={ +
+
+ `Use: ${value}`, + }} + filteringOptions={filteringOptions} + filteringProperties={filteringProperties} + filteringStatusType={filteringStatusType} + onLoadItems={handleLoadItems} + /> +
+
+ } + preferences={} + footer={} + /> + ); +}; diff --git a/frontend/src/pages/Models/List/styles.module.scss b/frontend/src/pages/Models/List/styles.module.scss new file mode 100644 index 0000000000..9f5eb3b217 --- /dev/null +++ b/frontend/src/pages/Models/List/styles.module.scss @@ -0,0 +1,17 @@ +.selectFilters { + --select-width: calc((688px - 3 * 20px) / 2); + display: flex; + flex-wrap: wrap; + gap: 0 20px; + + .propertyFilter { + max-width: 640px; + flex-grow: 1; + min-width: 0; + } + + .activeOnly { + display: flex; + padding-top: 7px; + } +} diff --git a/frontend/src/pages/Models/List/types.ts b/frontend/src/pages/Models/List/types.ts new file mode 100644 index 0000000000..196aec7387 --- /dev/null +++ b/frontend/src/pages/Models/List/types.ts @@ -0,0 +1,12 @@ +export interface IModelExtended extends Partial { + id: string; + run_name: string; + project_name: string; + submitted_at: string; + user: string; + resources: string | null; + price: number | null; + region: string | null; + repository: string | null; + backend: TBackendType | null; +} diff --git a/frontend/src/pages/Models/helpers.ts b/frontend/src/pages/Models/helpers.ts new file mode 100644 index 0000000000..24d297dd26 --- /dev/null +++ b/frontend/src/pages/Models/helpers.ts @@ -0,0 +1,13 @@ +import { isValidUrl } from 'libs'; + +export const getModelGateway = (baseUrl?: IModel['base_url']) => { + if (!baseUrl) { + return ''; + } + + if (isValidUrl(baseUrl)) { + return baseUrl; + } + + return document.location.origin + baseUrl; +}; diff --git a/frontend/src/pages/Models/index.ts b/frontend/src/pages/Models/index.ts new file mode 100644 index 0000000000..e4439beab7 --- /dev/null +++ b/frontend/src/pages/Models/index.ts @@ -0,0 +1 @@ +export { List as ModelsList } from './List'; diff --git a/frontend/src/pages/Offers/List/helpers.test.ts b/frontend/src/pages/Offers/List/helpers.test.ts new file mode 100644 index 0000000000..6d7c77132b --- /dev/null +++ b/frontend/src/pages/Offers/List/helpers.test.ts @@ -0,0 +1,20 @@ +import { rangeToObject } from './helpers'; + +describe('Offers helpers', () => { + test('rangeToObject parses open and closed ranges', () => { + expect(rangeToObject('1..')).toEqual({ min: 1 }); + expect(rangeToObject('..4')).toEqual({ max: 4 }); + expect(rangeToObject('1..4')).toEqual({ min: 1, max: 4 }); + }); + + test('rangeToObject parses GB ranges for memory', () => { + expect(rangeToObject('24GB..', { requireUnit: true })).toEqual({ min: 24 }); + expect(rangeToObject('..80GB', { requireUnit: true })).toEqual({ max: 80 }); + expect(rangeToObject('40GB..80GB', { requireUnit: true })).toEqual({ min: 40, max: 80 }); + }); + + test('rangeToObject rejects unitless memory when unit is required', () => { + expect(rangeToObject('24..80', { requireUnit: true })).toBeUndefined(); + expect(rangeToObject(24, { requireUnit: true })).toBeUndefined(); + }); +}); diff --git a/frontend/src/pages/Offers/List/helpers.tsx b/frontend/src/pages/Offers/List/helpers.tsx new file mode 100644 index 0000000000..531eb0544f --- /dev/null +++ b/frontend/src/pages/Offers/List/helpers.tsx @@ -0,0 +1,125 @@ +import React from 'react'; + +import { RequestParam } from '../../../libs/filters'; + +import styles from './styles.module.scss'; + +const rangeSeparator = '..'; + +export function convertMiBToGB(mib: number) { + return mib / 1024; +} + +export const getPropertyFilterOptions = (gpus: IGpu[]) => { + const names = new Set(); + const backends = new Set(); + const counts = new Set(); + + gpus.forEach((gp) => { + names.add(gp.name); + + if (gp.backend) { + backends.add(gp.backend); + } + + if (gp.backends?.length) { + gp.backends.forEach((i) => backends.add(i)); + } + + const countRange = renderRange(gp.count); + + if (gp.count && countRange) { + counts.add(countRange); + } + }); + + return { + names, + backends, + counts, + }; +}; + +export const getFleetFilterValue = (fleet: IFleet, selectedProjectName?: string) => { + if (selectedProjectName && fleet.project_name === selectedProjectName) { + return fleet.name; + } + + return `${fleet.project_name}/${fleet.name}`; +}; + +export const round = (number: number) => Math.round(number * 100) / 100; + +export const renderRange = (range: { min?: number; max?: number }) => { + if (typeof range.min === 'number' && typeof range.max === 'number' && range.max != range.min) { + return `${round(range.min)}${rangeSeparator}${round(range.max)}`; + } + + return range.min?.toString() ?? range.max?.toString(); +}; + +export const renderRangeJSX = (range: { min?: number; max?: number }) => { + if (typeof range.min === 'number' && typeof range.max === 'number' && range.max != range.min) { + return ( + <> + {round(range.min)} + {rangeSeparator} + {round(range.max)} + + ); + } + + return range.min?.toString() ?? range.max?.toString(); +}; + +export const rangeToObject = ( + range: RequestParam, + { + requireUnit = false, + }: { + requireUnit?: boolean; + } = {}, +): { min?: number; max?: number } | undefined => { + const hasGbUnit = (value?: string) => /gb/i.test(value ?? ''); + + if (!range) return; + + if (typeof range === 'string') { + const [minString, maxString] = range.split(rangeSeparator); + const normalizeNumericPart = (value?: string) => (value ?? '').replace(/[^\d.]/g, ''); + const parseBound = (value?: string): number | undefined => { + if (requireUnit && value && !hasGbUnit(value)) { + return undefined; + } + const normalized = normalizeNumericPart(value); + if (!normalized) { + return undefined; + } + const parsed = Number(normalized); + return isNaN(parsed) ? undefined : parsed; + }; + + const min = parseBound(minString); + const max = parseBound(maxString); + + if (typeof min === 'number' && typeof max === 'number') { + return { min, max }; + } + + if (typeof min === 'number') { + return { min }; + } + + if (typeof max === 'number') { + return { max }; + } + } + + if (typeof range === 'number') { + return requireUnit ? undefined : { min: range, max: range }; + } + + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-expect-error + return range; +}; diff --git a/frontend/src/pages/Offers/List/hooks/useEmptyMessages.tsx b/frontend/src/pages/Offers/List/hooks/useEmptyMessages.tsx new file mode 100644 index 0000000000..1d6ba07efb --- /dev/null +++ b/frontend/src/pages/Offers/List/hooks/useEmptyMessages.tsx @@ -0,0 +1,58 @@ +import React, { useCallback } from 'react'; +import { useTranslation } from 'react-i18next'; + +import { Button, ListEmptyMessage } from 'components'; + +export const useEmptyMessages = ({ + clearFilter, + isDisabledClearFilter, + projectNameSelected, + groupBySelected, +}: { + clearFilter?: () => void; + isDisabledClearFilter?: boolean; + projectNameSelected?: boolean; + groupBySelected?: boolean; +}) => { + const { t } = useTranslation(); + + const renderEmptyMessage = useCallback<() => React.ReactNode>(() => { + if (!projectNameSelected) { + return ( + + ); + } + + if (!groupBySelected) { + return ( + + ); + } + + return ( + + + + ); + }, [clearFilter, isDisabledClearFilter]); + + const renderNoMatchMessage = useCallback<() => React.ReactNode>(() => { + return ( + + + + ); + }, [clearFilter, isDisabledClearFilter]); + + return { renderEmptyMessage, renderNoMatchMessage } as const; +}; diff --git a/frontend/src/pages/Offers/List/hooks/useFilters.ts b/frontend/src/pages/Offers/List/hooks/useFilters.ts new file mode 100644 index 0000000000..5d331aa6d6 --- /dev/null +++ b/frontend/src/pages/Offers/List/hooks/useFilters.ts @@ -0,0 +1,399 @@ +import { useEffect, useMemo, useRef, useState } from 'react'; +import { useSearchParams } from 'react-router-dom'; + +import type { MultiselectProps, PropertyFilterProps } from 'components'; + +import { + EMPTY_QUERY, + getTokenAwareNamePatternFilterRequestParams, + requestParamsToArray, + requestParamsToTokens, + tokensToRequestParams, + tokensToSearchParams, +} from 'libs/filters'; +import { useLazyGetProjectFleetsQuery } from 'services/fleet'; +import { useGetProjectsQuery, useLazyGetProjectsQuery } from 'services/project'; + +import { getFleetFilterValue, getPropertyFilterOptions } from '../helpers'; + +type RequestParamsKeys = + | 'project_name' + | 'gpu_name' + | 'gpu_count' + | 'gpu_memory' + | 'backend' + | 'fleet' + | 'spot_policy' + | 'group_by'; + +export type UseFiltersArgs = { + gpus: IGpu[]; + withSearchParams?: boolean; + showFleetFilter?: boolean; + permanentFilters?: Partial>; + defaultFilters?: Partial>; +}; + +export const filterKeys: Record = { + PROJECT_NAME: 'project_name', + GPU_NAME: 'gpu_name', + GPU_COUNT: 'gpu_count', + GPU_MEMORY: 'gpu_memory', + BACKEND: 'backend', + FLEET: 'fleet', + SPOT_POLICY: 'spot_policy', +}; + +const multipleChoiceKeys: RequestParamsKeys[] = ['gpu_name', 'backend', 'fleet']; + +const spotPolicyOptions = [ + { + propertyKey: filterKeys.SPOT_POLICY, + value: 'spot', + }, + { + propertyKey: filterKeys.SPOT_POLICY, + value: 'on-demand', + }, + { + propertyKey: filterKeys.SPOT_POLICY, + value: 'auto', + }, +]; + +const filteringProperties = [ + { + key: filterKeys.PROJECT_NAME, + operators: ['='], + propertyLabel: 'Project', + groupValuesLabel: 'Project values', + }, + { + key: filterKeys.GPU_NAME, + operators: ['='], + propertyLabel: 'GPU name', + groupValuesLabel: 'GPU name values', + }, + { + key: filterKeys.GPU_COUNT, + operators: ['<=', '>='], + propertyLabel: 'GPU count', + groupValuesLabel: 'GPU count values', + }, + { + key: filterKeys.GPU_MEMORY, + operators: ['<=', '>='], + propertyLabel: 'GPU memory', + groupValuesLabel: 'GPU memory values', + }, + { + key: filterKeys.BACKEND, + operators: ['='], + propertyLabel: 'Backend', + groupValuesLabel: 'Backend values', + }, + { + key: filterKeys.FLEET, + operators: ['='], + propertyLabel: 'Fleet', + groupValuesLabel: 'Fleet values', + }, + { + key: filterKeys.SPOT_POLICY, + operators: ['='], + propertyLabel: 'Spot policy', + groupValuesLabel: 'Spot policy values', + }, +]; + +const gpuFilterOption = { label: 'GPU', value: 'gpu' }; +const defaultGroupByOptions = [{ ...gpuFilterOption }, { label: 'Backend', value: 'backend' }]; +const groupByRequestParamName: RequestParamsKeys = 'group_by'; +const limit = 100; + +export const useFilters = ({ + gpus, + withSearchParams = true, + showFleetFilter = false, + permanentFilters = {}, + defaultFilters, +}: UseFiltersArgs) => { + const [searchParams, setSearchParams] = useSearchParams(); + const [dynamicFilteringOptions, setDynamicFilteringOptions] = useState([]); + const [filteringStatusType, setFilteringStatusType] = useState(); + const [getProjects] = useLazyGetProjectsQuery(); + const [getProjectFleets] = useLazyGetProjectFleetsQuery(); + const { data: projectsData } = useGetProjectsQuery({ limit: 1 }); + const projectNameIsChecked = useRef(false); + const prevSelectedProjectName = useRef(); + + const [propertyFilterQuery, setPropertyFilterQuery] = useState(() => { + const queryFromSearchParams = requestParamsToTokens({ + searchParams, + filterKeys, + defaultFilterValues: defaultFilters, + }); + + const tokens = showFleetFilter + ? queryFromSearchParams.tokens + : queryFromSearchParams.tokens.filter((token) => token.propertyKey !== filterKeys.FLEET); + + const query = { + ...queryFromSearchParams, + tokens, + }; + + if (query.tokens.length > 0) { + return query; + } + + return EMPTY_QUERY; + }); + + const [groupBy, setGroupBy] = useState(() => { + const selectedGroupBy = requestParamsToArray({ + searchParams, + paramName: groupByRequestParamName, + }); + + if (selectedGroupBy.length) { + return defaultGroupByOptions.filter(({ value }) => selectedGroupBy.includes(value)); + } + + return [gpuFilterOption]; + }); + + const clearFilter = () => { + if (withSearchParams) { + setSearchParams({}); + } + setPropertyFilterQuery(EMPTY_QUERY); + setGroupBy([]); + }; + + const filteringOptions = useMemo(() => { + const options: PropertyFilterProps.FilteringOption[] = [...spotPolicyOptions, ...dynamicFilteringOptions]; + + const { names, backends } = getPropertyFilterOptions(gpus); + + Array.from(names).forEach((name) => { + options.push({ + propertyKey: filterKeys.GPU_NAME, + value: name, + }); + }); + + Array.from(backends).forEach((backend) => { + options.push({ + propertyKey: filterKeys.BACKEND, + value: backend, + }); + }); + + return options; + }, [gpus, dynamicFilteringOptions]); + + const groupByOptions: MultiselectProps.Options = useMemo(() => { + return defaultGroupByOptions.map((option) => { + if (option.value === 'gpu' && groupBy.some(({ value }) => value === 'backend')) { + return { + ...option, + disabled: true, + }; + } + + if (option.value === 'backend' && !groupBy.some(({ value }) => value === 'gpu')) { + return { + ...option, + disabled: true, + }; + } + + return option; + }); + }, [groupBy]); + + const filteringPropertiesForShowing = useMemo(() => { + const permanentFilterKeys = Object.keys(permanentFilters); + return filteringProperties.filter(({ key }) => { + if (key === filterKeys.FLEET && !showFleetFilter) { + return false; + } + + return !permanentFilterKeys.includes(key); + }); + }, [permanentFilters, showFleetFilter]); + + const setSearchParamsHandle = ({ + tokens, + groupBy, + }: { + tokens: PropertyFilterProps.Query['tokens']; + groupBy: MultiselectProps.Options; + }) => { + if (!withSearchParams) { + return; + } + + const searchParams = tokensToSearchParams(tokens); + + groupBy.forEach(({ value }) => searchParams.append(groupByRequestParamName, value as string)); + + setSearchParams(searchParams); + }; + + const onChangePropertyFilterHandle = ({ tokens, operation }: PropertyFilterProps.Query) => { + const filteredTokens = tokens.filter((token, tokenIndex) => { + return ( + multipleChoiceKeys.includes(token.propertyKey as RequestParamsKeys) || + !tokens.some((item, index) => token.propertyKey === item.propertyKey && index > tokenIndex) + ); + }); + + setSearchParamsHandle({ + tokens: filteredTokens, + groupBy: [...groupBy], + }); + + setPropertyFilterQuery({ + operation, + tokens: filteredTokens, + }); + }; + + const onChangePropertyFilter: PropertyFilterProps['onChange'] = ({ detail }) => { + onChangePropertyFilterHandle(detail); + }; + + const onChangeGroupBy: MultiselectProps['onChange'] = ({ detail }) => { + const selectedGpu = detail.selectedOptions.some(({ value }) => value === 'gpu'); + + let tempSelectedOptions: MultiselectProps.Options = detail.selectedOptions; + + if (!selectedGpu) { + tempSelectedOptions = detail.selectedOptions.filter(({ value }) => value !== 'backend'); + } + + setSearchParamsHandle({ + tokens: propertyFilterQuery.tokens, + groupBy: tempSelectedOptions, + }); + + setGroupBy(tempSelectedOptions); + }; + + const filteringRequestParams = useMemo(() => { + const params = tokensToRequestParams({ + tokens: propertyFilterQuery.tokens, + arrayFieldKeys: multipleChoiceKeys, + }); + + return { + ...params, + ...permanentFilters, + }; + }, [propertyFilterQuery, permanentFilters]); + + const selectedProjectName = useMemo(() => { + const projectName = filteringRequestParams['project_name']; + + return typeof projectName === 'string' ? projectName : undefined; + }, [filteringRequestParams]); + + const handleLoadItems: PropertyFilterProps['onLoadItems'] = async ({ detail: { filteringProperty, filteringText } }) => { + setDynamicFilteringOptions([]); + + setFilteringStatusType('loading'); + + if (filteringProperty?.key === filterKeys.PROJECT_NAME) { + await getProjects( + getTokenAwareNamePatternFilterRequestParams({ + filteringText, + limit, + propertyKey: filterKeys.PROJECT_NAME, + tokens: propertyFilterQuery.tokens, + }), + ) + .unwrap() + .then(({ data }) => + data.map(({ project_name }) => ({ + propertyKey: filterKeys.PROJECT_NAME, + value: project_name, + })), + ) + .then(setDynamicFilteringOptions); + } + + if (showFleetFilter && filteringProperty?.key === filterKeys.FLEET && selectedProjectName) { + await getProjectFleets({ + projectName: selectedProjectName, + includeImported: true, + }) + .unwrap() + .then((fleets) => + fleets + .map((fleet) => ({ + propertyKey: filterKeys.FLEET, + value: getFleetFilterValue(fleet, selectedProjectName), + })) + .filter(({ value }) => value.toLowerCase().includes(filteringText.toLowerCase())) + .slice(0, limit), + ) + .then(setDynamicFilteringOptions); + } + + setFilteringStatusType(undefined); + }; + + useEffect(() => { + if (!projectNameIsChecked.current && projectsData?.data?.length) { + projectNameIsChecked.current = true; + + if (!filteringRequestParams['project_name']) { + onChangePropertyFilterHandle({ + tokens: [ + ...propertyFilterQuery.tokens, + { + operator: '=', + propertyKey: filterKeys.PROJECT_NAME, + value: projectsData.data[0].project_name, + }, + ], + operation: 'and', + }); + } + } + }, [projectsData]); + + useEffect(() => { + const prevProjectName = prevSelectedProjectName.current; + prevSelectedProjectName.current = selectedProjectName; + + if (!showFleetFilter || prevProjectName === selectedProjectName) { + return; + } + + if (!propertyFilterQuery.tokens.some((token) => token.propertyKey === filterKeys.FLEET)) { + return; + } + + onChangePropertyFilterHandle({ + tokens: propertyFilterQuery.tokens.filter((token) => token.propertyKey !== filterKeys.FLEET), + operation: propertyFilterQuery.operation, + }); + }, [propertyFilterQuery, selectedProjectName, showFleetFilter]); + + return { + filteringRequestParams, + clearFilter, + propertyFilterQuery, + onChangePropertyFilter, + filteringOptions, + filteringProperties: filteringPropertiesForShowing, + groupBy, + groupByOptions, + onChangeGroupBy, + filteringStatusType, + handleLoadItems, + } as const; +}; diff --git a/frontend/src/pages/Offers/List/index.tsx b/frontend/src/pages/Offers/List/index.tsx new file mode 100644 index 0000000000..60b2ed147a --- /dev/null +++ b/frontend/src/pages/Offers/List/index.tsx @@ -0,0 +1,313 @@ +import React, { useEffect, useState } from 'react'; +import { useTranslation } from 'react-i18next'; + +import { Alert, Cards, CardsProps, MultiselectCSD, Popover, PropertyFilter } from 'components'; + +import { useCollection } from 'hooks'; +import { useGetGpusListQuery } from 'services/gpu'; + +import { useEmptyMessages } from './hooks/useEmptyMessages'; +import { useFilters, UseFiltersArgs } from './hooks/useFilters'; +import { convertMiBToGB, rangeToObject, renderRange, renderRangeJSX, round } from './helpers'; + +import styles from './styles.module.scss'; + +const getRequestParams = ({ + project_name, + gpu_name, + backend, + fleet, + gpu_count, + gpu_memory, + spot_policy, + group_by, +}: { + project_name: string; + gpu_name?: string[]; + backend?: string[]; + fleet?: string[]; + gpu_count?: string; + gpu_memory?: string; + spot_policy?: TSpot; + group_by?: TGpuGroupBy[]; +}): TGpusListQueryParams => { + const gpuCountMinMax = rangeToObject(gpu_count ?? ''); + const gpuMemoryMinMax = rangeToObject(gpu_memory ?? '', { requireUnit: true }); + + return { + project_name, + group_by, + run_spec: { + configuration: { + nodes: 1, + ports: [], + commands: [':'], + type: 'task', + privileged: false, + home_dir: '/root', + env: {}, + resources: { + // cpu: { min: 2 }, + // memory: { min: 8.0 }, + // disk: { size: { min: 100.0 } }, + gpu: { + ...(gpu_name?.length ? { name: gpu_name } : {}), + ...(gpuCountMinMax ? { count: gpuCountMinMax as unknown as TRange } : {}), + ...(gpuMemoryMinMax ? { memory: gpuMemoryMinMax as unknown as TRange } : {}), + }, + }, + spot_policy, + volumes: [], + files: [], + setup: [], + ...(backend?.length ? { backends: backend as TBackendType[] } : {}), + ...(fleet?.length ? { fleets: fleet } : {}), + }, + profile: { name: 'default', default: false }, + ssh_key_pub: '(dummy)', + }, + }; +}; + +type OfferListProps = Pick & { + permanentFilters?: UseFiltersArgs['permanentFilters']; + defaultFilters?: UseFiltersArgs['defaultFilters']; + withSearchParams?: boolean; + showFleetFilter?: boolean; + disabled?: boolean; + onChangeProjectName?: (value: string) => void; + onChangeBackendFilter?: (backends: string[]) => void; + onChangeFleetFilter?: (fleets: string[]) => void; +}; + +export const OfferList: React.FC = ({ + withSearchParams, + showFleetFilter, + disabled, + onChangeProjectName, + onChangeBackendFilter, + onChangeFleetFilter, + permanentFilters, + defaultFilters, + ...props +}) => { + const { t } = useTranslation(); + const [requestParams, setRequestParams] = useState(); + + const { data, error, isError, isLoading, isFetching } = useGetGpusListQuery( + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-expect-error + requestParams, + { + skip: disabled || !requestParams || !requestParams['project_name'] || !requestParams['group_by']?.length, + }, + ); + + const { + filteringRequestParams, + clearFilter, + propertyFilterQuery, + onChangePropertyFilter, + filteringOptions, + filteringProperties, + groupBy, + groupByOptions, + onChangeGroupBy, + filteringStatusType, + handleLoadItems, + } = useFilters({ gpus: data?.gpus ?? [], withSearchParams, showFleetFilter, permanentFilters, defaultFilters }); + + useEffect(() => { + setRequestParams( + getRequestParams({ + ...filteringRequestParams, + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-expect-error + group_by: groupBy.map(({ value }) => value), + }), + ); + }, [JSON.stringify(filteringRequestParams), groupBy]); + + useEffect(() => { + const projectName = typeof filteringRequestParams.project_name === 'string' ? filteringRequestParams.project_name : ''; + onChangeProjectName?.(projectName); + }, [filteringRequestParams.project_name]); + + useEffect(() => { + const backend = filteringRequestParams.backend; + const backendValues = backend + ? (Array.isArray(backend) ? backend : [backend]).filter((value): value is string => typeof value === 'string') + : []; + onChangeBackendFilter?.(backendValues); + }, [filteringRequestParams.backend]); + + useEffect(() => { + const fleet = filteringRequestParams.fleet; + const fleetValues = fleet + ? (Array.isArray(fleet) ? fleet : [fleet]).filter((value): value is string => typeof value === 'string') + : []; + onChangeFleetFilter?.(fleetValues); + }, [filteringRequestParams.fleet]); + + const { renderEmptyMessage, renderNoMatchMessage } = useEmptyMessages({ + clearFilter, + projectNameSelected: Boolean(requestParams?.['project_name']), + groupBySelected: Boolean(requestParams?.['group_by']?.length), + }); + + const { items, collectionProps } = useCollection( + requestParams?.['project_name'] && requestParams?.['group_by']?.length ? (data?.gpus ?? []) : [], + { + filtering: { + empty: renderEmptyMessage(), + noMatch: renderNoMatchMessage(), + }, + selection: {}, + }, + ); + + const groupByBackend = groupBy.some(({ value }) => value === 'backend'); + + const sections = [ + { + id: 'memory_mib', + // header: t('offer.memory_mib'), + content: (gpu: IGpu) => ( +
+ {round(convertMiBToGB(gpu.memory_mib))}GB + : + {renderRange(gpu.count)} +
+ ), + width: 50, + }, + { + id: 'price', + // header: t('offer.price'), + content: (gpu: IGpu) => ${renderRangeJSX(gpu.price) ?? '-'}, + width: 50, + }, + // { + // id: 'count', + // header: t('offer.count'), + // content: (gpu: IGpu) => renderRange(gpu.count) ?? '-', + // width: 50, + // }, + !groupByBackend && { + id: 'backends', + // header: t('offer.backend_plural'), + content: (gpu: IGpu) => gpu.backends?.join(', ') ?? '-', + width: 50, + }, + groupByBackend && { + id: 'backend', + // header: t('offer.backend'), + content: (gpu: IGpu) => gpu.backend ?? '-', + width: 50, + }, + // { + // id: 'region', + // header: t('offer.region'), + // content: (gpu) => gpu.region ?? gpu.regions?.join(', ') ?? '-', + // width: 50, + // }, + { + id: 'spot', + // header: t('offer.spot'), + content: (gpu: IGpu) => gpu.spot.join(', ') ?? '-', + width: 50, + }, + { + id: 'availability', + content: (gpu: IGpu) => { + const availabilityIssues = + gpu.availability.length > 0 && + gpu.availability.every((a) => a === 'not_available' || a === 'no_quota' || a === 'no_balance'); + + if (!availabilityIssues) { + return null; + } + + if (gpu.availability.length === 1) { + return {t(`offer.availability_${gpu.availability[0]}`)}; + } + + return ( + t(`offer.availability_${a}`)).join(', ')} + > + {t('offer.availability_not_available')} + + ); + }, + width: 50, + }, + ].filter(Boolean) as CardsProps.CardDefinition['sections']; + + return ( + <> + {!disabled && isError && ( + + {'data' in (error as object) && (error as { data?: { detail?: { msg?: string }[] } }).data?.detail?.[0]?.msg + ? (error as { data?: { detail?: { msg?: string }[] } }).data?.detail?.[0]?.msg + : t('common.server_error', { error: 'Unknown error' })} + + )} + + gpu.name, + sections, + }} + loading={!disabled && (isLoading || isFetching)} + loadingText={t('common.loading')} + stickyHeader={true} + filter={ + disabled ? undefined : ( +
+
+ `Use: ${value}`, + }} + filteringOptions={filteringOptions} + filteringProperties={filteringProperties} + filteringStatusType={filteringStatusType} + onLoadItems={handleLoadItems} + /> +
+ +
+ +
+
+ ) + } + /> + + ); +}; diff --git a/frontend/src/pages/Offers/List/styles.module.scss b/frontend/src/pages/Offers/List/styles.module.scss new file mode 100644 index 0000000000..dd0686150b --- /dev/null +++ b/frontend/src/pages/Offers/List/styles.module.scss @@ -0,0 +1,26 @@ +@use '@cloudscape-design/design-tokens/index' as awsui; + +.selectFilters { + display: flex; + flex-wrap: wrap; + gap: 0 20px; + + .filterField { + flex-shrink: 0; + width: 240px; + } + + .propertyFilter { + max-width: 640px; + flex-grow: 1; + min-width: 0; + } +} + +.greyText { + color: awsui.$color-text-status-inactive; +} + +.greenText { + color: awsui.$color-text-status-success; +} diff --git a/frontend/src/pages/Offers/ListPage.tsx b/frontend/src/pages/Offers/ListPage.tsx new file mode 100644 index 0000000000..17e27c96ed --- /dev/null +++ b/frontend/src/pages/Offers/ListPage.tsx @@ -0,0 +1,23 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; + +import { useBreadcrumbs } from 'hooks'; +import { ROUTES } from 'routes'; + +import { Header } from '../../components'; +import { OfferList } from './List'; + +export const ListPage: React.FC = () => { + const { t } = useTranslation(); + + useBreadcrumbs([ + { + text: t('offer.title'), + href: ROUTES.OFFERS.LIST, + }, + ]); + + return ( + {t('offer.title')}} /> + ); +}; diff --git a/frontend/src/pages/Offers/index.ts b/frontend/src/pages/Offers/index.ts new file mode 100644 index 0000000000..2f8cba53fb --- /dev/null +++ b/frontend/src/pages/Offers/index.ts @@ -0,0 +1 @@ +export { ListPage as OfferList } from './ListPage'; diff --git a/frontend/src/pages/Project/Add/index.tsx b/frontend/src/pages/Project/Add/index.tsx new file mode 100644 index 0000000000..23cbc4c006 --- /dev/null +++ b/frontend/src/pages/Project/Add/index.tsx @@ -0,0 +1,347 @@ +import React, { useState } from 'react'; +import { useForm } from 'react-hook-form'; +import { useTranslation } from 'react-i18next'; +import { useNavigate } from 'react-router-dom'; +import { isNil } from 'lodash'; +import * as yup from 'yup'; +import { WizardProps } from '@cloudscape-design/components'; + +import { Container, FormInput, FormToggle, InfoLink, KeyValuePairs, SpaceBetween, Wizard } from 'components'; + +import { useBreadcrumbs, useConfirmationDialog, useHelpPanel, useNotifications } from 'hooks'; +import { isResponseServerError, isResponseServerFormFieldError } from 'libs'; +import { ROUTES } from 'routes'; +import { useApplyFleetMutation } from 'services/fleet'; +import { useCreateProjectMutation } from 'services/project'; + +import { FleetFormFields } from 'pages/Fleets/Add/FleetFormFields'; +import { + fleetFormDefaultValues, + getMaxInstancesValidator, + getMinInstancesValidator, + idleDurationValidator, +} from 'pages/Fleets/Add/FleetFormFields/constants'; + +import { DEFAULT_FLEET_INFO } from '../constants'; +import { useYupValidationResolver } from '../hooks/useYupValidationResolver'; + +import { IProjectForm } from '../Form/types'; +import { FieldPath } from 'react-hook-form/dist/types/path'; + +const requiredFieldError = 'This is required field'; +const namesFieldError = 'Only latin characters, dashes, underscores, and digits'; + +const fleetStepIndex = 1; + +const projectValidationSchema = yup.object({ + project_name: yup + .string() + .required(requiredFieldError) + .matches(/^[a-zA-Z0-9-_]+$/, namesFieldError), + is_public: yup.boolean(), + fleet: yup.object().shape({ + min_instances: yup.number().when('enable_default', { + is: true, + then: getMinInstancesValidator('max_instances'), + }), + max_instances: yup.number().when('enable_default', { + is: true, + then: getMaxInstancesValidator('min_instances'), + }), + idle_duration: yup.string().when('enable_default', { + is: true, + then: idleDurationValidator, + }), + spot_policy: yup.string().required(requiredFieldError), + }), +}); + +export const ProjectAdd: React.FC = () => { + const { t } = useTranslation(); + const navigate = useNavigate(); + const [pushNotification] = useNotifications(); + const [openConfirmationDialog] = useConfirmationDialog(); + const [openHelpPanel] = useHelpPanel(); + const [createProject, { isLoading }] = useCreateProjectMutation(); + const [applyFleet, { isLoading: isApplyingFleet }] = useApplyFleetMutation(); + const [activeStepIndex, setActiveStepIndex] = useState(0); + const resolver = useYupValidationResolver(projectValidationSchema); + + const loading = isLoading || isApplyingFleet; + + const formMethods = useForm({ + resolver, + defaultValues: { + is_public: false, + fleet: { + ...fleetFormDefaultValues, + enable_default: true, + }, + }, + }); + + const { handleSubmit, control, setError, clearErrors, trigger, watch, getValues } = formMethods; + const formValues = watch(); + + const getFormValuesForServer = (): IProjectCreateRequestParams => { + const { project_name, is_public } = getValues(); + + return { + project_name, + is_public, + }; + }; + + const getFormValuesForFleetApplying = (): IApplyFleetPlanRequestRequest => { + const { + fleet: { min_instances, max_instances, idle_duration, name, spot_policy }, + } = getValues(); + + return { + plan: { + spec: { + configuration: { + ...(name ? { name } : {}), + nodes: { + min: min_instances, + ...(max_instances ? { max: max_instances } : {}), + }, + ...(idle_duration ? { idle_duration } : {}), + spot_policy, + }, + profile: {}, + }, + }, + force: false, + }; + }; + + useBreadcrumbs([ + { + text: t('navigation.project_other'), + href: ROUTES.PROJECT.LIST, + }, + { + text: t('common.create_wit_text', { text: t('navigation.project') }), + href: ROUTES.PROJECT.ADD, + }, + ]); + + const validateNameAndType = async () => { + return await trigger(['project_name', 'is_public']); + }; + const validateFleet = async () => { + return await trigger(['fleet.enable_default', 'fleet.min_instances', 'fleet.max_instances', 'fleet.idle_duration']); + }; + + const emptyValidator = async () => Promise.resolve(true); + + const onNavigate = ({ + requestedStepIndex, + reason, + }: { + requestedStepIndex: number; + reason: WizardProps.NavigationReason; + }) => { + const stepValidators = [validateNameAndType, validateFleet, emptyValidator]; + + if (reason === 'next') { + stepValidators[activeStepIndex]?.().then((isValid) => { + if (isValid) { + if (activeStepIndex === fleetStepIndex && formValues?.['fleet']['min_instances'] > 0) { + openConfirmationDialog({ + title: 'Are sure want to set min instances above than 0?', + content: null, + onConfirm: () => setActiveStepIndex(requestedStepIndex), + }); + } else { + setActiveStepIndex(requestedStepIndex); + } + } + }); + } else { + setActiveStepIndex(requestedStepIndex); + } + }; + + const onNavigateHandler: WizardProps['onNavigate'] = ({ detail: { requestedStepIndex, reason } }) => { + onNavigate({ requestedStepIndex, reason }); + }; + + const onCancelHandler = () => { + navigate(ROUTES.PROJECT.LIST); + }; + + const onSubmitWizard = async () => { + const isValid = await trigger(); + + const { fleet } = getValues(); + + if (!isValid) { + return; + } + + clearErrors(); + + const request = createProject(getFormValuesForServer()).unwrap(); + + request + .then(async (data) => { + pushNotification({ + type: 'success', + content: t('projects.create.success_notification'), + }); + + if (fleet.enable_default) { + await applyFleet({ + projectName: data.project_name, + ...getFormValuesForFleetApplying(), + }).unwrap(); + } + + navigate(ROUTES.PROJECT.DETAILS.SETTINGS.FORMAT(data.project_name)); + }) + .catch((error) => { + const errorRequestData = error?.data; + + if (isResponseServerError(errorRequestData)) { + errorRequestData.detail.forEach((error) => { + if (isResponseServerFormFieldError(error)) { + setError(error.loc.join('.') as FieldPath, { type: 'custom', message: error.msg }); + } else { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: error.msg }), + }); + } + }); + } else { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: error?.error ?? error }), + }); + } + }); + }; + + const onSubmit = () => { + if (activeStepIndex < 2) { + onNavigate({ requestedStepIndex: activeStepIndex + 1, reason: 'next' }); + } else { + onSubmitWizard().catch(console.log); + } + }; + + const getDefaultFleetSummary = () => { + const summaryFields: Array = [ + 'name', + 'min_instances', + 'max_instances', + 'idle_duration', + 'spot_policy', + ]; + + const result: string[] = []; + + summaryFields.forEach((fieldName) => { + if (!isNil(formValues?.fleet?.[fieldName])) { + result.push(`${t(`fleets.edit.${fieldName}`)}: ${formValues['fleet'][fieldName]}`); + } + }); + + return result.join(', '); + }; + + return ( +
+ `Step ${stepNumber}`, + navigationAriaLabel: 'Steps', + cancelButton: t('common.cancel'), + previousButton: t('common.previous'), + nextButton: t('common.next'), + optional: 'optional', + }} + onCancel={onCancelHandler} + submitButtonText={t('projects.wizard.submit')} + steps={[ + { + title: 'Name and public', + content: ( + + + + + + + + ), + }, + { + title: 'Fleets', + content: ( + + + {t('projects.edit.default_fleet')}} + constraintText={t('projects.edit.default_fleet_description')} + toggleInfo={ openHelpPanel(DEFAULT_FLEET_INFO)} />} + control={control} + name="fleet.enable_default" + /> + + {formValues['fleet']['enable_default'] && ( + + control={control} + disabledAllFields={loading} + fieldNamePrefix="fleet" + /> + )} + + + ), + }, + { + title: 'Summary', + content: ( + + + + ), + }, + ]} + /> + + ); +}; diff --git a/frontend/src/pages/Project/Backends/Add/index.tsx b/frontend/src/pages/Project/Backends/Add/index.tsx new file mode 100644 index 0000000000..611b76d6d6 --- /dev/null +++ b/frontend/src/pages/Project/Backends/Add/index.tsx @@ -0,0 +1,105 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate, useParams } from 'react-router-dom'; + +import { Container, Header } from 'components'; + +import { useBreadcrumbs, useNotifications } from 'hooks'; +import { getServerError } from 'libs'; +import { ROUTES } from 'routes'; +import { + // useCreateBackendMutation, + useCreateBackendViaYamlMutation, +} from 'services/backend'; + +// import { BackendForm } from '../Form'; +// import { prepareBackendConfigForApi } from '../Form/helpers'; +import { YAMLForm } from '../YAMLForm'; + +export const BackendAdd: React.FC = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramProjectName = params.projectName ?? ''; + const navigate = useNavigate(); + const [pushNotification] = useNotifications(); + + // const [createBackend, { isLoading }] = useCreateBackendMutation(); + const [createBackendViaYaml, { isLoading: isCreatingViaYaml }] = useCreateBackendViaYamlMutation(); + + useBreadcrumbs([ + { + text: t('navigation.project_other'), + href: ROUTES.PROJECT.LIST, + }, + { + text: paramProjectName, + href: ROUTES.PROJECT.DETAILS.FORMAT(paramProjectName), + }, + { + text: t('projects.settings'), + href: ROUTES.PROJECT.DETAILS.SETTINGS.FORMAT(paramProjectName), + }, + { + text: t('backend.add_backend'), + href: ROUTES.PROJECT.BACKEND.ADD.FORMAT(paramProjectName), + }, + ]); + + const onCancelHandler = () => { + navigate(ROUTES.PROJECT.DETAILS.SETTINGS.FORMAT(paramProjectName)); + }; + + // const onSubmitHandler = (backend: TBackendConfig) => { + // const request = createBackend({ + // projectName: paramProjectName, + // config: prepareBackendConfigForApi(backend), + // }).unwrap(); + // + // request + // .then(() => { + // pushNotification({ + // type: 'success', + // content: t('backend.create.success_notification'), + // }); + // + // navigate(ROUTES.PROJECT.DETAILS.SETTINGS.FORMAT(paramProjectName)); + // }) + // .catch((error: never) => { + // console.log(error); + // }); + // + // return request; + // }; + + const onSubmitYamlHandler = (backend: IBackendConfigYaml) => { + const request = createBackendViaYaml({ + projectName: paramProjectName, + backend, + }).unwrap(); + + request + .then(() => { + pushNotification({ + type: 'success', + content: t('backend.create.success_notification'), + }); + + navigate(ROUTES.PROJECT.DETAILS.SETTINGS.FORMAT(paramProjectName)); + }) + .catch((error) => { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: getServerError(error) }), + }); + }); + + return request; + }; + + return ( + {t('backend.add_backend')}}> + {/**/} + + + ); +}; diff --git a/frontend/src/pages/Project/Backends/Edit/index.tsx b/frontend/src/pages/Project/Backends/Edit/index.tsx new file mode 100644 index 0000000000..10912b4bea --- /dev/null +++ b/frontend/src/pages/Project/Backends/Edit/index.tsx @@ -0,0 +1,151 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate, useParams } from 'react-router-dom'; +import { pick } from 'lodash'; + +import { Container, Header, Loader } from 'components'; + +import { useBreadcrumbs, useNotifications } from 'hooks'; +import { getServerError } from 'libs'; +import { ROUTES } from 'routes'; +import { + // useGetBackendConfigQuery, + useGetBackendYamlQuery, + // useUpdateBackendMutation, + useUpdateBackendViaYamlMutation, +} from 'services/backend'; + +// import { BackendForm } from '../Form'; +// import { prepareBackendConfigForApi } from '../Form/helpers'; +import { YAMLForm } from '../YAMLForm'; + +export const BackendEdit: React.FC = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramProjectName = params.projectName ?? ''; + const paramBackendName = params.backend ?? ''; + const navigate = useNavigate(); + const [pushNotification] = useNotifications(); + // const [updateProject, { isLoading: isBackendUpdating }] = useUpdateBackendMutation(); + const [updateBackendYamlConfig, { isLoading: isBackendYamlConfigUpdating }] = useUpdateBackendViaYamlMutation(); + + // const { data, isLoading } = useGetBackendConfigQuery({ projectName: paramProjectName, backendName: paramBackendName }); + + const { + data: backendYamlData, + isLoading: isLoadingYaml, + isFetching: isFetchingYaml, + } = useGetBackendYamlQuery( + { + projectName: paramProjectName, + backendName: paramBackendName, + }, + { + refetchOnMountOrArgChange: true, + }, + ); + + useBreadcrumbs([ + { + text: t('navigation.project_other'), + href: ROUTES.PROJECT.LIST, + }, + { + text: paramProjectName, + href: ROUTES.PROJECT.DETAILS.FORMAT(paramProjectName), + }, + { + text: t('projects.settings'), + href: ROUTES.PROJECT.DETAILS.SETTINGS.FORMAT(paramProjectName), + }, + { + text: t('backend.edit_backend'), + href: ROUTES.PROJECT.BACKEND.ADD.FORMAT(paramProjectName), + }, + ]); + + const onCancelHandler = () => { + navigate(ROUTES.PROJECT.DETAILS.SETTINGS.FORMAT(paramProjectName)); + }; + + // const onSubmitHandler = async (backend: TBackendConfig): Promise => { + // const request = updateProject({ + // projectName: paramProjectName, + // config: prepareBackendConfigForApi(backend), + // }).unwrap(); + // + // request + // .then(() => { + // pushNotification({ + // type: 'success', + // content: t('backend.edit.success_notification'), + // }); + // + // navigate(ROUTES.PROJECT.DETAILS.SETTINGS.FORMAT(paramProjectName)); + // }) + // .catch(console.log); + // + // return request; + // }; + + const onSubmitYaml = async (backend: IBackendConfigYaml): Promise => { + const request = updateBackendYamlConfig({ + projectName: paramProjectName, + backend, + }).unwrap(); + + request + .then(() => { + pushNotification({ + type: 'success', + content: t('backend.edit.success_notification'), + }); + }) + .catch((error) => { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: getServerError(error) }), + }); + }); + + return request; + }; + + const onSubmitYamlHandler = async (backend: IBackendConfigYaml): Promise => { + const request = onSubmitYaml(backend); + request.then(() => navigate(ROUTES.PROJECT.DETAILS.SETTINGS.FORMAT(paramProjectName))); + return request; + }; + + const onApplyYamlHandler = async (backend: IBackendConfigYaml): Promise => onSubmitYaml(backend); + + if (isLoadingYaml || isFetchingYaml) + return ( + + + + ); + + return ( + {t('backend.edit_backend')}}> + {/*{data && (*/} + {/* */} + {/*)}*/} + + {backendYamlData && ( + + )} + + ); +}; diff --git a/frontend/src/pages/Project/Backends/Table/constants.tsx b/frontend/src/pages/Project/Backends/Table/constants.tsx new file mode 100644 index 0000000000..0d9fb99128 --- /dev/null +++ b/frontend/src/pages/Project/Backends/Table/constants.tsx @@ -0,0 +1,47 @@ +import React from 'react'; + +export const BACKENDS_HELP_SKY = { + header:

Backends

, + body: ( + <> +

+ To use dstack with cloud providers, you have to configure backends. +

+

Marketplace

+

+ By default, dstack Sky includes a preset of backends that let you access compute from the{' '} + dstack marketplace and pay through your dstack Sky user billing. +

+

Your own cloud accounts

+

+ You can also configure custom backends to use your own cloud providers, either instead of or in addition to the + default ones. +

+

+ See the{' '} + + documentation + {' '} + for the list of supported backends. +

+ + ), +}; + +export const BACKENDS_HELP_ENTERPRISE = { + header:

Backends

, + body: ( + <> +

+ To use dstack with cloud providers, you have to configure backends. +

+

+ See the{' '} + + documentation + {' '} + for the list of supported backends. +

+ + ), +}; diff --git a/frontend/src/pages/Project/Backends/Table/hooks/index.ts b/frontend/src/pages/Project/Backends/Table/hooks/index.ts new file mode 100644 index 0000000000..bc3180f26e --- /dev/null +++ b/frontend/src/pages/Project/Backends/Table/hooks/index.ts @@ -0,0 +1 @@ +export * from './useColumnsDefinitions'; diff --git a/frontend/src/pages/Project/Backends/Table/hooks/useColumnsDefinitions.tsx b/frontend/src/pages/Project/Backends/Table/hooks/useColumnsDefinitions.tsx new file mode 100644 index 0000000000..78261ebd4e --- /dev/null +++ b/frontend/src/pages/Project/Backends/Table/hooks/useColumnsDefinitions.tsx @@ -0,0 +1,63 @@ +import React, { useMemo } from 'react'; +import { useTranslation } from 'react-i18next'; + +import { Button } from 'components'; +import { ButtonWithConfirmation } from 'components/ButtonWithConfirmation'; + +import styles from '../styles.module.scss'; + +type hookArgs = { + loading?: boolean; + onDeleteClick?: (backend: IProjectBackend) => void; + onEditClick?: (backend: IProjectBackend) => void; +}; + +export const useColumnsDefinitions = ({ loading, onDeleteClick, onEditClick }: hookArgs) => { + const { t } = useTranslation(); + + const columns = useMemo(() => { + return [ + { + id: 'type', + header: t('projects.edit.backend_type'), + cell: (backend: IProjectBackend) => backend.config.type, + }, + + { + id: 'actions', + header: '', + + cell: (backend: IProjectBackend) => + backend.config.type !== 'dstack' && ( +
+
+ {onEditClick && ( +
+
+ ), + }, + ]; + }, [loading, onEditClick, onDeleteClick]); + + return { columns } as const; +}; diff --git a/frontend/src/pages/Project/Backends/Table/index.tsx b/frontend/src/pages/Project/Backends/Table/index.tsx new file mode 100644 index 0000000000..3ccfa9d9c5 --- /dev/null +++ b/frontend/src/pages/Project/Backends/Table/index.tsx @@ -0,0 +1,99 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; + +import { Button, ButtonWithConfirmation, Header, InfoLink, ListEmptyMessage, SpaceBetween, Table } from 'components'; + +import { useCollection, useHelpPanel } from 'hooks'; + +import { BACKENDS_HELP_ENTERPRISE, BACKENDS_HELP_SKY } from './constants'; +import { useColumnsDefinitions } from './hooks'; + +import { IProps } from './types'; +const INFO = process.env.UI_VERSION === 'enterprise' ? BACKENDS_HELP_ENTERPRISE : BACKENDS_HELP_SKY; + +export const BackendsTable: React.FC = ({ + backends, + editBackend, + deleteBackends, + onClickAddBackend, + isDisabledDelete, +}) => { + const { t } = useTranslation(); + const [openHelpPanel] = useHelpPanel(); + + const renderEmptyMessage = (): React.ReactNode => { + return ( + + {onClickAddBackend && } + + ); + }; + + const { items, collectionProps } = useCollection(backends ?? [], { + filtering: { + empty: renderEmptyMessage(), + noMatch: renderEmptyMessage(), + }, + selection: {}, + }); + + const { selectedItems } = collectionProps; + + const isDisabledDeleteSelected = !selectedItems?.length || isDisabledDelete; + + const deleteSelectedBackends = () => { + if (!selectedItems?.length || !deleteBackends) return; + + deleteBackends(selectedItems); + }; + + const { columns } = useColumnsDefinitions({ + ...(editBackend ? { onEditClick: (backend) => editBackend(backend) } : {}), + }); + + const renderCounter = () => { + if (!backends?.length) return ''; + + return `(${backends.length})`; + }; + + return ( +
openHelpPanel(INFO)} />} + actions={ + + {deleteBackends && ( + + {t('common.delete')} + + )} + + {onClickAddBackend && ( + + )} + + } + > + {t('backend.page_title_other')} + + } + /> + ); +}; diff --git a/frontend/src/pages/Project/Backends/Table/styles.module.scss b/frontend/src/pages/Project/Backends/Table/styles.module.scss new file mode 100644 index 0000000000..4c83d9f2c7 --- /dev/null +++ b/frontend/src/pages/Project/Backends/Table/styles.module.scss @@ -0,0 +1,16 @@ +.cell { + display: flex; + align-items: center; +} + +.contextMenu { + margin-left: auto; + padding-left: 20px; +} + +.ellipsisCell { + overflow: hidden; + white-space: nowrap; + text-overflow: ellipsis; + max-width: 450px; +} diff --git a/frontend/src/pages/Project/Backends/Table/types.ts b/frontend/src/pages/Project/Backends/Table/types.ts new file mode 100644 index 0000000000..5bb6d3877f --- /dev/null +++ b/frontend/src/pages/Project/Backends/Table/types.ts @@ -0,0 +1,7 @@ +export interface IProps { + backends: IProjectBackend[]; + onClickAddBackend?: () => void; + deleteBackends?: (backends: readonly IProjectBackend[] | IProjectBackend[]) => void; + editBackend?: (backend: IProjectBackend) => void; + isDisabledDelete?: boolean; +} diff --git a/frontend/src/pages/Project/Backends/YAMLForm/constants.tsx b/frontend/src/pages/Project/Backends/YAMLForm/constants.tsx new file mode 100644 index 0000000000..a139f137d5 --- /dev/null +++ b/frontend/src/pages/Project/Backends/YAMLForm/constants.tsx @@ -0,0 +1,78 @@ +import React from 'react'; + +export const CONFIG_YAML_HELP_SKY = { + header:

Backend config

, + body: ( + <> +

+ The backend config is defined in the YAML format. It specifies the backend's type and settings,{' '} + such as creds, regions, and so on. +

+

Marketplace

+

+ If you set creds's type to dstack, you'll get compute from{' '} + dstack's marketplace and will pay for it via your dstack Sky user billing. Example: +

+

+

+                    type: aws{'\n'}
+                    creds:{'\n'}
+                    {'  '}type: dstack{'\n'}
+                
+

+

+ You can see all supported backend types at the{' '} + + documentation + + . +

+

Your own cloud account

+

+ If you want to use your own cloud account, configure creds and other settings according to the{' '} + + documentation + + . Example: +

+

+

+                    type: aws{'\n'}
+                    creds:{'\n'}
+                    {'  '}type: access_key{'\n'}
+                    {'  '}access_key: AIZKISCVKUK{'\n'}
+                    {'  '}secret_key: QSbmpqJIUBn1
+                
+

+ + ), +}; + +export const CONFIG_YAML_HELP_ENTERPRISE = { + header:

Backend config

, + body: ( + <> +

+ The backend config is defined in the YAML format. It specifies the backend's type and settings, + such as creds, regions, and so on. +

+

Example:

+

+

+                    type: aws{'\n'}
+                    creds:{'\n'}
+                    {'  '}type: access_key{'\n'}
+                    {'  '}access_key: AIZKISCVKUK{'\n'}
+                    {'  '}secret_key: QSbmpqJIUBn1
+                
+

+

+ Each backend type may support different properties. See the{' '} + + documentaiton + {' '} + for more examples. +

+ + ), +}; diff --git a/frontend/src/pages/Project/Backends/YAMLForm/index.tsx b/frontend/src/pages/Project/Backends/YAMLForm/index.tsx new file mode 100644 index 0000000000..e5ebc10a70 --- /dev/null +++ b/frontend/src/pages/Project/Backends/YAMLForm/index.tsx @@ -0,0 +1,107 @@ +import React, { useState } from 'react'; +import { useForm } from 'react-hook-form'; +import { useTranslation } from 'react-i18next'; + +import { Button, FormCodeEditor, FormUI, InfoLink, SpaceBetween } from 'components'; + +import { useHelpPanel, useNotifications } from 'hooks'; +import { isResponseServerError, isResponseServerFormFieldError } from 'libs'; + +import { CONFIG_YAML_HELP_ENTERPRISE, CONFIG_YAML_HELP_SKY } from './constants'; + +import { FieldPath } from 'react-hook-form/dist/types/path'; + +const INFO = process.env.UI_VERSION === 'enterprise' ? CONFIG_YAML_HELP_ENTERPRISE : CONFIG_YAML_HELP_SKY; + +export interface IProps { + initialValues?: IBackendConfigYaml; + loading?: boolean; + onCancel: () => void; + onApply?: (backend: IBackendConfigYaml) => Promise; + onSubmit: (backend: IBackendConfigYaml) => Promise; +} + +export const YAMLForm: React.FC = ({ + initialValues, + onCancel, + loading, + onSubmit: onSubmitProp, + onApply: onApplyProp, +}) => { + const { t } = useTranslation(); + const [openHelpPanel] = useHelpPanel(); + const [pushNotification] = useNotifications(); + const [isApplying, setIsApplying] = useState(false); + + const { handleSubmit, control, setError, clearErrors } = useForm({ + defaultValues: initialValues, + }); + + const onSubmit = (data: IBackendConfigYaml) => { + clearErrors(); + + const submitCallback = isApplying && onApplyProp ? onApplyProp : onSubmitProp; + + submitCallback(data) + .finally(() => setIsApplying(false)) + .catch((errorResponse) => { + const errorRequestData = errorResponse?.data; + + if (isResponseServerError(errorRequestData)) { + errorRequestData.detail.forEach((error) => { + if (isResponseServerFormFieldError(error)) { + setError(error.loc.join('.') as FieldPath, { + type: 'custom', + message: error.msg, + }); + } else { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: error.msg }), + }); + } + }); + } else { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: errorResponse?.error ?? errorResponse }), + }); + } + }); + }; + + return ( +
+ + + + {onApplyProp && ( + + )} + + + + } + > + openHelpPanel(INFO)} />} + control={control} + label={t('projects.edit.backend_config')} + description={t('projects.edit.backend_config_description')} + name="config_yaml" + language="yaml" + loading={loading} + editorContentHeight={600} + /> + + + ); +}; diff --git a/frontend/src/pages/Project/Backends/hooks/index.ts b/frontend/src/pages/Project/Backends/hooks/index.ts new file mode 100644 index 0000000000..368919a01d --- /dev/null +++ b/frontend/src/pages/Project/Backends/hooks/index.ts @@ -0,0 +1 @@ +export { useBackendsTable } from './useBackendsTable'; diff --git a/frontend/src/pages/Project/Backends/hooks/useBackendsTable.ts b/frontend/src/pages/Project/Backends/hooks/useBackendsTable.ts new file mode 100644 index 0000000000..c43b2d12dc --- /dev/null +++ b/frontend/src/pages/Project/Backends/hooks/useBackendsTable.ts @@ -0,0 +1,38 @@ +import { useTranslation } from 'react-i18next'; +import { useNavigate } from 'react-router-dom'; + +import { useNotifications } from 'hooks'; +import { getServerError } from 'libs'; +import { ROUTES } from 'routes'; +import { useDeleteProjectBackendMutation } from 'services/backend'; + +export const useBackendsTable = (projectName: IProject['project_name'], backends: IProject['backends']) => { + const { t } = useTranslation(); + const navigate = useNavigate(); + const [deleteBackendRequest, { isLoading: isDeleting }] = useDeleteProjectBackendMutation(); + const [pushNotification] = useNotifications(); + + const editBackend = (backend: IProjectBackend) => { + navigate(ROUTES.PROJECT.BACKEND.EDIT.FORMAT(projectName, backend.name)); + }; + + const deleteBackend = (backends: readonly IProjectBackend[] | IProjectBackend[]) => { + deleteBackendRequest({ + projectName, + backends_names: backends.map((backend) => backend.name), + }) + .unwrap() + .catch((error) => { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: getServerError(error) }), + }); + }); + }; + + const addBackend = () => { + navigate(ROUTES.PROJECT.BACKEND.ADD.FORMAT(projectName)); + }; + + return { data: backends, isDeleting, editBackend, deleteBackend, addBackend } as const; +}; diff --git a/frontend/src/pages/Project/Backends/index.tsx b/frontend/src/pages/Project/Backends/index.tsx new file mode 100644 index 0000000000..bb0c50edd6 --- /dev/null +++ b/frontend/src/pages/Project/Backends/index.tsx @@ -0,0 +1,7 @@ +import React from 'react'; +export { BackendAdd } from './Add'; +export { BackendEdit } from './Edit'; + +export const Backends: React.FC = () => { + return null; +}; diff --git a/frontend/src/pages/Project/CreateWizard/constants.ts b/frontend/src/pages/Project/CreateWizard/constants.ts new file mode 100644 index 0000000000..415fa8c204 --- /dev/null +++ b/frontend/src/pages/Project/CreateWizard/constants.ts @@ -0,0 +1,16 @@ +export const projectTypeOptions = [ + { + label: 'Bring your own cloud', + description: 'Use compute from your own cloud account(s) by providing your credentials.', + billing_notes: + "You pay for compute and storage usage directly to the configured cloud provider(s) through their billing. dstack won't bill or charge you.", + value: 'own_cloud', + }, + { + label: 'GPU marketplace', + description: 'Use compute from multiple cloud providers without needing your own cloud account(s).', + billing_notes: + 'You pay for compute and storage usage directly to dstack. You can top up your balance in your dstack user settings.', + value: 'gpu_marketplace', + }, +]; diff --git a/frontend/src/pages/Project/CreateWizard/index.tsx b/frontend/src/pages/Project/CreateWizard/index.tsx new file mode 100644 index 0000000000..d2ffa2aee9 --- /dev/null +++ b/frontend/src/pages/Project/CreateWizard/index.tsx @@ -0,0 +1,502 @@ +import React, { useEffect, useMemo, useState } from 'react'; +import { useForm } from 'react-hook-form'; +import { useTranslation } from 'react-i18next'; +import { useNavigate } from 'react-router-dom'; +import { isNil } from 'lodash'; +import * as yup from 'yup'; +import { WizardProps } from '@cloudscape-design/components'; +import { TilesProps } from '@cloudscape-design/components/tiles'; + +import { + Alert, + Cards, + Container, + FormCards, + FormField, + FormInput, + FormTiles, + FormToggle, + InfoLink, + KeyValuePairs, + SpaceBetween, + Wizard, +} from 'components'; + +import { useBreadcrumbs, useConfirmationDialog, useHelpPanel, useNotifications } from 'hooks'; +import { getServerError } from 'libs'; +import { ROUTES } from 'routes'; +import { useGetBackendBaseTypesQuery, useGetBackendTypesQuery } from 'services/backend'; +import { useApplyFleetMutation } from 'services/fleet'; +import { useCreateWizardProjectMutation } from 'services/project'; + +import { FleetFormFields } from '../../Fleets/Add/FleetFormFields'; +import { + fleetFormDefaultValues, + getMaxInstancesValidator, + getMinInstancesValidator, + idleDurationValidator, +} from '../../Fleets/Add/FleetFormFields/constants'; +import { DEFAULT_FLEET_INFO } from '../constants'; +import { useYupValidationResolver } from '../hooks/useYupValidationResolver'; +import { projectTypeOptions } from './constants'; + +import { IProjectWizardForm } from './types'; + +const requiredFieldError = 'This is required field'; +const minOneLengthError = 'Need to choose one or more'; +const namesFieldError = 'Only latin characters, dashes, underscores, and digits'; + +const fleetStepIndex = 2; + +const projectValidationSchema = yup.object({ + project_name: yup + .string() + .required(requiredFieldError) + .matches(/^[a-zA-Z0-9-_]+$/, namesFieldError), + project_type: yup.string().required(requiredFieldError), + backends: yup.array().when('project_type', { + is: 'gpu_marketplace', + then: yup.array().min(1, minOneLengthError).required(requiredFieldError), + }), + fleet: yup.object().shape({ + min_instances: yup.number().when('enable_default', { + is: true, + then: getMinInstancesValidator('max_instances'), + }), + max_instances: yup.number().when('enable_default', { + is: true, + then: getMaxInstancesValidator('min_instances'), + }), + idle_duration: yup.string().when('enable_default', { + is: true, + then: idleDurationValidator, + }), + spot_policy: yup.string().required(requiredFieldError), + }), +}); + +export const CreateProjectWizard: React.FC = () => { + const { t } = useTranslation(); + const navigate = useNavigate(); + const [pushNotification] = useNotifications(); + const [activeStepIndex, setActiveStepIndex] = useState(0); + const [openHelpPanel] = useHelpPanel(); + const [createProject, { isLoading }] = useCreateWizardProjectMutation(); + const [applyFleet, { isLoading: isApplyingFleet }] = useApplyFleetMutation(); + const { data: backendBaseTypesData, isLoading: isBackendBaseTypesLoading } = useGetBackendBaseTypesQuery(); + const { data: backendTypesData, isLoading: isBackendTypesLoading } = useGetBackendTypesQuery(); + + const [openConfirmationDialog] = useConfirmationDialog(); + + const loading = isLoading || isApplyingFleet; + + useBreadcrumbs([ + { + text: t('navigation.project_other'), + href: ROUTES.PROJECT.LIST, + }, + { + text: t('common.create_wit_text', { text: t('navigation.project') }), + href: ROUTES.PROJECT.ADD, + }, + ]); + + const backendBaseOptions = useMemo(() => { + if (!backendBaseTypesData) { + return []; + } + + return backendBaseTypesData.map((b: TProjectBackend) => ({ + label: b, + value: b, + })); + }, [backendBaseTypesData]); + + const backendOptions = useMemo(() => { + if (!backendTypesData) { + return []; + } + + return backendTypesData.map((b: TProjectBackend) => ({ + label: b, + value: b, + })); + }, [backendTypesData]); + + const resolver = useYupValidationResolver(projectValidationSchema); + + const formMethods = useForm({ + resolver, + defaultValues: { + project_type: 'own_cloud', + fleet: { + ...fleetFormDefaultValues, + enable_default: true, + }, + }, + }); + + const { handleSubmit, control, watch, trigger, formState, getValues, setValue, setError } = formMethods; + const formValues = watch(); + const selectedProjectTypeOption = projectTypeOptions.find(({ value }) => value === formValues['project_type']); + + const onCancelHandler = () => { + navigate(ROUTES.PROJECT.LIST); + }; + + const getFormValuesForServer = (): TCreateWizardProjectParams => { + const { project_name, backends, project_type } = getValues(); + + return { + project_name, + config: { + base_backends: project_type === 'gpu_marketplace' ? (backends ?? []) : [], + }, + }; + }; + + const getFormValuesForFleetApplying = (): IApplyFleetPlanRequestRequest => { + const { + fleet: { min_instances, max_instances, idle_duration, name }, + } = getValues(); + + return { + plan: { + spec: { + configuration: { + ...(name ? { name } : {}), + nodes: { + min: min_instances, + ...(max_instances ? { max: max_instances } : {}), + }, + ...(idle_duration ? { idle_duration } : {}), + }, + profile: {}, + }, + }, + force: false, + }; + }; + + const validateNameAndType = async () => { + try { + const yupValidationResult = await trigger(['project_type', 'project_name']); + + const serverValidationResult = await createProject({ + ...getFormValuesForServer(), + dry: true, + }) + .unwrap() + .then(() => true) + .catch((error) => { + const errorDetail = (error?.data?.detail ?? []) as { msg: string; code: string }[]; + const projectExist = errorDetail.some(({ code }) => code === 'resource_exists'); + + if (projectExist) { + setError('project_name', { type: 'custom', message: 'Project with this name already exists' }); + } + + return false; + }); + + return yupValidationResult && serverValidationResult; + } catch (e) { + console.log(e); + return false; + } + }; + + const validateBackends = async () => { + if (formValues['project_type'] === 'gpu_marketplace') { + return await trigger(['backends']); + } + + return Promise.resolve(true); + }; + + const validateFleet = async () => { + return await trigger(['fleet.enable_default', 'fleet.min_instances', 'fleet.max_instances', 'fleet.idle_duration']); + }; + + const emptyValidator = async () => Promise.resolve(true); + + const onNavigate = ({ + requestedStepIndex, + reason, + }: { + requestedStepIndex: number; + reason: WizardProps.NavigationReason; + }) => { + const stepValidators = [validateNameAndType, validateBackends, validateFleet, emptyValidator]; + + if (reason === 'next') { + stepValidators[activeStepIndex]?.().then((isValid) => { + if (isValid) { + if (activeStepIndex === fleetStepIndex && formValues?.['fleet']['min_instances'] > 0) { + openConfirmationDialog({ + title: 'Are sure want to set min instances above than 0?', + content: null, + onConfirm: () => setActiveStepIndex(requestedStepIndex), + }); + } else { + setActiveStepIndex(requestedStepIndex); + } + } + }); + } else { + setActiveStepIndex(requestedStepIndex); + } + }; + + const onNavigateHandler: WizardProps['onNavigate'] = ({ detail: { requestedStepIndex, reason } }) => { + onNavigate({ requestedStepIndex, reason }); + }; + + const onChangeProjectType = (backendType: string) => { + if (backendType === 'gpu_marketplace') { + setValue( + 'backends', + backendBaseOptions.map((b: { value: string }) => b.value), + ); + } else { + trigger(['backends']).catch(console.log); + } + }; + + const onChangeProjectTypeHandler: TilesProps['onChange'] = ({ detail: { value } }) => { + onChangeProjectType(value); + }; + + useEffect(() => { + if (backendBaseOptions?.length) { + onChangeProjectType(formValues.project_type); + } + }, [backendBaseOptions]); + + const onSubmitWizard = async () => { + const isValid = await trigger(); + + const { fleet } = getValues(); + + if (!isValid) { + return; + } + + const request = createProject(getFormValuesForServer()).unwrap(); + + request + .then(async (data) => { + if (fleet.enable_default) { + await applyFleet({ + projectName: data.project_name, + ...getFormValuesForFleetApplying(), + }).unwrap(); + } + + pushNotification({ + type: 'success', + content: t('projects.create.success_notification'), + }); + + navigate(ROUTES.PROJECT.DETAILS.SETTINGS.FORMAT(data.project_name)); + }) + .catch((error) => { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: getServerError(error) }), + }); + }); + }; + + const onSubmit = () => { + if (activeStepIndex < 3) { + onNavigate({ requestedStepIndex: activeStepIndex + 1, reason: 'next' }); + } else { + onSubmitWizard().catch(console.log); + } + }; + + const getDefaultFleetSummary = () => { + const summaryFields: Array = [ + 'name', + 'min_instances', + 'max_instances', + 'idle_duration', + 'spot_policy', + ]; + + const result: string[] = []; + + summaryFields.forEach((fieldName) => { + if (!isNil(formValues?.fleet?.[fieldName])) { + result.push(`${t(`fleets.edit.${fieldName}`)}: ${formValues['fleet'][fieldName]}`); + } + }); + + return result.join(', '); + }; + + return ( +
+ `Step ${stepNumber}`, + navigationAriaLabel: 'Steps', + cancelButton: t('common.cancel'), + previousButton: t('common.previous'), + nextButton: t('common.next'), + optional: 'optional', + }} + onCancel={onCancelHandler} + submitButtonText={t('projects.wizard.submit')} + steps={[ + { + title: 'Settings', + content: ( + + + + +
+ + + + + + {selectedProjectTypeOption?.billing_notes && ( + + {selectedProjectTypeOption.billing_notes} + + )} + +
+
+
+ ), + }, + { + title: 'Backends', + content: ( + + + +
+ + {formValues['project_type'] === 'gpu_marketplace' && ( + item.label, + }} + cardsPerRow={[{ cards: 1 }, { minWidth: 400, cards: 2 }, { minWidth: 800, cards: 3 }]} + /> + )} + + {formValues['project_type'] === 'own_cloud' && ( + item.label, + }} + cardsPerRow={[{ cards: 1 }, { minWidth: 400, cards: 2 }, { minWidth: 800, cards: 3 }]} + /> + )} +
+ ), + }, + { + title: 'Fleets', + content: ( + + + {t('projects.edit.default_fleet')}} + constraintText={t('projects.edit.default_fleet_description')} + toggleInfo={ openHelpPanel(DEFAULT_FLEET_INFO)} />} + control={control} + name="fleet.enable_default" + /> + + {formValues['fleet']['enable_default'] && ( + + control={control} + disabledAllFields={loading} + fieldNamePrefix="fleet" + /> + )} + + + ), + }, + { + title: 'Summary', + content: ( + + + + ), + }, + ]} + /> + + ); +}; diff --git a/frontend/src/pages/Project/CreateWizard/styles.module.scss b/frontend/src/pages/Project/CreateWizard/styles.module.scss new file mode 100644 index 0000000000..95a6f77a08 --- /dev/null +++ b/frontend/src/pages/Project/CreateWizard/styles.module.scss @@ -0,0 +1,7 @@ +.ownCloudInfo { + display: flex; + align-items: center; + justify-content: center; + padding-top: 40px; + padding-bottom: 40px; +} diff --git a/frontend/src/pages/Project/CreateWizard/types.ts b/frontend/src/pages/Project/CreateWizard/types.ts new file mode 100644 index 0000000000..62f63d725b --- /dev/null +++ b/frontend/src/pages/Project/CreateWizard/types.ts @@ -0,0 +1,9 @@ +import { FleetFormFields } from 'pages/Fleets/Add/FleetFormFields/type'; + +export interface IProjectWizardForm extends Pick { + project_type: 'gpu_marketplace' | 'own_cloud'; + backends: TBackendType[]; + fleet: FleetFormFields & { + enable_default?: boolean; + }; +} diff --git a/frontend/src/pages/Project/Details/Events/index.tsx b/frontend/src/pages/Project/Details/Events/index.tsx new file mode 100644 index 0000000000..f01186cecc --- /dev/null +++ b/frontend/src/pages/Project/Details/Events/index.tsx @@ -0,0 +1,64 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate, useParams } from 'react-router-dom'; + +import { Button, Container, Header, Loader, SpaceBetween } from 'components'; + +import { useBreadcrumbs } from 'hooks'; +import { ROUTES } from 'routes'; +import { useGetProjectQuery } from 'services/project'; + +import { EventList } from 'pages/Events/List'; + +export const Events: React.FC = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramProjectName = params.projectName ?? ''; + const navigate = useNavigate(); + const { data, isLoading } = useGetProjectQuery({ name: paramProjectName }); + + useBreadcrumbs([ + { + text: t('navigation.project_other'), + href: ROUTES.PROJECT.LIST, + }, + { + text: paramProjectName, + href: ROUTES.PROJECT.DETAILS.FORMAT(paramProjectName), + }, + { + text: t('projects.events'), + href: ROUTES.PROJECT.DETAILS.EVENTS.FORMAT(paramProjectName), + }, + ]); + + const goToEventsPage = () => { + navigate(ROUTES.EVENTS.LIST + `?within_projects=${data?.project_id}`); + }; + + if (isLoading || !data) + return ( + + + + ); + + return ( + { + return ( +
+ + + } + /> + ); + }} + permanentFilters={{ within_projects: [data.project_id] }} + showFilters={false} + /> + ); +}; diff --git a/frontend/src/pages/Project/Details/Settings/constants.tsx b/frontend/src/pages/Project/Details/Settings/constants.tsx new file mode 100644 index 0000000000..195c684889 --- /dev/null +++ b/frontend/src/pages/Project/Details/Settings/constants.tsx @@ -0,0 +1,44 @@ +import React from 'react'; +import Link from '@cloudscape-design/components/link'; + +export const CLI_INFO = { + header:

CLI

, + body: ( + <> +

+ To use this project with your CLI, add it using the + + dstack project add + {' '} + command. +

+

+ To learn how to install the CLI, refer to the{' '} + + installation + {' '} + guide. +

+ + ), +}; + +export const TEMPLATES_REPO_INFO = { + header:

Templates

, + body: ( + <> +

+ Specify a project-level templates Git repository URL. Templates from this repo are shown on the Launch page in + Runs, and setting it enables the Launch button when templates are available. +

+

If set, project templates override global templates configured on the server.

+

+ See official examples in{' '} + + dstackai/dstack-templates + + . +

+ + ), +}; diff --git a/frontend/src/pages/Project/Details/Settings/index.tsx b/frontend/src/pages/Project/Details/Settings/index.tsx new file mode 100644 index 0000000000..8e61522bbd --- /dev/null +++ b/frontend/src/pages/Project/Details/Settings/index.tsx @@ -0,0 +1,654 @@ +import React, { useCallback, useEffect, useMemo, useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useDispatch } from 'react-redux'; +import { useLocation, useNavigate, useParams } from 'react-router-dom'; +import { debounce } from 'lodash'; +import { ExpandableSection, Tabs } from '@cloudscape-design/components'; +import { FetchBaseQueryError } from '@reduxjs/toolkit/query'; + +import { + Box, + Button, + ButtonWithConfirmation, + Code, + ConfirmationDialog, + Container, + FormField, + Header, + Hotspot, + InfoLink, + InputCSD, + Loader, + Popover, + SelectCSD, + SpaceBetween, + StatusIndicator, +} from 'components'; +import { HotspotIds } from 'layouts/AppLayout/TutorialPanel/constants'; + +import { useBreadcrumbs, useHelpPanel, useNotifications } from 'hooks'; +import { useCheckingForFleetsInProjects } from 'hooks/useCheckingForFleetsInProjectsOfMember'; +import { riseRouterException } from 'libs'; +import { copyToClipboard } from 'libs'; +import { ROUTES } from 'routes'; +import { useGetProjectQuery, useUpdateProjectMembersMutation, useUpdateProjectMutation } from 'services/project'; +import { useGetRunsQuery } from 'services/run'; +import { templateApi } from 'services/templates'; +import { useGetUserDataQuery } from 'services/user'; + +import { useCheckAvailableProjectPermission } from 'pages/Project/hooks/useCheckAvailableProjectPermission'; +import { useConfigProjectCliCommand } from 'pages/Project/hooks/useConfigProjectCliComand'; +import { useDeleteProject } from 'pages/Project/hooks/useDeleteProject'; +import { ProjectMembers } from 'pages/Project/Members'; +import { getProjectRoleByUserName } from 'pages/Project/utils'; + +import { useBackendsTable } from '../../Backends/hooks'; +import { BackendsTable } from '../../Backends/Table'; +import { NoFleetProjectAlert } from '../../components/NoFleetProjectAlert'; +import { GatewaysTable } from '../../Gateways'; +import { useGatewaysTable } from '../../Gateways/hooks'; +import { ProjectSecrets } from '../../Secrets'; +import { TEMPLATES_REPO_INFO } from './constants'; + +import styles from './styles.module.scss'; + +type ApiErrorResponse = { detail?: string | { msg?: string } | Array<{ msg?: string }> }; + +const isFetchBaseQueryError = (error: unknown): error is FetchBaseQueryError => + typeof error === 'object' && error !== null && 'status' in error; + +export const ProjectSettings: React.FC = () => { + const { t } = useTranslation(); + const params = useParams(); + const navigate = useNavigate(); + const location = useLocation(); + const paramProjectName = params.projectName ?? ''; + const [isExpandedCliSection, setIsExpandedCliSection] = React.useState(false); + const [configCliCommand, copyCliCommand] = useConfigProjectCliCommand({ projectName: paramProjectName }); + + const { isAvailableDeletingPermission, isProjectManager, isProjectAdmin, isAvailableProjectManaging } = + useCheckAvailableProjectPermission(); + + const [pushNotification] = useNotifications(); + const [openHelpPanel] = useHelpPanel(); + const dispatch = useDispatch(); + const [updateProjectMembers] = useUpdateProjectMembersMutation(); + const [updateProject] = useUpdateProjectMutation(); + const { deleteProject, isDeleting } = useDeleteProject(); + const { data: currentUser } = useGetUserDataQuery({}); + + const projectNames = useMemo(() => [paramProjectName], [paramProjectName]); + + const projectHavingFleetMap = useCheckingForFleetsInProjects({ projectNames }); + + const { data, isLoading, error } = useGetProjectQuery({ name: paramProjectName }); + + const { data: runsData } = useGetRunsQuery({ + project_name: paramProjectName, + limit: 1, + }); + + useEffect(() => { + setIsExpandedCliSection(!runsData || runsData.length === 0); + }, [runsData]); + + useEffect(() => { + if (error && 'status' in error && error.status === 404) { + riseRouterException(); + } + }, [error]); + + const currentUserRole = data ? getProjectRoleByUserName(data, currentUser?.username ?? '') : null; + const isProjectMember = currentUserRole !== null; + + const currentOwner = { + label: data?.owner.username, + value: data?.owner.username, + }; + + const visibilityOptions = [ + { label: t('projects.edit.visibility.private') || '', value: 'private' }, + { label: t('projects.edit.visibility.public') || '', value: 'public' }, + ]; + + const [selectedVisibility, setSelectedVisibility] = useState(data?.isPublic ? visibilityOptions[1] : visibilityOptions[0]); + const [templatesRepoValue, setTemplatesRepoValue] = useState(''); + const [templatesRepoError, setTemplatesRepoError] = useState(null); + const [isChangeTemplatesRepoVisible, setIsChangeTemplatesRepoVisible] = useState(false); + const [isResetTemplatesRepoVisible, setIsResetTemplatesRepoVisible] = useState(false); + const changeTemplatesRepoInputWrapperRef = React.useRef(null); + const dangerZoneRef = React.useRef(null); + + useEffect(() => { + setSelectedVisibility(data?.isPublic ? visibilityOptions[1] : visibilityOptions[0]); + }, [data]); + + useEffect(() => { + setTemplatesRepoValue(data?.templates_repo ?? ''); + }, [data?.templates_repo]); + + useEffect(() => { + if (!isChangeTemplatesRepoVisible) { + return; + } + const timer = setTimeout(() => { + changeTemplatesRepoInputWrapperRef.current?.querySelector('input')?.focus(); + }, 10); + return () => clearTimeout(timer); + }, [isChangeTemplatesRepoVisible]); + + const { + data: backendsData, + isDeleting: isDeletingBackend, + addBackend, + deleteBackend, + editBackend, + } = useBackendsTable(paramProjectName, data?.backends ?? []); + + const { data: gatewaysData, isLoading: isLoadingGateways } = useGatewaysTable(paramProjectName); + + const isLoadingPage = isLoading || !data || isLoadingGateways; + + useEffect(() => { + if (location.hash === '#danger-zone') { + setTimeout(() => { + dangerZoneRef.current?.scrollIntoView({ behavior: 'smooth', block: 'start' }); + }, 0); + } + }, [location.hash, isLoadingPage]); + + useBreadcrumbs([ + { + text: t('navigation.project_other'), + href: ROUTES.PROJECT.LIST, + }, + { + text: paramProjectName, + href: ROUTES.PROJECT.DETAILS.FORMAT(paramProjectName), + }, + ]); + + const changeMembersHandler = (members: IProjectMember[]) => { + updateProjectMembers({ + project_name: paramProjectName, + members: members.map((m) => ({ project_role: m.project_role, username: m.user.username })), + }) + .unwrap() + .then(() => { + pushNotification({ + type: 'success', + content: t('projects.edit.update_members_success'), + }); + }) + // eslint-disable-next-line @typescript-eslint/no-explicit-any + .catch((error: any) => { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: error?.data?.detail?.msg }), + }); + }); + }; + + const debouncedMembersHandler = useCallback(debounce(changeMembersHandler, 1000), []); + + const changeVisibilityHandler = (is_public: boolean) => { + updateProject({ + project_name: paramProjectName, + is_public: is_public, + }) + .unwrap() + .then(() => { + pushNotification({ + type: 'success', + content: t('projects.edit.update_visibility_success'), + }); + }) + // eslint-disable-next-line @typescript-eslint/no-explicit-any + .catch((error: any) => { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: error?.data?.detail?.msg }), + }); + }); + }; + + const getApiErrorMessage = (error: unknown): string => { + const detail = isFetchBaseQueryError(error) ? (error.data as ApiErrorResponse | undefined)?.detail : undefined; + if (Array.isArray(detail)) { + return detail[0]?.msg ?? t('common.server_error', { error: 'Unknown error' }); + } + if (typeof detail === 'string') { + return detail; + } + if (detail?.msg) { + return detail.msg; + } + return t('common.server_error', { error: 'Unknown error' }); + }; + + const updateTemplatesRepoHandler = async (): Promise => { + const templates_repo = templatesRepoValue.trim() === '' ? null : templatesRepoValue.trim(); + try { + await updateProject({ + project_name: paramProjectName, + templates_repo, + }).unwrap(); + dispatch(templateApi.util.invalidateTags(['Templates'])); + pushNotification({ + type: 'success', + content: t('projects.edit.update_templates_repo_success'), + }); + return true; + } catch (error: unknown) { + const errorMessage = getApiErrorMessage(error); + setTemplatesRepoError(errorMessage); + return false; + } + }; + + const openChangeTemplatesRepoDialog = () => { + setTemplatesRepoValue(data?.templates_repo ?? ''); + setTemplatesRepoError(null); + setIsChangeTemplatesRepoVisible(true); + }; + + const closeChangeTemplatesRepoDialog = () => { + setTemplatesRepoError(null); + setIsChangeTemplatesRepoVisible(false); + }; + + const openResetTemplatesRepoDialog = () => { + setIsResetTemplatesRepoVisible(true); + }; + + const closeResetTemplatesRepoDialog = () => { + setIsResetTemplatesRepoVisible(false); + }; + + const confirmChangeTemplatesRepo = async () => { + if (templatesRepoValue.trim() === '') { + setTemplatesRepoError(t('projects.edit.templates_repo_required')); + return; + } + const isUpdated = await updateTemplatesRepoHandler(); + if (isUpdated) { + closeChangeTemplatesRepoDialog(); + } + }; + + const confirmResetTemplatesRepo = () => { + setTemplatesRepoValue(''); + updateProject({ + project_name: paramProjectName, + reset_templates_repo: true, + }) + .unwrap() + .then(() => { + dispatch(templateApi.util.invalidateTags(['Templates'])); + pushNotification({ + type: 'success', + content: t('projects.edit.update_templates_repo_success'), + }); + }) + // eslint-disable-next-line @typescript-eslint/no-explicit-any + .catch((error: any) => { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: error?.data?.detail?.msg }), + }); + }); + closeResetTemplatesRepoDialog(); + }; + + const isDisabledButtons = useMemo(() => { + return isDeleting || !data || !isAvailableDeletingPermission(data); + }, [data, isDeleting, isAvailableDeletingPermission]); + + const deleteProjectHandler = () => { + if (!data) return; + + deleteProject(data) + .then(() => navigate(ROUTES.PROJECT.LIST)) + // eslint-disable-next-line @typescript-eslint/no-explicit-any + .catch((error: any) => { + console.error('Delete project failed:', error); + }); + }; + + const projectDontHasFleet = !projectHavingFleetMap?.[paramProjectName]; + + if (isLoadingPage) + return ( + + + + ); + + return ( + <> + {data && backendsData && gatewaysData && ( + + + + {isProjectMember && ( + setIsExpandedCliSection(detail.expanded)} + // headerInfo={ openHelpPanel(CLI_INFO)} />} + > + + To use dstack with this project, run the following command. + +
+ + {configCliCommand} + +
+ {t('common.copied')}} + > +
+
+
+ + + + + To use dstack, install the CLI on your local machine. + + +
+ uv tool install dstack -U + +
+ + {t('common.copied')} + + } + > +
+
+ + ), + }, + { + label: 'pip', + id: 'pip', + content: ( + <> +
+ pip install dstack -U + +
+ + {t('common.copied')} + + } + > +
+
+ + ), + }, + ]} + /> +
+
+
+
+ )} + + + + + + + + + +
+ {t('common.danger_zone')}
}> + +
+ {isAvailableProjectManaging && ( + <> + + {t('projects.edit.delete_this_project')} + + +
+ + {t('common.delete')} + +
+ + )} + + {isAvailableProjectManaging && ( + <> + + {t('projects.edit.project_visibility_settings')} + + +
+ + changeVisibilityHandler(selectedVisibility.value === 'public') + } + confirmTitle={t('projects.edit.update_visibility_confirm_title')} + confirmButtonLabel={t('projects.edit.change_visibility')} + confirmContent={ + + + {t('projects.edit.update_visibility_confirm_message')} + +
+ + setSelectedVisibility( + event.detail.selectedOption as { + label: string; + value: string; + }, + ) + } + expandToViewport={true} + filteringType="auto" + /> +
+
+ } + > + {t('projects.edit.change_visibility')} +
+
+ + )} + + + {t('projects.edit.transfer_ownership')} + + +
+
+ +
+
+ + {isAvailableProjectManaging && ( + <> +
+ + {t('projects.edit.override_project_templates')} + + openHelpPanel(TEMPLATES_REPO_INFO)} /> +
+ +
+ {data.templates_repo && ( + + )} + + + + +
+ + )} +
+
+ + + + )} + + + + {t('projects.edit.change_templates_repo_message')} + +
+ + { + setTemplatesRepoValue(detail.value); + if (templatesRepoError) { + setTemplatesRepoError(null); + } + }} + onKeyDown={({ detail }) => { + if (detail.key === 'Enter') { + void confirmChangeTemplatesRepo(); + } + }} + placeholder={t('projects.edit.templates_repo_placeholder')} + /> + +
+ + } + /> + + {t('projects.edit.reset_templates_repo_message')}} + /> + + ); +}; diff --git a/frontend/src/pages/Project/Details/Settings/styles.module.scss b/frontend/src/pages/Project/Details/Settings/styles.module.scss new file mode 100644 index 0000000000..e4c4e127c8 --- /dev/null +++ b/frontend/src/pages/Project/Details/Settings/styles.module.scss @@ -0,0 +1,44 @@ +.dangerSectionGrid { + display: grid; + gap: 20px 40px; + grid-template-columns: minmax(auto, 300px) 1fr; +} + +.dangerSectionField { + width: 300px; +} + +.templatesRepoRow { + display: flex; + align-items: center; + gap: 12px; +} + +.templatesRepoTitle { + display: inline-flex; + align-items: center; + gap: 8px; +} + +.templatesRepoInput { + width: 300px; + max-width: 100%; +} + +.templatesRepoActions { + flex-shrink: 0; +} + +.codeWrapper { + position: relative; + + .code { + padding: 16px 12px; + } + + .copy { + position: absolute; + top: 10px; + right: 8px; + } +} diff --git a/frontend/src/pages/Project/Details/index.tsx b/frontend/src/pages/Project/Details/index.tsx new file mode 100644 index 0000000000..f667319eb2 --- /dev/null +++ b/frontend/src/pages/Project/Details/index.tsx @@ -0,0 +1,49 @@ +import React, { useMemo } from 'react'; +import { useTranslation } from 'react-i18next'; +import { Outlet, useMatch, useParams } from 'react-router-dom'; + +import { ContentLayout, DetailsHeader, Tabs } from 'components'; + +import { ROUTES } from 'routes'; + +import styles from './styles.module.scss'; + +export const ProjectDetails: React.FC = () => { + const params = useParams(); + const paramProjectName = params.projectName ?? ''; + const { t } = useTranslation(); + + const matchSettings = useMatch(ROUTES.PROJECT.DETAILS.SETTINGS.FORMAT(paramProjectName)); + const matchEvents = useMatch(ROUTES.PROJECT.DETAILS.EVENTS.FORMAT(paramProjectName)); + + const tabs: { + label: string; + id: string; + href: string; + }[] = [ + { + label: t('projects.settings'), + id: 'settings', + href: ROUTES.PROJECT.DETAILS.SETTINGS.FORMAT(paramProjectName), + }, + { + label: t('projects.events'), + id: 'events', + href: ROUTES.PROJECT.DETAILS.EVENTS.FORMAT(paramProjectName), + }, + ].filter(Boolean); + + const showTabs = useMemo(() => { + return Boolean(matchSettings) || Boolean(matchEvents); + }, [matchSettings, matchEvents]); + + return ( +
+ }> + {showTabs && } + + + +
+ ); +}; diff --git a/frontend/src/pages/Project/Details/styles.module.scss b/frontend/src/pages/Project/Details/styles.module.scss new file mode 100644 index 0000000000..1a7d41a9c5 --- /dev/null +++ b/frontend/src/pages/Project/Details/styles.module.scss @@ -0,0 +1,18 @@ +.page { + height: 100%; + + & [class^="awsui_tabs-content"] { + display: none; + } + + & > [class^="awsui_layout"] { + height: 100%; + + & > [class^="awsui_content"] { + display: flex; + flex-direction: column; + gap: 20px; + height: 100%; + } + } +} diff --git a/frontend/src/pages/Project/Form/index.tsx b/frontend/src/pages/Project/Form/index.tsx new file mode 100644 index 0000000000..c95f7f98fc --- /dev/null +++ b/frontend/src/pages/Project/Form/index.tsx @@ -0,0 +1,105 @@ +import React from 'react'; +import { useForm } from 'react-hook-form'; +import { useTranslation } from 'react-i18next'; + +import { Button, Container, FormCheckbox, FormInput, FormUI, Header, SpaceBetween } from 'components'; + +import { useNotifications } from 'hooks'; +import { isResponseServerError, isResponseServerFormFieldError } from 'libs'; + +import { IProps } from './types'; +import { FieldPath } from 'react-hook-form/dist/types/path'; + +export const ProjectForm: React.FC = ({ initialValues, onCancel, loading, onSubmit: onSubmitProp }) => { + const { t } = useTranslation(); + const [pushNotification] = useNotifications(); + + const formMethods = useForm({ + defaultValues: { + isPublic: false, + ...initialValues, + }, + }); + + const { handleSubmit, control, setError, clearErrors } = formMethods; + + const onSubmit = (data: IProject) => { + clearErrors(); + + // Transform frontend camelCase to backend snake_case + const backendData = { + project_name: data.project_name, + is_public: data.isPublic, + }; + + onSubmitProp(backendData as unknown as IProject).catch((errorResponse) => { + const errorRequestData = errorResponse?.data; + + if (isResponseServerError(errorRequestData)) { + errorRequestData.detail.forEach((error) => { + if (isResponseServerFormFieldError(error)) { + setError(error.loc.join('.') as FieldPath, { type: 'custom', message: error.msg }); + } else { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: error.msg }), + }); + } + }); + } else { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: errorResponse?.error ?? errorResponse }), + }); + } + }); + }; + + return ( +
+ + + + + + } + > + + {t('projects.edit.general')}}> + + + + + + + + + + ); +}; diff --git a/frontend/src/pages/Project/Form/types.ts b/frontend/src/pages/Project/Form/types.ts new file mode 100644 index 0000000000..2f821d4bfc --- /dev/null +++ b/frontend/src/pages/Project/Form/types.ts @@ -0,0 +1,14 @@ +import { FleetFormFields } from 'pages/Fleets/Add/FleetFormFields/type'; + +export interface IProps { + initialValues?: Partial; + loading?: boolean; + onCancel: () => void; + onSubmit: (user: IProject) => Promise; +} + +export interface IProjectForm extends IProjectCreateRequestParams { + fleet: FleetFormFields & { + enable_default?: boolean; + }; +} diff --git a/frontend/src/pages/Project/Gateways/Add/index.tsx b/frontend/src/pages/Project/Gateways/Add/index.tsx new file mode 100644 index 0000000000..733853dde3 --- /dev/null +++ b/frontend/src/pages/Project/Gateways/Add/index.tsx @@ -0,0 +1,194 @@ +import React, { useEffect, useState } from 'react'; +import { useForm } from 'react-hook-form'; +import { useTranslation } from 'react-i18next'; +import { useNavigate, useParams } from 'react-router-dom'; +import { get as _get } from 'lodash'; + +import { Button, Container, FormSelect, FormSelectOptions, FormUI, Header, SpaceBetween, Spinner } from 'components'; + +import { useBreadcrumbs, useNotifications } from 'hooks'; +import { getServerError, isResponseServerError, isResponseServerFormFieldError } from 'libs'; +import { ROUTES } from 'routes'; +import { useCreateProjectGatewayMutation } from 'services/gateway'; +import { useGetProjectQuery } from 'services/project'; + +import { FieldPath } from 'react-hook-form/dist/types/path'; + +import styles from './styles.module.scss'; + +const FIELD_NAMES: Record = { + BACKEND: 'backend_type', + REGION: 'region', +}; + +export const AddGateway: React.FC = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramProjectName = params.projectName ?? ''; + const navigate = useNavigate(); + const [pushNotification] = useNotifications(); + const [pushPermanentNotification] = useNotifications({ temporary: false }); + const [regionOptions, setRegionOptions] = useState([]); + + const { data, isLoading: isLoadingBackends } = useGetProjectQuery({ name: paramProjectName }); + + const [createGateway, { isLoading: isCreating }] = useCreateProjectGatewayMutation(); + + const { handleSubmit, control, watch, setValue, setError } = useForm(); + + const backendFormValue = watch(FIELD_NAMES.BACKEND); + + const isDisabledFields = isCreating || isLoadingBackends; + + useBreadcrumbs([ + { + text: t('navigation.project_other'), + href: ROUTES.PROJECT.LIST, + }, + { + text: paramProjectName, + href: ROUTES.PROJECT.DETAILS.FORMAT(paramProjectName), + }, + { + text: t('projects.settings'), + href: ROUTES.PROJECT.DETAILS.SETTINGS.FORMAT(paramProjectName), + }, + { + text: t('gateway.add_gateway'), + href: ROUTES.PROJECT.GATEWAY.ADD.FORMAT(paramProjectName), + }, + ]); + + const backendOptions: FormSelectOptions = + data?.backends.filter((b) => b.name !== 'dstack').map((i) => ({ label: i.name, value: i.name })) ?? []; + + useEffect(() => { + if (data && backendFormValue) { + const backend = data.backends.find((b) => b.name === backendFormValue); + + setRegionOptions(() => { + if (!backend) return []; + + const regions = _get(backend.config, 'regions', _get(backend.config, 'locations', [])); + + if (regions?.[0]) { + setValue(FIELD_NAMES.REGION, regions[0]); + } + + return regions?.map((region) => ({ label: region, value: region })) ?? []; + }); + } else { + setRegionOptions([]); + } + }, [backendFormValue]); + + const onCancel = () => { + navigate(ROUTES.PROJECT.DETAILS.SETTINGS.FORMAT(paramProjectName)); + }; + + const onSubmit = (gateway: TCreateGatewayParams) => { + pushPermanentNotification({ + type: 'info', + content: t('gateway.create.creating_notification'), + }); + + createGateway({ + projectName: paramProjectName, + gateway, + }) + .unwrap() + .then((response) => { + pushNotification({ + type: 'success', + content: t('gateway.create.success_notification'), + }); + + navigate(ROUTES.PROJECT.GATEWAY.EDIT.FORMAT(paramProjectName, response.name)); + }) + .catch((errorResponse) => { + const errorRequestData = errorResponse?.data; + + if (isResponseServerError(errorRequestData)) { + errorRequestData.detail.forEach((error) => { + if (isResponseServerFormFieldError(error)) { + setError(error.loc.join('.') as FieldPath, { + type: 'custom', + message: error.msg, + }); + } else { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: error.msg }), + }); + } + }); + } else { + pushNotification({ + type: 'error', + content: t('common.server_error', { + error: getServerError(errorResponse), + }), + }); + } + }); + }; + + const renderSpinner = () => { + if (isLoadingBackends) + return ( +
+ +
+ ); + }; + + return ( +
+ + + + + + } + > + + {t('gateway.add_gateway')}}> + + + + + + + + + + ); +}; diff --git a/frontend/src/pages/Project/Gateways/Add/styles.module.scss b/frontend/src/pages/Project/Gateways/Add/styles.module.scss new file mode 100644 index 0000000000..9972fad213 --- /dev/null +++ b/frontend/src/pages/Project/Gateways/Add/styles.module.scss @@ -0,0 +1,7 @@ +.fieldSpinner { + display: flex; + align-items: center; + justify-content: center; + height: 34px; + width: 34px; +} diff --git a/frontend/src/pages/Project/Gateways/Edit/constants.tsx b/frontend/src/pages/Project/Gateways/Edit/constants.tsx new file mode 100644 index 0000000000..aab9c2caee --- /dev/null +++ b/frontend/src/pages/Project/Gateways/Edit/constants.tsx @@ -0,0 +1,18 @@ +import React from 'react'; + +export const WILDCARD_DOMAIN_HELP = { + header:

Wildcard domain

, + body: ( + <> +

+ Create a wildcard A record in your DNS provider pointing to the gateway's external IP address. Once created, + specify the corresponding wildcard domain name here. +

+ +

+ If you've configured a wildcard domain for the gateway, dstack enables HTTPS automatically and serves the + services at https://<run name>.<your domain name> +

+ + ), +}; diff --git a/frontend/src/pages/Project/Gateways/Edit/index.tsx b/frontend/src/pages/Project/Gateways/Edit/index.tsx new file mode 100644 index 0000000000..7d4ac6a372 --- /dev/null +++ b/frontend/src/pages/Project/Gateways/Edit/index.tsx @@ -0,0 +1,225 @@ +import React, { useEffect } from 'react'; +import { useForm } from 'react-hook-form'; +import { useTranslation } from 'react-i18next'; +import { useNavigate, useParams } from 'react-router-dom'; + +import { + Button, + Container, + FormCheckbox, + FormField, + FormInput, + FormUI, + Header, + InfoLink, + InputCSD, + SpaceBetween, + Spinner, +} from 'components'; + +import { useBreadcrumbs, useHelpPanel, useNotifications } from 'hooks'; +import { getServerError, isResponseServerError, isResponseServerFormFieldError } from 'libs'; +import { ROUTES } from 'routes'; +import { + useGetProjectGatewayQuery, + useSetDefaultProjectGatewayMutation, + useSetWildcardDomainOfGatewayMutation, +} from 'services/gateway'; + +import { WILDCARD_DOMAIN_HELP } from './constants'; + +import { FieldPath } from 'react-hook-form/dist/types/path'; + +import styles from './styles.module.scss'; + +const FIELD_NAMES: Record = { + WILDCARD_DOMAIN: 'wildcard_domain', + DEFAULT: 'default', +}; + +export const EditGateway: React.FC = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramProjectName = params.projectName ?? ''; + const paramInstanceName = params.instance ?? ''; + const navigate = useNavigate(); + const [pushNotification] = useNotifications(); + const [openHelpPanel] = useHelpPanel(); + + const { data, isLoading: isLoadingGateway } = useGetProjectGatewayQuery({ + projectName: paramProjectName, + instanceName: paramInstanceName, + }); + + const [setDefault, { isLoading: isSettingDefault }] = useSetDefaultProjectGatewayMutation(); + const [setWildcardDomainOfGateway, { isLoading: isUpdating }] = useSetWildcardDomainOfGatewayMutation(); + + const { handleSubmit, control, watch, setValue, setError } = useForm({ + defaultValues: { [FIELD_NAMES.DEFAULT]: false }, + }); + + const isDefault = watch(FIELD_NAMES.DEFAULT); + + useEffect(() => { + if (data) { + setValue(FIELD_NAMES.DEFAULT, data.default); + setValue(FIELD_NAMES.WILDCARD_DOMAIN, data.wildcard_domain); + } + }, [data]); + + const isDisabledFields = isUpdating || isLoadingGateway; + + useBreadcrumbs([ + { + text: t('navigation.project_other'), + href: ROUTES.PROJECT.LIST, + }, + { + text: paramProjectName, + href: ROUTES.PROJECT.DETAILS.FORMAT(paramProjectName), + }, + { + text: t('projects.settings'), + href: ROUTES.PROJECT.DETAILS.SETTINGS.FORMAT(paramProjectName), + }, + { + text: t('gateway.edit_gateway'), + href: ROUTES.PROJECT.GATEWAY.EDIT.FORMAT(paramProjectName, paramInstanceName), + }, + ]); + + const onCancel = () => { + navigate(ROUTES.PROJECT.DETAILS.SETTINGS.FORMAT(paramProjectName)); + }; + + const onChangeDefault = () => { + setDefault({ + projectName: paramProjectName, + name: paramInstanceName, + }) + .unwrap() + .then(() => { + pushNotification({ + type: 'success', + content: t('gateway.update.success_notification'), + }); + }); + }; + + const onSubmit = ({ wildcard_domain }: TUpdateGatewayParams) => { + setWildcardDomainOfGateway({ + projectName: paramProjectName, + name: paramInstanceName, + wildcard_domain, + }) + .unwrap() + .then(() => { + pushNotification({ + type: 'success', + content: t('gateway.update.success_notification'), + }); + }) + .catch((errorResponse) => { + const errorRequestData = errorResponse?.data; + + if (isResponseServerError(errorRequestData)) { + errorRequestData.detail.forEach((error) => { + if (isResponseServerFormFieldError(error)) { + setError(error.loc.join('.') as FieldPath, { + type: 'custom', + message: error.msg, + }); + } else { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: error.msg }), + }); + } + }); + } else { + pushNotification({ + type: 'error', + content: t('common.server_error', { + error: getServerError(errorResponse), + }), + }); + } + }); + }; + + const renderSpinner = (force?: boolean) => { + if (isLoadingGateway || force) + return ( +
+ +
+ ); + }; + + return ( +
+ + + + } + > + + {t('gateway.edit_gateway')}}> + + + + + + + + + + + + + + + + openHelpPanel(WILDCARD_DOMAIN_HELP)} />} + label={t('gateway.edit.wildcard_domain')} + description={t('gateway.edit.wildcard_domain_description')} + placeholder={t('gateway.edit.wildcard_domain_placeholder')} + control={control} + name={FIELD_NAMES.WILDCARD_DOMAIN} + disabled={isDisabledFields} + rules={{ + pattern: { + value: /^\*\..+\..+/, + message: t('gateway.edit.validation.wildcard_domain_format', { + pattern: t('gateway.edit.wildcard_domain_placeholder'), + }), + }, + }} + secondaryControl={ + renderSpinner() ?? ( + + ) + } + /> + + + + + + ); +}; diff --git a/frontend/src/pages/Project/Gateways/Edit/styles.module.scss b/frontend/src/pages/Project/Gateways/Edit/styles.module.scss new file mode 100644 index 0000000000..9972fad213 --- /dev/null +++ b/frontend/src/pages/Project/Gateways/Edit/styles.module.scss @@ -0,0 +1,7 @@ +.fieldSpinner { + display: flex; + align-items: center; + justify-content: center; + height: 34px; + width: 34px; +} diff --git a/frontend/src/pages/Project/Gateways/Table/constants.tsx b/frontend/src/pages/Project/Gateways/Table/constants.tsx new file mode 100644 index 0000000000..53ec3b15ae --- /dev/null +++ b/frontend/src/pages/Project/Gateways/Table/constants.tsx @@ -0,0 +1,17 @@ +import React from 'react'; + +export const GATEWAYS_INFO = { + header:

Gateways

, + body: ( + <> +

Gateways manage the ingress traffic for running services.

+

+ To learn more about gateways, see the{' '} + + documentation + + . +

+ + ), +}; diff --git a/frontend/src/pages/Project/Gateways/Table/hooks/index.ts b/frontend/src/pages/Project/Gateways/Table/hooks/index.ts new file mode 100644 index 0000000000..bc3180f26e --- /dev/null +++ b/frontend/src/pages/Project/Gateways/Table/hooks/index.ts @@ -0,0 +1 @@ +export * from './useColumnsDefinitions'; diff --git a/frontend/src/pages/Project/Gateways/Table/hooks/useColumnsDefinitions.tsx b/frontend/src/pages/Project/Gateways/Table/hooks/useColumnsDefinitions.tsx new file mode 100644 index 0000000000..f63a77fa14 --- /dev/null +++ b/frontend/src/pages/Project/Gateways/Table/hooks/useColumnsDefinitions.tsx @@ -0,0 +1,98 @@ +import React, { useMemo } from 'react'; +import { useTranslation } from 'react-i18next'; + +import { Button, Icon } from 'components'; +import { ButtonWithConfirmation } from 'components/ButtonWithConfirmation'; + +import styles from '../styles.module.scss'; + +type hookArgs = { + loading?: boolean; + projectName: string; + onDeleteClick?: (gateway: IGateway) => void; + onEditClick?: (gateway: IGateway) => void; +}; + +export const useColumnsDefinitions = ({ loading, projectName, onDeleteClick, onEditClick }: hookArgs) => { + const { t } = useTranslation(); + + const columns = useMemo(() => { + return [ + { + id: 'name', + header: t('gateway.edit.name'), + cell: (gateway: IGateway) => + gateway.project_name && gateway.project_name !== projectName + ? `${gateway.project_name}/${gateway.name}` + : gateway.name, + }, + + { + id: 'type', + header: t('gateway.edit.backend'), + cell: (gateway: IGateway) => + gateway.replicas.length > 0 ? gateway.replicas.map((r, i) =>
{r.backend}
) : null, + }, + + { + id: 'region', + header: t('gateway.edit.region'), + cell: (gateway: IGateway) => + gateway.replicas.length > 0 ? gateway.replicas.map((r, i) =>
{r.region}
) : null, + }, + + { + id: 'default', + header: t('gateway.edit.default'), + cell: (gateway: IGateway) => gateway.default && , + }, + + { + id: 'hostname', + header: t('gateway.edit.hostname'), + cell: (gateway: IGateway) => { + if (gateway.hostname) return gateway.hostname; + if (gateway.replicas.length > 0) return gateway.replicas.map((r, i) =>
{r.hostname}
); + return null; + }, + }, + + { + id: 'wildcard_domain', + header: t('gateway.edit.wildcard_domain'), + + cell: (gateway: IGateway) => ( +
+
{gateway.wildcard_domain}
+ +
+ {onEditClick && ( +
+
+ ), + }, + ]; + }, [loading, projectName, onEditClick, onDeleteClick]); + + return { columns } as const; +}; diff --git a/frontend/src/pages/Project/Gateways/Table/index.tsx b/frontend/src/pages/Project/Gateways/Table/index.tsx new file mode 100644 index 0000000000..16e852ee12 --- /dev/null +++ b/frontend/src/pages/Project/Gateways/Table/index.tsx @@ -0,0 +1,91 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; + +import { Button, ButtonWithConfirmation, Header, InfoLink, ListEmptyMessage, SpaceBetween, Table } from 'components'; + +import { useCollection, useHelpPanel } from 'hooks'; + +import { GATEWAYS_INFO } from './constants'; +import { useColumnsDefinitions } from './hooks'; + +import { IProps } from './types'; + +export const GatewaysTable: React.FC = ({ gateways, projectName, addItem, deleteItem, editItem, isDisabledDelete }) => { + const { t } = useTranslation(); + const [openHelpPanel] = useHelpPanel(); + + const renderEmptyMessage = (): React.ReactNode => { + return ( + + {addItem && } + + ); + }; + + const { items, collectionProps } = useCollection(gateways ?? [], { + filtering: { + empty: renderEmptyMessage(), + noMatch: renderEmptyMessage(), + }, + selection: {}, + }); + + const { selectedItems } = collectionProps; + + const isDisabledDeleteSelected = !selectedItems?.length || isDisabledDelete; + + const deleteSelectedGateways = () => { + if (!selectedItems?.length || !deleteItem) return; + + deleteItem(selectedItems); + }; + + const { columns } = useColumnsDefinitions({ + projectName, + ...(editItem ? { onEditClick: (gateway) => editItem(gateway) } : {}), + ...(deleteItem ? { onDeleteClick: (gateway) => deleteItem([gateway]) } : {}), + }); + + const renderCounter = () => { + if (!gateways?.length) return ''; + + return `(${gateways.length})`; + }; + + return ( +
openHelpPanel(GATEWAYS_INFO)} />} + actions={ + + {/* Disallow adding/editing gateways while custom backends are not supported */} + {deleteItem && ( + + {t('common.delete')} + + )} + + {addItem && } + + } + > + {t('gateway.page_title_other')} + + } + /> + ); +}; diff --git a/frontend/src/pages/Project/Gateways/Table/styles.module.scss b/frontend/src/pages/Project/Gateways/Table/styles.module.scss new file mode 100644 index 0000000000..4c83d9f2c7 --- /dev/null +++ b/frontend/src/pages/Project/Gateways/Table/styles.module.scss @@ -0,0 +1,16 @@ +.cell { + display: flex; + align-items: center; +} + +.contextMenu { + margin-left: auto; + padding-left: 20px; +} + +.ellipsisCell { + overflow: hidden; + white-space: nowrap; + text-overflow: ellipsis; + max-width: 450px; +} diff --git a/frontend/src/pages/Project/Gateways/Table/types.ts b/frontend/src/pages/Project/Gateways/Table/types.ts new file mode 100644 index 0000000000..9a7cdda312 --- /dev/null +++ b/frontend/src/pages/Project/Gateways/Table/types.ts @@ -0,0 +1,8 @@ +export interface IProps { + gateways: IGateway[]; + projectName: string; + addItem?: () => void; + deleteItem?: (gateways: readonly IGateway[] | IGateway[]) => void; + editItem?: (gateways: IGateway) => void; + isDisabledDelete?: boolean; +} diff --git a/frontend/src/pages/Project/Gateways/hooks/index.ts b/frontend/src/pages/Project/Gateways/hooks/index.ts new file mode 100644 index 0000000000..d283ae669d --- /dev/null +++ b/frontend/src/pages/Project/Gateways/hooks/index.ts @@ -0,0 +1 @@ +export { useGatewaysTable } from './useGatewaysTable'; diff --git a/frontend/src/pages/Project/Gateways/hooks/useGatewaysTable.ts b/frontend/src/pages/Project/Gateways/hooks/useGatewaysTable.ts new file mode 100644 index 0000000000..efbf278e81 --- /dev/null +++ b/frontend/src/pages/Project/Gateways/hooks/useGatewaysTable.ts @@ -0,0 +1,29 @@ +import { useNavigate } from 'react-router-dom'; + +import { ROUTES } from 'routes'; +import { useDeleteProjectGatewayMutation, useGetProjectGatewaysQuery } from 'services/gateway'; + +export const useGatewaysTable = (projectName: IProject['project_name']) => { + const navigate = useNavigate(); + const { data, isLoading } = useGetProjectGatewaysQuery({ projectName }); + const [deleteGatewayRequest, { isLoading: isDeleting }] = useDeleteProjectGatewayMutation(); + + // NOTE: editing and deletion are disabled as of 0.20.21. + // If enabling, ensure that imported gateways cannot be edited or deleted. + const editGateway = (gateway: IGateway) => { + navigate(ROUTES.PROJECT.GATEWAY.EDIT.FORMAT(projectName, gateway.name)); + }; + + const deleteGateway = (gateways: readonly IGateway[] | IGateway[]) => { + deleteGatewayRequest({ + projectName, + names: gateways.map((gateway) => gateway.name), + }); + }; + + const addGateway = () => { + navigate(ROUTES.PROJECT.GATEWAY.ADD.FORMAT(projectName)); + }; + + return { data, isLoading, isDeleting, editGateway, deleteGateway, addGateway } as const; +}; diff --git a/frontend/src/pages/Project/Gateways/index.tsx b/frontend/src/pages/Project/Gateways/index.tsx new file mode 100644 index 0000000000..4be825212d --- /dev/null +++ b/frontend/src/pages/Project/Gateways/index.tsx @@ -0,0 +1,3 @@ +export { GatewaysTable } from './Table'; +export { AddGateway } from './Add'; +export { EditGateway } from './Edit'; diff --git a/frontend/src/pages/Project/List/hooks/index.ts b/frontend/src/pages/Project/List/hooks/index.ts new file mode 100644 index 0000000000..bc3180f26e --- /dev/null +++ b/frontend/src/pages/Project/List/hooks/index.ts @@ -0,0 +1 @@ +export * from './useColumnsDefinitions'; diff --git a/frontend/src/pages/Project/List/hooks/useColumnsDefinitions.tsx b/frontend/src/pages/Project/List/hooks/useColumnsDefinitions.tsx new file mode 100644 index 0000000000..54b85ce94c --- /dev/null +++ b/frontend/src/pages/Project/List/hooks/useColumnsDefinitions.tsx @@ -0,0 +1,85 @@ +import React, { useMemo } from 'react'; +import { useTranslation } from 'react-i18next'; + +// import { useNavigate } from 'react-router-dom'; +import { /*Button,*/ NavigateLink, StatusIndicator } from 'components'; + +// import { ButtonWithConfirmation } from 'components/ButtonWithConfirmation'; +import { ROUTES } from 'routes'; + +// import { useCheckAvailableProjectPermission } from '../../hooks/useCheckAvailableProjectPermission'; +import styles from '../styles.module.scss'; + +type hookArgs = { + loading?: boolean; + onDeleteClick?: (project: IProject) => void; +}; + +export const useColumnsDefinitions = ({ loading, onDeleteClick }: hookArgs) => { + const { t } = useTranslation(); + // const navigate = useNavigate(); + + // const { isAvailableDeletingPermission } = useCheckAvailableProjectPermission(); + // + // const goToSettings = (project: IProject) => { + // navigate(ROUTES.PROJECT.DETAILS.SETTINGS.FORMAT(project.project_name)); + // }; + + const columns = useMemo(() => { + return [ + { + id: 'project_name', + header: `${t('projects.edit.project_name')}`, + cell: (project: IProject) => ( + + {project.project_name} + + ), + }, + { + id: 'owner.username', + header: `${t('projects.edit.owner')}`, + cell: (project: IProject) => ( +
+ + {project.owner.username} + + + {/*
*/} + {/* goToSettings(project)}*/} + {/* variant="icon"*/} + {/* iconName="settings"*/} + {/* />*/} + + {/* {onDeleteClick && (*/} + {/* onDeleteClick(project)}*/} + {/* variant="icon"*/} + {/* iconName="remove"*/} + {/* confirmTitle={t('projects.edit.delete_project_confirm_title')}*/} + {/* confirmContent={t('projects.edit.delete_project_confirm_message')}*/} + {/* />*/} + {/* )}*/} + {/*
*/} +
+ ), + }, + { + id: 'visibility', + header: t('projects.edit.project_visibility'), + cell: (project: IProject) => ( + + {project.isPublic ? t('projects.edit.visibility.public') : t('projects.edit.visibility.private')} + + ), + }, + ]; + }, [loading, onDeleteClick]); + + return { columns } as const; +}; diff --git a/frontend/src/pages/Project/List/index.tsx b/frontend/src/pages/Project/List/index.tsx new file mode 100644 index 0000000000..eb896df91d --- /dev/null +++ b/frontend/src/pages/Project/List/index.tsx @@ -0,0 +1,159 @@ +import React, { useMemo, useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate } from 'react-router-dom'; + +import { Button, ButtonWithConfirmation, Header, ListEmptyMessage, Loader, SpaceBetween, Table, TextFilter } from 'components'; + +import { DEFAULT_TABLE_PAGE_SIZE } from 'consts'; +import { useBreadcrumbs, useCollection, useInfiniteScroll } from 'hooks'; +import { ROUTES } from 'routes'; +import { useLazyGetProjectsQuery } from 'services/project'; + +import { useCheckAvailableProjectPermission } from '../hooks/useCheckAvailableProjectPermission'; +import { useDeleteProject } from '../hooks/useDeleteProject'; +import { useColumnsDefinitions } from './hooks'; + +export const ProjectList: React.FC = () => { + const { t } = useTranslation(); + + const { isAvailableDeletingPermission, isAvailableProjectManaging } = useCheckAvailableProjectPermission(); + const { deleteProject, deleteProjects, isDeleting } = useDeleteProject(); + const [filteringText, setFilteringText] = useState(''); + const [namePattern, setNamePattern] = useState(''); + const navigate = useNavigate(); + + const { data, isLoading, refreshList, isLoadingMore, totalCount } = useInfiniteScroll({ + useLazyQuery: useLazyGetProjectsQuery, + args: { name_pattern: namePattern, limit: DEFAULT_TABLE_PAGE_SIZE }, + + getPaginationParams: (lastProject) => ({ + prev_created_at: lastProject.created_at, + prev_id: lastProject.project_id, + }), + }); + + useBreadcrumbs([ + { + text: t('navigation.project_other'), + href: ROUTES.PROJECT.LIST, + }, + ]); + + const addProjectHandler = () => { + navigate(ROUTES.PROJECT.ADD); + }; + + const onClearFilter = () => { + setNamePattern(''); + setFilteringText(''); + }; + + const renderEmptyMessage = (): React.ReactNode => { + if (isLoading) { + return null; + } + + if (filteringText) { + return ( + + + + ); + } + + return ( + + {isAvailableProjectManaging && } + + ); + }; + + const { items, collectionProps } = useCollection(data, { + filtering: { + empty: renderEmptyMessage(), + }, + selection: {}, + }); + + const { selectedItems } = collectionProps; + + const deleteSelectedProjects = () => { + if (!selectedItems?.length) return; + + deleteProjects([...selectedItems]).catch(console.log); + }; + + const isDisabledDeleteSelected = useMemo(() => { + if (!selectedItems?.length || isDeleting) return true; + + return !selectedItems.every(isAvailableDeletingPermission); + }, [selectedItems]); + + const { columns } = useColumnsDefinitions({ + loading: isLoading, + onDeleteClick: isAvailableProjectManaging ? deleteProject : undefined, + }); + + const renderCounter = () => { + if (typeof totalCount !== 'number') return ''; + + return `(${totalCount})`; + }; + + return ( + <> +
+ + {t('common.delete')} + + + + + , + ); + } + + // Add join/leave button if user is authenticated (available even in readonly mode) + if (userData?.username && project) { + if (!isMember) { + actions.unshift( + , + ); + } else { + // Check if user is the last admin - if so, don't show leave button + const adminCount = project.members.filter((member) => member.project_role === 'admin').length; + const isLastAdmin = currentUserRole === 'admin' && adminCount <= 1; + + if (!isLastAdmin) { + // Only show leave button if user is not the last admin + actions.unshift( + handleLeaveProject(project.project_name, userData.username!)} + disabled={isMemberActionLoading} + variant="danger-normal" + confirmTitle={t('projects.leave_confirm_title')} + confirmContent={t('projects.leave_confirm_message')} + confirmButtonLabel={t('projects.leave')} + > + {isMemberActionLoading ? t('common.loading') : t('projects.leave')} + , + ); + } + } + } + + return actions.length > 0 ? ( + + {actions} + + ) : undefined; + }; + + const COLUMN_DEFINITIONS = [ + { + id: 'name', + header: t('projects.edit.members.name'), + cell: (item: IProjectMember) => ( + + {item.user.username} + + ), + }, + { + id: 'global_role', + header: t('projects.edit.members.role'), + cell: (field: IProjectMember & { index: number }) => { + const isAvailableForAdmin = !readonly && (isAdmin || field.project_role !== 'admin'); + + return ( +
+
+ +
+ +
+ {isAvailableForAdmin && ( +
+
+ ); + }, + }, + ]; + + return ( + {})}> +
setSelectedItems(event.detail.selectedItems)} + columnDefinitions={COLUMN_DEFINITIONS} + items={items} + header={ +
+ {t('projects.edit.members.section_title')} +
+ } + filter={ + readonly ? undefined : ( + addMemberHandler(detail.value)} + optionsFilter={(options) => options.filter((o) => !fields.find((f) => f.user.username === o.value))} + /> + ) + } + pagination={} + /> + + ); +}; diff --git a/frontend/src/pages/Project/Members/styles.module.scss b/frontend/src/pages/Project/Members/styles.module.scss new file mode 100644 index 0000000000..b67dd1dd10 --- /dev/null +++ b/frontend/src/pages/Project/Members/styles.module.scss @@ -0,0 +1,13 @@ +.role { + display: flex; + align-items: center; + gap: 20px; +} +.roleFieldWrapper { + flex-grow: 1; + flex-basis: 0; + max-width: 200px; +} +.deleteMemberButtonWrapper { + margin-left: auto; +} diff --git a/frontend/src/pages/Project/Members/types.ts b/frontend/src/pages/Project/Members/types.ts new file mode 100644 index 0000000000..282791f850 --- /dev/null +++ b/frontend/src/pages/Project/Members/types.ts @@ -0,0 +1,11 @@ +export interface IProps { + members?: IProjectMember[]; + loading?: boolean; + onChange: (users: IProjectMember[]) => void; + readonly?: boolean; + isAdmin?: boolean; + project?: IProject; +} + +export type TProjectMemberWithIndex = IProjectMember & { index: number }; +export type TFormValues = { members: IProjectMember[] }; diff --git a/frontend/src/pages/Project/Secrets/Form/index.tsx b/frontend/src/pages/Project/Secrets/Form/index.tsx new file mode 100644 index 0000000000..3ccce97493 --- /dev/null +++ b/frontend/src/pages/Project/Secrets/Form/index.tsx @@ -0,0 +1,71 @@ +import React from 'react'; +import { useForm } from 'react-hook-form'; +import { useTranslation } from 'react-i18next'; + +import { Box, Button, FormInput, FormTextarea, SpaceBetween } from 'components'; + +import { TFormValues } from '../types'; + +export type IFormProps = { + initialValues?: TFormValues; + onSubmit: (values: TFormValues) => void; + onCancel?: () => void; + loading?: boolean; +}; + +export const SecretForm: React.FC = ({ initialValues, onSubmit: onSubmitProp, loading, onCancel }) => { + const { t } = useTranslation(); + const { handleSubmit, control } = useForm({ + defaultValues: { + ...initialValues, + }, + }); + + const onSubmit = (values: TFormValues) => onSubmitProp(values); + + return ( +
+ + + + + + + + {onCancel && ( + + )} + + + + + + + ); +}; diff --git a/frontend/src/pages/Project/Secrets/index.tsx b/frontend/src/pages/Project/Secrets/index.tsx new file mode 100644 index 0000000000..febc739682 --- /dev/null +++ b/frontend/src/pages/Project/Secrets/index.tsx @@ -0,0 +1,218 @@ +import React, { useState } from 'react'; +import { useTranslation } from 'react-i18next'; + +import { Button, ButtonWithConfirmation, Header, ListEmptyMessage, Modal, Pagination, SpaceBetween, Table } from 'components'; + +import { useAppSelector, useCollection, useNotifications } from 'hooks'; +import { getServerError } from 'libs'; +import { + useDeleteSecretsMutation, + useGetAllSecretsQuery, + useLazyGetSecretQuery, + useUpdateSecretMutation, +} from 'services/secrets'; +import { GlobalUserRole } from 'types'; + +import { selectUserData } from 'App/slice'; + +import { getMemberCanManageSecrets } from '../utils'; +import { SecretForm } from './Form'; + +import { IProps, TFormValues } from './types'; + +import styles from './styles.module.scss'; + +export const ProjectSecrets: React.FC = ({ project, loading }) => { + const { t } = useTranslation(); + const userData = useAppSelector(selectUserData); + const userName = userData?.username ?? ''; + const [initialFormValues, setInitialFormValues] = useState(); + const projectName = project?.project_name ?? ''; + const [pushNotification] = useNotifications(); + + const hasPermissionForSecretsManaging = + userData?.global_role === GlobalUserRole.ADMIN || (project ? getMemberCanManageSecrets(project, userName) : false); + + const { data, isLoading, isFetching } = useGetAllSecretsQuery( + { project_name: projectName }, + { skip: !hasPermissionForSecretsManaging }, + ); + const [updateSecret, { isLoading: isUpdating }] = useUpdateSecretMutation(); + const [deleteSecret, { isLoading: isDeleting }] = useDeleteSecretsMutation(); + const [getSecret, { isLoading: isGettingSecrets }] = useLazyGetSecretQuery(); + + const { items, paginationProps, collectionProps } = useCollection(data ?? [], { + filtering: { + empty: hasPermissionForSecretsManaging ? ( + + ) : ( + + ), + }, + pagination: { pageSize: 10 }, + selection: {}, + }); + + const { selectedItems } = collectionProps; + + const deleteSelectedSecrets = () => { + const names = selectedItems?.map((s) => s.name ?? ''); + + if (names?.length) { + deleteSecret({ project_name: projectName, names }); + } + }; + + const removeSecretByName = (name: IProjectSecret['name']) => { + deleteSecret({ project_name: projectName, names: [name] }); + }; + + const updateOrCreateSecret = ({ name, value }: TFormValues) => { + if (!name || !value) { + return; + } + + updateSecret({ project_name: projectName, name, value }) + .unwrap() + .then(() => setInitialFormValues(undefined)) + .catch((error) => { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: getServerError(error) }), + }); + }); + }; + + const editSecret = ({ name }: IProjectSecret) => { + getSecret({ project_name: projectName, name }) + .unwrap() + .then((secret) => setInitialFormValues(secret)); + }; + + const closeModal = () => setInitialFormValues(undefined); + + const isDisabledActions = loading || isLoading || isFetching || isDeleting || isGettingSecrets; + + const COLUMN_DEFINITIONS = [ + { + id: 'name', + header: t('projects.edit.secrets.name'), + cell: (secret: IProjectSecret) => secret.name, + }, + { + id: 'value', + header: t('projects.edit.secrets.value'), + cell: (secret: IProjectSecret) => { + return ( +
+
************************
+ +
+
+
+ ); + }, + }, + ]; + + const addSecretHandler = () => { + setInitialFormValues({}); + }; + + const renderActions = () => { + if (!hasPermissionForSecretsManaging) { + return null; + } + + const actions = [ + , + + + {t('common.delete')} + , + ]; + + return actions.length > 0 ? ( + + {actions} + + ) : undefined; + }; + + const isShowModal = !!initialFormValues; + + return ( + <> +
+ {t('projects.edit.secrets.section_title')} + + } + pagination={} + /> + + {hasPermissionForSecretsManaging && ( + + {isShowModal && ( + + )} + + )} + + ); +}; diff --git a/frontend/src/pages/Project/Secrets/styles.module.scss b/frontend/src/pages/Project/Secrets/styles.module.scss new file mode 100644 index 0000000000..5a82457878 --- /dev/null +++ b/frontend/src/pages/Project/Secrets/styles.module.scss @@ -0,0 +1,17 @@ +.value { + display: flex; + align-items: center; + gap: 20px; +} +.valueFieldWrapper { + flex-grow: 1; + flex-basis: 0; + max-width: 400px; +} +.buttonsWrapper { + min-width: 96px; + display: flex; + gap: 8px; + justify-content: flex-end; + margin-left: auto; +} diff --git a/frontend/src/pages/Project/Secrets/types.ts b/frontend/src/pages/Project/Secrets/types.ts new file mode 100644 index 0000000000..ee630b27af --- /dev/null +++ b/frontend/src/pages/Project/Secrets/types.ts @@ -0,0 +1,6 @@ +export interface IProps { + loading?: boolean; + project?: IProject; +} + +export type TFormValues = Partial; diff --git a/frontend/src/pages/Project/components/NoFleetProjectAlert/index.tsx b/frontend/src/pages/Project/components/NoFleetProjectAlert/index.tsx new file mode 100644 index 0000000000..9aefc77d88 --- /dev/null +++ b/frontend/src/pages/Project/components/NoFleetProjectAlert/index.tsx @@ -0,0 +1,54 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate } from 'react-router-dom'; +import cn from 'classnames'; + +import type { ButtonProps } from 'components'; +import { Alert, AlertProps, Button } from 'components'; + +import { useLocalStorageState } from 'hooks/useLocalStorageState'; +import { ROUTES } from 'routes'; + +import styles from './styles.module.scss'; + +type NoFleetProjectAlertProps = { + show?: boolean; + projectName: string; + className?: string; + dismissible?: boolean; +}; + +export const NoFleetProjectAlert: React.FC = ({ projectName, show, className, dismissible }) => { + const { t } = useTranslation(); + const navigate = useNavigate(); + const [dontShowAgain, setDontShowAgain] = useLocalStorageState(`noFleetProjectAlert-${projectName}`, false); + + const onCreateAFleet: ButtonProps['onClick'] = (event) => { + event.preventDefault(); + navigate(ROUTES.FLEETS.ADD.FORMAT(projectName)); + }; + + const onDismiss: AlertProps['onDismiss'] = () => setDontShowAgain(true); + + if (!show || dontShowAgain) { + return null; + } + + return ( +
+ + {t('fleets.no_alert.button_title')} + + } + > + The project {projectName} has no fleets. Create one before submitting a run. + +
+ ); +}; diff --git a/frontend/src/pages/Project/components/NoFleetProjectAlert/styles.module.scss b/frontend/src/pages/Project/components/NoFleetProjectAlert/styles.module.scss new file mode 100644 index 0000000000..c49d1793fb --- /dev/null +++ b/frontend/src/pages/Project/components/NoFleetProjectAlert/styles.module.scss @@ -0,0 +1,10 @@ +.alertBox { + :global { + & [class^="awsui_alert"] { + & [class^="awsui_action-slot"] { + display: flex; + align-items: center; + } + } + } +} diff --git a/frontend/src/pages/Project/constants.tsx b/frontend/src/pages/Project/constants.tsx new file mode 100644 index 0000000000..151740116b --- /dev/null +++ b/frontend/src/pages/Project/constants.tsx @@ -0,0 +1,32 @@ +import React from 'react'; + +export const DEFAULT_FLEET_INFO = { + header:

Default fleet

, + body: ( + <> +

+ Fleets act both as pools of instances and as templates for how those instances are provisioned. When you submit + a dev environment, task, or service, dstack reuses idle instances or provisions new + ones based on the fleet configuration. +

+ +

+ If you set Min number of instances to 0, dstack will provision instances + only when you run a dev environment, task, or service. +

+ +

+ At least one fleet is required to run dev environments, tasks, or services. Create it here, or create it using + the dstack apply command via the CLI. +

+ +

+ To learn more about fleets, see the{' '} + + documentation + + . +

+ + ), +}; diff --git a/frontend/src/pages/Project/hooks/useCheckAvailableProjectPermission.ts b/frontend/src/pages/Project/hooks/useCheckAvailableProjectPermission.ts new file mode 100644 index 0000000000..5c05fe5440 --- /dev/null +++ b/frontend/src/pages/Project/hooks/useCheckAvailableProjectPermission.ts @@ -0,0 +1,42 @@ +import { useAppSelector, usePermissionGuard } from 'hooks'; +import { UserPermission } from 'types'; + +import { selectUserData } from 'App/slice'; + +import { getProjectRoleByUserName } from '../utils'; + +export const useCheckAvailableProjectPermission = () => { + const userData = useAppSelector(selectUserData); + const userName = userData?.username ?? ''; + const userGlobalRole = userData?.global_role ?? ''; + + const [hasPermissionForProjectManaging] = usePermissionGuard({ + allowedPermissions: [UserPermission.CAN_CREATE_PROJECTS], + }); + + const isAvailableDeletingPermission = (project: IProject): boolean => { + return getProjectRoleByUserName(project, userName) === 'admin' || userGlobalRole === 'admin'; + }; + + const isAvailableAddProjectPermission = (project: IProject): boolean => { + return getProjectRoleByUserName(project, userName) === 'admin' || userGlobalRole === 'admin'; + }; + + const isProjectAdmin = (project: IProject): boolean => { + return getProjectRoleByUserName(project, userName) === 'admin' || userGlobalRole === 'admin'; + }; + + const isProjectManager = (project: IProject): boolean => { + return isProjectAdmin(project) || getProjectRoleByUserName(project, userName) === 'manager'; + }; + + const isAvailableProjectManaging = hasPermissionForProjectManaging; + + return { + isAvailableDeletingPermission, + isAvailableAddProjectPermission, + isProjectAdmin, + isProjectManager, + isAvailableProjectManaging, + } as const; +}; diff --git a/frontend/src/pages/Project/hooks/useConfigProjectCliComand.ts b/frontend/src/pages/Project/hooks/useConfigProjectCliComand.ts new file mode 100644 index 0000000000..d6bacb4408 --- /dev/null +++ b/frontend/src/pages/Project/hooks/useConfigProjectCliComand.ts @@ -0,0 +1,19 @@ +import { useAppSelector } from 'hooks'; +import { copyToClipboard } from 'libs'; + +import { selectAuthToken } from 'App/slice'; + +type Args = { + projectName: string; +}; +export const useConfigProjectCliCommand = ({ projectName }: Args) => { + const currentUserToken = useAppSelector(selectAuthToken); + + const cliCommand = `dstack project add \\\n --url ${location.origin} \\\n --name ${projectName} \\\n --token ${currentUserToken}`; + + const copyCliCommand = () => { + copyToClipboard(cliCommand); + }; + + return [cliCommand, copyCliCommand] as const; +}; diff --git a/frontend/src/pages/Project/hooks/useDeleteProject.ts b/frontend/src/pages/Project/hooks/useDeleteProject.ts new file mode 100644 index 0000000000..86f51ac482 --- /dev/null +++ b/frontend/src/pages/Project/hooks/useDeleteProject.ts @@ -0,0 +1,39 @@ +import { useTranslation } from 'react-i18next'; + +import { useNotifications } from 'hooks'; +import { getServerError } from 'libs'; +import { useDeleteProjectsMutation } from 'services/project'; + +export const useDeleteProject = () => { + const { t } = useTranslation(); + const [deleteProjectsRequest, { isLoading: isDeleting }] = useDeleteProjectsMutation(); + const [pushNotification] = useNotifications(); + + const deleteProject = (project: IProject) => { + const request = deleteProjectsRequest([project.project_name]).unwrap(); + + request.catch((error) => { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: getServerError(error) }), + }); + }); + + return request; + }; + + const deleteProjects = (projects: IProject[]) => { + const request = deleteProjectsRequest(projects.map((project) => project.project_name)).unwrap(); + + request.catch((error) => { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: getServerError(error) }), + }); + }); + + return request; + }; + + return { isDeleting, deleteProject, deleteProjects } as const; +}; diff --git a/frontend/src/pages/Project/hooks/useProjectMemberActions.ts b/frontend/src/pages/Project/hooks/useProjectMemberActions.ts new file mode 100644 index 0000000000..6005e313ad --- /dev/null +++ b/frontend/src/pages/Project/hooks/useProjectMemberActions.ts @@ -0,0 +1,84 @@ +import { useTranslation } from 'react-i18next'; + +import { useNotifications } from 'hooks'; +import { useAddProjectMemberMutation, useRemoveProjectMemberMutation } from 'services/project'; + +export const useProjectMemberActions = () => { + const { t } = useTranslation(); + const [pushNotification] = useNotifications(); + const [addMember, { isLoading: isAdding }] = useAddProjectMemberMutation(); + const [removeMember, { isLoading: isRemoving }] = useRemoveProjectMemberMutation(); + + const handleJoinProject = async (projectName: string, username: string) => { + if (!username || !projectName) return; + + try { + await addMember({ + project_name: projectName, + username: username, + project_role: 'user', + }).unwrap(); + + pushNotification({ + type: 'success', + content: t('projects.join_success'), + }); + } catch (error) { + console.error('Failed to join project:', error); + pushNotification({ + type: 'error', + content: t('projects.join_error'), + }); + } + }; + + const handleLeaveProject = async (projectName: string, username: string, onLeaveSuccess?: () => void) => { + if (!username || !projectName) return; + + try { + await removeMember({ + project_name: projectName, + username: username, + }).unwrap(); + + pushNotification({ + type: 'success', + content: t('projects.leave_success'), + }); + + // Optionally call the success callback + onLeaveSuccess?.(); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + } catch (error: any) { + console.error('Failed to leave project:', error); + + // Extract the specific error message from the backend + let errorMessage = t('projects.leave_error'); + if (error?.data?.detail) { + if (Array.isArray(error.data.detail)) { + // Handle array format: [{msg: "error message"}] + errorMessage = error.data.detail[0]?.msg || errorMessage; + } else if (typeof error.data.detail === 'string') { + // Handle string format + errorMessage = error.data.detail; + } else if (error.data.detail.msg) { + // Handle object format: {msg: "error message"} + errorMessage = error.data.detail.msg; + } + } + + pushNotification({ + type: 'error', + content: errorMessage, + }); + } + }; + + const isMemberActionLoading = isAdding || isRemoving; + + return { + handleJoinProject, + handleLeaveProject, + isMemberActionLoading, + }; +}; diff --git a/frontend/src/pages/Project/hooks/useYupValidationResolver.ts b/frontend/src/pages/Project/hooks/useYupValidationResolver.ts new file mode 100644 index 0000000000..2cd694c63d --- /dev/null +++ b/frontend/src/pages/Project/hooks/useYupValidationResolver.ts @@ -0,0 +1,38 @@ +import { useCallback } from 'react'; +// eslint-disable-next-line @typescript-eslint/ban-ts-comment +// @ts-expect-error +export function useYupValidationResolver(validationSchema) { + return useCallback( + async (data: TData) => { + try { + const values = await validationSchema.validate(data, { + abortEarly: false, + }); + + return { + values, + errors: {}, + }; + } catch (errors) { + return { + values: {}, + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-expect-error + errors: errors.inner.reduce( + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-expect-error + (allErrors, currentError) => ({ + ...allErrors, + [currentError.path]: { + type: currentError.type ?? 'validation', + message: currentError.message, + }, + }), + {}, + ), + }; + } + }, + [validationSchema], + ); +} diff --git a/frontend/src/pages/Project/index.tsx b/frontend/src/pages/Project/index.tsx new file mode 100644 index 0000000000..503d4c2228 --- /dev/null +++ b/frontend/src/pages/Project/index.tsx @@ -0,0 +1,11 @@ +import React from 'react'; +export { ProjectList } from './List'; +export { ProjectDetails } from './Details'; +export { ProjectSettings } from './Details/Settings'; +export { Events as ProjectEvents } from './Details/Events'; +export { ProjectAdd } from './Add'; +export { CreateProjectWizard } from './CreateWizard'; + +export const Project: React.FC = () => { + return null; +}; diff --git a/frontend/src/pages/Project/utils.ts b/frontend/src/pages/Project/utils.ts new file mode 100644 index 0000000000..9b7879d7f6 --- /dev/null +++ b/frontend/src/pages/Project/utils.ts @@ -0,0 +1,11 @@ +export const getProjectRoleByUserName = ( + project: IProject, + userName: IProjectMember['user']['username'], +): TProjectRole | null => { + return project.members.find((m) => m.user.username === userName)?.project_role ?? null; +}; + +export const getMemberCanManageSecrets = (project: IProject, userName: IProjectMember['user']['username']): boolean => { + const member = project.members.find((m) => m.user.username === userName); + return member?.permissions?.can_manage_secrets ?? false; +}; diff --git a/frontend/src/pages/Runs/Details/Artifacts/index.tsx b/frontend/src/pages/Runs/Details/Artifacts/index.tsx new file mode 100644 index 0000000000..a3eb87e8ac --- /dev/null +++ b/frontend/src/pages/Runs/Details/Artifacts/index.tsx @@ -0,0 +1,235 @@ +import React, { useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useParams } from 'react-router-dom'; +import classNames from 'classnames'; + +import { + BreadcrumbGroup, + BreadcrumbGroupProps, + Button, + Header, + Link, + LinkProps, + ListEmptyMessage, + Table, + TextFilter, +} from 'components'; + +import { useCollection } from 'hooks'; +import { formatBytes } from 'libs'; +import { useGetArtifactsQuery } from 'services/artifact'; + +import { IProps, ITableItem } from './types'; + +import styles from './styles.module.scss'; + +export const Artifacts: React.FC = ({ className, ...props }) => { + const { t } = useTranslation(); + const params = useParams(); + const paramProjectName = props.project_name ?? params.projectName ?? ''; + const paramRunName = props.run_name ?? params.runName ?? ''; + const [globalPath, setGlobalPath] = useState([]); + const [selectedArtifactPath, setSelectedArtifactPath] = useState(''); + + const { data, isLoading, isFetching } = useGetArtifactsQuery({ + name: paramProjectName, + run_name: paramRunName, + prefix: selectedArtifactPath + globalPath.join('/'), + recursive: false, + }); + + const renderEmptyMessage = (): React.ReactNode => { + return ( + + ); + }; + + const renderNoMatchMessage = (onClearFilter: () => void): React.ReactNode => { + return ( + + + + ); + }; + + const formatListItems = (): ITableItem[] => { + let items: ITableItem[] = []; + + if (!data) return items; + + if (!selectedArtifactPath) { + items = data.map((a) => ({ + name: a.name.replace(/\/$/, ''), + path: a.path, + type: 'Folder', + size: null, + })); + } else { + // items = [ + // { + // name: '/', + // path: '..', + // type: 'Folder', + // size: null, + // }, + // ]; + + data.forEach((a) => { + const sortedFiles = [...a.files].sort((a, b) => { + if (a.filesize_in_bytes !== null && b.filesize_in_bytes === null) return 1; + if (a.filesize_in_bytes === null && b.filesize_in_bytes !== null) return -1; + + return 0; + }); + + sortedFiles.forEach((f) => { + let path = f.filepath; + + if (f.filesize_in_bytes !== null) { + const pathByArray = f.filepath.split('/'); + path = pathByArray[pathByArray.length - 1]; + } + + items.push({ + name: path.replace(/\/$/, ''), + path: path, + type: f.filesize_in_bytes === null ? 'Folder' : 'File', + size: f.filesize_in_bytes, + }); + }); + }); + } + + return items; + }; + + const { items, actions, filterProps, filteredItemsCount, collectionProps } = useCollection(formatListItems(), { + filtering: { + empty: renderEmptyMessage(), + noMatch: renderNoMatchMessage(() => actions.setFiltering('')), + }, + selection: {}, + }); + + const getLinkClickHandle = + (path: string): LinkProps['onFollow'] => + (event) => { + event.preventDefault(); + + if (path === '..') { + if (globalPath.length) { + setGlobalPath((oldGlobalPath) => oldGlobalPath.slice(0, -1)); + } else { + setSelectedArtifactPath(''); + } + + return; + } + + if (!selectedArtifactPath) { + setSelectedArtifactPath(path); + return; + } + + setGlobalPath((oldGlobalPath) => [...oldGlobalPath, path]); + }; + + const COLUMN_DEFINITIONS = [ + { + id: 'name', + header: t('projects.artifact.name'), + cell: (item: ITableItem) => { + if (item.type === 'Folder') return {item.name}; + + return item.name; + }, + }, + { + id: 'type', + header: t('projects.artifact.type'), + cell: (item: ITableItem) => item.type, + }, + { + id: 'size', + header: t('projects.artifact.size'), + cell: (item: ITableItem) => (item.size ? formatBytes(item.size) : '-'), + }, + ]; + + const onFollowHandler: BreadcrumbGroupProps['onFollow'] = (event) => { + event.preventDefault(); + + if (event.detail.href === '/') { + setSelectedArtifactPath(''); + setGlobalPath([]); + } + + const path = event.detail.href.replace(new RegExp(`^${selectedArtifactPath}`), ''); + + setGlobalPath( + path + .split('/') + .filter(Boolean) + .map((i) => i + '/'), + ); + }; + + const getBreadcrumbs = (): BreadcrumbGroupProps['items'] => { + const crumbs = [...(selectedArtifactPath ? [selectedArtifactPath] : []), ...globalPath].reduce( + (result, item, index) => { + result.push({ + text: item.replace(/\/$/, ''), + href: index ? result[index - 1].href + item : item, + }); + + return result; + }, + [] as { text: string; href: string }[], + ); + + return [ + { + text: t('projects.artifact.list_page_title'), + href: '/', + }, + + ...crumbs, + ]; + }; + + return ( +
+
+
+ {t('common.objects_other')} +
+ + + + } + filter={ + + } + /> + + ); +}; diff --git a/frontend/src/pages/Runs/Details/Artifacts/styles.module.scss b/frontend/src/pages/Runs/Details/Artifacts/styles.module.scss new file mode 100644 index 0000000000..0d30e883ad --- /dev/null +++ b/frontend/src/pages/Runs/Details/Artifacts/styles.module.scss @@ -0,0 +1,3 @@ +.artifacts { + +} diff --git a/frontend/src/pages/Runs/Details/Artifacts/types.ts b/frontend/src/pages/Runs/Details/Artifacts/types.ts new file mode 100644 index 0000000000..34de1d8df2 --- /dev/null +++ b/frontend/src/pages/Runs/Details/Artifacts/types.ts @@ -0,0 +1,10 @@ +export interface IProps extends Partial> { + className?: string; +} + +export interface ITableItem { + name: string; + path: string; + type: string; + size: number | null; +} diff --git a/frontend/src/pages/Runs/Details/Events/List/index.tsx b/frontend/src/pages/Runs/Details/Events/List/index.tsx new file mode 100644 index 0000000000..fb6610033e --- /dev/null +++ b/frontend/src/pages/Runs/Details/Events/List/index.tsx @@ -0,0 +1,61 @@ +import React from 'react'; +import { useListener } from 'react-bus'; +import { useTranslation } from 'react-i18next'; +import { useNavigate, useParams } from 'react-router-dom'; +import Button from '@cloudscape-design/components/button'; + +import { Header, Loader, Table } from 'components'; + +import { DEFAULT_TABLE_PAGE_SIZE } from 'consts'; +import { useCollection, useInfiniteScroll } from 'hooks'; +import { ROUTES } from 'routes'; +import { useLazyGetAllEventsQuery } from 'services/events'; + +import { useColumnsDefinitions } from 'pages/Events/List/hooks/useColumnDefinitions'; + +import { RUN_DETAILS_REFRESH_LIST_EVENT } from '../../constants'; + +export const EventsList = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramRunId = params.runId ?? ''; + const navigate = useNavigate(); + + const { data, isLoading, isLoadingMore, refreshList } = useInfiniteScroll({ + useLazyQuery: useLazyGetAllEventsQuery, + args: { limit: DEFAULT_TABLE_PAGE_SIZE, within_runs: [paramRunId] }, + + getPaginationParams: (lastEvent) => ({ + prev_recorded_at: lastEvent.recorded_at, + prev_id: lastEvent.id, + }), + }); + + useListener(RUN_DETAILS_REFRESH_LIST_EVENT, refreshList); + + const { items, collectionProps } = useCollection(data, { + selection: {}, + }); + + const goToFullView = () => { + navigate(ROUTES.EVENTS.LIST + `?within_runs=${paramRunId}`); + }; + + const { columns } = useColumnsDefinitions(); + + return ( +
{t('common.full_view')}}> + {t('navigation.events')} + + } + footer={} + /> + ); +}; diff --git a/frontend/src/pages/Runs/Details/Inspect/index.tsx b/frontend/src/pages/Runs/Details/Inspect/index.tsx new file mode 100644 index 0000000000..68c4b7bcbb --- /dev/null +++ b/frontend/src/pages/Runs/Details/Inspect/index.tsx @@ -0,0 +1,77 @@ +import React, { useEffect, useMemo } from 'react'; +import { useListener } from 'react-bus'; +import { useTranslation } from 'react-i18next'; +import { useParams } from 'react-router-dom'; + +import { CodeEditor, Container, Header, Loader } from 'components'; + +import { useGetRunQuery } from 'services/run'; + +import { RUN_DETAILS_REFRESH_LIST_EVENT } from '../constants'; + +interface AceEditorElement extends HTMLElement { + env?: { + editor?: { + setReadOnly: (readOnly: boolean) => void; + }; + }; +} + +export const RunInspect = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramProjectName = params.projectName ?? ''; + const paramRunId = params.runId ?? ''; + + const { + data: runData, + isLoading, + refetch, + } = useGetRunQuery({ + project_name: paramProjectName, + id: paramRunId, + }); + + useListener(RUN_DETAILS_REFRESH_LIST_EVENT, refetch); + + const jsonContent = useMemo(() => { + if (!runData) return ''; + return JSON.stringify(runData, null, 2); + }, [runData]); + + // Set editor to read-only after it loads + useEffect(() => { + const timer = setTimeout(() => { + // Find the ace editor instance in the DOM + const editorElements = document.querySelectorAll('.ace_editor'); + editorElements.forEach((element: Element) => { + const aceEditor = (element as AceEditorElement).env?.editor; + if (aceEditor) { + aceEditor.setReadOnly(true); + } + }); + }, 100); + + return () => clearTimeout(timer); + }, [jsonContent]); + + if (isLoading) + return ( + + + + ); + + return ( + {t('projects.run.inspect')}}> + { + // Prevent editing - onChange is required but we ignore changes + }} + /> + + ); +}; diff --git a/frontend/src/pages/Runs/Details/Jobs/Details/JobDetails/index.tsx b/frontend/src/pages/Runs/Details/Jobs/Details/JobDetails/index.tsx new file mode 100644 index 0000000000..724168d07b --- /dev/null +++ b/frontend/src/pages/Runs/Details/Jobs/Details/JobDetails/index.tsx @@ -0,0 +1,115 @@ +import React, { useMemo } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useParams } from 'react-router-dom'; + +import { Box, ColumnLayout, Container, Header, Loader, StatusIndicator } from 'components'; + +import { getStatusIconType } from 'libs/run'; +import { useGetRunQuery } from 'services/run'; + +import { + getJobError, + getJobFinishedAt, + getJobListItemBackend, + getJobListItemInstance, + getJobListItemPrice, + getJobListItemRegion, + getJobListItemResources, + getJobListItemSpot, + getJobStatus, + getJobStatusMessage, + getJobSubmittedAt, + getJobTerminationReason, +} from '../../List/helpers'; + +import styles from './styles.module.scss'; + +export const JobDetails = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramProjectName = params.projectName ?? ''; + const paramRunId = params.runId ?? ''; + const paramJobName = params.jobName ?? ''; + + const { data: runData, isLoading: isLoadingRun } = useGetRunQuery({ + project_name: paramProjectName, + id: paramRunId, + }); + + const jobData = useMemo(() => { + if (!runData) return null; + + return runData.jobs.find((job) => job.job_spec.job_name === paramJobName) ?? null; + }, [runData]); + + if (isLoadingRun) + return ( + + + + ); + + if (!jobData) return null; + + return ( +
+ {t('common.general')}}> + +
+ {t('projects.run.submitted_at')} +
{getJobSubmittedAt(jobData)}
+
+ +
+ {t('projects.run.finished_at')} +
{getJobFinishedAt(jobData)}
+
+ +
+ {t('projects.run.status')} +
+ + {getJobStatusMessage(jobData)} + +
+
+ +
+ {t('projects.run.error')} +
{getJobError(jobData)}
+
+ +
+ {t('projects.run.backend')} +
{getJobListItemBackend(jobData)}
+
+ +
+ {t('projects.run.region')} +
{getJobListItemRegion(jobData)}
+
+ +
+ {t('projects.run.instance')} +
{getJobListItemInstance(jobData)}
+
+ +
+ {t('projects.run.resources')} +
{getJobListItemResources(jobData)}
+
+ +
+ {t('projects.run.spot')} +
{getJobListItemSpot(jobData)}
+
+ +
+ {t('projects.run.price')} +
{getJobListItemPrice(jobData)}
+
+
+
+
+ ); +}; diff --git a/frontend/src/pages/Runs/Details/Jobs/Details/JobDetails/styles.module.scss b/frontend/src/pages/Runs/Details/Jobs/Details/JobDetails/styles.module.scss new file mode 100644 index 0000000000..8fe7a8c4c4 --- /dev/null +++ b/frontend/src/pages/Runs/Details/Jobs/Details/JobDetails/styles.module.scss @@ -0,0 +1,22 @@ +.details { + height: calc(100vh - 272px); + display: flex; + flex-direction: column; + gap: 16px; + + & > [class^="awsui_layout"] { + height: 100%; + & > [class^="awsui_content"] { + display: flex; + flex-direction: column; + gap: 20px; + height: 100%; + } + } +} + +.logs { + flex-grow: 1; + min-height: 0; + max-height: calc(100vh - 380px); +} diff --git a/frontend/src/pages/Runs/Details/Jobs/Details/index.tsx b/frontend/src/pages/Runs/Details/Jobs/Details/index.tsx new file mode 100644 index 0000000000..ffdc2d460c --- /dev/null +++ b/frontend/src/pages/Runs/Details/Jobs/Details/index.tsx @@ -0,0 +1,117 @@ +import React, { useEffect } from 'react'; +import { useTranslation } from 'react-i18next'; +import { Outlet, useParams } from 'react-router-dom'; + +import { ContentLayout, DetailsHeader, Tabs } from 'components'; + +import { useBreadcrumbs } from 'hooks'; +import { riseRouterException } from 'libs'; +import { ROUTES } from 'routes'; +import { useGetRunQuery } from 'services/run'; + +import styles from './styles.module.scss'; + +enum CodeTab { + Details = 'details', + Metrics = 'metrics', + Logs = 'logs', + Events = 'Events', +} + +export const JobDetailsPage: React.FC = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramProjectName = params.projectName ?? ''; + const paramRunId = params.runId ?? ''; + const paramJobName = params.jobName ?? ''; + + const { data: runData, error: runError } = useGetRunQuery({ + project_name: paramProjectName, + id: paramRunId, + }); + + useEffect(() => { + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + if (runError?.status === 404) { + riseRouterException(); + } + }, [runError]); + + useBreadcrumbs([ + { + text: t('navigation.project_other'), + href: ROUTES.PROJECT.LIST, + }, + { + text: paramProjectName, + href: ROUTES.PROJECT.DETAILS.FORMAT(paramProjectName), + }, + { + text: t('projects.runs'), + href: ROUTES.RUNS.LIST, + }, + { + text: runData?.run_spec.run_name ?? '', + href: ROUTES.PROJECT.DETAILS.RUNS.DETAILS.FORMAT(paramProjectName, paramRunId), + }, + { + text: t('projects.run.jobs'), + href: ROUTES.PROJECT.DETAILS.RUNS.DETAILS.FORMAT(paramProjectName, paramRunId), + }, + { + text: paramJobName, + href: ROUTES.PROJECT.DETAILS.RUNS.DETAILS.JOBS.DETAILS.FORMAT(paramProjectName, paramRunId, paramJobName), + }, + ]); + + return ( +
+ }> + + + + +
+ ); +}; diff --git a/frontend/src/pages/Runs/Details/Jobs/Details/styles.module.scss b/frontend/src/pages/Runs/Details/Jobs/Details/styles.module.scss new file mode 100644 index 0000000000..f8ecb8694b --- /dev/null +++ b/frontend/src/pages/Runs/Details/Jobs/Details/styles.module.scss @@ -0,0 +1,14 @@ +.page { + height: 100%; + + & > [class^="awsui_layout"] { + height: 100%; + + & > [class^="awsui_content"] { + display: flex; + flex-direction: column; + gap: 20px; + height: 100%; + } + } +} diff --git a/frontend/src/pages/Runs/Details/Jobs/Events/index.tsx b/frontend/src/pages/Runs/Details/Jobs/Events/index.tsx new file mode 100644 index 0000000000..48adc56364 --- /dev/null +++ b/frontend/src/pages/Runs/Details/Jobs/Events/index.tsx @@ -0,0 +1,78 @@ +import React, { useMemo } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate, useParams } from 'react-router-dom'; +import Button from '@cloudscape-design/components/button'; + +import { Header, Loader, Table } from 'components'; + +import { DEFAULT_TABLE_PAGE_SIZE } from 'consts'; +import { useCollection, useInfiniteScroll } from 'hooks'; +import { useLazyGetAllEventsQuery } from 'services/events'; + +import { useColumnsDefinitions } from 'pages/Events/List/hooks/useColumnDefinitions'; + +import { ROUTES } from '../../../../../routes'; +import { useGetRunQuery } from '../../../../../services/run'; + +export const EventsList = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramProjectName = params.projectName ?? ''; + const paramRunId = params.runId ?? ''; + const paramJobName = params.jobName ?? ''; + const navigate = useNavigate(); + + const { data: runData, isLoading: isLoadingRun } = useGetRunQuery({ + project_name: paramProjectName, + id: paramRunId, + }); + + const jobId = useMemo(() => { + if (!runData) return; + + return runData.jobs.find((job) => job.job_spec.job_name === paramJobName)?.job_submissions?.[0]?.id; + }, [runData]); + + const { data, isLoading, isLoadingMore } = useInfiniteScroll({ + useLazyQuery: useLazyGetAllEventsQuery, + args: { limit: DEFAULT_TABLE_PAGE_SIZE, target_jobs: jobId ? [jobId] : undefined }, + skip: !jobId, + + getPaginationParams: (lastEvent) => ({ + prev_recorded_at: lastEvent.recorded_at, + prev_id: lastEvent.id, + }), + }); + + const goToFullView = () => { + navigate(ROUTES.EVENTS.LIST + `?target_jobs=${jobId}`); + }; + + const { items, collectionProps } = useCollection(data, { + selection: {}, + }); + + const { columns } = useColumnsDefinitions(); + + return ( +
+ {t('common.full_view')} + + } + > + {t('navigation.events')} + + } + footer={} + /> + ); +}; diff --git a/frontend/src/pages/Runs/Details/Jobs/List/helpers.ts b/frontend/src/pages/Runs/Details/Jobs/List/helpers.ts new file mode 100644 index 0000000000..6fd1f30778 --- /dev/null +++ b/frontend/src/pages/Runs/Details/Jobs/List/helpers.ts @@ -0,0 +1,92 @@ +import { format } from 'date-fns'; +import type { StatusIndicatorProps } from '@cloudscape-design/components/status-indicator'; + +import { DATE_TIME_FORMAT } from 'consts'; +import { capitalize } from 'libs'; +import { formatBackend } from 'libs/fleet'; +import { formatResources } from 'libs/resources'; + +export const getJobListItemResources = (job: IJob) => { + const resources = job.job_submissions?.[job.job_submissions.length - 1]?.job_provisioning_data?.instance_type?.resources; + return resources ? formatResources(resources) : '-'; +}; + +export const getJobListItemSpot = (job: IJob) => { + return ( + job.job_submissions?.[ + job.job_submissions.length - 1 + ]?.job_provisioning_data?.instance_type?.resources?.spot?.toString() ?? '-' + ); +}; + +export const getJobListItemPrice = (job: IJob) => { + return job.job_submissions?.[job.job_submissions.length - 1]?.job_provisioning_data?.price + ? `$${job.job_submissions?.[job.job_submissions.length - 1]?.job_provisioning_data?.price}` + : null; +}; + +export const getJobListItemInstance = (job: IJob) => { + return job.job_submissions?.[job.job_submissions.length - 1]?.job_provisioning_data?.instance_type?.name; +}; + +export const getJobListItemRegion = (job: IJob) => { + return job.job_submissions?.[job.job_submissions.length - 1]?.job_provisioning_data?.region ?? '-'; +}; + +export const getJobListItemBackend = (job: IJob) => { + return formatBackend(job.job_submissions?.[job.job_submissions.length - 1]?.job_provisioning_data?.backend); +}; + +export const getJobSubmittedAt = (job: IJob) => { + return job.job_submissions?.[job.job_submissions.length - 1].submitted_at + ? format(new Date(job.job_submissions?.[job.job_submissions.length - 1].submitted_at), DATE_TIME_FORMAT) + : ''; +}; + +export const getJobFinishedAt = (job: IJob) => { + const finished_at = job.job_submissions?.[job.job_submissions.length - 1].finished_at; + return finished_at ? format(new Date(finished_at), DATE_TIME_FORMAT) : ''; +}; + +export const getJobStatus = (job: IJob) => { + return job.job_submissions?.[job.job_submissions.length - 1].status; +}; + +export const getJobSubmissionProbes = (job: IJob) => { + return job.job_submissions?.[job.job_submissions.length - 1].probes; +}; + +export const getJobProbesStatuses = (job: IJob): StatusIndicatorProps.Type[] => { + const status = getJobStatus(job); + const probes = getJobSubmissionProbes(job); + + if (!probes?.length || status !== 'running') { + return []; + } + + return probes.map((probe, index) => { + if (job.job_spec?.probes?.[index] && probe.success_streak >= job.job_spec.probes[index].ready_after) { + return 'success'; + } else if (probe.success_streak > 0) { + return 'in-progress'; + } + return 'not-started'; + }); +}; + +export const getJobTerminationReason = (job: IJob) => { + return job.job_submissions?.[job.job_submissions.length - 1].termination_reason ?? '-'; +}; + +export const getJobStatusMessage = (job: IJob): string | null => { + const latest_submission = job.job_submissions?.[job.job_submissions.length - 1]; + if (latest_submission?.status_message) { + return capitalize(latest_submission.status_message); + } else { + return capitalize(latest_submission.status); + } +}; + +export const getJobError = (job: IJob): string | null => { + return job.job_submissions?.[job.job_submissions.length - 1]?.error ?? null; +}; diff --git a/frontend/src/pages/Runs/Details/Jobs/List/hooks.tsx b/frontend/src/pages/Runs/Details/Jobs/List/hooks.tsx new file mode 100644 index 0000000000..2d89cdcaef --- /dev/null +++ b/frontend/src/pages/Runs/Details/Jobs/List/hooks.tsx @@ -0,0 +1,122 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; + +import { NavigateLink, StatusIndicator } from 'components'; + +import { getStatusIconType } from 'libs/run'; +import { ROUTES } from 'routes'; + +import { + getJobError, + getJobFinishedAt, + getJobListItemBackend, + getJobListItemInstance, + getJobListItemPrice, + getJobListItemRegion, + getJobListItemResources, + getJobListItemSpot, + getJobProbesStatuses, + getJobStatus, + getJobStatusMessage, + getJobSubmittedAt, + getJobTerminationReason, +} from './helpers'; + +export const useColumnsDefinitions = ({ + projectName, + runId, + runPriority, +}: { + projectName: string; + runId: string; + runPriority?: number | null; +}) => { + const { t } = useTranslation(); + + const columns = [ + { + id: 'job_name', + header: t('projects.run.job_name'), + cell: (item: IJob) => ( + + {item.job_spec.job_name} + + ), + }, + { + id: 'submitted_at', + header: t('projects.run.submitted_at'), + cell: getJobSubmittedAt, + }, + { + id: 'finished_at', + header: t('projects.run.finished_at'), + cell: getJobFinishedAt, + }, + { + id: 'status', + header: t('projects.run.status'), + cell: (item: IJob) => { + const status = getJobStatus(item); + + return ( + + {getJobStatusMessage(item)} + + ); + }, + }, + { + id: 'probe', + header: t('projects.run.probe'), + cell: (item: IJob) => { + const statuses = getJobProbesStatuses(item); + return statuses.map((statusType, index) => ); + }, + }, + { + id: 'priority', + header: t('projects.run.priority'), + cell: () => runPriority, + }, + { + id: 'error', + header: t('projects.run.error'), + cell: (item: IJob) => getJobError(item), + }, + { + id: 'resources', + header: `${t('projects.run.resources')}`, + cell: getJobListItemResources, + }, + { + id: 'spot', + header: `${t('projects.run.spot')}`, + cell: getJobListItemSpot, + }, + { + id: 'price', + header: `${t('projects.run.price')}`, + cell: getJobListItemPrice, + }, + { + id: 'instance', + header: `${t('projects.run.instance')}`, + cell: getJobListItemInstance, + }, + { + id: 'region', + header: `${t('projects.run.region')}`, + cell: getJobListItemRegion, + }, + { + id: 'backend', + header: `${t('projects.run.backend')}`, + cell: getJobListItemBackend, + }, + ]; + + return { columns } as const; +}; diff --git a/frontend/src/pages/Runs/Details/Jobs/List/index.tsx b/frontend/src/pages/Runs/Details/Jobs/List/index.tsx new file mode 100644 index 0000000000..6917d2b726 --- /dev/null +++ b/frontend/src/pages/Runs/Details/Jobs/List/index.tsx @@ -0,0 +1,34 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; + +import { Header, Pagination, Table } from 'components'; + +import { useCollection } from 'hooks'; + +import { useColumnsDefinitions } from './hooks'; + +export interface Props { + projectName: string; + runId: string; + runPriority?: number | null; + jobs: IRun['jobs']; +} + +export const JobList: React.FC = ({ jobs, projectName, runId, runPriority }) => { + const { t } = useTranslation(); + const { columns } = useColumnsDefinitions({ projectName, runId, runPriority }); + + const { items, collectionProps, paginationProps } = useCollection(jobs, { + pagination: { pageSize: 20 }, + }); + + return ( +
{t('projects.run.jobs')}} + pagination={} + /> + ); +}; diff --git a/frontend/src/pages/Runs/Details/Jobs/Metrics/consts.ts b/frontend/src/pages/Runs/Details/Jobs/Metrics/consts.ts new file mode 100644 index 0000000000..cb375c73c7 --- /dev/null +++ b/frontend/src/pages/Runs/Details/Jobs/Metrics/consts.ts @@ -0,0 +1,15 @@ +export const second = 1000; +export const minute = 60 * second; +export const hour = minute * 60; + +export const kByte = 1024; +export const MByte = kByte * 1024; +export const GByte = MByte * 1024; + +export const CPU_NUMS = 'cpus_detected_num'; +export const ALL_CPU_USAGE = 'cpu_usage_percent'; +export const MEMORY_WORKING_SET = 'memory_working_set_bytes'; +export const MEMORY_TOTAL = 'memory_total_bytes'; +export const EACH_GPU_USAGE_PREFIX = 'gpu_util_percent_gpu'; +export const EACH_GPU_MEMORY_USAGE_PREFIX = 'gpu_memory_usage_bytes_gpu'; +export const EACH_GPU_MEMORY_TOTAL = 'gpu_memory_total_bytes'; diff --git a/frontend/src/pages/Runs/Details/Jobs/Metrics/helpers.ts b/frontend/src/pages/Runs/Details/Jobs/Metrics/helpers.ts new file mode 100644 index 0000000000..56e41cd5a6 --- /dev/null +++ b/frontend/src/pages/Runs/Details/Jobs/Metrics/helpers.ts @@ -0,0 +1,73 @@ +import { GByte, kByte, MByte } from './consts'; + +export const formatTime = (date: Date) => { + return date.toLocaleTimeString('en-US', { + hour: 'numeric', + minute: 'numeric', + hour12: !1, + }); +}; + +export const formatPercent = (percent: number) => `${percent} %`; + +export const bytesFormatter = (bytes: number, hasPostfix = true) => { + if (bytes >= GByte) { + return (bytes / GByte).toFixed(1) + (hasPostfix ? ' GB' : ''); + } + + if (bytes >= MByte) { + return (bytes / MByte).toFixed(1) + (hasPostfix ? ' MB' : ''); + } + + if (bytes >= kByte) { + return (bytes / kByte).toFixed(1) + (hasPostfix ? ' KB' : ''); + } + + return bytes + (hasPostfix ? ' B' : ''); +}; + +type GetSeriesDataArgs = { + metricItem: IMetricsItem; + yValueFormater?: (value: IMetricsItem['values'][number], index: number) => IMetricsItem['values'][number]; +}; + +export const getSeriesData = ({ metricItem, yValueFormater = (value) => value }: GetSeriesDataArgs) => { + return metricItem.timestamps.map((time, index) => ({ + x: new Date(time), + y: yValueFormater(metricItem.values[index], index), + })); +}; + +type GetChartPropsArgs = Pick & { + renderTitle: (index: number) => string; + type?: string; + valueFormatter?: (value: number) => void; + metricItems: IMetricsItem[]; + customSeries?: unknown[]; + yDomain?: number[]; +}; + +export const getChartProps = ({ + metricItems, + renderTitle, + type = 'line', + valueFormatter, + yValueFormater, + customSeries = [], + yDomain = [], +}: GetChartPropsArgs) => { + const series = metricItems.map((metricItem, index) => ({ + title: renderTitle(index), + type, + valueFormatter, + data: getSeriesData({ metricItem, yValueFormater }), + })); + + const firstSeries = series?.[0]?.data; + + return { + series: [...series, ...customSeries], + xDomain: [firstSeries?.[0]?.x, firstSeries?.[firstSeries.length - 1]?.x], + yDomain, + }; +}; diff --git a/frontend/src/pages/Runs/Details/Jobs/Metrics/index.tsx b/frontend/src/pages/Runs/Details/Jobs/Metrics/index.tsx new file mode 100644 index 0000000000..db89033984 --- /dev/null +++ b/frontend/src/pages/Runs/Details/Jobs/Metrics/index.tsx @@ -0,0 +1,158 @@ +import React, { useEffect, useMemo } from 'react'; +import { useListener } from 'react-bus'; +import { useTranslation } from 'react-i18next'; +import { useParams } from 'react-router-dom'; + +import { Box, ColumnLayout, Container, Header, LineChart } from 'components'; + +import { riseRouterException } from 'libs'; +import { useGetRunQuery } from 'services/run'; + +import { RUN_DETAILS_REFRESH_LIST_EVENT } from '../../constants'; +import { bytesFormatter, formatPercent, formatTime } from './helpers'; +import { useMetricsData } from './useMetricsData'; + +export const JobMetrics: React.FC = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramProjectName = params.projectName ?? ''; + const paramRunId = params.runId ?? ''; + const paramJobName = params.jobName ?? ''; + + const { + data: runData, + isLoading: isLoadingRun, + error: runError, + } = useGetRunQuery({ + project_name: paramProjectName, + id: paramRunId, + }); + + const jobData = useMemo(() => { + if (!runData) return null; + + return runData.jobs.find((job) => job.job_spec.job_name === paramJobName) ?? null; + }, [runData]); + + const { + cpuChartProps, + memoryChartProps, + eachGPUChartProps, + eachGPUMemoryChartProps, + isLoading, + refetch: refetchMetrics, + } = useMetricsData({ + project_name: paramProjectName, + run_name: runData?.run_spec.run_name ?? '', + run_id: runData?.id ?? '', + job_num: jobData?.job_spec.job_num ?? 0, + limit: 1000, + }); + + useListener(RUN_DETAILS_REFRESH_LIST_EVENT, refetchMetrics); + + const statusType = isLoading || isLoadingRun ? 'loading' : 'finished'; + + useEffect(() => { + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + if (runError?.status === 404) { + riseRouterException(); + } + }, [runError]); + + const defaultChartProps = { + height: 300, + xTitle: 'Time', + empty: ( + + No data available + + There is no data available + + + ), + noMatch: ( + + No matching data + + There is no matching data to display + + + ), + }; + + return ( + + {t('projects.run.metrics.cpu_utilization')}}> + + + + {t('projects.run.metrics.memory_used')}}> + + + + {t('projects.run.metrics.per_each_cpu_utilization')}}> + + + + {t('projects.run.metrics.per_each_memory_used')}}> + + + + ); +}; diff --git a/frontend/src/pages/Runs/Details/Jobs/Metrics/useMetricsData.ts b/frontend/src/pages/Runs/Details/Jobs/Metrics/useMetricsData.ts new file mode 100644 index 0000000000..70bc1d6e5a --- /dev/null +++ b/frontend/src/pages/Runs/Details/Jobs/Metrics/useMetricsData.ts @@ -0,0 +1,84 @@ +import { useMemo } from 'react'; + +import { useGetMetricsQuery } from 'services/run'; + +import { + ALL_CPU_USAGE, + CPU_NUMS, + EACH_GPU_MEMORY_TOTAL, + EACH_GPU_MEMORY_USAGE_PREFIX, + EACH_GPU_USAGE_PREFIX, + GByte, + MEMORY_TOTAL, + MEMORY_WORKING_SET, +} from './consts'; +import { bytesFormatter, getChartProps } from './helpers'; + +export const useMetricsData = (params: TJobMetricsRequestParams) => { + const { + data: metricsData, + isLoading, + refetch, + } = useGetMetricsQuery(params, { + skip: !params.run_name, + }); + + const cpuChartProps = useMemo(() => { + const metricItem = metricsData?.find((i) => i.name === ALL_CPU_USAGE); + const numsMetricItem = metricsData?.find((i) => i.name === CPU_NUMS); + + return getChartProps({ + metricItems: metricItem ? [metricItem] : [], + renderTitle: () => 'CPU utilization %', + yValueFormater: (value, index) => { + return parseFloat((value / (numsMetricItem?.values?.[index] ?? 1)).toFixed(2)); + }, + yDomain: [0, 100], + }); + }, [metricsData]); + + const memoryChartProps = useMemo(() => { + const metricItem = metricsData?.find((i) => i.name === MEMORY_WORKING_SET); + const totalMetricItem = metricsData?.find((i) => i.name === MEMORY_TOTAL); + + const totalMemory = totalMetricItem?.values[0]; + + return getChartProps({ + metricItems: metricItem ? [metricItem] : [], + renderTitle: () => 'Memory used', + valueFormatter: bytesFormatter, + customSeries: totalMetricItem?.values?.length + ? [{ title: 'Memory total', type: 'threshold', valueFormatter: bytesFormatter, y: totalMemory }] + : undefined, + yDomain: [0, totalMemory ? totalMemory + GByte : 128 * GByte], + }); + }, [metricsData]); + + const eachGPUChartProps = useMemo(() => { + const metricItems = metricsData?.filter((i) => i.name.indexOf(EACH_GPU_USAGE_PREFIX) > -1) ?? []; + + return getChartProps({ + metricItems, + renderTitle: (index) => `GPU utilization % GPU${index}`, + yDomain: [0, 100], + }); + }, [metricsData]); + + const eachGPUMemoryChartProps = useMemo(() => { + const metricItems = metricsData?.filter((i) => i.name.indexOf(EACH_GPU_MEMORY_USAGE_PREFIX) > -1) ?? []; + const totalMetricItem = metricsData?.find((i) => i.name === EACH_GPU_MEMORY_TOTAL); + const totalMemory = totalMetricItem?.values[0]; + + return getChartProps({ + metricItems, + renderTitle: (index) => `Memory used GPU${index}`, + valueFormatter: bytesFormatter, + customSeries: totalMetricItem?.values?.length + ? [{ title: 'Memory total', type: 'threshold', valueFormatter: bytesFormatter, y: totalMemory }] + : undefined, + yDomain: [0, totalMemory ? totalMemory + GByte : 128 * GByte], + }); + }, [metricsData]); + + return { cpuChartProps, eachGPUChartProps, memoryChartProps, eachGPUMemoryChartProps, isLoading, refetch }; +}; diff --git a/frontend/src/pages/Runs/Details/Logs/components/LogRow/index.tsx b/frontend/src/pages/Runs/Details/Logs/components/LogRow/index.tsx new file mode 100644 index 0000000000..4393a493e5 --- /dev/null +++ b/frontend/src/pages/Runs/Details/Logs/components/LogRow/index.tsx @@ -0,0 +1,57 @@ +import React, { useEffect, useRef, useState } from 'react'; +import cn from 'classnames'; + +import { Icon } from 'components'; + +import styles from '../../styles.module.scss'; + +export const LogRow: React.FC<{ + logItem: ILogItem; + isShowTimestamp?: boolean; +}> = ({ logItem, isShowTimestamp }) => { + const [collapsed, setCollapsed] = useState(true); + const [showChevron, setShowChevron] = useState(true); + const messageInnerRef = useRef(null); + + const toggleCollapsed = () => setCollapsed((val) => !val); + + useEffect(() => { + const observeTarget = messageInnerRef.current; + if (!observeTarget) return; + + const resizeObserver = new ResizeObserver((entries) => { + const entry = entries[0]; + if (entry) { + const { height } = entry.contentRect; + + setShowChevron(height > 32); + } + }); + + resizeObserver.observe(observeTarget); + + return () => { + resizeObserver.unobserve(observeTarget); + }; + }, []); + + return ( + + {isShowTimestamp && ( + + )} + + + ); +}; diff --git a/frontend/src/pages/Runs/Details/Logs/helpers.ts b/frontend/src/pages/Runs/Details/Logs/helpers.ts new file mode 100644 index 0000000000..94ee018676 --- /dev/null +++ b/frontend/src/pages/Runs/Details/Logs/helpers.ts @@ -0,0 +1,24 @@ +export const getJobSubmissionId = (run?: IRun): string | undefined => { + if (!run) return; + + const lastJob = run.jobs[run.jobs.length - 1]; + + if (!lastJob) return; + + return lastJob.job_submissions[lastJob.job_submissions.length - 1]?.id; +}; + +export const decodeLogs = (logs: ILogItem[]): ILogItem[] => { + return logs.map((log: ILogItem) => { + let { message } = log; + + try { + message = atob(message); + // eslint-disable-next-line @typescript-eslint/no-unused-vars + } catch (e) { + return log; + } + + return { ...log, message }; + }); +}; diff --git a/frontend/src/pages/Runs/Details/Logs/index.tsx b/frontend/src/pages/Runs/Details/Logs/index.tsx new file mode 100644 index 0000000000..4d12520bba --- /dev/null +++ b/frontend/src/pages/Runs/Details/Logs/index.tsx @@ -0,0 +1,272 @@ +import React, { useCallback, useEffect, useLayoutEffect, useMemo, useRef, useState } from 'react'; +import { useListener } from 'react-bus'; +import { useTranslation } from 'react-i18next'; +import { useParams } from 'react-router-dom'; +import classNames from 'classnames'; + +import { Box, Button, Code, Container, Header, ListEmptyMessage, Loader, TextContent } from 'components'; + +import { useLocalStorageState } from 'hooks/useLocalStorageState'; +import { useLazyGetProjectLogsQuery } from 'services/project'; +import { useGetRunQuery } from 'services/run'; + +import { LogRow } from './components/LogRow'; +import { RUN_DETAILS_REFRESH_LIST_EVENT } from '../constants'; +import { decodeLogs, getJobSubmissionId } from './helpers'; + +import { IProps } from './types'; + +import styles from './styles.module.scss'; + +const LIMIT_LOG_ROWS = 100; +const LOADING_SCROLL_GAP = 300; + +export const Logs: React.FC = ({ className, projectName, runName, jobSubmissionId }) => { + const { t } = useTranslation(); + const codeRef = useRef(null); + const nextTokenRef = useRef(undefined); + const scrollPositionByBottom = useRef(0); + + const [logsData, setLogsData] = useState([]); + const [externalLink, setExternalLink] = useState(); + const [isLoading, setIsLoading] = useState(false); + const [getProjectLogs] = useLazyGetProjectLogsQuery(); + const [isEnabledDecoding, setIsEnabledDecoding] = useLocalStorageState('enable-encode-logs', false); + const [isShowTimestamp, setIsShowTimestamp] = useLocalStorageState('enable-showing-timestamp-logs', false); + + const logsForView = useMemo(() => { + if (isEnabledDecoding) { + return decodeLogs(logsData); + } + + return logsData; + }, [logsData, isEnabledDecoding]); + + const saveScrollPositionByBottom = () => { + if (!codeRef.current) return; + + const { clientHeight, scrollHeight, scrollTop } = codeRef.current; + scrollPositionByBottom.current = scrollHeight - clientHeight - scrollTop; + }; + + const restoreScrollPositionByBottom = () => { + if (!codeRef.current) return; + + const { clientHeight, scrollHeight } = codeRef.current; + codeRef.current.scrollTo(0, scrollHeight - clientHeight - scrollPositionByBottom.current); + }; + + const checkNeedMoreLoadingData = () => { + if (!codeRef.current) return; + + const { clientHeight, scrollHeight } = codeRef.current; + + if (scrollHeight - clientHeight <= LOADING_SCROLL_GAP) { + getNextLogItems(); + } + }; + + const getLogItems = (nextToken?: string) => { + setIsLoading(true); + + if (!jobSubmissionId) { + return; + } + + getProjectLogs({ + project_name: projectName, + run_name: runName, + descending: true, + job_submission_id: jobSubmissionId, + next_token: nextToken, + limit: LIMIT_LOG_ROWS, + }) + .unwrap() + .then((response) => { + saveScrollPositionByBottom(); + const reversed = response.logs.toReversed(); + + if (nextToken) { + setLogsData((old) => [...reversed, ...old]); + } else { + setLogsData(reversed); + setExternalLink(response.external_url); + } + + nextTokenRef.current = response.next_token; + setIsLoading(false); + }) + .catch(() => setIsLoading(false)); + }; + + const getNextLogItems = () => { + if (nextTokenRef.current) { + getLogItems(nextTokenRef.current); + } + }; + + const toggleDecodeLogs = () => { + saveScrollPositionByBottom(); + setIsEnabledDecoding(!isEnabledDecoding); + }; + + const toggleShowingTimestamp = () => { + setIsShowTimestamp(!isShowTimestamp); + }; + + useEffect(() => { + getLogItems(); + }, []); + + const refreshLogs = useCallback(() => getLogItems(), []); + + useListener(RUN_DETAILS_REFRESH_LIST_EVENT, refreshLogs); + + useLayoutEffect(() => { + if (logsForView.length && logsForView.length <= LIMIT_LOG_ROWS) { + scrollToBottom(); + } else { + restoreScrollPositionByBottom(); + } + + if (logsForView.length) checkNeedMoreLoadingData(); + }, [logsForView]); + + const onScroll = useCallback( + (event) => { + const element = event.target as HTMLDivElement; + + if (element.scrollTop <= LOADING_SCROLL_GAP && !isLoading) { + getNextLogItems(); + } + }, + [isLoading, logsForView], + ); + + useEffect(() => { + if (!codeRef.current) return; + + codeRef.current.addEventListener('scroll', onScroll); + + return () => { + if (codeRef.current) codeRef.current.removeEventListener('scroll', onScroll); + }; + }, [codeRef.current, onScroll]); + + const scrollToBottom = () => { + if (!codeRef.current) return; + + const { clientHeight, scrollHeight } = codeRef.current; + codeRef.current.scrollTo(0, scrollHeight - clientHeight); + }; + + return ( +
+ +
+
{t('projects.run.log')}
+
+ + {externalLink && ( +
+ + } + > + + {!isLoading && !logsForView.length && ( + + )} + + {!logsForView.length && } + + {Boolean(logsForView.length) && ( + +
+ + + {' '} + {new Date(logItem.timestamp).toISOString()} + +
+
+ {logItem.message} +
+
+
+ + {logsForView.map((log, i) => ( + + ))} + +
+ + )} + + +
+ ); +}; + +const getJobSubmissionIdFromJobData = (job?: IJob): string | undefined => { + if (!job) return; + + return job.job_submissions[job.job_submissions.length - 1]?.id; +}; + +export const JobLogs = () => { + const params = useParams(); + const paramProjectName = params.projectName ?? ''; + const paramRunId = params.runId ?? ''; + const paramJobName = params.jobName ?? ''; + + const { data: runData, isLoading: isLoadingRun } = useGetRunQuery({ + project_name: paramProjectName, + id: paramRunId, + }); + + const jobData = useMemo(() => { + if (!runData) return null; + + return runData.jobs.find((job) => job.job_spec.job_name === paramJobName) ?? null; + }, [runData]); + + if (isLoadingRun) + return ( + + + + ); + + return ( + + ); +}; diff --git a/frontend/src/pages/Runs/Details/Logs/styles.module.scss b/frontend/src/pages/Runs/Details/Logs/styles.module.scss new file mode 100644 index 0000000000..541da47870 --- /dev/null +++ b/frontend/src/pages/Runs/Details/Logs/styles.module.scss @@ -0,0 +1,137 @@ +@use '@cloudscape-design/design-tokens/index' as awsui; + +.headerContainer { + display: flex; + gap: 10px; + align-items: center; + padding-top: 4px; + + .headerTitle { + flex-shrink: 0; + margin-top: -4px; + } + + .switchers { + margin-left: auto; + display: flex; + gap: 10px; + + button { + width: 32px !important; + } + } +} + +.loader { + position: relative; + height: 20px; + background-color: rgba(awsui.$color-background-container-content, .8); + color: #6e6e6e; +} + +.mainLoader { + margin-top: auto; + margin-bottom: auto; + transform: translateY(-24px); +} + +.logsPage { + flex-grow: 1; + min-height: 0; + max-height: calc(100vh - 258px); +} + +.logs { + display: flex; + flex-direction: column; + & > [class^="awsui_root"] { + display: flex; + flex-direction: column; + flex-grow: 1; + min-height: 0; + + & > [class^="awsui_content-wrapper"] { + flex-grow: 1; + min-height: 0; + + & > [class^="awsui_header"] { + flex-shrink: 0; + } + + & > [class^="awsui_content"] { + display: flex; + flex-direction: column; + flex-grow: 1; + min-height: 0; + + & > [class^="awsui_content-inner"] { + display: flex; + flex-direction: column; + flex-grow: 1; + min-height: 0; + + & > [class^="awsui_text-content"] { + overflow: hidden; + position: relative; + display: flex; + flex-direction: column; + flex-grow: 1; + min-height: 20px; + } + } + + .terminal { + flex-grow: 1; + min-height: 0; + height: 0; + overflow-y: auto; + background-color: awsui.$color-background-layout-main; + + code { + color: awsui.$color-text-body-default !important; + } + + .logItem { + font-size: awsui.$font-size-body-s !important; + line-height: awsui.$line-height-body-s !important; + + .toggleCollapse { + position: relative; + top: -3px; + vertical-align: middle; + cursor: pointer; + + &.hidden { + opacity: 0; + pointer-events: none; + } + } + + .timestamp { + vertical-align: top; + padding-right: 16px; + white-space: nowrap; + } + + .messageCol { + vertical-align: top; + } + + .message { + overflow-y: hidden; + &.collapsed { + max-height: calc(awsui.$line-height-body-s * 2); + } + } + } + } + + .scroll { + overflow-y: auto; + flex-grow: 1; + min-height: 0; + } + } + } + } +} diff --git a/frontend/src/pages/Runs/Details/Logs/types.ts b/frontend/src/pages/Runs/Details/Logs/types.ts new file mode 100644 index 0000000000..5932896a52 --- /dev/null +++ b/frontend/src/pages/Runs/Details/Logs/types.ts @@ -0,0 +1,6 @@ +export interface IProps { + projectName: string; + runName: string; + jobSubmissionId?: string | null; + className?: string; +} diff --git a/frontend/src/pages/Runs/Details/RunDetails/ConnectToRunWithDevEnvConfiguration/index.tsx b/frontend/src/pages/Runs/Details/RunDetails/ConnectToRunWithDevEnvConfiguration/index.tsx new file mode 100644 index 0000000000..9e03c6b6c2 --- /dev/null +++ b/frontend/src/pages/Runs/Details/RunDetails/ConnectToRunWithDevEnvConfiguration/index.tsx @@ -0,0 +1,305 @@ +import React, { FC } from 'react'; +import { useTranslation } from 'react-i18next'; + +import { Alert, Box, Button, Code, ExpandableSection, Popover, SpaceBetween, StatusIndicator, Tabs, Wizard } from 'components'; + +import { copyToClipboard } from 'libs'; + +import { useConfigProjectCliCommand } from 'pages/Project/hooks/useConfigProjectCliComand'; +import { getIDEDisplayName } from 'pages/Runs/Launch/constants'; + +import styles from './styles.module.scss'; + +const UvInstallCommand = 'uv tool install dstack -U'; +const PipInstallCommand = 'pip install dstack -U'; + +export const ConnectToRunWithDevEnvConfiguration: FC<{ run: IRun }> = ({ run }) => { + const { t } = useTranslation(); + const [isExpandedConnectSection, setIsExpandedConnectSection] = React.useState(true); + + const getAttachCommand = (runData: IRun) => { + const attachCommand = `dstack attach ${runData.run_spec.run_name} --logs`; + + const copyAttachCommand = () => { + copyToClipboard(attachCommand); + }; + + return [attachCommand, copyAttachCommand] as const; + }; + + const getSSHCommand = (runData: IRun) => { + const sshCommand = `ssh ${runData.run_spec.run_name}`; + + const copySSHCommand = () => { + copyToClipboard(sshCommand); + }; + + return [sshCommand, copySSHCommand] as const; + }; + + const [activeStepIndex, setActiveStepIndex] = React.useState(0); + const [attachCommand, copyAttachCommand] = getAttachCommand(run); + const [sshCommand, copySSHCommand] = getSSHCommand(run); + + const configuration = run.run_spec.configuration as TDevEnvironmentConfiguration; + const hasIDE = !!configuration.ide; + // The IDE deep link is built server-side, per IDE, in JobConnectionInfo.attached_ide_url + // (e.g. `zed://ssh/...` for Zed vs `...//vscode-remote/ssh-remote+...` for VS Code forks). + // It is set once the job is running and reachable via the SSH config alias created by + // `dstack attach`. The UI always talks to a same-version server, so no fallback is needed. + const openInIDEUrl = run.jobs[0]?.job_connection_info?.attached_ide_url ?? undefined; + const ideDisplayName = hasIDE ? getIDEDisplayName(configuration.ide!) : undefined; + + const [configCliCommand, copyCliCommand] = useConfigProjectCliCommand({ projectName: run.project_name }); + + return ( + setIsExpandedConnectSection(detail.expanded)} + > + {run.status === 'running' && ( + `Step ${stepNumber}`, + collapsedStepsLabel: (stepNumber, stepsCount) => `Step ${stepNumber} of ${stepsCount}`, + skipToButtonLabel: (step) => `Skip to ${step.title}`, + navigationAriaLabel: 'Steps', + previousButton: 'Previous', + nextButton: 'Next', + optional: 'required', + }} + onNavigate={({ detail }) => setActiveStepIndex(detail.requestedStepIndex)} + activeStepIndex={activeStepIndex} + onSubmit={() => setIsExpandedConnectSection(false)} + submitButtonText="Done" + allowSkipTo + steps={[ + { + title: 'Attach', + description: 'To access this run, first you need to attach to it.', + content: ( + +
+ {attachCommand} + +
+ {t('common.copied')}} + > +
+
+ + + + + To use dstack, install the CLI on your local machine. + + +
+ {UvInstallCommand} + +
+ + {t('common.copied')} + + } + > +
+
+ + ), + }, + { + label: 'pip', + id: 'pip', + content: ( + <> +
+ {PipInstallCommand} + +
+ + {t('common.copied')} + + } + > +
+
+ + ), + }, + ]} + /> + + And then configure the project. + +
+ {configCliCommand} + +
+ + {t('common.copied')} + + } + > +
+
+
+
+
+ ), + isOptional: true, + }, + hasIDE + ? { + title: 'Open', + description: `After the CLI is attached, you can open the dev environment in ${ideDisplayName}.`, + content: ( + + + + + + +
+ {sshCommand} + +
+ + {t('common.copied')} + + } + > +
+
+
+
+
+ ), + isOptional: true, + } + : { + title: 'Connect via SSH', + description: 'After the CLI is attached, you can connect to the dev environment via SSH.', + content: ( +
+ {sshCommand} + +
+ {t('common.copied')} + } + > +
+
+ ), + isOptional: true, + }, + ]} + /> + )} + + {run.status !== 'running' && ( + + + Waiting for the run to start. + + )} +
+ ); +}; diff --git a/frontend/src/pages/Runs/Details/RunDetails/ConnectToRunWithDevEnvConfiguration/styles.module.scss b/frontend/src/pages/Runs/Details/RunDetails/ConnectToRunWithDevEnvConfiguration/styles.module.scss new file mode 100644 index 0000000000..d01f54b264 --- /dev/null +++ b/frontend/src/pages/Runs/Details/RunDetails/ConnectToRunWithDevEnvConfiguration/styles.module.scss @@ -0,0 +1,13 @@ +.codeWrapper { + position: relative; + + .code { + padding: 16px 12px; + } + + .copy { + position: absolute; + top: 10px; + right: 8px; + } +} diff --git a/frontend/src/pages/Runs/Details/RunDetails/ConnectToServiceRun/index.tsx b/frontend/src/pages/Runs/Details/RunDetails/ConnectToServiceRun/index.tsx new file mode 100644 index 0000000000..740e8c1b81 --- /dev/null +++ b/frontend/src/pages/Runs/Details/RunDetails/ConnectToServiceRun/index.tsx @@ -0,0 +1,97 @@ +import React, { FC } from 'react'; +import { useTranslation } from 'react-i18next'; + +import { Alert, Box, Button, ExpandableSection, Link, Popover, SpaceBetween, StatusIndicator, Wizard } from 'components'; + +import { copyToClipboard } from 'libs'; +import { getRunProbeStatuses } from 'libs/run'; + +import { getRunListItemServiceUrl } from '../../../List/helpers'; + +export const ConnectToServiceRun: FC<{ run: IRun }> = ({ run }) => { + const { t } = useTranslation(); + const [isExpandedEndpointSection, setIsExpandedEndpointSection] = React.useState(true); + const [activeStepIndex, setActiveStepIndex] = React.useState(0); + const serviceUrl = getRunListItemServiceUrl(run); + const probeStatuses = getRunProbeStatuses(run); + const hasProbes = probeStatuses.length > 0; + const allProbesReady = hasProbes && probeStatuses.every((s) => s === 'success'); + const serviceReady = run.status === 'running' && (!hasProbes || allProbesReady) && serviceUrl; + + return ( + setIsExpandedEndpointSection(detail.expanded)} + > + {run.status !== 'running' && ( + + + Waiting for the service to start. + + )} + + {run.status === 'running' && !serviceReady && ( + + + Waiting for the service to become ready. + + )} + + {serviceReady && ( + `Step ${stepNumber}`, + collapsedStepsLabel: (stepNumber, stepsCount) => `Step ${stepNumber} of ${stepsCount}`, + skipToButtonLabel: (step) => `Skip to ${step.title}`, + navigationAriaLabel: 'Steps', + previousButton: 'Previous', + nextButton: 'Next', + optional: 'required', + }} + onNavigate={({ detail }) => setActiveStepIndex(detail.requestedStepIndex)} + activeStepIndex={activeStepIndex} + onSubmit={() => setIsExpandedEndpointSection(false)} + submitButtonText="Done" + allowSkipTo + steps={[ + { + title: 'Open', + description: 'Open the service endpoint.', + content: ( + + {t('common.copied')}} + > +
+ + + + + + To use dstack, install the CLI on your local machine. + + +
+ {UvInstallCommand} + +
+ + {t('common.copied')} + + } + > +
+
+ + ), + }, + { + label: 'pip', + id: 'pip', + content: ( + <> +
+ {PipInstallCommand} + +
+ + {t('common.copied')} + + } + > +
+
+ + ), + }, + ]} + /> + + And then configure the project. + +
+ {configCliCommand} + +
+ + {t('common.copied')} + + } + > +
+
+
+
+ + ), + isOptional: true, + }, + ...(mappedAppSpecs.length > 0 + ? [ + { + title: 'Open', + description: 'After the CLI is attached, use the forwarded localhost URLs.', + content: ( + + {mappedAppSpecs.map((spec) => { + const mappedPort = getMappedPort(spec)!; + const localUrl = `http://127.0.0.1:${mappedPort}`; + + return ( + + {t('common.copied')} + + } + > + + + + + {/**/} + + + + + ) : ( + t('runs.launch.wizard.template_placeholder') + ) + } + cardsPerRow={[{ cards: 1 }, { minWidth: 400, cards: 2 }, { minWidth: 800, cards: 3 }]} + onSelectionChange={onChangeTemplate} + /> + + + ), + }, + + { + title: 'Resources', + content: ( + + } + /> + ), + }, + + { + title: 'Settings', + content: , + }, + + { + title: 'Configuration', + content: ( + + openHelpPanel(CONFIGURATION_INFO)} />} + name={FORM_FIELD_NAMES.config_yaml} + language="yaml" + loading={loading} + editorContentHeight={600} + /> + + ), + }, + ]} + /> + + ); +}; diff --git a/frontend/src/pages/Runs/Launch/styles.module.scss b/frontend/src/pages/Runs/Launch/styles.module.scss new file mode 100644 index 0000000000..fac3b62f06 --- /dev/null +++ b/frontend/src/pages/Runs/Launch/styles.module.scss @@ -0,0 +1,18 @@ +@use '@cloudscape-design/design-tokens/index' as awsui; + +.noFleetAlert { + margin: 12px 0 0; +} + +.wizardForm { + & [class^="awsui_wizard"] { + & [class^="awsui_footer"] { + position: sticky; + bottom: 0; + background-color: awsui.$color-background-layout-main; + margin-block-start: 0 !important; + padding-top: awsui.$space-scaled-l; + z-index: 100; + } + } +} diff --git a/frontend/src/pages/Runs/Launch/types.ts b/frontend/src/pages/Runs/Launch/types.ts new file mode 100644 index 0000000000..e0a1986946 --- /dev/null +++ b/frontend/src/pages/Runs/Launch/types.ts @@ -0,0 +1,19 @@ +export interface IRunEnvironmentFormValues { + project: IProject['project_name']; + template: string[]; + gpu_enabled?: boolean; + offer?: IGpu; + name: string; + ide?: 'cursor' | 'vscode' | 'windsurf' | 'zed'; + config_yaml: string; + image?: string; + python?: string; + repo_enabled?: boolean; + repo_url?: string; + repo_path?: string; + working_dir?: string; + password?: string; + password_copied?: boolean; +} + +export type IRunEnvironmentFormKeys = keyof Required; diff --git a/frontend/src/pages/Runs/List/Preferences/consts.ts b/frontend/src/pages/Runs/List/Preferences/consts.ts new file mode 100644 index 0000000000..1e95dfb706 --- /dev/null +++ b/frontend/src/pages/Runs/List/Preferences/consts.ts @@ -0,0 +1,27 @@ +import { CollectionPreferencesProps } from 'components'; + +export const DEFAULT_PREFERENCES: CollectionPreferencesProps.Preferences = { + pageSize: 30, + contentDisplay: [ + { id: 'run_name', visible: true }, + { id: 'resources', visible: true }, + { id: 'status', visible: true }, + { id: 'hub_user_name', visible: true }, + { id: 'submitted_at', visible: true }, + { id: 'finished_at', visible: true }, + { id: 'error', visible: true }, + { id: 'price', visible: true }, + { id: 'cost', visible: true }, + { id: 'spot', visible: true }, + { id: 'backend', visible: true }, + { id: 'region', visible: true }, + // hidden by default + { id: 'priority', visible: false }, + { id: 'project', visible: false }, + { id: 'repo', visible: false }, + { id: 'instance', visible: false }, + ], + wrapLines: false, + stripedRows: false, + contentDensity: 'comfortable', +}; diff --git a/frontend/src/pages/Runs/List/Preferences/index.tsx b/frontend/src/pages/Runs/List/Preferences/index.tsx new file mode 100644 index 0000000000..7f02fe1da5 --- /dev/null +++ b/frontend/src/pages/Runs/List/Preferences/index.tsx @@ -0,0 +1,42 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; + +import { CollectionPreferences } from 'components'; + +import { useRunListPreferences } from './useRunListPreferences'; + +export const Preferences: React.FC = () => { + const { t } = useTranslation(); + const [preferences, setPreferences] = useRunListPreferences(); + + return ( + setPreferences(detail)} + cancelLabel={t('common.cancel')} + confirmLabel={t('common.save')} + contentDisplayPreference={{ + title: t('common.select_visible_columns'), + options: [ + { id: 'run_name', label: t('projects.run.run_name'), alwaysVisible: true }, + { id: 'resources', label: t('projects.run.resources') }, + { id: 'spot', label: t('projects.run.spot') }, + { id: 'price', label: t('projects.run.price') }, + { id: 'submitted_at', label: t('projects.run.submitted_at') }, + { id: 'status', label: t('projects.run.status') }, + { id: 'error', label: t('projects.run.error') }, + { id: 'cost', label: t('projects.run.cost') }, + // hidden by default + { id: 'priority', label: t('projects.run.priority') }, + { id: 'finished_at', label: t('projects.run.finished_at') }, + { id: 'project', label: t('projects.run.project') }, + { id: 'hub_user_name', label: t('projects.run.hub_user_name') }, + { id: 'repo', label: t('projects.run.repo') }, + { id: 'instance', label: t('projects.run.instance') }, + { id: 'region', label: t('projects.run.region') }, + { id: 'backend', label: t('projects.run.backend') }, + ], + }} + /> + ); +}; diff --git a/frontend/src/pages/Runs/List/Preferences/useRunListPreferences.ts b/frontend/src/pages/Runs/List/Preferences/useRunListPreferences.ts new file mode 100644 index 0000000000..148eedbd82 --- /dev/null +++ b/frontend/src/pages/Runs/List/Preferences/useRunListPreferences.ts @@ -0,0 +1,14 @@ +import { CollectionPreferencesProps } from 'components'; + +import { useLocalStorageState } from 'hooks/useLocalStorageState'; + +import { DEFAULT_PREFERENCES } from './consts'; + +export const useRunListPreferences = () => { + const [preferences, setPreferences] = useLocalStorageState( + 'run-list-preferences', + DEFAULT_PREFERENCES, + ); + + return [preferences, setPreferences] as const; +}; diff --git a/frontend/src/pages/Runs/List/helpers.ts b/frontend/src/pages/Runs/List/helpers.ts new file mode 100644 index 0000000000..28f29af288 --- /dev/null +++ b/frontend/src/pages/Runs/List/helpers.ts @@ -0,0 +1,111 @@ +import { groupBy as _groupBy } from 'lodash'; + +import { formatBackend } from 'libs/fleet'; +import { formatResources } from 'libs/resources'; + +import { getBaseUrl } from 'App/helpers'; + +import { finishedJobs, finishedRunStatuses } from '../constants'; +import { getJobStatus } from '../Details/Jobs/List/helpers'; + +export const getGroupedRunsByProjectAndRepoID = (runs: IRun[]) => { + return _groupBy(runs, ({ project_name }) => project_name); +}; + +export const getRunListItemResources = (run: IRun) => { + if (run.jobs.length > 1) { + return '-'; + } + + const resources = run.latest_job_submission?.job_provisioning_data?.instance_type?.resources; + return resources ? formatResources(resources) : '-'; +}; + +export const getRunListItemSpotLabelKey = (run: IRun) => { + if (run.jobs.length > 1) { + return '-'; + } + + if (run.latest_job_submission?.job_provisioning_data?.instance_type?.resources?.spot) { + return 'common.yes'; + } + + return 'common.no'; +}; + +export const getRunListItemSpot = (run: IRun) => { + if (run.jobs.length > 1) { + return '-'; + } + + return run.latest_job_submission?.job_provisioning_data?.instance_type?.resources?.spot?.toString() ?? '-'; +}; + +export const getRunListItemPrice = (run: IRun) => { + const unFinishedJobs = run.jobs.filter((job) => !finishedJobs.includes(getJobStatus(job))); + + if (run.jobs.length > 1) { + return `$${unFinishedJobs.reduce((acc, job) => { + const price = job.job_submissions?.[job.job_submissions.length - 1]?.job_provisioning_data?.price; + + if (price) acc += price; + + return acc; + }, 0)}`; + } + + return run.latest_job_submission?.job_provisioning_data?.price + ? `$${run.latest_job_submission?.job_provisioning_data?.price}` + : null; +}; + +export const getRunListItemInstance = (run: IRun) => { + if (run.jobs.length > 1) { + return '-'; + } + + return run.latest_job_submission?.job_provisioning_data?.instance_type?.name; +}; + +export const getRunListItemInstanceId = (run: IRun) => { + if (run.jobs.length > 1) { + return '-'; + } + + return run.latest_job_submission?.job_provisioning_data?.instance_id ?? '-'; +}; + +export const getRunListItemRegion = (run: IRun) => { + if (run.jobs.length > 1) { + return '-'; + } + + return run.latest_job_submission?.job_provisioning_data?.region ?? '-'; +}; + +export const getRunListItemBackend = (run: IRun) => { + if (run.jobs.length > 1) { + return '-'; + } + + return formatBackend(run.latest_job_submission?.job_provisioning_data?.backend); +}; + +export const getRunListItemServiceUrl = (run: IRun) => { + const url = run.service?.url; + if (!url) return null; + return url.startsWith('/') ? `${getBaseUrl()}${url}` : url; +}; + +export const getRunListItemSchedule = (run: IRun) => { + if (run.run_spec.configuration.type != 'task' || !run.run_spec.configuration.schedule) return null; + + return run.run_spec.configuration.schedule.cron.join(', '); +}; + +export const getRunListFinishedAt = (run: IRun) => { + if (!run.latest_job_submission || !run.latest_job_submission.finished_at || !finishedRunStatuses.includes(run.status)) { + return null; + } + return run.latest_job_submission.finished_at; +}; diff --git a/frontend/src/pages/Runs/List/hooks/index.ts b/frontend/src/pages/Runs/List/hooks/index.ts new file mode 100644 index 0000000000..00683e1f99 --- /dev/null +++ b/frontend/src/pages/Runs/List/hooks/index.ts @@ -0,0 +1,6 @@ +export * from './useColumnsDefinitions'; +export * from './useStopRuns'; +export * from './useDeleteRuns'; +export * from './useDisabledStatesForButtons'; +export * from './useEmptyMessages'; +export * from './useFilters'; diff --git a/frontend/src/pages/Runs/List/hooks/useColumnsDefinitions.tsx b/frontend/src/pages/Runs/List/hooks/useColumnsDefinitions.tsx new file mode 100644 index 0000000000..285c29ad9f --- /dev/null +++ b/frontend/src/pages/Runs/List/hooks/useColumnsDefinitions.tsx @@ -0,0 +1,149 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import { format } from 'date-fns'; + +import { NavigateLink, StatusIndicator } from 'components'; + +import { DATE_TIME_FORMAT } from 'consts'; +import { + getRepoNameFromRun, + getRunError, + getRunPriority, + getRunStatusMessage, + getStatusIconColor, + getStatusIconType, +} from 'libs/run'; +import { ROUTES } from 'routes'; + +import { finishedRunStatuses } from 'pages/Runs/constants'; + +import { + getRunListFinishedAt, + getRunListItemBackend, + getRunListItemInstance, + getRunListItemPrice, + getRunListItemRegion, + getRunListItemResources, + getRunListItemSpotLabelKey, +} from '../helpers'; + +export const useColumnsDefinitions = () => { + const { t } = useTranslation(); + + const columns = [ + { + id: 'run_name', + header: t('projects.run.run_name'), + cell: (item: IRun) => { + return item.id !== null ? ( + + {item.run_spec.run_name} + + ) : ( + item.run_spec.run_name + ); + }, + }, + { + id: 'project', + header: `${t('projects.run.project')}`, + cell: (item: IRun) => ( + {item.project_name} + ), + }, + { + id: 'repo', + header: `${t('projects.run.repo')}`, + cell: (item: IRun) => getRepoNameFromRun(item), + }, + { + id: 'hub_user_name', + header: `${t('projects.run.hub_user_name')}`, + cell: (item: IRun) => {item.user}, + }, + { + id: 'submitted_at', + header: t('projects.run.submitted_at'), + cell: (item: IRun) => format(new Date(item.submitted_at), DATE_TIME_FORMAT), + }, + { + id: 'finished_at', + header: t('projects.run.finished_at'), + cell: (item: IRun) => { + const finishedAt = getRunListFinishedAt(item); + return finishedAt ? format(new Date(finishedAt), DATE_TIME_FORMAT) : '-'; + }, + }, + { + id: 'status', + header: t('projects.run.status'), + cell: (item: IRun) => { + const status = finishedRunStatuses.includes(item.status) + ? (item.latest_job_submission?.status ?? item.status) + : item.status; + const terminationReason = finishedRunStatuses.includes(item.status) + ? item.latest_job_submission?.termination_reason + : null; + const statusMessage = getRunStatusMessage(item); + + return ( + + {statusMessage} + + ); + }, + }, + { + id: 'error', + header: t('projects.run.error'), + cell: (item: IRun) => getRunError(item), + }, + { + id: 'priority', + header: t('projects.run.priority'), + cell: (item: IRun) => getRunPriority(item), + }, + { + id: 'cost', + header: `${t('projects.run.cost')}`, + cell: (item: IRun) => { + return `$${item.cost}`; + }, + }, + { + id: 'resources', + header: `${t('projects.run.resources')}`, + cell: getRunListItemResources, + }, + { + id: 'spot', + header: `${t('projects.run.spot')}`, + cell: (item: IRun) => t(getRunListItemSpotLabelKey(item)), + }, + { + id: 'price', + header: `${t('projects.run.price')}`, + cell: getRunListItemPrice, + }, + { + id: 'instance', + header: `${t('projects.run.instance')}`, + cell: getRunListItemInstance, + }, + { + id: 'region', + header: `${t('projects.run.region')}`, + cell: getRunListItemRegion, + }, + { + id: 'backend', + header: `${t('projects.run.backend')}`, + cell: getRunListItemBackend, + }, + ]; + + return { columns } as const; +}; diff --git a/frontend/src/pages/Runs/List/hooks/useDeleteRuns.ts b/frontend/src/pages/Runs/List/hooks/useDeleteRuns.ts new file mode 100644 index 0000000000..cb8cf4a778 --- /dev/null +++ b/frontend/src/pages/Runs/List/hooks/useDeleteRuns.ts @@ -0,0 +1,40 @@ +import { useCallback } from 'react'; +import { useTranslation } from 'react-i18next'; + +import { useNotifications } from 'hooks'; +import { getServerError } from 'libs'; +import { useDeleteRunsMutation } from 'services/run'; + +import { getGroupedRunsByProjectAndRepoID } from '../helpers'; + +export const useDeleteRuns = () => { + const { t } = useTranslation(); + const [deleteRun, { isLoading: isDeleting }] = useDeleteRunsMutation(); + const [pushNotification] = useNotifications(); + + const deleteRuns = useCallback((runs: IRun[]) => { + const groupedRuns = getGroupedRunsByProjectAndRepoID(runs); + + const request = Promise.all( + Object.keys(groupedRuns).map((key) => { + const runsGroup = groupedRuns[key]; + + return deleteRun({ + project_name: runsGroup[0].project_name, + runs_names: runsGroup.map((item) => item.run_spec.run_name), + }).unwrap(); + }), + ); + + request.catch((error) => { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: getServerError(error) }), + }); + }); + + return request; + }, []); + + return { deleteRuns, isDeleting } as const; +}; diff --git a/frontend/src/pages/Runs/List/hooks/useDisabledStatesForButtons.ts b/frontend/src/pages/Runs/List/hooks/useDisabledStatesForButtons.ts new file mode 100644 index 0000000000..38aa41611d --- /dev/null +++ b/frontend/src/pages/Runs/List/hooks/useDisabledStatesForButtons.ts @@ -0,0 +1,33 @@ +import { useMemo } from 'react'; + +import { isAvailableAbortingForRun, isAvailableDeletingForRun, isAvailableStoppingForRun } from '../../utils'; + +type hookArgs = { + selectedRuns?: readonly IRun[]; + isStopping?: boolean; + isAborting?: boolean; + isDeleting?: boolean; +}; +export const useDisabledStatesForButtons = ({ selectedRuns, isStopping, isAborting, isDeleting }: hookArgs) => { + const isRunningOperation = Boolean(isStopping || isAborting || isDeleting); + + const isDisabledAbortButton = useMemo(() => { + return ( + !selectedRuns?.length || selectedRuns.some((item) => !isAvailableAbortingForRun(item.status)) || isRunningOperation + ); + }, [selectedRuns, isRunningOperation]); + + const isDisabledStopButton = useMemo(() => { + return ( + !selectedRuns?.length || selectedRuns.some((item) => !isAvailableStoppingForRun(item.status)) || isRunningOperation + ); + }, [selectedRuns, isRunningOperation]); + + const isDisabledDeleteButton = useMemo(() => { + return ( + !selectedRuns?.length || selectedRuns.some((item) => !isAvailableDeletingForRun(item.status)) || isRunningOperation + ); + }, [selectedRuns, isRunningOperation]); + + return { isDisabledAbortButton, isDisabledStopButton, isDisabledDeleteButton } as const; +}; diff --git a/frontend/src/pages/Runs/List/hooks/useEmptyMessages.tsx b/frontend/src/pages/Runs/List/hooks/useEmptyMessages.tsx new file mode 100644 index 0000000000..d495f03976 --- /dev/null +++ b/frontend/src/pages/Runs/List/hooks/useEmptyMessages.tsx @@ -0,0 +1,54 @@ +import React, { useCallback } from 'react'; +import { useTranslation } from 'react-i18next'; + +import { Button, ListEmptyMessage } from 'components'; + +import { QUICK_START_URL } from 'consts'; +import { goToUrl } from 'libs'; + +export const useEmptyMessages = ({ + clearFilter, + noData, + isDisabledClearFilter, +}: { + clearFilter?: () => void; + noData?: boolean; + isDisabledClearFilter?: boolean; +}) => { + const { t } = useTranslation(); + + const renderEmptyMessage = useCallback<() => React.ReactNode>(() => { + if (noData && isDisabledClearFilter) { + return ( + + + + ); + } + + return ( + + + + ); + }, [isDisabledClearFilter, clearFilter]); + + const renderNoMatchMessage = useCallback<() => React.ReactNode>(() => { + return ( + + + + ); + }, [isDisabledClearFilter, clearFilter]); + + return { renderEmptyMessage, renderNoMatchMessage } as const; +}; diff --git a/frontend/src/pages/Runs/List/hooks/useFilters.ts b/frontend/src/pages/Runs/List/hooks/useFilters.ts new file mode 100644 index 0000000000..7aac60969b --- /dev/null +++ b/frontend/src/pages/Runs/List/hooks/useFilters.ts @@ -0,0 +1,150 @@ +import { useMemo, useState } from 'react'; +import { useSearchParams } from 'react-router-dom'; +import { ToggleProps } from '@cloudscape-design/components'; + +import type { PropertyFilterProps } from 'components'; + +import { useLocalStorageState } from 'hooks'; +import { + EMPTY_QUERY, + getTokenAwareNamePatternFilterRequestParams, + requestParamsToTokens, + tokensToRequestParams, + tokensToSearchParams, +} from 'libs/filters'; +import { useLazyGetProjectsQuery } from 'services/project'; +import { useLazyGetUserListQuery } from 'services/user'; + +type RequestParamsKeys = keyof Pick; + +const filterKeys: Record = { + PROJECT_NAME: 'project_name', + USER_NAME: 'username', +}; + +const limit = 100; + +export const useFilters = () => { + const [searchParams, setSearchParams] = useSearchParams(); + const [onlyActive, setOnlyActive] = useLocalStorageState('run-list-filter-only-active', true); + const [filteringOptions, setFilteringOptions] = useState([]); + const [filteringStatusType, setFilteringStatusType] = useState(); + const [getProjects] = useLazyGetProjectsQuery(); + const [getUsers] = useLazyGetUserListQuery(); + + const [propertyFilterQuery, setPropertyFilterQuery] = useState(() => + requestParamsToTokens({ searchParams, filterKeys }), + ); + + const clearFilter = () => { + setSearchParams({}); + setPropertyFilterQuery(EMPTY_QUERY); + }; + + const filteringProperties = useMemo( + () => [ + { + key: filterKeys.PROJECT_NAME, + operators: ['='], + propertyLabel: 'Project', + groupValuesLabel: 'Project values', + }, + { + key: filterKeys.USER_NAME, + operators: ['='], + propertyLabel: 'User', + groupValuesLabel: 'User values', + }, + ], + [], + ); + + const onChangePropertyFilter: PropertyFilterProps['onChange'] = ({ detail }) => { + const { tokens, operation } = detail; + + const filteredTokens = tokens.filter((token, tokenIndex) => { + return !tokens.some((item, index) => token.propertyKey === item.propertyKey && index > tokenIndex); + }); + + setSearchParams(tokensToSearchParams(filteredTokens, onlyActive)); + + setPropertyFilterQuery({ + operation, + tokens: filteredTokens, + }); + }; + + const onChangeOnlyActive: ToggleProps['onChange'] = ({ detail }) => { + setOnlyActive(detail.checked); + }; + + const filteringRequestParams = useMemo(() => { + const params = tokensToRequestParams({ + tokens: propertyFilterQuery.tokens, + }); + + return { + ...params, + only_active: onlyActive, + } as Partial; + }, [propertyFilterQuery, onlyActive]); + + const handleLoadItems: PropertyFilterProps['onLoadItems'] = async ({ detail: { filteringProperty, filteringText } }) => { + setFilteringOptions([]); + + setFilteringStatusType('loading'); + + if (filteringProperty?.key === filterKeys.PROJECT_NAME) { + await getProjects( + getTokenAwareNamePatternFilterRequestParams({ + filteringText, + limit, + propertyKey: filterKeys.PROJECT_NAME, + tokens: propertyFilterQuery.tokens, + }), + ) + .unwrap() + .then(({ data }) => + data.map(({ project_name }) => ({ + propertyKey: filterKeys.PROJECT_NAME, + value: project_name, + })), + ) + .then(setFilteringOptions); + } + + if (filteringProperty?.key === filterKeys.USER_NAME) { + await getUsers( + getTokenAwareNamePatternFilterRequestParams({ + filteringText, + limit, + propertyKey: filterKeys.USER_NAME, + tokens: propertyFilterQuery.tokens, + }), + ) + .unwrap() + .then(({ data }) => + data.map(({ username }) => ({ + propertyKey: filterKeys.USER_NAME, + value: username, + })), + ) + .then(setFilteringOptions); + } + + setFilteringStatusType(undefined); + }; + + return { + filteringRequestParams, + clearFilter, + propertyFilterQuery, + onChangePropertyFilter, + filteringOptions, + filteringProperties, + onlyActive, + onChangeOnlyActive, + filteringStatusType, + handleLoadItems, + } as const; +}; diff --git a/frontend/src/pages/Runs/List/hooks/useStopRuns.ts b/frontend/src/pages/Runs/List/hooks/useStopRuns.ts new file mode 100644 index 0000000000..d65d800275 --- /dev/null +++ b/frontend/src/pages/Runs/List/hooks/useStopRuns.ts @@ -0,0 +1,50 @@ +import { useCallback } from 'react'; +import { useTranslation } from 'react-i18next'; + +import { useNotifications } from 'hooks'; +import { getServerError } from 'libs'; +import { useStopRunsMutation } from 'services/run'; + +import { getGroupedRunsByProjectAndRepoID } from '../helpers'; + +export const useStopRuns = (isAborting?: boolean) => { + const { t } = useTranslation(); + const [stopRun, { isLoading: isStopping }] = useStopRunsMutation(); + const [pushNotification] = useNotifications(); + + const stopRuns = useCallback( + (runs: IRun[]) => { + const groupedRuns = getGroupedRunsByProjectAndRepoID(runs); + + const request = Promise.all( + Object.keys(groupedRuns).map((key) => { + const runsGroup = groupedRuns[key]; + + return stopRun({ + project_name: runsGroup[0].project_name, + runs_names: runsGroup.map((item) => item.run_spec.run_name), + abort: !!isAborting, + }).unwrap(); + }), + ); + + request.catch((error) => { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: getServerError(error) }), + }); + }); + + return request; + }, + [isAborting], + ); + + return { stopRuns, isStopping } as const; +}; + +export const useAbortRuns = () => { + const { stopRuns: abortRuns, isStopping: isAborting } = useStopRuns(true); + + return { abortRuns, isAborting } as const; +}; diff --git a/frontend/src/pages/Runs/List/index.tsx b/frontend/src/pages/Runs/List/index.tsx new file mode 100644 index 0000000000..631eb07c82 --- /dev/null +++ b/frontend/src/pages/Runs/List/index.tsx @@ -0,0 +1,207 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate } from 'react-router-dom'; + +import { Button, Header, Loader, PropertyFilter, SpaceBetween, Table, Toggle } from 'components'; + +import { DEFAULT_TABLE_PAGE_SIZE } from 'consts'; +import { useBreadcrumbs, useCollection, useInfiniteScroll } from 'hooks'; +import { useCheckingForFleetsInProjects } from 'hooks/useCheckingForFleetsInProjectsOfMember'; +import { ROUTES } from 'routes'; +import { useLazyGetRunsQuery } from 'services/run'; + +import { NoFleetProjectAlert } from 'pages/Project/components/NoFleetProjectAlert'; + +import { useRunListPreferences } from './Preferences/useRunListPreferences'; +import { + useAbortRuns, + useColumnsDefinitions, + useDeleteRuns, + useDisabledStatesForButtons, + useEmptyMessages, + useFilters, + useStopRuns, +} from './hooks'; +import { Preferences } from './Preferences'; + +import styles from './styles.module.scss'; + +export const RunList: React.FC = () => { + const { t } = useTranslation(); + const [preferences] = useRunListPreferences(); + const navigate = useNavigate(); + + useBreadcrumbs([ + { + text: t('projects.runs'), + href: ROUTES.RUNS.LIST, + }, + ]); + + const { + clearFilter, + propertyFilterQuery, + onChangePropertyFilter, + filteringOptions, + filteringProperties, + filteringRequestParams, + onlyActive, + onChangeOnlyActive, + filteringStatusType, + handleLoadItems, + } = useFilters(); + const projectHavingFleetMap = useCheckingForFleetsInProjects({}); + + const { data, isLoading, refreshList, isLoadingMore } = useInfiniteScroll({ + useLazyQuery: useLazyGetRunsQuery, + args: { ...filteringRequestParams, limit: DEFAULT_TABLE_PAGE_SIZE, job_submissions_limit: 1 }, + getPaginationParams: (lastRun) => ({ prev_submitted_at: lastRun.submitted_at }), + }); + + const { stopRuns, isStopping } = useStopRuns(); + const { abortRuns, isAborting } = useAbortRuns(); + const { + // deleteRuns, + isDeleting, + } = useDeleteRuns(); + + const { columns } = useColumnsDefinitions(); + + const { renderEmptyMessage, renderNoMatchMessage } = useEmptyMessages({ + clearFilter, + noData: !data.length, + isDisabledClearFilter: Object.keys(filteringRequestParams).length <= 1 && !filteringRequestParams.only_active, + }); + + const { items, actions, collectionProps } = useCollection(data ?? [], { + filtering: { + empty: renderEmptyMessage(), + noMatch: renderNoMatchMessage(), + }, + selection: {}, + }); + + const { selectedItems } = collectionProps; + + const { + isDisabledAbortButton, + isDisabledStopButton, + // isDisabledDeleteButton + } = useDisabledStatesForButtons({ + selectedRuns: selectedItems, + isStopping, + isAborting, + isDeleting, + }); + + const abortClickHandle = () => { + if (!selectedItems?.length) return; + + abortRuns([...selectedItems]).then(() => actions.setSelectedItems([])); + }; + + const stopClickHandle = () => { + if (!selectedItems?.length) return; + + stopRuns([...selectedItems]).then(() => actions.setSelectedItems([])); + }; + + // const deleteClickHandle = () => { + // if (!selectedItems?.length) return; + // + // deleteRuns([...selectedItems]).catch(console.log); + // }; + + const launchHandle = () => { + navigate( + `${ROUTES.RUNS.CREATE_DEV_ENV}${ + filteringRequestParams.project_name ? `?project_name=${filteringRequestParams.project_name}` : '' + }`, + ); + }; + const projectDontHasFleet = Object.keys(projectHavingFleetMap).find((project) => !projectHavingFleetMap[project]); + + return ( + } + header={ + <> + +
+ + + + + + + {/**/} + +
+ + } + filter={ +
+
+ `Use: ${value}`, + }} + filteringOptions={filteringOptions} + filteringProperties={filteringProperties} + filteringStatusType={filteringStatusType} + onLoadItems={handleLoadItems} + /> +
+ +
+ + {t('projects.run.active_only')} + +
+
+ } + footer={} + /> + ); +}; diff --git a/frontend/src/pages/Runs/List/styles.module.scss b/frontend/src/pages/Runs/List/styles.module.scss new file mode 100644 index 0000000000..0598087317 --- /dev/null +++ b/frontend/src/pages/Runs/List/styles.module.scss @@ -0,0 +1,43 @@ +.noFleetAlert { + margin-bottom: 12px; +} + +.selectFilters { + display: flex; + flex-wrap: wrap; + gap: 0 20px; + + .propertyFilter { + max-width: 640px; + flex-grow: 1; + min-width: 0; + } + + .activeOnly { + display: flex; + padding-top: 7px; + } + +} + +.emptyMessage { + display: flex; + flex-direction: column; + align-items: center; + + .emptyMessageHelp { + margin-bottom: 10px; + } + + .cliCommand { + position: relative; + padding-right: 50px; + margin-top: 10px; + + [data-class="copy-button"] { + position: absolute !important; + top: 2px; + right: 4px; + } + } +} diff --git a/frontend/src/pages/Runs/constants.ts b/frontend/src/pages/Runs/constants.ts new file mode 100644 index 0000000000..c38a440c83 --- /dev/null +++ b/frontend/src/pages/Runs/constants.ts @@ -0,0 +1,8 @@ +export const runStatusForDeleting: TJobStatus[] = ['failed', 'aborted', 'done', 'terminated']; +export const inActiveRunStatuses: TJobStatus[] = ['failed', 'aborted', 'done', 'terminated']; +export const runStatusForStopping: TJobStatus[] = ['submitted', 'provisioning', 'pulling', 'pending', 'running']; +export const runStatusForAborting: TJobStatus[] = ['submitted', 'provisioning', 'pulling', 'pending', 'running']; +export const unfinishedRuns: TJobStatus[] = ['running', 'terminating', 'pending']; +export const finishedJobs: TJobStatus[] = ['terminated', 'aborted', 'failed', 'done']; +// TODO: Replace TJobStatus with TRunStatus and remove all consts above +export const finishedRunStatuses: TJobStatus[] = ['done', 'failed', 'terminated']; diff --git a/frontend/src/pages/Runs/index.ts b/frontend/src/pages/Runs/index.ts new file mode 100644 index 0000000000..ee5fe86c30 --- /dev/null +++ b/frontend/src/pages/Runs/index.ts @@ -0,0 +1,8 @@ +export { RunList } from './List'; +export { RunDetailsPage } from './Details'; +export { RunDetails } from './Details/RunDetails'; +export { JobMetrics } from './Details/Jobs/Metrics'; +export { EventsList } from './Details/Events/List'; +export { JobLogs } from './Details/Logs'; +export { Artifacts } from './Details/Artifacts'; +export { Launch } from './Launch'; diff --git a/frontend/src/pages/Runs/utils.ts b/frontend/src/pages/Runs/utils.ts new file mode 100644 index 0000000000..a81295a858 --- /dev/null +++ b/frontend/src/pages/Runs/utils.ts @@ -0,0 +1,21 @@ +import { inActiveRunStatuses, runStatusForAborting, runStatusForDeleting, runStatusForStopping } from './constants'; + +export const isAvailableDeletingForRun = (status: IRun['status']): boolean => { + return runStatusForDeleting.includes(status); +}; + +export const isAvailableStoppingForRun = (status: IRun['status']): boolean => { + return runStatusForStopping.includes(status); +}; + +export const runIsStopped = (status: IRun['status']): boolean => { + return inActiveRunStatuses.includes(status); +}; + +export const isAvailableAbortingForRun = (status: IRun['status']): boolean => { + return runStatusForAborting.includes(status); +}; + +export const getRunProvisioningData = (run: IRun): IJobProvisioningData | void => { + return run?.latest_job_submission?.job_provisioning_data ?? undefined; +}; diff --git a/frontend/src/pages/User/Add/index.tsx b/frontend/src/pages/User/Add/index.tsx new file mode 100644 index 0000000000..8f2d084d44 --- /dev/null +++ b/frontend/src/pages/User/Add/index.tsx @@ -0,0 +1,58 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate } from 'react-router-dom'; + +import { ContentLayout, Header } from 'components'; + +import { useBreadcrumbs, useNotifications } from 'hooks'; +import { ROUTES } from 'routes'; +import { useCreateUserMutation } from 'services/user'; + +import { UserForm } from '../Form'; + +export const UserAdd: React.FC = () => { + const { t } = useTranslation(); + const navigate = useNavigate(); + const [createUser, { isLoading }] = useCreateUserMutation(); + const [pushNotification] = useNotifications(); + + useBreadcrumbs([ + { + text: t('navigation.account'), + href: ROUTES.USER.LIST, + }, + { + text: t('common.create'), + href: ROUTES.USER.ADD, + }, + ]); + + const onCancelHandler = () => { + navigate(ROUTES.USER.LIST); + }; + + const onSubmitHandler = async (userData: Omit) => { + try { + const data = await createUser(userData).unwrap(); + + pushNotification({ + type: 'success', + content: t('users.create.success_notification'), + }); + + navigate(ROUTES.USER.DETAILS.FORMAT(data.username)); + // eslint-disable-next-line @typescript-eslint/no-unused-vars + } catch (e) { + pushNotification({ + type: 'error', + content: t('users.create.error_notification'), + }); + } + }; + + return ( + {t('users.create.page_title')}}> + + + ); +}; diff --git a/frontend/src/pages/User/Details/Billing/PayForm/index.tsx b/frontend/src/pages/User/Details/Billing/PayForm/index.tsx new file mode 100644 index 0000000000..fc9a273c30 --- /dev/null +++ b/frontend/src/pages/User/Details/Billing/PayForm/index.tsx @@ -0,0 +1,64 @@ +import React from 'react'; +import { useForm } from 'react-hook-form'; +import { useTranslation } from 'react-i18next'; + +import { Button, FormUI, Hotspot, SpaceBetween } from 'components'; + +import { HotspotIds } from '../../../../../layouts/AppLayout/TutorialPanel/constants'; +import { AmountField } from '../components/AmountField'; + +import { FormValues, IProps } from './types'; + +export const MINIMAL_AMOUNT = 5; + +export const PayForm: React.FC = ({ defaultValues, isLoading, onCancel, onSubmit: onSubmitProp }) => { + const { t } = useTranslation(); + + const { handleSubmit, control } = useForm({ + defaultValues, + }); + + const onSubmit = (values: FormValues) => { + onSubmitProp(values); + }; + + return ( +
+ + + + + + + + } + > + + + + + + ); +}; diff --git a/frontend/src/pages/User/Details/Billing/PayForm/types.ts b/frontend/src/pages/User/Details/Billing/PayForm/types.ts new file mode 100644 index 0000000000..103f7fcd8a --- /dev/null +++ b/frontend/src/pages/User/Details/Billing/PayForm/types.ts @@ -0,0 +1,10 @@ +export type FormValues = { + amount: number; +}; + +export interface IProps { + defaultValues?: Partial; + isLoading?: boolean; + onCancel?: () => void; + onSubmit: (values: FormValues) => void; +} diff --git a/frontend/src/pages/User/Details/Billing/components/AmountField/index.tsx b/frontend/src/pages/User/Details/Billing/components/AmountField/index.tsx new file mode 100644 index 0000000000..24fcc42591 --- /dev/null +++ b/frontend/src/pages/User/Details/Billing/components/AmountField/index.tsx @@ -0,0 +1,27 @@ +import React from 'react'; +import { FieldValues } from 'react-hook-form'; + +import { Box, FormInput } from 'components'; +import { FormInputProps } from 'components/form/Input/types'; + +import styles from './styles.module.scss'; + +export type Props = Omit, 'leftContent' | 'type'>; + +export const AmountField = (props: Props) => { + return ( +
+ + + $ + +
+ } + type="number" + /> + + ); +}; diff --git a/frontend/src/pages/User/Details/Billing/components/AmountField/styles.module.scss b/frontend/src/pages/User/Details/Billing/components/AmountField/styles.module.scss new file mode 100644 index 0000000000..4fbca4fe5a --- /dev/null +++ b/frontend/src/pages/User/Details/Billing/components/AmountField/styles.module.scss @@ -0,0 +1,12 @@ +.amountInput { + .prefix { + position: absolute; + z-index: 1; + transform: translate(8px, 6px); + pointer-events: none; + } + + input { + padding-left: 18px !important; + } +} diff --git a/frontend/src/pages/User/Details/Billing/index.tsx b/frontend/src/pages/User/Details/Billing/index.tsx new file mode 100644 index 0000000000..5e346169a6 --- /dev/null +++ b/frontend/src/pages/User/Details/Billing/index.tsx @@ -0,0 +1,162 @@ +import React, { useEffect, useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useParams, useSearchParams } from 'react-router-dom'; + +import { Box, Button, Header, Hotspot, Loader, Modal, SpaceBetween } from 'components'; +import { PermissionGuard } from 'components/PermissionGuard'; +import { HotspotIds } from 'layouts/AppLayout/TutorialPanel/constants'; + +import { useAppSelector, useBreadcrumbs, useNotifications } from 'hooks'; +import { centsToFormattedString, getServerError, goToUrl } from 'libs'; +import { ROUTES } from 'routes'; +import { + useGetUserBillingInfoQuery, + useUserBillingCheckoutSessionMutation, + // useUserBillingPortalSessionMutation, +} from 'services/user'; +import { GlobalUserRole } from 'types'; + +import { selectUserName } from 'App/slice'; + +import { CreditsHistory } from '../CreditsHistory'; +import { Payments } from '../Payments'; +import { MINIMAL_AMOUNT, PayForm } from './PayForm'; + +import { FormValues } from './PayForm/types'; + +export const Billing: React.FC = () => { + const { t } = useTranslation(); + const [pushNotification] = useNotifications(); + const [searchParams, setSearchParams] = useSearchParams(); + const [showPaymentModal, setShowPaymentModal] = useState(false); + const userName = useAppSelector(selectUserName) ?? ''; + const params = useParams(); + const paramUserName = params.userName ?? ''; + + const isCurrentUser = userName === paramUserName; + + const { data, isLoading } = useGetUserBillingInfoQuery({ username: paramUserName }); + const [billingCheckout, { isLoading: isLoadingBillingCheckout }] = useUserBillingCheckoutSessionMutation(); + // const [billingPortalSession, { isLoading: isLoadingBillingPortalSession }] = useUserBillingPortalSessionMutation(); + + useBreadcrumbs([ + { + text: t('navigation.account'), + href: ROUTES.USER.LIST, + }, + { + text: paramUserName, + href: ROUTES.USER.DETAILS.FORMAT(paramUserName), + }, + ]); + + useEffect(() => { + if (searchParams.get('payment_status') === 'success') { + pushNotification({ + type: 'success', + content: t('billing.payment_success_message'), + }); + setSearchParams({}); + } + }, []); + + const onSubmitPayment = ({ amount }: FormValues) => { + billingCheckout({ + username: paramUserName, + // Because the server needs amount value as cents + amount: amount * 100, + }) + .unwrap() + .then((data) => goToUrl(data.url)) + .catch((error) => { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: getServerError(error) }), + }); + }) + .finally(closeModal); + }; + + const makePaymentClick = () => { + setShowPaymentModal(true); + }; + + // const editPaymentMethod = () => { + // billingPortalSession({ + // username: paramUserName, + // }) + // .unwrap() + // .then((data) => goToUrl(data.url)) + // .catch((error) => { + // pushNotification({ + // type: 'error', + // content: t('common.server_error', { error: getServerError(error) }), + // }); + // }) + // .finally(closeModal); + // }; + + const closeModal = () => { + setShowPaymentModal(false); + }; + + return ( + +
+
{t('billing.balance')}
+ + {isLoading && } + + {data && ( + + {centsToFormattedString(data?.balance ?? 0, '$')} + + + {isCurrentUser && ( + + + + )} + + {/* {data?.is_payment_method_attached && ( + + )} */} + + + )} +
+ + {t('billing.billing_history')}} + /> + + + + + + {showPaymentModal && ( + + {data && ( + + )} + + )} +
+ ); +}; diff --git a/frontend/src/pages/User/Details/CreditsHistory/Add/index.tsx b/frontend/src/pages/User/Details/CreditsHistory/Add/index.tsx new file mode 100644 index 0000000000..d4d0f058c7 --- /dev/null +++ b/frontend/src/pages/User/Details/CreditsHistory/Add/index.tsx @@ -0,0 +1,145 @@ +import React from 'react'; +import { useForm } from 'react-hook-form'; +import { useTranslation } from 'react-i18next'; +import { useNavigate, useParams } from 'react-router-dom'; + +import { Button, Container, ContentLayout, FormInput, FormUI, Header, SpaceBetween } from 'components'; + +import { useBreadcrumbs, useNotifications } from 'hooks'; +import { getServerError, isResponseServerError, isResponseServerFormFieldError } from 'libs'; +import { getFieldErrorFromServerResponse } from 'libs/form'; +import { ROUTES } from 'routes'; +import { useAddUserPaymentMutation } from 'services/user'; + +import { AmountField } from '../../Billing/components/AmountField'; + +import { TFormValue } from './types'; +import { FieldPath } from 'react-hook-form/dist/types/path'; + +export const Add: React.FC = () => { + const { t } = useTranslation(); + const [pushNotification] = useNotifications(); + const navigate = useNavigate(); + const params = useParams(); + const paramUserName = params.userName ?? ''; + const [createPayment, { isLoading }] = useAddUserPaymentMutation(); + + const formMethods = useForm(); + + const { handleSubmit, control, setError, clearErrors } = formMethods; + + useBreadcrumbs([ + { + text: t('navigation.account'), + href: ROUTES.USER.LIST, + }, + { + text: paramUserName, + href: ROUTES.USER.DETAILS.FORMAT(paramUserName), + }, + { + text: t('billing.title'), + href: ROUTES.USER.BILLING.LIST.FORMAT(paramUserName), + }, + { + text: t('users.manual_payments.title'), + href: ROUTES.USER.BILLING.LIST.FORMAT(paramUserName), + }, + { + text: t('common.add'), + href: ROUTES.USER.BILLING.ADD_PAYMENT.FORMAT(paramUserName), + }, + ]); + + const onSubmit = ({ value, ...data }: TFormValue) => { + clearErrors(); + + createPayment({ + ...data, + // Convert to cents + value: value * 100, + username: paramUserName, + }) + .unwrap() + .then(() => { + pushNotification({ + type: 'success', + content: t('users.manual_payments.create.success_notification'), + }); + + navigate(ROUTES.USER.DETAILS.FORMAT(paramUserName)); + }) + .catch((errorResponse) => { + const errorRequestData = errorResponse?.data; + + if (isResponseServerError(errorRequestData)) { + errorRequestData.detail.forEach((error) => { + if (isResponseServerFormFieldError(error)) { + const { fieldNamePath, message } = getFieldErrorFromServerResponse(error); + + setError(fieldNamePath as FieldPath, { type: 'custom', message }); + } else { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: error.msg }), + }); + } + }); + } else { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: getServerError(errorResponse) }), + }); + } + }); + }; + + const onCancel = () => { + navigate(ROUTES.USER.DETAILS.FORMAT(paramUserName)); + }; + + return ( + {t('users.manual_payments.add_payment')}}> +
+ + + + + + } + > + + + + + + + + + + + +
+ ); +}; diff --git a/frontend/src/pages/User/Details/CreditsHistory/Add/types.ts b/frontend/src/pages/User/Details/CreditsHistory/Add/types.ts new file mode 100644 index 0000000000..76f7523e4d --- /dev/null +++ b/frontend/src/pages/User/Details/CreditsHistory/Add/types.ts @@ -0,0 +1 @@ +export type TFormValue = Pick; diff --git a/frontend/src/pages/User/Details/CreditsHistory/constants.tsx b/frontend/src/pages/User/Details/CreditsHistory/constants.tsx new file mode 100644 index 0000000000..5efd3cf3a3 --- /dev/null +++ b/frontend/src/pages/User/Details/CreditsHistory/constants.tsx @@ -0,0 +1,9 @@ +import React from 'react'; +export const PAYMENTS_INFO = { + header:

Credits history

, + body: ( + <> +

Available for only the global admin role

+ + ), +}; diff --git a/frontend/src/pages/User/Details/CreditsHistory/index.tsx b/frontend/src/pages/User/Details/CreditsHistory/index.tsx new file mode 100644 index 0000000000..ee4923caa3 --- /dev/null +++ b/frontend/src/pages/User/Details/CreditsHistory/index.tsx @@ -0,0 +1,46 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate } from 'react-router-dom'; + +import { Button, Header, InfoLink } from 'components'; + +import { useHelpPanel } from 'hooks'; +import { ROUTES } from 'routes'; +import { useGetUserPaymentsQuery } from 'services/user'; + +import { Payments } from '../Payments'; +import { PAYMENTS_INFO } from './constants'; + +import { IProps } from './types'; + +export const CreditsHistory: React.FC = ({ username }) => { + const { t } = useTranslation(); + const navigate = useNavigate(); + const [openHelpPanel] = useHelpPanel(); + + const { data, isLoading } = useGetUserPaymentsQuery({ username }); + + const onAddClick = () => { + navigate(ROUTES.USER.BILLING.ADD_PAYMENT.FORMAT(username)); + }; + + return ( + {t('common.add')}} + tableHeaderContent={ +
+ {t('common.add')} + + } + info={ openHelpPanel(PAYMENTS_INFO)} />} + > + {t('users.manual_payments.title')} +
+ } + /> + ); +}; diff --git a/frontend/src/pages/User/Details/CreditsHistory/types.ts b/frontend/src/pages/User/Details/CreditsHistory/types.ts new file mode 100644 index 0000000000..d19720b3bc --- /dev/null +++ b/frontend/src/pages/User/Details/CreditsHistory/types.ts @@ -0,0 +1,3 @@ +export interface IProps { + username: string; +} diff --git a/frontend/src/pages/User/Details/Events/index.tsx b/frontend/src/pages/User/Details/Events/index.tsx new file mode 100644 index 0000000000..be5c174208 --- /dev/null +++ b/frontend/src/pages/User/Details/Events/index.tsx @@ -0,0 +1,73 @@ +import React, { useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate, useParams } from 'react-router-dom'; + +import { Button, Container, Header, Loader, SegmentedControl, SpaceBetween } from 'components'; + +import { useBreadcrumbs } from 'hooks'; +import { ROUTES } from 'routes'; +import { useGetUserQuery } from 'services/user'; + +import { EventList } from 'pages/Events/List'; + +export const Events: React.FC = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramUserName = params.userName ?? ''; + const navigate = useNavigate(); + const [filterParamName, setFilterParamName] = useState('actors'); + const { data, isLoading } = useGetUserQuery({ name: paramUserName }); + + useBreadcrumbs([ + { + text: t('navigation.account'), + href: ROUTES.USER.LIST, + }, + { + text: paramUserName, + href: ROUTES.USER.DETAILS.FORMAT(paramUserName), + }, + { + text: t('users.events'), + href: ROUTES.USER.EVENTS.FORMAT(paramUserName), + }, + ]); + + const goToEventsPage = () => { + navigate(ROUTES.EVENTS.LIST + `?${filterParamName}=${data?.id}`); + }; + + if (isLoading || !data) + return ( + + + + ); + + return ( + { + return ( +
+ setFilterParamName(detail.selectedId as keyof TEventListFilters)} + options={[ + { text: 'Actor', id: 'actors' }, + { text: 'Target user', id: 'target_users' }, + ]} + /> + + + } + /> + ); + }} + permanentFilters={{ [filterParamName]: [data.id] }} + showFilters={false} + /> + ); +}; diff --git a/frontend/src/pages/User/Details/Payments/index.tsx b/frontend/src/pages/User/Details/Payments/index.tsx new file mode 100644 index 0000000000..c146782bd2 --- /dev/null +++ b/frontend/src/pages/User/Details/Payments/index.tsx @@ -0,0 +1,66 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import { format } from 'date-fns'; + +import { ListEmptyMessage, Pagination, Table } from 'components'; + +import { DATE_TIME_FORMAT } from 'consts'; +import { useCollection } from 'hooks'; +import { centsToFormattedString } from 'libs'; + +import { IProps } from './types'; + +export const Payments: React.FC = ({ payments, emptyMessageContent, isLoading, tableHeaderContent }) => { + const { t } = useTranslation(); + + const renderEmptyMessage = (): React.ReactNode => { + return ( + + {emptyMessageContent} + + ); + }; + + const { items, collectionProps, paginationProps } = useCollection(payments, { + filtering: { + empty: renderEmptyMessage(), + }, + pagination: { pageSize: 20 }, + selection: {}, + }); + + const columns = [ + { + id: 'value', + header: `${t('users.manual_payments.edit.value')}`, + cell: (item: IPayment) => `${centsToFormattedString(item.value, '$')}`, + }, + { + id: 'created_at', + header: t('users.manual_payments.edit.created_at'), + cell: (item: IPayment) => format(new Date(item.created_at), DATE_TIME_FORMAT), + }, + { + id: 'description', + header: `${t('users.manual_payments.edit.description')}`, + cell: (item: IPayment) => item.description, + }, + ]; + + return ( +
} + /> + ); +}; diff --git a/frontend/src/pages/User/Details/Payments/types.ts b/frontend/src/pages/User/Details/Payments/types.ts new file mode 100644 index 0000000000..73160d0632 --- /dev/null +++ b/frontend/src/pages/User/Details/Payments/types.ts @@ -0,0 +1,8 @@ +import React from 'react'; + +export interface IProps { + payments: IPayment[]; + emptyMessageContent?: React.ReactNode; + isLoading?: boolean; + tableHeaderContent?: React.ReactNode; +} diff --git a/frontend/src/pages/User/Details/Projects/index.tsx b/frontend/src/pages/User/Details/Projects/index.tsx new file mode 100644 index 0000000000..3ce243d978 --- /dev/null +++ b/frontend/src/pages/User/Details/Projects/index.tsx @@ -0,0 +1,83 @@ +import React, { useMemo } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useParams } from 'react-router-dom'; + +import { + ListEmptyMessage, + NavigateLink, + // Pagination, + Table, +} from 'components'; + +import { useBreadcrumbs, useCollection } from 'hooks'; +import { ROUTES } from 'routes'; +import { useGetProjectsQuery } from 'services/project'; + +export const UserProjectList: React.FC = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramUserName = params.userName ?? ''; + + const { isLoading, isFetching, data } = useGetProjectsQuery({ limit: 200 }); + + useBreadcrumbs([ + { + text: t('navigation.account'), + href: ROUTES.USER.LIST, + }, + { + text: paramUserName, + href: ROUTES.USER.DETAILS.FORMAT(paramUserName), + }, + { + text: t('users.projects'), + href: ROUTES.USER.PROJECTS.FORMAT(paramUserName), + }, + ]); + + const renderEmptyMessage = (): React.ReactNode => { + return ; + }; + + const filteredData = useMemo(() => { + if (!data?.data) return []; + + // eslint-disable-next-line no-unsafe-optional-chaining + return [...data?.data] + .filter((p) => p.owner.username === paramUserName) + .sort((a, b) => new Date(b.created_at) - new Date(a.created_at)); + }, [data]); + + const { + items, + collectionProps, + // paginationProps + } = useCollection(filteredData, { + filtering: { + empty: renderEmptyMessage(), + }, + // pagination: { pageSize: 20 }, + selection: {}, + }); + + const columns = [ + { + id: 'project_name', + header: `${t('projects.edit.project_name')}`, + cell: (project: IProject) => ( + {project.project_name} + ), + }, + ]; + + return ( +
} + /> + ); +}; diff --git a/frontend/src/pages/User/Details/PublicKeys/constants.tsx b/frontend/src/pages/User/Details/PublicKeys/constants.tsx new file mode 100644 index 0000000000..1b7025b092 --- /dev/null +++ b/frontend/src/pages/User/Details/PublicKeys/constants.tsx @@ -0,0 +1,28 @@ +import React from 'react'; + +export const SSH_KEYS_INFO = { + header:

SSH Keys

, + body: ( + <> +

+ These SSH keys are for direct SSH access to runs from your local client without running{' '} + dstack attach. +

+

+ If you use dstack attach (or attached dstack apply), dstack manages a + client SSH key and local SSH shortcut automatically. In that workflow, you usually don't need to upload + additional keys. +

+

+ Without dstack attach, {'ssh '} is not configured on your machine. Use the + full proxied SSH connection details from run details instead. This requires SSH proxy to be enabled on the + server. +

+

+ To authorize this direct path, upload your public key (for example, ~/.ssh/id_ed25519.pub), and + keep the matching private key on your client. Uploaded keys are additional and do not replace the system-managed + key used by dstack attach/dstack apply. +

+ + ), +}; diff --git a/frontend/src/pages/User/Details/PublicKeys/index.tsx b/frontend/src/pages/User/Details/PublicKeys/index.tsx new file mode 100644 index 0000000000..b6ce794d3f --- /dev/null +++ b/frontend/src/pages/User/Details/PublicKeys/index.tsx @@ -0,0 +1,220 @@ +import React, { useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useParams } from 'react-router-dom'; +import CloudscapeInput from '@cloudscape-design/components/input'; +import CloudscapeTextarea from '@cloudscape-design/components/textarea'; + +import { Box, Button, ButtonWithConfirmation, FormField, Header, InfoLink, Modal, SpaceBetween, Table } from 'components'; + +import { useBreadcrumbs, useCollection, useHelpPanel, useNotifications } from 'hooks'; +import { getServerError } from 'libs'; +import { ROUTES } from 'routes'; +import { IPublicKey, useAddPublicKeyMutation, useDeletePublicKeysMutation, useListPublicKeysQuery } from 'services/publicKeys'; + +import { SSH_KEYS_INFO } from './constants'; + +export const PublicKeys: React.FC = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramUserName = params.userName ?? ''; + const [pushNotification] = useNotifications(); + const [openHelpPanel] = useHelpPanel(); + + const [showAddModal, setShowAddModal] = useState(false); + const [keyValue, setKeyValue] = useState(''); + const [keyName, setKeyName] = useState(''); + const [addError, setAddError] = useState(''); + + const { data: publicKeys = [], isLoading } = useListPublicKeysQuery(); + const [addPublicKey, { isLoading: isAdding }] = useAddPublicKeyMutation(); + const [deletePublicKeys, { isLoading: isDeleting }] = useDeletePublicKeysMutation(); + + useBreadcrumbs([ + { + text: t('navigation.account'), + href: ROUTES.USER.LIST, + }, + { + text: paramUserName, + href: ROUTES.USER.DETAILS.FORMAT(paramUserName), + }, + { + text: t('users.public_keys.title'), + href: ROUTES.USER.PUBLIC_KEYS.FORMAT(paramUserName), + }, + ]); + + const { items, collectionProps } = useCollection(publicKeys, { + selection: {}, + }); + + const { selectedItems = [] } = collectionProps; + + const openAddModal = () => { + setKeyValue(''); + setKeyName(''); + setAddError(''); + setShowAddModal(true); + }; + + const closeAddModal = () => { + setShowAddModal(false); + }; + + const handleAdd = () => { + if (!keyValue.trim()) { + setAddError(t('users.public_keys.key_required')); + return; + } + + addPublicKey({ key: keyValue.trim(), name: keyName.trim() || undefined }) + .unwrap() + .then(() => { + setShowAddModal(false); + }) + .catch((error) => { + const detail = (error?.data?.detail ?? []) as { msg: string; code: string }[]; + const isKeyExists = detail.some(({ code }) => code === 'resource_exists'); + setAddError(isKeyExists ? t('users.public_keys.key_already_exists') : getServerError(error)); + }); + }; + + const handleDelete = () => { + deletePublicKeys(selectedItems.map((k) => k.id)) + .unwrap() + .catch((error) => { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: getServerError(error) }), + }); + }); + }; + + const formatDate = (iso: string) => { + return new Date(iso).toLocaleDateString(undefined, { + year: 'numeric', + month: 'short', + day: 'numeric', + }); + }; + + const columns = [ + { + id: 'name', + header: t('users.public_keys.name'), + cell: (item: IPublicKey) => item.name, + }, + { + id: 'fingerprint', + header: t('users.public_keys.fingerprint'), + cell: (item: IPublicKey) => ( + + {item.fingerprint} + + ), + }, + { + id: 'type', + header: t('users.public_keys.key_type'), + cell: (item: IPublicKey) => item.type, + }, + { + id: 'added_at', + header: t('users.public_keys.added'), + cell: (item: IPublicKey) => formatDate(item.added_at), + }, + ]; + + return ( + <> +
openHelpPanel(SSH_KEYS_INFO)} />} + actions={ + + + {t('common.delete')} + + + + + } + > + {t('users.public_keys.title')} + + } + empty={ + + {t('users.public_keys.empty_title')} + + {t('users.public_keys.empty_message')} + + + + } + /> + + + + + + + + } + > + + + setKeyName(detail.value)} + placeholder={t('users.public_keys.key_name_placeholder')} + /> + + + + { + setKeyValue(detail.value); + setAddError(''); + }} + placeholder="ssh-ed25519 AAAA..." + rows={5} + /> + + + + + ); +}; diff --git a/frontend/src/pages/User/Details/Settings/index.tsx b/frontend/src/pages/User/Details/Settings/index.tsx new file mode 100644 index 0000000000..fdefae4dc2 --- /dev/null +++ b/frontend/src/pages/User/Details/Settings/index.tsx @@ -0,0 +1,112 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate, useParams } from 'react-router-dom'; + +import { Box, Button, ColumnLayout, Container, Header, Link, Loader, Popover, SpaceBetween, StatusIndicator } from 'components'; +import { PermissionGuard } from 'components/PermissionGuard'; + +import { useAppSelector, useBreadcrumbs, usePermissionGuard } from 'hooks'; +import { copyToClipboard } from 'libs'; +import { ROUTES } from 'routes'; +import { useDeleteUsersMutation, useGetUserQuery } from 'services/user'; +import { GlobalUserRole } from 'types'; + +import { selectUserData } from 'App/slice'; + +import styles from './styles.module.scss'; + +export const Settings: React.FC = () => { + const { t } = useTranslation(); + const userData = useAppSelector(selectUserData); + const params = useParams(); + const paramUserName = params.userName ?? ''; + const navigate = useNavigate(); + + const { isLoading, data } = useGetUserQuery({ name: paramUserName }, { skip: !params.userName }); + const [, { isLoading: isDeleting }] = useDeleteUsersMutation(); + + const [isAvailableDelete] = usePermissionGuard({ allowedGlobalRoles: [GlobalUserRole.ADMIN] }); + + useBreadcrumbs([ + { + text: t('navigation.account'), + href: ROUTES.USER.LIST, + }, + { + text: paramUserName, + href: ROUTES.USER.DETAILS.FORMAT(paramUserName), + }, + ]); + + const editUserHandler = () => { + navigate(ROUTES.USER.EDIT.FORMAT(paramUserName)); + }; + + const isDisabledUserEditing = () => { + return isDeleting || (!isAvailableDelete && userData?.username !== paramUserName); + }; + + const onCopyToken = () => { + copyToClipboard(data?.creds.token ?? ''); + }; + + return ( + + {t('common.edit')} + + } + > + {t('users.account_settings')} + + } + > + {isLoading && } + + {data && ( + + + {/*
*/} + {/* {t('users.user_name')}*/} + {/*
{data.user_name}
*/} + {/*
*/} + +
+ {t('users.email')} +
{data.email ? {data.email} : '-'}
+
+ + +
+ {t('users.global_role')} +
{t(`roles.${data.global_role}`)}
+
+
+ +
+ {t('users.token')} + +
+ {t('users.token_copied')}} + > +
+
+
+
+ )} +
+ ); +}; diff --git a/frontend/src/pages/User/Details/Settings/styles.module.scss b/frontend/src/pages/User/Details/Settings/styles.module.scss new file mode 100644 index 0000000000..af62ebfb58 --- /dev/null +++ b/frontend/src/pages/User/Details/Settings/styles.module.scss @@ -0,0 +1,11 @@ +.token { + display: flex; + align-items: center; + height: 20px; + gap: 12px; + + button { + padding: 0 !important; + border: none !important; + } +} diff --git a/frontend/src/pages/User/Details/index.tsx b/frontend/src/pages/User/Details/index.tsx new file mode 100644 index 0000000000..ee878a442d --- /dev/null +++ b/frontend/src/pages/User/Details/index.tsx @@ -0,0 +1,118 @@ +import React, { useEffect, useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { Outlet, useNavigate, useParams } from 'react-router-dom'; + +import { Box, ConfirmationDialog, ContentLayout, Tabs } from 'components'; +import { DetailsHeader } from 'components'; + +import { useNotifications /* usePermissionGuard*/ } from 'hooks'; +import { getServerError, riseRouterException } from 'libs'; +import { ROUTES } from 'routes'; +import { useDeleteUsersMutation, useGetUserQuery } from 'services/user'; + +// import { GlobalUserRole } from '../../../types'; +import { UserDetailsTabTypeEnum } from './types'; + +import styles from './styles.module.scss'; + +export { Settings as UserSettings } from './Settings'; +export { Billing as UserBilling } from './Billing'; +export { Events as UserEvents } from './Events'; +export { UserProjectList as UserProjects } from './Projects'; +export { PublicKeys as UserPublicKeys } from './PublicKeys'; + +export const UserDetails: React.FC = () => { + const { t } = useTranslation(); + const [showDeleteConfirm, setShowConfirmDelete] = useState(false); + const params = useParams(); + const paramUserName = params.userName ?? ''; + const navigate = useNavigate(); + const { error: userError } = useGetUserQuery({ name: paramUserName }); + const [deleteUsers /*, { isLoading: isDeleting }*/] = useDeleteUsersMutation(); + const [pushNotification] = useNotifications(); + // const [isAvailableDeleteUser] = usePermissionGuard({ allowedGlobalRoles: [GlobalUserRole.ADMIN] }); + + useEffect(() => { + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + if (userError?.status === 404) { + riseRouterException(); + } + }, [userError]); + + const toggleDeleteConfirm = () => { + setShowConfirmDelete((val) => !val); + }; + + const deleteUserHandler = () => { + deleteUsers([paramUserName]) + .unwrap() + .then(() => navigate(ROUTES.USER.LIST)) + .catch((error) => { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: getServerError(error) }), + }); + }); + + setShowConfirmDelete(false); + }; + + const tabs: { + label: string; + id: UserDetailsTabTypeEnum; + href: string; + }[] = [ + { + label: t('users.settings'), + id: UserDetailsTabTypeEnum.SETTINGS, + href: ROUTES.USER.DETAILS.FORMAT(paramUserName), + }, + { + label: t('users.public_keys.title'), + id: UserDetailsTabTypeEnum.PUBLIC_KEYS, + href: ROUTES.USER.PUBLIC_KEYS.FORMAT(paramUserName), + }, + { + label: t('users.projects'), + id: UserDetailsTabTypeEnum.PROJECTS, + href: ROUTES.USER.PROJECTS.FORMAT(paramUserName), + }, + { + label: t('users.events'), + id: UserDetailsTabTypeEnum.EVENTS, + href: ROUTES.USER.EVENTS.FORMAT(paramUserName), + }, + process.env.UI_VERSION === 'sky' && { + label: t('billing.title'), + id: UserDetailsTabTypeEnum.BILLING, + href: ROUTES.USER.BILLING.LIST.FORMAT(paramUserName), + }, + ].filter(Boolean); + + return ( +
+ + } + > + + + + + + {t('confirm_dialog.message')}} + onDiscard={toggleDeleteConfirm} + onConfirm={deleteUserHandler} + confirmButtonLabel={t('common.delete')} + /> +
+ ); +}; diff --git a/frontend/src/pages/User/Details/styles.module.scss b/frontend/src/pages/User/Details/styles.module.scss new file mode 100644 index 0000000000..1a7d41a9c5 --- /dev/null +++ b/frontend/src/pages/User/Details/styles.module.scss @@ -0,0 +1,18 @@ +.page { + height: 100%; + + & [class^="awsui_tabs-content"] { + display: none; + } + + & > [class^="awsui_layout"] { + height: 100%; + + & > [class^="awsui_content"] { + display: flex; + flex-direction: column; + gap: 20px; + height: 100%; + } + } +} diff --git a/frontend/src/pages/User/Details/types.ts b/frontend/src/pages/User/Details/types.ts new file mode 100644 index 0000000000..ea0f69431c --- /dev/null +++ b/frontend/src/pages/User/Details/types.ts @@ -0,0 +1,8 @@ +export enum UserDetailsTabTypeEnum { + SETTINGS = 'settings', + PROJECTS = 'projects', + EVENTS = 'events', + ACTIVITY = 'activity', + PUBLIC_KEYS = 'public-keys', + BILLING = 'billing', +} diff --git a/frontend/src/pages/User/Edit/index.tsx b/frontend/src/pages/User/Edit/index.tsx new file mode 100644 index 0000000000..d11b68a64e --- /dev/null +++ b/frontend/src/pages/User/Edit/index.tsx @@ -0,0 +1,139 @@ +import React, { useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate, useParams } from 'react-router-dom'; +import { pick } from 'lodash'; + +import { Box, ConfirmationDialog, Container, ContentLayout, Header, Loader } from 'components'; + +import { useAppDispatch, useAppSelector, useBreadcrumbs, useNotifications } from 'hooks'; +import { ROUTES } from 'routes'; +import { useGetUserQuery, useRefreshTokenMutation, useUpdateUserMutation } from 'services/user'; + +import { selectUserData, setAuthData } from 'App/slice'; + +import { UserForm } from '../Form'; + +export const UserEdit: React.FC = () => { + const { t } = useTranslation(); + const params = useParams(); + const navigate = useNavigate(); + const userData = useAppSelector(selectUserData); + const userGlobalRole = userData?.global_role ?? ''; + const paramUserName = params.userName ?? ''; + const dispatch = useAppDispatch(); + const [showRefreshConfirm, setShowRefreshConfirm] = useState(false); + const { isLoading, data } = useGetUserQuery({ name: paramUserName }, { skip: !paramUserName }); + const [updateUser, { isLoading: isUserUpdating }] = useUpdateUserMutation(); + const [refreshToken, { isLoading: isTokenRefreshing }] = useRefreshTokenMutation(); + + const toggleRefreshConfirm = () => { + setShowRefreshConfirm((val) => !val); + }; + + useBreadcrumbs([ + { + text: t('navigation.users'), + href: ROUTES.USER.LIST, + }, + { + text: paramUserName, + href: ROUTES.USER.DETAILS.FORMAT(paramUserName), + }, + { + text: t('common.settings'), + href: ROUTES.USER.DETAILS.FORMAT(paramUserName), + }, + { + text: t('common.edit'), + href: ROUTES.USER.EDIT.FORMAT(paramUserName), + }, + ]); + + const [pushNotification] = useNotifications(); + + const onRefreshTokenHandler = async () => { + setShowRefreshConfirm(false); + + try { + await refreshToken({ + username: paramUserName, + }) + .unwrap() + .then(({ creds: { token } }) => { + if (paramUserName === userData?.username) { + dispatch(setAuthData({ token })); + } + }); + + pushNotification({ + type: 'success', + content: t('users.edit.refresh_token_success_notification'), + }); + // eslint-disable-next-line @typescript-eslint/no-unused-vars + } catch (e) { + pushNotification({ + type: 'error', + content: t('users.edit.refresh_token_error_notification'), + }); + } + }; + + const onCancelHandler = () => { + navigate(ROUTES.USER.DETAILS.FORMAT(paramUserName)); + }; + + const onSubmitHandler = async (userData: Partial) => { + try { + const data = await updateUser({ + ...pick(userData, ['global_role', 'email', 'active']), + username: paramUserName, + }).unwrap(); + + pushNotification({ + type: 'success', + content: t('users.edit.success_notification'), + }); + + navigate(ROUTES.USER.DETAILS.FORMAT(data.username ?? paramUserName)); + // eslint-disable-next-line @typescript-eslint/no-unused-vars + } catch (e) { + pushNotification({ + type: 'error', + content: t('users.edit.error_notification'), + }); + } + }; + + return ( + <> + {paramUserName}}> + {isLoading && !data && ( + + + + )} + + {data && ( + + )} + + + {t('users.edit.refresh_token_confirm_message')}} + confirmButtonLabel={t('users.edit.refresh_token_button_label')} + visible={showRefreshConfirm} + onDiscard={toggleRefreshConfirm} + onConfirm={onRefreshTokenHandler} + /> + + ); +}; diff --git a/frontend/src/pages/User/Form/index.tsx b/frontend/src/pages/User/Form/index.tsx new file mode 100644 index 0000000000..abcc777e9f --- /dev/null +++ b/frontend/src/pages/User/Form/index.tsx @@ -0,0 +1,192 @@ +import React from 'react'; +import { useForm } from 'react-hook-form'; +import { useTranslation } from 'react-i18next'; + +import { + Box, + Button, + ColumnLayout, + Container, + FormField, + FormInput, + FormSelect, + FormUI, + Header, + Popover, + SpaceBetween, + StatusIndicator, +} from 'components'; + +import { copyToClipboard } from 'libs'; + +import { TActiveSelectOption, TRoleSelectOption } from './types'; + +export interface Props { + initialValues?: IUserWithCreds; + loading?: boolean; + onCancel: () => void; + onSubmit: (user: IUser) => void; + onRefreshToken?: () => void; + disabledEmailEndRoleFields?: boolean; + disabledRefreshToken?: boolean; +} + +export const UserForm: React.FC = ({ + initialValues, + onCancel, + loading, + onRefreshToken, + disabledEmailEndRoleFields, + disabledRefreshToken, + onSubmit: onSubmitProp, +}) => { + const { t } = useTranslation(); + const isEditing = !!initialValues; + + const { handleSubmit, control } = useForm({ + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + defaultValues: initialValues + ? { ...initialValues, active: initialValues.active ? 'active' : 'inactive' } + : { + global_role: 'user', + active: 'active', + }, + }); + + const roleSelectOptions: TRoleSelectOption[] = [ + { label: t('roles.admin'), value: 'admin' }, + { label: t('roles.user'), value: 'user' }, + ]; + + const activeSelectOptions: TActiveSelectOption[] = [ + { label: t('users.activated'), value: 'active' }, + { label: t('users.deactivated'), value: 'inactive' }, + ]; + + const onCopyToken = () => { + copyToClipboard(initialValues?.creds.token ?? ''); + }; + + const onSubmit = (data: IUser) => { + onSubmitProp({ + ...data, + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + active: data.active === 'active', + }); + }; + + const isDisabledEmailAndRoleField = () => { + return loading || disabledEmailEndRoleFields; + }; + + return ( +
+ + + + + + } + > + {t('users.account_settings')}}> + + + {!isEditing && ( + + )} + + + + + + + + + {initialValues && ( + + + {t('users.token_copied')}} + > + + + + )} + + + + + ); +}; diff --git a/frontend/src/pages/User/Form/types.ts b/frontend/src/pages/User/Form/types.ts new file mode 100644 index 0000000000..8e3005bf33 --- /dev/null +++ b/frontend/src/pages/User/Form/types.ts @@ -0,0 +1,2 @@ +export type TRoleSelectOption = { label: string; value: TProjectRole; disabled?: boolean }; +export type TActiveSelectOption = { label: string; value: 'active' | 'inactive'; disabled?: boolean }; diff --git a/frontend/src/pages/User/List/hooks.tsx b/frontend/src/pages/User/List/hooks.tsx new file mode 100644 index 0000000000..7f9f24549c --- /dev/null +++ b/frontend/src/pages/User/List/hooks.tsx @@ -0,0 +1,39 @@ +import React, { useMemo } from 'react'; +import { useTranslation } from 'react-i18next'; +import { format } from 'date-fns'; + +import { Link, NavigateLink } from 'components'; + +import { DATE_TIME_FORMAT } from 'consts'; +import { ROUTES } from 'routes'; + +export const useColumnDefinitions = () => { + const { t } = useTranslation(); + + return useMemo(() => { + return [ + { + id: 'name', + header: t('users.user_name'), + cell: (item: IUser) => ( + {item.username} + ), + }, + { + id: 'email', + header: t('users.email'), + cell: (item: IUser) => (item.email ? {item.email} : '-'), + }, + { + id: 'global_role', + header: t('users.global_role'), + cell: (item: IUser) => t(`roles.${item.global_role}`), + }, + process.env.UI_VERSION === 'sky' && { + id: 'created_at', + header: t('users.created_at'), + cell: (item: IUser) => format(new Date(item.created_at), DATE_TIME_FORMAT), + }, + ].filter(Boolean); + }, []); +}; diff --git a/frontend/src/pages/User/List/index.tsx b/frontend/src/pages/User/List/index.tsx new file mode 100644 index 0000000000..66d5e2e46f --- /dev/null +++ b/frontend/src/pages/User/List/index.tsx @@ -0,0 +1,191 @@ +import React, { useMemo, useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate } from 'react-router-dom'; + +import { Box, Button, ConfirmationDialog, Header, ListEmptyMessage, Loader, SpaceBetween, Table, TextFilter } from 'components'; + +import { DEFAULT_TABLE_PAGE_SIZE } from 'consts'; +import { useAppSelector, useBreadcrumbs, useCollection, useInfiniteScroll, useNotifications } from 'hooks'; +import { getServerError } from 'libs'; +import { ROUTES } from 'routes'; +import { useDeleteUsersMutation, useLazyGetUserListQuery } from 'services/user'; + +import { selectUserData } from 'App/slice'; + +import { useColumnDefinitions } from './hooks'; + +export const UserList: React.FC = () => { + const { t } = useTranslation(); + const [showDeleteConfirm, setShowConfirmDelete] = useState(false); + const [filteringText, setFilteringText] = useState(''); + const [namePattern, setNamePattern] = useState(''); + const userData = useAppSelector(selectUserData); + const userGlobalRole = userData?.global_role ?? ''; + const [deleteUsers, { isLoading: isDeleting }] = useDeleteUsersMutation(); + const navigate = useNavigate(); + const [pushNotification] = useNotifications(); + + const { data, isLoading, refreshList, isLoadingMore, totalCount } = useInfiniteScroll({ + useLazyQuery: useLazyGetUserListQuery, + args: { name_pattern: namePattern, limit: DEFAULT_TABLE_PAGE_SIZE }, + + getPaginationParams: (lastUser) => ({ + prev_created_at: lastUser.created_at, + }), + }); + + useBreadcrumbs([ + { + text: t('navigation.account'), + href: ROUTES.USER.LIST, + }, + ]); + + const columns = useColumnDefinitions(); + + const toggleDeleteConfirm = () => { + setShowConfirmDelete((val) => !val); + }; + + const addUserHandler = () => { + navigate(ROUTES.USER.ADD); + }; + + const onClearFilter = () => { + setNamePattern(''); + setFilteringText(''); + }; + + const renderEmptyMessage = (): React.ReactNode => { + if (isLoading) { + return null; + } + + if (filteringText) { + return ( + + + + ); + } + + return ( + + + + ); + }; + + const { items, actions, collectionProps } = useCollection(data, { + filtering: { + empty: renderEmptyMessage(), + }, + selection: {}, + }); + + const deleteSelectedUserHandler = () => { + const { selectedItems } = collectionProps; + if (selectedItems?.length) { + deleteUsers(selectedItems.map((user) => user.username)) + .unwrap() + .then(() => actions.setSelectedItems([])) + .catch((error) => { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: getServerError(error) }), + }); + }); + } + setShowConfirmDelete(false); + }; + + const editSelectedUserHandler = () => { + const { selectedItems } = collectionProps; + + if (selectedItems?.length) navigate(ROUTES.USER.EDIT.FORMAT(selectedItems[0].username)); + }; + + const getIsTableItemDisabled = () => { + return isDeleting; + }; + + const isDisabledDelete = useMemo(() => { + return isDeleting || collectionProps.selectedItems?.length === 0 || userGlobalRole !== 'admin'; + }, [collectionProps.selectedItems]); + + const isDisabledEdit = useMemo(() => { + return isDeleting || collectionProps.selectedItems?.length !== 1 || userGlobalRole !== 'admin'; + }, [collectionProps.selectedItems]); + + const renderCounter = () => { + if (typeof totalCount !== 'number') return ''; + + return `(${totalCount})`; + }; + + return ( + <> +
+ + + + + + + + + ); + }; + + const renderNoMatchMessage = (): React.ReactNode => { + return ( + + + + ); + }; + + return { renderEmptyMessage, renderNoMatchMessage }; +}; + +export const useColumnsDefinitions = () => { + const { t } = useTranslation(); + + const columns = [ + { + id: 'name', + header: t('volume.name'), + cell: (item: IVolume) => item.name, + }, + { + id: 'project', + header: `${t('volume.project')}`, + cell: (item: IVolume) => ( + {item.project_name} + ), + }, + { + id: 'backend', + header: `${t('volume.backend')}`, + cell: (item: IVolume) => item.configuration?.backend ?? '-', + }, + { + id: 'region', + header: `${t('volume.region')}`, + cell: (item: IVolume) => item.configuration?.region ?? '-', + }, + + { + id: 'status', + header: t('volume.status'), + cell: (item: IVolume) => + item.deleted ? ( + {t(`volume.statuses.deleted`)} + ) : ( + + {t(`volume.statuses.${item.status}`)} + + ), + }, + { + id: 'created', + header: t('volume.created'), + cell: (item: IVolume) => format(new Date(item.created_at), DATE_TIME_FORMAT), + }, + { + id: 'finished', + header: t('volume.finished'), + cell: (item: IVolume) => getVolumeFinished(item), + }, + { + id: 'price', + header: `${t('volume.price')}`, + cell: (item: IVolume) => { + return item?.provisioning_data?.price ? `$${item.provisioning_data.price.toFixed(2)}` : '-'; + }, + }, + { + id: 'cost', + header: `${t('volume.cost')}`, + cell: (item: IVolume) => { + return item?.cost ? `$${item.cost.toFixed(2)}` : '-'; + }, + }, + ]; + + return { columns } as const; +}; + +type RequestParamsKeys = keyof Pick; + +const filterKeys: Record = { + PROJECT_NAME: 'project_name', +}; + +const limit = 100; + +export const useFilters = () => { + const [searchParams, setSearchParams] = useSearchParams(); + const [onlyActive, setOnlyActive] = useLocalStorageState('volume-list-filter-only-active', true); + const [dynamicFilteringOptions, setDynamicFilteringOptions] = useState([]); + const [filteringStatusType, setFilteringStatusType] = useState(); + const [getProjects] = useLazyGetProjectsQuery(); + + const [propertyFilterQuery, setPropertyFilterQuery] = useState(() => + requestParamsToTokens({ searchParams, filterKeys }), + ); + + const clearFilter = () => { + setSearchParams({}); + setPropertyFilterQuery(EMPTY_QUERY); + }; + + const isDisabledClearFilter = !propertyFilterQuery.tokens.length && !onlyActive; + + const filteringOptions = useMemo(() => { + return [...dynamicFilteringOptions]; + }, [dynamicFilteringOptions]); + + const filteringProperties = [ + { + key: filterKeys.PROJECT_NAME, + operators: ['='], + propertyLabel: 'Project', + groupValuesLabel: 'Project values', + }, + ]; + + const onChangePropertyFilter: PropertyFilterProps['onChange'] = ({ detail }) => { + const { tokens, operation } = detail; + + const filteredTokens = tokens.filter((token, tokenIndex) => { + return !tokens.some((item, index) => token.propertyKey === item.propertyKey && index > tokenIndex); + }); + + setSearchParams(tokensToSearchParams(filteredTokens, onlyActive)); + + setPropertyFilterQuery({ + operation, + tokens: filteredTokens, + }); + }; + + const onChangeOnlyActive: ToggleProps['onChange'] = ({ detail }) => { + setOnlyActive(detail.checked); + }; + + const filteringRequestParams = useMemo(() => { + const params = tokensToRequestParams({ + tokens: propertyFilterQuery.tokens, + }); + + return { + ...params, + only_active: onlyActive, + } as Partial; + }, [propertyFilterQuery, onlyActive]); + + const handleLoadItems: PropertyFilterProps['onLoadItems'] = async ({ detail: { filteringProperty, filteringText } }) => { + setDynamicFilteringOptions([]); + + setFilteringStatusType('loading'); + + if (filteringProperty?.key === filterKeys.PROJECT_NAME) { + await getProjects(getNamePatternFilterRequestParams(filteringText, limit)) + .unwrap() + .then(({ data }) => + data.map(({ project_name }) => ({ + propertyKey: filterKeys.PROJECT_NAME, + value: project_name, + })), + ) + .then(setDynamicFilteringOptions); + } + + setFilteringStatusType(undefined); + }; + + return { + filteringRequestParams, + clearFilter, + propertyFilterQuery, + onChangePropertyFilter, + filteringOptions, + filteringProperties, + onlyActive, + onChangeOnlyActive, + isDisabledClearFilter, + filteringStatusType, + handleLoadItems, + } as const; +}; + +export const useVolumesDelete = () => { + const { t } = useTranslation(); + const [deleteVolumesRequest] = useDeleteVolumesMutation(); + const [pushNotification] = useNotifications(); + const [isDeleting, setIsDeleting] = useState(() => false); + + const namesOfVolumesGroupByProjectName = (volumes: IVolume[]) => { + return volumes.reduce>((acc, volume) => { + if (acc[volume.project_name]) { + acc[volume.project_name].push(volume.name); + } else { + acc[volume.project_name] = [volume.name]; + } + + return acc; + }, {}); + }; + + const deleteVolumes = (volumes: IVolume[]) => { + if (!volumes.length) return Promise.reject('No volumes'); + + setIsDeleting(true); + + const groupedVolumes = namesOfVolumesGroupByProjectName(volumes); + + const requests = Object.keys(groupedVolumes).map((projectName) => { + return deleteVolumesRequest({ + project_name: projectName, + names: groupedVolumes[projectName], + }).unwrap(); + }); + + return Promise.all(requests) + .finally(() => setIsDeleting(false)) + .catch((error) => { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: getServerError(error) }), + }); + }); + }; + + return { isDeleting, deleteVolumes }; +}; + +const getVolumeFinished = (volume: IVolume): string => { + if (!volume.deleted_at && volume.status != 'failed') { + return '-'; + } + let finished = volume.last_processed_at; + if (volume.deleted_at) { + finished = volume.deleted_at; + } + return format(new Date(finished), DATE_TIME_FORMAT); +}; diff --git a/frontend/src/pages/Volumes/List/index.tsx b/frontend/src/pages/Volumes/List/index.tsx new file mode 100644 index 0000000000..d51b9f1d5e --- /dev/null +++ b/frontend/src/pages/Volumes/List/index.tsx @@ -0,0 +1,149 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; + +import { Button, ButtonWithConfirmation, Header, Loader, PropertyFilter, SpaceBetween, Table, Toggle } from 'components'; + +import { DEFAULT_TABLE_PAGE_SIZE } from 'consts'; +import { useBreadcrumbs, useCollection, useInfiniteScroll } from 'hooks'; +import { ROUTES } from 'routes'; +import { useLazyGetAllVolumesQuery } from 'services/volume'; + +import { useColumnsDefinitions, useFilters, useVolumesDelete, useVolumesTableEmptyMessages } from './hooks'; + +import styles from './styles.module.scss'; + +export const VolumeList: React.FC = () => { + const { t } = useTranslation(); + const { + clearFilter, + propertyFilterQuery, + onChangePropertyFilter, + filteringOptions, + filteringProperties, + filteringRequestParams, + onlyActive, + onChangeOnlyActive, + isDisabledClearFilter, + filteringStatusType, + handleLoadItems, + } = useFilters(); + + const { isDeleting, deleteVolumes } = useVolumesDelete(); + + const { renderEmptyMessage, renderNoMatchMessage } = useVolumesTableEmptyMessages({ + clearFilter, + isDisabledClearFilter, + }); + + const { data, isLoading, refreshList, isLoadingMore } = useInfiniteScroll({ + useLazyQuery: useLazyGetAllVolumesQuery, + args: { ...filteringRequestParams, limit: DEFAULT_TABLE_PAGE_SIZE }, + + getPaginationParams: (lastFleet) => ({ + prev_created_at: lastFleet.created_at, + prev_id: lastFleet.id, + }), + }); + + const { columns } = useColumnsDefinitions(); + + useBreadcrumbs([ + { + text: t('volume.volumes'), + href: ROUTES.VOLUMES.LIST, + }, + ]); + + const { items, collectionProps, actions } = useCollection(data, { + filtering: { + empty: renderEmptyMessage(), + noMatch: renderNoMatchMessage(), + }, + selection: {}, + }); + + const { selectedItems } = collectionProps; + + const deleteSelectedVolumes = () => { + if (!selectedItems?.length) return; + + deleteVolumes([...selectedItems]) + .finally(() => { + refreshList(); + actions.setSelectedItems([]); + }) + .catch(console.log); + }; + + const isDisabledDeleteSelected = !selectedItems?.length || isDeleting; + + return ( +
+ + {t('common.delete')} + + + + +
  • + +
  • + + ` + : ""; + wrapper.innerHTML = ` + ${tabs} + ${hasExample ? ` +
    + ${getJsonEditorHtml("dstack-swagger-response-json-example")} +
    + ` : ""} + ${hasSchema ? ` +
    + ${getJsonEditorHtml("dstack-swagger-json-schema dstack-swagger-response-json-schema")} +
    + ` : ""} + `; + + wrapper + .querySelector(".dstack-swagger-response-example-tab") + ?.addEventListener("click", () => setResponseExampleMode(wrapper, "example")); + wrapper + .querySelector(".dstack-swagger-response-schema-tab") + ?.addEventListener("click", () => setResponseExampleMode(wrapper, "schema")); + return wrapper; + } + + function setResponseExampleMode(wrapper, mode) { + wrapper.dataset.dstackSwaggerResponseExampleMode = mode; + syncResponseExampleMode(wrapper); + } + + function syncResponseExampleMode(wrapper) { + const examplePanel = wrapper.querySelector(".dstack-swagger-response-example-panel"); + const schemaPanel = wrapper.querySelector(".dstack-swagger-response-schema-panel"); + const mode = wrapper.dataset.dstackSwaggerResponseExampleMode || + (examplePanel ? "example" : "schema"); + const isSchema = mode === "schema" && Boolean(schemaPanel); + + if (examplePanel) { + examplePanel.hidden = isSchema; + } + if (schemaPanel) { + schemaPanel.hidden = !isSchema; + } + wrapper.querySelectorAll(".tab li").forEach((item) => item.classList.remove("active")); + wrapper.querySelectorAll(".tablinks").forEach((button) => { + const selected = + (isSchema && button.classList.contains("dstack-swagger-response-schema-tab")) || + (!isSchema && button.classList.contains("dstack-swagger-response-example-tab")); + button.setAttribute("aria-selected", selected ? "true" : "false"); + button.closest("li")?.classList.toggle("active", selected); + }); + + } + + function getResponseExampleText(body, response, schema, spec) { + return ( + getResponseExampleTextFromSpec(response) || + getResponseExampleTextFromDom(body) || + getResponseExampleTextFromSchema(schema, spec) + ); + } + + function getResponseExampleTextFromSpec(response) { + const media = getResponseMedia(response); + if (!media) { + return ""; + } + if (media.example !== undefined) { + return stringifyJsonValue(media.example); + } + + const firstExample = Object.values(media.examples || {})[0]; + if (firstExample?.value !== undefined) { + return stringifyJsonValue(firstExample.value); + } + return ""; + } + + function getResponseMedia(response) { + return Object.values(response?.content || {})[0] || null; + } + + function getResponseExampleTextFromDom(body) { + const candidates = [ + ...body.querySelectorAll( + ".model-example .highlight-code pre, .model-example .highlight-code code, " + + ".highlight-code pre, .highlight-code code" + ), + ]; + for (const candidate of candidates) { + if (candidate.closest(".dstack-swagger-response-example")) { + continue; + } + + const text = normalizeJsonText(candidate.textContent || ""); + if (text && !/^(example value|schema)$/i.test(text)) { + return text; + } + } + return ""; + } + + function getResponseExampleTextFromSchema(schema, spec) { + if (!schema || isEmptySchema(schema)) { + return ""; + } + const example = buildSchemaExample(schema, spec); + return example === undefined ? "" : stringifyJsonValue(example); + } + + function buildSchemaExample(schema, spec, seenRefs = new Set(), depth = 0) { + if (!schema || typeof schema !== "object") { + return undefined; + } + if (depth > 8) { + return null; + } + + if (schema.example !== undefined) { + return schema.example; + } + if (schema.default !== undefined) { + return schema.default; + } + if (Array.isArray(schema.enum) && schema.enum.length > 0) { + return schema.enum[0]; + } + if (schema.const !== undefined) { + return schema.const; + } + + const refName = getRefName(schema); + if (refName) { + if (seenRefs.has(refName)) { + return {}; + } + seenRefs.add(refName); + return buildSchemaExample(resolveSchema(schema, spec), spec, seenRefs, depth + 1); + } + + if (Array.isArray(schema.allOf) && schema.allOf.length > 0) { + return mergeSchemaExamples( + schema.allOf.map((item) => buildSchemaExample(item, spec, seenRefs, depth + 1)) + ); + } + + const union = schema.oneOf || schema.anyOf; + if (Array.isArray(union) && union.length > 0) { + return buildSchemaExample(union[0], spec, seenRefs, depth + 1); + } + + const type = getSchemaExampleType(schema); + if (type === "array") { + return [buildSchemaExample(schema.items || {}, spec, seenRefs, depth + 1)]; + } + if (type === "object") { + return buildObjectSchemaExample(schema, spec, seenRefs, depth); + } + if (type === "integer" || type === "number") { + return typeof schema.minimum === "number" ? schema.minimum : 0; + } + if (type === "boolean") { + return false; + } + if (type === "null") { + return null; + } + return "string"; + } + + function mergeSchemaExamples(examples) { + const definedExamples = examples.filter((example) => example !== undefined); + if (definedExamples.every(isPlainObject)) { + return definedExamples.reduce((merged, example) => ({ ...merged, ...example }), {}); + } + return definedExamples[definedExamples.length - 1]; + } + + function buildObjectSchemaExample(schema, spec, seenRefs, depth) { + const properties = getSchemaProperties(schema, spec) || schema.properties; + if (properties && Object.keys(properties).length > 0) { + return Object.fromEntries( + Object.entries(properties).map(([name, propertySchema]) => [ + name, + buildSchemaExample(propertySchema, spec, seenRefs, depth + 1), + ]) + ); + } + if (schema.additionalProperties && typeof schema.additionalProperties === "object") { + return { + additionalProp1: buildSchemaExample( + schema.additionalProperties, + spec, + seenRefs, + depth + 1 + ), + }; + } + return {}; + } + + function getSchemaExampleType(schema) { + if (Array.isArray(schema.type)) { + return schema.type.find((type) => type !== "null") || "null"; + } + if (schema.type) { + return schema.type; + } + if (schema.properties || schema.additionalProperties) { + return "object"; + } + if (schema.items) { + return "array"; + } + return "string"; + } + + function isPlainObject(value) { + return Boolean(value) && typeof value === "object" && !Array.isArray(value); + } + + function getResponseCode(row) { + return ( + row.dataset.code || + row.querySelector(":scope > .response-col_status")?.textContent || + "" + ).trim(); + } + + function isSuccessResponseCode(code) { + return /^2\d\d$/.test(code); + } + + function getResponseContainer(descriptionCell, isSuccess, isLiveResponse) { + const tagName = isLiveResponse ? "blockquote" : "details"; + let container = descriptionCell.querySelector( + ":scope > .dstack-swagger-response-container" + ); + if (!container) { + container = descriptionCell.querySelector( + ":scope > .dstack-swagger-response-admonition" + ); + } + if (container?.tagName.toLowerCase() === tagName) { + updateResponseContainerClass(container, isSuccess, isLiveResponse); + return container; + } + + const nextContainer = document.createElement(tagName); + updateResponseContainerClass(nextContainer, isSuccess, isLiveResponse); + if (container) { + while (container.firstChild) { + nextContainer.appendChild(container.firstChild); + } + container.replaceWith(nextContainer); + } else { + descriptionCell.prepend(nextContainer); + } + return nextContainer; + } + + function updateResponseContainerClass(container, isSuccess, isLiveResponse) { + container.className = isLiveResponse + ? "dstack-swagger-response-container dstack-swagger-response-section dstack-swagger-live-response" + : "dstack-swagger-response-container info dstack-swagger-response-admonition dstack-swagger-response-section"; + } + + function updateResponseContainerTitle(container, code, description, isSuccess, isLiveResponse) { + const tagName = isLiveResponse ? "h4" : "summary"; + let title = container.querySelector(":scope > .dstack-swagger-response-title"); + if (title?.tagName.toLowerCase() !== tagName) { + const nextTitle = document.createElement(tagName); + if (title) { + title.replaceWith(nextTitle); + } else { + container.prepend(nextTitle); + } + title = nextTitle; + } + title.className = "dstack-swagger-response-title"; + title.textContent = `${code || "default"} ${description || "Response"}`; + } + + function getResponseContainerBody(container) { + let body = container.querySelector(":scope > .dstack-swagger-response-body"); + if (!body) { + body = document.createElement("div"); + body.className = "dstack-swagger-response-body"; + container.appendChild(body); + } + const title = container.querySelector(":scope > .dstack-swagger-response-title"); + if (title && title.nextElementSibling !== body) { + title.after(body); + } + return body; + } + + function getResponseDescription(descriptionCell) { + return ( + descriptionCell.querySelector( + ".response-col_description__inner .renderedMarkdown, " + + ".response-col_description__inner" + )?.textContent || + "" + ).trim(); + } + + function setupRequestCurlExamples(root, spec) { + let scheduled = false; + const update = () => { + scheduled = false; + updateRequestCurlExamples(root, spec); + }; + const scheduleUpdate = () => { + if (scheduled) { + return; + } + scheduled = true; + window.requestAnimationFrame(update); + }; + + const observer = new MutationObserver(scheduleUpdate); + observer.observe(root, { + childList: true, + subtree: true, + }); + scheduleUpdate(); + } + + function updateRequestCurlExamples(root, spec) { + root.querySelectorAll(".opblock").forEach((opblock) => { + const operation = getOperationForOpblock(spec, opblock); + if (!operation) { + return; + } + const modelExample = opblock.querySelector( + ":scope .opblock-section-request-body .model-example" + ); + if (!modelExample) { + teardownEditRequestCurlExample(opblock); + if (!operation.requestBody) { + setupOperationCurlExample(opblock, operation, spec); + } + return; + } + teardownOperationCurlExample(opblock); + if (isRequestBodyEditing(modelExample)) { + setupRequestCurlExample(opblock, modelExample, operation, spec, true); + return; + } + setupRequestCurlExample(opblock, modelExample, operation, spec, false); + }); + } + + function isRequestBodyEditing(modelExample) { + return Boolean( + modelExample.querySelector("textarea, .body-param__text") || + [...modelExample.querySelectorAll(".tablinks")].some((button) => + /^\s*(edit value|request body)\s*$/i.test(button.textContent || "") + ) + ); + } + + function teardownRequestCurlExample(modelExample) { + const wrapper = getRequestCurlWrapper(modelExample); + wrapper?.remove(); + modelExample.classList.remove("dstack-swagger-request-model-hidden"); + } + + function setupRequestCurlExample(opblock, modelExample, operation, spec, isEditing) { + const schema = getRequestJsonSchema(operation, opblock); + if (!schema) { + teardownRequestCurlExample(modelExample); + teardownEditRequestCurlExample(opblock); + return; + } + + if (isEditing) { + teardownRequestCurlExample(modelExample); + teardownEditRequestCurlExample(opblock); + return; + } + + const wrapper = ensureRequestCurlWrapper(modelExample); + wrapper.classList.remove("dstack-swagger-request-example-editing"); + teardownEditRequestCurlExample(opblock); + const body = getRequestExampleBody(modelExample, operation, opblock); + const curl = buildCurlCommand(opblock, operation, spec, body); + renderRequestCurlTermy(wrapper, curl); + renderRequestJsonSchema(wrapper, schema, spec); + + if (!wrapper.dataset.dstackSwaggerRequestExampleMode) { + setRequestExampleMode(modelExample, "curl"); + } else { + syncRequestExampleMode(modelExample); + } + } + + function setupOperationCurlExample(opblock, operation, spec) { + const wrapper = ensureOperationCurlWrapper(opblock); + const curl = buildCurlCommand(opblock, operation, spec, ""); + renderRequestCurlTermy(wrapper, curl); + } + + function ensureOperationCurlWrapper(opblock) { + let wrapper = opblock.querySelector(":scope > .dstack-swagger-operation-curl-example"); + if (wrapper) { + return wrapper; + } + + wrapper = document.createElement("div"); + wrapper.className = "dstack-swagger-operation-curl-example"; + wrapper.innerHTML = ` +
    +
    +
    + `; + opblock.querySelector(":scope > .opblock-summary")?.after(wrapper); + return wrapper; + } + + function teardownOperationCurlExample(opblock) { + opblock.querySelector(":scope > .dstack-swagger-operation-curl-example")?.remove(); + } + + function getRequestCurlWrapper(modelExample) { + const previous = modelExample.previousElementSibling; + return previous?.classList.contains("dstack-swagger-request-example") + ? previous + : null; + } + + function ensureRequestCurlWrapper(modelExample) { + let wrapper = getRequestCurlWrapper(modelExample); + if (wrapper) { + return wrapper; + } + + wrapper = document.createElement("div"); + wrapper.className = "dstack-swagger-request-example"; + wrapper.innerHTML = ` +
      +
    • + +
    • +
    • + +
    • +
    +
    +
    +
    +
    +
    + + `; + wrapper + .querySelector(".dstack-swagger-request-curl-tab") + .addEventListener("click", () => setRequestExampleMode(modelExample, "curl")); + wrapper + .querySelector(".dstack-swagger-request-schema-tab") + .addEventListener("click", () => setRequestExampleMode(modelExample, "schema")); + modelExample.before(wrapper); + return wrapper; + } + + function renderRequestCurlTermy(wrapper, curl) { + const termy = wrapper.querySelector(".dstack-swagger-request-curl-termy"); + if (!termy || termy.dataset.dstackSwaggerCurl === curl) { + return; + } + + termy.dataset.termynalCopy = "true"; + termy.dataset.termynalInstant = "true"; + termy.dataset.termynalMaxHeight = "calc(var(--dstack-swagger-curl-max-height) - 90px)"; + termy.dstackTermynalCopyText = curl; + termy.innerHTML = '
    '; + const highlight = termy.querySelector(".highlight"); + highlight.dataset.termynalSingleInput = "true"; + termy.querySelector("code").textContent = `$ ${curl}`; + termy.dataset.dstackSwaggerCurl = curl; + if (typeof setupTermynal === "function") { + setupTermynal(termy); + } + } + + function renderRequestJsonSchema(wrapper, schema, spec) { + const pre = wrapper.querySelector(".dstack-swagger-request-json-schema"); + renderJsonSchemaPre(pre, schema, spec); + } + + function getJsonEditorHtml(className) { + return ` +
    +
    +
    + `; + } + + function renderJsonSchemaPre(pre, schema, spec) { + const schemaDocument = buildJsonSchemaDocument(schema, spec); + const schemaText = JSON.stringify(orderJsonSchemaKeys(schemaDocument), null, 2); + renderJsonEditorPre(pre, schemaText); + } + + function renderJsonEditorPre(editor, text) { + const code = editor?.querySelector(":scope > pre > code, :scope > code"); + if (!editor || !code) { + return; + } + + const jsonText = normalizeJsonText(text); + setupJsonEditorCopyButton(editor, jsonText); + if (code.dataset.dstackSwaggerJson === jsonText) { + return; + } + + code.innerHTML = highlightJson(jsonText); + code.dataset.dstackSwaggerJson = jsonText; + } + + function setupJsonEditorCopyButton(pre, text) { + let nav = pre.querySelector(":scope > .md-code__nav"); + if (!nav) { + nav = document.createElement("nav"); + nav.className = "md-code__nav"; + pre.insertBefore(nav, pre.firstChild); + } + + let button = nav.querySelector(":scope > .md-code__button[data-md-type='copy']"); + if (!button) { + button = document.createElement("button"); + button.className = "md-code__button"; + button.type = "button"; + button.title = "Copy to clipboard"; + button.dataset.mdType = "copy"; + nav.appendChild(button); + } + button.dataset.clipboardText = text; + } + + function normalizeJsonText(value) { + const text = String(value || "").trim(); + if (!text) { + return ""; + } + try { + return JSON.stringify(JSON.parse(text), null, 2); + } catch { + return text; + } + } + + function stringifyJsonValue(value) { + if (typeof value === "string") { + return JSON.stringify(value, null, 2); + } + return JSON.stringify(value, null, 2); + } + + function highlightJson(json) { + return json.replace( + /("(?:\\u[a-fA-F0-9]{4}|\\[^u]|[^\\"])*"(\s*:)?|\b(?:true|false|null)\b|-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)/g, + (token) => { + const escaped = escapeHtml(token); + if (token.startsWith('"')) { + const className = token.endsWith(":") ? "nt" : "s2"; + return `${escaped}`; + } + if (/true|false|null/.test(token)) { + return `${escaped}`; + } + const className = /[.eE]/.test(token) ? "mf" : "mi"; + return `${escaped}`; + } + ); + } + + function escapeHtml(value) { + return String(value) + .replace(/&/g, "&") + .replace(//g, ">"); + } + + function setRequestExampleMode(modelExample, mode) { + const wrapper = getRequestCurlWrapper(modelExample); + if (!wrapper) { + return; + } + + wrapper.dataset.dstackSwaggerRequestExampleMode = mode; + syncRequestExampleMode(modelExample); + } + + function syncRequestExampleMode(modelExample) { + const wrapper = getRequestCurlWrapper(modelExample); + if (!wrapper) { + return; + } + + const mode = wrapper.dataset.dstackSwaggerRequestExampleMode || "curl"; + const isSchema = mode === "schema"; + const isEditing = wrapper.classList.contains("dstack-swagger-request-example-editing"); + wrapper.querySelector(".dstack-swagger-request-curl-panel").hidden = isSchema; + wrapper.querySelector(".dstack-swagger-request-schema-panel").hidden = !isSchema; + wrapper.querySelector(".dstack-swagger-request-curl-tab").closest("li").hidden = isEditing; + modelExample.classList.toggle("dstack-swagger-request-model-hidden", !isEditing); + + wrapper.querySelectorAll(".tab li").forEach((item) => item.classList.remove("active")); + wrapper.querySelectorAll(".tablinks").forEach((button) => { + const selected = + (isSchema && button.classList.contains("dstack-swagger-request-schema-tab")) || + (!isSchema && button.classList.contains("dstack-swagger-request-curl-tab")); + button.setAttribute("aria-selected", selected ? "true" : "false"); + button.closest("li")?.classList.toggle("active", selected); + }); + } + + function teardownEditRequestCurlExample(opblock) { + opblock.querySelector(":scope .dstack-swagger-edit-curl-example")?.remove(); + } + + function getRequestJsonSchema(operation, opblock) { + const mediaType = getRequestContentType(opblock, operation); + return operation?.requestBody?.content?.[mediaType]?.schema || + getContentSchema(operation?.requestBody?.content); + } + + function buildJsonSchemaDocument(schema, spec) { + const definitions = {}; + const seenDefinitions = new Set(); + const addDefinition = (name) => { + if (!name || seenDefinitions.has(name)) { + return Boolean(name); + } + const componentSchema = spec.components?.schemas?.[name]; + if (!componentSchema) { + return false; + } + seenDefinitions.add(name); + definitions[name] = transformOpenApiSchemaToJsonSchema( + componentSchema, + addDefinition, + spec + ); + return true; + }; + + const rootSchema = resolveSchema(schema, spec) || schema; + const schemaDocument = { + $schema: "https://json-schema.org/draft/2020-12/schema", + ...transformOpenApiSchemaToJsonSchema(rootSchema, addDefinition, spec), + }; + if (Object.keys(definitions).length > 0) { + schemaDocument.$defs = orderJsonSchemaDefinitions(definitions); + } + return schemaDocument; + } + + function orderJsonSchemaDefinitions(definitions) { + return Object.fromEntries( + Object.entries(definitions).map(([name, definition]) => [ + name, + orderJsonSchemaKeys(definition), + ]) + ); + } + + function orderJsonSchemaKeys(value) { + if (Array.isArray(value)) { + return value.map(orderJsonSchemaKeys); + } + if (!value || typeof value !== "object") { + return value; + } + + const keyOrder = [ + "$schema", + "$id", + "$ref", + "type", + "const", + "enum", + "required", + "properties", + "items", + "additionalProperties", + "oneOf", + "anyOf", + "allOf", + "not", + "format", + "minimum", + "maximum", + "exclusiveMinimum", + "exclusiveMaximum", + "minLength", + "maxLength", + "pattern", + "minItems", + "maxItems", + "uniqueItems", + "description", + "default", + "$defs", + ]; + const ordered = {}; + keyOrder.forEach((key) => { + if (Object.prototype.hasOwnProperty.call(value, key)) { + ordered[key] = orderJsonSchemaKeys(value[key]); + } + }); + Object.keys(value).forEach((key) => { + if (!Object.prototype.hasOwnProperty.call(ordered, key)) { + ordered[key] = orderJsonSchemaKeys(value[key]); + } + }); + return ordered; + } + + function transformOpenApiSchemaToJsonSchema(schema, addDefinition, spec) { + if (!schema || typeof schema !== "object") { + return schema; + } + if (Array.isArray(schema)) { + return schema.map((item) => + transformOpenApiSchemaToJsonSchema(item, addDefinition, spec) + ); + } + + const nullable = schema.nullable === true; + const transformed = {}; + Object.entries(schema).forEach(([key, value]) => { + if (key === "nullable" || isOpenApiOnlySchemaKeyword(key)) { + return; + } + + if (key === "$ref" && typeof value === "string") { + Object.assign(transformed, transformSchemaRef(value, addDefinition, spec)); + return; + } + + if (key === "exclusiveMinimum") { + transformExclusiveLimit(transformed, "minimum", "exclusiveMinimum", value, schema); + return; + } + if (key === "exclusiveMaximum") { + transformExclusiveLimit(transformed, "maximum", "exclusiveMaximum", value, schema); + return; + } + + transformed[key] = transformOpenApiSchemaToJsonSchema(value, addDefinition, spec); + }); + if (schema.exclusiveMinimum === true && schema.minimum !== undefined) { + delete transformed.minimum; + } + if (schema.exclusiveMaximum === true && schema.maximum !== undefined) { + delete transformed.maximum; + } + + return nullable ? addNullableType(transformed) : transformed; + } + + function isOpenApiOnlySchemaKeyword(key) { + return ["discriminator", "example", "externalDocs", "xml"].includes(key); + } + + function transformSchemaRef(ref, addDefinition, spec) { + const schemaName = getComponentSchemaRefName(ref); + if (!schemaName) { + return { $ref: ref }; + } + if (addDefinition(schemaName)) { + return { $ref: `#/$defs/${schemaName}` }; + } + + const componentSchema = spec.components?.schemas?.[schemaName]; + if (!componentSchema) { + return { $ref: ref }; + } + return { $ref: ref }; + } + + function getComponentSchemaRefName(ref) { + const prefix = "#/components/schemas/"; + return ref.startsWith(prefix) ? ref.slice(prefix.length) : null; + } + + function transformExclusiveLimit(target, limitKey, exclusiveKey, value, source) { + if (typeof value !== "boolean") { + target[exclusiveKey] = value; + return; + } + if (value && source[limitKey] !== undefined) { + target[exclusiveKey] = source[limitKey]; + delete target[limitKey]; + } + } + + function addNullableType(schema) { + if (typeof schema.type === "string") { + return { + ...schema, + type: schema.type === "null" ? "null" : [schema.type, "null"], + }; + } + if (Array.isArray(schema.type)) { + return { + ...schema, + type: schema.type.includes("null") ? schema.type : [...schema.type, "null"], + }; + } + return { + anyOf: [ + schema, + { + type: "null", + }, + ], + }; + } + + function getRequestExampleBody(modelExample, operation, opblock) { + const cached = modelExample.dataset.dstackSwaggerCurlBody; + const bodyFromDom = getRequestExampleBodyFromDom(modelExample); + if (bodyFromDom) { + modelExample.dataset.dstackSwaggerCurlBody = bodyFromDom; + return bodyFromDom; + } + if (cached) { + return cached; + } + + const mediaType = getRequestContentType(opblock, operation); + const bodyFromSpec = getRequestExampleBodyFromSpec(operation, mediaType); + if (bodyFromSpec) { + modelExample.dataset.dstackSwaggerCurlBody = bodyFromSpec; + } + return bodyFromSpec; + } + + function getRequestExampleBodyFromDom(modelExample) { + const candidates = [ + ...modelExample.querySelectorAll("textarea:not(.curl), .body-param__text"), + ...modelExample.querySelectorAll( + "[role='tabpanel'] pre, [role='tabpanel'] code, pre, code" + ), + ]; + for (const candidate of candidates) { + const text = (candidate.value || candidate.textContent || "").trim(); + if (looksLikeRequestBody(text)) { + return text; + } + } + return ""; + } + + function looksLikeRequestBody(text) { + if (!text || /^(schema|object)$/i.test(text)) { + return false; + } + if (/^\s*[{[]/.test(text)) { + return true; + } + return text.length > 0 && text.length < 10000; + } + + function getRequestExampleBodyFromSpec(operation, mediaType) { + const media = operation.requestBody?.content?.[mediaType]; + if (!media) { + return ""; + } + if (media.example !== undefined) { + return stringifyCurlBody(media.example); + } + + const firstExample = Object.values(media.examples || {})[0]; + if (firstExample?.value !== undefined) { + return stringifyCurlBody(firstExample.value); + } + return ""; + } + + function stringifyCurlBody(value) { + return typeof value === "string" ? value : JSON.stringify(value, null, 2); + } + + function buildCurlCommand(opblock, operation, spec, body) { + const method = (opblock.querySelector(".opblock-summary-method")?.textContent || "GET") + .trim() + .toUpperCase(); + const parameters = getOperationParameters(opblock, operation, spec); + const url = getCurlUrl(opblock, parameters); + const mediaType = getRequestContentType(opblock, operation); + const lines = [`curl -X ${method} '${escapeShellSingleQuoted(url)}'`]; + + if (hasSecurity(operation, spec)) { + lines.push("-H 'Authorization: Bearer {user token}'"); + } + addCurlParameterHeaders(lines, parameters); + lines.push("-H 'Accept: application/json'"); + if (mediaType) { + lines.push(`-H 'Content-Type: ${escapeShellSingleQuoted(mediaType)}'`); + } + if (body && method !== "GET" && method !== "HEAD") { + lines.push(formatCurlBodyArgument(body)); + } + return lines.join(` \\\n${CURL_CONTINUATION_INDENT}`); + } + + function formatCurlBodyArgument(body) { + const value = indentCurlBody(escapeShellSingleQuoted(String(body).trim())); + return `-d '${value}'`; + } + + function indentCurlBody(value) { + return value.replace(/\n/g, `\n${CURL_CONTINUATION_INDENT}`); + } + + function getCurlUrl(opblock, parameters) { + const url = getOperationUrl(opblock); + const query = parameters + .filter((parameter) => parameter.in === "query" && parameter.required) + .map((parameter) => { + return `${encodeURIComponent(parameter.name)}=${getParameterPlaceholder(parameter.name)}`; + }) + .join("&"); + if (!query) { + return url; + } + return `${url}${url.includes("?") ? "&" : "?"}${query}`; + } + + function addCurlParameterHeaders(lines, parameters) { + parameters + .filter((parameter) => parameter.in === "header" && parameter.required) + .forEach((parameter) => { + lines.push( + `-H '${escapeShellSingleQuoted(parameter.name)}: ${escapeShellSingleQuoted( + getParameterPlaceholder(parameter.name) + )}'` + ); + }); + + const cookies = parameters + .filter((parameter) => parameter.in === "cookie" && parameter.required) + .map((parameter) => { + return `${parameter.name}=${getParameterPlaceholder(parameter.name)}`; + }); + if (cookies.length > 0) { + lines.push(`-H 'Cookie: ${escapeShellSingleQuoted(cookies.join("; "))}'`); + } + } + + function getOperationParameters(opblock, operation, spec) { + const path = opblock.querySelector(".opblock-summary-path")?.dataset.path; + const pathItem = path ? spec.paths?.[path] : null; + const parameters = new Map(); + [...(pathItem?.parameters || []), ...(operation.parameters || [])].forEach((parameter) => { + const resolvedParameter = resolveParameter(parameter, spec); + if (!resolvedParameter?.name || !resolvedParameter?.in) { + return; + } + parameters.set(`${resolvedParameter.in}:${resolvedParameter.name}`, resolvedParameter); + }); + return [...parameters.values()]; + } + + function resolveParameter(parameter, spec) { + if (typeof parameter?.$ref !== "string") { + return parameter; + } + const name = parameter.$ref.split("/").pop(); + return spec.components?.parameters?.[name] || parameter; + } + + function getParameterPlaceholder(name) { + return `{${name}}`; + } + + function getRequestContentType(opblock, operation) { + return ( + opblock.querySelector(".dstack-swagger-content-type-proxy select")?.value || + opblock.querySelector(".opblock-section-request-body .content-type-wrapper select")?.value || + Object.keys(operation?.requestBody?.content || {})[0] || + "" + ); + } + + function hasSecurity(operation, spec) { + if (Array.isArray(operation.security)) { + return operation.security.length > 0; + } + return Array.isArray(spec.security) && spec.security.length > 0; + } + + function escapeShellSingleQuoted(value) { + return String(value).replace(/'/g, "'\\''"); + } + + function getOperationAnchors(root) { + const pageRoot = root.closest(".md-content__inner, .md-typeset, article") || document; + const anchors = new Map(); + pageRoot.querySelectorAll(".dstack-swagger-operation-anchor").forEach((anchor) => { + const key = getOperationKey(anchor.dataset.openapiMethod, anchor.dataset.openapiPath); + if (key) { + anchors.set(key, anchor); + } + }); + return anchors; + } + + function getOperationKeyForOpblock(opblock) { + return getOperationKey( + opblock.querySelector(".opblock-summary-method")?.textContent, + opblock.querySelector(".opblock-summary-path")?.dataset.path + ); + } + + function getOperationKey(method, path) { + const normalizedMethod = (method || "").trim().toLowerCase(); + const normalizedPath = (path || "").trim(); + return normalizedMethod && normalizedPath ? `${normalizedMethod} ${normalizedPath}` : null; + } + + function setupOperationTocScrolling(root) { + if (document.documentElement.dataset.dstackSwaggerTocScrolling === "true") { + return; + } + document.documentElement.dataset.dstackSwaggerTocScrolling = "true"; + + document.addEventListener("click", (event) => { + const target = event.target instanceof Element ? event.target : event.target.parentElement; + const link = target?.closest("a[href*='#']"); + const anchor = getOperationTitleForLink(link); + if (!anchor) { + return; + } + + event.preventDefault(); + event.stopPropagation(); + event.stopImmediatePropagation(); + window.history.pushState(null, "", `#${anchor.id}`); + scrollToOperationTitle(anchor, "smooth"); + }, true); + + scrollToCurrentOperationHash(root); + } + + function scrollToCurrentOperationHash(root) { + if (root.dataset.dstackSwaggerHashScrolled === "true" || !window.location.hash) { + return; + } + const anchor = document.getElementById(decodeHashId(window.location.hash.slice(1))); + if (!anchor?.classList.contains("dstack-swagger-operation-title") || !root.contains(anchor)) { + return; + } + + root.dataset.dstackSwaggerHashScrolled = "true"; + window.requestAnimationFrame(() => scrollToOperationTitle(anchor)); + } + + function getOperationTitleForLink(link) { + if (!link) { + return null; + } + + const url = new URL(link.getAttribute("href"), window.location.href); + if ( + url.origin !== window.location.origin || + url.pathname !== window.location.pathname || + url.search !== window.location.search || + !url.hash + ) { + return null; + } + + const anchor = document.getElementById(decodeHashId(url.hash.slice(1))); + return anchor?.classList.contains("dstack-swagger-operation-title") ? anchor : null; + } + + function scrollToOperationTitle(anchor, behavior = "auto") { + const style = window.getComputedStyle(anchor); + const scrollMargin = Number.parseFloat(style.scrollMarginTop) || 0; + const top = anchor.getBoundingClientRect().top + window.scrollY - scrollMargin; + window.scrollTo({ top, behavior }); + } + + function decodeHashId(hashId) { + try { + return decodeURIComponent(hashId); + } catch { + return hashId; + } + } + + function stripSchemaTitles(spec) { + const seen = new WeakSet(); + const strip = (schema) => { + if (!schema || typeof schema !== "object" || seen.has(schema)) { + return; + } + seen.add(schema); + + delete schema.title; + + Object.values(schema.properties || {}).forEach(strip); + ["items", "additionalProperties", "not"].forEach((key) => { + if (schema[key] && typeof schema[key] === "object") { + strip(schema[key]); + } + }); + ["allOf", "anyOf", "oneOf"].forEach((key) => { + if (Array.isArray(schema[key])) { + schema[key].forEach(strip); + } + }); + }; + + Object.values(spec.components?.schemas || {}).forEach(strip); + Object.values(spec.paths || {}).forEach((pathItem) => { + Object.values(pathItem || {}).forEach((operation) => { + (operation.parameters || []).forEach((parameter) => strip(parameter.schema)); + stripContentSchemas(operation.requestBody?.content, strip); + Object.values(operation.responses || {}).forEach((response) => { + stripContentSchemas(response.content, strip); + }); + }); + }); + } + + function stripContentSchemas(content, strip) { + Object.values(content || {}).forEach((mediaType) => { + strip(mediaType?.schema); + }); + } + + function setupSchemaNameBadges(root, spec) { + let scheduled = false; + const update = () => { + scheduled = false; + updateSchemaNameBadges(root, spec); + }; + const scheduleUpdate = () => { + if (scheduled) { + return; + } + scheduled = true; + window.requestAnimationFrame(update); + }; + + const observer = new MutationObserver(scheduleUpdate); + observer.observe(root, { + childList: true, + subtree: true, + }); + scheduleUpdate(); + } + + function setupModelPropertyLabels(root, spec) { + let scheduled = false; + const update = () => { + scheduled = false; + updateModelPropertyLabels(root, spec); + }; + const scheduleUpdate = () => { + if (scheduled) { + return; + } + scheduled = true; + window.requestAnimationFrame(update); + }; + + const observer = new MutationObserver(scheduleUpdate); + observer.observe(root, { + childList: true, + subtree: true, + }); + scheduleUpdate(); + } + + function updateSchemaNameBadges(root, spec) { + root.querySelectorAll(".opblock").forEach((opblock) => { + const operation = getOperationForOpblock(spec, opblock); + if (!operation) { + return; + } + + const schemas = getOperationSchemas(operation); + opblock.querySelectorAll(".model-example").forEach((modelExample, index) => { + const schema = schemas[index]; + if (!schema) { + return; + } + const rootArticle = modelExample.querySelector(".model-box > .json-schema-2020-12"); + if (rootArticle) { + updateSchemaArticle(rootArticle, schema, spec); + } + }); + }); + } + + function getOperationForOpblock(spec, opblock) { + const method = (opblock.querySelector(".opblock-summary-method")?.textContent || "") + .trim() + .toLowerCase(); + const path = opblock.querySelector(".opblock-summary-path")?.dataset.path; + if (!method || !path) { + return null; + } + return spec.paths?.[path]?.[method] || null; + } + + function getOperationSchemas(operation) { + const schemas = []; + const requestSchema = getContentSchema(operation.requestBody?.content); + if (requestSchema && !isEmptySchema(requestSchema)) { + schemas.push(requestSchema); + } + + Object.values(operation.responses || {}).forEach((response) => { + const responseSchema = getContentSchema(response.content); + if (responseSchema && !isEmptySchema(responseSchema)) { + schemas.push(responseSchema); + } + }); + return schemas; + } + + function getContentSchema(content) { + for (const mediaType of Object.values(content || {})) { + if (mediaType?.schema) { + return mediaType.schema; + } + } + return null; + } + + function isEmptySchema(schema) { + return Object.keys(schema || {}).length === 0; + } + + function updateModelPropertyLabels(root, spec) { + root.querySelectorAll(".opblock").forEach((opblock) => { + const operation = getOperationForOpblock(spec, opblock); + if (!operation) { + return; + } + + const schemas = getOperationSchemas(operation); + opblock.querySelectorAll(".model-example").forEach((modelExample, index) => { + const schema = schemas[index]; + if (!schema) { + return; + } + updateModelExample(modelExample, schema, spec); + }); + }); + + root.querySelectorAll(".model-example").forEach((modelExample) => { + modelExample + .querySelectorAll('.model-box-control[aria-expanded="true"]') + .forEach((button) => { + const schemaName = getModelSchemaName(button, spec); + const schema = schemaName ? spec.components?.schemas?.[schemaName] : null; + if (schema) { + updateModelWrapperPropertyLabels(button.parentElement, schema, spec); + } + }); + updateModelTitleLabels(modelExample, spec); + updateBareObjectControlTitles(modelExample); + }); + } + + function updateModelExample(modelExample, schema, spec) { + const modelBox = modelExample.querySelector(":scope .model-box"); + if (modelBox) { + updateModelControlTitle(modelBox.querySelector(":scope > .model-box-control"), schema, spec); + updateModelWrapperPropertyLabels(modelBox, schema, spec); + } + updateModelTitleLabels(modelExample, spec); + updateBareObjectControlTitles(modelExample); + } + + function updateModelWrapperPropertyLabels(wrapper, schema, spec) { + if (!wrapper) { + return; + } + const properties = getSchemaProperties(schema, spec); + if (!properties) { + return; + } + + wrapper + .querySelectorAll(":scope > .inner-object > table.model > tbody > tr.property-row") + .forEach((row) => { + const propertyName = getPropertyRowName(row); + const propertySchema = properties[propertyName]; + const label = getSimpleSchemaLabel(propertySchema, spec); + if (label) { + setPropertyRowLabel(row, label); + return; + } + + const valueCell = row.cells?.[1]; + updateModelControlTitle( + valueCell?.querySelector(".model-box-control"), + propertySchema, + spec + ); + updateModelWrapperPropertyLabels( + valueCell?.querySelector(".model-box"), + propertySchema, + spec + ); + }); + } + + function getModelTitle(button) { + return ( + button.querySelector(".model-title__text")?.textContent || + button.querySelector(".model-title")?.textContent || + "" + ).trim(); + } + + function getModelSchemaName(button, spec) { + if (!button) { + return null; + } + if (button.dataset.dstackSwaggerSchemaName) { + return button.dataset.dstackSwaggerSchemaName; + } + const title = getModelTitle(button); + const schemaName = getSchemaNameFromTitle(title, spec); + if (schemaName) { + button.dataset.dstackSwaggerSchemaName = schemaName; + } + return schemaName; + } + + function getSchemaNameFromTitle(title, spec) { + const normalized = (title || "").trim(); + if (!normalized) { + return null; + } + if (spec.components?.schemas?.[normalized]) { + return normalized; + } + + return null; + } + + function getPropertyRowName(row) { + const name = row.cells?.[0]?.textContent || ""; + return name.trim().replace(/\s*\*$/, ""); + } + + function setPropertyRowLabel(row, label) { + const valueCell = row.cells?.[1]; + if (!valueCell || valueCell.dataset.dstackSwaggerLabel === label) { + return; + } + valueCell.dataset.dstackSwaggerLabel = label; + valueCell.replaceChildren(); + + const labelNode = document.createElement("span"); + labelNode.className = "dstack-swagger-model-label"; + labelNode.textContent = label; + valueCell.appendChild(labelNode); + } + + function updateModelTitleLabels(container, spec) { + container.querySelectorAll(".model-box-control").forEach((button) => { + getModelSchemaName(button, spec); + const title = getModelTitle(button); + const schema = getSchemaForModelTitle(title, spec); + if (schema) { + updateModelControlTitle(button, schema, spec); + } + }); + } + + function updateBareObjectControlTitles(container) { + container.querySelectorAll(".model-box-control").forEach((button) => { + if ( + button.querySelector(".model-title__text") || + button.querySelector(":scope > .dstack-swagger-model-inline-title") || + !isBareObjectControl(button) + ) { + return; + } + insertModelControlTitle(button, "object"); + }); + } + + function isBareObjectControl(button) { + return Array.from(button.children).some((child) => { + const text = (child.textContent || "").trim().replace(/\s+/g, ""); + return text === "{...}" || text.startsWith("{..."); + }); + } + + function getSchemaForModelTitle(title, spec) { + const normalized = (title || "").trim(); + const schemaName = getSchemaNameFromTitle(normalized, spec); + if (schemaName) { + return spec.components?.schemas?.[schemaName] || null; + } + + const arrayMatch = normalized.match(/^array<(.+)>$/i); + if (arrayMatch) { + const itemName = arrayMatch[1].trim(); + if (spec.components?.schemas?.[itemName]) { + return { + type: "array", + items: { + $ref: `#/components/schemas/${itemName}`, + }, + }; + } + } + + return null; + } + + function updateModelControlTitle(button, schema, spec) { + const title = button?.querySelector(".model-title__text"); + const label = getModelTitleLabel(schema, spec); + if (!button || !label) { + return; + } + if (!title) { + insertModelControlTitle(button, label); + return; + } + if (title.dataset.dstackSwaggerTitle === label) { + return; + } + title.dataset.dstackSwaggerTitle = label; + title.textContent = label; + } + + function insertModelControlTitle(button, label) { + if (button.dataset.dstackSwaggerTitle === label) { + return; + } + + const existing = button.querySelector(":scope > .dstack-swagger-model-inline-title"); + if (existing) { + existing.textContent = label; + button.dataset.dstackSwaggerTitle = label; + return; + } + + const titleNode = document.createElement("span"); + titleNode.className = "dstack-swagger-model-inline-title"; + titleNode.textContent = label; + button.insertBefore(titleNode, getModelPlaceholderNode(button)); + button.dataset.dstackSwaggerTitle = label; + } + + function getModelPlaceholderNode(button) { + const directChildren = Array.from(button.children); + return ( + directChildren.find((child) => { + const text = (child.textContent || "").trim(); + return text === "{...}" || text.startsWith("{") || text === "[...]"; + }) || + directChildren.find((child) => child.classList.contains("model-toggle")) || + button.firstChild + ); + } + + function getSimpleSchemaLabel(schema, spec) { + if (!schema) { + return null; + } + + const refSchema = resolveSchema(schema, spec); + if (refSchema && refSchema !== schema) { + const label = getSimpleSchemaLabel(refSchema, spec); + if (label) { + return label; + } + } + + if (Array.isArray(schema.allOf) && schema.allOf.length === 1) { + return getSimpleSchemaLabel(schema.allOf[0], spec); + } + + const union = schema.anyOf || schema.oneOf; + if (Array.isArray(union) && union.length > 0) { + const labels = union.map((item) => getSimpleSchemaLabel(item, spec)); + if (labels.every(Boolean)) { + return labels.join(" | "); + } + return null; + } + + if (Array.isArray(schema.enum)) { + return formatEnumValues(schema.enum); + } + + if (schema.const !== undefined) { + return formatEnumValue(schema.const); + } + + if (schema.type === "array") { + const itemLabel = getSimpleSchemaLabel(schema.items || {}, spec); + return itemLabel ? `array<${itemLabel}>` : null; + } + + if (schema.type === "object") { + const additionalProperties = schema.additionalProperties; + if (additionalProperties && typeof additionalProperties === "object") { + const valueLabel = getSimpleSchemaLabel(additionalProperties, spec); + return valueLabel ? `object` : null; + } + return null; + } + + if (["string", "integer", "number", "boolean", "null"].includes(schema.type)) { + return schema.type; + } + + return null; + } + + function getModelTitleLabel(schema, spec) { + if (!schema) { + return null; + } + + const refSchema = resolveSchema(schema, spec); + if (refSchema && refSchema !== schema) { + return getModelTitleLabel(refSchema, spec); + } + + if (Array.isArray(schema.allOf) && schema.allOf.length === 1) { + return getModelTitleLabel(schema.allOf[0], spec); + } + + const union = schema.anyOf || schema.oneOf; + if (Array.isArray(union) && union.length > 0) { + const labels = uniqueLabels(union.map((item) => getModelTitleLabel(item, spec))); + return labels.length > 0 ? labels.join(" | ") : "oneOf"; + } + + const simpleLabel = getSimpleSchemaLabel(schema, spec); + if (simpleLabel) { + return simpleLabel; + } + + if (schema.type === "array") { + const itemLabel = getModelTitleLabel(schema.items || {}, spec); + return itemLabel ? `array<${itemLabel}>` : "array"; + } + + if (schema.type === "object" || schema.properties) { + return "object"; + } + + return null; + } + + function uniqueLabels(labels) { + const seen = new Set(); + return labels.filter((label) => { + if (!label || seen.has(label)) { + return false; + } + seen.add(label); + return true; + }); + } + + function formatEnumValue(value) { + if (value === null) { + return "null"; + } + if (value === undefined) { + return "undefined"; + } + return String(value); + } + + function formatEnumValues(values) { + return values.map(formatEnumValue).join(" | "); + } + + function updateSchemaArticle(article, schema, spec) { + const badge = article.querySelector( + ":scope > .json-schema-2020-12-head .json-schema-2020-12__attribute--primary" + ); + const label = getModelTitleLabel(schema, spec); + if (badge && label) { + const current = (badge.textContent || "").trim(); + const schemaName = getRefName(schema); + if (schemaName && current === schemaName) { + badge.textContent = label; + } + } + + const resolvedSchema = resolveSchema(schema, spec) || schema; + updateItemArticle(article, resolvedSchema, spec); + updatePropertyArticles(article, resolvedSchema, spec); + } + + function updateItemArticle(article, schema, spec) { + const itemArticle = article.querySelector( + ":scope > .json-schema-2020-12-body .json-schema-2020-12-keyword--items > article.json-schema-2020-12" + ); + if (itemArticle && schema?.items) { + updateSchemaArticle(itemArticle, schema.items, spec); + } + } + + function getSchemaProperties(schema, spec) { + const resolvedSchema = resolveSchema(schema, spec) || schema; + if (resolvedSchema?.properties) { + return resolvedSchema.properties; + } + + if (Array.isArray(resolvedSchema?.allOf)) { + return resolvedSchema.allOf.reduce((properties, item) => { + return { + ...properties, + ...(getSchemaProperties(item, spec) || {}), + }; + }, {}); + } + + return null; + } + + function updatePropertyArticles(article, schema, spec) { + const properties = resolveSchema(schema, spec)?.properties || schema?.properties; + if (!properties) { + return; + } + + article + .querySelectorAll( + ":scope > .json-schema-2020-12-body .json-schema-2020-12-keyword--properties > ul > li > article.json-schema-2020-12" + ) + .forEach((propertyArticle) => { + const propertyName = getDirectTitle(propertyArticle); + const propertySchema = properties[propertyName]; + if (propertySchema) { + updateSchemaArticle(propertyArticle, propertySchema, spec); + } + }); + } + + function getDirectTitle(article) { + return ( + article + .querySelector(":scope > .json-schema-2020-12-head .json-schema-2020-12__title") + ?.textContent || "" + ).trim(); + } + + function resolveSchema(schema, spec) { + const ref = getRef(schema); + if (!ref) { + return schema; + } + const name = ref.split("/").pop(); + return spec.components?.schemas?.[name] || schema; + } + + function getRefName(schema) { + const ref = getRef(schema); + return ref ? ref.split("/").pop() : null; + } + + function getRef(schema) { + if (typeof schema?.$ref === "string") { + return schema.$ref; + } + if (Array.isArray(schema?.allOf) && schema.allOf.length === 1) { + return getRef(schema.allOf[0]); + } + return null; + } + + if (document.readyState === "loading") { + window.addEventListener("DOMContentLoaded", initSwaggerReferences); + } else { + initSwaggerReferences(); + } +})(); diff --git a/mkdocs/assets/javascripts/termynal.js b/mkdocs/assets/javascripts/termynal.js new file mode 100644 index 0000000000..70f99eabc9 --- /dev/null +++ b/mkdocs/assets/javascripts/termynal.js @@ -0,0 +1,312 @@ +/** + * termynal.js + * A lightweight, modern and extensible animated terminal window, using + * async/await. + * + * @author Ines Montani + * @version 0.0.1 + * @license MIT + */ + +'use strict'; + +/** Generate a terminal widget. */ +class Termynal { + /** + * Construct the widget's settings. + * @param {(string|Node)=} container - Query selector or container element. + * @param {Object=} options - Custom settings. + * @param {string} options.prefix - Prefix to use for data attributes. + * @param {number} options.startDelay - Delay before animation, in ms. + * @param {number} options.typeDelay - Delay between each typed character, in ms. + * @param {number} options.lineDelay - Delay between each line, in ms. + * @param {number} options.progressLength - Number of characters displayed as progress bar. + * @param {string} options.progressChar – Character to use for progress bar, defaults to █. + * @param {number} options.progressPercent - Max percent of progress. + * @param {string} options.cursor – Character to use for cursor, defaults to ▋. + * @param {Object[]} lineData - Dynamically loaded line data objects. + * @param {boolean} options.noInit - Don't initialise the animation. + */ + constructor(container = '#termynal', options = {}) { + this.container = (typeof container === 'string') ? document.querySelector(container) : container; + this.pfx = `data-${options.prefix || 'ty'}`; + this.originalStartDelay = this.startDelay = this.getOptionNumber( + options.startDelay, + `${this.pfx}-startDelay`, + 300 + ); + this.originalTypeDelay = this.typeDelay = this.getOptionNumber( + options.typeDelay, + `${this.pfx}-typeDelay`, + 60 + ); + this.originalLineDelay = this.lineDelay = this.getOptionNumber( + options.lineDelay, + `${this.pfx}-lineDelay`, + 1500 + ); + this.progressLength = options.progressLength + || parseFloat(this.container.getAttribute(`${this.pfx}-progressLength`)) || 40; + this.progressChar = options.progressChar + || this.container.getAttribute(`${this.pfx}-progressChar`) || '█'; + this.progressPercent = options.progressPercent + || parseFloat(this.container.getAttribute(`${this.pfx}-progressPercent`)) || 100; + this.cursor = options.cursor + || this.container.getAttribute(`${this.pfx}-cursor`) || '▋'; + this.lineData = this.lineDataToElements(options.lineData || []); + this.lineContainer = null; + this.loadLines() + if (!options.noInit) this.init() + } + + getOptionNumber(value, attr, fallback) { + if (value !== undefined && value !== null) { + return value; + } + const attrValue = parseFloat(this.container.getAttribute(attr)); + return Number.isNaN(attrValue) ? fallback : attrValue; + } + + loadLines() { + // Load all the lines and create the container so that the size is fixed + // Otherwise it would be changing and the user viewport would be constantly + // moving as she/he scrolls + // Appends dynamically loaded lines to existing line elements. + this.lines = [...this.container.querySelectorAll(`[${this.pfx}]`)].concat(this.lineData); + const lineParent = this.container.classList.contains('dstack-termy-scrollable') ? + this.getLineContainer() : this.container; + for (let line of this.lines) { + line.style.visibility = 'hidden' + lineParent.appendChild(line) + } + const restart = this.generateRestart() + restart.style.visibility = 'hidden' + this.container.appendChild(restart) + this.container.setAttribute('data-termynal', ''); + } + + /** + * Initialise the widget, get lines, clear container and start animation. + */ + init() { + /** + * Calculates width and height of Termynal container. + * If container is empty and lines are dynamically loaded, defaults to browser `auto` or CSS. + */ + const containerStyle = getComputedStyle(this.container); + this.container.style.width = containerStyle.width !== '0px' ? + containerStyle.width : undefined; + if (!this.container.classList.contains('dstack-termy-scrollable') && containerStyle.height !== '0px') { + this.container.style.minHeight = containerStyle.height; + } else { + this.container.style.minHeight = ''; + } + + this.container.setAttribute('data-termynal', ''); + this.container.innerHTML = ''; + this.lineContainer = null; + for (let line of this.lines) { + line.style.visibility = 'visible' + } + this.start(); + } + + /** + * Start the animation and rener the lines depending on their data attributes. + */ + async start() { + await this._wait(this.startDelay); + + for (let line of this.lines) { + const type = line.getAttribute(this.pfx); + const delay = line.getAttribute(`${this.pfx}-delay`) || this.lineDelay; + + if (type == 'input') { + line.setAttribute(`${this.pfx}-cursor`, this.cursor); + await this.type(line); + await this._wait(delay); + } + + else if (type == 'progress') { + await this.progress(line); + await this._wait(delay); + } + + else { + this.appendLine(line); + await this._wait(delay); + } + + line.removeAttribute(`${this.pfx}-cursor`); + } + // Keep cursor visible if the last input has no output after it + let lastInputIdx = -1; + let hasOutputAfter = false; + for (let i = this.lines.length - 1; i >= 0; i--) { + if (this.lines[i].getAttribute(this.pfx) === 'input') { + lastInputIdx = i; + break; + } + if (this.lines[i].textContent.trim()) { + hasOutputAfter = true; + } + } + if (lastInputIdx >= 0 && !hasOutputAfter) { + this.lines[lastInputIdx].setAttribute(`${this.pfx}-cursor`, this.cursor); + } + this.addRestart() + this.lineDelay = this.originalLineDelay + this.typeDelay = this.originalTypeDelay + this.startDelay = this.originalStartDelay + } + + generateRestart() { + const restart = document.createElement('a') + restart.onclick = (e) => { + e.preventDefault() + this.container.innerHTML = '' + this.init() + } + restart.href = 'javascript:void(0)' + restart.setAttribute('data-terminal-control', '') + restart.innerHTML = "restart ↻" + return restart + } + + addRestart() { + const restart = this.generateRestart() + this.container.appendChild(restart) + } + + /** + * Animate a typed line. + * @param {Node} line - The line element to render. + */ + async type(line) { + const chars = [...line.textContent]; + line.textContent = ''; + this.appendLine(line); + const delay = line.getAttribute(`${this.pfx}-typeDelay`) || this.typeDelay; + if (delay <= 0) { + line.textContent = chars.join(''); + return; + } + + for (let char of chars) { + await this._wait(delay); + line.textContent += char; + } + } + + /** + * Animate a progress bar. + * @param {Node} line - The line element to render. + */ + async progress(line) { + const progressLength = line.getAttribute(`${this.pfx}-progressLength`) + || this.progressLength; + const progressChar = line.getAttribute(`${this.pfx}-progressChar`) + || this.progressChar; + const chars = progressChar.repeat(progressLength); + const progressPercent = line.getAttribute(`${this.pfx}-progressPercent`) + || this.progressPercent; + const typeDelay = line.getAttribute(`${this.pfx}-typeDelay`) + || this.typeDelay; + line.textContent = ''; + this.appendLine(line); + if (typeDelay <= 0) { + line.textContent = this.getFullProgressText(progressLength, progressChar, progressPercent); + return; + } + + for (let i = 1; i < chars.length + 1; i++) { + await this._wait(typeDelay); + const percent = Math.round(i / chars.length * 100); + line.textContent = `${chars.slice(0, i)} ${percent}%`; + if (percent>progressPercent) { + break; + } + } + } + + appendLine(line) { + this.getLineContainer().appendChild(line); + } + + getLineContainer() { + if (!this.container.classList.contains('dstack-termy-scrollable')) { + return this.container; + } + if (!this.lineContainer) { + this.lineContainer = document.createElement('div'); + this.lineContainer.setAttribute('data-termynal-body', ''); + this.container.appendChild(this.lineContainer); + } + return this.lineContainer; + } + + getFullProgressText(progressLength, progressChar, progressPercent) { + const visibleLength = Math.ceil(progressLength * progressPercent / 100); + return `${progressChar.repeat(visibleLength)} ${progressPercent}%`; + } + + /** + * Helper function for animation delays, called with `await`. + * @param {number} time - Timeout, in ms. + */ + _wait(time) { + if (time <= 0) { + return Promise.resolve(); + } + return new Promise(resolve => setTimeout(resolve, time)); + } + + /** + * Converts line data objects into line elements. + * + * @param {Object[]} lineData - Dynamically loaded lines. + * @param {Object} line - Line data object. + * @returns {Element[]} - Array of line elements. + */ + lineDataToElements(lineData) { + return lineData.map(line => { + let div = document.createElement('div'); + div.innerHTML = `${line.value || ''}`; + + return div.firstElementChild; + }); + } + + /** + * Helper function for generating attributes string. + * + * @param {Object} line - Line data object. + * @returns {string} - String of attributes. + */ + _attributes(line) { + let attrs = ''; + for (let prop in line) { + // Custom add class + if (prop === 'class') { + attrs += ` class=${line[prop]} ` + continue + } + if (prop === 'type') { + attrs += `${this.pfx}="${line[prop]}" ` + } else if (prop !== 'value') { + attrs += `${this.pfx}-${prop}="${line[prop]}" ` + } + } + + return attrs; + } +} + +/** +* HTML API: If current script has container(s) specified, initialise Termynal. +*/ +if (document.currentScript.hasAttribute('data-termynal-container')) { + const containers = document.currentScript.getAttribute('data-termynal-container'); + containers.split('|') + .forEach(container => new Termynal(container)) +} diff --git a/mkdocs/assets/stylesheets/extra.css b/mkdocs/assets/stylesheets/extra.css new file mode 100644 index 0000000000..937369742a --- /dev/null +++ b/mkdocs/assets/stylesheets/extra.css @@ -0,0 +1,1956 @@ +html { + --dstack-code-font-size: 14px; + --dstack-code-line-height: 1.075rem; +} + +@media screen and (min-width: 76.1875em) { + .md-header { + backdrop-filter: blur(5px); + background-color: rgba(255, 255, 255, 0); + border-bottom: 0.5px dotted rgba(0, 0, 0, 1); + padding-bottom: 2px; + } + + [dir=ltr] .md-header__source { + margin-left: 0; + width: 10rem; + } + + .md-source__facts { + font-size: 0.6rem; + } + + .md-source__fact:before { + height: 0.75rem; + } + + .md-source { + font-size: 0.75rem; + } +} + +@media screen and (max-width: 76.1875em) { + .md-header { + background-color: rgb(255, 255, 255); + } +} + +#kapa-widget-container .mantine-Text-root { + font-size: 16.5px !important; + font-weight: 500; + color: rgba(0, 0, 0, 1.87) !important; + text-shadow: none !important; + /*letter-spacing: -0.5px;*/ + line-height: 1.2; +} + +#kapa-widget-container .mantine-Button-inner:focus-visible { + border: none !important; +} + +#kapa-widget-container .mantine-Button-root { + height: 3.5rem !important; + width: 3.5rem !important; + background: rgba(255,255,255, 0.5) !important; + box-shadow: none !important; + border: 0.5px solid black !important; + border-radius: 4px; +} + +#kapa-widget-container .mantine-Image-image { + width: 27.5px !important; + height: 27.5px !important;; +} + +#kapa-widget-container .mantine-Image-root { + width: 27.5px !important; +} + +.mantine-Modal-body .mantine-Input-input { + font-size: 18px !important; +} + +.mantine-Modal-body .mantine-Text-root, .mantine-Modal-body .mantine-List-root, .mantine-Modal-body .mantine-List-item, .mantine-Modal-body .mantine-Prism-code { + font-size: 17px !important; +} + +.md-copyright { + width: inherit; +} + +.md-copyright__highlight { + font-size: 0.75rem; + margin-top: -8px; +} + +.md-copyright__highlight a { + font-weight: 700; +} + +.md-footer { + /*border-top: 1px solid #E4E4E7;*/ +} +.md-typeset a.md-footer__link { + margin: 0.5rem 0 0.4rem; + border: 1px solid black; + color: black; + border-radius: 5px; + flex: 1; +} + +.md-footer__link:focus, .md-footer__link:hover { + opacity: 1; +} + +.md-typeset a.md-footer__link:hover .md-ellipsis { + opacity: 1; +} + +.md-footer__title { + font-size: 17px; +} + +.md-footer__link--prev .md-footer__title { + display: block; +} + +.md-footer__direction { + opacity: 1; + font-weight: 800; +} + +.md-footer__title { + margin-top: 0.2rem; + margin-bottom: 0.5rem; + padding: 0; +} + +.md-header[data-md-state=shadow] { + box-shadow: none; +} +.md-sidebar__scrollwrap { + margin-top: -25px; + overflow-y: hidden; + + -webkit-mask-image: linear-gradient(rgba(0,0,0,1) 85%, rgba(0,0,0,0)); + mask-image: linear-gradient(rgba(0,0,0,1) 85%, rgba(0,0,0,0)); + + padding-bottom: 40px; +} + +.md-sidebar__scrollwrap:hover { + overflow-y: scroll; +} + +*::-webkit-scrollbar-thumb { + background: linear-gradient(0deg, white 0%, white 4%, #e5e5e9 5%, #e5e5e9 93%, white 94%, white 100%); + border-radius: 2px; + padding-top: 200px; +} + +.md-sidebar__scrollwrap::-webkit-scrollbar-thumb:hover { + background-color: var(--md-default-fg-color--lighter) +} + +.md-typeset :is(.admonition,details):is(.info,.tip,.warning,.c) { + background-color: rgba(0,0,0,0.005); + border-color: rgba(0,0,0,0.6); +} + +.md-typeset :is(.info,.tip,.warning)>:is(.admonition-title) { + color: var(--md-default-fg-color); +} + +.md-typeset :is(.info,.tip,.warning)>:is(.admonition-title,summary):before, +.md-typeset :is(.info,.tip,.warning)>:is(.admonition-title,summary):after { + background-color: var(--md-default-fg-color); +} + +[dir=ltr] .md-typeset :is(details) { + border-style: solid; + border-color: rgba(0, 0, 0, 0.87); + border-width: 1px; + border-radius: 0; + box-shadow: none; + padding: .6rem .8rem; + background: none; +} + +[dir=ltr] .md-typeset :is(.admonition) { + border-style: solid; + border-color: rgba(0, 0, 0, 0.87); + border-width: 1px; + border-radius: 0; + box-shadow: none; + padding: .6rem .8rem; + /*background: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.1), rgb(0 114 255 / 1%), rgba(0, 42, 255, 0.05));*/ +} + +[dir=ltr] + .md-typeset + :is(.admonition, details) + > :is(.md-typeset__scrollwrap, p, h4, h3, .tabbed-set, ul, ol):not(.admonition-title, summary) { + padding-left: 32px; +} + +[dir=ltr] + .md-typeset + :is(.admonition, details) + > :is(pre, blockquote, .highlight, .termy, div[editor-title]) { + margin-left: 32px; +} + +.md-typeset iframe { + border-radius: 0; +} + +.md-typeset__scrollwrap { + margin-top: 0; + margin-bottom: 0; + margin-block-start: 1em; + margin-block-end: 1em; +} + +.md-typeset__table { + margin-bottom: 0; +} + +.admonition-title, details > summary { + border: none !important; + background-color: transparent !important; +} + +.md-typeset .admonition, .md-typeset details { + font-size: 1em; +} + +[dir=ltr] .md-typeset :is(.admonition-title,summary) { + font-size: 18px !important; + /*letter-spacing: -0.5px;*/ + /*font-weight: 800;*/ + font-weight: 700; + /*padding-left: 18px;*/ + padding-bottom: 0; + border: none; + border-radius: 0; +} + +.md-typeset .admonition.info:focus-within, .md-typeset details.info:focus-within { + box-shadow: none; +} + +.md-typeset :is(.info,.tip, .warning)>:is(.admonition-title) { + margin-bottom: -12px; +} + +.md-typeset :is(.info,.tip, .warning)>:is(summary) { + margin-bottom: 12px; + font-size: 1.1em !important; +} + +.md-typeset :is(.info,.tip, .warning)>:is(.admonition-title,summary):before { + /*display: none;*/ +} + +.md-header__title { + margin-left: 1px !important; + font-weight: 700; + font-size: 20px; + /*padding-top: 2px;*/ +} + +@media screen and (min-width: 76.1875em) { + .md-header__topic:first-child { + font-size: 24px; + top: 2px; + left: 1px; + /*font-family: Poppins, metro-web, Metro, -apple-system, "system-ui", "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", sans-serif;*/ + /*font-weight: 500;*/ + } + + .md-header__title { + flex-grow: unset; + } + + .md-header__topic { + position: relative; + } + + .md-header__title--active .md-header__topic { + opacity: 1; + transition: inherit; + transform: inherit; + pointer-events: auto; + } + + .md-header__title--active .md-header__topic+.md-header__topic { + opacity: 0; + } + + .md-header__topic+.md-header__topic { + display: none; + } +} + +@media screen and (max-width: 76.1875em) { + .md-nav--primary .md-nav__title .md-logo { + padding: 0.1rem 0.4rem; + } + .md-nav__title .md-nav__button.md-logo :-webkit-any(img,svg) { + max-width: 100px; + } + .md-nav--primary .md-nav__title { + display: block; + font-weight: 700; + /*display: none;*/ + height: 10px; + } + + .md-nav__source { + background-color: transparent; + } +} + +.md-nav--secondary:not(:has(ul)) { + display: none; +} + +.md-nav--secondary { + margin-top: 14px; +} + +.md-nav--secondary .md-nav__title { + background: transparent; + box-shadow: none; + padding: 6px 15px 5px; + font-size: 17.5px; + font-weight: 600; + color: rgba(0, 0, 0, 0.87); + position: relative; + margin-left: 8px; + margin-bottom: 4px; + /*letter-spacing: -0.5px;*/ +} + +@font-face { + font-family: 'Geist'; + font-style: normal; + font-weight: 100 900; + font-display: swap; + src: url(../fonts/Geist-Variable.woff2) format('woff2'); +} + +@font-face { + font-family: 'Geist Mono'; + font-style: normal; + font-weight: 100 900; + font-display: swap; + src: url(../fonts/GeistMono-Variable.woff2) format('woff2'); +} + +@font-face { + font-family: 'Geist Pixel Square'; + font-style: normal; + font-weight: 400; + font-display: swap; + src: url(../fonts/GeistPixel-Square.woff2) format('woff2'); +} + +@font-face { + font-family: 'Geist Pixel Circle'; + font-style: normal; + font-weight: 400; + font-display: swap; + src: url(../fonts/GeistPixel-Circle.woff2) format('woff2'); +} + +/* latin */ +@font-face { + font-family: 'Fira Mono'; + font-style: normal; + font-weight: 400; + src: local('Fira Mono Regular'), local('FiraMono-Regular'), url(https://fonts.gstatic.com/s/firamono/v6/N0bX2SlFPv1weGeLZDtgJv7Ss9XZYQ.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} + +/* latin */ +@font-face { + font-family: 'Fira Mono'; + font-style: normal; + font-weight: 500; + src: local('Fira Mono Medium'), local('FiraMono-Medium'), url(https://fonts.gstatic.com/s/firamono/v8/N0bS2SlFPv1weGeLZDto1d3HnvfUS5NBBA.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} + +/* latin */ +@font-face { + font-family: 'Fira Mono'; + font-style: normal; + font-weight: 700; + src: local('Fira Mono Bold'), local('FiraMono-Bold'), url(https://fonts.gstatic.com/s/firamono/v6/N0bS2SlFPv1weGeLZDtondvHnvfUS5NBBA.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} + +/*label.md-nav__link {*/ +/* font-weight: 500;*/ +/*}*/ + +.md-nav__link { + margin-top: 0; + color: black; +} + +.md-nav__link:hover { + color: var(--md-default-fg-color); +} + +.md-nav__item, .md-nav__link .md-typeset { + /*color: rgba(0,0,0,0.87);*/ + font-weight: 400; + font-size: 0.75rem; +} + +.md-nav__item .md-nav__link:hover:not(.md-nav__link--active) { + color: black; +} + +.md-typeset pre > code, .md-typeset code { + /*-webkit-font-smoothing: auto;*/ +} + +.md-sidebar.md-sidebar--secondary .md-typeset code { + color: rgba(0,0,0,0.87); +} + +.md-typeset :not(pre) :is(h1, h2, h3, h4, h5, h6) > code { + color: inherit; + background: inherit; + padding: 0; +} + +h4.doc-heading { + font-size: inherit; +} + +.md-typeset :not(pre, h1, h2, h3, h4, h5, h6) > code { + background-color: rgba(163, 68, 215, 0.05); + /*border: 1px solid #dce0e6;*/ + border-radius: 3px; + font-weight: 600; + color: var(--md-primary-fg-color); + text-align: center; + padding: 4px; + height: 16px; + margin: 0 4px; +} + +.md-typeset :is(h1, h2, h3, h4, h5, h6) > code { + background-color: inherit; + color: inherit; + /*padding: 0; + margin: 0;*/ +} +.md-typeset :is(h1, h2, h3, h4, h5, h6) > a > code { + font-size: inherit; + color: inherit; +} + + +.md-typeset :is(table) :not(pre, h1, h2, h3, h4, h5, h6) > code { + font-size: .85em; +} + +.md-typeset :not(pre, h1, h2, h3, h4, h5, h6) > code { + font-size: 0.65rem; +} + +.md-typeset :not(pre, h1, h2, h3, h4, h5, h6) > a code { + color: #ce00ff; +} + +.md-typeset pre > code { + background-color: rgb(21, 22, 29); + padding: 55px 15px 35px 30px; + border-radius: 3px; + font-size: var(--dstack-code-font-size); + line-height: var(--dstack-code-line-height); + + /*border-radius: 0;*/ + /*border-top: 1px solid #dce0e6;*/ + /*background-color: rgba(0,0,0,.87);*/ + /*padding: 15px 20px;*/ + /*font-size: .85em;*/ + font-family: var(--md-code-font-family) !important; + margin-top: 0; +} + +.md-typeset .highlight pre > code:before, .md-typeset div[editor-title] pre > code:before { + content: ''; + position: absolute; + top: 15px; + left: 15px; + display: inline-block; + width: 12px; + height: 12px; + border-radius: 50%; + /* A little hack to display the window buttons in one pseudo element. */ + background: #d9515d; + /*-webkit-box-shadow: 25px 0 0 #f4c025, 50px 0 0 #3ec930;*/ + /* box-shadow: 25px 0 0 #f4c025, 50px 0 0 #3ec930;*/ + -webkit-box-shadow: 20px 0 0 #f4c025, 40px 0 0 #3ec930; + box-shadow: 20px 0 0 #f4c025, 40px 0 0 #3ec930; +} + +.md-typeset :is(.highlight.dstack-scrollable-code, .dstack-editable-code) { + background-color: rgb(21, 22, 29); + box-sizing: border-box; + padding: 55px 15px 35px 30px; + position: relative; +} + +.md-typeset :is(.highlight.dstack-scrollable-code, .dstack-editable-code):before { + background: #d9515d; + border-radius: 50%; + content: ''; + display: inline-block; + height: 12px; + left: 15px; + position: absolute; + top: 15px; + width: 12px; + -webkit-box-shadow: 20px 0 0 #f4c025, 40px 0 0 #3ec930 !important; + box-shadow: 20px 0 0 #f4c025, 40px 0 0 #3ec930 !important; +} + +.md-typeset .highlight.dstack-scrollable-code pre { + background: transparent; + margin: 0; + overflow: visible; +} + +.md-typeset .highlight.dstack-scrollable-code pre > code { + background: transparent; + box-sizing: border-box; + display: block; + line-height: var(--dstack-code-line-height); + max-height: var(--dstack-scrollable-code-max-height, none); + overflow: auto; + padding: 0; + scrollbar-color: #a2a2a2 rgba(255, 255, 255, 0.06); + scrollbar-width: thin; +} + +.md-typeset .dstack-editable-code textarea { + background: transparent; + border: 0; + box-sizing: border-box; + color: var(--md-code-fg-color); + display: block; + font-family: var(--md-code-font-family); + font-size: var(--dstack-code-font-size); + line-height: var(--dstack-code-line-height); + margin: 0; + max-height: var(--dstack-editable-code-max-height, none); + overflow: auto; + padding: 0; + scrollbar-color: #a2a2a2 rgba(255, 255, 255, 0.06); + scrollbar-width: thin; + width: 100%; +} + +.md-typeset .highlight.dstack-scrollable-code pre > code:before { + content: none; +} + +.md-typeset .highlight.dstack-scrollable-code pre > code::-webkit-scrollbar, +.md-typeset .dstack-editable-code textarea::-webkit-scrollbar { + height: 10px; + width: 10px; +} + +.md-typeset .highlight.dstack-scrollable-code pre > code::-webkit-scrollbar-track, +.md-typeset .dstack-editable-code textarea::-webkit-scrollbar-track { + background: rgba(255, 255, 255, 0.06); +} + +.md-typeset .highlight.dstack-scrollable-code pre > code::-webkit-scrollbar-thumb, +.md-typeset .dstack-editable-code textarea::-webkit-scrollbar-thumb { + background: #a2a2a2; + background-clip: content-box; + border: 2px solid transparent; + border-radius: 8px; +} + +.md-typeset .highlight.dstack-scrollable-code pre > code::-webkit-scrollbar-thumb:hover, +.md-typeset .dstack-editable-code textarea::-webkit-scrollbar-thumb:hover { + background: #e5e5e9; + background-clip: content-box; +} + +.md-typeset .highlight.dstack-scrollable-code > .md-code__nav { + background: transparent; + position: absolute; + right: 15px; + top: 9px; +} + +.md-typeset .highlight.dstack-scrollable-code > .md-code__nav .md-code__button { + color: #a2a2a2; + height: 24px; + padding: 0; + width: 24px; +} + +.md-typeset .highlight.dstack-scrollable-code > .md-code__nav .md-code__button:after { + height: 16px; + margin: 4px; + width: 16px; +} + +.md-typeset .highlight.dstack-scrollable-code:hover > .md-code__nav, +.md-typeset .highlight.dstack-scrollable-code > .md-code__nav:hover { + background: transparent; +} + +.md-typeset .highlight.dstack-scrollable-code:hover > .md-code__nav .md-code__button, +.md-typeset .highlight.dstack-scrollable-code > .md-code__nav:hover .md-code__button { + color: #eee; + opacity: 1; +} + +@media screen and (min-width: 60em) { + [data-md-color-primary=white] .md-search__form { + background-color: transparent; + border-radius: 0; + border: 0.5px solid rgba(0, 0, 0, 0.87); + height: 1.84rem; + } + + [data-md-color-primary=white] .md-search__form:hover { + background-color: transparent; + border: 0.5px solid black + } +} + +.md-search__input { + font-size: .7rem; +} + +.md-code__button:after { + width: 1em; + height: 1em; +} + +.md-code__nav { + top: 0.175em; + right: .15em; + background: none; +} + +.md-code__nav .md-code__button { + color: #a2a2a2; +} + +.md-code__nav:hover .md-code__button { + color: #eee; +} + +pre:hover .md-code__nav, +code .md-code__nav:hover { + background: transparent; +} + +code .md-code__nav:hover .md-code__button { + color: var(--md-accent-fg-color); +} + +.md-clipboard { + top: 0.65em; +} + +.md-clipboard:after { + color: var(--md-default-fg-color--lighter); +} + +.md-clipboard:hover:after { + color: var(--md-default-fg-color--light) +} + +.md-annotation:not([hidden]) { + line-height: 1.225; +} + +.md-annotation__index { + margin-left: 0; + margin-right: 0; +} + +.md-annotation__index:after { + background-color: rgba(0, 0, 0, 0.87); + transform: scale(0.9); +} + +@media screen and (max-width: 44.9375em) { + .md-typeset pre > code { + border: none; + } +} + +.footer__inner { + background-color: var(--md-footer-bg-color--dark); + margin-top: 0.7rem; +} + +.md-footer__inner:not([hidden]) { + gap: 20px !important; +} + +.md-footer__link .md-footer__link--next { + font-weight: 500; +} + +.md-typeset .admonition.note { + border-color: var(--md-primary-fg-color); +} + +.md-typeset { + line-height: 1.3rem; + font-size: 0.8rem; +} + +.md-typeset h1 { + margin: 0 0 0.75em; + font-size: 33px; +} + +/* Hide permalink anchor on top-level headings only */ +.md-typeset h1 > .headerlink { + display: none; +} + +.md-typeset h2 { + margin: 1.4em 0 0.64em; + padding-top: 0.2em; + font-size: 25px; +} + +.md-typeset :is(h2, h3, h4, h5, h6)[id] { + scroll-margin-top: 3rem; +} + +.md-typeset h1, .md-typeset h2 { + font-weight: 400; + /*letter-spacing: 0;*/ +} + +.md-typeset h1, .md-typeset h2, .md-typeset h3, .md-typeset h4, .md-typeset h5, .md-typeset h6 { + font-weight: 800; + /*letter-spacing: -1px;*/ + color: rgb(0, 0, 0); + text-transform: none; +} + +.md-typeset h4 { + font-size: 19.5px; +} + +.md-typeset h5 { + font-size: 17px; +} + +.md-typeset h6 { + font-size: 15px; +} + +.md-typeset h3 { + font-size: 21.5px; + margin-block-end: 0; + padding-bottom: 0.7em; + border-bottom: 1px solid rgba(243, 244, 246, 1); +} + +[data-md-color-scheme=slate][data-md-color-primary=black], [data-md-color-primary=white] { + --md-primary-fg-color: #0048ff; + --md-accent-fg-color: #ce00ff; + --md-typeset-a-color: #ce00ff; + --md-code-hl-function-color: #e3b4fb; + --md-code-hl-keyword-color: #e37cff; + --md-code-hl-string-color: #b4f9c6; + --md-code-fg-color: #eee; + /*--md-code-fg-color: rgba(0, 0, 0, 0.95);*/ + --md-mermaid-label-fg-color: rgba(0, 0, 0, 0.95); + --md-mermaid-edge-color: rgba(0, 0, 0, 0.95); + --md-code-hl-comment-color: #757585; + --md-code-hl-number-color: #d9548c; + --md-code-hl-operator-color: #5b6369; + --md-code-hl-punctuation-color: #5b6369; + --md-code-bg-color: #f7f7fb; + --md-code-hl-constant-color: var(--md-code-fg-color); + /*--md-primary-bg-color: white;*/ + --md-default-fg-color--light: rgba(0,0,0,.6); + --md-default-fg-color--lighter: rgb(159, 172, 190); + --md-default-fg-color--lightest: #f6f9fc; + --md-footer-fg-color--light: var(--md-default-fg-color); + --md-code-hl-color--light: rgba(197, 173, 255, 0.12); +} + +#__mermaid_0 .note { + stroke: var(--md-primary-fg-color) !important; +} + +.md-typeset .highlight :where(.l) { + /*color: #eee !important;*/ + /*color: var(--md-code-fg-color) !important;*/ +} + +.highlight .sd { + color: var(--md-code-hl-string-color); +} + +.highlight .na, .highlight .nv, .highlight .vc, .highlight .vg, .highlight .vi { + color: #c6c052; +} + +.md-typeset .highlight .md-clipboard:after { + display: none; +} + +.md-typeset .highlight .hll { + box-shadow: none; + /*box-shadow: 3px 0px 0px 0.1px var(--md-primary-fg-color) inset;*/ + margin: 0 -1.8em; + padding: 0 1.8em; +} + +body { + --md-text-font-family: Geist, -apple-system, "system-ui", "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", sans-serif; + --md-code-font-family: 'Geist Mono', ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, Liberation Mono, Courier New, monospace; +} + +.md-content { + /*border-left: 1px solid #E4E4E7;*/ +} + +.md-header__button.md-logo :where(img,svg) { + height: 1.25rem !important; +} + +.md-header__button.md-logo { + margin: 0 0 0 0.5rem; + padding: 0; +} + +.md-header__button.md-logo:hover { + opacity: 1; +} + +.md-main__inner { + /* DO NOT CHANGE: IT BREAKS SIDEBAR NAVIGATION */ + /*margin-top: 1.25em;*/ +} + +.md-sidebar__inner { + margin: 0 0 30px; +} + +@media screen and (min-width: 76.1875em) { + .md-sidebar--primary .md-nav__link, + .md-sidebar--primary .md-nav__item--section.md-nav__item .md-nav__link[for=__toc] { + display: none; + } + + .md-sidebar--primary .md-nav__item--section.md-nav__item .md-nav__link { + /*margin-left: 2px;*/ + } + + .md-sidebar--primary .md-nav__item--section.md-nav__item .md-nav__link--active { + border-left: 3px solid var(--md-typeset-a-color); + color: inherit; + border-image: linear-gradient(8deg, #0048ff, #ce00ff, #ce00ff, #ce00ff) 10; + margin-left: -2px; + font-size: 16.5px; + padding-left: 14px; + } + + .md-sidebar--primary .md-nav__item--section.md-nav__item .md-nav__link { + display: flex; + } + + /* Comment to siwtch to sections in sidebar */ + .md-sidebar__inner > .md-nav--primary > .md-nav__list:not(.md-post__meta) > .md-nav__item > .md-nav > .md-nav__list > .md-nav__item:not(.md-nav__item--section) { + display: none; + } + + .md-nav__item--section > .md-nav__link { + /*text-transform: uppercase;*/ + display: inline-block; + font-size: 17px; + font-weight: 600; + /* line-height: 1.4rem; */ + /*letter-spacing: -0.5px;*/ + position: relative; + left: -11px; + } + + .md-nav__item--section>.md-nav__link[for] { + color: rgba(0,0,0,0.87); + } + + .md-search__form > * { + z-index: 101; + } + + /* .md-search__form::before { + content: "⌘"; + color: white; + font-weight: 600; + position: absolute; + padding: 7px; + margin: 3.5px; + font-size: .65rem; + background-color: rgba(0,0,0,.87); + border-radius: 5px; + right: 37px; + z-index: 1; + width: 30px; + text-align: center; + } + + .md-search__form::after { + content: "K"; + color: white; + font-weight: 600; + position: absolute; + padding: 7px; + margin: 3.5px; + font-size: .65rem; + background-color: rgba(0,0,0,.87); + border-radius: 5px; + right: 1px; + top: 0; + width: 30px; + text-align: center; + } */ + + .md-nav--lifted > .md-nav__list > .md-nav__item > [for] { + display: none; + } +} + +.md-nav[data-md-level="2"] > .md-nav__list > .md-nav__item { + /*margin-left: -16px !important;*/ + border-left: 0.5px dotted rgba(0, 0, 0, 1); + /*background: red;*/ + margin-bottom: 0.5px; +} + +.md-nav[data-md-level="3"] > .md-nav__list > .md-nav__item:last-of-type { + margin-bottom: 6px; +} + +.md-sidebar--primary .md-nav__link, .md-sidebar--post .md-nav__link { + padding: 2px 15px 4px; + margin-top: 0; +} + +.md-nav.md-nav--secondary > .md-nav__list > .md-nav__item { + margin-left: 16px; + border-left: 0.5px dotted rgba(0, 0, 0, 0.4); + font-size: 16.5px; +} + +.md-sidebar--secondary .md-nav__link { + margin-left: -8px; + border-left: 2px solid transparent; + padding: 5px 15px 5px; + margin-right: 0 !important; +} + +.md-sidebar--secondary .md-nav__item { + padding-left: 0.35rem; +} + +.md-sidebar--secondary .md-nav__item .md-nav__link--passed { + color: inherit; +} + +.md-sidebar--secondary .md-nav__item .md-nav__link--active .md-typeset { + font-weight: 600; +} + +.md-sidebar--secondary .md-nav__item .md-nav__link--active { + border-left: 2.5px solid var(--md-typeset-a-color); + color: inherit; + border-image: linear-gradient(8deg, #0048ff, #ce00ff, #ce00ff, #ce00ff) 10; + margin-left: -9px; +} + +.md-nav__item .md-nav__link--active, .md-nav__item .md-nav__link--active:hover { + font-weight: 600; + color: inherit; +} + +.md-path__list { + margin-top: -7px; +} + +.md-path__link { + color: var(--md-default-fg-color); + font-size: 0.7rem; +} + +@media screen and (min-width: 76.1875em) { + .md-nav__item--section>.md-nav>.md-nav__list>li.md-nav__item:not(.md-nav__item--nested):not(.md-nav__item--active) { + margin-left: 10px; + /*border-left: 0.5px dotted rgba(0, 0, 0, 0.4);*/ + /*font-size: 16.5px;*/ + } + .md-sidebar--primary a.md-nav__link--active { + overflow: inherit; + } + .md-nav__item--section>.md-nav>.md-nav__list>.md-nav__item { + margin-left: 10px; + } + .md-nav__item--section { + margin: 0.6em 0; + } + /*MKDocs Insiders fix*/ + /*.md-sidebar--primary a.md-nav__link--active::before {*/ + /* content: "•";*/ + /* min-width: 1.1rem;*/ + /* font-size: 2rem;*/ + /* height: 0;*/ + /* display: flex;*/ + /* background-color: #000000;*/ + /* !*Add absolute positioning*!*/ + /* position: relative;*/ + /* top: -1.48rem;*/ + /* margin-left: -24px;*/ + /*}*/ + .md-sidebar--primary .md-nav__link--active { + font-weight: 600; + justify-content: start; + } +} + +.md-footer { + background-color: var(--md-default-bg-color); + color: inherit; + font-weight: 500; +} + +.md-footer-meta { + background-color: var(--md-default-bg-color); +} + +.md-typeset a { + /*letter-spacing: -0.5px;*/ +} + +html .md-footer-meta.md-typeset a:is(:focus,:hover) { + color: var(--md-primary-fg-color) !important; +} + +@media screen and (max-width: 76.1875em) { + .md-sidebar--primary .md-nav__link { + padding: inherit; + margin: 5px 5px; + } + + .md-sidebar--primary .md-nav__item .md-nav__link--active { + color: inherit; + background-color: inherit; + border-radius: inherit; + } + + .md-nav--primary .md-nav__title[for=__drawer] { + background-color: inherit; + } +} + +@media screen and (min-width: 76.25em) { + .md-nav--integrated>.md-nav__list>.md-nav__item--active .md-nav--secondary { + margin-bottom: 0; + } + + .md-nav--primary .md-nav__list { + padding-top: .15rem; + padding-bottom: .3rem; + } +} + +.md-typeset :where(ol, ul) { + list-style: none !important; +} + +[dir=ltr] .md-typeset ol li, [dir=ltr] .md-typeset ul li { + margin-left: 5px; + padding-left: 1rem; + position: relative; +} + +.md-typeset :where(ul) > li:before { + background-color: rgba(0,0,0,87); + border-radius: 50%; + content: ""; + height: 0.48em; + width: 0.48em; + left: 0.25em; + position: absolute; + top: 0.6875em; +} + +.md-typeset :where(ol) > li:before { + /*color: #6b7280;*/ + content: counter(list-item,var(--list-counter-style,decimal)) "."; + font-weight: 400; + left: 0; + position: absolute; +} + +.md-typeset :where(ol) { + margin-left: 0; +} + +.md-typeset :where(ul,ol) li :where(ul,ol) { + margin-top: 1.25em; + margin-bottom: 1.25em; +} + +@media screen and (min-width: 76.25em) { + [dir=ltr] .md-sidebar--primary:not([hidden]) ~ .md-content > .md-content__inner { + margin-left: 1.2rem; + } + + .md-content__inner:before { + display: none; + } +} + +.md-typeset .md-content__button { + color: var(--md-default-fg-color--light); +} + +/*.md-typeset p > img {*/ +/* border: 1px solid #E4E4E7;*/ +/*}*/ + +.md-typeset figure p img { + border: none; + display: inline-block; +} + +.md-typeset .grid.fit { + grid-template-columns: repeat(auto-fit,minmax(15rem,1fr)); +} + +.md-typeset .tabbed-labels>label:last-of-type { + margin-right: -2px; +} + +.md-typeset .tabbed-labels>label { + padding: 18px 18px 16px !important; + font-size: 16.5px !important; + line-height: 1.2 !important; + -webkit-font-smoothing: auto !important; + z-index: 1 !important; + background: none; + display: inline-block; + margin-right: -1px; +} + +.md-typeset .tabbed-labels--linked>label a { + /*MKDocs Insiders fix*/ + padding: initial; + font-weight: 700 !important; + color: rgba(0,0,0,0.83) !important; + min-width: 80px; + text-align: center; +} + +.md-typeset .tabbed-labels--linked>label>a code { + /*MKDocs Insiders fix*/ + background: initial; + font-weight: 700; + color: var(--md-typeset-color); +} + +.md-typeset .highlight :is(.nd,.ni,.nl,.nt), +.md-typeset .highlight :is(.k,.kd,.kn,.kp,.kr,.kt), +.md-typeset .highlight :is(.nc,.ne,.nf,.nn) { + font-weight: 400; +} + +.md-typeset .tabbed-labels>label > code { + background-color: transparent; + /*letter-spacing: -0.25px;*/ +} + +.md-typeset .tabbed-set { + border-radius: 0; + display: block; +} + +.md-typeset .tabbed-block>.highlight, .md-typeset .tabbed-block>.termy { + margin-top: 20px; +} + +@media screen and (min-width: 76.1875em) { + .md-typeset .tabbed-block > .highlight:first-child > pre > code, .md-typeset .tabbed-block > pre:first-child > code { + border-radius: 3px; + } +} + +.md-typeset thead { + background: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.05), rgb(0 114 255 / 0.5%), rgba(0, 42, 255, 0.025)); +} + +.md-typeset .tabbed-set>input:checked~.tabbed-labels code, +.md-typeset .tabbed-set>input:first-child:checked~.tabbed-labels>:first-child { + /*MKDocs Insiders fix*/ + /*font-weight: 500;*/ + /*color: var(--md-code-fg-color);*/ +} + +.js .md-typeset .tabbed-labels:before { +height: auto; + background: none; + z-index: 1; + padding: 5px; + border-radius: 0; + border: 0; + bottom: -0.7px; + top: -0.7px; + left: -0.7px; + right: -0.7px; + background: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.1), rgb(0 114 255 / 1%), rgba(0, 42, 255, 0.05)) !important; +} + +.md-typeset .tabbed-labels > label:after { + content: ""; + height: auto; + z-index: 1; + background: rgba(0,0,0, 0.25); + bottom: 0; + top: 0; + position: absolute; + width: 0.5px; + margin-left: -18.5px; + /* margin-top: 16px; + margin-bottom: 16px; */ +} + + +@media screen and (max-width: 44.984375em) { + [dir=ltr] .md-content__inner>.tabbed-set .tabbed-labels:after { + padding-right: 0; + } +} + +.md-typeset .tabbed-labels { + box-shadow: none !important; + margin-bottom: -3px; + display: inline-block; + height: 100%; + background: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.025), rgb(0 114 255 / 0.25%), rgba(0, 42, 255, 0.0125)); + z-index: 1; + border-radius: 0; + border: 0.5px solid rgba(0,0,0, 0.5); + overflow: unset; +} + +.md-typeset table:not([class]) { + border-radius: 8px; + border: .5px solid rgba(0, 0, 0, .25); +} + +.md-typeset .grid { + grid-gap: 1.2rem; + display: grid; + grid-template-columns: repeat(auto-fill,minmax(15rem,1fr)); + margin: 1.6em 0; +} + +.md-typeset .grid.cards>:-webkit-any(ul,ol) { + display: contents; +} + +.md-typeset .grid.cards>:-webkit-any(ul,ol)>li strong, .md-typeset .grid>.card strong { + display: block; + color: var(--md-default-fg-color); + margin-bottom: 6px; + font-weight: 800; +} + +.md-typeset .grid.cards>:-webkit-any(ul,ol)>li a:hover strong, +.md-typeset .grid>.card a:hover strong { + color: var(--md-typeset-a-color); +} + +.md-typeset .grid.cards > :-webkit-any(ul,ol) > li a, +.md-typeset .grid.cards > :-webkit-any(ul,ol) > li span, +.md-typeset .grid > .card a { + color: var(--md-default-fg-color); + text-decoration: none; + display: block; + margin: 0; + padding: 1rem 1.4rem; + border-radius: 0; + border: rgba(0,0,0,0.6) 0.5px solid; +} + +.md-typeset .grid.cards>ol>li:focus-within,.md-typeset .grid.cards>ol>li:hover,.md-typeset .grid.cards>ul>li:focus-within,.md-typeset .grid.cards>ul>li:hover,.md-typeset .grid>.card:focus-within,.md-typeset .grid>.card:hover { + box-shadow: none; +} + +.md-typeset .grid.cards>:-webkit-any(ul,ol)>li, .md-typeset .grid>.card { + font-size: 95%; + line-height: 1.6; + padding: 0; +} + +.md-typeset .grid.cards>:-webkit-any(ul,ol)>li:before { + display: none; +} + +.md-header--shadow { + box-shadow: none; +} + +.md-tabs { + background-color: transparent; + flex-grow: 1; + width: initial; +} + +@media screen and (max-width: 76.25em) { + .md-typeset .tabbed-block p { + margin: 1em .8rem; + } + + .md-typeset .tabbed-block :is(h1, h2, h3, h4, h5, h6) { + margin-left: .8rem; + } + + .md-tabs { + display: none; + } + + [data-termynal]:before { + left: 25px; + } +} +@media screen and (min-width: 76.1875em) { + .md-tabs { + padding-left: 2.5rem; + display: none; + } + + .ready .md-tabs { + display: block; + } + + [data-md-color-primary=white] .md-tabs { + border-bottom: none; + } + + .md-tabs{ + overflow: visible; + contain: none; + } + + .md-tabs__item--active .md-tabs__link { + font-weight: 600; + } + + .md-tabs__item--active .md-tabs__link:after { + position: absolute; + background: -webkit-linear-gradient(100deg, #0048ff, #ce00ff); + content: ""; + width: 100%; + z-index: 1000; + height: 3px; + bottom: -5px; + } + + .md-tabs[hidden] .md-tabs__link { + opacity: inherit; + transform: inherit; + transition: inherit; + position: relative; + } + + .md-tabs[hidden] { + pointer-events: inherit; + } + + /*.md-nav__title { + display: none; + }*/ + + [dir=ltr] .md-tabs__list { + display: flex; + overflow: visible; + contain: none; + } + + .md-tabs__item { + display: flex; + height: 2.4rem; + padding-left: 0; + padding-right: 22px; + } + + .md-tabs__item:nth-child(1) { + display: none; + } + + + + .md-tabs__item:nth-child(6) { + margin-left: auto; + padding-right: 0.5rem; + } + + .md-tabs__item:nth-child(n+6) .md-tabs__link { + visibility: hidden; + width: 35px; + display: inline-block; + margin-top: 12px; + } + + .md-tabs__item:nth-child(n+6) .md-tabs__link:before { + width: 38px; + height: 38px; + margin-top: 4px; + visibility: visible; + } + + /* .twemoji.external { + position: relative; + top: 2.5px; + height: 18.5px; + margin-left: -3px; + } + + .tx-footer__section-link.external:after { + content: url('data:image/svg+xml,'); + line-height: 14px; + margin-left: 5px; + position: relative; + top: 1.5px; + margin-right: -7px; + } */ + + .md-tabs__item:nth-child(6) .md-tabs__link:before { + position: relative; + content: ''; + width: 34px; + height: 28px; + display: inline-block; + -webkit-mask: url('data:image/svg+xml,') no-repeat 50% 50%; + mask: url('data:image/svg+xml,') no-repeat 50% 50%; + -webkit-mask-size: cover; + mask-size: cover; + background-color: rgba(0,0,0,0.87); + /* background: -webkit-linear-gradient(45deg, #0048ff, #ce00ff); */ + margin-top: 1px; + } + + .md-tabs__link { + display: flex; + align-items: center; + } + + .md-tabs__link { + font-size: 0.88rem; + font-weight: 500; + color: rgba(0,0,0,0.87); + /*letter-spacing: -0.5px;*/ + margin-top: 1px; + line-height: 24px; + } + + .md-tabs__link:hover { + color: black; + } + + .md-tabs__link.md-tabs__link--active { + color: var(--md-accent-fg-color); + } + + .md-nav[aria-label="Community"] .md-nav__item:nth-child(1) { + display: flex; + } +} + +.md-source__repository--active { + font-weight: 700; +} + +.md-source__icon { + height: 2.6rem; +} + +.md-source { + font-size: 0.7rem; +} + +.md-source:hover { + opacity: 1; +} + +[dir=ltr] .md-source__icon+.md-source__repository { + margin-left: -2.6rem !important; +} + +.md-source__icon.md-icon svg { + height: 1.38rem; + width: 1.38rem; + fill: none; + margin-left: -0.1rem; + margin-top: 0.61rem; +} + +.md-source__facts:nth-child(2) { + display: none; +} + +.md-source__facts { + color: black; +} + +.md-social__link svg { + max-height: 1.2rem; +} + +@media screen and (min-width: 76.25em) { + .md-search { + margin-top: 2px; + } + + .md-search .md-search__inner { + /*padding-top: 0.58rem;*/ + margin-right: 8px; + } + + [data-md-toggle=search]:checked ~ .md-header .md-search__inner, .md-search__scrollwrap { + width: 30rem; + } + + [data-md-toggle=search]:checked~.md-header .md-search__form .md-search__input+.md-search__icon { + color: black; + } + + [data-md-toggle=search]:checked~.md-header .md-search__form { + border-radius: 5px 5px 0 0; + border: none; + border-bottom: 0.5px solid black; + } + + .md-search__scrollwrap { + border-radius: 0 0 5px 5px; + } + + [dir=ltr] .md-search__options { + right: 3.8rem; + } + + .md-search__options .md-icon svg { + width: 1rem; + height: 1rem; + } + + .md-search__options > * { + color: var(--md-default-fg-color--light); + } +} + +.md-search-result mark { + font-weight: 600; +} + +.md-search__input::placeholder { + color: inherit; +} + +.md-top { + font-weight: 500; + box-shadow: 0 0 0.2rem rgb(0 0 0 / 5%), 0 0.2rem 0.9rem rgb(0 0 0 / 10%); + color: var(--md-default-fg-color); + border-radius: 20px; +} + +.md-top:hover { + background-color: white; + color: var(--md-default-fg-color); +} + +@media screen and (max-width: 44.9375em) { + .md-typeset .tabbed-set pre > code, [data-termynal] { + border-radius: 0 !important; + } + + .md-typeset .termy { + margin-left: -0.8rem; + margin-right: -0.8rem; + } + + .md-typeset .tabbed-set .termy { + margin-left: 0; + margin-right: 0; + } + + .md-typeset blockquote { + margin-left: -0.8rem; + margin-right: -0.8rem; + border-radius: 0; + } + + .md-typeset .tabbed-set blockquote { + margin-left: 0; + margin-right: 0; + border-radius: 0; + } + + .md-typeset .tabbed-set { + margin: 0 -.8rem 0 -0.8rem; + overflow-x: clip; + } + + .md-typeset .tabbed-set > .tabbed-labels { + display: flex; + flex-wrap: nowrap; + width: 100%; + max-width: 100%; + overflow-x: auto; + overflow-y: hidden; + -webkit-overflow-scrolling: touch; + box-sizing: border-box; + } + + .md-typeset .tabbed-set > .tabbed-labels > label { + flex: 0 0 auto; + white-space: nowrap; + } + + .md-typeset .tabbed-set > .tabbed-content, + .md-typeset .tabbed-set > .tabbed-content > .tabbed-block { + max-width: 100%; + min-width: 0; + box-sizing: border-box; + } + + .md-typeset div[editor-title] code { + border-width: 0; + border-radius: 0 !important; + } + + .md-typeset div[editor-title] { + width: 100vw; + max-width: 100vw; + margin-left: calc(50% - 50vw); + margin-right: calc(50% - 50vw); + } + + .md-typeset div[editor-title] .highlight, + .md-typeset div[editor-title] pre, + .md-typeset div[editor-title] pre > code { + max-width: 100%; + min-width: 0; + box-sizing: border-box; + } + + .md-typeset div[editor-title] .highlight { + margin: 0; + } + + .md-typeset div[editor-title] pre { + overflow-x: auto; + -webkit-overflow-scrolling: touch; + } + + [dir=ltr] .md-typeset :is(.admonition,details) { + border-width: 0; + border-radius: 0 !important; + margin-left: -0.8rem; + margin-right: -0.8rem; + background: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.1), rgb(0 114 255 / 1%), rgba(0, 42, 255, 0.05)) !important; + } + + [dir=ltr] .md-typeset details pre, + [dir=ltr] .md-typeset details blockquote, + [dir=ltr] .md-typeset details :is(.admonition,details, .termy) { + margin-left: 0 !important; + margin-right: 0 !important; + } + + [dir=ltr] .md-typeset details > :is(.md-typeset__scrollwrap,p,h4,h3,.tabbed-set,ul):not(.admonition-title,summary) { + padding-left: 0 !important; + padding-right: 0 !important; + } + + [dir=ltr] .md-typeset details :is(.md-typeset__scrollwrap,.highlight,.termy,div[editor-title],.tabbed-set,.md-typeset__table) { + width: calc(100% + 1.6rem); + max-width: calc(100% + 1.6rem); + margin-left: -0.8rem !important; + margin-right: -0.8rem !important; + box-sizing: border-box; + } + + [dir=ltr] .md-typeset details :is(pre,.md-typeset__scrollwrap,.tabbed-labels,.tabbed-content,.highlight,.termy,.md-typeset__table) { + max-width: 100%; + min-width: 0; + box-sizing: border-box; + overflow-x: auto; + -webkit-overflow-scrolling: touch; + } + + [dir=ltr] .md-typeset details > .tabbed-set { + margin-left: -0.8rem !important; + margin-right: -0.8rem !important; + } + + [dir=ltr] .md-content__inner>.tabbed-set .tabbed-labels { + margin-left: 0; + padding-left: 0; + } +} + +.md-banner { + background: black; + color: var(--md-default-bg-color); + margin-bottom: 1px; + font-weight: 500; +} + +.md-typeset.md-banner__inner { + margin: 0.15rem auto; + text-align: center; + font-size: 0.725rem; + font-weight: 300; + white-space: nowrap; +} + +.md-typeset.md-banner__inner strong { + font-weight: 600; +} + +.md-typeset.md-banner__inner a { + color: var(--md-default-bg-color); + /* border-bottom: 1.5px dotted; */ + /* font-weight: 500; */ + font-size: 0.75rem; +} + +.md-typeset.md-banner__inner .md-banner__button svg { + height: 20px; + max-width: 100%; + margin: 0.2rem 0; +} + +.md-typeset.md-banner__inner .md-banner__button { + color: var(--md-default-fg-color--lighter); +} + +.md-typeset.md-banner__inner .md-banner__button:hover { + color: var(--md-accent-fg-color); +} + +.md-typeset .footnote-backref { + vertical-align: inherit; +} + +.md-go-to-action.secondary.discord:before { + position: relative; + top: 5px; + content: url('data:image/svg+xml,'); + padding-right: 10px; +} + +.md-go-to-action.primary.discord:before { + position: relative; + top: 5px; + content: url('data:image/svg+xml,'); + padding-right: 10px; +} + +.md-go-to-action.github:before { + position: relative; + top: 5px; + content: url('data:image/svg+xml,'); + padding-right: 10px; +} + +[data-md-color-primary=white] .md-button { + border: 1.5px solid rgba(0,0,0,0.87); + line-height: 35px; + color: rgba(0,0,0,0.87); + margin-right: 5px; + background: transparent; + font-weight: 500 !important; + padding: 0.4em 1.5em; + font-size: 17px; + border-radius: 0; + white-space: nowrap; +} + +/*[data-md-color-primary=white] .md-button:hover{ + background: transparent; + border: 1.5px solid rgba(0,0,0,1); + color: black !important; +}*/ + +.md-post__title { + padding: 5px 15px 5px; +} + +.md-post .md-nav__title { + font-weight: 500; +} + +[data-md-color-primary=white] .md-button--primary { + color: white; + background: rgba(0,0,0,0.87); +} + +[data-md-color-primary=white] .md-button--github:before { + position: relative; + top: 5px; + content: url('data:image/svg+xml,'); + padding-right: 10px; +} + +/*[data-md-color-primary=white] .md-button:hover { + background: inherit; + color: inherit; + border-color: inherit; +}*/ + +/* +[data-md-color-primary=white] .md-button--primary:hover { + background: rgba(0,0,0,1); + color: white !important; +} +*/ + +[dir=ltr] .md-typeset blockquote { + /*border: 1px solid black;*/ + border: none; + color: var(--md-default-fg-color); + padding: 8px 25px; + border-radius: 0; + background: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.1), rgb(0 114 255 / 1%), rgba(0, 42, 255, 0.05)); +} + +a.md-go-to-action.secondary { + color: rgba(0,0,0,0.87); + background: white; +} + +.md-post__content :is(h2, h3, h4, h5, h6) a { + color: rgba(0,0,0,0.87); +} + +div[editor-title] code .editor-title { + position: absolute; + color: #a2a2a2; + top: 10px; + left: 0; + width: 100%; + text-align: center; + font-family: var(--md-code-font-family) !important; +} + +.md-status:after, .md-status:hover:after{ + background: rgba(0,0,0,0.7); +} + +.md-ellipsis, .md-ellipsis .md-typeset { + white-space: normal; + /*font-size: 98%;*/ +} + +.md-header__topic .md-ellipsis { + white-space: nowrap; +} + +/*.md-blog-sidebar .md-nav__link--active { + display: none; +}*/ + +[data-ty].no-newline, .no-newline { + display: inline-block; +} + +[data-ty].newline, .newline { + display: block; +} + +img.border { + border: 0.25px rgba(0,0,0,0.2) solid; + border-radius: 7px; +} + +.md-typeset .reference-item:hover > .headerlink { + display: none; +} + +.md-typeset .reference-item { + display: list-item; + margin-left: .625em; + font-weight: inherit; + color: inherit; + font-size: inherit; + letter-spacing: inherit; +} + +.md-typeset .reference-item a code { + color: var(--md-typeset-a-color); + font-size: .85em; +} + +.md-typeset .reference-item code { + background-color: rgba(163, 68, 215, 0.05); + border-radius: 2px; + font-weight: 600; + color: var(--md-primary-fg-color); + text-align: center; + padding: 4px; + height: 16px; + margin: 0 4px; +} + +/* External link indicator */ +a[href^="http"]:not(:where( + /* skip if marked with external-skip */ + .external-skip, + /* exclude http:// dstack links */ + [href^="http://dstack.ai"], + /* exclude https://dstack.ai links */ + [href^="https://dstack.ai"], + /* exclude md-content__button links */ + .md-content__button, +)):after { + content: ''; + display: inline-block; + width: 18.5px; + height: 18.5px; + margin-left: 0.15em; + vertical-align: -0.2em; + background-color: currentColor; + mask-image: url('data:image/svg+xml,'); + mask-size: 100%; + mask-repeat: no-repeat; + mask-position: center; + -webkit-mask-image: url('data:image/svg+xml,'); + -webkit-mask-size: 100%; + -webkit-mask-repeat: no-repeat; + -webkit-mask-position: center; + text-decoration: none; +} + +/* Exclude links inside .md-social */ +.md-social a[href^="http"]:after { + display: none; +} diff --git a/mkdocs/assets/stylesheets/landing.css b/mkdocs/assets/stylesheets/landing.css new file mode 100644 index 0000000000..da311e0ba9 --- /dev/null +++ b/mkdocs/assets/stylesheets/landing.css @@ -0,0 +1,1289 @@ +.tx-landing { + margin: 0 .8rem; + color: var(--md-primary-bg-color) +} + +.tx-landing__hero_text { + display: flex; + flex-direction: column; + align-items: center; + text-align: center; +} + +.tx-landing__hero_text h1 { + margin-bottom: .6rem; + font-weight: 700; + font-size: 2.5em; + /*letter-spacing: -3px;*/ +} + +.tx-landing__hero_text h1 strong { + font-weight: 800; +} + +/*.tx-landing__hero_text h1*/ .gradient { + background: linear-gradient(90deg, #4631C8 -1.29%, #CD4AE2 100%); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + padding-right: 4px; + margin-right: -4px; +} + +.tx-landing span.highlight { + display: inline-block; + border-radius: 0; + padding: 1px 5px; + border: 0.5px solid rgba(0, 0, 0, 0.33); +} + +.tx-landing .nowrap-token { + display: inline-block; + white-space: nowrap; +} + +/*.tx-landing__hero_text p { + font-size: 0.95rem; +}*/ + +.tx-landing__hero_buttons { + display: flex; + align-items: flex-start; + justify-content: center; + gap: 24px; +} + +.tx-container .tx-landing__hero_buttons .md-button { + margin-bottom: 0.2rem; + margin-right: 0; +} + +.tx-landing__hero_button_container { + text-align: center; +} + +.tx-landing__hero_button_placeholder { + color: #202128; + font-size: 0.67rem; + line-height: 24px; + text-align: center; + padding: 18px 5px 13px; +} + +@media screen and (max-width: 76.1875em) { + .tx-landing h1 { + font-size: 1.9rem; + margin: 24px -24px; + } + + .tx-landing__hero_text { + max-width: 30rem; + margin-left: auto; + margin-right: auto; + } + + .tx-landing__hero_image { + margin-top: 1.5rem; + max-width: 26rem; + } + + .tx-landing__hero_code { + margin-top: 2.5rem; + max-width: 750px; + margin-left: auto; + margin-right: auto; + } + + .tx-landing__hero_buttons { + flex-direction: column; + align-items: center; + gap: 0; + } +} + +@media screen and (min-width: 76.1875em) { + .tx-container { + padding-bottom: 3vw; + } + + .tx-landing__hero { + margin-bottom: 2.5rem; + font-size: 1em; + line-height: 1.5; + } + + .tx-landing__hero_text { + margin-top: 5.1rem; + } + + .tx-landing__hero_text h1 { + font-size: 2.9rem; + max-width: 36rem; + line-height: 1.1; + } + + .tx-landing__hero_text p { + max-width: 30rem; + } + + .tx-landing__hero_image { + order: 1; + width: 30rem; + margin-top: 0.5rem; + margin-left: 3rem; + } + + .tx-landing__hero_code { + /*width: 100vw;*/ + position: relative; + left: 50%; + transform: translateX(-50%); + margin-top: 2.5rem; + padding-top: 4.5rem; + padding-bottom: 4.5rem; + /*border-top-left-radius: 2.5rem;*/ + /*border-top-right-radius: 2.5rem;*/ + border-radius: 2.5rem; + background-image: url("/https/github.com/assets/images/hero_code_background.png"); + background-size: cover; + background-position: center; + background-repeat: no-repeat; + } + + .tx-landing__hero_code > [data-termynal] { + max-width: 750px; + margin-left: auto; + margin-right: auto; + } +} + +.md-header__buttons .md-button-secondary.discord:before, +.md-typeset .md-button-secondary.discord:before { + position: relative; + top: 6px; + content: ''; + width: 24px; + height: 24px; + display: inline-block; + -webkit-mask: url('data:image/svg+xml,') no-repeat 50% 50%; + mask: url('data:image/svg+xml,') no-repeat 50% 50%; + -webkit-mask-size: cover; + mask-size: cover; + margin-right: 8px; + background: -webkit-linear-gradient(45deg, rgba(0, 0, 0, 0.87), rgba(0, 0, 0, 0.87)); +} + +.md-header__buttons .md-button--primary.discord:before, +.md-typeset .md-button--primary.discord:before { + position: relative; + top: 6px; + content: ''; + width: 24px; + height: 24px; + display: inline-block; + -webkit-mask: url('data:image/svg+xml,') no-repeat 50% 50%; + mask: url('data:image/svg+xml,') no-repeat 50% 50%; + -webkit-mask-size: cover; + mask-size: cover; + margin-right: 8px; + background: white; +} + + +.md-typeset .md-button--primary.shell span { + font-family: var(--md-code-font-family) !important; + font-size: 16px; +} + +.md-header__buttons .md-button--primary.shell:before, +.md-typeset .md-button--primary.shell:before { + color: #e37cff; + position: relative; + top: 1px; + content: '$'; + display: inline-block; + margin-right: 10px; + font-size: 16px; + font-family: var(--md-code-font-family) !important; +} + +.md-header__buttons .md-button-secondary.github:before, +.md-typeset .md-button-secondary.github:before { + position: relative; + top: 5px; + content: ''; + width: 22px; + height: 22px; + display: inline-block; + -webkit-mask: url('data:image/svg+xml,') no-repeat 50% 50%; + mask: url('data:image/svg+xml,') no-repeat 50% 50%; + -webkit-mask-size: cover; + mask-size: cover; + margin-right: 8px; + background: black; +} + +.md-header__buttons .md-button--primary.github:before, +.md-typeset .md-button--primary.github:before { + position: relative; + top: 5px; + content: ''; + width: 22px; + height: 22px; + display: inline-block; + color: white; + -webkit-mask: url('data:image/svg+xml,') no-repeat 50% 50%; + mask: url('data:image/svg+xml,') no-repeat 50% 50%; + -webkit-mask-size: cover; + mask-size: cover; + margin-right: 8px; + background: white; +} + +.md-header__buttons { + margin-left: auto; +} + +.md-header__buttons .md-button, +.md-typeset .md-button { + margin-top: 0.6rem; + margin-bottom: 1.5rem; + font-size: 20px; + font-weight: 400 !important; + text-align: center; + border-radius: 0; + border-color: transparent; + margin-right: 5px; +} + +.md-header__buttons .md-button { + font-size: 17.5px; + /*letter-spacing: -0.5px;*/ +} + +.md-typeset .md-button { + min-width: 225px; + padding: 0.5em 2em; +} + +.md-typeset .md-button.small { + min-width: 150px; + padding: 0.3rem 0.5rem; + font-size: 18px; +} + +.md-typeset .md-button { + vertical-align: middle; +} + +.md-typeset.md-banner__inner .icon { + display: inline-block; + width: 12.5px; + height: 12.5px; + transition: opacity .2s ease,transform .2s ease; +} + +.md-typeset.md-banner__inner a:hover .icon { + transform: translateX(3px) +} + + +.md-typeset .md-button .icon, +.md-typeset .md-post__action .icon { + display: inline-block; + position: relative; + width: 15px; + height: 15px; + margin-left: 7px; + transition: opacity .2s ease,transform .2s ease; +} + +.md-typeset .md-button-secondary .icon { + color: black; +} + +.md-typeset .md-button--primary .icon { + color: white; +} + +.md-typeset .md-button-secondary:hover .icon, .md-typeset .md-button--primary:hover .icon { + /* color: #a91ffe; */ + transform: translateX(3px) +} + + +[data-md-color-primary=white] .md-header__buttons .md-button--primary, [data-md-color-primary=white].md-header__buttons .md-button--primary:hover, +[data-md-color-primary=white] .md-typeset .md-button--primary, [data-md-color-primary=white] .md-typeset .md-button--primary:hover { + background: rgba(0, 0, 0, 0.87); + border-radius: 0; + font-weight: 400 !important; + /*margin-right: 10px;*/ +} + +.md-header__buttons .md-button--primary:hover:not(.sky), .md-typeset .md-button--primary:hover:not(.sky) { + background: black !important; + border-color: black !important; +} + +.md-header__buttons .md-button--primary, +.md-header__buttons .md-button-secondary { + font-weight: 400 !important; + white-space: nowrap; + padding: 0.44rem 0.8rem; +} + +.md-header__buttons .md-button-secondary:hover, .md-typeset .md-button-secondary:hover { + color: black !important; + border-color: black !important; +} + +.providers a.feature-cell/*:hover*/ h3:after { + content: url('data:image/svg+xml,'); + margin-left: 5px; + position: relative; + top: 3px; + margin-right: -7px; +} + +/* .md-button-secondary.external:after { + content: url('data:image/svg+xml,'); + line-height: 14px; + margin-left: 5px; + position: relative; + top: 3px; + margin-right: -7px; +} + +.md-button--primary.external:after, .md-button--primary.sky.external:after { + content: url('data:image/svg+xml,'); + line-height: 14px; + margin-left: 5px; + position: relative; + top: 2.5px; + margin-right: -7px; +} */ + +.md-header__buttons .md-button-secondary, +.md-typeset .md-button-secondary, +.md-header__buttons .md-button-secondary:hover, +.md-typeset .md-button-secondary:hover, +.md-header__buttons .md-button-secondary:focus, +.md-typeset .md-button-secondary:focus { + background: transparent; + color: rgba(0, 0, 0, 0.87); + border: 0.5px solid rgba(0, 0, 0, 0.87); + border-radius: 0; +} + +.md-header__buttons .md-button-secondary:hover, +.md-typeset .md-button-secondary:hover { + color: black; + border: 0.5px solid blackl +} + +.md-header__buttons .md-button-secondary.borderless, +.md-header__buttons .md-button-secondary.borderless:hover { + border: none; + padding: 0 20px 0 0; + margin: 0; + font-weight: 500 !important; +} + +.md-header__buttons { + white-space: nowrap; + padding-top: 2px; +} + +.tx-landing__highlights { + margin-bottom: 5vw; + font-size: 17px; + line-height: 1.5; +} + +.tx-landing__highlights_text h2 { + font-size: 2em; + max-width: 600px; + font-weight: 700; + margin-top: 0; + margin-bottom: 1.8em; + /*letter-spacing: -1.5px;*/ + line-height: 1.3; +} + +.tx-landing__bottom_cta { + display: flex; + flex-direction: row; +} + +.tx-landing__bottom_cta_card { + padding: 10px 50px 20px; + border: 0.5px dotted black; + + &.sky { + border: 0; + background: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.05), rgba(0, 42, 255, 0.05), rgba(225, 101, 254, 0.08)); + } +} + +.tx-landing__bottom_cta_card h2 { + margin-top: 0.75em; + margin-bottom: 0; +} + +@media screen and (max-width: 76.1875em) { + .tx-landing__bottom_cta { + flex-direction: column; + gap: 35px; + } +} + +@media screen and (min-width: 76.1875em) { + .tx-landing__bottom_cta_card { + width: 50%; + + &.enterprise { + border-radius: 0; + border-top-right-radius: 0; + border-bottom-right-radius: 0 + } + + &.sky { + border-top-left-radius: 0; + border-bottom-left-radius: 0 + } + } + + .tx-landing__bottom_cta_card .tx-landing__bottom_cta_card_panel { + display: flex; + column-gap: 30px; + flex-direction: column; + } + + .tx-landing__bottom_cta_card .tx-landing__bottom_cta_card_button_panel { + flex: inherit; + } +} + +@media screen and (max-width: 76.1875em) { + .tx-landing__bottom_cta_card .tx-landing__bottom_cta_card_button_panel { + flex: inherit; + } +} + +.tx-landing__bottom_cta_card .tx-landing__bottom_cta_card_text { + max-width: 100%; + font-size: 0.95em; +} + +.tx-landing__bottom_cta_card .tx-landing__bottom_cta_card_button_panel a { + margin: 0 0 15px; +} + +.tx-landing__bottom_cta_card .tx-landing__bottom_cta_card_button_subheader { + font-size: 0.9em; + margin-bottom: 10px; +} + +.tx-landing__highlights_cta { + margin-top: 3vw; +} + +.tx-landing__highlights_cta a { + display: inline-block; + font-size: 19px; + margin-top: 30px; + border: 1px solid; + padding: 10px 30px; +} + +.tx-landing__highlights_text h2 .gradient { + background: -webkit-linear-gradient(45deg, #0048ff, #ce00ff); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; +} + +.providers.tx-landing__highlights_grid { + grid-gap: 20px !important; +} + +.providers.tx-landing__highlights_grid .feature-cell h3 { + align-content: center; + font-size: 1em; + font-weight: 600; + padding-bottom: 0.05em; + line-height: 25px; +} + +.providers.tx-landing__highlights_grid .feature-cell { + row-gap: 22px; + padding: 25px 30px; + aspect-ratio: 1.05; + + @media screen and (min-width: 76.1875em) { + &:nth-child(1) { + border-top-left-radius: 3px; + } + } +} + +.tx-landing__highlights_grid .feature-cell { + padding: 30px 40px; + border-radius: 0; + border-color: rgba(0, 0, 0, 0.75); + border-width: 0.5px; + border-style: dotted; + display: flex; + flex-direction: column; +} + +@media screen and (min-width: 76.1875em) { + .providers.tx-landing__highlights_grid .feature-cell { + border-radius: 0; + border-left: none; + border-bottom: none; + } + + .nvidia.providers.tx-landing__highlights_grid .feature-cell { + &:nth-child(1), &:nth-child(6), &:nth-child(11) { + border-left: 0.5px dotted rgba(0, 0, 0, 0.75); + } + + &:nth-child(n+7) { + border-bottom: 0.5px dotted rgba(0, 0, 0, 0.75); + } + + &:nth-child(5) { + border-top-right-radius: 3px; + } + + &:nth-child(5), &:nth-child(11) { + border-bottom-right-radius: 3px; + } + + &:nth-child(11) { + border-bottom-left-radius: 3px; + } + + &:nth-child(10) { + border-bottom-right-radius: 3px; + } + } +} + +:is(.amd).providers.tx-landing__highlights_grid .feature-cell { + &:nth-child(1) { + border-left: 0.5px dotted rgba(0, 0, 0, 0.75); + border-bottom-left-radius: 3px; + } + + border-bottom: 0.5px dotted rgba(0, 0, 0, 0.75); + + &:nth-child(3) { + border-top-right-radius: 3px; + border-bottom-right-radius: 3px; + } +} + +.providers.tx-landing__highlights_grid.other .feature-cell { + column-gap: 15px; + flex-direction: row; + padding: 15px 29px 15px; +} + +.providers.tx-landing__highlights_grid.other .feature-cell h3 { + font-size: 1em; +} + +@media screen and (min-width: 76.1875em) { + .tx-landing__highlights_grid { + grid-gap: 20px !important; + border: none; + + grid-template-columns: repeat(4, 1fr) !important; + } + + .providers.tx-landing__highlights_grid { + grid-gap: 0px !important; + border: none; + + grid-template-columns: repeat(5, 1fr) !important; + } + + .tx-landing__highlights_grid .feature-cell { + } +} + +.tx-landing__highlights_grid .feature-cell { + background: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.005), rgba(0, 42, 255, 0.005), rgba(225, 101, 254, 0.01)); +} + +/*.tx-landing__highlights_grid .feature-cell:hover { + background: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.03), rgba(0, 42, 255, 0.03), rgba(225, 101, 254, 0.05)); +}*/ + +.tx-landing__highlights_grid .feature-cell strong { + font-weight: 500; +} + +.tx-landing__highlights_grid .feature-cell .feature-tags { + gap: 2px; + margin: 0 -5px; + margin-top: auto; + display: none; +} + +.tx-landing__highlights_grid .feature-cell .feature-tags .feature-tag { + display: flex; + align-items: center; + gap: 8px; + padding: 8px 20px; + font-size: 0.85em; + font-weight: 400; + line-height: 1.44; + color: black; + margin-top: 20px; + margin-right: 5px; + border-radius: 30px; + border-width: 0.5px; + border-style: solid; + white-space: nowrap; +} + +.tx-landing__highlights_grid > a, .tx-landing__highlights_grid > a:hover { + text-decoration: none; + color: inherit; +} + +.tx-landing__integrations_text { + color: #202128; + font-size: 0.65rem; + line-height: 24px; + text-align: center; + padding: 21px 5px 3px; +} + +.tx-landing__integrations_logos { + display: flex; + align-items: center; + justify-content: center; + gap: 14px; + padding: 8px 5px 3px; +} + +.tx-landing__integrations .logo-xlarge { + width: 41px; + margin-top: 2px; +} + +.tx-landing__integrations .logo-large { + width: 30px; +} + +.tx-landing__integrations .logo-medium { + width: 26px; +} + +.tx-landing__highlights_grid { + grid-gap: 2rem; + display: grid; + grid-template-columns: repeat(auto-fill, minmax(15rem, 1fr)); + margin-top: 40px; + margin-bottom: 30px; +} + +.tx-landing__highlights_grid .feature-icon svg { + padding: 12px; + background: rgba(125, 4, 233, 0.02); + color: #002aff; + width: 50px; + height: 50px; + border-radius: 25px; + text-align: center; + display: inline-flex; + vertical-align: text-top; + fill: currentColor; + margin-bottom: 15px; +} + +.tx-landing__highlights_grid h3 { + font-size: 1.15em; + font-weight: 700; + border-bottom: none; + padding-bottom: 0.3em; + margin-top: 0; + line-height: 32px; +} + +/* .tx-landing__highlights_grid h3.external:after { + content: url('data:image/svg+xml,'); + margin-left: 2px; + position: relative; + top: 3px; + margin-right: -7px; +} */ + +.tx-landing__highlights_grid p { + font-size: 16px; + margin-top: 5px; + margin-bottom: 5px; + color: rgba(0, 0, 0, 0.87); +} + +.tx-landing__features { + margin-bottom: 5vw; +} + +.tx-landing__features_text h2 { + font-size: 1.7em; + max-width: 500px; + color: rgba(0, 0, 0, 0.87); + margin-bottom: 1.5em; +} + +.tx-landing__features_grid { + grid-gap: 1.2rem; + display: grid; + grid-template-columns: repeat(auto-fill, minmax(15rem, 1fr)); +} + +.tx-landing__trusted_by { + margin-top: 9vw; + /*margin-bottom: 5vw;*/ + font-size: 18px; + line-height: 1.5; +} + +.tx-landing__major_feature { + font-size: 1em; + margin-top: 4em; +} + +.tx-landing__trusted_by_text { + font-size: 0.95em; + max-width: 500px; +} + +.tx-landing__major_feature h2 { + font-weight: 700; +} + +.tx-landing__major_feature h3 { + padding-bottom: 0; + border-bottom: none; + background: -webkit-linear-gradient(45deg, #0048ff, #ce00ff); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + font-size: 1.5em; +} + +.tx-landing__major_feature img.border { + border: 0.25px rgba(0,0,0,0.2) solid; + border-radius: 7px; +} + +.tx-landing__major_feature h2 { + font-size: 1.7em; + max-width: 500px; + margin-top: 0; + margin-bottom: 1.5em; + background: black; + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + /*letter-spacing: -1.5px;*/ + line-height: 1.1; +} + +.tx-landing__major_feature { + margin-bottom: 7vw; +} + +.tx-landing__major_feature .section { + display: flex; + flex-direction: column; +} + +@media screen and (min-width: 76.1875em) { + .tx-landing__major_feature .section { + flex-direction: row; + } +} + +@media screen and (max-width: 76.1875em) { + .tx-landing__major_feature .section .block.large { + order: 2; + } + + .tx-landing__major_feature .section .block:not(.large) { + order: 1; + } +} + +.tx-landing__major_feature .block { + max-width: 800px; + width: 100%; +} + +@media screen and (min-width: 76.1875em) { + .tx-landing__major_feature .block.margin.right { + margin-right: 50px; + } + + .tx-landing__major_feature .block.margin.left { + margin-left: 50px; + } +} + +.tx-landing__major_feature .block.large { + width: 700px; + max-width: 100%; + flex: 0 0 auto; +} + +/*.tx-landing__trusted_by*/ a[data-terminal-control] { + color: #e37cff; +} + +/*.tx-landing__trusted_by*/ [data-ty="input"]:before, [data-ty-prompt]:before { + color: #e37cff; +} + +#get-started-code-snippet[data-termynal], +#get-started-claude-snippet[data-termynal] { + font-size: 14px !important; +} + +/*.tx-landing__trusted_by*/ [data-termynal] { + font-size: 14px; +} + +.tx-landing__trusted_by .termy { + max-width: 750px; + margin-bottom: 2rem; +} + +.tx-footer { + padding-top: 2.5rem; +} + +.md-footer__inner { + padding: 0; +} + +.tx-footer .md-main__inner { + border-width: 0; + /*border-top-width: 0.6px;*/ + border-image: linear-gradient(45deg, #0048ff, #ce00ff) 10; + border-style: solid; +} + +.tx-footer__section { + display: flex; + flex-direction: column; + gap: 0.5rem; + margin: 0 0.6rem; +} + +.tx-footer__section-title { + text-transform: uppercase; + font-size: 0.8rem; + font-weight: 700; + color: black; + /*letter-spacing: -0.5px;*/ + line-height: 24px; + margin-top: 0.6rem; + margin-bottom: 0.1rem; +} + +.tx-footer__logo:hover { + opacity: .7; +} + +.tx-footer__copyright { + margin-top: 0.6rem; + font-size: 18px; + line-height: 26px; + color: black; + font-weight: 500; +} + +.tx-footer__section-link { + font-size: 0.75rem; + line-height: 26px; + color: #151414; + transition: opacity .2s ease; +} + +.tx-footer__section-link:hover { + opacity: .7; +} + +@media screen and (max-width: 76.1875em) { + .tx-footer .md-main__inner { + flex-direction: column; + gap: 1.5rem; + margin-left: .8rem; + margin-right: .8rem; + } + + .tx-footer__section { + margin-bottom: 1.5rem; + } + + .md-header__buttons { + padding: 4px 0; + } +} + +@media screen and (min-width: 76.1875em) { + .tx-footer { + padding-bottom: 3.5rem; + } + + .tx-footer__right-side { + margin-left: auto; + display: flex; + gap: 4.5rem; + } +} + +.tx-landing__plans { + margin-bottom: 2.75rem; +} + +.tx-landing__plans_text { + +} + +.tx-landing__major_feature .supported_clouds_block.block.large { + width: 600px +} + +.code-carousel__slides { + position: relative; + display: grid; +} + +.code-carousel__slide { + grid-area: 1 / 1; + display: none; +} + +.code-carousel__slide.active { + display: block; +} + +.code-carousel__slide > [data-termynal], +.code-carousel__slide[data-termynal] { + margin: 0 !important; +} + +.code-carousel .code-carousel__slide[editor-title] pre { + margin: 0; +} + +.code-carousel .code-carousel__slide[editor-title] pre > code, +.code-carousel .code-carousel__slide[data-termynal] { + font-size: 14px; + padding-top: 70px; +} + +@media screen and (max-width: 44.9375em) { + .tx-landing__major_feature .block.large { + width: 100%; + } + + .code-carousel { + width: 100vw; + max-width: 100vw; + margin-left: calc(50% - 50vw); + margin-right: calc(50% - 50vw); + min-width: 0; + } + + .code-carousel__slides, + .code-carousel__slide, + .code-carousel .code-carousel__slide[editor-title], + .code-carousel .code-carousel__slide[editor-title] .highlight, + .code-carousel .code-carousel__slide[editor-title] pre, + .code-carousel .code-carousel__slide[editor-title] pre > code, + .code-carousel .code-carousel__slide > [data-termynal], + .code-carousel .code-carousel__slide[data-termynal] { + max-width: 100%; + min-width: 0; + box-sizing: border-box; + } + + .code-carousel .code-carousel__slide[editor-title] .highlight { + margin: 0; + } + + .code-carousel .code-carousel__slide[editor-title] pre, + .code-carousel .code-carousel__slide > [data-termynal], + .code-carousel .code-carousel__slide[data-termynal] { + overflow-x: auto; + -webkit-overflow-scrolling: touch; + } + + .code-carousel .code-carousel__slide[editor-title] { + margin-left: 0 !important; + margin-right: 0 !important; + } +} + +.code-carousel__dots { + display: flex; + justify-content: center; + gap: 8px; + margin-top: 12px; +} + +.code-carousel__dot { + width: 10px; + height: 10px; + display: block; + flex: 0 0 auto; + border-radius: 50%; + background: transparent; + border: 0.5px solid #000; + box-sizing: border-box; + cursor: pointer; +} + +.code-carousel__dot.active { + border-color: #000; + background: #000; +} + +.code-carousel__dot:hover { + border-color: #000; +} + +.tx-landing [editor-title] pre { + visibility: hidden; +} + +.tx-landing [editor-title] .highlight pre { + visibility: visible; +} + +.supported_clouds_inline { + display: flex; + flex-wrap: wrap; + align-items: center; + gap: 12px; + margin-bottom: 1rem; +} + +.supported_clouds_inline a { + display: inline-flex; + align-items: center; +} + +.supported_clouds_inline a img { + filter: brightness(0); +} + +.supported_clouds { + display: grid; + flex-wrap: wrap; + grid-gap: 0; + margin-bottom: 1.3rem; + max-width: 550px; +} + +@media screen and (min-width: 44.984375em) { + .supported_clouds { + grid-template-columns: repeat(6, 1fr) !important; + row-gap: 20px; + } +} + +@media screen and (max-width: 44.984375em) { + .supported_clouds { + grid-template-columns: repeat(3, 1fr) !important; + } +} + +.supported_clouds_item { + display: flex; + /* align-items: center; */ + gap: 10px; + padding: 21px; + /* border-radius: 2px; */ + border: 0.5px solid black; + /* font-size: .85em; */ + color: #2A292D !important; + line-height: 1.44; + /* aspect-ratio: 1; */ + flex-direction: column; + font-weight: 300; + font-size: 85%; + + align-items: center; + justify-content: center; + + + &:hover { + background: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.05), rgba(0, 42, 255, 0.05), rgba(225, 101, 254, 0.08)); + } +} + +.supported_clouds_item { + border-right: none; + border-bottom: none; +} + +@media screen and (min-width: 44.984375em) { + .supported_clouds_item { + border-right: none; + border-left: none; + + &:nth-child(1), &:nth-child(7) { + border-top-left-radius: 3px; + border-bottom-left-radius: 3px; + } + + &:nth-child(6), &:nth-child(12) { + border-top-right-radius: 3px; + border-bottom-right-radius: 3px; + } + + &:nth-child(1), &:nth-child(7) { + border-left: 0.5px solid black; + } + + &:nth-child(6), &:nth-child(12) { + border-right: 0.5px solid black; + } + + &:nth-child(n+0) { + border-bottom: 0.5px solid black; + } + } +} + +@media screen and (max-width: 44.984375em) { + .supported_clouds_item { + &:nth-child(3), &:nth-child(6), &:nth-child(9), &:nth-child(12) { + border-right: 0.5px solid black; + } + + &:nth-child(n+10) { + border-bottom: 0.5px solid black; + } + } +} + +.md-header__buttons .md-button--primary.sky, .md-header__buttons .md-button--primary.sky:hover, +.md-typeset .md-button--primary.sky, .md-typeset .md-button--primary.sky:hover { + background: -webkit-linear-gradient(45deg, #002aff, #002aff, #e165fe); + border-radius: 0; + border: 1px solid transparent; +} + +.highlighted { + font-weight: 500; + color: var(--md-typeset-a-color); +} + +.plans_card__link { + margin-top: 1.0rem; + margin-bottom: 1.9rem; + font-size: 1.0em; + line-height: 1.66; +} + +.plans_card__buttons .md-button { + margin: 0; +} + +@media screen and (max-width: 44.984375em) { + .md-header__buttons .md-button:not(.github):before, .md-header__buttons .md-button:not(.github):after { + display: none !important; + } + + .md-header__buttons .md-button-secondary.borderless, .md-header__buttons .md-button-secondary.borderless:hover { + padding-right: 10px; + } +} + +@media screen and (min-width: 76.1875em) { + .tx-landing__plans_text h2 { + font-size: 2.5rem; + } +} + +.tx-landing__quotes_grid { + grid-gap: 1.2rem; + display: grid; + grid-template-columns: repeat(auto-fill, minmax(15rem, 2fr)); + margin-bottom: 2.4em; +} + +.heart::after { + position: relative; + content: ''; + width: 45px; + height: 40px; + display: inline-block; + -webkit-mask: url('data:image/svg+xml, ') no-repeat 50% 50%; + mask: url('data:image/svg+xml, ') no-repeat 50% 50%; + -webkit-mask-size: cover; + mask-size: cover; + background: -webkit-linear-gradient(45deg, #0048ff, #ce00ff); + top: 4px; + margin: 0 3px; +} + +.tx-landing__quotes_grid .photo { + height: 80px; + float: left; + margin: 0 15px 15px 0; +} + +.tx-landing__quotes_grid .photo img { + width: auto; + height: 100%; + aspect-ratio: 1; + object-fit: cover; + border-radius: 50px; +} + +.tx-landing__quotes_grid h3 { + border-bottom: none; + font-size: 1em; + font-weight: 700; + margin: 0; + padding: 0; + margin-top: 13px; +} + +.tx-landing__quotes_grid h4 { + border-bottom: none; + font-size: 0.95em; + font-weight: 500; + margin: 0; + padding: 0; + color: rgba(0, 0, 0, 0.5); +} + +.tx-landing__quotes_grid p { + clear: both; + font-size: 0.9em; +} + +.tx-landing__quotes_grid .cell { + padding: 23px 23px 13px; + border-radius: 0; + border-color: rgba(0, 0, 0, 0.75); + border-width: 0.5px; + border-style: solid; + + &:nth-child(odd) { + background: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.05), rgba(0, 42, 255, 0.05), rgba(225, 101, 254, 0.08)); + border: 0; + } +} + +.md-header__title { + font-family: 'Geist Pixel Square', var(--md-text-font-family); +} diff --git a/docs/assets/stylesheets/pricing.css b/mkdocs/assets/stylesheets/pricing.css similarity index 99% rename from docs/assets/stylesheets/pricing.css rename to mkdocs/assets/stylesheets/pricing.css index b9ae1afe30..93ba7484f0 100644 --- a/docs/assets/stylesheets/pricing.css +++ b/mkdocs/assets/stylesheets/pricing.css @@ -143,7 +143,7 @@ position: relative; padding-right: 40px; color: #2A292D; - font-size: 0.85rem; + font-size: 1rem; font-weight: 800; line-height: 1.33; cursor: pointer; @@ -232,4 +232,4 @@ .tx-container .tx-pricing-cta .md-button { min-width: 250px; -} \ No newline at end of file +} diff --git a/mkdocs/assets/stylesheets/swagger.css b/mkdocs/assets/stylesheets/swagger.css new file mode 100644 index 0000000000..c9aef08747 --- /dev/null +++ b/mkdocs/assets/stylesheets/swagger.css @@ -0,0 +1,1773 @@ +.dstack-swagger-ui { + --dstack-swagger-border-color: rgba(0, 0, 0, 0.87); + --dstack-swagger-code-bg-color: rgb(21, 22, 29); + --dstack-swagger-hairline-border: 0.5px solid black; + --dstack-swagger-anchor-offset: 4rem; + --dstack-swagger-operation-anchor-offset: 3rem; + --dstack-swagger-control-font-size: 0.7rem; + --dstack-swagger-control-height: 1.84rem; + --dstack-swagger-muted-bg-color: rgba(0, 0, 0, 0.005); + --dstack-swagger-primary-border: 1.5px solid var(--dstack-swagger-border-color); + --dstack-swagger-quote-bg: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.1), rgb(0 114 255 / 1%), rgba(0, 42, 255, 0.05)); + --dstack-swagger-action-radius: 4px; + --dstack-swagger-curl-max-height: min(520px, 70vh); + --dstack-swagger-schema-max-height: min(520px, 70vh); + --dstack-swagger-schema-code-max-height: min(420px, calc(70vh - 100px)); + --dstack-swagger-tab-content-gap: 14px; +} + +.dstack-swagger-ui .swagger-ui, +.dstack-swagger-ui .swagger-ui :where(button, input, optgroup, select, textarea) { + color: var(--md-default-fg-color); + font-family: var(--md-text-font-family); +} + +.dstack-swagger-ui .swagger-ui *, +.dstack-swagger-ui .swagger-ui *::before, +.dstack-swagger-ui .swagger-ui *::after { + box-shadow: none !important; + text-shadow: none !important; +} + +.dstack-swagger-ui .swagger-ui .information-container, +.dstack-swagger-ui .swagger-ui > .info { + display: none !important; +} + +.dstack-swagger-ui .swagger-ui .wrapper { + padding: 0 !important; +} + +.md-typeset .dstack-swagger-operation-anchor:not(.dstack-swagger-operation-title) { + font-size: 0; + height: 1px; + line-height: 0; + margin: 0 !important; + overflow: hidden; + padding: 0 !important; + pointer-events: none; + position: absolute; + scroll-margin-top: 0; + top: calc(-1 * var(--dstack-swagger-anchor-offset)); + visibility: hidden; + width: 1px; +} + +.md-typeset + .dstack-swagger-operation-anchor:not(.dstack-swagger-operation-title) + .headerlink { + display: none; +} + +.md-typeset .dstack-swagger-ui .dstack-swagger-operation-title { + scroll-margin-top: var(--dstack-swagger-operation-anchor-offset); +} + +.dstack-swagger-ui .swagger-ui .scheme-container { + background: transparent; + box-shadow: none; + margin: 0; + padding: 0; +} + +.dstack-swagger-ui .swagger-ui .scheme-container .schemes-server-container .servers-title, +.dstack-swagger-ui .swagger-ui .scheme-container .schemes-server-container .servers label { + color: var(--md-default-fg-color); + font-family: var(--md-text-font-family); + font-size: 0.7rem; + font-weight: 400; + margin: 0 15px 0 0; +} + +.dstack-swagger-ui .swagger-ui .scheme-container .schemes-server-container .servers-title { + display: none; +} + +.dstack-swagger-ui .swagger-ui .scheme-container .schemes-server-container .servers label { + margin: 0; +} + +.dstack-swagger-ui .swagger-ui .scheme-container .schemes-server-container .servers select, +.dstack-swagger-ui .swagger-ui .content-type-wrapper select.content-type { + background-color: transparent; + border: 1px solid rgba(0, 0, 0, 0.87); + border-radius: 0; + color: var(--md-default-fg-color); + font-family: var(--md-text-font-family); + font-size: var(--dstack-swagger-control-font-size); + font-weight: 400; + height: var(--dstack-swagger-control-height); + margin: 0; + padding-bottom: 0; + padding-top: 0; + text-transform: none; +} + +.dstack-swagger-ui .swagger-ui .scheme-container .schemes-server-container .servers select:hover, +.dstack-swagger-ui .swagger-ui .content-type-wrapper select.content-type:hover { + border-color: black; +} + +.dstack-swagger-ui .swagger-ui .opblock, +.dstack-swagger-ui .swagger-ui .opblock.opblock-delete, +.dstack-swagger-ui .swagger-ui .opblock.opblock-deprecated, +.dstack-swagger-ui .swagger-ui .opblock.opblock-get, +.dstack-swagger-ui .swagger-ui .opblock.opblock-head, +.dstack-swagger-ui .swagger-ui .opblock.opblock-options, +.dstack-swagger-ui .swagger-ui .opblock.opblock-patch, +.dstack-swagger-ui .swagger-ui .opblock.opblock-post, +.dstack-swagger-ui .swagger-ui .opblock.opblock-put, +.dstack-swagger-ui .swagger-ui .opblock.opblock-query { + background: transparent; + border: 0; + border-radius: 0; + margin: 0 0 36px; + position: relative; +} + +.dstack-swagger-ui .swagger-ui .opblock > .opblock-summary { + background: transparent; + box-sizing: border-box; + cursor: default; + padding: 10px 0; +} + +.dstack-swagger-ui .swagger-ui .opblock.dstack-swagger-has-try-out > .opblock-summary { + padding-right: calc(var(--dstack-swagger-try-out-width, 94px) + 8px); +} + +.dstack-swagger-ui .swagger-ui .opblock.is-open > .opblock-summary { + border-bottom: 0; +} + +.dstack-swagger-ui .swagger-ui .opblock > .opblock-summary:hover { + background: transparent; +} + +.dstack-swagger-ui .swagger-ui .opblock > .opblock-summary .opblock-summary-control { + align-items: center; + background: transparent; + cursor: default; + display: flex; + gap: 8px; + min-width: 0; + width: 100%; +} + +.dstack-swagger-ui .swagger-ui .opblock > .opblock-summary .opblock-control-arrow { + display: none; +} + +.dstack-swagger-ui .swagger-ui .opblock > .opblock-summary .authorization__btn { + display: none; +} + +.dstack-swagger-ui .swagger-ui .opblock .opblock-summary .view-line-link.copy-to-clipboard { + display: none !important; +} + +.dstack-swagger-ui .swagger-ui .opblock .opblock-summary-path-description-wrapper { + align-items: center; + background: transparent; + border: 1px solid rgba(0, 0, 0, 0.87); + border-radius: 0; + color: var(--md-default-fg-color); + cursor: default !important; + display: block; + flex: 1 1 auto; + font-family: var(--md-text-font-family); + font-size: var(--dstack-swagger-control-font-size); + font-weight: 400; + gap: 0; + height: var(--dstack-swagger-control-height); + min-width: 0; + overflow: hidden; + padding: 0; + position: relative; + user-select: text; +} + +.dstack-swagger-ui .swagger-ui .opblock .opblock-summary-description { + display: none; +} + +.dstack-swagger-ui .swagger-ui .dstack-swagger-summary-actions { + align-items: center; + display: flex; + flex: 0 0 auto; + gap: 8px; + margin-left: 8px; +} + +.dstack-swagger-ui .swagger-ui .dstack-swagger-summary-actions .content-type-wrapper { + margin: 0; +} + +.dstack-swagger-ui .swagger-ui .dstack-swagger-summary-actions .try-out__btn { + margin: 0; +} + +.dstack-swagger-ui .swagger-ui .opblock .opblock-section-header.dstack-swagger-try-out-source-header { + display: block !important; + overflow: visible; +} + +.dstack-swagger-ui + .swagger-ui + .opblock + .opblock-section-header.dstack-swagger-try-out-source-header.dstack-swagger-empty-parameters { + height: 0; + min-height: 0; + padding: 0; +} + +.dstack-swagger-ui + .swagger-ui + .opblock + .opblock-section-header.dstack-swagger-try-out-source-header.dstack-swagger-empty-parameters + h4 { + display: none !important; +} + +.dstack-swagger-ui + .swagger-ui + .opblock + .opblock-section-header.dstack-swagger-try-out-source-header.dstack-swagger-empty-parameters + .tab-header { + display: none !important; +} + +.dstack-swagger-ui + .swagger-ui + .opblock + .opblock-section-header.dstack-swagger-try-out-source-header.dstack-swagger-empty-parameters + + .parameters-container.dstack-swagger-empty-parameters + + .opblock-section-request-body { + margin-top: 16px; +} + +.dstack-swagger-ui .swagger-ui .opblock .opblock-section-header .try-out__btn.dstack-swagger-summary-try-out { + margin: 0; + position: absolute; + right: 0; + top: var(--dstack-swagger-try-out-top, 10px); + z-index: 5; +} + +.dstack-swagger-ui .swagger-ui .opblock-section-header .dstack-swagger-original-control { + height: 1px !important; + margin: 0 !important; + opacity: 0 !important; + overflow: hidden !important; + padding: 0 !important; + pointer-events: none !important; + position: absolute !important; + width: 1px !important; +} + +.dstack-swagger-ui .swagger-ui .opblock .opblock-summary-method, +.dstack-swagger-ui .swagger-ui .opblock.opblock-delete .opblock-summary-method, +.dstack-swagger-ui .swagger-ui .opblock.opblock-deprecated .opblock-summary-method, +.dstack-swagger-ui .swagger-ui .opblock.opblock-get .opblock-summary-method, +.dstack-swagger-ui .swagger-ui .opblock.opblock-head .opblock-summary-method, +.dstack-swagger-ui .swagger-ui .opblock.opblock-options .opblock-summary-method, +.dstack-swagger-ui .swagger-ui .opblock.opblock-patch .opblock-summary-method, +.dstack-swagger-ui .swagger-ui .opblock.opblock-post .opblock-summary-method, +.dstack-swagger-ui .swagger-ui .opblock.opblock-put .opblock-summary-method, +.dstack-swagger-ui .swagger-ui .opblock.opblock-query .opblock-summary-method { + background: transparent; + border: 1px solid rgba(0, 0, 0, 0.87); + border-radius: 0; + color: var(--md-default-fg-color); + display: inline-flex; + flex: 0 0 auto; + align-items: center; + justify-content: center; + font-family: var(--md-text-font-family); + font-size: var(--dstack-swagger-control-font-size); + font-weight: 400; + height: var(--dstack-swagger-control-height); + line-height: 1.2; + min-width: 72px; + padding: 0 0.65rem; + text-transform: none; +} + +.dstack-swagger-ui .swagger-ui .opblock .opblock-summary-path, +.dstack-swagger-ui .swagger-ui .opblock .opblock-summary-operation-id, +.dstack-swagger-ui .swagger-ui .opblock .opblock-summary-path__deprecated { + color: var(--md-default-fg-color); + cursor: default !important; + font-family: var(--md-text-font-family); + font-weight: 400; +} + +.dstack-swagger-ui .swagger-ui .opblock .opblock-summary-path, +.dstack-swagger-ui .swagger-ui .opblock .opblock-summary-path__deprecated { + box-sizing: border-box; + display: block; + font-size: var(--dstack-swagger-control-font-size); + line-height: calc(var(--dstack-swagger-control-height) - 2px); + max-width: none; + min-width: 0; + overflow-x: auto; + overflow-y: hidden; + padding: 0 calc(var(--dstack-swagger-control-height) + 0.4rem) 0 0.65rem; + scrollbar-color: transparent transparent; + scrollbar-width: none; + text-overflow: clip; + white-space: nowrap; + word-break: normal; + overflow-wrap: normal; + -webkit-overflow-scrolling: touch; + user-select: text; + -webkit-user-select: text; +} + +.dstack-swagger-ui .swagger-ui .opblock .opblock-summary-path *, +.dstack-swagger-ui .swagger-ui .opblock .opblock-summary-path__deprecated * { + white-space: nowrap; + word-break: normal; + overflow-wrap: normal; +} + +.dstack-swagger-ui .swagger-ui .opblock .opblock-summary-path::-webkit-scrollbar, +.dstack-swagger-ui .swagger-ui .opblock .opblock-summary-path__deprecated::-webkit-scrollbar { + background: transparent; + height: 0; +} + +.dstack-swagger-ui .swagger-ui .opblock .opblock-summary-path::-webkit-scrollbar-thumb, +.dstack-swagger-ui .swagger-ui .opblock .opblock-summary-path__deprecated::-webkit-scrollbar-thumb, +.dstack-swagger-ui .swagger-ui .opblock .opblock-summary-path::-webkit-scrollbar-track, +.dstack-swagger-ui .swagger-ui .opblock .opblock-summary-path__deprecated::-webkit-scrollbar-track { + background: transparent; +} + +.dstack-swagger-ui .swagger-ui .opblock .opblock-summary-operation-id { + display: none !important; +} + +.dstack-swagger-ui .swagger-ui .opblock .dstack-swagger-summary-url, +.dstack-swagger-ui .swagger-ui .opblock .dstack-swagger-summary-url *, +.dstack-swagger-ui .swagger-ui .opblock .dstack-swagger-summary-url:hover, +.dstack-swagger-ui .swagger-ui .opblock .dstack-swagger-summary-url:hover * { + cursor: default !important; +} + +.dstack-swagger-ui .swagger-ui .opblock .dstack-swagger-summary-url .opblock-summary-path, +.dstack-swagger-ui .swagger-ui .opblock .dstack-swagger-summary-url .opblock-summary-path *, +.dstack-swagger-ui .swagger-ui .opblock .dstack-swagger-summary-url .opblock-summary-path__deprecated, +.dstack-swagger-ui .swagger-ui .opblock .dstack-swagger-summary-url .opblock-summary-path__deprecated * { + cursor: text !important; +} + +.dstack-swagger-ui .swagger-ui .opblock .dstack-swagger-url-copy { + align-items: center; + align-self: stretch; + background: transparent; + border: 0; + color: black; + cursor: pointer !important; + display: inline-flex; + height: 100%; + justify-content: center; + margin: 0; + padding: 0; + position: absolute; + right: 0; + top: 0; + width: var(--dstack-swagger-control-height); +} + +.dstack-swagger-ui .swagger-ui .opblock .dstack-swagger-url-copy *, +.dstack-swagger-ui .swagger-ui .opblock .dstack-swagger-url-copy:hover, +.dstack-swagger-ui .swagger-ui .opblock .dstack-swagger-url-copy:hover * { + cursor: pointer !important; +} + +.dstack-swagger-ui .swagger-ui .opblock .dstack-swagger-url-copy::before { + background: var(--dstack-swagger-border-color); + color: var(--md-default-bg-color); + content: "Copied to clipboard"; + display: block; + font-family: var(--md-text-font-family); + font-size: 0.6rem; + font-weight: 400; + line-height: 1.2; + opacity: 0; + padding: 0.2rem 0.35rem; + pointer-events: none; + position: absolute; + right: 0; + top: -1.6rem; + transform: translateY(0.2rem); + transition: + opacity 0.12s ease, + transform 0.12s ease; + white-space: nowrap; + z-index: 4; +} + +.dstack-swagger-ui .swagger-ui .opblock .dstack-swagger-url-copy.dstack-swagger-url-copy-copied::before { + opacity: 1; + transform: translateY(0); +} + +.dstack-swagger-ui .swagger-ui .opblock .dstack-swagger-url-copy::after { + background-color: currentColor; + content: ""; + display: block; + height: 1.125em; + mask-image: var(--md-code-copy-icon, var(--md-clipboard-icon)); + mask-position: center; + mask-repeat: no-repeat; + mask-size: contain; + width: 1.125em; + -webkit-mask-image: var(--md-code-copy-icon, var(--md-clipboard-icon)); + -webkit-mask-position: center; + -webkit-mask-repeat: no-repeat; + -webkit-mask-size: contain; +} + +.dstack-swagger-ui .swagger-ui .opblock .opblock-summary-description, +.dstack-swagger-ui .swagger-ui .opblock .opblock-section-header h4, +.dstack-swagger-ui .swagger-ui .opblock .opblock-section-header > label, +.dstack-swagger-ui .swagger-ui .parameters-col_description, +.dstack-swagger-ui .swagger-ui .parameters-col_name, +.dstack-swagger-ui .swagger-ui .response-col_description, +.dstack-swagger-ui .swagger-ui .response-col_status, +.dstack-swagger-ui .swagger-ui .responses-inner h4:not(.dstack-swagger-response-title), +.dstack-swagger-ui .swagger-ui .responses-inner h5, +.dstack-swagger-ui .swagger-ui table thead tr td, +.dstack-swagger-ui .swagger-ui table thead tr th { + color: var(--md-default-fg-color); + font-family: var(--md-text-font-family); +} + +.dstack-swagger-ui .swagger-ui .opblock .opblock-section-header { + background: transparent; + border-bottom: 0; + border-top: 0; + min-height: 0; + padding: 0; +} + +.dstack-swagger-ui + .swagger-ui + .opblock + .opblock-section-header.dstack-swagger-empty-parameters:not(.dstack-swagger-try-out-source-header), +.dstack-swagger-ui .swagger-ui .opblock-section-request-body > .opblock-section-header.dstack-swagger-request-body-header-hidden, +.dstack-swagger-ui .swagger-ui .dstack-swagger-native-response-caption, +.dstack-swagger-ui .swagger-ui .responses-wrapper > .opblock-section-header, +.dstack-swagger-ui .swagger-ui .responses-inner > h4:not(.dstack-swagger-response-title), +.dstack-swagger-ui .swagger-ui .responses-inner > h5, +.dstack-swagger-ui .swagger-ui .responses-wrapper.dstack-swagger-responses > .opblock-section-header, +.dstack-swagger-ui .swagger-ui .responses-wrapper.dstack-swagger-responses .responses-inner > h4:not(.dstack-swagger-response-title), +.dstack-swagger-ui .swagger-ui .responses-wrapper.dstack-swagger-responses .responses-inner > h5, +.dstack-swagger-ui .swagger-ui .responses-wrapper.dstack-swagger-responses .responses-table > thead, +.dstack-swagger-ui + .swagger-ui + .responses-wrapper.dstack-swagger-responses + .responses-table + > tbody + > tr.response + > .response-col_status, +.dstack-swagger-ui + .swagger-ui + .responses-wrapper.dstack-swagger-responses + .responses-table + > tbody + > tr.response + > .response-col_links, +.dstack-swagger-ui .swagger-ui .parameters-container.dstack-swagger-empty-parameters, +.dstack-swagger-ui .swagger-ui .execute-wrapper:empty { + display: none !important; +} + +.dstack-swagger-ui .swagger-ui .curl-command, +.dstack-swagger-ui .swagger-ui .request-url { + display: none !important; +} + +.dstack-swagger-ui .swagger-ui .responses-wrapper.dstack-swagger-responses .responses-table, +.dstack-swagger-ui .swagger-ui .responses-wrapper.dstack-swagger-responses .responses-table > tbody { + display: block; + width: 100%; +} + +.dstack-swagger-ui .swagger-ui .responses-wrapper.dstack-swagger-responses { + margin-bottom: 12px; +} + +.dstack-swagger-ui .swagger-ui .responses-wrapper.dstack-swagger-responses .responses-table { + margin: 0; +} + +.dstack-swagger-ui + .swagger-ui + .responses-wrapper.dstack-swagger-responses + .responses-table + > tbody + > tr.response { + background: transparent; + border: 0; + display: block; + margin: 0 0 1em; +} + +.dstack-swagger-ui + .swagger-ui + .responses-wrapper.dstack-swagger-responses + .responses-table + > tbody + > tr.response + > td { + border: 0; + box-sizing: border-box; + display: block; + max-width: none; + padding: 0; + width: 100%; +} + +.md-typeset + .dstack-swagger-ui + .swagger-ui + .responses-wrapper.dstack-swagger-responses + .dstack-swagger-response-container:not(blockquote) { + margin: 0; +} + +.md-typeset + .dstack-swagger-ui + .swagger-ui + .responses-wrapper.dstack-swagger-responses + .dstack-swagger-response-section + > h4.dstack-swagger-response-title { + color: rgb(0, 0, 0); + font-family: var(--md-text-font-family); + font-size: 19.5px; + font-weight: 800; + line-height: 26px; + margin: 1em 0; + padding: 0; + text-transform: none; +} + +.dstack-swagger-ui + .swagger-ui + .responses-wrapper.dstack-swagger-responses + .dstack-swagger-response-body + > .response-controls { + padding-top: 0; +} + +.dstack-swagger-ui + .swagger-ui + .responses-wrapper.dstack-swagger-responses + .dstack-swagger-response-source { + display: none !important; +} + +.dstack-swagger-ui + .swagger-ui + .responses-wrapper.dstack-swagger-responses + .model-example + > .tab + + :is(.highlight-code, .model-box, [role="tabpanel"]) { + margin-top: 20px; +} + +.dstack-swagger-ui + .swagger-ui + .responses-wrapper.dstack-swagger-responses + .dstack-swagger-response-example { + margin-top: 1em; +} + +.dstack-swagger-ui + .swagger-ui + .responses-wrapper.dstack-swagger-responses + .dstack-swagger-response-example + > .tab { + margin-top: 0; +} + +.dstack-swagger-ui + .swagger-ui + .responses-wrapper.dstack-swagger-responses + .dstack-swagger-response-example-panel, +.dstack-swagger-ui + .swagger-ui + .responses-wrapper.dstack-swagger-responses + .dstack-swagger-response-schema-panel { + margin-top: 20px; +} + +[dir=ltr] + .md-typeset + .dstack-swagger-ui + .responses-wrapper.dstack-swagger-responses + .dstack-swagger-response-container + :is(pre, .admonition, details, .termy) { + margin-left: 0 !important; +} + +[dir=ltr] + .md-typeset + .dstack-swagger-ui + .responses-wrapper.dstack-swagger-responses + .dstack-swagger-response-admonition + > summary.dstack-swagger-response-title { + display: block; + list-style: none; +} + +[dir=ltr] + .md-typeset + .dstack-swagger-ui + .responses-wrapper.dstack-swagger-responses + .dstack-swagger-response-admonition + > .dstack-swagger-response-body { + padding-left: 32px; +} + +[dir=ltr] + .md-typeset + .dstack-swagger-ui + .responses-wrapper.dstack-swagger-responses + .dstack-swagger-response-admonition + > summary.dstack-swagger-response-title::-webkit-details-marker { + display: none; +} + +[dir=ltr] + .md-typeset + .dstack-swagger-ui + .responses-wrapper.dstack-swagger-responses + .dstack-swagger-response-admonition + > summary.dstack-swagger-response-title::marker { + content: ""; +} + +@media screen and (max-width: 44.9375em) { + [dir=ltr] + .md-typeset + .dstack-swagger-ui + .responses-wrapper.dstack-swagger-responses + .dstack-swagger-response-admonition { + border-width: 0; + border-radius: 0 !important; + margin-left: -0.8rem !important; + margin-right: -0.8rem !important; + } + + [dir=ltr] + .md-typeset + .dstack-swagger-ui + .responses-wrapper.dstack-swagger-responses + .dstack-swagger-response-admonition + > .dstack-swagger-response-body { + padding-left: 0; + padding-right: 0; + } + + [dir=ltr] + .md-typeset + .dstack-swagger-ui + .responses-wrapper.dstack-swagger-responses + .dstack-swagger-response-admonition + :is(.admonition, details, blockquote) { + margin-left: 0 !important; + margin-right: 0 !important; + } + + [dir=ltr] + .md-typeset + .dstack-swagger-ui + .responses-wrapper.dstack-swagger-responses + .dstack-swagger-response-admonition + :is( + .dstack-swagger-response-example, + .dstack-swagger-request-example, + .model-example, + .highlight, + .highlight-code, + .termy, + .dstack-swagger-json-editor, + .dstack-swagger-request-editor, + .model-box, + .md-typeset__scrollwrap, + .md-typeset__table + ) { + box-sizing: border-box; + margin-left: -0.8rem !important; + margin-right: -0.8rem !important; + max-width: calc(100% + 1.6rem); + width: calc(100% + 1.6rem); + } + + [dir=ltr] + .md-typeset + .dstack-swagger-ui + .responses-wrapper.dstack-swagger-responses + .dstack-swagger-response-admonition + > .dstack-swagger-response-body + > .dstack-swagger-response-example { + margin-left: calc(50% - 50vw) !important; + margin-right: calc(50% - 50vw) !important; + max-width: 100vw; + width: 100vw; + } + + [dir=ltr] + .md-typeset + .dstack-swagger-ui + .responses-wrapper.dstack-swagger-responses + .dstack-swagger-response-admonition + > .dstack-swagger-response-body + > .dstack-swagger-response-example + > :is(.tab, [role="tabpanel"]) { + margin-left: 0 !important; + margin-right: 0 !important; + max-width: 100%; + width: 100%; + } + + [dir=ltr] + .md-typeset + .dstack-swagger-ui + .responses-wrapper.dstack-swagger-responses + .dstack-swagger-response-admonition + > .dstack-swagger-response-body + > .dstack-swagger-response-example + > [role="tabpanel"] + > :is(.highlight, .highlight-code, .dstack-swagger-json-editor) { + margin-left: 0 !important; + margin-right: 0 !important; + max-width: 100%; + width: 100%; + } + + [dir=ltr] + .md-typeset + .dstack-swagger-ui + .responses-wrapper.dstack-swagger-responses + .dstack-swagger-response-admonition + :is(pre, .highlight, .highlight-code, .termy, .md-typeset__table) { + box-sizing: border-box; + max-width: 100%; + min-width: 0; + overflow-x: auto; + -webkit-overflow-scrolling: touch; + } + + [dir=ltr] + .md-typeset + .dstack-swagger-ui + .swagger-ui + :is(.dstack-swagger-response-example, .dstack-swagger-request-example, .model-example) { + box-sizing: border-box; + margin-left: -0.8rem !important; + margin-right: -0.8rem !important; + max-width: calc(100% + 1.6rem); + width: calc(100% + 1.6rem); + } + + [dir=ltr] .md-typeset .dstack-swagger-ui .swagger-ui :is(.tab, .tab-header) { + box-sizing: border-box; + display: flex; + flex-wrap: nowrap; + max-width: 100%; + overflow-x: auto; + overflow-y: hidden; + width: 100%; + -webkit-overflow-scrolling: touch; + } + + [dir=ltr] .md-typeset .dstack-swagger-ui .swagger-ui :is(.tab li, .tab-header .tab-item) { + flex: 0 0 auto; + white-space: nowrap; + } + + [dir=ltr] + .md-typeset + .dstack-swagger-ui + .swagger-ui + :is(.dstack-swagger-json-editor, .dstack-swagger-request-editor) { + border-radius: 0 !important; + box-sizing: border-box; + margin-left: -0.8rem !important; + margin-right: -0.8rem !important; + max-width: calc(100% + 1.6rem); + width: calc(100% + 1.6rem); + } + + [dir=ltr] + .md-typeset + .dstack-swagger-ui + .swagger-ui + :is(.dstack-swagger-json-editor, .dstack-swagger-request-editor) + :is(pre, code, textarea) { + box-sizing: border-box; + max-width: 100%; + min-width: 0; + } +} + +.dstack-swagger-ui .swagger-ui .tab-header { + background: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.025), rgb(0 114 255 / 0.25%), rgba(0, 42, 255, 0.0125)); + border: 0.5px solid rgba(0, 0, 0, 0.5); + border-radius: 0; + display: inline-flex; + height: 100%; + list-style: none !important; + margin: 16px 0 -3px; + overflow: visible; + padding: 0; + position: relative; + z-index: 1; +} + +.dstack-swagger-ui .swagger-ui .opblock-section-header + .parameters-container { + margin-top: 28px; +} + +.dstack-swagger-ui .swagger-ui .parameters-container:not(.dstack-swagger-empty-parameters) { + margin-bottom: 28px; +} + +.dstack-swagger-ui .swagger-ui .tab-header::before { + content: none; + display: none; +} + +.dstack-swagger-ui .swagger-ui .tab-header .tab-item { + display: inline-block; + margin: 0 -1px 0 0; + padding: 0 !important; + position: relative; + z-index: 2; +} + +.dstack-swagger-ui .swagger-ui .tab-header .tab-item:last-child { + margin-right: -2px; +} + +.dstack-swagger-ui .swagger-ui .tab-header .tab-item + .tab-item::before { + background: rgba(0, 0, 0, 0.25); + bottom: 0; + content: ""; + display: block; + left: 0; + position: absolute; + top: 0; + width: 0.5px; + z-index: 5; +} + +.dstack-swagger-ui .swagger-ui .tab-header .tab-item.active::before, +.dstack-swagger-ui .swagger-ui .tab-header .tab-item.active + .tab-item::before { + display: block; +} + +.dstack-swagger-ui .swagger-ui .tab-header .tab-item h4 { + margin: 0; +} + +.dstack-swagger-ui .swagger-ui .tab-header .tab-item h4 span { + border: 1px solid transparent; + border-radius: 0; + color: rgba(0, 0, 0, 0.6); + display: block; + font-family: var(--md-text-font-family); + font-size: 16.5px !important; + font-weight: 700 !important; + line-height: 1.2 !important; + min-width: 80px; + padding: 18px 18px 16px !important; + position: relative; + text-align: center; + transition: + background-color 0.25s cubic-bezier(0.4, 0, 0.2, 1), + color 0.25s cubic-bezier(0.4, 0, 0.2, 1); + z-index: 3; +} + +.dstack-swagger-ui .swagger-ui .tab-header .tab-item.active h4 span { + background: var(--dstack-swagger-quote-bg); + border-color: black; + border-style: dotted; + color: var(--md-default-fg-color); +} + +.dstack-swagger-ui .swagger-ui .tab-header .tab-item h4 span::after, +.dstack-swagger-ui .swagger-ui .opblock .tab-header .tab-item.active h4 span::after { + content: none !important; + display: none !important; +} + +.dstack-swagger-ui .swagger-ui .opblock-description-wrapper, +.dstack-swagger-ui .swagger-ui .opblock-external-docs-wrapper, +.dstack-swagger-ui .swagger-ui .opblock-title_normal, +.dstack-swagger-ui .swagger-ui .parameters-container, +.dstack-swagger-ui .swagger-ui .responses-inner { + padding: 0; +} + +.dstack-swagger-ui .swagger-ui .responses-wrapper.dstack-swagger-responses .responses-inner { + padding-top: 16px; +} + +.dstack-swagger-ui .swagger-ui .opblock > .opblock-description-wrapper { + margin-bottom: 16px; +} + +.dstack-swagger-ui .swagger-ui .dstack-swagger-source-operation-description { + display: none !important; +} + +.dstack-swagger-ui .swagger-ui .markdown, +.dstack-swagger-ui .swagger-ui .renderedMarkdown, +.dstack-swagger-ui .swagger-ui .opblock .opblock-summary-description, +.dstack-swagger-ui .swagger-ui .opblock-description-wrapper, +.dstack-swagger-ui .swagger-ui .opblock-external-docs-wrapper, +.dstack-swagger-ui .swagger-ui .parameters-col_description, +.dstack-swagger-ui .swagger-ui .response-col_description, +.dstack-swagger-ui .swagger-ui .response-col_links { + color: var(--md-default-fg-color); + font-family: var(--md-text-font-family); + font-size: 0.8rem; + font-weight: 400; + line-height: 1.3rem; +} + +.dstack-swagger-ui .swagger-ui .markdown p, +.dstack-swagger-ui .swagger-ui .renderedMarkdown p, +.dstack-swagger-ui .swagger-ui .opblock-description-wrapper p, +.dstack-swagger-ui .swagger-ui .opblock-external-docs-wrapper p, +.dstack-swagger-ui .swagger-ui .parameters-col_description p, +.dstack-swagger-ui .swagger-ui .response-col_description p { + color: inherit; + font-family: inherit; + font-size: inherit; + font-weight: inherit; + line-height: inherit; + margin: 1em 0; +} + +.dstack-swagger-ui .swagger-ui .markdown p:first-child, +.dstack-swagger-ui .swagger-ui .renderedMarkdown p:first-child, +.dstack-swagger-ui .swagger-ui .opblock-description-wrapper p:first-child, +.dstack-swagger-ui .swagger-ui .opblock-external-docs-wrapper p:first-child, +.dstack-swagger-ui .swagger-ui .parameters-col_description p:first-child, +.dstack-swagger-ui .swagger-ui .response-col_description p:first-child { + margin-top: 0; +} + +.dstack-swagger-ui .swagger-ui .markdown p:last-child, +.dstack-swagger-ui .swagger-ui .renderedMarkdown p:last-child, +.dstack-swagger-ui .swagger-ui .opblock-description-wrapper p:last-child, +.dstack-swagger-ui .swagger-ui .opblock-external-docs-wrapper p:last-child, +.dstack-swagger-ui .swagger-ui .parameters-col_description p:last-child, +.dstack-swagger-ui .swagger-ui .response-col_description p:last-child { + margin-bottom: 0; +} + +.dstack-swagger-ui .swagger-ui .parameters, +.dstack-swagger-ui .swagger-ui .responses-table { + border-collapse: collapse; +} + +.dstack-swagger-ui .swagger-ui .table-container { + padding: 0 !important; +} + +.dstack-swagger-ui .swagger-ui .parameters { + border-collapse: collapse; + display: block; + width: 100%; +} + +.dstack-swagger-ui .swagger-ui .parameters > tbody { + display: grid; + gap: 14px; + width: 100%; +} + +.dstack-swagger-ui .swagger-ui .parameters > tbody > tr { + align-items: start; + column-gap: 16px; + display: grid; + grid-template-columns: minmax(180px, 32%) minmax(0, 1fr); +} + +.dstack-swagger-ui .swagger-ui .parameters > tbody > tr > td { + display: block; + min-width: 0; + width: auto !important; +} + +.dstack-swagger-ui .swagger-ui .parameters > thead { + display: none !important; +} + +.dstack-swagger-ui .swagger-ui table { + padding: 0 !important; +} + +.dstack-swagger-ui .swagger-ui .parameters thead tr th, +.dstack-swagger-ui .swagger-ui .parameters tbody tr td, +.dstack-swagger-ui .swagger-ui .responses-table > thead > tr > td, +.dstack-swagger-ui .swagger-ui .responses-table > tbody > tr.response > td { + border-bottom: 0; + padding: 0; +} + +.dstack-swagger-ui .swagger-ui .responses-table > tbody > tr.response:last-child > td { + border-bottom: 0; +} + +.dstack-swagger-ui .swagger-ui .parameters-col_name, +.dstack-swagger-ui .swagger-ui .parameters-col_description { + max-width: none; + padding-left: 0 !important; + padding-right: 0 !important; + vertical-align: top; +} + +.dstack-swagger-ui .swagger-ui .parameters-col_name { + color: var(--md-default-fg-color); + font-family: var(--md-text-font-family); + font-size: 0.8rem; + line-height: 1.3rem; +} + +.dstack-swagger-ui .swagger-ui .parameters-col_description { + margin-bottom: 0; + width: auto; +} + +.dstack-swagger-ui .swagger-ui .parameters-col_description .json-schema-form, +.dstack-swagger-ui .swagger-ui .parameters-col_description .json-schema-form-item { + margin: 0; + max-width: none; + width: 100%; +} + +.dstack-swagger-ui + .swagger-ui + .parameters-col_description + > :is(.markdown, .renderedMarkdown, p):not(:empty) + + :is(.json-schema-form, .json-schema-form-item, input, select, textarea) { + margin-top: 8px; +} + +.dstack-swagger-ui .swagger-ui .parameters-col_description .parameter__default { + display: none; +} + +.dstack-swagger-ui .swagger-ui .parameter__name, +.dstack-swagger-ui .swagger-ui .parameter__type, +.dstack-swagger-ui .swagger-ui .parameter__in, +.dstack-swagger-ui .swagger-ui .parameter__deprecated { + color: var(--md-default-fg-color); + font-family: var(--md-text-font-family); + font-weight: 400; + line-height: inherit; +} + +.dstack-swagger-ui .swagger-ui .parameter__name { + background: var(--md-code-bg-color); + color: var(--md-default-fg-color); + display: inline; + font-family: var(--md-code-font-family); + font-size: 0.7rem; + font-weight: 500; + padding: 0.05rem 0.22rem; +} + +.dstack-swagger-ui .swagger-ui .parameter__name.required span { + display: none; +} + +.dstack-swagger-ui .swagger-ui .parameter__name.required::after { + content: "*"; + color: var(--md-default-fg-color); + font-size: inherit; + font-weight: inherit; + padding: 0 0 0 0.05rem; + position: static; + vertical-align: baseline; +} + +.dstack-swagger-ui .swagger-ui .parameter__type, +.dstack-swagger-ui .swagger-ui .parameter__in, +.dstack-swagger-ui .swagger-ui .parameter__deprecated { + color: rgba(0, 0, 0, 0.6); + display: inline; + font-size: 0.7rem; + margin: 0 0 0 0.35rem; +} + +.dstack-swagger-ui .swagger-ui .parameter__type .prop-format, +.dstack-swagger-ui .swagger-ui .parameter__in .prop-format { + color: inherit; + font-family: inherit; + font-size: inherit; + font-weight: inherit; +} + +.dstack-swagger-ui .swagger-ui .btn, +.dstack-swagger-ui .swagger-ui select, +.dstack-swagger-ui .swagger-ui input[type=email], +.dstack-swagger-ui .swagger-ui input[type=file], +.dstack-swagger-ui .swagger-ui input[type=password], +.dstack-swagger-ui .swagger-ui input[type=search], +.dstack-swagger-ui .swagger-ui input[type=text], +.dstack-swagger-ui .swagger-ui textarea { + background-color: transparent; + border: var(--dstack-swagger-hairline-border); + border-radius: 0; + color: var(--md-default-fg-color); +} + +.dstack-swagger-ui .swagger-ui .parameters input[type=text], +.dstack-swagger-ui .swagger-ui .parameters input[type=number], +.dstack-swagger-ui .swagger-ui .parameters select, +.dstack-swagger-ui .swagger-ui .parameters textarea { + background-color: transparent; + border: 1px solid rgba(0, 0, 0, 0.87); + border-radius: 0; + box-sizing: border-box; + color: var(--md-default-fg-color); + cursor: default; + font-family: var(--md-text-font-family); + font-size: var(--dstack-swagger-control-font-size); + font-weight: 400; + height: var(--dstack-swagger-control-height); + line-height: calc(var(--dstack-swagger-control-height) - 2px); + margin: 0; + max-width: none; + min-height: var(--dstack-swagger-control-height); + padding: 0 0.65rem; + width: 100%; +} + +@media screen and (max-width: 44em) { + .dstack-swagger-ui .swagger-ui .parameters > tbody > tr { + gap: 6px; + grid-template-columns: minmax(0, 1fr); + } +} + +.dstack-swagger-ui .swagger-ui .opblock.is-open .parameters input[type=text], +.dstack-swagger-ui .swagger-ui .opblock.is-open .parameters input[type=number], +.dstack-swagger-ui .swagger-ui .opblock.is-open .parameters textarea { + cursor: text; +} + +.dstack-swagger-ui .swagger-ui .opblock.is-open .parameters select { + cursor: pointer; +} + +.dstack-swagger-ui .swagger-ui .parameters textarea { + height: auto; + line-height: 1.3rem; + min-height: calc(var(--dstack-swagger-control-height) * 3); + padding-bottom: 0.45rem; + padding-top: 0.45rem; +} + +.dstack-swagger-ui .swagger-ui .parameters input::placeholder, +.dstack-swagger-ui .swagger-ui .parameters textarea::placeholder { + color: rgba(0, 0, 0, 0.45); + opacity: 1; +} + +.dstack-swagger-ui .swagger-ui .opblock.is-open .parameters input[type=text]:hover, +.dstack-swagger-ui .swagger-ui .opblock.is-open .parameters input[type=number]:hover, +.dstack-swagger-ui .swagger-ui .opblock.is-open .parameters select:hover, +.dstack-swagger-ui .swagger-ui .opblock.is-open .parameters textarea:hover { + border-color: black; +} + +.dstack-swagger-ui .swagger-ui :where(a, button, input, select, textarea, [role="button"], .btn, .model-box-control, .opblock-summary-control):focus, +.dstack-swagger-ui .swagger-ui :where(a, button, input, select, textarea, [role="button"], .btn, .model-box-control, .opblock-summary-control):focus-visible { + box-shadow: none !important; + outline: none !important; +} + +.dstack-swagger-ui .swagger-ui .btn { + font-weight: 500; + padding: 0 0.85rem; +} + +.dstack-swagger-ui .swagger-ui .dstack-swagger-hidden-reset { + display: none !important; +} + +.dstack-swagger-ui .swagger-ui .btn.authorize { + align-items: center; + background: var(--dstack-swagger-border-color); + border: var(--dstack-swagger-primary-border); + border-radius: 0; + color: var(--md-default-bg-color); + display: inline-flex; + font-size: var(--dstack-swagger-control-font-size); + font-weight: 500 !important; + gap: 8px; + height: var(--dstack-swagger-control-height); + line-height: 1.2; + padding: 0 0.85rem; + white-space: nowrap; +} + +.dstack-swagger-ui .swagger-ui .btn.authorize:hover { + background: black; + border-color: black; + color: var(--md-default-bg-color); +} + +.dstack-swagger-ui .swagger-ui .try-out__btn, +.dstack-swagger-ui .swagger-ui .btn.execute, +.dstack-swagger-ui .swagger-ui .dstack-swagger-clear-btn, +.dstack-swagger-ui .swagger-ui .btn.cancel { + align-items: center; + border: var(--dstack-swagger-primary-border); + border-radius: 0; + display: inline-flex; + font-size: var(--dstack-swagger-control-font-size); + font-weight: 500 !important; + gap: 8px; + height: var(--dstack-swagger-control-height); + justify-content: center; + line-height: 1.2; + min-width: 6rem; + padding: 0 0.85rem; + white-space: nowrap; + width: auto; +} + +.dstack-swagger-ui .swagger-ui .try-out__btn, +.dstack-swagger-ui .swagger-ui .dstack-swagger-clear-btn, +.dstack-swagger-ui .swagger-ui .btn.cancel { + background: transparent; + color: var(--md-default-fg-color); +} + +.dstack-swagger-ui .swagger-ui .dstack-swagger-clear-btn, +.dstack-swagger-ui .swagger-ui .try-out__btn.cancel, +.dstack-swagger-ui .swagger-ui .btn.cancel { + border-radius: var(--dstack-swagger-action-radius); + min-width: 0; +} + +.dstack-swagger-ui .swagger-ui .try-out__btn:hover, +.dstack-swagger-ui .swagger-ui .dstack-swagger-clear-btn:hover, +.dstack-swagger-ui .swagger-ui .btn.cancel:hover { + background: transparent; + border-color: black; + color: black; +} + +.dstack-swagger-ui .swagger-ui .dstack-swagger-execute-wrapper, +.dstack-swagger-ui .swagger-ui .dstack-swagger-execute-row { + align-items: center; + display: flex; + gap: 8px; + margin-bottom: 16px !important; + margin-top: 16px !important; + padding: 0 !important; + width: 100%; +} + +.dstack-swagger-ui .swagger-ui .dstack-swagger-execute-wrapper { + margin-top: 16px !important; +} + +.dstack-swagger-ui .swagger-ui .dstack-swagger-execute-wrapper > .dstack-swagger-execute-row { + margin-bottom: 0 !important; + margin-top: 0 !important; +} + +.dstack-swagger-ui .swagger-ui .dstack-swagger-execute-row > .btn { + align-items: center; + background: transparent !important; + border: var(--dstack-swagger-primary-border) !important; + border-radius: var(--dstack-swagger-action-radius) !important; + box-sizing: border-box; + color: var(--md-default-fg-color) !important; + display: inline-flex; + font-size: var(--dstack-swagger-control-font-size); + font-weight: 500 !important; + height: var(--dstack-swagger-control-height) !important; + justify-content: center; + line-height: 1.2; + overflow: visible; + padding: 0 0.85rem !important; + text-overflow: clip; + white-space: nowrap; +} + +.dstack-swagger-ui .swagger-ui .dstack-swagger-execute-row > .btn.execute { + flex: 1 1 auto !important; + min-width: 0 !important; + width: auto !important; +} + +.dstack-swagger-ui .swagger-ui .dstack-swagger-execute-row > .btn:not(.execute) { + flex: 0 0 auto !important; + min-width: 5rem !important; + width: auto !important; +} + +.dstack-swagger-ui .swagger-ui .dstack-swagger-execute-row > .btn:hover { + background: transparent !important; + border-color: black !important; + color: black !important; +} + +.dstack-swagger-ui .swagger-ui .dstack-swagger-execute-wrapper ~ .responses-wrapper, +.dstack-swagger-ui .swagger-ui .dstack-swagger-execute-row ~ .responses-wrapper { + margin-top: 0 !important; +} + +.dstack-swagger-ui .swagger-ui .download-contents { + background: var(--dstack-swagger-border-color); + border: var(--dstack-swagger-primary-border); + border-radius: 0; + color: var(--md-default-bg-color); +} + +.dstack-swagger-ui .swagger-ui .btn.authorize span { + float: none; + padding: 0; +} + +.dstack-swagger-ui .swagger-ui .btn.authorize svg { + flex: 0 0 auto; + fill: currentColor; + height: 16px; + margin: 0; + order: -1; + width: 16px; +} + +.dstack-swagger-ui .swagger-ui .btn.authorize svg path { + fill: currentColor; +} + +.dstack-swagger-ui .swagger-ui .highlight-code > .microlight, +.dstack-swagger-ui .swagger-ui .opblock-body pre.microlight { + background: var(--dstack-swagger-code-bg-color) !important; + border: 0; + border-radius: 0; + color: var(--md-code-fg-color) !important; + font-family: var(--md-code-font-family); + font-size: var(--dstack-code-font-size); + font-weight: 400; + line-height: var(--dstack-code-line-height); + margin: 0; + overflow: auto; + padding: 65px 30px 35px 40px; + position: relative; +} + +.dstack-swagger-ui .swagger-ui .dstack-swagger-request-editor { + --dstack-editable-code-max-height: var(--dstack-swagger-schema-code-max-height); + margin: 0; +} + +.dstack-swagger-ui + .swagger-ui + .model-example.dstack-swagger-edit-schema-active + > :not(.tab):not(.dstack-swagger-edit-request-schema) { + display: none !important; +} + +.dstack-swagger-ui .swagger-ui .dstack-swagger-edit-request-schema[hidden] { + display: none !important; +} + +.dstack-swagger-ui .swagger-ui .highlight-code > .microlight::before, +.dstack-swagger-ui .swagger-ui .opblock-body pre.microlight::before { + background: #d9515d; + border-radius: 50%; + content: ""; + display: inline-block; + height: 12px; + left: 15px; + position: absolute; + top: 15px; + width: 12px; + -webkit-box-shadow: 20px 0 0 #f4c025, 40px 0 0 #3ec930 !important; + box-shadow: 20px 0 0 #f4c025, 40px 0 0 #3ec930 !important; +} + +.dstack-swagger-ui .swagger-ui textarea:not(.curl) { + background: transparent !important; + border: 0; + border-radius: 0; + box-sizing: border-box; + color: var(--md-code-fg-color) !important; + font-family: var(--md-code-font-family); + font-size: var(--dstack-code-font-size); + font-weight: 400; + line-height: var(--dstack-code-line-height); + margin: 0; + min-height: 220px; + padding: 65px 30px 35px 40px; + resize: vertical; + width: 100%; +} + +.dstack-swagger-ui .swagger-ui .dstack-swagger-request-editor textarea:not(.curl) { + max-height: var(--dstack-editable-code-max-height); + min-height: 220px; + padding: 0; +} + +.dstack-swagger-ui .swagger-ui textarea.curl { + background: transparent !important; + border: 1px solid rgba(0, 0, 0, 0.87); + border-radius: 0; + color: var(--md-default-fg-color) !important; + font-family: var(--md-text-font-family); + font-size: var(--dstack-swagger-control-font-size); + font-weight: 400; + line-height: calc(var(--dstack-swagger-control-height) - 2px); + min-height: var(--dstack-swagger-control-height); + padding: 0 0.65rem; + resize: none; + text-transform: none; +} + +.dstack-swagger-ui .swagger-ui .highlight-code > .microlight > code, +.dstack-swagger-ui .swagger-ui .opblock-body pre.microlight > code, +.md-typeset .dstack-swagger-ui .swagger-ui pre:not(.dstack-scrollable-code-pre) > code { + background: transparent !important; + color: inherit !important; + font-family: var(--md-code-font-family) !important; + padding: 0 !important; +} + +.dstack-swagger-ui .swagger-ui .dstack-swagger-json-editor { + --dstack-scrollable-code-max-height: var(--dstack-swagger-schema-code-max-height); + margin: 0; +} + +.dstack-swagger-ui .swagger-ui .dstack-swagger-json-editor > .md-code__nav { + background: transparent; + right: 15px; + top: 9px; +} + +.dstack-swagger-ui .swagger-ui .dstack-swagger-json-editor .md-code__button { + background: transparent; + border: 0; + color: #a2a2a2; + height: 24px; + padding: 0; + width: 24px; +} + +.dstack-swagger-ui .swagger-ui .dstack-swagger-json-editor .md-code__button::after { + height: 16px; + margin: 4px; + width: 16px; +} + +.dstack-swagger-ui .swagger-ui .dstack-swagger-json-editor .md-code__button:hover, +.dstack-swagger-ui .swagger-ui .dstack-swagger-json-editor .md-code__button:focus-visible { + color: #eee; + outline: none; +} + +.dstack-swagger-ui .swagger-ui .model-box, +.dstack-swagger-ui .swagger-ui .model-container, +.dstack-swagger-ui .swagger-ui .json-schema-2020-12, +.dstack-swagger-ui .swagger-ui .json-schema-2020-12 button, +.dstack-swagger-ui .swagger-ui section.models { + background: transparent; + border-radius: 0; +} + +.dstack-swagger-ui .swagger-ui .json-schema-2020-12 { + margin: 0 0 15px; + padding: 0; +} + +.dstack-swagger-ui .swagger-ui .json-schema-2020-12--embedded, +.dstack-swagger-ui .swagger-ui .model-box .json-schema-2020-12 { + border-left: 0; + margin: 0; + padding: 0; +} + +.dstack-swagger-ui .swagger-ui .json-schema-2020-12__constraint, +.dstack-swagger-ui .swagger-ui .json-schema-2020-12-keyword__value--warning, +.dstack-swagger-ui .swagger-ui .json-schema-2020-12-json-viewer__value--warning { + background: transparent; + border: 0; + border-radius: 0; + color: var(--md-default-fg-color); +} + +.dstack-swagger-ui .swagger-ui .errors-wrapper, +.dstack-swagger-ui .swagger-ui .auth-container .errors { + background: var(--dstack-swagger-quote-bg); + border: 0; + border-radius: 0; + color: var(--md-default-fg-color); + font-family: var(--md-text-font-family); + font-size: 0.8rem; + line-height: 1.3rem; + margin: 1em 0; + padding: 8px 25px; +} + +.dstack-swagger-ui .swagger-ui .errors-wrapper h4, +.dstack-swagger-ui .swagger-ui .auth-container .errors h4 { + color: var(--md-default-fg-color); + font-family: var(--md-text-font-family); + font-size: 0.8rem; + font-weight: 700; + line-height: 1.3rem; + margin: 0 0 0.45rem; +} + +.dstack-swagger-ui .swagger-ui .errors-wrapper small, +.dstack-swagger-ui .swagger-ui .auth-container .errors small, +.dstack-swagger-ui .swagger-ui .errors-wrapper span, +.dstack-swagger-ui .swagger-ui .auth-container .errors span { + color: var(--md-default-fg-color); + font-family: var(--md-text-font-family); + font-size: inherit; + line-height: inherit; +} + +.dstack-swagger-ui .swagger-ui .errors-wrapper ul, +.dstack-swagger-ui .swagger-ui .auth-container .errors ul { + margin: 0.45rem 0 0; + padding: 0; +} + +.dstack-swagger-ui .swagger-ui .errors-wrapper li, +.dstack-swagger-ui .swagger-ui .auth-container .errors li { + color: var(--md-default-fg-color); + font-family: var(--md-text-font-family); + font-size: inherit; + line-height: inherit; + margin: 0; + padding: 0; + position: static; +} + +.dstack-swagger-ui .swagger-ui .errors-wrapper li::before, +.dstack-swagger-ui .swagger-ui .auth-container .errors li::before { + content: none !important; + display: none !important; +} + +.dstack-swagger-ui .swagger-ui .errors-wrapper li + li, +.dstack-swagger-ui .swagger-ui .auth-container .errors li + li { + margin-top: 0.2rem; +} + +.dstack-swagger-ui .swagger-ui .errors-wrapper .errors__clear-btn { + display: none; +} + +.dstack-swagger-ui .swagger-ui .dialog-ux .modal-ux { + background: var(--md-default-bg-color); + border: 0; + border-radius: 0; +} + +.dstack-swagger-ui .swagger-ui .dialog-ux .modal-ux-header { + border-bottom: 0; +} + +.dstack-swagger-ui[data-openapi-tag] .swagger-ui .opblock-tag { + display: none !important; +} + +.dstack-swagger-ui .swagger-ui .model-hint { + display: none !important; +} + +.dstack-swagger-ui .swagger-ui .tab { + background: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.025), rgb(0 114 255 / 0.25%), rgba(0, 42, 255, 0.0125)); + border: 0.5px solid rgba(0, 0, 0, 0.5); + border-radius: 0; + display: inline-flex; + height: 100%; + list-style: none !important; + margin: 16px 0 -3px; + overflow: unset; + padding: 0; + position: relative; + z-index: 1; +} + +.dstack-swagger-ui .swagger-ui .tab::before { + content: none; + display: none; +} + +.dstack-swagger-ui .swagger-ui .tab + [role="tabpanel"] { + margin-top: 16px; +} + +.dstack-swagger-ui .swagger-ui .opblock-section-request-body .model-example { + margin-top: 8px; +} + +.dstack-swagger-ui .swagger-ui .opblock-section-request-body .dstack-swagger-request-example { + margin-top: 8px; +} + +.dstack-swagger-ui .swagger-ui .opblock > .dstack-swagger-operation-curl-example { + margin-top: 20px; +} + +.dstack-swagger-ui .swagger-ui .dstack-swagger-request-curl-termy > [data-termynal]::before { + -webkit-box-shadow: 20px 0 0 #f4c025, 40px 0 0 #3ec930 !important; + box-shadow: 20px 0 0 #f4c025, 40px 0 0 #3ec930 !important; +} + +.dstack-swagger-ui .swagger-ui .opblock-section-request-body .model-example > .tab { + margin: 0; +} + +.dstack-swagger-ui + .swagger-ui + .opblock-section-request-body + .model-example + > .tab + ~ :is( + .body-param, + .highlight-code, + .model-box, + .dstack-editable-code, + .dstack-swagger-edit-request-schema + ) { + margin-top: var(--dstack-swagger-tab-content-gap) !important; +} + +.dstack-swagger-ui + .swagger-ui + .opblock-section-request-body + .model-example + :is(.dstack-swagger-request-editor, .dstack-swagger-edit-request-schema) { + margin-top: var(--dstack-swagger-tab-content-gap) !important; +} + +.dstack-swagger-ui .swagger-ui .opblock-section-request-body .dstack-swagger-request-example > .tab { + margin: 0; +} + +.dstack-swagger-ui .swagger-ui .opblock-section-request-body .dstack-swagger-request-curl-panel, +.dstack-swagger-ui .swagger-ui .opblock-section-request-body .dstack-swagger-request-schema-panel { + margin-top: var(--dstack-swagger-tab-content-gap); +} + +.dstack-swagger-ui .swagger-ui .opblock-section-request-body .dstack-swagger-request-model-hidden { + display: none !important; +} + +.dstack-swagger-ui .swagger-ui .tab li { + display: inline-block; + margin: 0 -1px 0 0; + padding: 0 !important; + position: relative; + z-index: 2; +} + +.dstack-swagger-ui .swagger-ui .tab li:last-of-type { + margin-right: -2px; +} + +.dstack-swagger-ui .swagger-ui .tab li:first-of-type::after { + content: none !important; + display: none !important; +} + +.dstack-swagger-ui .swagger-ui .tab li + li::before { + background: rgba(0, 0, 0, 0.25); + bottom: 0; + content: ""; + display: block; + height: auto; + left: 0; + margin: 0; + position: absolute; + top: 0; + width: 0.5px; + z-index: 5; +} + +.dstack-swagger-ui .swagger-ui .tab li.active::before, +.dstack-swagger-ui .swagger-ui .tab li.active + li::before { + display: block; +} + +.dstack-swagger-ui .swagger-ui .tab li button.tablinks { + border: 1px solid transparent; + border-radius: 0; + color: rgba(0, 0, 0, 0.6); + cursor: pointer; + display: block; + font-family: var(--md-text-font-family); + font-size: 16.5px !important; + font-weight: 700 !important; + line-height: 1.2 !important; + min-width: 80px; + padding: 18px 18px 16px !important; + position: relative; + text-align: center; + transition: + background-color 0.25s cubic-bezier(0.4, 0, 0.2, 1), + color 0.25s cubic-bezier(0.4, 0, 0.2, 1); + z-index: 4; +} + +.dstack-swagger-ui .swagger-ui .tab li button.tablinks:hover, +.dstack-swagger-ui .swagger-ui .tab li.active button.tablinks, +.dstack-swagger-ui .swagger-ui .tab li button.tablinks[aria-selected="true"] { + color: var(--md-default-fg-color); + outline: none; +} + +.dstack-swagger-ui .swagger-ui .tab li.active button.tablinks, +.dstack-swagger-ui .swagger-ui .tab li button.tablinks[aria-selected="true"] { + background: var(--dstack-swagger-quote-bg); + border-color: transparent; + border-style: dotted; + /* margin-right: 1px; */ +} + +.dstack-swagger-ui .swagger-ui .tab li button.tablinks:focus-visible { + color: var(--md-accent-fg-color); + outline: none; +} + +.md-typeset .dstack-swagger-ui .swagger-ui .tab > li:first-child::before { + content: none; + display: none; +} + +.dstack-swagger-ui .swagger-ui .dstack-swagger-model-label, +.dstack-swagger-ui .swagger-ui .dstack-swagger-model-inline-title { + color: var(--md-default-fg-color); + font-family: var(--md-code-font-family); + font-size: 12px; +} + +.dstack-swagger-ui .swagger-ui .markdown code, +.dstack-swagger-ui .swagger-ui .renderedMarkdown code, +.md-typeset .dstack-swagger-ui .swagger-ui pre:not(.dstack-scrollable-code-pre) > code { + font-size: 12px !important; +} diff --git a/mkdocs/assets/stylesheets/termynal.css b/mkdocs/assets/stylesheets/termynal.css new file mode 100644 index 0000000000..77a8eba359 --- /dev/null +++ b/mkdocs/assets/stylesheets/termynal.css @@ -0,0 +1,227 @@ +/** + * termynal.js + * + * @author Ines Montani + * @version 0.0.1 + * @license MIT + */ + +:root { + --color-bg: rgb(21, 22, 29); + --color-text: #eee; + --color-text-subtle: #a2a2a2; +} + +[data-termynal] span { + white-space: pre; +} + +.small > [data-termynal] { + font-size: var(--dstack-code-font-size); + line-height: 1.4; +} + +[data-termynal] { + margin-block-start: 1em; + margin-block-end: 1em; + overflow-x: scroll; + /*white-space: pre;*/ + /*width: 750px;*/ + max-width: 100%; + background: var(--color-bg); + color: var(--color-text); + /* font-size: 18px; */ + font-size: 14px; + line-height: var(--dstack-code-line-height); + /* font-family: 'Fira Mono', Consolas, Menlo, Monaco, 'Courier New', Courier, monospace; */ + font-family: var(--md-code-font-family) !important; + border-radius: 3px; + padding: 60px 12px 25px 25px; + /*padding: 75px 45px 35px;*/ + position: relative; + -webkit-box-sizing: border-box; + box-sizing: border-box; +} + +[data-termynal]:before { + content: ''; + position: absolute; + top: 15px; + left: 15px; + display: inline-block; + width: 12px; + height: 12px; + border-radius: 50%; + /* A little hack to display the window buttons in one pseudo element. */ + background: #d9515d; + /*-webkit-box-shadow: 25px 0 0 #f4c025, 50px 0 0 #3ec930;*/ + /* box-shadow: 25px 0 0 #f4c025, 50px 0 0 #3ec930;*/ + -webkit-box-shadow: 20px 0 0 #f4c025, 40px 0 0 #3ec930; + box-shadow: 20px 0 0 #f4c025, 40px 0 0 #3ec930; +} + +[data-termynal]:after { + content: ''; + position: absolute; + color: var(--color-text-subtle); + top: 7px; + left: 0; + width: 100%; + text-align: center; +} + +[data-termynal].dstack-termy-scrollable { + overflow: visible; +} + +[data-termynal].dstack-termy-scrollable > [data-termynal-body] { + max-height: var(--dstack-termy-max-height); + overflow: auto; + scrollbar-color: #a2a2a2 rgba(255, 255, 255, 0.06); + scrollbar-width: thin; +} + +[data-termynal].dstack-termy-scrollable > [data-termynal-body]::-webkit-scrollbar { + height: 10px; + width: 10px; +} + +[data-termynal].dstack-termy-scrollable > [data-termynal-body]::-webkit-scrollbar-track { + background: rgba(255, 255, 255, 0.06); +} + +[data-termynal].dstack-termy-scrollable > [data-termynal-body]::-webkit-scrollbar-thumb { + background: #a2a2a2; + background-clip: content-box; + border: 2px solid transparent; + border-radius: 8px; +} + +[data-termynal].dstack-termy-scrollable > [data-termynal-body]::-webkit-scrollbar-thumb:hover { + background: #e5e5e9; + background-clip: content-box; +} + +[data-termynal].dstack-termy-has-copy > a[data-terminal-control] { + box-sizing: border-box; + padding-right: 32px; +} + +[data-termynal] > .dstack-termy-copy { + -webkit-appearance: none; + appearance: none; + background: transparent; + border: 0; + color: var(--color-text-subtle); + cursor: pointer; + height: 24px; + padding: 0; + position: absolute; + right: 15px; + top: 9px; + width: 24px; + z-index: 2; +} + +[data-termynal] > .dstack-termy-copy:before { + background: rgb(21, 22, 29); + border: 1px solid var(--color-text-subtle); + color: var(--color-text); + content: 'Copied to clipboard'; + font-family: var(--md-text-font-family); + font-size: 11px; + line-height: 1; + opacity: 0; + padding: 5px 7px; + pointer-events: none; + position: absolute; + right: calc(100% + 8px); + top: 50%; + transform: translateY(-50%); + transition: opacity 0.15s ease; + white-space: nowrap; +} + +[data-termynal] > .dstack-termy-copy.dstack-termy-copy-copied:before { + opacity: 1; +} + +[data-termynal] > .dstack-termy-copy:after { + background-color: currentColor; + content: ''; + display: block; + height: 16px; + margin: 4px; + mask-image: var(--md-code-copy-icon, var(--md-clipboard-icon)); + mask-position: center; + mask-repeat: no-repeat; + mask-size: contain; + -webkit-mask-image: var(--md-code-copy-icon, var(--md-clipboard-icon)); + -webkit-mask-position: center; + -webkit-mask-repeat: no-repeat; + -webkit-mask-size: contain; + width: 16px; +} + +[data-termynal] > .dstack-termy-copy:hover, +[data-termynal] > .dstack-termy-copy:focus-visible { + color: var(--color-text); + outline: none; +} + +a[data-terminal-control] { + color: #aebbff; + display: block; + margin-right: 0.5rem; + margin-top: 1rem; + text-align: right; +} + +[data-ty] { + display: block; + line-height: inherit; +} + +[data-ty]:before { + /* Set up defaults and ensure empty lines are displayed. */ + content: ''; + display: inline-block; + vertical-align: middle; +} + +[data-ty="input"]:before, +[data-ty-prompt]:before { + margin-right: 0.75em; + color: var(--color-text-subtle); +} + +[data-ty="input"]:before { + content: '$'; +} + +[data-ty][data-ty-prompt]:before { + content: attr(data-ty-prompt); +} + +[data-ty-cursor]:after { + content: attr(data-ty-cursor); + font-family: monospace; + margin-left: 0.5em; + -webkit-animation: blink 1s infinite; + animation: blink 1s infinite; +} + + +/* Cursor animation */ + +@-webkit-keyframes blink { + 50% { + opacity: 0; + } +} + +@keyframes blink { + 50% { + opacity: 0; + } +} diff --git a/mkdocs/blog/index.md b/mkdocs/blog/index.md new file mode 100644 index 0000000000..56dbff40cd --- /dev/null +++ b/mkdocs/blog/index.md @@ -0,0 +1,7 @@ + + +# Blog diff --git a/mkdocs/blog/posts/0_20.md b/mkdocs/blog/posts/0_20.md new file mode 100644 index 0000000000..550cb60ad4 --- /dev/null +++ b/mkdocs/blog/posts/0_20.md @@ -0,0 +1,127 @@ +--- +title: "dstack 0.20 GA: Fleet-first UX and other important changes" +date: 2025-12-18 +description: "TBA" +slug: "0_20" +image: https://dstack.ai/static-assets/static-assets/images/dstack-0_20.png +categories: + - Changelog +links: + - Release notes: https://github.com/dstackai/dstack/releases/tag/0.20.0 + - Migration guide: https://dstack.ai/docs/guides/migration/#0_20 +--- + +# dstack 0.20 GA: Fleet-first UX and other important changes + +We’re releasing `dstack` 0.20.0, a major update that improves how teams orchestrate GPU workloads for development, training, and inference. Most `dstack` updates are incremental and backward compatible, but this version introduces a few major changes to how you work with `dstack`. + +In `dstack` 0.20.0, fleets are now a first-class concept, giving you more explicit control over how GPU capacity is provisioned and managed. We’ve also added *Events*, which record important system activity—such as scheduling decisions, run status changes, and resource lifecycle updates—so it’s easier to understand what’s happening without digging through server logs. + + + +This post goes through the changes in detail and explains how to upgrade and migrate your existing setup. + + + +## Fleets + +In earlier versions, submitting a run that didn’t match any existing fleet would cause `dstack` to automatically create one. While this reduced setup overhead, it also made capacity provisioning implicit and less predictable. + +With `dstack` 0.20.0, fleets must be created explicitly and treated as first-class resources. This shift makes capacity provisioning declarative, improving control over resource limits, instance lifecycles, and overall fleet behavior. + +For users who previously relied on auto-created fleets, similar behavior can be achieved by defining an elastic fleet, for example: + +
    + + ```yaml + type: fleet + # The name is optional, if not specified, generated randomly + name: default + + # Can be a range or a fixed number + # Allow to provision up to 2 instances + nodes: 0..2 + + # Uncomment to ensure instances are inter-connected + #placement: cluster + + # Deprovision instances above the minimum if they remain idle + idle_duration: 1h + + resources: + # Allow to provision up to 8 GPUs + gpu: 0..8 + ``` + +
    + +If the `nodes` range starts above `0`, `dstack` provisions the initial capacity upfront and scales additional instances on demand, enabling more predictable capacity planning. + +When a run does not explicitly reference a fleet (via the [`fleets`](../../docs/reference/dstack.yml/dev-environment.md#fleets) property), `dstack` automatically selects one that satisfies the run’s requirements. + +## Events + +Previously, when `dstack` changed the state of a run or other resource, that information was written only to the server logs. This worked for admins, but it made it hard for users to understand what happened or why. + +Starting with version `0.20.0`, `dstack` exposes these events directly to users. + +Each resource now includes an `Events` tab in the UI, showing related events in real time: + + + +There is also a dedicated `Events` page that aggregates events across resources. You can filter by project, user, run, or job to quickly narrow down what you’re looking for: + + + +The same information is available through the CLI: + + + +This makes it easier to track state changes, debug issues, and review past actions without needing access to server logs. + +## Runs + +This release updates several defaults related to run configuration. The goal is to reduce implicit assumptions and make it more convenient. + +### Working directory + +Previously, the `working_dir` property defaulted to `/workflow`. Now, the default working directory is always taken from the Docker image. + +The working directory in the default Docker images (if you don't specify `image`) is now set to `/dstack/run`. + +### Repo directory + +Previously, if you didn't specify a repo path, the repo was cloned to `/workflow`. Now, in that case the repo will be cloned to the working directory. + +
    + +```yaml +type: dev-environment +name: vscode + +repos: + # Clones the repo from the parent directory (`examples/..`) to `` + - .. + +ide: vscode +``` + +
    + +Also, now if the repo directory is not empty, the run will fail with an error. + +## Backward compatibility + +While the update introduces breaking changes, 0.19.* CLIs remain compatible with 0.20.* servers. + +> Note, the 0.20.* CLI only works with a 0.20.* server. + +!!! warning "Breaking changes" + This release introduces breaking changes that may affect existing setups. Before upgrading either the CLI or the server, review the [migration guide](https://dstack.ai/docs/guides/migration/#0_20). + +## What's next + +1. Follow the [Installation](../../docs/installation.md) guide +2. Try the [Quickstart](../../docs/quickstart.md) +3. Report issues on [GitHub](https://github.com/dstackai/dstack/issues) +4. Ask questions on [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/mkdocs/blog/posts/agentic-orchestration.md b/mkdocs/blog/posts/agentic-orchestration.md new file mode 100644 index 0000000000..71ad35e876 --- /dev/null +++ b/mkdocs/blog/posts/agentic-orchestration.md @@ -0,0 +1,291 @@ +--- +title: "Infrastructure orchestration is an agent skill" +date: 2026-03-11 +description: "Agentic engineering pulls compute discovery, provisioning, scheduling, and observability into the execution loop. Infrastructure orchestration is becoming an agent skill." +slug: agentic-orchestration +image: https://dstack.ai/static-assets/static-assets/images/agentic-orchestration.png +categories: + - Changelog +--- + +# Infrastructure orchestration is an agent skill + +Andrej Karpathy’s [autoresearch](https://github.com/karpathy/autoresearch) demo is a crisp example of “agentic engineering” in practice: a short Markdown spec (`program.md`) drives an automated research cycle that iterates many times on one GPU with minimal human involvement. This post extends that same idea one layer down. + + + + + +Closing a research loop on one GPU is already useful. Closing the full engineering loop—training jobs, evaluations, deploying inference endpoints, running regressions, rolling forward/back—forces one additional requirement: infrastructure orchestration has to be something an agent can do reliably. + +## Before: orchestration lived outside the workload + +Most orchestration approaches treat “what to run” and “where it runs” as separate. + +Teams decide the placement context outside the workload: which cluster or region, which GPU class, which runtime image, which quota pool, which scheduling lane. The workload is then expressed in a way that assumes that context is already fixed. + +That separation is not “wrong.” It matches how humans operate: decisions about capacity and placement are made deliberately, reviewed, and changed on a human timescale. The orchestrator executes inside a box that humans chose. + +## After: provisioning and scheduling move into the loop + +Agentic engineering collapses the separation. + +When an agent is responsible for progress—not just for drafting code—compute choices affect how quickly it can iterate, what it can afford to try, and whether it can ship a result as a service. The orchestration decisions aren’t just “which cluster?” + +Training often wants one shape of resources (long-running, stable, sometimes multi-GPU or multi-node). + +Evaluation wants another (many small runs, often interruptible). Inference wants another (a long-lived service with predictable restarts, health checks, and a stable endpoint). If those shapes require switching tools and rewriting glue each time, the “agent does execution” idea breaks down at the infrastructure boundary. + +!!! info "Where orchestration becomes an agent skill" + Orchestration becomes an agent skill when agents can choose and operate compute as part of execution, instead of handing infrastructure decisions back to a human. + +## What “agent skill” means here + +This isn’t about giving an agent raw cloud credentials and hoping for the best. “Agent skill” here means there is an interface and set of abstractions that are stable enough to teach, predictable enough to automate, and specific enough for GPU work. + +An agent needs to reason about GPU constraints as first-class inputs: memory and count, placement for multi-node jobs, preemptible vs stable capacity, and the difference between “run 100 short evals” and “keep an inference endpoint alive.” + +A true orchestration skill is one where the agent can answer, mechanically: what ran, where it ran, what resources it used, what state transitions happened, and what to do next. + +## What this does to platform teams + +The platform team shift is not “replace humans with agents.” It’s a change in what the platform optimizes for. + +Platforms are often designed around human workflows: manual approvals, bespoke runbooks, and implicit institutional knowledge. Agentic engineering needs a different center of gravity: an agent-native control plane that exposes explicit building blocks for GPU jobs and inference services, plus the constraints that keep cost and risk bounded. + +The old model treats orchestration as an internal service layer that humans operate on behalf of everyone else. In the emerging model, that ownership shifts. + +The platform team's job becomes enabling agent-driven orchestration and controlling it safely. That means defining the supported abstractions, access boundaries, budgets, quotas, and observability that let agents provision compute and operate workloads directly without turning the platform into an unbounded automation surface. + +## What this does to cloud and datacenter providers + +For cloud and datacenter providers, GPUs don’t become less important; the interface around them becomes decisive for agent-operated workflows. + +Agents need capacity to be discoverable, provisionable, and observable through repeatable semantics. A provider can have excellent hardware and still be painful to use if the operational contract is “humans click around and tribal-knowledge it into working.” In an agent-driven workflow, anything that can’t be expressed cleanly in an orchestration interface becomes friction. + +That’s why multi-environment orchestration layers matter. They don’t only reduce vendor lock-in; they make capacity usable by automation, which is increasingly the consumer. + +Providers that still require provider-specific operating patterns remain harder to operationalize, even when the underlying hardware is strong. + +## What this looks like with dstack + +`dstack` is an open-source control plane for GPU provisioning and orchestration across GPU clouds and on-prem clusters, with a workflow model that explicitly targets development, training, and inference. + +The way to read `dstack` is as a CLI with a small set of abstractions that line up with the agent-skill requirements above. + +**Step 1: treat available compute as queryable state** + +`dstack` exposes “offers” as a way to query available hardware configurations from configured backends or on-prem clusters. That turns “where can I run this?” into something automation can ask and answer deterministically, instead of hard-coding instance types and regions. + +```shell +$ dstack offer --gpu H100:1.. --max-offers 3 + + # BACKEND REGION INSTANCE TYPE RESOURCES SPOT PRICE + 1 verda FIN-01 1H100.80S.30V 30xCPU, 120GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 + 2 runpod US-KS-2 NVIDIA H100 PCIe 16xCPU, 251GB, 1xH100 (80GB), 100.0GB (disk) no $2.39 + 3 nebius eu-north1 gpu-h100-sxm 16xCPU, 200GB, 1xH100 (80GB), 100.0GB (disk) no $2.95 + ... + Shown 3 of 99 offers +``` + +**Step 2: define capacity pools and provisioning bounds** + +Fleets are `dstack`’s way to make capacity explicit. A fleet can represent elastic capacity (scale from zero on demand) or a pre-provisioned pool (including SSH-managed on-prem hosts). It also supports operational patterns that matter for GPU efficiency, such as splitting a multi-GPU node into blocks so that many small jobs don’t waste a full 8-GPU box. The agent operates within declared capacity instead of interacting with provider infrastructure directly. + +```yaml +# fleet.dstack.yml +type: fleet +name: h100-fleet + +nodes: 0..2 +idle_duration: 1h + +resources: + gpu: H100:8 + +blocks: 4 +``` + +
    + +```shell +$ dstack apply -f fleet.dstack.yml +``` + +
    + +If the fleet is elastic (`nodes` set to a range), later runs can provision instances on demand. If it is pre-provisioned, the capacity is already present. + +
    + +```shell +$ dstack fleet + + NAME NODES GPU SPOT BACKEND PRICE STATUS CREATED + gpu-cluster 2..4 A100:80GB:8 auto aws $0..$32 active 2 hours ago + instance=0 A100:80GB:8 spot aws (us-ea…) $28.50 busy 2 hours ago + instance=1 A100:80GB:8 spot gcp (us-ce…) $26.80 busy 1 hour ago + on-prem 2 - - ssh - active 3 days ago + instance=0 A100:40GB:4 - ssh - busy 3 days ago + instance=1 A100:40GB:4 - ssh - idle 3 days ago + test-fleet 0..1 gpu:16GB on-demand * - active 10 min ago +``` + +
    + +**Step 3: run evaluation or training loops as tasks** + +Tasks are the batch form: training runs, eval runs, data processing. + +```yaml +# train.dstack.yml +type: task +name: train-qwen + +image: huggingface/trl-latest-gpu +working_dir: /workspace + +files: + - .:/workspace + +commands: + - pip install -r requirements.txt + - python train.py --model Qwen/Qwen2.5-7B-Instruct --output-dir /workspace/checkpoints + +max_duration: 2h +resources: + gpu: 24GB + shm_size: 16GB +``` + +Tasks can be distributed (`nodes` set to a number), in which case `dstack` handles cluster selection and job coordination across nodes. + +Once a task is running, the agent may attach to it and SSH inside the container to run commands interactively, or inspect runtime state before deciding what to do next. + +
    + +```shell +$ dstack attach train-qwen --logs +``` + +
    + +
    + +```shell +$ ssh train-qwen +``` + +
    + +**Step 4: run model inference as services** + +Services are the inference form: they turn a model into a endpoint that later steps in the loop can call, monitor, and scale as needed. + +```yaml +# serve.dstack.yml +type: service +name: qwen25-instruct + +image: lmsysorg/sglang:latest + +env: + - MODEL_ID=Qwen/Qwen2.5-32B-Instruct + +commands: + - | + python -m sglang.launch_server \ + --model-path $MODEL_ID \ + --port 8000 \ + --trust-remote-code + +port: 8000 +model: Qwen/Qwen2.5-32B-Instruct +replicas: 1..4 +scaling: + metric: rps + target: 10 + +resources: + gpu: 80GB + disk: 200GB +``` + +Once the service is running, the endpoint can be called directly, including from another agent step: + +
    + +```shell +$ curl https://qwen25-instruct.example.com/v1/chat/completions \ + -H 'Content-Type: application/json' \ + -H 'Authorization: Bearer ' \ + -d '{ + "model": "Qwen/Qwen2.5-32B-Instruct", + "messages": [{"role": "user", "content": "Hello"}] + }' +``` + +
    + +This matters because the agent does not just launch the service. It can treat the endpoint itself as part of the workflow: deploy it, call it, monitor it, and adjust it through the same orchestration layer. + +**Step 5: observe through events and metrics** + +`dstack` exposes structured lifecycle data through events and metrics, so the loop can inspect state transitions and resource usage directly instead of inferring everything from logs. + +
    + +```shell +$ dstack event --within-run train-qwen + + [2026-01-21 13:09:37] [run train-qwen] Run submitted. Status: SUBMITTED + [2026-01-21 13:09:57] [job train-qwen-0-0] Job status changed SUBMITTED -> PROVISIONING + [2026-01-21 13:11:49] [job train-qwen-0-0] Job status changed PULLING -> RUNNING +``` + +
    + +
    + +```shell +$ dstack metrics train-qwen + + NAME STATUS CPU MEMORY GPU + train-qwen running 92% 118GB/200GB gpu=0 mem=71GB/80GB util=97% +``` + +
    + +Taken together, these are the fine-grained primitives a fully autonomous agent needs: discover capacity, provision it, run the right workload type, inspect state, and decide what to do next without handing orchestration back to a human operator. + +## Skills + +Those primitives make orchestration operable by agents, but they do not encode all of the workload-specific know-how. Training recipes, inference tuning, eval patterns, and runtime trade-offs still need to live somewhere. + +`dstack` already ships an installable [SKILL.md](https://skills.sh/dstackai/dstack/dstack) so tools like Claude Code, Codex, Cursor, and others can learn how to operate `dstack` configs and CLI without guessing: + +
    + +```shell +$ npx skills add dstackai/dstack +``` + +
    + +Skills are the layer where that operational know-how can be packaged and reused. + +> The orchestrator provides the control surface. Skills provide the workload knowledge on top of it. + +## Why open source and the ecosystem matter here + +Once orchestration becomes the interface that agents use, ecosystem depth matters for both sides. + +Teams want a control plane they can inspect and extend because it sits in the path of cost, reliability, and security. Providers want their capacity to be usable through standard patterns instead of one-off glue. Open source accelerates both: more backends, more integrations, more operational recipes, and fewer bespoke adapters per provider or per team. + +`dstack` is MPL-2.0 licensed. That matters because agentic orchestration will not be built once inside a single vendor boundary; it will be assembled across GPU clouds, Kubernetes, on-prem infrastructure, and a growing ecosystem of specialized operational patterns. + +## What's next + +Agentic engineering is moving toward agents that own execution, not agents that merely assist humans during execution. If training jobs, evaluations, and inference services are part of execution, then GPU orchestration has to be part of what agents can operate directly. + +If you want to use `dstack` for these workflows or contribute to the surrounding ecosystem, issues and feedback are welcome in the [GitHub repo](https://github.com/dstackai/dstack). diff --git a/mkdocs/blog/posts/amd-mi300x-inference-benchmark.md b/mkdocs/blog/posts/amd-mi300x-inference-benchmark.md new file mode 100644 index 0000000000..18b8d343c6 --- /dev/null +++ b/mkdocs/blog/posts/amd-mi300x-inference-benchmark.md @@ -0,0 +1,224 @@ +--- +title: "Benchmarking Llama 3.1 405B on 8x AMD MI300X GPUs" +date: 2024-10-09 +description: "Exploring how the inference performance of Llama 3.1 405B varies on 8x AMD MI300X GPUs across vLLM and TGI backends in different use cases." +slug: amd-mi300x-inference-benchmark +image: https://dstack.ai/static-assets/static-assets/images/dstack-hotaisle-amd-mi300x-prompt-v5.png +categories: + - Benchmarks +--- + +# Benchmarking Llama 3.1 405B on 8x AMD MI300X GPUs + +At `dstack`, we've been adding support for AMD GPUs with [SSH fleets](../../docs/concepts/fleets.md#ssh-fleets), +so we saw this as a great chance to test our integration by benchmarking AMD GPUs. Our friends at +[Hot Aisle](https://hotaisle.xyz/), who build top-tier +bare metal compute for AMD GPUs, kindly provided the hardware for the benchmark. + + + + + +With access to a bare metal machine with 8x AMD MI300X GPUs from Hot Aisle, we decided to skip smaller models and went +with Llama 3.1 405B. To make the benchmark interesting, we tested how inference performance varied across different +backends (vLLM and TGI) and use cases (real-time vs batch inference, different context sizes, etc.). + +## Benchmark setup + +Here is the spec of the bare metal machine we got: + +- Intel® Xeon® Platinum 8470 2G, 52C/104T, 16GT/s, 105M Cache, Turbo, HT (350W) [x2] +- AMD MI300X GPU OAM 192GB 750W GPUs [x8] +- 64GB RDIMM, 4800MT/s Dual Rank [x32] + +??? info "Set up an SSH fleet" + + Hot Aisle provided us with SSH access to the machine. To make it accessible via `dstack`, + we created an [SSH fleet](../../docs/concepts/fleets.md#ssh-fleets) using the following configuration: + +
    + + ```yaml + type: fleet + name: hotaisle-fleet + + placement: any + + ssh_config: + user: hotaisle + identity_file: ~/.ssh/hotaisle_id_rsa + + hosts: + - hostname: ssh.hotaisle.cloud + port: 22013 + ``` + +
    + + After running `dstack apply -f hotaisle.dstack.yml`, we were ready to run dev environments, tasks, and services on + this fleet via `datack`. + +??? info "vLLM" + + ``` + PyTorch version: 2.4.1+rocm6.1 + Is debug build: False + CUDA used to build PyTorch: N/A + ROCM used to build PyTorch: 6.1.40091-a8dbc0c19 + + OS: Ubuntu 22.04.4 LTS (x86_64) + GCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 + Clang version: 17.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-6.1.0 24103 7db7f5e49612030319346f900c08f474b1f9023a) + CMake version: version 3.26.4 + Libc version: glibc-2.35 + + Python version: 3.10.14 (main, Mar 21 2024, 16:24:04) [GCC 11.2.0] (64-bit runtime) + Python platform: Linux-6.8.0-45-generic-x86_64-with-glibc2.35 + Is CUDA available: True + CUDA runtime version: Could not collect + CUDA_MODULE_LOADING set to: LAZY + GPU models and configuration: AMD Instinct MI300X (gfx942:sramecc+:xnack-) + Nvidia driver version: Could not collect + cuDNN version: Could not collect + HIP runtime version: 6.1.40093 + MIOpen runtime version: 3.1.0 + Is XNNPACK available: True + + Versions of relevant libraries: + [pip3] mypy==1.4.1 + [pip3] mypy-extensions==1.0.0 + [pip3] numpy==1.26.4 + [pip3] pytorch-triton-rocm==3.0.0 + [pip3] pyzmq==24.0.1 + [pip3] torch==2.4.1+rocm6.1 + [pip3] torchaudio==2.4.1+rocm6.1 + [pip3] torchvision==0.16.1+fdea156 + [pip3] transformers==4.45.1 + [pip3] triton==3.0.0 + [conda] No relevant packages + ROCM Version: 6.1.40091-a8dbc0c19 + Neuron SDK Version: N/A + vLLM Version: 0.6.3.dev116+g151ef4ef + vLLM Build Flags: + CUDA Archs: Not Set; ROCm: Disabled; Neuron: Disabled + ``` + +??? info "TGI" + The `ghcr.io/huggingface/text-generation-inference:sha-11d7af7-rocm` Docker image was used. + +For conducting the tests, we've been using the [`benchmark_serving`](https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_serving.py) provided by vLLM. + +## Observations + +### Token/sec per batch size + +TGI consistently exceeds vLLM in token throughput across all batch sizes, with the performance difference growing larger +as the batch size increases. For batch sizes exceeding 64, the performance disparity becomes quite notable. + + + +The prompts maintain a constant sequence length of 80 tokens each. + +### TTFT per batch size + +TGI surpasses vLLM in Time to First Token for all batch sizes, except for batch sizes 2 and 32. + + + +The performance difference is considerable for larger batch sizes. + +### Token/sec per context size + +To evaluate performance with larger prompt sizes, we conducted tests using prompts of 10,000 tokens. + + + +### TTFT per context size + +In this case, TGI demonstrated an advantage over vLLM in both token throughput and time to first token (TTFT). + + + +### Token/sec and TTFT per RPS + +To assess the performance scalability of TGI and vLLM, we conducted tests by gradually increasing the Requests Per +Second (RPS) and the total Requests Sent (RS) while keeping the prompt size consistent at 1,000 tokens for all trials. + +In this experiment, we initiated requests beginning with 30 requests at 1 RPS, then increased to 60 requests at 2 RPS, +and continued this pattern up to 150 requests at 5 RPS. + + + +Ideally, we would expect all trials to complete within the same time frame. However, due to resource limitations and +increasing resource utilization, higher RPS does not lead to a proportional increase in throughput (tokens per second) +or maintain Time to First Token (TTFT). + + + +At 1 RPS, vLLM performs slightly better than TGI. However, between 2 and 4 RPS, TGI outperforms vLLM in both throughput and TTFT. + +> Notably, TGI begins to drop requests once it reaches 5 RPS. + +We repeated the test using a higher number of requests, ranging from 300 to 900. + + + +> At 900 requests with a rate of 3 requests per second (RPS), TGI dropped a majority of the requests. However, its +> performance improved notably when the number of requests was below 900. + + + +### VRAM consumption + +When considering VRAM consumption right after loading model weights, TGI allocates approximately 28% less VRAM compared +to vLLM. + + + +This difference may be related to how vLLM [pre-allocates GPU cache](https://docs.vllm.ai/en/latest/models/performance.html). + +## Conclusion + +1. For small sequence lengths, starting with a batch size of 64, TGI significantly outperforms vLLM in terms of throughput and TTFT. +2. For larger sequence lengths, TGI outperforms vLLM even more in both throughput and TTFT, with the difference increasing as the batch size grows. +3. At higher request rates, TGI continues to outperform vLLM, likely due to its superior ability to batch requests efficiently. + +!!! info "Limitation" + * In certain circumstances (e.g., at higher request rates), for unknown reasons, TGI dropped requests, making it + impossible to accurately track throughput and TTFT. + * With vLLM, we used the default backend configuration. With better tuning, we might have achieved improved performance. + +In general, the 8x AMD MI300X is a good fit for larger models and allows us to make the most of its VRAM, especially for +larger batches. + +If you’d like to support us in doing more benchmarks, please let us know. + +## What's next? + +While we wait for AMD to announce new GPUs and for data centers to offer them, we’re considering tests with NVIDIA GPUs +like the H100 and H200, as well as possibly Google TPU. + +> Also, the next step is to measure how the FP8 version of the model would perform on this hardware. + +### Source code + +The source code used for this benchmark can be found in our +[GitHub repo](https://github.com/dstackai/benchmarks/tree/main/amd/inference). + +If you have questions, feedback, or want to help improve the benchmark, please reach out to our team. + +## Thanks to our friends + +### Hot Aisle + +[Hot Aisle](https://hotaisle.xyz/) +is the primary sponsor of this benchmark, and we are sincerely grateful for their hardware and support. + +If you'd like to use top-tier bare metal compute with AMD GPUs, we recommend going +with Hot Aisle. Once you gain access to a cluster, it can be easily accessed via `dstack`'s [SSH fleet](../../docs/concepts/fleets.md#ssh-fleets) easily. + +### Runpod +If you’d like to use on-demand compute with AMD GPUs at affordable prices, you can configure `dstack` to +use [Runpod](https://runpod.io/). In +this case, `dstack` will be able to provision fleets automatically when you run dev environments, tasks, and +services. diff --git a/mkdocs/blog/posts/amd-on-runpod.md b/mkdocs/blog/posts/amd-on-runpod.md new file mode 100644 index 0000000000..0d5c60b4e9 --- /dev/null +++ b/mkdocs/blog/posts/amd-on-runpod.md @@ -0,0 +1,121 @@ +--- +title: Supporting AMD accelerators on Runpod +date: 2024-08-21 +description: "dstack, the open-source AI container orchestration platform, adds support for AMD accelerators, with Runpod as the first supported cloud provider." +slug: amd-on-runpod +categories: + - Changelog +--- + +# Supporting AMD accelerators on Runpod + +While `dstack` helps streamline the orchestration of containers for AI, its primary goal is to offer vendor independence +and portability, ensuring compatibility across different hardware and cloud providers. + +Inspired by the recent `MI300X` benchmarks, we are pleased to announce that Runpod is the first cloud provider to offer +AMD GPUs through `dstack`, with support for other cloud providers and on-prem servers to follow. + + + +## Specification + +For the reference, below is a comparison of the `MI300X` and `H100 SXM` specs, incl. the prices offered by Runpod. + +| | MI300X | H100X SXM | +|---------------------------------|-------------------------------------------|--------------| +| **On-demand pricing** | $3.99/hr | $3.99/hr | +| **VRAM** | 192 GB | 80GB | +| **Memory bandwidth** | 5.3 TB/s | 3.4TB/s | +| **FP16** | 2,610 TFLOPs | 1,979 TFLOPs | +| **FP8** | 5,220 TFLOPs | 3,958 TFLOPs | + +One of the main advantages of the `MI300X` is its VRAM. For example, with the `H100 SXM`, you wouldn't be able to fit the FP16 +version of Llama 3.1 405B into a single node with 8 GPUs—you'd have to use FP8 instead. However, with the `MI300X`, you +can fit FP16 into a single node with 8 GPUs, and for FP8, you'd only need 4 GPUs. + +With the [latest update](https://github.com/dstackai/dstack/releases/0.18.11rc1), +you can now specify an AMD GPU under `resources`. Below are a few examples. + +## Configuration + +=== "Service" + Here's an example of a [service](../../docs/concepts/services.md) that deploys + Llama 3.1 70B in FP16 using [TGI](https://huggingface.co/docs/text-generation-inference/en/installation_amd). + +
    + + ```yaml + type: service + name: amd-service-tgi + + image: ghcr.io/huggingface/text-generation-inference:sha-a379d55-rocm + env: + - HF_TOKEN + - MODEL_ID=meta-llama/Meta-Llama-3.1-70B-Instruct + - TRUST_REMOTE_CODE=true + - ROCM_USE_FLASH_ATTN_V2_TRITON=true + commands: + - text-generation-launcher --port 8000 + port: 8000 + # Register the model + model: meta-llama/Meta-Llama-3.1-70B-Instruct + + # Uncomment to leverage spot instances + #spot_policy: auto + + resources: + gpu: MI300X + disk: 150GB + ``` + +
    + +=== "Dev environment" + Here's an example of a [dev environment](../../docs/concepts/dev-environments.md) using + [TGI](https://huggingface.co/docs/text-generation-inference/en/installation_amd)'s + Docker image: + + ```yaml + type: dev-environment + name: amd-dev-tgi + + image: ghcr.io/huggingface/text-generation-inference:sha-a379d55-rocm + env: + - HF_TOKEN + - ROCM_USE_FLASH_ATTN_V2_TRITON=true + ide: vscode + + # Uncomment to leverage spot instances + #spot_policy: auto + + resources: + gpu: MI300X + disk: 150GB + ``` + +!!! info "Docker image" + Please note that if you want to use AMD, specifying `image` is currently required. This must be an image that includes + ROCm drivers. + +To request multiple GPUs, specify the quantity after the GPU name, separated by a colon, e.g., `MI300X:4`. + +Once the configuration is ready, run `dstack apply -f `, and `dstack` will automatically provision the +cloud resources and run the configuration. + +??? info "Control plane" + If you specify `model` when running a service, `dstack` will automatically register the model on + an OpenAI-compatible endpoint and allow you to use it for chat via the control plane UI. + + + +## What's next? + +1. The examples above demonstrate the use of +[TGI](https://huggingface.co/docs/text-generation-inference/en/installation_amd). +AMD accelerators can also be used with other frameworks like vLLM, Ollama, etc., and we'll be adding more examples soon. +2. Runpod is the first cloud provider where dstack supports AMD. More cloud providers will be supported soon as well. +3. Want to give Runpod and `dstack` a try? Make sure you've signed up for [Runpod](https://www.runpod.io/), + then [set up](../../docs/reference/server/config.yml.md#runpod) the `dstack server`. + +> Have questioned or feedback? Join our [Discord](https://discord.gg/u8SmfwPpMd) +server. diff --git a/mkdocs/blog/posts/amd-on-tensorwave.md b/mkdocs/blog/posts/amd-on-tensorwave.md new file mode 100644 index 0000000000..82a7908805 --- /dev/null +++ b/mkdocs/blog/posts/amd-on-tensorwave.md @@ -0,0 +1,240 @@ +--- +title: Using SSH fleets with TensorWave's private AMD cloud +date: 2025-03-11 +description: "This tutorial walks you through how dstack can be used with TensorWave's private AMD cloud using SSH fleets." +slug: amd-on-tensorwave +image: https://dstack.ai/static-assets/static-assets/images/dstack-tensorwave-v2.png +# categories: + # - Case studies +--- + +# Using SSH fleets with TensorWave's private AMD cloud + +Since last month, when we introduced support for private clouds and data centers, it has become easier to use `dstack` +to orchestrate AI containers with any AI cloud vendor, whether they provide on-demand compute or reserved clusters. + +In this tutorial, we’ll walk you through how `dstack` can be used with +[TensorWave](https://tensorwave.com/) using +[SSH fleets](../../docs/concepts/fleets.md#ssh-fleets). + + + + + +TensorWave is a cloud provider specializing in large-scale AMD GPU clusters for both +training and inference. + +Before following this tutorial, ensure you have access to a cluster. You’ll see the cluster and its nodes in your +TensorWave dashboard. + + + +## Creating a fleet + +??? info "Prerequisites" + Once `dstack` is [installed](https://dstack.ai/docs/installation), create a project folder. + +
    + + ```shell + $ mkdir tensorwave-demo && cd tensorwave-demo + ``` + +
    + +Now, define an SSH fleet configuration by listing the IP addresses of each node in the cluster, +along with the SSH user and SSH key configured for each host. + +
    + +```yaml +type: fleet +name: my-tensorwave-fleet + +placement: cluster + +ssh_config: + user: dstack + identity_file: ~/.ssh/id_rsa + hosts: + - hostname: 64.139.222.107 + blocks: auto + - hostname: 64.139.222.108 + blocks: auto +``` + +
    + +You can set `blocks` to `auto` if you want to run concurrent workloads on each instance. +Otherwise, you can omit this property. + +Once the configuration is ready, apply it using `dstack apply`: + +
    + +```shell +$ dstack apply -f fleet.dstack.yml + +Provisioning... +---> 100% + + FLEET INSTANCE RESOURCES STATUS CREATED + my-tensorwave-fleet 0 8xMI300X (192GB) 0/8 busy 3 mins ago + 1 8xMI300X (192GB) 0/8 busy 3 mins ago + +``` + +
    + +`dstack` will automatically connect to each host, detect the hardware, install dependencies, and make them ready for +workloads. + +## Running workloads + +Once the fleet is created, you can use `dstack` to run workloads. + +### Dev environments + +A dev environment lets you access an instance through your desktop IDE. + +
    + +```yaml +type: dev-environment +name: vscode + +image: rocm/pytorch:rocm6.3.3_ubuntu22.04_py3.10_pytorch_release_2.4.0 +ide: vscode + +resources: + gpu: MI300X:8 +``` + +
    + +Apply the configuration via [`dstack apply`](../../docs/reference/cli/dstack/apply.md): + +
    + +```shell +$ dstack apply -f .dstack.yml + +Submit the run `vscode`? [y/n]: y + +Launching `vscode`... +---> 100% + +To open in VS Code Desktop, use this link: + vscode://vscode-remote/ssh-remote+vscode/workflow +``` + +
    + +Open the link to access the dev environment using your desktop IDE. + +### Tasks + +A task allows you to schedule a job or run a web app. Tasks can be distributed and support port forwarding. + +Below is a distributed training task configuration: + +
    + +```yaml +type: task +name: train-distrib + +nodes: 2 + +image: rocm/pytorch:rocm6.3.3_ubuntu22.04_py3.10_pytorch_release_2.4.0 +commands: + - pip install torch + - export NCCL_IB_GID_INDEX=3 + - export NCCL_NET_GDR_LEVEL=0 + - torchrun --nproc_per_node=8 --nnodes=2 --node_rank=$DSTACK_NODE_RANK --master_port=29600 --master_addr=$DSTACK_MASTER_NODE_IP test/tensorwave/multinode.py 5000 50 + +resources: + gpu: MI300X:8 +``` + +
    + +Run the configuration via [`dstack apply`](../../docs/reference/cli/dstack/apply.md): + +
    + +```shell +$ dstack apply -f train.dstack.yml + +Submit the run `streamlit`? [y/n]: y + +Provisioning `train-distrib`... +---> 100% +``` + +
    + +`dstack` automatically runs the container on each node while passing +[system environment variables](../../docs/concepts/tasks.md#system-environment-variables) +which you can use with `torchrun`, `accelerate`, or other distributed frameworks. + +### Services + +A service allows you to deploy a model or any web app as a scalable and secure endpoint. + +Create the following configuration file inside the repo: + +
    + +```yaml +type: service +name: deepseek-r1-sglang + +image: rocm/sglang-staging:20250212 +env: + - MODEL_ID=deepseek-ai/DeepSeek-R1 + - HSA_NO_SCRATCH_RECLAIM=1 +commands: + - python3 -m sglang.launch_server --model-path $MODEL_ID --port 8000 --tp 8 --trust-remote-code +port: 8000 +model: deepseek-ai/DeepSeek-R1 + +resources: + gpu: mi300x:8 + +volumes: + - /root/.cache/huggingface:/root/.cache/huggingface +``` + +
    + +Run the configuration via [`dstack apply`](../../docs/reference/cli/dstack/apply.md): + +
    + +```shell +$ dstack apply -f deepseek.dstack.yml + +Submit the run `deepseek-r1-sglang`? [y/n]: y + +Provisioning `deepseek-r1-sglang`... +---> 100% + +Service is published at: + http://localhost:3000/proxy/services/main/deepseek-r1-sglang/ +Model deepseek-ai/DeepSeek-R1 is published at: + http://localhost:3000/proxy/models/main/ +``` + +
    + +## See it in action + +Want to see how it works? Check out the video below: + + + +!!! info "What's next?" + 1. See [SSH fleets](../../docs/concepts/fleets.md#ssh-fleets) + 2. Read about [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), and [services](../../docs/concepts/services.md) + 3. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/mkdocs/blog/posts/amd-pd-disaggregation.md b/mkdocs/blog/posts/amd-pd-disaggregation.md new file mode 100644 index 0000000000..5e7cf7aa2a --- /dev/null +++ b/mkdocs/blog/posts/amd-pd-disaggregation.md @@ -0,0 +1,221 @@ +--- +title: "Deploying inference endpoints with PD disaggregation on AMD GPUs" +date: 2026-05-21 +description: "A walkthrough of deploying PD disaggregated inference on AMD GPUs with dstack and Shepherd Model Gateway (SMG), using SGLang workers and the Mooncake Transfer Engine." +slug: amd-pd-disaggregation +image: https://dstack.ai/static-assets/static-assets/images/amd-pd-disaggregation.png +categories: + - Changelog +--- + +# Deploying inference endpoints with PD disaggregation on AMD GPUs + +`dstack` is an open-source, AI-native orchestrator that works across clouds, Kubernetes clusters, on-prem fleets, hardware vendors, and frameworks. Alongside training, inference is one of the primary use cases `dstack` supports out of the box. + + + +`dstack` recently added native support for Prefill–Decode (PD) disaggregation. It works with [Shepherd Model Gateway](smg.md) (SMG) — a high-performance inference gateway evolved from the SGLang Router — on both NVIDIA and AMD, and with [NVIDIA Dynamo](https://docs.nvidia.com/dynamo/) on NVIDIA. This post walks through deploying it on AMD GPUs with SMG. + + + +## Why PD disaggregation + +PD disaggregation is useful when a single LLM deployment has two different bottlenecks: + +- **Prefill** processes the prompt. It is compute-bound, parallelizable, and has a direct impact on Time to First Token (TTFT). +- **Decode** generates tokens one by one. It is memory-bound, sequential, and has a direct impact on inter-token latency. + +When the same worker handles both phases, every replica has to serve both bottlenecks. With PD disaggregation, prefill and decode run as separate pools, and each pool can be sized and scaled independently. + +The tradeoff is operational: for every request, the KV cache produced by the prefill worker must be transferred to the decode worker before generation can continue. That transfer sits on the TTFT path, so the cluster needs a high-bandwidth, low-latency interconnect such as RDMA over InfiniBand or RoCE, rather than TCP over a conventional NIC. + +In this walkthrough, [SMG](https://lightseek.org/smg/) routes requests between SGLang workers. On AMD, the workers use the [Mooncake Transfer Engine](https://github.com/kvcache-ai/Mooncake) to transfer KV cache over RDMA/RoCE. In the configuration we tested, the RDMA fabric is exposed by Broadcom `bnxt_re` Ethernet devices. + +??? info "Prerequisites" + Running PD disaggregation on `dstack` requires first creating a [fleet](https://dstack.ai/docs/concepts/fleets/) with `placement: cluster`, so that prefill and decode workers share a high-bandwidth interconnect. This can be a [backend fleet](https://dstack.ai/docs/concepts/fleets/#backend-fleets_1) provisioned by `dstack` on a cloud or Kubernetes cluster, or an [SSH fleet](https://dstack.ai/docs/concepts/fleets/#ssh-fleets_1) registered against bare-metal or VM hosts you already manage. + +## Validating the interconnect + +To measure end-to-end bandwidth across nodes, run the [NCCL/RCCL tests example](../../docs/examples/clusters/nccl-rccl-tests.md). + +For a quick check that the RDMA devices are visible on a particular host, run: + +
    + +```shell +$ ibv_devices +``` + +
    + +All eight `bnxt_re*` interfaces should be listed. Use `ibv_devinfo` to inspect port state and link details. If devices are missing or in an unexpected state, install or update the NIC driver and userspace RDMA library before proceeding. + +## Deploying the service + +To deploy an inference endpoint with PD disaggregation using `dstack`, define a [service](../../docs/concepts/services.md) with three replica groups: an SMG router, a pool of prefill workers, and a pool of decode workers. + +The example below deploys `Qwen/Qwen2.5-72B-Instruct` on a multi-node cluster with AMD MI300X GPUs: + +
    + +```yaml +type: service +name: amd-sglang-pd-service + +image: rocm/sgl-dev:v0.5.10.post1-rocm720-mi30x-20260427 +privileged: true + +env: + - MODEL_ID=Qwen/Qwen2.5-72B-Instruct + - HF_TOKEN + - SGLANG_USE_AITER=0 + - SGLANG_ROCM_FUSED_DECODE_MLA=0 + - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=600 + - SGLANG_DISAGGREGATION_WAITING_TIMEOUT=600 + - RDMA_DEVICES=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7 + - NCCL_IB_DISABLE=1 + +replicas: + - count: 1 + commands: + - pip install smg + - | + smg launch \ + --pd-disaggregation \ + --host 0.0.0.0 \ + --port 30000 + resources: + cpu: 4.. + router: + type: sglang + + - count: 1..2 + scaling: + metric: rps + target: 300 + commands: + - | + python3 -m sglang.launch_server \ + --model $MODEL_ID \ + --disaggregation-mode prefill \ + --disaggregation-transfer-backend mooncake \ + --host 0.0.0.0 \ + --port 30000 \ + --tp $DSTACK_GPUS_NUM \ + --trust-remote-code \ + --disaggregation-ib-device $RDMA_DEVICES \ + --disaggregation-bootstrap-port 8998 \ + --disable-radix-cache \ + --disable-cuda-graph \ + --disable-overlap-schedule \ + --mem-fraction-static 0.8 \ + --max-running-requests 1024 + resources: + gpu: MI300X:8 + cpu: 96.. + memory: 512GB.. + + - count: 1..4 + scaling: + metric: rps + target: 300 + commands: + - | + python3 -m sglang.launch_server \ + --model $MODEL_ID \ + --disaggregation-mode decode \ + --disaggregation-transfer-backend mooncake \ + --host 0.0.0.0 \ + --port 30000 \ + --tp $DSTACK_GPUS_NUM \ + --trust-remote-code \ + --disaggregation-ib-device $RDMA_DEVICES \ + --disable-radix-cache \ + --disable-cuda-graph \ + --disable-overlap-schedule \ + --decode-attention-backend triton \ + --mem-fraction-static 0.8 \ + --max-running-requests 1024 + resources: + gpu: MI300X:8 + cpu: 96.. + memory: 512GB.. + +port: 30000 +model: Qwen/Qwen2.5-72B-Instruct + +# Custom probe is required for PD disaggregation. +probes: + - type: http + url: /health + interval: 15s + +volumes: + - /usr/lib64/libibverbs/libbnxt_re-rdmav34.so:/usr/lib/x86_64-linux-gnu/libibverbs/libbnxt_re-rdmav34.so +``` + +
    + +`dstack` provisions each group, registers workers with the router, runs health probes, and autoscales prefill and decode pools independently against RPS. + +Worker replicas run on GPU and bind to the Broadcom RDMA devices. While the prefill and decode replicas run on GPUs, the router replica requires a CPU instance in the same cluster. + +!!! info "RoCE library" + Mooncake uses the RDMA/RoCE interconnect for KV cache transfer. To use the RDMA/RoCE interconnect on Broadcom `bnxt_re` devices, Mooncake requires the Broadcom-specific userspace provider library `libbnxt_re-rdmav34.so` to be available inside the container at `/usr/lib/x86_64-linux-gnu/libibverbs/libbnxt_re-rdmav34.so`. We make this library available by mounting the host provider library from `/usr/lib64/libibverbs/libbnxt_re-rdmav34.so`. + +Apply the configuration: + +
    + +```shell +$ HF_TOKEN=... +$ dstack apply -f amd-pd.dstack.yml +``` + +
    + +Once provisioning completes, `dstack` exposes the service through a single endpoint: + +
    + +```shell +$ curl http://localhost:3000/proxy/services/main/amd-sglang-pd-service/v1/chat/completions \ + -H 'Content-Type: application/json' \ + -H 'Authorization: Bearer ' \ + -d '{ + "model": "Qwen/Qwen2.5-72B-Instruct", + "messages": [ + { + "role": "user", + "content": "Compose a poem that explains the concept of recursion in programming." + } + ] + }' +``` + +
    + +Requests are routed to SMG, which selects the prefill and decode workers for each request. The prefill worker processes the prompt, the decode worker continues generation, and Mooncake transfers the KV cache between them over RoCE. `dstack` registers and deregisters workers with SMG as replicas are added or removed, runs the `/health` probe on each replica, and scales each replica group independently. + +!!! info "Limitations" + - Currently, only one router replica per service is supported. + - The example uses the SGLang inference backend for prefill and decode workers. vLLM backend support is coming soon. + - Autoscaling supports the RPS metric. TTFT and ITL-based autoscaling support is coming soon. + +## Why this matters + +`dstack` provides a single, simple interface for orchestrating training and inference across hardware vendors, serving frameworks, routers, and infrastructure. It removes the need to assemble multiple fragmented tools on top of Kubernetes or build your own orchestration layer in-house. + +!!! info "Benchmarks" + Benchmarks for PD disaggregation on AMD are in progress and will be published in a follow-up. If you are running AMD GPUs and would like to contribute workloads or collaborate on benchmarking, please get in touch. + +Bug reports, feedback, and feature requests are welcome on the [issue tracker](https://github.com/dstackai/dstack/issues) and on [Discord](https://discord.gg/u8SmfwPpMd). + +> *Thanks to Matthew Bettinger at AMD for the collaboration, testing time, and feedback that shaped this integration.* + +## What's next? + +1. Read about [services](https://dstack.ai/docs/concepts/services/) and [fleets](https://dstack.ai/docs/concepts/fleets/) +2. Check the [NCCL/RCCL tests](https://dstack.ai/docs/examples/clusters/nccl-rccl-tests/) example +3. Review the [Shepherd Model Gateway](https://lightseek.org/smg/getting-started/) and [SGLang PD disaggregation](https://docs.sglang.ai/advanced_features/pd_disaggregation.html) documentation +4. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/mkdocs/blog/posts/benchmark-amd-containers-and-partitions.md b/mkdocs/blog/posts/benchmark-amd-containers-and-partitions.md new file mode 100644 index 0000000000..8b945aaba3 --- /dev/null +++ b/mkdocs/blog/posts/benchmark-amd-containers-and-partitions.md @@ -0,0 +1,491 @@ +--- +title: "Benchmarking AMD GPUs: bare-metal, containers, partitions" +date: 2025-07-15 +description: "TBA" +slug: benchmark-amd-containers-and-partitions +image: https://dstack.ai/static-assets/static-assets/images/benchmark-amd-containers-and-partitions.png +categories: + - Benchmarks +--- + +# Benchmarking AMD GPUs: bare-metal, containers, partitions + +Our new benchmark explores two important areas for optimizing AI workloads on AMD GPUs: First, do containers introduce a performance penalty for network-intensive tasks compared to a bare-metal setup? Second, how does partitioning a powerful GPU like the MI300X affect its real-world performance for different types of AI workloads? + + + + + +This benchmark was supported by [Hot Aisle](https://hotaisle.xyz/), +a provider of AMD GPU bare-metal and VM infrastructure. + +## Benchmark 1: Bare-metal vs containers + +### Finding 1: No loss in interconnect bandwidth + +A common concern is that the abstraction layer of containers might slow down communication between GPUs on different nodes. To test this, we measured interconnect performance using two critical methods: high-level RCCL collectives (AllGather, AllReduce) essential for distributed AI, and low-level RDMA write tests for a raw measure of network bandwidth. + +#### AllGather + +The `all_gather` operation is crucial for tasks like tensor-parallel inference, where results from multiple GPUs must be combined. Our tests showed that container performance almost perfectly matched bare-metal across message sizes from 8MB to 16GB. + + + +#### AllReduce + +Similarly, `all_reduce` is the backbone of distributed training, used for synchronizing gradients. Once again, the results were clear: containers performed just as well as bare-metal. + + + +Both bare-metal and container setups achieved nearly identical peak bus bandwidth (around 350 GB/s for 16GB messages), confirming that containerization does not hinder this fundamental collective operation. + +??? info "Variability" + Both setups showed some variability at smaller message sizes—typical behavior due to kernel launch latencies—but converged to stable, identical peak bandwidths for larger transfers. The fluctuations at smaller sizes are likely caused by non-deterministic factors such as CPU-induced pauses during GPU kernel launches, occasionally favoring one setup over the other. + +#### RDMA write + +To isolate the network from any framework overhead, we ran direct device-to-device RDMA write tests. This measures the raw data transfer speed between GPUs in different nodes. + + + +The results were definitive: bidirectional bandwidth was virtually identical in both bare-metal and container environments across all message sizes, from a tiny 2 bytes up to 8MB. + +#### Conclusion + +Our experiments consistently demonstrate that running multi-node AI workloads inside containers does not degrade interconnect performance. The performance of RCCL collectives and raw RDMA bandwidth on AMD GPUs is on par with a bare-metal configuration. This debunks the myth of a "container tax" and validates containers as a first-class choice for scalable AI infrastructure. + +## Benchmark 2: Partition performance isolated vs mesh + +The AMD GPU can be [partitioned](https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/gpu-partitioning/mi300x/overview.html) into smaller, independent units (e.g., NPS4 mode splits one GPU into four partitions). This promises better memory bandwidth utilization. Does this theoretical gain translate to better performance in practice? + +### Finding 1: Higher performance for isolated partitions + +First, we sought to reproduce and extend findings from the [official ROCm blog](https://rocm.blogs.amd.com/software-tools-optimization/compute-memory-modes/README.html). We benchmarked the memory bandwidth of a single partition (in CPX/NPS4 mode) against a full, unpartitioned GPU (in SPX/NPS1 mode). + + + +Our results confirmed that a single partition offers superior memory bandwidth. After aggregating the results to ensure an apples-to-apples comparison, we found the partitioned mode delivered consistently higher memory bandwidth across all message sizes, with especially large gains in the 32MB to 128MB range. + +### Finding 2: Worse performance for partition meshes + +Our benchmark showed that isolated partitions in CPX/NPS4 mode deliver strong memory bandwidth. But can these partitions work efficiently together in mesh scenarios? If performance drops when partitions communicate or share load, the GPU loses significant value for real-world workloads. + +#### Data-parallel inference + +We ran eight independent vLLM instances on eight partitions of a single MI300X and compared their combined throughput against one vLLM instance on a single unpartitioned GPU. The single GPU was significantly faster, and the performance gap widened as the request rate increased. The partitions were starved for memory, limiting their ability to handle the KV cache for a high volume of requests. + + + +The degradation stems from increased memory pressure, as each partition has only a fraction of GPU memory, limiting its ability to handle larger workloads efficiently. + +#### Tensor-parallel inference + +We built a toy inference benchmark with PyTorch’s native distributed support to simulate Tensor Parallelism. A single GPU in SPX/NPS1 mode significantly outperformed the combined throughput of 8xCPX/NPS4 partitions. + + + +The gap stems from the overhead of collective operations like `all_gather`, which are needed to synchronize partial outputs across GPU partitions. + +#### Conclusion + +Although GPU partitioning provides a memory bandwidth boost in isolated microbenchmarks, this benefit does not carry over to practical inference scenarios. + +In reality, performance is limited by two factors: + +1. **Reduced memory**: Each partition has only a fraction of the GPU's total HBM, creating a bottleneck for memory-hungry tasks like storing KV caches. +2. **Communication overhead**: When partitions must work together, the cost of communication between them negates the performance gains. + +GPU partitioning is only practical if used dynamically—for instance, to run multiple small development jobs or lightweight models, and then "unfractioning" the GPU back to its full power for larger, more demanding workloads. + +#### Limitations + +1. **Reproducibility**: AMD’s original blog post on partitioning lacked detailed setup information, so we had to reconstruct the benchmarks independently. +2. **Network tuning**: These benchmarks were run on a default, out-of-the-box network configuration. Our results for RCCL (~339 GB/s) and RDMA (~726 Gbps) are slightly below the peak figures [reported by Dell](https://infohub.delltechnologies.com/en-us/l/generative-ai-in-the-enterprise-with-amd-accelerators/rccl-and-perftest-for-cluster-validation-1/4/). This suggests that further performance could be unlocked with expert tuning of network topology, MTU size, and NCCL environment variables. + +## Benchmark setup + +### Hardware configuration + +Two nodes with below specifications: + +* Dell PowerEdge XE9680 (MI300X) +* CPU: 2 x Intel Xeon Platinum 8462Y+ +* RAM: 2.0 TiB +* GPU: 8 x AMD MI300X +* OS: Ubuntu 22.04.5 LTS +* ROCm: 6.4.1 +* AMD SMI: 25.4.2+aca1101 + +### Benchmark methodology + +The full, reproducible steps are available in our GitHub repository. Below is a summary of the approach. + +#### Creating a fleet + +We first defined a `dstack` [SSH fleet](../../docs/concepts/fleets.md#ssh-fleets) to manage the two-node cluster. + +```yaml +type: fleet +name: hotaisle-fleet +placement: any +ssh_config: + user: hotaisle + identity_file: ~/.ssh/id_rsa + hosts: + - hostname: ssh.hotaisle.cloud + port: 22007 + - hostname: ssh.hotaisle.cloud + port: 22015 +``` + +#### Bare-metal + +**RCCL tests** + +1. Install OpenMPI: + +```shell +apt install libopenmpi-dev openmpi-bin +``` + +2. Clone the RCCL tests repository + +```shell +git clone https://github.com/ROCm/rccl-tests.git +``` + +3. Build RCCL tests + +```shell +cd rccl-tests +make MPI=1 MPI_HOME=$OPEN_MPI_HOME +``` + +4. Create a hostfile with node IPs + +```shell +cat > hostfile < + + + +Our findings reveal that for single-GPU LLM training and inference, both setups deliver comparable performance. The subtle differences we observed highlight how virtualization overhead can influence performance under specific conditions, but for most practical purposes, the performance is nearly identical. + +This benchmark was supported by [Hot Aisle](https://hotaisle.xyz/), +a provider of AMD GPU bare-metal and VM infrastructure. + +## Benchmark 1: Inference + +### Finding 1: Identical performance at moderate concurrency levels and slightly worse otherwise + +**Throughput vs latency** + +Comparing throughput (tokens/second) against end-to-end latency across multiple concurrency levels is an effective way to measure an LLM inference system's scalability and responsiveness. This benchmark reveals how VM and bare-metal environments handle varying loads and pinpoints their throughput saturation points. + + + +At moderate concurrency levels (16–64), both bare-metal and VM deliver near-identical inference performance. At lower levels (4-16), bare-metal shows slightly better throughput, likely due to faster kernel launches and direct device access. At high concurrency (64–128), bare-metal maintains a slight edge in latency and throughput. At a concurrency of 256, throughput saturates for both, suggesting a bottleneck from KV cache pressure on GPU memory. + +## Benchmark 2: Training + +### Finding 1: Identical performance at large batches with only minor variations + +For training, we compare throughput (samples/second) and total runtime across increasing batch sizes. These metrics are crucial for evaluating cost and training efficiency. + +**Throughput** + +Bare metal performs slightly better at small batch sizes, but the VM consistently shows slightly better throughput and runtime at larger batch sizes (≥8). + + + +This may be because larger batches are compute-bound, making CPU-GPU synchronization less frequent. + + + +One plausible explanation for the VM's slight advantage here is that in the bare-metal setup, using only one of eight available GPUs may lead to minor interference from shared background services. + +### Finding 2: Identical convergence, GPU utilization, memory consumption + +Training/eval loss, GPU utilization, and VRAM usage are key indicators of training stability and system efficiency. Loss shows model convergence, while utilization and memory reflect hardware efficiency. + + + +Both VM and bare-metal setups exhibited nearly identical training and evaluation loss curves, indicating consistent model convergence. GPU utilization remained high (~95–100%) and stable in both environments, with similar VRAM consumption. + + + +This demonstrates that from a model training and hardware utilization perspective, both setups are equally efficient. + +## Limitations + +**Multi-GPU** + +This initial benchmark deliberately focused on a single-GPU setup to establish a baseline. A more production-representative evaluation would compare multi-GPU VMs with multi-GPU bare-metal systems. In multi-GPU inference, bare-metal’s direct hardware access could offer an advantage. For distributed training, however, where all GPUs are fully engaged, the performance between VM and bare-metal would likely be even closer. + +Furthermore, it's important to note that the performance gap in virtualized setups can potentially be narrowed significantly with expert hypervisor tuning, such as CPU pinning and NUMA node alignment. + +**Multi-node** + +For distributed training, models are trained across multi-node clusters where control-plane operations rely on the CPU. This can impact interconnect bandwidth and overall performance. A future comparison is critical, as performance will heavily depend on the network virtualization technology used. + + For instance, testing setups that use SR-IOV (Single Root I/O Virtualization)—a technology designed to provide near-native network performance to VMs—would be essential for a complete picture. + +## Conclusion + +Our initial benchmark shows that performance differences between a VM and bare-metal are minimal. Both environments exhibit near-identical behavior aside from a few subtle variations. These findings suggest that VMs are a highly viable option for demanding GPU tasks, with only minor trade-offs under specific conditions, and that AMD GPUs deliver exceptional performance in both virtualized and bare-metal environments. + +## Benchmark setup + +### Hardware configuration + +**VM** + +* CPU: Intel Xeon Platinum 8470: 13c @ 2 GHz +* RAM: 224 GiB +* NVMe: 13 TB +* GPUs: 1 x AMD MI300X + +**Bare-metal** + +* CPU: Intel Xeon Pla*tinum 8470: 13c @ 2 GHz (`--cpuset-cpus="0-12"`) +* RAM: 224 GiB (`--memory="224g"`) +* GPUs: 1x AMD MI300X + +### Benchmark methodology + +The steps to run benchmarks are identical for both setups, except that the docker run command for bare metal includes `--cpuset-cpus="0-12"` and `--memory="224g"` to match the VM's resources. + +#### Inference + +1. Run a `rocm/vllm` container: + +```shell +docker run -it \ + --network=host \ + --group-add=video \ + --ipc=host \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --device /dev/kfd \ + --device /dev/dri \ + rocm/vllm:latest /bin/bash +``` + +2. Start the vLLM server: + +```shell +vllm serve meta-llama/Llama-3.3-70B-Instruct --max-model-len 100000 +``` + +3. Start the benchmark + +```shell +isl=1024 +osl=1024 +MaxConcurrency="4 8 16 32 64 128 256" +RESULT_DIR="./results_concurrency_sweep" +mkdir -p $RESULT_DIR + +for concurrency in $MaxConcurrency; do + TIMESTAMP=$(date +%Y%m%d-%H%M%S) + FILENAME="llama3.3-70B-random-${concurrency}concurrency-${TIMESTAMP}.json" + + python3 /app/vllm/benchmarks/benchmark_serving.py \ + --model meta-llama/Llama-3.3-70B-Instruct \ + --dataset-name random \ + --random-input-len $isl \ + --random-output-len $osl \ + --num-prompts $((10 * $concurrency)) \ + --max-concurrency $concurrency \ + --ignore-eos \ + --percentile-metrics ttft,tpot,e2el \ + --save-result \ + --result-dir "$RESULT_DIR" \ + --result-filename "$FILENAME" +done +``` + +#### Training + +1. Run the `rocm/dev-ubuntu-22.04:6.4-complete` container: + +```shell +docker run -it \ + --network=host \ + --group-add=video \ + --ipc=host \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --device /dev/kfd \ + --device /dev/dri \ + rocm/dev-ubuntu-22.04:6.4-complete /bin/bash +``` + +2. Install TRL: + +```shell +sudo apt-get update && sudo apt-get install -y git cmake && \ +pip install torch --index-url https://download.pytorch.org/whl/nightly/rocm6.4 && \ +pip install transformers peft wandb && \ +git clone https://github.com/huggingface/trl && \ +cd trl && \ +pip install . +``` + +1. Run the benchmark + +```shell +python3 trl/scripts/sft.py \ + --model_name_or_path Qwen/Qwen2-0.5B \ + --dataset_name trl-lib/Capybara \ + --learning_rate 2.0e-4 \ + --num_train_epochs 1 \ + --packing \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 8 \ + --gradient_checkpointing \ + --eos_token '<|im_end|>' \ + --eval_strategy steps \ + --eval_steps 100 \ + --use_peft \ + --lora_r 32 \ + --lora_alpha 16 +``` + +## Source code + +All source code and findings are available in our [GitHub repo](https://github.com/dstackai/benchmarks/tree/main/amd/single_gpu_vm_vs_bare-metal). + +## References + +* [vLLM V1 Meets AMD Instinct GPUs: A New Era for LLM Inference Performance](https://rocm.blogs.amd.com/software-tools-optimization/vllmv1-rocm-llm/README.html) + +## What's next? + +Our next steps are to benchmark VM vs. bare-metal performance in multi-GPU and multi-node setups, covering tensor-parallel inference and distributed training scenarios. + +## Acknowledgments + +#### Hot Aisle + +Big thanks to [Hot Aisle](https://hotaisle.xyz/) for providing the compute power behind these benchmarks. +If you’re looking for fast AMD GPU bare-metal or VM instances, they’re definitely worth checking out. diff --git a/mkdocs/blog/posts/benchmarking-pd-ratios.md b/mkdocs/blog/posts/benchmarking-pd-ratios.md new file mode 100644 index 0000000000..c303163e13 --- /dev/null +++ b/mkdocs/blog/posts/benchmarking-pd-ratios.md @@ -0,0 +1,141 @@ +--- +title: "Benchmarking Prefill–Decode ratios: fixed vs dynamic" +date: 2025-09-25 +description: "TBA" +slug: benchmarking-pd-ratios +image: https://dstack.ai/static-assets/static-assets/images/benchmarking-pd-ratios.png +categories: + - Benchmarks +--- + +# Benchmarking Prefill–Decode ratios: fixed vs dynamic + +This benchmark investigates whether the Prefill–Decode worker ratio needs to be managed dynamically at runtime, or if a fixed split can deliver the same performance with simpler orchestration. +We evaluate different ratios across workload profiles and concurrency levels to measure their impact on TTFT, ITL, and throughput, and to see whether fixing the ratio in advance is a practical alternative to dynamic adjustment. + + + + + +## Introduction + +### What is Prefill–Decode disaggregation? + +LLM inference has two distinct phases: prefill and decode. Prefill processes all prompt tokens in parallel and is compute-intensive. Decode generates tokens one by one, repeatedly accessing the KV-cache, making it memory- and bandwidth-intensive. DistServe ([Zhong et al., 2024](https://arxiv.org/pdf/2401.09670)) introduced prefill–decode disaggregation to separate these phases across dedicated workers, reducing interference and enabling hardware to be allocated more efficiently. + +### What is the prefill–decode ratio? + +The ratio of prefill to decode workers determines how much capacity is dedicated to each phase. DistServe showed that for a workload with ISL=512 and OSL=64, a 2:1 ratio met both TTFT and TPOT targets. But this example does not answer how the ratio should be chosen more generally, or whether it needs to change at runtime. + +!!! info "Reasoning model example" + In the DeepSeek deployment ([LMSYS, 2025](https://lmsys.org/blog/2025-05-05-large-scale-ep)), the ratio was 1:3. This decode-leaning split reflects reasoning workloads, where long outputs dominate. Allocating more workers to decode reduces inter-token latency and keeps responses streaming smoothly. + +### Dynamic ratio + +Dynamic approaches, such as NVIDIA’s [SLA-based](https://docs.nvidia.com/dynamo/latest/architecture/sla_planner.html) +and [Load-based](https://docs.nvidia.com/dynamo/latest/architecture/load_planner.html) planners, adjust the ratio at runtime according to SLO targets or load. However, they do this in conjunction with auto-scaling, which increases orchestration complexity. This raises the question: does the prefill–decode ratio really need to be dynamic, or can a fixed ratio be chosen ahead of time and still provide robust performance? + +## Benchmark purpose + +The aim of this benchmark is to test whether the prefill–decode ratio must be adjusted dynamically at runtime, or if a fixed split can perform just as well. + +If a fixed ratio works across workload profiles and concurrency levels, it would mean the ratio can be chosen ahead of time, simplifying orchestration by removing the need for runtime ratio management. + +We evaluate different ratios across workload types (prefill-heavy, decode-heavy, balanced) and concurrency levels to see how each affects TTFT, ITL, and throughput. + +## Methodology + +To test this, we benchmarked different fixed prefill–decode ratios under varying workload profiles and concurrency levels. The experiments were run on a single node with 8xH200 GPUs, using SGLang to serve the model. + +We compared three ratios—3:1, 2:2, and 1:3—at both low and high concurrency across three workload types: + +* **Prefill-heavy** (ISL > OSL) — e.g., summarization: long inputs, short outputs. +* **Decode-heavy** (ISL < OSL) — e.g., reasoning: short inputs, long chains of thought. +* **Balanced** (ISL ≈ OSL) — e.g., translation, paraphrasing. + +Lower concurrency highlights intrinsic trade-offs (prefill-leaning improves TTFT; decode-leaning improves ITL and throughput). Higher concurrency reveals the true bottleneck. In real deployments, success means meeting TTFT/ITL SLOs and sustaining throughput for cost efficiency, so we evaluate both. + +To evaluate performance, we measured TTFT, ITL, and throughput to capture both latency and efficiency. + +??? info "Why these metrics matter" + + * **TTFT** (Time to First Token) captures perceived responsiveness—crucial for interactive experiences (e.g., support bots, code assistants). + * **ITL** (inter-token latency) captures streaming smoothness—critical for long, reasoning-style outputs. + * **Throughput** (tokens/sec) reflects cost efficiency. Prefill-heavy tasks (e.g., summarization of long docs) stress prefill; reasoning tasks stress decode. Maintaining high throughput ensures the under-stressed phase doesn’t leave GPUs idle. + +If a fixed ratio consistently performs well across these metrics, it would indicate that the ratio can be chosen ahead of time, without requiring runtime adjustment. + +## Benchmark setup + +* **GPU**: NVIDIA 8xH200 (SXM5) +* **CPU**: Intel Xeon Platinum 8468 +* **Model**: `openai/gpt-oss-120b` +* **Backend**: SGLang + +For full steps and raw data, see the [GitHub repo](https://github.com/dstackai/benchmarks/tree/main/comparison/pd_ratio). + +## Finding 1: Prefill-heavy workloads + +At lower concurrency, 1:3 yields the best ITL and throughput but the worst TTFT. Ratios 3:1 and 2:2 improve TTFT because more prefill capacity clears prompts faster. However, with 3:1, a single decode worker becomes a chokepoint—queues build up, ITL rises, and overall throughput drops. + +At higher concurrency, 1:3 wins across all metrics. Because TTFT = prefill time + waiting at decode + time to first token, ample decode capacity trims the waiting component, improving TTFT even on prefill-heavy inputs. + +In practice, summarization rarely has tight TTFT SLOs—users expect some delay after uploading long documents. Throughput and ITL dominate cost and experience, making 1:3 the recommended split for prefill-heavy workloads at both low and high concurrency. + + + +> Metrics are normalized per chart: the best value for each metric is 100%; others are percentages of that maximum. Lower is better for ITL/TTFT; higher is better for Throughput. + + + +## Finding 2: Decode-heavy workloads + +As with prefill-heavy cases, at lower concurrency a 1:3 split delivers the best ITL and throughput, at the cost of higher TTFT. Ratios 3:1 and 2:2 improve TTFT but degrade streaming smoothness and throughput. + +At higher concurrency, 1:3 again leads across all metrics. + +For reasoning tasks, ITL is usually the tightest SLO—smooth, uninterrupted token streaming drives user experience. We recommend 1:3 for decode-heavy workloads at both low and high concurrency. + + + +> Metrics normalized as above. Lower is better for ITL/TTFT; higher is better for Throughput. + + + +## Finding 3: Balanced workloads + +At lower concurrency, 1:1 provides the most balanced profile: better TTFT than the other ratios, with only modest trade-offs in ITL and throughput versus 1:3. + +At higher concurrency, 1:3 regains the lead across metrics, while 1:1 sees TTFT degrade as decode pressure grows. + +Since 1:1 becomes limiting under load, 1:3 is the safer default for balanced workloads—1:1 can offer slightly lower TTFT at light load, but 1:3 scales better and sustains higher throughput. + + + +> Metrics normalized as above. Lower is better for ITL/TTFT; higher is better for Throughput. + + + +## Conclusion + +Across all workload profiles and concurrency levels, a fixed ratio delivered robust performance. +This suggests that while dynamic planners (e.g., SLA- and load-based) provide a flexible framework for worker allocation, in many cases a fixed ratio combined with standard autoscaling can achieve similar outcomes with simpler orchestration. + +A fixed ratio therefore serves as a practical baseline for Prefill–Decode disaggregation. Dynamic adjustment remains valuable when workloads are highly unpredictable, but when profiles are understood, setting the ratio in advance can reduce operational complexity without sacrificing performance. + +## Limitations + +1. This benchmark does not provide a method for determining the fixed ratio. +2. The benchmark evaluated only a limited set of ratios: 3:1, 2:2, and 1:3. +3. The benchmark does not directly validate whether dynamic ratio adjustment (e.g., NVIDIA’s planners) delivers better or worse performance compared with a fixed-ratio approach. +4. The benchmark only considers tensor parallelism and not data parallelism, e.g. to assess how other forms of model parallelism interact with PD and affect latency/throughput trade-offs. + +Overall, more study on how the optimal ratio is found and what factors it depends on is required to ensure there is a simple and robust framework, ideally without overcomplicating orchestration. + +## References + +* [DistServe](https://arxiv.org/pdf/2401.09670) +* [DeepSeek deployment on 96 H100 GPUs](https://lmsys.org/blog/2025-05-05-large-scale-ep/) +* [Dynamo disaggregated serving](https://docs.nvidia.com/dynamo/latest/architecture/disagg_serving.html#) +* [SGLang PD disaggregation](https://docs.sglang.ai/advanced_features/pd_disaggregation.html) +* [vLLM disaggregated prefilling](https://docs.vllm.ai/en/v0.9.2/features/disagg_prefill.html) diff --git a/mkdocs/blog/posts/beyond-kubernetes-2024-recap-and-whats-ahead.md b/mkdocs/blog/posts/beyond-kubernetes-2024-recap-and-whats-ahead.md new file mode 100644 index 0000000000..fb43d7f3ea --- /dev/null +++ b/mkdocs/blog/posts/beyond-kubernetes-2024-recap-and-whats-ahead.md @@ -0,0 +1,133 @@ +--- +title: "Beyond Kubernetes: 2024 recap and what's next for AI infra" +date: 2024-12-10 +description: "Reflecting on key milestones from 2024, and looking ahead to the next steps in simplifying AI infrastructure orchestration." +slug: beyond-kubernetes-2024-recap-and-whats-ahead +image: https://dstack.ai/static-assets/static-assets/images/beyond-kubernetes-2024-recap-and-whats-ahead.png +--- + +# Beyond Kubernetes: 2024 recap and what's ahead for AI infra + +At `dstack`, we aim to simplify AI model development, training, and deployment of AI models by offering an +alternative to the complex Kubernetes ecosystem. Our goal is to enable seamless AI infrastructure management across any +cloud or hardware vendor. + +As 2024 comes to a close, we reflect on the milestones we've achieved and look ahead to the next steps. + + + +## Ecosystem + +While `dstack` integrates with leading cloud GPU providers, we aim to expand partnerships with more providers +sharing our vision of simplifying AI infrastructure orchestration with a lightweight, efficient alternative to Kubernetes. + +This year, we’re excited to welcome our first partners: [Lambda](https://lambdalabs.com/), +[Runpod](https://www.runpod.io/), +[CUDO Compute](https://www.cudocompute.com/), +and [Hot Aisle](https://hotaisle.xyz/). + +We’d also like to thank [Oracle ](https://www.oracle.com/cloud/) +for their collaboration, ensuring seamless integration between `dstack` and OCI. + +> Special thanks to [Lambda](https://lambdalabs.com/) and +> [Hot Aisle](https://hotaisle.xyz/) for providing NVIDIA and AMD hardware, enabling us conducting +> [benchmarks](/blog/category/benchmarks/), which +> are essential to advancing open-source inference and training stacks for all accelerator chips. + +## Community + +Thanks to your support, the project has +reached [1.6K stars on GitHub](https://github.com/dstackai/dstack), +reflecting the growing interest and trust in its mission. +Your issues, pull requests, as well as feedback on [Discord](https://discord.gg/u8SmfwPpMd), play a +critical role in the project's development. + +## Fleets + +A key milestone for `dstack` this year has been the introduction of [fleets](/docs/concepts/fleets/), +an abstraction that simplifies the management of clusters. + +### Cloud providers + +Unlike Kubernetes, where node groups are typically managed through auto-scaling policies, `dstack` offers a more +streamlined approach. With `dstack`, you simply define a fleet YAML file and run +`dstack apply`. This command automatically provisions clusters across any cloud provider. + +For quick deployments, you can skip defining a fleet altogether. When you run a dev environment, task, or service, +`dstack` creates a fleet automatically. + +### On-prem server + +Managing on-prem resources with `dstack`'s fleets is equally straightforward. If you have SSH access to a group of hosts, simply +list them in a YAML configuration file and run `dstack apply`. + +
    + +```yaml +type: fleet +# The name is optional, if not specified, generated randomly +name: my-fleet + +# Ensure instances are inter-connected +placement: cluster + +# The user, private SSH key, and hostnames of the on-prem servers +ssh_config: + user: ubuntu + identity_file: ~/.ssh/id_rsa + hosts: + - 3.255.177.51 + - 3.255.177.52 +``` + +
    + +This turns your on-prem cluster into a `dstack` fleet, ready to run dev environments, tasks, and services. + +### GPU blocks + +At `dstack`, when running a job on an instance, it uses all available GPUs on that instance. In Q1 2025, we will +introduce [GPU blocks](https://github.com/dstackai/dstack/issues/1780), +allowing the allocation of instance GPUs into discrete blocks that can be reused by concurrent jobs. + +This will enable more cost-efficient utilization of expensive instances. + +## Volumes + +Another key milestone for `dstack` this year has been the introduction of [volumes](/docs/concepts/volumes), addressing +a critical need in AI infrastructure—data storage. + +With `dstack`'s volumes, users can now leverage storage in both cloud and on-prem environments in a unified and +efficient manner. + +## Accelerators + +### NVIDIA + +NVIDIA remains the top accelerator supported by `dstack`. Recently, we introduced a [NIM example](../../docs/examples/inference/nim.md) +for model deployment, and we continue to enhance support for the rest of NVIDIA's ecosystem. + +### AMD + +This year, we’re particularly proud of our newly added integration with AMD. + +`dstack` works seamlessly with any on-prem AMD clusters. For example, you can rent such servers through our partner +[Hot Aisle](https://hotaisle.xyz/). + +> Among cloud providers, [AMD](https://www.amd.com/en/products/accelerators/instinct.html) is supported only through Runpod. In Q1 2025, we plan to extend it to +[Nscale](https://www.nscale.com/), +> [Hot Aisle](https://hotaisle.xyz/), and potentially other providers open to collaboration. + +### Intel + +In Q1 2025, our roadmap includes added integration with +[Intel Gaudi](https://www.intel.com/content/www/us/en/products/details/processors/ai-accelerators/gaudi-overview.html) +among other accelerator chips. + +## Join the community + +If you're interested in simplifying AI infrastructure, both in the cloud and on-prem, consider getting involved as a +`dstack` user, open-source contributor, or ambassador. + +Finally, if you're a cloud, hardware, or software vendor, consider contributing to `dstack` and helping us drive it as +an open standard together. diff --git a/mkdocs/blog/posts/changelog-07-25.md b/mkdocs/blog/posts/changelog-07-25.md new file mode 100644 index 0000000000..e231ac6a37 --- /dev/null +++ b/mkdocs/blog/posts/changelog-07-25.md @@ -0,0 +1,204 @@ +--- +title: "Rolling deployment, Secrets, Files, Tenstorrent, and more" +date: 2025-07-10 +description: "TBA" +slug: changelog-07-25 +image: https://dstack.ai/static-assets/static-assets/images/changelog-07-25.png +categories: + - Changelog +--- + +# Rolling deployment, Secrets, Files, Tenstorrent, and more + +Thanks to feedback from the community, `dstack` continues to evolve. Here’s a look at what’s new. + +#### Rolling deployments + +Previously, updating running services could cause downtime. The latest release fixes this with [rolling deployments](../../docs/concepts/services.md/#rolling-deployment). Replicas are now updated one by one, allowing uninterrupted traffic during redeployments. + +
    + +```shell +$ dstack apply -f .dstack.yml + +Active run my-service already exists. Detected changes that can be updated in-place: +- Repo state (branch, commit, or other) +- File archives +- Configuration properties: + - env + - files + +Update the run? [y/n]: y +⠋ Launching my-service... + + NAME BACKEND PRICE STATUS SUBMITTED + my-service deployment=1 running 11 mins ago + replica=0 deployment=0 aws (us-west-2) $0.0026 terminating 11 mins ago + replica=1 deployment=1 aws (us-west-2) $0.0026 running 1 min ago +``` + +
    + + + +#### Secrets + +Secrets let you centrally manage sensitive data like API keys and credentials. They’re scoped to a project, managed by project admins, and can be [securely referenced](../../docs/concepts/secrets.md) in run configurations. + +
    + +```yaml hl_lines="7" +type: task +name: train + +image: nvcr.io/nvidia/pytorch:25.05-py3 +registry_auth: + username: $oauthtoken + password: ${{ secrets.ngc_api_key }} + +commands: + - git clone https://github.com/pytorch/examples.git pytorch-examples + - cd pytorch-examples/distributed/ddp-tutorial-series + - pip install -r requirements.txt + - | + torchrun \ + --nproc-per-node=$DSTACK_GPUS_PER_NODE \ + --nnodes=$DSTACK_NODES_NUM \ + multinode.py 50 10 + +resources: + gpu: H100:1..2 + shm_size: 24GB +``` + +
    + +#### Files + +By default, `dstack` mounts the repo directory (where you ran `dstack init`) to all runs. + +If the directory is large or you need files outside of it, use the new [files](../../docs/concepts/dev-environments/#files) property to map specific local paths into the container. + +
    + +```yaml +type: task +name: trl-sft + +files: + - .:examples # Maps the directory where `.dstack.yml` to `/workflow/examples` + - ~/.ssh/id_rsa:/root/.ssh/id_rsa # Maps `~/.ssh/id_rsa` to `/root/.ssh/id_rs + +python: 3.12 + +env: + - HF_TOKEN + - HF_HUB_ENABLE_HF_TRANSFER=1 + - MODEL=Qwen/Qwen2.5-0.5B + - DATASET=stanfordnlp/imdb + +commands: + - uv pip install trl + - | + trl sft \ + --model_name_or_path $MODEL --dataset_name $DATASET + --num_processes $DSTACK_GPUS_PER_NODE + +resources: + gpu: H100:1 +``` + +
    + +#### Tenstorrent + +`dstack` remains committed to supporting multiple GPU vendors—including NVIDIA, AMD, TPUs, and more recently, [Tenstorrent](https://tenstorrent.com/). The latest release improves Tenstorrent support by handling hosts with multiple N300 cards and adds Docker-in-Docker support. + + + +Huge thanks to the Tenstorrent community for testing these improvements! + +#### Docker in Docker + +Using Docker inside `dstack` run configurations is now even simpler. Just set `docker` to `true` to [enable the use of Docker CLI](../../docs/concepts/tasks.md#docker-in-docker) in your runs—allowing you to build images, run containers, use Docker Compose, and more. + +
    + +```yaml +type: task +name: docker-nvidia-smi + +docker: true + +commands: + - | + docker run --gpus all \ + nvidia/cuda:12.3.0-base-ubuntu22.04 \ + nvidia-smi + +resources: + gpu: H100:1 +``` + +
    + +#### AWS EFA + +EFA is a network interface for EC2 that enables low-latency, high-bandwidth communication between nodes—crucial for scaling distributed deep learning. With `dstack`, EFA is automatically enabled when using supported instance types in fleets. Check out our [example](../../docs/examples/clusters/aws.md) + +#### Default Docker images + +If no `image` is specified, `dstack` uses a base Docker image that now comes pre-configured with `uv`, `python`, `pip`, essential CUDA drivers, InfiniBand, and NCCL tests (located at `/opt/nccl-tests/build`). + +
    + +```yaml +type: task +name: nccl-tests + +nodes: 2 + +startup_order: workers-first +stop_criteria: master-done + +env: + - NCCL_DEBUG=INFO +commands: + - | + if [ $DSTACK_NODE_RANK -eq 0 ]; then + mpirun \ + --allow-run-as-root \ + --hostfile $DSTACK_MPI_HOSTFILE \ + -n $DSTACK_GPUS_NUM \ + -N $DSTACK_GPUS_PER_NODE \ + --bind-to none \ + /opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1 + else + sleep infinity + fi + +resources: + gpu: nvidia:1..8 + shm_size: 16GB +``` + +
    + +These images are optimized for common use cases and kept lightweight—ideal for everyday development, training, and inference. + +#### Server performance + +Server-side performance has been improved. With optimized handling and background processing, each server replica can now handle more runs. + +#### Google SSO + +Alongside the open-source version, `dstack` also offers [dstack Enterprise](https://github.com/dstackai/dstack-enterprise) — which adds dedicated support and extra integrations like Single Sign-On (SSO). The latest release introduces support for configuring your company’s Google account for authentication. + + + +If you’d like to learn more about `dstack` Enterprise, [let us know](https://calendly.com/dstackai/discovery-call). + +That’s all for now. + +!!! info "What's next?" + Give dstack a try, and share your feedback—whether it’s [GitHub](https://github.com/dstackai/dstack) issues, PRs, or questions on [Discord](https://discord.gg/u8SmfwPpMd). We’re eager to hear from you! diff --git a/mkdocs/blog/posts/cursor.md b/mkdocs/blog/posts/cursor.md new file mode 100644 index 0000000000..4e8e01fb49 --- /dev/null +++ b/mkdocs/blog/posts/cursor.md @@ -0,0 +1,86 @@ +--- +title: "Accessing dev environments with Cursor" +date: 2025-03-31 +description: "TBA" +slug: cursor +image: https://dstack.ai/static-assets/static-assets/images/dstack-cursor-v2.png +categories: + - Changelog +--- + +# Accessing dev environments with Cursor + +Dev environments enable seamless provisioning of remote instances with the necessary GPU resources, +automatic repository fetching, and streamlined access via SSH or a preferred desktop IDE. + +Previously, support was limited to VS Code. However, as developers rely on a variety of desktop IDEs, +we’ve expanded compatibility. With this update, dev environments now offer effortless access for users of +[Cursor](https://www.cursor.com/). + + + + + +To access a dev environment via Cursor, set the `ide` property in your configuration to `cursor`. + +
    + +```yaml +type: dev-environment +# The name is optional, if not specified, generated randomly +name: vscode + +python: "3.11" +# Uncomment to use a custom Docker image +#image: dstackai/base:py3.13-0.7-cuda-12.1 + +ide: cursor + +# Use either spot or on-demand instances +#spot_policy: auto + +resources: + gpu: 24GB +``` + +
    + +Once you’ve configured the environment, invoke the [`dstack apply`](../../docs/reference/cli/dstack/apply.md) command. +When the dev environment is ready, dstack will provide a URL that you can click to open the environment in your desktop +Cursor IDE. + +
    + +```shell +$ dstack apply -f examples/.dstack.yml + + # BACKEND REGION RESOURCES SPOT PRICE + 1 runpod CA-MTL-1 9xCPU, 48GB, A5000:24GB yes $0.11 + 2 runpod EU-SE-1 9xCPU, 43GB, A5000:24GB yes $0.11 + 3 gcp us-west4 4xCPU, 16GB, L4:24GB yes $0.21 + +Submit the run vscode? [y/n]: y + +Launching `vscode`... +---> 100% + +To open in Cursor, use this link: + cursor://vscode-remote/ssh-remote+vscode/workflow +``` + +
    + +Clicking the provided URL will prompt your desktop Cursor IDE to automatically connect to the remote machine via the SSH +tunnel created by the `dstack apply` command, allowing you to securely work with your dev environment. + + + +Using Cursor over VS Code offers multiple benefits, particularly when it comes to integrated AI coding assistance and +enhanced developer experience. + +!!! info "What's next?" + 1. [Download](https://www.cursor.com/) and install Cursor + 2. Learn more about [dev environments](../../docs/concepts/dev-environments.md), + [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), + and [fleets](../../docs/concepts/fleets.md) + 2. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/mkdocs/blog/posts/digitalocean-and-amd-dev-cloud.md b/mkdocs/blog/posts/digitalocean-and-amd-dev-cloud.md new file mode 100644 index 0000000000..7893356f4c --- /dev/null +++ b/mkdocs/blog/posts/digitalocean-and-amd-dev-cloud.md @@ -0,0 +1,151 @@ +--- +title: Orchestrating GPUs on DigitalOcean and AMD Developer Cloud +date: 2025-09-04 +description: "TBA" +slug: digitalocean-and-amd-dev-cloud +image: https://dstack.ai/static-assets/static-assets/images/digitalocean-and-amd-dev-cloud.png +categories: + - Changelog +--- + +# Orchestrating GPUs on DigitalOcean and AMD Developer Cloud + +Orchestration automates provisioning, running jobs, and tearing them down. While Kubernetes and Slurm are powerful in their domains, they lack the lightweight, GPU-native focus modern teams need to move faster. + +`dstack` is built entirely around GPUs. Our latest update introduces native integration with [DigitalOcean](https://www.digitalocean.com/products/gradient/gpu-droplets) and +[AMD Developer Cloud](https://www.amd.com/en/developer/resources/cloud-access/amd-developer-cloud.html), enabling teams to provision cloud GPUs and run workloads more cost-efficiently. + + + + + +## About Digital Ocean + +DigitalOcean is one of the leading cloud platforms offering GPUs both as VMs and as bare-metal clusters equipped with NVIDIA and AMD GPUs. + +## About AMD Developer Cloud + +AMD Developer Cloud is a new cloud platform designed to make AMD GPUs easily accessible to developers, academics, open-source contributors, and AI innovators worldwide. + +## Why dstack + +`dstack` provides a high-level, AI-engineer-friendly interface where GPUs work out of the box—no K8S custom operators or low-level setup required. It’s use-case agnostic, equally suited for training, inference, benchmarking, and dev environments. + +With the new DigitalOcean and AMD Developer Cloud backends, you can now provision NVIDIA or AMD GPU VMs and run workloads with a single CLI command. + +## Getting started + +Best part about `dstack` is that it's very easy to get started. + +1. Create a project in Digital Ocean or AMD Developer Cloud +2. Get credits or approve a payment method +3. Create an API key + +Then, configure the backend in `~/.dstack/server/config.yml`: + +
    + +```yaml +projects: +- name: main + backends: + - type: amddevcloud + project_name: my-amd-project + creds: + type: api_key + api_key: ... +``` + +
    + +For DigitalOcean, set `type` to `digitalocean`. + +Install and start the `dstack` server: + +
    + +```shell +$ pip install "dstack[server]" +$ dstack server +``` + +
    + +For more details, see [Installation](../../docs/installation.md). + +Use the `dstack` CLI to +manage [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), +and [services](../../docs/concepts/services.md). + + + +The `digitalocean` and `amddevcloud` backends support NVIDIA and AMD GPU VMs, respectively, and allow you to run +[dev environments](../../docs/concepts/dev-environments.md) (interactive development), [tasks](../../docs/concepts/tasks.md) +(training, fine-tuning, or other batch jobs), and [services](../../docs/concepts/services.md) (inference). + +Here’s an example of a service configuration: + +
    + +```yaml +type: service +name: gpt-oss-120b + +model: openai/gpt-oss-120b + +env: + - HF_TOKEN + - MODEL=openai/gpt-oss-120b + # To enable AITER, set below to 1. Otherwise, set it to 0. + - VLLM_ROCM_USE_AITER=1 + # To enable AITER Triton unified attention + - VLLM_USE_AITER_UNIFIED_ATTENTION=1 + # below is required in order to enable AITER unified attention by disabling AITER MHA + - VLLM_ROCM_USE_AITER_MHA=0 +image: rocm/vllm-dev:open-mi300-08052025 +commands: + - | + vllm serve $MODEL \ + --tensor-parallel $DSTACK_GPUS_NUM \ + --no-enable-prefix-caching \ + --disable-log-requests \ + --compilation-config '{"full_cuda_graph": true}' +port: 8000 + +volumes: + # Cache downloaded models + - /root/.cache/huggingface:/root/.cache/huggingface + +resources: + gpu: MI300X:8 + shm_size: 32GB +``` + +
    + +As with any configuration, you can apply it via `dstack apply`. If needed, `dstack` will automatically provision new VMs and run the inference endpoint. + +
    + +```shell +$ dstack apply -f examples/models/gpt-oss/120b.dstack.yml + + # BACKEND RESOURCES PRICE + 1 amddevcloud (alt1) cpu=20 mem=240GB disk=720GB MI300X:192GB:8 $15.92 + + Submit the run? [y/n]: +``` + +
    + +> If you prefer to use bare-metal clusters with `dstack`, you can create an [SSH fleet](../../docs/concepts/fleets.md#ssh-fleets). +> This way, you’ll be able to run [distributed tasks](../../docs/concepts/tasks.md#distributed-tasks) efficiently across the cluster. + +!!! info "What's next?" + 1. Check [Quickstart](../../docs/quickstart.md) + 2. Learn more about [DigitalOcean](https://www.digitalocean.com/products/gradient/gpu-droplets) and + [AMD Developer Cloud](https://www.amd.com/en/developer/resources/cloud-access/amd-developer-cloud.html) + 3. Explore [dev environments](../../docs/concepts/dev-environments.md), + [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), + and [fleets](../../docs/concepts/fleets.md) + 4. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/mkdocs/blog/posts/docker-inside-containers.md b/mkdocs/blog/posts/docker-inside-containers.md new file mode 100644 index 0000000000..1a88f20b13 --- /dev/null +++ b/mkdocs/blog/posts/docker-inside-containers.md @@ -0,0 +1,109 @@ +--- +title: "Using Docker and Docker Compose inside GPU-enabled containers" +date: 2024-10-30 +description: "The latest release of dstack allows for the direct use of Docker and Docker Compose within run configurations." +image: https://dstack.ai/static-assets/static-assets/images/dstack-docker-inside-containers.png +slug: docker-inside-containers +--- + +# Using Docker and Docker Compose inside GPU-enabled containers + +To run containers with `dstack`, you can use your own Docker image (or the default one) without a need to interact +directly with Docker. However, some existing code may require direct use of Docker or Docker Compose. That's why, +in our latest release, we've added this option. + +
    + +```yaml +type: task +name: compose-task + +image: dstackai/dind +privileged: true + +commands: + - start-dockerd + - | + cat > compose.yaml <<'EOF' + services: + web: + image: python:3.11-slim + command: python -m http.server 9000 + ports: + - "9000:9000" + EOF + - docker compose up +ports: [9000] + +resources: + gpu: 16GB..24GB +``` + +
    + + + +## How it works + +To use Docker or Docker Compose with your `dstack` configuration, set `image` to `dstackai/dind`, `privileged` to +`true`, and add the `start-dockerd` command. After this command, you can use Docker or Docker Compose directly. + + +For dev environments, add `start-dockerd` as the first command +in the `init` property. + +??? info "Dev environment" +
    + + ```yaml + type: dev-environment + name: vscode-dind + + image: dstackai/dind + privileged: true + + ide: vscode + init: + - start-dockerd + + resources: + gpu: 16GB..24GB + ``` + +
    + +The `start-dockerd` script is part of the `dstackai/dind` image, a pre-built image by `dstack` that enables Docker to run +inside containers. + +With this setup, you don’t have to worry about configuration—both Docker and Docker Compose work out of the box and +support GPU usage. + +!!! info "Backends" + Note that the `privileged` option is only supported by VM-based backends. This does not include `runpod`, `vastai`, + and `kubernetes`. All other backends support it. + +## When using it + +### docker compose + +One of the obvious use cases for this feature is when you need to use Docker Compose. +For example, the Hugging Face Chat UI requires a MongoDB database, so using Docker Compose to run it is +the easiest way: + + + +### docker build + +Another use case for this feature is when you need to build a custom Docker image using the `docker build` command. + +### docker run + +Last but not least, you can, of course, use the `docker run` command, for example, if your existing code requires it. + +## Feedback + +If you find something not working as intended, please be sure to report it to +our [bug tracker](https://github.com/dstackai/dstack/issues){:target="_ blank"}. +Your feedback and feature requests are also very welcome on both +[Discord](https://discord.gg/u8SmfwPpMd) and the +[issue tracker](https://github.com/dstackai/dstack/issues). diff --git a/mkdocs/blog/posts/dstack-metrics.md b/mkdocs/blog/posts/dstack-metrics.md new file mode 100644 index 0000000000..d3bf4ffa67 --- /dev/null +++ b/mkdocs/blog/posts/dstack-metrics.md @@ -0,0 +1,69 @@ +--- +title: "Monitoring essential GPU metrics via CLI" +date: 2024-10-22 +description: "dstack introduces a new CLI command (and API) for monitoring container metrics, incl. GPU usage for NVIDIA, AMD, and other accelerators." +slug: dstack-metrics +image: https://dstack.ai/static-assets/static-assets/images/dstack-stats-v2.png +categories: + - Changelog +--- + +# Monitoring essential GPU metrics via CLI + +## How it works { style="display:none"} + +While it's possible to use third-party monitoring tools with `dstack`, it is often more convenient to debug your run and +track metrics out of the box. That's why, with the latest release, `dstack` introduced [`dstack stats`](../../docs/reference/cli/dstack/metrics.md), a new CLI (and API) +for monitoring container metrics, including GPU usage for `NVIDIA`, `AMD`, and other accelerators. + + + + + +> Note, the `dstack stats` command has been renamed to `dstack metrics`. The old name is also supported by deprecated. + +The command is similar to `kubectl top` (in terms of semantics) and `docker stats` (in terms of the CLI interface). The key +difference is that `dstack stats` includes GPU VRAM usage and GPU utilization percentage. + +>The feature works right away with `NVIDIA` and `AMD`, whether you're running a development environment, task, or service. +> `TPU` support is coming soon. + +Similar to `kubectl top`, if a run consists of multiple jobs (such as distributed training or an auto-scalable service), +`dstack stats` will display metrics per job. + +!!! info "HTTP API" + In addition to the `dstack stats` CLI commands, metrics can be obtained via the + [`/api/project/{project_name}/metrics/job/{run_name}`](../../docs/reference/http/metrics.md) HTTP endpoint. + +## Why monitor GPU usage + +Kubernetes and Docker don’t offer built-in support for GPU usage tracking. Since `dstack` is tailored for AI containers, we +consider native GPU monitoring essential. + +#### GPU usage + +Monitoring GPU memory usage in AI workloads helps prevent out-of-memory errors and provides a clearer picture of how +much memory is actually used or needed by the workload. + +#### GPU utilization + +Monitoring GPU utilization is important for identifying under-utilization and ensuring that workloads are distributed +evenly across GPUs. + +## Roadmap + +Monitoring is a critical part of observability, and we have many more features on our roadmap: + +* Potentially adding more metrics, including disk usage, I/O, network, etc +* Support for the TPU accelerator +* Displaying historical metrics within the control plane UI +* Tracking deployment metrics, including LLM-related metrics +* A simple way to export metrics to Prometheus + +## Feedback + +If you find something not working as intended, please be sure to report it to +our [bug tracker](https://github.com/dstackai/dstack/issues){:target="_ blank"}. +Your feedback and feature requests are also very welcome on both +[Discord](https://discord.gg/u8SmfwPpMd) and the +[issue tracker](https://github.com/dstackai/dstack/issues). diff --git a/docs/blog/posts/dstack-sky-own-cloud-accounts.md b/mkdocs/blog/posts/dstack-sky-own-cloud-accounts.md similarity index 75% rename from docs/blog/posts/dstack-sky-own-cloud-accounts.md rename to mkdocs/blog/posts/dstack-sky-own-cloud-accounts.md index 7da979d32b..8fe8c9c4e4 100644 --- a/docs/blog/posts/dstack-sky-own-cloud-accounts.md +++ b/mkdocs/blog/posts/dstack-sky-own-cloud-accounts.md @@ -3,11 +3,13 @@ title: dstack Sky now supports your own cloud accounts date: 2024-06-11 description: "With today's release, dstack Sky supports both options: accessing the GPU marketplace and using your own cloud accounts." slug: dstack-sky-own-cloud-accounts +categories: + - Changelog --- -# dstack Sky now allows using your own cloud accounts +# dstack Sky now supports your own cloud accounts -[dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"} +[dstack Sky](https://sky.dstack.ai) enables you to access GPUs from the global marketplace at the most competitive rates. However, sometimes you may want to use your own cloud accounts. With today's release, both options are now supported. @@ -23,18 +25,18 @@ To use your own cloud account, open the project settings and edit the correspond ![dstack-sky-banner.png](https://raw.githubusercontent.com/dstackai/static-assets/main/static-assets/images/dstack-sky-edit-backend-config.png){ width=650 } You can configure your cloud accounts for any of the supported providers, including AWS, GCP, Azure, TensorDock, Lambda, -CUDO, RunPod, and Vast.ai. +CUDO, Runpod, and Vast.ai. Additionally, you can disable certain backends if you do not plan to use them. Typically, if you prefer using your own cloud accounts, it's recommended that you use the -[open-source version :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/){:target="_blank"} of `dstack`. +[open-source version](https://github.com/dstackai/dstack/) of `dstack`. However, if you prefer not to host it yourself, now you can use `dstack Sky` with your own cloud accounts as well. > Seeking the cheapest on-demand and spot cloud GPUs? -> [dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"} has you covered! +> [dstack Sky](https://sky.dstack.ai) has you covered! Need help, have a question, or just want to stay updated? -[Join Discord](https://discord.gg/u8SmfwPpMd){ .md-button .md-button-secondary .discord .small .external } \ No newline at end of file +[Join Discord](https://discord.gg/u8SmfwPpMd){ .md-button .md-button-secondary .discord .small .external } diff --git a/docs/blog/posts/dstack-sky.md b/mkdocs/blog/posts/dstack-sky.md similarity index 87% rename from docs/blog/posts/dstack-sky.md rename to mkdocs/blog/posts/dstack-sky.md index 48466a291f..877befb044 100644 --- a/docs/blog/posts/dstack-sky.md +++ b/mkdocs/blog/posts/dstack-sky.md @@ -2,6 +2,8 @@ date: 2024-03-11 description: A managed service that enables you to get GPUs at competitive rates from a wide pool of providers. slug: dstack-sky +categories: + - Changelog --- # Introducing dstack Sky @@ -40,8 +42,8 @@ set up with `dstack Sky`.
    ```shell -$ dstack config --url https://sky.dstack.ai \ - --project my-awesome-project \ +$ dstack project add --name my-awesome-project \ + --url https://sky.dstack.ai \ --token ca1ee60b-7b3f-8943-9a25-6974c50efa75 ``` @@ -67,7 +69,7 @@ Continue? [y/n]:
    !!! info "Backends" - `dstack Sky` supports the same [backends](../../docs/installation/index.md) as the open-source version, except that you + `dstack Sky` supports the same [backends](../../docs/installation.md) as the open-source version, except that you don't need to set them up. By default, it uses all supported backends. You can use both on-demand and spot instances without needing to manage quotas, as they are automatically handled @@ -75,9 +77,9 @@ for you. With `dstack Sky` you can use all of `dstack`'s features, incl. [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and -[pools](../../docs/concepts/pools.md). +[fleets](../../docs/concepts/fleets.md). -To use services, the open-source version requires setting up a gateway with your own domain. +To publish services, the open-source version requires setting up a gateway with your own domain. `dstack Sky` comes with a pre-configured gateway.
    @@ -115,22 +117,18 @@ resources: gpu: 48GB..80GB # Enable OpenAI compatible endpoint -model: - type: chat - name: mixtral - format: openai +model: mixtral ```
    -If it has a `model` mapping, the model will be accessible -at `https://gateway..sky.dstack.ai` via the OpenAI compatible interface. +The service endpoint will be accessible at `https://..sky.dstack.ai` via the OpenAI compatible interface. ```python from openai import OpenAI client = OpenAI( - base_url="https://gateway..sky.dstack.ai", + base_url="https://..sky.dstack.ai/v1", api_key="" ) @@ -154,4 +152,4 @@ or even use them side by side. `dstack Sky` is live on [Product Hunt](https://www.producthunt.com/posts/dstack-sky). Support it by giving it your vote! -[Join Discord](https://discord.gg/u8SmfwPpMd){ .md-button .md-button-secondary .discord .small .external } \ No newline at end of file +[Join Discord](https://discord.gg/u8SmfwPpMd){ .md-button .md-button-secondary .discord .small .external } diff --git a/mkdocs/blog/posts/ea-gtc25.md b/mkdocs/blog/posts/ea-gtc25.md new file mode 100644 index 0000000000..499c5402cb --- /dev/null +++ b/mkdocs/blog/posts/ea-gtc25.md @@ -0,0 +1,88 @@ +--- +title: "Case study: how EA uses dstack to fast-track AI development" +date: 2025-05-22 +description: "TBA" +slug: ea-gtc25 +image: https://dstack.ai/static-assets/static-assets/images/dstack-ea-slide-2-background-min.png +categories: + - Case studies +links: + - NVIDIA GTC 2025: https://www.nvidia.com/en-us/on-demand/session/gtc25-s73667/ +--- + +# How EA uses dstack to fast-track AI development + +At NVIDIA GTC 2025, Electronic Arts [shared](https://www.nvidia.com/en-us/on-demand/session/gtc25-s73667/) how they’re scaling AI development and managing infrastructure across teams. They highlighted using tools like `dstack` to provision GPUs quickly, flexibly, and cost-efficiently. This case study summarizes key insights from their talk. + + + +EA has over 100+ AI projects running, and the number keeps growing. There are many teams with AI needs—game dev, ML engineers, AI researchers, and platform teams—supported by a central tech team. Some need full MLOps support; others have in-house expertise but need flexible tooling and infrastructure. + + + +The central tech team ensures all teams have what they require, including tools, infrastructure, and expertise. + + + +As EA’s AI efforts grew, they faced major challenges: + +* **Tool fragmentation**: Teams used different tools and workflows, leading to duplicated effort and poor collaboration. +* **High GPU costs**: Spinning up GPUs could take days or weeks. To avoid delays, teams often left machines running idle, increasing costs. +* **Heavy engineering burden**: ML engineers spent time managing infrastructure—setting up clusters, configuring environments, and deploying models—instead of building AI. + +The typical AI workflow at EA includes: + +1. Development and training +2. Model storage and distribution +3. Serving and scaling + +Each stage comes with scaling challenges, from GPU compute provisioning efficiency to fragmented tooling and complex project setups. + + + +EA's centralized approach uses these core ML tools: + +* `dstack` – for provisioning compute for AI workloads at scale, covering everything related to ML development and training +* ML Artifactory – for managing artifacts at scale +* AXS (Kubernetes+) – for scalable inference and production serving + +EA uses `dstack` to streamline GPU provisioning and AI workflow orchestration. It's open-source, cloud-agnostic, automated, and integrates seamlessly with teams' existing dev workflows. + + + +> *Because our teams are fragmented, we want them to be able to run on any environment of their choosing... It has to work with all of these. That means a centralized, unified interface to talk to all of them.* +> +> *— Wah Loon Keng, Sr. AI Engineer, Electronic Arts* + +EA teams use `dstack` for three types of ML workloads: + +* [Dev environments](../../docs/concepts/dev-environments.md): spining up GPU boxes pre-setup with a Gitrepo, and ready to use via desktop IDE such as VS Code, Cursor, etc +* [Tasks](../../docs/concepts/tasks.md): seamless single-node or distributed training using open-source PyTorch libraries +* [Services](../../docs/concepts/services.md): running model endpoints and Streamlit-style apps for quick internal demos and prototyping + +Introducing `dstack` had a significant impact on EA’s ML teams. Before, getting access to GPU infrastructure could take days or even weeks. With dstack, teams can now spin up what they need in just minutes. This shift accelerated development by removing delays and freeing engineers to focus on building models. + +> *With dstack, what used to take weeks, provisioning GPUs, setting up environments, now takes minutes. It changed how fast teams at EA can move.* +> +> *— Wah Loon Keng, Sr. AI Engineer, Electronic Arts* + +Costs dropped by nearly a factor of three, largely due to dstack’s ability to automatically start and stop resources using spot and on-demand instances. + + + +Workflows became standardized, reproducible, and easier to trace—thanks to the use of version-controlled YAML configurations. Teams across different departments and cloud providers now follow the same setup and processes. + +> `dstack` provisions compute on demand and automatically shuts it down when no longer needed. That alone saves you over three times in cost.” +> +> — Wah Loon Keng, Sr. AI Engineer, Electronic Arts + + + +By adopting tools that are cloud-agnostic and developer-friendly, EA has reduced friction—from provisioning GPUs to deploying models—and enabled teams to spend more time on actual ML work. + +*Huge thanks to Kris and Keng from EA’s central tech team for sharing these insights. For more details, including the recording and slides, check out the full talk on the [NVIDIA GTC website](https://www.nvidia.com/en-us/on-demand/session/gtc25-s73667/).* + +!!! info "What's next?" + 1. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) + 2. Follow [Quickstart](../../docs/quickstart.md) + 3. Browse [Examples](../../docs/examples.md) diff --git a/mkdocs/blog/posts/gh200-on-lambda.md b/mkdocs/blog/posts/gh200-on-lambda.md new file mode 100644 index 0000000000..1a87dc90e1 --- /dev/null +++ b/mkdocs/blog/posts/gh200-on-lambda.md @@ -0,0 +1,87 @@ +--- +title: "Supporting ARM and NVIDIA GH200 on Lambda" +date: 2025-05-12 +description: "TBA" +slug: gh200-on-lambda +image: https://dstack.ai/static-assets/static-assets/images/dstack-arm--gh200-lambda-min.png +categories: + - Changelog +--- + +# Supporting ARM and NVIDIA GH200 on Lambda + +The latest update to `dstack` introduces support for NVIDIA GH200 instances on [Lambda](../../docs/concepts/backends.md#lambda) +and enables ARM-powered hosts, including GH200 and GB200, with [SSH fleets](../../docs/concepts/fleets.md#ssh-fleets). + + + + + +## ARM support + +Previously, `dstack` only supported x86 architecture with both cloud providers as well as on-prem clusters. With the latest update, it’s now possible to use both cloud and SSH fleets with ARM-based CPUs too. To request ARM CPUs in a run or fleet configuration, specify the arm architecture in the `resources`.`cpu` property: + +```yaml +resources: + cpu: arm:4.. # 4 or more ARM cores +``` + +If the hosts in an SSH fleet have ARM CPUs, `dstack` will automatically detect both ARM-based CPUs as well as ARM-based GPU Superchips such as GH200 and enable their use. + +To see available offers with ARM CPUs, pass `--cpu arm` to the `dstack offer` command. + +## About GH200 + +NVIDIA Grace is the first NVIDIA data center CPU, built on top of ARM specifically for AI workloads. The NVIDIA GH200 Superchip brings together a 72-core NVIDIA Grace CPU with an NVIDIA H100 GPU, connected with a high-bandwidth, memory-coherent NVIDIA NVLink-C2C interconnect. + +| CPU | GPU | CPU Memory | GPU Memory | NVLink-C2C | +| ------------- | ---- | ------------------------ | ------------------ | ---------- | +| Grace 72-core | H100 | 480GB LPDDR5X at 512GB/s | 96GB HBM3 at 4TB/s | 900GB/s | + +The GH200 Superchip’s 450 GB/s bidirectional bandwidth enables KV cache offloading to CPU memory. While prefill can leverage CPU memory for optimizations like prefix caching, generation benefits from the GH200’s higher memory bandwidth. + +## GH200 on Lambda + +[Lambda](https://cloud.lambda.ai/sign-up?_gl=1*1qovk06*_gcl_au*MTg2MDc3OTAyOS4xNzQyOTA3Nzc0LjE3NDkwNTYzNTYuMTc0NTQxOTE2MS4xNzQ1NDE5MTYw*_ga*MTE2NDM5MzI0My4xNzQyOTA3Nzc0*_ga_43EZT1FM6Q*czE3NDY3MTczOTYkbzM0JGcxJHQxNzQ2NzE4MDU2JGo1NyRsMCRoMTU0Mzg1NTU1OQ..) provides secure, user-friendly, reliable, and affordable cloud GPUs. Since end of last year, Lambda started to offer on-demand GH200 instances through their public cloud. Furthermore, they offer these instances at the promotional price of $1.49 per hour until June 30th 2025. + +With the latest `dstack` update, it’s now possible to use these instances with your Lambda account whether you’re running a dev environment, task, or service: + +
    + +```yaml +type: dev-environment +name: my-env +image: nvidia/cuda:12.8.1-base-ubuntu20.04 +ide: vscode + +resources: + gpu: GH200:1 +``` + +
    + +> Note, you have to use an ARM-based Docker image. + +To determine whether Lambda has GH200 on-demand instances available, run `dstack apply`: + +
    + +```shell +$ dstack apply -f .dstack.yml + + # BACKEND RESOURCES INSTANCE TYPE PRICE + 1 lambda (us-east-3) cpu=arm:64 mem=464GB GH200:96GB:1 gpu_1x_gh200 $1.49 +``` + +
    + +!!! info "Retry policy" + Note, if GH200s are not available at the moment, you can specify the [retry policy](../../docs/concepts/dev-environments.md#retry-policy) in your run configuration so that `dstack` can run the configuration once the GPU becomes available. + +> If you have GH200 or GB200-powered hosts already provisioned via Lambda, another cloud provider, or on-prem, you can now use them with [SSH fleets](../../docs/concepts/fleets.md#ssh-fleets). + +!!! info "What's next?" + 1. Sign up with [Lambda](https://cloud.lambda.ai/sign-up?_gl=1*1qovk06*_gcl_au*MTg2MDc3OTAyOS4xNzQyOTA3Nzc0LjE3NDkwNTYzNTYuMTc0NTQxOTE2MS4xNzQ1NDE5MTYw*_ga*MTE2NDM5MzI0My4xNzQyOTA3Nzc0*_ga_43EZT1FM6Q*czE3NDY3MTczOTYkbzM0JGcxJHQxNzQ2NzE4MDU2JGo1NyRsMCRoMTU0Mzg1NTU1OQ..) + 2. Set up the [Lambda](../../docs/concepts/backends.md#lambda) backend + 3. Follow [Quickstart](../../docs/quickstart.md) + 4. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) diff --git a/mkdocs/blog/posts/gpu-blocks-and-proxy-jump.md b/mkdocs/blog/posts/gpu-blocks-and-proxy-jump.md new file mode 100644 index 0000000000..61f28ea811 --- /dev/null +++ b/mkdocs/blog/posts/gpu-blocks-and-proxy-jump.md @@ -0,0 +1,181 @@ +--- +title: Introducing GPU blocks and proxy jump for SSH fleets +date: 2025-02-18 +description: "TBA" +slug: gpu-blocks-and-proxy-jump +image: https://dstack.ai/static-assets/static-assets/images/data-centers-and-private-clouds.png +categories: + - Changelog +--- + +# Introducing GPU blocks and proxy jump for SSH fleets + +Recent breakthroughs in open-source AI have made AI infrastructure accessible beyond public clouds, driving demand for +running AI workloads in on-premises data centers and private clouds. +This shift offers organizations both high-performant clusters and flexibility and control. + +However, Kubernetes, while a popular choice for traditional deployments, is often too complex and low-level to address +the needs of AI teams. + +Originally, `dstack` was focused on public clouds. With the new release, `dstack` +extends support to data centers and private clouds, offering a simpler, AI-native solution that replaces Kubernetes and +Slurm. + + + + + +Private clouds offer the scalability and performance needed for large GPU clusters, while on-premises data centers +provide stronger security and privacy controls. + +In both cases, the focus isn’t just on seamless orchestration but also on maximizing infrastructure efficiency. This has +long been a strength of Kubernetes, which enables concurrent workload execution across provisioned nodes to minimize +resource waste. + +### GPU blocks + +The newest version of `dstack` introduces a feature called [GPU blocks](../../docs/concepts/fleets.md#ssh-blocks), bringing this level of efficiency to `dstack`. It +enables optimal hardware utilization by allowing concurrent workloads to run on the same hosts, using slices of the +available resources on each host. + +> For example, imagine you’ve reserved a cluster with multiple bare-metal nodes, each equipped with 8x MI300X GPUs from +[Hot Aisle](https://hotaisle.xyz/). + +With `dstack`, you can define your fleet configuration like this: + +
    + +```yaml +type: fleet +name: my-hotaisle-fleet + +ssh_config: + user: ubuntu + identity_file: ~/.ssh/hotaisle_id_rsa + hosts: + - hostname: ssh.hotaisle.cloud + port: 22013 + blocks: auto + - hostname: ssh.hotaisle.cloud + port: 22014 + blocks: auto + +placement: cluster +``` + +
    + +When you run `dstack apply`, each host appears as an available fleet instance, showing `0/8` next to `busy`. By setting `blocks` +to `auto`, you automatically slice each host into 8 GPU blocks. + +
    + +```shell +$ dstack apply -f my-hotaisle-fleet.dstack.yml + +Provisioning... +---> 100% + + FLEET INSTANCE RESOURCES STATUS CREATED + my-hotaisle-fleet 0 8xMI300X (192GB) 0/8 busy 3 mins ago + 1 8xMI300X (192GB) 0/8 busy 3 mins ago +``` + +
    + +For instance, you can run two workloads, each using 4 GPUs, and `dstack` will execute them concurrently on a single instance. + +As the fleet owner, you can set the `blocks` parameter to any number. If you set it to `2`, `dstack` will slice each +host into 2 blocks, each with 4 GPUs. This flexibility allows you to define the minimum block size, ensuring the most +efficient utilization of your resources. + +!!! info "Fractional GPU" + While we plan to eventually support fractions of a single GPU too, this is not the primary use case, as most modern AI + teams require full GPUs for their workloads. + +Regardless whether you're using dstack with a data center or a private cloud, once a fleet is created, +you’re free to run [dev environments](../../docs/concepts/dev-environments.md), +[tasks](../../docs/concepts/tasks.md), and [services](../../docs/concepts/services.md) while maximizing the +cost-efficiency of GPU utilization by concurrent runs. + +## Proxy jump + +Private clouds typically provide access to GPU clusters via SSH through a login node. In these setups, only the login +node is internet-accessible, while cluster nodes can only be reached via SSH from the login node. This prevents creating +an SSH fleet by directly listing the cluster nodes' hostnames. + +The latest `dstack` release introduces the [`proxy_jump`](../../docs/concepts/fleets.md#proxy-jump) property in SSH fleet configurations, enabling creating fleets +through a login node. + +> For example, imagine you’ve reserved a 1-Click Cluster from +> [Lambda](https://lambdalabs.com/) with multiple nodes, each equipped with 8x H100 GPUs from. + +With `dstack`, you can define your fleet configuration like this: + +
    + +```yaml +type: fleet +name: my-lambda-fleet + +ssh_config: + user: ubuntu + identity_file: ~/.ssh/lambda_node_id_rsa + hosts: + - us-east-2-1cc-node-1 + - us-east-2-1cc-node-2 + - us-east-2-1cc-node-3 + - us-east-2-1cc-node-4 + proxy_jump: + hostname: 12.34.567.890 + user: ubuntu + identity_file: ~/.ssh/lambda_head_id_rsa + +placement: cluster +``` + +
    + +When you run `dstack apply`, `dstack` creates an SSH fleet and connects to the configured hosts through the login node +specified via `proxy_jump`. Fleet instances appear as normal instances, enabling you to run +[dev environments](../../docs/concepts/dev-environments.md), +[tasks](../../docs/concepts/tasks.md), and [services](../../docs/concepts/services.md) +just as you would without `proxy_jump`. + +
    + +```shell +$ dstack apply -f my-lambda-fleet.dstack.yml + +Provisioning... +---> 100% + + FLEET INSTANCE RESOURCES STATUS CREATED + my-lambda-fleet 0 8xH100 (80GB) idle 3 mins ago + 1 8xH100 (80GB) idle 3 mins ago + 2 8xH100 (80GB) idle 3 mins ago + 3 8xH100 (80GB) idle 3 mins ago +``` + +
    + +The `dstack` CLI automatically handles SSH tunneling and port forwarding when running workloads. + +## What's next + +To sum it up, the latest release enables `dstack` to be used efficiently not only with public clouds but also with private +clouds and data centers. It natively supports NVIDIA, AMD, Intel Gaudi, and soon other upcoming chips. + +What’s also important is that `dstack` comes with a control plane that not only simplifies orchestration but also provides +a console for monitoring and managing workloads across projects (also known as tenants). + +As a container orchestrator, `dstack` remains a streamlined alternative to Kubernetes and Slurm for AI teams, focusing on +an AI-native experience, simplicity, and vendor-agnostic orchestration for both cloud and on-prem. + +!!! info "Roadmap" + We plan to further enhance `dstack`'s support for both cloud and on-premises setups. For more details on our roadmap, + refer to our [GitHub](https://github.com/dstackai/dstack/issues/2184). + +> Have questions? You're welcome to join +> our [Discord](https://discord.gg/u8SmfwPpMd) or talk +> directly to [our team](https://calendly.com/dstackai/discovery-call). diff --git a/mkdocs/blog/posts/gpu-health-checks.md b/mkdocs/blog/posts/gpu-health-checks.md new file mode 100644 index 0000000000..84746ed90f --- /dev/null +++ b/mkdocs/blog/posts/gpu-health-checks.md @@ -0,0 +1,73 @@ +--- +title: Introducing passive GPU health checks +date: 2025-08-12 +description: "TBA" +slug: gpu-helth-checks +image: https://dstack.ai/static-assets/static-assets/images/gpu-health-checks.png +categories: + - Changelog +--- + +# Introducing passive GPU health checks + +In large-scale training, a single bad GPU can derail progress. Sometimes the failure is obvious — jobs crash outright. Other times it’s subtle: correctable memory errors, intermittent instability, or thermal throttling that quietly drags down throughput. In big experiments, these issues can go unnoticed for hours or days, wasting compute and delaying results. + +`dstack` already supports GPU telemetry monitoring through NVIDIA DCGM [metrics](../../docs/concepts/metrics.md), covering utilization, memory, and temperature. This release extends that capability with passive hardware health checks powered by DCGM [background health checks](https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/feature-overview.html#background-health-checks). With these, `dstack` continuously evaluates fleet GPUs for hardware reliability and displays their status before scheduling workloads. + + + + + +## Why this matters + +Multi-GPU and multi-node workloads are only as strong as their weakest component. GPU cloud providers increasingly rely on automated health checks to prevent degraded hardware from reaching customers. Problems can stem from ECC memory errors, faulty PCIe links, overheating, or other hardware-level issues. Some are fatal, others allow the GPU to run but at reduced performance or with higher failure risk. + +Passive checks like these run in the background. They continuously monitor hardware telemetry and system events, evaluating them against NVIDIA’s known failure patterns — all without pausing workloads. + +## How it works in dstack + +`dstack` automatically queries DCGM for each fleet instance and appends a health status: + +* An `idle` status means no issues have been detected. +* An `idle (warning)` status indicates a non-fatal issue, such as a correctable ECC error. The instance remains usable but should be monitored. +* An `idle (failure)` status points to a fatal issue, and the instance is automatically excluded from scheduling. + +
    + +```shell +$ dstack fleet + + FLEET INSTANCE BACKEND RESOURCES STATUS PRICE CREATED + my-fleet 0 aws (us-east-1) T4:16GB:1 idle $0.526 11 mins ago + 1 aws (us-east-1) T4:16GB:1 idle (warning) $0.526 11 mins ago + 2 aws (us-east-1) T4:16GB:1 idle (failure) $0.526 11 mins ago +``` + +
    + +A healthy instance is ready for workloads. A warning means you should monitor it closely. A failure removes it from scheduling entirely. + +## Passive vs active checks + +This release focuses on passive checks using DCGM background health checks. These run continuously and do not interrupt workloads. + +For active checks today, you can run [NCCL/RCCL tests](../../docs/examples/clusters/nccl-rccl-tests.md) as a [distributed task](../../docs/concepts/tasks.md#distributed-tasks) to verify GPU-to-GPU communication and bandwidth across a fleet. Active tests like these can reveal network or interconnect issues that passive monitoring might miss. More built-in support for active diagnostics is planned. + +## Supported backends + +Passive GPU health checks work on AWS (except with custom `os_images`), Azure (except A10 GPUs), GCP, OCI, and [SSH fleets](../../docs/concepts/fleets.md#ssh-fleets) where DCGM is installed and configured for background checks. + +> Fleets created before version 0.19.22 need to be recreated to enable this feature. + +## Looking ahead + +This update is about visibility: giving engineers real-time insight into GPU health before jobs run. Next comes automation — policies to skip GPUs with warnings, and self-healing workflows that replace unhealthy instances without manual steps. + +If you have experience with GPU reliability or ideas for automated recovery, join the conversation on +[Discord](https://discord.gg/u8SmfwPpMd). + +!!! info "What's next?" + 1. Check [Quickstart](../../docs/quickstart.md) + 2. Explore the [fleets](../../docs/concepts/fleets.md#cluster-placement) guide + 3. Learn more about [metrics](../../docs/concepts/metrics.md) + 4. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/mkdocs/blog/posts/graphsignal.md b/mkdocs/blog/posts/graphsignal.md new file mode 100644 index 0000000000..8ece703454 --- /dev/null +++ b/mkdocs/blog/posts/graphsignal.md @@ -0,0 +1,127 @@ +--- +title: "How Graphsignal uses dstack for inference benchmarking" +date: 2026-04-08 +description: "How Graphsignal uses dstack as a unified layer for GPU development, inference deployment, and benchmarking across on-prem systems and GPU clouds." +slug: graphsignal +image: https://dstack.ai/static-assets/static-assets/images/dstack-graphsignal.png +categories: + - Case studies +links: + - Graphsignal's autodebug blog: https://graphsignal.com/blog/autodebug-telemetry-driven-inference-optimization-loop/ +--- + +# How Graphsignal uses dstack for inference benchmarking + +In a recent engineering [blog post](https://graphsignal.com/blog/autodebug-telemetry-driven-inference-optimization-loop/), Graphsignal shared `autodebug`, an autonomous loop that deploys an inference service, benchmarks it, updates the deployment config, and redeploys it again. This case study looks at the team workflow behind that setup, and how `dstack` gives Graphsignal a common layer for GPU development, inference deployment, and benchmarking. + + + + + +[Graphsignal](https://graphsignal.com/) builds inference observability and AI debugging tooling for teams running production inference across models, engines, and GPUs. That puts the team close to the systems they measure and tune: inference servers, GPU infrastructure, deployment workflows, and benchmark loops. + +To benchmark and optimize inference efficiently, the Graphsignal team combines: + +- on-prem GPU systems, including [NVIDIA DGX Spark](https://www.nvidia.com/en-us/products/workstations/dgx-spark/) devices managed through `dstack` +- cloud GPU capacity, including [Verda](https://verda.com/) as a supported `dstack` backend +- `dstack` as the common orchestration layer for GPU development and inference deployment + +For Graphsignal, the same operational model applies across on-prem systems and GPU clouds. The team can develop on GPU-backed environments, deploy inference services, and rerun benchmarks without switching orchestration models between environments. + +Many teams running inference need a workflow that: + +- works across different GPU environments +- supports both development and production +- does not require building and maintaining custom orchestration for every provider + +`dstack` gives the Graphsignal team a declarative way to provision GPU resources, deploy inference services, and iterate on deployment configs across environments without introducing a separate control plane for each provider. + +> *`dstack` gives us a unified layer for GPU development and inference across on-prem systems and GPU clouds. It is fine-grained enough for serious inference engineering, but simple enough that we do not have to build and maintain custom orchestration around every GPU environment we use.* +> +> *— **Dmitry Melikyan**, Founder at Graphsignal* + +The Graphsignal team primarily uses these `dstack` components: + +- [Dev environments](../../docs/concepts/dev-environments.md) — for GPU-backed development and experimentation +- [Services](../../docs/concepts/services.md) — for deploying inference endpoints and running benchmarkable workloads +- [Fleets](../../docs/concepts/fleets.md) — for spanning on-prem systems and cloud backends through one interface +- the `dstack` CLI — with `dstack apply` used directly in the deployment and benchmarking loop + +In practice, this gives the Graphsignal team a way to: + +- move from GPU development to production inference without changing orchestration layers +- turn a serving change into a fresh, versioned deployment +- run benchmarks on real hardware across on-prem and cloud environments +- keep the same workflow for development, deployment, and repeated optimization + +The examples below are representative `dstack` configurations that illustrate the workflow described above. They are included to show how the same control plane can span on-prem hosts and cloud backends, not as Graphsignal production configs. + +For on-prem systems such as DGX Spark devices, `dstack` can manage multiple hosts through a single SSH fleet definition. + +
    + +```yaml + +type: fleet +name: graphsignal-onprem + +ssh_config: + user: ubuntu + identity_file: ~/.ssh/id_rsa + hosts: + - dgx-spark-1 + - dgx-spark-2 + - dgx-spark-3 +``` + +
    + +For cloud GPU, `dstack` supports Verda as a native backend. + +
    + +```yaml +projects: + - name: main + backends: + - type: verda + creds: + type: api_key + client_id: YOUR_CLIENT_ID + client_secret: YOUR_CLIENT_SECRET +``` + +
    + +For Graphsignal, `dstack` acts as a unified orchestration layer for GPU development and inference across on-prem systems and GPU clouds. It gives both developers and agents a fine-grained interface for editing configs, deploying services, and iterating on infrastructure without switching tools or rebuilding workflow around each environment. + +For agentic workflows, [`dstack` skills](https://skills.sh/dstackai/dstack/dstack) extend that same interface to tools such as Claude Code, Codex, and Cursor. + +
    + +```shell +$ npx skills add dstackai/dstack +``` + +
    + +Once installed, they let an agent work directly with `dstack` configs and CLI commands: create or edit a `*.dstack.yml`, apply the configuration, check run status, and manage fleets, etc. + +Claude Code can use Graphsignal telemetry to decide what to change next, then use `dstack` to generate the updated service config and invoke the CLI on the team’s behalf. + + + +The point is not a single benchmark run, but a repeatable workflow in which deployment, measurement, and optimization stay inside the same system. + +> *Agentic engineering is changing not only how code gets written, but how compute gets orchestrated and how inference gets optimized. Once the deployment layer is programmable, agents can participate directly in benchmarking, redeployment, and performance tuning.* +> +> *— Dmitry Melikyan**, Founder at Graphsignal* + +Instead of treating performance testing as a separate script, the team can run it as a loop: benchmark a live endpoint, inspect logs and telemetry for the same time window, identify bottlenecks, update the `dstack` service config, redeploy, and run the next iteration. + +*Huge thanks to Dmitry Melikyan and Bogdan Sulima at Graphsignal for feedback and collaboration. For more details, see Graphsignal’s engineering post on [autodebug](https://graphsignal.com/blog/autodebug-telemetry-driven-inference-optimization-loop/).* + +!!! info "What's next?" + 1. Follow the [`Installation`](../../docs/installation.md) and [`Quickstart`](../../docs/quickstart.md) guides + 2. Explore [`dev environments`](../../docs/concepts/dev-environments.md), [`tasks`](../../docs/concepts/tasks.md), [`services`](../../docs/concepts/services.md), and [`fleets`](../../docs/concepts/fleets.md) + 3. Use Graphsignal’s [`dstack` integration guide](https://graphsignal.com/docs/integrations/dstack/) to add profiling, tracing, and monitoring to a `dstack` inference service diff --git a/mkdocs/blog/posts/h100-mi300x-inference-benchmark.md b/mkdocs/blog/posts/h100-mi300x-inference-benchmark.md new file mode 100644 index 0000000000..2209393d11 --- /dev/null +++ b/mkdocs/blog/posts/h100-mi300x-inference-benchmark.md @@ -0,0 +1,206 @@ +--- +title: "Exploring inference memory saturation effect: H100 vs MI300x" +date: 2024-12-05 +description: "This benchmark explores how GPU memory saturation affects LLM inference performance and cost, comparing NVIDIA H100 and AMD MI300x." +slug: h100-mi300x-inference-benchmark +image: https://dstack.ai/static-assets/static-assets/images/h100-mi300x-inference-benchmark-v2.png +categories: + - Benchmarks +--- + +# Exploring inference memory saturation effect: H100 vs MI300x + +GPU memory plays a critical role in LLM inference, affecting both performance and cost. This benchmark evaluates memory +saturation’s impact on inference using NVIDIA's H100 and AMD's MI300x with Llama 3.1 405B FP8. + +We examine the effect of limited parallel computational resources on throughput and Time to First Token (TTFT). +Additionally, we compare deployment strategies: running two Llama 3.1 405B FP8 replicas on 4xMI300x versus a single +replica on 4xMI300x and 8xMI300x + +Finally, we extrapolate performance projections for upcoming GPUs like NVIDIA H200, B200, and AMD MI325x, MI350x. + + + +This benchmark is made possible through the generous support of our friends at +[Hot Aisle](https://hotaisle.xyz/) and +[Lambda](https://lambdalabs.com/), +who provided high-end hardware. + + + +## Benchmark setup + +1. AMD 8xMI300x + * 2x Intel Xeon Platinum 8470, 52C/104T, 16GT/s, 105M Cache (350W) + * 8x AMD MI300x GPU OAM, 192GB, 750W + * 32x 64GB RDIMM, 4800MT/s +2. NVIDIA 8xH100 SXM5 + * 2× Intel Xeon Platinum 8480+, 56C/112T, 16GT/s, 105M Cache (350W) + * 8× NVIDIA H100 SXM5 GPU, 80GB, 700W + * 32x 64GB DDR5 + +### Benchmark modes + +1. **Online inference**: Benchmarked across QPS 16, 32, and 1000 using + the [ShareGPT](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered) dataset. Execution used + vLLM’s [benchmark\_serving](https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_serving.py). +2. **Offline inference**: Benchmarked with varying input/output lengths across different batch sizes, using vLLM’s [benchmark\_throughput.py](https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_throughput.py). + +| | Input prompt lengths | Batch size | +|-----------------|----------------------|-------------------------| +| **Short/Small** | 4 to 1024 | | +| **Short/Large** | 128 | 256 | +| **Large/Large** | 32784 | 64 (MI300x) / 16 (H100) | + +## Observations + +### Cost per token + + + +As prompt and batch sizes grow, the NVIDIA H100 reaches memory limits, causing a sharp drop in cost-effectiveness. In +contrast, the 1 FP8 8xMI300x configuration is the most cost-efficient for large prompts. + +For large prompts, two parallel replicas running on 4xMI300x lose their cost advantage compared to a single replica on +8xMI300x. The latter offers 51% more memory for the KV cache, improving throughput and reducing cost per token. + + + +While 4xMI300x is a cost-effective alternative to 8xH100 for smaller load profiles, it underperforms in online serving. +8xH100 SXM5 processes 74% more requests per second and reduces TTFT by at least 50% at all QPS levels. + + + +### Throughput + + + +With large prompts and batch sizes, two replicas on 4xMI300x GPUs hit memory saturation when total tokens (prompt +length x batch size) exceed the available memory for the KV cache. This forces the inference engine to compute KV +tensors on-the-fly or offload them to CPU memory, degrading throughput. + +In [Lambda](https://lambdalabs.com/blog/partner-spotlight-evaluating-nvidia-h200-gpus-for-ai-inference-with-baseten)’ +benchmark, an 8xH200 setup processed 3.4 times more tokens per second than an 8xH100. Extrapolating to our +setup, an 8xH200 would process around 2,186 tokens per second (3.4 × 643), though still lower than 8xMI300x. + +| | AMD MI300x | NVIDIA H200 | +|---------------------------|------------|-------------| +| **GPU Memory** | 192 GB | 141 GB | +| **Memory Type** | HBM3 | HBM3e | +| **Peak Memory Bandwidth** | 5.3TB/s | 4.8TB/s | +| **TFLOPS (FP8)** | 2610 | 1979 | + +#### Replicas on 4xMi300x + + + +Running two replicas on 4xMI300x delivers better throughput for small to medium prompts than a single replica on +8xMI300x. + + + +This boost comes from distributing the Llama 3.1 405B model across four GPUs, enabling parallel execution. For +small prompts, a single replica underutilizes the GPUs. Running two replicas doubles the batch size, improving GPU +utilization and efficiency. + +### Time To First Token + + + +The 4xMI300x setup provides 768 GB of memory (4 GPUs × 192 GB each), compared to 640 GB with 8xH100 (8 GPUs × 80 GB +each). However, at 1000 QPS, TTFT for 4xMI300x is over twice as long as for 8xH100 + +This difference occurs during the prefill stage, where KV tensors for input prompts are computed. Since tensors are +processed in parallel, the 8xH100 configuration distributes the load more effectively, reducing computation time. + +Despite offering more memory, 4xMI300x lacks the parallelism of 8xH100, leading to longer TTFT. + +### Time to Serve 1 Request + + + +Processing a single large prompt request with 8xMI300x takes around 11.25 seconds. This latency is mainly due to +computational demands during the prefill phase, where KV tensors are computed. + +Optimizations like [automatic prefix caching](https://docs.vllm.ai/en/latest/automatic_prefix_caching/apc.html) +could help reduce this time, but are outside the scope of this benchmark. + +## Benchmark notes + +### Benchmark setup + +The script used in this benchmark was designed for large prompts in offline inference. A different script tailored for +online inference would provide more accurate insights. + +### Batch size + +We compared throughput at batch size 16 for 8xH100 and batch size 64 for 8xMI300x. The 8xH100 setup begins to struggle +with batch size 16 due to memory saturation, resulting in slower generation times. + +### Model checkpoints + +For AMD MI300x, we used [`amd/Llama-3.1-405B-Instruct-FP8-KV`](https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV) +to achieve optimal performance, relying on AMD for quantization. + +### vLLM configuration + +To maximize inference results on AMD MI300x, we adjusted specific arguments: + +
    + +```shell +$ VLLM_RPC_TIMEOUT=30000 VLLM_USE_TRITON_FLASH_ATTN=0 vllm serve \ + meta-llama/Llama-3.1-405B-FP8 -tp 8 \ + --max-seq-len-to-capture 16384 \ + --served-model-name meta-llama/Llama-3.1-405B-FP8 \ + --enable-chunked-prefill=False \ + --num-scheduler-step 15 \ + --max-num-seqs 1024 +``` + +
    + +Our benchmark focused on testing inference with tensor parallelism. Integrating tensor and pipeline parallelism could +provide additional insights. + +## On B200, MI325x, and MI350x + +The MI325x offers 64GB more HBM and 0.7TB/s higher bandwidth than MI300x. However, because it has the same FP8 TFLOPS, it +doesn't provide significant compute gains, positioning it against NVIDIA's H200. + +The NVIDIA B200 outperforms MI300x and MI325x with more TFLOPS and higher peak memory bandwidth, resulting in lower TTFT +by reducing compute time for KV tensors and memory transfer times during the decode stage. We expect the B200 to +challenge MI325x, as long as memory saturation is avoided. + +Notably, future GPUs from AMD and NVIDIA are expected to support FP4 and FP6, improving throughput, latency, and +cost-efficiency. + +| | AMD MI300x | AMD MI325x | AMD MI350x | NVIDIA B200 | +|---------------------------|------------|------------|---------------|---------------| +| **GPU Memory** | 192 GB | 256 GB | 288GB | 192 GB | +| **Memory Type** | HBM3 | HBM3e | | HBM3e | +| **Peak Memory Bandwidth** | 5.3TB/s | 6TB/s | | 8TB/s | +| **TFLOPS (FP8)** | 2610 | 2610 | | 4500 | +| **Low precision** | FP8 | FP8 | FP4, FP6, FP8 | FP4, FP6, FP8 | + +## Source code + +All the source code and findings to help you replicate the results are available in +[our GitHub repo](https://github.com/dstackai/benchmarks/tree/main/comparison/h100sxm5_vs_mi300x). + +## Thanks to our friends + +### Hot Aisle + +[Hot Aisle](https://hotaisle.xyz/) sponsored this benchmark by providing access to 8x MI300x hardware. We’re deeply grateful for their support. + +If you're looking for top-tier bare metal compute with AMD GPUs, we highly recommend Hot Aisle. With `dstack`, accessing +your cluster via SSH is seamless and straightforward. + +### Lambda + +[Lambda](https://lambdalabs.com/) sponsored this benchmark with credits for on-demand 8x H100 instances. +We’re truly thankful for their support. + +For top-tier cloud compute with NVIDIA GPUs, Lambda is an excellent choice. Once set up, you can easily provision +compute, manage clusters, and orchestrate your AI workloads using `dstack`. diff --git a/mkdocs/blog/posts/h200-mi300x-deepskeek-benchmark.md b/mkdocs/blog/posts/h200-mi300x-deepskeek-benchmark.md new file mode 100644 index 0000000000..b8587d43f6 --- /dev/null +++ b/mkdocs/blog/posts/h200-mi300x-deepskeek-benchmark.md @@ -0,0 +1,231 @@ +--- +title: "DeepSeek R1 inference performance: MI300X vs. H200" +date: 2025-03-18 +description: "TBA" +slug: h200-mi300x-deepskeek-benchmark +image: https://dstack.ai/static-assets/static-assets/images/h200-mi300x-deepskeek-benchmark-v2.png +categories: + - Benchmarks +--- + +# DeepSeek R1 inference performance: MI300X vs. H200 + +DeepSeek-R1, with its innovative architecture combining Multi-head Latent Attention (MLA) and DeepSeekMoE, presents +unique challenges for inference workloads. As a reasoning-focused model, it generates intermediate chain-of-thought +outputs, placing significant demands on memory capacity and bandwidth. + +In this benchmark, we evaluate the performance of three inference backends—SGLang, vLLM, and TensorRT-LLM—on two hardware +configurations: 8x NVIDIA H200 and 8x AMD MI300X. Our goal is to compare throughput, latency, and overall efficiency to +determine the optimal backend and hardware pairing for DeepSeek-R1's demanding requirements. + + + +This benchmark was made possible through the generous support of our partners at +[Vultr](https://www.vultr.com/) and +[Lambda](https://lambdalabs.com/), +who provided access to the necessary hardware. + + + +## Benchmark setup + +### Hardware configurations + +1. AMD 8xMI300x + * 2x Intel Xeon Platinum 8468, 48C/96T, 16GT/s, 105M Cache (350W) + * 8x AMD MI300x GPU, 192GB, 750W + * 32x 64GB DDR5, 4800MT/s +2. NVIDIA 8xH200 SXM5 + * 2x Intel Xeon Platinum 8570, 56C/112T, 20GT/s, 300M Cache (350W) + * 8x NVIDIA H200 SXM5 GPU, 141GB, 700W + * 32x 64GB DDR5, 5600MT/s + +### Benchmark methodology + +**Online inference** + +We utilized SGLang's [`Deepseek-R1/bench_serving.py`](https://github.com/dstackai/benchmarks/tree/main/Deepseek-R1/bench_serving.py) +script, modified to incorporate TensorRT-LLM. + +Tests were conducted across multiple request concurrencies and output token lengths, with input token length fixed at 3200. + +| Request Concurrencies | Output Token Lengths | Prefix-Cached | +|------------------------|----------------------|----------------| +| 4,8,16,...,128 | 800 | No | +| 128 | 1600, 3200, 6400 | No | +| 128 | 800 | Yes | + +To test prefix caching ability, about 62.5% of each ~3200-token prompt (i.e., 2000 out of 3200 tokens) is a repeated prefix across multiple requests. + +**Offline inference** + +For offline inference, we used vLLM’s [`benchmark_throughput.py`](https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_throughput.py), +modified for SGLang. TensorRT-LLM was tested using a custom +[`benchmark_throughput_trt.py`](https://github.com/dstackai/benchmarks/blob/deepseek-r1-benchmark/Deepseek-R1/benchmark_throughput_trt.py). +The benchmark examined performance across various batch sizes and output token lengths. + +| Batch Sizes | Output Token Lengths | +|--------------------|----------------------| +| 32,64,128,...,1024 | 800 | +| 256, 512, 1024 | 1600 | +| 256, 512, 1024 | 3200 | + +## Key observations + +### Throughput and End-to-End Latency + +**NVIDIA H200 performance** + +* TensorRT-LLM outperformed both vLLM and SGLang, achieving the highest online throughput of 4176 tokens/s on H200. +* At concurrencies below 128, vLLM led in online throughput and end-to-end latency. +* In offline scenarios, H200 achieved the highest overall throughput of 6311 tokens/s with SGLang. + + + +**AMD MI300X performance** + +* vLLM outperformed SGLang in both online and offline throughput and end-to-end latency. +* MI300X with vLLM achieved the highest overall throughput of 4574 tokens/s in online scenarios. +* At request concurrencies below 32, SGLang outperformed vLLM in online throughput and latency. + + + +While MI300X's larger memory capacity and higher bandwidth should theoretically enable higher throughput at larger batch +sizes, the results suggest that inference backends for MI300X may require further optimization to fully leverage its +architectural advantages. + +### Throughput and Latency vs. Output Token Length + +**NVIDIA H200 performance** + +* SGLang delivered slightly higher throughput and better latency as output token length increased in online scenarios. +* In offline scenarios, SGLang with H200 outperformed MI300X as output token length increased. + +=== "Throughput" + + +=== "Latency" + + +**AMD MI300X performance** + +vLLM maintained the lead in both online and offline scenarios as output token length increased. + +=== "Throughput" + + +=== "Latency" + + +### Time to First Token (TTFT) + +**NVIDIA H200 performance** + +TensorRT-LLM maintained the lowest and most consistent TTFT up to concurrency 64. + + + +**AMD MI300X performance** + +vLLM achieved the lowest TTFT at concurrency 128. Below 128, vLLM and SGLang had similar TTFT. + +TTFT, being compute-intensive, highlights H200's advantage, aligning with [SemiAnalysis’s MI300X vs. H200 TFLOPS benchmark](https://semianalysis.com/2024/12/22/mi300x-vs-h100-vs-h200-benchmark-part-1-training/). +However, at 128 concurrent requests, MI300X's memory capacity and bandwidth advantages become evident. + +### Time Per Output Token (TPOT) + +**NVIDIA H200 performance** + +vLLM maintained the lowest TPOT across all request concurrencies. + + + +**AMD MI300X performance** + +SGLang delivered the lowest TPOT up to concurrency 32. Beyond that, vLLM took the lead. + +Given that TPOT is memory-bound, MI300X should have a stronger advantage with further optimizations. + +### TTFT vs. Output Token Length + +**NVIDIA H200 performance** + +* SGLang demonstrated stable TTFT across increasing output token lengths. +* vLLM and TensorRT-LLM showed significant increases in TTFT as output token length grew, likely due to KV cache memory pressure. + + + +**AMD MI300X performance** + +Both vLLM and SGLang demonstrated stable TTFT across increasing output token lengths, with vLLM maintaining lower TTFT. + + + +### TPOT vs. Output Token Length + +**NVIDIA H200 performance** + +SGLang and TensorRT-LLM demonstrated stable TPOT across increasing output token lengths. + + + +vLLM maintained the lowest TPOT up to 3200 tokens but showed a sudden increase at 6400 tokens, likely due to memory pressure. + +**AMD MI300X performance** + +Both SGLang and vLLM demonstrated stable TPOT across increasing output token lengths, with vLLM maintaining the lowest TPOT. + +### Prefix caching + +**NVIDIA H200 performance** + +vLLM outperformed SGLang in online throughput, TTFT, and end-to-end latency with prefix caching enabled. However, vLLM's +TPOT increased after prefix caching, which requires further investigation. + +=== "Throughput" + +=== "TTFT" + +=== "TPOT" + +=== "Latency" + + +## Limitations + +1. The offline benchmark results for TensorRT-LLM were obtained using the DeepSeek-R1 model engine built from the + [`deepseek` branch](https://github.com/NVIDIA/TensorRT-LLM/tree/deepseek). + However, the TensorRT-LLM team recommends using the TorchFlow-based approach for deployment. +2. The impact of dynamic batching on inference efficiency was not tested. +3. vLLM's prefix caching support for MI300X is a work in progress and can be tracked [here](https://github.com/ROCm/vllm/issues/457). +4. The inference backends are being optimized for the DeepSeek-R1 model. Given these continuous updates, the current + results reflect only the performance tested at the time of the benchmark. Overall, performance for all backends is + expected to improve as more optimizations are made by the backend teams. + +## Source code + +All source code and findings are available in +[our GitHub repo](https://github.com/dstackai/benchmarks/tree/deepseek-r1-benchmark/Deepseek-R1). + +## References + +* [Unlock DeepSeek-R1 Inference Performance on AMD Instinct MI300X GPU](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html) +* [Deploy DeepSeek-R1 671B on 8x NVIDIA H200 with SGLang](https://datacrunch.io/blog/deploy-deepseek-r1-on-8x-nvidia-h200) +* [vLLM Prefix Caching](https://docs.vllm.ai/en/latest/design/automatic_prefix_caching.html#design-automatic-prefix-caching) +* [SgLang Prefix Caching](https://lmsys.org/blog/2024-01-17-sglang/) + +## Acknowledgments + +### Vultr + +[Vultr](https://www.vultr.com/) provided access to 8x AMD MI300X GPUs. We are truly thankful for their support. + +If you're looking for top-tier bare metal compute with AMD GPUs, we highly recommend Vultr. With `dstack`, provisioning +and accessing compute via `dstack` is seamless and straightforward. + +### Lambda + +[Lambda](https://lambdalabs.com/) provided access to 8x +NVIDIA H200 GPUs. We are truly thankful for their support + +Both Vultr and Lambda are natively supported and can be seamlessly integrated with `dstack`. diff --git a/mkdocs/blog/posts/hotaisle.md b/mkdocs/blog/posts/hotaisle.md new file mode 100644 index 0000000000..0d531710d5 --- /dev/null +++ b/mkdocs/blog/posts/hotaisle.md @@ -0,0 +1,114 @@ +--- +title: Supporting Hot Aisle AMD AI Developer Cloud +date: 2025-08-11 +description: "TBA" +slug: hotaisle +image: https://dstack.ai/static-assets/static-assets/images/dstack-hotaisle.png +categories: + - Changelog +--- + +# Supporting Hot Aisle AMD AI Developer Cloud + +As the ecosystem around AMD GPUs matures, developers are looking for easier ways to experiment with ROCm, benchmark new architectures, and run cost-effective workloads—without manual infrastructure setup. + +`dstack` is an open-source orchestrator designed for AI workloads, providing a lightweight, container-native alternative to Kubernetes and Slurm. + + + +Today, we’re excited to announce native integration with [Hot Aisle](https://www.hotaisle.io/), an AMD-only GPU neocloud offering VMs and clusters at highly competitive on-demand pricing. + + + +## About Hot Aisle + +Hot Aisle is a next-generation GPU cloud built around AMD’s flagship AI accelerators. + +Highlights: + +- AMD’s flagship AI-optimized accelrators +- On-demand pricing: $1.99/hour for 1-GPU VMs +- No commitment – start and stop when you want +- First AMD-only GPU backend in `dstack` + +While it has already been possible to use HotAisle’s 8-GPU MI300X bare-metal clusters via [`SSH fleets`](../../docs/concepts/fleets.md#ssh-fleets), this integration now enables automated provisioning of VMs—made possible by HotAisle’s newly added API for MI300X instances. + +## Why dstack + +`dstack` is a new open-source container orchestrator built specifically for GPU workloads. +It fills the gaps left by Kubernetes and Slurm when it comes to GPU provisioning and orchestration: + +- Unlike Kubernetes, `dstack` offers a high-level, AI-engineer-friendly interface, and GPUs work out of the box, with no need to wrangle custom operators, device plugins, or other low-level setup. +- Unlike Slurm, it’s use-case agnostic — equally suited for training, inference, benchmarking, or even setting up long-running dev environments. +- It works across clouds and on-prem without vendor lock-in. + +With the new Hot Aisle backend, you can automatically provision MI300X VMs for any workload — from experiments to production — with a single `dstack` CLI command. + +## Getting started + +Before configuring `dstack` to use Hot Aisle’s VMs, complete these steps: + +1. Create a project via `ssh admin.hotaisle.app` +2. Get credits or approve a payment method +3. Create an API key + +Then, configure the backend in `~/.dstack/server/config.yml`: + +
    + +```yaml +projects: +- name: main + backends: + - type: hotaisle + team_handle: hotaisle-team-handle + creds: + type: api_key + api_key: 9c27a4bb7a8e472fae12ab34.3f2e3c1db75b9a0187fd2196c6b3e56d2b912e1c439ba08d89e7b6fcd4ef1d3f +``` + +
    + +Install and start the `dstack` server: + +
    + +```shell +$ pip install "dstack[server]" +$ dstack server +``` + +
    + +For more details, see [Installation](../../docs/installation.md). + +Use the `dstack` CLI to +manage [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), +and [services](../../docs/concepts/services.md). + +
    + +```shell +$ dstack apply -f .dstack.yml + + # BACKEND RESOURCES INSTANCE TYPE PRICE + 1 hotaisle (us-michigan-1) cpu=13 mem=224GB disk=12288GB MI300X:192GB:1 1x MI300X 13x Xeon Platinum 8470 $1.99 + 2 hotaisle (us-michigan-1) cpu=8 mem=224GB disk=12288GB MI300X:192GB:1 1x MI300X 8x Xeon Platinum 8470 $1.99 + + Submit the run? [y/n]: +``` + +
    + +Currently, `dstack` supports 1xGPU Hot Aisle VMs. Support for 8xGPU VMs will be added once Hot Aisle supports it. + +> If you prefer to use Hot Aisle’s bare-metal 8-GPU clusters with dstack, you can create an [SSH fleet](../../docs/concepts/fleets.md#ssh-fleets). +> This way, you’ll be able to run [distributed tasks](../../docs/concepts/tasks.md#distributed-tasks) efficiently across the cluster. + +!!! info "What's next?" + 1. Check [Quickstart](../../docs/quickstart.md) + 2. Learn more about [Hot Aisle](https://hotaisle.xyz/) + 3. Explore [dev environments](../../docs/concepts/dev-environments.md), + [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), + and [fleets](../../docs/concepts/fleets.md) + 4. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/images/dstack-diagram-stack-3.png b/mkdocs/blog/posts/images/dstack-diagram-stack-3.png similarity index 100% rename from docs/blog/posts/images/dstack-diagram-stack-3.png rename to mkdocs/blog/posts/images/dstack-diagram-stack-3.png diff --git a/docs/blog/posts/images/dstack-research-banner-2.png b/mkdocs/blog/posts/images/dstack-research-banner-2.png similarity index 100% rename from docs/blog/posts/images/dstack-research-banner-2.png rename to mkdocs/blog/posts/images/dstack-research-banner-2.png diff --git a/docs/blog/posts/images/dstack-sky-banner-4.png b/mkdocs/blog/posts/images/dstack-sky-banner-4.png similarity index 100% rename from docs/blog/posts/images/dstack-sky-banner-4.png rename to mkdocs/blog/posts/images/dstack-sky-banner-4.png diff --git a/mkdocs/blog/posts/inactivity-duration.md b/mkdocs/blog/posts/inactivity-duration.md new file mode 100644 index 0000000000..7c3d88eb58 --- /dev/null +++ b/mkdocs/blog/posts/inactivity-duration.md @@ -0,0 +1,75 @@ +--- +title: Auto-shutdown for inactive dev environments—no idle GPUs +date: 2025-02-19 +description: "dstack introduces a new feature that automatically detects and shuts down inactive dev environments, helping you avoid wasted GPU costs." +slug: inactivity-duration +image: https://dstack.ai/static-assets/static-assets/images/inactive-dev-environments-auto-shutdown.png +categories: + - Changelog +--- + +# Auto-shutdown for inactive dev environments—no idle GPUs + +Whether you’re using cloud or on-prem compute, you may want to test your code before launching a +training task or deploying a service. `dstack`’s [dev environments](../../docs/concepts/dev-environments.md) +make this easy by setting up a remote machine, cloning your repository, and configuring your IDE —all within +a container that has GPU access. + +One issue with dev environments is forgetting to stop them or closing your laptop, leaving the GPU idle and costly. With +our latest update, `dstack` now detects inactive environments and automatically shuts them down, saving you money. + + + + + +When defining a dev environment, you can now enable automatic shutdown by setting the +`inactivity_duration` property to specify how long `dstack` should wait before +automatically terminating an inactive environment. + +```yaml +type: dev-environment +# The name is optional, if not specified, generated randomly +name: vscode + +python: "3.11" + +ide: vscode + +# Shut-down if inactive for 2 hours +inactivity_duration: 2h + +resources: + gpu: 1 +``` + +A dev environment is considered inactive when you close your desktop VS Code, exit any `ssh ` sessions, or +interrup the `dstack apply` or `dstack attach` command. + +If you go offline without manually stopping anything, `dstack` will +automatically detect inactivity and shut down the environment within approximately three minutes. + +If you’ve configured `inactivity_duration`, you can check how long a dev environment environment has been inactive using: + +
    + +```shell +$ dstack ps --verbose + NAME BACKEND RESOURCES PRICE STATUS SUBMITTED + vscode cudo 2xCPU, 8GB, $0.0286 running 8 mins ago + 100.0GB (disk) (inactive for 2m 34s) +``` + +
    + +Reattaching to the environment with [`dstack attach`](../../docs/reference/cli/dstack/attach.md) +resets the inactivity timer within seconds. + +Overall, the new feature makes using dev environments both safer and more cost-effective. +This not only helps reduce unnecessary GPU costs, but also ensures more efficient reuse of +fleets by teams. + +!!! info "What's next?" + 1. Check [dev environments](../../docs/concepts/dev-environments.md), + [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), + and [fleets](../../docs/concepts/fleets.md) + 2. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/mkdocs/blog/posts/instance-volumes.md b/mkdocs/blog/posts/instance-volumes.md new file mode 100644 index 0000000000..36ead69303 --- /dev/null +++ b/mkdocs/blog/posts/instance-volumes.md @@ -0,0 +1,87 @@ +--- +title: "Introducing instance volumes to persist data on instances" +date: 2024-11-05 +description: "To simplify caching across runs and the use of NFS, we introduce a new volume type that persists data on the instance." +image: https://dstack.ai/static-assets/static-assets/images/dstack-instance-volumes.png +slug: instance-volumes +categories: + - Changelog +--- + +# Introducing instance volumes to persist data on instances + +## How it works { style="display:none" } + +Until now, `dstack` supported data persistence only with network volumes, managed by clouds. +While convenient, sometimes you might want to use a simple cache on the instance or +mount an NFS share to your SSH fleet. To address this, we're now introducing instance volumes that work for both cases. + +
    + +```yaml +type: task +name: llama32-task + +env: + - HF_TOKEN + - MODEL_ID=meta-llama/Llama-3.2-3B-Instruct +commands: + - pip install vllm + - vllm serve $MODEL_ID --max-model-len 4096 +ports: [8000] + +volumes: + - /root/.dstack/cache:/root/.cache + +resources: + gpu: 16GB.. +``` + +
    + + + +> Instance volumes work with both [SSH fleets](../../docs/concepts/fleets.md#ssh-fleets) +> and [cloud fleets](../../docs/concepts/fleets.md#backend-fleets), and it is possible to mount any folders on the instance, +> whether they are regular folders or NFS share mounts. + +The configuration above mounts `/root/.dstack/cache` on the instance to `/root/.cache` inside container. + +## Caching data on fleet instances { #caching } + +If you use a folder on the instance that is not an NFS mount, instance volumes can only be used for caching purposes, as +their state is bound to a particular instance while it's up. + +Caching can be especially useful if you want to re-run the same configuration on the same fleet and avoid downloading +very large models, datasets, or dependencies with each run. + +## Using NFS with SSH and cloud fleets { #nfs } + +If you want to replicate the state across instances, you can mount an NFS share to the instance folder. + +With SSH fleets, it's easy to set up an NFS share, as you can do it when logging into your hosts via SSH. +If you'd like to mount NFS with your cloud fleets, you will need to use a custom AMI for that. + +Here's an example of a dev environment that mounts the `data` folder from an NFS share, which is mounted to +`/mnt/nfs-storage` on the instance, to the `/data` folder inside the container. + +
    + +```yaml +type: dev-environment +name: vscode-nfs + +ide: vscode + +volumes: + - /mnt/nfs-storage/data:/data +``` + +
    + +## Feedback + +If you find something not working as intended, please be sure to report it to +[GitHub issues](https://github.com/dstackai/dstack/issues){:target="_ blank"}. +Your feedback and feature requests is also very welcome on our +[Discord](https://discord.gg/u8SmfwPpMd) server. diff --git a/mkdocs/blog/posts/intel-gaudi.md b/mkdocs/blog/posts/intel-gaudi.md new file mode 100644 index 0000000000..37b8c383b4 --- /dev/null +++ b/mkdocs/blog/posts/intel-gaudi.md @@ -0,0 +1,174 @@ +--- +title: Supporting Intel Gaudi AI accelerators with SSH fleets +date: 2025-02-21 +description: "dstack now supports Intel Gaudi accelerators with SSH fleets, simplifying container orchestration across private clouds and data centers." +slug: intel-gaudi +image: https://dstack.ai/static-assets/static-assets/images/dstack-intel-gaudi-and-intel-tiber-cloud.png-v2 +categories: + - Changelog +--- + +# Supporting Intel Gaudi AI accelerators with SSH fleets + +At `dstack`, our goal is to make AI container orchestration simpler and fully vendor-agnostic. That’s why we support not +just leading cloud providers and on-prem environments but also a wide range of accelerators. + +With our latest release, we’re adding support +for Intel Gaudi AI Accelerator and launching a new partnership with Intel. + + + + + +## About Intel Gaudi + +Intel Gaudi AI Accelerator is a series of accelerators built to handle AI tasks. Powered by Intel’s Habana architecture, Gaudi is +tailored for high-performance AI inference and training, offering high throughput and efficiency. It has a scalable +design with numerous cores and ample memory bandwidth, enabling better performance per watt. + +Here's a brief spec for Gaudi 2 and Gaudi 3: + +| | **Gaudi 2** | **Gaudi 3** | +|----------------------|-------------|-------------| +| **MME Units** | 2 | 8 | +| **TPC Units** | 24 | 64 | +| **HBM Capacity** | 96 GB | 128 GB | +| **HBM Bandwidth** | 2.46 TB/s | 3.7 TB/s | +| **Networking** | 600 GB/s | 1200 GB/s | +| **FP8 Performance** | 865 TFLOPs | 1835 TFLOPs | +| **BF16 Performance** | 432 TFLOPs | 1835 TFLOPs | + +In the latest release, `dstack` now supports the orchestration of containers across on-prem +machines equipped with Intel Gaudi accelerators. + +## Create a fleet + +To manage container workloads on on-prem machines with Intel Gaudi accelerators, start by configuring an +[SSH fleet](../../docs/concepts/fleets.md#ssh-fleets). Here’s an example configuration for your fleet: + +
    + +```yaml +type: fleet +name: my-gaudi2-fleet +ssh_config: + hosts: + - hostname: 100.83.163.67 + user: sdp + identity_file: ~/.ssh/id_rsa + blocks: auto + - hostname: 100.83.163.68 + user: sdp + identity_file: ~/.ssh/id_rsa + blocks: auto + proxy_jump: + hostname: 146.152.186.135 + user: guest + identity_file: ~/.ssh/intel_id_rsa +``` + +
    + +To provision the fleet, run the [`dstack apply`](../../docs/reference/cli/dstack/apply.md) command: + +
    + +```shell +$ dstack apply -f examples/misc/fleets/gaudi.dstack.yml + +Provisioning... +---> 100% + + FLEET INSTANCE BACKEND GPU STATUS CREATED + my-gaudi2-fleet 0 ssh 152xCPU, 1007GB, 8xGaudi2 idle 3 mins ago + (96GB), 388.0GB (disk) + 1 ssh 152xCPU, 1007GB, 8xGaudi2 idle 3 mins ago + (96GB), 388.0GB (disk) +``` + +
    + +## Apply a configuration + +With your fleet provisioned, you can now run [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md). + +Below is an example of a task configuration for fine-tuning the [`DeepSeek-R1-Distill-Qwen-7B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) +model using [Optimum for Intel Gaudi](https://github.com/huggingface/optimum-habana) +and [DeepSpeed](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/DeepSpeed_User_Guide/DeepSpeed_User_Guide.html#deepspeed-user-guide) with +the [`lvwerra/stack-exchange-paired`](https://huggingface.co/datasets/lvwerra/stack-exchange-paired) dataset: + +
    + +```yaml +type: task +name: trl-train + +image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0 +env: + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B + - WANDB_API_KEY + - WANDB_PROJECT +commands: + - pip install --upgrade-strategy eager optimum[habana] + - pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 + - git clone https://github.com/huggingface/optimum-habana.git + - cd optimum-habana/examples/trl + - pip install -r requirements.txt + - pip install wandb + - DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 python ../gaudi_spawn.py --world_size $DSTACK_GPUS_NUM --use_deepspeed sft.py + --model_name_or_path $MODEL_ID + --dataset_name "lvwerra/stack-exchange-paired" + --deepspeed ../language-modeling/llama2_ds_zero3_config.json + --output_dir="./sft" + --do_train + --max_steps=500 + --logging_steps=10 + --save_steps=100 + --per_device_train_batch_size=1 + --per_device_eval_batch_size=1 + --gradient_accumulation_steps=2 + --learning_rate=1e-4 + --lr_scheduler_type="cosine" + --warmup_steps=100 + --weight_decay=0.05 + --optim="paged_adamw_32bit" + --lora_target_modules "q_proj" "v_proj" + --bf16 + --remove_unused_columns=False + --run_name="sft_deepseek_70" + --report_to="wandb" + --use_habana + --use_lazy_mode + +resources: + gpu: gaudi2:8 +``` + +
    + +Submit the task using the [`dstack apply`](../../docs/reference/cli/dstack/apply.md) command: + +
    + +```shell +$ dstack apply -f examples/single-node-training/trl/intel/.dstack.yml -R +``` + +
    + +`dstack` will automatically create containers according to the run configuration and execute them across the fleet. + +> Explore our [examples](../../docs/examples/accelerators/intel/index.md) to learn how to train and deploy large models on +> Intel Gaudi AI Accelerator. + +!!! info "Intel Tiber AI Cloud" + At `dstack`, we’re grateful to be part of the Intel Liftoff program, which allowed us to access Intel Gaudi AI + accelerators via [Intel Tiber AI Cloud](https://www.intel.com/content/www/us/en/developer/tools/tiber/ai-cloud.html). + You can sign up if you’d like to access Intel Gaudi AI accelerators via the cloud. + + Native integration with Intel Tiber AI Cloud is also coming soon to `dstack`. + +!!! info "What's next?" + 1. Refer to [Quickstart](../../docs/quickstart.md) + 2. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) + 3. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/mkdocs/blog/posts/kubernetes-beta.md b/mkdocs/blog/posts/kubernetes-beta.md new file mode 100644 index 0000000000..64fb6117c5 --- /dev/null +++ b/mkdocs/blog/posts/kubernetes-beta.md @@ -0,0 +1,315 @@ +--- +title: Orchestrating GPUs on Kubernetes clusters +date: 2025-10-08 +description: "TBA" +slug: kubernetes-beta +image: https://dstack.ai/static-assets/static-assets/images/dstack-kubernetes.png +categories: + - Changelog +--- + +# Orchestrating GPUs on Kubernetes clusters + +`dstack` gives teams a unified way to run and manage GPU-native containers across clouds and on-prem environments — without requiring Kubernetes. +At the same time, many organizations rely on Kubernetes as the foundation of their infrastructure. + +To support these users, `dstack` is releasing the beta of its native Kubernetes integration. + + + + + +This update allows `dstack` to orchestrate dev environments, distributed training, and inference workloads directly on Kubernetes clusters — combining the best of both worlds: an ML-tailored interface for ML teams together with the full Kubernetes ecosystem. + +Read below to learn on how to use `dstack` with Kubernetes clusters. + +## Creating a Kubernetes cluster + +A major advantage of Kubernetes is its portability. Whether you’re using managed Kubernetes on a GPU cloud or an on-prem cluster, you can connect it to `dstack` and use it to orchestrate your GPU workloads. + +!!! info "NVIDIA GPU Operator" + For `dstack` to correctly detect GPUs in your Kubernetes cluster, the cluster must have the + [NVIDIA GPU Operator](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/index.html) pre-installed. + +### Nebius example + +If you're using [Nebius](https://nebius.com/), the process of creating a Kubernetes cluster is straightforward. + +Select the region of interest and click `Create cluster`. +Once the cluster is created, switch to `Applications` and install the `nvidia-device-plugin` application — this can be done in one click. + + + +Next, go to `Node groups` and click `Create node group`. Choose the GPU type and count, disk size, and other options. +If `dstack` doesn't run in the same network, enable public IPs so that `dstack` can access the nodes. + + + +## Setting up the backend + +Once the cluster is ready, you need to configure the `kubernetes` backend in the `dstack` server. +To do this, add the corresponding configuration to your `~/.dstack/server/config.yml` file: + +
    + +```yaml +projects: +- name: main + backends: + - type: kubernetes + kubeconfig: + filename: ~/.kube/config + proxy_jump: + hostname: 204.12.171.137 + port: 32000 +``` + +
    + +The configuration includes two main parts: the path to the kubeconfig file and the proxy-jump configuration. + +If your cluster is on Nebius, click `How to connect` in the console — it will guide you through setting up the kubeconfig file. + +!!! info "Proxy jump" + To allow `dstack` to forward SSH traffic, it needs one node to act as a proxy jump. + Choose any node in the cluster and specify its IP address and an accessible port in the backend configuration. + + Now that the backend is configured, go ahead and restart the `dstack server`. + +That’s it — you can now use all of `dstack`’s features, including [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md). + +## Running a dev environment + +A dev environment lets you provision an instance and connect to it from your desktop IDE. + +
    + +```yaml +type: dev-environment +# The name is optional, if not specified, generated randomly +name: vscode + +python: "3.11" + +# Uncomment to use a custom Docker image +#image: huggingface/trl-latest-gpu + +ide: vscode + +resources: + gpu: H200 +``` + +
    + +To run a dev environment, pass the configuration to [`dstack apply`](../../docs/reference/cli/dstack/apply.md): + +
    + +```shell +$ dstack apply -f examples/.dstack.yml + + # BACKEND RESOURCES INSTANCE TYPE PRICE + 1 kubernetes (-) cpu=127 mem=1574GB disk=871GB H200:141GB:8 computeinstance-u00hwk32d0xemhxhvj $0 + 2 kubernetes (-) cpu=127 mem=1574GB disk=871GB H200:141GB:8 computeinstance-u00n24fb4q85yavc9z $0 + +Submit the run vscode? [y/n]: y + +Launching `vscode`... +---> 100% + +To open in VS Code Desktop, use this link: + vscode://vscode-remote/ssh-remote+vscode/workflow +``` + +
    + +Dev environments support many [diffrent options](../../docs/concepts/dev-environments.md), including a custom Docker image, mounted repositories, idle timeout, min GPU utilization, and more. + +## Running distributed training + +Distributed training can be performed in `dstack` using [distributed tasks](../../docs/concepts/tasks.md#distributed-tasks). +The configuration is similar to a dev environment, except it runs across multiple nodes. + +### Creating a cluster fleet + +Before running a distributed task, create a fleet with `placement` set to `cluster`: + +
    + + ```yaml + type: fleet + # The name is optional; if not specified, one is generated automatically + name: my-k8s-fleet + + # For `kubernetes`, `min` should be set to `0` since it can't pre-provision VMs. + # Optionally, you can set the maximum number of nodes to limit scaling. + nodes: 0.. + + placement: cluster + + backends: [kubernetes] + + resources: + # Specify requirements to filter nodes + gpu: 1..8 + ``` + +
    + +Then, create the fleet using the `dstack apply` command: + +
    + +```shell +$ dstack apply -f examples/misc/fleets/.dstack.yml + +Provisioning... +---> 100% + + FLEET INSTANCE BACKEND GPU PRICE STATUS CREATED +``` + +
    + +Once the fleet is created, you can run distributed tasks on it. + +### NCCL tests example + +Below is an example of using distributed tasks to run NCCL tests. +It also demonstrates how to use mpirun with `dstack`: + +
    + +```yaml +type: task +name: nccl-tests + +nodes: 2 + +# The `startup_order` and `stop_criteria` properties are required for `mpirun` +startup_order: workers-first +stop_criteria: master-done + +env: + - NCCL_DEBUG=INFO +commands: + - | + if [ $DSTACK_NODE_RANK -eq 0 ]; then + mpirun \ + --allow-run-as-root \ + --hostfile $DSTACK_MPI_HOSTFILE \ + -n $DSTACK_GPUS_NUM \ + -N $DSTACK_GPUS_PER_NODE \ + --bind-to none \ + /opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1 + else + sleep infinity + fi + +# The `kubernetes` backend requires it +privileged: true + +resources: + gpu: nvidia:1..8 + shm_size: 16GB +``` + +
    + +To run the configuration, use the [`dstack apply`](../../docs/reference/cli/dstack/apply.md) command. + +
    + +```shell +$ dstack apply -f examples/clusters/nccl-tests/.dstack.yml --fleet my-k8s-fleet + +# BACKEND RESOURCES INSTANCE TYPE PRICE +1 kubernetes (-) cpu=127 mem=1574GB disk=871GB H200:141GB:8 computeinstance-u00hwk32d0xemhxhvj $0 +2 kubernetes (-) cpu=127 mem=1574GB disk=871GB H200:141GB:8 computeinstance-u00n24fb4q85yavc9z $0 + +Submit the run nccl-tests? [y/n]: y +``` + +
    + +### Distributed training example + +Below is a minimal example of a distributed training configuration: + +
    + +```yaml +type: task +name: train-distrib + +nodes: 2 + +python: 3.12 +env: + - NCCL_DEBUG=INFO +commands: + - git clone https://github.com/pytorch/examples.git pytorch-examples + - cd pytorch-examples/distributed/ddp-tutorial-series + - uv pip install -r requirements.txt + - | + torchrun \ + --nproc-per-node=$DSTACK_GPUS_PER_NODE \ + --node-rank=$DSTACK_NODE_RANK \ + --nnodes=$DSTACK_NODES_NUM \ + --master-addr=$DSTACK_MASTER_NODE_IP \ + --master-port=12345 \ + multinode.py 50 10 + +resources: + gpu: 1..8 + shm_size: 16GB +``` + +
    + +To run the configuration, use the [`dstack apply`](../../docs/reference/cli/dstack/apply.md) command. + +
    + +```shell +$ dstack apply -f examples/distributed-training/torchrun/.dstack.yml --fleet my-k8s-fleet + +# BACKEND RESOURCES INSTANCE TYPE PRICE +1 kubernetes (-) cpu=127 mem=1574GB disk=871GB H200:141GB:8 computeinstance-u00hwk32d0xemhxhvj $0 +2 kubernetes (-) cpu=127 mem=1574GB disk=871GB H200:141GB:8 computeinstance-u00n24fb4q85yavc9z $0 + +Submit the run nccl-tests? [y/n]: y +``` + +
    + +For more examples, explore the [training](../../docs/examples.md#training) section in the docs. + +## FAQ + +### VM-based backends vs Kubernetes backend + +While the `kubernetes` backend is preferred if your team depends on the Kubernetes ecosystem, +the [VM-based](../../docs/concepts/backends.md#vm-based) backends leverage native integration with top GPU clouds (including Nebius and others) and may be a better choice if Kubernetes isn’t required. + +VM-based backends also offer more granular control over cluster provisioning. + +> Note that `dstack` doesn’t yet support Kubernetes clusters with auto-scaling enabled (coming soon), which can be another reason to use VM-based backends. + +### SSH fleets vs Kubernetes backend + +If you’re using on-prem servers and Kubernetes isn’t a requirement, [SSH fleets](../../docs/concepts/fleets.md#ssh-fleets) may be simpler. +They provide a lightweight and flexible alternative. + +### AMD GPUs + +Support for AMD GPUs is coming soon — our team is actively working on it right now. + +!!! info "What's next" + 1. Check [Quickstart](../../docs/quickstart.md) + 2. Explore [dev environments](../../docs/concepts/dev-environments.md), + [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), + and [fleets](../../docs/concepts/fleets.md) + 3. Browse the [fleets](../../docs/concepts/fleets.md#cluster-placement) guide + 4. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/mkdocs/blog/posts/metrics-ui.md b/mkdocs/blog/posts/metrics-ui.md new file mode 100644 index 0000000000..877ae9fca8 --- /dev/null +++ b/mkdocs/blog/posts/metrics-ui.md @@ -0,0 +1,58 @@ +--- +title: "Built-in UI for monitoring essential GPU metrics" +date: 2025-04-03 +description: "TBA" +slug: metrics-ui +image: https://dstack.ai/static-assets/static-assets/images/dstack-metrics-ui-v3-min.png +categories: + - Changelog +--- + +# Built-in UI for monitoring essential GPU metrics + +AI workloads generate vast amounts of metrics, making it essential to have efficient monitoring tools. While our recent +update introduced the ability to export available metrics to Prometheus for maximum flexibility, there are times when +users need to quickly access essential metrics without the need to switch to an external tool. + + + +Previously, we introduced a [CLI command](dstack-metrics.md) that allows users to view essential GPU metrics for both NVIDIA +and AMD hardware. Now, with this latest update, we’re excited to announce the addition of a built-in dashboard within +the `dstack` control plane. + + + +The new feature provides an easy-to-use interface for tracking the most essential GPU metrics +directly from the control plane, streamlining the real-time monitoring process without needing any additional tools. + + + +Additionally, we’ve renamed the CLI command previously known as `dstack stats` to `dstack metrics` for consistency. + +
    + +```shell +$ dstack metrics nccl-tests -w + NAME CPU MEMORY GPU + nccl-tests 81% 2754MB/1638400MB #0 100740MB/144384MB 100% Util + #1 100740MB/144384MB 100% Util + #2 100740MB/144384MB 99% Util + #3 100740MB/144384MB 99% Util + #4 100740MB/144384MB 99% Util + #5 100740MB/144384MB 99% Util + #6 100740MB/144384MB 99% Util + #7 100740MB/144384MB 100% Util +``` + +
    + +By default, both the control plane and CLI show metrics from the last hour, which is particularly useful for debugging +workloads. + +For persistent storage and long-term access to metrics, we still recommend setting up Prometheus to fetch +metrics from `dstack`. + +!!! info "What's next?" + 1. See [Metrics](../../docs/concepts/metrics.md) + 2. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) + 3. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/mkdocs/blog/posts/mpi.md b/mkdocs/blog/posts/mpi.md new file mode 100644 index 0000000000..02152aad3b --- /dev/null +++ b/mkdocs/blog/posts/mpi.md @@ -0,0 +1,104 @@ +--- +title: "Supporting MPI and NCCL/RCCL tests" +date: 2025-04-02 +description: "TBA" +slug: mpi +image: https://dstack.ai/static-assets/static-assets/images/dstack-mpi-v2.png +categories: + - Changelog +--- + +# Supporting MPI and NCCL/RCCL tests + +As AI models grow in complexity, efficient orchestration tools become increasingly important. +[Fleets](../../docs/concepts/fleets.md) introduced by `dstack` last year streamline +[task execution](../../docs/concepts/tasks.md) on both cloud and +on-prem clusters, whether it's pre-training, fine-tuning, or batch processing. + +The strength of `dstack` lies in its flexibility. Users can leverage distributed framework like +`torchrun`, `accelerate`, or others. `dstack` handles node provisioning, job execution, and automatically propagates +system environment variables—such as `DSTACK_NODE_RANK`, `DSTACK_MASTER_NODE_IP`, +`DSTACK_GPUS_PER_NODE` and [others](../../docs/concepts/tasks.md#system-environment-variables)—to containers. + + + +One use case `dstack` hasn’t supported until now is MPI, as it requires a scheduled environment or +direct SSH connections between containers. Since `mpirun` is essential for running NCCL/RCCL tests—crucial for large-scale +cluster usage—we’ve added support for it. + + + +Below is an example of a task that runs AllReduce test on 2 nodes, each with 4 GPUs (8 processes in total). + +
    + +```yaml +type: task +name: nccl-tests + +nodes: 2 + +image: dstackai/efa +env: + - NCCL_DEBUG=INFO +commands: + - | + # We use FIFO for inter-node communication + FIFO=/tmp/dstack_job + if [ ${DSTACK_NODE_RANK} -eq 0 ]; then + cd /root/nccl-tests/build + # Generate hostfile for mpirun + : > hostfile + for ip in ${DSTACK_NODES_IPS}; do + echo "${ip} slots=${DSTACK_GPUS_PER_NODE}" >> hostfile + done + MPIRUN='mpirun --allow-run-as-root --hostfile hostfile' + # Wait for other nodes + while true; do + if ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 true >/dev/null 2>&1; then + break + fi + echo 'Waiting for nodes...' + sleep 5 + done + # Run NCCL tests + ${MPIRUN} \ + -n ${DSTACK_GPUS_NUM} -N ${DSTACK_GPUS_PER_NODE} \ + --mca pml ^cm \ + --mca btl tcp,self \ + --mca btl_tcp_if_exclude lo,docker0 \ + --bind-to none \ + ./all_reduce_perf -b 8 -e 8G -f 2 -g 1 + # Notify nodes the job is done + ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 sh -c "echo done > ${FIFO}" + else + mkfifo ${FIFO} + # Wait for a message from the first node + cat ${FIFO} + fi + +resources: + gpu: nvidia:4:16GB + shm_size: 16GB + +``` + +
    + +The master node (`DSTACK_NODE_RANK=0`) generates a `hostfile` listing all node IPs and waits until all nodes are +reachable via MPI. Once confirmed, it launches the `/root/nccl-tests/build/all_reduce_perf` benchmark across all available GPUs in the cluster. + +Non-master nodes remain blocked until they receive a termination signal from the master node via a FIFO pipe. + +With this, now you can use such a task to run both NCCL or RCCL tests on both cloud and SSH fleets, +as well as use MPI for other tasks. + +> The `dstackai/efa` image used in the example comes with MPI and NCCL tests pre-installed. While it is optimized for +> [AWS EFA](https://aws.amazon.com/hpc/efa/), it can also +> be used with regular TCP/IP network adapters and InfiniBand. +> See the [source code](https://github.com/dstackai/dstack/blob/master/docker/efa) for the image. + +!!! info "What's next?" + 1. Learn more about [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) + 2. Check the [NCCL/RCCL tests](../../docs/examples/clusters/nccl-rccl-tests.md) example + 3. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/mkdocs/blog/posts/nebius-in-dstack-sky.md b/mkdocs/blog/posts/nebius-in-dstack-sky.md new file mode 100644 index 0000000000..1f911f98d3 --- /dev/null +++ b/mkdocs/blog/posts/nebius-in-dstack-sky.md @@ -0,0 +1,127 @@ +--- +title: Nebius joins dstack Sky GPU marketplace, with production-ready GPU clusters +date: 2025-09-18 +description: "TBA" +slug: nebius-in-dstack-sky +image: https://dstack.ai/static-assets/static-assets/images/dstack-sky-nebius.png +categories: + - Changelog +--- + +# Nebius in dstack Sky GPU marketplace, with production-ready GPU clusters + +`dstack` is an [open-source](https://github.com/dstackai/dstack) control plane for orchestrating GPU workloads. It can provision cloud VMs, run on top of Kubernetes, or manage on-prem clusters. If you don’t want to self-host, you can use [dstack Sky](https://sky.dstack.ai), the managed version of `dstack` that also provides access to cloud GPUs via its markfetplace. + +With our latest release, we’re excited to announce that [Nebius](https://nebius.com/), a purpose-built AI cloud for large scale training and inference, has joined the `dstack` Sky marketplace +to offer on-demand and spot GPUs, including clusters. + + + + +Last week we published the [state of cloud GPU](state-of-cloud-gpu-2025.md), a study of the GPU market. As noted there, Nebius is one of the few purpose-built AI clouds delivering performant and resilient GPUs at scale — available on-demand, as spot instances, and as full clusters. + +Nebius designs and operates its own GPU servers in energy-efficient data centers, giving full control over quality, performance tuning, and delivery timelines. Every cluster undergoes a three-stage validation — hardware burn-in, reference architecture checks, and long-haul stress tests — ensuring production-ready infrastructure with consistent performance for large-scale AI training. + +Since early this year, the open-source `dstack` has supported Nebius, making it easy to manage clusters and orchestrate compute cost-effectively. + +## About dstack Sky + +With this week's release, Nebius officially joins [dstack Sky](https://sky.dstack.ai). Nebius can now be used not only with your own account, but also directly via the GPU marketplace. + +The marketplace lets you access Nebius GPUs without having a Nebius account. You can pay through `dstack Sky`, and switch to your own Nebius account anytime with just a few clicks. + + + +While the open-source version of `dstack` has supported Nebius clusters from day one, +Nebius is the first provider to bring on-demand and spot GPU clusters to `dstack` Sky. + +With Nebius, `dstack` Sky users can orchestrate NVIDIA GPUs provisioned in hours, with optimized InfiniBand networking to minimize bottlenecks, non-virtualized GPUs for predictable throughput, and industry-leading MTBF/MTTR proven on multi-thousand-GPU clusters. + +## Getting started + +After you [sign up](https://sky.dstack.ai) with `dstack` Sky, +you’ll be prompted to create a project and choose between the GPU marketplace or your own cloud account: + + + +Once the project is created, install the `dstack` CLI: + +=== "uv" + +
    + + ```shell + $ uv tool install dstack -U + ``` + +
    + +=== "pip" + +
    + + ```shell + $ pip install dstack -U + ``` + +
    + +Now you can define [dev environments](../../docs/concepts/dev-environments.md), +[tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), +and [fleets](../../docs/concepts/fleets.md), then apply them with `dstack apply`. + +`dstack` provisions cloud VMs, sets up environments, orchestrates runs, and handles everything required for development, training, or deployment. + +To create a Nebius cluster, for example for distributed training, define the following fleet configuration: + +
    + +```yaml +type: fleet +name: my-cluster + +placement: cluster +nodes: 2 + +backends: [nebius] + +resources: + gpu: H100:8 +``` + +
    + +Then, create it via `dstack apply`: + +
    + +```shell +$ dstack apply -f my-cluster.dstack.yml +``` + +
    + +Once the fleet is ready, you can run [distributed tasks](../../docs/concepts/tasks.md#distributed-tasks). +`dstack` automatically configures drivers, networking, and fast GPU-to-GPU interconnect. + +To learn more, see the [clusters](../../docs/examples/clusters/nebius.md) guide. + +With Nebius joining `dstack` Sky, users can now run on-demand and spot GPUs and clusters directly through the marketplace—gaining access to the same production grade infrastrucure Nebius customers use for frontier-scale training, without needing a separate Nebius account. + +> If you prefer to go self-hosted, you can always switch to the open-source version of `dstack`, bringing the same functionality. + +Our goal is to give teams maximum flexibility while removing the complexity of managing infrastructure. More updates are coming soon. + +!!! info "How does `dstack` compare to Kubernetes?" + `dstack` can run either on top of Kubernetes or directly on cloud VMs. + In both cases, you don’t need to manage Kubernetes yourself — `dstack` handles container and GPU orchestration, + providing a simple, multi-cloud interface for development, training, and inference. + +!!! info "What's next" + 1. Sign up with [dstack Sky](https://sky.dstack.ai) + 2. Check [Quickstart](../../docs/quickstart.md) + 3. Learn more about [Nebius](https://nebius.com/) + 4. Explore [dev environments](../../docs/concepts/dev-environments.md), + [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), + and [fleets](../../docs/concepts/fleets.md) + 5. Read the [clusters](../../docs/examples/clusters/nebius.md) guide diff --git a/mkdocs/blog/posts/nebius.md b/mkdocs/blog/posts/nebius.md new file mode 100644 index 0000000000..5fab310227 --- /dev/null +++ b/mkdocs/blog/posts/nebius.md @@ -0,0 +1,115 @@ +--- +title: Supporting GPU provisioning and orchestration on Nebius +date: 2025-04-11 +description: "TBA" +slug: nebius +image: https://dstack.ai/static-assets/static-assets/images/dstack-nebius-v2.png +categories: + - Changelog +--- + +# Supporting GPU provisioning and orchestration on Nebius + +As demand for GPU compute continues to scale, open-source tools tailored for AI workloads are becoming critical to +developer velocity and efficiency. +`dstack` is an open-source orchestrator purpose-built for AI infrastructure—offering a lightweight, container-native +alternative to Kubernetes and Slurm. + + + +Today, we’re announcing native integration with [Nebius](https://nebius.com/), +offering a streamlined developer experience for teams using GPUs for AI workloads. + + +## About Nebius + +Nebius provides cloud GPUs, +offering high-performance clusters at competitive prices. This pricing is achieved through custom-designed hardware, +partnerships with Original Design Manufacturers (ODMs), and infrastructure team expertise. + +Nebius offers various NVIDIA GPUs, including the L40S, H100, H200, GB200, NVL72, and B200 models, available on-demand +and through reserved instances. Their data centers are located across Europe, with planned expansions into the US. + +## Why dstack + +Kubernetes offers flexibility, but its complexity is often unnecessary—especially for use cases like interactive +development or multi-stage training. +Slurm is excellent for batch scheduling but lacks native support for dev environments, real-time inference, and +multi-user orchestration. + +`dstack` fills the gap: a developer-friendly platform with native GPU support across dev environments, tasks, and +long-running services—without the operational overhead. + +## Getting started + +To use `dstack` with Nebius, configure your `nebius` backend: + +1. Log in to your [Nebius AI Cloud](https://console.eu.nebius.com/) account. +2. Navigate to `Access`, and select `Service Accounts`. +3. Create a new service account, assign it to the `editors` group, and upload an authorized key. + +Then, configure the backend via `~/.dstack/server/config.yml`: + +
    + +```yaml +projects: + - name: main + backends: + - type: nebius + creds: + type: service_account + service_account_id: serviceaccount-e002dwnbz81sbvg2bs + public_key_id: publickey-e00fciu5rkoteyzo69 + private_key_file: ~/path/to/key.pem +``` + +
    + +Now, proceed with installing and starting the `dstack` server: + +
    + +```shell +$ pip install "dstack[nebius]" +$ dstack server +``` + +
    + +For more details, refer to [Installation](../../docs/installation.md). + +Use the `dstack` CLI to +manage [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), +and [services](../../docs/concepts/services.md). + +
    + +```shell +$ dstack apply -f .dstack.yml + + # BACKEND REGION RESOURCES SPOT PRICE + 1 nebius eu-north1 8xCPU, 32GB, 1xL40S (48GB) no $1.5484 + 2 nebius eu-north1 16xCPU, 200GB, 1xH100 (80GB) no $2.95 + 3 nebius eu-north1 16xCPU, 200GB, 1xH200 (141GB) no $3.5 + ... + Shown 3 of 7 offers, $28 max + + Override the run? [y/n]: +``` + +
    + +The new `nebius` backend supports CPU and GPU instances, [fleets](../../docs/concepts/fleets.md), +[distributed tasks](../../docs/concepts/tasks.md#distributed-tasks), and more. + +> Support for [network volumes](../../docs/concepts/volumes.md#network-volumes) and accelerated cluster +interconnects is coming soon. + +!!! info "What's next?" + 1. Check [Quickstart](../../docs/quickstart.md) + 2. Sign up with [Nebius AI Cloud](https://console.eu.nebius.com/) + 3. Read about [dev environments](../../docs/concepts/dev-environments.md), + [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), + and [fleets](../../docs/concepts/fleets.md) + 4. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/mkdocs/blog/posts/nvidia-and-amd-on-vultr.md b/mkdocs/blog/posts/nvidia-and-amd-on-vultr.md new file mode 100644 index 0000000000..e3961b37a9 --- /dev/null +++ b/mkdocs/blog/posts/nvidia-and-amd-on-vultr.md @@ -0,0 +1,76 @@ +--- +title: Supporting NVIDIA and AMD accelerators on Vultr +date: 2025-02-17 +description: "Introducing integration with Vultr: The new integration allows Vultr customers to train and deploy models on both AMD and NVIDIA GPUs." +slug: nvidia-and-amd-on-vultr +image: https://dstack.ai/static-assets/static-assets/images/dstack-vultr.png +categories: + - Changelog +--- + +# Supporting NVIDIA and AMD accelerators on Vultr + +As demand for AI infrastructure grows, the need for efficient, vendor-neutral orchestration tools is becoming +increasingly important. +At `dstack`, we’re committed to redefining AI container orchestration by prioritizing an AI-native, open-source-first +approach. +Today, we’re excited to share a new integration and partnership +with [Vultr](https://www.vultr.com/). + + + +This new integration enables Vultr customers to train and deploy models on both AMD +and NVIDIA GPUs with greater flexibility and efficiency–using `dstack`. + + + +## About Vultr + +[Vultr](https://www.vultr.com/) provides cloud GPUs across 32 regions, supporting both NVIDIA and AMD hardware with on-demand and reserved +capacity. Their offerings include AMD MI300X and NVIDIA GH200, H200, H100, A100, L40S, and A40, all available at +competitive [pricing](https://www.vultr.com/pricing/#cloud-gpu). + +## Why dstack + +Kubernetes wasn’t built for AI. It’s powerful, but it adds unnecessary complexity that slows down development, training, +and deployment. That’s where `dstack` comes in. + +`dstack` is an open-source orchestrator designed specifically for AI. Here’s a quick look at how it simplifies running dev +environments and services on Vultr: + + + +`dstack` runs on any cloud or on-prem setup, providing a simple way to manage dev environments, tasks, services, fleets, +and volumes—so you can focus on building instead of troubleshooting infrastructure. + +## Getting started + +To use `dstack` with your Vultr account, you need to [configure a `vultr` backend](../../docs/concepts/backends.md): + +Log into your [Vultr](https://www.vultr.com/) account, click `Account` in the sidebar, select `API`, find the `Personal Access Token` panel and click the `Enable API` button. In the `Access Control` panel, allow API requests from all addresses or from the subnet where your `dstack` server is deployed. + +Then, go ahead and configure the backend: + +
    + +```yaml +projects: + - name: main + backends: + - type: vultr + creds: + type: api_key + api_key: B57487240a466624b48de22865589 +``` + +
    + +For more details, refer to [Installation](../../docs/installation.md). + +!!! info "What's next?" + 1. Refer to [Quickstart](../../docs/quickstart.md) + 2. Sign up with [Vultr](https://www.vultr.com/) + 3. Check [dev environments](../../docs/concepts/dev-environments.md), + [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), + and [fleets](../../docs/concepts/fleets.md) + 4. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/mkdocs/blog/posts/nvidia-dgx-spark.md b/mkdocs/blog/posts/nvidia-dgx-spark.md new file mode 100644 index 0000000000..60202c60c9 --- /dev/null +++ b/mkdocs/blog/posts/nvidia-dgx-spark.md @@ -0,0 +1,132 @@ +--- +title: "Orchestrating workloads on NVIDIA DGX Spark" +date: 2025-11-14 +description: "TBA" +slug: nvidia-dgx-spark +image: https://dstack.ai/static-assets/static-assets/images/nvidia-dgx-spark.png +# categories: +# - Benchmarks +--- + +# Orchestrating workloads on NVIDIA DGX Spark + +With support from [Graphsignal](https://x.com/GraphsignalAI/status/1986565583593197885), our team gained access to the new [NVIDIA DGX Spark](https://www.nvidia.com/en-us/products/workstations/dgx-spark/) and used it to validate how `dstack` operates on this hardware. This post walks through how to set it up with `dstack` and use it alongside existing on-prem clusters or GPU cloud environments to run workloads. + + + + + +If DGX Spark is new to you, here is a quick breakdown of the key specs. + +* Built on the NVIDIA GB10 Grace Blackwell Superchip with Arm CPUs. +* Capable of up to 1 petaflop of AI compute at FP4 precision, roughly comparable to RTX 5070 performance. +* Features 128GB of unified CPU and GPU memory enabled by the Grace Blackwell architecture. +* Ships with NVIDIA DGX OS (a tuned Ubuntu build) and NVIDIA Container Toolkit. + +These characteristics make DGX Spark a fitting extension for local development and smaller-scale model training or inference, including workloads up to the GPT-OSS 120B range. + +## Creating an SSH fleet + +Because DGX Spark supports SSH and containers, integrating it with dstack is straightforward. Start by configuring an [SSH fleet](../../docs/concepts/fleets.md#ssh-fleets). The file needs the hosts and access credentials. + +
    + +```yaml +type: fleet +name: spark + +ssh_config: + user: devops + identity_file: ~/.ssh/id_rsa + hosts: + - spark-e3a4 +``` + +
    + +The `user` must have `sudo` privileges. + +Apply the configuration: + +
    + +```shell +$ dstack apply -f fleet.dstack.yml + +Provisioning... +---> 100% + + FLEET INSTANCE GPU PRICE STATUS CREATED + spark 0 GB10:1 $0 idle 3 mins ago +``` + +
    + +Once active, the system detects hardware and marks the instance as `idle`. From here, you can run +[dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), +and [services](../../docs/concepts/services.md) on the DGX Spark fleet, the same way you would with other on-prem or cloud GPU backends. + +## Running a dev environment + +Example configuration: + +
    + +```yaml +type: dev-environment +name: cursor + +image: lmsysorg/sglang:spark + +ide: cursor + +resources: + gpu: GB10 + +volumes: + - /root/.cache/huggingface:/root/.cache/huggingface + +fleets: [spark] +``` + +
    + +We use an [instance volume](../../docs/concepts/volumes.md#instance-volumes) to keep model downloads cached across runs. The `lmsysorg/sglang:spark` image is tuned for inference on DGX Spark. Any Arm-compatible image with proper driver support will work if customization is needed. + +Run the environment: + +
    + +```shell +$ dstack apply -f .dstack.yml + + BACKEND GPU INSTANCE TYPE PRICE + ssh (remtoe) GB10:1 instance $0 idle + +Submit the run cursor? [y/n]: y + + # NAME BACKEND GPU PRICE STATUS SUMBITTED + 1 cursor ssh (remote) GB10:1 $0 running 12:24 + +Launching `cursor`... +---> 100% + +To open in VS Code Desktop, use this link: + vscode://vscode-remote/ssh-remote+cursor/workflow +``` + +
    + +## What's next? + +> Running workloads on DGX Spark with `dstack` works the same way as on any other [backend](../../docs/concepts/backends.md) (including GPU clouds): you can run [dev environments](../../docs/concepts/dev-environments.md) for interactive development, [tasks](../../docs/concepts/tasks.md) for fine tuning, and [services](../../docs/concepts/services.md) for inference through the unified interface. + +1. Read the [NVIDIA DGX Spark in-depth review](https://lmsys.org/blog/2025-10-13-nvidia-dgx-spark/) by the SGLang team. +2. Check [dev environments](../../docs/concepts/dev-environments.md), + [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), + and [fleets](../../docs/concepts/fleets.md) +3. Follow [Quickstart](../../docs/quickstart.md) +4. Join [Discord](https://discord.gg/u8SmfwPpMd) + +!!! info "Aknowledgement" + Thanks to the [Graphsignal](https://graphsignal.com/) team for access to DGX Spark and for supporting testing and validation. Graphsignal provides inference observability tooling used to profile CUDA workloads during both training and inference. diff --git a/mkdocs/blog/posts/nvidia-dynamo.md b/mkdocs/blog/posts/nvidia-dynamo.md new file mode 100644 index 0000000000..1d2e63d66b --- /dev/null +++ b/mkdocs/blog/posts/nvidia-dynamo.md @@ -0,0 +1,211 @@ +--- +title: "Deploying NVIDIA Dynamo PD disaggregation with dstack" +date: 2026-06-10 +description: "Deploy NVIDIA Dynamo with Prefill-Decode disaggregation using dstack services." +slug: nvidia-dynamo +image: https://dstack.ai/static-assets/static-assets/images/nvidia-dynamo.png +categories: + - Changelog +--- + +# Deploying NVIDIA Dynamo PD disaggregation with dstack + +`dstack` is an open-source, AI-native orchestrator that works across clouds, Kubernetes clusters, on-prem fleets, hardware vendors, and frameworks. Alongside training, inference is one of the primary use cases `dstack` supports out of the box. + +With the latest update, `dstack` added native support for NVIDIA Dynamo with Prefill-Decode (PD) disaggregation, letting a service run a Dynamo router, prefill workers, and decode workers as separate replica groups. + + + + + +## About NVIDIA Dynamo + +[NVIDIA Dynamo](https://docs.nvidia.com/dynamo/getting-started/introduction) is an open-source, high-throughput, low-latency inference framework for serving generative AI workloads in distributed environments. It adds a system-level layer above inference engines such as SGLang, vLLM, and TensorRT-LLM, coordinating them across GPUs and nodes. + +Dynamo brings together disaggregated serving, intelligent routing, KV cache management, KV cache transfer, and automatic scaling to maximize throughput and minimize latency for LLM, reasoning, multimodal, and video generation workloads. + +!!! info "PD disaggregation" + Prefill-Decode disaggregation separates the two phases of LLM inference: prompt processing (prefill) and token generation (decode). Prefill is compute-bound and parallelizable. Decode is memory-bound and sequential. Running them as separate pools allows each phase to be sized and scaled independently. + +## PD disaggregation with dstack + +To deploy NVIDIA Dynamo with PD disaggregation, define a [service](../../docs/concepts/services.md) with three [replica groups](../../docs/concepts/services.md#replicas-and-scaling): + +- a Dynamo router +- prefill workers +- decode workers + +The router replica group declares `router: { type: dynamo }`. This tells `dstack` to route external traffic only to the router replica and to inject `DSTACK_ROUTER_INTERNAL_IP` into the worker replicas after the router is provisioned. + +This support was introduced in [`0.20.20`](https://github.com/dstackai/dstack/releases/tag/0.20.20). + +??? info "Prerequisites" + Running PD disaggregation on `dstack` requires a [fleet](../../docs/concepts/fleets.md) with [cluster placement](../../docs/concepts/fleets.md#cluster-placement), because prefill and decode workers need a fast interconnect for KV cache transfer. + + The prefill and decode replicas run on GPUs. The router replica can run on CPU, but it must run in the same cluster. + +## Deploying the service + +Here's a complete service configuration that deploys `zai-org/GLM-4.5-Air-FP8` with NVIDIA Dynamo, SGLang workers, and PD disaggregation on `dstack`: + +
    + +```yaml +type: service +name: dynamo-pd + +env: + - HF_TOKEN + - MODEL_ID=zai-org/GLM-4.5-Air-FP8 + +replicas: + - count: 1 + docker: true + commands: + - apt-get update + - apt-get install -y python3-dev python3-venv + - python3 -m venv ~/dyn-venv + - source ~/dyn-venv/bin/activate + - pip install -U pip + - pip install "ai-dynamo[sglang]==1.1.1" + - git clone https://github.com/ai-dynamo/dynamo.git + # Brings up the NATS / etcd compose stack and runs the Dynamo HTTP frontend. + - docker compose -f dynamo/dev/docker-compose.yml up -d + - | + python3 -m dynamo.frontend \ + --http-host 0.0.0.0 --http-port 8000 \ + --discovery-backend etcd --router-mode kv \ + --kv-cache-block-size 64 + resources: + cpu: 4 + router: + type: dynamo + + - count: 1..4 + scaling: + metric: rps + target: 3 + python: "3.12" + nvcc: true + commands: + # dstack injects DSTACK_ROUTER_INTERNAL_IP after the router replica + # is provisioned. Compose the etcd/NATS endpoints from it. + - export ETCD_ENDPOINTS="http://$DSTACK_ROUTER_INTERNAL_IP:2379" + - export NATS_SERVER="nats://$DSTACK_ROUTER_INTERNAL_IP:4222" + # Set to enable /health endpoint required by dstack probes. + - export DYN_SYSTEM_PORT="8000" + # Wait until the router's etcd and NATS ports are actually accepting connections. + - | + until (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/2379) 2>/dev/null \ + && (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/4222) 2>/dev/null; do + echo "waiting for etcd/NATS on $DSTACK_ROUTER_INTERNAL_IP..."; sleep 3 + done + - pip install "ai-dynamo[sglang]==1.1.1" + - | + python3 -m dynamo.sglang \ + --model-path $MODEL_ID --served-model-name $MODEL_ID \ + --discovery-backend etcd --host 0.0.0.0 \ + --page-size 64 \ + --disaggregation-mode prefill --disaggregation-transfer-backend nixl + resources: + gpu: H200 + + - count: 1..8 + scaling: + metric: rps + target: 2 + python: "3.12" + nvcc: true + commands: + - export ETCD_ENDPOINTS="http://$DSTACK_ROUTER_INTERNAL_IP:2379" + - export NATS_SERVER="nats://$DSTACK_ROUTER_INTERNAL_IP:4222" + - export DYN_SYSTEM_PORT="8000" + - | + until (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/2379) 2>/dev/null \ + && (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/4222) 2>/dev/null; do + echo "waiting for etcd/NATS on $DSTACK_ROUTER_INTERNAL_IP..."; sleep 3 + done + - pip install "ai-dynamo[sglang]==1.1.1" + - | + python3 -m dynamo.sglang \ + --model-path $MODEL_ID --served-model-name $MODEL_ID \ + --discovery-backend etcd --host 0.0.0.0 \ + --page-size 64 \ + --disaggregation-mode decode --disaggregation-transfer-backend nixl + resources: + gpu: H200 + +port: 8000 +model: zai-org/GLM-4.5-Air-FP8 + +# Custom probe is required for PD disaggregation. +probes: + - type: http + url: /health + interval: 15s +``` + +
    + +The router replica group starts the Dynamo HTTP frontend and the NATS/etcd compose stack used by the workers. It declares `router: { type: dynamo }`, so `dstack` treats it as the service router. + +The prefill and decode replica groups use the router's internal IP to set `ETCD_ENDPOINTS` and `NATS_SERVER`, wait for those services to become reachable, then start `dynamo.sglang` in either `prefill` or `decode` mode. `DYN_SYSTEM_PORT=8000` exposes the `/health` endpoint required by the `dstack` [probe](../../docs/concepts/services.md#probes). + +In this setup, Dynamo uses etcd for worker discovery and NATS for worker and KV-cache events used by the router. NIXL handles KV cache transfer between prefill and decode workers. `dstack` handles provisioning, service exposure, health probes, and independent scaling of the prefill and decode replica groups. + +> With the `dynamo` router, `dstack` can run SGLang, vLLM, or TensorRT-LLM prefill and decode workers. + +Apply the configuration: + +
    + +```shell +$ HF_TOKEN=... +$ dstack apply -f dynamo-pd.dstack.yml +``` + +
    + +Once provisioning completes, `dstack` exposes a single OpenAI-compatible endpoint. Without a gateway, the endpoint is available through the server proxy: + +
    + +```shell +$ curl http://127.0.0.1:3000/proxy/services/main/dynamo-pd/v1/chat/completions \ + -X POST \ + -H 'Authorization: Bearer ' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "zai-org/GLM-4.5-Air-FP8", + "messages": [ + { + "role": "user", + "content": "What is prefill-decode disaggregation?" + } + ], + "max_tokens": 1024 + }' +``` + +
    + +If a [gateway](../../docs/concepts/gateways.md) is configured, the service endpoint is available at `https://dynamo-pd./`. + +!!! info "Limitations" + - The router replica group must use `count: 1`. + - Services with a Dynamo router cannot configure `retry`, because workers cache the router's internal IP at provisioning time. + - In-place updates are blocked when they would replace the Dynamo router replica. If the router gets a new internal IP, already-running workers would still point to the old etcd and NATS endpoints. Stop the run and apply again for router-affecting changes. + - The `scaling` blocks use [`dstack` service autoscaling](../../docs/reference/dstack.yml/service.md#scaling), which currently scales replica groups based on `rps`. Support for scaling based on inference metrics such as TTFT and ITL is planned. + +## Why this matters + +Dynamo brings system-level inference optimizations such as disaggregated serving, KV-aware routing, KV cache transfer, and coordination across workers. `dstack` complements it with orchestration for provisioning compute, cluster placement, service exposure, health probes, and independent scaling of worker groups. + +With native Dynamo support, `dstack` streamlines high-throughput inference with leading open-source serving frameworks, while avoiding custom deployment glue. The same `dstack` orchestration layer can be used for training, inference, and development across GPU clouds, Kubernetes clusters, and on-prem fleets. + +## What's next? + +1. Read the [NVIDIA Dynamo example](../../docs/examples/inference/dynamo.md) +2. Read about [services](../../docs/concepts/services.md), [fleets](../../docs/concepts/fleets.md), and [gateways](../../docs/concepts/gateways.md) +3. Review the [NVIDIA Dynamo documentation](https://docs.nvidia.com/dynamo/getting-started/introduction) and [Dynamo GitHub repository](https://github.com/ai-dynamo/dynamo) +4. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/mkdocs/blog/posts/pd-disaggregation.md b/mkdocs/blog/posts/pd-disaggregation.md new file mode 100644 index 0000000000..dd3f27c9e8 --- /dev/null +++ b/mkdocs/blog/posts/pd-disaggregation.md @@ -0,0 +1,143 @@ +--- +title: "Model inference with Prefill-Decode disaggregation" +date: 2026-02-19 +description: "TBA" +slug: pd-disaggregation +image: https://dstack.ai/static-assets/static-assets/images/dstack-pd-disaggregation.png +categories: + - Changelog +links: + - SGLang router integration: https://dstack.ai/blog/sglang-router/ +--- + +# Model inference with Prefill-Decode disaggregation + +While `dstack` started as a GPU-native orchestrator for development and training, over the last year it has increasingly brought inference to the forefront — making serving a first-class citizen. + + + +At the end of last year, we introduced [SGLang router](../posts/sglang-router.md) integration — bringing cache-aware routing to [services](../../docs/concepts/services.md). Today, building on that integration, we’re adding native Prefill–Decode (PD) disaggregation. + + + +Unlike many PD disaggregation setups tied to Kubernetes as the control plane, dstack does not depend on Kubernetes. It’s an open-source, GPU-native orchestrator that can provision GPUs directly in your cloud accounts or on bare-metal infrastructure — while also running on top of existing Kubernetes clusters if needed. + +For inference, `dstack` provides a [services](../../docs/concepts/services.md) abstraction. While remaining framework-agnostic, we integrate more deeply with leading open-source frameworks — [SGLang](https://github.com/sgl-project/sglang) being one of them for model inference. + +> If you’re new to Prefill–Decode disaggregation, see the official [SGLang docs](https://docs.sglang.io/advanced_features/pd_disaggregation.html). + +!!! note "Deprecation notice" + Configuring the SGLang router in a gateway is deprecated and will be disallowed in a future release. To run router and workers as separate replica groups, see [SGLang PD disaggregation (router as replica group)](../../docs/examples/inference/sglang.md#pd-disaggregation). + +## Services + +With `dstack` `0.20.10`, you can define a service with separate replica groups for Prefill and Decode workers and enable PD disaggregation directly in the `router` configuration. + +
    + +```yaml +type: service +name: glm45air + +env: + - HF_TOKEN + - MODEL_ID=zai-org/GLM-4.5-Air-FP8 + +image: lmsysorg/sglang:latest + +replicas: + - count: 1..4 + scaling: + metric: rps + target: 3 + commands: + - | + python -m sglang.launch_server \ + --model-path $MODEL_ID \ + --disaggregation-mode prefill \ + --disaggregation-transfer-backend mooncake \ + --host 0.0.0.0 \ + --port 8000 \ + --disaggregation-bootstrap-port 8998 + resources: + gpu: H200 + + - count: 1..8 + scaling: + metric: rps + target: 2 + commands: + - | + python -m sglang.launch_server \ + --model-path $MODEL_ID \ + --disaggregation-mode decode \ + --disaggregation-transfer-backend mooncake \ + --host 0.0.0.0 \ + --port 8000 + resources: + gpu: H200 + +port: 8000 +model: zai-org/GLM-4.5-Air-FP8 + +probes: + - type: http + url: /health_generate + interval: 15s + +router: + type: sglang + pd_disaggregation: true +``` + +
    + +Deploy it as usual: + +
    + +```shell +$ HF_TOKEN=... +$ dstack apply -f glm45air.dstack.yml +``` + +
    + +### Gateway + +Just like `dstack` relies on the SGLang router for cache-aware routing, Prefill–Decode disaggregation also requires a [gateway](../../docs/concepts/gateways.md#sglang) configured with the SGLang router. + +
    + +```yaml +type: gateway +name: inference-gateway + +backends: [kubernetes] +region: any + +domain: example.com + +router: + type: sglang + policy: cache_aware +``` + +
    + +## Limitations +* Because the SGLang router requires all workers to be on the same network, and `dstack` currently runs the router inside the gateway, the gateway and the service must be running in the same cluster. +* Autoscaling supports RPS as the metric for now; TTFT and ITL metrics are planned next. +* Prefill–Decode disaggregation is currently available with the SGLang backend (vLLM support is coming). + +With native support for inference and now Prefill–Decode disaggregation, `dstack` makes it easier to run high-throughput, low-latency model serving across GPU clouds, and Kubernetes or bare-metal clusters. + +## What's next? + +We’re working on PD disaggregation benchmarks and tuning guidance — coming soon. + +In the meantime: + +1. Read about [services](../../docs/concepts/services.md), [gateways](../../docs/concepts/gateways.md), and [fleets](../../docs/concepts/fleets.md) +2. Check out [Quickstart](../../docs/quickstart.md) +3. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/mkdocs/blog/posts/probes.md b/mkdocs/blog/posts/probes.md new file mode 100644 index 0000000000..d3d85335aa --- /dev/null +++ b/mkdocs/blog/posts/probes.md @@ -0,0 +1,111 @@ +--- +title: Introducing service probes +date: 2025-08-14 +description: HTTP readiness probes for services, inspired by Kubernetes—safer rollouts and clear runtime visibility. +slug: probes +image: https://dstack.ai/static-assets/static-assets/images/dstack-service-probes.png +categories: + - Changelog +--- + +# Introducing service probes + +`dstack` services are long-running workloads—most often inference endpoints and sometimes web apps—that run continuously on GPU or CPU instances. They can scale across replicas and support rolling deployments. + +This release adds HTTP probes inspired by Kubernetes readiness probes. Probes periodically call an endpoint on each replica (for example, `/health`) to confirm it responds as expected. The result gives clear visibility into startup progress and, during rolling deployments, ensures traffic only shifts to a replacement replica after all configured probes have proven ready. + + + + + +When a service starts, replicas may need time to load models and initialize dependencies. Without probes, a replica is considered ready as soon as the container starts. With probes, readiness is based on real responses. + +Each probe sends an HTTP request to a configured endpoint at a set interval. A `2xx` response counts as success. + +## Configuration + +Probes can be set via the `probes` property in a service configuration: + +
    + +```yaml +type: service +name: llama31 + +python: 3.12 +env: + - HF_TOKEN +commands: + - uv pip install vllm + - | + vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \ + --max-model-len 4096 \ + --tensor-parallel-size $DSTACK_GPUS_NUM +port: 8000 +model: meta-llama/Meta-Llama-3.1-8B-Instruct + +probes: + - type: http + url: /health + interval: 15s + +replicas: 2 + +resources: + gpu: 24GB..48GB +``` +
    + +In this example, `dstack` sends a GET `/health` request every 15 seconds to each replica. + +## Probe status + +
    + +```shell +$ dstack ps --verbose + + NAME BACKEND STATUS PROBES SUBMITTED + llama31 deployment=1 running 11 mins ago + replica=0 job=0 deployment=0 aws (us-west-2) running ✓ 11 mins ago + replica=1 job=0 deployment=1 aws (us-west-2) running × 1 min ago +``` + +
    + +In `dstack ps --verbose`, a replica shows `×` if the last probe failed, `~` while probes are succeeding but the [`ready_after`](../../docs/reference/dstack.yml/service.md#ready_after) threshold is not yet reached, and `✓` once the last `ready_after` checks have succeeded. Probes run for each replica while it is `running`. + +## Advanced configuration + +Probes support custom HTTP methods, headers (with environment variable interpolation), request bodies, timeouts, and multiple checks in sequence. For example: + +```yaml +env: + - PROBES_API_KEY +probes: + - type: http + method: post + url: /check-health + headers: + - name: X-API-Key + value: ${{ env.PROBES_API_KEY }} + - name: Content-Type + value: application/json + body: '{"level": 2}' + timeout: 20s +``` + +Note: request bodies are not allowed with `GET` or `HEAD` methods. + +## Rolling deployments + +During a rolling deployment, `dstack` starts a replacement replica, waits for it to be `running` and to pass its probes, then retires the old replica. This preserves availability while large models warm up. + +Probes give you visibility about health of each replica. During rolling updates they gate traffic so new replicas receive requests after their checks pass. + +See [services](../../docs/concepts/services.md#probes) and the [reference](../../docs/reference/dstack.yml/service.md#probes) for all options. + +!!! info "What's next?" + 1. Check [Quickstart](../../docs/quickstart.md) + 2. Learn about [services](../../docs/concepts/services.md) + 3. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/mkdocs/blog/posts/prometheus.md b/mkdocs/blog/posts/prometheus.md new file mode 100644 index 0000000000..08aecb4cf5 --- /dev/null +++ b/mkdocs/blog/posts/prometheus.md @@ -0,0 +1,66 @@ +--- +title: "Exporting fleet and run metrics to Prometheus" +date: 2025-04-01 +description: "TBA" +slug: prometheus +image: https://dstack.ai/static-assets/static-assets/images/dstack-prometheus-v3.png +categories: + - Changelog +--- + +# Exporting GPU, cost, and other metrics to Prometheus + +## Why Prometheus { style="display:none" } + +Effective AI infrastructure management requires full visibility into compute performance and costs. AI researchers need +detailed insights into container- and GPU-level performance, while managers rely on cost metrics to track resource usage +across projects. + +While `dstack` provides key metrics through its UI and [`dstack metrics`](dstack-metrics.md) CLI, teams often need more granular data and prefer +using their own monitoring tools. To support this, we’ve introduced a new endpoint that allows real-time exporting all collected +metrics—covering fleets and runs—directly to Prometheus. + + + + + +## How to set it up + +To collect and export fleet and run metrics to Prometheus, set the +`DSTACK_ENABLE_PROMETHEUS_METRICS` environment variable. Once the server is running, configure Prometheus to pull +metrics from `/metrics`. + +Once Prometheus is set up, it will automatically pull metrics from the `dstack` server at the defined interval. + +With metrics now in Prometheus, you can use Grafana to create dashboards, whether to monitor all projects at once or +drill down into specific projects or users. + + + +Overall, `dstack` collects three groups of metrics: + +| Group | Description | +|------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **Fleets** | Fleet metrics include details for each instance, such as running time, price, GPU name, and more. | +| **Runs** | Run metrics include run counters for each user in each project. | +| **Jobs** | A run consists of one or more jobs, each mapped to a container. Job metrics offer insights into execution time, cost, GPU model, NVIDIA DCGM telemetry, and more. | + +For a full list of available metrics and labels, check out [Metrics](../../docs/concepts/metrics.md). + +??? info "NVIDIA" + NVIDIA DCGM metrics are automatically collected for `aws`, `azure`, `gcp`, and `oci` backends, + as well as for [SSH fleets](../../docs/concepts/fleets.md#ssh-fleets). + + To ensure NVIDIA DCGM metrics are collected from SSH fleets, ensure the `datacenter-gpu-manager-4-core`, + `datacenter-gpu-manager-4-proprietary`, and `datacenter-gpu-manager-exporter` packages are installed on the hosts. + +??? info "AMD" + AMD device metrics are not yet collected for any backends. This support will be available soon. For now, AMD metrics are + only accessible through the UI and the [`dstack metrics`](dstack-metrics.md) CLI. + +!!! info "What's next?" + 1. See [Metrics](../../docs/concepts/metrics.md) + 1. Check [dev environments](../../docs/concepts/dev-environments.md), + [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), + and [fleets](../../docs/concepts/fleets.md) + 2. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/mkdocs/blog/posts/sglang-router.md b/mkdocs/blog/posts/sglang-router.md new file mode 100644 index 0000000000..f33fd7e400 --- /dev/null +++ b/mkdocs/blog/posts/sglang-router.md @@ -0,0 +1,173 @@ +--- +title: "SGLang router integration and disaggregated inference roadmap" +date: 2025-11-25 +description: "TBA" +slug: sglang-router +image: https://dstack.ai/static-assets/static-assets/images/dstack-sglang-router.png +categories: + - Changelog +--- + +# SGLang router integration and disaggregated inference roadmap + +[dstack](https://github.com/dstackai/dstack/) provides a streamlined way to handle GPU provisioning and workload orchestration across GPU clouds, Kubernetes clusters, or on-prem environments. Built for interoperability, dstack bridges diverse hardware and open-source tooling. + + + +As disaggregated, low-latency inference emerges, we aim to ensure this new stack runs natively on `dstack`. To move this forward, we’re introducing native integration between dstack and [SGLang’s Model Gateway](https://docs.sglang.ai/advanced_features/router.html) (formerly known as the SGLang Router). + + + +Although `dstack` can run on Kubernetes, it differs by offering higher-level abstractions that cover the core AI use cases: [dev environments](../../docs/concepts/dev-environments.md) for development, [tasks](../../docs/concepts/tasks.md) for training, and [services](../../docs/concepts/services.md) for inference. + +## Services + +Here’s an example of a service: + +=== "NVIDIA" + +
    + + ```yaml + type: service + name: qwen + + image: lmsysorg/sglang:latest + env: + - HF_TOKEN + - MODEL_ID=qwen/qwen2.5-0.5b-instruct + commands: + - | + python3 -m sglang.launch_server \ + --model-path $MODEL_ID \ + --port 8000 \ + --trust-remote-code + port: 8000 + model: qwen/qwen2.5-0.5b-instruct + + resources: + gpu: 8GB..24GB:1 + ``` + +
    + +=== "AMD" +
    + + ```yaml + type: service + name: qwen + + image: lmsysorg/sglang:v0.5.5.post3-rocm700-mi30x + env: + - HF_TOKEN + - MODEL_ID=qwen/qwen2.5-0.5b-instruct + commands: + - | + python3 -m sglang.launch_server \ + --model-path $MODEL_ID \ + --port 8000 \ + --trust-remote-code + port: 8000 + model: qwen/qwen2.5-0.5b-instruct + + resources: + gpu: MI300X:1 + ``` + +
    + +This service can be deployed via the following command: + +
    + +```shell +$ HF_TOKEN=... +$ dstack apply -f qwen.dstack.yml +``` + +
    + +This deploys the service as an OpenAI-compatible endpoint and manages provisioning and replicas automatically. + +## Gateways + +If you'd like to enable auto-scaling, HTTPS, or use a custom domain, create a gateway: + +
    + + ```yaml + type: gateway + name: my-gateway + + backend: aws + region: eu-west-1 + + # Specify your custom domain + domain: example.com + ``` + +
    + +This gateway can be created via the following command: + +
    + +```shell +$ dstack apply -f gateway.dstack.yml +``` + +
    + +Once the gateway has a hostname, update your domain’s DNS settings by adding a record for `*.`. + +After that, if you configure [replicas and scaling](../../docs/concepts/services.md#replicas-and-scaling), the gateway will automatically scale the number of replicas and route traffic across them. + +### Router + +By default, the gateway uses its built-in load balancer to route traffic across replicas. With the latest release, you can instead delegate traffic routing to the [SGLang Model Gateway](https://docs.sglang.ai/advanced_features/router.html) by setting the `router` property to `sglang`: + +
    + + ```yaml + type: gateway + name: my-gateway + + backend: aws + region: eu-west-1 + + # Specify your custom domain + domain: example.com + + router: + type: sglang + policy: cache_aware + ``` + +
    + +The `policy` property allows you to configure the routing policy: + +* `cache_aware` — Default policy; combines cache locality with load balancing, falling back to shortest queue. +* `power_of_two` — Samples two workers and picks the lighter one. +* `random` — Uniform random selection. +* `round_robin` — Cycles through workers in order. + +With this integration, K/V cache reuse across replicas becomes possible — a key step toward low-latency inference. It also sets the path for full disaggregated inference and native auto-scaling. And fundamentally, it reflects our commitment to collaborating with the open-source ecosystem instead of reinventing its core components. + +## Limitations and roadmap + +Looking ahead, this integration also shapes our roadmap. Over the coming releases, we plan to expand support in several key areas: + +* Enabling prefill and decode worker separation for full disaggregation (today, only standard workers are supported). +* Introducing auto-scaling based on TTFT (Time to First Token) and ITL (Inter-Token Latency), complementing the current requests-per-second scaling metric. +* Supporting multi-node replicas, enabling a single replica to span multiple nodes instead of being limited to one. +* Extending native support to more emerging inference stacks. + +## What's next? + +1. Check [dev environments](../../docs/concepts/dev-environments.md), + [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), + and [gateways](../../docs/concepts/gateways.md) +2. Follow [Quickstart](../../docs/quickstart.md) +3. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/mkdocs/blog/posts/smg.md b/mkdocs/blog/posts/smg.md new file mode 100644 index 0000000000..94a3b173f8 --- /dev/null +++ b/mkdocs/blog/posts/smg.md @@ -0,0 +1,126 @@ +--- +title: "Deploying SGLang with PD disaggregation via Shepherd Model Gateway" +date: 2026-04-29 +description: "TBA" +slug: smg +image: https://dstack.ai/static-assets/static-assets/images/smg.png +categories: + - Changelog +--- + +# Deploying SGLang with PD disaggregation via Shepherd Model Gateway + +`dstack` is an open-source control plane that simplifies GPU orchestration for both training and inference — across cloud providers, hardware vendors, and frameworks. Over the past year, we've been steadily making inference a first-class citizen in dstack. + + + + + +## About SMG + +Today, we're taking the next step: native support for [Shepherd Model Gateway](https://lightseek.org/smg/) (SMG) — a high-performance inference gateway that has evolved from the SGLang Router into a standalone project under the [LightSeek Foundation](https://lightseek.org/). With the latest update, deploying SGLang with Prefill-Decode disaggregation on `dstack` becomes simpler and more flexible. + +Now a standalone project, SMG aims to support various serving backends — including SGLang, vLLM, and TensorRT-LLM. Written in Rust, it provides cache-aware routing, PD disaggregation, circuit breakers, rate limiting, and 40+ Prometheus metrics out of the box. + +!!! info "PD disaggregation" + Prefill-Decode disaggregation separates the two phases of LLM inference — prompt processing (prefill) and token generation (decode). Prefill is compute-bound and parallel; decode is memory-bound and sequential. Running them separately improves both Time to First Token (TTFT) and end-to-end latency. + +Since 0.20.17, `dstack` supports deploying SGLang with PD disaggregation using Shepherd Model Gateway. To do it, define three replica groups: one for SMG, one for prefill workers, and one for decode workers. + +## How to use SMG with dstack + +Here's a complete service configuration that deploys `zai-org/GLM-4.5-Air-FP8` with PD disaggregation using SMG and SGLang on `dstack`: + +```yaml +type: service +name: prefill-decode +image: lmsysorg/sglang:v0.5.10.post1 +env: + - HF_TOKEN + - MODEL_ID=zai-org/GLM-4.5-Air-FP8 +replicas: + - count: 1 + # For now replica group with router must have count: 1 + commands: + - pip install smg + - | + smg launch \ + --host 0.0.0.0 \ + --port 8000 \ + --pd-disaggregation \ + --prefill-policy cache_aware + router: + type: sglang + resources: + cpu: 4 + - count: 1..4 + scaling: + metric: rps + target: 3 + commands: + - | + python -m sglang.launch_server \ + --model-path $MODEL_ID \ + --disaggregation-mode prefill \ + --disaggregation-transfer-backend nixl \ + --host 0.0.0.0 \ + --port 8000 \ + --disaggregation-bootstrap-port 8998 + resources: + gpu: H200 + - count: 1..8 + scaling: + metric: rps + target: 2 + commands: + - | + python -m sglang.launch_server \ + --model-path $MODEL_ID \ + --disaggregation-mode decode \ + --disaggregation-transfer-backend nixl \ + --host 0.0.0.0 \ + --port 8000 + resources: + gpu: H200 +port: 8000 +model: zai-org/GLM-4.5-Air-FP8 +# Custom probe is required for PD disaggregation. +probes: + - type: http + url: /health + interval: 15s +``` + +The SMG replica group must define `router: sglang`. + +The configuration defines three replica groups. The first runs SMG as the router on a CPU node. The second and third run prefill and decode workers respectively, using [NIXL](https://github.com/ai-dynamo/nixl) for KV cache transfer between them. Prefill scales from 1 to 4 replicas and decode from 1 to 8, both based on requests per second. + +``` +$ HF_TOKEN=... +$ dstack apply -f prefill-decode.dstack.yml +``` + +Because `dstack` is not tied to any specific cloud or cluster manager, this same configuration works across any GPU cloud, any Kubernetes cluster, or any non-Kubernetes on-prem environment managed through `dstack` [fleets](../../docs/concepts/fleets.md). + +## What's coming next + +We're actively working on expanding the inference stack in `dstack`. Here's what's coming: + +- **gRPC** — enabling SMG's gRPC mode, which will also allow using vLLM with Shepherd Model Gateway for PD disaggregation. +- **NVIDIA Dynamo** — native support for NVIDIA's inference framework. +- **TTFT and ITL** — autoscaling based on Time to First Token and Inter-Token Latency, complementing the current RPS metric. +- **AMD** — validated configurations for running PD disaggregation on AMD Instinct GPUs. + +## Why vendor-agnostic? + +The inference stack is evolving fast — new serving engines, new routing strategies, new hardware. Teams shouldn't have to rebuild their orchestration every time a piece of the stack changes. `dstack` provides a stable, vendor-agnostic layer that lets you adopt the best tools for each job — whether that's SGLang or vLLM, NVIDIA or AMD, cloud or on-prem — without locking into any single vendor's platform. + +> Our commitment remains the same: simplify both training and inference across vendors through open-source. + +*Huge thanks to the SGLang community for collaboration and support. The gateway's evolution into a standalone project have been instrumental in making this integration possible.* + +!!! info "What's next?" + 1. Read about [services](https://dstack.ai/docs/concepts/services/), [gateways](https://dstack.ai/docs/concepts/gateways/), and [fleets](https://dstack.ai/docs/concepts/fleets/) + 2. Follow [Quickstart](https://dstack.ai/docs/quickstart/) + 3. Check out the [Shepherd Model Gateway](https://lightseek.org/smg/getting-started/) and [SGLang PD disaggregation](https://sgl-project.github.io/advanced_features/pd_disaggregation.html) documentation + 4. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/mkdocs/blog/posts/state-of-cloud-gpu-2025.md b/mkdocs/blog/posts/state-of-cloud-gpu-2025.md new file mode 100644 index 0000000000..b9add79156 --- /dev/null +++ b/mkdocs/blog/posts/state-of-cloud-gpu-2025.md @@ -0,0 +1,146 @@ +--- +title: "The state of cloud GPUs in 2025: costs, performance, playbooks" +date: 2025-09-10 +description: "TBA" +slug: state-of-cloud-gpu-2025 +image: https://dstack.ai/static-assets/static-assets/images/cloud-gpu-providers.png +# categories: +# - Benchmarks +--- + +# The state of cloud GPUs in 2025: costs, performance, playbooks + +This is a practical map for teams renting GPUs — whether you’re a single project team fine-tuning models or a production-scale team managing thousand-GPU workloads. We’ll break down where providers fit, what actually drives performance, how pricing really works, and how to design a control plane that makes multi-cloud not just possible, but a competitive advantage. + + + +## A quick map of the market + +Two forces define the market: **Target scale** (from single nodes → racks → multi-rack pods) and **automation maturity** (manual VMs → basic Kubernetes → API-first orchestration). + + + +These axes split providers into distinct archetypes—each with different economics, fabrics, and operational realities. + +### Categories at a glance + +| Category | Description | Examples | +| :---- | :---- | :---- | +| **Classical hyperscalers** | General-purpose clouds with GPU SKUs bolted on | AWS, Google Cloud, Azure, OCI | +| **Massive neoclouds** | GPU-first operators built around dense HGX or MI-series clusters | CoreWeave, Lambda, Nebius, Crusoe | +| **Rapidly-catching neoclouds** | Smaller GPU-first players building out aggressively | Runpod, DataCrunch, Voltage Park, TensorWave, Hot Aisle | +| **Cloud marketplaces** | Don’t own capacity; sell orchestration + unified API over multiple backends | NVIDIA DGX Cloud (Lepton), Modal, Lightning AI, dstack Sky | +| **DC aggregators** | Aggregate idle capacity from third-party datacenters, pricing via market dynamics | Vast.ai | + +> Massive neoclouds lead at extreme GPU scales. Hyperscalers may procure GPU capacity from these GPU-first operators for both training and inference. + +## Silicon reality check + +=== "NVIDIA" + **NVIDIA** remains the path of least resistance for most teams—CUDA and the NVIDIA Container Toolkit still lead in framework compatibility and tooling maturity. H100 is now table stakes and widely available across clouds, a reflection of billions in GPU capex flowing into the open market. GB200 takes it further with tightly coupled domains ideal for memory- and bandwidth-heavy prefill, while cheaper pools can handle lighter decode phases. + +=== "AMD" + **AMD** has now crossed the viability threshold with ROCm 6/7—native PyTorch wheels, ROCm containers, and upstream support in vLLM/SGLang mean OSS stacks “Day 0” if you standardize ROCm images. MI300X (192 GB) and MI350X (288 GB HBM3E) match or exceed NVIDIA on per-GPU memory and are increasingly listed by neoclouds. The new MI355X further pushes boundaries—designed for rack-scale AI, it packs massive HBM3E pools in high-density systems for ultra-large model throughput. + +=== "TPU & Trainium" + **TPUs** and **Trainium** excel in tightly coupled training when you’re all-in on one provider, letting you amortize integration over years. The trade-offs—vendor lock-in, slower OSS support, and smaller ecosystems—make them viable mainly for multi-year, hyperscale workloads where efficiency outweighs migration cost. + +> **AMD** vs **NVIDIA** fit. MI300X matches H200 in capacity (192 GB vs 141 GB) but with more headroom for long-context prefill. MI325X (256 GB) is rolling out slowly, with many providers jumping to MI350X/MI355X (288 GB HBM3E). These top models exceed B200’s 192 GB, making them viable drop-ins where ROCm is ready; GB200/NVL still lead for ultra-low-latency collectives. + +## What you’re really buying + +The GPU SKU is only one piece. Real throughput depends on the system around it. Clusters are optional—until your workload forces them. + +| Dimension | Why it matters | Examples | +| :---- | :---- | :---- | +| **GPU memory** | Governs max batch size and KV-cache headroom, reducing parallelism overhead. | H100 (80 GB), H200 (~141 GB), B200 (~192 GB), MI300X (192 GB), MI325X (256 GB), MI350X/MI355X (288 GB). | +| **Fabric bandwidth** | Dictates all-reduce speed and MoE routing efficiency. Matters beyond a few nodes | 400 Gb/s – 3.2 Tb/s (e.g., 8×400 Gb/s NICs) | +| **Topology** | Low-diameter, uniform interconnect pods beat ad-hoc multi-rack for scale efficiency | HGX islands | +| **Local NVMe** | NVMe hides object-store latency for shards and checkpoints | Multi-TB local SSD per node is common on training SKUs | +| **Network volumes** | Removes “copy to every node” overhead | FSx for Lustre, Filestore, managed NFS; in HPC/neocloud setups, Vast and Weka are common. | +| **Orchestration** | Containers, placement, gang scheduling, autoscaling | K8s+Kueue, KubeRay, dstack, SLURM, vendor schedulers | + +## Pricing models – and what they hide + +Price tables don’t show availability risk. Commitments lower cost and increase odds you get the hardware when you need it. + +| With commitments | No committments | +| ----- | ----- | +| **Long-term (1–3 years)** Reserved or savings plans. 30–70% below on-demand. High capacity assurance, but utilization risk if needs shift. | **On-demand** Launch instantly—if quota allows. Highest $/hr. Limited availability for hot SKUs. | +| **Short-term (6–12 months)** Private offers, common with neoclouds. 20–60% off. Often includes hard capacity guarantees. | **Flex / queued** Starts when supply frees up. Cheaper than on-demand; runs capped in duration. | +| **Calendar capacity** Fixed-date bookings (AWS Capacity Blocks, GCP Calendar). Guarantees start time for planned runs. | **Spot / preemptible** 60–90% off. Eviction-prone; needs checkpointing/stateless design. | + +!!! info "Playbook" + Lock in calendar or reserved for steady base load or planned long runs. Keep urgent, interactive, and development/CI/CD work on on-demand. Push experiments and ephemeral runs to spot/flex. Always leave exit ramps to pivot to new SKUs. + +### Quotas, approvals, and the human factor + +Even listed SKUs may be gated. Hyperscalers and neoclouds enforce quotas and manual approvals—region by region—especially for new accounts on credits. If you can’t clear those gates, multi-cloud isn’t optional, it’s survival. + +### H100 pricing example + +Below is the price range for a single H100 SXM across providers. + + + +> Price is per GPU and excludes full CPU, disk amount and type, and network factors. 8xGPU multi-node setups with fast interconnects will cost more. + +For comparison, below is the price range for H100×GPU clusters across providers. + + + +> Most hyperscalers and neoclouds need short- or long-term contracts, though providers like Runpod, DataCrunch, and Nebius offer on-demand clusters. Larger capacity and longer commitments bring bigger discounts — Nebius offers up to 35% off for longer terms. + +## New GPU generations – why they matter + +* **Memory and bandwidth scaling.** Higher HBM and faster interconnects expand batch size, context length, and per-node throughput. NVIDIA’s B300 and AMD’s MI355X push this further with massive HBM3E capacity and rack-scale fabrics, targeting ultra-large training runs. +* **Fabrics.** Each new generation often brings major interconnect upgrades — GB200 with NVLink5 (1.8 TB/s) and 800 Gb/s Infiniband, MI355X with PCIe Gen6 and NDR. These cut all-reduce and MoE latency, but only if the cloud deploys matching network infrastructure. Pairing new GPUs with legacy 400 Gb/s links can erase much of the gain. +* **Prefill vs decode.** Prefill (memory/bandwidth heavy) thrives on large HBM and tightly coupled GPUs like GB200 NVL72. Decode can run cheaper, on high-concurrency pools. Splitting them is a major cost lever. +* **Cascade.** Top-end SKUs arrive roughly every 18–24 months, with mid-cycle refreshes in between. Each launch pushes older SKUs down the price curve — locking in for years right before a release risks overpaying within months. + +!!! info "Prices" + H100 prices have dropped significantly in recent years due to new GPU generations and models like DeepSeek that require more memory. New generations include the H200 and B200. Only AWS has reduced H100 instance prices by 44%. H200 and later B200 prices are expected to follow the same trend. + + **AMD** MI300X pricing is also softening as MI350X/MI355X roll out, with some neoclouds undercutting H100/H200 on $/GPU-hr while offering more memory per GPU. + + +## Where provisioning is going + +The shift is from ad-hoc starts to time-bound allocations. + +Large runs are booked ahead; daily work rides elastic pools. Placement engines increasingly decide on region + provider + interconnect before SKU. The mindset moves from “more GPUs” to “higher sustained utilization.” + +## Control plane as the force multiplier + +A real multi-cloud control plane should: + +* **Be quota-aware and cost-aware** – place jobs where they’ll start fastest at the best $/SLO. +* **Maximize utilization** – keep GPUs busy with checkpointing, resumable pipelines, and efficient gang scheduling. +* **Enforce portability** – one container spec, CUDA+ROCm images, upstream framework compatibility, state in object storage. + +This turns capacity from individual silos into one fungible pool. + +## Final takeaways + +* **Price ≠ cost** — List price often explains <50% of total job cost on multi-node training; fabric and storage dominate at scale. +* **Match commitments to workload reality** — and leave room for next-gen hardware. +* **Multi-cloud isn’t backup, it’s strategy** – keep a warm secondary. +* **Watch AMD’s ramp-up** – the MI series is becoming production-ready, and MI355X availability is set to expand quickly as providers bring it online. +* **Control plane is leverage** – define once, run anywhere, at the cheapest viable pool. + +??? info "Scope & limitations of this report" + + - **Provider coverage.** The vendor set is a curated sample aligned with the dstack team’s view of the market. A limited group of community members and domain experts reviewed drafts. Corrections, reproducibility notes, and additional data points are welcome. + - **Methodology gaps.** We did not perform cross-vendor **price normalization** (CPU/RAM/NVMe/fabric adjustments, region effects, egress), controlled **microbenchmarks** (NCCL/all-reduce, MoE routing latency, KV-cache behavior, object store vs. parallel FS), or a full **orchestration capability matrix** (scheduler semantics, gang scheduling, quota APIs, preemption, multi-tenancy). + - **Next steps.** We plan to publish price normalization, hardware/network microbenchmarks, and a scheduler capability matrix; preliminary harnesses are linked in the appendix. Contributors welcome. + + +> If you need a lighter, simpler orchestration and control-plane alternative to Kubernetes or Slurm, consider [dstack](https://github.com/dstackai/dstack/). +It’s open-source and self-hosted. + +??? info "dstack Sky" + If you want unified access to low-cost on-demand and spot GPUs across multiple clouds, try [dstack Sky](https://sky.dstack.ai/). + + + + You can use it with your own cloud accounts or through the cloud marketplace. diff --git a/mkdocs/blog/posts/toffee.md b/mkdocs/blog/posts/toffee.md new file mode 100644 index 0000000000..512218c1bb --- /dev/null +++ b/mkdocs/blog/posts/toffee.md @@ -0,0 +1,88 @@ +--- +title: "How Toffee streamlines inference and cut GPU costs with dstack" +date: 2025-12-05 +description: "TBA" +slug: toffee +image: https://dstack.ai/static-assets/static-assets/images/dstack-toffee.png +categories: + - Case studies +links: + - Toffee's research blog: https://research.toffee.ai/blog/how-we-use-dstack-at-toffee +--- + +# How Toffee streamlines inference and cut GPU costs with dstack + +In a recent engineering [blog post](https://research.toffee.ai/blog/how-we-use-dstack-at-toffee), Toffee shared how they use `dstack` to run large-language and image-generation models across multiple GPU clouds, while keeping their core backend on AWS. This case study summarizes key insights and highlights how `dstack` became the backbone of Toffee’s multi-cloud inference stack. + + + + + +[Toffee](https://toffee.ai) builds AI-powered experiences backed by LLMs and image-generation models. To serve these workloads efficiently, they combine: + +- **GPU neoclouds** such as [Runpod](https://www.runpod.io/) and [Vast.ai](https://vast.ai/) for flexible, cost-efficient GPU capacity +- **AWS** for core, non-AI services and backend infrastructure +- **dstack** as the orchestration layer that provisions GPU resources and exposes AI models via `dstack` [services](../../docs/concepts/services.md) and [gateways](../../docs/concepts/gateways.md) + +Most user-facing logic lives in AWS. The backend communicates with AI services through `dstack` gateways, each running on an EC2 instance inside Toffee’s AWS perimeter and exposed via Route 53 private hosted zones. `dstack`, in turn, manages GPU workloads on GPU clouds, abstracting away provider differences. + +Unlike the major hyperscalers (AWS, GCP, Azure), GPU neoclouds have historically offered more limited infrastructure-as-code (IaC) support, so teams often had to build their own tooling to provision and manage workloads at scale. + +Toffee ran LLM and image-generation workloads across several GPU providers, but: + +- Each provider had its own APIs and quirks +- Maintaining custom scripts and Terraform modules became increasingly painful as they scaled + +They needed **a unified orchestration layer** that: + +- Worked across their GPU providers +- Didn’t require Toffee to build and maintain its own orchestration platform + +`dstack` became the core of Toffee’s infrastructure by providing a declarative, cloud-agnostic way to provision GPUs and run services across multiple providers. + +> *Since we switched to `dstack`, we’ve cut the overhead of GPU-cloud orchestration by more than 50%. What used to take hours of custom Terraform + CLI scripting now deploys in minutes with a single declarative config — freeing us to focus on modelling, not infrastructure.* +> +> *— [Nikita Shupeyko](https://www.linkedin.com/in/nikita-shupeyko/), AI/ML & Cloud Infrastructure Architect at Toffee* + +Toffee primarily uses these `dstack` components: + +- [**Services**](../../docs/concepts/services.md) – to define and run inference endpoints for LLM and image-generation models, including replica counts and resource requirements +- [**Gateways**](../../docs/concepts/gateways.md) – EC2-based entry points inside AWS that expose `dstack` services to the Toffee backend as secure and auto-scalable model endpoints +- **Dashboard UI** – to manage active workloads, see where services are running, and track usage and cost across providers + +This architecture lets Toffee: + +- Deploy new AI services via declarative configs instead of hand-rolled scripts +- Switch between providers like GPU clouds without changing service code +- Keep all AI traffic flowing through their AWS network perimeter + +
    + +
    + +Beyond oechestration, Toffee relies on `dstack`’s UI as a central observability hub for their GPU workloads across GPU clouds. From `dstack` UI, they can: + +- See all active runs with resource allocations, costs, and current status across providers +- Inspect service-level dashboards for each AI endpoint +- Drill down into replica-level metrics, incl. GPU and CPU utilization, memory consumption, and instance-level logs and configuration details. + + + +> *Thanks to dstack’s seamless integration with GPU neoclouds like Runpod and Vast.ai, we’ve been able to shift most workloads off hyperscalers — reducing our effective GPU spend by roughly 2–3× without changing a single line of model code.* +> +> *— [Nikita Shupeyko](https://www.linkedin.com/in/nikita-shupeyko/), AI/ML & Cloud Infrastructure Architect at Toffee* + +Before adopting `dstack`, there were serious drawbacks: + +- Significant **maintenance overhead** as they scaled to more services and providers +- Limited support for **zero-downtime deployments** and **autoscaling** +- Additional engineering effort required to build features that platforms like `dstack` already provided + +As Toffee’s user base and model footprint grew, investing further in home-grown orchestration stopped making sense. With `dstack` in place, Toffee’s model and product teams spend more time on experimentation and user experience, and less firefighting and maintaining brittle tooling. + +*Huge thanks to Kamran and Nikita from Toffee’s team for sharing these insights. For more details, including the diagrams and some of hte open-source code, check out the original blog post in Toffee's [research blog](https://research.toffee.ai/blog/how-we-use-dstack-at-toffee).* + +!!! info "What's next?" + 1. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) + 2. Follow [Quickstart](../../docs/quickstart.md) + 3. Browse [Examples](../../docs/examples.md) diff --git a/mkdocs/blog/posts/tpu-on-gcp.md b/mkdocs/blog/posts/tpu-on-gcp.md new file mode 100644 index 0000000000..4a45af000b --- /dev/null +++ b/mkdocs/blog/posts/tpu-on-gcp.md @@ -0,0 +1,219 @@ +--- +title: Using TPUs for fine-tuning and deploying LLMs +date: 2024-09-10 +description: "Learn how to use TPUs with dstack for fine-tuning and deploying LLMs, leveraging open-source tools like Hugging Face’s Optimum TPU and vLLM." +slug: tpu-on-gcp +categories: + - Changelog +--- + +# Using TPUs for fine-tuning and deploying LLMs + +If you’re using or planning to use TPUs with Google Cloud, you can now do so via `dstack`. Just specify the TPU version and the number of cores +(separated by a dash), in the `gpu` property under `resources`. + +Read below to find out how to use TPUs with `dstack` for fine-tuning and deploying +LLMs, leveraging open-source tools like Hugging Face’s +[Optimum TPU](https://github.com/huggingface/optimum-tpu) +and [vLLM](https://docs.vllm.ai/en/latest/getting_started/tpu-installation.html). + + + +Below is an example of a dev environment: + +
    + + ```yaml + type: dev-environment + name: vscode-tpu + + python: 3.11 + ide: vscode + + resources: + gpu: v2-8 + ``` + +
    + +If you've configured the `gcp` backend, `dstack` will automatically provision the dev environment with a TPU. + +> Currently, maximum 8 TPU cores can be specified, so the maximum supported values are `v2-8`, `v3-8`, `v4-8`, `v5litepod-8`, +> and `v5e-8`. Multi-host TPU support, allowing for larger numbers of cores, is coming soon. + +## Deployment + +You can use any serving framework, such as vLLM, TGI. Here's an example of a [service](https://dstack.ai/docs/services) that deploys +Llama 3.1 8B using +[Optimum TPU](https://github.com/huggingface/optimum-tpu) +and [vLLM](https://github.com/vllm-project/vllm). + +=== "Optimum TPU" + +
    + + ```yaml + type: service + name: llama31-service-optimum-tpu + + image: dstackai/optimum-tpu:llama31 + env: + - HF_TOKEN + - MODEL_ID=meta-llama/Meta-Llama-3.1-8B-Instruct + - MAX_TOTAL_TOKENS=4096 + - MAX_BATCH_PREFILL_TOKENS=4095 + commands: + - text-generation-launcher --port 8000 + port: 8000 + # Register the model + model: + format: tgi + type: chat + name: meta-llama/Meta-Llama-3.1-8B-Instruct + + # Uncomment to leverage spot instances + #spot_policy: auto + + resources: + gpu: v5litepod-4 + ``` +
    + + Once the [pull request](https://github.com/huggingface/optimum-tpu/pull/87) is merged, + the official Docker image can be used instead of `dstackai/optimum-tpu:llama31`. + +=== "vLLM" +
    + + ```yaml + type: service + name: llama31-service-vllm-tpu + + env: + - MODEL_ID=meta-llama/Meta-Llama-3.1-8B-Instruct + - HF_TOKEN + - DATE=20240828 + - TORCH_VERSION=2.5.0 + - VLLM_TARGET_DEVICE=tpu + - MAX_MODEL_LEN=4096 + commands: + - pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-${TORCH_VERSION}.dev${DATE}-cp311-cp311-linux_x86_64.whl + - pip3 install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}.dev${DATE}-cp311-cp311-linux_x86_64.whl + - pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html + - pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html + - git clone https://github.com/vllm-project/vllm.git + - cd vllm + - pip install -r requirements-tpu.txt + - apt-get install -y libopenblas-base libopenmpi-dev libomp-dev + - python setup.py develop + - vllm serve $MODEL_ID + --tensor-parallel-size 4 + --max-model-len $MAX_MODEL_LEN + --port 8000 + port: 8000 + # Register the model + model: meta-llama/Meta-Llama-3.1-8B-Instruct + + # Uncomment to leverage spot instances + #spot_policy: auto + + resources: + gpu: v5litepod-4 + ``` +
    + +??? info "Control plane" + If you specify `model` when running a service, `dstack` will automatically register the model on + an OpenAI-compatible endpoint and allow you to use it for chat via the control plane UI. + + + +### Memory requirements + +Below are the approximate memory requirements for serving LLMs with their corresponding TPUs. + +| Model size | bfloat16 | TPU | int8 | TPU | +|------------|----------|--------------|-------|----------------| +| **8B** | 16GB | v5litepod-4 | 8GB | v5litepod-4 | +| **70B** | 140GB | v5litepod-16 | 70GB | v5litepod-16 | +| **405B** | 810GB | v5litepod-64 | 405GB | v5litepod-64 | + +Note, `v5litepod` is optimized for serving transformer-based models. Each core is equipped with 16GB of memory. + +### Supported frameworks + +| Framework | Quantization | Note | +|-----------|----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **TGI** | bfloat16 | To deploy with TGI, Optimum TPU must be used. | +| **vLLM** | int8, bfloat16 | int8 quantization still requires the same memory because the weights are first moved to the TPU in bfloat16, and then converted to int8. See the [pull request](https://github.com/vllm-project/vllm/pull/7005) for more details. | + +### Running a configuration + +Once the configuration is ready, run `dstack apply -f `, and `dstack` will automatically provision the +cloud resources and run the configuration. + +## Fine-tuning + +Below is an example of fine-tuning Llama 3.1 8B using [Optimum TPU](https://github.com/huggingface/optimum-tpu) +and the [Abirate/english_quotes](https://huggingface.co/datasets/Abirate/english_quotes) +dataset. + +
    + +```yaml +type: task +name: optimum-tpu-llama-train + +python: "3.11" + +env: + - HF_TOKEN +commands: + - git clone -b add_llama_31_support https://github.com/dstackai/optimum-tpu.git + - mkdir -p optimum-tpu/examples/custom/ + - cp examples/single-node-training/optimum-tpu/llama31/train.py optimum-tpu/examples/custom/train.py + - cp examples/single-node-training/optimum-tpu/llama31/config.yaml optimum-tpu/examples/custom/config.yaml + - cd optimum-tpu + - pip install -e . -f https://storage.googleapis.com/libtpu-releases/index.html + - pip install datasets evaluate + - pip install accelerate -U + - pip install peft + - python examples/custom/train.py examples/custom/config.yaml + + +resources: + gpu: v5litepod-8 +``` + +
    + +### Memory requirements + +Below are the approximate memory requirements for fine-tuning LLMs with their corresponding TPUs. + +| Model size | LoRA | TPU | +|------------|-------|--------------| +| **8B** | 16GB | v5litepod-8 | +| **70B** | 160GB | v5litepod-16 | +| **405B** | 950GB | v5litepod-64 | + +Note, `v5litepod` is optimized for fine-tuning transformer-based models. Each core is equipped with 16GB of memory. + +### Supported frameworks + +| Framework | Quantization | Note | +|-----------------|--------------|---------------------------------------------------------------------------------------------------| +| **TRL** | bfloat16 | To fine-tune using TRL, Optimum TPU is recommended. TRL doesn't support Llama 3.1 out of the box. | +| **Pytorch XLA** | bfloat16 | | + +## What's next? + +1. Browse [Optimum TPU](https://github.com/huggingface/optimum-tpu), + [Optimum TPU TGI](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference) and + [vLLM](https://docs.vllm.ai/en/latest/getting_started/tpu-installation.html). +2. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](https://dstack.ai/docs/tasks), + [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md). + +!!! info "Multi-host TPUs" + If you’d like to use `dstack` with more than eight TPU cores, upvote the corresponding + [issue](https://github.com/dstackai/dstack/issues/1337). diff --git a/mkdocs/blog/posts/volumes-on-runpod.md b/mkdocs/blog/posts/volumes-on-runpod.md new file mode 100644 index 0000000000..08f2e19126 --- /dev/null +++ b/mkdocs/blog/posts/volumes-on-runpod.md @@ -0,0 +1,139 @@ +--- +title: Using volumes to optimize cold starts on Runpod +date: 2024-08-13 +description: "Learn how to use volumes with dstack to optimize model inference cold start times on Runpod." +slug: volumes-on-runpod +categories: + - Changelog +--- + +# Using volumes to optimize cold starts on Runpod + +Deploying custom models in the cloud often faces the challenge of cold start times, including the time to provision a +new instance and download the model. This is especially relevant for services with autoscaling when new model replicas +need to be provisioned quickly. + +Let's explore how `dstack` optimizes this process using volumes, with an example of +deploying a model on Runpod. + + + +Suppose you want to deploy Llama 3.1 on Runpod as a [service](../../docs/concepts/services.md): + +
    + +```yaml +type: service +name: llama31-service-tgi + +replicas: 1..2 +scaling: + metric: rps + target: 30 + +image: ghcr.io/huggingface/text-generation-inference:latest +env: + - HF_TOKEN + - MODEL_ID=meta-llama/Meta-Llama-3.1-8B-Instruct + - MAX_INPUT_LENGTH=4000 + - MAX_TOTAL_TOKENS=4096 +commands: + - text-generation-launcher +port: 80 +# Register the model +model: meta-llama/Meta-Llama-3.1-8B-Instruct + +# Uncomment to leverage spot instances +#spot_policy: auto + +resources: + gpu: 24GB +``` + +
    + +When you run `dstack apply`, it creates a public endpoint with one service replica. `dstack` will then automatically scale +the service by adjusting the number of replicas based on traffic. + +When starting each replica, `text-generation-launcher` downloads the model to the `/data` folder. For Llama 3.1 8B, this +usually takes under a minute, but larger models may take longer. Repeated downloads can significantly affect +auto-scaling efficiency. + +Great news: Runpod supports network volumes, which we can use for caching models across multiple replicas. + +With `dstack`, you can create a Runpod volume using the following configuration: + +
    + +```yaml +type: volume +name: llama31-volume + +backend: runpod +region: EU-SE-1 + +# Required size +size: 100GB +``` + +
    + +Go ahead and create it via `dstack apply`: + +
    + +```shell +$ dstack apply -f runpod-volume.dstack.yml +``` + +
    + +Once the volume is created, attach it to your service by updating the configuration file and mapping the +volume name to the `/data` path. + +
    + +```yaml +type: service +name: llama31-service-tgi + +replicas: 1..2 +scaling: + metric: rps + target: 30 + +volumes: + - name: llama31-volume + path: /data + +image: ghcr.io/huggingface/text-generation-inference:latest +env: + - HF_TOKEN + - MODEL_ID=meta-llama/Meta-Llama-3.1-8B-Instruct + - MAX_INPUT_LENGTH=4000 + - MAX_TOTAL_TOKENS=4096 +commands: + - text-generation-launcher +port: 80 +# Register the model +model: meta-llama/Meta-Llama-3.1-8B-Instruct + +# Uncomment to leverage spot instances +#spot_policy: auto + +resources: + gpu: 24GB +``` + +
    + +In this case, `dstack` attaches the specified volume to each new replica. This ensures the model is downloaded only +once, reducing cold start time in proportion to the model size. + +A notable feature of Runpod is that volumes can be attached to multiple containers simultaneously. This capability is +particularly useful for auto-scalable services or distributed tasks. + +Using [volumes](../../docs/concepts/volumes.md) not only optimizes inference cold start times but also enhances the +efficiency of data and model checkpoint loading during training and fine-tuning. +Whether you're running [tasks](../../docs/concepts/tasks.md) or [dev environments](../../docs/concepts/dev-environments.md), leveraging +volumes can significantly streamline your workflow and improve overall performance. diff --git a/mkdocs/docs/concepts/backends.md b/mkdocs/docs/concepts/backends.md new file mode 100644 index 0000000000..33b904d9b9 --- /dev/null +++ b/mkdocs/docs/concepts/backends.md @@ -0,0 +1,1382 @@ +--- +title: Backends +description: Configuring cloud providers and Kubernetes clusters +--- + +# Backends + +Backends allow `dstack` to provision fleets across GPU clouds or Kubernetes clusters. + +`dstack` supports two types of backends: + + * [VM-based](#vm-based) – use `dstack`'s native integration with cloud providers to provision VMs, manage clusters, and orchestrate container-based runs. + * [Container-based](#container-based) – use either `dstack`'s native integration with cloud providers or Kubernetes to orchestrate container-based runs; provisioning in this case is delegated to the cloud provider or Kubernetes. + +!!! info "SSH fleets" + When using `dstack` with on-prem servers, backend configuration isn’t required. Simply create [SSH fleets](../concepts/fleets.md#ssh-fleets) once the server is up. + +Backends can be configured via `~/.dstack/server/config.yml` or through the [project settings page](../concepts/projects.md#backends) in the UI. See the examples of backend configuration below. + +> If you update `~/.dstack/server/config.yml`, you have to restart the server. + +## VM-based + +VM-based backends allow `dstack` users to manage clusters and orchestrate container-based runs across a wide range of cloud providers. Under the hood, `dstack` uses native integrations with these providers to provision clusters on demand. + +Compared to [container-based](#container-based) backends, this approach offers finer-grained, simpler control over cluster provisioning and eliminates the dependency on a Kubernetes layer. + + + +### AWS + +There are two ways to configure AWS: using an access key or using the default credentials. + +=== "Default credentials" + + If you have default credentials set up (e.g. in `~/.aws/credentials`), configure the backend like this: + +
    + + ```yaml + projects: + - name: main + backends: + - type: aws + creds: + type: default + ``` + +
    + +=== "Access key" + + Create an access key by following the [this guide](https://docs.aws.amazon.com/cli/latest/userguide/cli-authentication-user.html#cli-authentication-user-get). + Once you've downloaded the `.csv` file with your IAM user's Access key ID and Secret access key, proceed to + configure the backend. + +
    + + ```yaml + projects: + - name: main + backends: + - type: aws + creds: + type: access_key + access_key: KKAAUKLIZ5EHKICAOASV + secret_key: pn158lMqSBJiySwpQ9ubwmI6VUU3/W2fdJdFwfgO + ``` + +
    + +??? info "Required permissions" + The following AWS policy permissions are sufficient for `dstack` to work: + + ``` + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "ec2:AttachVolume", + "ec2:AuthorizeSecurityGroupEgress", + "ec2:AuthorizeSecurityGroupIngress", + "ec2:CreatePlacementGroup", + "ec2:CancelSpotInstanceRequests", + "ec2:CreateSecurityGroup", + "ec2:CreateTags", + "ec2:CreateVolume", + "ec2:DeletePlacementGroup", + "ec2:DeleteVolume", + "ec2:DescribeAvailabilityZones", + "ec2:DescribeCapacityReservations" + "ec2:DescribeImages", + "ec2:DescribeInstances", + "ec2:DescribeInstanceAttribute", + "ec2:DescribeInstanceTypes", + "ec2:DescribeRouteTables", + "ec2:DescribeSecurityGroups", + "ec2:DescribeSubnets", + "ec2:DescribeVpcs", + "ec2:DescribeVolumes", + "ec2:DetachVolume", + "ec2:RunInstances", + "ec2:TerminateInstances" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "servicequotas:ListServiceQuotas", + "servicequotas:GetServiceQuota" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "elasticloadbalancing:CreateLoadBalancer", + "elasticloadbalancing:CreateTargetGroup", + "elasticloadbalancing:CreateListener", + "elasticloadbalancing:RegisterTargets", + "elasticloadbalancing:AddTags", + "elasticloadbalancing:DeleteLoadBalancer", + "elasticloadbalancing:DeleteTargetGroup", + "elasticloadbalancing:DeleteListener", + "elasticloadbalancing:DeregisterTargets" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "acm:DescribeCertificate", + "acm:ListCertificates" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "iam:GetInstanceProfile", + "iam:GetRole", + "iam:PassRole" + ], + "Resource": "*" + } + ] + } + ``` + + The `elasticloadbalancing:*` and `acm:*` permissions are only needed for provisioning gateways with ACM (AWS Certificate Manager) certificates. + + The `iam:*` permissions are only needed if you specify `iam_instance_profile` to assign to EC2 instances. + + The following additional permissions are required when running [multi-EFA instance types](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-eni.html#network-cards) with `public_ips: true`: + + ``` + { + "Effect": "Allow", + "Action": [ + "ec2:AllocateAddress", + "ec2:AssociateAddress", + "ec2:DescribeAddresses", + "ec2:DisassociateAddress", + "ec2:ReleaseAddress" + ], + "Resource": "*" + } + ``` + + You can also limit permissions to specific resources in your account: + + ``` + { + "Version": "2012-10-17", + "Statement": [ + ... + { + "Effect": "Allow", + "Action": [ + "iam:GetInstanceProfile", + "iam:GetRole", + "iam:PassRole" + ], + "Resource": "arn:aws:iam::account-id:role/EC2-roles-for-XYZ-*" + } + ] + } + ``` + +??? info "VPC" + By default, `dstack` uses the default VPC. It's possible to customize it: + + === "vpc_name" + + ```yaml + projects: + - name: main + backends: + - type: aws + creds: + type: default + + vpc_name: my-vpc + ``` + + === "vpc_ids" + ```yaml + projects: + - name: main + backends: + - type: aws + creds: + type: default + + default_vpcs: true + vpc_ids: + us-east-1: vpc-0a2b3c4d5e6f7g8h + us-east-2: vpc-9i8h7g6f5e4d3c2b + us-west-1: vpc-4d3c2b1a0f9e8d7 + ``` + + For the regions without configured `vpc_ids`, enable default VPCs by setting `default_vpcs` to `true`. + +??? info "Private subnets" + By default, `dstack` provisions instances with public IPs and permits inbound SSH traffic. + If you want `dstack` to use private subnets and provision instances without public IPs, set `public_ips` to `false`. + + ```yaml + projects: + - name: main + backends: + - type: aws + creds: + type: default + + public_ips: false + ``` + + Using private subnets assumes that both the `dstack` server and users can access the configured VPC's private subnets. + Additionally, private subnets must have outbound internet connectivity provided by NAT Gateway, Transit Gateway, or other mechanism. + +??? info "OS images" + By default, `dstack` uses its own [AMI](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AMIs.html) + optimized for `dstack`. + To use your own or other third-party images, set the `os_images` property: + + ```yaml + projects: + - name: main + backends: + - type: aws + creds: + type: default + + os_images: + cpu: + name: my-ami-for-cpu-instances + owner: self + user: dstack + nvidia: + name: 'Some ThirdParty CUDA image' + owner: 123456789012 + user: ubuntu + ``` + + Here, both `cpu` and `nvidia` properties are optional, but if the property is not set, you won´t be able to use the corresponding instance types. + + The `name` is an AMI name. + The `owner` is either an AWS account ID (a 12-digit number) or a special value `self` indicating the current account. + The `user` specifies an OS user for instance provisioning. + + !!! info "Image requirements" + * SSH server listening on port 22 + * `user` with passwordless sudo access + * Docker is installed + * (For NVIDIA instances) NVIDIA/CUDA drivers and NVIDIA Container Toolkit are installed + * The firewall (`iptables`, `ufw`, etc.) must allow external traffic to port 22 and all traffic within the private subnet, and should forbid any other incoming external traffic. + +### Azure + +There are two ways to configure Azure: using a client secret or using the default credentials. + +=== "Default credentials" + + If you have default credentials set up, configure the backend like this: + +
    + + ```yaml + projects: + - name: main + backends: + - type: azure + subscription_id: 06c82ce3-28ff-4285-a146-c5e981a9d808 + tenant_id: f84a7584-88e4-4fd2-8e97-623f0a715ee1 + creds: + type: default + ``` + +
    + + If you don't know your `subscription_id` and `tenant_id`, use [Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli): + + ```shell + az account show --query "{subscription_id: id, tenant_id: tenantId}" + ``` + +=== "Client secret" + + A client secret can be created using the [Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli): + + ```shell + SUBSCRIPTION_ID=... + az ad sp create-for-rbac + --name dstack-app \ + --role $DSTACK_ROLE \ + --scopes /subscriptions/$SUBSCRIPTION_ID \ + --query "{ tenant_id: tenant, client_id: appId, client_secret: password }" + ``` + + Once you have `tenant_id`, `client_id`, and `client_secret`, go ahead and configure the backend. + +
    + + ```yaml + projects: + - name: main + backends: + - type: azure + subscription_id: 06c82ce3-28ff-4285-a146-c5e981a9d808 + tenant_id: f84a7584-88e4-4fd2-8e97-623f0a715ee1 + creds: + type: client + client_id: acf3f73a-597b-46b6-98d9-748d75018ed0 + client_secret: 1Kb8Q~o3Q2hdEvrul9yaj5DJDFkuL3RG7lger2VQ + ``` + +
    + + If you don't know your `subscription_id`, use [Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli): + + ```shell + az account show --query "{subscription_id: id}" + ``` + +??? info "Required permissions" + The following Azure permissions are sufficient for `dstack` to work: + + ```json + { + "properties": { + "roleName": "dstack-role", + "description": "Minimal required permissions for using Azure with dstack", + "assignableScopes": [ + "/subscriptions/${YOUR_SUBSCRIPTION_ID}" + ], + "permissions": [ + { + "actions": [ + "Microsoft.Authorization/*/read", + "Microsoft.Compute/availabilitySets/*", + "Microsoft.Compute/locations/*", + "Microsoft.Compute/virtualMachines/*", + "Microsoft.Compute/virtualMachineScaleSets/*", + "Microsoft.Compute/cloudServices/*", + "Microsoft.Compute/disks/write", + "Microsoft.Compute/disks/read", + "Microsoft.Compute/disks/delete", + "Microsoft.ManagedIdentity/userAssignedIdentities/assign/action", + "Microsoft.ManagedIdentity/userAssignedIdentities/read", + "Microsoft.Network/networkSecurityGroups/*", + "Microsoft.Network/locations/*", + "Microsoft.Network/virtualNetworks/*", + "Microsoft.Network/networkInterfaces/*", + "Microsoft.Network/publicIPAddresses/*", + "Microsoft.Resources/subscriptions/resourceGroups/read", + "Microsoft.Resources/subscriptions/resourceGroups/write", + "Microsoft.Resources/subscriptions/read" + ], + "notActions": [], + "dataActions": [], + "notDataActions": [] + } + ] + } + } + ``` + + The `"Microsoft.Resources/subscriptions/resourceGroups/write"` permission is not required + if [`resource_group`](/docs/reference/server/config.yml/#azure) is specified. + +??? info "VPC" + By default, `dstack` creates new Azure networks and subnets for every configured region. + It's possible to use custom networks by specifying `vpc_ids`: + + ```yaml + projects: + - name: main + backends: + - type: azure + creds: + type: default + regions: [westeurope] + vpc_ids: + westeurope: myNetworkResourceGroup/myNetworkName + ``` + + Alternatively, specify `subnet_ids` to target specific subnets: + + ```yaml + projects: + - name: main + backends: + - type: azure + creds: + type: default + regions: [westeurope] + subnet_ids: + westeurope: myNetworkResourceGroup/myNetworkName/mySubnetName + ``` + + +??? info "Private subnets" + By default, `dstack` provisions instances with public IPs and permits inbound SSH traffic. + If you want `dstack` to use private subnets and provision instances without public IPs, + specify custom networks using `vpc_ids` or `subnet_ids`, and set `public_ips` to `false`. + + ```yaml + projects: + - name: main + backends: + - type: azure + creds: + type: default + regions: [westeurope] + vpc_ids: + westeurope: myNetworkResourceGroup/myNetworkName + public_ips: false + ``` + + Using private subnets assumes that both the `dstack` server and users can access the configured VPC's private subnets. + Additionally, private subnets must have outbound internet connectivity provided by [NAT Gateway or other mechanism](https://learn.microsoft.com/en-us/azure/nat-gateway/nat-overview). + +### GCP + +There are two ways to configure GCP: using a service account or using the default credentials. + +=== "Default credentials" + + Enable GCP application default credentials: + + ```shell + gcloud auth application-default login + ``` + + Then configure the backend like this: + +
    + + ```yaml + projects: + - name: main + backends: + - type: gcp + project_id: gcp-project-id + creds: + type: default + ``` + +
    + +=== "Service account" + + To create a service account, follow [this guide](https://cloud.google.com/iam/docs/service-accounts-create). After setting up the service account [create a key](https://cloud.google.com/iam/docs/keys-create-delete) for it and download the corresponding JSON file. + + Then go ahead and configure the backend by specifying the downloaded file path. + +
    + + ```yaml + projects: + - name: main + backends: + - type: gcp + project_id: my-gcp-project + creds: + type: service_account + filename: ~/.dstack/server/gcp-024ed630eab5.json + ``` + +
    + + ??? info "User interface" + If you are configuring the `gcp` backend on the [project settings page](projects.md#backends), + specify the contents of the JSON file in `data`: + +
    + + ```yaml + type: gcp + project_id: my-gcp-project + creds: + type: service_account + data: | + { + "type": "service_account", + "project_id": "my-gcp-project", + "private_key_id": "abcd1234efgh5678ijkl9012mnop3456qrst7890", + "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEv...rest_of_key...IDAQAB\n-----END PRIVATE KEY-----\n", + "client_email": "my-service-account@my-gcp-project.iam.gserviceaccount.com", + "client_id": "123456789012345678901", + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", + "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/my-service-account%40my-gcp-project.iam.gserviceaccount.com", + "universe_domain": "googleapis.com" + } + ``` + +
    + +If you don't know your GCP project ID, use [Google Cloud CLI](https://cloud.google.com/sdk/docs/install-sdk): + +```shell +gcloud projects list --format="json(projectId)" +``` + +??? info "Required permissions" + The following GCP permissions are sufficient for `dstack` to work: + + ``` + compute.disks.create + compute.disks.delete + compute.disks.get + compute.disks.list + compute.disks.setLabels + compute.disks.use + compute.firewalls.create + compute.images.useReadOnly + compute.instances.attachDisk + compute.instances.create + compute.instances.delete + compute.instances.detachDisk + compute.instances.get + compute.instances.setLabels + compute.instances.setMetadata + compute.instances.setServiceAccount + compute.instances.setTags + compute.networks.get + compute.networks.updatePolicy + compute.projects.get + compute.regions.get + compute.regions.list + compute.reservations.list + compute.resourcePolicies.create + compute.resourcePolicies.delete + compute.routers.list + compute.subnetworks.list + compute.subnetworks.use + compute.subnetworks.useExternalIp + compute.zoneOperations.get + ``` + + If you plan to use TPUs, additional permissions are required: + + ``` + tpu.nodes.create + tpu.nodes.get + tpu.nodes.update + tpu.nodes.delete + tpu.operations.get + tpu.operations.list + ``` + + Also, the use of TPUs requires the `serviceAccountUser` role. + For TPU VMs, dstack will use the default service account. + + If you plan to use shared reservations, the `compute.reservations.list` + permission is required in the project that owns the reservations. + +??? info "Required APIs" + First, ensure the required APIs are enabled in your GCP `project_id`. + + ```shell + PROJECT_ID=... + gcloud config set project $PROJECT_ID + gcloud services enable cloudapis.googleapis.com + gcloud services enable compute.googleapis.com + ``` + +??? info "VPC" + + === "VPC" + +
    + + ```yaml + projects: + - name: main + backends: + - type: gcp + project_id: gcp-project-id + creds: + type: default + + vpc_name: my-custom-vpc + ``` + +
    + + If you specify a non-default VPC, ensure it has a firewall rule + allowing all traffic within the VPC. This is needed for multi-node tasks to work. + The default VPC already permits traffic within the VPC. + + === "Shared VPC" + +
    + + ```yaml + projects: + - name: main + backends: + - type: gcp + project_id: gcp-project-id + creds: + type: default + + vpc_name: my-custom-vpc + vpc_project_id: another-project-id + ``` + +
    + + When using a Shared VPC, ensure there is a firewall rule allowing `INGRESS` traffic on port `22`. + You can limit this rule to `dstack` instances using the `dstack-runner-instance` target tag. + + When using GCP gateways with a Shared VPC, also ensure there is a firewall rule allowing `INGRESS` traffic on ports `22`, `80`, `443`. + You can limit this rule to `dstack` gateway instances using the `dstack-gateway-instance` target tag. + + To use TPUs with a Shared VPC, you need to grant the TPU Service Account in your service project permissions + to manage resources in the host project by granting the "TPU Shared VPC Agent" (roles/tpu.xpnAgent) role + ([more in the GCP docs](https://cloud.google.com/tpu/docs/shared-vpc-networks#vpc-shared-vpc)). + +??? info "Private subnets" + By default, `dstack` provisions instances with public IPs and permits inbound SSH traffic. + If you want `dstack` to use private subnets and provision instances without public IPs, set `public_ips` to `false`. + + ```yaml + projects: + - name: main + backends: + - type: gcp + creds: + type: default + + public_ips: false + ``` + + Using private subnets assumes that both the `dstack` server and users can access the configured VPC's private subnets. + Additionally, [Cloud NAT](https://cloud.google.com/nat/docs/overview) must be configured to provide access to external resources for provisioned instances. + +### Lambda + +Log into your [Lambda Cloud](https://lambdalabs.com/service/gpu-cloud) account, click API keys in the sidebar, and then click the `Generate API key` +button to create a new API key. + +Then, go ahead and configure the backend: + +
    + +```yaml +projects: +- name: main + backends: + - type: lambda + creds: + type: api_key + api_key: eersct_yrpiey-naaeedst-tk-_cb6ba38e1128464aea9bcc619e4ba2a5.iijPMi07obgt6TZ87v5qAEj61RVxhd0p +``` + +
    + +### Nebius + +Log into your [Nebius AI Cloud](https://console.eu.nebius.com/) account, navigate to Access, and select Service Accounts. Create a service account, add it to the editors group, and upload its authorized key. + +Then configure the backend: + +
    + +```yaml +projects: +- name: main + backends: + - type: nebius + creds: + type: service_account + service_account_id: serviceaccount-e00dhnv9ftgb3cqmej + public_key_id: publickey-e00ngaex668htswqy4 + private_key_file: ~/path/to/key.pem +``` + +
    + +??? info "Credentials file" + It's also possible to configure the `nebius` backend using a credentials file [generated](https://docs.nebius.com/iam/service-accounts/authorized-keys#create) by the `nebius` CLI: + +
    + + ```shell + $ nebius iam auth-public-key generate \ + --service-account-id \ + --output ~/.nebius/sa-credentials.json + ``` + +
    + + + ```yaml + projects: + - name: main + backends: + - type: nebius + creds: + type: service_account + filename: ~/.nebius/sa-credentials.json + ``` + +??? info "User interface" + If you are configuring the `nebius` backend on the [project settings page](projects.md#backends), + specify the contents of the private key file in `private_key_content`: + +
    + + ```yaml + type: nebius + creds: + type: service_account + service_account_id: serviceaccount-e00dhnv9ftgb3cqmej + public_key_id: publickey-e00ngaex668htswqy4 + private_key_content: | + -----BEGIN PRIVATE KEY----- + MIIJQQIBADANBgkqhkiG9w0BAQEFAASCCSswggknAgEAAoICAQChwQ5OOhy60N7m + cPx/9M0oRUyJdRRv2nCALbdU/wSDOo8o5N7sP63zCaxXPeKwLNEzneMd/U0gWSv2 + [...] + 8y1qYDPKQ8LR+DPCUmyhM2I8t6673Vz3GrtEjkLhgQo/KqOVb3yiBFVfkA5Jov5s + kO7y4T0ynsI8b6wlhCukQTLpIYJ5 + -----END PRIVATE KEY----- + ``` + +
    + +??? info "Projects" + If you have multiple projects per region, specify which ones to use, at most one per region. + +
    + + ```yaml + type: nebius + projects: + - project-e00jt6t095t1ahrg4re30 + - project-e01iahuh3cklave4ao1nv + creds: + type: service_account + service_account_id: serviceaccount-e00dhnv9ftgb3cqmej + public_key_id: publickey-e00ngaex668htswqy4 + private_key_file: ~/path/to/key.pem + ``` + +
    + + +### Crusoe + +Log into your [Crusoe](https://console.crusoecloud.com/) console and create an API key +under your account settings. Note your project ID from the project settings page. + +Then, go ahead and configure the backend: + +
    + +```yaml +projects: +- name: main + backends: + - type: crusoe + project_id: your-project-id + creds: + type: access_key + access_key: your-access-key + secret_key: your-secret-key + regions: + - us-east1-a + - us-southcentral1-a +``` + +
    + +`regions` is optional. If not specified, all available Crusoe regions are used. + + + +### Verda (formerly DataCrunch) { #verda } + +Log into your [Verda](https://console.verda.com/signin) account, click Keys in the sidebar, find `REST API Credentials` area and then click the `Generate Credentials` button. + +Then, go ahead and configure the backend: + +
    + +```yaml +projects: + - name: main + backends: + - type: verda + creds: + type: api_key + client_id: xfaHBqYEsArqhKWX-e52x3HH7w8T + client_secret: B5ZU5Qx9Nt8oGMlmMhNI3iglK8bjMhagTbylZy4WzncZe39995f7Vxh8 +``` + +
    + +### AMD Developer Cloud +Log into your [AMD Developer Cloud](https://amd.digitalocean.com/login) account. Click `API` in the sidebar and click the button `Generate New Token`. + +Then, go ahead and configure the backend: + +
    + +```yaml +projects: +- name: main + backends: + - type: amddevcloud + project_name: my-amd-project + creds: + type: api_key + api_key: ... +``` + +
    + +??? info "Project" + If `project_name` is not set, the default project will be used. + +??? info "Required permissions" + The API key must have the following scopes assigned: + + * `account` - read + * `droplet` - create, read, update, delete, admin + * `project` - create, read, update, delete + * `regions` - read + * `sizes` - read + * `ssh_key` - create, read, update, delete + + +### Digital Ocean +Log into your [Digital Ocean](https://cloud.digitalocean.com/login) account. Click `API` in the sidebar and click the button `Generate New Token`. + +Then, go ahead and configure the backend: + +
    + +```yaml +projects: +- name: main + backends: + - type: digitalocean + project_name: my-digital-ocean-project + creds: + type: api_key + api_key: ... +``` + +
    + +??? info "Project" + If `project_name` is not set, the default project will be used. + +??? info "Required permissions" + The API key must have the following scopes assigned: + + * `account` - read + * `droplet` - create, read, update, delete, admin + * `project` - create, read, update, delete + * `regions` - read + * `sizes` - read + * `ssh_key` - create, read, update,delete + +### Hot Aisle + +Log in to the SSH TUI as described in the [Hot Aisle Quick Start](https://hotaisle.xyz/quick-start/). +Create a new team and generate an API key for the member in the team. + +Then, go ahead and configure the backend: + +
    + +```yaml +projects: +- name: main + backends: + - type: hotaisle + team_handle: hotaisle-team-handle + creds: + type: api_key + api_key: 9c27a4bb7a8e472fae12ab34.3f2e3c1db75b9a0187fd2196c6b3e56d2b912e1c439ba08d89e7b6fcd4ef1d3f +``` + +
    + +??? info "Required permissions" + The API key must have the following roles assigned: + + * **Owner role for the user** - Required for creating and managing SSH keys + * **Operator role for the team** - Required for managing virtual machines within the team + +??? info "Pricing" + `dstack` shows the hourly price for Hot Aisle instances. Some instances also require an upfront payment for a minimum reservation period, which is usually a few hours. You will be charged for the full minimum period even if you stop the instance early. + + See the Hot Aisle API for the minimum reservation period for each instance type: + +
    + + ```shell + $ curl -H "Authorization: Token $API_KEY" https://admin.hotaisle.app/api/teams/$TEAM_HANDLE/virtual_machines/available/ | jq ".[] | {gpus: .Specs.gpus, MinimumReservationMinutes}" + ``` + +
    + +### JarvisLabs + +Log into your [JarvisLabs](https://cloud.jarvislabs.ai/) account and create an API key. + +Then, go ahead and configure the backend: + +
    + +```yaml +projects: +- name: main + backends: + - type: jarvislabs + creds: + type: api_key + api_key: ... +``` + +
    + +### CloudRift + +Log into your [CloudRift](https://console.cloudrift.ai/) console, click `API Keys` in the sidebar and click the button to create a new API key. + +Ensure you've created a project with CloudRift. + +Then proceed to configuring the backend. + +
    + +```yaml +projects: + - name: main + backends: + - type: cloudrift + creds: + type: api_key + api_key: rift_2prgY1d0laOrf2BblTwx2B2d1zcf1zIp4tZYpj5j88qmNgz38pxNlpX3vAo +``` + +
    + +### Vultr + +Log into your [Vultr](https://www.vultr.com/) account, click `Account` in the sidebar, select `API`, find the `Personal Access Token` panel and click the `Enable API` button. In the `Access Control` panel, allow API requests from all addresses or from the subnet where your `dstack` server is deployed. + +Then, go ahead and configure the backend: + +
    + +```yaml +projects: + - name: main + backends: + - type: vultr + creds: + type: api_key + api_key: B57487240a466624b48de22865589 +``` + +
    + +### OCI + +There are two ways to configure OCI: using client credentials or using the default credentials. + +=== "Default credentials" + If you have default credentials set up in `~/.oci/config`, configure the backend like this: + +
    + + ```yaml + projects: + - name: main + backends: + - type: oci + creds: + type: default + ``` + +
    + +=== "Client credentials" + + Log into the [OCI Console](https://cloud.oracle.com), go to `My profile`, + select `API keys`, and click `Add API key`. + + Once you add a key, you'll see the configuration file. Copy its values to configure the backend as follows: + +
    + + ```yaml + projects: + - name: main + backends: + - type: oci + creds: + type: client + user: ocid1.user.oc1..g5vlaeqfu47akmaafq665xsgmyaqjktyfxtacfxc4ftjxuca7aohnd2ev66m + tenancy: ocid1.tenancy.oc1..ajqsftvk4qarcfaak3ha4ycdsaahxmaita5frdwg3tqo2bcokpd3n7oizwai + region: eu-frankfurt-1 + fingerprint: 77:32:77:00:49:7c:cb:56:84:75:8e:77:96:7d:53:17 + key_file: ~/.oci/private_key.pem + ``` + +
    + + Make sure to include either the path to your private key via `key_file` or the contents of the key via `key_content`. + +??? info "Required permissions" + + This is an example of a restrictive policy for a group of `dstack` users: + + ``` + Allow group to read compartments in tenancy where target.compartment.name = '' + Allow group to read marketplace-community-listings in compartment + Allow group to manage app-catalog-listing in compartment + Allow group to manage instances in compartment + Allow group to manage compute-capacity-reports in compartment + Allow group to manage volumes in compartment + Allow group to manage volume-attachments in compartment + Allow group to manage virtual-network-family in compartment + ``` + + To use this policy, create a compartment for `dstack` and specify it in `~/.dstack/server/config.yml`. + + ```yaml + projects: + - name: main + backends: + - type: oci + creds: + type: default + compartment_id: ocid1.compartment.oc1..aaaaaaaa + ``` + +SSH fleets support the same features as [VM-based](#vm-based) backends. + +!!! info "What's next" + 1. See the [`~/.dstack/server/config.yml`](../reference/server/config.yml.md) reference + 2. Check [Projects](../concepts/projects.md) + +## Container-based + +Container-based backends allow `dstack` to orchestrate container-based runs either directly on cloud providers that support containers or on Kubernetes. +In this case, `dstack` delegates provisioning to the cloud provider or Kubernetes. + +Compared to [VM-based](#vm-based) backends, they offer less fine-grained control over provisioning but rely on the native logic of the underlying environment, whether that’s a cloud provider or Kubernetes. + + + +### Kubernetes + +Regardless of whether it’s on-prem Kubernetes or managed, `dstack` can orchestrate container-based runs across your clusters. A single `kubernetes` backend can manage one or many clusters — each cluster is selected via a kubeconfig [context](https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/#context). + +The recommended way is to enable clusters explicitly via the `contexts` property: + +
    + +```yaml +projects: +- name: main + backends: + - type: kubernetes + + kubeconfig: + filename: ~/.kube/config + + contexts: + - name: gpu-cluster-a + - name: gpu-cluster-b +``` + +
    + +!!! info "Proxy jump" + To allow the `dstack` server and CLI to access runs via SSH, `dstack` uses a node in each cluster as a jump host to proxy SSH traffic into containers. No additional setup is required — `dstack` configures and manages the proxy automatically. + + By default, `dstack` autodetects the jump host: + + - `hostname` — picks the `ExternalIP` of the jump pod's node, or a random node `ExternalIP` from the cluster if the jump pod's node has none. If no node in the cluster has an `ExternalIP`, provisioning fails and you must set `hostname` explicitly. + - `port` — Kubernetes allocates a port from the cluster's NodePort range. + + Set `proxy_jump.hostname` and `proxy_jump.port` per context to override autodetection — useful when nodes lack `ExternalIP`s, or when you want a stable, firewall-friendly port: + + ```yaml + contexts: + - name: gpu-cluster-a + proxy_jump: + hostname: 204.12.171.137 + port: 32000 + ``` + + Both fields are independent — you can set just one. + + The jump host can be a GPU node or a CPU-only node — it makes no difference. The only requirement is that both the `dstack` server and CLI can reach `hostname:port`. + +!!! info "Region and namespace" + Each enabled context becomes its own `dstack` region, named after the context. When creating a `dstack` [volume](volumes.md) or [gateway](gateways.md), the `region` field selects which cluster the resource is provisioned in. + + The namespace `dstack` uses for managed resources is taken from each kubeconfig context's `namespace` property, defaulting to `default` if not set: + + ```yaml + contexts: + - name: gpu-cluster-a + context: + cluster: gpu-cluster-a + user: kubernetes-admin + namespace: dstack + ``` + +??? info "User interface" + If you are configuring the `kubernetes` backend on the [project settings page](projects.md#backends), + specify the contents of the `kubeconfig` file in `data`: + +
    + + ```yaml + type: kubernetes + + kubeconfig: + data: | + apiVersion: v1 + kind: Config + + clusters: + - name: gpu-cluster-a + cluster: + server: https://gpu-cluster-a.internal.example.com:6443 + certificate-authority-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0t...LS0tLQo= + + users: + - name: kubernetes-admin + user: + client-certificate-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0t...LS0tLQo= + client-key-data: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0t...LS0tLQo= + + contexts: + - name: gpu-cluster-a + context: + cluster: gpu-cluster-a + user: kubernetes-admin + namespace: dstack + + contexts: + - name: gpu-cluster-a + proxy_jump: + hostname: 204.12.171.137 + port: 32000 + ``` + +
    + +??? warning "Legacy configuration (without `contexts`)" + If `contexts` is not set, `dstack` falls back to using the kubeconfig's `current-context` as the only cluster, and the top-level `proxy_jump` and `namespace` properties apply: + +
    + + ```yaml + projects: + - name: main + backends: + - type: kubernetes + + kubeconfig: + filename: ~/.kube/config + + namespace: dstack + + proxy_jump: + hostname: 204.12.171.137 + port: 32000 + ``` + +
    + + This mode is not recommended and may be deprecated and removed in the future. It also has a namespace-handling quirk: the top-level `namespace` property **overrides** the kubeconfig context's namespace (defaulting to `default` if not set in the config), unlike the `contexts` mode where the kubeconfig is authoritative. A warning is logged when the two disagree. To prepare for a possible future change, set the same value in both your kubeconfig context and the backend config. + + With this configuration, the cluster's region is an empty string. When creating a `dstack` volume or gateway, set `region: ''` explicitly in the configuration. + + !!! warning "Migrating from legacy to `contexts`" + Switching an existing backend from the legacy mode to `contexts` is not transparent for already-provisioned resources: their region changes from an empty string to the context name, so `dstack` can no longer terminate them. Terminate all jobs, gateways, and volumes managed by the backend before changing the configuration. + +??? info "Required operators" + === "NVIDIA" + For `dstack` to correctly detect GPUs in your Kubernetes cluster, the cluster must have the + [NVIDIA GPU Operator](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/index.html) pre-installed. + === "AMD" + For `dstack` to correctly detect GPUs in your Kubernetes cluster, the cluster must have the + [AMD GPU Operator](https://github.com/ROCm/gpu-operator) pre-installed. + + + +??? info "Required permissions" + The following Kubernetes permissions are sufficient for `dstack` to work: + + * Cluster-scoped resources: + ```yaml + --8<-- "snippets/kubernetes/dstack-backend-clusterrole.yaml" + ``` + * Namespaced resources (replace the `${NAMESPACE}` placeholder with an actual value): + ```yaml + --8<-- "snippets/kubernetes/dstack-backend-role.yaml" + ``` + + Ensure you've created a ClusterRoleBinding and RoleBinding to grant the roles to the user or the service account you're using. + +??? info "Resources and offers" + If you use ranges with [`resources`](../concepts/tasks.md#resources) (e.g. `gpu: 1..8` or `memory: 64GB..`) in fleet or run configurations, other backends collect and try all offers that satisfy the range. + + The `kubernetes` backend handles it differently. + + * For `gpu`, if you specify a range (e.g. `gpu: 4..8`), the `kubernetes` backend only provisions pods with the GPU count equal to the lower limit (`4`). The upper limit of the GPU range is always ignored. + * For other resources such as `cpu`, `memory`, and `disk`, the `kubernetes` backend passes the lower and upper limits of the range as Kubernetes [requests and limits](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#requests-and-limits) respectively. If the upper limit is not set, the Kubernetes limit is also not set. + + Example: + +
    + + ```yaml + type: dev-environment + ide: vscode + + resources: + cpu: 32..64 + memory: 1024GB + disk: 100GB.. + gpu: nvidia:4..8 + ``` + +
    + + This translates to the following Kubernetes resource spec: + + | Resource | Request | Limit | + |---------------------|----------|-----------| + | `cpu` | `32` | `64` | + | `memory` | `1024Gi` | `1024Gi` | + | `ephemeral-storage` | `100Gi` | _not set_ | + | `nvidia.com/gpu` | `4` | `4` | + + This applies to offers shown in `dstack apply` (run plans), during provisioning, and in `dstack offer`. Unlike other backends, offers for the `kubernetes` backend always reflect the lower limit of the range. + +> To learn more, see the [Lambda](../examples/clusters/lambda/#kubernetes) and [Crusoe](../examples/clusters/crusoe/#kubernetes) examples. + +### Runpod + +Log into your [Runpod](https://www.runpod.io/console/) console, click Settings in the sidebar, expand the `API Keys` section, and click +the button to create a Read & Write key. + +Then proceed to configuring the backend. + +
    + +```yaml +projects: + - name: main + backends: + - type: runpod + creds: + type: api_key + api_key: US9XTPDIV8AR42MMINY8TCKRB8S4E7LNRQ6CAUQ9 +``` + +
    + +??? info "Community Cloud" + By default, `dstack` considers instance offers only from the Secure Cloud. + To also include the + [Community Cloud](https://docs.runpod.io/references/faq/#secure-cloud-vs-community-cloud), + set `community_cloud: true` in the backend settings. + +
    + + ```yaml + projects: + - name: main + backends: + - type: runpod + creds: + type: api_key + api_key: US9XTPDIV8AR42MMINY8TCKRB8S4E7LNRQ6CAUQ9 + community_cloud: true + ``` + +
    + + You can tell Secure Cloud and Community Cloud apart by their regions. + Secure Cloud regions contain datacenter IDs such as `CA-MTL-3`. + Community Cloud regions contain country codes such as `CA`. + +
    + + ```shell + $ dstack apply -f .dstack.yml -b runpod + + # BACKEND REGION INSTANCE SPOT PRICE + 1 runpod CA NVIDIA A100 80GB PCIe yes $0.6 + 2 runpod CA-MTL-3 NVIDIA A100 80GB PCIe yes $0.82 + ``` + +
    + +### Vast.ai + +Log into your [Vast.ai](https://cloud.vast.ai/) account, click Account in the sidebar, and copy your +API Key. + +Then, go ahead and configure the backend: + +
    + +```yaml +projects: +- name: main + backends: + - type: vastai + creds: + type: api_key + api_key: d75789f22f1908e0527c78a283b523dd73051c8c7d05456516fc91e9d4efd8c5 +``` + +
    + +??? info "Community Cloud" + By default, `dstack` includes both Server Cloud (datacenter) and Community Cloud offers. + To restrict offers to Server Cloud only, set `community_cloud: false` in the backend settings. + +
    + + ```yaml + projects: + - name: main + backends: + - type: vastai + creds: + type: api_key + api_key: d75789f22f1908e0527c78a283b523dd73051c8c7d05456516fc91e9d4efd8c5 + community_cloud: false + ``` + +
    + +Also, the `vastai` backend supports on-demand instances only. Spot instance support coming soon. diff --git a/mkdocs/docs/concepts/dev-environments.md b/mkdocs/docs/concepts/dev-environments.md new file mode 100644 index 0000000000..2e4bb73f11 --- /dev/null +++ b/mkdocs/docs/concepts/dev-environments.md @@ -0,0 +1,660 @@ +--- +title: Dev environments +description: Provisioning remote instances for cloud-based development +--- + +# Dev environments + +A dev environment lets you provision an instance and access it with your desktop IDE or SSH. + +??? info "Prerequisites" + Before running a dev environment, make sure you’ve [installed](../installation.md) the server and CLI, and created a [fleet](fleets.md). + +## Apply a configuration + +First, define a dev environment configuration as a YAML file. +The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml` are both acceptable). + +
    + +```yaml +type: dev-environment +# The name is optional, if not specified, generated randomly +name: vscode + +python: "3.11" +# Uncomment to use a custom Docker image +#image: huggingface/trl-latest-gpu + +# Comment if not required +ide: vscode + +# Uncomment to leverage spot instances +#spot_policy: auto + +resources: + gpu: 24GB +``` + +
    + +To run a dev environment, pass the configuration to [`dstack apply`](../reference/cli/dstack/apply.md): + +
    + +```shell +$ dstack apply -f examples/.dstack.yml + + # BACKEND REGION RESOURCES SPOT PRICE + 1 runpod CA-MTL-1 9xCPU, 48GB, A5000:24GB yes $0.11 + 2 runpod EU-SE-1 9xCPU, 43GB, A5000:24GB yes $0.11 + 3 gcp us-west4 4xCPU, 16GB, L4:24GB yes $0.214516 + +Submit the run vscode? [y/n]: y + +Launching `vscode`... +---> 100% + +To open in VS Code Desktop, use this link: + vscode://vscode-remote/ssh-remote+vscode/workflow + +To connect via SSH, use: `ssh vscode` +``` + +
    + +`dstack apply` automatically provisions an instance and sets up an IDE on it. + +The `ide` property supports `vscode`, `cursor`, `windsurf`, and `zed`. + +??? info "SSH-only" + The `ide` property is optional. If omitted, no IDE is pre-installed, but the dev environment + is still accessible via SSH: + +
    + + ```yaml + type: dev-environment + name: my-env + + python: "3.11" + + resources: + gpu: 24GB + ``` + +
    + +??? info "Windows" + On Windows, `dstack` works both natively and inside WSL. But, for dev environments, + it's recommended _not to use_ `dstack apply` _inside WSL_ due to a [VS Code issue](https://github.com/microsoft/vscode-remote-release/issues/937). + +To open the dev environment in your desktop IDE, use the link from the output +(such as `vscode://vscode-remote/ssh-remote+fast-moth-1/workflow`). + +![](../../assets/images/dstack-vscode-jupyter.png){ width=800 } + +??? info "SSH" + + Alternatively, while the CLI is attached to the run, you can connect to the dev environment via SSH: + +
    + + ```shell + $ ssh vscode + ``` + +
    + +## Configuration options + +### Initialization + +If you want to pre-configure the dev environment, specify the [`init`](../reference/dstack.yml/dev-environment.md#init) +property with a list of commands to run at startup: + +
    + +```yaml +type: dev-environment +name: vscode + +python: "3.11" +ide: vscode + +init: + - pip install wandb +``` + +
    + +### Resources + +When you specify a resource value like `cpu` or `memory`, +you can either use an exact value (e.g. `24GB`) or a +range (e.g. `24GB..`, or `24GB..80GB`, or `..80GB`). + +
    + +```yaml +type: dev-environment +# The name is optional, if not specified, generated randomly +name: vscode + +ide: vscode + +resources: + # 16 or more x86_64 cores + cpu: 16.. + # 200GB or more RAM + memory: 200GB.. + # 4 GPUs from 40GB to 80GB + gpu: 40GB..80GB:4 + # Shared memory (required by multi-gpu) + shm_size: 16GB + # Disk size + disk: 500GB +``` + +
    + +The `cpu` property lets you set the architecture (`x86` or `arm`) and core count — e.g., `x86:16` (16 x86 cores), `arm:8..` (at least 8 ARM cores). +If not set, `dstack` infers it from the GPU or defaults to `x86`. + +The `gpu` property lets you specify vendor, model, memory, and count — e.g., `nvidia` (one NVIDIA GPU), `A100` (one A100), `A10G,A100` (either), `A100:80GB` (one 80GB A100), `A100:2` (two A100), `24GB..40GB:2` (two GPUs with 24–40GB), `A100:40GB:2` (two 40GB A100s). + +If vendor is omitted, `dstack` infers it from the model or defaults to `nvidia`. + + + +??? info "Shared memory" + If you are using parallel communicating processes (e.g., dataloaders in PyTorch), you may need to configure + `shm_size`, e.g. set it to `16GB`. + +> If you’re unsure which offers (hardware configurations) are available from the configured backends, use the +> [`dstack offer`](../reference/cli/dstack/offer.md#list-gpu-offers) command to list them. + +### Docker + +#### Default image + +If you don't specify `image`, `dstack` uses its [base](https://github.com/dstackai/dstack/tree/master/docker/base) Docker image pre-configured with + `uv`, `python`, `pip`, essential CUDA drivers, `mpirun`, and NCCL tests (under `/opt/nccl-tests/build`). + +Set the `python` property to pre-install a specific version of Python. + +
    + +```yaml +type: dev-environment +name: vscode + +python: 3.12 + +ide: vscode +``` + +
    + +#### NVCC + +By default, the base Docker image doesn’t include `nvcc`, which is required for building custom CUDA kernels. +If you need `nvcc`, set the [`nvcc`](../reference/dstack.yml/dev-environment.md#nvcc) property to true. + +
    + +```yaml +type: dev-environment +name: vscode + +python: 3.12 +nvcc: true + +ide: vscode +init: + - uv pip install flash_attn --no-build-isolation +``` + +
    + +#### Custom image + +If you want, you can specify your own Docker image via `image`. + +
    + +```yaml +type: dev-environment +name: vscode + +image: huggingface/trl-latest-gpu + +ide: vscode +``` + +
    + +#### Docker in Docker + +Set `docker` to `true` to enable the `docker` CLI in your dev environment, e.g., to run or build Docker images, or use Docker Compose. + +
    + +```yaml +type: dev-environment +name: vscode + +docker: true + +ide: vscode +init: + - docker run --gpus all nvidia/cuda:12.3.0-base-ubuntu22.04 nvidia-smi +``` + +
    + +Cannot be used with `python` or `image`. Not supported on `runpod`, `vastai`, or `kubernetes`. + +#### Privileged mode + +To enable privileged mode, set [`privileged`](../reference/dstack.yml/dev-environment.md#privileged) to `true`. + +Not supported with `runpod`, `vastai`, and `kubernetes`. + +#### Private registry + +Use the [`registry_auth`](../reference/dstack.yml/dev-environment.md#registry_auth) property to provide credentials for a private Docker registry. + +
    + +```yaml +type: dev-environment +name: vscode + +env: + - NGC_API_KEY + +image: nvcr.io/nim/deepseek-ai/deepseek-r1-distill-llama-8b +registry_auth: + username: $oauthtoken + password: ${{ env.NGC_API_KEY }} + +ide: vscode +``` + +
    + +### Environment variables + +
    + +```yaml +type: dev-environment +name: vscode + +env: + - HF_TOKEN + - HF_HUB_ENABLE_HF_TRANSFER=1 + +ide: vscode +``` + +
    + +If you don't assign a value to an environment variable (see `HF_TOKEN` above), +`dstack` will require the value to be passed via the CLI or set in the current process. + +??? info "System environment variables" + The following environment variables are available in any run by default: + + | Name | Description | + |-------------------------|--------------------------------------------------| + | `DSTACK_RUN_NAME` | The name of the run | + | `DSTACK_REPO_ID` | The ID of the repo | + | `DSTACK_GPUS_NUM` | The total number of GPUs in the run | + | `DSTACK_WORKING_DIR` | The working directory of the run | + | `DSTACK_REPO_DIR` | The directory where the repo is mounted (if any) | + +### Working directory + +If `working_dir` is not specified, it defaults to the working directory set in the Docker image. For example, the [default image](#default-image) uses `/dstack/run` as its working directory. + +If the Docker image does not have a working directory set, `dstack` uses `/` as the `working_dir`. + +The `working_dir` must be an absolute path. The tilde (`~`) is supported (e.g., `~/my-working-dir`). + + + +### Files + +Sometimes, when you run a dev environment, you may want to mount local files. This is possible via the [`files`](../reference/dstack.yml/task.md#_files) property. Each entry maps a local directory or file to a path inside the container. + +
    + +```yaml +type: dev-environment +name: vscode + +files: + - .:examples # Maps the directory with `.dstack.yml` to `/examples` + - ~/.ssh/id_rsa:/root/.ssh/id_rsa # Maps `~/.ssh/id_rsa` to `/root/.ssh/id_rsa` + +ide: vscode +``` + +
    + +If the local path is relative, it’s resolved relative to the configuration file. +If the container path is relative, it’s resolved relative to the [working directory](#working-directory). + +The container path is optional. If not specified, it will be automatically calculated: + +
    + +```yaml +type: dev-environment +name: vscode + +files: + - ../examples # Maps the parent directory of `.dstack.yml` to `/../examples` + - ~/.ssh/id_rsa # Maps `~/.ssh/id_rsa` to `/root/.ssh/id_rsa` + +ide: vscode +``` + +
    + +??? info "File size" + Whether its a file or folder, each entry is limited to 2MB. To avoid exceeding this limit, make sure to exclude unnecessary files + by listing it via `.gitignore` or `.dstackignore`. + The 2MB upload limit can be increased by setting the `DSTACK_SERVER_CODE_UPLOAD_LIMIT` environment variable. + +### Repos + +Sometimes, you may want to clone an entire Git repo inside the container. + +Imagine you have a Git repo (clonned locally) containing an `examples` subdirectory with a `.dstack.yml` file: + +
    + +```yaml +type: dev-environment +name: vscode + +repos: + # Clones the repo from the parent directory (`examples/..`) to `` + - .. + +ide: vscode +``` + +
    + +When you run it, `dstack` clones the repo on the instance, applies your local changes, and mounts it—so the container matches your local repo. + +The local path can be either relative to the configuration file or absolute. + +??? info "Repo directory" + By default, `dstack` clones the repo to the [working directory](#working-directory). + + You can override the repo directory using either a relative or an absolute path: + +
    + + ```yaml + type: dev-environment + name: vscode + + repos: + # Clones the repo in the parent directory (`examples/..`) to `/my-repo` + - ..:/my-repo + + ide: vscode + ``` + +
    + + > If the repo directory is relative, it is resolved against [working directory](#working-directory). + + If the repo directory is not empty, the run will fail with a runner error. + To override this behavior, you can set `if_exists` to `skip`: + + ```yaml + type: dev-environment + name: vscode + + repos: + - local_path: .. + path: /my-repo + if_exists: skip + + ide: vscode + ``` + + +??? info "Repo size" + The repo size is not limited. However, local changes are limited to 2MB. + To avoid exceeding this limit, exclude unnecessary files using `.gitignore` or `.dstackignore`. + You can increase the 2MB limit by setting the `DSTACK_SERVER_CODE_UPLOAD_LIMIT` environment variable. + +??? info "Repo URL" + Sometimes you may want to clone a Git repo within the container without cloning it locally. In this case, simply provide a URL in `repos`: + +
    + + ```yaml + type: dev-environment + name: vscode + + repos: + # Clone the repo to `` + - https://github.com/dstackai/dstack + + ide: vscode + ``` + +
    + +??? info "Private repos" + If a Git repo is private, `dstack` will automatically try to use your default Git credentials (from + `~/.ssh/config` or `~/.config/gh/hosts.yml`). + + > If you want to use custom credentials, ensure to pass them via [`dstack init`](../reference/cli/dstack/init.md) before submitting a run. + +Currently, you can configure up to one repo per run configuration. + +### Retry policy + +By default, if `dstack` can't find capacity or the instance is interrupted, the run will fail. + +If you'd like `dstack` to automatically retry, configure the +[retry](../reference/dstack.yml/dev-environment.md#retry) property accordingly: + +
    + +```yaml +type: dev-environment +# The name is optional, if not specified, generated randomly +name: vscode + +ide: vscode + +retry: + # Retry on specific events + on_events: [no-capacity, error, interruption] + # Retry for up to 1 hour + duration: 1h +``` + +
    + +!!! info "Retry duration" + The duration period is calculated as a run age for `no-capacity` event + and as a time passed since the last `interruption` and `error` for `interruption` and `error` events. + +### Inactivity duration + +Set [`inactivity_duration`](../reference/dstack.yml/dev-environment.md#inactivity_duration) +to automatically stop the dev environment after a configured period of inactivity. + +
    + +```yaml +type: dev-environment +name: vscode + +ide: vscode + +# Stop if inactive for 2 hours +inactivity_duration: 2h +``` + +
    + +The dev environment becomes inactive when you close the remote VS Code window, +close any `ssh ` shells, and stop the `dstack apply` or `dstack attach` command. +If you go offline without stopping anything manually, the dev environment will also become inactive +within about 3 minutes. + +If `inactivity_duration` is configured for your dev environment, you can see how long +it has been inactive in `dstack ps --verbose` (or `-v`). + +
    + +```shell +$ dstack ps -v + NAME BACKEND RESOURCES PRICE STATUS SUBMITTED + vscode runpod 2xCPU, 8GB, $0.0286 running 8 mins ago + 100.0GB (disk) (inactive for 2m 34s) +``` + +
    + +If you reattach to the dev environment using [`dstack attach`](../reference/cli/dstack/attach.md), +the inactivity timer will be reset within a few seconds. + +??? info "In-place update" + As long as the configuration defines the `name` property, the value of `inactivity_duration` + can be changed for a running dev environment without a restart. + Just change the value in the configuration and run `dstack apply` again. + +
    + + ```shell + $ dstack apply -f .dstack.yml + + Detected configuration changes that can be updated in-place: ['inactivity_duration'] + Update the run? [y/n]: + ``` + +
    + +> `inactivity_duration` is not to be confused with [`idle_duration`](#idle-duration). +> The latter determines how soon the underlying cloud instance will be terminated +> _after_ the dev environment is stopped. + +### Utilization policy + +Sometimes it’s useful to track whether a dev environment is fully utilizing all GPUs. While you can check this with +[`dstack metrics`](../reference/cli/dstack/metrics.md), `dstack` also lets you set a policy to auto-terminate the run if any GPU is underutilized. + +Below is an example of a dev environment that auto-terminate if any GPU stays below 10% utilization for 1 hour. + +
    + +```yaml +type: dev-environment +name: my-dev + +python: 3.12 +ide: cursor + +resources: + gpu: H100:8 + +utilization_policy: + min_gpu_utilization: 10 + time_window: 1h +``` + +
    + +### Schedule + +Specify `schedule` to start a dev environment periodically at specific UTC times using the cron syntax: + +
    + +```yaml +type: dev-environment +ide: vscode +schedule: + cron: "0 8 * * mon-fri" # at 8:00 UTC from Monday through Friday +``` + +
    + +The `schedule` property can be combined with `max_duration` or `utilization_policy` to shutdown the dev environment automatically when it's not needed. + +??? info "Cron syntax" + `dstack` supports [POSIX cron syntax](https://pubs.opengroup.org/onlinepubs/9699919799/utilities/crontab.html#tag_20_25_07). One exception is that days of the week are started from Monday instead of Sunday so `0` corresponds to Monday. + + The month and day of week fields accept abbreviated English month and weekday names (`jan–dec` and `mon–sun`) respectively. + + A cron expression consists of five fields: + + ``` + ┌───────────── minute (0-59) + │ ┌───────────── hour (0-23) + │ │ ┌───────────── day of the month (1-31) + │ │ │ ┌───────────── month (1-12 or jan-dec) + │ │ │ │ ┌───────────── day of the week (0-6 or mon-sun) + │ │ │ │ │ + │ │ │ │ │ + │ │ │ │ │ + * * * * * + ``` + + The following operators can be used in any of the fields: + + | Operator | Description | Example | + |----------|-----------------------|-------------------------------------------------------------------------| + | `*` | Any value | `0 * * * *` runs every hour at minute 0 | + | `,` | Value list separator | `15,45 10 * * *` runs at 10:15 and 10:45 every day. | + | `-` | Range of values | `0 1-3 * * *` runs at 1:00, 2:00, and 3:00 every day. | + | `/` | Step values | `*/10 8-10 * * *` runs every 10 minutes during the hours 8:00 to 10:59. | + +### Spot policy + +By default, `dstack` uses on-demand instances. However, you can change that +via the [`spot_policy`](../reference/dstack.yml/dev-environment.md#spot_policy) property. It accepts `spot`, `on-demand`, and `auto`. + +--8<-- "docs/concepts/snippets/manage-fleets.ext" + +!!! info "Reference" + Dev environments support many more configuration options, + incl. [`backends`](../reference/dstack.yml/dev-environment.md#backends), + [`regions`](../reference/dstack.yml/dev-environment.md#regions), + [`max_price`](../reference/dstack.yml/dev-environment.md#max_price), and + [`max_duration`](../reference/dstack.yml/dev-environment.md#max_duration), + among [others](../reference/dstack.yml/dev-environment.md). + + +--8<-- "docs/concepts/snippets/manage-runs.ext" + +!!! info "What's next?" + 1. Read about [tasks](tasks.md) and [services](services.md) + 2. Learn how to manage [fleets](fleets.md) diff --git a/mkdocs/docs/concepts/events.md b/mkdocs/docs/concepts/events.md new file mode 100644 index 0000000000..d4057715a2 --- /dev/null +++ b/mkdocs/docs/concepts/events.md @@ -0,0 +1,75 @@ +--- +title: Events +description: Auditing resource state changes and operations +--- + +# Events + +Events provide a chronological record of notable state changes and operations affecting `dstack` resources. They are designed for auditing, debugging, and understanding the lifecycle of runs, jobs, fleets, and other resources. + +Each event includes the following fields: + +| Field | Description | +| --------- | ----------------------------------------------------------- | +| Timestamp | When the event occurred | +| Actor | The user or system that initiated the change, if applicable | +| Targets | The resources affected by the event | +| Message | A description of the change or additional event details | + +Events can be queried by targeting a specific resource or within a group of related resources. For example, you can query events targeting a particular job, or query events within a run, including the run itself and all of its jobs. + +Events are accessible through the UI, CLI, and API. + +## UI + +The UI allows you to query events either globally on the dedicated `Events` page or within a specific group on the page of a run, job, fleet, and other resources. + +### Global page + +The global page shows events from all projects that the user has access to and allows filtering by many fields. + +![](https://dstack.ai/static-assets/static-assets/images/dstack-ui-events-global.png){ width=800 } + +This page allows you to query events targeting a specific resource or within a particular group. + +### Resource page + +The resource page shows events within that specific group. For example, if you open a run and switch to the `Events` tab, you will see all events about that run and its jobs. + +![](https://dstack.ai/static-assets/static-assets/images/dstack-ui-events-run.png){ width=800 } + +## CLI + +To query events via the CLI, use the `dstack event` command. This command provides several arguments that allow filtering by target and within scopes. + +Here is an example of querying all events within a particular run: + +
    + +```shell +$ dstack event --within-run cursor + +[2026-01-21 13:09:37] [👤admin] [run cursor] Run submitted. Status: SUBMITTED +[2026-01-21 13:09:37] [job cursor-0-0] Job created on run submission. Status: SUBMITTED +[2026-01-21 13:09:57] [job cursor-0-0] Job status changed SUBMITTED -> PROVISIONING +[2026-01-21 13:09:58] [job cursor-0-0, instance some-fleet-0] Instance created for job. Instance status: PROVISIONING +[2026-01-21 13:09:59] [run cursor] Run status changed SUBMITTED -> PROVISIONING +[2026-01-21 13:11:22] [job cursor-0-0] Job status changed PROVISIONING -> PULLING +[2026-01-21 13:11:49] [job cursor-0-0] Job status changed PULLING -> RUNNING +[2026-01-21 13:11:51] [run cursor] Run status changed PROVISIONING -> RUNNING +[2026-01-21 13:18:41] [👤admin] [run cursor] Run status changed RUNNING -> TERMINATING. Termination reason: STOPPED_BY_USER +[2026-01-21 13:18:48] [job cursor-0-0] Job status changed RUNNING -> TERMINATING. Termination reason: TERMINATED_BY_USER +[2026-01-21 13:19:05] [instance some-fleet-0, job cursor-0-0] Job unassigned from instance. Instance blocks: 0/1 busy +[2026-01-21 13:19:05] [job cursor-0-0] Job status changed TERMINATING -> TERMINATED +[2026-01-21 13:19:07] [run cursor] Run status changed TERMINATING -> TERMINATED +``` + +
    + +To see all supported arguments, check the [reference](../reference/cli/dstack/event.md). + +If you invoke the command without arguments, it will include all events targeting resources in the project. + +## TTL + +By default, `dstack` stores each event for 30 days and then deletes it. This can be overridden by server administrators using the `DSTACK_SERVER_EVENTS_TTL_SECONDS` environment variable. diff --git a/mkdocs/docs/concepts/exports.md b/mkdocs/docs/concepts/exports.md new file mode 100644 index 0000000000..39df234450 --- /dev/null +++ b/mkdocs/docs/concepts/exports.md @@ -0,0 +1,194 @@ +--- +title: Exports +description: Exporting resources across projects +--- + +# Exports + +Exports allow making resources from one project available to other projects. When a project exports a resource, +the specified importer projects can see and use it as if it were their own. + +!!! warning "Experimental" + Exports are an experimental feature. + Currently, [SSH fleets](fleets.md#ssh-fleets) and [gateways](gateways.md) can be exported. + +An export is created in the exporter project and specifies the resources to export and the +importer projects that will gain access to them. + +Once an export is created, the importer projects can see the exported resources in their resource lists and use them +for running tasks, dev environments, and services. Imported resources appear with a project prefix +(e.g., `team-a/my-fleet`) to distinguish them from the project's own resources. + +!!! info "Required project role" + The user creating or updating an export must have the project admin role on both the exporter project and + any importer project they add. Alternatively, a global admin can add any project as an importer. + +## Manage exports + +### Create exports + +Use the `dstack export create` command to create a new export. Specify the fleets to export +with `--fleet`, the gateways to export with `--gateway`, and the importer projects with `--importer`: + +
    + +```shell +$ dstack export create my-export --fleet my-fleet --gateway my-gateway --importer team-b + NAME FLEETS GATEWAYS IMPORTERS + my-export my-fleet my-gateway team-b + +``` + +
    + +`--fleet`, `--gateway`, and `--importer` can be specified multiple times: + +
    + +```shell +$ dstack export create shared-gpus --fleet gpu-fleet-1 --fleet gpu-fleet-2 --importer team-b --importer team-c + NAME FLEETS GATEWAYS IMPORTERS + shared-gpus gpu-fleet-1, gpu-fleet-2 - team-b, team-c + +``` + +
    + +### List exports + +Use `dstack export list` (or simply `dstack export`) to list all exports in the project: + +
    + +```shell +$ dstack export list + NAME FLEETS GATEWAYS IMPORTERS + my-export my-fleet my-gateway team-b + shared-gpus gpu-fleet-1, gpu-fleet-2 - team-b, team-c + +``` + +
    + +### Update exports + +Use the `dstack export update` command to add or remove fleets, gateways, and importers from an existing export: + +
    + +```shell +$ dstack export update my-export --add-fleet another-fleet --add-importer team-c + NAME FLEETS GATEWAYS IMPORTERS + my-export my-fleet, another-fleet my-gateway team-b, team-c + +``` + +
    + +To remove a fleet, gateway, or importer: + +
    + +```shell +$ dstack export update my-export --remove-importer team-b + NAME FLEETS GATEWAYS IMPORTERS + my-export my-fleet, another-fleet my-gateway team-c + +``` + +
    + +### Delete exports + +Use the `dstack export delete` command to delete an export. This revokes access for all importer projects: + +
    + +```shell +$ dstack export delete my-export +Delete the export my-export? [y/n]: y +Export my-export deleted +``` + +
    + +Use `-y` to skip the confirmation prompt. + +### Global exports + +Users with the global admin role can mark any export as a global export. Global exports are automatically imported into all projects, and their imports cannot be deleted. + +
    + +```shell +$ dstack export create global-export --gateway shared-gateway --global + NAME FLEETS GATEWAYS IMPORTERS + global-export - shared-gateway * + +``` + +Only promoting an export to global requires the global admin role. Regular project admins can add or remove resources, remove global status, or delete the export. + +
    + +## Access imported resources + +From the importer project's perspective, use `dstack import list` (or simply `dstack import`) to list all imports in the project — i.e., all exports from other projects that this project has been granted access to: + +
    + +```shell +$ dstack import list + NAME FLEETS GATEWAYS + team-a/my-export my-fleet, another-fleet my-gateway + +``` + +
    + +Imported fleets and gateways also appear in `dstack fleet list` and `dstack gateway list` in the `/` format: + +
    + +```shell +$ dstack fleet list + NAME NODES GPU SPOT BACKEND PRICE STATUS CREATED + my-local-fleet 1 - - ssh - active 3 days ago + team-a/my-fleet 2 A100:80GB:8 - ssh - active 1 week ago + team-a/another-fleet 1 H100:80GB:4 - ssh - active 2 days ago + +$ dstack gateway list + NAME BACKEND HOSTNAME DOMAIN DEFAULT STATUS + team-a/my-gateway aws (eu-west-1) 10.0.0.4 gtw.mycompany.example running + +``` + +
    + +Imported resources can be used for runs just like the project's own resources. + +
    + +```yaml +type: service +image: nginx +port: 80 + +gateway: team-a/my-gateway + +fleets: +- my-local-fleet +- team-a/my-fleet +``` + +
    + +!!! info "Tenant isolation" + Exported fleets share the same access model as regular fleets. See [Tenant isolation](../guides/tenant-isolation.md) for details. + +!!! info "What's next?" + 1. Check the [`dstack export` CLI reference](../reference/cli/dstack/export.md) + 1. Check the [`dstack import` CLI reference](../reference/cli/dstack/import.md) + 1. Learn how to manage [fleets](fleets.md) + 1. Learn how to manage [gateways](gateways.md) + 1. Read about [projects](projects.md) and project roles diff --git a/mkdocs/docs/concepts/fleets.md b/mkdocs/docs/concepts/fleets.md new file mode 100644 index 0000000000..b74bd60c08 --- /dev/null +++ b/mkdocs/docs/concepts/fleets.md @@ -0,0 +1,526 @@ +--- +title: Fleets +description: Managing pools of compute instances +--- + +# Fleets + +Before submitting runs, you must create a fleet. Fleets act as both pools of instances and templates for how those instances are provisioned. + +> `dstack` supports two fleet types: [backend fleets](#backend-fleet) (which are provisioned dynamically in the cloud or on Kubernetes), and [SSH fleets](#ssh-fleet) (which use existing on-prem servers). + +## Apply a configuration + +To create a fleet, define its configuration in a YAML file. The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `fleet.dstack.yml`), regardless of fleet type. + +=== "Backend fleets" + If you're using cloud providers or Kubernetes clusters and have configured the corresponding [backends](backends.md), create a backend fleet as follows: + +
    + + ```yaml + type: fleet + name: my-fleet + + # Allow to provision of up to 2 instances + nodes: 0..2 + + # Uncomment to ensure instances are inter-connected + #placement: cluster + + # Deprovision instances above the minimum if they remain idle + idle_duration: 1h + + resources: + # Allow to provision up to 8 GPUs + gpu: 0..8 + ``` + +
    + + Pass the fleet configuration to `dstack apply`: + +
    + + ```shell + $ dstack apply -f fleet.dstack.yml + + # BACKEND REGION RESOURCES SPOT PRICE + 1 gcp us-west4 2xCPU, 8GB, 100GB (disk) yes $0.010052 + 2 azure westeurope 2xCPU, 8GB, 100GB (disk) yes $0.0132 + 3 gcp europe-central2 2xCPU, 8GB, 100GB (disk) yes $0.013248 + + Create the fleet? [y/n]: y + + FLEET INSTANCE BACKEND GPU PRICE STATUS CREATED + my-fleet 0 gcp (europe-west-1) L4:24GB (spot) $0.1624 idle 3 mins ago + 1 gcp (europe-west-1) L4:24GB (spot) $0.1624 idle 3 mins ago + ``` + +
    + + If the `nodes` range starts with `0`, `dstack apply` creates only a template. Instances are provisioned only when you submit runs. + +=== "SSH fleets" + If you have a group of on-prem servers accessible via SSH, you can create an SSH fleet as follows: + +
    + + ```yaml + type: fleet + name: my-fleet + + # Uncomment if instances are interconnected + #placement: cluster + + ssh_config: + user: ubuntu + identity_file: ~/.ssh/id_rsa + hosts: + - 3.255.177.51 + - 3.255.177.52 + ``` + +
    + + Pass the fleet configuration to `dstack apply`: + +
    + + ```shell + $ dstack apply -f fleet.dstack.yml + + Provisioning... + ---> 100% + + FLEET INSTANCE BACKEND GPU PRICE STATUS CREATED + my-fleet 0 ssh (remote) L4:24GB $0 idle 3 mins ago + 1 ssh (remote) L4:24GB $0 idle 3 mins ago + ``` + +
    + + `dstack apply` automatically connects to on-prem servers, installs the required dependencies, and adds them to the created fleet. + + ??? info "Host requirements" + 1. Hosts must be Linux-based and have Docker pre-installed. + + === "NVIDIA" + 2. Hosts with NVIDIA GPUs must also be pre-installed with CUDA 12.1 and + [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html). + + === "AMD" + 2. Hosts with AMD GPUs must also be pre-installed with AMDGPU-DKMS kernel driver (e.g. via + [native package manager](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/native-install/index.html) + or [AMDGPU installer](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/amdgpu-install.html).) + + === "Tenstorrent" + 2. Hosts with Tenstorrent accelerators must be pre-installed with [Tenstorrent software](https://docs.tenstorrent.com/getting-started/README.html#software-installation). + This must include the drivers and HugePages. + + 3. The user specified must have passwordless `sudo` access. + + 4. The SSH server must be running and configured with `AllowTcpForwarding yes` in `/etc/ssh/sshd_config`. + + 5. The firewall must allow SSH and should forbid any other connections from external networks. + + 6. If `placement` is set to `cluster`, hosts must be able to communicate with each other. + +> Once the fleet is created, you can run [dev environments](dev-environments.md), [tasks](tasks.md), and [services](services.md). + +## Configuration options + +Backend fleets support [many options](../reference/dstack.yml/fleet.md); see some major configuration examples below. + +### Cluster placement + +Both [backend fleets](#backend-fleet) and [SSH fleets](#ssh-fleet) allow the `placement` property to be set to `cluster`. + +This property ensures that instances are interconnected. This is required for running [distributed tasks](tasks.md#distributed-tasks). + +=== "Backend fleets" + Backend fleets allow to provision interconnected clusters across supported backends. + +
    + + ```yaml + type: fleet + name: my-fleet + + nodes: 2 + placement: cluster + + resources: + gpu: H100:8 + ``` + +
    + + #### Backends + + Fast interconnect is supported on the `aws`, `gcp`, `nebius`, `crusoe`, and `kubernetes` backends. Some backends may require additional configuration. + + === "GCP" + On GCP, you may need to configure `extra_vpcs` and `roce_vpcs` in the `gcp` backend configuration. + Refer to the [GCP](../examples/clusters/gcp.md) examples for more details. + + === "Nebius" + On [Nebius](https://docs.nebius.com/compute/clusters/gpu), `dstack` automatically configures InfiniBand networking if it is supported by the selected instance type. + + === "Crusoe" + On [Crusoe](https://docs.crusoecloud.com/networking/infiniband/managing-infiniband-networks), `dstack` automatically configures InfiniBand networking if it is supported by the selected instance type. + Refer to the [Crusoe](../examples/clusters/crusoe.md#vms) example for more details. + + === "Kubernetes" + If the Kubernetes cluster has interconnect configured, `dstack` can use it without additional setup. + See the [Lambda](../examples/clusters/lambda.md#kubernetes) or [Crusoe](../examples/clusters/crusoe.md#kubernetes) examples. + + + +=== "SSH fleets" + If the hosts in the SSH fleet have interconnect configured, you only need to set `placement` to `cluster`. + +
    + + ```yaml + type: fleet + name: my-fleet + + placement: cluster + + ssh_config: + user: ubuntu + identity_file: ~/.ssh/id_rsa + hosts: + - 3.255.177.51 + - 3.255.177.52 + ``` + +
    + + !!! info "Network" + By default, `dstack` automatically detects the network shared by the hosts. However, it's possible to configure it explicitly via the [`network`](../reference/dstack.yml/fleet.md#network) property. + + + +!!! info "Examples" + See the cluster examples for [AWS](../examples/clusters/aws.md), [GCP](../examples/clusters/gcp.md), [Lambda](../examples/clusters/lambda.md), [Crusoe](../examples/clusters/crusoe.md), [Nebius](../examples/clusters/nebius.md), and [NCCL/RCCL tests](../examples/clusters/nccl-rccl-tests.md). + +### Nodes + +The `nodes` property is supported only by backend fleets and specifies how many nodes `dstack` must or can provision. + +
    + +```yaml +type: fleet +name: my-fleet + +# Allow to provision of up to 2 instances +nodes: 0..2 + +# Uncomment to ensure instances are inter-connected +#placement: cluster + +# Deprovision instances above the minimum if they remain idle +idle_duration: 1h + +resources: + # Allow to provision up to 8 GPUs + gpu: 0..8 +``` + +
    + +#### Pre-provisioning + +If the `nodes` range starts with `0`, `dstack apply` creates only a template, and instances are provisioned when you submit runs. + +To provision instances up front, set the `nodes` range to start above `0`. This pre-creates the initial number of instances; additional instances (if any) are provisioned on demand. + + +
    + + ```yaml + type: fleet + name: my-fleet + + nodes: 2..10 + + # Uncomment to ensure instances are inter-connected + #placement: cluster + + resources: + gpu: H100:8 + ``` + +
    + +Pre-provisioning is supported only for [VM-based backends](backends.md#vm-based). + +??? info "Target number" + To pre-provision more than the minimum number of instances, set the `target` parameter. + +
    + + ```yaml + type: fleet + name: my-fleet + + nodes: + min: 2 + max: 10 + target: 6 + + # Deprovision instances above the minimum if they remain idle + idle_duration: 1h + ``` + +
    + + `dstack apply` pre-provisions up to `target` and scales back to `min` after `idle_duration`. + +### Resources + +Backend fleets allow you to specify the resource requirements for the instances to be provisioned. The `resources` property syntax is the same as for [run configurations](dev-environments.md#resources). + +### Spot policy + +Backend fleets allow you to specify a `spot policy`. By default, it is set to `on-demand`. If you want to use spot instances, you must set it to `auto` if you plan to use both on-demand and spot instances, or to `spot` if only spot instances are allowed. + +
    + +```yaml +type: fleet +name: my-fleet + +nodes: 0..2 + +# Uncomment to ensure instances are inter-connected +#placement: cluster + +# Allows both on-demand and spot +spot_policy: auto + +idle_duration: 1h + +resources: + gpu: 0..8 +``` + +
    + +Note that run configurations must specify their own `spot policy` which is also set to `on-demand` by default. + +### Backends + +Backend fleets allow you to set `backends` to specify which backends are allowed to be used. + +### Idle duration + +By default, instances of a backend fleet stay `idle` for 3 days and can be reused within that time. +If an instance is not reused within this period, it is automatically terminated. + +To change the default idle duration, set +[`idle_duration`](../reference/dstack.yml/fleet.md#idle_duration) in the fleet configuration (e.g., `0s`, `1m`, or `off` for +unlimited). + +
    + +```yaml +type: fleet +name: my-fleet + +nodes: 2 + +# Terminate instances idle for more than 1 hour +idle_duration: 1h + +resources: + gpu: 24GB +``` + +
    + +### Blocks + +By default, a job uses the entire instance—e.g., all 8 GPUs. To allow multiple jobs on the same instance, set the `blocks` property to divide the instance. Each job can then use one or more blocks, up to the full instance. + +=== "Backend fleets" +
    + + ```yaml + type: fleet + name: my-fleet + + nodes: 0..2 + + resources: + gpu: H100:8 + + # Split into 4 blocks, each with 2 GPUs + blocks: 4 + ``` + +
    + +=== "SSH fleets" +
    + + ```yaml + type: fleet + name: my-fleet + + ssh_config: + user: ubuntu + identity_file: ~/.ssh/id_rsa + hosts: + - hostname: 3.255.177.51 + blocks: 4 + - hostname: 3.255.177.52 + # As many as possible, according to numbers of GPUs and CPUs + blocks: auto + - hostname: 3.255.177.53 + # Do not slice. This is the default value, may be omitted + blocks: 1 + ``` + +
    + +All resources (GPU, CPU, memory) are split evenly across blocks, while disk is shared. + +For example, with 8 GPUs, 128 CPUs, and 2TB RAM, setting `blocks` to `8` gives each block 1 GPU, 16 CPUs, and 256 GB RAM. + +Set `blocks` to `auto` to match the number of blocks to the number of GPUs. + +!!! info "Distributed tasks" + Distributed tasks require exclusive access to all host resources and therefore must use all blocks on each node. + +### SSH config + + + +#### Proxy jump + +If hosts are behind a head node (aka "login node"), configure [`proxy_jump`](../reference/dstack.yml/fleet.md#proxy_jump): + +
    + + ```yaml + type: fleet + name: my-fleet + + ssh_config: + user: ubuntu + identity_file: ~/.ssh/worker_node_key + hosts: + - 3.255.177.51 + - 3.255.177.52 + proxy_jump: + hostname: 3.255.177.50 + user: ubuntu + identity_file: ~/.ssh/head_node_key + ``` + +
    + +To be able to attach to runs, both explicitly with `dstack attach` and implicitly with `dstack apply`, you must either add a front node key (`~/.ssh/head_node_key`) to an SSH agent or configure a key path in `~/.ssh/config`: + +
    + + ``` + Host 3.255.177.50 + IdentityFile ~/.ssh/head_node_key + ``` + +
    + +where `Host` must match `ssh_config.proxy_jump.hostname` or `ssh_config.hosts[n].proxy_jump.hostname` if you configure head nodes on a per-worker basis. + +### Environment variables + +If needed, you can specify environment variables that will be automatically passed to any jobs running on this fleet. + +For example, these variables can be used to configure a proxy: + +```yaml +type: fleet +name: my-fleet + +env: + - HTTP_PROXY=http://proxy.example.com:80 + - HTTPS_PROXY=http://proxy.example.com:80 + - NO_PROXY=localhost,127.0.0.1 + +ssh_config: + user: ubuntu + identity_file: ~/.ssh/id_rsa + hosts: + - 3.255.177.51 + - 3.255.177.52 +``` + +!!! info "Reference" + The fleet configuration file supports additional options, including [`instance_types`](../reference/dstack.yml/fleet.md#instance_types), [`max_price`](../reference/dstack.yml/fleet.md#max_price), [`regions`](../reference/dstack.yml/fleet.md#max_price), among others. For the complete list, see the [reference](../reference/dstack.yml/fleet.md). + +## Export fleets + +Fleets can be exported to other projects, allowing those projects to use the exported fleets +for running dev environments, tasks, and services. See [Exports](exports.md) for more details. + +## Manage fleets + +### List fleets + +The [`dstack fleet`](../reference/cli/dstack/fleet.md#dstack-fleet-list) command lists fleet instances and their status: + +
    + +```shell +$ dstack fleet + + FLEET INSTANCE BACKEND GPU PRICE STATUS CREATED + my-fleet 0 gcp (europe-west-1) L4:24GB (spot) $0.1624 idle 3 mins ago + 1 gcp (europe-west-1) L4:24GB (spot) $0.1624 idle 3 mins ago +``` + +
    + +### Delete fleets + +When a fleet isn't used by a run, you can delete it by passing the fleet configuration to `dstack delete`: + +
    + +```shell +$ dstack delete -f cluster.dstack.yaml +Delete the fleet my-gcp-fleet? [y/n]: y +Fleet my-gcp-fleet deleted +``` + +
    + +Alternatively, you can delete a fleet by passing the fleet name to `dstack fleet delete`. +To terminate and delete specific instances from a fleet, pass `-i INSTANCE_NUM`. + +### List offers + +To inspect offers available through a fleet, pass `--fleet` to `dstack offer`. + +
    + +```shell +$ dstack offer --gpu H100 --fleet my-fleet +``` + +
    + +Use `--group-by gpu,backend` to aggregate offers. + +!!! info "What's next?" + 1. Check [dev environments](dev-environments.md), [tasks](tasks.md), and + [services](services.md) + 2. Read about [Backends](backends.md) guide + 3. Learn how to [export fleets](exports.md) to other projects + 4. Explore the [`.dstack.yml` reference](../reference/dstack.yml/fleet.md) + 5. See the cluster examples for [AWS](../examples/clusters/aws.md), [GCP](../examples/clusters/gcp.md), [Lambda](../examples/clusters/lambda.md), [Crusoe](../examples/clusters/crusoe.md), [Nebius](../examples/clusters/nebius.md), and [NCCL/RCCL tests](../examples/clusters/nccl-rccl-tests.md) diff --git a/mkdocs/docs/concepts/gateways.md b/mkdocs/docs/concepts/gateways.md new file mode 100644 index 0000000000..b71a23d7b6 --- /dev/null +++ b/mkdocs/docs/concepts/gateways.md @@ -0,0 +1,262 @@ +--- +title: Gateways +description: Managing ingress traffic and endpoints for services +--- + +# Gateways + +Gateways manage ingress traffic for running [services](services.md), handle auto-scaling and rate limits, enable HTTPS, and allow you to configure a custom domain. + + + +## Apply a configuration + +First, define a gateway configuration as a YAML file in your project folder. +The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `gateway.dstack.yml` are both acceptable). + +
    + +```yaml +type: gateway +# A name of the gateway +name: example-gateway + +# Gateways are bound to a specific backend and region +backend: aws +region: eu-west-1 + +# This domain will be used to access the endpoint +domain: example.com +``` + +
    + +To create or update the gateway, simply call the [`dstack apply`](../reference/cli/dstack/apply.md) command: + +
    + +```shell +$ dstack apply -f gateway.dstack.yml +The example-gateway doesn't exist. Create it? [y/n]: y + +Provisioning... +---> 100% + + BACKEND REGION NAME HOSTNAME DOMAIN DEFAULT STATUS + aws eu-west-1 example-gateway example.com ✓ submitted +``` + +
    + +## Configuration options + +### Domain + +A gateway requires a `domain` to be specified in the configuration before creation. The domain is used to generate service endpoints (e.g. `.`). + +Once the gateway is created and assigned a hostname, configure your DNS by adding a wildcard record for `*.` (e.g. `*.example.com`). The record should point to the gateway's hostname and should be of type `A` if the hostname is an IP address (most cases), or of type `CNAME` if the hostname is another domain (some private gateways and Kubernetes). + +??? info "Project name interpolation" + You can use the `${{ run.project_name }}` variable to include the service’s project name in the domain name. This is especially useful when [exporting](exports.md) the gateway to multiple projects, as it ensures each importer receives a unique domain name. + + ```yaml + type: gateway + name: global-gateway + backend: aws + region: eu-west-1 + domain: ${{ run.project_name }}.mycompany.example + ``` + +### Backend + +You can create gateways with the `aws`, `azure`, `gcp`, or `kubernetes` backends, but that does not limit where services run. A gateway can use one backend while services run on any other backend supported by dstack, including backends where gateways themselves cannot be created. + +??? info "Kubernetes" + Gateways in `kubernetes` backend require an external load balancer. Managed Kubernetes solutions usually include a load balancer. + For self-hosted Kubernetes, you must provide a load balancer by yourself. + +### Router + +> In previous releases, `dstack` allowed configuring `router` the gateway, which was required for PD disaggregation. Since 0.20.17, the `router` configuration has moved to [services](services.md#pd-disaggregation), and the gateway no longer needs to configure router. + + + +### Certificate + +By default, when you run a service with a gateway, `dstack` provisions an SSL certificate via Let's Encrypt for the configured domain. This automatically enables HTTPS for the service endpoint. + +If you disable [public IP](#public-ip) (e.g. to make the gateway private) or if you simply don't need HTTPS, you can set `certificate` to `null`. + +> Note, by default services set [`https`](../reference/dstack.yml/service.md#https) to `true` which requires a certificate. You can set `https` to `auto` to detect if the gateway supports HTTPS or not automatically. + +??? info "Certificate types" + `dstack` supports the following certificate types: + + * `lets-encrypt` (default) — Automatic certificates via [Let's Encrypt](https://letsencrypt.org/). Requires a [public IP](#public-ip). + * `acm` — Certificates managed by [AWS Certificate Manager](https://aws.amazon.com/certificate-manager/). AWS-only. TLS is terminated at the load balancer, not at the gateway, and HTTP requests are redirected to HTTPS by the ALB. + Requires a VPC with at least two subnets in different availability zones to provision a load balancer. If `public_ip: False`, subnets must be private and have a route to NAT gateway. + * `null` — No certificate. Services will use HTTP. + +### Public IP + +If you don't need a public IP for the gateway, you can set `public_ip` to `false` (the default is `true`), making the gateway private. + +Private gateways are currently supported in `aws` and `gcp` backends. + +
    + +```yaml +type: gateway +name: private-gateway + +backend: aws +region: eu-west-1 +domain: example.com + +public_ip: false +certificate: null +``` + +
    + +### Instance type + +By default, `dstack` provisions a small, low-cost instance for the gateway. If you expect to run high-traffic services, you can configure a larger instance type using the `instance_type` property. + +
    + +```yaml +type: gateway +name: example-gateway + +backend: aws +region: eu-west-1 + +instance_type: t3.large + +domain: example.com +``` + +
    + +### Replicas + +A gateway can have multiple replicas for improved availability. + +
    + +```yaml +type: gateway +name: example-gateway + +backend: aws +region: eu-west-1 + +domain: example.com + +certificate: null +replicas: 2 +``` + +
    + +To balance requests between gateway replicas, add DNS records for each replica or set up a load balancer outside of `dstack`. Replica hostnames are displayed in `dstack` CLI and UI. + +
    + +```shell +$ dstack gateway list + NAME BACKEND HOSTNAME DOMAIN DEFAULT STATUS + example-gateway example.com ✓ running + replica=0 aws (eu-west-1) 34.244.128.46 + replica=1 aws (eu-west-1) 18.201.201.174 +``` + +
    + +!!! warning "Experimental" + Replicated gateways are an experimental feature and currently have limitations: + + - Changing the number of replicas or redeploying replicas is not supported. + - HTTPS is not supported. Use an external load balancer for TLS termination. + - An unavailable gateway replica prevents any new services or service replicas from being added. + - All replicas are bound to the same backend and region. + - At most 3 replicas are allowed per gateway. + +!!! info "Reference" + For all gateway configuration options, refer to the [reference](../reference/dstack.yml/gateway.md). + +## Export gateways + +Gateways can be exported to other projects, allowing those projects to use the exported gateways +for running services. See [Exports](exports.md) for more details. + +## Manage gateways + +### List gateways + +The [`dstack gateway list`](../reference/cli/dstack/gateway.md#dstack-gateway-list) command lists existing gateways and their status. + +### Delete a gateway + +To delete a gateway, pass the gateway configuration to [`dstack delete`](../reference/cli/dstack/delete.md): + +
    + +```shell +$ dstack delete -f examples/inference/gateway.dstack.yml +``` + +
    + +Alternatively, you can delete a gateway by passing the gateway name to `dstack gateway delete`. + +[//]: # (TODO: Elaborate on default) + +[//]: # (TODO: ## Accessing endpoints) + +!!! info "What's next?" + 1. See [services](services.md) on how to run services diff --git a/mkdocs/docs/concepts/metrics.md b/mkdocs/docs/concepts/metrics.md new file mode 100644 index 0000000000..3bebef4c09 --- /dev/null +++ b/mkdocs/docs/concepts/metrics.md @@ -0,0 +1,207 @@ +--- +title: Metrics +description: Tracking and monitoring system metrics +--- + +# Metrics + +`dstack` automatically tracks essential metrics, which you can access via the CLI and UI. +You can also configure the `dstack` server to export metrics to Prometheus—this is required to access advanced metrics such as those from DCGM. + +## UI + +To access metrics via the UI, open the page of the corresponding run or job and switch to the `Metrics` tab: + +![](https://dstack.ai/static-assets/static-assets/images/dstack-newsletter-metrics.png){ width=800 } + +This tab displays key CPU, memory, and GPU metrics collected during the last hour of the run or job. + +## CLI + +As an alternative to the UI, you can track real-time essential metrics via the CLI. +The `dstack metrics` command displays the most recently tracked CPU, memory, and GPU metrics. + +
    + +```shell +dstack metrics gentle-mayfly-1 + + NAME STATUS CPU MEMORY GPU + gentle-mayfly-1 done 0% 16.27GB/2000GB gpu=0 mem=72.48GB/80GB util=0% + gpu=1 mem=64.99GB/80GB util=0% + gpu=2 mem=580MB/80GB util=0% + gpu=3 mem=4MB/80GB util=0% + gpu=4 mem=4MB/80GB util=0% + gpu=5 mem=4MB/80GB util=0% + gpu=6 mem=4MB/80GB util=0% + gpu=7 mem=292MB/80GB util=0% +``` + +
    + +## Prometheus + +To enable exporting metrics to Prometheus, set the +`DSTACK_ENABLE_PROMETHEUS_METRICS` environment variable and configure Prometheus to scrape metrics from +`/metrics`. + +In addition to the essential metrics available via the CLI and UI, `dstack` exports additional metrics to Prometheus, including data on fleets, runs, jobs, and DCGM metrics. + +??? info "NVIDIA DCGM" + NVIDIA DCGM metrics are automatically collected for `aws`, `azure`, `gcp`, and `oci` backends, + as well as for [SSH fleets](../concepts/fleets.md#ssh-fleets). + + To ensure NVIDIA DCGM metrics are collected from SSH fleets, ensure the `datacenter-gpu-manager-4-core`, + `datacenter-gpu-manager-4-proprietary`, and `datacenter-gpu-manager-exporter` packages are installed on the hosts. + +### Fleets + +Fleet metrics include metrics for each instance within a fleet. This includes information such as the instance's running +time, price, GPU name, and more. + +=== "Metrics" + | Name | Type | Description | Examples | + |------------------------------------------|-----------|-----------------------------------|--------------| + | `dstack_instance_duration_seconds_total` | *counter* | Total instance runtime in seconds | `1123763.22` | + | `dstack_instance_price_dollars_per_hour` | *gauge* | Instance price, USD/hour | `16.0` | + | `dstack_instance_gpu_count` | *gauge* | Instance GPU count | `4.0`, `0.0` | + +=== "Labels" + | Name | Type | Description | Examples | + |------------------------|-----------|:--------------|----------------------------------------| + | `dstack_project_name` | *string* | Project name | `main` | + | `dstack_fleet_name` | *string?* | Fleet name | `my-fleet` | + | `dstack_fleet_id` | *string?* | Fleet ID | `51e837bf-fae9-4a37-ac9c-85c005606c22` | + | `dstack_instance_name` | *string* | Instance name | `my-fleet-0` | + | `dstack_instance_id` | *string* | Instance ID | `8c28c52c-2f94-4a19-8c06-12f1dfee4dd2` | + | `dstack_instance_type` | *string?* | Instance type | `g4dn.xlarge` | + | `dstack_backend` | *string?* | Backend | `aws`, `runpod` | + | `dstack_gpu` | *string?* | GPU name | `H100` | + +### Runs + +Run metrics include run counters for each user in each project. + +=== "Metrics" + | Name | Type | Description | Examples | + |-------------------------------------|-----------|-------------------------------|----------| + | `dstack_run_count_total` | *counter* | The total number of runs | `537` | + | `dstack_run_count_terminated_total` | *counter* | The number of terminated runs | `118` | + | `dstack_run_count_failed_total` | *counter* | The number of failed runs | `27` | + | `dstack_run_count_done_total` | *counter* | The number of successful runs | `218` | + +=== "Labels" + + | Name | Type | Description | Examples | + |-----------------------|-----------|:--------------|-------------| + | `dstack_project_name` | *string* | Project name | `main` | + | `dstack_user_name` | *string* | User name | `alice` | + +### Jobs + +A run consists of one or more jobs, each mapped to an individual container. +For distributed workloads or auto-scalable services, a run spans multiple jobs. + +Job metrics provide detailed insights into each job within a run, including execution time, cost, GPU model, DCGM +telemetry, and more. + +=== "Metrics" + + | Name | Type | Description | Examples | + |-------------------------------------------------|-----------|--------------------------------------------------------------------------------------------|----------------| + | `dstack_job_duration_seconds_total` | *counter* | Total job runtime in seconds | `520.37` | + | `dstack_job_price_dollars_per_hour` | *gauge* | Job instance price, USD/hour | `8.0` | + | `dstack_job_gpu_count` | *gauge* | Job GPU count | `2.0`, `0.0` | + | `dstack_job_cpu_count` | *gauge* | Job CPU count | `32.0` | + | `dstack_job_cpu_time_seconds_total` | *counter* | Total CPU time consumed by the job, seconds | `11.727975` | + | `dstack_job_memory_total_bytes` | *gauge* | Total memory allocated for the job, bytes | `4009754624.0` | + | `dstack_job_memory_usage_bytes` | *gauge* | Memory used by the job (including cache), bytes | `339017728.0` | + | `dstack_job_memory_working_set_bytes` | *gauge* | Memory used by the job (not including cache), bytes | `147251200.0` | + | `dstack_job_gpu_usage_ratio` | *gauge* | Job GPU usage, percent (as 0.0-1.0) | `0.93` | + | `dstack_job_gpu_memory_total_bytes` | *gauge* | Total GPU memory allocated for the job, bytes | `8589934592.0` | + | `dstack_job_gpu_memory_usage_bytes` | *gauge* | GPU memory used by the job, bytes | `1048576.0` | + | `DCGM_FI_DEV_GPU_UTIL` | *gauge* | GPU utilization (in %) | | + | `DCGM_FI_DEV_MEM_COPY_UTIL` | *gauge* | Memory utilization (in %) | | + | `DCGM_FI_DEV_ENC_UTIL` | *gauge* | Encoder utilization (in %) | | + | `DCGM_FI_DEV_DEC_UTIL` | *gauge* | Decoder utilization (in %) | | + | `DCGM_FI_DEV_FB_FREE` | *gauge* | Framebuffer memory free (in MiB) | | + | `DCGM_FI_DEV_FB_USED` | *gauge* | Framebuffer memory used (in MiB) | | + | `DCGM_FI_PROF_GR_ENGINE_ACTIVE` | *gauge* | The ratio of cycles during which a graphics engine or compute engine remains active | | + | `DCGM_FI_PROF_SM_ACTIVE` | *gauge* | The ratio of cycles an SM has at least 1 warp assigned | | + | `DCGM_FI_PROF_SM_OCCUPANCY` | *gauge* | The ratio of number of warps resident on an SM | | + | `DCGM_FI_PROF_PIPE_TENSOR_ACTIVE` | *gauge* | Ratio of cycles the tensor (HMMA) pipe is active | | + | `DCGM_FI_PROF_PIPE_FP64_ACTIVE` | *gauge* | Ratio of cycles the fp64 pipes are active | | + | `DCGM_FI_PROF_PIPE_FP32_ACTIVE` | *gauge* | Ratio of cycles the fp32 pipes are active | | + | `DCGM_FI_PROF_PIPE_FP16_ACTIVE` | *gauge* | Ratio of cycles the fp16 pipes are active | | + | `DCGM_FI_PROF_PIPE_INT_ACTIVE` | *gauge* | Ratio of cycles the integer pipe is active | | + | `DCGM_FI_PROF_DRAM_ACTIVE` | *gauge* | Ratio of cycles the device memory interface is active sending or receiving data | | + | `DCGM_FI_PROF_PCIE_TX_BYTES` | *counter* | The number of bytes of active PCIe tx (transmit) data including both header and payload | | + | `DCGM_FI_PROF_PCIE_RX_BYTES` | *counter* | The number of bytes of active PCIe rx (read) data including both header and payload | | + | `DCGM_FI_DEV_SM_CLOCK` | *gauge* | SM clock frequency (in MHz) | | + | `DCGM_FI_DEV_MEM_CLOCK` | *gauge* | Memory clock frequency (in MHz) | | + | `DCGM_FI_DEV_MEMORY_TEMP` | *gauge* | Memory temperature (in C) | | + | `DCGM_FI_DEV_GPU_TEMP` | *gauge* | GPU temperature (in C) | | + | `DCGM_FI_DEV_POWER_USAGE` | *gauge* | Power draw (in W) | | + | `DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION` | *counter* | Total energy consumption since boot (in mJ) | | + | `DCGM_FI_DEV_PCIE_REPLAY_COUNTER` | *counter* | Total number of PCIe retries | | + | `DCGM_FI_DEV_XID_ERRORS` | *gauge* | Value of the last XID error encountered | | + | `DCGM_FI_DEV_POWER_VIOLATION` | *counter* | Throttling duration due to power constraints (in us) | | + | `DCGM_FI_DEV_THERMAL_VIOLATION` | *counter* | Throttling duration due to thermal constraints (in us) | | + | `DCGM_FI_DEV_SYNC_BOOST_VIOLATION` | *counter* | Throttling duration due to sync-boost constraints (in us) | | + | `DCGM_FI_DEV_BOARD_LIMIT_VIOLATION` | *counter* | Throttling duration due to board limit constraints (in us) | | + | `DCGM_FI_DEV_LOW_UTIL_VIOLATION` | *counter* | Throttling duration due to low utilization (in us) | | + | `DCGM_FI_DEV_RELIABILITY_VIOLATION` | *counter* | Throttling duration due to reliability constraints (in us) | | + | `DCGM_FI_DEV_ECC_SBE_VOL_TOTAL` | *counter* | Total number of single-bit volatile ECC errors | | + | `DCGM_FI_DEV_ECC_DBE_VOL_TOTAL` | *counter* | Total number of double-bit volatile ECC errors | | + | `DCGM_FI_DEV_ECC_SBE_AGG_TOTAL` | *counter* | Total number of single-bit persistent ECC errors | | + | `DCGM_FI_DEV_ECC_DBE_AGG_TOTAL` | *counter* | Total number of double-bit persistent ECC errors | | + | `DCGM_FI_DEV_RETIRED_SBE` | *counter* | Total number of retired pages due to single-bit errors | | + | `DCGM_FI_DEV_RETIRED_DBE` | *counter* | Total number of retired pages due to double-bit errors | | + | `DCGM_FI_DEV_RETIRED_PENDING` | *counter* | Total number of pages pending retirement | | + | `DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS` | *counter* | Number of remapped rows for uncorrectable errors | | + | `DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS` | *counter* | Number of remapped rows for correctable errors | | + | `DCGM_FI_DEV_ROW_REMAP_FAILURE` | *gauge* | Whether remapping of rows has failed | | + | `DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL` | *counter* | Total number of NVLink flow-control CRC errors | | + | `DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL` | *counter* | Total number of NVLink data CRC errors | | + | `DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL` | *counter* | Total number of NVLink retries | | + | `DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL` | *counter* | Total number of NVLink recovery errors | | + | `DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL` | *counter* | Total number of NVLink bandwidth counters for all lanes | | + | `DCGM_FI_DEV_NVLINK_BANDWIDTH_L0` | *counter* | The number of bytes of active NVLink rx or tx data including both header and payload | | + | `DCGM_FI_PROF_NVLINK_RX_BYTES` | *counter* | The number of bytes of active PCIe rx (read) data including both header and payload | | + | `DCGM_FI_PROF_NVLINK_TX_BYTES` | *counter* | The number of bytes of active NvLink tx (transmit) data including both header and payload | | + +=== "Labels" + | Label | Type | Description | Examples | + |-----------------------|-----------|:-----------------------|----------------------------------------| + | `dstack_project_name` | *string* | Project name | `main` | + | `dstack_user_name` | *string* | User name | `alice` | + | `dstack_run_name` | *string* | Run name | `nccl-tests` | + | `dstack_run_id` | *string* | Run ID | `51e837bf-fae9-4a37-ac9c-85c005606c22` | + | `dstack_job_name` | *string* | Job name | `nccl-tests-0-0` | + | `dstack_job_id` | *string* | Job ID | `8c28c52c-2f94-4a19-8c06-12f1dfee4dd2` | + | `dstack_job_num` | *integer* | Job number | `0` | + | `dstack_replica_num` | *integer* | Replica number | `0` | + | `dstack_run_type` | *string* | Run configuration type | `task`, `dev-environment` | + | `dstack_backend` | *string* | Backend | `aws`, `runpod` | + | `dstack_gpu` | *string?* | GPU name | `H100` | + | `dstack_gpu_num`[^1] | *integer* | GPU number (0-based) | `0` | + + [^1]: For `dstack_gpu_*` metrics only. + +### Server health metrics + +These are operational metrics to monitor the health of the dstack server. For now, these only include HTTP metrics, but more will be added later. + +=== "Metrics" + | Name | Type | Description | Examples | + |------------------------------------------|-----------|-----------------------------------|--------------| + | `dstack_server_requests_total` | *counter* | Total number of HTTP requests | `100.0` | + | `dstack_server_request_duration_seconds` | *histogram* | HTTP request duration in seconds | `1.0`| + +=== "Labels" + | Name | Type | Description | Examples | + |------------------------|-----------|:--------------|----------------------------------------| + | `method` | *string* | HTTP method | `POST` | + | `endpoint` | *string* | Endpoint path | `/api/project/main/repos/get` | + | `http_status` | *string* | HTTP status code | `200` | + | `project_name` | *string?* | Project name | `main` | diff --git a/mkdocs/docs/concepts/projects.md b/mkdocs/docs/concepts/projects.md new file mode 100644 index 0000000000..a1de7ffb96 --- /dev/null +++ b/mkdocs/docs/concepts/projects.md @@ -0,0 +1,69 @@ +--- +title: Projects +description: Organizing teams and isolating resources +--- + +# Projects + +Projects enable the isolation of different teams and their resources. Each project can configure its own backends and +control which users have access to it. + +> While project backends can be configured via [`~/.dstack/server/config.yml`](../reference/server/config.yml.md), +> use the UI to fully manage projects, users, and user permissions. + +## Project backends { #backends } + +In addition to [`~/.dstack/server/config.yml`](../reference/server/config.yml.md), +a global admin or a project admin can configure backends on the project settings page. + + + +## Global admins + +A user can be assigned or unassigned a global admin role on the user account settings page. This can only be done by +another global admin. + + + +The global admin role allows a user to manage all projects and users. + +## Project members + +A user can be added to a project and assigned or unassigned as a project role on the project settings page. + + + +### Project roles + +* **Admin** – The project admin role allows a user to manage the project's settings, + including backends, gateways, and members. +* **Manager** – The project manager role allows a user to manage project members. + Unlike admins, managers cannot configure backends and gateways. +* **User** – A user can manage project resources including runs, fleets, and volumes. + +## Project exports + +Projects can export resources such as fleets to other projects, allowing them to be used across team +boundaries. See [Exports](exports.md) for more details. + +## Authorization + +### User token + +Once created, a user is issued a token. This token can be found on the user account settings page. + + + +The token must be used for authentication when logging into the control plane UI +and when using the CLI or API. + +### Setting up the CLI + +You can configure multiple projects on the client and set the default project using the [`dstack project`](../reference/cli/dstack/project.md) CLI command. + +You can find the command on the project’s settings page: + + + +??? info "API" + In addition to the UI, managing projects, users, and user permissions can also be done via the [HTTP API](../reference/http/index.md). diff --git a/mkdocs/docs/concepts/secrets.md b/mkdocs/docs/concepts/secrets.md new file mode 100644 index 0000000000..0528e10c09 --- /dev/null +++ b/mkdocs/docs/concepts/secrets.md @@ -0,0 +1,130 @@ +--- +title: Secrets +description: Managing sensitive values and credentials +--- + +# Secrets + +Secrets allow centralized management of sensitive values such as API keys and credentials. They are project-scoped, managed by project admins, and can be referenced in run configurations to pass sensitive values to runs in a secure manner. + +!!! info "Secrets encryption" + By default, secrets are stored in plaintext in the DB. + Configure [server encryption](../guides/server-deployment.md#encryption) to store secrets encrypted. + +## Manage secrets + +### Set + +Use the `dstack secret set` command to create a new secret: + +
    + +```shell +$ dstack secret set my_secret some_secret_value +OK +``` + +
    + +The same command can be used to update an existing secret: + +
    + +```shell +$ dstack secret set my_secret another_secret_value +OK +``` + +
    + +### List + +Use the `dstack secret list` command to list all secrets set in a project: + +
    + +```shell +$ dstack secret + NAME VALUE + hf_token ****** + my_secret ****** + +``` + +
    + +### Get + +The `dstack secret list` does not show secret values. To see a secret value, use the `dstack secret get` command: + +
    + +```shell +$ dstack secret get my_secret + NAME VALUE + my_secret some_secret_value + +``` + +
    + +### Delete + +Secrets can be deleted using the `dstack secret delete` command: + +
    + +```shell +$ dstack secret delete my_secret +Delete the secret my_secret? [y/n]: y +OK +``` + +
    + +## Use secrets + +You can use the `${{ secrets. }}` syntax to reference secrets in run configurations. Currently, secrets interpolation is supported in `env` and `registry_auth` properties. + +### `env` + +Suppose you need to pass a sensitive environment variable to a run such as `HF_TOKEN`. You'd first create a secret holding the environment variable value: + +
    + +```shell +$ dstack secret set hf_token {hf_token_value} +OK +``` + +
    + +and then reference the secret in `env`: + +
    + +```yaml +type: service +env: + - HF_TOKEN=${{ secrets.hf_token }} +commands: + ... +``` + +
    + +### `registry_auth` + +If you need to pull a private Docker image, you can store registry credentials as secrets and reference them in `registry_auth`: + +
    + +```yaml +type: service +image: nvcr.io/nim/deepseek-ai/deepseek-r1-distill-llama-8b +registry_auth: + username: $oauthtoken + password: ${{ secrets.ngc_api_key }} +``` + +
    diff --git a/mkdocs/docs/concepts/services.md b/mkdocs/docs/concepts/services.md new file mode 100644 index 0000000000..757546483b --- /dev/null +++ b/mkdocs/docs/concepts/services.md @@ -0,0 +1,1508 @@ +--- +title: Services +description: Deploying models and web apps as endpoints +--- + +# Services + +Services allow you to deploy models or web apps as secure and scalable endpoints. + +??? info "Prerequisites" + Before running a service, make sure you’ve [installed](../installation.md) the server and CLI, and created a [fleet](fleets.md). + +## Apply a configuration + +First, define a service configuration as a YAML file in your project folder. +The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml` are both acceptable). + +=== "NVIDIA" + +
    + + ```yaml + type: service + name: qwen36 + + image: lmsysorg/sglang:v0.5.10.post1 + + commands: + - | + sglang serve \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 30000 \ + --tp $DSTACK_GPUS_NUM \ + --mem-fraction-static 0.8 \ + --context-length 262144 \ + --reasoning-parser qwen3 + + port: 30000 + model: Qwen/Qwen3.6-27B + + volumes: + # Optional instance volume for model and runtime caches + - instance_path: /root/.cache + path: /root/.cache + optional: true + + resources: + shm_size: 16GB + gpu: H100:4 + ``` + +
    + +=== "AMD" + +
    + + ```yaml + type: service + name: qwen36 + + image: lmsysorg/sglang:v0.5.10-rocm720-mi30x + + commands: + - | + sglang serve \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 30000 \ + --tp $DSTACK_GPUS_NUM \ + --mem-fraction-static 0.8 \ + --context-length 262144 \ + --reasoning-parser qwen3 + + port: 30000 + model: Qwen/Qwen3.6-27B + + volumes: + # Optional instance volume for model and runtime caches + - instance_path: /root/.cache + path: /root/.cache + optional: true + + resources: + cpu: 52.. + memory: 896GB.. + shm_size: 16GB + disk: 450GB.. + gpu: MI300X:4 + ``` + +
    + +The first startup on MI300X can take longer while SGLang compiles ROCm +kernels. + +To run a service, pass the configuration to [`dstack apply`](../reference/cli/dstack/apply.md): + +
    + +```shell +$ dstack apply -f .dstack.yml + +Submit the run qwen36? [y/n]: y + +Provisioning... +---> 100% + +Service is published at: + http://localhost:3000/proxy/services/main/qwen36/ +Model Qwen/Qwen3.6-27B is published at: + http://localhost:3000/proxy/models/main/ +``` + +
    + +`dstack apply` automatically provisions instances and runs the service. + +If you do not have a [gateway](gateways.md) created, the service endpoint will be accessible at +`/proxy/services///`. + +
    + +```shell +$ curl http://localhost:3000/proxy/services/main/qwen36/v1/chat/completions \ + -H 'Content-Type: application/json' \ + -H 'Authorization: Bearer <user token>' \ + -d '{ + "model": "Qwen/Qwen3.6-27B", + "messages": [ + { + "role": "user", + "content": "Compose a poem that explains the concept of recursion in programming." + } + ] + }' +``` + +
    + +The request and response format depends on the serving framework used by the +service. Even for OpenAI-compatible endpoints, the format may vary slightly +across frameworks. + +If [authorization](#authorization) is not disabled, the service endpoint requires the `Authorization` header with `Bearer `. + +## Configuration options + + + +### Gateway + +Here are cases where a service may need a [gateway](gateways.md): + +* To use [auto-scaling](#replicas-and-scaling) or [rate limits](#rate-limits) +* To enable HTTPS for the endpoint and map it to your domain +* If your service requires WebSockets +* If your service cannot work with a [path prefix](#path-prefix) + + + +If you want `dstack` to explicitly validate that a gateway is used, you can set the [`gateway`](../reference/dstack.yml/service.md#gateway) property in the service configuration to `true`. In this case, `dstack` will raise an error during `dstack apply` if a default gateway is not created. + +You can also set the `gateway` property to the name of a specific gateway, if required. + +If you have a [gateway](gateways.md) created, the service endpoint will be accessible at `https://./`: + +
    + +```shell +$ curl https://llama31.example.com/v1/chat/completions \ + -H 'Content-Type: application/json' \ + -H 'Authorization: Bearer <user token>' \ + -d '{ + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "messages": [ + { + "role": "user", + "content": "Compose a poem that explains the concept of recursion in programming." + } + ] + }' +``` + +
    + +### Replicas and scaling + +By default, `dstack` runs a single replica of the service. +You can configure the number of replicas as well as the auto-scaling rules. + +=== "NVIDIA" + +
    + + ```yaml + type: service + name: qwen36-service + + image: lmsysorg/sglang:v0.5.10.post1 + + commands: + - | + sglang serve \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 30000 \ + --tp $DSTACK_GPUS_NUM \ + --reasoning-parser qwen3 \ + --mem-fraction-static 0.8 \ + --context-length 262144 + + port: 30000 + model: Qwen/Qwen3.6-27B + + volumes: + # Optional instance volume for model and runtime caches + - instance_path: /root/.cache + path: /root/.cache + optional: true + + resources: + shm_size: 16GB + gpu: H100:4 + + replicas: 1..2 + scaling: + metric: rps + target: 1 + ``` + +
    + +=== "AMD" + +
    + + ```yaml + type: service + name: qwen36-service + + image: lmsysorg/sglang:v0.5.10-rocm720-mi30x + + commands: + - | + sglang serve \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 30000 \ + --tp $DSTACK_GPUS_NUM \ + --reasoning-parser qwen3 \ + --mem-fraction-static 0.8 \ + --context-length 262144 + + port: 30000 + model: Qwen/Qwen3.6-27B + + volumes: + # Optional instance volume for model and runtime caches + - instance_path: /root/.cache + path: /root/.cache + optional: true + + resources: + cpu: 52.. + memory: 896GB.. + shm_size: 16GB + disk: 450GB.. + gpu: MI300X:4 + + replicas: 1..2 + scaling: + metric: rps + target: 1 + ``` + +
    + +The [`replicas`](../reference/dstack.yml/service.md#replicas) property can be a number or a range. + +The [`metric`](../reference/dstack.yml/service.md#metric) property of [`scaling`](../reference/dstack.yml/service.md#scaling) only supports the `rps` metric (requests per second). In this +case `dstack` adjusts the number of replicas (scales up or down) automatically based on the load. + +Setting the minimum number of replicas to `0` allows the service to scale down to zero when there are no requests. + +> The `scaling` property requires creating a [gateway](gateways.md). + + + +??? info "Replica groups" + A service can include multiple replica groups. Each group can define its own `commands`, `resources` requirements, and `scaling` rules. + +
    + + ```yaml + type: service + name: llama-8b-service + + image: lmsysorg/sglang:v0.5.10.post1 + env: + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-8B + + replicas: + - count: 1..2 + scaling: + metric: rps + target: 10 + commands: + - | + python -m sglang.launch_server \ + --model-path $MODEL_ID \ + --port 8000 \ + --trust-remote-code + resources: + gpu: 48GB + + - count: 1..4 + scaling: + metric: rps + target: 5 + commands: + - | + python -m sglang.launch_server \ + --model-path $MODEL_ID \ + --port 8000 \ + --trust-remote-code + resources: + gpu: 24GB + + port: 8000 + model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B + ``` + +
    + + > Properties such as `regions`, `port`, `image`, `env` and some other cannot be configured per replica group. This support is coming soon. + +### PD disaggregation + + + +Since 0.20.17, `dstack` supports serving a model using Prefill-Decode disaggregation. To use it, configure three replica groups: one for the router, one for prefill workers, and one for decode workers. + +`dstack` integrates with two routers for PD disaggregation: [Shepherd Model Gateway (SMG)](https://docs.sglang.io/advanced_features/sgl_model_gateway.html) and [NVIDIA Dynamo](https://github.com/ai-dynamo/dynamo). + +#### NVIDIA + +Below is an example for running `zai-org/GLM-4.5-Air-FP8` on `H200`: + +=== "SMG" + +
    + + ```yaml + type: service + name: prefill-decode + image: lmsysorg/sglang:v0.5.10.post1 + + env: + - HF_TOKEN + - MODEL_ID=zai-org/GLM-4.5-Air-FP8 + + replicas: + - count: 1 + # For now replica group with router must have count: 1 + commands: + - pip install smg + - | + smg launch \ + --host 0.0.0.0 \ + --port 8000 \ + --pd-disaggregation \ + --prefill-policy cache_aware + resources: + cpu: 4 + router: + type: sglang + + - count: 1..4 + scaling: + metric: rps + target: 3 + commands: + - | + python -m sglang.launch_server \ + --model-path $MODEL_ID \ + --disaggregation-mode prefill \ + --disaggregation-transfer-backend nixl \ + --port 8000 \ + --disaggregation-bootstrap-port 8998 + resources: + gpu: H200 + + - count: 1..8 + scaling: + metric: rps + target: 2 + commands: + - | + python -m sglang.launch_server \ + --model-path $MODEL_ID \ + --disaggregation-mode decode \ + --disaggregation-transfer-backend nixl \ + --port 8000 + resources: + gpu: H200 + + port: 8000 + model: zai-org/GLM-4.5-Air-FP8 + + # Custom probe is required for PD disaggregation. + probes: + - type: http + url: /health + interval: 15s + ``` + +
    + + > With the `sglang` router, you can use SGLang prefill and decode workers. Support for vLLM and TensorRT-LLM workers is coming soon. + +=== "Dynamo" + +
    + + ```yaml + type: service + name: dynamo-pd + + env: + - HF_TOKEN + - MODEL_ID=zai-org/GLM-4.5-Air-FP8 + + replicas: + - count: 1 + docker: true + commands: + - apt-get update + - apt-get install -y python3-dev python3-venv + - python3 -m venv ~/dyn-venv + - source ~/dyn-venv/bin/activate + - pip install -U pip + - pip install "ai-dynamo[sglang]==1.1.1" + - git clone https://github.com/ai-dynamo/dynamo.git + # Brings up the NATS / etcd compose stack and runs the Dynamo HTTP frontend. + - docker compose -f dynamo/dev/docker-compose.yml up -d + - | + python3 -m dynamo.frontend \ + --http-host 0.0.0.0 --http-port 8000 \ + --discovery-backend etcd --router-mode kv \ + --kv-cache-block-size 64 + resources: + cpu: 4 + router: + type: dynamo + + - count: 1..4 + scaling: + metric: rps + target: 3 + python: "3.12" + nvcc: true + commands: + # dstack injects DSTACK_ROUTER_INTERNAL_IP after the router replica + # is provisioned. Compose the etcd/NATS endpoints from it. + - export ETCD_ENDPOINTS="http://$DSTACK_ROUTER_INTERNAL_IP:2379" + - export NATS_SERVER="nats://$DSTACK_ROUTER_INTERNAL_IP:4222" + # Set to enable /health endpoint required by dstack probes. + - export DYN_SYSTEM_PORT="8000" + # Wait until the router's etcd and NATS ports are actually accepting connections. + - | + until (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/2379) 2>/dev/null \ + && (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/4222) 2>/dev/null; do + echo "waiting for etcd/NATS on $DSTACK_ROUTER_INTERNAL_IP..."; sleep 3 + done + - pip install "ai-dynamo[sglang]==1.1.1" + - | + python3 -m dynamo.sglang \ + --model-path $MODEL_ID --served-model-name $MODEL_ID \ + --discovery-backend etcd --host 0.0.0.0 \ + --page-size 64 \ + --disaggregation-mode prefill --disaggregation-transfer-backend nixl + resources: + gpu: H200 + + - count: 1..8 + scaling: + metric: rps + target: 2 + python: "3.12" + nvcc: true + commands: + - export ETCD_ENDPOINTS="http://$DSTACK_ROUTER_INTERNAL_IP:2379" + - export NATS_SERVER="nats://$DSTACK_ROUTER_INTERNAL_IP:4222" + - export DYN_SYSTEM_PORT="8000" + - | + until (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/2379) 2>/dev/null \ + && (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/4222) 2>/dev/null; do + echo "waiting for etcd/NATS on $DSTACK_ROUTER_INTERNAL_IP..."; sleep 3 + done + - pip install "ai-dynamo[sglang]==1.1.1" + - | + python3 -m dynamo.sglang \ + --model-path $MODEL_ID --served-model-name $MODEL_ID \ + --discovery-backend etcd --host 0.0.0.0 \ + --page-size 64 \ + --disaggregation-mode decode --disaggregation-transfer-backend nixl + resources: + gpu: H200 + + port: 8000 + model: zai-org/GLM-4.5-Air-FP8 + + # Custom probe is required for PD disaggregation. + probes: + - type: http + url: /health + interval: 15s + ``` + +
    + + > With the `dynamo` router, you can use SGLang, vLLM, and TensorRT-LLM prefill and decode workers. + +#### AMD + +The example below deploys `Qwen/Qwen2.5-72B-Instruct` on a multi-node cluster with AMD MI300X GPUs: + +
    + +```yaml +type: service +name: amd-sglang-pd-service + +image: rocm/sgl-dev:v0.5.10.post1-rocm720-mi30x-20260427 +privileged: true + +env: + - MODEL_ID=Qwen/Qwen2.5-72B-Instruct + - HF_TOKEN + - SGLANG_USE_AITER=0 + - SGLANG_ROCM_FUSED_DECODE_MLA=0 + - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=600 + - SGLANG_DISAGGREGATION_WAITING_TIMEOUT=600 + - RDMA_DEVICES=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7 + - NCCL_IB_DISABLE=1 + +replicas: + - count: 1 + commands: + - pip install smg + - | + smg launch \ + --pd-disaggregation \ + --host 0.0.0.0 \ + --port 30000 + resources: + cpu: 4.. + router: + type: sglang + + - count: 1..2 + scaling: + metric: rps + target: 300 + commands: + - | + python3 -m sglang.launch_server \ + --model $MODEL_ID \ + --disaggregation-mode prefill \ + --disaggregation-transfer-backend mooncake \ + --host 0.0.0.0 \ + --port 30000 \ + --tp $DSTACK_GPUS_NUM \ + --trust-remote-code \ + --disaggregation-ib-device $RDMA_DEVICES \ + --disaggregation-bootstrap-port 8998 \ + --disable-radix-cache \ + --disable-cuda-graph \ + --disable-overlap-schedule \ + --mem-fraction-static 0.8 \ + --max-running-requests 1024 + resources: + gpu: MI300X:8 + cpu: 96.. + memory: 512GB.. + + - count: 1..4 + scaling: + metric: rps + target: 300 + commands: + - | + python3 -m sglang.launch_server \ + --model $MODEL_ID \ + --disaggregation-mode decode \ + --disaggregation-transfer-backend mooncake \ + --host 0.0.0.0 \ + --port 30000 \ + --tp $DSTACK_GPUS_NUM \ + --trust-remote-code \ + --disaggregation-ib-device $RDMA_DEVICES \ + --disable-radix-cache \ + --disable-cuda-graph \ + --disable-overlap-schedule \ + --decode-attention-backend triton \ + --mem-fraction-static 0.8 \ + --max-running-requests 1024 + resources: + gpu: MI300X:8 + cpu: 96.. + memory: 512GB.. + +port: 30000 +model: Qwen/Qwen2.5-72B-Instruct + +# Custom probe is required for PD disaggregation. +probes: + - type: http + url: /health + interval: 15s + +volumes: + - /usr/lib64/libibverbs/libbnxt_re-rdmav34.so:/usr/lib/x86_64-linux-gnu/libibverbs/libbnxt_re-rdmav34.so +``` + +
    + +!!! info "RoCE library" + Mooncake uses the RDMA/RoCE interconnect for KV Cache transfer. To use the RDMA/RoCE interconnect on Broadcom `bnxt_re` devices, Mooncake requires the Broadcom-specific userspace provider library `libbnxt_re-rdmav34.so` to be available inside the container at `/usr/lib/x86_64-linux-gnu/libibverbs/libbnxt_re-rdmav34.so`. We make this library available by mounting the host provider library from `/usr/lib64/libibverbs/libbnxt_re-rdmav34.so`. + + + +!!! info "Cluster" + PD disaggregation requires the service to run in a fleet with `placement` set to `cluster`, because the replicas require an interconnect between instances. + + While the prefill and decode replicas run on GPUs, the router replica requires a CPU instance in the same cluster. + +### Authorization + +By default, the service enables authorization, meaning the service endpoint requires a `dstack` user token. +This can be disabled by setting `auth` to `false`. + +
    + +```yaml +type: service +name: http-server-service + +# Disable authorization +auth: false + +python: 3.12 + +commands: + - python3 -m http.server +port: 8000 +``` + +
    + +### Probes + +Configure one or more HTTP probes to periodically check the health of the service. + +
    + +```yaml +type: service +name: my-service +port: 80 +image: my-app:latest +probes: +- type: http + url: /health + interval: 15s +``` + +
    + +You can track probe statuses in `dstack ps --verbose`. + +
    + +```shell +$ dstack ps --verbose + + NAME BACKEND STATUS PROBES SUBMITTED + my-service deployment=1 running 11 mins ago + replica=0 job=0 deployment=0 aws (us-west-2) running ✓ 11 mins ago + replica=1 job=0 deployment=1 aws (us-west-2) running × 1 min ago +``` + +
    + +??? info "Status" + The following symbols are used for probe statuses: + + - `×` — the last probe execution failed. + - `~` — the last probe execution succeeded, but the [`ready_after`](../reference/dstack.yml/service.md#ready_after) threshold is not yet reached. + - `✓` — the last `ready_after` probe executions succeeded. + + If multiple probes are configured for the service, their statuses are displayed in the order in which the probes appear in the configuration. + +Probes are executed for each service replica while the replica is `running`. A probe execution is considered successful if the replica responds with a `2xx` status code. Probe statuses do not affect how `dstack` handles replicas, except during [rolling deployments](#rolling-deployment). + +??? info "HTTP request configuration" + You can configure the HTTP request method, headers, and other properties. To include secret values in probe requests, use environment variable interpolation, which is enabled for the `url`, `headers[i].value`, and `body` properties. + +
    + + ```yaml + type: service + name: my-service + port: 80 + image: my-app:latest + env: + - PROBES_API_KEY + probes: + - type: http + method: post + url: /check-health + headers: + - name: X-API-Key + value: ${{ env.PROBES_API_KEY }} + - name: Content-Type + value: application/json + body: '{"level": 2}' + timeout: 20s + ``` + +
    + +??? info "Model" + If you set the [`model`](#model) property but don't explicitly configure `probes`, + `dstack` automatically configures a default probe that tests the model using the `/v1/chat/completions` API. + To disable probes entirely when `model` is set, explicitly set `probes` to an empty list. + +See the [reference](../reference/dstack.yml/service.md#probes) for more probe configuration options. + +### Path prefix { #path-prefix } + +If your `dstack` project doesn't have a [gateway](gateways.md), services are hosted with the +`/proxy/services///` path prefix in the URL. +When running web apps, you may need to set some app-specific settings +so that browser-side scripts and CSS work correctly with the path prefix. + +
    + +```yaml +type: service +name: dash +gateway: false + +auth: false +# Do not strip the path prefix +strip_prefix: false + +env: + # Configure Dash to work with a path prefix + # Replace `main` with your dstack project name + - DASH_ROUTES_PATHNAME_PREFIX=/proxy/services/main/dash/ + +commands: + - uv pip install dash + # Assuming the Dash app is in your repo at app.py + - python app.py + +port: 8050 +``` + +
    + +By default, `dstack` strips the prefix before forwarding requests to your service, +so to the service it appears as if the prefix isn't there. This allows some apps +to work out of the box. If your app doesn't expect the prefix to be stripped, +set [`strip_prefix`](../reference/dstack.yml/service.md#strip_prefix) to `false`. + +If your app cannot be configured to work with a path prefix, you can host it +on a dedicated domain name by setting up a [gateway](gateways.md). + +### Rate limits + +If you have a [gateway](gateways.md), you can configure rate limits for your service +using the [`rate_limits`](../reference/dstack.yml/service.md#rate_limits) property. + +
    + +```yaml +type: service +image: my-app:latest +port: 80 + +rate_limits: +# For /api/auth/* - 1 request per second, no bursts +- prefix: /api/auth/ + rps: 1 +# For other URLs - 4 requests per second + bursts of up to 9 requests +- rps: 4 + burst: 9 +``` + +
    + +The rps limit sets the max requests per second, tracked in milliseconds (e.g., `rps: 4` means 1 request every 250 ms). Use `burst` to allow short spikes while keeping the average within `rps`. + +Limits apply to the whole service (all replicas) and per client (by IP). Clients exceeding the limit get a 429 error. + +??? info "Partitioning key" + Instead of partitioning requests by client IP address, + you can choose to partition by the value of a header. + +
    + + ```yaml + type: service + image: my-app:latest + port: 80 + + rate_limits: + - rps: 4 + burst: 9 + # Apply to each user, as determined by the `Authorization` header + key: + type: header + header: Authorization + ``` + +
    + +### Model + +If the service runs a model with an OpenAI-compatible interface, you can set the [`model`](#model) property to make the model accessible through `dstack`'s chat UI on the `Models` page. +In this case, `dstack` will use the service's `/v1/chat/completions` service. + +When `model` is set, `dstack` automatically configures [`probes`](#probes) to verify model health. +To customize or disable this, set `probes` explicitly. + +### Resources + +If you specify memory size, you can either specify an explicit size (e.g. `24GB`) or a +range (e.g. `24GB..`, or `24GB..80GB`, or `..80GB`). + +
    + +```yaml +type: service +name: llama31-service + +python: 3.12 +env: + - HF_TOKEN + - MODEL_ID=meta-llama/Meta-Llama-3.1-8B-Instruct + - MAX_MODEL_LEN=4096 +commands: + - uv pip install vllm + - | + vllm serve $MODEL_ID + --max-model-len $MAX_MODEL_LEN + --tensor-parallel-size $DSTACK_GPUS_NUM +port: 8000 + +resources: + # 16 or more x86_64 cores + cpu: 16.. + # 2 GPUs of 80GB + gpu: 80GB:2 + + # Minimum disk size + disk: 200GB +``` + +
    + +The `cpu` property lets you set the architecture (`x86` or `arm`) and core count — e.g., `x86:16` (16 x86 cores), `arm:8..` (at least 8 ARM cores). +If not set, `dstack` infers it from the GPU or defaults to `x86`. + +The `gpu` property lets you specify vendor, model, memory, and count — e.g., `nvidia` (one NVIDIA GPU), `A100` (one A100), `A10G,A100` (either), `A100:80GB` (one 80GB A100), `A100:2` (two A100), `24GB..40GB:2` (two GPUs with 24–40GB), `A100:40GB:2` (two 40GB A100s). + +If vendor is omitted, `dstack` infers it from the model or defaults to `nvidia`. + + + +??? info "Shared memory" + If you are using parallel communicating processes (e.g., dataloaders in PyTorch), you may need to configure + `shm_size`, e.g. set it to `16GB`. + +> If you’re unsure which offers (hardware configurations) are available from the configured backends, use the +> [`dstack offer`](../reference/cli/dstack/offer.md#list-gpu-offers) command to list them. + + +### Docker + +#### Default image + +If you don't specify `image`, `dstack` uses its [base](https://github.com/dstackai/dstack/tree/master/docker/base) Docker image pre-configured with + `uv`, `python`, `pip`, essential CUDA drivers, `mpirun`, and NCCL tests (under `/opt/nccl-tests/build`). + +Set the `python` property to pre-install a specific version of Python. + + + +
    + +```yaml +type: service +name: http-server-service + +python: 3.12 + +commands: + - python3 -m http.server +port: 8000 +``` + +
    + +#### NVCC + +By default, the base Docker image doesn’t include `nvcc`, which is required for building custom CUDA kernels. +If you need `nvcc`, set the [`nvcc`](../reference/dstack.yml/dev-environment.md#nvcc) property to true. + + + +
    + +```yaml +type: service +name: http-server-service + +python: 3.12 +nvcc: true + +commands: + - python3 -m http.server +port: 8000 +``` + +
    + +#### Custom image + +If you want, you can specify your own Docker image via `image`. + +
    + + ```yaml + type: service + name: http-server-service + + image: python + + commands: + - python3 -m http.server + port: 8000 + ``` + +
    + +#### Docker in Docker + +Set `docker` to `true` to enable the `docker` CLI in your service, e.g., to run Docker images or use Docker Compose. + +
    + +```yaml +type: service +name: compose-service + +auth: false + +docker: true + +commands: + - | + cat > compose.yaml <<'EOF' + services: + web: + image: python:3.11-slim + command: python -m http.server 9000 + ports: + - "9000:9000" + EOF + - docker compose up +port: 9000 +``` + +
    + +Cannot be used with `python` or `image`. Not supported on `runpod`, `vastai`, or `kubernetes`. + +#### Privileged mode + +To enable privileged mode, set [`privileged`](../reference/dstack.yml/dev-environment.md#privileged) to `true`. + +Not supported with `runpod`, `vastai`, and `kubernetes`. + +#### Private registry + +Use the [`registry_auth`](../reference/dstack.yml/dev-environment.md#registry_auth) property to provide credentials for a private Docker registry. + +```yaml +type: service +name: serve-distill-deepseek + +env: + - NGC_API_KEY + - NIM_MAX_MODEL_LEN=4096 + +image: nvcr.io/nim/deepseek-ai/deepseek-r1-distill-llama-8b +registry_auth: + username: $oauthtoken + password: ${{ env.NGC_API_KEY }} +port: 8000 + +model: deepseek-ai/deepseek-r1-distill-llama-8b + +resources: + gpu: H100:1 +``` + +### Environment variables + +
    + +```yaml +type: service +name: llama-2-7b-service + +python: 3.12 + +env: + - HF_TOKEN + - MODEL=NousResearch/Llama-2-7b-chat-hf +commands: + - uv pip install vllm + - python -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000 +port: 8000 + +resources: + gpu: 24GB +``` + +
    + +> If you don't assign a value to an environment variable (see `HF_TOKEN` above), +`dstack` will require the value to be passed via the CLI or set in the current process. + +??? info "System environment variables" + The following environment variables are available in any run by default: + + | Name | Description | + |-------------------------|--------------------------------------------------| + | `DSTACK_RUN_NAME` | The name of the run | + | `DSTACK_REPO_ID` | The ID of the repo | + | `DSTACK_GPUS_NUM` | The total number of GPUs in the run | + | `DSTACK_WORKING_DIR` | The working directory of the run | + | `DSTACK_REPO_DIR` | The directory where the repo is mounted (if any) | + + + +### Working directory + +If `working_dir` is not specified, it defaults to the working directory set in the Docker image. For example, the [default image](#default-image) uses `/dstack/run` as its working directory. + +If the Docker image does not have a working directory set, `dstack` uses `/` as the `working_dir`. + +The `working_dir` must be an absolute path. The tilde (`~`) is supported (e.g., `~/my-working-dir`). + + + +### Files + +Sometimes, when you run a service, you may want to mount local files. This is possible via the [`files`](../reference/dstack.yml/task.md#_files) property. Each entry maps a local directory or file to a path inside the container. + + + +
    + +```yaml +type: service +name: llama-2-7b-service + +files: + - .:examples # Maps the directory with `.dstack.yml` to `/examples` + - ~/.ssh/id_rsa:/root/.ssh/id_rsa # Maps `~/.ssh/id_rsa` to `/root/.ssh/id_rsa` + +python: 3.12 + +env: + - HF_TOKEN + - MODEL=NousResearch/Llama-2-7b-chat-hf +commands: + - uv pip install vllm + - python -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000 +port: 8000 + +resources: + gpu: 24GB +``` + +
    + +If the local path is relative, it’s resolved relative to the configuration file. +If the container path is relative, it’s resolved relative to the [working directory](#working-directory). + +The container path is optional. If not specified, it will be automatically calculated: + + + +
    + +```yaml +type: service +name: llama-2-7b-service + +files: + - ../examples # Maps the parent directory of `.dstack.yml` to `/../examples` + - ~/.ssh/id_rsa # Maps `~/.ssh/id_rsa` to `/root/.ssh/id_rsa` + +python: 3.12 + +env: + - HF_TOKEN + - MODEL=NousResearch/Llama-2-7b-chat-hf +commands: + - uv pip install vllm + - python -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000 +port: 8000 + +resources: + gpu: 24GB +``` + +
    + +??? info "File size" + Whether its a file or folder, each entry is limited to 2MB. To avoid exceeding this limit, make sure to exclude unnecessary files + by listing it via `.gitignore` or `.dstackignore`. + The 2MB upload limit can be increased by setting the `DSTACK_SERVER_CODE_UPLOAD_LIMIT` environment variable. + +### Repos + +Sometimes, you may want to clone an entire Git repo inside the container. + +Imagine you have a Git repo (clonned locally) containing an `examples` subdirectory with a `.dstack.yml` file: + + + +
    + +```yaml +type: service +name: llama-2-7b-service + +repos: + # Clones the repo from the parent directory (`examples/..`) to `` + - .. + +python: 3.12 + +env: + - HF_TOKEN + - MODEL=NousResearch/Llama-2-7b-chat-hf +commands: + - uv pip install vllm + - python -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000 +port: 8000 + +resources: + gpu: 24GB +``` + +
    + +When you run it, `dstack` clones the repo on the instance, applies your local changes, and mounts it—so the container matches your local repo. + +The local path can be either relative to the configuration file or absolute. + +??? info "Repo directory" + By default, `dstack` clones the repo to the [working directory](#working-directory). + + + + You can override the repo directory using either a relative or an absolute path: + +
    + + ```yaml + type: service + name: llama-2-7b-service + + repos: + # Clones the repo in the parent directory (`examples/..`) to `/my-repo` + - ..:/my-repo + + python: 3.12 + + env: + - HF_TOKEN + - MODEL=NousResearch/Llama-2-7b-chat-hf + commands: + - uv pip install vllm + - python -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000 + port: 8000 + + resources: + gpu: 24GB + ``` + +
    + + > If the repo directory is relative, it is resolved against [working directory](#working-directory). + + If the repo directory is not empty, the run will fail with a runner error. + To override this behavior, you can set `if_exists` to `skip`: + + ```yaml + type: service + name: llama-2-7b-service + + repos: + - local_path: .. + path: /my-repo + if_exists: skip + + python: 3.12 + + env: + - HF_TOKEN + - MODEL=NousResearch/Llama-2-7b-chat-hf + commands: + - uv pip install vllm + - python -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000 + port: 8000 + + resources: + gpu: 24GB + ``` + +??? info "Repo size" + The repo size is not limited. However, local changes are limited to 2MB. + To avoid exceeding this limit, exclude unnecessary files using `.gitignore` or `.dstackignore`. + You can increase the 2MB limit by setting the `DSTACK_SERVER_CODE_UPLOAD_LIMIT` environment variable. + +??? info "Repo URL" + Sometimes you may want to clone a Git repo within the container without cloning it locally. In this case, simply provide a URL in `repos`: + + + +
    + + ```yaml + type: service + name: llama-2-7b-service + + repos: + # Clone the repo to `` + - https://github.com/dstackai/dstack + + python: 3.12 + + env: + - HF_TOKEN + - MODEL=NousResearch/Llama-2-7b-chat-hf + commands: + - uv pip install vllm + - python -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000 + port: 8000 + + resources: + gpu: 24GB + ``` + +
    + +??? info "Private repos" + If a Git repo is private, `dstack` will automatically try to use your default Git credentials (from + `~/.ssh/config` or `~/.config/gh/hosts.yml`). + + > If you want to use custom credentials, you can provide them with [`dstack init`](../reference/cli/dstack/init.md). + +Currently, you can configure up to one repo per run configuration. + +### Retry policy + +By default, if `dstack` can't find capacity, or the service exits with an error, or the instance is interrupted, the run will fail. + +If you'd like `dstack` to automatically retry, configure the +[retry](../reference/dstack.yml/service.md#retry) property accordingly: + + +
    + +```yaml +type: service +image: my-app:latest +port: 80 + +retry: + on_events: [no-capacity, error, interruption] + # Retry for up to 1 hour + duration: 1h +``` + +
    + +If one replica of a multi-replica service fails with retry enabled, +`dstack` will resubmit only the failed replica while keeping active replicas running. + +!!! info "Retry duration" + The duration period is calculated as a run age for `no-capacity` event and as a time passed since the last `interruption` and `error` for `interruption` and `error` events. + +### Spot policy + +By default, `dstack` uses on-demand instances. However, you can change that +via the [`spot_policy`](../reference/dstack.yml/service.md#spot_policy) property. It accepts `spot`, `on-demand`, and `auto`. + +### Utilization policy + +Sometimes it’s useful to track whether a service is fully utilizing all GPUs. While you can check this with +[`dstack metrics`](../reference/cli/dstack/metrics.md), `dstack` also lets you set a policy to auto-terminate the run if any GPU is underutilized. + +Below is an example of a service that auto-terminate if any GPU stays below 10% utilization for 1 hour. + + + +
    + +```yaml +type: service +name: llama-2-7b-service + +python: 3.12 +env: + - HF_TOKEN + - MODEL=NousResearch/Llama-2-7b-chat-hf +commands: + - uv pip install vllm + - python -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000 +port: 8000 + +resources: + gpu: 24GB + +utilization_policy: + min_gpu_utilization: 10 + time_window: 1h +``` + +
    + +### Schedule + +Specify `schedule` to start a service periodically at specific UTC times using the cron syntax: + +
    + +```yaml +type: service +name: llama-2-7b-service + +python: 3.12 +env: + - HF_TOKEN + - MODEL=NousResearch/Llama-2-7b-chat-hf +commands: + - uv pip install vllm + - python -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000 +port: 8000 + +resources: + gpu: 24GB + +schedule: + cron: "0 8 * * mon-fri" # at 8:00 UTC from Monday through Friday +``` + +
    + +The `schedule` property can be combined with `max_duration` or `utilization_policy` to shutdown the service automatically when it's not needed. + +??? info "Cron syntax" + `dstack` supports [POSIX cron syntax](https://pubs.opengroup.org/onlinepubs/9699919799/utilities/crontab.html#tag_20_25_07). One exception is that days of the week are started from Monday instead of Sunday so `0` corresponds to Monday. + + The month and day of week fields accept abbreviated English month and weekday names (`jan–dec` and `mon–sun`) respectively. + + A cron expression consists of five fields: + + ``` + ┌───────────── minute (0-59) + │ ┌───────────── hour (0-23) + │ │ ┌───────────── day of the month (1-31) + │ │ │ ┌───────────── month (1-12 or jan-dec) + │ │ │ │ ┌───────────── day of the week (0-6 or mon-sun) + │ │ │ │ │ + │ │ │ │ │ + │ │ │ │ │ + * * * * * + ``` + + The following operators can be used in any of the fields: + + | Operator | Description | Example | + |----------|-----------------------|-------------------------------------------------------------------------| + | `*` | Any value | `0 * * * *` runs every hour at minute 0 | + | `,` | Value list separator | `15,45 10 * * *` runs at 10:15 and 10:45 every day. | + | `-` | Range of values | `0 1-3 * * *` runs at 1:00, 2:00, and 3:00 every day. | + | `/` | Step values | `*/10 8-10 * * *` runs every 10 minutes during the hours 8:00 to 10:59. | + +--8<-- "docs/concepts/snippets/manage-fleets.ext" + +!!! info "Reference" + Services support many more configuration options, + incl. [`backends`](../reference/dstack.yml/service.md#backends), + [`regions`](../reference/dstack.yml/service.md#regions), + [`max_price`](../reference/dstack.yml/service.md#max_price), and + among [others](../reference/dstack.yml/service.md). + +## Rolling deployment + +To deploy a new version of a service that is already `running`, use `dstack apply`. `dstack` will automatically detect changes and suggest a rolling deployment update. + +
    + +```shell +$ dstack apply -f my-service.dstack.yml + +Active run my-service already exists. Detected changes that can be updated in-place: +- Repo state (branch, commit, or other) +- File archives +- Configuration properties: + - env + - files + +Update the run? [y/n]: +``` + +
    + +If approved, `dstack` gradually updates the service replicas. To update a replica, `dstack` starts a new replica, waits for it to become `running` and for all of its [probes](#probes) to pass, then terminates the old replica. This process is repeated for each replica, one at a time. + +You can track the progress of rolling deployment in both `dstack apply` or `dstack ps`. +Older replicas have lower `deployment` numbers; newer ones have higher. + + + +```shell +$ dstack apply -f my-service.dstack.yml + +⠋ Launching my-service... + NAME BACKEND PRICE STATUS SUBMITTED + my-service deployment=1 running 11 mins ago + replica=0 job=0 deployment=0 aws (us-west-2) $0.0026 terminating 11 mins ago + replica=1 job=0 deployment=1 aws (us-west-2) $0.0026 running 1 min ago +``` + +The rolling deployment stops when all replicas are updated or when a new deployment is submitted. + +??? info "Supported properties" + + + Rolling deployment supports changes to the following properties: `port`, `probes`, `resources`, `volumes`, `docker`, `files`, `image`, `user`, `privileged`, `entrypoint`, `working_dir`, `python`, `nvcc`, `single_branch`, `env`, `shell`, `commands`, as well as changes to [repo](#repos) or [file](#files) contents. + + Changes to `replicas` and `scaling` can be applied without redeploying replicas. + + Changes to other properties require a full service restart. + + To trigger a rolling deployment when no properties have changed (e.g., after updating [secrets](secrets.md) or to restart all replicas), + make a minor config change, such as adding a dummy [environment variable](#environment-variables). + +--8<-- "docs/concepts/snippets/manage-runs.ext" + +!!! info "What's next?" + 1. Read about [dev environments](dev-environments.md) and [tasks](tasks.md) + 2. Learn how to manage [fleets](fleets.md) + 3. See how to set up [gateways](gateways.md) + 4. Check the [vLLM](../examples/inference/vllm.md) and + [NIM](../examples/inference/nim.md) examples diff --git a/mkdocs/docs/concepts/snippets/manage-fleets.ext b/mkdocs/docs/concepts/snippets/manage-fleets.ext new file mode 100644 index 0000000000..b30b4126a8 --- /dev/null +++ b/mkdocs/docs/concepts/snippets/manage-fleets.ext @@ -0,0 +1,10 @@ +### Idle duration + +If the run is submitted to a fleet with `nodes` set to a range and a new instance is provisioned, +the shorter of the fleet's and run's `idle_duration` is used. +If the run reuses an existing fleet instance, only the fleet's +[`idle_duration`](../reference/dstack.yml/fleet.md#idle_duration) applies. + +If an instance remains `idle`, it is automatically terminated after `idle_duration`. + +> Not applied for container-based backends (Kubernetes, Vast.ai, Runpod). diff --git a/mkdocs/docs/concepts/snippets/manage-runs.ext b/mkdocs/docs/concepts/snippets/manage-runs.ext new file mode 100644 index 0000000000..5fad4a62aa --- /dev/null +++ b/mkdocs/docs/concepts/snippets/manage-runs.ext @@ -0,0 +1,14 @@ +## Manage runs + +`dstack` provides several commands to manage runs: + +* [`dstack ps`](../reference/cli/dstack/ps.md) – Lists all running jobs and their statuses. + Use `--watch` (or `-w`) to monitor the live status of runs. +* [`dstack stop`](../reference/cli/dstack/stop.md) – Stops a run gracefully. +Pass `--abort` or `-x` to stop it immediately without waiting for a graceful shutdown. By default, a run + runs until you stop it or its lifetime exceeds the value of [`max_duration`](../reference/dstack.yml/dev-environment.md#max_duration). +* [`dstack attach`](../reference/cli/dstack/attach.md) – By default, `dstack apply` runs in attached mode, + establishing an SSH tunnel to the run, forwarding ports, and displaying real-time logs. + If you detach from a run, use this command to reattach. +* [`dstack logs`](../reference/cli/dstack/logs.md) – Displays run logs. +Pass `--diagnose` or `-d` to view diagnostic logs, which can help troubleshoot failed runs. diff --git a/mkdocs/docs/concepts/tasks.md b/mkdocs/docs/concepts/tasks.md new file mode 100644 index 0000000000..d3a3515b82 --- /dev/null +++ b/mkdocs/docs/concepts/tasks.md @@ -0,0 +1,882 @@ +--- +title: Tasks +description: Running commands for training and batch processing +--- + +# Tasks + +A task allows you to run arbitrary commands on one or more nodes. They are best suited for jobs like training or batch processing. + +??? info "Prerequisites" + Before running a task, make sure you’ve [installed](../installation.md) the server and CLI, and created a [fleet](fleets.md). + +## Apply a configuration + +First, define a task configuration as a YAML file. +The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml` are both acceptable). + +[//]: # (TODO: Make tabs - single machine & distributed tasks & web app) + +
    + +```yaml +type: task +# The name is optional, if not specified, generated randomly +name: trl-sft + +python: 3.12 + +# Uncomment to use a custom Docker image +#image: huggingface/trl-latest-gpu + +env: + - MODEL=Qwen/Qwen2.5-0.5B + - DATASET=stanfordnlp/imdb + +commands: + - uv pip install trl + - | + trl sft \ + --model_name_or_path $MODEL --dataset_name $DATASET \ + --num_processes $DSTACK_GPUS_PER_NODE + +resources: + # One to two H100 GPUs + gpu: H100:1..2 + shm_size: 24GB +``` + +
    + +To run a task, pass the configuration to [`dstack apply`](../reference/cli/dstack/apply.md): + +
    + +```shell +$ dstack apply -f .dstack.yml + + # BACKEND REGION RESOURCES SPOT PRICE + 1 runpod CA-MTL-1 18xCPU, 100GB, A5000:24GB:2 yes $0.22 + 2 runpod EU-SE-1 18xCPU, 100GB, A5000:24GB:2 yes $0.22 + 3 gcp us-west4 27xCPU, 150GB, A5000:24GB:3 yes $0.33 + +Submit the run trl-sft? [y/n]: y + +Launching `axolotl-train`... +---> 100% + +{'loss': 1.4967, 'grad_norm': 1.2734375, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0} + 0% 1/24680 [00:13<95:34:17, 13.94s/it] + 6% 73/1300 [00:48<13:57, 1.47it/s] +``` + +
    + +`dstack apply` automatically provisions instances and runs the task. + +## Configuration options + +!!! info "No commands" + If `commands` are not specified, `dstack` runs `image`’s entrypoint (or fails if none is set). + +### Ports + +A task can configure ports. In this case, if the task is running an application on a port, `dstack apply` +will securely allow you to access this port from your local machine through port forwarding. + +
    + +```yaml +type: task +name: streamlit-hello + +python: 3.12 + +commands: + - uv pip install streamlit + - streamlit hello +ports: + - 8501 +``` + +
    + +When running it, `dstack apply` forwards `8501` port to `localhost:8501`, enabling secure access to the running +application. + +### Distributed tasks + +By default, a task runs on a single node. +However, you can run it on a cluster of nodes by specifying `nodes`. + +
    + +```yaml +type: task +name: train-distrib + +nodes: 2 + +python: 3.12 +env: + - NCCL_DEBUG=INFO +commands: + - git clone https://github.com/pytorch/examples.git pytorch-examples + - cd pytorch-examples/distributed/ddp-tutorial-series + - uv pip install -r requirements.txt + - | + torchrun \ + --nproc-per-node=$DSTACK_GPUS_PER_NODE \ + --node-rank=$DSTACK_NODE_RANK \ + --nnodes=$DSTACK_NODES_NUM \ + --master-addr=$DSTACK_MASTER_NODE_IP \ + --master-port=12345 \ + multinode.py 50 10 + +resources: + gpu: 24GB:1..2 + shm_size: 24GB +``` + +
    + +!!! info "Cluster placement" + To submit a distributed task, you must create at least one fleet with a [cluster placement](fleets.md#cluster-placement). + + +Jobs on each node communicate using their private IP addresses. Use `DSTACK_MASTER_NODE_IP`, `DSTACK_NODES_IPS`, `DSTACK_NODE_RANK`, and other [system environment variables](#system-environment-variables) for inter-node communication. + + + +`dstack` is easy to use with `accelerate`, `torchrun`, Ray, Spark, and any other distributed frameworks. + +!!! info "Examples" + See the training examples for [TRL](../examples/training/trl.md#distributed-training), [Axolotl](../examples/training/axolotl.md#distributed-training), [Ray+RAGEN](../examples/training/ray-ragen.md), and [Miles](../examples/training/miles.md). + + See the cluster examples for [AWS](../examples/clusters/aws.md), [GCP](../examples/clusters/gcp.md), [Lambda](../examples/clusters/lambda.md), [Crusoe](../examples/clusters/crusoe.md), [Nebius](../examples/clusters/nebius.md), and [NCCL/RCCL tests](../examples/clusters/nccl-rccl-tests.md). + +??? info "Network interface" + Distributed frameworks usually detect the correct network interface automatically, + but sometimes you need to specify it explicitly. + + For example, with PyTorch and the NCCL backend, you may need + to add these commands to tell NCCL to use the private interface: + + ```yaml + commands: + - apt-get install -y iproute2 + - > + if [[ $DSTACK_NODE_RANK == 0 ]]; then + export NCCL_SOCKET_IFNAME=$(ip -4 -o addr show | fgrep $DSTACK_MASTER_NODE_IP | awk '{print $2}') + else + export NCCL_SOCKET_IFNAME=$(ip route get $DSTACK_MASTER_NODE_IP | sed -E 's/.*?dev (\S+) .*/\1/;t;d') + fi + # ... The rest of the commands + ``` + +??? info "SSH" + You can log in to any node from any node via SSH on port 10022 using the `~/.ssh/dstack_job` private key. + For convenience, `~/.ssh/config` is preconfigured with these options, so a simple `ssh ` is enough. + For a list of nodes IPs check the `DSTACK_NODES_IPS` environment variable. + +### Resources + +When you specify a resource value like `cpu` or `memory`, +you can either use an exact value (e.g. `24GB`) or a +range (e.g. `24GB..`, or `24GB..80GB`, or `..80GB`). + +
    + +```yaml +type: task +name: trl-sft + +python: 3.12 + +env: + - MODEL=Qwen/Qwen2.5-0.5B + - DATASET=stanfordnlp/imdb + +commands: + - uv pip install trl + - | + trl sft \ + --model_name_or_path $MODEL --dataset_name $DATASET \ + --num_processes $DSTACK_GPUS_PER_NODE + +resources: + # 16 or more x86_64 cores + cpu: 16.. + # 200GB or more RAM + memory: 200GB.. + # 4 GPUs from 40GB to 80GB + gpu: 40GB..80GB:4 + # Shared memory (required by multi-gpu) + shm_size: 24GB + # Disk size + disk: 500GB +``` + +
    + +The `cpu` property lets you set the architecture (`x86` or `arm`) and core count — e.g., `x86:16` (16 x86 cores), `arm:8..` (at least 8 ARM cores). +If not set, `dstack` infers it from the GPU or defaults to `x86`. + +The `gpu` property lets you specify vendor, model, memory, and count — e.g., `nvidia` (one NVIDIA GPU), `A100` (one A100), `A10G,A100` (either), `A100:80GB` (one 80GB A100), `A100:2` (two A100), `24GB..40GB:2` (two GPUs with 24–40GB), `A100:40GB:2` (two 40GB A100s). + +If vendor is omitted, `dstack` infers it from the model or defaults to `nvidia`. + + + +??? info "Shared memory" + If you are using parallel communicating processes (e.g., dataloaders in PyTorch), you may need to configure + `shm_size`, e.g. set it to `24GB`. + +> If you’re unsure which offers (hardware configurations) are available from the configured backends, use the +> [`dstack offer`](../reference/cli/dstack/offer.md#list-gpu-offers) command to list them. + + +### Docker + +#### Default image + +If you don't specify `image`, `dstack` uses its [base](https://github.com/dstackai/dstack/tree/master/docker/base) Docker image pre-configured with + `uv`, `python`, `pip`, essential CUDA drivers, `mpirun`, and NCCL tests (under `/opt/nccl-tests/build`). + +Set the `python` property to pre-install a specific version of Python. + +
    + +```yaml +type: task +name: train + +python: 3.12 + +env: + - MODEL=Qwen/Qwen2.5-0.5B + - DATASET=stanfordnlp/imdb + +commands: + - uv pip install trl + - | + trl sft \ + --model_name_or_path $MODEL --dataset_name $DATASET \ + --num_processes $DSTACK_GPUS_PER_NODE + +resources: + gpu: H100:1..2 + shm_size: 24GB +``` + +
    + +#### NVCC + +By default, the base Docker image doesn’t include `nvcc`, which is required for building custom CUDA kernels. +If you need `nvcc`, set the [`nvcc`](../reference/dstack.yml/dev-environment.md#nvcc) property to true. + +```yaml +type: task +name: train + +python: 3.12 +nvcc: true + +env: + - MODEL=Qwen/Qwen2.5-0.5B + - DATASET=stanfordnlp/imdb + +commands: + - uv pip install trl + - uv pip install flash_attn --no-build-isolation + - | + trl sft \ + --model_name_or_path $MODEL --dataset_name $DATASET \ + --attn_implementation=flash_attention_2 \ + --num_processes $DSTACK_GPUS_PER_NODE + +resources: + gpu: H100:1 +``` + +#### Custom image + +If you want, you can specify your own Docker image via `image`. + + + +
    + +```yaml +type: task +name: trl-sft + +image: huggingface/trl-latest-gpu + +env: + - MODEL=Qwen/Qwen2.5-0.5B + - DATASET=stanfordnlp/imdb + +# if shell is not specified, `sh` is used for custom images +shell: bash + +commands: + - source activate trl + - | + trl sft --model_name_or_path $MODEL \ + --dataset_name $DATASET \ + --output_dir /output \ + --torch_dtype bfloat16 \ + --use_peft true + +resources: + gpu: H100:1 +``` + +
    + +#### Docker in Docker + +Set `docker` to `true` to enable the `docker` CLI in your task, e.g., to run or build Docker images, or use Docker Compose. + +
    + +```yaml +type: task +name: docker-nvidia-smi + +docker: true + +commands: + - docker run --gpus all nvidia/cuda:12.3.0-base-ubuntu22.04 nvidia-smi + +resources: + gpu: 1 +``` + +
    + +Cannot be used with `python` or `image`. Not supported on `runpod`, `vastai`, or `kubernetes`. + +#### Privileged mode + +To enable privileged mode, set [`privileged`](../reference/dstack.yml/dev-environment.md#privileged) to `true`. + +Not supported with `runpod`, `vastai`, and `kubernetes`. + +#### Private registry + +Use the [`registry_auth`](../reference/dstack.yml/dev-environment.md#registry_auth) property to provide credentials for a private Docker registry. + +```yaml +type: task +name: train + +env: + - NGC_API_KEY + +image: nvcr.io/nvidia/pytorch:25.05-py3 +registry_auth: + username: $oauthtoken + password: ${{ env.NGC_API_KEY }} + +commands: + - git clone https://github.com/pytorch/examples.git pytorch-examples + - cd pytorch-examples/distributed/ddp-tutorial-series + - pip install -r requirements.txt + - | + torchrun \ + --nproc-per-node=$DSTACK_GPUS_PER_NODE \ + --nnodes=$DSTACK_NODES_NUM \ + multinode.py 50 10 + +resources: + gpu: H100:1..2 + shm_size: 24GB +``` + +### Environment variables + +
    + +```yaml +type: task +name: trl-sft + +python: 3.12 + +env: + - HF_TOKEN + - HF_HUB_ENABLE_HF_TRANSFER=1 + - MODEL=Qwen/Qwen2.5-0.5B + - DATASET=stanfordnlp/imdb + +commands: + - uv pip install trl + - | + trl sft \ + --model_name_or_path $MODEL --dataset_name $DATASET \ + --num_processes $DSTACK_GPUS_PER_NODE + +resources: + gpu: H100:1 +``` + +
    + +If you don't assign a value to an environment variable (see `HF_TOKEN` above), +`dstack` will require the value to be passed via the CLI or set in the current process. + + +??? info "System environment variables" + The following environment variables are available in any run by default: + + | Name | Description | + |-------------------------|------------------------------------------------------------------| + | `DSTACK_RUN_NAME` | The name of the run | + | `DSTACK_REPO_ID` | The ID of the repo | + | `DSTACK_GPUS_NUM` | The total number of GPUs in the run | + | `DSTACK_NODES_NUM` | The number of nodes in the run | + | `DSTACK_GPUS_PER_NODE` | The number of GPUs per node | + | `DSTACK_NODE_RANK` | The rank of the node | + | `DSTACK_MASTER_NODE_IP` | The internal IP address of the master node | + | `DSTACK_NODES_IPS` | The list of internal IP addresses of all nodes delimited by "\n" | + | `DSTACK_MPI_HOSTFILE` | The path to a pre-populated MPI hostfile | + | `DSTACK_WORKING_DIR` | The working directory of the run | + | `DSTACK_REPO_DIR` | The directory where the repo is mounted (if any) | + +### Working directory + +If `working_dir` is not specified, it defaults to the working directory set in the Docker image. For example, the [default image](#default-image) uses `/dstack/run` as its working directory. + +If the Docker image does not have a working directory set, `dstack` uses `/` as the `working_dir`. + +The `working_dir` must be an absolute path. The tilde (`~`) is supported (e.g., `~/my-working-dir`). + + + +### Files + +Sometimes, when you run a task, you may want to mount local files. This is possible via the [`files`](../reference/dstack.yml/task.md#_files) property. Each entry maps a local directory or file to a path inside the container. + +
    + +```yaml +type: task +name: trl-sft + +files: + - .:examples # Maps the directory with `.dstack.yml` to `/examples` + - ~/.ssh/id_rsa:/root/.ssh/id_rsa # Maps `~/.ssh/id_rsa` to `/root/.ssh/id_rs + +python: 3.12 + +env: + - HF_TOKEN + - HF_HUB_ENABLE_HF_TRANSFER=1 + - MODEL=Qwen/Qwen2.5-0.5B + - DATASET=stanfordnlp/imdb + +commands: + - uv pip install trl + - | + trl sft \ + --model_name_or_path $MODEL --dataset_name $DATASET \ + --num_processes $DSTACK_GPUS_PER_NODE + +resources: + gpu: H100:1 +``` + +
    + +If the local path is relative, it’s resolved relative to the configuration file. +If the container path is relative, it’s resolved relative to the [working directory](#working-directory). + +The container path is optional. If not specified, it will be automatically calculated: + + + +
    + +```yaml +type: task +name: trl-sft + +files: + - ../examples # Maps the parent directory of `.dstack.yml` to `/../examples` + - ~/.cache/huggingface/token # Maps `~/.cache/huggingface/token` to `/root/.cache/huggingface/token` + +python: 3.12 + +env: + - HF_TOKEN + - HF_HUB_ENABLE_HF_TRANSFER=1 + - MODEL=Qwen/Qwen2.5-0.5B + - DATASET=stanfordnlp/imdb + +commands: + - uv pip install trl + - | + trl sft \ + --model_name_or_path $MODEL --dataset_name $DATASET \ + --num_processes $DSTACK_GPUS_PER_NODE + +resources: + gpu: H100:1 +``` + +
    + +??? info "File size" + Whether its a file or folder, each entry is limited to 2MB. To avoid exceeding this limit, make sure to exclude unnecessary files + by listing it via `.gitignore` or `.dstackignore`. + The 2MB upload limit can be increased by setting the `DSTACK_SERVER_CODE_UPLOAD_LIMIT` environment variable. + +### Repos + +Sometimes, you may want to clone an entire Git repo inside the container. + +Imagine you have a Git repo (clonned locally) containing an `examples` subdirectory with a `.dstack.yml` file: + + + +
    + +```yaml +type: task +name: trl-sft + +repos: + # Clones the repo from the parent directory (`examples/..`) to `` + - .. + +python: 3.12 + +env: + - HF_TOKEN + - HF_HUB_ENABLE_HF_TRANSFER=1 + - MODEL=Qwen/Qwen2.5-0.5B + - DATASET=stanfordnlp/imdb + +commands: + - uv pip install trl + - | + trl sft \ + --model_name_or_path $MODEL --dataset_name $DATASET \ + --num_processes $DSTACK_GPUS_PER_NODE + +resources: + gpu: H100:1 +``` + +
    + +When you run it, `dstack` clones the repo on the instance, applies your local changes, and mounts it—so the container matches your local repo. + +The local path can be either relative to the configuration file or absolute. + +??? info "Repo directory" + By default, `dstack` clones the repo to the [working directory](#working-directory). + + You can override the repo directory using either a relative or an absolute path: + +
    + + ```yaml + type: task + name: trl-sft + + repos: + # Clones the repo in the parent directory (`examples/..`) to `/my-repo` + - ..:/my-repo + + python: 3.12 + + env: + - HF_TOKEN + - HF_HUB_ENABLE_HF_TRANSFER=1 + - MODEL=Qwen/Qwen2.5-0.5B + - DATASET=stanfordnlp/imdb + + commands: + - uv pip install trl + - | + trl sft \ + --model_name_or_path $MODEL --dataset_name $DATASET \ + --num_processes $DSTACK_GPUS_PER_NODE + + resources: + gpu: H100:1 + ``` + +
    + + > If the repo directory is relative, it is resolved against [working directory](#working-directory). + + If the repo directory is not empty, the run will fail with a runner error. + To override this behavior, you can set `if_exists` to `skip`: + + ```yaml + type: task + name: trl-sft + + repos: + - local_path: .. + path: /my-repo + if_exists: skip + + python: 3.12 + + env: + - HF_TOKEN + - HF_HUB_ENABLE_HF_TRANSFER=1 + - MODEL=Qwen/Qwen2.5-0.5B + - DATASET=stanfordnlp/imdb + + commands: + - uv pip install trl + - | + trl sft \ + --model_name_or_path $MODEL --dataset_name $DATASET \ + --num_processes $DSTACK_GPUS_PER_NODE + + resources: + gpu: H100:1 + ``` + +??? info "Repo size" + The repo size is not limited. However, local changes are limited to 2MB. + To avoid exceeding this limit, exclude unnecessary files using `.gitignore` or `.dstackignore`. + You can increase the 2MB limit by setting the `DSTACK_SERVER_CODE_UPLOAD_LIMIT` environment variable. + +??? info "Repo URL" + Sometimes you may want to clone a Git repo within the container without cloning it locally. In this case, simply provide a URL in `repos`: + + + +
    + + ```yaml + type: task + name: trl-sft + + repos: + # Clone the repo to `` + - https://github.com/dstackai/dstack + + python: 3.12 + + env: + - HF_TOKEN + - HF_HUB_ENABLE_HF_TRANSFER=1 + - MODEL=Qwen/Qwen2.5-0.5B + - DATASET=stanfordnlp/imdb + + commands: + - uv pip install trl + - | + trl sft \ + --model_name_or_path $MODEL --dataset_name $DATASET \ + --num_processes $DSTACK_GPUS_PER_NODE + + resources: + gpu: H100:1 + ``` + +
    + +??? info "Private repos" + If a Git repo is private, `dstack` will automatically try to use your default Git credentials (from + `~/.ssh/config` or `~/.config/gh/hosts.yml`). + + > If you want to use custom credentials, you can provide them with [`dstack init`](../reference/cli/dstack/init.md). + +Currently, you can configure up to one repo per run configuration. + +### Retry policy + +By default, if `dstack` can't find capacity, or the task exits with an error, or the instance is interrupted, +the run will fail. + +If you'd like `dstack` to automatically retry, configure the +[retry](../reference/dstack.yml/task.md#retry) property accordingly: + + + +
    + +```yaml +type: task +name: train + +python: 3.12 + +commands: + - uv pip install -r fine-tuning/qlora/requirements.txt + - python fine-tuning/qlora/train.py + +retry: + on_events: [no-capacity, error, interruption] + # Retry for up to 1 hour + duration: 1h +``` + +
    + +If one job of a multi-node task fails with retry enabled, +`dstack` will stop all the jobs and resubmit the run. + +!!! info "Retry duration" + The duration period is calculated as a run age for `no-capacity` event and as a time passed since the last `interruption` and `error` for `interruption` and `error` events. + +### Priority + +Be default, submitted runs are scheduled in the order they were submitted. +When compute resources are limited, you may want to prioritize some runs over others. +This can be done by specifying the [`priority`](../reference/dstack.yml/task.md) property in the run configuration: + + + +
    + +```yaml +type: task +name: train + +python: 3.12 + +commands: + - uv pip install -r fine-tuning/qlora/requirements.txt + - python fine-tuning/qlora/train.py + +priority: 50 +``` + +
    + +`dstack` tries to provision runs with higher priority first. +Note that if a high priority run cannot be scheduled, +it does not block other runs with lower priority from scheduling. + +### Utilization policy + +Sometimes it’s useful to track whether a task is fully utilizing all GPUs. While you can check this with +[`dstack metrics`](../reference/cli/dstack/metrics.md), `dstack` also lets you set a policy to auto-terminate the run if any GPU is underutilized. + +Below is an example of a task that auto-terminate if any GPU stays below 10% utilization for 1 hour. + + + +
    + +```yaml +type: task +name: train + +python: 3.12 +commands: + - uv pip install -r fine-tuning/qlora/requirements.txt + - python fine-tuning/qlora/train.py + +resources: + gpu: H100:8 + +utilization_policy: + min_gpu_utilization: 10 + time_window: 1h +``` + +
    + +### Schedule + +Specify `schedule` to start a task periodically at specific UTC times using the cron syntax: + +
    + +```yaml +type: task +name: train + +python: 3.12 +commands: + - uv pip install -r fine-tuning/qlora/requirements.txt + - python fine-tuning/qlora/train.py + +resources: + gpu: H100:8 + +schedule: + cron: "15 23 * * *" # everyday at 23:15 UTC +``` + +
    + +??? info "Cron syntax" + `dstack` supports [POSIX cron syntax](https://pubs.opengroup.org/onlinepubs/9699919799/utilities/crontab.html#tag_20_25_07). One exception is that days of the week are started from Monday instead of Sunday so `0` corresponds to Monday. + + The month and day of week fields accept abbreviated English month and weekday names (`jan–dec` and `mon–sun`) respectively. + + A cron expression consists of five fields: + + ``` + ┌───────────── minute (0-59) + │ ┌───────────── hour (0-23) + │ │ ┌───────────── day of the month (1-31) + │ │ │ ┌───────────── month (1-12 or jan-dec) + │ │ │ │ ┌───────────── day of the week (0-6 or mon-sun) + │ │ │ │ │ + │ │ │ │ │ + │ │ │ │ │ + * * * * * + ``` + + The following operators can be used in any of the fields: + + | Operator | Description | Example | + |----------|-----------------------|-------------------------------------------------------------------------| + | `*` | Any value | `0 * * * *` runs every hour at minute 0 | + | `,` | Value list separator | `15,45 10 * * *` runs at 10:15 and 10:45 every day. | + | `-` | Range of values | `0 1-3 * * *` runs at 1:00, 2:00, and 3:00 every day. | + | `/` | Step values | `*/10 8-10 * * *` runs every 10 minutes during the hours 8:00 to 10:59. | + +### Spot policy + +By default, `dstack` uses on-demand instances. However, you can change that +via the [`spot_policy`](../reference/dstack.yml/task.md#spot_policy) property. It accepts `spot`, `on-demand`, and `auto`. + +--8<-- "docs/concepts/snippets/manage-fleets.ext" + +!!! info "Reference" + Tasks support many more configuration options, + incl. [`backends`](../reference/dstack.yml/task.md#backends), + [`regions`](../reference/dstack.yml/task.md#regions), + [`max_price`](../reference/dstack.yml/task.md#max_price), and + [`max_duration`](../reference/dstack.yml/task.md#max_duration), + among [others](../reference/dstack.yml/task.md). + +--8<-- "docs/concepts/snippets/manage-runs.ext" + +!!! info "What's next?" + 1. Read about [dev environments](dev-environments.md) and [services](services.md) + 2. Learn how to manage [fleets](fleets.md) + 3. Check the [Axolotl](../examples/training/axolotl.md) example diff --git a/mkdocs/docs/concepts/volumes.md b/mkdocs/docs/concepts/volumes.md new file mode 100644 index 0000000000..bb338fbeeb --- /dev/null +++ b/mkdocs/docs/concepts/volumes.md @@ -0,0 +1,356 @@ +--- +title: Volumes +description: Managing persistent data storage +--- + +# Volumes + +Volumes enable data persistence between runs of dev environments, tasks, and services. + +`dstack` supports two kinds of volumes: + +* [Network volumes](#network-volumes) — provisioned via backends and mounted to specific container directories. + Ideal for persistent storage. +* [Instance volumes](#instance-volumes) — bind directories on the host instance to container directories. +Useful as a cache for cloud fleets or for persistent storage with SSH fleets. + +## Network volumes + +> Network volumes are currently supported for the `aws`, `gcp`, `runpod`, and `kubernetes` backends. + +### Apply a configuration + +First, define a volume configuration as a YAML file in your project folder. +The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `volume.dstack.yml` are both acceptable). + +
    + +```yaml +type: volume +# A name of the volume +name: my-volume + +# Volumes are bound to a specific backend and region +backend: aws +region: eu-central-1 + +# Required size +size: 100GB +``` + +
    + +If you use this configuration, `dstack` will create a new volume based on the specified options. + +??? info "Kubernetes" + With the `kubernetes` backend, you don't have to specify `region`, but you can optionally specify `storage_class_name` and/or `access_modes`: + +
    + + ```yaml + type: volume + backend: kubernetes + name: my-volume + + size: 100GB + ``` + +
    + + This automatically creates a `PersistentVolumeClaim` and associates it with the volume. + + If you don't specify `storage_class_name`, the decision is delegated to the `DefaultStorageClass` admission controller, if enabled. + + If you don't specify `access_modes`, it defaults to `[ReadWriteOnce]`. To attach volumes to multiple runs at the same time, set it to `[ReadWriteMany]` or `[ReadWriteMany, ReadOnlyMany]`. + + +To create, update, or register the volume, pass the volume configuration to `dstack apply`: + +
    + +```shell +$ dstack apply -f volume.dstack.yml +Volume my-volume does not exist yet. Create the volume? [y/n]: y + + NAME BACKEND REGION STATUS CREATED + my-volume aws eu-central-1 submitted now + +``` + +
    + + +Once created, the volume can be attached to dev environments, tasks, and services. + +> When creating a new network volume, `dstack` automatically creates an `ext4` filesystem on it. + +#### Register existing volumes + +If you prefer not to create a new volume but to reuse an existing one (e.g., created manually), you can +specify its ID via [`volume_id`](../reference/dstack.yml/volume.md#volume_id). In this case, `dstack` will register the specified volume so that you can use it with dev environments, tasks, and services. + +
    + +```yaml +type: volume +# The name of the volume +name: my-volume + +# Volumes are bound to a specific backend and region +backend: aws +region: eu-central-1 + +# The ID of the volume in AWS +volume_id: vol1235 +``` + +
    + +If you register an existing volume, you must ensure the volume already has a filesystem. + +??? info "Kubernetes" + + With the `kubernetes` backend, to reuse an existing `PersistentVolumeClaim`, specify its name in `claim_name`: + +
    + + ```yaml + type: volume + backend: kubernetes + name: my-volume + + claim_name: existing-pvc + ``` + +
    + +For all volume configuration options, refer to the [reference](../reference/dstack.yml/volume.md). + +### Attach a volume { #attach-network-volume } + +Dev environments, tasks, and services let you attach any number of network volumes. +To attach a network volume, simply specify its name using the `volumes` property +and specify where to mount its contents: + +
    + +```yaml +type: dev-environment +# A name of the dev environment +name: vscode-vol + +ide: vscode + +# Map the name of the volume to any path +volumes: + - name: my-volume + path: /volume_data + +# You can also use the short syntax in the `name:path` form +# volumes: +# - my-volume:/volume_data +``` + +
    + +Once you run this configuration, the contents of the volume will be attached to `/volume_data` inside the dev environment, +and its contents will persist across runs. + +??? info "Multiple regions or backends" + If you're unsure in advance which region or backend you'd like to use (or which is available), + you can specify multiple volumes for the same path. + +
    + + ```yaml + volumes: + - name: [my-aws-eu-west-1-volume, my-aws-us-east-1-volume] + path: /volume_data + ``` + +
    + + `dstack` will attach one of the volumes based on the region and backend of the run. + + +??? info "Distributed tasks" + When using single-attach volumes such as AWS EBS with distributed tasks, + you can attach different volumes to different nodes using `dstack` variable interpolation: + +
    + + ```yaml + type: task + nodes: 8 + commands: + - ... + volumes: + - name: data-volume-${{ dstack.node_rank }} + path: /volume_data + ``` + +
    + + This way, every node will use its own volume. + + Tip: To create volumes for all nodes using one volume configuration, specify volume name with `-n`: + + ```shell + $ for i in {0..7}; do dstack apply -f vol.dstack.yml -n data-volume-$i -y; done + ``` + +### Detach a volume { #detach-network-volume } + +`dstack` automatically detaches volumes from instances when a run stops. + +!!! info "Force detach" + In some clouds such as AWS a volume may stuck in the detaching state. + To fix this, you can abort the run, and `dstack` will force detach the volume. + `dstack` will also force detach the stuck volume automatically after `stop_duration`. + + Note that force detaching a volume is a last resort measure and may corrupt the file system. + Contact your cloud support if you experience volumes getting stuck in the detaching state. + +### Manage volumes { #manage-network-volumes } + +#### List volumes + +The [`dstack volume list`](../reference/cli/dstack/volume.md#dstack-volume-list) command lists created and registered volumes: + +
    + +```shell +$ dstack volume list +NAME BACKEND REGION STATUS CREATED + my-volume aws eu-central-1 active 3 weeks ago +``` + +
    + +#### Delete volumes + +When the volume isn't attached to any active dev environment, task, or service, +you can delete it by passing the volume configuration to `dstack delete`: + +
    + +```shell +$ dstack delete -f vol.dstack.yaml +``` + +
    + +Alternatively, you can delete a volume by passing the volume name to `dstack volume delete`. + +If the volume was created using `dstack`, it will be physically destroyed along with the data. +If you've registered an existing volume, it will be de-registered with `dstack` but will keep the data. + +### FAQs + +??? info "Can I use network volumes across backends?" + + Since volumes are backed up by cloud network disks, you can only use them within the same cloud. If you need to access + data across different backends, you should either use object storage or replicate the data across multiple volumes. + +??? info "Can I use network volumes across regions?" + + Typically, network volumes are associated with specific regions, so you can't use them in other regions. Often, + volumes are also linked to availability zones, but some providers support volumes that can be used across different + availability zones within the same region. + + If you don't want to limit a run to one particular region, you can create different volumes for different regions + and specify them for the same mount point as [documented above](#attach-network-volume). + +??? info "Can I attach network volumes to multiple runs or instances?" + You can mount a volume in multiple runs. This feature is currently supported only by the `runpod` backend. + +## Instance volumes + +Instance volumes allow mapping any directory on the instance where the run is executed to any path inside the container. +This means that the data in instance volumes is persisted only if the run is executed on the same instance. + +> Instance volumes are currently supported for all backends except `runpod` and `vastai`, and can also be used with [SSH fleets](fleets.md#ssh-fleets). + +### Attach a volume + +A run can configure any number of instance volumes. To attach an instance volume, +specify the `instance_path` and `path` in the `volumes` property: + +
    + +```yaml +type: dev-environment +# A name of the dev environment +name: vscode-vol + +ide: vscode + +# Map the instance path to any container path +volumes: + - instance_path: /mnt/volume + path: /volume_data + +# You can also use the short syntax in the `instance_path:path` form +# volumes: +# - /mnt/volume:/volume_data +``` + +
    + +Since persistence isn't guaranteed (instances may be interrupted or runs may occur on different instances), use instance +volumes only for caching or with directories manually mounted to network storage. + +??? info "Optional volumes" + If the volume is not critical for your workload, you can mark it as `optional`. + +
    + + ```yaml + type: task + + volumes: + - instance_path: /dstack-cache + path: /root/.cache/ + optional: true + ``` + + Configurations with optional volumes can run in any backend, but the volume is only mounted + if the selected backend supports it. + +
    + +### Use instance volumes for caching + +For example, if a run regularly installs packages with `pip install`, +you can mount the `/root/.cache/pip` folder inside the container to a folder on the instance for +reuse. + +
    + +```yaml +type: task + +volumes: + - /dstack-cache/pip:/root/.cache/pip +``` + +
    + +### Use instance volumes with SSH fleets + +If you control the instances (e.g. they are on-prem servers configured via [SSH fleets](fleets.md#ssh-fleets)), +you can mount network storage (e.g., NFS or SMB) and use the mount points as instance volumes. + +For example, if you mount a network storage to `/mnt/nfs-storage` on all hosts of your SSH fleet, +you can map this directory via instance volumes and be sure the data is persisted. + +
    + +```yaml +type: task + +volumes: + - /mnt/nfs-storage:/storage +``` + +
    diff --git a/mkdocs/docs/examples.md b/mkdocs/docs/examples.md new file mode 100644 index 0000000000..ad8bf48398 --- /dev/null +++ b/mkdocs/docs/examples.md @@ -0,0 +1,233 @@ +--- +title: Examples +description: Collection of examples for models, training, inference, and clusters +#template: examples.html +hide: +# - navigation + - toc +# - footer +--- + + + +## Training + + + + +## Clusters + + + +## Inference + + + +## Models + + + +## Accelerators + + diff --git a/mkdocs/docs/examples/accelerators/amd.md b/mkdocs/docs/examples/accelerators/amd.md new file mode 100644 index 0000000000..f94af3d02f --- /dev/null +++ b/mkdocs/docs/examples/accelerators/amd.md @@ -0,0 +1,256 @@ +--- +title: AMD +description: Running dev environments, tasks, and services on AMD GPUs +--- + +# AMD + +`dstack` natively supports AMD GPUs. This page covers the basics of setting up +fleets, running inference, training, and dev environments on AMD GPUs. + +## Fleets + +`dstack` supports native cloud provisioning, and can also work with existing +Kubernetes clusters or vanilla bare-metal hosts. + +=== "Clouds" + + `dstack` supports native provisioning of VMs with AMD GPUs across a number + of clouds, including + [AMD Developer Cloud](../../concepts/backends.md#amd-developer-cloud) and + [Hot Aisle](../../concepts/backends.md#hot-aisle). More cloud support is + coming soon. + + To provision compute in these clouds, configure the corresponding + [backend](../../concepts/backends.md) and create a + [backend fleet](../../concepts/fleets.md). + +=== "Kubernetes" + + To use `dstack` with existing Kubernetes cluster(s), configure the + [`kubernetes` backend](../../concepts/backends.md#kubernetes) and point it + to your kubeconfig file. Then create a + [backend fleet](../../concepts/fleets.md). + +=== "SSH fleets" + + If you'd like `dstack` to use a cluster or machine that is already + provisioned and that you have access to, create an + [SSH fleet](../../concepts/fleets.md). + +!!! info "Cluster placement" + For multi-node workloads, the fleet must + [set](../../concepts/fleets.md#cluster-placement) `placement` to `cluster`. + For Kubernetes and SSH fleets, the network must be properly configured. + + To test whether the cluster is properly configured, run the + [RCCL tests via a distributed task](../clusters/nccl-rccl-tests.md). + +Once a fleet is created, you can run dev environments, tasks, and services. + +## Inference + +Here are examples of a [service](../../concepts/services.md) that deploys +`Qwen/Qwen3.6-27B` on AMD MI300X GPUs using +[SGLang](https://github.com/sgl-project/sglang) and +[vLLM](https://docs.vllm.ai/en/latest/). + +=== "SGLang" + +
    + + ```yaml + type: service + name: qwen36-sglang-amd + + image: lmsysorg/sglang:v0.5.10-rocm720-mi30x + + commands: + - | + sglang serve \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 30000 \ + --tp $DSTACK_GPUS_NUM \ + --reasoning-parser qwen3 \ + --mem-fraction-static 0.8 \ + --context-length 262144 + + port: 30000 + model: Qwen/Qwen3.6-27B + + volumes: + - instance_path: /root/.cache + path: /root/.cache + optional: true + + resources: + cpu: 52.. + memory: 896GB.. + shm_size: 16GB + disk: 450GB.. + gpu: MI300X:4.. + ``` + +
    + + !!! info "PD disaggregation" + To run SGLang with prefill and decode workers on an interconnected + cluster of AMD GPU instances, see the + [SGLang PD disaggregation](../inference/sglang.md#pd-disaggregation) + example. + + For multi-node PD disaggregation, the fleet must use [cluster placement](../../concepts/fleets.md#cluster-placement). + +=== "vLLM" + +
    + + ```yaml + type: service + name: qwen36-vllm-amd + + image: vllm/vllm-openai-rocm:v0.19.1 + + commands: + - | + vllm serve Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 8000 \ + --tensor-parallel-size $DSTACK_GPUS_NUM \ + --max-model-len 262144 \ + --reasoning-parser qwen3 + + port: 8000 + model: Qwen/Qwen3.6-27B + + volumes: + - instance_path: /root/.cache + path: /root/.cache + optional: true + + resources: + cpu: 52.. + memory: 896GB.. + shm_size: 16GB + disk: 450GB.. + gpu: MI300X:4.. + ``` + +
    + +Use the [`dstack apply`](../../reference/cli/dstack/apply.md) command to apply +any configuration, including services, tasks, dev environments, and fleets. + +
    + +```shell +$ dstack apply -f service.dstack.yml +``` + +
    + +## Training + +Below is a [task](../../concepts/tasks.md) that fine-tunes a small language +model using the official +[Transformers causal language modeling example](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling) +on AMD GPUs. + +
    + +```yaml +type: task +name: amd-qwen3-train + +image: rocm/pytorch:latest + +commands: + - git clone --depth 1 https://github.com/huggingface/transformers.git + - pip install -e ./transformers -r transformers/examples/pytorch/language-modeling/requirements.txt + - | + torchrun --standalone --nproc-per-node $DSTACK_GPUS_PER_NODE \ + transformers/examples/pytorch/language-modeling/run_clm.py \ + --model_name_or_path Qwen/Qwen3-0.6B-Base \ + --dataset_name Salesforce/wikitext \ + --dataset_config_name wikitext-2-raw-v1 \ + --do_train \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 8 \ + --max_steps 10 \ + --block_size 512 \ + --learning_rate 2e-5 \ + --bf16 \ + --logging_steps 1 \ + --output_dir /tmp/qwen3-clm + +resources: + gpu: MI300X:4.. + disk: 100GB.. +``` + +
    + +!!! info "Distributed tasks" + To run training across multiple nodes, use + [distributed tasks](../../concepts/tasks.md#distributed-tasks). Distributed + tasks may run on a cluster; in that case, the fleet must use + [cluster placement](../../concepts/fleets.md#cluster-placement). + +## Dev environments + +Here's an example of a [dev environment](../../concepts/dev-environments.md) +that can be accessed via your desktop IDE. + +
    + +```yaml +type: dev-environment +name: amd-vscode + +image: rocm/dev-ubuntu-24.04 + +ide: vscode + +resources: + gpu: MI300X:1 +``` + +
    + +## Docker image + +> If you'd like a run to use AMD GPUs, make sure to specify `image`. + +The image's ROCm runtime must be compatible with the AMD GPUs the run will use. +The image should also include the packages your workload needs. + +## Metrics + +Run and job [metrics](../../concepts/metrics.md) include CPU, memory, and GPU +usage. They are available in the UI and via the CLI: + +
    + +```shell +$ dstack metrics <run name> +``` + +
    + +> AMD GPU metrics require `amd-smi` to be available in the run image. If it +> isn't present, GPU metrics may be unavailable. + +## What's next? + +1. Browse the dedicated [SGLang](../inference/sglang.md) + and [vLLM](../inference/vllm.md) examples, plus the + [Qwen 3.6](../models/qwen36.md) model page. +2. For multi-node inference, see + [SGLang PD disaggregation](../inference/sglang.md#pd-disaggregation). +3. For cluster validation, run + [NCCL/RCCL tests](../clusters/nccl-rccl-tests.md). +4. Check [dev environments](../../concepts/dev-environments.md), + [tasks](../../concepts/tasks.md), [services](../../concepts/services.md), + [fleets](../../concepts/fleets.md), and + [backends](../../concepts/backends.md). diff --git a/src/tests/_internal/server/background/tasks/__init__.py b/mkdocs/docs/examples/accelerators/intel/index.md similarity index 100% rename from src/tests/_internal/server/background/tasks/__init__.py rename to mkdocs/docs/examples/accelerators/intel/index.md diff --git a/mkdocs/docs/examples/accelerators/tenstorrent.md b/mkdocs/docs/examples/accelerators/tenstorrent.md new file mode 100644 index 0000000000..3561b8c242 --- /dev/null +++ b/mkdocs/docs/examples/accelerators/tenstorrent.md @@ -0,0 +1,319 @@ +--- +title: Tenstorrent +description: Running inference, training, and dev environments on Tenstorrent Wormhole and Blackhole accelerators +--- + +# Tenstorrent + +`dstack` supports running inference, training, and dev environments on +Tenstorrent Wormhole and Blackhole accelerators, including +[PCIe cards](https://tenstorrent.com/en/hardware/cards) and systems such as +[TT-LoudBox](https://tenstorrent.com/en/hardware/tt-loudbox), +[TT-QuietBox and TT-QuietBox 2](https://tenstorrent.com/en/hardware/tt-quietbox), +and [Tenstorrent Galaxy](https://tenstorrent.com/hardware/galaxy). + +## Fleets + +Currently, Tenstorrent accelerators are supported via +[SSH fleets](../../concepts/fleets.md#ssh-fleets). + +=== "SSH fleets" + + To configure an SSH fleet, create a fleet configuration and list hostnames of the hosts along with the private key and username. + +
    + + ```yaml + type: fleet + name: tt-fleet + + ssh_config: + user: root + identity_file: ~/.ssh/id_rsa + hosts: + - 192.168.2.108 + ``` + +
    + + ??? info "Host requirements" + Before creating the fleet, make sure each host: + + - Has Docker installed. + - Has [Tenstorrent software](https://docs.tenstorrent.com/getting-started/README.html#software-installation) + installed, including the drivers and HugePages. + - Can be accessed by the configured SSH user with passwordless `sudo`. + - Runs an SSH server with `AllowTcpForwarding yes`. + - Allows SSH through the firewall and should block other external + inbound traffic. + + If `placement` is set to `cluster`, hosts can communicate with each other. + + To apply the fleet configuration, run: + +
    + + ```bash + $ dstack apply -f tt-fleet.dstack.yml + + FLEET RESOURCES PRICE STATUS CREATED + tt-fleet cpu=64 mem=566.1GB disk=749.6GB gpu=tt-galaxy-wh:12GB:32 $0 idle 18 sec ago + ``` + +
    + +## Inference + +Below is a [service](../../concepts/services.md) that deploys +[`gpt-oss-120b`](https://huggingface.co/openai/gpt-oss-120b) on a +Tenstorrent Galaxy system using +[Tenstorrent Inference Server](https://github.com/tenstorrent/tt-inference-server). + +
    + +```yaml +type: service +name: gpt-oss-120b + +image: ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.12.0-805f43d-a45c614 + +env: + - HF_TOKEN + +commands: + - | + ulimit -n 65535 + /home/container_app_user/tt-metal/python_env/bin/python /home/container_app_user/app/src/run_vllm_api_server.py \ + --model gpt-oss-120b \ + --tt-device galaxy + +port: 8000 + +model: openai/gpt-oss-120b + +volumes: + # Cache model weights and TT runtime artifacts on the host. + - /mnt/data/gpt-oss-120b/cache_root:/home/container_app_user/cache_root + - /mnt/data/gpt-oss-120b/dot-cache:/home/container_app_user/.cache + +resources: + shm_size: 32GB + gpu: tt-galaxy-wh:32 +``` + +
    + +Go ahead and run the configuration using `dstack apply`: + +
    + +```bash +$ export HF_TOKEN= +$ dstack apply -f service.dstack.yml +``` + +
    + +Once the service is up, it will be available via the service endpoint +at `/proxy/services///`. + +
    + +```shell +$ curl http://127.0.0.1:3000/proxy/services/main/gpt-oss-120b/v1/chat/completions \ + -X POST \ + -H 'Authorization: Bearer <user token>' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "openai/gpt-oss-120b", + "messages": [ + { + "role": "user", + "content": "What is 17 + 25? Answer with just the number." + } + ], + "max_tokens": 128 + }' +``` + +
    + +The response includes both the final answer and the model's reasoning fields +(`reasoning` and `reasoning_content`). + +Additionally, the model is available via `dstack`'s control plane UI: + +![](https://dstack.ai/static-assets/static-assets/images/dstack-tenstorrent-model-ui.png){ width=800 } + +When a [gateway](../../concepts/gateways.md) is configured, the service endpoint +is available at `https://./`. + +## Training + +Below is a minimal [task](../../concepts/tasks.md) that runs a TT-XLA training +smoke test. + +
    + +```yaml +type: task +name: tt-xla-train + +image: ghcr.io/tenstorrent/tt-xla-slim:latest + +commands: + - | + python - <<'PY' + import jax + import jax.numpy as jnp + + devices = jax.devices("tt") + print("TT devices:", devices) + if not devices: + raise SystemExit("No Tenstorrent devices found by JAX") + + with jax.default_device(devices[0]): + params = { + "w": jnp.ones((32, 32), dtype=jnp.bfloat16), + "b": jnp.zeros((32,), dtype=jnp.bfloat16), + } + x = jnp.ones((32, 32), dtype=jnp.bfloat16) + y = jnp.zeros((32, 32), dtype=jnp.bfloat16) + + def loss_fn(params, x, y): + pred = x @ params["w"] + params["b"] + err = (pred - y).astype(jnp.float32) + return jnp.mean(err * err) + + @jax.jit + def train_step(params, x, y): + loss, grads = jax.value_and_grad(loss_fn)(params, x, y) + next_params = jax.tree_util.tree_map( + lambda p, g: p - jnp.asarray(0.01, dtype=p.dtype) * g.astype(p.dtype), + params, + grads, + ) + return next_params, loss + + for step in range(3): + params, loss = train_step(params, x, y) + loss.block_until_ready() + print(f"step={step} loss={float(jax.device_get(loss)):.6f}") + + print("tiny training smoke test passed") + PY + +resources: + gpu: tt-galaxy-wh:32 +``` + +
    + +For a single Wormhole PCIe card, use `gpu: n150:1`. + +??? info "Files and repos" + For longer commands, put the Python code in `train.py` next to + `tt-task.dstack.yml` and upload it with `files`: + +
    + + ```yaml + type: task + name: tt-xla-train + + image: ghcr.io/tenstorrent/tt-xla-slim:latest + + files: + - train.py + + commands: + - python train.py + + resources: + gpu: tt-galaxy-wh:32 + ``` + +
    + + If the script is part of a Git repository, use `repos` instead: + +
    + + ```yaml + working_dir: /workspace + + repos: + - .:/workspace + + commands: + - python train.py + ``` + +
    + + For more details, refer to [Files](../../concepts/tasks.md#files) and + [Repos](../../concepts/tasks.md#repos). + +## Dev environments + +Below is an example [dev environment](../../concepts/dev-environments.md) +configuration. It can be used to provision a dev environment that can be +accessed via your desktop IDE. + +
    + +```yaml +type: dev-environment +# The name is optional, if not specified, generated randomly +name: vscode + +image: dstackai/tt-smi:latest + +# Can be `vscode` or `cursor` +ide: vscode + +resources: + gpu: tt-galaxy-wh:32 +``` + +
    + +If you run it via `dstack apply`, it will output the URL to access it via your desktop IDE. + +![](https://dstack.ai/static-assets/static-assets/images/dstack-tenstorrent-vscode.png){ width=800 } + +## GPU specification + +`resources.gpu` uses the usual `name:count` format. For Tenstorrent, `count` +is the number of devices reported from the TT-SMI topology. On Galaxy systems, +this corresponds to chips. On PCIe systems, this is usually the card count, but +dual-chip cards can also be reported as per-chip devices. + +```yaml +resources: + gpu: tt-galaxy-wh:32 # Galaxy Wormhole, 32 chips + # gpu: tt-galaxy-bh:32 # Galaxy Blackhole, 32 chips + # gpu: n300:4 # TT-LoudBox or TT-QuietBox Wormhole, 4 n300 cards + # gpu: p150:4 # TT-QuietBox Blackhole, 4 p150 cards + # gpu: p300:64GB:2 # TT-QuietBox 2 Blackhole, 2 p300 cards + # gpu: p300:32GB:4 # TT-QuietBox 2 Blackhole, if exposed per chip +``` + +Use `tt:` only when the workload can run on any Tenstorrent device type. +Use a model name when placement depends on the hardware family: `n150` or +`n300` for Wormhole PCIe cards, `tt-galaxy-wh` for Galaxy Wormhole, `p100a`, +`p150`, or `p300` for Blackhole PCIe cards, and `tt-galaxy-bh` for Galaxy +Blackhole. + +## What's next? + +1. Check [Services](../../concepts/services.md), + [Tasks](../../concepts/tasks.md), [Dev environments](../../concepts/dev-environments.md), + and [SSH fleets](../../concepts/fleets.md#ssh-fleets). +2. Browse [Tenstorrent Inference Server](https://github.com/tenstorrent/tt-inference-server), + [TT-XLA](https://github.com/tenstorrent/tt-xla), and + [TT-Metalium](https://github.com/tenstorrent/tt-metal). + +??? info "Feedback" + Found a bug, or want to request a feature? File it in the [issue tracker](https://github.com/dstackai/dstack/issues), + or share via [Discord](https://discord.gg/u8SmfwPpMd). diff --git a/mkdocs/docs/examples/accelerators/tpu.md b/mkdocs/docs/examples/accelerators/tpu.md new file mode 100644 index 0000000000..8c4d1584bb --- /dev/null +++ b/mkdocs/docs/examples/accelerators/tpu.md @@ -0,0 +1,193 @@ +--- +title: TPU +description: Deploying and fine-tuning models on Google Cloud TPUs using Optimum TPU and vLLM +--- + +# TPU + +If you've configured the `gcp` backend in `dstack`, you can run dev environments, tasks, and services on [TPUs](https://cloud.google.com/tpu/docs/intro-to-tpu). +Choose a TPU instance by specifying the TPU version and the number of cores (e.g. `v5litepod-8`) in the `gpu` property under `resources`, +or request TPUs by specifying `tpu` as `vendor` ([see examples](../../guides/protips.md#gpu)). + +Below are a few examples on using TPUs for deployment and fine-tuning. + +!!! info "Multi-host TPUs" + Currently, `dstack` supports only single-host TPUs, which means that + the maximum supported number of cores is `8` (e.g. `v2-8`, `v3-8`, `v5litepod-8`, `v5p-8`, `v6e-8`). + Multi-host TPU support is on the roadmap. + +!!! info "TPU storage" + By default, each TPU VM contains a 100GB boot disk and its size cannot be changed. + If you need more storage, attach additional disks using [Volumes](../../concepts/volumes.md). + +## Deployment + +Many serving frameworks including vLLM and TGI have TPU support. +Here's an example of a [service](../../concepts/services.md) that deploys Llama 3.1 8B using +[Optimum TPU](https://github.com/huggingface/optimum-tpu) +and [vLLM](https://github.com/vllm-project/vllm). + +=== "Optimum TPU" + +
    + + ```yaml + type: service + name: llama31-service-optimum-tpu + + image: dstackai/optimum-tpu:llama31 + env: + - HF_TOKEN + - MODEL_ID=meta-llama/Meta-Llama-3.1-8B-Instruct + - MAX_TOTAL_TOKENS=4096 + - MAX_BATCH_PREFILL_TOKENS=4095 + commands: + - text-generation-launcher --port 8000 + port: 8000 + # Register the model + model: meta-llama/Meta-Llama-3.1-8B-Instruct + + resources: + gpu: v5litepod-4 + ``` +
    + + Note that for Optimum TPU `MAX_INPUT_TOKEN` is set to 4095 by default. We must also set `MAX_BATCH_PREFILL_TOKENS` to 4095. + + ??? info "Docker image" + The official Docker image `huggingface/optimum-tpu:latest` doesn’t support Llama 3.1-8B. + We’ve created a custom image with the fix: `dstackai/optimum-tpu:llama31`. + Once the [pull request](https://github.com/huggingface/optimum-tpu/pull/92) is merged, + the official Docker image can be used. + +=== "vLLM" +
    + + ```yaml + type: service + name: llama31-service-vllm-tpu + + env: + - MODEL_ID=meta-llama/Meta-Llama-3.1-8B-Instruct + - HF_TOKEN + - DATE=20240828 + - TORCH_VERSION=2.5.0 + - VLLM_TARGET_DEVICE=tpu + - MAX_MODEL_LEN=4096 + commands: + - pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-${TORCH_VERSION}.dev${DATE}-cp311-cp311-linux_x86_64.whl + - pip3 install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}.dev${DATE}-cp311-cp311-linux_x86_64.whl + - pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html + - pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html + - git clone https://github.com/vllm-project/vllm.git + - cd vllm + - pip install -r requirements-tpu.txt + - apt-get install -y libopenblas-base libopenmpi-dev libomp-dev + - python setup.py develop + - vllm serve $MODEL_ID + --tensor-parallel-size 4 + --max-model-len $MAX_MODEL_LEN + --port 8000 + port: 8000 + # Register the model + model: meta-llama/Meta-Llama-3.1-8B-Instruct + + # Uncomment to leverage spot instances + #spot_policy: auto + + resources: + gpu: v5litepod-4 + ``` +
    + + Note, when using Llama 3.1 8B with a `v5litepod` which has 16GB memory per core, we must limit the context size to 4096 tokens to fit the memory. + +### Memory requirements + +Below are the approximate memory requirements for serving LLMs with the minimal required TPU configuration: + +| Model size | bfloat16 | TPU | int8 | TPU | +|------------|----------|--------------|-------|----------------| +| **8B** | 16GB | v5litepod-4 | 8GB | v5litepod-4 | +| **70B** | 140GB | v5litepod-16 | 70GB | v5litepod-16 | +| **405B** | 810GB | v5litepod-64 | 405GB | v5litepod-64 | + +Note, `v5litepod` is optimized for serving transformer-based models. Each core is equipped with 16GB of memory. + +### Supported frameworks + +| Framework | Quantization | Note | +|-----------|----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **TGI** | bfloat16 | To deploy with TGI, Optimum TPU must be used. | +| **vLLM** | int8, bfloat16 | int8 quantization still requires the same memory because the weights are first moved to the TPU in bfloat16, and then converted to int8. See the [pull request](https://github.com/vllm-project/vllm/pull/7005) for more details. | + +### Running a configuration + +Once the configuration is ready, run `dstack apply -f `, and `dstack` will automatically provision the +cloud resources and run the configuration. + +## Fine-tuning with Optimum TPU + +Below is an example of fine-tuning Llama 3.1 8B using [Optimum TPU](https://github.com/huggingface/optimum-tpu) +and the [`Abirate/english_quotes`](https://huggingface.co/datasets/Abirate/english_quotes) +dataset. + +
    + +```yaml +type: task +name: optimum-tpu-llama-train + +python: "3.11" +env: + - HF_TOKEN +files: + - train.py + - config.yaml +commands: + - git clone -b add_llama_31_support https://github.com/dstackai/optimum-tpu.git + - mkdir -p optimum-tpu/examples/custom/ + - cp train.py optimum-tpu/examples/custom/train.py + - cp config.yaml optimum-tpu/examples/custom/config.yaml + - cd optimum-tpu + - pip install -e . -f https://storage.googleapis.com/libtpu-releases/index.html + - pip install datasets evaluate + - pip install accelerate -U + - pip install peft + - python examples/custom/train.py examples/custom/config.yaml + +resources: + gpu: v5litepod-8 +``` + +
    + +[//]: # (### Fine-Tuning with TRL) +[//]: # (Use the example `examples/single-node-training/optimum-tpu/gemma/train.dstack.yml` to Finetune `Gemma-2B` model using `trl` with `dstack` and `optimum-tpu`. ) + +### Memory requirements + +Below are the approximate memory requirements for fine-tuning LLMs with the minimal required TPU configuration: + +| Model size | LoRA | TPU | +|------------|-------|--------------| +| **8B** | 16GB | v5litepod-8 | +| **70B** | 160GB | v5litepod-16 | +| **405B** | 950GB | v5litepod-64 | + +Note, `v5litepod` is optimized for fine-tuning transformer-based models. Each core is equipped with 16GB of memory. + +### Supported frameworks + +| Framework | Quantization | Note | +|-----------------|--------------|---------------------------------------------------------------------------------------------------| +| **TRL** | bfloat16 | To fine-tune using TRL, Optimum TPU is recommended. TRL doesn't support Llama 3.1 out of the box. | +| **Pytorch XLA** | bfloat16 | | + +## What's next? + +1. Browse [Optimum TPU](https://github.com/huggingface/optimum-tpu), + [Optimum TPU TGI](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference) and + [vLLM](https://docs.vllm.ai/en/latest/getting_started/tpu-installation.html). +2. Check [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), + [services](../../concepts/services.md), and [fleets](../../concepts/fleets.md). diff --git a/mkdocs/docs/examples/clusters/aws.md b/mkdocs/docs/examples/clusters/aws.md new file mode 100644 index 0000000000..b35e59d6c7 --- /dev/null +++ b/mkdocs/docs/examples/clusters/aws.md @@ -0,0 +1,196 @@ +--- +title: AWS +description: High-performance distributed training on AWS using Elastic Fabric Adapter (EFA) +--- + +# AWS + +In this guide, we'll walk through how to run high-performance distributed training on AWS using [Amazon Elastic Fabric Adapter (EFA)](https://aws.amazon.com/hpc/efa/) with `dstack`. + +## Overview + +EFA is a network interface for Amazon EC2 that enables low-latency, high-bandwidth inter-node communication — essential for scaling distributed deep learning. With `dstack`, EFA is automatically enabled when you create fleets with supported instance types. + +## Prerequisite + +Before you start, make sure the `aws` backend is properly configured. + +
    + +```yaml +projects: +- name: main + backends: + - type: aws + creds: + type: default + regions: ["us-west-2"] +``` + +
    + +!!! info "VPC" + If you use a custom VPC, verify that it permits all internal traffic between nodes for EFA to function properly + +## Create a fleet + +Once your backend is ready, define a fleet configuration. + +
    + + ```yaml + type: fleet + name: efa-fleet + + nodes: 2 + placement: cluster + + resources: + gpu: H100:8 + ``` + +
    + +Provision the fleet with `dstack apply`: + +
    + +```shell +$ dstack apply -f efa-fleet.dstack.yml + +Provisioning... +---> 100% + + FLEET INSTANCE BACKEND INSTANCE TYPE GPU PRICE STATUS CREATED + efa-fleet 0 aws (us-west-2) p4d.24xlarge H100:8:80GB $98.32 idle 3 mins ago + 1 aws (us-west-2) p4d.24xlarge H100:8:80GB $98.32 idle 3 mins ago +``` + +
    + +??? info "Instance types" + `dstack` selects suitable instances automatically, but not + [all types support EFA](https://aws.amazon.com/hpc/efa/). + To enforce EFA, you can specify `instance_types` explicitly: + + ```yaml + type: fleet + name: efa-fleet + + nodes: 2 + placement: cluster + + resources: + gpu: L4 + + instance_types: ["g6.8xlarge"] # If not specified, g6.xlarge is used (won't have EFA) + ``` + +## Run NCCL tests + +To confirm that EFA is working, run NCCL tests: + +
    + +```yaml +type: task +name: nccl-tests + +nodes: 2 + +startup_order: workers-first +stop_criteria: master-done + +env: + - NCCL_DEBUG=INFO +commands: + - | + if [ $DSTACK_NODE_RANK -eq 0 ]; then + mpirun \ + --allow-run-as-root \ + --hostfile $DSTACK_MPI_HOSTFILE \ + -n $DSTACK_GPUS_NUM \ + -N $DSTACK_GPUS_PER_NODE \ + --bind-to none \ + /opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1 + else + sleep infinity + fi + +resources: + gpu: 1..8 + shm_size: 16GB +``` + +
    + +Run it with `dstack apply`: + +
    + +```shell +$ dstack apply -f nccl-tests.dstack.yml + +Provisioning... +---> 100% +``` + +
    + +!!! info "Docker image" + You can use your own container by setting `image`. If omitted, `dstack` uses its default image with drivers, NCCL tests, and tools pre-installed. + +## Run distributed training + +Here’s an example using `torchrun` for a simple multi-node PyTorch job: + +
    + +```yaml +type: task +name: train-distrib + +nodes: 2 + +python: 3.12 +env: + - NCCL_DEBUG=INFO +commands: + - git clone https://github.com/pytorch/examples.git pytorch-examples + - cd pytorch-examples/distributed/ddp-tutorial-series + - uv pip install -r requirements.txt + - | + torchrun \ + --nproc-per-node=$DSTACK_GPUS_PER_NODE \ + --node-rank=$DSTACK_NODE_RANK \ + --nnodes=$DSTACK_NODES_NUM \ + --master-addr=$DSTACK_MASTER_NODE_IP \ + --master-port=12345 \ + multinode.py 50 10 + +resources: + gpu: 1..8 + shm_size: 16GB +``` + +
    + +Provision and launch it via `dstack apply`. + +
    + +```shell +$ dstack apply -f train-distrib.dstack.yml + +Provisioning... +---> 100% +``` + +
    + +Instead of setting `python`, you can specify your own Docker image using `image`. Make sure that the image is properly configured for EFA. + +!!! info "What's next" + 1. Learn more about [distributed tasks](../../concepts/tasks.md#distributed-tasks) and [cluster placement](../../concepts/fleets.md#cluster-placement) + 2. Check [dev environments](../../concepts/dev-environments.md), + [services](../../concepts/services.md), and [fleets](../../concepts/fleets.md) diff --git a/mkdocs/docs/examples/clusters/crusoe.md b/mkdocs/docs/examples/clusters/crusoe.md new file mode 100644 index 0000000000..28901a8e3c --- /dev/null +++ b/mkdocs/docs/examples/clusters/crusoe.md @@ -0,0 +1,280 @@ +--- +title: Crusoe +description: Using Crusoe clusters with InfiniBand support via VMs or Kubernetes +--- + +# Crusoe + +`dstack` allows using Crusoe clusters with fast interconnect via two ways: + +* [VMs](#vms) – If you configure a `crusoe` backend in `dstack` by providing your Crusoe credentials, `dstack` lets you fully provision and use clusters through `dstack`. +* [Kubernetes](#kubernetes) – If you create a Kubernetes cluster on Crusoe and configure a `kubernetes` backend and create a backend fleet in `dstack`, `dstack` lets you fully use this cluster through `dstack`. + +## VMs + +Since `dstack` offers a VM-based backend that natively integrates with Crusoe, you only need to provide your Crusoe credentials to `dstack`, and it will allow you to fully provision and use clusters on Crusoe through `dstack`. + +### Configure a backend + +Log into your [Crusoe](https://console.crusoecloud.com/) console, create an API key under your account settings, and note your project ID. + +
    + +```yaml +projects: +- name: main + backends: + - type: crusoe + project_id: your-project-id + creds: + type: access_key + access_key: your-access-key + secret_key: your-secret-key +``` + +
    + +### Create a fleet + +Once the backend is configured, you can create a fleet: + +
    + +```yaml +type: fleet +name: crusoe-fleet + +nodes: 2 +placement: cluster + +backends: [crusoe] + +resources: + gpu: A100:80GB:8 +``` + +
    + +Pass the fleet configuration to `dstack apply`: + +
    + +```shell +$ dstack apply -f crusoe-fleet.dstack.yml +``` + +
    + +This will automatically create an IB partition and provision instances with InfiniBand networking. + +Once the fleet is created, you can run [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), and [services](../../concepts/services.md). + +> If you want instances to be provisioned on demand, you can set `nodes` to `0..2`. In this case, `dstack` will create instances only when you run workloads. + +## Kubernetes + +### Create a cluster + +1. Go `Networking` → `Firewall Rules`, click `Create Firewall Rule`, and allow ingress traffic on port `30022`. This port will be used by the `dstack` server to access the jump host. +2. Go to `Orchestration` and click `Create Cluster`. Make sure to enable the `NVIDIA GPU Operator` add-on. +3. Go the the cluster, and click `Create Node Pool`. Select the right type of the instance, and `Desired Number of Nodes`. +4. Wait until nodes are provisioned. + +> Even if you enable `autoscaling`, `dstack` can use only the nodes that are already provisioned. + +### Configure the backend + +Follow the standard instructions for setting up a [`kubernetes`](../../concepts/backends.md#kubernetes) backend: + +
    + +```yaml +projects: + - name: main + backends: + - type: kubernetes + kubeconfig: + filename: + proxy_jump: + port: 30022 +``` + +
    + +### Create a fleet + +Once the Crusoe Managed Kubernetes cluster and the `dstack` server are running, you can create a fleet: + +
    + +```yaml +type: fleet +name: crusoe-fleet + +placement: cluster +nodes: 0.. + +backends: [kubernetes] + +resources: + # Specify requirements to filter nodes + gpu: 8 +``` + +
    + +Pass the fleet configuration to `dstack apply`: + +
    + +```shell +$ dstack apply -f crusoe-fleet.dstack.yml +``` + +
    + +Once the fleet is created, you can run [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), and [services](../../concepts/services.md). + +## NCCL tests + +Use a [distributed task](../../concepts/tasks.md#distributed-tasks) that runs NCCL tests to validate cluster network bandwidth. + +=== "VMs" + + With the Crusoe backend, HPC-X and NCCL topology files are pre-installed on the host VM image. Mount them into the container via [instance volumes](../../concepts/volumes.md#instance-volumes). + +
    + + ```yaml + type: task + name: nccl-tests + + nodes: 2 + startup_order: workers-first + stop_criteria: master-done + + volumes: + - /opt/hpcx:/opt/hpcx + - /etc/crusoe/nccl_topo:/etc/crusoe/nccl_topo + + commands: + - . /opt/hpcx/hpcx-init.sh + - hpcx_load + - | + if [ $DSTACK_NODE_RANK -eq 0 ]; then + mpirun \ + --allow-run-as-root \ + --hostfile $DSTACK_MPI_HOSTFILE \ + -n $DSTACK_GPUS_NUM \ + -N $DSTACK_GPUS_PER_NODE \ + --bind-to none \ + -mca btl tcp,self \ + -mca coll_hcoll_enable 0 \ + -x PATH \ + -x LD_LIBRARY_PATH \ + -x CUDA_DEVICE_ORDER=PCI_BUS_ID \ + -x NCCL_SOCKET_NTHREADS=4 \ + -x NCCL_NSOCKS_PERTHREAD=8 \ + -x NCCL_TOPO_FILE=/etc/crusoe/nccl_topo/a100-80gb-sxm-ib-cloud-hypervisor.xml \ + -x NCCL_IB_MERGE_VFS=0 \ + -x NCCL_IB_HCA=^mlx5_0:1 \ + /opt/nccl-tests/build/all_reduce_perf -b 8 -e 2G -f 2 -t 1 -g 1 -c 1 -n 100 + else + sleep infinity + fi + + backends: [crusoe] + + resources: + gpu: A100:80GB:8 + shm_size: 16GB + ``` + +
    + + > Update `NCCL_TOPO_FILE` to match your instance type. Topology files for all supported types are available at `/etc/crusoe/nccl_topo/` on the host. + +=== "Kubernetes" + + If you're running on Crusoe Managed Kubernetes, make sure to install HPC-X and provide an up-to-date topology file. + +
    + + ```yaml + type: task + name: nccl-tests + + nodes: 2 + startup_order: workers-first + stop_criteria: master-done + + commands: + # Install NCCL topology files + - curl -sSL https://gist.github.com/un-def/48df8eea222fa9547ad4441986eb15af/archive/df51d56285c5396a0e82bb42f4f970e7bb0a9b65.tar.gz -o nccl_topo.tar.gz + - mkdir -p /etc/crusoe/nccl_topo + - tar -C /etc/crusoe/nccl_topo -xf nccl_topo.tar.gz --strip-components=1 + # Install and initialize HPC-X + - curl -sSL https://content.mellanox.com/hpc/hpc-x/v2.21.3/hpcx-v2.21.3-gcc-doca_ofed-ubuntu22.04-cuda12-x86_64.tbz -o hpcx.tar.bz + - mkdir -p /opt/hpcx + - tar -C /opt/hpcx -xf hpcx.tar.bz --strip-components=1 --checkpoint=10000 + - . /opt/hpcx/hpcx-init.sh + - hpcx_load + # Run NCCL Tests + - | + if [ $DSTACK_NODE_RANK -eq 0 ]; then + mpirun \ + --allow-run-as-root \ + --hostfile $DSTACK_MPI_HOSTFILE \ + -n $DSTACK_GPUS_NUM \ + -N $DSTACK_GPUS_PER_NODE \ + --bind-to none \ + -mca btl tcp,self \ + -mca coll_hcoll_enable 0 \ + -x PATH \ + -x LD_LIBRARY_PATH \ + -x CUDA_DEVICE_ORDER=PCI_BUS_ID \ + -x NCCL_SOCKET_NTHREADS=4 \ + -x NCCL_NSOCKS_PERTHREAD=8 \ + -x NCCL_TOPO_FILE=/etc/crusoe/nccl_topo/a100-80gb-sxm-ib-cloud-hypervisor.xml \ + -x NCCL_IB_MERGE_VFS=0 \ + -x NCCL_IB_AR_THRESHOLD=0 \ + -x NCCL_IB_PCI_RELAXED_ORDERING=1 \ + -x NCCL_IB_SPLIT_DATA_ON_QPS=0 \ + -x NCCL_IB_QPS_PER_CONNECTION=2 \ + -x NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1 \ + -x UCX_NET_DEVICES=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1 \ + /opt/nccl-tests/build/all_reduce_perf -b 8 -e 2G -f 2 -t 1 -g 1 -c 1 -n 100 + else + sleep infinity + fi + + # Required for IB + privileged: true + + resources: + gpu: A100:8 + shm_size: 16GB + ``` + +
    + + > The task above downloads an A100 topology file from a Gist. The most reliable way to obtain the latest topology is to copy it from a Crusoe-provisioned VM (see [VMs](#vms)). + + ??? info "Privileged" + When running on Crusoe Managed Kubernetes, set `privileged` to `true` to ensure access to InfiniBand. + +Pass the configuration to `dstack apply`: + +
    + +```shell +$ dstack apply -f crusoe-nccl-tests.dstack.yml +``` + +
    + +## What's next + +1. Learn about [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), [services](../../concepts/services.md) +2. Check out [backends](../../concepts/backends.md#crusoe-cloud) and [fleets](../../concepts/fleets.md#cloud-fleets) +3. Check the docs on [Crusoe's networking](https://docs.crusoecloud.com/networking/infiniband/) and ["Crusoe Managed" Kubernetes](https://docs.crusoecloud.com/orchestration/cmk/index.html) diff --git a/mkdocs/docs/examples/clusters/gcp.md b/mkdocs/docs/examples/clusters/gcp.md new file mode 100644 index 0000000000..eb9ddef0c2 --- /dev/null +++ b/mkdocs/docs/examples/clusters/gcp.md @@ -0,0 +1,582 @@ +--- +title: GCP +description: Creating and using GPU clusters on GCP with GPUDirect-TCPX and RoCE support +--- + +# GCP + +This example shows how to create and use clusters on GCP. + +`dstack` supports the following instance types: + +| Instance type | GPU | Maximum bandwidth | Fabric | +| ------------- | ------ | ----------------- | ---------------------------------------------------------------------------------------------------------------- | +| **A3 Edge** | H100:8 | 0.8 Tbps | [GPUDirect-TCPX](https://cloud.google.com/compute/docs/gpus/gpudirect) | +| **A3 High** | H100:8 | 1 Tbps | [GPUDirect-TCPX](https://cloud.google.com/compute/docs/gpus/gpudirect) | +| **A3 Mega** | H100:8 | 1.8 Tbps | [GPUDirect-TCPXO](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot) | +| **A4** | B200:8 | 3.2 Tbps | RoCE | + +## Configure the backend + +Despite hiding most of the complexity, `dstack` still requires instance-specific backend configuration: + +=== "A4" + A4 requires one `extra_vpcs` for inter-node traffic (regular VPC, one subnet) and one `roce_vpcs` for GPU-to-GPU communication (RoCE profile, eight subnets). + +
    + + ```yaml + projects: + - name: main + backends: + - type: gcp + # Specify your GCP project ID + project_id: + + extra_vpcs: [dstack-gvnic-net-1] + roce_vpcs: [dstack-mrdma] + + # Specify the regions you intend to use + regions: [us-west2] + + creds: + type: default + ``` + +
    + +

    Create extra and RoCE VPCs

    + + See GCP's [RoCE network setup guide](https://cloud.google.com/ai-hypercomputer/docs/create/create-vm#setup-network) for the commands to create + VPCs and filewall rules. + + Ensure VPCs allow internal traffic between nodes for MPI/NCCL to function. + +=== "A3 Mega" + A3 Edge/High require at least 4 `extra_vpcs` for data NICs. + +
    + + ```yaml + projects: + - name: main + backends: + - type: gcp + # Specify your GCP project ID + project_id: + + extra_vpcs: + - dstack-gpu-data-net-1 + - dstack-gpu-data-net-2 + - dstack-gpu-data-net-3 + - dstack-gpu-data-net-4 + - dstack-gpu-data-net-5 + - dstack-gpu-data-net-6 + - dstack-gpu-data-net-7 + - dstack-gpu-data-net-8 + + # Specify the regions you intend to use + regions: [europe-west4] + + creds: + type: default + ``` + +
    + +

    Create extra VPCs

    + + Create the VPC networks for GPUDirect in your project, each with a subnet and a firewall rule: + + ```shell + # Specify the region where you intend to deploy the cluster + REGION="europe-west4" + + for N in $(seq 1 8); do + gcloud compute networks create dstack-gpu-data-net-$N \ + --subnet-mode=custom \ + --mtu=8244 + + gcloud compute networks subnets create dstack-gpu-data-sub-$N \ + --network=dstack-gpu-data-net-$N \ + --region=$REGION \ + --range=192.168.$N.0/24 + + gcloud compute firewall-rules create dstack-gpu-data-internal-$N \ + --network=dstack-gpu-data-net-$N \ + --action=ALLOW \ + --rules=tcp:0-65535,udp:0-65535,icmp \ + --source-ranges=192.168.0.0/16 + done + ``` + +=== "A3 High/Edge" + A3 Edge/High require at least 4 `extra_vpcs` for data NICs and a `vm_service_account` authorized to pull GPUDirect Docker images. + +
    + + ```yaml + projects: + - name: main + backends: + - type: gcp + # Specify your GCP project ID + project_id: + + extra_vpcs: + - dstack-gpu-data-net-1 + - dstack-gpu-data-net-2 + - dstack-gpu-data-net-3 + - dstack-gpu-data-net-4 + + # Specify the regions you intend to use + regions: [europe-west4] + + # Specify your GCP project ID + vm_service_account: a3cluster-sa@$.iam.gserviceaccount.com + + creds: + type: default + ``` + +
    + +

    Create extra VPCs

    + + Create the VPC networks for GPUDirect in your project, each with a subnet and a firewall rule: + + ```shell + # Specify the region where you intend to deploy the cluster + REGION="europe-west4" + + for N in $(seq 1 4); do + gcloud compute networks create dstack-gpu-data-net-$N \ + --subnet-mode=custom \ + --mtu=8244 + + gcloud compute networks subnets create dstack-gpu-data-sub-$N \ + --network=dstack-gpu-data-net-$N \ + --region=$REGION \ + --range=192.168.$N.0/24 + + gcloud compute firewall-rules create dstack-gpu-data-internal-$N \ + --network=dstack-gpu-data-net-$N \ + --action=ALLOW \ + --rules=tcp:0-65535,udp:0-65535,icmp \ + --source-ranges=192.168.0.0/16 + done + ``` + +

    Create a service account

    + + Create a VM service account that allows VMs to access the `pkg.dev` registry: + + ```shell + PROJECT_ID=$(gcloud config get-value project) + + gcloud iam service-accounts create a3cluster-sa \ + --display-name "Service Account for pulling GCR images" + + gcloud projects add-iam-policy-binding $PROJECT_ID \ + --member="serviceAccount:a3cluster-sa@${PROJECT_ID}.iam.gserviceaccount.com" \ + --role="roles/artifactregistry.reader" + ``` + +!!! info "Default VPC" + If you set a non-default `vpc_name` in the backend configuration, ensure it allows all inter-node traffic. This is required for MPI and NCCL. The default VPC already allows this. + +## Create a fleet + +Once you've configured the `gcp` backend, create the fleet configuration: + +=== "A4" + +
    + + ```yaml + type: fleet + name: a4-fleet + + placement: cluster + # Can be a range on a fixed number + nodes: 2 + + # Specify the zone where you have configured the RoCE VPC + availability_zones: [us-west2-c] + + backends: [gcp] + + # Uncomment to allow spot instances + #spot_policy: auto + + resources: + gpu: B200:8 + ``` + +
    + + Then apply it with `dstack apply`: + +
    + + ```shell + $ dstack apply -f a4-fleet.dstack.yml + + Provisioning... + ---> 100% + + FLEET INSTANCE BACKEND GPU PRICE STATUS CREATED + a4-fleet 0 gcp (us-west2) B200:180GB:8 (spot) $51.552 idle 9 mins ago + 1 gcp (us-west2) B200:180GB:8 (spot) $51.552 idle 9 mins ago + ``` + +
    + +=== "A3 Mega" + +
    + + ```yaml + type: fleet + name: a3mega-fleet + + placement: cluster + # Can be a range on a fixed number + nodes: 2 + + instance_types: + - a3-megagpu-8g + + # Uncomment to allow spot instances + #spot_policy: auto + ``` +
    + + Pass the configuration to `dstack apply`: + +
    + + ```shell + $ dstack apply -f a3mega-fleet.dstack.yml + + FLEET INSTANCE BACKEND GPU PRICE STATUS CREATED + a3mega-fleet 1 gcp (europe-west4) H100:80GB:8 $22.1525 (spot) idle 9 mins ago + a3mega-fleet 2 gcp (europe-west4) H100:80GB:8 $64.2718 idle 9 mins ago + + Create the fleet? [y/n]: y + + Provisioning... + ---> 100% + ``` + +
    + +=== "A3 High/Edge" + +
    + + ```yaml + type: fleet + name: a3high-fleet + + placement: cluster + nodes: 2 + + instance_types: + - a3-highgpu-8g + + # Uncomment to allow spot instances + #spot_policy: auto + ``` + +
    + + Pass the configuration to `dstack apply`: + +
    + + ```shell + $ dstack apply -f a3high-fleet.dstack.yml + + FLEET INSTANCE BACKEND GPU PRICE STATUS CREATED + a3mega-fleet 1 gcp (europe-west4) H100:80GB:8 $20.5688 (spot) idle 9 mins ago + a3mega-fleet 2 gcp (europe-west4) H100:80GB:8 $58.5419 idle 9 mins ago + + Create the fleet? [y/n]: y + + Provisioning... + ---> 100% + ``` + +
    + +Once the fleet is created, you can run distributed tasks, in addition to dev environments, services, and regular tasks. + +## Run tasks + +### NCCL tests + +Use a distributed task that runs NCCL tests to validate cluster network bandwidth. + +=== "A4" + Pass the configuration to `dstack apply`: + +
    + + ```shell + $ dstack apply -f nccl-tests.dstack.yml + + Provisioning... + ---> 100% + + nccl-tests provisioning completed (running) + nThread 1 nGpus 1 minBytes 8 maxBytes 8589934592 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0 + size count type redop root time algbw busbw wrong time algbw busbw wrong + (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) + 8388608 2097152 float sum -1 156.9 53.47 100.25 0 167.6 50.06 93.86 0 + 16777216 4194304 float sum -1 196.3 85.49 160.29 0 206.2 81.37 152.57 0 + 33554432 8388608 float sum -1 258.5 129.82 243.42 0 261.8 128.18 240.33 0 + 67108864 16777216 float sum -1 369.4 181.69 340.67 0 371.2 180.79 338.98 0 + 134217728 33554432 float sum -1 638.5 210.22 394.17 0 587.2 228.57 428.56 0 + 268435456 67108864 float sum -1 940.3 285.49 535.29 0 950.7 282.36 529.43 0 + 536870912 134217728 float sum -1 1695.2 316.70 593.81 0 1666.9 322.08 603.89 0 + 1073741824 268435456 float sum -1 3229.9 332.44 623.33 0 3201.8 335.35 628.78 0 + 2147483648 536870912 float sum -1 6107.7 351.61 659.26 0 6157.1 348.78 653.97 0 + 4294967296 1073741824 float sum -1 11952 359.36 673.79 0 11942 359.65 674.34 0 + 8589934592 2147483648 float sum -1 23563 364.55 683.52 0 23702 362.42 679.54 0 + Out of bounds values : 0 OK + Avg bus bandwidth : 165.789 + ``` + +
    + +=== "A3 Mega" + +
    + + ```yaml + type: task + name: nccl-tests + nodes: 2 + image: nvcr.io/nvidia/pytorch:24.04-py3 + entrypoint: "bash -c" # Need to use bash instead of default dash for nccl-env-profile.sh + commands: + - | + # Setup TCPXO NCCL env variables + NCCL_LIB_DIR="/var/lib/tcpxo/lib64" + source ${NCCL_LIB_DIR}/nccl-env-profile-ll128.sh + export NCCL_FASTRAK_CTRL_DEV=enp0s12 + export NCCL_FASTRAK_IFNAME=enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0 + export NCCL_SOCKET_IFNAME=enp0s12 + export NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY="/dev/aperture_devices" + export LD_LIBRARY_PATH="${NCCL_LIB_DIR}:${LD_LIBRARY_PATH}" + # Build NCCL Tests + git clone https://github.com/NVIDIA/nccl-tests.git + cd nccl-tests + MPI=1 CC=mpicc CXX=mpicxx make -j + cd build + # We use FIFO for inter-node communication + FIFO=/tmp/dstack_job + if [ ${DSTACK_NODE_RANK} -eq 0 ]; then + sleep 10 + echo "${DSTACK_NODES_IPS}" > hostfile + MPIRUN='mpirun --allow-run-as-root --hostfile hostfile' + # Wait for other nodes + while true; do + if ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 true >/dev/null 2>&1; then + break + fi + echo 'Waiting for nodes...' + sleep 5 + done + # Run NCCL Tests + ${MPIRUN} \ + -n ${DSTACK_GPUS_NUM} -N ${DSTACK_GPUS_PER_NODE} \ + --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 \ + $(env | awk -F= '{print "-x", $1}' | xargs) \ + ./all_gather_perf -b 8M -e 8G -f 2 -g 1 -w 5 --iters 200 -c 0; + # Notify nodes the job is done + ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 sh -c "echo done > ${FIFO}" + else + mkfifo ${FIFO} + # Wait for a message from the first node + cat ${FIFO} + fi + spot_policy: auto + resources: + shm_size: 16GB + ``` + +
    + + Pass the configuration to `dstack apply`: + +
    + + ```shell + $ dstack apply -f nccl-tests.dstack.yml + + nccl-tests provisioning completed (running) + nThread 1 nGpus 1 minBytes 8388608 maxBytes 8589934592 step: 2(factor) warmup iters: 5 iters: 200 agg iters: 1 validation: 0 graph: 0 + + out-of-place in-place + size count type redop root time algbw busbw #wrong time algbw busbw #wrong + (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) + 8388608 131072 float none -1 166.6 50.34 47.19 N/A 164.1 51.11 47.92 N/A + 16777216 262144 float none -1 204.6 82.01 76.89 N/A 203.8 82.30 77.16 N/A + 33554432 524288 float none -1 284.0 118.17 110.78 N/A 281.7 119.12 111.67 N/A + 67108864 1048576 float none -1 447.4 150.00 140.62 N/A 443.5 151.31 141.86 N/A + 134217728 2097152 float none -1 808.3 166.05 155.67 N/A 801.9 167.38 156.92 N/A + 268435456 4194304 float none -1 1522.1 176.36 165.34 N/A 1518.7 176.76 165.71 N/A + 536870912 8388608 float none -1 2892.3 185.62 174.02 N/A 2894.4 185.49 173.89 N/A + 1073741824 16777216 float none -1 5532.7 194.07 181.94 N/A 5530.7 194.14 182.01 N/A + 2147483648 33554432 float none -1 10863 197.69 185.34 N/A 10837 198.17 185.78 N/A + 4294967296 67108864 float none -1 21481 199.94 187.45 N/A 21466 200.08 187.58 N/A + 8589934592 134217728 float none -1 42713 201.11 188.54 N/A 42701 201.16 188.59 N/A + Out of bounds values : 0 OK + Avg bus bandwidth : 146.948 + ``` + +
    + +=== "A3 High/Edge" + +
    + + ```yaml + type: task + name: nccl-tests + nodes: 2 + image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx + commands: + - | + export NCCL_DEBUG=INFO + export LD_LIBRARY_PATH=/usr/local/tcpx/lib64:$LD_LIBRARY_PATH + # We use FIFO for inter-node communication + FIFO=/tmp/dstack_job + if [ ${DSTACK_NODE_RANK} -eq 0 ]; then + mkdir -p /scripts/hostfiles2 + : > /scripts/hostfiles2/hostfile8 + for ip in ${DSTACK_NODES_IPS}; do + echo "${ip} slots=${DSTACK_GPUS_PER_NODE}" >> /scripts/hostfiles2/hostfile8 + done + MPIRUN='mpirun --allow-run-as-root --hostfile /scripts/hostfiles2/hostfile8' + # Wait for other nodes + while true; do + if ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 true >/dev/null 2>&1; then + break + fi + echo 'Waiting for nodes...' + sleep 5 + done + # Run NCCL Tests + NCCL_GPUDIRECTTCPX_FORCE_ACK=0 /scripts/run-allgather.sh 8 eth1,eth2,eth3,eth4 8M 8GB 2 + # Notify nodes the job is done + ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 sh -c "echo done > ${FIFO}" + else + mkfifo ${FIFO} + # Wait for a message from the first node + cat ${FIFO} + fi + spot_policy: auto + resources: + shm_size: 16GB + ``` + +
    + + Pass the configuration to `dstack apply`: + +
    + + ```shell + $ dstack apply -f nccl-tests.dstack.yml + + nccl-tests provisioning completed (running) + nThread 1 nGpus 1 minBytes 8388608 maxBytes 8589934592 step: 2(factor) warmup iters: 5 iters: 200 agg iters: 1 validation: 0 graph: 0 + + out-of-place in-place + size count type redop root time algbw busbw #wrong time algbw busbw #wrong + (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) + 8388608 131072 float none -1 784.9 10.69 10.02 0 775.9 10.81 10.14 0 + 16777216 262144 float none -1 1010.3 16.61 15.57 0 999.3 16.79 15.74 0 + 33554432 524288 float none -1 1161.6 28.89 27.08 0 1152.9 29.10 27.28 0 + 67108864 1048576 float none -1 1432.6 46.84 43.92 0 1437.8 46.67 43.76 0 + 134217728 2097152 float none -1 2516.9 53.33 49.99 0 2491.7 53.87 50.50 0 + 268435456 4194304 float none -1 5066.8 52.98 49.67 0 5131.4 52.31 49.04 0 + 536870912 8388608 float none -1 10028 53.54 50.19 0 10149 52.90 49.60 0 + 1073741824 16777216 float none -1 20431 52.55 49.27 0 20214 53.12 49.80 0 + 2147483648 33554432 float none -1 40254 53.35 50.01 0 39923 53.79 50.43 0 + 4294967296 67108864 float none -1 80896 53.09 49.77 0 78875 54.45 51.05 0 + 8589934592 134217728 float none -1 160505 53.52 50.17 0 160117 53.65 50.29 0 + Out of bounds values : 0 OK + Avg bus bandwidth : 40.6043 + ``` + +
    + +### Distributed training + +=== "A4" + You can use the standard [distributed task](../../concepts/tasks.md#distributed-tasks) example to run distributed training on A4 instances. + +=== "A3 Mega" + You can use the standard [distributed task](../../concepts/tasks.md#distributed-tasks) example to run distributed training on A3 Mega instances. To enable GPUDirect-TCPX, make sure the required [NCCL environment variables](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot#environment-variables-nccl) are properly set, for example by adding the following commands at the beginning: + + ```shell + # ... + + commands: + - | + NCCL_LIB_DIR="/var/lib/tcpxo/lib64" + source ${NCCL_LIB_DIR}/nccl-env-profile-ll128.sh + export NCCL_FASTRAK_CTRL_DEV=enp0s12 + export NCCL_FASTRAK_IFNAME=enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0 + export NCCL_SOCKET_IFNAME=enp0s12 + export NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY="/dev/aperture_devices" + export LD_LIBRARY_PATH="${NCCL_LIB_DIR}:${LD_LIBRARY_PATH}" + + # ... + ``` + +=== "A3 High/Edge" + You can use the standard [distributed task](../../concepts/tasks.md#distributed-tasks) example to run distributed training on A3 High/Edge instances. To enable GPUDirect-TCPX0, make sure the required [NCCL environment variables](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot#environment-variables-nccl) are properly set, for example by adding the following commands at the beginning: + + ```shell + # ... + + commands: + - | + export NCCL_DEBUG=INFO + NCCL_LIB_DIR="/usr/local/tcpx/lib64" + export LD_LIBRARY_PATH="${NCCL_LIB_DIR}:${LD_LIBRARY_PATH}" + export NCCL_SOCKET_IFNAME=eth0 + export NCCL_CROSS_NIC=0 + export NCCL_ALGO=Ring + export NCCL_PROTO=Simple + export NCCL_NSOCKS_PERTHREAD=4 + export NCCL_SOCKET_NTHREADS=1 + export NCCL_NET_GDR_LEVEL=PIX + export NCCL_P2P_PXN_LEVEL=0 + export NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4 + export NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0 + export NCCL_DYNAMIC_CHUNK_SIZE=524288 + export NCCL_P2P_NET_CHUNKSIZE=524288 + export NCCL_P2P_PCI_CHUNKSIZE=524288 + export NCCL_P2P_NVL_CHUNKSIZE=1048576 + export NCCL_BUFFSIZE=4194304 + export NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177" + export NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191" + export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=50000 + export NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX="/run/tcpx" + + # ... + ``` + +In addition to distributed training, you can of course run regular tasks, dev environments, and services. + +## What's new + +1. Learn about [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), [services](../../concepts/services.md) +2. Read about [cluster placement](../../concepts/fleets.md#cluster-placement) +3. Check GCP's docs on using [A4](https://docs.cloud.google.com/compute/docs/gpus/create-gpu-vm-a3u-a4), and [A3 Mega/High/Edge](https://docs.cloud.google.com/compute/docs/gpus/gpudirect) instances diff --git a/mkdocs/docs/examples/clusters/lambda.md b/mkdocs/docs/examples/clusters/lambda.md new file mode 100644 index 0000000000..1ebe35ce76 --- /dev/null +++ b/mkdocs/docs/examples/clusters/lambda.md @@ -0,0 +1,218 @@ +--- +title: Lambda +description: Setting up Lambda clusters using Kubernetes or 1-Click Clusters with fast interconnect +--- + +# Lambda + +`dstack` allows using Lambda clusters with fast interconnect via two ways: + +* [Kubernetes](#kubernetes) – If you create a Kubernetes cluster on Lambda and configure a `kubernetes` backend and create a backend fleet in `dstack`, `dstack` lets you fully use this cluster through `dstack`. +* [VMs](#vms) – If you create a 1CC cluster on Lambda and create an SSH fleet in `dstack`, `dstack` lets you fully use this cluster through `dstack`. + +## Kubernetes + +### Prerequsisites + +1. Follow the instructions in [Lambda's guide](https://docs.lambda.ai/public-cloud/1-click-clusters/managed-kubernetes/#accessing-mk8s) on accessing MK8s. +2. Go to `Firewall` → `Edit rules`, click `Add rule`, and allow ingress traffic on port `30022`. This port will be used by the `dstack` server to access the jump host. + +### Configure the backend + +Follow the standard instructions for setting up a [Kubernetes](../../concepts/backends.md#kubernetes) backend: + +
    + +```yaml +projects: + - name: main + backends: + - type: kubernetes + kubeconfig: + filename: + proxy_jump: + port: 30022 +``` + +
    + +### Create a fleet + +Once the Kubernetes cluster and the `dstack` server are running, you can create a fleet: + +
    + +```yaml +type: fleet +name: lambda-fleet + +placement: cluster +nodes: 0.. + +backends: [kubernetes] + +resources: + # Specify requirements to filter nodes + gpu: 1..8 +``` + +
    + +Pass the fleet configuration to `dstack apply`: + +
    + +```shell +$ dstack apply -f lambda-fleet.dstack.yml +``` + +
    + +Once the fleet is created, you can run [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), and [services](../../concepts/services.md). + +## 1-Click Clusters + +Another way to work with Lambda clusters is through [1CC](https://lambda.ai/1-click-clusters). While `dstack` supports automated cluster provisioning via [VM-based backends](../../concepts/backends.md#vm-based), there is currently no programmatic way to provision Lambda 1CCs. As a result, to use a 1CC cluster with `dstack`, you must use [SSH fleets](../../concepts/fleets.md). + +### Prerequsisites + +1. Follow the instructions in [Lambda's guide](https://docs.lambda.ai/public-cloud/1-click-clusters/) on working with 1-Click Clusters + +### Create a fleet + +Follow the standard instructions for setting up an [SSH fleet](../../concepts/fleets.md#ssh-fleets): + +
    + +```yaml +type: fleet +name: lambda-fleet + +ssh_config: + user: ubuntu + identity_file: ~/.ssh/id_rsa + hosts: + - worker-gpu-8x-b200-rplfm-ll9nr + - worker-gpu-8x-b200-rplfm-qrcs9 + proxy_jump: + hostname: 192.222.55.54 + user: ubuntu + identity_file: ~/.ssh/id_rsa + +placement: cluster +``` + +
    + +> Under `proxy_jump`, we specify the hostname of the head node along with the private SSH key. + +Pass the fleet configuration to `dstack apply`: + +
    + +```shell +$ dstack apply -f lambda-fleet.dstack.yml +``` + +
    + +Once the fleet is created, you can run [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), and [services](../../concepts/services.md). + +## Run tasks + +To run tasks on a cluster, you must use [distributed tasks](../../concepts/tasks.md#distributed-task). + +### Run NCCL tests + +To validate cluster network bandwidth, use the following task: + +
    + +```yaml +type: task +name: nccl-tests + +nodes: 2 +startup_order: workers-first +stop_criteria: master-done + +commands: + - | + if [ $DSTACK_NODE_RANK -eq 0 ]; then + mpirun \ + --allow-run-as-root \ + --hostfile $DSTACK_MPI_HOSTFILE \ + -n $DSTACK_GPUS_NUM \ + -N $DSTACK_GPUS_PER_NODE \ + --bind-to none \ + -x NCCL_IB_HCA=^mlx5_0 \ + /opt/nccl-tests/build/all_reduce_perf -b 8 -e 2G -f 2 -t 1 -g 1 -c 1 -n 100 + else + sleep infinity + fi + +# Uncomment if the `kubernetes` backend requires it for `/dev/infiniband` access +#privileged: true + +resources: + gpu: nvidia:B200:8 + shm_size: 16GB +``` + +
    + +Pass the configuration to `dstack apply`: + +
    + +```shell +$ dstack apply -f lambda-nccl-tests.dstack.yml + +Provisioning... +---> 100% + + nccl-tests version 2.17.6 nccl-headers=22602 nccl-library=22602 + Collective test starting: all_reduce_perf + + size count type redop root time algbw busbw #wrong time algbw busbw #wrong + (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) + 8 2 float sum -1 36.50 0.00 0.00 0 36.16 0.00 0.00 0 + 16 4 float sum -1 35.55 0.00 0.00 0 35.49 0.00 0.00 0 + 32 8 float sum -1 35.49 0.00 0.00 0 36.28 0.00 0.00 0 + 64 16 float sum -1 35.85 0.00 0.00 0 35.54 0.00 0.00 0 + 128 32 float sum -1 37.36 0.00 0.01 0 36.82 0.00 0.01 0 + 256 64 float sum -1 37.38 0.01 0.01 0 37.80 0.01 0.01 0 + 512 128 float sum -1 51.05 0.01 0.02 0 37.17 0.01 0.03 0 + 1024 256 float sum -1 45.33 0.02 0.04 0 37.98 0.03 0.05 0 + 2048 512 float sum -1 38.67 0.05 0.10 0 38.30 0.05 0.10 0 + 4096 1024 float sum -1 40.08 0.10 0.19 0 39.18 0.10 0.20 0 + 8192 2048 float sum -1 42.13 0.19 0.36 0 41.47 0.20 0.37 0 + 16384 4096 float sum -1 43.66 0.38 0.70 0 41.94 0.39 0.73 0 + 32768 8192 float sum -1 45.42 0.72 1.35 0 43.29 0.76 1.42 0 + 65536 16384 float sum -1 44.59 1.47 2.76 0 43.90 1.49 2.80 0 + 131072 32768 float sum -1 47.44 2.76 5.18 0 46.79 2.80 5.25 0 + 262144 65536 float sum -1 66.68 3.93 7.37 0 65.36 4.01 7.52 0 + 524288 131072 float sum -1 240.71 2.18 4.08 0 125.73 4.17 7.82 0 + 1048576 262144 float sum -1 115.58 9.07 17.01 0 115.48 9.08 17.03 0 + 2097152 524288 float sum -1 114.44 18.33 34.36 0 114.27 18.35 34.41 0 + 4194304 1048576 float sum -1 118.25 35.47 66.50 0 117.11 35.82 67.15 0 + 8388608 2097152 float sum -1 141.39 59.33 111.24 0 134.95 62.16 116.55 0 + 16777216 4194304 float sum -1 186.86 89.78 168.34 0 184.39 90.99 170.60 0 + 33554432 8388608 float sum -1 255.79 131.18 245.96 0 253.88 132.16 247.81 0 + 67108864 16777216 float sum -1 350.41 191.52 359.09 0 350.71 191.35 358.79 0 + 134217728 33554432 float sum -1 596.75 224.92 421.72 0 595.37 225.44 422.69 0 + 268435456 67108864 float sum -1 934.67 287.20 538.50 0 931.37 288.22 540.41 0 + 536870912 134217728 float sum -1 1625.63 330.25 619.23 0 1687.31 318.18 596.59 0 + 1073741824 268435456 float sum -1 2972.25 361.26 677.35 0 2971.33 361.37 677.56 0 + 2147483648 536870912 float sum -1 5784.75 371.23 696.06 0 5728.40 374.88 702.91 0 + Out of bounds values : 0 OK + Avg bus bandwidth : 137.179 +``` + +
    + +## What's next + +1. Learn about [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), [services](../../concepts/services.md) +2. Read about the [Kubernetes backend](../../concepts/backends.md#kubernetes) and [cluster placement](../../concepts/fleets.md#cluster-placement) +3. Check Lambda's docs on [Kubernetes](https://docs.lambda.ai/public-cloud/1-click-clusters/managed-kubernetes/#accessing-mk8s) and [1CC](https://docs.lambda.ai/public-cloud/1-click-clusters/) diff --git a/mkdocs/docs/examples/clusters/nccl-rccl-tests.md b/mkdocs/docs/examples/clusters/nccl-rccl-tests.md new file mode 100644 index 0000000000..775e2c8b88 --- /dev/null +++ b/mkdocs/docs/examples/clusters/nccl-rccl-tests.md @@ -0,0 +1,143 @@ +--- +title: NCCL/RCCL tests +description: Running NCCL and RCCL tests to validate cluster network bandwidth +--- + +# NCCL/RCCL tests + +This example shows how to run [NCCL](https://github.com/NVIDIA/nccl-tests) or [RCCL](https://github.com/ROCm/rccl-tests) tests on a cluster using [distributed tasks](../../concepts/tasks.md#distributed-tasks). + +!!! info "Prerequisites" + Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](../../concepts/fleets.md#cluster-placement) or an [SSH fleet](../../concepts/fleets.md#ssh-placement)). + +## Running as a task + +Here's an example of a task that runs AllReduce test on 2 nodes, each with 4 GPUs (8 processes in total). + +=== "NCCL tests" + +
    + + ```yaml + type: task + name: nccl-tests + + nodes: 2 + + startup_order: workers-first + stop_criteria: master-done + + env: + - NCCL_DEBUG=INFO + commands: + - | + if [ $DSTACK_NODE_RANK -eq 0 ]; then + mpirun \ + --allow-run-as-root \ + --hostfile $DSTACK_MPI_HOSTFILE \ + -n $DSTACK_GPUS_NUM \ + -N $DSTACK_GPUS_PER_NODE \ + --bind-to none \ + /opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1 + else + sleep infinity + fi + + # Uncomment if the `kubernetes` backend requires it for `/dev/infiniband` access + #privileged: true + + resources: + gpu: nvidia:1..8 + shm_size: 16GB + ``` + +
    + + !!! info "Default image" + If you don't specify `image`, `dstack` uses its [base](https://github.com/dstackai/dstack/tree/master/docker/base) Docker image pre-configured with + `uv`, `python`, `pip`, essential CUDA drivers, `mpirun`, and NCCL tests (under `/opt/nccl-tests/build`). + +=== "RCCL tests" + +
    + + ```yaml + type: task + name: rccl-tests + + nodes: 2 + startup_order: workers-first + stop_criteria: master-done + + # Mount the system libraries folder from the host + volumes: + - /usr/local/lib:/mnt/lib + + image: rocm/dev-ubuntu-22.04:6.4-complete + env: + - NCCL_DEBUG=INFO + - OPEN_MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi + commands: + # Setup MPI and build RCCL tests + - apt-get install -y git libopenmpi-dev openmpi-bin + - git clone https://github.com/ROCm/rccl-tests.git + - cd rccl-tests + - make MPI=1 MPI_HOME=$OPEN_MPI_HOME + + # Preload the RoCE driver library from the host (for Broadcom driver compatibility) + - export LD_PRELOAD=/mnt/lib/libbnxt_re-rdmav34.so + + # Run RCCL tests via MPI + - | + if [ $DSTACK_NODE_RANK -eq 0 ]; then + mpirun --allow-run-as-root \ + --hostfile $DSTACK_MPI_HOSTFILE \ + -n $DSTACK_GPUS_NUM \ + -N $DSTACK_GPUS_PER_NODE \ + --mca btl_tcp_if_include ens41np0 \ + -x LD_PRELOAD \ + -x NCCL_IB_HCA=mlx5_0/1,bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7 \ + -x NCCL_IB_GID_INDEX=3 \ + -x NCCL_IB_DISABLE=0 \ + ./build/all_reduce_perf -b 8M -e 8G -f 2 -g 1 -w 5 --iters 20 -c 0; + else + sleep infinity + fi + + resources: + gpu: MI300X:8 + ``` + +
    + + !!! info "RoCE library" + RCCL tests use the RDMA/RoCE interconnect for internode communication. To use the RDMA/RoCE interconnect on Broadcom `bnxt_re` devices, RCCL requires the Broadcom-specific userspace provider library `libbnxt_re-rdmav34.so` to be available inside the container at `/usr/lib/x86_64-linux-gnu/libibverbs/libbnxt_re-rdmav34.so`. We make this library available by mounting it from the host and using `LD_PRELOAD` when running MPI. + + Alternatively, you can avoid `LD_PRELOAD` and directly mount `/usr/lib/x86_64-linux-gnu/libibverbs/libbnxt_re-rdmav34.so` if you use a custom image with OpenMPI pre-installed. + +!!! info "Privileged" + In some cases, the backend (e.g., `kubernetes`) may require `privileged: true` to access the high-speed interconnect (e.g., InfiniBand). + +### Apply a configuration + +To run a configuration, use the [`dstack apply`](../../reference/cli/dstack/apply.md) command. + +
    + +```shell +$ dstack apply -f nccl-tests.dstack.yml + + # BACKEND REGION INSTANCE RESOURCES SPOT PRICE + 1 aws us-east-1 g4dn.12xlarge 48xCPU, 192GB, 4xT4 (16GB), 100.0GB (disk) no $3.912 + 2 aws us-west-2 g4dn.12xlarge 48xCPU, 192GB, 4xT4 (16GB), 100.0GB (disk) no $3.912 + 3 aws us-east-2 g4dn.12xlarge 48xCPU, 192GB, 4xT4 (16GB), 100.0GB (disk) no $3.912 + +Submit the run nccl-tests? [y/n]: y +``` + +
    + +## What's next? + +1. Check [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), + [services](../../concepts/services.md), and [fleets](../../concepts/fleets.md). diff --git a/mkdocs/docs/examples/clusters/nebius.md b/mkdocs/docs/examples/clusters/nebius.md new file mode 100644 index 0000000000..20b1a47555 --- /dev/null +++ b/mkdocs/docs/examples/clusters/nebius.md @@ -0,0 +1,257 @@ +--- +title: Nebius +description: Using Nebius clusters with InfiniBand support via VMs or Kubernetes +--- + +# Nebius + +`dstack` allows you to use Nebius clusters with fast interconnects in two ways: + +* [VMs](#vms) – If you configure a `nebius` backend in `dstack` by providing your Nebius credentials, `dstack` lets you fully provision and use clusters through `dstack`. +* [Kubernetes](#kubernetes) – If you create a Kubernetes cluster on Nebius and configure a `kubernetes` backend and create a backend fleet in `dstack`, `dstack` lets you fully use this cluster through `dstack`. + +## VMs + +Since `dstack` offers a VM-based backend that natively integrates with Nebius, you only need to provide your Nebius credentials to `dstack`, and it will allow you to fully provision and use clusters on Nebius through `dstack`. + +### Configure a backend + +You can configure the `nebius` backend using a credentials file [generated](https://docs.nebius.com/iam/service-accounts/authorized-keys#create) by the `nebius` CLI: + +
    + +```shell +$ nebius iam auth-public-key generate \ + --service-account-id <service account ID> \ + --output ~/.nebius/sa-credentials.json +``` + +
    + +
    + +```yaml +projects: +- name: main + backends: + - type: nebius + creds: + type: service_account + filename: ~/.nebius/sa-credentials.json +``` + +
    + +### Create a fleet + +Once the backend configured, you can create a fleet: + +
    + +```yaml +type: fleet +name: nebius-fleet + +nodes: 2 +placement: cluster + +backends: [nebius] + +resources: + gpu: H100:8 +``` + +
    + +Pass the fleet configuration to `dstack apply`: + +
    + +```shell +$ dstack apply -f nebius-fleet.dstack.yml +``` + +
    + +This will automatically create a Nebius cluster and provision instances. + +Once the fleet is created, you can run [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), and [services](../../concepts/services.md). + +> If you want instances to be provisioned on demand, you can set `nodes` to `0..2`. In this case, `dstack` will create instances only when you run workloads. + +## Kubernetes + +If, for some reason, you’d like to use dstack with Nebius’s managed Kubernetes service, you can point `dstack` to the cluster’s kubeconfig file, and `dstack` will allow you to fully use this cluster through `dstack`. + +### Create a cluster + +1. Go to `Compute` → `Kubernetes` and click `Create cluster`. Make sure to enable `Public endpoint`. +2. Go to `Node groups` and click `Create node group`. Make sure to enable `Assign public IPv4 addresses` and `Install NVIDIA GPU drivers and other components`. Select the appropriate instance type, specify the `Number of nodes`, and set `Node storage` to at least `120 GiB`. Make sure to click `Create` under `GPU cluster` if you plan to use a fast interconnect. +3. Go to `Applications`, find `NVIDIA Device Plugin`, and click `Deploy`. +4. Wait until the nodes are provisioned. + +> Even if you enable `autoscaling`, `dstack` can use only the nodes that are already provisioned. To provision instances on demand, use [VMs](#vms) (see above). + +#### Configure the kubeconfig file + +1. Click `How to connect` and copy the `nebius` CLI command that configures the `kubeconfig` file. +2. Install the `nebius` CLI and run the command: + +
    + +```shell +$ nebius mk8s cluster get-credentials --id <cluster id> --external +``` + +
    + +### Configure a backend + +Follow the standard instructions for setting up a [`kubernetes`](../../concepts/backends.md#kubernetes) backend: + +
    + +```yaml +projects: + - name: main + backends: + - type: kubernetes + kubeconfig: + filename: +``` + +
    + +### Create a fleet + +Once the cluster and the `dstack` server are running, you can create a fleet: + +
    + +```yaml +type: fleet +name: nebius-fleet + +placement: cluster +nodes: 0.. + +backends: [kubernetes] + +resources: + # Specify requirements to filter nodes + gpu: 8 +``` + +
    + +Pass the fleet configuration to `dstack apply`: + +
    + +```shell +$ dstack apply -f nebius-fleet.dstack.yml +``` + +
    + +Once the fleet is created, you can run [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), and [services](../../concepts/services.md). + +## NCCL tests + +Use a [distributed task](../../concepts/tasks.md#distributed-tasks) to run NCCL tests and validate the cluster’s network bandwidth. + +
    + +```yaml +type: task +name: nccl-tests + +nodes: 2 +startup_order: workers-first +stop_criteria: master-done + +env: + - NCCL_DEBUG=INFO +commands: + - | + if [ $DSTACK_NODE_RANK -eq 0 ]; then + mpirun \ + --allow-run-as-root \ + --hostfile $DSTACK_MPI_HOSTFILE \ + -n $DSTACK_GPUS_NUM \ + -N $DSTACK_GPUS_PER_NODE \ + --bind-to none \ + /opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1 + else + sleep infinity + fi + +# Required for `/dev/infiniband` access +privileged: true + +resources: + gpu: 8 + shm_size: 16GB +``` + +
    + +Pass the configuration to `dstack apply`: + +
    + +```shell +$ dstack apply -f nebius-nccl-tests.dstack.yml + +Provisioning... +---> 100% + +nccl-tests provisioning completed (running) + + out-of-place in-place + size count type redop root time algbw busbw #wrong time algbw busbw #wrong + (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) + 8 2 float sum -1 45.72 0.00 0.00 0 29.78 0.00 0.00 0 + 16 4 float sum -1 29.92 0.00 0.00 0 29.42 0.00 0.00 0 + 32 8 float sum -1 30.10 0.00 0.00 0 29.75 0.00 0.00 0 + 64 16 float sum -1 34.48 0.00 0.00 0 29.36 0.00 0.00 0 + 128 32 float sum -1 30.38 0.00 0.01 0 29.67 0.00 0.01 0 + 256 64 float sum -1 30.48 0.01 0.02 0 29.97 0.01 0.02 0 + 512 128 float sum -1 30.45 0.02 0.03 0 30.85 0.02 0.03 0 + 1024 256 float sum -1 31.36 0.03 0.06 0 31.29 0.03 0.06 0 + 2048 512 float sum -1 32.27 0.06 0.12 0 32.26 0.06 0.12 0 + 4096 1024 float sum -1 36.04 0.11 0.21 0 43.17 0.09 0.18 0 + 8192 2048 float sum -1 37.24 0.22 0.41 0 35.54 0.23 0.43 0 + 16384 4096 float sum -1 37.22 0.44 0.83 0 34.55 0.47 0.89 0 + 32768 8192 float sum -1 43.82 0.75 1.40 0 35.64 0.92 1.72 0 + 65536 16384 float sum -1 37.85 1.73 3.25 0 37.55 1.75 3.27 0 + 131072 32768 float sum -1 43.10 3.04 5.70 0 53.08 2.47 4.63 0 + 262144 65536 float sum -1 58.59 4.47 8.39 0 63.33 4.14 7.76 0 + 524288 131072 float sum -1 97.88 5.36 10.04 0 83.91 6.25 11.72 0 + 1048576 262144 float sum -1 87.08 12.04 22.58 0 77.82 13.47 25.26 0 + 2097152 524288 float sum -1 99.06 21.17 39.69 0 97.67 21.47 40.26 0 + 4194304 1048576 float sum -1 110.14 38.08 71.40 0 114.66 36.58 68.59 0 + 8388608 2097152 float sum -1 154.48 54.30 101.82 0 156.03 53.76 100.80 0 + 16777216 4194304 float sum -1 210.33 79.77 149.56 0 200.98 83.48 156.52 0 + 33554432 8388608 float sum -1 274.23 122.36 229.43 0 276.45 121.38 227.58 0 + 67108864 16777216 float sum -1 472.43 142.05 266.35 0 480.00 139.81 262.14 0 + 134217728 33554432 float sum -1 759.58 176.70 331.31 0 756.21 177.49 332.79 0 + 268435456 67108864 float sum -1 1305.66 205.59 385.49 0 1303.37 205.95 386.16 0 + 536870912 134217728 float sum -1 2379.38 225.63 423.06 0 2373.42 226.20 424.13 0 + 1073741824 268435456 float sum -1 4511.97 237.98 446.21 0 4513.82 237.88 446.02 0 + 2147483648 536870912 float sum -1 8776.26 244.69 458.80 0 8760.42 245.13 459.63 0 + 4294967296 1073741824 float sum -1 17407.8 246.73 462.61 0 17302.2 248.23 465.44 0 + 8589934592 2147483648 float sum -1 34448.4 249.36 467.54 0 34381.0 249.85 468.46 0 + Out of bounds values : 0 OK + Avg bus bandwidth : 125.499 + + Collective test concluded: all_reduce_perf +``` + +
    + +## What's next + +1. Learn about [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), [services](../../concepts/services.md) +2. Check out [backends](../../concepts/backends.md) and [fleets](../../concepts/fleets.md) +3. Read Nebius' docs on [networking for VMs](https://docs.nebius.com/compute/clusters/gpu) and the [managed Kubernetes service](https://docs.nebius.com/kubernetes). diff --git a/mkdocs/docs/examples/inference/dynamo.md b/mkdocs/docs/examples/inference/dynamo.md new file mode 100644 index 0000000000..32a9a1e6e2 --- /dev/null +++ b/mkdocs/docs/examples/inference/dynamo.md @@ -0,0 +1,166 @@ +--- +title: NVIDIA Dynamo +description: Deploying zai-org/GLM-4.5-Air-FP8 using NVIDIA Dynamo with Prefill-Decode disaggregation. +--- + +# Dynamo + +This example shows how to deploy `zai-org/GLM-4.5-Air-FP8` using +[NVIDIA Dynamo](https://github.com/ai-dynamo/dynamo) and `dstack`. + + +## Apply a configuration + +Here's an example of a service that deploys `zai-org/GLM-4.5-Air-FP8` using +Dynamo with PD disaggregation. + +
    + +```yaml +type: service +name: dynamo-pd + +env: + - HF_TOKEN + - MODEL_ID=zai-org/GLM-4.5-Air-FP8 + +replicas: + - count: 1 + docker: true + commands: + - apt-get update + - apt-get install -y python3-dev python3-venv + - python3 -m venv ~/dyn-venv + - source ~/dyn-venv/bin/activate + - pip install -U pip + - pip install "ai-dynamo[sglang]==1.1.1" + - git clone https://github.com/ai-dynamo/dynamo.git + # Brings up the NATS / etcd compose stack and runs the Dynamo HTTP frontend. + - docker compose -f dynamo/dev/docker-compose.yml up -d + - | + python3 -m dynamo.frontend \ + --http-host 0.0.0.0 --http-port 8000 \ + --discovery-backend etcd --router-mode kv \ + --kv-cache-block-size 64 + resources: + cpu: 4 + router: + type: dynamo + + - count: 1..4 + scaling: + metric: rps + target: 3 + python: "3.12" + nvcc: true + commands: + # dstack injects DSTACK_ROUTER_INTERNAL_IP after the router replica + # is provisioned. Compose the etcd/NATS endpoints from it. + - export ETCD_ENDPOINTS="http://$DSTACK_ROUTER_INTERNAL_IP:2379" + - export NATS_SERVER="nats://$DSTACK_ROUTER_INTERNAL_IP:4222" + # Set to enable /health endpoint required by dstack probes. + - export DYN_SYSTEM_PORT="8000" + # Wait until the router's etcd and NATS ports are actually accepting connections. + - | + until (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/2379) 2>/dev/null \ + && (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/4222) 2>/dev/null; do + echo "waiting for etcd/NATS on $DSTACK_ROUTER_INTERNAL_IP..."; sleep 3 + done + - pip install "ai-dynamo[sglang]==1.1.1" + - | + python3 -m dynamo.sglang \ + --model-path $MODEL_ID --served-model-name $MODEL_ID \ + --discovery-backend etcd --host 0.0.0.0 \ + --page-size 64 \ + --disaggregation-mode prefill --disaggregation-transfer-backend nixl + resources: + gpu: H200 + + - count: 1..8 + scaling: + metric: rps + target: 2 + python: "3.12" + nvcc: true + commands: + - export ETCD_ENDPOINTS="http://$DSTACK_ROUTER_INTERNAL_IP:2379" + - export NATS_SERVER="nats://$DSTACK_ROUTER_INTERNAL_IP:4222" + - export DYN_SYSTEM_PORT="8000" + - | + until (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/2379) 2>/dev/null \ + && (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/4222) 2>/dev/null; do + echo "waiting for etcd/NATS on $DSTACK_ROUTER_INTERNAL_IP..."; sleep 3 + done + - pip install "ai-dynamo[sglang]==1.1.1" + - | + python3 -m dynamo.sglang \ + --model-path $MODEL_ID --served-model-name $MODEL_ID \ + --discovery-backend etcd --host 0.0.0.0 \ + --page-size 64 \ + --disaggregation-mode decode --disaggregation-transfer-backend nixl + resources: + gpu: H200 + +port: 8000 +model: zai-org/GLM-4.5-Air-FP8 + +# Custom probe is required for PD disaggregation. +probes: + - type: http + url: /health + interval: 15s +``` + +
    + +> With the the `dynamo` router, you can use SGLang, vLLM, and TensorRT-LLM prefill and decode workers. + +Save the configuration as `service.dstack.yml`, then use the +[`dstack apply`](../../reference/cli/dstack/apply.md) command. + +
    + +```shell +$ dstack apply -f service.dstack.yml +``` + +
    + +If no gateway is created, the service endpoint will be available at `/proxy/services///`. + +
    + +```shell +curl http://127.0.0.1:3000/proxy/services/main/dynamo-pd/v1/chat/completions \ + -X POST \ + -H 'Authorization: Bearer <user token>' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "zai-org/GLM-4.5-Air-FP8", + "messages": [ + { + "role": "user", + "content": "What is prefill-decode disaggregation?" + } + ], + "max_tokens": 1024 + }' +``` + +
    + +> If a [gateway](../../concepts/gateways.md) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://dynamo-pd./`. + +## Configuration options + +Currently, auto-scaling only supports `rps` as the metric. TTFT and ITL metrics are coming soon. + +!!! info "Cluster" + PD disaggregation requires the service to run in a fleet with `placement` set to `cluster`, because the replicas require an interconnect between instances. + + While the prefill and decode replicas run on GPUs, the router replica requires a CPU instance in the same cluster. + +## What's next? + +1. Read about [services](../../concepts/services.md) and [gateways](../../concepts/gateways.md) +2. Browse the [NVIDIA Dynamo GitHub repository](https://github.com/ai-dynamo/dynamo) and the [SGLang](./sglang.md) example diff --git a/mkdocs/docs/examples/inference/nim.md b/mkdocs/docs/examples/inference/nim.md new file mode 100644 index 0000000000..3ec4c8b43c --- /dev/null +++ b/mkdocs/docs/examples/inference/nim.md @@ -0,0 +1,99 @@ +--- +title: NVIDIA NIM +description: Deploying Nemotron-3-Super-120B-A12B using NVIDIA NIM +--- + +# NVIDIA NIM + +This example shows how to deploy Nemotron-3-Super-120B-A12B using [NVIDIA NIM](https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html) and `dstack`. + +??? info "Prerequisites" + Once `dstack` is [installed](../../installation.md), clone the repo with examples. + +
    + + ```shell + $ git clone https://github.com/dstackai/dstack + $ cd dstack + ``` + +
    + +## Deployment + +Here's an example of a service that deploys Nemotron-3-Super-120B-A12B using NIM. + +
    + +```yaml +type: service +name: nemotron120 + +image: nvcr.io/nim/nvidia/nemotron-3-super-120b-a12b:1.8.0 +env: + - NGC_API_KEY +registry_auth: + username: $oauthtoken + password: ${{ env.NGC_API_KEY }} +port: 8000 +model: nvidia/nemotron-3-super-120b-a12b +volumes: + - instance_path: /root/.cache/nim + path: /opt/nim/.cache + optional: true + +resources: + cpu: x86:96.. + memory: 512GB.. + shm_size: 16GB + disk: 500GB.. + gpu: H100:80GB:8 +``` +
    + +### Running a configuration + +Save the configuration above as `nemotron120.dstack.yml`, then use the +[`dstack apply`](../../reference/cli/dstack/apply.md) command. + +
    + +```shell +$ NGC_API_KEY=... +$ dstack apply -f service.dstack.yml +``` +
    + +If no gateway is created, the service endpoint will be available at `/proxy/services///`. + +
    + +```shell +$ curl http://127.0.0.1:3000/proxy/services/main/nemotron120/v1/chat/completions \ + -X POST \ + -H 'Authorization: Bearer <user token>' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "nvidia/nemotron-3-super-120b-a12b", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "What is Deep Learning?" + } + ], + "max_tokens": 128 + }' +``` + +
    + +When a [gateway](../../concepts/gateways.md) is configured, the service endpoint will be available at `https://nemotron120./`. + +## What's next? + +1. Check [services](../../concepts/services.md) +2. Browse the [Nemotron-3-Super-120B-A12B model page](https://build.nvidia.com/nvidia/nemotron-3-super-120b-a12b) diff --git a/mkdocs/docs/examples/inference/sglang.md b/mkdocs/docs/examples/inference/sglang.md new file mode 100644 index 0000000000..1ea9e6e065 --- /dev/null +++ b/mkdocs/docs/examples/inference/sglang.md @@ -0,0 +1,333 @@ +--- +title: SGLang +description: Deploying Qwen3.6-27B using SGLang on NVIDIA and AMD GPUs +--- + +# SGLang + +This example shows how to deploy `Qwen/Qwen3.6-27B` using +[SGLang](https://github.com/sgl-project/sglang) and `dstack`. + +> For a `DeepSeek-V4-Pro` deployment on `B200:8`, see the +[DeepSeek V4](../models/deepseek-v4.md) model page. + +## Apply a configuration + +Here's an example of a service that deploys +`Qwen/Qwen3.6-27B` using SGLang. + +=== "NVIDIA" + +
    + + ```yaml + type: service + name: qwen36 + + image: lmsysorg/sglang:v0.5.10.post1 + + commands: + - | + sglang serve \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 30000 \ + --tp $DSTACK_GPUS_NUM \ + --reasoning-parser qwen3 \ + --mem-fraction-static 0.8 \ + --context-length 262144 + + port: 30000 + model: Qwen/Qwen3.6-27B + + volumes: + - instance_path: /root/.cache + path: /root/.cache + optional: true + + resources: + shm_size: 16GB + gpu: H100:4 + ``` +
    + +=== "AMD" + +
    + + ```yaml + type: service + name: qwen36 + + image: lmsysorg/sglang:v0.5.10-rocm720-mi30x + + commands: + - | + sglang serve \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 30000 \ + --tp $DSTACK_GPUS_NUM \ + --reasoning-parser qwen3 \ + --mem-fraction-static 0.8 \ + --context-length 262144 + + port: 30000 + model: Qwen/Qwen3.6-27B + + volumes: + - instance_path: /root/.cache + path: /root/.cache + optional: true + + resources: + cpu: 52.. + memory: 896GB.. + shm_size: 16GB + disk: 450GB.. + gpu: MI300X:4 + ``` +
    + +The AMD example keeps the deployment close to the upstream Qwen and SGLang +guidance: a pinned ROCm image, tensor parallelism across all four GPUs, and the +standard `qwen3` reasoning parser without extra ROCm-specific tuning flags. + +Save one of the configurations above as `service.dstack.yml`, then use the +[`dstack apply`](../../reference/cli/dstack/apply.md) command. + +
    + +```shell +$ dstack apply -f service.dstack.yml +``` + +
    + +If no gateway is created, the service endpoint will be available at `/proxy/services///`. + +
    + +```shell +curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \ + -X POST \ + -H 'Authorization: Bearer <user token>' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "Qwen/Qwen3.6-27B", + "messages": [ + { + "role": "user", + "content": "A bat and a ball cost $1.10 total. The bat costs $1.00 more than the ball. How much does the ball cost? Answer with just the dollar amount." + } + ], + "separate_reasoning": true, + "max_tokens": 1024 + }' +``` +
    + +Qwen3.6 uses thinking mode by default. To disable thinking, pass +`"chat_template_kwargs": {"enable_thinking": false}` in the request body. To +enable tool calling, add `--tool-call-parser qwen3_coder` to the serve command. + +> If a [gateway](../../concepts/gateways.md) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen36./`. + +## Configuration options + +### PD disaggregation + +To run SGLang with [PD disaggregation](https://docs.sglang.io/advanced_features/pd_disaggregation.html), use replica groups: one for [Shepherd Model Gateway (SMG)](https://docs.sglang.io/advanced_features/sgl_model_gateway.html), one for prefill workers, and one for decode workers. + +=== "NVIDIA" + +
    + + ```yaml + type: service + name: prefill-decode + image: lmsysorg/sglang:v0.5.10.post1 + + env: + - HF_TOKEN + - MODEL_ID=zai-org/GLM-4.5-Air-FP8 + + replicas: + - count: 1 + # For now replica group with router must have count: 1 + commands: + - pip install smg + - | + smg launch \ + --host 0.0.0.0 \ + --port 8000 \ + --pd-disaggregation \ + --prefill-policy cache_aware + resources: + cpu: 4 + router: + type: sglang + + - count: 1..4 + scaling: + metric: rps + target: 3 + commands: + - | + python -m sglang.launch_server \ + --model-path $MODEL_ID \ + --disaggregation-mode prefill \ + --disaggregation-transfer-backend nixl \ + --host 0.0.0.0 \ + --port 8000 \ + --disaggregation-bootstrap-port 8998 + resources: + gpu: H200 + + - count: 1..8 + scaling: + metric: rps + target: 2 + commands: + - | + python -m sglang.launch_server \ + --model-path $MODEL_ID \ + --disaggregation-mode decode \ + --disaggregation-transfer-backend nixl \ + --host 0.0.0.0 \ + --port 8000 + resources: + gpu: H200 + + port: 8000 + model: zai-org/GLM-4.5-Air-FP8 + + # Custom probe is required for PD disaggregation. + probes: + - type: http + url: /health + interval: 15s + ``` + +
    + + > With the `sglang` router, you can use SGLang prefill and decode workers. Support for vLLM and TensorRT-LLM workers is coming soon. + +=== "AMD" + + The example below deploys `Qwen/Qwen2.5-72B-Instruct` on a multi-node cluster with AMD MI300X GPUs: + +
    + + ```yaml + type: service + name: amd-sglang-pd-service + + image: rocm/sgl-dev:v0.5.10.post1-rocm720-mi30x-20260427 + privileged: true + + env: + - MODEL_ID=Qwen/Qwen2.5-72B-Instruct + - HF_TOKEN + - SGLANG_USE_AITER=0 + - SGLANG_ROCM_FUSED_DECODE_MLA=0 + - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=600 + - SGLANG_DISAGGREGATION_WAITING_TIMEOUT=600 + - RDMA_DEVICES=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7 + - NCCL_IB_DISABLE=1 + + replicas: + - count: 1 + commands: + - pip install smg + - | + smg launch \ + --pd-disaggregation \ + --host 0.0.0.0 \ + --port 30000 + resources: + cpu: 4.. + router: + type: sglang + + - count: 1..2 + scaling: + metric: rps + target: 300 + commands: + - | + python3 -m sglang.launch_server \ + --model $MODEL_ID \ + --disaggregation-mode prefill \ + --disaggregation-transfer-backend mooncake \ + --host 0.0.0.0 \ + --port 30000 \ + --tp $DSTACK_GPUS_NUM \ + --trust-remote-code \ + --disaggregation-ib-device $RDMA_DEVICES \ + --disaggregation-bootstrap-port 8998 \ + --disable-radix-cache \ + --disable-cuda-graph \ + --disable-overlap-schedule \ + --mem-fraction-static 0.8 \ + --max-running-requests 1024 + resources: + gpu: MI300X:8 + cpu: 96.. + memory: 512GB.. + + - count: 1..4 + scaling: + metric: rps + target: 300 + commands: + - | + python3 -m sglang.launch_server \ + --model $MODEL_ID \ + --disaggregation-mode decode \ + --disaggregation-transfer-backend mooncake \ + --host 0.0.0.0 \ + --port 30000 \ + --tp $DSTACK_GPUS_NUM \ + --trust-remote-code \ + --disaggregation-ib-device $RDMA_DEVICES \ + --disable-radix-cache \ + --disable-cuda-graph \ + --disable-overlap-schedule \ + --decode-attention-backend triton \ + --mem-fraction-static 0.8 \ + --max-running-requests 1024 + resources: + gpu: MI300X:8 + cpu: 96.. + memory: 512GB.. + + port: 30000 + model: Qwen/Qwen2.5-72B-Instruct + + # Custom probe is required for PD disaggregation. + probes: + - type: http + url: /health + interval: 15s + + volumes: + - /usr/lib64/libibverbs/libbnxt_re-rdmav34.so:/usr/lib/x86_64-linux-gnu/libibverbs/libbnxt_re-rdmav34.so + ``` + +
    + + !!! info "RoCE library" + Mooncake uses the RDMA/RoCE interconnect for KV Cache transer. To use the RDMA/RoCE interconnect on Broadcom `bnxt_re` devices, Mooncake requires the Broadcom-specific userspace provider library `libbnxt_re-rdmav34.so` to be available inside the container at `/usr/lib/x86_64-linux-gnu/libibverbs/libbnxt_re-rdmav34.so`. We make this library available by mounting the host provider library from `/usr/lib64/libibverbs/libbnxt_re-rdmav34.so`. + +Currently, auto-scaling only supports `rps` as the metric. TTFT and ITL metrics are coming soon. + +!!! info "Cluster" + PD disaggregation requires the service to run in a fleet with `placement` set to `cluster`, because the replicas require an interconnect between instances. + + While the prefill and decode replicas run on GPUs, the router replica requires a CPU instance in the same cluster. + +## What's next? + +1. Read about [services](../../concepts/services.md) and [gateways](../../concepts/gateways.md) +2. Browse the [Qwen 3.6 SGLang cookbook](https://docs.sglang.io/cookbook/autoregressive/Qwen/Qwen3.6) and the [SGLang server arguments reference](https://docs.sglang.ai/advanced_features/server_arguments.html) diff --git a/mkdocs/docs/examples/inference/trtllm.md b/mkdocs/docs/examples/inference/trtllm.md new file mode 100644 index 0000000000..ac1b8b33b4 --- /dev/null +++ b/mkdocs/docs/examples/inference/trtllm.md @@ -0,0 +1,99 @@ +--- +title: TensorRT-LLM +description: Deploying Qwen3-235B-A22B-FP8 using NVIDIA TensorRT-LLM on NVIDIA GPUs +--- + +# TensorRT-LLM + +This example shows how to deploy `nvidia/Qwen3-235B-A22B-FP8` using +[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) and `dstack`. + +## Apply a configuration + +Here's an example of a service that deploys +`nvidia/Qwen3-235B-A22B-FP8` using TensorRT-LLM. + +
    + +```yaml +type: service +name: qwen235 + +image: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc11 + +env: + - HF_HUB_ENABLE_HF_TRANSFER=1 + +commands: + - pip install hf_transfer + - | + trtllm-serve serve nvidia/Qwen3-235B-A22B-FP8 \ + --host 0.0.0.0 \ + --port 8000 \ + --backend pytorch \ + --tp_size $DSTACK_GPUS_NUM \ + --max_batch_size 32 \ + --max_num_tokens 4096 \ + --kv_cache_free_gpu_memory_fraction 0.75 + +port: 8000 +model: nvidia/Qwen3-235B-A22B-FP8 + +volumes: + - instance_path: /root/.cache + path: /root/.cache + optional: true + +resources: + cpu: 96.. + memory: 512GB.. + shm_size: 32GB + disk: 1000GB.. + gpu: H100:8 +``` +
    + +Apply it with [`dstack apply`](../../reference/cli/dstack/apply.md): + +
    + +```shell +$ dstack apply -f service.dstack.yml +``` + +
    + +## Access the endpoint + +If no gateway is created, the service endpoint will be available at `/proxy/services///`. + +
    + +```shell +$ curl http://127.0.0.1:3000/proxy/services/main/qwen235/v1/chat/completions \ + -X POST \ + -H 'Authorization: Bearer <user token>' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "nvidia/Qwen3-235B-A22B-FP8", + "messages": [ + { + "role": "user", + "content": "A bat and a ball cost $1.10 total. The bat costs $1.00 more than the ball. How much does the ball cost?" + } + ], + "chat_template_kwargs": {"enable_thinking": true}, + "max_tokens": 1024, + "temperature": 0.0 + }' +``` + +
    + +When a [gateway](../../concepts/gateways.md) is configured, the service endpoint will be available at `https://qwen235./`. + +## What's next? + +1. Read about [services](../../concepts/services.md) and [gateways](../../concepts/gateways.md) +2. Browse the [TensorRT-LLM deployment guides](https://nvidia.github.io/TensorRT-LLM/deployment-guide/index.html) and the [Qwen3 deployment guide](https://nvidia.github.io/TensorRT-LLM/deployment-guide/deployment-guide-for-qwen3-on-trtllm.html) +3. See the [`trtllm-serve` reference](https://nvidia.github.io/TensorRT-LLM/commands/trtllm-serve/trtllm-serve.html) diff --git a/mkdocs/docs/examples/inference/vllm.md b/mkdocs/docs/examples/inference/vllm.md new file mode 100644 index 0000000000..dd6909ba62 --- /dev/null +++ b/mkdocs/docs/examples/inference/vllm.md @@ -0,0 +1,130 @@ +--- +title: vLLM +description: Deploying Qwen3.6-27B using vLLM on NVIDIA and AMD GPUs +--- + +# vLLM + +This example shows how to deploy `Qwen/Qwen3.6-27B` using +[vLLM](https://docs.vllm.ai/en/latest/) and `dstack`. + +## Apply a configuration + +Here's an example of a service that deploys +`Qwen/Qwen3.6-27B` using vLLM. + +=== "NVIDIA" + +
    + + ```yaml + type: service + name: qwen36 + + image: vllm/vllm-openai:v0.19.1 + + commands: + - | + vllm serve Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 8000 \ + --tensor-parallel-size $DSTACK_GPUS_NUM \ + --max-model-len 262144 \ + --reasoning-parser qwen3 + + port: 8000 + model: Qwen/Qwen3.6-27B + + volumes: + - instance_path: /root/.cache + path: /root/.cache + optional: true + + resources: + shm_size: 16GB + gpu: H100:4 + ``` + +
    + +=== "AMD" + +
    + + ```yaml + type: service + name: qwen36 + + image: vllm/vllm-openai-rocm:v0.19.1 + + commands: + - | + vllm serve Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 8000 \ + --tensor-parallel-size $DSTACK_GPUS_NUM \ + --max-model-len 262144 \ + --reasoning-parser qwen3 + + port: 8000 + model: Qwen/Qwen3.6-27B + + volumes: + - instance_path: /root/.cache + path: /root/.cache + optional: true + + resources: + cpu: 52.. + memory: 896GB.. + shm_size: 16GB + disk: 450GB.. + gpu: MI300X:4 + ``` + +
    + +Qwen3.6-27B is a multimodal model. For text-only workloads, add +`--language-model-only` to free more memory for the KV cache. To enable tool +calling, add `--enable-auto-tool-choice --tool-call-parser qwen3_coder`. + +Save one of the configurations above as `service.dstack.yml`, then use the +[`dstack apply`](../../reference/cli/dstack/apply.md) command. + +
    + +```shell +$ dstack apply -f service.dstack.yml +``` + +
    + +If no gateway is created, the service endpoint will be available at `/proxy/services///`. + +
    + +```shell +curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \ + -X POST \ + -H 'Authorization: Bearer <user token>' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "Qwen/Qwen3.6-27B", + "messages": [ + { + "role": "user", + "content": "A bat and a ball cost $1.10 total. The bat costs $1.00 more than the ball. How much does the ball cost?" + } + ], + "max_tokens": 1024 + }' +``` + +
    + +> If a [gateway](../../concepts/gateways.md) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen36./`. + +## What's next? + +1. Read about [services](../../concepts/services.md) and [gateways](../../concepts/gateways.md) +2. Browse the [Qwen 3.5 & 3.6 vLLM recipe](https://docs.vllm.ai/projects/recipes/en/latest/Qwen/Qwen3.5.html) and the [SGLang](../inference/sglang.md) example diff --git a/src/tests/_internal/server/background/tasks/test_process_terminating_jobs.py b/mkdocs/docs/examples/llms/deepseek/index.md similarity index 100% rename from src/tests/_internal/server/background/tasks/test_process_terminating_jobs.py rename to mkdocs/docs/examples/llms/deepseek/index.md diff --git a/mkdocs/docs/examples/llms/llama/index.md b/mkdocs/docs/examples/llms/llama/index.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mkdocs/docs/examples/misc/docker-compose/index.md b/mkdocs/docs/examples/misc/docker-compose/index.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mkdocs/docs/examples/models/deepseek-v4.md b/mkdocs/docs/examples/models/deepseek-v4.md new file mode 100644 index 0000000000..1d13b8d6ae --- /dev/null +++ b/mkdocs/docs/examples/models/deepseek-v4.md @@ -0,0 +1,154 @@ +--- +title: DeepSeek V4 +description: Deploying DeepSeek-V4-Pro using SGLang on NVIDIA B200:8 +--- + +# DeepSeek V4 + +This example shows how to deploy `deepseek-ai/DeepSeek-V4-Pro` as a +[service](../../concepts/services.md) using +[SGLang](https://github.com/sgl-project/sglang) and `dstack`. + +## Apply a configuration + +Save the following configuration as `deepseek-v4.dstack.yml`. + +
    + +```yaml +type: service +name: deepseek-v4 + +image: lmsysorg/sglang:deepseek-v4-blackwell + +env: + - HF_TOKEN + - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 + - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 + +commands: + - | + sglang serve \ + --trust-remote-code \ + --model-path deepseek-ai/DeepSeek-V4-Pro \ + --tp 8 \ + --dp 8 \ + --enable-dp-attention \ + --moe-a2a-backend deepep \ + --mem-fraction-static 0.82 \ + --cuda-graph-max-bs 64 \ + --max-running-requests 256 \ + --deepep-config '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' \ + --tool-call-parser deepseekv4 \ + --reasoning-parser deepseek-v4 \ + --host 0.0.0.0 \ + --port 30000 + +port: 30000 +model: deepseek-ai/DeepSeek-V4-Pro + +volumes: + - instance_path: /root/.cache + path: /root/.cache + optional: true + +resources: + gpu: B200:8 + shm_size: 32GB + disk: 2TB.. +``` + +
    + +This configuration uses the single-node Blackwell `DeepSeek-V4-Pro` recipe +shape for `8 x NVIDIA B200`. + +Export your Hugging Face token and apply the configuration with +[`dstack apply`](../../reference/cli/dstack/apply.md). + +
    + +```shell +$ export HF_TOKEN= +$ dstack apply -f deepseek-v4.dstack.yml +``` + +
    + +If no gateway is created, the service endpoint will be available at +`/proxy/services///`. + +
    + +```shell +curl http://127.0.0.1:3000/proxy/services/main/deepseek-v4/v1/chat/completions \ + -X POST \ + -H 'Authorization: Bearer <user token>' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "deepseek-ai/DeepSeek-V4-Pro", + "messages": [ + { + "role": "user", + "content": "What is 15% of 240? Reply with just the number." + } + ], + "temperature": 0, + "max_tokens": 32 + }' +``` + +
    + +## Reasoning mode + +To separate the model's reasoning into `reasoning_content`, keep +`--reasoning-parser deepseek-v4` in the server command and send +`chat_template_kwargs` in the request body. + +For raw HTTP requests, `chat_template_kwargs` and `separate_reasoning` must be +top-level JSON fields. + +
    + +```shell +curl http://127.0.0.1:3000/proxy/services/main/deepseek-v4/v1/chat/completions \ + -X POST \ + -H 'Authorization: Bearer <user token>' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "deepseek-ai/DeepSeek-V4-Pro", + "messages": [ + { + "role": "user", + "content": "Solve step by step: If 3x + 5 = 20, what is x?" + } + ], + "temperature": 0, + "max_tokens": 256, + "chat_template_kwargs": { + "thinking": true + }, + "separate_reasoning": true + }' +``` + +
    + +This returns both: + +- `reasoning_content`: a separate reasoning trace +- `content`: the final user-visible answer + +## Deployment notes + +- The first startup can take several minutes while the model loads and SGLang + finishes initialization. +- The optional `/root/.cache` instance volume helps reuse the model cache on + backends that support instance volumes. + +## What's next? + +1. Read the [DeepSeek-V4-Pro model card](https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro) +2. Read the [DeepSeek-V4 SGLang cookbook](https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4) +3. Browse the dedicated [SGLang](../inference/sglang.md) and [vLLM](../inference/vllm.md) examples diff --git a/mkdocs/docs/examples/models/qwen36.md b/mkdocs/docs/examples/models/qwen36.md new file mode 100644 index 0000000000..e2b2fa84db --- /dev/null +++ b/mkdocs/docs/examples/models/qwen36.md @@ -0,0 +1,168 @@ +--- +title: Qwen 3.6 +description: Deploying Qwen3.6-27B using SGLang on NVIDIA and AMD GPUs +--- + +# Qwen 3.6 + +This example shows how to deploy `Qwen/Qwen3.6-27B` as a +[service](../../concepts/services.md) using +[SGLang](https://github.com/sgl-project/sglang) and `dstack`. + +## Apply a configuration + +Save one of the following configurations as `qwen36.dstack.yml`. + +=== "NVIDIA" + +
    + + ```yaml + type: service + name: qwen36 + + image: lmsysorg/sglang:v0.5.10.post1 + + commands: + - | + sglang serve \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 30000 \ + --tp $DSTACK_GPUS_NUM \ + --reasoning-parser qwen3 \ + --mem-fraction-static 0.8 \ + --context-length 262144 + + port: 30000 + model: Qwen/Qwen3.6-27B + + volumes: + - instance_path: /root/.cache + path: /root/.cache + optional: true + + resources: + shm_size: 16GB + gpu: H100:4 + ``` + +
    + +=== "AMD" + +
    + + ```yaml + type: service + name: qwen36 + + image: lmsysorg/sglang:v0.5.10-rocm720-mi30x + + commands: + - | + sglang serve \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 30000 \ + --tp $DSTACK_GPUS_NUM \ + --reasoning-parser qwen3 \ + --mem-fraction-static 0.8 \ + --context-length 262144 + + port: 30000 + model: Qwen/Qwen3.6-27B + + volumes: + - instance_path: /root/.cache + path: /root/.cache + optional: true + + resources: + cpu: 52.. + memory: 896GB.. + shm_size: 16GB + disk: 450GB.. + gpu: MI300X:4 + ``` + +
    + +The NVIDIA and AMD configurations above use pinned SGLang images and the same +straightforward 4-GPU layout used across the Qwen 3.6 docs and examples. + +Apply the configuration with +[`dstack apply`](../../reference/cli/dstack/apply.md). + +
    + +```shell +$ dstack apply -f qwen36.dstack.yml +``` + +
    + +If no gateway is created, the service endpoint will be available at +`/proxy/services///`. + +
    + +```shell +curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \ + -X POST \ + -H 'Authorization: Bearer <user token>' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "Qwen/Qwen3.6-27B", + "messages": [ + { + "role": "user", + "content": "A bat and a ball cost $1.10 total. The bat costs $1.00 more than the ball. How much does the ball cost? Answer with just the dollar amount." + } + ], + "max_tokens": 1024 + }' +``` + +
    + +## Thinking mode + +Qwen3.6 uses thinking mode by default. With SGLang, the reasoning stream is +returned separately as `reasoning_content`. + +To disable thinking, pass `chat_template_kwargs` in the request body. + +
    + +```shell +curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \ + -X POST \ + -H 'Authorization: Bearer <user token>' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "Qwen/Qwen3.6-27B", + "messages": [ + { + "role": "user", + "content": "Summarize the benefits of container images in one sentence." + } + ], + "max_tokens": 256, + "chat_template_kwargs": { + "enable_thinking": false + } + }' +``` + +
    + +## What's next? + +1. Read the [Qwen/Qwen3.6-27B model card](https://huggingface.co/Qwen/Qwen3.6-27B) +2. Read the [Qwen 3.6 SGLang cookbook](https://docs.sglang.io/cookbook/autoregressive/Qwen/Qwen3.6) +3. Read the [Qwen 3.5 & 3.6 vLLM recipe](https://docs.vllm.ai/projects/recipes/en/latest/Qwen/Qwen3.5.html) +4. Browse the dedicated [SGLang](../inference/sglang.md) + and [vLLM](../inference/vllm.md) examples +5. Check the [AMD](../accelerators/amd.md) example for + more AMD deployment and training configurations diff --git a/mkdocs/docs/examples/models/wan22/index.md b/mkdocs/docs/examples/models/wan22/index.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mkdocs/docs/examples/training/axolotl.md b/mkdocs/docs/examples/training/axolotl.md new file mode 100644 index 0000000000..5d59e5802b --- /dev/null +++ b/mkdocs/docs/examples/training/axolotl.md @@ -0,0 +1,181 @@ +--- +title: Axolotl +description: Fine-tuning Llama models with Axolotl — single-node SFT with FSDP and QLoRA, or distributed across multiple nodes +--- + +# Axolotl + +This example shows how to use [Axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) with `dstack` to fine-tune Llama models — on a single node with SFT, FSDP, and QLoRA, or distributed across multiple nodes. + +## Single-node training + +This section walks through fine-tuning 4-bit quantized `Llama-4-Scout-17B-16E` using SFT with FSDP and QLoRA. + +### Define a configuration + +Axolotl reads the model, QLoRA, and dataset arguments, as well as trainer configuration from a [`scout-qlora-flexattn-fsdp2.yaml`](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml) file. The configuration uses 4-bit axolotl quantized version of `meta-llama/Llama-4-Scout-17B-16E`, requiring only ~43GB VRAM/GPU with 4K context length. + +Below is a task configuration that does fine-tuning. + +
    + +```yaml +type: task +# The name is optional, if not specified, generated randomly +name: axolotl-nvidia-llama-scout-train + +# Using the official Axolotl's Docker image +image: axolotlai/axolotl:main-latest + +# Required environment variables +env: + - HF_TOKEN + - WANDB_API_KEY + - WANDB_PROJECT + - HUB_MODEL_ID +# Commands of the task +commands: + - wget https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml + - | + axolotl train scout-qlora-flexattn-fsdp2.yaml \ + --wandb-project $WANDB_PROJECT \ + --wandb-name $DSTACK_RUN_NAME \ + --hub-model-id $HUB_MODEL_ID + +resources: + # Four GPU (required by FSDP) + gpu: H100:4 + # Shared memory size for inter-process communication + shm_size: 64GB + disk: 500GB.. +``` + +
    + +The task uses Axolotl's Docker image, where Axolotl is already pre-installed. + +### Run the configuration + +Once the configuration is ready, run `dstack apply -f `, and `dstack` will automatically provision the +cloud resources and run the configuration. + +
    + +```shell +$ HF_TOKEN=... +$ WANDB_API_KEY=... +$ WANDB_PROJECT=... +$ HUB_MODEL_ID=... +$ dstack apply -f train.dstack.yml + + # BACKEND RESOURCES INSTANCE TYPE PRICE + 1 vastai (cz-czechia) cpu=64 mem=128GB H100:80GB:2 18794506 $3.8907 + 2 vastai (us-texas) cpu=52 mem=64GB H100:80GB:2 20442365 $3.6926 + 3 vastai (fr-france) cpu=64 mem=96GB H100:80GB:2 20379984 $3.7389 + +Submit the run axolotl-nvidia-llama-scout-train? [y/n]: + +Provisioning... +---> 100% +``` + +
    + +## Distributed training + +!!! info "Prerequisites" + Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](../../concepts/fleets.md#cluster-placement) or an [SSH fleet](../../concepts/fleets.md#ssh-placement)). + +This section walks through running distributed fine-tuning of `Llama-3.1-70B` with QLoRA and FSDP across multiple nodes. + +### Define a configuration + +Once the fleet is created, define a distributed task configuration. Here's an example of a distributed `QLoRA` task using `FSDP`. + +
    + +```yaml +type: task +name: axolotl-multi-node-qlora-llama3-70b + +nodes: 2 + +image: nvcr.io/nvidia/pytorch:25.01-py3 + +env: + - HF_TOKEN + - WANDB_API_KEY + - WANDB_PROJECT + - HUB_MODEL_ID + - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + - NCCL_DEBUG=INFO + - ACCELERATE_LOG_LEVEL=info + +commands: + # Replacing the default Torch and FlashAttention in the NCG container with Axolotl-compatible versions. + # The preinstalled versions are incompatible with Axolotl. + - pip uninstall -y torch flash-attn + - pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/test/cu124 + - pip install --no-build-isolation axolotl[flash-attn,deepspeed] + - wget https://raw.githubusercontent.com/huggingface/trl/main/examples/accelerate_configs/fsdp1.yaml + - wget https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/examples/llama-3/qlora-fsdp-70b.yaml + # Axolotl includes hf-xet version 1.1.0, which fails during downloads. Replacing it with the latest version (1.1.2). + - pip uninstall -y hf-xet + - pip install hf-xet --no-cache-dir + - | + accelerate launch \ + --config_file=fsdp1.yaml \ + -m axolotl.cli.train qlora-fsdp-70b.yaml \ + --hub-model-id $HUB_MODEL_ID \ + --output-dir /checkpoints/qlora-llama3-70b \ + --wandb-project $WANDB_PROJECT \ + --wandb-name $DSTACK_RUN_NAME \ + --main_process_ip=$DSTACK_MASTER_NODE_IP \ + --main_process_port=8008 \ + --machine_rank=$DSTACK_NODE_RANK \ + --num_processes=$DSTACK_GPUS_NUM \ + --num_machines=$DSTACK_NODES_NUM + +resources: + gpu: 80GB:8 + shm_size: 128GB + +volumes: + - /checkpoints:/checkpoints +``` + +
    + +!!! info "Docker image" + We are using `nvcr.io/nvidia/pytorch:25.01-py3` from NGC because it includes the necessary libraries and packages for RDMA and InfiniBand support. + +### Run the configuration + +To run a configuration, use the [`dstack apply`](../../reference/cli/dstack/apply.md) command. + +
    + +```shell +$ HF_TOKEN=... +$ WANDB_API_KEY=... +$ WANDB_PROJECT=... +$ HUB_MODEL_ID=... +$ dstack apply -f train-distrib.dstack.yml + + # BACKEND RESOURCES INSTANCE TYPE PRICE + 1 ssh (remote) cpu=208 mem=1772GB H100:80GB:8 instance $0 idle + 2 ssh (remote) cpu=208 mem=1772GB H100:80GB:8 instance $0 idle + +Submit the run axolotl-multi-node-qlora-llama3-70b? [y/n]: y + +Provisioning... +---> 100% +``` + +
    + +## What's next? + +1. Check [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), + [services](../../concepts/services.md), and [fleets](../../concepts/fleets.md) +2. Read about [cluster placement](../../concepts/fleets.md#cluster-placement) diff --git a/mkdocs/docs/examples/training/miles.md b/mkdocs/docs/examples/training/miles.md new file mode 100644 index 0000000000..59451d150b --- /dev/null +++ b/mkdocs/docs/examples/training/miles.md @@ -0,0 +1,283 @@ +--- +title: Miles +description: RL post-training Qwen2.5-32B with Miles, SGLang, Megatron-LM, and Ray across two 8xH100 nodes +--- + +# Miles + +This example shows how to use `dstack` and [Miles](https://github.com/radixark/miles) +for reinforcement learning (RL) post-training of a 32B language model with +[GRPO](https://arxiv.org/abs/2402.03300) across a multi-node cluster. +Miles integrates [SGLang](https://github.com/sgl-project/sglang) for +high-throughput rollouts, [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) +for training, and [Ray](https://docs.ray.io/en/latest/) to coordinate the +trainer and rollout actors across nodes. + +Here we fine-tune `Qwen/Qwen2.5-32B-Instruct` on the +[GSM8K](https://huggingface.co/datasets/openai/gsm8k) dataset. + +!!! info "Prerequisites" + Multi-node tasks require a [fleet](../../concepts/fleets.md) with + `placement` set to [`cluster`](../../concepts/fleets.md#cluster-placement). + +## Run a Ray cluster + +### Define a configuration + +The [task](../../concepts/tasks.md) below starts Ray on two nodes and prepares +each node by downloading the model and dataset, then converting the checkpoint +to Megatron's `torch_dist` format. + +
    + +```yaml +type: task +name: miles-qwen32b-h100 +nodes: 2 +image: radixark/miles:sglang-miles-v0.5.12 +env: + - WANDB_API_KEY + - PYTHONPATH=/root/Megatron-LM + - NCCL_DEBUG=INFO + - MODEL_ID=Qwen/Qwen2.5-32B-Instruct +commands: + # 1. Download the model and dataset. + - pip install -U "huggingface_hub[cli]" + - hf download "$MODEL_ID" --local-dir "/root/$(basename "$MODEL_ID")" + - hf download --repo-type dataset openai/gsm8k --local-dir /root/gsm8k + # 2. Convert the Hugging Face checkpoint to Megatron torch_dist. + - | + MODEL_NAME="$(basename "$MODEL_ID")" + cd /root/miles && python tools/convert_hf_to_torch_dist.py \ + --swiglu \ + --num-layers 64 \ + --hidden-size 5120 \ + --ffn-hidden-size 27648 \ + --num-attention-heads 40 \ + --use-rotary-position-embeddings \ + --disable-bias-linear \ + --add-qkv-bias \ + --normalization RMSNorm \ + --norm-epsilon 1e-5 \ + --rotary-base 1000000 \ + --group-query-attention \ + --num-query-groups 8 \ + --vocab-size 152064 \ + --untie-embeddings-and-output-weights \ + --hf-checkpoint "/root/$MODEL_NAME" \ + --save "/root/${MODEL_NAME}_torch_dist" + # 3. Start Ray. + - | + if [ $DSTACK_NODE_RANK = 0 ]; then + ray start --head --port=6379 + else + ray start --address=$DSTACK_MASTER_NODE_IP:6379 + fi +ports: + - 8265 +resources: + gpu: H100:8 + shm_size: 32GB + disk: 1000GB.. +volumes: + - /checkpoints:/checkpoints +``` + +
    + +### Run the configuration + +Run the task with [`dstack apply`](../../reference/cli/dstack/apply.md). By +default, `dstack apply` forwards the Ray dashboard port to `localhost:8265`. + +
    + +```shell +$ export WANDB_API_KEY=... +$ dstack apply -f miles-qwen32b-h100.dstack.yml +``` + +
    + +While `dstack apply` is attached, you can submit Ray jobs through +`localhost:8265`. If you detach or run from another machine, use +[`dstack attach`](../../reference/cli/dstack/attach.md) to re-attach and make +the dashboard port accessible on `localhost`. + +> To run on a single node, remove `nodes` or set it to `1`, then submit the job +> with `NUM_NODES=1`. In this case, `placement: cluster` is not required. + +## Submit Ray jobs + +Install `ray` locally before submitting jobs: + +
    + +```shell +$ pip install ray +``` + +
    + +The submit script below runs the Miles training job on the Ray cluster. The +model is sharded across all 8 GPUs per node with tensor parallelism, and SGLang +uses the same 8 GPUs per node for rollout. + +
    + +```bash +#!/bin/bash +set -euo pipefail + +export RAY_ADDRESS=http://localhost:8265 + +: "${NUM_NODES:?NUM_NODES is not set}" +: "${GPUS_PER_NODE:?GPUS_PER_NODE is not set}" + +MODEL_ID="Qwen/Qwen2.5-32B-Instruct" +MODEL_NAME="$(basename "$MODEL_ID")" +HF_CHECKPOINT="/root/$MODEL_NAME" +REF_LOAD="/root/${MODEL_NAME}_torch_dist" +PROMPT_DATA="/root/gsm8k/main/train-00000-of-00001.parquet" +EVAL_PROMPT_DATA="/root/gsm8k/main/test-00000-of-00001.parquet" +INPUT_KEY="question" +LABEL_KEY="answer" +EVAL_DATASET_NAME="gsm8k" +CHECKPOINT_DIR="/checkpoints/${MODEL_NAME}-${EVAL_DATASET_NAME}" +SAVE_INTERVAL=10 +WANDB_PROJECT="dstack-miles-RL" +WANDB_GROUP="${MODEL_NAME}-gsm8k-${NUM_NODES}node-${GPUS_PER_NODE}gpu" +WANDB_NAME="rollout-$(date +%Y%m%d-%H%M%S)" +ROLLOUT_GPUS_PER_ENGINE=8 + +CMD='cd /root/miles && python3 train.py \ + --actor-num-nodes '"$NUM_NODES"' \ + --actor-num-gpus-per-node '"$GPUS_PER_NODE"' \ + --num-gpus-per-node '"$GPUS_PER_NODE"' \ + --rollout-num-gpus-per-engine '"$ROLLOUT_GPUS_PER_ENGINE"' \ + --sglang-server-concurrency 128 \ + --colocate \ + --calculate-per-token-loss \ + --use-miles-router \ + --swiglu \ + --num-layers 64 \ + --hidden-size 5120 \ + --ffn-hidden-size 27648 \ + --num-attention-heads 40 \ + --use-rotary-position-embeddings \ + --disable-bias-linear \ + --add-qkv-bias \ + --normalization RMSNorm \ + --norm-epsilon 1e-5 \ + --rotary-base 1000000 \ + --group-query-attention \ + --num-query-groups 8 \ + --vocab-size 152064 \ + --untie-embeddings-and-output-weights \ + --hf-checkpoint '"$HF_CHECKPOINT"' \ + --ref-load '"$REF_LOAD"' \ + --prompt-data '"$PROMPT_DATA"' \ + --input-key '"$INPUT_KEY"' \ + --label-key '"$LABEL_KEY"' \ + --apply-chat-template \ + --rollout-shuffle \ + --rm-type math \ + --num-rollout 20 \ + --rollout-batch-size 8 \ + --n-samples-per-prompt 8 \ + --rollout-max-response-len 512 \ + --rollout-temperature 1 \ + --global-batch-size 64 \ + --eval-interval 5 \ + --eval-prompt-data '"$EVAL_DATASET_NAME"' '"$EVAL_PROMPT_DATA"' \ + --n-samples-per-eval-prompt 1 \ + --eval-max-response-len 512 \ + --eval-top-k 1 \ + --tensor-model-parallel-size 8 \ + --sequence-parallel \ + --pipeline-model-parallel-size 1 \ + --context-parallel-size 1 \ + --expert-model-parallel-size 1 \ + --expert-tensor-parallel-size 1 \ + --use-dynamic-batch-size \ + --max-tokens-per-gpu 9216 \ + --advantage-estimator grpo \ + --use-kl-loss \ + --kl-loss-coef 0.00 \ + --kl-loss-type low_var_kl \ + --kl-coef 0.00 \ + --entropy-coef 0.00 \ + --eps-clip 0.2 \ + --eps-clip-high 0.28 \ + --optimizer adam \ + --lr 1e-6 \ + --lr-decay-style constant \ + --weight-decay 0.1 \ + --adam-beta1 0.9 \ + --adam-beta2 0.98 \ + --sglang-mem-fraction-static 0.7 \ + --use-wandb \ + --wandb-host https://wandb.ai/ \ + --wandb-project '"$WANDB_PROJECT"' \ + --wandb-group '"$WANDB_GROUP"' \ + --wandb-exp-name '"$WANDB_NAME"' \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --accumulate-allreduce-grads-in-fp32 \ + --attention-softmax-in-fp32 \ + --attention-backend flash \ + --save '"$CHECKPOINT_DIR"' \ + --save-interval '"$SAVE_INTERVAL"'' + +# GLOO_SOCKET_IFNAME=eth0 is required for multi-node Gloo process group init. +# Without it, Gloo resolves to a loopback address (127.0.1.1) instead of the +# inter-node interface, causing `init_gloo_group()` to timeout. +RUNTIME_ENV_JSON=$(cat < + +Submit the job with the same cluster shape as the task: + +
    + +```shell +$ NUM_NODES=2 GPUS_PER_NODE=8 bash submit-miles-train.sh +``` + +
    + +!!! info "Training parameters" + 1. `--tensor-model-parallel-size 8` shards the 32B model across all 8 GPUs + per node. + 2. `--rollout-num-gpus-per-engine 8` starts SGLang with TP-8 on each node. + 3. `--sglang-server-concurrency` sets how many requests SGLang processes + concurrently. + 4. `--max-tokens-per-gpu 9216` sets the per-GPU token budget. Lower this if + Megatron OOMs during training. + 5. `--sglang-mem-fraction-static 0.7` sets the SGLang KV cache memory + fraction. Lower this if Megatron OOMs at startup. + +Using Ray via `dstack` gives you access to the Ray ecosystem while benefiting +from `dstack`'s provisioning capabilities. + +!!! info "What's next" + 1. Read about [distributed tasks](../../concepts/tasks.md#distributed-tasks) + and [fleets](../../concepts/fleets.md) + 2. See the [SGLang inference](../inference/sglang.md) example + 3. Browse Miles' [examples](https://github.com/radixark/miles/tree/main/examples) diff --git a/mkdocs/docs/examples/training/ray-ragen.md b/mkdocs/docs/examples/training/ray-ragen.md new file mode 100644 index 0000000000..73e8749e83 --- /dev/null +++ b/mkdocs/docs/examples/training/ray-ragen.md @@ -0,0 +1,134 @@ +--- +title: Ray + RAGEN +description: Multi-node agent fine-tuning using RAGEN with Ray and verl for reinforcement learning +--- + +# Ray + RAGEN + +This example shows how use `dstack` and [RAGEN](https://github.com/RAGEN-AI/RAGEN) +to fine-tune an agent on multiple nodes. + +Under the hood `RAGEN` uses [verl](https://github.com/volcengine/verl) for Reinforcement Learning and [Ray](https://docs.ray.io/en/latest/) for distributed training. + +!!! info "Prerequisites" + Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](../../concepts/fleets.md#cluster-placement) or an [SSH fleet](../../concepts/fleets.md#ssh-placement)). + +## Run a Ray cluster + +If you want to use Ray with `dstack`, you have to first run a Ray cluster. + +The task below runs a Ray cluster on an existing fleet: + +
    + +```yaml +type: task +name: ray-cluster + +nodes: 2 + +env: +- WANDB_API_KEY +image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6-mcore0.12.0-te2.2 +commands: + - wget -O miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh + - bash miniconda.sh -b -p /workflow/miniconda + - eval "$(/workflow/miniconda/bin/conda shell.bash hook)" + - git clone https://github.com/RAGEN-AI/RAGEN.git + - cd RAGEN + - bash scripts/setup_ragen.sh + - conda activate ragen + - cd verl + - pip install --no-deps -e . + - pip install hf_transfer hf_xet + - pip uninstall -y ray + - pip install -U "ray[default]" + - | + if [ $DSTACK_NODE_RANK = 0 ]; then + ray start --head --port=6379; + else + ray start --address=$DSTACK_MASTER_NODE_IP:6379 + fi + +# Expose Ray dashboard port +ports: + - 8265 + +resources: + gpu: 80GB:8 + shm_size: 128GB + +# Save checkpoints on the instance +volumes: + - /checkpoints:/checkpoints +``` + +
    + +We are using verl's docker image for vLLM with FSDP. See [Installation](https://verl.readthedocs.io/en/latest/start/install.html) for more. + +The `RAGEN` setup script `scripts/setup_ragen.sh` isolates dependencies within Conda environment. + +Note that the Ray setup in the RAGEN environment is missing the dashboard, so we reinstall it using `ray[default]`. + +Now, if you run this task via `dstack apply`, it will automatically forward the Ray's dashboard port to `localhost:8265`. + +
    + +```shell +$ dstack apply -f ray-cluster.dstack.yml +``` + +
    + +As long as the `dstack apply` is attached, you can use `localhost:8265` to submit Ray jobs for execution. +If `dstack apply` is detached, you can use `dstack attach` to re-attach. + +## Submit Ray jobs + +Before you can submit Ray jobs, ensure to install `ray` locally: + +
    + +```shell +$ pip install ray +``` + +
    + +Now you can submit the training job to the Ray cluster which is available at `localhost:8265`: + +
    + +```shell +$ RAY_ADDRESS=http://localhost:8265 +$ ray job submit \ + -- bash -c "\ + export PYTHONPATH=/workflow/RAGEN; \ + cd /workflow/RAGEN; \ + /workflow/miniconda/envs/ragen/bin/python train.py \ + --config-name base \ + system.CUDA_VISIBLE_DEVICES=[0,1,2,3,4,5,6,7] \ + model_path=Qwen/Qwen2.5-7B-Instruct \ + trainer.experiment_name=agent-fine-tuning-Qwen2.5-7B \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=2 \ + micro_batch_size_per_gpu=2 \ + trainer.default_local_dir=/checkpoints \ + trainer.save_freq=50 \ + actor_rollout_ref.rollout.tp_size_check=False \ + actor_rollout_ref.rollout.tensor_model_parallel_size=4" +``` + +
    + +!!! info "Training parameters" + 1. `actor_rollout_ref.rollout.tensor_model_parallel_size=4`, because `Qwen/Qwen2.5-7B-Instruct` has 28 attention heads and number of attention heads should be divisible by `tensor_model_parallel_size` + 2. `actor_rollout_ref.rollout.tp_size_check=False`, if True `tensor_model_parallel_size` should be equal to `trainer.n_gpus_per_node` + 3. `micro_batch_size_per_gpu=2`, to keep the RAGEN-paper's `rollout_filter_ratio` and `es_manager` settings as it is for world size `16` + +Using Ray via `dstack` is a powerful way to get access to the rich Ray ecosystem while benefiting from `dstack`'s provisioning capabilities. + +!!! info "What's next" + 1. Read about [distributed tasks](../../concepts/tasks.md#distributed-tasks), [fleets](../../concepts/fleets.md), and [cluster placement](../../concepts/fleets.md#cluster-placement) + 2. Browse Ray's [docs](https://docs.ray.io/en/latest/train/examples.html) for other examples. diff --git a/mkdocs/docs/examples/training/trl.md b/mkdocs/docs/examples/training/trl.md new file mode 100644 index 0000000000..e75a4d89b2 --- /dev/null +++ b/mkdocs/docs/examples/training/trl.md @@ -0,0 +1,268 @@ +--- +title: TRL +description: Fine-tuning Llama with TRL — single-node SFT with QLoRA, or distributed across multiple nodes with FSDP and DeepSpeed +--- + +# TRL + +This example walks you through how to use [TRL](https://github.com/huggingface/trl) with `dstack` to fine-tune `Llama-3.1-8B` — on a single node with SFT and QLoRA, or distributed across multiple nodes with [Accelerate](https://github.com/huggingface/accelerate) and [DeepSpeed](https://github.com/deepspeedai/DeepSpeed). + +## Single-node training + +### Define a configuration + +Below is a task configuration that does fine-tuning. + +
    + +```yaml +type: task +name: trl-train + +python: 3.12 +# Ensure nvcc is installed (req. for Flash Attention) +nvcc: true + +env: + - HF_TOKEN + - WANDB_API_KEY + - HUB_MODEL_ID +commands: + # Pin torch==2.6.0 to avoid building Flash Attention from source. + # Prebuilt Flash Attention wheels are not available for the latest torch==2.7.0. + - uv pip install torch==2.6.0 + - uv pip install transformers bitsandbytes peft wandb + - uv pip install flash_attn --no-build-isolation + - git clone https://github.com/huggingface/trl + - cd trl + - uv pip install . + - | + accelerate launch \ + --config_file=examples/accelerate_configs/multi_gpu.yaml \ + --num_processes $DSTACK_GPUS_PER_NODE \ + trl/scripts/sft.py \ + --model_name meta-llama/Meta-Llama-3.1-8B \ + --dataset_name OpenAssistant/oasst_top1_2023-08-25 \ + --dataset_text_field="text" \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 4 \ + --learning_rate 2e-4 \ + --report_to wandb \ + --bf16 \ + --max_seq_length 1024 \ + --lora_r 16 \ + --lora_alpha 32 \ + --lora_target_modules q_proj k_proj v_proj o_proj \ + --load_in_4bit \ + --use_peft \ + --attn_implementation "flash_attention_2" \ + --logging_steps=10 \ + --output_dir models/llama31 \ + --hub_model_id peterschmidt85/FineLlama-3.1-8B + +resources: + gpu: + # 24GB or more VRAM + memory: 24GB.. + # One or more GPU + count: 1.. + # Shared memory (for multi-gpu) + shm_size: 24GB +``` + +
    + +Change the `resources` property to specify more GPUs. + +??? info "DeepSpeed" + For more memory-efficient use of multiple GPUs, consider using DeepSpeed and ZeRO Stage 3. + + To do this, use the `examples/accelerate_configs/deepspeed_zero3.yaml` configuration file instead of + `examples/accelerate_configs/multi_gpu.yaml`. + +### Run the configuration + +Once the configuration is ready, run `dstack apply -f `, and `dstack` will automatically provision the +cloud resources and run the configuration. + +
    + +```shell +$ HF_TOKEN=... +$ WANDB_API_KEY=... +$ HUB_MODEL_ID=... +$ dstack apply -f train.dstack.yml + + # BACKEND RESOURCES INSTANCE TYPE PRICE + 1 vastai (cz-czechia) cpu=64 mem=128GB H100:80GB:2 18794506 $3.8907 + 2 vastai (us-texas) cpu=52 mem=64GB H100:80GB:2 20442365 $3.6926 + 3 vastai (fr-france) cpu=64 mem=96GB H100:80GB:2 20379984 $3.7389 + +Submit the run trl-train? [y/n]: + +Provisioning... +---> 100% +``` + +
    + +## Distributed training + +!!! info "Prerequisites" + Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](../../concepts/fleets.md#cluster-placement) or an [SSH fleet](../../concepts/fleets.md#ssh-placement)). + +### Define a configuration + +Once the fleet is created, define a distributed task configuration. Here's an example using either FSDP or DeepSpeed ZeRO-3. + +=== "FSDP" + +
    + + ```yaml + type: task + name: trl-train-fsdp-distrib + + nodes: 2 + + image: nvcr.io/nvidia/pytorch:25.01-py3 + + env: + - HF_TOKEN + - ACCELERATE_LOG_LEVEL=info + - WANDB_API_KEY + - MODEL_ID=meta-llama/Llama-3.1-8B + - HUB_MODEL_ID + + commands: + - pip install transformers bitsandbytes peft wandb + - git clone https://github.com/huggingface/trl + - cd trl + - pip install . + - | + accelerate launch \ + --config_file=examples/accelerate_configs/fsdp1.yaml \ + --main_process_ip=$DSTACK_MASTER_NODE_IP \ + --main_process_port=8008 \ + --machine_rank=$DSTACK_NODE_RANK \ + --num_processes=$DSTACK_GPUS_NUM \ + --num_machines=$DSTACK_NODES_NUM \ + trl/scripts/sft.py \ + --model_name $MODEL_ID \ + --dataset_name OpenAssistant/oasst_top1_2023-08-25 \ + --dataset_text_field="text" \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 4 \ + --learning_rate 2e-4 \ + --report_to wandb \ + --bf16 \ + --max_seq_length 1024 \ + --attn_implementation flash_attention_2 \ + --logging_steps=10 \ + --output_dir /checkpoints/llama31-ft \ + --hub_model_id $HUB_MODEL_ID \ + --torch_dtype bfloat16 + + resources: + gpu: 80GB:8 + shm_size: 128GB + + volumes: + - /checkpoints:/checkpoints + ``` + +
    + +=== "DeepSpeed ZeRO-3" + +
    + + ```yaml + type: task + name: trl-train-deepspeed-distrib + + nodes: 2 + + image: nvcr.io/nvidia/pytorch:25.01-py3 + + env: + - HF_TOKEN + - WANDB_API_KEY + - HUB_MODEL_ID + - MODEL_ID=meta-llama/Llama-3.1-8B + - ACCELERATE_LOG_LEVEL=info + + commands: + - pip install transformers bitsandbytes peft wandb deepspeed + - git clone https://github.com/huggingface/trl + - cd trl + - pip install . + - | + accelerate launch \ + --config_file=examples/accelerate_configs/deepspeed_zero3.yaml \ + --main_process_ip=$DSTACK_MASTER_NODE_IP \ + --main_process_port=8008 \ + --machine_rank=$DSTACK_NODE_RANK \ + --num_processes=$DSTACK_GPUS_NUM \ + --num_machines=$DSTACK_NODES_NUM \ + trl/scripts/sft.py \ + --model_name $MODEL_ID \ + --dataset_name OpenAssistant/oasst_top1_2023-08-25 \ + --dataset_text_field="text" \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 4 \ + --learning_rate 2e-4 \ + --report_to wandb \ + --bf16 \ + --max_seq_length 1024 \ + --attn_implementation flash_attention_2 \ + --logging_steps=10 \ + --output_dir /checkpoints/llama31-ft \ + --hub_model_id $HUB_MODEL_ID \ + --torch_dtype bfloat16 + + resources: + gpu: 80GB:8 + shm_size: 128GB + + volumes: + - /checkpoints:/checkpoints + ``` + +
    + +!!! info "Docker image" + We are using `nvcr.io/nvidia/pytorch:25.01-py3` from NGC because it includes the necessary libraries and packages for RDMA and InfiniBand support. + +### Run the configuration + +To run a configuration, use the [`dstack apply`](../../reference/cli/dstack/apply.md) command. + +
    + +```shell +$ HF_TOKEN=... +$ WANDB_API_KEY=... +$ HUB_MODEL_ID=... +$ dstack apply -f train-distrib.dstack.yml + + # BACKEND RESOURCES INSTANCE TYPE PRICE + 1 ssh (remote) cpu=208 mem=1772GB H100:80GB:8 instance $0 idle + 2 ssh (remote) cpu=208 mem=1772GB H100:80GB:8 instance $0 idle + +Submit the run trl-train-fsdp-distrib? [y/n]: y + +Provisioning... +---> 100% +``` + +
    + +## What's next? + +1. Check [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), + [services](../../concepts/services.md), and [fleets](../../concepts/fleets.md) +2. Read about [cluster placement](../../concepts/fleets.md#cluster-placement) diff --git a/mkdocs/docs/guides/cli-api.md b/mkdocs/docs/guides/cli-api.md new file mode 100644 index 0000000000..b451dc281b --- /dev/null +++ b/mkdocs/docs/guides/cli-api.md @@ -0,0 +1,479 @@ +--- +title: CLI & API +description: How to use the dstack CLI and HTTP API +--- + +# CLI & API + +!!! info "Prerequisites" + Ensure the [server](../installation.md#server) is up and running. To use `dstack` with AI agents, install [skills](../installation.md#skills). + +The primary way to use `dstack` is the CLI. It can be used to manage +[fleets](../concepts/fleets.md), [dev environments](../concepts/dev-environments.md), +[tasks](../concepts/tasks.md), [services](../concepts/services.md), +[volumes](../concepts/volumes.md), and [gateways](../concepts/gateways.md), view logs, +and inspect [events](../concepts/events.md). Use the HTTP API for functionality not +available in the CLI or for integrations that need to call the server directly. + +## CLI + +> See [installation](../installation.md#cli) on how to install the CLI. + +### Configuration + +The CLI requires a [project](../concepts/projects.md) configuration with the project name, server URL, and user token in `~/.dstack/config.yml`. + +
    + +```yaml +projects: + - name: main + url: http://127.0.0.1:3000 + token: + default: true + - name: octocat + url: https://sky.dstack.ai + token: +``` + +
    + +Use [`dstack project`](../reference/cli/dstack/project.md) to list, +[add](../installation.md#configure-the-project), delete, and set the default +project configurations. To run a command against a non-default project, pass +`--project NAME`, or set `DSTACK_PROJECT` in the current shell. + +??? info "Projects" + [Projects](../concepts/projects.md) enable the isolation of different teams and their resources. Users can be added to projects and assigned roles. Each user has a user token for authentication. + +### Manage fleets + +Before submitting runs, you must create at least one +[fleet](../concepts/fleets.md). Fleets act as both pools of instances and +templates for how those instances are provisioned. + +Use [`dstack fleet`](../reference/cli/dstack/fleet.md#dstack-fleet-list) to +list existing fleets, their configurations, and instances (if any): + +
    + +```shell +$ dstack fleet +``` + +
    + +??? info "Offers" + Offers are available instance configurations that match resource + requirements. + +
    + + ```shell + $ dstack offer --gpu H100 --max-offers 10 + ``` + +
    + + If no fleet is specified, + [`dstack offer`](../reference/cli/dstack/offer.md) shows offers from all + configured backends. + + Use `--fleet NAME` to restrict offers to a fleet. Listing offers does not + create capacity. + +Define a fleet configuration in a YAML file. The filename must end with +`.dstack.yml`, for example `fleet.dstack.yml`: + +
    + +```yaml +type: fleet +name: default + +nodes: 0..1 +idle_duration: 1h + +resources: + gpu: 0 +``` + +
    + +Pass the fleet configuration to `dstack apply`: + +
    + +```shell +$ dstack apply -f fleet.dstack.yml +``` + +
    + +If the `nodes` range starts with `0`, `dstack` creates a fleet template. +Instances are provisioned when matching runs are submitted. + +### Submit runs + +To submit a run, define a +[dev environment](../concepts/dev-environments.md), +[task](../concepts/tasks.md), or [service](../concepts/services.md) +configuration. The example below submits a task. + +
    + +```yaml +type: task +name: hello + +commands: + - echo hello world +``` + +
    + +Submit the run: + +
    + +```shell +$ dstack apply -f .dstack.yml +``` + +
    + +!!! info "Plan and confirmation" + `dstack apply` shows the plan and asks for confirmation before submitting + the run. To only see the plan, answer `n` at the prompt: + +
    + + ```shell + $ echo "n" | dstack apply -f .dstack.yml + ``` + +
    + + Use `-y` to skip confirmation. + +!!! info "Attached by default" + For run configurations, `dstack apply` automatically attaches after + submitting the run. This configures SSH access, forwards declared ports, and + streams logs. See [Attach to runs](#attach-to-runs). + + Use `-d` to submit in detached mode. + +### Attach to runs + +If the run was submitted with `-d`, or if you need to attach to another job in +a multi-job run, use `dstack attach`: + +
    + +```shell +$ dstack attach <run name> +``` + +
    + +!!! info "SSH" + During `dstack apply` in attached mode and during + `dstack attach `, the CLI downloads the current user's built-in + private SSH key if needed and stores it under `~/.dstack/ssh/`. + + While attached, the CLI updates `~/.dstack/ssh/config` and ensures this file + is included from `~/.ssh/config`. The file contains a `Host ` + alias used by `ssh ` to SSH into the run container: + +
    + + ```ssh-config + Host + HostName + Port + User + IdentityFile + IdentitiesOnly yes + ``` + +
    + + > For [VM-based backends](../concepts/backends.md#vm-based) and + > [SSH fleets](../concepts/fleets.md), `dstack` may add an additional alias + > `-host` and use it as a proxy jump for `ssh `. + + While attached, connect to the run with: + +
    + + ```shell + $ ssh <run name> + ``` + +
    + +Use `--job JOB_NUMBER` with `dstack attach` to attach to another job. Ports +declared in the run configuration are forwarded while attached. + +??? info "User SSH keys" + The server stores a built-in SSH key pair for each user. + + Users can add custom public SSH keys via the UI or the + [users](../reference/http/users.md) API. To use a custom private key for a + particular run, pass `--ssh-identity` to `dstack apply` or `dstack attach`. + +### Browse logs + +When `dstack apply` is attached, it streams logs for job `0` automatically. +Use [`dstack logs`](../reference/cli/dstack/logs.md) to view logs in detached +mode, or to view logs for a specific job: + +
    + +```shell +$ dstack logs <run name> +``` + +
    + +Use `--job JOB_NUMBER` to select a job and `--since` to filter by time. + +??? info "Attached logs" + Use `--logs` with `dstack attach` to stream logs while attaching: + +
    + + ```shell + $ dstack attach <run name> --logs + ``` + +
    + +### Commands + +Other common CLI commands include [`dstack ps`](../reference/cli/dstack/ps.md), +[`dstack stop`](../reference/cli/dstack/stop.md), and +[`dstack event`](../reference/cli/dstack/event.md). + +!!! info "Verbose and JSON modes" + Use `-v` for more details where supported. For automation, use `--json`, + e.g. `dstack ps --json`, `dstack run get --json`, or + `dstack fleet get --json`. + +## API + +The `dstack` API is represented by the HTTP API. Use it for functionality not +available in the CLI or for integrations that need to call the server directly. + + + +### Authenticate + +The HTTP API requires the `Authorization` header for user authentication: + +```text +Authorization: Bearer +``` + +### Manage fleets + +The [fleets](../reference/http/fleets.md) API can list existing fleets, their +configurations, and instances (if any): + +
    + +```shell +$ curl "<server URL>/api/project/<project name>/fleets/list" \ + -X POST \ + -H "Authorization: Bearer <user token>" \ + -H 'Content-Type: application/json' \ + -d '{"include_imported": true}' +``` + +
    + +??? info "Offers" + To check available offers via the HTTP API, call + [`/runs/get_plan`](../reference/http/runs.md) with the same lightweight + task specification used by `dstack offer`: + +
    + + ```shell + $ curl "<server URL>/api/project/<project name>/runs/get_plan" \ + -X POST \ + -H "Authorization: Bearer <user token>" \ + -H 'Content-Type: application/json' \ + -d '{ + "run_spec": { + "configuration": { + "type": "task", + "commands": [":"], + "image": "scratch", + "user": "root", + "resources": { + "gpu": 0 + } + } + }, + "max_offers": 5 + }' + ``` + +
    + + If `fleets` is not set in the run configuration, offers are returned from + all configured backends. Use `"fleets": ["default"]` to restrict offers to + a fleet. + + To group offers by GPU and other fields, use the + [gpus](../reference/http/gpus.md) API. + +Creating fleets uses `/fleets/get_plan` followed by `/fleets/apply`: + +
    + +```shell +$ curl "<server URL>/api/project/<project name>/fleets/get_plan" \ + -X POST \ + -H "Authorization: Bearer <user token>" \ + -H 'Content-Type: application/json' \ + -d '{ + "spec": { + "configuration": { + "type": "fleet", + "name": "cpu-fleet", + "nodes": "0..1", + "idle_duration": "1h", + "resources": { + "gpu": 0 + } + }, + "profile": {} + } + }' +``` + +
    + +Then apply the fleet plan: + +
    + +```shell +$ curl "<server URL>/api/project/<project name>/fleets/apply" \ + -X POST \ + -H "Authorization: Bearer <user token>" \ + -H 'Content-Type: application/json' \ + -d '{ + "plan": { + "spec": { + "configuration": { + "type": "fleet", + "name": "cpu-fleet", + "nodes": "0..1", + "idle_duration": "1h", + "resources": { + "gpu": 0 + } + }, + "profile": {} + } + }, + "force": false + }' +``` + +
    + +### Submit runs + +Use the [runs](../reference/http/runs.md) API to submit +[dev environments](../concepts/dev-environments.md), [tasks](../concepts/tasks.md), +and [services](../concepts/services.md). The example below submits a task: + +
    + +```shell +$ curl "<server URL>/api/project/<project name>/runs/apply" \ + -X POST \ + -H "Authorization: Bearer <user token>" \ + -H 'Content-Type: application/json' \ + -d '{ + "plan": { + "run_spec": { + "run_name": "hello-api", + "configuration": { + "type": "task", + "commands": ["echo hello world"] + } + } + }, + "force": false + }' +``` + +
    + +Set `run_name` if a stable run name is needed. Otherwise, the server can +generate a run name. + +Poll `/runs/get` to check the run status: + +
    + +```shell +$ curl "<server URL>/api/project/<project name>/runs/get" \ + -X POST \ + -H "Authorization: Bearer <user token>" \ + -H 'Content-Type: application/json' \ + -d '{"run_name": "hello-api"}' +``` + +
    + +### Poll logs + +Use the [logs](../reference/http/logs.md) API to poll logs. Get +`job_submission_id` from `/runs/get`, e.g. from `latest_job_submission.id`. + +
    + +```shell +$ curl "<server URL>/api/project/<project name>/logs/poll" \ + -X POST \ + -H "Authorization: Bearer <user token>" \ + -H 'Content-Type: application/json' \ + -d '{ + "run_name": "hello-api", + "job_submission_id": "<job submission id>", + "limit": 100 + }' +``` + +
    + +Use `next_token` from the response to continue polling. + +## Reference + +For complete details on specific CLI commands and HTTP APIs, see the +[`dstack server`](../reference/cli/dstack/server.md) and +[server](../reference/http/server.md) references. + +!!! info "OpenAPI" + For complete information on the HTTP API, or to generate native clients, + refer to [openapi.json](../reference/http/openapi.json). + +!!! info "What's next?" + 1. Follow the [installation guide](../installation.md) + 2. Read about [projects](../concepts/projects.md) + 3. Check [fleets](../concepts/fleets.md), + [dev environments](../concepts/dev-environments.md), + [tasks](../concepts/tasks.md), and [services](../concepts/services.md) diff --git a/mkdocs/docs/guides/migration/slurm.md b/mkdocs/docs/guides/migration/slurm.md new file mode 100644 index 0000000000..2791075e8d --- /dev/null +++ b/mkdocs/docs/guides/migration/slurm.md @@ -0,0 +1,1850 @@ +--- +title: Migrate from Slurm +description: This guide compares Slurm and dstack, and shows how to orchestrate equivalent GPU-based workloads using dstack. +--- + +# Migrate from Slurm + +Both Slurm and `dstack` are open-source workload orchestration systems designed to manage compute resources and schedule jobs. This guide compares Slurm and `dstack`, maps features between the two systems, and shows their `dstack` equivalents. + +!!! tip "Slurm vs dstack" + Slurm is a battle-tested system with decades of production use in HPC environments. `dstack` is designed for modern ML/AI workloads with cloud-native provisioning and container-first architecture. Slurm is better suited for traditional HPC centers with static clusters; `dstack` is better suited for cloud-native ML teams working with cloud GPUs. Both systems can handle distributed training and batch workloads. + +| | Slurm | dstack | +|---|-------|--------| +| **Provisioning** | Pre-configured static clusters; cloud requires third-party integrations with potential limitations | Native integration with top GPU clouds; automatically provisions clusters on demand | +| **Containers** | Optional via plugins | Built around containers from the ground up | +| **Use cases** | Batch job scheduling and distributed training | Interactive development, distributed training, and production inference services | +| **Personas** | HPC centers, academic institutions, research labs | ML engineering teams, AI startups, cloud-native organizations | + +While `dstack` is designed to be use-case agnostic and supports both development and production-grade inference, this guide focuses specifically on training workloads. + +## Architecture + +Both Slurm and `dstack` follow a client-server architecture with a control plane and a compute plane running on cluster instances. + +| | Slurm | dstack | +|---|---------------|-------------------| +| **Control plane** | `slurmctld` (controller) | `dstack-server` | +| **State persistence** | `slurmdbd` (database) | `dstack-server` (SQLite/PostgreSQL) | +| **API** | `slurmrestd` (REST API) | `dstack-server` (HTTP API) | +| **Compute plane** | `slurmd` (compute agent) | `dstack-shim` (on VMs/hosts) and/or `dstack-runner` (inside containers) | +| **Client** | CLI from login nodes | CLI from anywhere | +| **High availability** | Active-passive failover (typically 2 controller nodes) | Horizontal scaling with multiple server replicas (requires PostgreSQL) | + +## Job configuration and submission + +Both Slurm and `dstack` allow defining jobs as files and submitting them via CLI. + +### Slurm + +Slurm uses shell scripts with `#SBATCH` directives embedded in the script: + +
    + +```bash +#!/bin/bash +#SBATCH --job-name=train-model +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=8 +#SBATCH --gres=gpu:1 +#SBATCH --mem=32G +#SBATCH --time=2:00:00 +#SBATCH --partition=gpu +#SBATCH --output=train-%j.out +#SBATCH --error=train-%j.err + +export HF_TOKEN +export LEARNING_RATE=0.001 + +module load python/3.9 +srun python train.py --batch-size=64 +``` + +
    + +Submit the job from a login node (with environment variables that override script defaults): + +
    + +```shell +$ sbatch --export=ALL,LEARNING_RATE=0.002 train.sh + Submitted batch job 12346 +``` + +
    + +### dstack + +`dstack` uses declarative YAML configuration files: + +
    + +```yaml +type: task +name: train-model + +python: 3.9 +repos: + - . + +env: + - HF_TOKEN + - LEARNING_RATE=0.001 + +commands: + - python train.py --batch-size=64 + +resources: + gpu: 1 + memory: 32GB + cpu: 8 + shm_size: 8GB + +max_duration: 2h +``` + +
    + +Submit the job from anywhere (laptop, CI/CD) via the CLI. `dstack apply` allows overriding various options and runs in attached mode by default, streaming job output in real-time: + +
    + +```shell +$ dstack apply -f .dstack.yml --env LEARNING_RATE=0.002 + + # BACKEND REGION RESOURCES SPOT PRICE + 1 aws us-east-1 4xCPU, 16GB, T4:1 yes $0.10 + +Submit the run train-model? [y/n]: y + +Launching `train-model`... +---> 100% +``` + +
    + +### Configuration comparison + +| | Slurm | dstack | +|---|-------|--------| +| **File type** | Shell script with `#SBATCH` directives | YAML configuration file (`.dstack.yml`) | +| **GPU** | `--gres=gpu:N` or `--gres=gpu:type:N` | `gpu: A100:80GB:4` or `gpu: 40GB..80GB:2..8` (supports ranges) | +| **Memory** | `--mem=M` (per node) or `--mem-per-cpu=M` | `memory: 200GB..` (range, per node, minimum requirement) | +| **CPU** | `--cpus-per-task=C` or `--ntasks` | `cpu: 32` (per node) | +| **Shared memory** | Configured on host | `shm_size: 24GB` (explicit) | +| **Duration** | `--time=2:00:00` | `max_duration: 2h` (both enforce walltime) | +| **Cluster** | `--partition=gpu` | `fleets: [gpu]` (see Partitions and fleets below) | +| **Output** | `--output=train-%j.out` (writes files) | `dstack logs` or UI (streams via API) | +| **Working directory** | `--chdir=/path/to/dir` or defaults to submission directory | `working_dir: /path/to/dir` (defaults to image's working directory, typically `/dstack/run`) | +| **Environment variables** | `export VAR` or `--export=ALL,VAR=value` | `env: - VAR` or `--env VAR=value` | +| **Node exclusivity** | `--exclusive` (entire node) | Automatic if `blocks` is not used or job uses all blocks; required for distributed tasks (`nodes` > 1) | + +> For multi-node examples, see [Distributed training](#distributed-training) below. + +## Containers + +### Slurm + +By default, Slurm runs jobs on compute nodes using the host OS with cgroups for resource isolation and full access to the host filesystem. Container execution is optional via plugins but require explicit filesystem mounts. + +=== "Singularity/Apptainer" + + Container image must exist on shared filesystem. Mount host directories with `--container-mounts`: + + ```bash + #!/bin/bash + #SBATCH --nodes=1 + #SBATCH --gres=gpu:1 + #SBATCH --mem=32G + #SBATCH --time=2:00:00 + + srun --container-image=/shared/images/pytorch-2.0-cuda11.8.sif \ + --container-mounts=/shared/datasets:/datasets,/shared/checkpoints:/checkpoints \ + python train.py --batch-size=64 + ``` + +=== "Pyxis with Enroot" + + Pyxis plugin pulls images from Docker registry. Mount host directories with `--container-mounts`: + + ```bash + #!/bin/bash + #SBATCH --nodes=1 + #SBATCH --gres=gpu:1 + #SBATCH --mem=32G + #SBATCH --time=2:00:00 + + srun --container-image=pytorch/pytorch:2.0.0-cuda11.8-cudnn8-runtime \ + --container-mounts=/shared/datasets:/datasets,/shared/checkpoints:/checkpoints \ + python train.py --batch-size=64 + ``` + +=== "Enroot" + + Pulls images from registry. Mount host directories with `--container-mounts`: + + ```bash + #!/bin/bash + #SBATCH --nodes=1 + #SBATCH --gres=gpu:1 + #SBATCH --mem=32G + #SBATCH --time=2:00:00 + + srun --container-image=docker://pytorch/pytorch:2.0.0-cuda11.8-cudnn8-runtime \ + --container-mounts=/shared/datasets:/datasets,/shared/checkpoints:/checkpoints \ + python train.py --batch-size=64 + ``` + +### dstack + +`dstack` always uses container. If `image` is not specified, `dstack` uses a base Docker image with `uv`, `python`, essential CUDA drivers, and other dependencies. You can also specify your own Docker image: + +=== "Public registry" + + ```yaml + type: task + name: train-with-image + + image: pytorch/pytorch:2.0.0-cuda11.8-cudnn8-runtime + + repos: + - . + + commands: + - python train.py --batch-size=64 + + resources: + gpu: 1 + memory: 32GB + ``` + +=== "Private registry" + + ```yaml + type: task + name: train-ngc + + image: nvcr.io/nvidia/pytorch:24.01-py3 + + registry_auth: + username: $oauthtoken + password: ${{ secrets.nvidia_ngc_api_key }} + + repos: + - . + + commands: + - python train.py --batch-size=64 + + resources: + gpu: 1 + memory: 32GB + ``` + +`dstack` can automatically upload files via `repos` or `files`, or mount filesystems via `volumes`. See [Filesystems and data access](#filesystems-and-data-access) below. + +## Distributed training + +Both Slurm and `dstack` schedule distributed workloads over clusters with fast interconnect, automatically propagating environment variables required by distributed frameworks (PyTorch DDP, DeepSpeed, FSDP, etc.). + +### Slurm + +Slurm explicitly controls both `nodes` and processes/tasks. + +=== "PyTorch DDP" + + ```bash + #!/bin/bash + #SBATCH --job-name=distributed-train + #SBATCH --nodes=4 + #SBATCH --ntasks-per-node=1 # One task per node + #SBATCH --gres=gpu:8 # 8 GPUs per node + #SBATCH --mem=200G + #SBATCH --time=24:00:00 + #SBATCH --partition=gpu + + # Set up distributed training environment + MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) + MASTER_PORT=12345 + + export MASTER_ADDR MASTER_PORT + + # Launch training with torchrun (torch.distributed.launch is deprecated) + srun torchrun \ + --nnodes="$SLURM_JOB_NUM_NODES" \ + --nproc_per_node=8 \ + --node_rank="$SLURM_NODEID" \ + --rdzv_backend=c10d \ + --rdzv_endpoint="$MASTER_ADDR:$MASTER_PORT" \ + train.py \ + --model llama-7b \ + --batch-size=32 \ + --epochs=10 + ``` + + +=== "MPI" + + ```bash + #!/bin/bash + #SBATCH --nodes=2 + #SBATCH --ntasks=16 + #SBATCH --gres=gpu:8 + #SBATCH --mem=200G + #SBATCH --time=24:00:00 + + export MASTER_ADDR=$(scontrol show hostnames $SLURM_NODELIST | head -n1) + export MASTER_PORT=12345 + + # Convert SLURM_JOB_NODELIST to hostfile format + HOSTFILE=$(mktemp) + scontrol show hostnames $SLURM_JOB_NODELIST | awk -v slots=$SLURM_NTASKS_PER_NODE '{print $0" slots="slots}' > $HOSTFILE + + # MPI with NCCL tests or custom MPI application + mpirun \ + --allow-run-as-root \ + --hostfile $HOSTFILE \ + -n $SLURM_NTASKS \ + --bind-to none \ + /opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1 + + rm -f $HOSTFILE + ``` + +### dstack + +`dstack` only specifies `nodes`. A run with multiple nodes creates multiple jobs (one per node), each running in a container on a particular instance. Inside the job container, processes are determined by the user's `commands`. + +=== "PyTorch DDP" + + ```yaml + type: task + name: distributed-train-pytorch + + nodes: 4 + + python: 3.12 + repos: + - . + + env: + - NCCL_DEBUG=INFO + - NCCL_IB_DISABLE=0 + - NCCL_SOCKET_IFNAME=eth0 + + commands: + - | + torchrun \ + --nproc-per-node=$DSTACK_GPUS_PER_NODE \ + --node-rank=$DSTACK_NODE_RANK \ + --nnodes=$DSTACK_NODES_NUM \ + --master-addr=$DSTACK_MASTER_NODE_IP \ + --master-port=12345 \ + train.py \ + --model llama-7b \ + --batch-size=32 \ + --epochs=10 + + resources: + gpu: A100:80GB:8 + memory: 200GB.. + shm_size: 24GB + + max_duration: 24h + ``` + +=== "MPI" + + For MPI workloads that require specific job startup and termination behavior, `dstack` provides `startup_order` and `stop_criteria` properties. The master node (rank 0) runs the MPI command, while worker nodes wait for the master to complete. + + ```yaml + type: task + name: nccl-tests + + nodes: 2 + startup_order: workers-first + stop_criteria: master-done + + env: + - NCCL_DEBUG=INFO + + commands: + - | + if [ $DSTACK_NODE_RANK -eq 0 ]; then + mpirun \ + --allow-run-as-root \ + --hostfile $DSTACK_MPI_HOSTFILE \ + -n $DSTACK_GPUS_NUM \ + -N $DSTACK_GPUS_PER_NODE \ + --bind-to none \ + /opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1 + else + sleep infinity + fi + + resources: + gpu: nvidia:1..8 + shm_size: 16GB + ``` + + If `startup_order` and `stop_criteria` are not configured (as in the PyTorch DDP example above), the master worker starts first and waits until all workers terminate. For MPI workloads, we need to change this. + +#### Nodes and processes comparison + +| | Slurm | dstack | +|---|-------|--------| +| **Nodes** | `--nodes=4` | `nodes: 4` | +| **Processes/tasks** | `--ntasks=8` or `--ntasks-per-node=2` (controls process distribution) | Determined by `commands` (relies on frameworks like `torchrun`, `accelerate`, `mpirun`, etc.) | + +**Environment variables comparison:** + +| Slurm | dstack | Purpose | +|-------|--------|---------| +| `SLURM_NODELIST` | `DSTACK_NODES_IPS` | Newline-delimited list of node IPs | +| `SLURM_NODEID` | `DSTACK_NODE_RANK` | Node rank (0-based) | +| `SLURM_PROCID` | N/A | Process rank (0-based, across all processes) | +| `SLURM_NTASKS` | `DSTACK_GPUS_NUM` | Total number of processes/GPUs | +| `SLURM_NTASKS_PER_NODE` | `DSTACK_GPUS_PER_NODE` | Number of processes/GPUs per node | +| `SLURM_JOB_NUM_NODES` | `DSTACK_NODES_NUM` | Number of nodes | +| Manual master address | `DSTACK_MASTER_NODE_IP` | Master node IP (automatically set) | +| N/A | `DSTACK_MPI_HOSTFILE` | Pre-populated MPI hostfile | + +!!! info "Fleets" + Distributed tasks may run only on a fleet with `placement: cluster` configured. Refer to [Partitions and fleets](#partitions-and-fleets) for configuration details. + +## Queueing and scheduling + +Both systems support core scheduling features and efficient resource utilization. + +| | Slurm | dstack | +|---------|-------|--------| +| **Prioritization** | Multi-factor system (fairshare, age, QOS); influenced via `--qos` or `--partition` flags | Set via `priority` (0-100); plus FIFO within the same priority | +| **Queueing** | Automatic via `sbatch`; managed through partitions | Set `on_events` to `[no-capacity]` under `retry` configuration | +| **Usage quotas** | Set via `sacctmgr` command per user/account/QOS | Not supported | +| **Backfill scheduling** | Enabled via `SchedulerType=sched/backfill` in `slurm.conf` | Not supported | +| **Preemption** | Configured via `PreemptType` in `slurm.conf` (QOS or partition-based) | Not supported | +| **Topology-aware scheduling** | Configured via `topology.conf` (InfiniBand switches, interconnects) | Not supported | + +### Slurm + +Slurm may use a multi-factor priority system, and limit usage across accounts, users, and runs. + +#### QOS + +Quality of Service (QOS) provides a static priority boost. Administrators create QOS levels and assign them to users as defaults: + +
    + +```shell +$ sacctmgr add qos high_priority Priority=1000 +$ sacctmgr modify qos high_priority set MaxWall=200:00:00 MaxTRES=gres/gpu=8 +``` + +
    + +Users can override the default QOS when submitting jobs via CLI (`sbatch --qos=high_priority`) or in the job script: + +
    + +```bash +#!/bin/bash +#SBATCH --qos=high_priority +``` + +
    + +#### Accounts and usage quotas + +Usage quotas limit resource consumption and can be set per user, account, or QOS: + +
    + +```shell +$ sacctmgr add account research +$ sacctmgr modify user user1 set account=research +$ sacctmgr modify user user1 set MaxWall=100:00:00 MaxTRES=gres/gpu=4 +$ sacctmgr modify account research set MaxWall=1000:00:00 MaxTRES=gres/gpu=16 +``` + +
    + +#### Monitoring commands + +Slurm provides several CLI commands to check queue status, job details, and quota usage: + +=== "Queue status" + + Use `squeue` to check queue status. Jobs are listed in scheduling order by priority: + +
    + + ```shell + $ squeue -u $USER + JOBID PARTITION NAME USER ST TIME NODES REASON + 12345 gpu training user1 PD 0:00 2 Priority + ``` + +
    + +=== "Job details" + + Use `scontrol show job` to show detailed information about a specific job: + +
    + + ```shell + $ scontrol show job 12345 + JobId=12345 JobName=training + UserId=user1(1001) GroupId=users(100) + Priority=4294 Reason=Priority (Resources) + ``` + +
    + +=== "Quota usage" + + The `sacct` command can show quota consumption per user, account, or QOS depending on the format options: + +
    + + ```shell + $ sacct -S 2024-01-01 -E 2024-01-31 --format=User,Account,TotalCPU,TotalTRES + User Account TotalCPU TotalTRES + user1 research 100:00:00 gres/gpu=50 + ``` + +
    + +#### Topology-aware scheduling + +Slurm detects network topology (InfiniBand switches, interconnects) and optimizes multi-node job placement to minimize latency. Configured in `topology.conf`, referenced from `slurm.conf`: + +
    + +```bash +SwitchName=switch1 Nodes=node[01-10] +SwitchName=switch2 Nodes=node[11-20] +``` + +
    + +When scheduling multi-node jobs, Slurm prioritizes nodes connected to the same switch to minimize network latency. + +### dstack + +`dstack` doesn't have the concept of accounts, QOS, and doesn't support usage quotas yet. + +#### Priority and retry policy + +However, `dstack` supports prioritization (integer, no multi-factor or pre-emption) and queueing jobs. + +
    + +```yaml +type: task +name: train-with-retry + +python: 3.12 +repos: + - . + +commands: + - python train.py --batch-size=64 + +resources: + gpu: 1 + memory: 32GB + +# Priority: 0-100 (FIFO within same level; default: 0) +priority: 50 + +retry: + on_events: [no-capacity] # Retry until idle instances are available (enables queueing similar to Slurm) + duration: 48h # Maximum retry time (run age for no-capacity, time since last event for error/interruption) + +max_duration: 2h +``` + +
    + +By default, the `retry` policy is not set, which means run fails immediately if no capacity is available. + +#### Scheduled runs + +Unlike Slurm, `dstack` supports scheduled runs using the `schedule` property with cron syntax, allowing tasks to start periodically at specific UTC times. + +
    + +```yaml +type: task +name: task-with-cron + +python: 3.12 +repos: + - . + +commands: + - python task.py --batch-size=64 + +resources: + gpu: 1 + memory: 32GB + +schedule: + cron: "15 23 * * *" # everyday at 23:15 UTC +``` + +
    + +#### Monitoring commands + +=== "Queue status" + The `dstack ps` command displays runs and jobs sorted by priority, reflecting the order in which they will be scheduled. + +
    + + ```shell + $ dstack ps + NAME BACKEND RESOURCES PRICE STATUS SUBMITTED + training-job aws H100:1 (spot) $4.50 provisioning 2 mins ago + ``` + +
    + +#### Topology-aware scheduling + +Topology-aware scheduling is not supported in `dstack`. While backend provisioning may respect network topology (e.g., cloud providers may provision instances with optimal inter-node connectivity), `dstack` task scheduling does not leverage topology-aware placement. + +## Partitions and fleets + +Partitions in Slurm and fleets in `dstack` both organize compute nodes for job scheduling. The key difference is that `dstack` fleets natively support dynamic cloud provisioning, whereas Slurm partitions organize pre-configured static nodes. + +| | Slurm | dstack | +|---|-------|--------| +| **Provisioning** | Static nodes only | Supports both static clusters (SSH fleets) and dynamic provisioning via backends (cloud or Kubernetes) | +| **Overlap** | Nodes can belong to multiple partitions | Each instance belongs to exactly one fleet | +| **Accounts and projects** | Multiple accounts can use the same partition; used for quotas and resource accounting | Each fleet belongs to one project | + +### Slurm + +Slurm partitions are logical groupings of static nodes defined in `slurm.conf`. Nodes can belong to multiple partitions: + +
    + +```bash +PartitionName=gpu Nodes=gpu-node[01-10] Default=NO MaxTime=24:00:00 +PartitionName=cpu Nodes=cpu-node[01-50] Default=YES MaxTime=72:00:00 +PartitionName=debug Nodes=gpu-node[01-10] Default=NO MaxTime=1:00:00 +``` + +
    + +Submit to a specific partition: + +
    + +```shell +$ sbatch --partition=gpu train.sh + Submitted batch job 12346 +``` + +
    + +### dstack + +`dstack` fleets are pools of instances (VMs or containers) that serve as both the organization unit and the provisioning template. + +`dstack` supports two types of fleets: + +| Fleet type | Description | +|------------|-------------| +| **Backend fleets** | Dynamically provisioned via configured backends (cloud or Kubernetes). Specify `resources` and `nodes` range; `dstack apply` provisions matching instances/clusters automatically. | +| **SSH fleets** | Use existing on-premises servers/clusters via `ssh_config`. `dstack apply` connects via SSH, installs dependencies. | + +=== "Backend fleets" + +
    + + ```yaml + type: fleet + name: gpu-fleet + + nodes: 0..8 + + resources: + gpu: A100:80GB:8 + + # Optional: Enables inter-node connectivity; required for distributed tasks + placement: cluster + + # Optional: Split GPUs into blocks for multi-tenant sharing + # Optional: Allows to share the instance across up to 8 workloads + blocks: 8 + + backends: [aws] + + # Spot instances for cost savings + spot_policy: auto + ``` + +
    + +=== "SSH fleets" + +
    + + ```yaml + type: fleet + name: on-prem-gpu-fleet + + # Optional: Enables inter-node connectivity; required for distributed tasks + placement: cluster + + # Optional: Allows to share the instance across up to 8 workloads + blocks: 8 + + ssh_config: + user: dstack + identity_file: ~/.ssh/id_rsa + hosts: + - gpu-node01.example.com + - gpu-node02.example.com + + # Optional: Only required if hosts are behind a login node (bastion host) + proxy_jump: + hostname: login-node.example.com + user: dstack + identity_file: ~/.ssh/login_node_key + ``` + +
    + +Tasks with multiple nodes require a fleet with `placement: cluster` configured, otherwise they cannot run. + +Submit to a specific fleet: + +
    + +```shell +$ dstack apply -f train.dstack.yml --fleet gpu-fleet + BACKEND REGION RESOURCES SPOT PRICE + 1 aws us-east-1 4xCPU, 16GB, T4:1 yes $0.10 + Submit the run train-model? [y/n]: y + Launching `train-model`... + ---> 100% +``` + +
    + +Create or update a fleet: + +
    + +```shell +$ dstack apply -f fleet.dstack.yml + Provisioning... + ---> 100% +``` + +
    + +List fleets: + +
    + +```shell +$ dstack fleet + FLEET INSTANCE BACKEND GPU PRICE STATUS CREATED + gpu-fleet 0 aws (us-east-1) A100:80GB (spot) $0.50 idle 3 mins ago +``` + +
    + +## Filesystems and data access + +Both Slurm and `dstack` allow workloads to access filesystems (including shared filesystems) and copy files. + +| | Slurm | dstack | +|---|-------|--------| +| **Host filesystem access** | Full access by default (native processes); mounting required only for containers | Always uses containers; requires explicit mounting via `volumes` (instance or network) | +| **Shared filesystems** | Assumes global namespace (NFS, Lustre, GPFS); same path exists on all nodes | Supported via SSH fleets with instance volumes (pre-mounted network storage); network volumes for backend fleets (limited support for shared filesystems) | +| **Instance disk size** | Fixed by cluster administrator | Configurable via `disk` property in `resources` (tasks) or fleet configuration; supports ranges (e.g., `disk: 500GB` or `disk: 200GB..1TB`) | +| **Local/temporary storage** | `$SLURM_TMPDIR` (auto-cleaned on job completion) | Container filesystem (auto-cleaned on job completion; except instance volumes or network volumes) | +| **File transfer** | `sbcast` for broadcasting files to allocated nodes | `repos` and `files` properties; `rsync`/`scp` via SSH (when attached) | + +### Slurm + +Slurm assumes a shared filesystem (NFS, Lustre, GPFS) with a global namespace. The same path exists on all nodes, and `$SLURM_TMPDIR` provides local scratch space that is automatically cleaned. + +=== "Native processes" + +
    + + ```bash + #!/bin/bash + #SBATCH --nodes=4 + #SBATCH --gres=gpu:8 + #SBATCH --time=24:00:00 + + # Global namespace - same path on all nodes + # Dataset accessible at same path on all nodes + DATASET_PATH=/shared/datasets/imagenet + + # Local scratch (faster I/O, auto-cleaned) + # Copy dataset to local SSD for faster access + cp -r $DATASET_PATH $SLURM_TMPDIR/dataset + + # Training with local dataset + python train.py \ + --data=$SLURM_TMPDIR/dataset \ + --checkpoint-dir=/shared/checkpoints \ + --epochs=100 + + # $SLURM_TMPDIR automatically cleaned when job ends + # Checkpoints saved to shared filesystem persist + ``` + +
    + +=== "Containers" + + When using containers, shared filesystems must be explicitly mounted via bind mounts: + +
    + + ```bash + #!/bin/bash + #SBATCH --nodes=4 + #SBATCH --gres=gpu:8 + #SBATCH --time=24:00:00 + + # Shared filesystem mounted at /datasets and /checkpoints + DATASET_PATH=/datasets/imagenet + + # Local scratch accessible via $SLURM_TMPDIR (host storage mounted into container) + # Copy dataset to local scratch, then train + srun --container-image=/shared/images/pytorch-2.0-cuda11.8.sif \ + --container-mounts=/shared/datasets:/datasets,/shared/checkpoints:/checkpoints \ + cp -r $DATASET_PATH $SLURM_TMPDIR/dataset + + srun --container-image=/shared/images/pytorch-2.0-cuda11.8.sif \ + --container-mounts=/shared/datasets:/datasets,/shared/checkpoints:/checkpoints \ + python train.py \ + --data=$SLURM_TMPDIR/dataset \ + --checkpoint-dir=/checkpoints \ + --epochs=100 + + # \$SLURM_TMPDIR automatically cleaned when job ends + # Checkpoints saved to mounted shared filesystem persist + ``` + +
    + +#### File broadcasting (sbcast) + +Slurm provides `sbcast` to distribute files efficiently using its internal network topology, avoiding filesystem contention: + +
    + +```bash +#!/bin/bash +#SBATCH --nodes=4 +#SBATCH --ntasks=32 + +# Broadcast file to all allocated nodes +srun --ntasks=1 --nodes=1 sbcast /shared/data/input.txt /tmp/input.txt + +# Use broadcasted file on all nodes +srun python train.py --input=/tmp/input.txt +``` + +
    + +### dstack + +`dstack` supports both accessing filesystems (including shared filesystems) and uploading/downloading code/data from the client. + +#### Instance volumes + +Instance volumes mount host directories into containers. With distributed tasks, the host can use a shared filesystem (NFS, Lustre, GPFS) to share data across jobs within the same task: + +
    + +```yaml +type: task +name: distributed-train + +nodes: 4 + +python: 3.12 +repos: + - . + +volumes: + # Host directory (can be on shared filesystem) mounted into container + - /mnt/shared/datasets:/data + - /mnt/shared/checkpoints:/checkpoints + +commands: + - | + torchrun \ + --nproc-per-node=$DSTACK_GPUS_PER_NODE \ + --node-rank=$DSTACK_NODE_RANK \ + --nnodes=$DSTACK_NODES_NUM \ + --master-addr=$DSTACK_MASTER_NODE_IP \ + --master-port=12345 \ + train.py \ + --data=/data \ + --checkpoint-dir=/checkpoints + +resources: + gpu: A100:80GB:8 + memory: 200GB +``` + +
    + +#### Network volumes + +Network volumes are persistent cloud storage (AWS EBS, GCP persistent disks, Runpod volumes). + +Single-node task: + +
    + +```yaml +type: task +name: train-model + +python: 3.9 +repos: + - . + +volumes: + - name: imagenet-dataset + path: /data + +commands: + - python train.py --data=/data --batch-size=64 + +resources: + gpu: 1 + memory: 32GB +``` + +
    + +Network volumes cannot be used with distributed tasks (no multi-attach support), except where multi-attach is supported (Runpod) or via volume interpolation. + +For distributed tasks, use interpolation to attach different volumes to each node. + +
    + +```yaml +type: task +name: distributed-train + +nodes: 4 + +python: 3.12 +repos: + - . + +volumes: + # Each node gets its own volume + - name: dataset-${{ dstack.node_rank }} + path: /data + +commands: + - | + torchrun \ + --nproc-per-node=$DSTACK_GPUS_PER_NODE \ + --node-rank=$DSTACK_NODE_RANK \ + --nnodes=$DSTACK_NODES_NUM \ + --master-addr=$DSTACK_MASTER_NODE_IP \ + --master-port=12345 \ + train.py \ + --data=/data + +resources: + gpu: A100:80GB:8 + memory: 200GB +``` + +
    + +Volume name interpolation is not the same as a shared filesystem—each node has its own separate volume. `dstack` currently has limited support for shared filesystems when using backend fleets. + +#### Repos and files + +The `repos` and `files` properties allow uploading code or data into the container. + +=== "Repos" + + The `repos` property clones Git repositories into the container. `dstack` clones the repo on the instance, applies local changes, and mounts it into the container. This is useful for code that needs to be version-controlled and synced. + +
    + + ```yaml + type: task + name: train-model + + python: 3.9 + + repos: + - . # Clone current directory repo + + commands: + - python train.py --batch-size=64 + + resources: + gpu: 1 + memory: 32GB + cpu: 8 + ``` + +
    + +=== "Files" + + The `files` property mounts local files or directories into the container. Each entry maps a local path to a container path. + +
    + + ```yaml + type: task + name: train-model + + python: 3.9 + + files: + - ../configs:~/configs + - ~/.ssh/id_rsa:~/ssh/id_rsa + + commands: + - python train.py --config ~/configs/model.yaml --batch-size=64 + + resources: + gpu: 1 + memory: 32GB + cpu: 8 + ``` + +
    + + Files are uploaded to the instance and mounted into the container, but are not persisted across runs (2MB limit per file, configurable). + +#### SSH file transfer + +While attached to a run, you can transfer files via `rsync` or `scp` using the run name alias: + +=== "rsync" + +
    + + ```shell + $ rsync -avz ./data/ :/path/inside/container/data/ + ``` + +
    + +=== "scp" + +
    + + ```shell + $ scp large-dataset.h5 :/path/inside/container/ + ``` + +
    + +> Uploading code/data from/to the client is not recommended as transfer speed greatly depends on network bandwidth between the CLI and the instance. + +## Interactive development + +Both Slurm and `dstack` allow allocating resources for interactive development. + +| | Slurm | dstack | +|---|-------|--------| +| **Configuration** | Uses `salloc` command to allocate resources with a time limit; resources are automatically released when time expires | Uses `type: dev-environment` configurations as first-class citizen; provisions compute and runs until explicitly stopped (optional inactivity-based termination) | +| **IDE access** | Requires SSH access to allocated nodes | Native access using desktop IDEs (VS Code, Cursor, Windsurf, etc.) or SSH | +| **SSH access** | SSH to allocated nodes (host OS) using `SLURM_NODELIST` or `srun --pty` | SSH automatically configured; access via run name alias (inside container) | + +### Slurm + +Slurm uses `salloc` to allocate resources with a time limit. `salloc` returns a shell on the login node with environment variables set; use `srun` or SSH to access compute nodes. After the time limit expires, resources are automatically released: + +
    + +```shell +$ salloc --nodes=1 --gres=gpu:1 --time=4:00:00 + salloc: Granted job allocation 12346 + +$ srun --pty bash + [user@compute-node-01 ~]$ python train.py --epochs=1 + Training epoch 1... + [user@compute-node-01 ~]$ exit + exit + +$ exit + exit + salloc: Relinquishing job allocation 12346 +``` + +
    + +Alternatively, SSH directly to allocated nodes using hostnames from `SLURM_NODELIST`: + +
    + +```shell +$ ssh $SLURM_NODELIST + [user@compute-node-01 ~]$ +``` + +
    + +### dstack + +`dstack` uses `dev-environment` configuration type that automatically provisions an instance and runs until explicitly stopped, with optional inactivity-based termination. Access is provided via native desktop IDEs (VS Code, Cursor, Windsurf, etc.) or SSH: + +
    + +```yaml +type: dev-environment +name: ml-dev + +python: 3.12 +ide: vscode + +resources: + gpu: A100:80GB:1 + memory: 200GB + +# Optional: Maximum runtime duration (stops after this time) +max_duration: 8h + +# Optional: Auto-stop after period of inactivity (no SSH/IDE connections) +inactivity_duration: 2h + +# Optional: Auto-stop if GPU utilization is below threshold +utilization_policy: + min_gpu_utilization: 10 # Percentage + time_window: 1h +``` + +
    + +Start the dev environment: + +
    + +```shell +$ dstack apply -f dev.dstack.yml + BACKEND REGION RESOURCES SPOT PRICE + 1 runpod CA-MTL-1 9xCPU, 48GB, A5000:24GB yes $0.11 + Submit the run ml-dev? [y/n]: y + Launching `ml-dev`... + ---> 100% + To open in VS Code Desktop, use this link: + vscode://vscode-remote/ssh-remote+ml-dev/workflow +``` + +
    + +#### Port forwarding + +`dstack` tasks support exposing `ports` for running interactive applications like Jupyter notebooks or Streamlit apps: + +=== "Jupyter" + +
    + + ```yaml + type: task + name: jupyter + + python: 3.12 + + commands: + - pip install jupyterlab + - jupyter lab --allow-root + + ports: + - 8888 + + resources: + gpu: 1 + memory: 32GB + ``` + +
    + +=== "Streamlit" + +
    + + ```yaml + type: task + name: streamlit-app + + python: 3.12 + + commands: + - pip install streamlit + - streamlit hello + + ports: + - 8501 + + resources: + gpu: 1 + memory: 32GB + ``` + +
    + +While `dstack apply` is attached, ports are automatically forwarded to `localhost` (e.g., `http://localhost:8888` for Jupyter, `http://localhost:8501` for Streamlit). + +## Job arrays + +### Slurm job arrays + +Slurm provides native job arrays (`--array=1-100`) that create multiple job tasks from a single submission. Job arrays can be specified via CLI argument or in the job script. + +
    + +```shell +$ sbatch --array=1-100 train.sh + Submitted batch job 1001 +``` + +
    + +Each task can use the `$SLURM_ARRAY_TASK_ID` environment variable within the job script to determine its configuration. Output files can use `%A` for the job ID and `%a` for the task ID in `#SBATCH --output` and `--error` directives. + +### dstack + +`dstack` does not support native job arrays. Submit multiple runs programmatically via CLI or API. Pass a custom environment variable (e.g., `TASK_ID`) to identify each run: + +
    + +```shell +$ for i in {1..100}; do + dstack apply -f train.dstack.yml \ + --name "train-array-task-${i}" \ + --env TASK_ID=${i} \ + --detach + done +``` + +
    + + +## Environment variables and secrets + +Both Slurm and `dstack` handle sensitive data (API keys, tokens, passwords) for ML workloads. Slurm uses environment variables or files, while `dstack` provides encrypted secrets management in addition to environment variables. + +### Slurm + +Slurm uses OS-level authentication. Jobs run with the user's UID/GID and inherit the environment from the login node. No built-in secrets management; users manage credentials in their environment or shared files. + +Set environment variables in the shell before submitting (requires `--export=ALL`): + +
    + +```shell +$ export HF_TOKEN=$(cat ~/.hf_token) +$ sbatch --export=ALL train.sh + Submitted batch job 12346 +``` + +
    + +### dstack + +In addition to environment variables (`env`), `dstack` provides a secrets management system with encryption. Secrets are referenced in configuration using `${{ secrets.name }}` syntax. + +Set secrets: + +
    + +```shell +$ dstack secret set huggingface_token +$ dstack secret set wandb_api_key +``` + +
    + +Use secrets in configuration: + +
    + +```yaml +type: task +name: train-with-secrets + +python: 3.12 +repos: + - . + +env: + - HF_TOKEN=${{ secrets.huggingface_token }} + - WANDB_API_KEY=${{ secrets.wandb_api_key }} + +commands: + - pip install huggingface_hub + - huggingface-cli download meta-llama/Llama-2-7b-hf + - wandb login + - python train.py + +resources: + gpu: A100:80GB:8 +``` + +
    + +## Authentication + +### Slurm + +Slurm uses OS-level authentication. Users authenticate via SSH to login nodes using their Unix accounts. Jobs run with the user's UID/GID, ensuring user isolation—users cannot access other users' files or processes. Slurm enforces file permissions based on Unix UID/GID and association limits (MaxJobs, MaxSubmitJobs) configured per user or account. + +### dstack + +`dstack` uses token-based authentication. Users are registered within projects on the server, and each user is issued a token. This token is used for authentication with all CLI and API commands. Access is controlled at the project level with user roles: + +| Role | Permissions | +|------|-------------| +| **Admin** | Can manage project settings, including backends, gateways, and members | +| **Manager** | Can manage project members but cannot configure backends and gateways | +| **User** | Can manage project resources including runs, fleets, and volumes | + +`dstack` manages SSH keys on the server for secure access to runs and instances. User SSH keys are automatically generated and used when attaching to runs via `dstack attach` or `dstack apply`. Project SSH keys are used by the server to establish SSH connections to provisioned instances. + +!!! note "Multi-tenancy isolation" + `dstack` currently does not offer full isolation for multi-tenancy. Users may access global resources within the host. + +## Monitoring and observability + +Both systems provide tools to monitor job/run status, cluster/node status, resource metrics, and logs: + +| | Slurm | dstack | +|---|-------|--------| +| **Job/run status** | `squeue` lists jobs in queue | `dstack ps` lists active runs | +| **Cluster/node status** | `sinfo` shows node availability | `dstack fleet` lists instances | +| **CPU/memory metrics** | `sstat` for running jobs | `dstack metrics` for real-time metrics | +| **GPU metrics** | Requires SSH to nodes, `nvidia-smi` per node | Automatic collection via `nvidia-smi`/`amd-smi`, `dstack metrics` | +| **Job history** | `sacct` for completed jobs | `dstack ps -n NUM` shows run history | +| **Logs** | Written to files (`--output`, `--error`) | Streamed via API, `dstack logs` | + +### Slurm + +Slurm provides command-line tools for monitoring cluster state, jobs, and history. + +Check node status: + +
    + +```shell +$ sinfo + PARTITION AVAIL TIMELIMIT NODES STATE NODELIST + gpu up 1-00:00:00 10 idle gpu-node[01-10] +``` + +
    + +Check job queue: + +
    + +```shell +$ squeue -u $USER + JOBID PARTITION NAME USER ST TIME NODES + 12345 gpu training user1 R 2:30 2 +``` + +
    + +Check job details: + +
    + +```shell +$ scontrol show job 12345 + JobId=12345 JobName=training + UserId=user1(1001) GroupId=users(100) + NumNodes=2 NumCPUs=64 NumTasks=32 + Gres=gpu:8(IDX:0,1,2,3,4,5,6,7) +``` + +
    + +Check resource usage for running jobs (`sstat` only works for running jobs): + +
    + +```shell +$ sstat --job=12345 --format=JobID,MaxRSS,MaxVMSize,CPUUtil + JobID MaxRSS MaxVMSize CPUUtil + 12345.0 2048M 4096M 95.2% +``` + +
    + +Check GPU usage (requires SSH to node): + +
    + +```shell +$ srun --jobid=12345 --pty nvidia-smi + GPU 0: 95% utilization, 72GB/80GB memory +``` + +
    + +Check job history for completed jobs: + +
    + +```shell +$ sacct --job=12345 --format=JobID,Elapsed,MaxRSS,State,ExitCode + JobID Elapsed MaxRSS State ExitCode + 12345 2:30:00 2048M COMPLETED 0:0 +``` + +
    + +View logs (written to files via `--output` and `--error` flags; typically in the submission directory on a shared filesystem): + +
    + +```shell +$ cat slurm-12345.out + Training started... + Epoch 1/10: loss=0.5 +``` + +
    + +If logs are on compute nodes, find the node from `scontrol show job`, then access via `srun --jobid` (running jobs) or SSH (completed jobs): + +
    + +```shell +$ srun --jobid=12345 --nodelist=gpu-node01 --pty bash +$ cat slurm-12345.out +``` + +
    + +### dstack + +`dstack` automatically collects essential metrics (CPU, memory, GPU utilization) using vendor utilities (`nvidia-smi`, `amd-smi`, etc.) and provides real-time monitoring via CLI. + +List runs: + +
    + +```shell +$ dstack ps + NAME BACKEND GPU PRICE STATUS SUBMITTED + training-job aws H100:1 (spot) $4.50 running 5 mins ago +``` + +
    + +List fleets and instances (shows GPU health status): + +
    + +```shell +$ dstack fleet + FLEET INSTANCE BACKEND RESOURCES STATUS PRICE CREATED + my-fleet 0 aws (us-east-1) T4:16GB:1 idle $0.526 11 mins ago + 1 aws (us-east-1) T4:16GB:1 idle (warning) $0.526 11 mins ago +``` + +
    + +Check real-time metrics: + +
    + +```shell +$ dstack metrics training-job + NAME STATUS CPU MEMORY GPU + training-job running 45% 16.27GB/200GB gpu=0 mem=72.48GB/80GB util=95% +``` + +
    + +Stream logs (stored centrally using external storage services like CloudWatch Logs or GCP Logging, accessible via CLI and UI): + +
    + +```shell +$ dstack logs training-job + Training started... + Epoch 1/10: loss=0.5 +``` + +
    + +#### Prometheus integration + +`dstack` exports additional metrics to Prometheus: + +| Metric type | Description | +|-------------|-------------| +| **Fleet metrics** | Instance duration, price, GPU count | +| **Run metrics** | Run counters (total, terminated, failed, done) | +| **Job metrics** | Execution time, cost, CPU/memory/GPU usage | +| **DCGM telemetry** | Temperature, ECC errors, PCIe replay counters, NVLink errors | +| **Server health** | HTTP request metrics | + +To enable Prometheus export, set the `DSTACK_ENABLE_PROMETHEUS_METRICS` environment variable and configure Prometheus to scrape metrics from `/metrics`. + +> GPU health monitoring is covered in the [GPU health monitoring](#gpu-health-monitoring) section below. + +## Fault tolerance, checkpointing, and retry + +Both systems support fault tolerance for long-running training jobs that may be interrupted by hardware failures, spot instance terminations, or other issues: + +| | Slurm | dstack | +|---|-------|--------| +| **Retry** | `--requeue` flag requeues jobs on node failure (hardware crash) or preemption, not application failures (software crashes); all nodes requeued together (all-or-nothing) | `retry` property with `on_events` (`error`, `interruption`) and `duration`; all jobs stopped and run resubmitted if any job fails (all-or-nothing) | +| **Graceful stop** | Grace period with `SIGTERM` before `SIGKILL`; `--signal` sends signal before time limit (e.g., `--signal=B:USR1@300`) | Not supported | +| **Checkpointing** | Application-based; save to shared filesystem | Application-based; save to persistent volumes | +| **Instance health** | `HealthCheckProgram` in `slurm.conf` runs custom scripts (DCGM/RVS); non-zero exit drains node (excludes from new scheduling, running jobs continue) | Automatic GPU health monitoring via DCGM; unhealthy instances excluded from scheduling | + +### Slurm + +Slurm handles three types of failures: system failures (hardware crash), application failures (software crash), and preemption. + +Enable automatic requeue on node failure (not application failures). For distributed jobs, if one node fails, the entire job is requeued (all-or-nothing): + +
    + +```bash +#!/bin/bash +#SBATCH --job-name=train-with-checkpoint +#SBATCH --nodes=4 +#SBATCH --gres=gpu:8 +#SBATCH --time=48:00:00 +#SBATCH --requeue # Requeue on node failure only + +srun python train.py +``` + +
    + +Preempted jobs receive `SIGTERM` during a grace period before `SIGKILL` and are typically requeued automatically. Use `--signal` to send a custom signal before the time limit expires: + +
    + +```bash +#!/bin/bash +#SBATCH --job-name=train-with-checkpoint +#SBATCH --nodes=4 +#SBATCH --gres=gpu:8 +#SBATCH --time=48:00:00 +#SBATCH --signal=B:USR1@300 # Send USR1 5 minutes before time limit + +trap 'python save_checkpoint.py --checkpoint-dir=/shared/checkpoints' USR1 + +if [ -f /shared/checkpoints/latest.pt ]; then + RESUME_FLAG="--resume /shared/checkpoints/latest.pt" +fi + +srun python train.py \ + --checkpoint-dir=/shared/checkpoints \ + $RESUME_FLAG +``` + +
    + +Checkpoints are saved to a shared filesystem. Applications must implement checkpointing logic. + +Custom health checks are configured via `HealthCheckProgram` in `slurm.conf`: + +
    + +```bash +HealthCheckProgram=/shared/scripts/gpu_health_check.sh +``` + +
    + +The health check script should exit with non-zero code to drain the node: + +
    + +```bash +#!/bin/bash +dcgmi diag -r 1 +if [ $? -ne 0 ]; then + exit 1 # Non-zero exit drains node +fi +``` + +
    + +Drained nodes are excluded from new scheduling, but running jobs continue until completion. + +### dstack + +`dstack` handles three types of failures: provisioning failures (`no-capacity`), job failures (`error`), and interruptions (`interruption`). The `error` event is triggered by application failures (non-zero exit code) and instance unreachable issues. The `interruption` event is triggered by spot instance terminations and network/hardware issues. + +By default, runs fail immediately. Enable retry via the `retry` property to handle these events: + +
    + +```yaml +type: task +name: train-with-checkpoint-retry + +nodes: 4 + +python: 3.12 +repos: + - . + +volumes: + # Use instance volumes (host directories) or network volumes (cloud-managed persistent storage) + - name: checkpoint-volume + path: /checkpoints + +commands: + - | + if [ -f /checkpoints/latest.pt ]; then + RESUME_FLAG="--resume /checkpoints/latest.pt" + fi + python train.py \ + --checkpoint-dir=/checkpoints \ + $RESUME_FLAG + +resources: + gpu: A100:80GB:8 + memory: 200GB + +spot_policy: auto + +retry: + on_events: [error, interruption] + duration: 48h +``` + +
    + +For distributed tasks, if any job fails and retry is enabled, all jobs are stopped and the run is resubmitted (all-or-nothing). + +Unlike Slurm, `dstack` does not support graceful shutdown signals. Applications must implement proactive checkpointing (periodic saves) and check for existing checkpoints on startup to resume after retries. + +## GPU health monitoring + +Both systems monitor GPU health to prevent degraded hardware from affecting workloads: + +| | Slurm | dstack | +|---|-------|--------| +| **Health checks** | Custom scripts (DCGM/RVS) via `HealthCheckProgram` in `slurm.conf`; typically active diagnostics (`dcgmi diag`) or passive health watches | Automatic DCGM health watches (passive, continuous monitoring) | +| **Failure handling** | Non-zero exit drains node (excludes from new scheduling, running jobs continue); status: DRAIN/DRAINED | Unhealthy instances excluded from scheduling; status shown in `dstack fleet`: `idle` (healthy), `idle (warning)`, `idle (failure)` | + +### Slurm + +Configure custom health check scripts via `HealthCheckProgram` in `slurm.conf`. Scripts typically use DCGM diagnostics (`dcgmi diag`) for NVIDIA GPUs or RVS for AMD GPUs: + +
    + +```bash +HealthCheckProgram=/shared/scripts/gpu_health_check.sh +``` + +
    + +
    + +```bash +#!/bin/bash +dcgmi diag -r 1 # DCGM diagnostic for NVIDIA GPUs +if [ $? -ne 0 ]; then + exit 1 # Non-zero exit drains node +fi +``` + +
    + +Drained nodes are excluded from new scheduling, but running jobs continue until completion. + +### dstack + +`dstack` automatically monitors GPU health using DCGM background health checks on instances with NVIDIA GPUs. Supported on cloud backends where DCGM is pre-installed automatically (or comes with users' `os_images`) and SSH fleets where DCGM packages (`datacenter-gpu-manager-4-core`, `datacenter-gpu-manager-4-proprietary`, `datacenter-gpu-manager-exporter`) are installed on hosts. + +> AMD GPU health monitoring is not supported yet. + +Health status is displayed in `dstack fleet`: + +
    + +```shell +$ dstack fleet + FLEET INSTANCE BACKEND RESOURCES STATUS PRICE CREATED + my-fleet 0 aws (us-east-1) T4:16GB:1 idle $0.526 11 mins ago + 1 aws (us-east-1) T4:16GB:1 idle (warning) $0.526 11 mins ago + 2 aws (us-east-1) T4:16GB:1 idle (failure) $0.526 11 mins ago +``` + +
    + +Health status: + +| Status | Description | +|--------|-------------| +| `idle` | Healthy, no issues detected | +| `idle (warning)` | Non-fatal issues (e.g., correctable ECC errors); instance still usable | +| `idle (failure)` | Fatal issues (uncorrectable ECC, PCIe failures); instance excluded from scheduling | + +GPU health metrics are also exported to Prometheus (see [Prometheus integration](#prometheus-integration)). + +## Job dependencies + +Job dependencies enable chaining tasks together, ensuring that downstream jobs only run after upstream jobs complete. + +### Slurm dependencies + +Slurm provides native dependency support via `--dependency` flags. Dependencies are managed by Slurm: + +| Dependency type | Description | +|----------------|-------------| +| **`afterok`** | Runs only if the dependency job finishes with Exit Code 0 (success) | +| **`afterany`** | Runs regardless of success or failure (useful for cleanup jobs) | +| **`aftercorr`** | For array jobs, allows corresponding tasks to start as soon as the matching task in the dependency array completes (e.g., Task 1 of Array B starts when Task 1 of Array A finishes, without waiting for the entire Array A) | +| **`singleton`** | Based on job name and user (not job IDs), ensures only one job with the same name runs at a time for that user (useful for serializing access to shared resources) | + +Submit a job that depends on another job completing successfully: + +
    + +```shell +$ JOB_TRAIN=$(sbatch train.sh | awk '{print $4}') + Submitted batch job 1001 + +$ sbatch --dependency=afterok:$JOB_TRAIN evaluate.sh + Submitted batch job 1002 +``` + +
    + +Submit a job with singleton dependency (only one job with this name runs at a time): + +
    + +```shell +$ sbatch --job-name=ModelTraining --dependency=singleton train.sh + Submitted batch job 1004 +``` + +
    + +### dstack { #dstack-workflow-orchestration } + +`dstack` does not support native job dependencies. Use external workflow orchestration tools (Airflow, Prefect, etc.) to implement dependencies. + +=== "Prefect" + + ```python + from prefect import flow, task + import subprocess + + @task + def train_model(): + """Submit training job and wait for completion""" + subprocess.run( + ["dstack", "apply", "-f", "train.dstack.yml", "--name", "train-run"], + check=True # Raises exception if training fails + ) + return "train-run" + + @task + def evaluate_model(run_name): + """Submit evaluation job after training succeeds""" + subprocess.run( + ["dstack", "apply", "-f", "evaluate.dstack.yml", "--name", f"eval-{run_name}"], + check=True + ) + + @flow + def ml_pipeline(): + train_run = train_model() + evaluate_model(train_run) + ``` + +=== "Airflow" + + ```python + from airflow.decorators import dag, task + from datetime import datetime + import subprocess + + @dag(schedule=None, start_date=datetime(2024, 1, 1), catchup=False) + def ml_training_pipeline(): + @task + def train(context): + """Submit training job and wait for completion""" + run_name = f"train-{context['ds']}" + subprocess.run( + ["dstack", "apply", "-f", "train.dstack.yml", "--name", run_name], + check=True # Raises exception if training fails + ) + return run_name + + @task + def evaluate(run_name, context): + """Submit evaluation job after training succeeds""" + eval_name = f"eval-{run_name}" + subprocess.run( + ["dstack", "apply", "-f", "evaluate.dstack.yml", "--name", eval_name], + check=True + ) + + # Define task dependencies - train() completes before evaluate() starts + train_run = train() + evaluate(train_run) + + ml_training_pipeline() + ``` + +## Heterogeneous jobs + +Heterogeneous jobs (het jobs) allow a single job to request different resource configurations for different components (e.g., GPU nodes for training, high-memory CPU nodes for preprocessing). This is an edge case used for coordinated multi-component workflows. + +### Slurm + +Slurm supports heterogeneous jobs via `#SBATCH hetjob` and `--het-group` flags. Each component can specify different resources: + +```bash +#!/bin/bash +#SBATCH --job-name=ml-pipeline +#SBATCH hetjob +#SBATCH --het-group=0 --nodes=2 --gres=gpu:8 --mem=200G +#SBATCH --het-group=1 --nodes=1 --mem=500G --partition=highmem + +# Use SLURM_JOB_COMPONENT_ID to identify the component +if [ "$SLURM_JOB_COMPONENT_ID" -eq 0 ]; then + srun python train.py +elif [ "$SLURM_JOB_COMPONENT_ID" -eq 1 ]; then + srun python preprocess.py +fi +``` + +### dstack + +`dstack` does not support heterogeneous jobs natively. Use separate runs with [workflow orchestration tools (Prefect, Airflow)](#dstack-workflow-orchestration) or submit multiple runs programmatically to coordinate components with different resource requirements. + +## What's next? + +1. Check out [Quickstart](../../quickstart.md) +2. Read about [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), and [services](../../concepts/services.md) +3. Browse the [examples](../../examples.md) diff --git a/mkdocs/docs/guides/protips.md b/mkdocs/docs/guides/protips.md new file mode 100644 index 0000000000..d3c1ae5c2a --- /dev/null +++ b/mkdocs/docs/guides/protips.md @@ -0,0 +1,582 @@ +--- +title: Protips +description: Tips and tricks for using dstack efficiently +--- + +# Protips + +Below are tips and tricks to use `dstack` more efficiently. + +## Dev environments + +Before running a task or service, it's recommended that you first start with a dev environment. Dev environments +allow you to run commands interactively. + +Once the commands work, go ahead and run them as a task or a service. + +??? info "Notebooks" + **VS Code** + + When you access a dev environment using your desktop VS Code, it allows you to work with Jupyter notebooks via its + pre-configured and easy-to-use extension. + + **JupyterLab** + + If you prefer to use JupyterLab, you can run it as a task: + + ```yaml + type: task + + commands: + - pip install jupyterlab + - jupyter lab --allow-root + + ports: + - 8888 + + ``` + +## Tasks + +Tasks can be used not only for batch jobs but also for web applications. + +
    + +```yaml +type: task +name: streamlit-task + +python: 3.12 + +commands: + - uv pip install streamlit + - streamlit hello +ports: + - 8501 + +``` + +
    + +While you run a task, `dstack apply` forwards the remote ports to `localhost`. + +
    + +```shell +$ dstack apply -f app.dstack.yml + + Welcome to Streamlit. Check out our demo in your browser. + + Local URL: http://localhost:8501 +``` + +
    + +This allows you to access the remote `8501` port on `localhost:8501` while the CLI is attached. + +??? info "Port mapping" + If you want to override the local port, use the `--port` option: + +
    + + ```shell + $ dstack apply -f app.dstack.yml --port 3000:8501 + ``` + +
    + + This will forward the remote `8501` port to `localhost:3000`. + +!!! info "Tasks vs. services" + [Services](../concepts/services.md) provide external access, `https`, replicas with autoscaling, OpenAI-compatible endpoint + and other service features. If you don't need them, you can use [tasks](../concepts/tasks.md) for running apps. + +## Utilization policy + +If you want your run to automatically terminate if any of GPUs are underutilized, you can specify `utilization_policy`. + +Below is an example of a dev environment that auto-terminate if any GPU stays below 10% utilization for 1 hour. + +
    + +```yaml +type: dev-environment +name: my-dev + +python: 3.12 +ide: cursor + +resources: + gpu: H100:8 + +utilization_policy: + min_gpu_utilization: 10 + time_window: 1h +``` + +
    + +## Docker in Docker + +Set `docker` to `true` to enable the `docker` CLI in your dev environment, e.g., to run or build Docker images, or use Docker Compose. + +=== "Dev environment" +
    + + ```yaml + type: dev-environment + name: vscode + + docker: true + + ide: vscode + init: + - docker run --gpus all nvidia/cuda:12.3.0-base-ubuntu22.04 nvidia-smi + ``` + +
    + +=== "Task" +
    + + ```yaml + type: task + name: docker-nvidia-smi + + docker: true + + commands: + - docker run --gpus all nvidia/cuda:12.3.0-base-ubuntu22.04 nvidia-smi + + resources: + gpu: 1 + ``` + +
    + +??? info "Volumes" + + To persist Docker data between runs (e.g. images, containers, volumes, etc), create a `dstack` [volume](../concepts/volumes.md) + and add attach it in your run configuration. + + === "Network volums" + + ```yaml + type: dev-environment + name: vscode + + docker: true + ide: vscode + + volumes: + - name: docker-volume + path: /var/lib/docker + ``` + + === "Instance volumes" + + ```yaml + type: dev-environment + name: vscode + + docker: true + ide: vscode + + volumes: + - name: /docker-volume + path: /var/lib/docker + optional: true + ``` + +## Fleets + +### Creation policy + +By default, when you run `dstack apply` with a dev environment, task, or service, +if no `idle` instances from the available fleets meet the requirements, `dstack` provisions a new instance using configured backends. + +To ensure `dstack apply` doesn't provision a new instance but reuses an existing one, +pass `-R` (or `--reuse`) to `dstack apply`. + +
    + +```shell +$ dstack apply -R -f examples/.dstack.yml +``` + +
    + +Or, set [`creation_policy`](../reference/dstack.yml/dev-environment.md#creation_policy) to `reuse` in the run configuration. + +### Idle duration + +If the run is submitted to a fleet with `nodes` set to a range and a new instance is provisioned, the shorter of the fleet's and run's `idle_duration` is used. +If the run reuses an existing fleet instance, only the fleet's +[`idle_duration`](../reference/dstack.yml/fleet.md#idle_duration) applies. + +If an instance remains `idle`, it is automatically terminated after `idle_duration`. + +> Not applied for container-based backends (Kubernetes, Vast.ai, Runpod). + +## Volumes + +To persist data across runs, it is recommended to use volumes. +`dstack` supports two types of volumes: [network](../concepts/volumes.md#network-volumes) +(for persisting data even if the instance is interrupted) +and [instance](../concepts/volumes.md#instance-volumes) (useful for persisting cached data across runs while the instance remains active). + +> If you use [SSH fleets](../concepts/fleets.md#ssh-fleets), you can mount network storage (e.g., NFS or SMB) to the hosts and access it in runs via instance volumes. + +## Environment variables + +If a configuration requires an environment variable that you don't want to hardcode in the YAML, you can define it +without assigning a value: + +
    + +```yaml +type: dev-environment +name: vscode + +python: 3.12 + +env: + - HF_TOKEN +ide: vscode +``` + +
    + +Then, you can pass the environment variable either via the shell: + +
    + +```shell +$ HF_TOKEN=... +$ dstack apply -f .dstack.yml +``` + +
    + +Or via the `-e` option of the `dstack apply` command: + +
    + +```shell +$ dstack apply -e HF_TOKEN=... -f .dstack.yml +``` + +
    + +??? info ".envrc" + A better way to configure environment variables not hardcoded in YAML is by specifying them in a `.envrc` file: + +
    + + ```shell + export HF_TOKEN=... + ``` + +
    + + If you install [`direnv`](https://direnv.net/), + it will automatically apply the environment variables from the `.envrc` file to the `dstack apply` command. + + Remember to add `.envrc` to `.gitignore` to avoid committing it to the repo. + +[//]: # (## Profiles) +[//]: # () +[//]: # (If you don't want to specify the same parameters for each configuration, you can define them once via [profiles](../reference/profiles.yml.md)) +[//]: # (and reuse them across configurations.) +[//]: # () +[//]: # (This can be handy, for example, for configuring parameters such as `max_duration`, `max_price`, `termination_idle_time`,) +[//]: # (`regions`, etc.) +[//]: # () +[//]: # (Set `default` to `true` in your profile, and it will be applied automatically to any run.) + +## Retry policy + +By default, if `dstack` can't find available capacity, the run will fail. + +If you'd like `dstack` to automatically retry, configure the +[retry](../reference/dstack.yml/task.md#retry) property accordingly: + + + +
    + +```yaml +type: task +name: train + +python: 3.12 + +commands: + - uv pip install -r fine-tuning/qlora/requirements.txt + - python fine-tuning/qlora/train.py + +retry: + on_events: [no-capacity] + # Retry for up to 1 hour + duration: 1h +``` + +
    + +## Profiles + +Sometimes, you may want to reuse parameters across runs or set defaults so you don’t have to repeat them in every configuration. You can do this by defining a profile. + +??? info ".dstack/profiles.yml" + A profile file can be created either globally in `~/.dstack/profiles.yml` or locally in `.dstack/profiles.yml`: + + ```yaml + profiles: + - name: my-profile + # If set to true, this profile will be applied automatically + default: true + + # The spot pololicy can be "spot", "on-demand", or "auto" + spot_policy: auto + # Limit the maximum price of the instance per hour + max_price: 1.5 + # Stop any run if it runs longer that this duration + max_duration: 1d + # Use only these backends + backends: [azure, lambda] + ``` + + Check [`.dstack/profiles.yml`](../reference/profiles.yml.md) to see what properties can be defined there. + +A profile can be set as `default` to apply automatically to any run, or specified with `--profile NAME` in `dstack apply`. + +## Projects + +If you're using multiple `dstack` projects (e.g., from different `dstack` servers), +you can switch between them using the [`dstack project`](../reference/cli/dstack/project.md) command. + +??? info ".envrc" + Alternatively, you can install [`direnv`](https://direnv.net/) + to automatically apply environment variables from the `.envrc` file in your project directory. + +
    + + ```shell + export DSTACK_PROJECT=main + ``` + +
    + + Now, `dstack` will always use this project within this directory. + + Remember to add `.envrc` to `.gitignore` to avoid committing it to the repo. + +## Attached mode + +By default, `dstack apply` runs in attached mode. +This means it streams the logs as they come in and, in the case of a task, forwards its ports to `localhost`. + +To run in detached mode, use `-d` with `dstack apply`. + +> If you detached the CLI, you can always re-attach to a run via [`dstack attach`](../reference/cli/dstack/attach.md). + +## GPU specification + +`dstack` natively supports NVIDIA GPUs, AMD GPUs, TPUs, and Tenstorrent +devices. + +The `gpu` property within [`resources`](../reference/dstack.yml/dev-environment.md#resources) (or the `--gpu` option with [`dstack apply`](../reference/cli/dstack/apply.md) or +[`dstack offer`](../reference/cli/dstack/offer.md)) +allows specifying not only memory size but also GPU vendor, names, their memory, and quantity. + +The general format is: `:::`. + +Each component is optional. + + + +Ranges can be: + +* **Closed** (e.g. `24GB..80GB` or `1..8`) +* **Open** (e.g. `24GB..` or `1..`) +* **Single values** (e.g. `1` or `24GB`). + +Examples: + +- `1` (any GPU) +- `amd:2` (two AMD GPUs) +- `A100` (A100) +- `24GB..` (any GPU starting from 24GB) +- `24GB..40GB:2` (two GPUs between 24GB and 40GB) +- `A10G,A100` (either A10G or A100) +- `A100:80GB` (one A100 of 80GB) +- `A100:2` (two A100) +- `MI300X:4` (four MI300X) +- `A100:40GB:2` (two A100 40GB) +- `tpu:v2-8` (`v2` TPU with 8 cores) +- `tt:32` (32 Tenstorrent devices) +- `tt-galaxy-wh:32` (32 Galaxy Wormhole chips) +- `tt-galaxy-bh:32` (32 Galaxy Blackhole chips) +- `p150:8` (eight Tenstorrent Blackhole P150 devices) + +The GPU vendor is indicated by one of the following case-insensitive values: + +- `nvidia` (NVIDIA GPUs) +- `amd` (AMD GPUs) +- `tpu` (TPUs) +- `tt` (Tenstorrent devices) + +??? info "AMD" + Currently, when an AMD GPU is specified, either by name or by vendor, the `image` property must be specified as well. + +??? info "TPU" + Currently, you can't specify other than 8 TPU cores. This means only single host workloads are supported. + Support for multiple hosts is coming soon. + +## Offers + +If you're not sure which offers (hardware configurations) are available with the configured backends, use the +[`dstack offer`](../reference/cli/dstack/offer.md#list-gpu-offers) command. + +
    + +```shell +$ dstack offer --gpu H100 --max-offers 10 +Getting offers... +---> 100% + + # BACKEND REGION INSTANCE TYPE RESOURCES SPOT PRICE + 1 verda FIN-01 1H100.80S.30V 30xCPU, 120GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 + 2 verda FIN-02 1H100.80S.30V 30xCPU, 120GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 + 3 verda FIN-02 1H100.80S.32V 32xCPU, 185GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 + 4 verda ICE-01 1H100.80S.32V 32xCPU, 185GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 + 5 runpod US-KS-2 NVIDIA H100 PCIe 16xCPU, 251GB, 1xH100 (80GB), 100.0GB (disk) no $2.39 + 6 runpod CA NVIDIA H100 80GB HBM3 24xCPU, 251GB, 1xH100 (80GB), 100.0GB (disk) no $2.69 + 7 nebius eu-north1 gpu-h100-sxm 16xCPU, 200GB, 1xH100 (80GB), 100.0GB (disk) no $2.95 + 8 runpod AP-JP-1 NVIDIA H100 80GB HBM3 20xCPU, 251GB, 1xH100 (80GB), 100.0GB (disk) no $2.99 + 9 runpod CA-MTL-1 NVIDIA H100 80GB HBM3 28xCPU, 251GB, 1xH100 (80GB), 100.0GB (disk) no $2.99 + 10 runpod CA-MTL-2 NVIDIA H100 80GB HBM3 26xCPU, 125GB, 1xH100 (80GB), 100.0GB (disk) no $2.99 + ... + Shown 10 of 99 offers, $127.816 max +``` + +
    + +By default, `dstack offer` ignores fleet configurations and shows all available offers that match the request. +To inspect offers available through a specific fleet, pass `--fleet NAME`. + +??? info "Grouping offers" + Use `--group-by` to aggregate offers. Accepted values: `gpu`, `backend`, `region`, and `count`. + +
    + + ```shell + dstack offer --gpu b200 --group-by gpu,backend,region + Project main + User admin + Resources cpu=2.. mem=8GB.. disk=100GB.. b200:1.. + Spot policy auto + Max price - + Reservation - + Group by gpu, backend, region + + # GPU SPOT $/GPU BACKEND REGION + 1 B200:180GB:1..8 spot, on-demand 3.59..5.99 runpod EU-RO-1 + 2 B200:180GB:1..8 spot, on-demand 3.59..5.99 runpod US-CA-2 + 3 B200:180GB:8 on-demand 4.99 lambda us-east-1 + 4 B200:180GB:8 on-demand 5.5 nebius us-central1 + ``` + +
    + + When using `--group-by`, `gpu` must always be `included`. + The `region` value can only be used together with `backend`. + +The `offer` command allows you to filter and group offers with various [advanced options](../reference/cli/dstack/offer.md#usage). + + +## Metrics + +`dstack` tracks essential metrics accessible via the CLI and UI. To access advanced metrics like DCGM, configure the server to export metrics to Prometheus. See [Metrics](../concepts/metrics.md) for details. + +## Pull progress + +When using a VM-based backend or an SSH instance, you can see the Docker image pull progress in the CLI while the job is in the `pulling` state. + +
    + +```shell +$ dstack apply -y + +Launching orange-yak-1... + NAME BACKEND STATUS SUBMITTED + orange-yak-1 aws (eu-west-3) pulling 1.19/3.44/4.76GB 3 mins ago +``` + +
    + +The three slash-separated indicators represent: + +- Image data already extracted. +- Image data already downloaded. +- Total target image size. + +The exact total size is not always known initially, in which case a lower estimate is displayed with the `≥` sign. All values exclude any image layers already cached on the host prior to the pull. + +## Service quotas + +If you're using your own AWS, GCP, Azure, or OCI accounts, before you can use GPUs or spot instances, you have to request the +corresponding service quotas for each type of instance in each region. + +??? info "AWS" + Check this [guide ](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-resource-limits.html) on EC2 service quotas. + The relevant service quotas include: + + - `Running On-Demand P instances` (on-demand V100, A100 80GB x8) + - `All P4, P3 and P2 Spot Instance Requests` (spot V100, A100 80GB x8) + - `Running On-Demand G and VT instances` (on-demand T4, A10G, L4) + - `All G and VT Spot Instance Requests` (spot T4, A10G, L4) + - `Running Dedicated p5 Hosts` (on-demand H100) + - `All P5 Spot Instance Requests` (spot H100) + +??? info "GCP" + Check this [guide ](https://cloud.google.com/compute/resource-usage) on Compute Engine service quotas. + The relevant service quotas include: + + - `NVIDIA V100 GPUs` (on-demand V100) + - `Preemtible V100 GPUs` (spot V100) + - `NVIDIA T4 GPUs` (on-demand T4) + - `Preemtible T4 GPUs` (spot T4) + - `NVIDIA L4 GPUs` (on-demand L4) + - `Preemtible L4 GPUs` (spot L4) + - `NVIDIA A100 GPUs` (on-demand A100) + - `Preemtible A100 GPUs` (spot A100) + - `NVIDIA A100 80GB GPUs` (on-demand A100 80GB) + - `Preemtible A100 80GB GPUs` (spot A100 80GB) + - `NVIDIA H100 GPUs` (on-demand H100) + - `Preemtible H100 GPUs` (spot H100) + +??? info "Azure" + Check this [guide ](https://learn.microsoft.com/en-us/azure/quotas/quickstart-increase-quota-portal) on Azure service quotas. + The relevant service quotas include: + + - `Total Regional Spot vCPUs` (any spot instances) + - `Standard NCASv3_T4 Family vCPUs` (on-demand T4) + - `Standard NVADSA10v5 Family vCPUs` (on-demand A10) + - `Standard NCADS_A100_v4 Family vCPUs` (on-demand A100 80GB) + - `Standard NDASv4_A100 Family vCPUs` (on-demand A100 40GB x8) + - `Standard NDAMSv4_A100Family vCPUs` (on-demand A100 80GB x8) + - `Standard NCadsH100v5 Family vCPUs` (on-demand H100) + - `Standard NDSH100v5 Family vCPUs` (on-demand H100 x8) + +??? info "OCI" + Check this [guide ](https://docs.oracle.com/en-us/iaas/Content/General/Concepts/servicelimits.htm#Requesti) on requesting OCI service limits increase. + The relevant service category is compute. The relevant resources include: + + - `GPUs for GPU.A10 based VM and BM instances` (on-demand A10) + - `GPUs for GPU2 based VM and BM instances` (on-demand P100) + - `GPUs for GPU3 based VM and BM instances` (on-demand V100) + +Note, for AWS, GCP, and Azure, service quota values are measured with the number of CPUs rather than GPUs. + +[//]: # (TODO: Mention spot policy) diff --git a/mkdocs/docs/guides/server-deployment.md b/mkdocs/docs/guides/server-deployment.md new file mode 100644 index 0000000000..f368b62d5a --- /dev/null +++ b/mkdocs/docs/guides/server-deployment.md @@ -0,0 +1,537 @@ +--- +title: Server Deployment +description: Deploying the dstack server +--- + +The `dstack` server can run on your laptop or any environment with access to the cloud and on-prem clusters you plan to use. + +??? info "Hardware requirements" + The minimum hardware requirements for running the server are 1 CPU and 1GB of RAM. The recommended RAM is + "8MB × number of active instances". For example, a server with 1000 active instances should have 8GB of RAM. + You can set the `DSTACK_SERVER_SSH_POOL_DISABLED` env var to minimize RAM usage at the expense of slower processing. + +=== "pip" + > The server can be set up via `pip` on Linux, macOS, and Windows (via WSL 2). It requires Git and OpenSSH. + +
    + + ```shell + $ pip install "dstack[all]" -U + $ dstack server + + Applying ~/.dstack/server/config.yml... + + The admin token is "bbae0f28-d3dd-4820-bf61-8f4bb40815da" + The server is running at http://127.0.0.1:3000/ + ``` + +
    + +=== "uv" + + > The server can be set up via `uv` on Linux, macOS, and Windows (via WSL 2). It requires Git and OpenSSH. + +
    + + ```shell + $ uv tool install 'dstack[all]' -U + $ dstack server + + Applying ~/.dstack/server/config.yml... + + The admin token is "bbae0f28-d3dd-4820-bf61-8f4bb40815da" + The server is running at http://127.0.0.1:3000/ + ``` + +
    + +=== "Docker" + > For production deployments, it's recommended to use `dstackai/dstack` Docker image. + +
    + + ```shell + $ docker run -p 3000:3000 \ + -v $HOME/.dstack/server/:/root/.dstack/server \ + dstackai/dstack + + Applying ~/.dstack/server/config.yml... + + The admin token is "bbae0f28-d3dd-4820-bf61-8f4bb40815da" + The server is running at http://127.0.0.1:3000/ + ``` + +
    + +??? info "AWS CloudFormation" + If you'd like to deploy the server to a private AWS VPC, you can use + our CloudFormation [template](https://console.aws.amazon.com/cloudformation/home#/stacks/quickcreate?templateURL=https://get-dstack.s3.eu-west-1.amazonaws.com/cloudformation/template.yaml). + + First, ensure you've set up a private VPC with public and private subnets. + + ![](https://dstack.ai/static-assets/static-assets/images/dstack-aws-private-vpc-example-v2.png) + + Create a stack using the template, and specify the VPC and private subnets. + Once, the stack is created, go to `Outputs` for the server URL and admin token. + + To access the server URL, ensure you're connected to the VPC, e.g. via VPN client. + + > If you'd like to adjust anything, the source code of the template can be found at + [`examples/server-deployment/cloudformation/template.yaml`](https://github.com/dstackai/dstack/blob/master/examples/server-deployment/cloudformation/template.yaml). + +## Backend configuration + +To use `dstack` with cloud providers, configure [backends](../concepts/backends.md) +via the `~/.dstack/server/config.yml` file. +The server loads this file on startup. + +Alternatively, you can configure backends on the [project settings page](../concepts/projects.md#backends) via UI. + +> For using `dstack` with on-prem servers, no backend configuration is required. +> Use [SSH fleets](../concepts/fleets.md#ssh-fleets) instead. + +## State persistence + +The `dstack` server can store its internal state in SQLite or Postgres. +By default, it stores the state locally in `~/.dstack/server` using SQLite. +With SQLite, you can run at most one server replica. +Postgres has no such limitation and is recommended for production deployment. + +??? info "Replicate SQLite to cloud storage" + You can configure automatic replication of your SQLite state to a cloud object storage using Litestream. + This allows persisting the server state across re-deployments when using SQLite. + + To enable Litestream replication, set the following environment variables: + + - `LITESTREAM_REPLICA_URL` - The url of the cloud object storage. + Examples: `s3:///`, `gcs:///`, `abs://@/`, etc. + + You also need to configure cloud storage credentials. + + **AWS S3** + + To persist state into an AWS S3 bucket, provide the following environment variables: + + - `AWS_ACCESS_KEY_ID` - The AWS access key ID + - `AWS_SECRET_ACCESS_KEY` - The AWS secret access key + + **GCP Storage** + + To persist state into a GCP Storage bucket, provide one of the following environment variables: + + - `GOOGLE_APPLICATION_CREDENTIALS` - The path to the GCP service account key JSON file + - `GOOGLE_APPLICATION_CREDENTIALS_JSON` - The GCP service account key JSON + + **Azure Blob Storage** + + To persist state into an Azure blog storage, provide the following environment variable. + + - `LITESTREAM_AZURE_ACCOUNT_KEY` - The Azure storage account key + + More [details](https://litestream.io/guides/) on options for configuring replication. + +### PostgreSQL + +To store the server state in Postgres, set the `DSTACK_DATABASE_URL` environment variable: + +```shell +$ DSTACK_DATABASE_URL=postgresql+asyncpg://user:password@db-host:5432/dstack dstack server +``` + +The minimum requirements for the DB instance are 2 CPU, 2GB of RAM, and at least 50 `max_connections` per server replica +or a configured connection pooler to handle that many connections. +If you're using a smaller DB instance, you may need to set lower `DSTACK_DB_POOL_SIZE` and `DSTACK_DB_MAX_OVERFLOW`, e.g. +`DSTACK_DB_POOL_SIZE=10` and `DSTACK_DB_MAX_OVERFLOW=0`. + +??? info "Migrate from SQLite to PostgreSQL" + You can migrate the existing state from SQLite to PostgreSQL using `pgloader`: + + 1. Create a new PostgreSQL database + 2. Clone the `dstack` repo and [install](https://github.com/dstackai/dstack/blob/master/contributing/DEVELOPMENT.md) `dstack` from source. + Ensure you've checked out the tag that corresponds to your server version (e.g. `git checkout 0.18.10`). + 3. Apply database migrations to the new database: + ```bash + cd src/dstack/_internal/server/ + export DSTACK_DATABASE_URL="postgresql+asyncpg://..." + alembic upgrade head + ``` + 4. Install [pgloader :material-arrow-top-right-thin:{.external }](https://github.com/dimitri/pgloader) + 5. Pass the path to the `~/.dstack/server/data/sqlite.db` file to `SOURCE_PATH` and + set `TARGET_PATH` with the URL of the PostgreSQL database. Example: + ```bash + cd scripts/ + export SOURCE_PATH=sqlite:///Users/me/.dstack/server/data/sqlite.db + export TARGET_PATH=postgresql://postgres:postgres@localhost:5432/postgres + pgloader sqlite_to_psql.load + ``` + The `pgloader` script will migrate the SQLite data to PostgreSQL. It may emit warnings that are safe to ignore. + + If you encounter errors, please [submit an issue](https://github.com/dstackai/dstack/issues/new/choose). + +> For a local setup running PostgreSQL and the [SSH proxy](#ssh-proxy) together, see the example +> [`docker-compose.yml`](https://github.com/dstackai/dstack/blob/master/docker/server/docker-compose.yml). + +## Logs storage + +By default, `dstack` stores workload logs locally in `~/.dstack/server/projects//logs`. +For multi-replica server deployments, it's required to store logs externally. +`dstack` supports storing logs using AWS CloudWatch, GCP Logging, or Fluent-bit with Elasticsearch / Opensearch. + +### AWS CloudWatch + +To store logs in AWS CloudWatch, set the `DSTACK_SERVER_CLOUDWATCH_LOG_GROUP` and +the `DSTACK_SERVER_CLOUDWATCH_LOG_REGION` environment variables. + +The log group must be created beforehand. `dstack` won't try to create it. + +??? info "Required permissions" + + ```json + { + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "DstackLogStorageAllow", + "Effect": "Allow", + "Action": [ + "logs:DescribeLogStreams", + "logs:CreateLogStream", + "logs:GetLogEvents", + "logs:PutLogEvents" + ], + "Resource": [ + "arn:aws:logs:::log-group:", + "arn:aws:logs:::log-group::*" + ] + } + ] + } + ``` + +### GCP Logging + +To store logs using GCP Logging, set the `DSTACK_SERVER_GCP_LOGGING_PROJECT` environment variable. + +??? info "Required permissions" + Ensure you've configured Application Default Credentials with the following permissions: + + ``` + logging.logEntries.create + logging.logEntries.list + ``` + +??? info "Logs management" + `dstack` writes all the logs to the `projects/[PROJECT]/logs/dstack-run-logs` log name. + If you want to set up a custom retention policy for `dstack` logs, create a new bucket and configure a sink: + +
    + + ```shell + $ gcloud logging buckets create dstack-bucket \ + --location=global \ + --description="Bucket for storing dstack run logs" \ + --retention-days=10 + $ gcloud logging sinks create dstack-sink \ + logging.googleapis.com/projects/[PROJECT]/locations/global/buckets/dstack-bucket \ + --log-filter='logName = "projects/[PROJECT]/logs/dstack-run-logs"' + ``` + +
    + +### Fluent-bit + +To store logs using Fluent-bit, set the `DSTACK_SERVER_FLUENTBIT_HOST` environment variable. +Fluent-bit supports two modes depending on how you want to access logs. + +=== "Full mode" + + Logs are shipped to Fluent-bit and can be read back through the `dstack` UI and CLI via Elasticsearch or OpenSearch. + Use this mode when you want a complete integration with log viewing in `dstack`: + + ```shell + $ DSTACK_SERVER_FLUENTBIT_HOST=fluentbit.example.com \ + DSTACK_SERVER_ELASTICSEARCH_HOST=https://elasticsearch.example.com:9200 \ + dstack server + ``` + +=== "Ship-only mode" + + Logs are forwarded to Fluent-bit but cannot be read through `dstack`. + The dstack UI/CLI will show empty logs. Use this mode when: + + - You have an existing logging infrastructure (Kibana, Grafana, Datadog, etc.) + - You only need to forward logs without reading them back through `dstack` + - You want to reduce operational complexity by not running Elasticsearch/OpenSearch + + ```shell + $ DSTACK_SERVER_FLUENTBIT_HOST=fluentbit.example.com \ + dstack server + ``` + +??? info "Additional configuration" + The following optional environment variables can be used to customize the Fluent-bit integration: + + **Fluent-bit settings:** + + - `DSTACK_SERVER_FLUENTBIT_PORT` – The Fluent-bit port. Defaults to `24224`. + - `DSTACK_SERVER_FLUENTBIT_PROTOCOL` – The protocol to use: `forward` or `http`. Defaults to `forward`. + - `DSTACK_SERVER_FLUENTBIT_TAG_PREFIX` – The tag prefix for logs. Defaults to `dstack`. + + **Elasticsearch/OpenSearch settings (for full mode only):** + + - `DSTACK_SERVER_ELASTICSEARCH_HOST` – The Elasticsearch/OpenSearch host for reading logs. If not set, runs in ship-only mode. + - `DSTACK_SERVER_ELASTICSEARCH_INDEX` – The Elasticsearch/OpenSearch index pattern. Defaults to `dstack-logs`. + - `DSTACK_SERVER_ELASTICSEARCH_API_KEY` – The Elasticsearch/OpenSearch API key for authentication. + +??? info "Fluent-bit configuration" + Configure Fluent-bit to receive logs and forward them to Elasticsearch or OpenSearch. Example configuration: + + ```ini + [INPUT] + Name forward + Listen 0.0.0.0 + Port 24224 + + [OUTPUT] + Name es + Match dstack.* + Host elasticsearch.example.com + Port 9200 + Index dstack-logs + Suppress_Type_Name On + ``` + +??? info "Required dependencies" + To use Fluent-bit log storage, install the `fluentbit` extras: + + ```shell + $ pip install "dstack[all]" -U + # or + $ pip install "dstack[fluentbit]" -U + ``` + +## File storage + +When using [files](../concepts/dev-environments.md#files) or [repos](../concepts/dev-environments.md#repos), `dstack` uploads local files and diffs to the server so that you can have access to them within runs. By default, the files are stored in the DB and each upload is limited to 2MB. You can configure an object storage to be used for uploads and increase the default limit by setting the `DSTACK_SERVER_CODE_UPLOAD_LIMIT` environment variable + +### S3 + +To use S3 for storing uploaded files, set the `DSTACK_SERVER_S3_BUCKET` and `DSTACK_SERVER_S3_BUCKET_REGION` environment variables. +The bucket must be created beforehand. `dstack` won't try to create it. + +??? info "Required permissions" + + ```json + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:ListBucket" + ], + "Resource": [ + "arn:aws:s3:::", + "arn:aws:s3:::/*" + ] + } + ] + } + ``` + +### GCS + +To use GCS for storing uploaded files, set the `DSTACK_SERVER_GCS_BUCKET` environment variable. +The bucket must be created beforehand. `dstack` won't try to create it. + +??? info "Required permissions" + Ensure you've configured Application Default Credentials with the following permissions: + + ``` + storage.buckets.get + storage.buckets.list + storage.objects.get + storage.objects.list + storage.objects.create + storage.objects.delete + storage.objects.update + ``` + +## SSH proxy + +To connect to a run over SSH, `dstack` establishes a connection to the job's container, routed through the job's host and, for [SSH fleets](../concepts/fleets.md#ssh-fleets) with a head node, through that head node. + +[`dstack-sshproxy`](https://github.com/dstackai/sshproxy) is an optional service that you deploy alongside the `dstack` server. When it's enabled, `dstack attach` connects to the proxy instead of to the job's host (and the head node if the SSH fleet has one). + +This lets you: + +- Restrict users to the job's container. Without the proxy, an attached user can SSH into the host, not just the container. +- Reach runs on SSH fleets with a head node without giving users the head node's SSH key. +- Let users connect to runs without `dstack attach`. This requires uploading their public SSH key(s) to the `dstack` server. + + + +### Deployment + +To deploy the SSH proxy, follow its [deployment guide](https://github.com/dstackai/sshproxy/blob/main/DEPLOYMENT.md). Then connect the `dstack` server to it by setting the following environment variables: + +* `DSTACK_SSHPROXY_API_TOKEN` – the token used to authenticate requests to the SSH proxy. It must match the token the SSH proxy is deployed with. +* `DSTACK_SERVER_SSHPROXY_ADDRESS` – the address where users reach the SSH proxy, in the `HOSTNAME[:PORT]` form (`PORT` defaults to 22). + + + +> For a local setup running [PostgreSQL](#postgresql) and the SSH proxy together, see the example +> [`docker-compose.yml`](https://github.com/dstackai/dstack/blob/master/docker/server/docker-compose.yml). + +## Encryption + +By default, `dstack` stores data in plaintext. To enforce encryption, you +specify one or more encryption keys. + +`dstack` currently supports AES and identity (plaintext) encryption keys. +Support for external providers like HashiCorp Vault and AWS KMS is planned. + +=== "AES" + The `aes` encryption key encrypts data using [AES-256](https://en.wikipedia.org/wiki/Advanced_Encryption_Standard) in GCM mode. + To configure the `aes` encryption, generate a random 32-byte key: + +
    + + ```shell + $ head -c 32 /dev/urandom | base64 + + opmx+r5xGJNVZeErnR0+n+ElF9ajzde37uggELxL + ``` + +
    + + And specify it as `secret`: + + ```yaml + # ... + + encryption: + keys: + - type: aes + name: key1 + secret: opmx+r5xGJNVZeErnR0+n+ElF9ajzde37uggELxL + ``` + +=== "Identity" + The `identity` encryption performs no encryption and stores data in plaintext. + You can specify an `identity` encryption key explicitly if you want to decrypt the data: + +
    + + ```yaml + # ... + + encryption: + keys: + - type: identity + - type: aes + name: key1 + secret: opmx+r5xGJNVZeErnR0+n+ElF9ajzde37uggELxL + ``` + +
    + + With this configuration, the `aes` key will still be used to decrypt the old data, + but new writes will store the data in plaintext. + +??? info "Key rotation" + If multiple keys are specified, the first is used for encryption, and all are tried for decryption. This enables key + rotation by specifying a new encryption key. + +
    + + ```yaml + # ... + + encryption: + keys: + - type: aes + name: key2 + secret: cR2r1JmkPyL6edBQeHKz6ZBjCfS2oWk87Gc2G3wHVoA= + + - type: aes + name: key1 + secret: E5yzN6V3XvBq/f085ISWFCdgnOGED0kuFaAkASlmmO4= + ``` + +
    + + Old keys may be deleted once all existing records have been updated to re-encrypt sensitive data. + Encrypted values are prefixed with key names, allowing DB admins to identify the keys used for encryption. + +## Default permissions + +By default, all users can create and manage their own projects. You can specify `default_permissions` +to `false` so that only global admins can create and manage projects: + +
    + +```yaml +# ... + +default_permissions: + allow_non_admins_create_projects: false +``` + +
    + +## Backward compatibility + +`dstack` follows the `{major}.{minor}.{patch}` versioning scheme. +Backward compatibility is maintained based on these principles: + +* The server backward compatibility is maintained on a best-effort basis across minor and patch releases. The specific features can be removed, but the removal is preceded with deprecation warnings for several minor releases. This means you can use older client versions with newer server versions. +* The client backward compatibility is maintained across patch releases. A new minor release indicates that the release breaks client backward compatibility. This means you don't need to update the server when you update the client to a new patch release. Still, upgrading a client to a new minor version requires upgrading the server too. + +## Server limits + +A single `dstack` server replica can support at least + +* 1000 active instances +* 1000 active runs +* 1000 active jobs. + +If you hit server performance limits, try scale up server instances and/or configure Postgres with multiple server replicas. +Also, please [submit a GitHub issue](https://github.com/dstackai/dstack/issues) describing your setup – we strive to improve `dstack` scalability and efficiency. + +## Server upgrades + +When upgrading the `dstack` server, follow these guidelines to ensure a smooth transition and minimize downtime. + +### Before upgrading + +1. **Check the changelog**: Review the [release notes](https://github.com/dstackai/dstack/releases) for breaking changes, new features, and migration notes. +2. **Review backward compatibility**: Understand the [backward compatibility](#backward-compatibility) policy. +3. **Back up your data**: Ensure you always create a backup before upgrading. + +### Best practices + +- **Test in staging**: Always test upgrades in a non-production environment first. +- **Monitor logs**: Watch server logs during and after the upgrade for any errors or warnings. +- **Keep backups**: Retain backups for at least a few days after a successful upgrade. + +### Troubleshooting + +**Deadlock when upgrading a multi-replica PostgreSQL deployment** + +If a deployment is stuck due to a deadlock when applying DB migrations, try scaling server replicas to 1 and retry the deployment multiple times. Some releases may not support rolling deployments, which is always noted in the release notes. If you think there is a bug, please [file an issue](https://github.com/dstackai/dstack/issues). + +## FAQs + +??? info "Can I run multiple replicas of dstack server?" + + Yes, you can if you configure `dstack` to use [PostgreSQL](#postgresql) and an external log storage + such as [AWS CloudWatch](#aws-cloudwatch), [GCP Logging](#gcp-logging), or [Fluent-bit](#fluent-bit). + +??? info "Does dstack server support blue-green or rolling deployments?" + + Yes, it does if you configure `dstack` to use [PostgreSQL](#postgresql) and an external log storage + such as [AWS CloudWatch](#aws-cloudwatch), [GCP Logging](#gcp-logging), or [Fluent-bit](#fluent-bit). diff --git a/mkdocs/docs/guides/tenant-isolation.md b/mkdocs/docs/guides/tenant-isolation.md new file mode 100644 index 0000000000..10d200eb53 --- /dev/null +++ b/mkdocs/docs/guides/tenant-isolation.md @@ -0,0 +1,77 @@ +--- +title: Tenant isolation +description: Restricting access to hosts managed by dstack +--- + +# Tenant isolation + +`dstack` assumes mutual trust between users of the same project. While users' jobs run in Docker containers, users and their containers may have broad access to the underlying hosts. This guide explains how to restrict access to the host when stronger boundaries are required. + +!!! info "Isolation guarantees" + The methods described in this guide should be treated as best-effort hardening measures, not as a guarantee of isolation between users, as isolation ultimately depends on the underlying hardware and software. For the strongest isolation, place users in separate `dstack` projects and avoid sharing hardware between them. + + +## Host SSH access + +While attached to a run, users can SSH directly into the host machine — not just the container — using: + +```shell +ssh -host +``` + +This gives unrestricted access to the underlying instance, bypassing container boundaries. + +If desired, host SSH access can be disabled server-wide by configuring the [SSH proxy](server-deployment.md#ssh-proxy) and setting the following environment variable when starting the `dstack` server: + +```shell +DSTACK_SERVER_SSHPROXY_ENFORCED=1 +``` + +With this setting, all users' SSH connections go through the SSH proxy, which only allows connections into the container and not into the host. + +## Privileged mode + +Running a container in privileged mode gives it full access to the host kernel, making container escape straightforward. `dstack` supports requesting privileged mode through several configuration properties: + +| Property | Applies to | +|---|---| +| `privileged: true` | Tasks, dev environments, services | +| `docker: true` | Tasks, dev environments, services | +| `replicas[i].privileged: true` | Services with replica groups | +| `replicas[i].docker: true` | Services with replica groups | + +To block runs that request privileged mode, write a [REST plugin](../reference/plugins/rest/index.md) or a [Python plugin](../reference/plugins/python/index.md) with an apply policy. + +## Instance volumes + +[Instance volumes](../concepts/volumes.md#instance-volumes) mount a path from the host filesystem directly into the container. A user with access to this feature can mount arbitrary host paths — including sensitive directories such as `/etc`, `/proc`, or `/var`. + +You can disallow instance volumes or restrict access to certain paths by writing a [REST plugin](../reference/plugins/rest/index.md) or a [Python plugin](../reference/plugins/python/index.md). + +## Host network access + +By default, most `dstack` jobs run in host networking mode. This allows them to listen on any host network interface and communicate with other jobs over the internal network, which facilitates workloads such as [distributed tasks](../concepts/tasks.md#distributed-tasks) or [services with routers](../concepts/services.md#pd-disaggregation). + +However, exposing the host network to the job also exposes internal `dstack` APIs used to manage containers and SSH authorized keys on the host. If this is not acceptable, bridge networking should be used, which isolates the job from the host network. Bridge networking, however, breaks workloads that do need inter-job communication. + +The `DSTACK_SERVER_JOB_NETWORK_MODE` environment variable controls which jobs get host vs. bridge networking: + +| Value | Name | Behavior | +|---|---|---| +| `1` | `HOST_FOR_MULTINODE_ONLY` | Host for distributed tasks, bridge otherwise | +| `2` | `HOST_WHEN_POSSIBLE` | Host whenever the job occupies a full instance (default) | +| `3` | `FORCED_BRIDGE` | Always bridge, including distributed tasks | + +### No distributed tasks + +If you don't need distributed tasks or other runs with inter-job communication, you can set `DSTACK_SERVER_JOB_NETWORK_MODE=3` when starting the server: + +```shell +DSTACK_SERVER_JOB_NETWORK_MODE=3 +``` + +This forces bridge networking for all jobs on the server without exception, preventing access to internal `dstack` APIs, as well as communication between jobs. + +### Allow distributed tasks in selected projects + +If you want distributed tasks or other runs with inter-job communication to be available in some projects but not others, use `DSTACK_SERVER_JOB_NETWORK_MODE=1` instead. With this mode, single-node jobs get bridge networking, while distributed tasks still run with host networking. Distributed tasks can then be selectively blocked per project or user by writing a [REST plugin](../reference/plugins/rest/index.md) or a [Python plugin](../reference/plugins/python/index.md). diff --git a/mkdocs/docs/guides/troubleshooting.md b/mkdocs/docs/guides/troubleshooting.md new file mode 100644 index 0000000000..f3746a54d4 --- /dev/null +++ b/mkdocs/docs/guides/troubleshooting.md @@ -0,0 +1,227 @@ +--- +title: Troubleshooting +description: Common issues and how to resolve them +--- + +# Troubleshooting + +## Reporting issues + +When you encounter a problem, please report it as +a [GitHub issue](https://github.com/dstackai/dstack/issues/new/choose). + +If you have a question or need help, feel free to ask it in our [Discord server](https://discord.gg/u8SmfwPpMd). + +> When bringing up issues, always include the steps to reproduce. + +### Steps to reproduce + +Make sure to provide clear, detailed steps to reproduce the issue. +Include server logs, CLI outputs, and configuration samples. Avoid using screenshots for logs or errors—use text instead. + +#### Server logs + +To get more detailed server logs, set the `DSTACK_SERVER_LOG_LEVEL` +environment variable to `DEBUG`. By default, it is set to `INFO`. + +#### CLI logs + +CLI logs are located in `~/.dstack/logs/cli`, and the default log level is `DEBUG`. + +> See these examples for well-reported issues: [this](https://github.com/dstackai/dstack/issues/1640) +and [this](https://github.com/dstackai/dstack/issues/1551). + +## Typical issues + +### No fleets { #no-fleets } +[//]: # (NOTE: This section is referenced in the CLI. Do not change its URL.) + +If you run `dstack apply` and see `No fleets` status it can mean two things: + +=== "The project has no fleets" + In this case, ensure you've created one before submitting runs. This can be either a [backend fleet](../concepts/fleets.md#backend-fleets) (if you are using cloud or Kubernetes) or an [SSH fleet](../concepts/fleets.md#ssh-fleets) (if you're using on-prem clusters). + + !!! info "Backend fleets" + Note that creating [backend fleet](../concepts/fleets.md#backend-fleets) doesn't necessarily require provisioning instances upfront. If you set `nodes` to a range, `dstack` will be able to provision instances as required. See [backend fleet](../concepts/fleets.md#backend-fleets) for examples. + +=== "No matching fleet found" + This means fleets exist but run requirements do not match the configuration of the fleet. Review your fleets, and ensure that both run and fleet configuration are correct. + +### No offers { #no-offers } +[//]: # (NOTE: This section is referenced in the CLI. Do not change its URL.) + +If you run `dstack apply` and don't see any instance offers, it means that +`dstack` could not find instances that match the requirements in your configuration. +Below are some of the reasons why this might happen. + +Feel free to use `dstack offer` to inspect available offers: + +```shell +# All matching offers, ignoring fleet configurations +$ dstack offer --gpu H100 + +# Offers available through a specific fleet +$ dstack offer --gpu H100 --fleet my-fleet +``` + +#### Cause 1: No backends + +If you are not using [SSH fleets](../concepts/fleets.md#ssh-fleets), make sure you have configured at least one [backends](../concepts/backends.md). + +If you have configured a backend but still cannot use it, check the output of `dstack server` for backend configuration errors. + +> You can find a list of successfully configured backends on the [project settings page](../concepts/projects.md#backends) in the UI. + +#### Cause 2: Requirements mismatch + +When you apply a configuration, `dstack` tries to find instances that match the +[`resources`](../reference/dstack.yml/task.md#resources), +[`backends`](../reference/dstack.yml/task.md#backends), +[`regions`](../reference/dstack.yml/task.md#regions), +[`availability_zones`](../reference/dstack.yml/task.md#availability_zones), +[`instance_types`](../reference/dstack.yml/task.md#instance_types), +[`spot_policy`](../reference/dstack.yml/task.md#spot_policy), +and [`max_price`](../reference/dstack.yml/task.md#max_price) +properties from the configuration. + +`dstack` will only select instances that meet all the requirements. +Make sure your configuration doesn't set any conflicting requirements, such as +`regions` that don't exist in the specified `backends`, or `instance_types` that +don't match the specified `resources`. + +#### Cause 3: Too specific resources + +If you set a resource requirement to an exact value, `dstack` will only select instances +that have exactly that amount of resources. For example, `cpu: 5` and `memory: 10GB` will only +match instances that have exactly 5 CPUs and exactly 10GB of memory. + +Typically, you will want to set resource ranges to match more instances. +For example, `cpu: 4..8` and `memory: 10GB..` will match instances with 4 to 8 CPUs +and at least 10GB of memory. + +#### Cause 4: Default resources + +By default, `dstack` uses these resource requirements: +`cpu: 2..`, `memory: 8GB..`, `disk: 100GB..`. +If you want to use smaller instances, override the `cpu`, `memory`, or `disk` +properties in your configuration. + +#### Cause 5: GPU requirements + +By default, `dstack` only selects instances with no GPUs or a single NVIDIA GPU. +If you want to use non-NVIDIA GPUs or multi-GPU instances, set the `gpu` property +in your configuration. + +Examples: `gpu: amd` (one AMD GPU), `gpu: A10:4..8` (4 to 8 A10 GPUs), +`gpu: tt:32` (32 Tenstorrent devices). + +> If you don't specify the number of GPUs, `dstack` will only select single-GPU instances. + +#### Cause 6: Network volumes + +If your run configuration uses [network volumes](../concepts/volumes.md#network-volumes), +`dstack` will only select instances from the same backend and region as the volumes. +For AWS, the availability zone of the volume and the instance should also match. + +#### Cause 7: Feature support + +Some `dstack` features are not supported by all backends. If your configuration uses +one of these features, `dstack` will only select offers from the backends that support it. + +- [Backend fleets](../concepts/fleets.md#backend-fleets) configurations, + [Instance volumes](../concepts/volumes.md#instance-volumes), + and [Privileged containers](../reference/dstack.yml/dev-environment.md#privileged) + are supported by all backends except `runpod`, `vastai`, and `kubernetes`. +- [Clusters](../concepts/fleets.md#cluster-placement) + and [distributed tasks](../concepts/tasks.md#distributed-tasks) + are only supported by the `aws`, `azure`, `gcp`, `nebius`, `oci`, and `vultr` backends, + as well as SSH fleets. +- [Reservations](../reference/dstack.yml/fleet.md#reservation) + are only supported by the `aws` and `gcp` backends. + +#### Cause 8: dstack Sky balance + +If you are using +[dstack Sky](https://sky.dstack.ai), +you will not see marketplace offers until you top up your balance. +Alternatively, you can configure your own cloud accounts +on the [project settings page](../concepts/projects.md#backends) +or use [SSH fleets](../concepts/fleets.md#ssh-fleets). + +### Provisioning fails + +In certain cases, running `dstack apply` may show instance offers, +but then produce the following output: + +```shell +wet-mangust-1 provisioning completed (failed) +All provisioning attempts failed. This is likely due to cloud providers not having enough capacity. Check CLI and server logs for more details. +``` + +#### Cause 1: Insufficient service quotas + +If some runs fail to provision, it may be due to an insufficient service quota. For cloud providers like AWS, GCP, +Azure, and OCI, you often need to request an increased [service quota](protips.md#service-quotas) before you can use +specific instances. + +### Run starts but fails + +There could be several reasons for a run failing after successful provisioning. + +!!! info "Termination reason" + To find out why a run terminated, use `--verbose` (or `-v`) with `dstack ps`. + This will show the run's status and any failure reasons. + +!!! info "Diagnostic logs" + You can get more information on why a run fails with diagnostic logs. + Pass `--diagnose` (or `-d`) to `dstack logs` and you'll see logs of the run executor. + +#### Cause 1: Spot interruption + +If a run fails after provisioning with the termination reason `INTERRUPTED_BY_NO_CAPACITY`, it is likely that the run +was using spot instances and was interrupted. To address this, you can either set the +[`spot_policy`](../reference/dstack.yml/task.md#spot_policy) to `on-demand` or specify the +[`retry`](../reference/dstack.yml/task.md#retry) property. + +[//]: # (#### Other) +[//]: # (TODO: Explain how to get the shim logs) + +### Services fail to start + +#### Cause 1: Gateway misconfiguration + +If all services fail to start with a specific gateway, make sure a +[correct DNS record](../concepts/gateways.md#update-dns-records) +pointing to the gateway's hostname is configured. + +### Service endpoint doesn't work + +#### Cause 1: Bad Authorization + +If the service endpoint returns a 403 error, it is likely because the [`Authorization`](../concepts/services.md#access-the-endpoint) +header with the correct `dstack` token was not provided. + +[//]: # (#### Other) +[//]: # (TODO: Explain how to get the gateway logs) + +### Cannot access dev environment or task ports + +#### Cause 1: Detached from run + +When running a dev environment or task with configured ports, `dstack apply` +automatically forwards remote ports to `localhost` via SSH for easy and secure access. +If you interrupt the command, the port forwarding will be disconnected. To reattach, use `dstack attach + +## 0.20.* { #0_20 } + +### CLI compatibility + +- CLI versions `0.19.*` and earlier remain backward compatible with the `0.20.*` `dstack` server. +- CLI versions `0.20.` are not compatible with server versions prior to `0.20.*`. + +> Do not upgrade the CLI to `0.20.*` until the server has been upgraded. + +### Fleets + +* Prior to `0.20`, `dstack` automatically provisioned a fleet if one did not exist at run time. + Beginning with `0.20`, `dstack` will only use existing fleets. + +> Create fleets before submitting runs. To enable on-demand instance provisioning, configure `nodes` as a range in the [backend fleet](../concepts/fleets.md#backend-fleets) configuration. + +### Working directory + +- Previously, when `working_dir` was not specified, `dstack` defaulted to `/workflow`. As of `0.20`, `dstack` uses the working directory defined in the Docker image. If the image does not define a working directory, `dstack` falls back to `/`. +- The default image introduced in `0.20` uses `/dstack/run` as its default working directory. + +> To override the directory defined in the Docker image, specify [`working_dir`](../concepts/dev-environments.md#working-directory) explicitly. + +### Repo directory + +- Previously, if no [repo directory](../concepts/dev-environments.md#repos) was specified, `dstack` cloned the repository into `/workflow`. With `0.20`, the working directory becomes the default repo directory. +- In earlier versions, cloning was skipped if the repo directory was non-empty. Starting with `0.20`, this results in a `runner error` unless `if_exists` is set to `skip` in the repo configuration. + +> Ensure repo directories are empty, or explicitly set `if_exists` to `skip`. + +### Deprecated feature removal + +The following deprecated commands have been removed in **0.20**: + +- `dstack config` +- `dstack stats` +- `dstack gateway create` + +Use the corresponding replacements: + +- `dstack project` +- `dstack metrics` +- `dstack apply` + +> For more details on the changes, see the [release notes](https://github.com/dstackai/dstack/releases). diff --git a/mkdocs/docs/index.md b/mkdocs/docs/index.md new file mode 100644 index 0000000000..9bf00e5abe --- /dev/null +++ b/mkdocs/docs/index.md @@ -0,0 +1,46 @@ +--- +title: What is dstack? +description: Introduction to dstack and how it works +--- + +# What is dstack? + +`dstack` is a unified control plane for GPU provisioning and orchestration that works with any GPU cloud, Kubernetes, or on-prem clusters. + +It streamlines development, training, and inference, and is compatible with any hardware, open-source tools, and frameworks. + +!!! info "Accelerators" + `dstack` supports `NVIDIA`, `AMD`, `TPU`, and `Tenstorrent` accelerators out of the box. + +## How does it work? + + + +### Set up the server + +> Before using `dstack`, ensure you've [installed](installation.md) the server, or signed up for [dstack Sky](https://sky.dstack.ai). + +### Define configurations + +`dstack` supports the following configurations: + +* [Fleets](concepts/fleets.md) — for managing cloud and on-prem clusters +* [Dev environments](concepts/dev-environments.md) — for interactive development using a desktop IDE +* [Tasks](concepts/tasks.md) — for scheduling jobs, incl. distributed ones (or running web apps) +* [Services](concepts/services.md) — for deploying models (or web apps) +* [Volumes](concepts/volumes.md) — for managing network volumes (to persist data) + +Configuration can be defined as YAML files within your repo. + +### Apply configurations + +Apply the configuration either via the `dstack apply` CLI command (or through a programmatic API.) + +`dstack` automatically manages infrastructure provisioning and job scheduling, while also handling auto-scaling, +port-forwarding, ingress, and more. + +!!! info "Where do I start?" + 1. Proceed to [installation](installation.md) + 2. See [quickstart](quickstart.md) + 3. Browse [examples](/examples) + 4. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/mkdocs/docs/installation.md b/mkdocs/docs/installation.md new file mode 100644 index 0000000000..9203f1879b --- /dev/null +++ b/mkdocs/docs/installation.md @@ -0,0 +1,227 @@ +--- +title: Installation +description: How to install the dstack server and CLI +--- + +# Installation + + + +## Launch the server { #server } + +The server can run on your laptop or any environment with access to the cloud and on-prem clusters you plan to use. + +=== "uv" + + > The server can be set up via `uv` on Linux, macOS, and Windows (via WSL 2). + > It requires Git and OpenSSH. + +
    + + ```shell + $ uv tool install "dstack[all]" -U + $ dstack server + + Applying ~/.dstack/server/config.yml... + + The admin token is "bbae0f28-d3dd-4820-bf61-8f4bb40815da" + The server is running at http://127.0.0.1:3000/ + ``` + +
    + +=== "pip" + + > The server can be set up via `pip` on Linux, macOS, and Windows (via WSL 2). + > It requires Git and OpenSSH. + +
    + + ```shell + $ pip install "dstack[all]" -U + $ dstack server + + Applying ~/.dstack/server/config.yml... + + The admin token is "bbae0f28-d3dd-4820-bf61-8f4bb40815da" + The server is running at http://127.0.0.1:3000/ + ``` + +
    + +=== "Docker" + +
    + + ```shell + $ docker run -p 3000:3000 \ + -v $HOME/.dstack/server/:/root/.dstack/server \ + dstackai/dstack + + Applying ~/.dstack/server/config.yml... + + The admin token is "bbae0f28-d3dd-4820-bf61-8f4bb40815da" + The server is running at http://127.0.0.1:3000/ + ``` + +
    + +For more details on server deployment options, see the [Server deployment](guides/server-deployment.md) guide. + +!!! info "Configure backends" + + To orchestrate compute across GPU clouds or Kubernetes clusters, you need to configure [backends](concepts/backends.md). + +## Install the CLI { #cli } + +Once the server is up, you can access it via the `dstack` CLI. + +> The CLI can be used on Linux, macOS, and Windows. It requires Git and OpenSSH. + +=== "uv" + +
    + + ```shell + $ uv tool install dstack -U + ``` + +
    + +=== "pip" + +
    + + ```shell + $ pip install dstack -U + ``` + +
    + +??? info "Windows" + To use the CLI on Windows, ensure you've installed Git and OpenSSH via + [Git for Windows](https://git-scm.com/download/win). + + When installing it, ensure you've checked + `Git from the command line and also from 3-rd party software` + (or `Use Git and optional Unix tools from the Command Prompt`), and + `Use bundled OpenSSH`. + +??? info "Shell autocompletion" + + `dstack` supports shell autocompletion for `bash` and `zsh`. + + === "bash" + + First, validate if completion scripts load correctly in your current shell session: + +
    + + ```shell + $ eval "$(dstack completion bash)" + ``` + +
    + + If completions work as expected and you would like them to persist across shell sessions, add the completion script to your shell profile using these commands: + +
    + + ```shell + $ mkdir -p ~/.dstack + $ dstack completion bash > ~/.dstack/completion.sh + $ echo 'source ~/.dstack/completion.sh' >> ~/.bashrc + ``` + +
    + + === "zsh" + + First, validate if completion scripts load correctly in your current shell session: + +
    + + ```shell + $ eval "$(dstack completion zsh)" + ``` + +
    + + If completions work as expected and you would like them to persist across shell sessions, you can install them via Oh My Zsh using these commands: + +
    + + ```shell + $ mkdir -p ~/.oh-my-zsh/completions + $ dstack completion zsh > ~/.oh-my-zsh/completions/_dstack + ``` + +
    + + And if you don't use Oh My Zsh: + +
    + + ```shell + $ mkdir -p ~/.dstack + $ dstack completion zsh > ~/.dstack/completion.sh + $ echo 'source ~/.dstack/completion.sh' >> ~/.zshrc + ``` + +
    + + > If you get an error similar to `2: command not found: compdef`, then add the following line to the beginning of your `~/.zshrc` file: + > `autoload -Uz compinit && compinit`. + +### Configure the project + +When the server is started, by default, it creates the `main` project and the `admin` user. + +To point the CLI to the `dstack` server, configure it +with the server address, user token, and project name: + +
    + +```shell +$ dstack project add \ + --name main \ + --url http://127.0.0.1:3000 \ + --token bbae0f28-d3dd-4820-bf61-8f4bb40815da + +Configuration is updated at ~/.dstack/config.yml +``` + +
    + +This configuration is stored in `~/.dstack/config.yml`. + +Later, you can create additional projects and users. + +### Use CLI or API + +Once the project is configured, you can use the `dstack` CLI or API. + +> See the [CLI & API](guides/cli-api.md) guide for using the CLI and HTTP API. + +## Install agent skills { #skills } + +If you'd like to use `dstack` with AI agents like Claude, Codex, and Cursor, +install [`dstack` skills](https://skills.sh/dstackai/dstack/dstack) to help +them use the CLI and edit configuration files. + +
    + +```shell +$ npx skills add dstackai/dstack +``` + +
    + +We're actively improving Skills and would love your feedback in [GitHub issues](https://github.com/dstackai/dstack/issues). + +!!! info "What's next?" + 1. See [Backends](concepts/backends.md) + 2. Follow [Quickstart](quickstart.md) + 3. Check the [Server deployment](guides/server-deployment.md) guide diff --git a/mkdocs/docs/quickstart.md b/mkdocs/docs/quickstart.md new file mode 100644 index 0000000000..f5de84e3ee --- /dev/null +++ b/mkdocs/docs/quickstart.md @@ -0,0 +1,281 @@ +--- +title: Quickstart +description: Quick guide to creating fleets and submitting runs +--- + +# Quickstart + +!!! info "Prerequisites" + Ensure the server and CLI are [installed](installation.md). To use `dstack` with AI agents, install [skills](installation.md#skills). + +## Create a fleet + +> Before submitting runs, you must create a [fleet](concepts/fleets.md). + +=== "Backend fleet" + If you're using cloud providers or Kubernetes clusters and have configured the corresponding [backends](concepts/backends.md), create a fleet as follows: + +
    + + ```yaml + type: fleet + name: default + + # Allow to provision of up to 2 instances + nodes: 0..2 + + # Deprovision instances above the minimum if they remain idle + idle_duration: 1h + + resources: + # Allow to provision up to 8 GPUs + gpu: 0..8 + ``` + +
    + + Pass the fleet configuration to `dstack apply`: + +
    + + ```shell + $ dstack apply -f fleet.dstack.yml + + # BACKEND REGION RESOURCES SPOT PRICE + 1 gcp us-west4 2xCPU, 8GB, 100GB (disk) yes $0.010052 + 2 azure westeurope 2xCPU, 8GB, 100GB (disk) yes $0.0132 + 3 gcp europe-central2 2xCPU, 8GB, 100GB (disk) yes $0.013248 + + Create the fleet? [y/n]: y + + FLEET INSTANCE BACKEND RESOURCES PRICE STATUS CREATED + defalut - - - - - 10:36 + ``` + +
    + + If the `nodes` range starts with `0`, `dstack apply` creates only a template. Instances are provisioned only when you submit runs. + + If the fleet needs to be a cluster, the [placement](concepts/fleets.md#cluster-placement) property must be set to `cluster`. + +=== "SSH fleet" + If you have a group of on-prem servers accessible via SSH, you can create an SSH fleet as follows: + +
    + + ```yaml + type: fleet + name: my-fleet + + ssh_config: + user: ubuntu + identity_file: ~/.ssh/id_rsa + hosts: + - 3.255.177.51 + - 3.255.177.52 + ``` + +
    + + Pass the fleet configuration to `dstack apply`: + +
    + + ```shell + $ dstack apply -f fleet.dstack.yml + + Provisioning... + ---> 100% + + FLEET INSTANCE GPU PRICE STATUS CREATED + my-fleet 0 L4:24GB (spot) $0 idle 3 mins ago + 1 L4:24GB (spot) $0 idle 3 mins ago + ``` + +
    + + > Hosts must have Docker and GPU drivers installed and meet the other [requirements](concepts/fleets.md#ssh-fleets). + + If the fleet needs to be a cluster, the [placement](concepts/fleets.md#ssh-placement) property must be set to `cluster`. + +## Submit your first run + +`dstack` supports three types of run configurations. + +=== "Dev environment" + A [dev environment](concepts/dev-environments.md) lets you provision an instance and access it with your desktop IDE. + + Create the following run configuration: + +
    + + ```yaml + type: dev-environment + name: vscode + + # If `image` is not specified, dstack uses its default image + python: "3.11" + #image: dstackai/base:py3.13-0.7-cuda-12.1 + + ide: vscode + + # Uncomment to request resources + #resources: + # gpu: 24GB + ``` + +
    + + Apply the configuration via `dstack apply`: + +
    + + ```shell + $ dstack apply -f .dstack.yml + + # BACKEND REGION RESOURCES SPOT PRICE + 1 gcp us-west4 2xCPU, 8GB, 100GB (disk) yes $0.010052 + 2 azure westeurope 2xCPU, 8GB, 100GB (disk) yes $0.0132 + 3 gcp europe-central2 2xCPU, 8GB, 100GB (disk) yes $0.013248 + + Submit the run vscode? [y/n]: y + + Launching `vscode`... + ---> 100% + + To open in VS Code Desktop, use this link: + vscode://vscode-remote/ssh-remote+vscode/workflow + ``` + +
    + + Open the link to access the dev environment using your desktop IDE. Alternatively, you can access it via `ssh `. + +=== "Task" + A [task](concepts/tasks.md) allows you to schedule a job or run a web app. Tasks can be distributed and can forward ports. + + Create the following run configuration: + +
    + + ```yaml + type: task + name: streamlit + + # If `image` is not specified, dstack uses its default image + python: "3.11" + #image: dstackai/base:py3.13-0.7-cuda-12.1 + + # Commands of the task + commands: + - pip install streamlit + - streamlit hello + # Ports to forward + ports: + - 8501 + + # Uncomment to request resources + #resources: + # gpu: 24GB + ``` + +
    + + By default, tasks run on a single instance. To run a distributed task, specify + [`nodes`](concepts/tasks.md#distributed-tasks), and `dstack` will run it on a cluster. + + Run the configuration via `dstack apply`: + +
    + + ```shell + $ dstack apply -f task.dstack.yml + + # BACKEND REGION RESOURCES SPOT PRICE + 1 gcp us-west4 2xCPU, 8GB, 100GB (disk) yes $0.010052 + 2 azure westeurope 2xCPU, 8GB, 100GB (disk) yes $0.0132 + 3 gcp europe-central2 2xCPU, 8GB, 100GB (disk) yes $0.013248 + + Submit the run streamlit? [y/n]: y + + Provisioning `streamlit`... + ---> 100% + + Welcome to Streamlit. Check out our demo in your browser. + + Local URL: http://localhost:8501 + ``` + +
    + + If you specified `ports`, they will be automatically forwarded to `localhost` for convenient access. + +=== "Service" + A [service](concepts/services.md) allows you to deploy a model or any web app as an endpoint. + + Create the following run configuration: + +
    + + ```yaml + type: service + name: qwen36-service + + image: lmsysorg/sglang:v0.5.10.post1 + + commands: + - | + sglang serve \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 30000 \ + --reasoning-parser qwen3 + # Expose the SGLang server port + port: 30000 + + # Specify a name if it's an OpenAI-compatible model + model: Qwen/Qwen3.6-27B + + # Required resources + resources: + gpu: H100 + ``` + +
    + + Run the configuration via `dstack apply`: + +
    + + ```shell + $ dstack apply -f service.dstack.yml + + # BACKEND REGION INSTANCE RESOURCES SPOT PRICE + 1 nebius eu-north1 gpu-h100-sxm 16xCPU, 250GB, 1xH100 (80GB) no $2.95 + 2 runpod US-CA-2 NVIDIA H100 80GB HBM3 64xCPU, 1004GB, 1xH100 (80GB) no $2.99 + + Submit the run qwen36-service? [y/n]: y + + Provisioning `qwen36-service`... + ---> 100% + + Service is published at: + http://localhost:3000/proxy/services/main/qwen36-service/ + Model Qwen/Qwen3.6-27B is published at: + http://localhost:3000/proxy/models/main/ + ``` + +
    + + > To enable auto-scaling rate limits, or use a custom domain with HTTPS, set up a [gateway](concepts/gateways.md) before running the service. + +`dstack apply` automatically provisions instances with created fleets and runs the workload according to the configuration. + +## Troubleshooting + +Something not working? See the [troubleshooting](guides/troubleshooting.md) guide. + +!!! info "What's next?" + 1. Read about [backends](concepts/backends.md), [dev environments](concepts/dev-environments.md), [tasks](concepts/tasks.md), [services](concepts/services.md), and [fleets](concepts/services.md) + 2. Browse [examples](examples.md) + 3. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/mkdocs/docs/reference/api/python/index.md b/mkdocs/docs/reference/api/python/index.md new file mode 100644 index 0000000000..96e39467e3 --- /dev/null +++ b/mkdocs/docs/reference/api/python/index.md @@ -0,0 +1,230 @@ +# Python API + +The Python API enables running tasks, services, and managing runs programmatically. + +## Usage example + +Below is a quick example of submitting a task for running and displaying its logs. + +```python +import sys + +from dstack.api import Task, GPU, Client, Resources + +client = Client.from_config() + +task = Task( + name="my-awesome-run", # If not specified, a random name is assigned + image="ghcr.io/huggingface/text-generation-inference:latest", + env={"MODEL_ID": "TheBloke/Llama-2-13B-chat-GPTQ"}, + commands=[ + "text-generation-launcher --trust-remote-code --quantize gptq", + ], + ports=["80"], + resources=Resources(gpu=GPU(memory="24GB")), +) + +run = client.runs.apply_configuration( + configuration=task, + repo=None, # Specify to mount additional files +) + +run.attach() + +try: + for log in run.logs(): + sys.stdout.buffer.write(log) + sys.stdout.buffer.flush() +except KeyboardInterrupt: + run.stop(abort=True) +finally: + run.detach() +``` + +!!! info "NOTE:" + 1. The `configuration` argument in the `apply_configuration` method can be either `dstack.api.Task`, `dstack.api.Service`, or `dstack.api.DevEnvironment`. + 2. When you create `dstack.api.Task`, `dstack.api.Service`, or `dstack.api.DevEnvironment`, you can specify the `image` argument. If `image` isn't specified, the default image will be used. For a private Docker registry, ensure you also pass the `registry_auth` argument. + 3. The `repo` argument in the `apply_configuration` method allows the mounting of a remote repo or a + programmatically created repo. In this case, the `commands` argument can refer to the files within this repo. + 4. The `attach` method waits for the run to start and, for `dstack.api.Task` sets up an SSH tunnel and forwards + configured `ports` to `localhost`. + +## `dstack.api` { #dstack.api data-toc-label="dstack.api" } + +### `dstack.api.Client` { #dstack.api.Client data-toc-label="Client" } + +::: dstack.api.Client + options: + show_root_heading: false + show_root_toc_entry: false + heading_level: 4 + +### `dstack.api.RunCollection` { #dstack.api.Client.runs data-toc-label="RunCollection" } + +::: dstack.api.RunCollection + options: + show_bases: false + show_symbol_type_heading: true + show_root_toc_entry: false + heading_level: 4 + +### `dstack.api.RepoCollection` { #dstack.api.Client.repos data-toc-label="RepoCollection" } + +::: dstack.api.RepoCollection + options: + show_root_heading: false + show_root_toc_entry: false + heading_level: 4 + +[//]: # (### `dstack.api.BackendCollection` { #dstack.api.Client.backends data-toc-label="BackendCollection" }) + +[//]: # (::: dstack.api.BackendCollection) +[//]: # ( options:) +[//]: # ( show_bases: false) +[//]: # ( show_root_heading: false) +[//]: # ( show_root_toc_entry: false) +[//]: # ( heading_level: 4) + +### `dstack.api.Task` { #dstack.api.Task data-toc-label="Task" } + +#SCHEMA# dstack.api.Task + overrides: + show_root_heading: false + show_root_toc_entry: false + heading_level: 4 + item_id_mapping: + registry_auth: dstack.api.RegistryAuth + resources: dstack.api.Resources + +### `dstack.api.Service` { #dstack.api.Service data-toc-label="Service" } + +#SCHEMA# dstack.api.Service + overrides: + show_root_heading: false + show_root_toc_entry: false + heading_level: 4 + item_id_mapping: + scaling: dstack.api.Scaling + registry_auth: dstack.api.RegistryAuth + resources: dstack.api.Resources + +### `dstack.api.DevEnvironment` { #dstack.api.DevEnvironment data-toc-label="DevEnvironment" } + +#SCHEMA# dstack.api.DevEnvironment + overrides: + show_root_heading: false + show_root_toc_entry: false + heading_level: 4 + item_id_mapping: + registry_auth: dstack.api.RegistryAuth + resources: dstack.api.Resources + +### `dstack.api.Run` { #dstack.api.Run data-toc-label="Run" } + +::: dstack.api.Run + options: + show_bases: false + show_root_heading: false + show_root_toc_entry: false + heading_level: 4 + +### `dstack.api.Resources` { #dstack.api.Resources data-toc-label="Resources" } + +#SCHEMA# dstack.api.Resources + overrides: + show_root_heading: false + show_root_toc_entry: false + heading_level: 4 + item_id_mapping: + cpu: dstack.api.CPU + gpu: dstack.api.GPU + memory: dstack.api.Memory + Range: dstack.api.Range + +### `dstack.api.CPU` { #dstack.api.CPU data-toc-label="CPU" } + +#SCHEMA# dstack.api.CPU + overrides: + show_root_heading: false + show_root_toc_entry: false + heading_level: 4 + item_id_mapping: + Range: dstack.api.Range + +### `dstack.api.GPU` { #dstack.api.GPU data-toc-label="GPU" } + +#SCHEMA# dstack.api.GPU + overrides: + show_root_heading: false + show_root_toc_entry: false + heading_level: 4 + item_id_mapping: + memory: dstack.api.Memory + Range: dstack.api.Range + +### `dstack.api.Disk` { #dstack.api.Disk data-toc-label="Disk" } + +#SCHEMA# dstack.api.Disk + overrides: + show_root_heading: false + show_root_toc_entry: false + heading_level: 4 + item_id_mapping: + memory: dstack.api.Memory + Range: dstack.api.Range + +### `dstack.api.RemoteRepo` { #dstack.api.RemoteRepo data-toc-label="RemoteRepo" } + +::: dstack.api.RemoteRepo + options: + show_bases: false + show_root_heading: false + show_root_toc_entry: false + heading_level: 4 + +### `dstack.api.VirtualRepo` { #dstack.api.VirtualRepo data-toc-label="VirtualRepo" } + +::: dstack.api.VirtualRepo + options: + show_bases: false + show_root_heading: false + show_root_toc_entry: false + heading_level: 4 + +### `dstack.api.RegistryAuth` { #dstack.api.RegistryAuth data-toc-label="RegistryAuth" } + +#SCHEMA# dstack.api.RegistryAuth + overrides: + show_root_heading: false + show_root_toc_entry: false + heading_level: 4 + +### `dstack.api.Scaling` { #dstack.api.Scaling data-toc-label="Scaling" } + +#SCHEMA# dstack.api.Scaling + overrides: + show_root_heading: false + show_root_toc_entry: false + heading_level: 4 + +### `dstack.api.BackendType` { #dstack.api.BackendType data-toc-label="BackendType" } + +::: dstack.api.BackendType + options: + show_bases: false + show_root_heading: false + show_root_toc_entry: false + heading_level: 4 + + diff --git a/mkdocs/docs/reference/cli/dstack/apply.md b/mkdocs/docs/reference/cli/dstack/apply.md new file mode 100644 index 0000000000..4cc6215a0e --- /dev/null +++ b/mkdocs/docs/reference/cli/dstack/apply.md @@ -0,0 +1,25 @@ +# dstack apply + +This command applies a given configuration. If a resource does not exist, `dstack apply` creates the resource. +If a resource exists, `dstack apply` updates the resource in-place or re-creates the resource if the update is not possible. + +To mount a Git repo to the run's container, `dstack apply` requires that you run `dstack init` first, +or specify a repo to work with via `-P` (or `--repo`), or specify `--no-repo` if you don't need any repo for the run. + +## Usage + +
    + +```shell +$ dstack apply --help +#GENERATE# +``` + +
    + +## User SSH key + +By default, `dstack` uses its own SSH key to attach to runs (`~/.dstack/ssh/id_rsa`). +It is possible to override this key via the `--ssh-identity` argument. + +[//]: # (TODO: Provide examples) diff --git a/mkdocs/docs/reference/cli/dstack/attach.md b/mkdocs/docs/reference/cli/dstack/attach.md new file mode 100644 index 0000000000..eae8c7f176 --- /dev/null +++ b/mkdocs/docs/reference/cli/dstack/attach.md @@ -0,0 +1,21 @@ +# dstack attach + +This command attaches to a given run. It establishes the SSH tunnel, forwards ports, and shows real-time run logs. + +## Usage + +
    + +```shell +$ dstack attach --help +#GENERATE# +``` + +
    + +## User SSH key + +By default, `dstack` uses its own SSH key to attach to runs (`~/.dstack/ssh/id_rsa`). +It is possible to override this key via the `--ssh-identity` argument. + +[//]: # (TODO: Provide examples) diff --git a/mkdocs/docs/reference/cli/dstack/delete.md b/mkdocs/docs/reference/cli/dstack/delete.md new file mode 100644 index 0000000000..587c073e74 --- /dev/null +++ b/mkdocs/docs/reference/cli/dstack/delete.md @@ -0,0 +1,16 @@ +# dstack delete + +This command deletes the resources defined by a given configuration. + +## Usage + +
    + +```shell +$ dstack delete --help +#GENERATE# +``` + +
    + +[//]: # (TODO: Provide examples) diff --git a/mkdocs/docs/reference/cli/dstack/event.md b/mkdocs/docs/reference/cli/dstack/event.md new file mode 100644 index 0000000000..8f90e456c9 --- /dev/null +++ b/mkdocs/docs/reference/cli/dstack/event.md @@ -0,0 +1,16 @@ +# dstack event + +The `dstack event` command, an alias for `dstack event list`, allows you to view events. + +For more details, see [Events](../../../concepts/events.md). + +## Usage + +
    + +```shell +$ dstack event --help +#GENERATE# +``` + +
    diff --git a/mkdocs/docs/reference/cli/dstack/export.md b/mkdocs/docs/reference/cli/dstack/export.md new file mode 100644 index 0000000000..6b5a3dcf34 --- /dev/null +++ b/mkdocs/docs/reference/cli/dstack/export.md @@ -0,0 +1,63 @@ +# dstack export + +The `dstack export` commands manage [exports](../../../concepts/exports.md) of resources to other projects. + +## dstack export list + +The `dstack export list` command lists all exports in the project. + +##### Usage + +
    + +```shell +$ dstack export list --help +#GENERATE# +``` + +
    + +## dstack export create + +The `dstack export create` command creates a new export. + +##### Usage + +
    + +```shell +$ dstack export create --help +#GENERATE# +``` + +
    + +## dstack export update + +The `dstack export update` command updates an existing export. + +##### Usage + +
    + +```shell +$ dstack export update --help +#GENERATE# +``` + +
    + +## dstack export delete + +The `dstack export delete` command deletes the specified export. + +##### Usage + +
    + +```shell +$ dstack export delete --help +#GENERATE# +``` + +
    diff --git a/mkdocs/docs/reference/cli/dstack/fleet.md b/mkdocs/docs/reference/cli/dstack/fleet.md new file mode 100644 index 0000000000..4354a6327a --- /dev/null +++ b/mkdocs/docs/reference/cli/dstack/fleet.md @@ -0,0 +1,32 @@ +# dstack fleet + +Fleets enable efficient provisioning and management of clusters and instances. + +## dstack fleet list + +The `dstack fleet list` command displays fleets and instances. + +
    + +```shell +$ dstack fleet list --help +#GENERATE# +``` + +
    + +## dstack fleet delete + +The `dstack fleet delete` deletes fleets and instances. +Cloud instances are terminated upon deletion. + +
    + +```shell +$ dstack fleet delete --help +#GENERATE# +``` + +
    + +[//]: # (TODO: Provide examples) diff --git a/mkdocs/docs/reference/cli/dstack/gateway.md b/mkdocs/docs/reference/cli/dstack/gateway.md new file mode 100644 index 0000000000..d98eb5a8ab --- /dev/null +++ b/mkdocs/docs/reference/cli/dstack/gateway.md @@ -0,0 +1,35 @@ +# dstack gateway + +A gateway allows publishing services at a custom domain with HTTPS. + +## dstack gateway list + +The `dstack gateway list` command displays the names and addresses of the gateways configured in the project. + +##### Usage + +
    + +```shell +$ dstack gateway list --help +#GENERATE# +``` + +
    + +## dstack gateway delete + +The `dstack gateway delete` command deletes the specified gateway. + +##### Usage + +
    + +```shell +$ dstack gateway delete --help +#GENERATE# +``` + +
    + +[//]: # (TODO: Provide examples) diff --git a/mkdocs/docs/reference/cli/dstack/import.md b/mkdocs/docs/reference/cli/dstack/import.md new file mode 100644 index 0000000000..168b681c9b --- /dev/null +++ b/mkdocs/docs/reference/cli/dstack/import.md @@ -0,0 +1,34 @@ +# dstack import + +The `dstack import` commands list resources imported into the project from other projects. +See [Exports](../../../concepts/exports.md) for details. + +## dstack import list + +The `dstack import list` command lists all imports in the project. + +##### Usage + +
    + +```shell +$ dstack import list --help +#GENERATE# +``` + +
    + +## dstack import delete + +The `dstack import delete` command deletes the specified import. This makes the imported resources unavailable in your project, while they still exist in the host project. + +##### Usage + +
    + +```shell +$ dstack import delete --help +#GENERATE# +``` + +
    diff --git a/mkdocs/docs/reference/cli/dstack/init.md b/mkdocs/docs/reference/cli/dstack/init.md new file mode 100644 index 0000000000..c89be94686 --- /dev/null +++ b/mkdocs/docs/reference/cli/dstack/init.md @@ -0,0 +1,20 @@ +# dstack init + +If you’re using private Git repos in your runs via [`repos`](../../../concepts/dev-environments.md#repos), +`dstack` will automatically try to use your default Git credentials (from +`~/.ssh/config` or `~/.config/gh/hosts.yml`). + +To provide custom Git credentials, run `dstack init`. + +
    + +```shell +$ dstack init --help +#GENERATE# +``` + +
    + +You can set credentials with `--git-identity` (private SSH key) or `--token` (OAuth token). + +Run `dstack init` in the repo’s directory, or pass the repo path or URL with `--repo` (or `-P`). diff --git a/mkdocs/docs/reference/cli/dstack/login.md b/mkdocs/docs/reference/cli/dstack/login.md new file mode 100644 index 0000000000..d608476e27 --- /dev/null +++ b/mkdocs/docs/reference/cli/dstack/login.md @@ -0,0 +1,17 @@ +# dstack login + +This command authorizes the CLI using Single Sign-On and automatically configures your projects. +It provides an alternative to `dstack project add`. + +## Usage + +
    + +```shell +$ dstack login --help +#GENERATE# +``` + +
    + +[//]: # (TODO: Provide examples) diff --git a/mkdocs/docs/reference/cli/dstack/logs.md b/mkdocs/docs/reference/cli/dstack/logs.md new file mode 100644 index 0000000000..ca5ef3b12d --- /dev/null +++ b/mkdocs/docs/reference/cli/dstack/logs.md @@ -0,0 +1,16 @@ +# dstack logs + +This command shows the output of a given run. + +## Usage + +
    + +```shell +$ dstack logs --help +#GENERATE# +``` + +
    + +[//]: # (TODO: Provide examples) diff --git a/mkdocs/docs/reference/cli/dstack/metrics.md b/mkdocs/docs/reference/cli/dstack/metrics.md new file mode 100644 index 0000000000..826d6a4e0b --- /dev/null +++ b/mkdocs/docs/reference/cli/dstack/metrics.md @@ -0,0 +1,16 @@ +# dstack metrics + +This command shows run hardware metrics such as CPU, memory, and GPU utilization. + +## Usage + +
    + +```shell +$ dstack metrics --help +#GENERATE# +``` + +
    + +[//]: # (TODO: Provide examples) diff --git a/mkdocs/docs/reference/cli/dstack/offer.md b/mkdocs/docs/reference/cli/dstack/offer.md new file mode 100644 index 0000000000..793f13e037 --- /dev/null +++ b/mkdocs/docs/reference/cli/dstack/offer.md @@ -0,0 +1,193 @@ +# dstack offer + +Displays available offers (hardware configurations) from configured backends or from fleets you’ve already provisioned. Supports filtering and grouping. + +The output shows backend, region, instance type, resources, spot availability, and pricing. + +## Usage + +This command accepts most of the same arguments as [`dstack apply`](apply.md). + +
    + +```shell +$ dstack offer --help +#GENERATE# +``` + +
    + +## Fleet offers + +By default, `dstack offer` ignores fleet configurations and shows all available offers that match the request. + +Use `--fleet` to inspect offers available through specific fleets. With one `--fleet`, +`dstack offer` shows offers available through that fleet. With multiple `--fleet`, it +combines offers available through the selected fleets. + +
    + +```shell +$ dstack offer --gpu H100 --fleet my-fleet +``` + +
    + +The same fleet filtering applies to `--group-by` output, e.g. `--group-by gpu,backend` +or `--group-by gpu,backend,region`. + +## Examples + +### Filtering offers { #list-gpu-offers } + +The `--gpu` flag accepts the same specification format as the `gpu` property in [`dev environment`](../../../concepts/dev-environments.md), [`task`](../../../concepts/tasks.md), +[`service`](../../../concepts/services.md), and [`fleet`](../../../concepts/fleets.md) configurations. + +The general format is: `:::`. + +Each component is optional. + +Ranges can be: + +* **Closed** (e.g. `24GB..80GB` or `1..8`) +* **Open** (e.g. `24GB..` or `1..`) +* **Single values** (e.g. `1` or `24GB`). + +Examples: + +* `--gpu nvidia` (any NVIDIA GPU) +* `--gpu nvidia:1..8` (from one to eigth NVIDIA GPUs) +* `--gpu A10,A100` (single NVIDIA A10 or A100 GPU) +* `--gpu A100:80GB` (single NVIDIA A100 with 80GB VRAM) +* `--gpu 24GB..80GB` (any GPU with 24GB to 80GB VRAM) + + + + +The following example lists offers with one or more H100 GPUs: + +
    + +```shell +$ dstack offer --gpu H100:1.. --max-offers 10 +Getting offers... +---> 100% + + # BACKEND REGION INSTANCE TYPE RESOURCES SPOT PRICE + 1 verda FIN-01 1H100.80S.30V 30xCPU, 120GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 + 2 verda FIN-02 1H100.80S.30V 30xCPU, 120GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 + 3 verda FIN-02 1H100.80S.32V 32xCPU, 185GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 + 4 verda ICE-01 1H100.80S.32V 32xCPU, 185GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 + 5 runpod US-KS-2 NVIDIA H100 PCIe 16xCPU, 251GB, 1xH100 (80GB), 100.0GB (disk) no $2.39 + 6 runpod CA NVIDIA H100 80GB HBM3 24xCPU, 251GB, 1xH100 (80GB), 100.0GB (disk) no $2.69 + 7 nebius eu-north1 gpu-h100-sxm 16xCPU, 200GB, 1xH100 (80GB), 100.0GB (disk) no $2.95 + 8 runpod AP-JP-1 NVIDIA H100 80GB HBM3 20xCPU, 251GB, 1xH100 (80GB), 100.0GB (disk) no $2.99 + 9 runpod CA-MTL-1 NVIDIA H100 80GB HBM3 28xCPU, 251GB, 1xH100 (80GB), 100.0GB (disk) no $2.99 + 10 runpod CA-MTL-2 NVIDIA H100 80GB HBM3 26xCPU, 125GB, 1xH100 (80GB), 100.0GB (disk) no $2.99 + ... + Shown 10 of 99 offers, $127.816 max +``` + +
    + +### Grouping offers + +Use `--group-by` to aggregate offers. Accepted values: `gpu`, `backend`, `region`, and `count`. + +
    + +```shell +dstack offer --gpu b200 --group-by gpu,backend,region + Project main + User admin + Resources cpu=2.. mem=8GB.. disk=100GB.. b200:1.. + Spot policy auto + Max price - + Reservation - + Group by gpu, backend, region + + # GPU SPOT $/GPU BACKEND REGION + 1 B200:180GB:1..8 spot, on-demand 3.59..5.99 runpod EU-RO-1 + 2 B200:180GB:1..8 spot, on-demand 3.59..5.99 runpod US-CA-2 + 3 B200:180GB:8 on-demand 4.99 lambda us-east-1 + 4 B200:180GB:8 on-demand 5.5 nebius us-central1 +``` + +
    + +When using `--group-by`, `gpu` must always be `included`. +The `region` value can only be used together with `backend`. + +### JSON format + +Use `--json` to output offers in the JSON format. + +
    + +```shell +$ dstack offer --gpu amd --json +{ + "project": "main", + "user": "admin", + "resources": { + "cpu": { + "min": 2, + "max": null + }, + "memory": { + "min": 8.0, + "max": null + }, + "shm_size": null, + "gpu": { + "vendor": "amd", + "name": null, + "count": { + "min": 1, + "max": 1 + }, + "memory": null, + "total_memory": null, + "compute_capability": null + }, + "disk": { + "size": { + "min": 100.0, + "max": null + } + } + }, + "max_price": null, + "spot": null, + "reservation": null, + "offers": [ + { + "backend": "runpod", + "region": "EU-RO-1", + "instance_type": "AMD Instinct MI300X OAM", + "resources": { + "cpus": 24, + "memory_mib": 289792, + "gpus": [ + { + "name": "MI300X", + "memory_mib": 196608, + "vendor": "amd" + } + ], + "spot": false, + "disk": { + "size_mib": 102400 + }, + "description": "24xCPU, 283GB, 1xMI300X (192GB), 100.0GB (disk)" + }, + "spot": false, + "price": 2.49, + "availability": "available" + } + ], + "total_offers": 1 +} +``` + +
    diff --git a/mkdocs/docs/reference/cli/dstack/project.md b/mkdocs/docs/reference/cli/dstack/project.md new file mode 100644 index 0000000000..6e3e795564 --- /dev/null +++ b/mkdocs/docs/reference/cli/dstack/project.md @@ -0,0 +1,90 @@ +# dstack project + +Before the CLI can be used, it must be configured with a [project](../../../concepts/projects.md), including a project name, server address, and user token. You can configure multiple projects using the `dstack project` CLI command. The configuration is stored in `~/.dstack/config.yml`. + +> The `dstack server` command automatically creates the default `main` project and adds its configuration in `~/.dstack/config.yml`. + +The `dstack project set-default` command can be used to switch between multiple projects. + +??? info "Environment variable" + Alternatively to `dstack project set-default`, you can set the `DSTACK_PROJECT` environment variable. It overrides the default project set in `~/.dstack/config.yml`. + +
    + + ```shell + $ DSTACK_PROJECT=main + $ dstack apply -f examples/.dstack.yml + ``` + +
    + + Also, you can install [`direnv`](https://direnv.net/) + to automatically apply environment variables from the `.envrc` file in your project directory. + +
    + + ```shell + export DSTACK_PROJECT=main + ``` + +
    + + Now, `dstack` will always use this project within this directory. + + Remember to add `.envrc` to `.gitignore` to avoid committing it to the repo. + +## dstack project add + +This command adds a new project configuration. + +
    + +```shell +$ dstack project add --help +#GENERATE# +``` + +
    + +You can find the command on the project’s settings page: + + + +## dstack project list + +This command lists the projects configured on the client. + +
    + +```shell +$ dstack project list --help +#GENERATE# +``` + +
    + +## dstack project set-default + +This command sets the given project as default. + +
    + +```shell +$ dstack project set-default --help +#GENERATE# +``` + +
    + +## dstack project delete + +This command deletes the given project configuration. + +
    + +```shell +$ dstack project delete --help +#GENERATE# +``` + +
    diff --git a/mkdocs/docs/reference/cli/dstack/ps.md b/mkdocs/docs/reference/cli/dstack/ps.md new file mode 100644 index 0000000000..b8414be1f6 --- /dev/null +++ b/mkdocs/docs/reference/cli/dstack/ps.md @@ -0,0 +1,16 @@ +# dstack ps + +This command shows the status of runs. + +## Usage + +
    + +```shell +$ dstack ps --help +#GENERATE# +``` + +
    + +[//]: # (TODO: Provide examples, incl. `-a`) diff --git a/mkdocs/docs/reference/cli/dstack/secret.md b/mkdocs/docs/reference/cli/dstack/secret.md new file mode 100644 index 0000000000..9044cc37f3 --- /dev/null +++ b/mkdocs/docs/reference/cli/dstack/secret.md @@ -0,0 +1,61 @@ +# dstack secret + +The `dstack secret` commands allow managing [Secrets](../../../concepts/secrets.md). + +## dstack secret set + +The `dstack secret set` command creates a new secret or updates an existing one. + +##### Usage + +
    + +```shell +$ dstack secret set --help +#GENERATE# +``` + +
    + +## dstack secret list + +The `dstack secret list` command lists all secrets set in a project. +##### Usage + +
    + +```shell +$ dstack secret list --help +#GENERATE# +``` + +
    + +## dstack secret get + +The `dstack secret get` command show the value of a specified secret. +##### Usage + +
    + +```shell +$ dstack secret get --help +#GENERATE# +``` + +
    + +## dstack secret delete + +The `dstack secret delete` command deletes the specified secret. + +##### Usage + +
    + +```shell +$ dstack secret delete --help +#GENERATE# +``` + +
    diff --git a/mkdocs/docs/reference/cli/dstack/server.md b/mkdocs/docs/reference/cli/dstack/server.md new file mode 100644 index 0000000000..23a4f49190 --- /dev/null +++ b/mkdocs/docs/reference/cli/dstack/server.md @@ -0,0 +1,16 @@ +# dstack server + +This command starts the `dstack` server. + +## Usage + +
    + +```shell +$ dstack server --help +#GENERATE# +``` + +
    + +[//]: # (TODO: Provide examples; mention Docker; reference the deployment guide) diff --git a/mkdocs/docs/reference/cli/dstack/stop.md b/mkdocs/docs/reference/cli/dstack/stop.md new file mode 100644 index 0000000000..27dbed20c3 --- /dev/null +++ b/mkdocs/docs/reference/cli/dstack/stop.md @@ -0,0 +1,16 @@ +# dstack stop + +This command stops run(s). + +## Usage + +
    + +```shell +$ dstack stop --help +#GENERATE# +``` + +
    + +[//]: # (TODO: Provide examples, incl. `-x`) diff --git a/mkdocs/docs/reference/cli/dstack/volume.md b/mkdocs/docs/reference/cli/dstack/volume.md new file mode 100644 index 0000000000..044c2aecef --- /dev/null +++ b/mkdocs/docs/reference/cli/dstack/volume.md @@ -0,0 +1,35 @@ +# dstack volume + +The volumes commands. + +## dstack volume list + +The `dstack volume list` command lists volumes. + +##### Usage + +
    + +```shell +$ dstack volume list --help +#GENERATE# +``` + +
    + +## dstack volume delete + +The `dstack volume delete` command deletes volumes. + +##### Usage + +
    + +```shell +$ dstack volume delete --help +#GENERATE# +``` + +
    + +[//]: # (TODO: Provide examples) diff --git a/mkdocs/docs/reference/dstack.yml.md b/mkdocs/docs/reference/dstack.yml.md new file mode 100644 index 0000000000..6cd7b74538 --- /dev/null +++ b/mkdocs/docs/reference/dstack.yml.md @@ -0,0 +1,5 @@ +# .dstack.yml + +- [`dev-environment`](dstack.yml/dev-environment.md) +- [`task`](dstack.yml/task.md) +- [`service`](dstack.yml/service.md) diff --git a/mkdocs/docs/reference/dstack.yml/dev-environment.md b/mkdocs/docs/reference/dstack.yml/dev-environment.md new file mode 100644 index 0000000000..b2384d679a --- /dev/null +++ b/mkdocs/docs/reference/dstack.yml/dev-environment.md @@ -0,0 +1,197 @@ +# `dev-environment` + +The `dev-environment` configuration type allows running [dev environments](../../concepts/dev-environments.md). + +## Root reference + +#SCHEMA# dstack._internal.core.models.configurations.DevEnvironmentConfiguration + overrides: + show_root_heading: false + type: + required: true + +### `retry` + +#SCHEMA# dstack._internal.core.models.profiles.ProfileRetry + overrides: + show_root_heading: false + type: + required: true + +### `utilization_policy` + +#SCHEMA# dstack._internal.core.models.profiles.UtilizationPolicy + overrides: + show_root_heading: false + type: + required: true + +### `schedule` + +#SCHEMA# dstack._internal.core.models.profiles.Schedule + overrides: + show_root_heading: false + type: + required: true + +### `instances[n]` { #_instances data-toc-label="instances" } + +When `instances` is set, the run is placed only on matching existing fleet instances. + +=== "By name" + + #SCHEMA# dstack._internal.core.models.profiles.InstanceNameSelector + overrides: + show_root_heading: false + type: + required: true + +=== "By hostname" + + #SCHEMA# dstack._internal.core.models.profiles.InstanceHostnameSelector + overrides: + show_root_heading: false + type: + required: true + +=== "By fleet and instance number" + + #SCHEMA# dstack._internal.core.models.profiles.FleetInstanceSelector + overrides: + show_root_heading: false + type: + required: true + +??? info "Short syntax" + + The short syntax for instances is an instance name string. + + * `my-fleet-1`, same as `{name: my-fleet-1}` + +### `resources` + +#SCHEMA# dstack._internal.core.models.resources.ResourcesSpec + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: resources- + +#### `resources.cpu` { #resources-cpu data-toc-label="cpu" } + +#SCHEMA# dstack._internal.core.models.resources.CPUSpec + overrides: + show_root_heading: false + type: + required: true + +#### `resources.gpu` { #resources-gpu data-toc-label="gpu" } + +#SCHEMA# dstack._internal.core.models.resources.GPUSpec + overrides: + show_root_heading: false + type: + required: true + +#### `resources.disk` { #resources-disk data-toc-label="disk" } + +#SCHEMA# dstack._internal.core.models.resources.DiskSpec + overrides: + show_root_heading: false + type: + required: true + +### `registry_auth` + +#SCHEMA# dstack._internal.core.models.configurations.RegistryAuth + overrides: + show_root_heading: false + type: + required: true + +### `volumes[n]` { #_volumes data-toc-label="volumes" } + +=== "Network volumes" + + #SCHEMA# dstack._internal.core.models.volumes.VolumeMountPoint + overrides: + show_root_heading: false + type: + required: true + +=== "Instance volumes" + + #SCHEMA# dstack._internal.core.models.volumes.InstanceMountPoint + overrides: + show_root_heading: false + type: + required: true + +??? info "Short syntax" + + The short syntax for volumes is a colon-separated string in the form of `source:destination` + + * `volume-name:/container/path` for network volumes + * `/instance/path:/container/path` for instance volumes + +### `repos[n]` { #_repos data-toc-label="repos" } + +> Currently, a maximum of one repo is supported. + +> Either `local_path` or `url` must be specified. + +#SCHEMA# dstack._internal.core.models.configurations.RepoSpec + overrides: + show_root_heading: false + type: + required: true + +??? info "`if_exists` action" + + If the `path` already exists and is a non-empty directory, by default the run is terminated with an error. + This can be changed with the `if_exists` option: + + * `error` – do not try to check out, terminate the run with an error (the default action since `0.20.0`) + * `skip` – do not try to check out, skip the repo (the only action available before `0.20.0`) + + Note, if the `path` exists and is _not_ a directory (e.g., a regular file), this is always an error that + cannot be ignored with the `skip` action. + +??? info "Short syntax" + + The short syntax for repos is a colon-separated string in the form of `local_path_or_url:path`. + + * `.:/repo` + * `..:repo` + * `~/repos/demo:~/repo` + * `https://github.com/org/repo:~/data/repo` + * `git@github.com:org/repo.git:data/repo` + +### `files[n]` { #_files data-toc-label="files" } + +#SCHEMA# dstack._internal.core.models.files.FilePathMapping + overrides: + show_root_heading: false + type: + required: true + +??? info "Short syntax" + + The short syntax for files is a colon-separated string in the form of `local_path[:path]` where + `path` is optional and can be omitted if it's equal to `local_path`. + + * `~/.bashrc`, same as `~/.bashrc:~/.bashrc` + * `/opt/myorg`, same as `/opt/myorg/` and `/opt/myorg:/opt/myorg` + * `libs/patched_libibverbs.so.1:/lib/x86_64-linux-gnu/libibverbs.so.1` + +### `backend_options` + +Backend-specific options that only take effect for offers of the respective backend. + +#### `backend_options[n][type=vastai]` { #backend_options-vastai data-toc-label="vastai" } + +#SCHEMA# dstack._internal.core.backends.vastai.profile_options.VastAIProfileOptions + overrides: + show_root_heading: false + type: + required: true diff --git a/mkdocs/docs/reference/dstack.yml/fleet.md b/mkdocs/docs/reference/dstack.yml/fleet.md new file mode 100644 index 0000000000..99b985db44 --- /dev/null +++ b/mkdocs/docs/reference/dstack.yml/fleet.md @@ -0,0 +1,106 @@ +# `fleet` + +The `fleet` configuration type allows creating and updating fleets. + + +=== "Backend fleet" + + ## Root reference + + #SCHEMA# dstack._internal.core.models.fleets.BackendFleetConfiguration + overrides: + show_root_heading: false + type: + required: true + nodes: + required: true + + ### `resources` + + #SCHEMA# dstack._internal.core.models.resources.ResourcesSpec + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: resources- + + #### `resources.cpu` { #resources-cpu data-toc-label="cpu" } + + #SCHEMA# dstack._internal.core.models.resources.CPUSpec + overrides: + show_root_heading: false + type: + required: true + + #### `resources.gpu` { #resources-gpu data-toc-label="gpu" } + + #SCHEMA# dstack._internal.core.models.resources.GPUSpec + overrides: + show_root_heading: false + type: + required: true + + #### `resources.disk` { #resources-disk data-toc-label="disk" } + + #SCHEMA# dstack._internal.core.models.resources.DiskSpec + overrides: + show_root_heading: false + type: + required: true + + ### `retry` + + #SCHEMA# dstack._internal.core.models.profiles.ProfileRetry + overrides: + show_root_heading: false + + ### `backend_options` + + Backend-specific options that only take effect for offers of the respective backend. + + #### `backend_options[n][type=vastai]` { #backend_options-vastai data-toc-label="vastai" } + + #SCHEMA# dstack._internal.core.backends.vastai.profile_options.VastAIProfileOptions + overrides: + show_root_heading: false + type: + required: true + +=== "SSH fleet" + + ## Root reference + + #SCHEMA# dstack._internal.core.models.fleets.SSHFleetConfiguration + overrides: + show_root_heading: false + type: + required: true + ssh_config: + required: true + + ### `ssh_config` { data-toc-label="ssh_config" } + + #SCHEMA# dstack._internal.core.models.fleets.SSHParams + overrides: + show_root_heading: false + item_id_prefix: ssh_config- + + #### `ssh_config.proxy_jump` { #ssh_config-proxy_jump data-toc-label="proxy_jump" } + + #SCHEMA# dstack._internal.core.models.fleets.SSHProxyParams + overrides: + show_root_heading: false + item_id_prefix: proxy_jump- + + #### `ssh_config.hosts[n]` { #ssh_config-hosts data-toc-label="hosts" } + + #SCHEMA# dstack._internal.core.models.fleets.SSHHostParams + overrides: + show_root_heading: false + + ##### `ssh_config.hosts[n].proxy_jump` { #proxy_jump data-toc-label="hosts[n].proxy_jump" } + + #SCHEMA# dstack._internal.core.models.fleets.SSHProxyParams + overrides: + show_root_heading: false + item_id_prefix: hosts-proxy_jump- diff --git a/mkdocs/docs/reference/dstack.yml/gateway.md b/mkdocs/docs/reference/dstack.yml/gateway.md new file mode 100644 index 0000000000..33fbeb4190 --- /dev/null +++ b/mkdocs/docs/reference/dstack.yml/gateway.md @@ -0,0 +1,41 @@ +# `gateway` + +The `gateway` configuration type allows creating and updating [gateways](../../concepts/gateways.md). + +## Root reference + +#SCHEMA# dstack._internal.core.models.gateways.GatewayConfiguration + overrides: + show_root_heading: false + type: + required: true + +### `router` + +=== "SGLang Model Gateway" + + #SCHEMA# dstack._internal.core.models.routers.SGLangGatewayRouterConfig + overrides: + show_root_heading: false + type: + required: true + +### `certificate` + +Set to `null` to disable certificates (e.g. for [private gateways](../../concepts/gateways.md#public-ip)). + +=== "Let's encrypt" + + #SCHEMA# dstack._internal.core.models.gateways.LetsEncryptGatewayCertificate + overrides: + show_root_heading: false + type: + required: true + +=== "ACM" + + #SCHEMA# dstack._internal.core.models.gateways.ACMGatewayCertificate + overrides: + show_root_heading: false + type: + required: true diff --git a/mkdocs/docs/reference/dstack.yml/service.md b/mkdocs/docs/reference/dstack.yml/service.md new file mode 100644 index 0000000000..5f3aa3bd16 --- /dev/null +++ b/mkdocs/docs/reference/dstack.yml/service.md @@ -0,0 +1,277 @@ +# `service` + +The `service` configuration type allows running [services](../../concepts/services.md). + +## Root reference + +#SCHEMA# dstack._internal.core.models.configurations.ServiceConfiguration + overrides: + show_root_heading: false + type: + required: true + +### `model` { data-toc-label="model" } + +=== "OpenAI" + + #SCHEMA# dstack.api.OpenAIChatModel + overrides: + show_root_heading: false + type: + required: true + + +### `scaling` + +#SCHEMA# dstack._internal.core.models.configurations.ScalingSpec + overrides: + show_root_heading: false + type: + required: true + +### `rate_limits` + +#### `rate_limits[n]` + +#SCHEMA# dstack._internal.core.models.configurations.RateLimit + overrides: + show_root_heading: false + type: + required: true + +##### `rate_limits[n].key` { data-toc-label="key" } + +=== "IP address" + + Partition requests by client IP address. + + #SCHEMA# dstack._internal.core.models.configurations.IPAddressPartitioningKey + overrides: + show_root_heading: false + type: + required: true + +=== "Header" + + Partition requests by the value of a header. + + #SCHEMA# dstack._internal.core.models.configurations.HeaderPartitioningKey + overrides: + show_root_heading: false + type: + required: true + +### `probes` + +#### `probes[n]` + +#SCHEMA# dstack._internal.core.models.configurations.ProbeConfig + overrides: + show_root_heading: false + type: + required: true + +##### `probes[n].headers` + +###### `probes[n].headers[m]` + +#SCHEMA# dstack._internal.core.models.configurations.HTTPHeaderSpec + overrides: + show_root_heading: false + type: + required: true + + +### `replicas` + +#### `replicas[n]` + +#SCHEMA# dstack._internal.core.models.configurations.ReplicaGroup + overrides: + show_root_heading: false + type: + required: true + +### `retry` + +#SCHEMA# dstack._internal.core.models.profiles.ProfileRetry + overrides: + show_root_heading: false + +### `utilization_policy` + +#SCHEMA# dstack._internal.core.models.profiles.UtilizationPolicy + overrides: + show_root_heading: false + type: + required: true + +### `schedule` + +#SCHEMA# dstack._internal.core.models.profiles.Schedule + overrides: + show_root_heading: false + type: + required: true + +### `instances[n]` { #_instances data-toc-label="instances" } + +When `instances` is set, the run is placed only on matching existing fleet instances. + +=== "By name" + + #SCHEMA# dstack._internal.core.models.profiles.InstanceNameSelector + overrides: + show_root_heading: false + type: + required: true + +=== "By hostname" + + #SCHEMA# dstack._internal.core.models.profiles.InstanceHostnameSelector + overrides: + show_root_heading: false + type: + required: true + +=== "By fleet and instance number" + + #SCHEMA# dstack._internal.core.models.profiles.FleetInstanceSelector + overrides: + show_root_heading: false + type: + required: true + +??? info "Short syntax" + + The short syntax for instances is an instance name string. + + * `my-fleet-1`, same as `{name: my-fleet-1}` + +### `resources` + +#SCHEMA# dstack._internal.core.models.resources.ResourcesSpec + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: resources- + +#### `resources.cpu` { #resources-cpu data-toc-label="cpu" } + +#SCHEMA# dstack._internal.core.models.resources.CPUSpec + overrides: + show_root_heading: false + type: + required: true + +#### `resources.gpu` { #resources-gpu data-toc-label="gpu" } + +#SCHEMA# dstack._internal.core.models.resources.GPUSpec + overrides: + show_root_heading: false + type: + required: true + +#### `resources.disk` { #resources-disk data-toc-label="disk" } + +#SCHEMA# dstack._internal.core.models.resources.DiskSpec + overrides: + show_root_heading: false + type: + required: true + +### `registry_auth` + +#SCHEMA# dstack._internal.core.models.configurations.RegistryAuth + overrides: + show_root_heading: false + type: + required: true + +### `volumes[n]` { #_volumes data-toc-label="volumes" } + +=== "Network volumes" + + #SCHEMA# dstack._internal.core.models.volumes.VolumeMountPoint + overrides: + show_root_heading: false + type: + required: true + +=== "Instance volumes" + + #SCHEMA# dstack._internal.core.models.volumes.InstanceMountPoint + overrides: + show_root_heading: false + type: + required: true + +??? info "Short syntax" + + The short syntax for volumes is a colon-separated string in the form of `source:destination` + + * `volume-name:/container/path` for network volumes + * `/instance/path:/container/path` for instance volumes + +### `repos[n]` { #_repos data-toc-label="repos" } + +> Currently, a maximum of one repo is supported. + +> Either `local_path` or `url` must be specified. + +#SCHEMA# dstack._internal.core.models.configurations.RepoSpec + overrides: + show_root_heading: false + type: + required: true + +??? info "`if_exists` action" + + If the `path` already exists and is a non-empty directory, by default the run is terminated with an error. + This can be changed with the `if_exists` option: + + * `error` – do not try to check out, terminate the run with an error (the default action since `0.20.0`) + * `skip` – do not try to check out, skip the repo (the only action available before `0.20.0`) + + Note, if the `path` exists and is _not_ a directory (e.g., a regular file), this is always an error that + cannot be ignored with the `skip` action. + +??? info "Short syntax" + + The short syntax for repos is a colon-separated string in the form of `local_path_or_url:path`. + + * `.:/repo` + * `..:repo` + * `~/repos/demo:~/repo` + * `https://github.com/org/repo:~/data/repo` + * `git@github.com:org/repo.git:data/repo` + +### `files[n]` { #_files data-toc-label="files" } + +#SCHEMA# dstack._internal.core.models.files.FilePathMapping + overrides: + show_root_heading: false + type: + required: true + +??? info "Short syntax" + + The short syntax for files is a colon-separated string in the form of `local_path[:path]` where + `path` is optional and can be omitted if it's equal to `local_path`. + + * `~/.bashrc`, same as `~/.bashrc:~/.bashrc` + * `/opt/myorg`, same as `/opt/myorg/` and `/opt/myorg:/opt/myorg` + * `libs/patched_libibverbs.so.1:/lib/x86_64-linux-gnu/libibverbs.so.1` + +### `backend_options` + +Backend-specific options that only take effect for offers of the respective backend. + +#### `backend_options[n][type=vastai]` { #backend_options-vastai data-toc-label="vastai" } + +#SCHEMA# dstack._internal.core.backends.vastai.profile_options.VastAIProfileOptions + overrides: + show_root_heading: false + type: + required: true diff --git a/mkdocs/docs/reference/dstack.yml/task.md b/mkdocs/docs/reference/dstack.yml/task.md new file mode 100644 index 0000000000..104333c1bf --- /dev/null +++ b/mkdocs/docs/reference/dstack.yml/task.md @@ -0,0 +1,197 @@ +# `task` + +The `task` configuration type allows running [tasks](../../concepts/tasks.md). + +## Root reference + +#SCHEMA# dstack._internal.core.models.configurations.TaskConfiguration + overrides: + show_root_heading: false + type: + required: true + +### `retry` + +#SCHEMA# dstack._internal.core.models.profiles.ProfileRetry + overrides: + show_root_heading: false + type: + required: true + +### `utilization_policy` + +#SCHEMA# dstack._internal.core.models.profiles.UtilizationPolicy + overrides: + show_root_heading: false + type: + required: true + +### `schedule` + +#SCHEMA# dstack._internal.core.models.profiles.Schedule + overrides: + show_root_heading: false + type: + required: true + +### `instances[n]` { #_instances data-toc-label="instances" } + +When `instances` is set, the run is placed only on matching existing fleet instances. + +=== "By name" + + #SCHEMA# dstack._internal.core.models.profiles.InstanceNameSelector + overrides: + show_root_heading: false + type: + required: true + +=== "By hostname" + + #SCHEMA# dstack._internal.core.models.profiles.InstanceHostnameSelector + overrides: + show_root_heading: false + type: + required: true + +=== "By fleet and instance number" + + #SCHEMA# dstack._internal.core.models.profiles.FleetInstanceSelector + overrides: + show_root_heading: false + type: + required: true + +??? info "Short syntax" + + The short syntax for instances is an instance name string. + + * `my-fleet-1`, same as `{name: my-fleet-1}` + +### `resources` + +#SCHEMA# dstack._internal.core.models.resources.ResourcesSpec + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: resources- + +#### `resources.cpu` { #resources-cpu data-toc-label="cpu" } + +#SCHEMA# dstack._internal.core.models.resources.CPUSpec + overrides: + show_root_heading: false + type: + required: true + +#### `resources.gpu` { #resources-gpu data-toc-label="gpu" } + +#SCHEMA# dstack._internal.core.models.resources.GPUSpec + overrides: + show_root_heading: false + type: + required: true + +#### `resources.disk` { #resources-disk data-toc-label="disk" } + +#SCHEMA# dstack._internal.core.models.resources.DiskSpec + overrides: + show_root_heading: false + type: + required: true + +### `registry_auth` + +#SCHEMA# dstack._internal.core.models.configurations.RegistryAuth + overrides: + show_root_heading: false + type: + required: true + +### `volumes[n]` { #_volumes data-toc-label="volumes" } + +=== "Network volumes" + + #SCHEMA# dstack._internal.core.models.volumes.VolumeMountPoint + overrides: + show_root_heading: false + type: + required: true + +=== "Instance volumes" + + #SCHEMA# dstack._internal.core.models.volumes.InstanceMountPoint + overrides: + show_root_heading: false + type: + required: true + +??? info "Short syntax" + + The short syntax for volumes is a colon-separated string in the form of `source:destination` + + * `volume-name:/container/path` for network volumes + * `/instance/path:/container/path` for instance volumes + +### `repos[n]` { #_repos data-toc-label="repos" } + +> Currently, a maximum of one repo is supported. + +> Either `local_path` or `url` must be specified. + +#SCHEMA# dstack._internal.core.models.configurations.RepoSpec + overrides: + show_root_heading: false + type: + required: true + +??? info "`if_exists` action" + + If the `path` already exists and is a non-empty directory, by default the run is terminated with an error. + This can be changed with the `if_exists` option: + + * `error` – do not try to check out, terminate the run with an error (the default action since `0.20.0`) + * `skip` – do not try to check out, skip the repo (the only action available before `0.20.0`) + + Note, if the `path` exists and is _not_ a directory (e.g., a regular file), this is always an error that + cannot be ignored with the `skip` action. + +??? info "Short syntax" + + The short syntax for repos is a colon-separated string in the form of `local_path_or_url:path`. + + * `.:/repo` + * `..:repo` + * `~/repos/demo:~/repo` + * `https://github.com/org/repo:~/data/repo` + * `git@github.com:org/repo.git:data/repo` + +### `files[n]` { #_files data-toc-label="files" } + +#SCHEMA# dstack._internal.core.models.files.FilePathMapping + overrides: + show_root_heading: false + type: + required: true + +??? info "Short syntax" + + The short syntax for files is a colon-separated string in the form of `local_path[:path]` where + `path` is optional and can be omitted if it's equal to `local_path`. + + * `~/.bashrc`, same as `~/.bashrc:~/.bashrc` + * `/opt/myorg`, same as `/opt/myorg/` and `/opt/myorg:/opt/myorg` + * `libs/patched_libibverbs.so.1:/lib/x86_64-linux-gnu/libibverbs.so.1` + +### `backend_options` + +Backend-specific options that only take effect for offers of the respective backend. + +#### `backend_options[n][type=vastai]` { #backend_options-vastai data-toc-label="vastai" } + +#SCHEMA# dstack._internal.core.backends.vastai.profile_options.VastAIProfileOptions + overrides: + show_root_heading: false + type: + required: true diff --git a/mkdocs/docs/reference/dstack.yml/volume.md b/mkdocs/docs/reference/dstack.yml/volume.md new file mode 100644 index 0000000000..d3f851c8c0 --- /dev/null +++ b/mkdocs/docs/reference/dstack.yml/volume.md @@ -0,0 +1,64 @@ +# `volume` + +The `volume` configuration type allows creating, registering, and updating [volumes](../../concepts/volumes.md). + +=== "AWS" + + #SCHEMA# dstack._internal.core.models.volumes.AWSVolumeConfiguration + overrides: + show_root_heading: false + backend: + required: true + +=== "GCP" + + #SCHEMA# dstack._internal.core.models.volumes.GCPVolumeConfiguration + overrides: + show_root_heading: false + backend: + required: true + +=== "Runpod" + + #SCHEMA# dstack._internal.core.models.volumes.RunpodVolumeConfiguration + overrides: + show_root_heading: false + backend: + required: true + +=== "Kubernetes" + + Kubernetes backend volumes are mapped to [`PersistentVolumeClaim`](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#persistentvolumeclaims) objects. + + To create a new claim, specify `size` and optionally `storage_class_name` and/or `access_modes`: + + ```yaml + type: volume + backend: kubernetes + name: new-volume + size: 100GB + # By default, storage_class_name is not set, and the decision is delegated to + # the DefaultStorageClass admission controller (if it is enabled) + storage_class_name: test-nfs + # access_modes defaults to [ReadWriteOnce]. For multi-attach-capable volumes + # use ReadWriteMany and/or ReadOnlyMany + access_modes: + - ReadWriteMany + ``` + + To reuse an existing claim, specify `claim_name`: + + ```yaml + type: volume + backend: kubernetes + name: existing-volume + claim_name: existing-pvc + ``` + + #SCHEMA# dstack._internal.core.models.volumes.KubernetesVolumeConfiguration + overrides: + show_root_heading: false + backend: + required: true + region: + required: true diff --git a/mkdocs/docs/reference/env.md b/mkdocs/docs/reference/env.md new file mode 100644 index 0000000000..86a7dd051d --- /dev/null +++ b/mkdocs/docs/reference/env.md @@ -0,0 +1,196 @@ +# Environment variables + +## .dstack.yml + +The following read-only environment variables are automatically propagated to configurations for dev environments, +tasks, and services: + +- `DSTACK_RUN_NAME`{ #DSTACK_RUN_NAME } – The name of the run. + + The example below simply prints `vscode` to the output. + + ```yaml + type: task + name: vscode + + commands: + - echo $DSTACK_RUN_NAME + ``` + + If `name` is not set in the configuration, it is assigned a random name (e.g. `wet-mangust-1`). + +- `DSTACK_RUN_ID`{ #DSTACK_RUN_ID } – The UUID of the run. +- `DSTACK_JOB_ID`{ #DSTACK_JOB_ID } – The UUID of the job submission. +- `DSTACK_REPO_ID`{ #DSTACK_REPO_ID } – The ID of the repo. +- `DSTACK_GPUS_NUM`{ #DSTACK_GPUS_NUM } – The total number of GPUs in the run. + + Example: + + ```yaml + type: service + name: llama31 + + env: + - HF_TOKEN + commands: + - pip install vllm + - vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct + --max-model-len 4096 + --tensor-parallel-size $DSTACK_GPUS_NUM + port: 8000 + model: meta-llama/Meta-Llama-3.1-8B-Instruct + + resources: + gpu: 24GB + ``` + +- `DSTACK_NODES_NUM`{ #DSTACK_NODES_NUM } – The number of nodes in the run +- `DSTACK_GPUS_PER_NODE`{ #DSTACK_GPUS_PER_NODE } – The number of GPUs per node +- `DSTACK_NODE_RANK`{ #DSTACK_NODE_RANK } – The rank of the node +- `DSTACK_MASTER_NODE_IP`{ #DSTACK_NODE_RANK } – The internal IP address of the master node. + + Below is an example of using `DSTACK_NODES_NUM`, `DSTACK_GPUS_PER_NODE`, `DSTACK_NODE_RANK`, and `DSTACK_MASTER_NODE_IP` + for distributed training: + + ```yaml + type: task + name: train-distrib + + # The size of the cluster + nodes: 2 + + python: 3.12 + env: + - NCCL_DEBUG=INFO + commands: + - git clone https://github.com/pytorch/examples.git pytorch-examples + - cd pytorch-examples/distributed/ddp-tutorial-series + - uv pip install -r requirements.txt + - | + torchrun \ + --nproc-per-node=$DSTACK_GPUS_PER_NODE \ + --node-rank=$DSTACK_NODE_RANK \ + --nnodes=$DSTACK_NODES_NUM \ + --master-addr=$DSTACK_MASTER_NODE_IP \ + --master-port=12345 \ + multinode.py 50 10 + + resources: + gpu: 24GB:1..2 + # Uncomment if using multiple GPUs + #shm_size: 24GB + ``` + +- `DSTACK_NODES_IPS`{ #DSTACK_NODES_IPS } – The list of internal IP addresses of all nodes delimited by `"\n"`. +- `DSTACK_MPI_HOSTFILE`{ #DSTACK_MPI_HOSTFILE } – The path to a pre-populated MPI hostfile that can be used directly as `mpirun --hostfile $DSTACK_MPI_HOSTFILE`. + +## Server + +The following environment variables are supported by the `dstack` server and can be specified whether the server is run +via `dstack server` or deployed using Docker. + +For more details on the options below, refer to the [server deployment](../guides/server-deployment.md) guide. + +- `DSTACK_SERVER_LOG_LEVEL`{ #DSTACK_SERVER_LOG_LEVEL } – Has the same effect as `--log-level`. Defaults to `INFO`. + + Example: + +
    + + ```shell + $ DSTACK_SERVER_LOG_LEVEL=debug dstack server + ``` + +
    + +- `DSTACK_SERVER_LOG_FORMAT`{ #DSTACK_SERVER_LOG_FORMAT } – Sets format of log output. Can be `rich`, `standard`, `json`. Defaults to `rich`. +- `DSTACK_SERVER_HOST`{ #DSTACK_SERVER_HOST } – Has the same effect as `--host`. Defaults to `127.0.0.1`. +- `DSTACK_SERVER_PORT`{ #DSTACK_SERVER_PORT } – Has the same effect as `--port`. Defaults to `3000`. +- `DSTACK_SERVER_URL`{ #DSTACK_SERVER_URL } – The URL that the server is running on, e.g. `https://my-server.dstack.ai` Defaults to `http://{DSTACK_SERVER_HOST}:{DSTACK_SERVER_PORT}`. +- `DSTACK_SERVER_ADMIN_TOKEN`{ #DSTACK_SERVER_ADMIN_TOKEN } – Has the same effect as `--token`. Defaults to `None`. +- `DSTACK_SERVER_DIR`{ #DSTACK_SERVER_DIR } – Sets path to store data and server configs. Defaults to `~/.dstack/server`. +- `DSTACK_DATABASE_URL`{ #DSTACK_DATABASE_URL } – The database URL to use instead of default SQLite. Currently `dstack` supports Postgres. Example: `postgresql+asyncpg://myuser:mypassword@localhost:5432/mydatabase`. Defaults to `None`. +- `DSTACK_SERVER_CLOUDWATCH_LOG_GROUP`{ #DSTACK_SERVER_CLOUDWATCH_LOG_GROUP } – The CloudWatch Logs group for storing workloads logs. If not set, the default file-based log storage is used. +- `DSTACK_SERVER_CLOUDWATCH_LOG_REGION`{ #DSTACK_SERVER_CLOUDWATCH_LOG_REGION } – The CloudWatch Logs region. Defaults to `None`. +- `DSTACK_SERVER_GCP_LOGGING_PROJECT`{ #DSTACK_SERVER_GCP_LOGGING_PROJECT } – The GCP Logging project for storing workloads logs. If not set, the default file-based log storage is used. +- `DSTACK_SERVER_FLUENTBIT_HOST`{ #DSTACK_SERVER_FLUENTBIT_HOST } – The Fluent-bit host for log forwarding. If set, enables Fluent-bit log storage. +- `DSTACK_SERVER_FLUENTBIT_PORT`{ #DSTACK_SERVER_FLUENTBIT_PORT } – The Fluent-bit port. Defaults to `24224`. +- `DSTACK_SERVER_FLUENTBIT_PROTOCOL`{ #DSTACK_SERVER_FLUENTBIT_PROTOCOL } – The protocol to use: `forward` or `http`. Defaults to `forward`. +- `DSTACK_SERVER_FLUENTBIT_TAG_PREFIX`{ #DSTACK_SERVER_FLUENTBIT_TAG_PREFIX } – The tag prefix for logs. Defaults to `dstack`. +- `DSTACK_SERVER_ELASTICSEARCH_HOST`{ #DSTACK_SERVER_ELASTICSEARCH_HOST } – The Elasticsearch/OpenSearch host for reading logs back through dstack. Optional; if not set, Fluent-bit runs in ship-only mode (logs are forwarded but not readable through dstack UI/CLI). +- `DSTACK_SERVER_ELASTICSEARCH_INDEX`{ #DSTACK_SERVER_ELASTICSEARCH_INDEX } – The Elasticsearch/OpenSearch index pattern. Defaults to `dstack-logs`. +- `DSTACK_SERVER_ELASTICSEARCH_API_KEY`{ #DSTACK_SERVER_ELASTICSEARCH_API_KEY } – The Elasticsearch/OpenSearch API key for authentication. +- `DSTACK_ENABLE_PROMETHEUS_METRICS`{ #DSTACK_ENABLE_PROMETHEUS_METRICS } — Enables Prometheus metrics collection and export. +- `DSTACK_DEFAULT_SERVICE_CLIENT_MAX_BODY_SIZE`{ #DSTACK_DEFAULT_SERVICE_CLIENT_MAX_BODY_SIZE } – Request body size limit for services running with a gateway, in bytes. Defaults to 64 MiB. +- `DSTACK_SERVICE_CLIENT_TIMEOUT`{ #DSTACK_SERVICE_CLIENT_TIMEOUT } – Timeout in seconds for HTTP requests sent from the in-server proxy and gateways to service replicas. Defaults to 60. +- `DSTACK_FORBID_SERVICES_WITHOUT_GATEWAY`{ #DSTACK_FORBID_SERVICES_WITHOUT_GATEWAY } – Forbids registering new services without a gateway if set to any value. +- `DSTACK_SERVER_CODE_UPLOAD_LIMIT`{ #DSTACK_SERVER_CODE_UPLOAD_LIMIT } - The repo size limit when uploading diffs or local repos, in bytes. Set to `0` to disable size limits. Defaults to `2MiB`. +- `DSTACK_SERVER_S3_BUCKET`{ #DSTACK_SERVER_S3_BUCKET } - The bucket that repo diffs will be uploaded to if set. If unset, diffs are uploaded to the database. +- `DSTACK_SERVER_S3_BUCKET_REGION`{ #DSTACK_SERVER_S3_BUCKET_REGION } - The region of the S3 Bucket. +- `DSTACK_SERVER_GCS_BUCKET`{ #DSTACK_SERVER_GCS_BUCKET } - The bucket that repo diffs will be uploaded to if set. If unset, diffs are uploaded to the database. +- `DSTACK_DB_POOL_SIZE`{ #DSTACK_DB_POOL_SIZE } - The client DB connections pool size. Defaults to `20`, +- `DSTACK_DB_MAX_OVERFLOW`{ #DSTACK_DB_MAX_OVERFLOW } - The client DB connections pool allowed overflow. Defaults to `20`. +- `DSTACK_SERVER_BACKGROUND_PROCESSING_DISABLED`{ #DSTACK_SERVER_BACKGROUND_PROCESSING_DISABLED } - Disables background processing if set to any value. Useful to run only web frontend and API server. +- `DSTACK_SERVER_MAX_PROBES_PER_JOB`{ #DSTACK_SERVER_MAX_PROBES_PER_JOB } - Maximum number of probes allowed in a run configuration. Validated at apply time. +- `DSTACK_SERVER_MAX_PROBE_TIMEOUT`{ #DSTACK_SERVER_MAX_PROBE_TIMEOUT } - Maximum allowed timeout for a probe. Validated at apply time. +- `DSTACK_SERVER_METRICS_RUNNING_TTL_SECONDS`{ #DSTACK_SERVER_METRICS_RUNNING_TTL_SECONDS } – Maximum age of metrics samples for running jobs. +- `DSTACK_SERVER_METRICS_FINISHED_TTL_SECONDS`{ #DSTACK_SERVER_METRICS_FINISHED_TTL_SECONDS } – Maximum age of metrics samples for finished jobs. +- `DSTACK_SERVER_INSTANCE_HEALTH_TTL_SECONDS`{ #DSTACK_SERVER_INSTANCE_HEALTH_TTL_SECONDS } – Maximum age of instance health checks. +- `DSTACK_SERVER_INSTANCE_HEALTH_MIN_COLLECT_INTERVAL_SECONDS`{ #DSTACK_SERVER_INSTANCE_HEALTH_MIN_COLLECT_INTERVAL_SECONDS } – Minimum time interval between consecutive health checks of the same instance. +- `DSTACK_SERVER_EVENTS_TTL_SECONDS`{ #DSTACK_SERVER_EVENTS_TTL_SECONDS } - Maximum age of event records. Set to `0` to disable event storage. Defaults to 30 days. +- `DSTACK_SERVER_DEFAULT_DOCKER_REGISTRY`{ #DSTACK_SERVER_DEFAULT_DOCKER_REGISTRY } – A default Docker registry to use for job images that do not specify an explicit registry. E.g., if set to `registry.example`, then `image: ubuntu` becomes equivalent to `image: registry.example/ubuntu`. **Note**: This setting should only be used for configuring registries that act as a pull-through cache for Docker Hub. The default `dstack` images are also pulled from the configured registry. +- `DSTACK_SERVER_DEFAULT_DOCKER_REGISTRY_USERNAME`{ #DSTACK_SERVER_DEFAULT_DOCKER_REGISTRY_USERNAME } – Username for authenticating with the default Docker registry. See `DSTACK_SERVER_DEFAULT_DOCKER_REGISTRY_PASSWORD`. +- `DSTACK_SERVER_DEFAULT_DOCKER_REGISTRY_PASSWORD`{ #DSTACK_SERVER_DEFAULT_DOCKER_REGISTRY_PASSWORD } – Password for authenticating with the default Docker registry. Applied only when the image has no explicit registry and the run configuration does not specify `registry_auth`. **Note**: The value may be visible to anyone who can SSH into instances managed by `dstack`, which usually includes all users of that `dstack` server. +- `DSTACK_SSHPROXY_API_TOKEN`{ #DSTACK_SSHPROXY_API_TOKEN } – Authentication token for the SSH proxy API. Required to enable SSH proxy integration; must match the token configured when deploying [`dstack-sshproxy`](https://github.com/dstackai/sshproxy). +- `DSTACK_SERVER_SSHPROXY_ADDRESS`{ #DSTACK_SERVER_SSHPROXY_ADDRESS } – Address of the SSH proxy exposed to users, in `HOSTNAME[:PORT]` form. `PORT` defaults to `22` if omitted. Required together with `DSTACK_SSHPROXY_API_TOKEN` to enable SSH proxy integration. +- `DSTACK_SERVER_SSHPROXY_ENFORCED`{ #DSTACK_SERVER_SSHPROXY_ENFORCED } – When set to any value, restricts all SSH connections to go through the SSH proxy. +- `DSTACK_SERVER_JOB_NETWORK_MODE`{ #DSTACK_SERVER_JOB_NETWORK_MODE } – Controls the network mode assigned to jobs. Accepts an integer value: `1` forces bridge networking for single-node jobs while distributed tasks still use host networking; `2` uses host networking whenever the job occupies a full instance (default); `3` forces bridge networking for all jobs including distributed tasks. +- `DSTACK_SERVER_SSH_CONNECT_TIMEOUT`{ #DSTACK_SERVER_SSH_CONNECT_TIMEOUT } – The SSH `ConnectTimeout` for server-instance connections, in seconds. Defaults to `3`. Increase if there are high-latency links between the server and instances. +- `DSTACK_SERVER_SSH_POOL_DISABLED`{ #DSTACK_SERVER_SSH_POOL_DISABLED } – Disables the reuse of server SSH connections to instances. If set, significantly decreases server RAM usage, but +slows down processing and may cause CPU spikes due to frequent SSH-connection establishment. + +??? info "Internal environment variables" + The following environment variables are intended for development purposes: + + * `DSTACK_SERVER_ROOT_LOG_LEVEL` – Sets root logger log level. Defaults to `ERROR`. + * `DSTACK_SERVER_UVICORN_LOG_LEVEL` – Sets uvicorn logger log level. Defaults to `ERROR`. + * `DSTACK_SERVER_MAX_OFFERS_TRIED` - Sets how many instance offers to try when starting a job. + Setting a high value can degrade server performance. + * `DSTACK_RUNNER_VERSION` – Sets exact runner version for debug. Defaults to `latest`. Ignored if `DSTACK_RUNNER_DOWNLOAD_URL` is set. + * `DSTACK_RUNNER_DOWNLOAD_URL` – Overrides `dstack-runner` binary download URL. The URL can contain `{version}` and/or `{arch}` placeholders, + where `{version}` is `dstack` version in the `X.Y.Z` format or `latest`, and `{arch}` is either `amd64` or `arm64`, for example, + `https://dstack.example.com/{arch}/{version}/dstack-runner`. + * `DSTACK_SHIM_DOWNLOAD_URL` – Overrides `dstack-shim` binary download URL. The URL can contain `{version}` and/or `{arch}` placeholders, + see `DSTACK_RUNNER_DOWNLOAD_URL` for the details. + * `DSTACK_DEFAULT_CREDS_DISABLED` – Disables default credentials detection if set. Defaults to `None`. + +## CLI + +The following environment variables are supported by the CLI. + +- `DSTACK_CLI_LOG_LEVEL`{ #DSTACK_CLI_LOG_LEVEL } – Sets the logging level for CLI output to stdout. Defaults to `INFO`. + +Example: + +
    + +```shell +$ DSTACK_CLI_LOG_LEVEL=debug dstack apply -f .dstack.yml +``` + +
    + +- `DSTACK_CLI_FILE_LOG_LEVEL`{ #DSTACK_CLI_FILE_LOG_LEVEL } – Sets the logging level for CLI log files. Defaults to `DEBUG`. + +
    + +```shell +$ find ~/.dstack/logs/cli/ + + ~/.dstack/logs/cli/latest.log + ~/.dstack/logs/cli/2025-07-31.log +``` + +
    + +- `DSTACK_PROJECT`{ #DSTACK_PROJECT } – Has the same effect as `--project`. Defaults to `None`. diff --git a/mkdocs/docs/reference/http/authentication.md b/mkdocs/docs/reference/http/authentication.md new file mode 100644 index 0000000000..539743146e --- /dev/null +++ b/mkdocs/docs/reference/http/authentication.md @@ -0,0 +1,5 @@ +--- +title: authentication +--- + +!!swagger openapi.json tag="authentication"!! diff --git a/mkdocs/docs/reference/http/backends.md b/mkdocs/docs/reference/http/backends.md new file mode 100644 index 0000000000..24a12894a2 --- /dev/null +++ b/mkdocs/docs/reference/http/backends.md @@ -0,0 +1,5 @@ +--- +title: backends +--- + +!!swagger openapi.json tag="backends"!! diff --git a/mkdocs/docs/reference/http/default.md b/mkdocs/docs/reference/http/default.md new file mode 100644 index 0000000000..ff45baea78 --- /dev/null +++ b/mkdocs/docs/reference/http/default.md @@ -0,0 +1,5 @@ +--- +title: default +--- + +!!swagger openapi.json tag="default"!! diff --git a/mkdocs/docs/reference/http/events.md b/mkdocs/docs/reference/http/events.md new file mode 100644 index 0000000000..ef60663097 --- /dev/null +++ b/mkdocs/docs/reference/http/events.md @@ -0,0 +1,5 @@ +--- +title: events +--- + +!!swagger openapi.json tag="events"!! diff --git a/mkdocs/docs/reference/http/exports.md b/mkdocs/docs/reference/http/exports.md new file mode 100644 index 0000000000..9859c5b2a0 --- /dev/null +++ b/mkdocs/docs/reference/http/exports.md @@ -0,0 +1,5 @@ +--- +title: exports +--- + +!!swagger openapi.json tag="exports"!! diff --git a/mkdocs/docs/reference/http/files.md b/mkdocs/docs/reference/http/files.md new file mode 100644 index 0000000000..b4c724c8c7 --- /dev/null +++ b/mkdocs/docs/reference/http/files.md @@ -0,0 +1,5 @@ +--- +title: files +--- + +!!swagger openapi.json tag="files"!! diff --git a/mkdocs/docs/reference/http/fleets.md b/mkdocs/docs/reference/http/fleets.md new file mode 100644 index 0000000000..1eb04597fc --- /dev/null +++ b/mkdocs/docs/reference/http/fleets.md @@ -0,0 +1,5 @@ +--- +title: fleets +--- + +!!swagger openapi.json tag="fleets"!! diff --git a/mkdocs/docs/reference/http/gateways.md b/mkdocs/docs/reference/http/gateways.md new file mode 100644 index 0000000000..8947d22f30 --- /dev/null +++ b/mkdocs/docs/reference/http/gateways.md @@ -0,0 +1,5 @@ +--- +title: gateways +--- + +!!swagger openapi.json tag="gateways"!! diff --git a/mkdocs/docs/reference/http/gpus.md b/mkdocs/docs/reference/http/gpus.md new file mode 100644 index 0000000000..86cc023c79 --- /dev/null +++ b/mkdocs/docs/reference/http/gpus.md @@ -0,0 +1,5 @@ +--- +title: gpus +--- + +!!swagger openapi.json tag="gpus"!! diff --git a/mkdocs/docs/reference/http/index.md b/mkdocs/docs/reference/http/index.md new file mode 100644 index 0000000000..3e3de7d75b --- /dev/null +++ b/mkdocs/docs/reference/http/index.md @@ -0,0 +1,85 @@ +--- +title: HTTP API +hide: + - toc +--- + +The HTTP API enables running tasks, services, and managing runs programmatically. + +## Usage example + +Below is a quick example of submitting a task for running and waiting for its completion. + +```python +import os +from pathlib import Path +import time +import requests + +url = os.environ["DSTACK_URL"] +token = os.environ["DSTACK_TOKEN"] +project = os.environ["DSTACK_PROJECT"] +ssh_public_key = Path(os.environ["SSH_PUBLIC_KEY_PATH"]).read_text() + +print("Submitting task") +resp = requests.post( + url=f"{url}/api/project/{project}/runs/apply", + headers={"Authorization": f"Bearer {token}"}, + json={ + "plan":{ + "run_spec": { + "configuration": { + "type": "task", + "commands": [ + "echo Start", + "sleep 10", # do some work here + "echo Finish" + ], + }, + "ssh_key_pub": ssh_public_key, + } + }, + "force": False, + }, +) +run_name = resp.json()["run_spec"]["run_name"] + +print("Waiting for task completion") +while True: + resp = requests.post( + url=f"{url}/api/project/{project}/runs/get", + headers={"Authorization": f"Bearer {token}"}, + json={"run_name": run_name} + ) + if resp.json()["status"] in ["terminated", "aborted", "failed", "done"]: + print(f"Run finished with status {resp.json()['status']}") + break + time.sleep(2) +``` + +## Reference + +The HTTP API reference is split by endpoint tag. + + +- [server](server.md) +- [users](users.md) +- [authentication](authentication.md) +- [projects](projects.md) +- [backends](backends.md) +- [fleets](fleets.md) +- [repos](repos.md) +- [runs](runs.md) +- [gpus](gpus.md) +- [metrics](metrics.md) +- [logs](logs.md) +- [secrets](secrets.md) +- [gateways](gateways.md) +- [volumes](volumes.md) +- [proxy](proxy.md) +- [files](files.md) +- [events](events.md) +- [templates](templates.md) +- [exports](exports.md) +- [default](default.md) + diff --git a/mkdocs/docs/reference/http/logs.md b/mkdocs/docs/reference/http/logs.md new file mode 100644 index 0000000000..9f5e9900bc --- /dev/null +++ b/mkdocs/docs/reference/http/logs.md @@ -0,0 +1,5 @@ +--- +title: logs +--- + +!!swagger openapi.json tag="logs"!! diff --git a/mkdocs/docs/reference/http/metrics.md b/mkdocs/docs/reference/http/metrics.md new file mode 100644 index 0000000000..88b3b1da7a --- /dev/null +++ b/mkdocs/docs/reference/http/metrics.md @@ -0,0 +1,5 @@ +--- +title: metrics +--- + +!!swagger openapi.json tag="metrics"!! diff --git a/mkdocs/docs/reference/http/projects.md b/mkdocs/docs/reference/http/projects.md new file mode 100644 index 0000000000..814a98f1c0 --- /dev/null +++ b/mkdocs/docs/reference/http/projects.md @@ -0,0 +1,5 @@ +--- +title: projects +--- + +!!swagger openapi.json tag="projects"!! diff --git a/mkdocs/docs/reference/http/proxy.md b/mkdocs/docs/reference/http/proxy.md new file mode 100644 index 0000000000..3dc5cf57f7 --- /dev/null +++ b/mkdocs/docs/reference/http/proxy.md @@ -0,0 +1,5 @@ +--- +title: proxy +--- + +!!swagger openapi.json tag="proxy"!! diff --git a/mkdocs/docs/reference/http/repos.md b/mkdocs/docs/reference/http/repos.md new file mode 100644 index 0000000000..ab050bea6a --- /dev/null +++ b/mkdocs/docs/reference/http/repos.md @@ -0,0 +1,5 @@ +--- +title: repos +--- + +!!swagger openapi.json tag="repos"!! diff --git a/mkdocs/docs/reference/http/runs.md b/mkdocs/docs/reference/http/runs.md new file mode 100644 index 0000000000..18be3b87ed --- /dev/null +++ b/mkdocs/docs/reference/http/runs.md @@ -0,0 +1,5 @@ +--- +title: runs +--- + +!!swagger openapi.json tag="runs"!! diff --git a/mkdocs/docs/reference/http/secrets.md b/mkdocs/docs/reference/http/secrets.md new file mode 100644 index 0000000000..20e8284898 --- /dev/null +++ b/mkdocs/docs/reference/http/secrets.md @@ -0,0 +1,5 @@ +--- +title: secrets +--- + +!!swagger openapi.json tag="secrets"!! diff --git a/mkdocs/docs/reference/http/server.md b/mkdocs/docs/reference/http/server.md new file mode 100644 index 0000000000..cf30aefb31 --- /dev/null +++ b/mkdocs/docs/reference/http/server.md @@ -0,0 +1,5 @@ +--- +title: server +--- + +!!swagger openapi.json tag="server"!! diff --git a/mkdocs/docs/reference/http/templates.md b/mkdocs/docs/reference/http/templates.md new file mode 100644 index 0000000000..98b2185c7d --- /dev/null +++ b/mkdocs/docs/reference/http/templates.md @@ -0,0 +1,5 @@ +--- +title: templates +--- + +!!swagger openapi.json tag="templates"!! diff --git a/mkdocs/docs/reference/http/users.md b/mkdocs/docs/reference/http/users.md new file mode 100644 index 0000000000..7c526ce7d2 --- /dev/null +++ b/mkdocs/docs/reference/http/users.md @@ -0,0 +1,5 @@ +--- +title: users +--- + +!!swagger openapi.json tag="users"!! diff --git a/mkdocs/docs/reference/http/volumes.md b/mkdocs/docs/reference/http/volumes.md new file mode 100644 index 0000000000..e0f5dd15ba --- /dev/null +++ b/mkdocs/docs/reference/http/volumes.md @@ -0,0 +1,5 @@ +--- +title: volumes +--- + +!!swagger openapi.json tag="volumes"!! diff --git a/mkdocs/docs/reference/plugins/python/index.md b/mkdocs/docs/reference/plugins/python/index.md new file mode 100644 index 0000000000..278d974537 --- /dev/null +++ b/mkdocs/docs/reference/plugins/python/index.md @@ -0,0 +1,134 @@ +# Plugins + +The `dstack` plugin system allows extending `dstack` server functionality using external Python packages. + +!!! info "Experimental" + Plugins are currently an experimental feature. Backward compatibility is not guaranteed across releases. + +## Enable plugins + +To enable a plugin, list it under `plugins` in [`server/config.yml`](../../server/config.yml.md): + +
    + +```yaml +plugins: + - my_dstack_plugin + - some_other_plugin +projects: +- name: main +``` + +
    + +On the next server restart, you should see a log message indicating that the plugin is loaded. + +## Create a plugin + +To create a plugin, create a Python package that implements a subclass of +`dstack.plugins.Plugin` and exports this subclass as a "dstack.plugins" entry point. + +1. Init the plugin package: + +
    + + ```shell + $ uv init --library + ``` + +
    + +2. Define `ApplyPolicy` and `Plugin` subclasses: + +
    + + ```python + from dstack.plugins import ApplyPolicy, Plugin, RunSpec, get_plugin_logger + + logger = get_plugin_logger(__name__) + + class ExamplePolicy(ApplyPolicy): + def on_run_apply(self, user: str, project: str, spec: RunSpec) -> RunSpec: + # ... + return spec + + class ExamplePlugin(Plugin): + + def get_apply_policies(self) -> list[ApplyPolicy]: + return [ExamplePolicy()] + ``` + +
    + +3. Specify a `"dstack.plugins"` entry point in `pyproject.toml`: + +
    + + ```toml + [project.entry-points."dstack.plugins"] + example_plugin = "example_plugin:ExamplePlugin" + ``` + +
    + +Then you can install the plugin package into your Python environment and enable it via `server/config.yml`. + +??? info "Docker" + If you deploy `dstack` using a Docker image you can add plugins either + by including them in your custom image built upon the `dstack` [server image](https://github.com/dstackai/dstack/tree/master/docker/server), or by mounting installed plugins as volumes. + +## Apply policies + +Currently the only plugin functionality is apply policies. +Apply policies allow modifying specs of runs, fleets, volumes, and gateways submitted on `dstack apply`. +Subclass `dstack.plugins.ApplyPolicy` to implement them. + +Here's an example of how to enforce certain rules using apply policies: + +
    + +```python +class ExamplePolicy(ApplyPolicy): + def on_run_apply(self, user: str, project: str, spec: RunSpec) -> RunSpec: + # Forcing some limits + spec.configuration.max_price = 2.0 + spec.configuration.max_duration = "1d" + # Setting some extra tags + if spec.configuration.tags is None: + spec.configuration.tags = {} + spec.configuration.tags |= { + "team": "my_team", + } + # Forbid something + if ( + spec.configuration.privileged + or spec.configuration.docker + or ( + isinstance(spec.configuration, Service) + and isinstance(spec.configuration.replicas, list) + and any(r.privileged or r.docker for r in spec.configuration.replicas) + ) + ): + logger.warning("User %s tries to run privileged containers", user) + raise ValueError("Running privileged containers is forbidden") + # Set some service-specific properties + if isinstance(spec.configuration, Service): + spec.configuration.https = True + return spec +``` + +
    + +## Built-in plugins + +### REST plugin + +`rest_plugin` is a builtin `dstack` plugin that allows writing your custom plugins as API servers, so you don't need to install plugins as Python packages. + +Plugins implemented as API servers have advantages over plugins implemented as Python packages in some cases: + +* No dependency conflicts with `dstack`. +* You can use any programming language. +* If you run the `dstack` server via Docker, you don't need to extend the `dstack` server image with plugins or map them via volumes. + +To get started, check out the [plugin server example](https://github.com/dstackai/dstack/tree/master/examples/plugins/example_plugin_server). The `rest_plugin` server API is documented [here](../../plugins/rest/index.md). diff --git a/mkdocs/docs/reference/plugins/rest/index.md b/mkdocs/docs/reference/plugins/rest/index.md new file mode 100644 index 0000000000..b8259985d1 --- /dev/null +++ b/mkdocs/docs/reference/plugins/rest/index.md @@ -0,0 +1,33 @@ +--- +title: REST PLUGIN API +--- + +`rest_plugin` is a builtin `dstack` plugin that allows writing custom plugins as API servers. The following is an OpenAPI documentation for a `rest_plugin` API server. + + + +

    + +!!swagger rest_plugin_openapi.json!! diff --git a/mkdocs/docs/reference/profiles.yml.md b/mkdocs/docs/reference/profiles.yml.md new file mode 100644 index 0000000000..c97f9d427b --- /dev/null +++ b/mkdocs/docs/reference/profiles.yml.md @@ -0,0 +1,49 @@ +# .dstack/profiles.yml + +Sometimes, you may want to reuse the same parameters across runs or set your own defaults so you don’t have to repeat them in every run configuration. You can do this by defining a profile, either globally in `~/.dstack/profiles.yml` or locally in `.dstack/profiles.yml`. + +A profile can be set as `default` to apply automatically to any run, or specified with `--profile NAME` in `dstack apply`. + +Example: + +
    + +```yaml +profiles: + - name: my-profile + # If set to true, this profile will be applied automatically + default: true + + # The spot pololicy can be "spot", "on-demand", or "auto" + spot_policy: auto + # Limit the maximum price of the instance per hour + max_price: 1.5 + # Stop any run if it runs longer that this duration + max_duration: 1d + # Use only these backends + backends: [azure, lambda] +``` + +
    + +The profile configuration supports most properties that a run configuration supports — see below. + +### Root reference + +#SCHEMA# dstack._internal.core.models.profiles.Profile + overrides: + show_root_heading: false + max_price: + type: 'Optional[float]' + +### `retry` + +#SCHEMA# dstack._internal.core.models.profiles.ProfileRetry + overrides: + show_root_heading: false + +### `utilization_policy` + +#SCHEMA# dstack._internal.core.models.profiles.UtilizationPolicy + overrides: + show_root_heading: false diff --git a/mkdocs/docs/reference/server/config.yml.md b/mkdocs/docs/reference/server/config.yml.md new file mode 100644 index 0000000000..217e489b5c --- /dev/null +++ b/mkdocs/docs/reference/server/config.yml.md @@ -0,0 +1,446 @@ +# ~/.dstack/server/config.yml + +The `~/.dstack/server/config.yml` file is used +to configure [backends](../../concepts/backends.md) and other [server-level settings](../../guides/server-deployment.md). + +## Root reference + +#SCHEMA# dstack._internal.server.services.config.ServerConfig + overrides: + show_root_heading: false + +### `projects[n]` { #projects data-toc-label="projects" } + +#SCHEMA# dstack._internal.server.services.config.ProjectConfig + overrides: + show_root_heading: false + +#### `projects[n].backends` { #backends data-toc-label="backends" } + +##### `projects[n].backends[type=aws]` { #aws data-toc-label="aws" } + +#SCHEMA# dstack._internal.core.backends.aws.models.AWSBackendConfigWithCreds + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: aws- + +###### `projects[n].backends[type=aws].creds` { #aws-creds data-toc-label="creds" } + +=== "Access key" + #SCHEMA# dstack._internal.core.backends.aws.models.AWSAccessKeyCreds + overrides: + show_root_heading: false + type: + required: true + +=== "Default" + #SCHEMA# dstack._internal.core.backends.aws.models.AWSDefaultCreds + overrides: + show_root_heading: false + type: + required: true + +###### `projects[n].backends[type=aws].os_images` { #aws-os_images data-toc-label="os_images" } + +#SCHEMA# dstack._internal.core.backends.aws.models.AWSOSImageConfig + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: aws-os_images- + +###### `projects[n].backends[type=aws].os_images.cpu` { #aws-os_images-cpu data-toc-label="cpu" } + +#SCHEMA# dstack._internal.core.backends.aws.models.AWSOSImage + overrides: + show_root_heading: false + type: + required: true + +###### `projects[n].backends[type=aws].os_images.nvidia` { #aws-os_images-nvidia data-toc-label="nvidia" } + +#SCHEMA# dstack._internal.core.backends.aws.models.AWSOSImage + overrides: + show_root_heading: false + type: + required: true + +##### `projects[n].backends[type=azure]` { #azure data-toc-label="azure" } + +#SCHEMA# dstack._internal.core.backends.azure.models.AzureBackendConfigWithCreds + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: azure- + +###### `projects[n].backends[type=azure].creds` { #azure-creds data-toc-label="creds" } + +=== "Client" + #SCHEMA# dstack._internal.core.backends.azure.models.AzureClientCreds + overrides: + show_root_heading: false + type: + required: true + +=== "Default" + #SCHEMA# dstack._internal.core.backends.azure.models.AzureDefaultCreds + overrides: + show_root_heading: false + type: + required: true + +##### `projects[n].backends[type=gcp]` { #gcp data-toc-label="gcp" } + +#SCHEMA# dstack._internal.core.backends.gcp.models.GCPBackendConfigWithCreds + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: gcp- + +###### `projects[n].backends[type=gcp].creds` { #gcp-creds data-toc-label="creds" } + +=== "Service account" + #SCHEMA# dstack._internal.core.backends.gcp.models.GCPServiceAccountFileCreds + overrides: + show_root_heading: false + type: + required: true + + ??? info "Specifying `data`" + To specify service account file contents as a string, use `jq`: + + ```shell + cat my-service-account-file.json | jq -c | jq -R + ``` + +=== "Default" + #SCHEMA# dstack._internal.core.backends.gcp.models.GCPDefaultCreds + overrides: + show_root_heading: false + type: + required: true + + +##### `projects[n].backends[type=lambda]` { #lambda data-toc-label="lambda" } + +#SCHEMA# dstack._internal.core.backends.lambdalabs.models.LambdaBackendConfigWithCreds + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: lambda- + +###### `projects[n].backends[type=lambda].creds` { #lambda-creds data-toc-label="creds" } + +#SCHEMA# dstack._internal.core.backends.lambdalabs.models.LambdaAPIKeyCreds + overrides: + show_root_heading: false + type: + required: true + +##### `projects[n].backends[type=nebius]` { #nebius data-toc-label="nebius" } + +#SCHEMA# dstack._internal.core.backends.nebius.models.NebiusBackendFileConfigWithCreds + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: nebius- + +###### `projects[n].backends[type=nebius].creds` { #nebius-creds data-toc-label="creds" } + +#SCHEMA# dstack._internal.core.backends.nebius.models.NebiusServiceAccountFileCreds + overrides: + show_root_heading: false + type: + required: true + +##### `projects[n].backends[type=runpod]` { #runpod data-toc-label="runpod" } + +#SCHEMA# dstack._internal.core.backends.runpod.models.RunpodBackendConfigWithCreds + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: runpod- + +###### `projects[n].backends[type=runpod].creds` { #runpod-creds data-toc-label="creds" } + +#SCHEMA# dstack._internal.core.backends.runpod.models.RunpodAPIKeyCreds + overrides: + show_root_heading: false + type: + required: true + +##### `projects[n].backends[type=vastai]` { #vastai data-toc-label="vastai" } + +#SCHEMA# dstack._internal.core.backends.vastai.models.VastAIBackendConfigWithCreds + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: vastai- + +###### `projects[n].backends[type=vastai].creds` { #vastai-creds data-toc-label="creds" } + +#SCHEMA# dstack._internal.core.backends.vastai.models.VastAIAPIKeyCreds + overrides: + show_root_heading: false + type: + required: true + + + +##### `projects[n].backends[type=oci]` { #oci data-toc-label="oci" } + +#SCHEMA# dstack._internal.core.backends.oci.models.OCIBackendConfigWithCreds + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: oci- + +###### `projects[n].backends[type=oci].creds` { #oci-creds data-toc-label="creds" } + +=== "Client" + #SCHEMA# dstack._internal.core.backends.oci.models.OCIClientCreds + overrides: + show_root_heading: false + type: + required: true + +=== "Default" + #SCHEMA# dstack._internal.core.backends.oci.models.OCIDefaultCreds + overrides: + show_root_heading: false + type: + required: true + +##### `projects[n].backends[type=verda]` { #verda data-toc-label="verda" } + +#SCHEMA# dstack._internal.core.backends.verda.models.VerdaBackendConfigWithCreds + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: verda- + +###### `projects[n].backends[type=verda].creds` { #verda-creds data-toc-label="creds" } + +#SCHEMA# dstack._internal.core.backends.verda.models.VerdaAPIKeyCreds + overrides: + show_root_heading: false + type: + required: true + +##### `projects[n].backends[type=kubernetes]` { #kubernetes data-toc-label="kubernetes" } + +#SCHEMA# dstack._internal.core.backends.kubernetes.models.KubernetesBackendConfigWithCreds + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: kubernetes- + +###### `projects[n].backends[type=kubernetes].kubeconfig` { #kubernetes-kubeconfig data-toc-label="kubeconfig" } + +#SCHEMA# dstack._internal.core.backends.kubernetes.models.KubeconfigFileConfig + overrides: + show_root_heading: false + +??? info "Specifying `data`" + To specify kubeconfig contents directly via `data`, convert it to a string: + + ```shell + yq -o=json ~/.kube/config | jq -c | jq -R + ``` + +###### `projects[n].backends[type=kubernetes].contexts[n]` { #kubernetes-contexts data-toc-label="contexts" } + +#SCHEMA# dstack._internal.core.backends.kubernetes.models.KubernetesContextConfig + overrides: + show_root_heading: false + +###### `projects[n].backends[type=kubernetes].contexts[n].proxy_jump` { #kubernetes-contexts-proxy_jump data-toc-label="proxy_jump" } + +#SCHEMA# dstack._internal.core.backends.kubernetes.models.KubernetesProxyJumpConfig + overrides: + show_root_heading: false + +###### `projects[n].backends[type=kubernetes].proxy_jump` { #kubernetes-proxy_jump data-toc-label="proxy_jump" } + +#SCHEMA# dstack._internal.core.backends.kubernetes.models.KubernetesProxyJumpConfig + overrides: + show_root_heading: false + +##### `projects[n].backends[type=vultr]` { #vultr data-toc-label="vultr" } + +#SCHEMA# dstack._internal.core.backends.vultr.models.VultrBackendConfigWithCreds + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: vultr- + +###### `projects[n].backends[type=vultr].creds` { #vultr-creds data-toc-label="creds" } + +#SCHEMA# dstack._internal.core.backends.vultr.models.VultrAPIKeyCreds + overrides: + show_root_heading: false + type: + required: true + +##### `projects[n].backends[type=amddevcloud]` { #amddevcloud data-toc-label="amddevcloud" } + +#SCHEMA# dstack._internal.core.backends.digitalocean_base.models.BaseDigitalOceanBackendConfigWithCreds + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: amddevcloud- + +###### `projects[n].backends[type=amddevcloud].creds` { #amddevcloud-creds data-toc-label="creds" } + +#SCHEMA# dstack._internal.core.backends.digitalocean_base.models.BaseDigitalOceanAPIKeyCreds + overrides: + show_root_heading: false + type: + required: true + +##### `projects[n].backends[type=digitalocean]` { #digitalocean data-toc-label="digitalocean" } + +#SCHEMA# dstack._internal.core.backends.digitalocean_base.models.BaseDigitalOceanBackendConfigWithCreds + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: digitalocean- + +###### `projects[n].backends[type=digitalocean].creds` { #digitalocean-creds data-toc-label="creds" } + +#SCHEMA# dstack._internal.core.backends.digitalocean_base.models.BaseDigitalOceanAPIKeyCreds + overrides: + show_root_heading: false + type: + required: true + +##### `projects[n].backends[type=crusoe]` { #crusoe data-toc-label="crusoe" } + +#SCHEMA# dstack._internal.core.backends.crusoe.models.CrusoeBackendConfigWithCreds + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: crusoe- + +###### `projects[n].backends[type=crusoe].creds` { #crusoe-creds data-toc-label="creds" } + +#SCHEMA# dstack._internal.core.backends.crusoe.models.CrusoeAccessKeyCreds + overrides: + show_root_heading: false + type: + required: true + +##### `projects[n].backends[type=hotaisle]` { #hotaisle data-toc-label="hotaisle" } + +#SCHEMA# dstack._internal.core.backends.hotaisle.models.HotAisleBackendConfigWithCreds + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: hotaisle- + +###### `projects[n].backends[type=hotaisle].creds` { #hotaisle-creds data-toc-label="creds" } + +#SCHEMA# dstack._internal.core.backends.hotaisle.models.HotAisleAPIKeyCreds + overrides: + show_root_heading: false + type: + required: true + +##### `projects[n].backends[type=jarvislabs]` { #jarvislabs data-toc-label="jarvislabs" } + +#SCHEMA# dstack._internal.core.backends.jarvislabs.models.JarvisLabsBackendFileConfigWithCreds + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: jarvislabs- + +###### `projects[n].backends[type=jarvislabs].creds` { #jarvislabs-creds data-toc-label="creds" } + +#SCHEMA# dstack._internal.core.backends.jarvislabs.models.JarvisLabsAPIKeyCreds + overrides: + show_root_heading: false + type: + required: true + +##### `projects[n].backends[type=cloudrift]` { #cloudrift data-toc-label="cloudrift" } + +#SCHEMA# dstack._internal.core.backends.cloudrift.models.CloudRiftBackendConfigWithCreds + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: cloudrift- + +###### `projects[n].backends[type=cloudrift].creds` { #cloudrift-creds data-toc-label="creds" } + +#SCHEMA# dstack._internal.core.backends.cloudrift.models.CloudRiftAPIKeyCreds + overrides: + show_root_heading: false + type: + required: true + +### `encryption` { #encryption data-toc-label="encryption" } + +#SCHEMA# dstack._internal.server.services.config.EncryptionConfig + overrides: + show_root_heading: false + +#### `encryption.keys` { #encryption-keys data-toc-label="keys" } + +##### `encryption.keys[n][type=identity]` { #encryption-keys-identity data-toc-label="identity" } + +#SCHEMA# dstack._internal.server.services.encryption.keys.identity.IdentityEncryptionKeyConfig + overrides: + show_root_heading: false + type: + required: true + +##### `encryption.keys[n][type=aes]` { #encryption-keys-aes data-toc-label="aes" } + +#SCHEMA# dstack._internal.server.services.encryption.keys.aes.AESEncryptionKeyConfig + overrides: + show_root_heading: false + type: + required: true + +### `default_permissions` { #default_permissions data-toc-label="default_permissions" } + +#SCHEMA# dstack._internal.server.services.permissions.DefaultPermissions + overrides: + show_root_heading: false diff --git a/mkdocs/index.md b/mkdocs/index.md new file mode 100644 index 0000000000..571c05ae6e --- /dev/null +++ b/mkdocs/index.md @@ -0,0 +1,8 @@ +--- +template: home.html +title: The orchestration stack for AI infrastructure +hide: + - navigation + - toc + - footer +--- diff --git a/docs/layouts/custom.yml b/mkdocs/layouts/custom.yml similarity index 79% rename from docs/layouts/custom.yml rename to mkdocs/layouts/custom.yml index a11f5b7fe4..74a0637b2d 100644 --- a/docs/layouts/custom.yml +++ b/mkdocs/layouts/custom.yml @@ -11,6 +11,13 @@ definitions: {{ page.meta.get("title", page.title) }} {%- endif -%} + - &page_image >- + {% if page and page.meta.image %} + {{ page.meta.image }} + {%- else -%} + {{ image.url }} + {%- endif -%} + # Page title with site name - &page_title_with_site_name >- {%- if not page.is_homepage -%} @@ -43,17 +50,17 @@ size: { width: 1200, height: 630 } layers: - background: color: "black" - - size: { width: 44, height: 44 } - offset: { x: 970, y: 521 } + - size: { width: 65, height: 60 } + offset: { x: 908, y: 499 } background: image: *logo - - size: { width: 300, height: 42 } - offset: { x: 1018, y: 525 } + - size: { width: 360, height: 59 } + offset: { x: 975, y: 502 } typography: content: *site_name color: "white" - - size: { width: 850, height: 320 } - offset: { x: 80, y: 115 } + - size: { width: 1000, height: 220 } + offset: { x: 80, y: 280 } typography: content: *page_title overflow: shrink @@ -62,8 +69,8 @@ layers: line: amount: 3 height: 1.25 - - size: { width: 850, height: 64 } - offset: { x: 80, y: 495 } + - size: { width: 870, height: 64 } + offset: { x: 80, y: 498 } typography: content: *page_description align: start @@ -78,7 +85,7 @@ tags: og:type: website og:title: *page_title_with_site_name og:description: *page_description - og:image: "{{ image.url }}" + og:image: *page_image og:image:type: "{{ image.type }}" og:image:width: "{{ image.width }}" og:image:height: "{{ image.height }}" @@ -88,4 +95,4 @@ tags: twitter:card: summary_large_image twitter.title: *page_title_with_site_name twitter:description: *page_description - twitter:image: "{{ image.url }}" \ No newline at end of file + twitter:image: *page_image diff --git a/docs/overrides/.icons/custom/colored/discord.svg b/mkdocs/overrides/.icons/custom/colored/discord.svg similarity index 96% rename from docs/overrides/.icons/custom/colored/discord.svg rename to mkdocs/overrides/.icons/custom/colored/discord.svg index f2adc3ce75..d29c1a28de 100644 --- a/docs/overrides/.icons/custom/colored/discord.svg +++ b/mkdocs/overrides/.icons/custom/colored/discord.svg @@ -1,2 +1,2 @@ - \ No newline at end of file + diff --git a/mkdocs/overrides/.icons/custom/colored/github.svg b/mkdocs/overrides/.icons/custom/colored/github.svg new file mode 100644 index 0000000000..0b02fe1acb --- /dev/null +++ b/mkdocs/overrides/.icons/custom/colored/github.svg @@ -0,0 +1,2 @@ + diff --git a/docs/overrides/.icons/custom/colored/twitter.svg b/mkdocs/overrides/.icons/custom/colored/twitter.svg similarity index 82% rename from docs/overrides/.icons/custom/colored/twitter.svg rename to mkdocs/overrides/.icons/custom/colored/twitter.svg index f695fcce5b..fa739b0301 100644 --- a/docs/overrides/.icons/custom/colored/twitter.svg +++ b/mkdocs/overrides/.icons/custom/colored/twitter.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/mkdocs/overrides/.icons/custom/github.svg b/mkdocs/overrides/.icons/custom/github.svg new file mode 100644 index 0000000000..c018005444 --- /dev/null +++ b/mkdocs/overrides/.icons/custom/github.svg @@ -0,0 +1 @@ + diff --git a/docs/overrides/assets/images/github-logo.png b/mkdocs/overrides/assets/images/github-logo.png similarity index 100% rename from docs/overrides/assets/images/github-logo.png rename to mkdocs/overrides/assets/images/github-logo.png diff --git a/docs/overrides/assets/images/hero.svg b/mkdocs/overrides/assets/images/hero.svg similarity index 99% rename from docs/overrides/assets/images/hero.svg rename to mkdocs/overrides/assets/images/hero.svg index f8fc219eca..eb15ca3773 100644 --- a/docs/overrides/assets/images/hero.svg +++ b/mkdocs/overrides/assets/images/hero.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/docs/overrides/assets/images/new.svg b/mkdocs/overrides/assets/images/new.svg similarity index 99% rename from docs/overrides/assets/images/new.svg rename to mkdocs/overrides/assets/images/new.svg index c3f740aedb..63bc20ee8d 100644 --- a/docs/overrides/assets/images/new.svg +++ b/mkdocs/overrides/assets/images/new.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/docs/overrides/assets/images/quotes/alvarobartt.jpg b/mkdocs/overrides/assets/images/quotes/alvarobartt.jpg similarity index 100% rename from docs/overrides/assets/images/quotes/alvarobartt.jpg rename to mkdocs/overrides/assets/images/quotes/alvarobartt.jpg diff --git a/docs/overrides/assets/images/quotes/chansung.jpg b/mkdocs/overrides/assets/images/quotes/chansung.jpg similarity index 100% rename from docs/overrides/assets/images/quotes/chansung.jpg rename to mkdocs/overrides/assets/images/quotes/chansung.jpg diff --git a/docs/overrides/assets/images/quotes/eckart.png b/mkdocs/overrides/assets/images/quotes/eckart.png similarity index 100% rename from docs/overrides/assets/images/quotes/eckart.png rename to mkdocs/overrides/assets/images/quotes/eckart.png diff --git a/mkdocs/overrides/assets/images/quotes/jon.jpeg b/mkdocs/overrides/assets/images/quotes/jon.jpeg new file mode 100644 index 0000000000..d7d6085d3a Binary files /dev/null and b/mkdocs/overrides/assets/images/quotes/jon.jpeg differ diff --git a/mkdocs/overrides/assets/images/quotes/movchan.jpg b/mkdocs/overrides/assets/images/quotes/movchan.jpg new file mode 100644 index 0000000000..1147edd95f Binary files /dev/null and b/mkdocs/overrides/assets/images/quotes/movchan.jpg differ diff --git a/docs/overrides/assets/images/quotes/spott.jpg b/mkdocs/overrides/assets/images/quotes/spott.jpg similarity index 100% rename from docs/overrides/assets/images/quotes/spott.jpg rename to mkdocs/overrides/assets/images/quotes/spott.jpg diff --git a/docs/overrides/assets/images/slack.png b/mkdocs/overrides/assets/images/slack.png similarity index 100% rename from docs/overrides/assets/images/slack.png rename to mkdocs/overrides/assets/images/slack.png diff --git a/docs/overrides/assets/images/twitter.png b/mkdocs/overrides/assets/images/twitter.png similarity index 100% rename from docs/overrides/assets/images/twitter.png rename to mkdocs/overrides/assets/images/twitter.png diff --git a/docs/overrides/header-2.html b/mkdocs/overrides/header-2.html similarity index 79% rename from docs/overrides/header-2.html rename to mkdocs/overrides/header-2.html index 6bdbaee508..6310a155c6 100644 --- a/docs/overrides/header-2.html +++ b/mkdocs/overrides/header-2.html @@ -61,7 +61,19 @@ {% endif %}-->
    - Get started + GitHub + + dstack Sky
    {% if "navigation.tabs.sticky" in features %} diff --git a/docs/overrides/header.html b/mkdocs/overrides/header.html similarity index 86% rename from docs/overrides/header.html rename to mkdocs/overrides/header.html index d384b46609..0da16b35fa 100644 --- a/docs/overrides/header.html +++ b/mkdocs/overrides/header.html @@ -56,7 +56,9 @@
    {% endif %}--> {% if "navigation.tabs.sticky" in features %} diff --git a/mkdocs/overrides/home.html b/mkdocs/overrides/home.html new file mode 100644 index 0000000000..c876938795 --- /dev/null +++ b/mkdocs/overrides/home.html @@ -0,0 +1,1011 @@ +{% extends "landing.html" %} + +{% block scripts %} +{{ super() }} + + + +{% endblock %} + +{% block content %} +
    +
    +
    +
    +

    The orchestration stack
    for AI infrastructure

    + +

    + dstack is a unified control plane for GPU provisioning and orchestration that works with any GPU cloud, Kubernetes, or on-prem clusters. + It streamlines development, training, and inference, and is compatible with any hardware, open-source tools, and frameworks. +

    +
    + + +
    + Finally, an orchestration stack that doesn’t suck. +
    +
    + +
    +
    +
    +

    One control plane for AI compute

    +

    + Managing AI infrastructure requires first-class primitives for accelerator provisioning, + workload scheduling, and observability across clouds, clusters, and open-source frameworks. +

    + +

    + dstack unifies fleets, dev environments, tasks, services, + volumes, and gateways in one control plane for AI workloads. +

    + +

    + It’s built for containerized AI workloads with a simple CLI, UI, and API. + No Kubernetes or Slurm hassle required. +

    + + + +
    + +
    + +
    +
    +
    + +
    +
    +
    + +
    + +
    +

    Provision GPU fleets across clouds

    + +
    + + + + + + + + + + + + +
    + +

    + dstack provisions GPU VMs directly through cloud APIs—no Kubernetes needed. +

    + +

    + If you already have a Kubernetes cluster, dstack can manage it too. +

    + +

    + Once a backend fleet is created, dstack will let you run + dev environments, tasks, and services on this fleet. +

    + +

    + + Backends + + + + +

    +
    +
    +
    + +
    +
    +
    +

    Bring your own clusters

    + +

    + Have bare-metal servers or pre-provisioned VMs? Use SSH fleets to connect them to dstack. +

    + +

    + Just provide SSH credentials and host addresses, and dstack creates an SSH fleet. +

    + +

    + Once created, dstack will let you run + dev environments, tasks, and services on this fleet. +

    + +

    + + SSH fleets + + + + +

    +
    + +
    + +
    +
    +
    + + +
    +
    +
    + +
    + +
    +

    Launch GPU dev environments

    +

    + If you need a remote development environment with a GPU, let dstack create you a dev environment. +

    + +

    If you plan to work with it yourself, you can access it using your desktop IDE such as VS + Code, Cursor, and + Windsurf. dstack apply prints both the IDE URL and SSH command. +

    + +

    + + Dev environments + + +

    +
    +
    +
    + +
    +
    +
    +

    Run training and batch jobs at scale

    + +

    + Run training or batch workloads on a single GPU, or scale to multi-GPU and multi-node clusters using simple task configurations. + dstack automates cluster provisioning, resource allocation, and job scheduling. +

    + +

    + During execution, dstack reports GPU utilization, memory usage, and GPU health metrics for each job. +

    + +

    + + Tasks + + +

    +
    + +
    + +
    +
    +
    + +
    +
    +
    + +
    + +
    +

    Deploy production inference services

    + +

    + With dstack, you can deploy models as secure, + auto-scaling, OpenAI-compatible endpoints, integrating with top open-source serving frameworks + such as SGLang, vLLM, + TensorRT-LLM, or any other. +

    + +

    + dstack enables Disaggregated Prefill/Decode and cache-aware routing, providing + production-grade, optimized inference. +

    + +

    + + Services + + + + +

    +
    +
    +
    + +
    +
    +

    FAQ

    +
    + +
    +
    +
    + How does dstack differ from Slurm? +
    +
    +
    + +
    +

    + Slurm is a battle-tested system with decades of production use in HPC environments. + dstack by contrast, is built for modern ML/AI workloads with cloud-native provisioning and a container-first architecture. + While both support distributed training and batch jobs, dstack + also natively supports development and production-grade inference. +

    + +

    + See the migration guide for a detailed comparison. +

    +
    +
    + +
    +
    + How does dstack compare to Kubernetes? +
    +
    +
    + +
    +

    + Kubernetes is a general-purpose container orchestrator. dstack also + orchestrates containers, but it provides a lightweight and streamlined interface that is purpose + built for ML. +

    + +

    + You declare + dev environments, + tasks, + services, and + fleets + with simple configuration. dstack provisions GPUs, manages clusters via fleets with fine-grained + controls, and optimizes cost and utilization, while keeping a simple UI and CLI. +

    + +

    + If you already use Kubernetes, you can run dstack on it via the Kubernetes backend. +

    +
    +
    + +
    +
    + Can I use dstack with Kubernetes? +
    +
    +
    + +
    +

    + Yes. You can connect existing Kubernetes clusters using the Kubernetes backend and run + dev environments, + tasks, and + services on it. + Choose the Kubernetes backend if your GPUs already run on Kubernetes and your team depends on its + ecosystem and tooling. + See the + Kubernetes guide for setup and best practices. +

    +

    + If your priority is orchestrating cloud GPUs and Kubernetes isn’t a must, VM-based backends are a better fit + thanks to their native cloud integration. + For on-prem GPUs where Kubernetes is optional, SSH fleets provide a simpler and more lightweight alternative. +

    +
    +
    + +
    +
    + When should I use dstack? +
    +
    +
    + +
    +

    + dstack accelerates ML development with a simple, ML‑native interface. + Spin up dev environments, run + single‑node or distributed tasks, and deploy services without infrastructure overhead. +

    + +

    + It radically reduces GPU costs via smart orchestration and fine‑grained fleet controls, including efficient reuse, + right‑sizing, and support for spot, on‑demand, and reserved capacity. +

    + +

    + It is 100% interoperable with your stack and works with any open‑source frameworks and tools, as + well as your own Docker images and code, across GPU clouds, Kubernetes, and on‑prem GPUs. +

    +
    +
    +
    +
    + +
    +

    + Have questions, or need help? +
    + + Discord + + + Talk to us + +

    +
    + +
    +
    +

    Trusted by thousands of engineers across 100+ AI-first companies

    + +
    +
    +
    + +
    +

    Wah Loon Keng

    + +

    Sr. AI Engineer @Electronic Arts

    + +

    + With dstack, AI researchers at EA can spin up and scale experiments without touching + infrastructure. It supports everything from quick prototyping to multi-node training on any cloud. +

    +
    + +
    +
    + +
    +

    Aleksandr Movchan

    + +

    ML Engineer @Mobius Labs

    + +

    + Thanks to dstack, my team can quickly tap into affordable + GPUs and streamline our workflows + from testing and development to full-scale application deployment. +

    +
    + +
    +
    + +
    +

    Alvaro Bartolome

    + +

    ML Engineer @Argilla

    + +

    + With dstack it's incredibly easy to define a configuration + within a + repository + and run it without worrying about GPU availability. It lets you focus on + data and your research. +

    +
    + +
    +
    + +
    +

    Park Chansung

    + +

    ML Researcher @ETRI

    + +

    + Thanks to dstack, I can effortlessly access the top GPU + options across + different clouds, + saving me time and money while pushing my AI work forward. +

    +
    + +
    +
    + +
    +

    Eckart Burgwedel

    + +

    CEO @Uberchord

    + +

    + With dstack, running LLMs on a cloud GPU is as + easy as running a local Docker container. + It combines the ease of Docker with the auto-scaling capabilities of K8S. +

    +
    + +
    +
    + +
    +

    Jon Stevens

    + +

    CEO @Hot Aisle

    + +

    + dstack 's advantages over Slurm are clear: it's a modern, ground-up approach to running workloads at scale. If you're choosing an orchestration platform, dstack is the place to start. +

    +
    +
    +
    +
    + + + +


    + +
    +

    Get started in minutes

    +
    + +
    +
    +
    +

    + Install dstack on your laptop with uv, + or deploy it anywhere using the dstackai/dstack Docker image. +

    + +

    Bring your compute via backends or SSH fleets, then bring your team.

    + +

    + + Quickstart + + + + + Installation + + +

    +
    + +
    + +
    +
    +
    + +
    +
    +

    dstack Sky

    + +
    +
    +

    Hosted by us. Bring your own clouds, or tap into GPU marketplace.

    +
    +
    + + +
    + Get $5 in GPU marketplace credits. Have an account? Sign in +
    +
    +
    +
    + +
    +

    dstack Enterprise

    + +
    +
    +

    Self-hosted with SSO, air-gapped setup, and dedicated support.

    +
    +
    + + +
    + See how dstack fits your infrastructure. +
    +
    +
    +
    +
    +
    +{% endblock %} diff --git a/mkdocs/overrides/landing.html b/mkdocs/overrides/landing.html new file mode 100644 index 0000000000..34ada26ccb --- /dev/null +++ b/mkdocs/overrides/landing.html @@ -0,0 +1,5 @@ +{% extends "main.html" %} + +{% block header %} + {% include "header-2.html" %} +{% endblock %} diff --git a/mkdocs/overrides/main.html b/mkdocs/overrides/main.html new file mode 100644 index 0000000000..0e56191e19 --- /dev/null +++ b/mkdocs/overrides/main.html @@ -0,0 +1,277 @@ +{% extends "base.html" %} + +{% block extrahead %} + + + + +{# + Structured data (JSON-LD) for SEO. + - Homepage gets WebSite + Organization schema (helps Google show sitelinks and knowledge panel). + - All other pages get BreadcrumbList schema (helps Google show breadcrumb trails in results). + Breadcrumb URLs are resolved via _find_leaf_url because MkDocs nav sections don't have + their own URLs — we use the first descendant page's URL as a proxy. + Dedup by title to collapse nav levels duplicated by plugins (e.g. the blog plugin nests + "Blog" inside "Blog"). The current page is omitted when its title matches the last + ancestor (e.g. /examples/ is both the "Examples" section index and the page itself). +#} +{% macro _find_leaf_url(nav_item) -%} + {%- if nav_item.url -%} + /{{ nav_item.url }} + {%- elif nav_item.children -%} + {{ _find_leaf_url(nav_item.children | first) }} + {%- endif -%} +{%- endmacro %} +{% if page.is_homepage %} + +{% elif page.ancestors | length > 0 %} + +{% endif %} +{% endblock %} + +{% block container %} +
    + {% if "navigation.path" in features %} + {% include "path.html" %} + {% endif %} +
    + {% block content %} + {% include "partials/content.html" %} + + {% if page.previous_page or page.next_page %} + {% if page.meta and page.meta.hide %} + {% set hidden = "hidden" if "footer" in page.meta.hide %} + {% endif %} + + {% endif %} + {% endblock %} +
    +
    +{% endblock %} + +{% block header %} + {% include "header-2.html" %} +{% endblock %} + +{% block scripts %} + + +{{ super() }} + +{% endblock %} + +{% block announce %} +Infrastructure orchestration is an agent skill +{% endblock %} + +{% block footer %} + +{% endblock %} + +{% block site_nav %} + {% if nav %} + {% if page.meta and page.meta.hide %} + {% set hidden = "hidden" if "navigation" in page.meta.hide %} + {% endif %} + + {% endif %} + {% if "toc.integrate" not in features %} + {% if page.meta and page.meta.hide %} + {% set hidden = "hidden" if "toc" in page.meta.hide %} + {% endif %} + + {% endif %} +{% endblock %} diff --git a/mkdocs/overrides/partials/post.html b/mkdocs/overrides/partials/post.html new file mode 100644 index 0000000000..27589c4dca --- /dev/null +++ b/mkdocs/overrides/partials/post.html @@ -0,0 +1,69 @@ +{#- + This file was automatically generated - do not edit +-#} +
    +
    + {% if post.authors %} + + {% endif %} + +
    +
    + {{ post.content }} + {% if post.more %} + + {% endif %} + {% if post.config.pin %} +
    + {% endif %} +
    +
    diff --git a/mkdocs/overrides/path.html b/mkdocs/overrides/path.html new file mode 100644 index 0000000000..41f860d17e --- /dev/null +++ b/mkdocs/overrides/path.html @@ -0,0 +1,45 @@ +{#- + This file was automatically generated - do not edit +-#} +{% import "partials/path-item.html" as item with context %} +{% if page.meta and page.meta.hide %} + {% set hidden = "hidden" if "path" in page.meta.hide %} +{% endif %} +{% set depth = page.ancestors | length %} +{% if nav.homepage %} + {% set depth = depth + 1 %} +{% endif %} +{% macro arender_content(nav_item) %} + + {{ nav_item.title }} + +{% endmacro %} +{% macro arender(nav_item, ref = nav_item) %} + {% if nav_item.children %} + {% set first = nav_item.children | first %} + {% if first.children %} + {{ arender(first, ref) }} + {% else %} +
  • + + {{ arender_content(ref) }} + +
  • + {% endif %} + {% else %} +
  • + + {{ arender_content(ref) }} + +
  • + {% endif %} +{% endmacro %} +{% if depth > 1 and page.ancestors | length > 1 %} + +{% endif %} diff --git a/docs/overrides/pricing.html b/mkdocs/overrides/pricing.html similarity index 100% rename from docs/overrides/pricing.html rename to mkdocs/overrides/pricing.html diff --git a/docs/overrides/privacy.html b/mkdocs/overrides/privacy.html similarity index 99% rename from docs/overrides/privacy.html rename to mkdocs/overrides/privacy.html index 506ffdbe6b..e9039aa56b 100644 --- a/docs/overrides/privacy.html +++ b/mkdocs/overrides/privacy.html @@ -131,4 +131,4 @@

    Children's Information




    -{% endblock %} \ No newline at end of file +{% endblock %} diff --git a/mkdocs/overrides/toc-item.html b/mkdocs/overrides/toc-item.html new file mode 100644 index 0000000000..a4618bcbde --- /dev/null +++ b/mkdocs/overrides/toc-item.html @@ -0,0 +1,25 @@ +{#- + This file was automatically generated - do not edit +-#} +
  • + + + {% if toc_item.typeset %} + + {{ toc_item.typeset.title }} + + {% else %} + {{ toc_item.title }} + {% endif %} + + + {% if toc_item.children %} + + {% endif %} +
  • diff --git a/mkdocs/overrides/toc.html b/mkdocs/overrides/toc.html new file mode 100644 index 0000000000..577f4988ac --- /dev/null +++ b/mkdocs/overrides/toc.html @@ -0,0 +1,25 @@ +{#- + This file was automatically generated - do not edit +-#} +{% set title = lang.t("toc") %} +{% if config.mdx_configs.toc and config.mdx_configs.toc.title %} + {% set title = config.mdx_configs.toc.title %} +{% endif %} + diff --git a/docs/privacy.md b/mkdocs/privacy.md similarity index 100% rename from docs/privacy.md rename to mkdocs/privacy.md diff --git a/mkdocs/robots.txt b/mkdocs/robots.txt new file mode 100644 index 0000000000..b38b74b551 --- /dev/null +++ b/mkdocs/robots.txt @@ -0,0 +1,4 @@ +User-agent: * +Allow: / + +Sitemap: https://dstack.ai/sitemap.xml diff --git a/mkdocs/snippets/kubernetes/dstack-backend-clusterrole.yaml b/mkdocs/snippets/kubernetes/dstack-backend-clusterrole.yaml new file mode 100644 index 0000000000..a808595839 --- /dev/null +++ b/mkdocs/snippets/kubernetes/dstack-backend-clusterrole.yaml @@ -0,0 +1,16 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: dstack-backend +rules: + - apiGroups: [""] + resources: ["namespaces"] + # "create" is only needed if dstack should create the namespace. + # Could be dropped if the namespace already exists. + verbs: ["get", "create"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["list"] diff --git a/mkdocs/snippets/kubernetes/dstack-backend-role.yaml b/mkdocs/snippets/kubernetes/dstack-backend-role.yaml new file mode 100644 index 0000000000..7cc37fbc16 --- /dev/null +++ b/mkdocs/snippets/kubernetes/dstack-backend-role.yaml @@ -0,0 +1,18 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: dstack-backend + namespace: ${NAMESPACE} +rules: + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "watch", "create", "delete"] + - apiGroups: [""] + resources: ["services"] + verbs: ["get", "create", "delete"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["create", "delete"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["get", "create", "delete"] diff --git a/mkdocs/terms.md b/mkdocs/terms.md new file mode 100644 index 0000000000..27188750e3 --- /dev/null +++ b/mkdocs/terms.md @@ -0,0 +1,194 @@ +--- +hide: + - navigation + - footer +--- + +# Terms of Service + +**Last updated:** May 14, 2026 + +## 1. About these Terms + +These Terms of Service ("**Terms**") form a binding agreement between you, whether on your own behalf or on behalf of an entity ("**you**" or "**Customer**"), and **dstack Inc.**, a Delaware corporation with offices at 8 The Green, #23725, Dover, DE 19901, United States ("**dstack**," "**we**," "**us**"). dstack Inc. is the contracting party under these Terms. + +By accessing or using the Service (defined below), you accept these Terms. If you do not accept them, do not use the Service. + +## 2. What these Terms cover + +These Terms govern your access to and use of: + +- the dstack website at dstack.ai; and +- the hosted service offered as **dstack Sky**, including its Bring Your Own Cloud (BYOC) and GPU Marketplace modes + +(together, the "**Service**"). + +These Terms **do not** cover: + +- **dstack OSS** — the open-source dstack project is licensed under the Mozilla Public License 2.0. Your use of dstack OSS is governed solely by that license, which is included in the [project repository](https://github.com/dstackai/dstack). +- **dstack Enterprise** — commercial deployments of dstack Enterprise are governed by a separate written enterprise agreement between you and dstack. These Terms do not apply to dstack Enterprise. + +## 3. The Service + +dstack Sky is an orchestration service for GPU compute workloads. It is offered in two modes. + +**BYOC (Bring Your Own Cloud).** You configure dstack Sky to provision and manage compute and storage on cloud accounts you own or control (for example AWS, GCP, Azure, or other supported providers). In BYOC mode, **dstack does not bill you for compute, storage, or other cloud resources** — those are billed by the upstream cloud provider directly to your cloud account, under your agreement with that provider. You are solely responsible for all charges, resources, and data on your cloud accounts, and your relationship with each underlying cloud provider is governed exclusively by your agreement with that provider. + +**GPU Marketplace.** In the GPU Marketplace, dstack relies on third-party cloud providers integrated into the marketplace to supply compute and storage capacity, which dstack makes available to you through dstack Sky. Pricing is set dynamically per provider and is shown in the console at the time of provisioning. Charges are deducted from your prepaid credit balance (see Section 5). + +## 4. Your account + +To use dstack Sky, you must register an account. You agree to: + +- provide accurate registration information and keep it current; +- keep your credentials confidential and be responsible for all activity under your account; and +- meet the age of majority in your jurisdiction and, if registering on behalf of an entity, have authority to bind that entity to these Terms. + +We may suspend or terminate accounts that contain materially inaccurate information or that breach these Terms. + +## 5. Pricing, credits, and payment (GPU Marketplace) + +**Credits.** You pay for GPU Marketplace usage by adding funds to a prepaid balance on your dstack Sky account. We refer to this balance as "**credits**." Credits are deducted as you consume compute and storage resources at the then-current prices. + +**How marketplace pricing works.** In the GPU Marketplace, dstack relies on third-party cloud providers integrated into the marketplace to supply compute and storage capacity, which dstack makes available to you through dstack Sky. **dstack — not the upstream cloud provider — bills you for marketplace usage**, by deducting amounts from your credit balance. The price for each compute or storage resource is set per provider and reflects dstack's then-current commercial arrangements with that upstream provider, or, where dstack has no such arrangement with that provider, dstack's then-current pricing for that resource. Pricing for each resource is shown in the console before you provision it. + +**Price changes.** dstack reserves the right to change marketplace prices at any time, without prior notice, including in response to: changes in an upstream provider's pricing or terms; changes in, or termination of, dstack's commercial arrangements with an upstream provider; the absence of an agreement between dstack and an upstream provider; changes in availability, supply, or demand for any resource; or any other reason at dstack's discretion. The applicable price is the price in effect at the time the resource is consumed. + +**Payment.** We accept payment by major credit and debit cards through our third-party payment processor (currently Stripe). We may change accepted payment methods and payment processors at any time. By providing payment information, you authorize us and our payment processor to charge your selected method for credits you purchase. All amounts are in US dollars unless otherwise stated. + +**Taxes.** Prices are exclusive of taxes. You are responsible for any sales, use, value-added, or similar taxes, other than taxes on dstack's net income, and we may add such taxes to invoices where required by law. + +**No refunds.** Credits and any other amounts you pay to dstack are **non-refundable**, are not redeemable for cash, and have no monetary value outside the Service. We do not refund credits for any reason, including unused balances at account closure, marketplace price changes, service interruptions, or any act, omission, outage, or other issue caused by or attributable to an upstream cloud provider. Unused credits are forfeited upon account closure or termination (see Section 8). + +**BYOC.** dstack does not currently charge for use of dstack Sky in BYOC mode. + +## 6. Customer content and intellectual property + +**Your content remains yours.** You retain all rights, title, and interest in the code, configurations, model artifacts, datasets, logs, outputs, and other content that you upload to, generate through, or process using the Service ("**Customer Content**"). You grant dstack a worldwide, non-exclusive, royalty-free license to host, copy, transmit, display, and process Customer Content solely to provide and operate the Service and to comply with applicable law. + +**Your responsibility for Customer Content.** You are solely responsible for Customer Content, including its lawfulness, accuracy, and the rights you have in it. You represent and warrant that Customer Content does not infringe any third party's rights and that you have all necessary rights, consents, and authorizations to use it through the Service. + +**Feedback.** If you provide us with suggestions, ideas, bug reports, or other feedback about the Service ("**Feedback**"), you grant dstack a perpetual, irrevocable, royalty-free, worldwide license to use, modify, and incorporate the Feedback into our products and services without compensation. Feedback **does not include** Customer Content, code you author, configurations you create, datasets you upload, or model artifacts; dstack does not claim any right to any of these. + +**Our IP.** Except for dstack OSS (which is governed by its open-source license), the dstack Sky platform, the dstack website, the dstack name and logos, and all related software and documentation are owned by dstack or its licensors. We grant you a limited, non-exclusive, non-transferable, revocable right to use the Service in accordance with these Terms; no other rights are granted, whether by implication, estoppel, or otherwise. + +## 7. Acceptable use + +You may not, and may not permit or enable others to: + +- use the Service in violation of any applicable law or regulation; +- infringe the intellectual property, privacy, publicity, or other rights of any third party; +- mine, generate, or otherwise produce cryptocurrency or perform any cryptocurrency-related proof-of-work workloads — **cryptocurrency mining is expressly prohibited**; +- generate, store, transmit, or facilitate child sexual abuse material (CSAM) or any content that sexualizes minors, content that promotes terrorism or violent extremism, or content depicting non-consenting individuals; +- interfere with or disrupt the Service or attempt to gain unauthorized access to the Service, to other customers' workloads, or to any underlying systems or cloud-provider resources; +- probe, scan, or test the vulnerability of the Service except as expressly authorized by us in writing; +- circumvent rate limits, security controls, quotas, or other technical limitations of the Service; +- abuse, exhaust, or unreasonably burden the resources of dstack or any upstream cloud provider, including by triggering provider throttling or bans that affect other customers; +- impersonate any person or entity or misrepresent your affiliation; +- resell or commercially exploit the Service except as expressly permitted by us in writing; or +- scrape, crawl, or systematically extract content from the Service other than through documented APIs and within their published limits. + +**Enforcement.** If we reasonably believe you are violating this Section, we may, in our discretion and with or without notice, suspend or terminate your access to the Service, remove or quarantine Customer Content, throttle workloads, or take other reasonable steps. For clearly unlawful activity, we will act immediately. + +## 8. Suspension and termination + +**By you.** You may stop using the Service and close your account at any time through the console or by contacting hello@dstack.ai. + +**By us.** We may suspend or terminate your account, with or without notice, if: (a) you breach, or we reasonably suspect you have breached, these Terms; (b) we reasonably believe your use poses a security, legal, or operational risk to dstack, to other customers, or to any upstream provider; (c) your credit balance is insufficient to cover usage; (d) required by law; or (e) we discontinue the Service or any material part of it. + +We may suspend your access to the Service while we investigate any suspected breach of these Terms, and we are not obligated to share the results of any such investigation. + +**Effect of termination.** On termination of your account: + +- your right to use the Service ends immediately; +- any active workloads may be stopped; +- Customer Content stored within dstack Sky may be deleted in accordance with our standard retention practices, typically within 30 days of account closure (longer where required by law or legal hold); +- unused credits are forfeited, unless otherwise required by applicable law; and +- accrued payment obligations, together with Sections 5 (taxes and no refunds), 6 (Customer content and intellectual property), 8 (this section), 9 (disclaimers), 10 (limitation of liability), 11 (indemnification), 12 (export control), 13 (privacy), 15 (governing law and dispute resolution), and 16 (miscellaneous), survive any termination or expiration of these Terms. + +We will use commercially reasonable efforts to allow you to export Customer Content before deletion, except where termination is due to your material breach or to unlawful activity. + +## 9. Disclaimers + +THE SERVICE IS PROVIDED **"AS IS" AND "AS AVAILABLE."** TO THE MAXIMUM EXTENT PERMITTED BY LAW, DSTACK DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, ACCURACY, AND ANY WARRANTIES ARISING FROM COURSE OF DEALING, USAGE, OR TRADE PRACTICE. + +WITHOUT LIMITING THE ABOVE, DSTACK DOES NOT WARRANT THAT: + +- THE SERVICE WILL BE UNINTERRUPTED, TIMELY, SECURE, OR ERROR-FREE; +- ANY DATA STORED OR PROCESSED THROUGH THE SERVICE WILL BE PRESERVED, AVAILABLE, OR RECOVERABLE; +- ANY PARTICULAR COMPUTE CAPACITY, GPU TYPE, REGION, OR PRICE WILL BE AVAILABLE FROM ANY UPSTREAM CLOUD PROVIDER; OR +- WORKLOADS WILL PRODUCE ANY PARTICULAR RESULT, INCLUDING WITH RESPECT TO MACHINE-LEARNING TRAINING OR INFERENCE OUTPUTS. + +DSTACK DOES NOT OFFER A SERVICE-LEVEL AGREEMENT OR UPTIME COMMITMENT UNDER THESE TERMS. ANY SLA OR SERVICE COMMITMENT MUST BE SEPARATELY AGREED IN WRITING (FOR EXAMPLE, UNDER A DSTACK ENTERPRISE AGREEMENT). + +**Not tailored to industry-specific regulations.** The Service is not designed or tailored to comply with industry-specific laws or regulations such as the Health Insurance Portability and Accountability Act (HIPAA), the Federal Information Security Management Act (FISMA), or the Gramm-Leach-Bliley Act (GLBA). You must not use the Service to process data subject to these or similar regulations unless we have separately agreed in writing to support your specific compliance requirements. + +## 10. Limitation of liability + +**No liability for data, content, or upstream providers.** TO THE MAXIMUM EXTENT PERMITTED BY LAW, DSTACK WILL NOT BE LIABLE FOR (A) ANY LOSS, CORRUPTION, INACCESSIBILITY, OR DELETION OF DATA OR CUSTOMER CONTENT, OR (B) ANY ACT, OMISSION, FAILURE, INTERRUPTION, OUTAGE, SECURITY INCIDENT, BREACH, OR DATA LOSS CAUSED BY OR ATTRIBUTABLE TO ANY UPSTREAM CLOUD PROVIDER USED THROUGH THE SERVICE (IN BYOC OR GPU MARKETPLACE MODE). + +**Excluded damages.** TO THE MAXIMUM EXTENT PERMITTED BY LAW, DSTACK WILL NOT BE LIABLE FOR ANY INDIRECT, INCIDENTAL, SPECIAL, CONSEQUENTIAL, EXEMPLARY, OR PUNITIVE DAMAGES, OR FOR ANY LOSS OF PROFITS, REVENUE, BUSINESS, USE, GOODWILL, OR ANTICIPATED SAVINGS, ARISING OUT OF OR RELATING TO THESE TERMS, THE SERVICE, OR YOUR USE OF OR INABILITY TO USE THE SERVICE, EVEN IF DSTACK HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + +**Allocation of risk.** YOU ACKNOWLEDGE THAT THE PRICING AND ALLOCATION OF RISK IN THESE TERMS REFLECT AN AGREED COMMERCIAL UNDERSTANDING, AND THAT WITHOUT THIS ALLOCATION DSTACK WOULD NOT PROVIDE THE SERVICE ON THESE TERMS. + +**Carve-outs.** Nothing in these Terms excludes or limits any liability that cannot be excluded or limited under applicable law (for example, liability for fraud, fraudulent misrepresentation, or, in certain jurisdictions, gross negligence or willful misconduct). + +## 11. Indemnification + +You will defend, indemnify, and hold harmless dstack and its affiliates, and their respective officers, directors, employees, and agents, from and against any third-party claim, demand, action, or proceeding, and any related losses, damages, liabilities, judgments, settlements, fines, costs, and reasonable attorneys' fees, arising out of or relating to: + +- your Customer Content; +- your use of the Service; +- your breach of these Terms (including the Acceptable Use section); or +- your violation of any law or any rights of any third party. + +We may assume exclusive defense and control of any matter for which you are required to indemnify us, in which case you will cooperate at your expense. + +## 12. Export control and compliance + +You represent and warrant that your use of the Service complies with all applicable export-control, sanctions, and trade laws and regulations, including those administered by the U.S. Department of the Treasury's Office of Foreign Assets Control (OFAC) and the U.S. Department of Commerce's Bureau of Industry and Security (BIS), and any equivalent authorities in your jurisdiction. You may not use the Service if you are located in, ordinarily resident in, or organized under the laws of a country or region subject to comprehensive U.S. sanctions, and you may not provide access to the Service to any person on a U.S. or applicable foreign government's restricted-party list. + +## 13. Privacy + +Personal data we collect through the Service is handled in accordance with our [Privacy Policy](https://dstack.ai/privacy/), which is incorporated into these Terms by reference. By using the Service, you consent to the collection, processing, and transfer of your personal data as described in the Privacy Policy. + +## 14. Modifications + +We may change these Terms from time to time. The "Last updated" date at the top of these Terms indicates when they were most recently changed. We reserve the right to update these Terms and, for material changes, will use commercially reasonable efforts to inform you in advance — for example, by email to the address on your account or by an in-product notice. Your continued use of the Service after a change takes effect constitutes acceptance of the updated Terms. If you do not agree to a change, you must stop using the Service. + +Changes to dstack Sky pricing are governed by Section 5 (Pricing, credits, and payment) and are not subject to the advance-notice provision of this Section. + +## 15. Governing law and dispute resolution + +**Governing law.** These Terms are governed by the laws of the State of Delaware, USA, without regard to its conflict-of-laws principles. The United Nations Convention on Contracts for the International Sale of Goods does not apply. + +**Informal resolution.** Before bringing a formal dispute, the parties agree to first attempt to resolve any dispute, claim, or controversy arising out of or relating to these Terms or the Service (a "**Dispute**") through good-faith informal negotiation for at least thirty (30) days following written notice from one party to the other. + +**Binding arbitration.** Any Dispute not resolved through informal negotiation will be resolved by **final and binding individual arbitration** administered by the American Arbitration Association (AAA) under its Commercial Arbitration Rules and, where applicable, its Supplementary Procedures for Consumer-Related Disputes. The seat of arbitration is Wilmington, Delaware. The arbitration will be conducted in English. Judgment on the arbitrator's award may be entered in any court of competent jurisdiction. + +**Class-action waiver.** Any arbitration or proceeding under these Terms will be conducted on an **individual basis only**. The parties waive any right to bring or participate in any class, consolidated, or representative action. If this waiver is held unenforceable as to any Dispute, that Dispute will be heard in court rather than in arbitration. + +**Carve-outs.** The following are not subject to arbitration and may be brought in the state or federal courts located in Delaware: (a) claims to enforce or protect intellectual property rights; (b) claims for injunctive or equitable relief; and (c) claims that may be brought in small-claims court if they qualify. + +## 16. Miscellaneous + +**Entire agreement.** These Terms (together with any documents referenced in them) constitute the entire agreement between you and dstack regarding the Service and supersede any prior agreements on the same subject. + +**Assignment.** You may not assign these Terms without our prior written consent. We may assign these Terms in connection with a merger, acquisition, financing, sale of assets, or by operation of law. + +**Severability.** If any provision of these Terms is held unenforceable, the remaining provisions will continue in full force, and the unenforceable provision will be modified to the minimum extent necessary to make it enforceable while preserving its intent. + +**No waiver.** Our failure to enforce any provision is not a waiver of that or any other provision. + +**No agency.** Nothing in these Terms creates an agency, partnership, joint venture, or employment relationship between you and dstack. + +**Force majeure.** Neither party is liable for any failure or delay in performance caused by circumstances beyond its reasonable control. + +**Electronic communications.** You consent to receive communications from us electronically. Agreements, notices, disclosures, and other communications that we provide electronically satisfy any legal requirement that they be in writing. + +**Contact.** You can contact us about these Terms at **hello@dstack.ai** or at: + +> dstack Inc.\ +> 8 The Green, #23725\ +> Dover, DE 19901\ +> United States diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000..ca08438931 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,271 @@ +[project] +name = "dstack" +dynamic = ["version", "readme"] +authors = [{ name = "Andrey Cheptsov", email = "andrey@dstack.ai" }] +description = "dstack is an open-source orchestration engine for running AI workloads on any cloud or on-premises." +requires-python = ">=3.10" +classifiers = [ + "Development Status :: 4 - Beta", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)", + "Programming Language :: Python :: 3", +] +dependencies = [ + "pyyaml", + "requests", + "typing-extensions>=4.0.0", + "cryptography", + "packaging", + "python-dateutil", + "cachetools<7.1.0", # Pin to work around https://github.com/tkem/cachetools/issues/394 + "gitpython", + "jsonschema", + "paramiko>=3.2.0", + "cursor", + "rich", + "rich-argparse", + "tqdm", + "questionary>=2.0.1", + "pydantic>=1.10.10,<2.0.0", + "pydantic-duality>=1.2.4", + "websocket-client", + "python-multipart>=0.0.16", + "filelock", + "psutil", + "gpuhunt==0.1.25", + "argcomplete>=3.5.0", + "ignore-python>=0.2.0", + "orjson", + "apscheduler<4", +] + +[project.urls] +Homepage = "https://dstack.ai" +Source = "https://github.com/dstackai/dstack" +Documentation = "https://dstack.ai/docs" +Issues = "https://github.com/dstackai/dstack/issues" +Changelog = "https://github.com/dstackai/dstack/releases" +Discord = "https://discord.gg/u8SmfwPpMd" + +[build-system] +requires = ["hatchling", "hatch-fancy-pypi-readme"] +build-backend = "hatchling.build" + +[project.scripts] +dstack = "dstack._internal.cli.main:main" + +[tool.hatch.version] +path = "src/dstack/version.py" + +[tool.hatch.build.targets.sdist] +artifacts = [ + "src/dstack/_internal/server/statics/**", +] + +[tool.hatch.build.targets.wheel] +artifacts = [ + "src/dstack/_internal/server/statics/**", +] + +[tool.hatch.metadata.hooks.fancy-pypi-readme] +content-type = "text/markdown" + +[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]] +path = "README.md" + +[[tool.hatch.metadata.hooks.fancy-pypi-readme.substitutions]] +pattern = '\s*|]*>\s*|\s*|]*>\s*|\s*|### Demo\s*' +replacement = '' +ignore-case = true + +[tool.uv.sources] +dstack-plugin-server = { path = "examples/plugins/example_plugin_server", editable = true } + +[tool.ruff] +target-version = "py310" +line-length = 99 + +[tool.ruff.lint] +select = ["E", "F", "I", "Q", "W", "PGH", "FLY", "S113"] +ignore = [ + "E501", + "E712", +] + +[tool.ruff.lint.isort] +known-first-party = ["dstack"] + +[tool.pyright] +typeCheckingMode = "standard" +include = [ + "src/dstack/plugins", + "src/dstack/_internal/server", + "src/dstack/_internal/core/services", + "src/dstack/_internal/core/backends/aws", + "src/dstack/_internal/core/backends/kubernetes", + "src/dstack/_internal/core/backends/runpod", + "src/dstack/_internal/cli/services/configurators", + "src/dstack/_internal/cli/commands", + "src/tests/_internal/server/background/pipeline_tasks", +] +ignore = [ + "src/dstack/_internal/server/migrations/versions", +] + +[tool.pytest.ini_options] +testpaths = ["src/tests"] +addopts = [ + "--disable-socket", + "--allow-hosts=127.0.0.1,localhost", + # unix socket for Docker/testcontainers + "--allow-unix-socket", +] +markers = [ + "shim_version", + "dockerized", +] +env = [ + "DSTACK_CLI_RICH_FORCE_TERMINAL=0", + "DSTACK_SSHPROXY_API_TOKEN=test-token", +] +filterwarnings = [ + # testcontainers modules use deprecated decorators – nothing we can do: + # https://github.com/testcontainers/testcontainers-python/issues/874 + "ignore:^The @wait_container_is_ready decorator:DeprecationWarning" +] + +[dependency-groups] +dev = [ + "pre-commit>=4.2.0", + "pytest~=8.0", + "pytest-asyncio>=0.25.2", + "pytest-mock>=3.14.0", + "pytest-httpbin>=2.1.0", + "pytest-socket>=0.7.0", + "pytest-env>=1.1.0", + "pytest-unordered>=0.7.0", + "httpbin>=0.10.2", # indirect to make compatible with Werkzeug 3 + "requests-mock>=1.12.1", + "openai>=1.68.2", + "freezegun>=1.5.1", + "ruff==0.12.7", # should match .pre-commit-config.yaml + "testcontainers>=4.9.2", + "pytest-xdist>=3.6.1", + "pyinstrument>=5.0.0", + "kubernetes-stubs-elephant-fork>=35.0.0.post3", + {include-group = "docs"}, +] +docs = [ + "dstack[server]", + "dstack-plugin-server", + "pillow", + "cairosvg", + "mkdocs-material>=9.7.0", + "mkdocs-material[imaging]", + "mkdocs-material-extensions", + "mkdocs-redirects", + "mkdocs-gen-files", + "mkdocstrings[python]", + "mkdocs-render-swagger-plugin", +] + +[project.optional-dependencies] +gateway = [ + "fastapi", + "starlette>=0.26.0", + "uvicorn", + "aiorwlock", + "aiocache", + "httpx>=0.28.0", + "jinja2", +] +server = [ + "fastapi", + "starlette>=0.26.0", + "uvicorn[standard]", + "aiorwlock", + "aiocache", + "httpx>=0.28.0", + "requests-unixsocket>=0.4.1", + "jinja2", + "watchfiles", + "sqlalchemy[asyncio]>=2.0.0", + "sqlalchemy_utils>=0.40.0", + "alembic>=1.16.0", + "aiosqlite", + "docker>=6.0.0", + "python-dxf>=12.1.1", + "sentry-sdk[fastapi]>=2.27.0", + "alembic-postgresql-enum", + "asyncpg", + "python-json-logger>=3.1.0", + "prometheus-client", + "grpcio>=1.81.0", + "protobuf>=6.33.5", + "smg-grpc-proto==0.4.9", # Pin to avoid grpcio version mismatch when smg-grpc-proto bumps grpcio dependency https://github.com/dstackai/dstack/pull/3971 +] +aws = [ + "boto3>=1.38.13", + "botocore", + "dstack[server]", +] +azure = [ + "azure-identity>=1.12.0", + "azure-mgmt-subscription>=3.1.1", + "azure-mgmt-compute>=29.1.0", + "azure-mgmt-network>=23.0.0,<28.0.0", + "azure-mgmt-resource>=22.0.0", + "azure-mgmt-authorization>=3.0.0", + "azure-mgmt-msi>=7.0.0", + "dstack[server]", +] +gcp = [ + "google-auth>=2.3.0", + "google-cloud-storage>=2.0.0", + "google-cloud-compute>=1.5.0", + "google-cloud-logging>=2.0.0", + "google-api-python-client>=2.80.0", + "google-cloud-billing>=1.11.0", + "google-cloud-tpu>=1.18.3", + "dstack[server]", +] +datacrunch = [ + "verda>=1.23.0", + "dstack[server]", +] +verda = [ + "verda>=1.23.0", + "dstack[server]", +] +kubernetes = [ + "kubernetes", + "dstack[server]", +] +lambda = [ + "boto3>=1.38.13", + "botocore", + "dstack[server]", +] +oci = [ + "oci>=2.150.0", + "cryptography>=44.0.3", + # pyopenssl is indirect to avoid uv falling back to the old version + # due to an upper limit from oci + "pyopenssl>=23.2.0", + "dstack[server]", +] +nebius = [ + "nebius>=0.3.4,<0.4", + "dstack[server]", +] +fluentbit = [ + "fluent-logger>=0.10.0", + "elasticsearch>=8.0.0", + "dstack[server]", +] +crusoe = [ + "dstack[server]", +] +all = [ + "dstack[gateway,server,aws,azure,gcp,verda,kubernetes,lambda,nebius,oci,crusoe,fluentbit]", +] diff --git a/requirements_dev.txt b/requirements_dev.txt deleted file mode 100644 index 48905b799e..0000000000 --- a/requirements_dev.txt +++ /dev/null @@ -1,6 +0,0 @@ -pre-commit -httpx>=0.23 -pytest~=7.2 -pytest-asyncio>=0.21 -freezegun>=1.2.0 -ruff==0.4.8 diff --git a/ruff.toml b/ruff.toml deleted file mode 100644 index 22ad866902..0000000000 --- a/ruff.toml +++ /dev/null @@ -1,13 +0,0 @@ -target-version = "py38" -line-length = 99 - -[lint] -select = ['E', 'F', 'I' ,'Q', 'W', 'PGH', 'FLY', 'S113'] -ignore =[ - 'E501', - 'E712', -] - -[lint.isort] -known-first-party = ["dstack"] -known-third-party = ["mkdocs_gen_files", "datacrunch"] diff --git a/runner/.golangci-lint.yml b/runner/.golangci-lint.yml deleted file mode 100644 index ee2bb3814b..0000000000 --- a/runner/.golangci-lint.yml +++ /dev/null @@ -1,379 +0,0 @@ -run: - # default concurrency is a available CPU number - concurrency: 4 - - # timeout for analysis, e.g. 30s, 5m, default is 1m - timeout: 1m - - # exit code when at least one issue was found, default is 1 - issues-exit-code: 0 - - # include test files or not, default is true - tests: false - - skip-dirs: - - data - -# output configuration options -output: - # colored-line-number|line-number|json|tab|checkstyle|code-climate|junit-xml|github-actions - # default is "colored-line-number" - format: colored-line-number - -# all available settings of specific linters -linters-settings: - - cyclop: - # the maximal code complexity to report - max-complexity: 10 - # the maximal average package complexity. If it's higher than 0.0 (float) the check is enabled (default 0.0) - package-average: 0.0 - # should ignore tests (default false) - skip-tests: false - - errcheck: - # report about not checking of errors in type assertions: `a := b.(MyStruct)`; - # default is false: such cases aren't reported by default. - check-type-assertions: false - - # report about assignment of errors to blank identifier: `num, _ := strconv.Atoi(numStr)`; - # default is false: such cases aren't reported by default. - check-blank: false - - - errorlint: - # Check whether fmt.Errorf uses the %w verb for formatting errors. See the readme for caveats - errorf: true - # Check for plain type assertions and type switches - asserts: true - # Check for plain error comparisons - comparison: true - - exhaustive: - # check switch statements in generated files also - check-generated: false - # indicates that switch statements are to be considered exhaustive if a - # 'default' case is present, even if all enum members aren't listed in the - # switch - default-signifies-exhaustive: false - - exhaustivestruct: - # Struct Patterns is list of expressions to match struct packages and names - # The struct packages have the form example.com/package.ExampleStruct - # The matching patterns can use matching syntax from https://pkg.go.dev/path#Match - # If this list is empty, all structs are tested. - struct-patterns: - - '*.Test' - - 'example.com/package.ExampleStruct' - - funlen: - lines: 60 - statements: 40 - - gocognit: - # minimal code complexity to report, 30 by default (but we recommend 10-20) - min-complexity: 10 - - nestif: - # minimal complexity of if statements to report, 5 by default - min-complexity: 4 - - goconst: - # minimal length of string constant, 3 by default - min-len: 3 - # minimal occurrences count to trigger, 3 by default - min-occurrences: 3 - - gocritic: - # Which checks should be enabled; can't be combined with 'disabled-checks'; - # See https://go-critic.github.io/overview#checks-overview - # To check which checks are enabled run `GL_DEBUG=gocritic golangci-lint run` - # By default list of stable checks is used. - enabled-checks: - - rangeValCopy - - # Which checks should be disabled; can't be combined with 'enabled-checks'; default is empty - disabled-checks: - - regexpMust - - # Enable multiple checks by tags, run `GL_DEBUG=gocritic golangci-lint run` to see all tags and checks. - # Empty list by default. See https://github.com/go-critic/go-critic#usage -> section "Tags". - enabled-tags: - - performance - disabled-tags: - - experimental - - # Settings passed to gocritic. - # The settings key is the name of a supported gocritic checker. - # The list of supported checkers can be find in https://go-critic.github.io/overview. - settings: - captLocal: # must be valid enabled check name - # whether to restrict checker to params only (default true) - paramsOnly: true - elseif: - # whether to skip balanced if-else pairs (default true) - skipBalanced: true - hugeParam: - # size in bytes that makes the warning trigger (default 80) - sizeThreshold: 80 - nestingReduce: - # min number of statements inside a branch to trigger a warning (default 5) - bodyWidth: 5 - rangeExprCopy: - # size in bytes that makes the warning trigger (default 512) - sizeThreshold: 512 - # whether to check test functions (default true) - skipTestFuncs: true - rangeValCopy: - # size in bytes that makes the warning trigger (default 128) - sizeThreshold: 32 - # whether to check test functions (default true) - skipTestFuncs: true - truncateCmp: - # whether to skip int/uint/uintptr types (default true) - skipArchDependent: true - underef: - # whether to skip (*x).method() calls where x is a pointer receiver (default true) - skipRecvDeref: true - unnamedResult: - # whether to check exported functions - checkExported: true - - gocyclo: - # minimal code complexity to report, 30 by default (but we recommend 10-20) - min-complexity: 10 - - gofmt: - # simplify code: gofmt with `-s` option, true by default - simplify: true - - gofumpt: - # Choose whether or not to use the extra rules that are disabled - # by default - extra-rules: false - - goimports: - # put imports beginning with prefix after 3rd-party packages; - # it's a comma-separated list of prefixes - local-prefixes: github.com/dstackai/dstackai - - golint: - # minimal confidence for issues, default is 0.8 - min-confidence: 0.8 - - gosec: - # To select a subset of rules to run. - # Available rules: https://github.com/securego/gosec#available-rules - includes: - - G401 - - G306 - - G101 - # To specify a set of rules to explicitly exclude. - # Available rules: https://github.com/securego/gosec#available-rules - excludes: - - G204 - # To specify the configuration of rules. - # The configuration of rules is not fully documented by gosec: - # https://github.com/securego/gosec#configuration - # https://github.com/securego/gosec/blob/569328eade2ccbad4ce2d0f21ee158ab5356a5cf/rules/rulelist.go#L60-L102 - config: - G306: "0600" - G101: - pattern: "(?i)example" - ignore_entropy: false - entropy_threshold: "80.0" - per_char_threshold: "3.0" - truncate: "32" - - gosimple: - # Select the Go version to target. The default is '1.13'. - go: "1.18" - # https://staticcheck.io/docs/options#checks - checks: [ "all" ] - - govet: - # report about shadowed variables - check-shadowing: true - - # settings per analyzer - settings: - printf: # analyzer name, run `go tool vet help` to see all analyzers - funcs: # run `go tool vet help printf` to see available settings for `printf` analyzer - - (github.com/golangci/golangci-lint/pkg/logutils.Log).Infof - - (github.com/golangci/golangci-lint/pkg/logutils.Log).Warnf - - (github.com/golangci/golangci-lint/pkg/logutils.Log).Errorf - - (github.com/golangci/golangci-lint/pkg/logutils.Log).Fatalf - - # enable or disable analyzers by name - # run `go tool vet help` to see all analyzers - enable: - - atomicalign - enable-all: false - disable: - - shadow - disable-all: false - - ifshort: - # Maximum length of variable declaration measured in number of lines, after which linter won't suggest using short syntax. - # Has higher priority than max-decl-chars. - max-decl-lines: 1 - # Maximum length of variable declaration measured in number of characters, after which linter won't suggest using short syntax. - max-decl-chars: 30 - - importas: - # if set to `true`, force to use alias. - no-unaliased: true - # List of aliases - alias: - # using `servingv1` alias for `knative.dev/serving/pkg/apis/serving/v1` package - - pkg: knative.dev/serving/pkg/apis/serving/v1 - alias: servingv1 - # using `autoscalingv1alpha1` alias for `knative.dev/serving/pkg/apis/autoscaling/v1alpha1` package - - pkg: knative.dev/serving/pkg/apis/autoscaling/v1alpha1 - alias: autoscalingv1alpha1 - # You can specify the package path by regular expression, - # and alias by regular expression expansion syntax like below. - # see https://github.com/julz/importas#use-regular-expression for details - - pkg: knative.dev/serving/pkg/apis/(\w+)/(v[\w\d]+) - alias: $1$2 - - lll: - # max line length, lines longer will be reported. Default is 120. - # '\t' is counted as 1 character by default, and can be changed with the tab-width option - line-length: 120 - # tab width in spaces. Default to 1. - tab-width: 1 - - staticcheck: - # Select the Go version to target. The default is '1.13'. - go: "1.18" - # https://staticcheck.io/docs/options#checks - checks: [ "all" ] - - stylecheck: - # Select the Go version to target. The default is '1.13'. - go: "1.18" - # https://staticcheck.io/docs/options#checks - checks: [ "all", "-ST1000", "-ST1003", "-ST1016", "-ST1020", "-ST1021", "-ST1022" ] - # https://staticcheck.io/docs/options#dot_import_whitelist - dot-import-whitelist: - - fmt - # https://staticcheck.io/docs/options#initialisms - initialisms: [ "ACL", "API", "ASCII", "CPU", "CSS", "DNS", "EOF", "GUID", "HTML", "HTTP", "HTTPS", "ID", "IP", "JSON", "QPS", "RAM", "RPC", "SLA", "SMTP", "SQL", "SSH", "TCP", "TLS", "TTL", "UDP", "UI", "GID", "UID", "UUID", "URI", "URL", "UTF8", "VM", "XML", "XMPP", "XSRF", "XSS" ] - # https://staticcheck.io/docs/options#http_status_code_whitelist - http-status-code-whitelist: [ "200", "400", "404", "500" ] - - tagliatelle: - # check the struck tag name case - case: - # use the struct field name to check the name of the struct tag - use-field-name: true - rules: - # any struct tag type can be used. - # support string case: `camel`, `pascal`, `kebab`, `snake`, `goCamel`, `goPascal`, `goKebab`, `goSnake`, `upper`, `lower` - json: snake - yaml: camel - xml: camel - bson: camel - avro: snake - mapstructure: kebab - - testpackage: - # regexp pattern to skip files - skip-regexp: (export|internal)_test\.go - - thelper: - # The following configurations enable all checks. It can be omitted because all checks are enabled by default. - # You can enable only required checks deleting unnecessary checks. - test: - first: true - name: true - begin: true - benchmark: - first: true - name: true - begin: true - tb: - first: true - name: true - begin: true - - unused: - # Select the Go version to target. The default is '1.13'. - go: "1.18" - - whitespace: - multi-if: false # Enforces newlines (or comments) after every multi-line if statement - multi-func: false # Enforces newlines (or comments) after every multi-line function signatur - -linters: - enable: - - megacheck - - govet - disable: - - maligned - - prealloc - disable-all: false - presets: - - bugs - - unused - fast: false - - -issues: - # Excluding configuration per-path, per-linter, per-text and per-source - exclude-rules: - # Exclude some linters from running on tests files. - - path: _test\.go - linters: - - gocyclo - - errcheck - - dupl - - gosec - - # Exclude known linters from partially hard-vendored code, - # which is impossible to exclude via "nolint" comments. - - path: internal/hmac/ - text: "weak cryptographic primitive" - linters: - - gosec - - # Exclude some staticcheck messages - - linters: - - staticcheck - text: "SA9003:" - - # Exclude lll issues for long lines with go:generate - - linters: - - lll - source: "^//go:generate " - - # The list of ids of default excludes to include or disable. By default it's empty. - include: - - EXC0002 # disable excluding of issues about comments from golint - - -severity: - # Default value is empty string. - # Set the default severity for issues. If severity rules are defined and the issues - # do not match or no severity is provided to the rule this will be the default - # severity applied. Severities should match the supported severity names of the - # selected out format. - # - Code climate: https://docs.codeclimate.com/docs/issues#issue-severity - # - Checkstyle: https://checkstyle.sourceforge.io/property_types.html#severity - # - Github: https://help.github.com/en/actions/reference/workflow-commands-for-github-actions#setting-an-error-message - default-severity: error - - # The default value is false. - # If set to true severity-rules regular expressions become case sensitive. - case-sensitive: false - - # Default value is empty list. - # When a list of severity rules are provided, severity information will be added to lint - # issues. Severity rules have the same filtering capability as exclude rules except you - # are allowed to specify one matcher per severity rule. - # Only affects out formats that support setting severity information. - rules: - - linters: - - dupl - severity: info diff --git a/runner/.golangci.yml b/runner/.golangci.yml new file mode 100644 index 0000000000..8a6a47bf1f --- /dev/null +++ b/runner/.golangci.yml @@ -0,0 +1,319 @@ +version: "2" + +run: + # default concurrency is a available CPU number + concurrency: 4 + # timeout for analysis, e.g. 30s, 5m, default is 1m + timeout: 1m + # exit code when at least one issue was found, default is 1 + issues-exit-code: 1 + # include test files or not, default is true + tests: false + +linters: + default: none + enable: + # bugs + - asasalint + - asciicheck + - bidichk + - bodyclose + - durationcheck + - errcheck + - errchkjson + - errorlint + - exhaustive + - gocheckcompilerdirectives + - gochecksumtype + - gosec + - gosmopolitan + - govet + - loggercheck + - makezero + - musttag + - nilerr + - nilnesserr + - noctx + - protogetter + - reassign + - recvcheck + - rowserrcheck + - spancheck + - sqlclosecheck + - staticcheck + - testifylint + - zerologlint + # unused + - ineffassign + - unused + # module + - gomoddirectives + - gomodguard + settings: + cyclop: + # the maximal code complexity to report + max-complexity: 10 + # the maximal average package complexity. If it's higher than 0.0 (float) the check is enabled (default 0.0) + package-average: 0.0 + errcheck: + # report about not checking of errors in type assertions: `a := b.(MyStruct)`; + # default is false: such cases aren't reported by default. + check-type-assertions: false + # report about assignment of errors to blank identifier: `num, _ := strconv.Atoi(numStr)`; + # default is false: such cases aren't reported by default. + check-blank: false + exclude-functions: + # FIXME: either check for close errors or wrap defer calls into closure with _ = file.Close() + - (*os.File).Close + errorlint: + # Check whether fmt.Errorf uses the %w verb for formatting errors. See the readme for caveats + errorf: true + # Check for plain type assertions and type switches + asserts: true + # Check for plain error comparisons + comparison: true + exhaustive: + # indicates that switch statements are to be considered exhaustive if a + # 'default' case is present, even if all enum members aren't listed in the + # switch + default-signifies-exhaustive: false + funlen: + lines: 60 + statements: 40 + gocognit: + # minimal code complexity to report, 30 by default (but we recommend 10-20) + min-complexity: 10 + nestif: + # minimal complexity of if statements to report, 5 by default + min-complexity: 4 + goconst: + # minimal length of string constant, 3 by default + min-len: 3 + # minimal occurrences count to trigger, 3 by default + min-occurrences: 3 + gocritic: + # Which checks should be enabled; can't be combined with 'disabled-checks'; + # See https://go-critic.github.io/overview#checks-overview + # To check which checks are enabled run `GL_DEBUG=gocritic golangci-lint run` + # By default list of stable checks is used. + enabled-checks: + - rangeValCopy + # Which checks should be disabled; can't be combined with 'enabled-checks'; default is empty + disabled-checks: + - regexpMust + # Enable multiple checks by tags, run `GL_DEBUG=gocritic golangci-lint run` to see all tags and checks. + # Empty list by default. See https://github.com/go-critic/go-critic#usage -> section "Tags". + enabled-tags: + - performance + disabled-tags: + - experimental + # Settings passed to gocritic. + # The settings key is the name of a supported gocritic checker. + # The list of supported checkers can be find in https://go-critic.github.io/overview. + settings: + captLocal: # must be valid enabled check name + # whether to restrict checker to params only (default true) + paramsOnly: true + elseif: + # whether to skip balanced if-else pairs (default true) + skipBalanced: true + hugeParam: + # size in bytes that makes the warning trigger (default 80) + sizeThreshold: 80 + nestingReduce: + # min number of statements inside a branch to trigger a warning (default 5) + bodyWidth: 5 + rangeExprCopy: + # size in bytes that makes the warning trigger (default 512) + sizeThreshold: 512 + # whether to check test functions (default true) + skipTestFuncs: true + rangeValCopy: + # size in bytes that makes the warning trigger (default 128) + sizeThreshold: 32 + # whether to check test functions (default true) + skipTestFuncs: true + truncateCmp: + # whether to skip int/uint/uintptr types (default true) + skipArchDependent: true + underef: + # whether to skip (*x).method() calls where x is a pointer receiver (default true) + skipRecvDeref: true + unnamedResult: + # whether to check exported functions + checkExported: true + gocyclo: + # minimal code complexity to report, 30 by default (but we recommend 10-20) + min-complexity: 10 + gosec: + # To select a subset of rules to run. + # Available rules: https://github.com/securego/gosec#available-rules + includes: + - G401 + - G306 + - G101 + # To specify a set of rules to explicitly exclude. + # Available rules: https://github.com/securego/gosec#available-rules + excludes: + - G306 # Poor file permissions used when writing to a new file + # To specify the configuration of rules. + # The configuration of rules is not fully documented by gosec: + # https://github.com/securego/gosec#configuration + # https://github.com/securego/gosec/blob/569328eade2ccbad4ce2d0f21ee158ab5356a5cf/rules/rulelist.go#L60-L102 + config: + G306: "0600" + G101: + pattern: "(?i)example" + ignore_entropy: false + entropy_threshold: "80.0" + per_char_threshold: "3.0" + truncate: "32" + govet: + # settings per analyzer + settings: + printf: # analyzer name, run `go tool vet help` to see all analyzers + funcs: # run `go tool vet help printf` to see available settings for `printf` analyzer + - (github.com/golangci/golangci-lint/pkg/logutils.Log).Infof + - (github.com/golangci/golangci-lint/pkg/logutils.Log).Warnf + - (github.com/golangci/golangci-lint/pkg/logutils.Log).Errorf + - (github.com/golangci/golangci-lint/pkg/logutils.Log).Fatalf + # enable or disable analyzers by name + # run `go tool vet help` to see all analyzers + enable-all: true + disable: + - shadow + - fieldalignment + importas: + # if set to `true`, force to use alias. + no-unaliased: true + # List of aliases + alias: + # using `servingv1` alias for `knative.dev/serving/pkg/apis/serving/v1` package + - pkg: knative.dev/serving/pkg/apis/serving/v1 + alias: servingv1 + # using `autoscalingv1alpha1` alias for `knative.dev/serving/pkg/apis/autoscaling/v1alpha1` package + - pkg: knative.dev/serving/pkg/apis/autoscaling/v1alpha1 + alias: autoscalingv1alpha1 + # You can specify the package path by regular expression, + # and alias by regular expression expansion syntax like below. + # see https://github.com/julz/importas#use-regular-expression for details + - pkg: knative.dev/serving/pkg/apis/(\w+)/(v[\w\d]+) + alias: $1$2 + lll: + # max line length, lines longer will be reported. Default is 120. + # '\t' is counted as 1 character by default, and can be changed with the tab-width option + line-length: 120 + # tab width in spaces. Default to 1. + tab-width: 1 + staticcheck: + # https://staticcheck.io/docs/options#checks + checks: [ "all", "-ST1000", "-ST1003", "-ST1016", "-ST1020", "-ST1021", "-ST1022", "-QF1008" ] + # https://staticcheck.io/docs/options#dot_import_whitelist + dot-import-whitelist: + - fmt + # https://staticcheck.io/docs/options#initialisms + initialisms: [ "ACL", "API", "ASCII", "CPU", "CSS", "DNS", "EOF", "GUID", "HTML", "HTTP", "HTTPS", "ID", "IP", "JSON", "QPS", "RAM", "RPC", "SLA", "SMTP", "SQL", "SSH", "TCP", "TLS", "TTL", "UDP", "UI", "GID", "UID", "UUID", "URI", "URL", "UTF8", "VM", "XML", "XMPP", "XSRF", "XSS" ] + # https://staticcheck.io/docs/options#http_status_code_whitelist + http-status-code-whitelist: [ "200", "400", "404", "500" ] + tagliatelle: + # check the struck tag name case + case: + # use the struct field name to check the name of the struct tag + use-field-name: true + rules: + # any struct tag type can be used. + # support string case: `camel`, `pascal`, `kebab`, `snake`, `goCamel`, `goPascal`, `goKebab`, `goSnake`, `upper`, `lower` + json: snake + yaml: camel + xml: camel + bson: camel + avro: snake + mapstructure: kebab + testpackage: + # regexp pattern to skip files + skip-regexp: (export|internal)_test\.go + thelper: + # The following configurations enable all checks. It can be omitted because all checks are enabled by default. + # You can enable only required checks deleting unnecessary checks. + test: + first: true + name: true + begin: true + benchmark: + first: true + name: true + begin: true + tb: + first: true + name: true + begin: true + whitespace: + multi-if: false # Enforces newlines (or comments) after every multi-line if statement + multi-func: false # Enforces newlines (or comments) after every multi-line function signatur + exclusions: + # Excluding configuration per-path, per-linter, per-text and per-source + rules: + # Exclude some linters from running on tests files. + - path: _test\.go + linters: + - gocyclo + - errcheck + - dupl + - gosec + # Exclude known linters from partially hard-vendored code, + # which is impossible to exclude via "nolint" comments. + - path: internal/hmac/ + text: "weak cryptographic primitive" + linters: + - gosec + # Exclude lll issues for long lines with go:generate + - linters: + - lll + source: "^//go:generate " + # The list of ids of default excludes to include or disable. By default it's empty. + presets: + - comments # disable excluding of issues about comments from golint + +formatters: + enable: + - gci + - gofmt + - gofumpt + - goimports + settings: + gci: + sections: + - standard + - default + - localmodule + gofmt: + # simplify code: gofmt with `-s` option, true by default + simplify: true + gofumpt: + # Choose whether or not to use the extra rules that are disabled + # by default + extra-rules: false + goimports: + # put imports beginning with prefix after 3rd-party packages; + local-prefixes: + - github.com/dstackai/dstack/runner + +severity: + # Default value is empty string. + # Set the default severity for issues. If severity rules are defined and the issues + # do not match or no severity is provided to the rule this will be the default + # severity applied. Severities should match the supported severity names of the + # selected out format. + # - Code climate: https://docs.codeclimate.com/docs/issues#issue-severity + # - Checkstyle: https://checkstyle.sourceforge.io/property_types.html#severity + # - Github: https://help.github.com/en/actions/reference/workflow-commands-for-github-actions#setting-an-error-message + default: error + # Default value is empty list. + # When a list of severity rules are provided, severity information will be added to lint + # issues. Severity rules have the same filtering capability as exclude rules except you + # are allowed to specify one matcher per severity rule. + # Only affects out formats that support setting severity information. + rules: + - linters: + - dupl + severity: info diff --git a/runner/.justfile b/runner/.justfile new file mode 100644 index 0000000000..e07a8c8eb7 --- /dev/null +++ b/runner/.justfile @@ -0,0 +1,137 @@ +# Justfile for building and uploading dstack runner and shim +# +# Run `just` to see all available commands +# +# Configuration: +# - DSTACK_SHIM_UPLOAD_VERSION: Version of the runner and shim to upload +# - DSTACK_SHIM_UPLOAD_S3_BUCKET: S3 bucket to upload binaries to +# - DSTACK_SHIM_BUILD_ARCH: Target architecture for runner and shim (defaults to amd64) +# +# Build Process: +# - Runner and shim are always built for linux (GOOS=linux is the only supported OS) +# - The target architecture is configurable via DSTACK_SHIM_BUILD_ARCH (or `just --set arch ...`) +# - CGO is enabled only for native builds (Linux host with a matching architecture); +# otherwise it is disabled and DCGM support is dropped +# +# Development Workflows: +# - Local Development: +# * Use build recipes to build binaries for local testing +# * See README.md for instructions on running dstack server with local binaries +# * No need to upload binaries for local development +# +# - Remote Development: +# * Use upload recipes to build and upload binaries to S3 +# * See README.md for instructions on running dstack server with uploaded binaries +# * Upload is required for testing with standard backends (including SSH fleets) + +default: + @just --list + +# Version of the runner and shim to upload +export version := env("DSTACK_SHIM_UPLOAD_VERSION", "0.0.0") + +# S3 bucket to upload binaries to +export s3_bucket := env("DSTACK_SHIM_UPLOAD_S3_BUCKET", "dstack-runner-downloads-stgn") + +# Target architecture for runner and shim (GOOS is always linux) +export arch := env("DSTACK_SHIM_BUILD_ARCH", "amd64") + +# Download URLs +export runner_download_url := "s3://" + s3_bucket + "/" + version + "/binaries/dstack-runner-linux-" + arch +export shim_download_url := "s3://" + s3_bucket + "/" + version + "/binaries/dstack-shim-linux-" + arch + +# Go toolchain image for running tests in a container (keep in sync with go.mod) +export go_version := env("DSTACK_GO_VERSION", "1.25") + +# Build runner +[private] +build-runner-binary: + #!/usr/bin/env bash + set -e + echo "Building runner for linux/$arch" + cd {{source_directory()}}/cmd/runner && CGO_ENABLED=0 GOOS=linux GOARCH=$arch go build -ldflags "-X 'main.Version=$version' -extldflags '-static'" + echo "Runner build complete!" + +# Build shim +[private] +build-shim-binary: + #!/usr/bin/env bash + set -e + cd {{source_directory()}}/cmd/shim + echo "Building shim for linux/$arch" + host_arch=$(uname -m) + case "$host_arch" in + x86_64) host_arch=amd64 ;; + aarch64 | arm64) host_arch=arm64 ;; + esac + if [ "$(uname -s)" = "Linux" ] && [ "$host_arch" = "$arch" ]; then + CGO_ENABLED=1 GOOS=linux GOARCH=$arch go build -ldflags "-X 'main.Version=$version'" + else + echo "WARNING: Cross-compiling to linux/$arch, disabling CGO (DCGM unavailable)" + CGO_ENABLED=0 GOOS=linux GOARCH=$arch go build -ldflags "-X 'main.Version=$version' -extldflags '-static'" + fi + echo "Shim build (version: $version) complete!" + +# Build both runner and shim +build-runner: build-runner-binary build-shim-binary + echo "Build complete! linux/$arch binaries are in their respective cmd directories." + +# Clean build artifacts +clean-runner: + rm -f {{source_directory()}}/cmd/runner/runner + rm -f {{source_directory()}}/cmd/shim/shim + echo "Build artifacts cleaned!" + +# Run tests for runner and shim (native; requires a Linux host) +test-runner: + cd {{source_directory()}} && go test -v ./... + +# Run tests for runner and shim in a Linux container (use on macOS/Windows, where native builds are not available) +# Examples: +# just test-runner-in-container # short suite, all packages +# just test-runner-in-container -run TestPullImage ./internal/shim/ +test-runner-in-container *args="-short ./...": + docker run --rm -t \ + -v {{source_directory()}}:/src -w /src \ + -v dstack-go-mod:/go/pkg/mod \ + -v dstack-go-build:/root/.cache/go-build \ + -v /var/run/docker.sock:/var/run/docker.sock \ + golang:{{go_version}} \ + go test -race {{args}} + +# Validate shim is built for the configured linux architecture +[private] +validate-shim-binary: + #!/usr/bin/env bash + set -e + case "$arch" in + amd64) expected="x86-64" ;; + arm64) expected="ARM aarch64" ;; + *) echo "Error: Unsupported arch '$arch'"; exit 1 ;; + esac + if ! file {{source_directory()}}/cmd/shim/shim | grep -q "ELF 64-bit LSB executable, $expected"; then + echo "Error: Shim must be built for linux/$arch for upload" + exit 1 + fi + +# Upload both runner and shim to S3 +upload-runner: upload-runner-binary upload-shim-binary + +# Upload runner to S3 +[private] +upload-runner-binary: + #!/usr/bin/env bash + set -e + just build-runner-binary + aws s3 cp {{source_directory()}}/cmd/runner/runner "{{runner_download_url}}" --acl public-read + echo "Uploaded runner to S3" + +# Upload shim to S3 +[private] +upload-shim-binary: + #!/usr/bin/env bash + set -e + just build-shim-binary + just validate-shim-binary + aws s3 cp {{source_directory()}}/cmd/shim/shim "{{shim_download_url}}" --acl public-read + echo "Uploaded shim to S3" diff --git a/runner/README.md b/runner/README.md index 5f229b6d65..056031fe0c 100644 --- a/runner/README.md +++ b/runner/README.md @@ -1,10 +1,20 @@ -# dstack-shim an dstack-runner +# dstack-shim and dstack-runner For overview of `dstack-shim` and `dstack-runner`, see [/contributing/RUNNER-AND-SHIM.md](../contributing/RUNNER-AND-SHIM.md). -## Development +`dstack-shim` and `dstack-runner` can be built only for GOOS=linux. Use containers for development on other OS. -Here's the steps to build `dstack-shim` and `dstack-runner` and run `dstack` with them locally: +## Testing locally + +Run shim and runner tests on any OS inside a Docker container: + +```shell +just test-runner-in-container +``` + +## Running locally (standalone) + +Build `dstack-shim` and `dstack-runner` and run them locally: 1. Build the runner executable @@ -13,8 +23,6 @@ Here's the steps to build `dstack-shim` and `dstack-runner` and run `dstack` wit go build ``` - Note: The runner runs inside the Docker container, so ensure it's compiled for linux/amd64. For example, on macOS you'd run `GOOS=linux GOARCH=amd64 go build`. - 2. Build the shim executable ```shell @@ -25,13 +33,12 @@ Here's the steps to build `dstack-shim` and `dstack-runner` and run `dstack` wit 3. Start the shim: ```shell - ./shim --home $RUNNER_DIR --runner-binary-path $COMPILED_RUNNER_PATH docker --ssh-key $DSTACK_PUBLIC_KEY --keep-container + ./shim --shim-home $RUNNER_DIR --runner-binary-path $COMPILED_RUNNER_PATH ``` Notes: * `$RUNNER_DIR` is any directory for storing runner files. - * `$DSTACK_PUBLIC_KEY` is `~/.dstack/ssh/id_rsa.pub` that allows the dstack CLI to connect to the ssh server inside the container. Now you can call shim API: @@ -41,38 +48,63 @@ Now you can call shim API: >>> s.submit("","", "ubuntu", None) ``` -You can also run `dstack` end-to-end with local shim and runner by enabling the `local` backend on dstack server: +## Running with `dstack` + +You can test the built shim and runner with `dstack` using standard backends (including SSH fleets). + +> [!NOTE] +> To run with standard backends, both the runner and shim must be built for linux. + +Build the runner and shim and upload them to S3 using `just` (see [`justfile`](justfile)). + +> [!IMPORTANT] +> Before running any `just` commands that upload to S3, you must set the following environment variables: +> +> ```shell +> export DSTACK_SHIM_UPLOAD_VERSION="your-version" +> export DSTACK_SHIM_UPLOAD_S3_BUCKET="your-bucket" +> ``` +> +> These variables are required and must be set before running any upload commands. ```shell -DSTACK_LOCAL_BACKEND_ENABLED= dstack server --log-level=debug +just upload ``` -The `local` backend will submit the run to the locally started shim and runner. The CLI will attach to the container just as if it were any other cloud backend: +To use the built shim and runner with the `dstack` server, pass the URLs via `DSTACK_SHIM_DOWNLOAD_URL` and `DSTACK_RUNNER_DOWNLOAD_URL`: ```shell -✗ dstack run . - Configuration .dstack.yml - Project main - User admin - Pool name default-pool - Min resources 2..xCPU, 4GB.. - Max price - - Max duration 6h - Spot policy auto - Retry policy yes - Creation policy reuse-or-create - Termination policy destroy-after-idle - Termination idle time 300s - - # BACKEND REGION INSTANCE RESOURCES SPOT PRICE - 1 local local local 4xCPU, 8GB, 100GB no $0 - (disk) - 2 azure westeurope Standard_D2s_v3 2xCPU, 8GB, 100GB yes $0.012 - (disk) - 3 azure westeurope Standard_E2s_v4 2xCPU, 16GB, 100GB yes $0.015246 - (disk) - ... - Shown 3 of 4041 offers, $56.6266 max - -Continue? [y/n]: -``` \ No newline at end of file +export DSTACK_SHIM_DOWNLOAD_URL="https://${DSTACK_SHIM_UPLOAD_S3_BUCKET}.s3.amazonaws.com/${DSTACK_SHIM_UPLOAD_VERSION}/binaries/dstack-shim-linux-amd64" +export DSTACK_RUNNER_DOWNLOAD_URL="https://${DSTACK_SHIM_UPLOAD_S3_BUCKET}.s3.amazonaws.com/${DSTACK_SHIM_UPLOAD_VERSION}/binaries/dstack-runner-linux-amd64" + +dstack server --log-level=debug +``` + +## Dependencies (WIP) + +These are non-exhaustive lists of external dependencies (executables, libraries) of the `dstack-*` binaries. + +**TODO**: inspect codebase, add missing dependencies. + +### `dstack-shim` + +#### Executables + +* `mount` +* `umount` +* `mountpoint` +* `lsblk` +* `blkid` +* `mkfs.ext4` +* (NVIDIA GPU SSH fleet instances only) `nvidia-smi` +* (AMD SSH fleet instances only) `docker` (used for `amd-smi` container) +* (Intel Gaudi SSH fleet instances only) `hl-smi` +* ... + +Debian/Ubuntu packages: `mount` (`mount`, `umount`), `util-linux` (`mountpoint`, `lsblk`, `blkid`), `e2fsprogs` (`mkfs.ext4`) + +### `dstack-runner` + +#### Executables + +* ... diff --git a/runner/cmd/runner/cmd.go b/runner/cmd/runner/cmd.go deleted file mode 100644 index b217f0c54c..0000000000 --- a/runner/cmd/runner/cmd.go +++ /dev/null @@ -1,75 +0,0 @@ -package main - -import ( - "log" - "os" - - "github.com/urfave/cli/v2" -) - -// Version is a build-time variable. The value is overridden by ldflags. -var Version string - -func App() { - var paths struct{ tempDir, homeDir, workingDir string } - var httpPort int - var logLevel int - - app := &cli.App{ - Name: "dstack-runner", - Usage: "configure and start dstack-runner", - Version: Version, - Flags: []cli.Flag{ - &cli.IntFlag{ - Name: "log-level", - Value: 2, - DefaultText: "4 (Info)", - Usage: "log verbosity level: 2 (Error), 3 (Warning), 4 (Info), 5 (Debug), 6 (Trace)", - Destination: &logLevel, - }, - }, - Commands: []*cli.Command{ - { - Name: "start", - Usage: "Start dstack-runner", - Flags: []cli.Flag{ - &cli.PathFlag{ - Name: "temp-dir", - Usage: "Temporary directory for logs and other files", - Required: true, - Destination: &paths.tempDir, - }, - &cli.PathFlag{ - Name: "home-dir", - Usage: "HomeDir directory for credentials and $HOME", - Required: true, - Destination: &paths.homeDir, - }, - &cli.PathFlag{ - Name: "working-dir", - Usage: "Base path for the job", - Required: true, - Destination: &paths.workingDir, - }, - &cli.IntFlag{ - Name: "http-port", - Usage: "Set a http port", - Value: 10999, - Destination: &httpPort, - }, - }, - Action: func(c *cli.Context) error { - err := start(paths.tempDir, paths.homeDir, paths.workingDir, httpPort, logLevel, Version) - if err != nil { - return cli.Exit(err, 1) - } - return nil - }, - }, - }, - } - err := app.Run(os.Args) - if err != nil { - log.Fatal(err) - } -} diff --git a/runner/cmd/runner/main.go b/runner/cmd/runner/main.go index 443eefa927..08196b0ba4 100644 --- a/runner/cmd/runner/main.go +++ b/runner/cmd/runner/main.go @@ -1,48 +1,231 @@ +//go:build linux + +// dstack-runner is supported only in Linux environments. + package main import ( "context" + "errors" "fmt" "io" - _ "net/http/pprof" "os" + "os/signal" + "path" "path/filepath" + "syscall" - "github.com/dstackai/dstack/runner/consts" - "github.com/dstackai/dstack/runner/internal/log" - "github.com/dstackai/dstack/runner/internal/runner/api" "github.com/sirupsen/logrus" - "github.com/ztrue/tracerr" + "github.com/urfave/cli/v3" + + "github.com/dstackai/dstack/runner/internal/common/consts" + "github.com/dstackai/dstack/runner/internal/common/log" + "github.com/dstackai/dstack/runner/internal/runner/api" + "github.com/dstackai/dstack/runner/internal/runner/executor" + linuxuser "github.com/dstackai/dstack/runner/internal/runner/linux/user" + "github.com/dstackai/dstack/runner/internal/runner/ssh" ) +// Version is a build-time variable. The value is overridden by ldflags. +// The "latest" default marks a dev build; the server treats it as the newest version. +var Version = "latest" + func main() { - App() + os.Exit(mainInner()) +} + +func mainInner() int { + var tempDir string + var httpAddress string + var httpPort int + var sshPort int + var sshAuthorizedKeys []string + var sshLogLevel string + var logLevel string + + cmd := &cli.Command{ + Name: "dstack-runner", + Usage: "configure and start dstack-runner", + Version: Version, + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "log-level", + Value: "info", + Usage: "log verbosity level: fatal, error, warning, info, debug, trace", + Destination: &logLevel, + }, + }, + Commands: []*cli.Command{ + { + Name: "start", + Usage: "Start dstack-runner", + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "temp-dir", + Usage: "Temporary directory for logs and other files", + Value: consts.RunnerTempDir, + Destination: &tempDir, + TakesFile: true, + }, + &cli.StringFlag{ + Name: "http-address", + Usage: "Set a http bind address", + Value: "", + DefaultText: "all interfaces", + Destination: &httpAddress, + }, + &cli.IntFlag{ + Name: "http-port", + Usage: "Set a http port", + Value: consts.RunnerHTTPPort, + Destination: &httpPort, + }, + &cli.IntFlag{ + Name: "ssh-port", + Usage: "Set the ssh port", + Value: consts.RunnerSSHPort, + Destination: &sshPort, + }, + &cli.StringSliceFlag{ + Name: "ssh-authorized-key", + Usage: "dstack server or user authorized key. May be specified multiple times", + Destination: &sshAuthorizedKeys, + }, + &cli.StringFlag{ + Name: "ssh-log-level", + Value: "INFO", + Usage: "ssh LogLevel, see sshd_config(5)", + Destination: &sshLogLevel, + }, + // --home-dir is not used since 0.20.4, but the flag was retained as no-op + // for compatibility with pre-0.20.4 shims; remove the flag eventually + &cli.StringFlag{ + Name: "home-dir", + Hidden: true, + }, + }, + Action: func(ctx context.Context, cmd *cli.Command) error { + logLvl, err := log.ParseLevel(logLevel) + if err != nil { + return err + } + return start(ctx, logLvl, tempDir, httpAddress, httpPort, sshPort, sshAuthorizedKeys, sshLogLevel) + }, + }, + }, + } + + ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM, syscall.SIGQUIT) + defer stop() + + if err := cmd.Run(ctx, os.Args); err != nil { + log.Error(ctx, err.Error()) + return 1 + } + + return 0 } -func start(tempDir string, homeDir string, workingDir string, httpPort int, logLevel int, version string) error { +func start( + ctx context.Context, + logLevel int, tempDir string, + httpAddress string, httpPort int, + sshPort int, sshAuthorizedKeys []string, sshLogLevel string, +) error { if err := os.MkdirAll(tempDir, 0o755); err != nil { - return tracerr.Errorf("Failed to create temp directory: %w", err) + return fmt.Errorf("create temp directory: %w", err) } defaultLogFile, err := log.CreateAppendFile(filepath.Join(tempDir, consts.RunnerDefaultLogFileName)) if err != nil { - return tracerr.Errorf("Failed to create default log file: %w", err) + return fmt.Errorf("create default log file: %w", err) } defer func() { - err = defaultLogFile.Close() - if err != nil { - tracerr.Print(err) + if err := defaultLogFile.Close(); err != nil { + log.Error(ctx, "Failed to close default log file", "err", err) } }() - log.DefaultEntry.Logger.SetOutput(io.MultiWriter(os.Stdout, defaultLogFile)) log.DefaultEntry.Logger.SetLevel(logrus.Level(logLevel)) - server := api.NewServer(tempDir, homeDir, workingDir, fmt.Sprintf(":%d", httpPort), version) + currentUser, err := linuxuser.FromCurrentProcess() + if err != nil { + return fmt.Errorf("get current process user: %w", err) + } + if !currentUser.IsRoot() { + return fmt.Errorf("must be root: %s", currentUser) + } + if currentUser.HomeDir == "" { + log.Warning(ctx, "Current user does not have home dir, using /root as a fallback", "user", currentUser) + currentUser.HomeDir = "/root" + } + // Fix the current process HOME, just in case some internals require it (e.g., they use os.UserHomeDir() or + // spawn a child process which uses that variable) + envHome, envHomeIsSet := os.LookupEnv("HOME") + if envHome != currentUser.HomeDir { + if !envHomeIsSet { + log.Warning(ctx, "HOME is not set, setting the value", "home", currentUser.HomeDir) + } else { + log.Warning(ctx, "HOME is incorrect, fixing the value", "current", envHome, "home", currentUser.HomeDir) + } + if err := os.Setenv("HOME", currentUser.HomeDir); err != nil { + return fmt.Errorf("set HOME: %w", err) + } + } + log.Trace(ctx, "Running as", "user", currentUser) + + // NB: The Mkdir/Chown/Chmod code below relies on the fact that RunnerDstackDir path is _not_ nested (/dstack). + // Adjust it if the path is changed to, e.g., /opt/dstack + const dstackDir = consts.RunnerDstackDir + dstackSshDir := path.Join(dstackDir, "ssh") + + // To ensure that all components of the authorized_keys path are owned by root and no directories + // are group or world writable, as required by sshd with "StrictModes yes" (the default value), + // we fix `/dstack` ownership and permissions and remove `/dstack/ssh` (it will be (re)created + // in Sshd.Prepare()) + // See: https://github.com/openssh/openssh-portable/blob/d01efaa1c9ed84fd9011201dbc3c7cb0a82bcee3/misc.c#L2257-L2272 + if err := os.Mkdir(dstackDir, 0o755); errors.Is(err, os.ErrExist) { + if err := os.Chown(dstackDir, 0, 0); err != nil { + return fmt.Errorf("chown dstack dir: %w", err) + } + if err := os.Chmod(dstackDir, 0o755); err != nil { + return fmt.Errorf("chmod dstack dir: %w", err) + } + } else if err != nil { + return fmt.Errorf("create dstack dir: %w", err) + } + if err := os.RemoveAll(dstackSshDir); err != nil { + return fmt.Errorf("remove dstack ssh dir: %w", err) + } + + sshd := ssh.NewSshd("/usr/sbin/sshd") + if err := sshd.Prepare(ctx, dstackSshDir, sshPort, sshLogLevel); err != nil { + return fmt.Errorf("prepare sshd: %w", err) + } + if err := sshd.AddAuthorizedKeys(ctx, sshAuthorizedKeys...); err != nil { + return fmt.Errorf("add authorized keys: %w", err) + } + if err := sshd.Start(ctx); err != nil { + return fmt.Errorf("start sshd: %w", err) + } + defer func() { + if err := sshd.Stop(ctx); err != nil { + log.Error(ctx, "Error while stopping sshd", "err", err) + } + }() + + ex, err := executor.NewRunExecutor(tempDir, dstackDir, *currentUser, sshd) + if err != nil { + return fmt.Errorf("create executor: %w", err) + } - log.Trace(context.TODO(), "Starting API server", "port", httpPort) - if err := server.Run(); err != nil { - return tracerr.Errorf("Server failed: %w", err) + server, err := api.NewServer(ctx, fmt.Sprintf("%s:%d", httpAddress, httpPort), Version, ex) + if err != nil { + return fmt.Errorf("create server: %w", err) + } + log.Trace(ctx, "Starting API server", "port", httpPort) + if err := server.Run(ctx); err != nil { + return fmt.Errorf("server failed: %w", err) } return nil diff --git a/runner/cmd/shim/main.go b/runner/cmd/shim/main.go index 6859d8cb1a..116e16b50c 100644 --- a/runner/cmd/shim/main.go +++ b/runner/cmd/shim/main.go @@ -1,341 +1,345 @@ +//go:build linux + +// dstack-shim is supported only in Linux environments. + package main import ( "context" - "encoding/csv" - "encoding/json" "errors" "fmt" "io" - "log" - "net" - "net/http" "os" + "os/signal" + "path" "path/filepath" - "runtime" - "strings" + "syscall" "time" - execute "github.com/alexellis/go-execute/v2" - "github.com/dstackai/dstack/runner/consts" + "github.com/sirupsen/logrus" + "github.com/urfave/cli/v3" + + "github.com/dstackai/dstack/runner/internal/common/consts" + "github.com/dstackai/dstack/runner/internal/common/gpu" + "github.com/dstackai/dstack/runner/internal/common/log" "github.com/dstackai/dstack/runner/internal/shim" "github.com/dstackai/dstack/runner/internal/shim/api" - "github.com/shirou/gopsutil/v3/mem" - "github.com/urfave/cli/v2" - "golang.org/x/sys/unix" + "github.com/dstackai/dstack/runner/internal/shim/components" + "github.com/dstackai/dstack/runner/internal/shim/dcgm" ) // Version is a build-time variable. The value is overridden by ldflags. -var Version string +// The "latest" default marks a dev build; the server treats it as the newest version. +var Version = "latest" + +// https://everything.curl.dev/usingcurl/proxies/env.html +// https://cs.opensource.google/go/x/net/+/657eb1317b5dd33038d683297c6be9cae05fa97d:http/httpproxy/proxy.go +// We accept HTTP_PROXY in upper case without additional checks as it's unlikely that +// the shim is running in the CGI context +// The lower case form should be used as some applications ignore the upper case form, e.g., curl, apt +const defaultPassEnv = "http_proxy,https_proxy,no_proxy,HTTP_PROXY,HTTPS_PROXY,NO_PROXY" func main() { + os.Exit(mainInner()) +} + +func mainInner() int { var args shim.CLIArgs - args.Docker.SSHPort = 10022 var serviceMode bool - app := &cli.App{ + const defaultLogLevel = logrus.InfoLevel + + log.DefaultEntry.Logger.SetLevel(defaultLogLevel) + log.DefaultEntry.Logger.SetOutput(os.Stderr) + + shimBinaryPath, err := os.Executable() + if err != nil { + shimBinaryPath = consts.ShimBinaryPath + } + + cmd := &cli.Command{ Name: "dstack-shim", Usage: "Starts dstack-runner or docker container.", Version: Version, Flags: []cli.Flag{ /* Shim Parameters */ - &cli.PathFlag{ - Name: "home", - Usage: "Dstack home directory", + &cli.StringFlag{ + Name: "shim-home", + Usage: "Set shim's home directory", Destination: &args.Shim.HomeDir, - EnvVars: []string{"DSTACK_HOME"}, + TakesFile: true, + DefaultText: path.Join("~", consts.DstackUserDir), + Sources: cli.EnvVars("DSTACK_SHIM_HOME"), + }, + &cli.StringFlag{ + Name: "shim-binary-path", + Usage: "Path to shim's binary", + Value: shimBinaryPath, + Destination: &args.Shim.BinaryPath, + TakesFile: true, + Sources: cli.EnvVars("DSTACK_SHIM_BINARY_PATH"), }, &cli.IntFlag{ Name: "shim-http-port", - Usage: "Set's shim's http port", + Usage: "Set shim's http port", Value: 10998, Destination: &args.Shim.HTTPPort, - EnvVars: []string{"DSTACK_SHIM_HTTP_PORT"}, + Sources: cli.EnvVars("DSTACK_SHIM_HTTP_PORT"), + }, + &cli.StringFlag{ + Name: "shim-log-level", + Usage: "Set shim's log level", + Value: defaultLogLevel.String(), + Destination: &args.Shim.LogLevel, + Sources: cli.EnvVars("DSTACK_SHIM_LOG_LEVEL"), }, /* Runner Parameters */ + &cli.StringFlag{ + Name: "runner-download-url", + Usage: "Set runner's download URL", + Destination: &args.Runner.DownloadURL, + Sources: cli.EnvVars("DSTACK_RUNNER_DOWNLOAD_URL"), + }, + &cli.StringFlag{ + Name: "runner-binary-path", + Usage: "Path to runner's binary", + Value: consts.RunnerBinaryPath, + Destination: &args.Runner.BinaryPath, + TakesFile: true, + Sources: cli.EnvVars("DSTACK_RUNNER_BINARY_PATH"), + }, &cli.IntFlag{ Name: "runner-http-port", Usage: "Set runner's http port", - Value: 10999, + Value: consts.RunnerHTTPPort, Destination: &args.Runner.HTTPPort, - EnvVars: []string{"DSTACK_RUNNER_HTTP_PORT"}, + Sources: cli.EnvVars("DSTACK_RUNNER_HTTP_PORT"), }, &cli.IntFlag{ + Name: "runner-ssh-port", + Usage: "Set runner's ssh port", + Value: consts.RunnerSSHPort, + Destination: &args.Runner.SSHPort, + Sources: cli.EnvVars("DSTACK_RUNNER_SSH_PORT"), + }, + &cli.StringFlag{ + Name: "runner-ssh-log-level", + Usage: "Set runner's ssh log level", + Destination: &args.Runner.SSHLogLevel, + Sources: cli.EnvVars("DSTACK_RUNNER_SSH_LOG_LEVEL"), + }, + &cli.StringFlag{ Name: "runner-log-level", Usage: "Set runner's log level", - Value: 4, + Value: defaultLogLevel.String(), Destination: &args.Runner.LogLevel, - EnvVars: []string{"DSTACK_RUNNER_LOG_LEVEL"}, + Sources: cli.EnvVars("DSTACK_RUNNER_LOG_LEVEL"), + }, + /* DCGM Exporter Parameters */ + &cli.IntFlag{ + Name: "dcgm-exporter-http-port", + Usage: "DCGM Exporter http port", + Value: 10997, + Destination: &args.DCGMExporter.HTTPPort, + Sources: cli.EnvVars("DSTACK_DCGM_EXPORTER_HTTP_PORT"), + }, + &cli.IntFlag{ + Name: "dcgm-exporter-interval", + Usage: "DCGM Exporter collect interval, milliseconds", + Value: 5000, + Destination: &args.DCGMExporter.Interval, + Sources: cli.EnvVars("DSTACK_DCGM_EXPORTER_INTERVAL"), + }, + /* DCGM Parameters */ + &cli.StringFlag{ + Name: "dcgm-address", + Usage: "nv-hostengine `hostname`, e.g., `localhost`", + DefaultText: "start libdcgm in embedded mode", + Destination: &args.DCGM.Address, + Sources: cli.EnvVars("DSTACK_DCGM_ADDRESS"), }, + /* Docker Parameters */ &cli.StringFlag{ - Name: "runner-version", - Usage: "Set runner's version", - Value: "latest", - Destination: &args.Runner.Version, - EnvVars: []string{"DSTACK_RUNNER_VERSION"}, + Name: "pass-env", + Usage: "Environment variables to pass on to the container, a comma-separated list of names", + Value: defaultPassEnv, + Destination: &args.Docker.PassEnv, + Sources: cli.EnvVars("DSTACK_DOCKER_PASS_ENV"), }, &cli.BoolFlag{ - Name: "dev", - Usage: "Use stgn channel", - Destination: &args.Runner.DevChannel, + Name: "privileged", + Usage: "Give extended privileges to the container", + Destination: &args.Docker.Privileged, + Sources: cli.EnvVars("DSTACK_DOCKER_PRIVILEGED"), }, - &cli.PathFlag{ - Name: "runner-binary-path", - Usage: "Path to runner's binary", - Destination: &args.Runner.BinaryPath, - EnvVars: []string{"DSTACK_RUNNER_BINARY_PATH"}, + &cli.StringFlag{ + Name: "pjrt-device", + Usage: "Set the PJRT_DEVICE environment variable (e.g., TPU, GPU)", + Destination: &args.Docker.PJRTDevice, + Sources: cli.EnvVars("PJRT_DEVICE"), }, - }, - Commands: []*cli.Command{ - { - Name: "docker", - Usage: "Starts docker container and modifies entrypoint", - Flags: []cli.Flag{ - /* Docker Parameters */ - &cli.BoolFlag{ - Name: "keep-container", - Usage: "Do not delete container on exit", - Destination: &args.Docker.KeepContainer, - }, - &cli.BoolFlag{ - Name: "privileged", - Usage: "Give extended privileges to the container", - Destination: &args.Docker.Privileged, - }, - &cli.StringFlag{ - Name: "ssh-key", - Usage: "Public SSH key", - Required: true, - Destination: &args.Docker.ConcatinatedPublicSSHKeys, - EnvVars: []string{"DSTACK_PUBLIC_SSH_KEY"}, - }, - &cli.StringFlag{ - Name: "pjrt-device", - Usage: "Set the PJRT_DEVICE environment variable (e.g., TPU, GPU)", - Destination: &args.Docker.PJRTDevice, - EnvVars: []string{"PJRT_DEVICE"}, - }, - &cli.BoolFlag{ - Name: "service", - Usage: "Start as a service", - Destination: &serviceMode, - EnvVars: []string{"DSTACK_SERVICE_MODE"}, - }, - }, - Action: func(c *cli.Context) error { - if args.Runner.BinaryPath == "" { - if err := args.DownloadRunner(); err != nil { - return cli.Exit(err, 1) - } - } - - args.Runner.TempDir = "/tmp/runner" - args.Runner.HomeDir = "/root" - args.Runner.WorkingDir = "/workflow" - - var err error - - // set dstack home path - args.Shim.HomeDir, err = getDstackHome(args.Shim.HomeDir) - if err != nil { - return cli.Exit(err, 1) - } - log.Printf("Config Shim: %+v\n", args.Shim) - log.Printf("Config Runner: %+v\n", args.Runner) - log.Printf("Config Docker: %+v\n", args.Docker) - - dockerRunner, err := shim.NewDockerRunner(args) - if err != nil { - return cli.Exit(err, 1) - } - - address := fmt.Sprintf(":%d", args.Shim.HTTPPort) - shimServer := api.NewShimServer(address, dockerRunner, Version) - - defer func() { - shutdownCtx, cancelShutdown := context.WithTimeout(context.Background(), 5*time.Second) - defer cancelShutdown() - _ = shimServer.HttpServer.Shutdown(shutdownCtx) - }() - - if serviceMode { - writeHostInfo() - } - - if err := shimServer.HttpServer.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { - return cli.Exit(err, 1) - } - - return nil - }, + /* Misc Parameters */ + &cli.BoolFlag{ + Name: "service", + Usage: "Start as a service", + Destination: &serviceMode, + Sources: cli.EnvVars("DSTACK_SERVICE_MODE"), }, }, + Action: func(ctx context.Context, cmd *cli.Command) error { + return start(ctx, args, serviceMode) + }, } - if err := app.Run(os.Args); err != nil { - log.Fatal(err) - } -} + ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) + defer stop() -func getDstackHome(flag string) (string, error) { - if flag != "" { - return flag, nil + if err := cmd.Run(ctx, os.Args); err != nil { + log.Error(ctx, err.Error()) + return 1 } - home, err := os.UserHomeDir() - if err != nil { - return "", err - } - return filepath.Join(home, consts.DstackDirPath), nil + return 0 } -func writeHostInfo() { - // host_info exist - if _, err := os.Stat(consts.HostInfoFile); !errors.Is(err, os.ErrNotExist) { - return - } - - type Message struct { - GpuName string `json:"gpu_name"` - GpuMemory string `json:"gpu_memory"` - GpuCount int `json:"gpu_count"` - Adresses []string `json:"addresses"` - DiskSize uint64 `json:"disk_size"` - NumCPUs int `json:"cpus"` - Memory uint64 `json:"memory"` - } - - gpuCount := 0 - gpuMemory := "" - gpuName := "" - gpus := getGpuInfo() - if len(gpus) != 0 { - gpuCount = len(gpus) - gpuMemory = gpus[0][1] - gpuName = gpus[0][0] - } - m := Message{ - GpuName: gpuName, - GpuMemory: gpuMemory, - GpuCount: gpuCount, - Adresses: getInterfaces(), - DiskSize: getDiskSize(), - NumCPUs: runtime.NumCPU(), - Memory: getMemory(), - } - - b, err := json.Marshal(m) - if err != nil { - panic(err) - } - - f, err := os.Create(consts.HostInfoFile) - if err != nil { - panic(err) - } - defer f.Close() - - _, err = f.Write(b) +func start(ctx context.Context, args shim.CLIArgs, serviceMode bool) (err error) { + _, err = log.ParseLevel(args.Runner.LogLevel) if err != nil { - panic(err) + return err } - - err = f.Sync() + logLevel, err := log.ParseLevel(args.Shim.LogLevel) if err != nil { - panic(err) + return err } -} + log.DefaultEntry.Logger.SetLevel(logrus.Level(logLevel)) + log.Info(ctx, "Starting dstack-shim", "version", Version) -func getGpuInfo() [][]string { - cmd := execute.ExecTask{ - Command: "docker", - Args: []string{ - "run", - "--rm", - "--gpus", "all", - "dstackai/base:py3.12-0.4-cuda-12.1", - "nvidia-smi", "--query-gpu=gpu_name,memory.total", "--format=csv", - }, - StreamStdio: false, + shimHomeDir := args.Shim.HomeDir + if shimHomeDir == "" { + home, err := os.UserHomeDir() + if err != nil { + return err + } + shimHomeDir = filepath.Join(home, consts.DstackUserDir) + args.Shim.HomeDir = shimHomeDir } - res, err := cmd.Execute(context.Background()) + shimLogFile, err := log.CreateAppendFile(filepath.Join(shimHomeDir, consts.ShimLogFileName)) if err != nil { - return [][]string{} // GPU not found - } - - if res.ExitCode != 0 { - return [][]string{} // GPU not found + return fmt.Errorf("failed to create shim log file: %w", err) } + defer func() { + _ = shimLogFile.Close() + }() - r := csv.NewReader(strings.NewReader(res.Stdout)) - - var gpus [][]string - - // Skip header - if _, err := r.Read(); err != nil { - panic("canot read csv") - } + originalLogger := log.GetLogger(ctx) + loggerOut := io.MultiWriter(originalLogger.Logger.Out, shimLogFile) + ctx = log.WithLogger(ctx, log.NewEntry(loggerOut, int(originalLogger.Logger.GetLevel()))) - for { - record, err := r.Read() - if err == io.EOF { - break - } + defer func() { + // Should be called _before_ we close shimLogFile + // If an error occurs earlier, we still log it to stderr in the main function if err != nil { - log.Fatal(err) + log.Error(ctx, err.Error()) } + }() - gpus = append(gpus, record) + runnerManager, runnerErr := components.NewRunnerManager(ctx, args.Runner.BinaryPath) + if args.Runner.DownloadURL != "" { + if err := runnerManager.Install(ctx, args.Runner.DownloadURL, false); err != nil { + return err + } + } else if runnerErr != nil { + return runnerErr } - return gpus -} + shimManager, shimErr := components.NewShimManager(ctx, args.Shim.BinaryPath) + if shimErr != nil { + return shimErr + } + + log.Debug(ctx, "Shim", "args", args.Shim) + log.Debug(ctx, "Runner", "args", args.Runner) + log.Debug(ctx, "Docker", "args", args.Docker) -func getInterfaces() []string { - var addresses []string - ifaces, err := net.Interfaces() + dockerRunner, err := shim.NewDockerRunner(ctx, &args) if err != nil { - panic("cannot get interfaces") + return err } - for _, i := range ifaces { - addrs, err := i.Addrs() - if err != nil { - panic("cannot get addrs") - } + var dcgmExporter *dcgm.DCGMExporter + var dcgmWrapper dcgm.DCGMWrapperInterface - for _, addr := range addrs { - switch v := addr.(type) { - case *net.IPNet: - if v.IP.IsLoopback() { - continue + if gpu.GetGpuVendor() == gpu.GpuVendorNvidia { + dcgmExporterPath, err := dcgm.GetDCGMExporterExecPath(ctx) + if err == nil { + interval := time.Duration(args.DCGMExporter.Interval * int(time.Millisecond)) + dcgmExporter = dcgm.NewDCGMExporter(dcgmExporterPath, args.DCGMExporter.HTTPPort, interval) + err = dcgmExporter.Start(ctx) + } + if err == nil { + log.Info(ctx, "using DCGM Exporter") + defer func() { + if err := dcgmExporter.Stop(ctx); err != nil { + log.Error(ctx, "failed to stop DCGM Exporter", "err", err) } + }() + } else { + log.Warning(ctx, "not using DCGM Exporter", "err", err) + } - addresses = append(addresses, addr.String()) + dcgmWrapper, err = dcgm.NewDCGMWrapper(args.DCGM.Address) + if err == nil { + log.Info(ctx, "using libdcgm") + defer func() { + if err := dcgmWrapper.Shutdown(); err != nil { + log.Error(ctx, "failed to shut down libdcgm", "err", err) + } + }() + if err := dcgmWrapper.EnableHealthChecks(); err != nil { + log.Error(ctx, "failed to enable libdcgm health checks", "err", err) } + } else { + log.Warning(ctx, "not using libdcgm", "err", err) } } - return addresses -} -func getDiskSize() uint64 { - var stat unix.Statfs_t - wd, err := os.Getwd() - if err != nil { - panic("cannot get current disk") + address := fmt.Sprintf("localhost:%d", args.Shim.HTTPPort) + shimServer := api.NewShimServer( + ctx, address, Version, + dockerRunner, dcgmExporter, dcgmWrapper, + runnerManager, shimManager, + ) + + if serviceMode { + if err := shim.WriteHostInfo(shimHomeDir, dockerRunner.Resources(ctx)); err != nil { + if errors.Is(err, os.ErrExist) { + log.Error(ctx, "write host info: file already exists") + } else { + return fmt.Errorf("write host info: %w", err) + } + } } - err = unix.Statfs(wd, &stat) - if err != nil { - panic("cannot get disk size") + + var serveErr error + serveErrCh := make(chan error) + + go func() { + if err := shimServer.Serve(); err != nil { + serveErrCh <- err + } + close(serveErrCh) + }() + + select { + case serveErr = <-serveErrCh: + case <-ctx.Done(): } - size := stat.Bavail * uint64(stat.Bsize) - return size -} -func getMemory() uint64 { - v, err := mem.VirtualMemory() - if err != nil { - panic("cannot get emeory") + shutdownCtx, cancelShutdown := context.WithTimeout(ctx, 5*time.Second) + defer cancelShutdown() + shutdownErr := shimServer.Shutdown(shutdownCtx, false) + if serveErr != nil { + return serveErr } - return v.Total + return shutdownErr } diff --git a/runner/consts/consts.go b/runner/consts/consts.go deleted file mode 100644 index d829e487ec..0000000000 --- a/runner/consts/consts.go +++ /dev/null @@ -1,23 +0,0 @@ -package consts - -const DstackDirPath string = ".dstack" - -// Runner's log filenames -const ( - RunnerDefaultLogFileName = "default.log" - RunnerJobLogFileName = "job.log" - RunnerLogFileName = "runner.log" -) - -// Error-containing messages will be identified by this signature -const ExecutorFailedSignature = "Executor failed" - -const HostInfoFile = "host_info.json" - -// GPU constants -const NVIDIA_RUNTIME = "nvidia" - -const ( - REPO_HTTPS_URL = "https://%s/%s/%s.git" - REPO_GIT_URL = "git@%s:%s/%s.git" -) diff --git a/runner/consts/states/state.go b/runner/consts/states/state.go deleted file mode 100644 index a1a6823137..0000000000 --- a/runner/consts/states/state.go +++ /dev/null @@ -1,9 +0,0 @@ -package states - -const ( - Done = "done" - Failed = "failed" - Running = "running" - Terminated = "terminated" - Terminating = "terminating" -) diff --git a/runner/docs/shim.openapi.yaml b/runner/docs/shim.openapi.yaml new file mode 100644 index 0000000000..e375e4e9d3 --- /dev/null +++ b/runner/docs/shim.openapi.yaml @@ -0,0 +1,806 @@ +openapi: 3.1.2 + +info: + title: dstack-shim API + version: v2/0.20.1 + x-logo: + url: https://avatars.githubusercontent.com/u/54146142?s=260 + description: > + ## Versioning + + `dstack-shim` versioning and release cycles are tied to those of `dstack`, meaning that we + cannot use shim version to express shim API versioning. To get around this limitation, we use + two version schemes: + + * shim binary version, the same as the server version, in the form of `MAJOR.MINOR.MICRO`, e.g, + `0.18.33`. This version is exposed via the `/healthcheck` endpoint. Used to discriminate + different API versions (`if shim_version >= x.y.z then api_version = 2 else api_version = 1`) + and detect features within one API version (`if shim_version >= x.y.z then has_x_feature = true`) + + * shim API version (you can think of it as a “generation”), in the form or `vN`, e.g., `v2`. + Diferrent API versions represent _totally different_ APIs. When two such API versions coexist + in the same shim binary (naturally, they should coexist for some time to ensure seamless migration), + we call the older one “Legacy API” and the newer one “Future API”. These versions are not exposed + via API, clients should figure out them via shim binary version + +servers: + - url: http://localhost:10998/api + +paths: + /healthcheck: + get: + summary: Ping and API version negotiation + description: > + Serves two roles: + + * as the path implies, it's a healthcheck, although there is no field in the response that + indicate if shim is healthy. Basically, it not is a proper healthcheck but + a basic "ping" method + * API version negotiation. Server inspects `version` field to figure out which API features + it should use + + **Important**: Since this endpoint is used for negotiation, it should always stay + backward/future compatible, specifically the `version` field + tags: [shim] + responses: + "200": + description: "" + content: + application/json: + schema: + $ref: "#/components/schemas/HealthcheckResponse" + + /shutdown: + post: + summary: Request shim shutdown + description: | + (since [0.20.1](https://github.com/dstackai/dstack/releases/tag/0.20.1)) Request shim to shut down itself. + Restart must be handled by an external process supervisor, e.g., `systemd`. + + **Note**: background jobs (e.g., component installation) are canceled regardless of the `force` option. + tags: [shim] + requestBody: + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/ShutdownRequest" + responses: + "200": + description: Request accepted + $ref: "#/components/responses/PlainTextOk" + "400": + description: Malformed JSON body or validation error + $ref: "#/components/responses/PlainTextBadRequest" + + /instance/health: + get: + summary: Get instance health + description: (since [0.19.22](https://github.com/dstackai/dstack/releases/tag/0.19.22)) Returns an object of optional passive system checks + tags: [Instance] + responses: + "200": + description: "" + content: + application/json: + schema: + $ref: "#/components/schemas/InstanceHealthResponse" + + /components: + get: + summary: Get components + description: (since [0.20.0](https://github.com/dstackai/dstack/releases/tag/0.20.0)) Returns a list of software components (e.g., `dstack-runner`) + tags: [Components] + responses: + "200": + description: "" + content: + application/json: + schema: + $ref: "#/components/schemas/ComponentListResponse" + + /components/install: + post: + summary: Install component + description: > + (since [0.20.0](https://github.com/dstackai/dstack/releases/tag/0.20.0)) Request installing/updating the software component. + Components are installed asynchronously + tags: [Components] + requestBody: + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/ComponentInstallRequest" + responses: + "200": + description: Request accepted + $ref: "#/components/responses/PlainTextOk" + "400": + description: Malformed JSON body or validation error + $ref: "#/components/responses/PlainTextBadRequest" + "409": + description: The component is already being installed + $ref: "#/components/responses/PlainTextConflict" + + /tasks: + get: + summary: Get task list + description: Returns a list of all tasks known to shim, including terminated ones + tags: [Tasks] + responses: + "200": + description: "" + content: + application/json: + schema: + $ref: "#/components/schemas/TaskListResponse" + post: + summary: Submit and run new task + tags: [Tasks] + requestBody: + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/TaskSubmitRequest" + responses: + "200": + description: Pending task info + $ref: "#/components/responses/TaskInfo" + "400": + description: Malformed JSON body or validation error + $ref: "#/components/responses/PlainTextBadRequest" + "409": + description: Task with the same ID already submitted + $ref: "#/components/responses/PlainTextConflict" + "500": + description: Internal error + $ref: "#/components/responses/PlainTextInternalError" + + /tasks/{id}: + get: + summary: Get task info + tags: [Tasks] + parameters: + - $ref: "#/components/parameters/taskId" + responses: + "200": + $ref: "#/components/responses/TaskInfo" + "404": + description: Task not found + $ref: "#/components/responses/PlainTextNotFound" + + /tasks/{id}/terminate: + post: + summary: Terminate task + description: > + Stops the task, that is, cancels image pulling if in progress, + stops the container if running, and sets the status to `terminated`. + No-op if the task is already terminated + tags: [Tasks] + parameters: + - in: path + name: id + schema: + $ref: "#/components/schemas/TaskID" + required: true + requestBody: + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/TaskTerminateRequest" + responses: + "200": + description: Updated task info + $ref: "#/components/responses/TaskInfo" + "404": + description: Task not found + $ref: "#/components/responses/PlainTextNotFound" + "500": + description: Internal error, e.g., failed to remove a container + $ref: "#/components/responses/PlainTextInternalError" + + /tasks/{id}/remove: + post: + summary: Remove task + description: > + Removes the task from in-memory storage and destroys its associated + resources: a container, logs, etc. + tags: [Tasks] + parameters: + - $ref: "#/components/parameters/taskId" + responses: + "200": + description: Task removed + $ref: "#/components/responses/PlainTextOk" + "404": + description: Task not found + $ref: "#/components/responses/PlainTextNotFound" + "409": + description: Task is not terminated, cannot remove + $ref: "#/components/responses/PlainTextConflict" + "500": + description: Internal error, e.g., failed to remove a container + $ref: "#/components/responses/PlainTextInternalError" + +components: + parameters: + taskId: + name: id + in: path + schema: + $ref: "#/components/schemas/TaskID" + required: true + + schemas: + TaskID: + description: Unique task ID assigned by dstack server + type: string + examples: + - 23a2c7a0-6c88-48ee-8028-b9ad9f6f5c24 + + TaskStatus: + title: shim.TaskStatus + type: string + enum: + - pending + - preparing + - pulling + - creating + - running + - terminated + + TerminationReason: + type: string + enum: + - executor_error + - creating_container_error + - container_exited_with_error + - done_by_runner + - terminated_by_user + - terminated_by_server + - max_duration_exceeded + + GpuID: + description: > + A vendor-specific unique identifier of GPU: + * NVIDIA: "globally unique immutable alphanumeric identifier of the GPU", + in the form of `GPU-` + * AMD: `/dev/dri/renderD` path + type: string + examples: + - GPU-2b79666e-d81f-f3f8-fd47-9903f118c3f5 + - /dev/dri/renderD128 + + NetworkMode: + title: shim.NetworkMode + type: string + enum: + - host + - bridge + + PortMapping: + title: shim.PortMapping + description: Task host:container port mapping pair + type: object + properties: + host: + type: integer + description: host port + examples: + - 32770 + container: + type: integer + description: container port + examples: + - 80 + required: + - host + - container + additionalProperties: false + + VolumeInfo: + title: shim.VolumeInfo + type: object + properties: + backend: + type: string + enum: [aws, gcp] + name: + type: string + default: "" + description: > + `dstack` volume [name](https://dstack.ai/docs/reference/dstack.yml/volume/#name) + volume_id: + type: string + default: "" + init_fs: + type: boolean + default: false + description: > + Create a filesystem when it doesn't exist if `true`, fail with error if `false` + + VolumeMountPoint: + title: shim.VolumeMountPoint + type: object + properties: + name: + type: string + default: "" + description: > + `dstack` volume [name](https://dstack.ai/docs/reference/dstack.yml/volume/#name) + path: + type: string + default: "" + description: Mount point inside container + + InstanceMountPoint: + title: shim.InstanceMountPoint + type: object + properties: + instance_name: + type: string + default: "" + description: Instance (host) path + path: + type: string + default: "" + description: Mount point inside container + + GPUDevice: + title: shim.GPUDevice + type: object + properties: + path_on_host: + type: string + default: "" + description: Instance (host) path + path_in_container: + type: string + default: "" + description: Path inside container + + DCGMHealth: + title: shim.dcgm.Health + type: object + properties: + overall_health: + type: integer + description: > + [dcgmHealthWatchResult_enum](https://docs.nvidia.com/datacenter/dcgm/latest/dcgm-api/dcgm-api-data-structs.html#_CPPv426dcgmHealthWatchResult_enum) + examples: + - 10 + incidents: + type: array + items: + $ref: "#/components/schemas/DCGMHealthIncident" + required: + - overall_health + - incidents + additionalProperties: false + + DCGMHealthIncident: + title: shim.dcgm.HealthIncident + type: object + properties: + system: + type: integer + description: > + [dcgmHealthSystems_enum](https://docs.nvidia.com/datacenter/dcgm/latest/dcgm-api/dcgm-api-data-structs.html#_CPPv422dcgmHealthSystems_enum) + examples: + - 1 + health: + type: integer + description: > + [dcgmHealthWatchResult_enum](https://docs.nvidia.com/datacenter/dcgm/latest/dcgm-api/dcgm-api-data-structs.html#_CPPv426dcgmHealthWatchResult_enum) + examples: + - 10 + error_message: + type: string + examples: + - > + Detected more than 16 PCIe replays per minute for GPU 0 : 99 Reconnect PCIe card. + Run system side PCIE diagnostic utilities to verify hops off the GPU board. If issue is on the board, run the field diagnostic. + error_code: + type: integer + description: > + [dcgmError_enum](https://github.com/NVIDIA/DCGM/blob/master/dcgmlib/dcgm_errors.h) + examples: + - 3 + entity_group_id: + type: integer + description: > + [dcgm_field_entity_group_t](https://docs.nvidia.com/datacenter/dcgm/latest/dcgm-api/dcgm-api-field-entity.html#_CPPv425dcgm_field_entity_group_t) + examples: + - 1 + entity_id: + type: integer + examples: + - 0 + required: + - system + - health + - error_message + - error_code + - entity_group_id + - entity_id + additionalProperties: false + + ComponentName: + title: shim.components.ComponentName + type: string + enum: + - dstack-runner + - dstack-shim + description: | + * (since [0.20.0](https://github.com/dstackai/dstack/releases/tag/0.20.0)) `dstack-runner` + * (since [0.20.1](https://github.com/dstackai/dstack/releases/tag/0.20.1)) `dstack-shim` + + ComponentStatus: + title: shim.components.ComponentStatus + type: string + enum: + - not-installed + - installed + - installing + - error + + ComponentInfo: + title: shim.components.ComponenInfo + type: object + properties: + name: + $ref: "#/components/schemas/ComponentName" + version: + type: string + description: An empty string if status != installed + examples: + - 0.20.1 + status: + allOf: + - $ref: "#/components/schemas/ComponentStatus" + - examples: + - installed + required: + - name + - version + - status + additionalProperties: false + + HealthcheckResponse: + title: shim.api.HealthcheckResponse + type: object + properties: + service: + const: dstack-shim + version: + type: string + examples: + - 0.18.34 + required: + - service + - version + additionalProperties: false + + ShutdownRequest: + title: shim.api.ShutdownRequest + type: object + properties: + force: + type: boolean + examples: + - false + description: If `true`, don't wait for background job coroutines to complete after canceling them and close HTTP server forcefully. + required: + - force + + InstanceHealthResponse: + title: shim.api.InstanceHealthResponse + type: object + properties: + dcgm: + $ref: "#/components/schemas/DCGMHealth" + additionalProperties: false + + ComponentListResponse: + title: shim.api.ComponentListResponse + type: object + properties: + components: + type: array + items: + $ref: "#/components/schemas/ComponentInfo" + required: + - components + additionalProperties: false + + ComponentInstallRequest: + title: shim.api.ComponentInstallRequest + type: object + properties: + name: + $ref: "#/components/schemas/ComponentName" + url: + type: string + examples: + - https://dstack-runner-downloads.s3.eu-west-1.amazonaws.com/0.20.1/binaries/dstack-runner-linux-amd64 + required: + - name + - url + + TaskListResponse: + title: shim.api.TaskListResponse + type: object + properties: + tasks: + type: array + items: + type: object + properties: + id: + $ref: "#/components/schemas/TaskID" + status: + $ref: "#/components/schemas/TaskStatus" + required: + - id + - status + description: A list of all tasks tracked by shim, each with its ID and status + required: + - tasks + additionalProperties: false + + TaskInfoResponse: + title: shim.api.TaskInfoResponse + description: Same as `shim.TaskInfo` + type: object + properties: + id: + $ref: "#/components/schemas/TaskID" + status: + allOf: + - $ref: "#/components/schemas/TaskStatus" + - examples: + - terminated + termination_reason: + $ref: "#/components/schemas/TerminationReason" + termination_message: + type: string + description: A shim-generated message or N last lines from the container logs + ports: + oneOf: + - type: array + items: + $ref: "#/components/schemas/PortMapping" + - type: "null" + description: > + A host:container port mapping or `null` if this information is not yet available. + In the `host` network mode the array is empty (or `null` if is not ready yet, see above). + container_name: + type: string + examples: + - horrible-mule-1-0-0-44f7cb95 + container_id: + type: string + examples: + - a6bb8d4bb8af8ec72482ecd194ff92fac9974521aa5ad8a46abfc4f0ba858775 + gpu_ids: + oneOf: + - type: array + items: + $ref: "#/components/schemas/GpuID" + - type: "null" + description: > + An array of GPU identifiers or `null` if this information is not yet available + required: + - id + - status + - termination_reason + - termination_message + - ports + - container_name + - container_id + - gpu_ids + additionalProperties: false + + TaskSubmitRequest: + title: shim.api.TaskSubmitRequest + description: Same as `shim.TaskConfig` + type: object + properties: + id: + $ref: "#/components/schemas/TaskID" + name: + type: string + description: Task name. Used to construct unique container name + examples: + - horrible-mule-1-0-0 + registry_username: + type: string + default: "" + description: Private container registry username + examples: + - registry-user + registry_password: + type: string + default: "" + description: Private container registry password + examples: + - registry-token + image_name: + type: string + examples: + - ubuntu:22.04 + container_user: + type: string + default: root + description: > + If not set, the default image user is used. As of 0.18.24, `dstack` always uses `root` + examples: + - root + privileged: + type: boolean + default: false + description: Start container in privileged mode + gpu: + type: integer + minimum: -1 + default: 0 + description: > + Number of GPUs allocated for the container. A special value `-1` means "all available, + even if none", `0` means "zero GPUs" + cpu: + type: number + minimum: 0 + default: 0 + description: > + Amount of CPU resources available to the container. A special value `0` means "all". + Fractional values are allowed, e.g., `1.5` — one and a half CPUs + memory: + type: number + minimum: 0 + default: 0 + description: > + Amount of memory available to the container, in bytes. A special value `0` means "all" + shm_size: + type: integer + minimum: 0 + default: 0 + description: > + POSIX shared memory, bytes. A special value `0` means "use the default value (64MiB)". + If > 0, tmpfs is mounted with the `exec` option, unlike the default mount options + examples: + - 1073741824 + network_mode: + allOf: + - $ref: "#/components/schemas/NetworkMode" + - default: host + volumes: + type: array + items: + $ref: "#/components/schemas/VolumeInfo" + default: [] + volume_mounts: + type: array + items: + $ref: "#/components/schemas/VolumeMountPoint" + default: [] + instance_mounts: + type: array + items: + $ref: "#/components/schemas/InstanceMountPoint" + default: [] + gpu_devices: + type: array + items: + $ref: "#/components/schemas/GPUDevice" + default: [] + host_ssh_user: + type: string + default: "" + description: > + Instance (host) user for SSH access, either directly (`ssh {run_name}-host`) + or for `ProxyJump`ing inside the container. Ignored if `host_ssh_keys` is not set + examples: + - root + host_ssh_keys: + type: array + items: + type: string + default: [] + description: > + SSH public keys for access to the instance (host). If set, the keys will be added + to the `host_ssh_user`'s `~/.ssh/authorized_keys` when the run starts and removed + when the run exits. + examples: + - "ssh-ed25519 me@laptop" + container_ssh_keys: + type: array + items: + type: string + default: [] + description: > + SSH public keys for `container_user`. As of 0.18.24, `dstack` submits two keys: + project key (generated by the server) and user key (either generated by + the CLI client or provided by the user) + examples: + - ["ssh-rsa project@dstack", "ssh-ed25519 me@laptop"] + required: + - id + - name + - image_name + + TaskTerminateRequest: + title: shim.api.TaskTerminateRequest + type: object + properties: + termination_reason: + allOf: + - $ref: "#/components/schemas/TerminationReason" + - examples: + - TERMINATED_BY_USER + - TERMINATED_BY_SERVER + default: "" + termination_message: + type: string + default: "" + timeout: + type: boolean + default: 0 + description: > + Seconds to wait before killing the container. If zero, kill + the container immediately (no graceful shutdown) + + responses: + TaskInfo: + description: Task info + content: + application/json: + schema: + $ref: "#/components/schemas/TaskInfoResponse" + + PlainTextOk: + description: "" + content: + text/plain: + schema: + type: string + examples: + - OK + + PlainTextBadRequest: + description: "" + content: + text/plain: + schema: + type: string + examples: + - bad request + + PlainTextNotFound: + description: "" + content: + text/plain: + schema: + type: string + examples: + - not found + + PlainTextConflict: + description: "" + content: + text/plain: + schema: + type: string + examples: + - conflict + + PlainTextInternalError: + description: "" + content: + text/plain: + schema: + type: string + examples: + - internal error diff --git a/runner/go.mod b/runner/go.mod index 9c133eacde..f338e78edd 100644 --- a/runner/go.mod +++ b/runner/go.mod @@ -1,43 +1,49 @@ module github.com/dstackai/dstack/runner -go 1.21 - -toolchain go1.21.9 +go 1.25 require ( + github.com/NVIDIA/go-dcgm v0.0.0-20250707210631-823394f2bd9b github.com/alexellis/go-execute/v2 v2.2.1 github.com/bluekeyes/go-gitdiff v0.7.2 - github.com/creack/pty v1.1.21 + github.com/codeclysm/extract/v4 v4.0.0 + github.com/creack/pty v1.1.24 github.com/docker/docker v26.0.0+incompatible github.com/docker/go-connections v0.5.0 + github.com/docker/go-units v0.5.0 + github.com/dstackai/ansistrip v0.0.6 github.com/go-git/go-git/v5 v5.12.0 github.com/golang/gddo v0.0.0-20210115222349-20d68f94ee1f - github.com/icza/backscanner v0.0.0-20240328210400-b40c3a86dec5 + github.com/gorilla/websocket v1.5.1 github.com/inhies/go-bytesize v0.0.0-20220417184213-4913239db9cf + github.com/prometheus/procfs v0.15.1 + github.com/shirou/gopsutil/v4 v4.24.11 github.com/sirupsen/logrus v1.9.3 - github.com/stretchr/testify v1.9.0 - github.com/urfave/cli/v2 v2.27.1 - github.com/ztrue/tracerr v0.4.0 + github.com/stretchr/testify v1.11.1 + github.com/urfave/cli/v3 v3.6.1 golang.org/x/crypto v0.22.0 + golang.org/x/sys v0.26.0 + kernel.org/pub/linux/libs/security/libcap/cap v1.2.77 ) require ( dario.cat/mergo v1.0.0 // indirect github.com/Microsoft/go-winio v0.6.1 // indirect github.com/ProtonMail/go-crypto v1.0.0 // indirect + github.com/bits-and-blooms/bitset v1.22.0 // indirect github.com/cloudflare/circl v1.3.7 // indirect github.com/containerd/log v0.1.0 // indirect - github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect github.com/cyphar/filepath-securejoin v0.2.4 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/distribution/reference v0.6.0 // indirect - github.com/docker/go-units v0.5.0 // indirect + github.com/ebitengine/purego v0.8.1 // indirect github.com/emirpasic/gods v1.18.1 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376 // indirect github.com/go-git/go-billy/v5 v5.5.0 // indirect github.com/go-logr/logr v1.4.1 // indirect github.com/go-logr/stdr v1.2.2 // indirect + github.com/go-ole/go-ole v1.2.6 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/h2non/filetype v1.1.3 // indirect @@ -45,6 +51,7 @@ require ( github.com/juju/errors v1.0.0 // indirect github.com/kevinburke/ssh_config v1.2.0 // indirect github.com/klauspost/compress v1.17.8 // indirect + github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect github.com/moby/docker-image-spec v1.3.1 // indirect github.com/moby/term v0.5.0 // indirect github.com/morikuni/aec v1.0.0 // indirect @@ -53,13 +60,15 @@ require ( github.com/pjbgf/sha1cd v0.3.0 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/russross/blackfriday/v2 v2.1.0 // indirect + github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 // indirect - github.com/shirou/gopsutil/v3 v3.24.3 // indirect github.com/skeema/knownhosts v1.2.2 // indirect + github.com/tidwall/btree v1.7.0 // indirect + github.com/tklauser/go-sysconf v0.3.12 // indirect + github.com/tklauser/numcpus v0.6.1 // indirect github.com/ulikunitz/xz v0.5.12 // indirect github.com/xanzy/ssh-agent v0.3.3 // indirect - github.com/xrash/smetrics v0.0.0-20240312152122-5f08fbb34913 // indirect + github.com/yusufpapurcu/wmi v1.2.4 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.50.0 // indirect go.opentelemetry.io/otel v1.25.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.25.0 // indirect @@ -69,16 +78,12 @@ require ( golang.org/x/mod v0.17.0 // indirect golang.org/x/net v0.24.0 // indirect golang.org/x/sync v0.7.0 // indirect - golang.org/x/sys v0.19.0 // indirect + golang.org/x/time v0.5.0 // indirect golang.org/x/tools v0.20.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20240401170217-c3f982113cda // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20240401170217-c3f982113cda // indirect gopkg.in/warnings.v0 v0.1.2 // indirect - gotest.tools/v3 v3.5.0 // indirect -) - -require ( - github.com/codeclysm/extract/v3 v3.1.1 - github.com/gorilla/websocket v1.5.1 gopkg.in/yaml.v3 v3.0.1 // indirect + gotest.tools/v3 v3.5.1 // indirect + kernel.org/pub/linux/libs/security/libcap/psx v1.2.77 // indirect ) diff --git a/runner/go.sum b/runner/go.sum index 27a3efb41f..655ea59dc0 100644 --- a/runner/go.sum +++ b/runner/go.sum @@ -7,16 +7,20 @@ github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03 github.com/Microsoft/go-winio v0.5.2/go.mod h1:WpS1mjBmmwHBEWmogvA2mj8546UReBk4v8QkMxJ6pZY= github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migciow= github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM= +github.com/NVIDIA/go-dcgm v0.0.0-20250707210631-823394f2bd9b h1:FL0NJYUNMX1ezl2Dv0azgedHPBXDuqHnqGDtqj6aqZM= +github.com/NVIDIA/go-dcgm v0.0.0-20250707210631-823394f2bd9b/go.mod h1:cA0Bv7+JtAd8sqCCZizhAQjj4+Z47x/d8KD60iYBT+g= github.com/ProtonMail/go-crypto v1.0.0 h1:LRuvITjQWX+WIfr930YHG2HNfjR1uOfyf5vE0kC2U78= github.com/ProtonMail/go-crypto v1.0.0/go.mod h1:EjAoLdwvbIOoOQr3ihjnSoLZRtE8azugULFRteWMNc0= github.com/alexellis/go-execute/v2 v2.2.1 h1:4Ye3jiCKQarstODOEmqDSRCqxMHLkC92Bhse743RdOI= github.com/alexellis/go-execute/v2 v2.2.1/go.mod h1:FMdRnUTiFAmYXcv23txrp3VYZfLo24nMpiIneWgKHTQ= github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be h1:9AeTilPcZAjCFIImctFaOjnTIavg87rW78vTPkQqLI8= github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be/go.mod h1:ySMOLuWl6zY27l47sB3qLNK6tF2fkHG55UZxx8oIVo4= -github.com/arduino/go-paths-helper v1.2.0 h1:qDW93PR5IZUN/jzO4rCtexiwF8P4OIcOmcSgAYLZfY4= -github.com/arduino/go-paths-helper v1.2.0/go.mod h1:HpxtKph+g238EJHq4geEPv9p+gl3v5YYu35Yb+w31Ck= +github.com/arduino/go-paths-helper v1.12.1 h1:WkxiVUxBjKWlLMiMuYy8DcmVrkxdP7aKxQOAq7r2lVM= +github.com/arduino/go-paths-helper v1.12.1/go.mod h1:jcpW4wr0u69GlXhTYydsdsqAjLaYK5n7oWHfKqOG6LM= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= +github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4= +github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/bluekeyes/go-gitdiff v0.7.2 h1:42jrcVZdjjxXtVsFNYTo/I6T1ZvIiQL+iDDLiH904hw= github.com/bluekeyes/go-gitdiff v0.7.2/go.mod h1:QpfYYO1E0fTVHVZAZKiRjtSGY9823iCdvGXBcEzHGbM= github.com/bradfitz/gomemcache v0.0.0-20170208213004-1952afaa557d/go.mod h1:PmM6Mmwb0LSuEubjR8N7PtNe1KxZLtOUHtbeikc5h60= @@ -26,14 +30,12 @@ github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyY github.com/cloudflare/circl v1.3.3/go.mod h1:5XYMA4rFBvNIrhs50XuiBJ15vF2pZn4nnUKZrLbUZFA= github.com/cloudflare/circl v1.3.7 h1:qlCDlTPz2n9fu58M0Nh1J/JzcFpfgkFHHX3O35r5vcU= github.com/cloudflare/circl v1.3.7/go.mod h1:sRTcRWXGLrKw6yIGJ+l7amYJFfAXbZG0kBSc8r4zxgA= -github.com/codeclysm/extract/v3 v3.1.1 h1:iHZtdEAwSTqPrd+1n4jfhr1qBhUWtHlMTjT90+fJVXg= -github.com/codeclysm/extract/v3 v3.1.1/go.mod h1:ZJi80UG2JtfHqJI+lgJSCACttZi++dHxfWuPaMhlOfQ= +github.com/codeclysm/extract/v4 v4.0.0 h1:H87LFsUNaJTu2e/8p/oiuiUsOK/TaPQ5wxsjPnwPEIY= +github.com/codeclysm/extract/v4 v4.0.0/go.mod h1:SFju1lj6as7FvUgalpSct7torJE0zttbJUWtryPRG6s= github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= -github.com/cpuguy83/go-md2man/v2 v2.0.4 h1:wfIWP927BUkWJb2NmU/kNDYIBTh/ziUX91+lVfRxZq4= -github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= -github.com/creack/pty v1.1.21 h1:1/QdRyBaHHJP61QkWMXlOIBfsgdDeeKfK8SYVUWJKf0= -github.com/creack/pty v1.1.21/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= +github.com/creack/pty v1.1.24 h1:bJrF4RRfyJnbTJqzRLHzcGaZK1NeM5kTC9jGgovnR1s= +github.com/creack/pty v1.1.24/go.mod h1:08sCNb52WyoAwi2QDyzUCTgcvVFhUzewun7wtTfvcwE= github.com/cyphar/filepath-securejoin v0.2.4 h1:Ugdm7cg7i6ZK6x3xDF1oEu1nfkyfH53EtKeQYTC3kyg= github.com/cyphar/filepath-securejoin v0.2.4/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -47,6 +49,10 @@ github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= +github.com/dstackai/ansistrip v0.0.6 h1:6qqeDNWt8NoqfkY1CxKUvdHpJzBl89LOE3wMwptVpaI= +github.com/dstackai/ansistrip v0.0.6/go.mod h1:w3ejXI0twxDv6bPXhkOaPeYdbwz2nwcrcvFoZGqi9F0= +github.com/ebitengine/purego v0.8.1 h1:sdRKd6plj7KYW33EH5As6YKfe8m9zbN9JMrOjNVF/BE= +github.com/ebitengine/purego v0.8.1/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ= github.com/elazarl/goproxy v0.0.0-20230808193330-2592e75ae04a h1:mATvB/9r/3gvcejNsXKSkQ6lcIaNec2nyfOdlTBR2lU= github.com/elazarl/goproxy v0.0.0-20230808193330-2592e75ae04a/go.mod h1:Ro8st/ElPeALwNFlcTpWmkr6IoMFfkjXAvTHpevnDsM= github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc= @@ -70,6 +76,7 @@ github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= github.com/go-stack/stack v1.6.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= @@ -83,7 +90,6 @@ github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5y github.com/golang/snappy v0.0.0-20170215233205-553a64147049/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/go-cmp v0.1.1-0.20171103154506-982329095285/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/googleapis/gax-go v2.0.0+incompatible/go.mod h1:SFVmujtThgffbyetf+mdk2eWhX2bMyUtNHzFKcPA9HY= @@ -95,10 +101,6 @@ github.com/grpc-ecosystem/grpc-gateway/v2 v2.19.0/go.mod h1:qmOFXW2epJhM0qSnUUYp github.com/h2non/filetype v1.1.3 h1:FKkx9QbD7HR/zjK1Ia5XiBsq9zdLi5Kf3zGyFTAFkGg= github.com/h2non/filetype v1.1.3/go.mod h1:319b3zT68BvV+WRj7cwy856M2ehB3HqNOt6sy1HndBY= github.com/hashicorp/hcl v0.0.0-20170914154624-68e816d1c783/go.mod h1:oZtUIOe8dh44I2q6ScRibXws4Ajl+d+nod3AaR9vL5w= -github.com/icza/backscanner v0.0.0-20240328210400-b40c3a86dec5 h1:FcxwOojw6pUiPpsf7Q6Fw/pI+7cR6FlapLBEGV/902A= -github.com/icza/backscanner v0.0.0-20240328210400-b40c3a86dec5/go.mod h1:GYeBD1CF7AqnKZK+UCytLcY3G+UKo0ByXX/3xfdNyqQ= -github.com/icza/mighty v0.0.0-20180919140131-cfd07d671de6 h1:8UsGZ2rr2ksmEru6lToqnXgA8Mz1DP11X4zSJ159C3k= -github.com/icza/mighty v0.0.0-20180919140131-cfd07d671de6/go.mod h1:xQig96I1VNBDIWGCdTt54nHt6EeI639SmHycLYL7FkA= github.com/inconshreveable/log15 v0.0.0-20170622235902-74a0988b5f80/go.mod h1:cOaXtrgN4ScfRrD9Bre7U1thNq5RtJ8ZoP4iXVGRj6o= github.com/inhies/go-bytesize v0.0.0-20220417184213-4913239db9cf h1:FtEj8sfIcaaBfAKrE1Cwb61YDtYq9JxChK1c7AKce7s= github.com/inhies/go-bytesize v0.0.0-20220417184213-4913239db9cf/go.mod h1:yrqSXGoD/4EKfF26AOGzscPOgTTJcyAwM2rpixWT+t4= @@ -120,6 +122,7 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4= github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= github.com/magiconair/properties v1.7.4-0.20170902060319-8d7837e64d3c/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/mattn/go-colorable v0.0.10-0.20170816031813-ad5389df28cd/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU= @@ -144,17 +147,16 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw= github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= +github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= +github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= -github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= -github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 h1:n661drycOFuPLCN3Uc8sB6B/s6Z4t2xvBgU1htSHuq8= github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4= -github.com/shirou/gopsutil/v3 v3.24.3 h1:eoUGJSmdfLzJ3mxIhmOAhgKEKgQkeOwKpz1NbhVnuPE= -github.com/shirou/gopsutil/v3 v3.24.3/go.mod h1:JpND7O217xa72ewWz9zN2eIIkPWsDN/3pl0H8Qt0uwg= -github.com/shoenig/go-m1cpu v0.1.6/go.mod h1:1JJMcUBvfNwpq05QDQVAnx3gUHr9IYF7GNg9SUEw2VQ= -github.com/shoenig/test v0.6.4/go.mod h1:byHiCGXqrVaflBLAMq/srcZIHynQPQgeyvkvXnjqq0k= +github.com/shirou/gopsutil/v4 v4.24.11 h1:WaU9xqGFKvFfsUv94SXcUPD7rCkU0vr/asVdQOBZNj8= +github.com/shirou/gopsutil/v4 v4.24.11/go.mod h1:s4D/wg+ag4rG0WO7AiTj2BeYCRhym0vM7DHbZRxnIT8= github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= @@ -166,33 +168,28 @@ github.com/spf13/jwalterweatherman v0.0.0-20170901151539-12bd96e66386/go.mod h1: github.com/spf13/pflag v1.0.1-0.20170901120850-7aff26db30c1/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= github.com/spf13/viper v1.0.0/go.mod h1:A8kyI5cUJhb8N+3pkfONlcEcZbueH6nhAm0Fq7SrnBM= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= -github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= -github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= -github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= -github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/tidwall/btree v1.7.0 h1:L1fkJH/AuEh5zBnnBbmTwQ5Lt+bRJ5A8EWecslvo9iI= +github.com/tidwall/btree v1.7.0/go.mod h1:twD9XRA5jj9VUQGELzDO4HPQTNJsoWWfYEL+EUQ2cKY= +github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU= github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI= +github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+Fk= github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY= github.com/ulikunitz/xz v0.5.12 h1:37Nm15o69RwBkXM0J6A5OlE67RZTfzUxTj8fB3dfcsc= github.com/ulikunitz/xz v0.5.12/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= -github.com/urfave/cli/v2 v2.27.1 h1:8xSQ6szndafKVRmfyeUMxkNUJQMjL1F2zmsZ+qHpfho= -github.com/urfave/cli/v2 v2.27.1/go.mod h1:8qnjx1vcq5s2/wpsqoZFndg2CE5tNFyrTvS6SinrnYQ= +github.com/urfave/cli/v3 v3.6.1 h1:j8Qq8NyUawj/7rTYdBGrxcH7A/j7/G8Q5LhWEW4G3Mo= +github.com/urfave/cli/v3 v3.6.1/go.mod h1:ysVLtOEmg2tOy6PknnYVhDoouyC/6N42TMeoMzskhso= github.com/xanzy/ssh-agent v0.3.3 h1:+/15pJfg/RsTxqYcX6fHqOXZwwMP+2VyYWJeWM2qQFM= github.com/xanzy/ssh-agent v0.3.3/go.mod h1:6dzNDKs0J9rVPHPhaGCukekBHKqfl+L3KghI1Bc68Uw= -github.com/xrash/smetrics v0.0.0-20240312152122-5f08fbb34913 h1:+qGGcbkzsfDQNPPe9UDgpxAWQrhbbBXOYJFQDq/dtJw= -github.com/xrash/smetrics v0.0.0-20240312152122-5f08fbb34913/go.mod h1:4aEEwZQutDLsQv2Deui4iYQ6DWTxR14g6m8Wv88+Xqk= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= -github.com/ztrue/tracerr v0.4.0 h1:vT5PFxwIGs7rCg9ZgJ/y0NmOpJkPCPFK8x0vVIYzd04= -github.com/ztrue/tracerr v0.4.0/go.mod h1:PaFfYlas0DfmXNpo7Eay4MFhZUONqvXM+T2HyGPpngk= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.50.0 h1:cEPbyTSEHlQR89XVlyo78gqluF8Y3oMeBkXGWzQsfXY= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.50.0/go.mod h1:DKdbWcT4GH1D0Y3Sqt/PFXt2naRKDWtU+eE6oLdFNA8= go.opentelemetry.io/otel v1.25.0 h1:gldB5FfhRl7OJQbUHt/8s0a7cE8fbsPAtdpRaApKy4k= @@ -265,9 +262,8 @@ golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o= -golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= +golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= @@ -285,8 +281,9 @@ golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= -golang.org/x/time v0.0.0-20170424234030-8be79e1e0910 h1:bCMaBn7ph495H+x72gEvgcv+mDRd9dElbzo/mVCMxX4= golang.org/x/time v0.0.0-20170424234030-8be79e1e0910/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk= +golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= @@ -322,5 +319,9 @@ gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gotest.tools/v3 v3.5.0 h1:Ljk6PdHdOhAb5aDMWXjDLMMhph+BpztA4v1QdqEW2eY= -gotest.tools/v3 v3.5.0/go.mod h1:isy3WKz7GK6uNw/sbHzfKBLvlvXwUyV06n6brMxxopU= +gotest.tools/v3 v3.5.1 h1:EENdUnS3pdur5nybKYIh2Vfgc8IUNBjxDPSjtiJcOzU= +gotest.tools/v3 v3.5.1/go.mod h1:isy3WKz7GK6uNw/sbHzfKBLvlvXwUyV06n6brMxxopU= +kernel.org/pub/linux/libs/security/libcap/cap v1.2.77 h1:iQtQTjFUOcTT19fI8sTCzYXsjeVs56et3D8AbKS2Uks= +kernel.org/pub/linux/libs/security/libcap/cap v1.2.77/go.mod h1:oV+IO8kGh0B7TxErbydDe2+BRmi9g/W0CkpVV+QBTJU= +kernel.org/pub/linux/libs/security/libcap/psx v1.2.77 h1:Z06sMOzc0GNCwp6efaVrIrz4ywGJ1v+DP0pjVkOfDuA= +kernel.org/pub/linux/libs/security/libcap/psx v1.2.77/go.mod h1:+l6Ee2F59XiJ2I6WR5ObpC1utCQJZ/VLsEbQCD8RG24= diff --git a/runner/internal/api/common.go b/runner/internal/api/common.go deleted file mode 100644 index 7c4ceba8ae..0000000000 --- a/runner/internal/api/common.go +++ /dev/null @@ -1,126 +0,0 @@ -package api - -import ( - "encoding/json" - "errors" - "fmt" - "io" - "net/http" - "strings" - - "github.com/dstackai/dstack/runner/internal/log" - "github.com/golang/gddo/httputil/header" -) - -type Error struct { - Status int - Err error - Msg string -} - -func (e *Error) Error() string { - return e.Err.Error() -} - -func DecodeJSONBody(w http.ResponseWriter, r *http.Request, dst interface{}, allowUnknown bool) error { - // From https://www.alexedwards.net/blog/how-to-properly-parse-a-json-request-body - if r.Header.Get("Content-Type") != "" { - value, _ := header.ParseValueAndParams(r.Header, "Content-Type") - msg := "Content-Type header is not application/json" - if value != "application/json" { - return &Error{Status: http.StatusUnsupportedMediaType, Msg: msg} - } - } - - r.Body = http.MaxBytesReader(w, r.Body, 1*1024*1024) - - dec := json.NewDecoder(r.Body) - if !allowUnknown { - dec.DisallowUnknownFields() - } - - err := dec.Decode(&dst) - if err != nil { - var syntaxError *json.SyntaxError - var unmarshalTypeError *json.UnmarshalTypeError - - switch { - case errors.As(err, &syntaxError): - msg := fmt.Sprintf("Request body contains badly-formed JSON (at position %d)", syntaxError.Offset) - return &Error{Status: http.StatusBadRequest, Msg: msg} - - case errors.Is(err, io.ErrUnexpectedEOF): - msg := "Request body contains badly-formed JSON" - return &Error{Status: http.StatusBadRequest, Msg: msg} - - case errors.As(err, &unmarshalTypeError): - msg := fmt.Sprintf("Request body contains an invalid value for the %q field (at position %d)", unmarshalTypeError.Field, unmarshalTypeError.Offset) - return &Error{Status: http.StatusBadRequest, Msg: msg} - - case strings.HasPrefix(err.Error(), "json: unknown field "): - fieldName := strings.TrimPrefix(err.Error(), "json: unknown field ") - msg := fmt.Sprintf("Request body contains unknown field %s", fieldName) - return &Error{Status: http.StatusBadRequest, Msg: msg} - - case errors.Is(err, io.EOF): - msg := "Request body must not be empty" - return &Error{Status: http.StatusBadRequest, Msg: msg} - - case err.Error() == "http: request body too large": - msg := "Request body must not be larger than 1MB" - return &Error{Status: http.StatusRequestEntityTooLarge, Msg: msg} - - default: - return err - } - } - - err = dec.Decode(&struct{}{}) - if !errors.Is(err, io.EOF) { - msg := "Request body must only contain a single JSON object" - return &Error{Status: http.StatusBadRequest, Msg: msg} - } - - return nil -} - -func JSONResponseHandler(method string, handler func(http.ResponseWriter, *http.Request) (interface{}, error)) func(http.ResponseWriter, *http.Request) { - return func(w http.ResponseWriter, r *http.Request) { - status := 200 - msg := "" - var body interface{} - var err error - var apiErr *Error - - if r.Method == method { - body, err = handler(w, r) - } else { - body = nil - err = &Error{Status: http.StatusMethodNotAllowed, Err: nil} - } - - if err != nil { - if errors.As(err, &apiErr) { - status = apiErr.Status - msg = apiErr.Msg - log.Warning(r.Context(), "API error", "err", apiErr.Err) - } else { - status = http.StatusInternalServerError - log.Error(r.Context(), "Unexpected API error", "err", err) - } - } - - if status != 500 && body != nil { - w.Header().Set("Content-Type", "application/json") - w.WriteHeader(status) - _ = json.NewEncoder(w).Encode(body) - } else { - if msg == "" { - msg = http.StatusText(status) - } - http.Error(w, msg, status) - } - - log.Debug(r.Context(), "", "method", r.Method, "endpoint", r.URL.Path, "status", status) - } -} diff --git a/runner/internal/common/api/api.go b/runner/internal/common/api/api.go new file mode 100644 index 0000000000..85cab57164 --- /dev/null +++ b/runner/internal/common/api/api.go @@ -0,0 +1,134 @@ +package api + +import ( + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "strings" + + "github.com/golang/gddo/httputil/header" + + "github.com/dstackai/dstack/runner/internal/common/log" +) + +type Error struct { + Status int + Err error + Msg string +} + +func (e *Error) Error() string { + if e.Msg != "" { + return e.Msg + } + if e.Err != nil { + return e.Err.Error() + } + return http.StatusText(e.Status) +} + +type Router struct { + *http.ServeMux +} + +func (r *Router) AddHandler(method string, pattern string, handler func(http.ResponseWriter, *http.Request) (interface{}, error)) { + r.HandleFunc(fmt.Sprintf("%s %s", method, pattern), JSONResponseHandler(handler)) +} + +func NewRouter() Router { + return Router{http.NewServeMux()} +} + +func DecodeJSONBody(w http.ResponseWriter, r *http.Request, dst interface{}, allowUnknown bool) error { + // From https://www.alexedwards.net/blog/how-to-properly-parse-a-json-request-body + if r.Header.Get("Content-Type") != "" { + value, _ := header.ParseValueAndParams(r.Header, "Content-Type") + msg := "Content-Type header is not application/json" + if value != "application/json" { + return &Error{Status: http.StatusUnsupportedMediaType, Msg: msg} + } + } + + r.Body = http.MaxBytesReader(w, r.Body, 1*1024*1024) + + dec := json.NewDecoder(r.Body) + if !allowUnknown { + dec.DisallowUnknownFields() + } + + err := dec.Decode(&dst) + if err != nil { + var syntaxError *json.SyntaxError + var unmarshalTypeError *json.UnmarshalTypeError + + switch { + case errors.As(err, &syntaxError): + msg := fmt.Sprintf("Request body contains badly-formed JSON (at position %d)", syntaxError.Offset) + return &Error{Status: http.StatusBadRequest, Msg: msg} + + case errors.Is(err, io.ErrUnexpectedEOF): + msg := "Request body contains badly-formed JSON" + return &Error{Status: http.StatusBadRequest, Msg: msg} + + case errors.As(err, &unmarshalTypeError): + msg := fmt.Sprintf("Request body contains an invalid value for the %q field (at position %d)", unmarshalTypeError.Field, unmarshalTypeError.Offset) + return &Error{Status: http.StatusBadRequest, Msg: msg} + + case strings.HasPrefix(err.Error(), "json: unknown field "): + fieldName := strings.TrimPrefix(err.Error(), "json: unknown field ") + msg := fmt.Sprintf("Request body contains unknown field %s", fieldName) + return &Error{Status: http.StatusBadRequest, Msg: msg} + + case errors.Is(err, io.EOF): + msg := "Request body must not be empty" + return &Error{Status: http.StatusBadRequest, Msg: msg} + + case err.Error() == "http: request body too large": + msg := "Request body must not be larger than 1MB" + return &Error{Status: http.StatusRequestEntityTooLarge, Msg: msg} + + default: + return err + } + } + + err = dec.Decode(&struct{}{}) + if !errors.Is(err, io.EOF) { + msg := "Request body must only contain a single JSON object" + return &Error{Status: http.StatusBadRequest, Msg: msg} + } + + return nil +} + +func JSONResponseHandler(handler func(http.ResponseWriter, *http.Request) (interface{}, error)) func(http.ResponseWriter, *http.Request) { + return func(w http.ResponseWriter, r *http.Request) { + status := 200 + errMsg := "" + var apiErr *Error + + body, err := handler(w, r) + if err != nil { + if errors.As(err, &apiErr) { + status = apiErr.Status + errMsg = apiErr.Error() + log.Warning(r.Context(), "API error", "err", errMsg, "method", r.Method, "endpoint", r.URL.Path, "status", status) + } else { + status = http.StatusInternalServerError + log.Error(r.Context(), "Unexpected API error", "err", err, "method", r.Method, "endpoint", r.URL.Path, "status", status) + } + } else { + log.Trace(r.Context(), "", "method", r.Method, "endpoint", r.URL.Path, "status", status) + } + + if status != 500 && body != nil { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(status) + _ = json.NewEncoder(w).Encode(body) + } else { + http.Error(w, errMsg, status) + } + } +} diff --git a/runner/internal/common/consts/consts.go b/runner/internal/common/consts/consts.go new file mode 100644 index 0000000000..99f405c29d --- /dev/null +++ b/runner/internal/common/consts/consts.go @@ -0,0 +1,43 @@ +package consts + +// A directory inside user's home used for dstack-related files +const DstackUserDir string = ".dstack" + +// Runner's log filenames +const ( + RunnerDefaultLogFileName = "default.log" + RunnerJobLogFileName = "job.log" + RunnerLogFileName = "runner.log" +) + +// 1. A fixed path inside the container +// 2. A default path on the host unless overridden via shim CLI +const RunnerBinaryPath = "/usr/local/bin/dstack-runner" + +// A fallback path on the host used if os.Executable() has failed +const ShimBinaryPath = "/usr/local/bin/dstack-shim" + +// Error-containing messages will be identified by this signature +const ExecutorFailedSignature = "Executor failed" + +// All the following are directories inside the container +const ( + // A directory where runner stores its files (logs, etc.) + // NOTE: RunnerRuntimeDir would be a more appropriate name, but it's called tempDir + // throughout runner's codebase + RunnerTempDir = "/tmp/runner" + // A directory for: + // 1. Files used by the runner and related components (e.g., sshd stores its config and log inside /dstack/ssh) + // 2. Files shared between users (e.g., sshd authorized_keys, MPI hostfile) + // The inner structure should be considered private and subject to change, the users should not make assumptions + // about its structure. + // The only way to access its content/paths should be via public environment variables such as DSTACK_MPI_HOSTFILE. + RunnerDstackDir = "/dstack" +) + +const ( + RunnerHTTPPort = 10999 + RunnerSSHPort = 10022 +) + +const ShimLogFileName = "shim.log" diff --git a/runner/internal/common/gpu/gpu.go b/runner/internal/common/gpu/gpu.go new file mode 100644 index 0000000000..72ae83bb56 --- /dev/null +++ b/runner/internal/common/gpu/gpu.go @@ -0,0 +1,40 @@ +package gpu + +import ( + "errors" + "os" +) + +type GpuVendor string + +const ( + GpuVendorNone GpuVendor = "none" + GpuVendorNvidia GpuVendor = "nvidia" + GpuVendorAmd GpuVendor = "amd" + GpuVendorIntel GpuVendor = "intel" + GpuVendorTenstorrent GpuVendor = "tenstorrent" +) + +func GetGpuVendor() GpuVendor { + // FIXME: There might be errors other than os.ErrNotExist that are ignored silently. + // Propagate and log. + if _, err := os.Stat("/dev/kfd"); !errors.Is(err, os.ErrNotExist) { + return GpuVendorAmd + } + if _, err := os.Stat("/dev/nvidiactl"); !errors.Is(err, os.ErrNotExist) { + return GpuVendorNvidia + } + if _, err := os.Stat("/dev/accel"); !errors.Is(err, os.ErrNotExist) { + return GpuVendorIntel + } + if _, err := os.Stat("/dev/tenstorrent"); !errors.Is(err, os.ErrNotExist) { + return GpuVendorTenstorrent + } + if _, err := os.Stat("/dev/dxg"); !errors.Is(err, os.ErrNotExist) { + // WSL2 + if _, err := os.Stat("/usr/lib/wsl/lib/nvidia-smi"); !errors.Is(err, os.ErrNotExist) { + return GpuVendorNvidia + } + } + return GpuVendorNone +} diff --git a/runner/internal/common/interpolator.go b/runner/internal/common/interpolator.go deleted file mode 100644 index 7331141810..0000000000 --- a/runner/internal/common/interpolator.go +++ /dev/null @@ -1,68 +0,0 @@ -package common - -import ( - "context" - "fmt" - "strings" - - "github.com/dstackai/dstack/runner/internal/gerrors" - "github.com/dstackai/dstack/runner/internal/log" -) - -const ( - PatternOpening = "${{" - PatternClosing = "}}" -) - -type VariablesInterpolator struct { - Variables map[string]string -} - -func (vi *VariablesInterpolator) Add(namespace string, vars map[string]string) { - if vi.Variables == nil { - vi.Variables = make(map[string]string, len(vars)) - } - for k, v := range vars { - vi.Variables[fmt.Sprintf("%s.%s", namespace, k)] = v - } -} - -func (vi *VariablesInterpolator) Interpolate(ctx context.Context, s string) (string, error) { - log.Trace(ctx, "Interpolating", "s", s) - var sb strings.Builder - - start := 0 - for start < len(s) { - dollar := IndexWithOffset(s, "$", start) - if dollar == -1 || dollar == len(s)-1 { - sb.WriteString(s[start:]) - break - } - if s[dollar+1] == '$' { // $$ = escaped $ - sb.WriteString(s[start : dollar+1]) - start = dollar + 2 - continue - } - - opening := IndexWithOffset(s, PatternOpening, start) - if opening == -1 { - sb.WriteString(s[start:]) - break - } - sb.WriteString(s[start:opening]) - closing := IndexWithOffset(s, PatternClosing, opening) - if closing == -1 { - return "", gerrors.Newf("no pattern closing: %s", s[opening:]) - } - - name := strings.TrimSpace(s[opening+len(PatternOpening) : closing]) - value, ok := vi.Variables[name] - if ok { - sb.WriteString(value) - } else { - log.Warning(ctx, "Variable is missing", "name", name) - } - start = closing + len(PatternClosing) - } - return sb.String(), nil -} diff --git a/runner/internal/common/interpolator_test.go b/runner/internal/common/interpolator_test.go deleted file mode 100644 index e14a248744..0000000000 --- a/runner/internal/common/interpolator_test.go +++ /dev/null @@ -1,64 +0,0 @@ -package common - -import ( - "context" - "testing" - - "github.com/stretchr/testify/assert" -) - -func TestPlainText(t *testing.T) { - var vi VariablesInterpolator - s := "plain text" - result, err := vi.Interpolate(context.Background(), s) - assert.Equal(t, nil, err) - assert.Equal(t, s, result) -} - -func TestMissingVariable(t *testing.T) { - var vi VariablesInterpolator - result, err := vi.Interpolate(context.Background(), "${{ VAR_NAME }} is here") - assert.Equal(t, nil, err) - assert.Equal(t, " is here", result) -} - -func TestDollarEscape(t *testing.T) { - var vi VariablesInterpolator - result, err := vi.Interpolate(context.Background(), "it is not a variable $$!") - assert.Equal(t, nil, err) - assert.Equal(t, "it is not a variable $!", result) -} - -func TestDollarWithoutEscape(t *testing.T) { - var vi VariablesInterpolator - result, err := vi.Interpolate(context.Background(), "it is not a variable $!") - assert.Equal(t, nil, err) - assert.Equal(t, "it is not a variable $!", result) -} - -func TestEscapeOpening(t *testing.T) { - var vi VariablesInterpolator - result, err := vi.Interpolate(context.Background(), "$${{ VAR_NAME }}") - assert.Equal(t, nil, err) - assert.Equal(t, "${{ VAR_NAME }}", result) -} - -func TestWithoutClosing(t *testing.T) { - var vi VariablesInterpolator - _, err := vi.Interpolate(context.Background(), "the end ${{") - assert.NotEqual(t, nil, err) -} - -func TestUnexpectedEOL(t *testing.T) { - var vi VariablesInterpolator - _, err := vi.Interpolate(context.Background(), "the end ${{ VAR }") - assert.NotEqual(t, nil, err) -} - -func TestSecrets(t *testing.T) { - var vi VariablesInterpolator - vi.Add("secrets", map[string]string{"user": "qwerty"}) - result, err := vi.Interpolate(context.Background(), "${{ secrets.user }}") - assert.Equal(t, nil, err) - assert.Equal(t, "qwerty", result) -} diff --git a/runner/internal/log/log.go b/runner/internal/common/log/log.go similarity index 78% rename from runner/internal/log/log.go rename to runner/internal/common/log/log.go index 99478a8f9a..94749157a7 100644 --- a/runner/internal/log/log.go +++ b/runner/internal/common/log/log.go @@ -6,7 +6,6 @@ import ( "io" "os" - "github.com/dstackai/dstack/runner/internal/gerrors" "github.com/sirupsen/logrus" ) @@ -28,6 +27,31 @@ func NewEntry(out io.Writer, level int) *logrus.Entry { var DefaultEntry = NewEntry(os.Stderr, int(logrus.InfoLevel)) +// ParseLevel accepts the following values: +// * fatal, error, warn(ing), info, debug, trace, in any letter case +// * any digit in a range from 1 (fatal) to 6 (trace) +func ParseLevel(lvl string) (int, error) { + var level int + if len(lvl) == 1 && lvl[0] >= '0' && lvl[0] <= '9' { + level = int(lvl[0] - 48) + } else { + logrusLevel, err := logrus.ParseLevel(lvl) + if err != nil { + return 0, fmt.Errorf("invalid log level: %s", lvl) + } + level = int(logrusLevel) + } + if level < 1 || level > 6 { + return 0, fmt.Errorf("invalid log level: %s", lvl) + } + return level, nil +} + +func Fatal(ctx context.Context, msg string, args ...interface{}) { + logger := AppendArgs(GetLogger(ctx), args...) + logger.Fatal(msg) +} + func Error(ctx context.Context, msg string, args ...interface{}) { logger := AppendArgs(GetLogger(ctx), args...) logger.Error(msg) @@ -100,9 +124,9 @@ func GetLogger(ctx context.Context) *logrus.Entry { } func CreateAppendFile(path string) (*os.File, error) { - f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0644) + f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0o644) if err != nil { - return nil, gerrors.Wrap(err) + return nil, fmt.Errorf("open file: %w", err) } return f, nil } diff --git a/runner/internal/common/log/log_test.go b/runner/internal/common/log/log_test.go new file mode 100644 index 0000000000..340a0416bd --- /dev/null +++ b/runner/internal/common/log/log_test.go @@ -0,0 +1,63 @@ +package log + +import ( + "testing" + + "github.com/sirupsen/logrus" + "github.com/stretchr/testify/require" +) + +func TestParseLevel(t *testing.T) { + tests := []struct { + name string + input string + want int + }{ + {name: "digit 1", input: "1", want: int(logrus.FatalLevel)}, + {name: "digit 2", input: "2", want: int(logrus.ErrorLevel)}, + {name: "digit 3", input: "3", want: int(logrus.WarnLevel)}, + {name: "digit 4", input: "4", want: int(logrus.InfoLevel)}, + {name: "digit 5", input: "5", want: int(logrus.DebugLevel)}, + {name: "digit 6", input: "6", want: int(logrus.TraceLevel)}, + {name: "fatal", input: "fatal", want: int(logrus.FatalLevel)}, + {name: "error", input: "error", want: int(logrus.ErrorLevel)}, + {name: "warn", input: "warn", want: int(logrus.WarnLevel)}, + {name: "warning", input: "warning", want: int(logrus.WarnLevel)}, + {name: "info", input: "info", want: int(logrus.InfoLevel)}, + {name: "debug", input: "debug", want: int(logrus.DebugLevel)}, + {name: "trace", input: "trace", want: int(logrus.TraceLevel)}, + {name: "uppercase", input: "INFO", want: int(logrus.InfoLevel)}, + {name: "mixed case", input: "Debug", want: int(logrus.DebugLevel)}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := ParseLevel(tt.input) + require.NoError(t, err) + require.Equal(t, tt.want, got) + }) + } +} + +func TestParseLevelError(t *testing.T) { + tests := []struct { + name string + input string + }{ + {name: "empty", input: ""}, + {name: "unknown word", input: "verbose"}, + {name: "panic out of range", input: "panic"}, + {name: "digit 0 out of range", input: "0"}, + {name: "digit 7 out of range", input: "7"}, + {name: "digit 9 out of range", input: "9"}, + {name: "multi-digit", input: "10"}, + {name: "negative digit", input: "-1"}, + {name: "non-ascii digit", input: "౧"}, + {name: "digit with whitespace", input: "4 "}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, err := ParseLevel(tt.input) + require.Error(t, err) + }) + } +} diff --git a/runner/internal/common/string.go b/runner/internal/common/string.go deleted file mode 100644 index 28a5ae0756..0000000000 --- a/runner/internal/common/string.go +++ /dev/null @@ -1,11 +0,0 @@ -package common - -import "strings" - -func IndexWithOffset(hay string, needle string, start int) int { - idx := strings.Index(hay[start:], needle) - if idx < 0 { - return -1 - } - return start + idx -} diff --git a/runner/internal/common/types/types.go b/runner/internal/common/types/types.go new file mode 100644 index 0000000000..057c0248ca --- /dev/null +++ b/runner/internal/common/types/types.go @@ -0,0 +1,14 @@ +package types + +type TerminationReason string + +const ( + TerminationReasonExecutorError TerminationReason = "executor_error" + TerminationReasonCreatingContainerError TerminationReason = "creating_container_error" + TerminationReasonContainerExitedWithError TerminationReason = "container_exited_with_error" + TerminationReasonDoneByRunner TerminationReason = "done_by_runner" + TerminationReasonTerminatedByUser TerminationReason = "terminated_by_user" + TerminationReasonTerminatedByServer TerminationReason = "terminated_by_server" + TerminationReasonMaxDurationExceeded TerminationReason = "max_duration_exceeded" + TerminationReasonLogQuotaExceeded TerminationReason = "log_quota_exceeded" +) diff --git a/runner/internal/common/utils/utils.go b/runner/internal/common/utils/utils.go new file mode 100644 index 0000000000..5bfc17d867 --- /dev/null +++ b/runner/internal/common/utils/utils.go @@ -0,0 +1,76 @@ +package utils + +import ( + "context" + "errors" + "os" + "path" + "slices" + + "github.com/dstackai/dstack/runner/internal/common/log" +) + +func PathExists(pth string) (bool, error) { + _, err := os.Stat(pth) + if err == nil { + return true, nil + } + if errors.Is(err, os.ErrNotExist) { + return false, nil + } + return false, err +} + +func RemoveIfExists(pth string) (bool, error) { + err := os.Remove(pth) + if err == nil { + return true, nil + } + if errors.Is(err, os.ErrNotExist) { + return false, nil + } + return false, err +} + +func ExpandPath(pth string, base string, home string) (string, error) { + pth = path.Clean(pth) + if pth == "~" { + return path.Clean(home), nil + } + if len(pth) >= 2 && pth[0] == '~' { + if pth[1] == '/' { + return path.Join(home, pth[2:]), nil + } + return "", errors.New("~username syntax is not supported") + } + if base != "" && !path.IsAbs(pth) { + return path.Join(base, pth), nil + } + return pth, nil +} + +func MkdirAll(ctx context.Context, pth string, uid int, gid int, perm os.FileMode) error { + paths := []string{pth} + for { + pth = path.Dir(pth) + if pth == "/" || pth == "." { + break + } + paths = append(paths, pth) + } + for _, p := range slices.Backward(paths) { + if _, err := os.Stat(p); errors.Is(err, os.ErrNotExist) { + if err := os.Mkdir(p, perm); err != nil { + return err + } + if uid != -1 || gid != -1 { + if err := os.Chown(p, uid, gid); err != nil { + log.Warning(ctx, "Failed to chown", "path", p, "err", err) + } + } + } else if err != nil { + return err + } + } + return nil +} diff --git a/runner/internal/common/utils/utils_test.go b/runner/internal/common/utils/utils_test.go new file mode 100644 index 0000000000..f38ac57925 --- /dev/null +++ b/runner/internal/common/utils/utils_test.go @@ -0,0 +1,157 @@ +package utils + +import ( + "context" + "os" + "path" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestExpandPath_NoPath_NoBase(t *testing.T) { + path, err := ExpandPath("", "", "") + require.NoError(t, err) + require.Equal(t, ".", path) +} + +func TestExpandPath_NoPath_RelBase(t *testing.T) { + testCases := []string{"repo", "./repo"} + for _, base := range testCases { + path, err := ExpandPath("", base, "") + require.NoError(t, err) + require.Equal(t, "repo", path) + } +} + +func TestExpandPath_NoPath_AbsBase(t *testing.T) { + path, err := ExpandPath("", "/repo", "") + require.NoError(t, err) + require.Equal(t, "/repo", path) +} + +func TestExpandtPath_RelPath_NoBase(t *testing.T) { + testCases := []string{"repo", "./repo"} + for _, pth := range testCases { + path, err := ExpandPath(pth, "", "") + require.NoError(t, err) + require.Equal(t, "repo", path) + } +} + +func TestExpandtPath_RelPath_RelBase(t *testing.T) { + path, err := ExpandPath("repo", "data", "") + require.NoError(t, err) + require.Equal(t, "data/repo", path) +} + +func TestExpandtPath_RelPath_AbsBase(t *testing.T) { + path, err := ExpandPath("repo", "/data", "") + require.NoError(t, err) + require.Equal(t, "/data/repo", path) +} + +func TestExpandtPath_AbsPath_NoBase(t *testing.T) { + path, err := ExpandPath("/repo", "", "") + require.NoError(t, err) + require.Equal(t, "/repo", path) +} + +func TestExpandtPath_AbsPath_RelBase(t *testing.T) { + path, err := ExpandPath("/repo", "data", "") + require.NoError(t, err) + require.Equal(t, "/repo", path) +} + +func TestExpandtPath_AbsPath_AbsBase(t *testing.T) { + path, err := ExpandPath("/repo", "/data", "") + require.NoError(t, err) + require.Equal(t, "/repo", path) +} + +func TestExpandPath_BareTilde_NoHome(t *testing.T) { + path, err := ExpandPath("~", "", "") + require.NoError(t, err) + require.Equal(t, ".", path) +} + +func TestExpandPath_BareTilde_RelHome(t *testing.T) { + path, err := ExpandPath("~", "", "user") + require.NoError(t, err) + require.Equal(t, "user", path) +} + +func TestExpandPath_BareTilde_AbsHome(t *testing.T) { + path, err := ExpandPath("~", "", "/home/user") + require.NoError(t, err) + require.Equal(t, "/home/user", path) +} + +func TestExpandtPath_TildeWithPath_NoHome(t *testing.T) { + path, err := ExpandPath("~/repo", "", "") + require.NoError(t, err) + require.Equal(t, "repo", path) +} + +func TestExpandtPath_TildeWithPath_RelHome(t *testing.T) { + path, err := ExpandPath("~/repo", "", "user") + require.NoError(t, err) + require.Equal(t, "user/repo", path) +} + +func TestExpandtPath_TildeWithPath_AbsHome(t *testing.T) { + path, err := ExpandPath("~/repo", "", "/home/user") + require.NoError(t, err) + require.Equal(t, "/home/user/repo", path) +} + +func TestExpandtPath_ErrorTildeUsernameNotSupported_BareTildeUsername(t *testing.T) { + path, err := ExpandPath("~username", "", "") + require.ErrorContains(t, err, "~username syntax is not supported") + require.Equal(t, "", path) +} + +func TestExpandtPath_ErrorTildeUsernameNotSupported_TildeUsernameWithPath(t *testing.T) { + path, err := ExpandPath("~username/repo", "", "") + require.ErrorContains(t, err, "~username syntax is not supported") + require.Equal(t, "", path) +} + +func TestMkdirAll_AbsPath_NotExists(t *testing.T) { + absPath := path.Join(t.TempDir(), "a/b/c") + require.NoDirExists(t, absPath) + err := MkdirAll(context.Background(), absPath, -1, -1, 0o755) + require.NoError(t, err) + require.DirExists(t, absPath) +} + +func TestMkdirAll_AbsPath_Exists(t *testing.T) { + absPath, err := os.Getwd() + require.NoError(t, err) + err = MkdirAll(context.Background(), absPath, -1, -1, 0o755) + require.NoError(t, err) + require.DirExists(t, absPath) +} + +func TestMkdirAll_RelPath_NotExists(t *testing.T) { + cwd := t.TempDir() + os.Chdir(cwd) + relPath := "a/b/c" + absPath := path.Join(cwd, relPath) + require.NoDirExists(t, absPath) + err := MkdirAll(context.Background(), relPath, -1, -1, 0o755) + require.NoError(t, err) + require.DirExists(t, absPath) +} + +func TestMkdirAll_RelPath_Exists(t *testing.T) { + cwd := t.TempDir() + os.Chdir(cwd) + relPath := "a/b/c" + absPath := path.Join(cwd, relPath) + err := os.MkdirAll(absPath, 0o755) + require.NoError(t, err) + err = MkdirAll(context.Background(), relPath, -1, -1, 0o755) + require.NoError(t, err) + require.DirExists(t, absPath) +} diff --git a/runner/internal/executor/base.go b/runner/internal/executor/base.go deleted file mode 100644 index bf3eeb2916..0000000000 --- a/runner/internal/executor/base.go +++ /dev/null @@ -1,22 +0,0 @@ -package executor - -import ( - "context" - - "github.com/dstackai/dstack/runner/internal/schemas" -) - -type Executor interface { - GetHistory(timestamp int64) *schemas.PullResponse - GetJobLogsHistory() []schemas.LogEvent - GetRunnerState() string - Run(ctx context.Context) error - SetCodePath(codePath string) - SetJob(job schemas.SubmitBody) - SetJobState(ctx context.Context, state string) - SetRunnerState(state string) - Lock() - RLock() - RUnlock() - Unlock() -} diff --git a/runner/internal/executor/exec.go b/runner/internal/executor/exec.go deleted file mode 100644 index ae974864da..0000000000 --- a/runner/internal/executor/exec.go +++ /dev/null @@ -1,32 +0,0 @@ -package executor - -import ( - "fmt" - "os" - "path/filepath" - "strings" - - "github.com/dstackai/dstack/runner/internal/gerrors" -) - -func makeEnv(homeDir string, mappings ...map[string]string) []string { - list := os.Environ() - for _, mapping := range mappings { - for key, value := range mapping { - list = append(list, fmt.Sprintf("%s=%s", key, value)) - } - } - list = append(list, fmt.Sprintf("HOME=%s", homeDir)) - return list -} - -func joinRelPath(rootDir string, path string) (string, error) { - if filepath.IsAbs(path) { - return "", gerrors.New("path must be relative") - } - targetPath := filepath.Join(rootDir, path) - if !strings.HasPrefix(targetPath, rootDir) { - return "", gerrors.New("path is outside of the root directory") - } - return targetPath, nil -} diff --git a/runner/internal/executor/exec_test.go b/runner/internal/executor/exec_test.go deleted file mode 100644 index 841f4e6b13..0000000000 --- a/runner/internal/executor/exec_test.go +++ /dev/null @@ -1,27 +0,0 @@ -package executor - -import ( - "testing" - - "github.com/stretchr/testify/assert" -) - -func TestJoinRelPath(t *testing.T) { - base := "/tmp/repo" - var err error - var res string - - res, err = joinRelPath(base, ".") - assert.NoError(t, err) - assert.Equal(t, "/tmp/repo", res) - - _, err = joinRelPath(base, "..") - assert.Error(t, err) - - res, err = joinRelPath(base, "task") - assert.NoError(t, err) - assert.Equal(t, "/tmp/repo/task", res) - - _, err = joinRelPath(base, "/tmp/repo/task") - assert.Error(t, err) -} diff --git a/runner/internal/executor/executor.go b/runner/internal/executor/executor.go deleted file mode 100644 index 49573f1fd5..0000000000 --- a/runner/internal/executor/executor.go +++ /dev/null @@ -1,338 +0,0 @@ -package executor - -import ( - "context" - "errors" - "fmt" - "io" - "os" - "os/exec" - "path/filepath" - "strconv" - "strings" - "sync" - "syscall" - "time" - - "github.com/creack/pty" - "github.com/dstackai/dstack/runner/consts" - "github.com/dstackai/dstack/runner/consts/states" - "github.com/dstackai/dstack/runner/internal/gerrors" - "github.com/dstackai/dstack/runner/internal/log" - "github.com/dstackai/dstack/runner/internal/schemas" -) - -type RunExecutor struct { - tempDir string - homeDir string - workingDir string - - run schemas.RunSpec - jobSpec schemas.JobSpec - clusterInfo schemas.ClusterInfo - secrets map[string]string - repoCredentials *schemas.RepoCredentials - codePath string - - mu *sync.RWMutex - state string - jobStateHistory []schemas.JobStateEvent - jobLogs *appendWriter - runnerLogs *appendWriter - timestamp *MonotonicTimestamp - - killDelay time.Duration -} - -func NewRunExecutor(tempDir string, homeDir string, workingDir string) *RunExecutor { - mu := &sync.RWMutex{} - timestamp := NewMonotonicTimestamp() - return &RunExecutor{ - tempDir: tempDir, - homeDir: homeDir, - workingDir: workingDir, - - mu: mu, - state: WaitSubmit, - jobStateHistory: make([]schemas.JobStateEvent, 0), - jobLogs: newAppendWriter(mu, timestamp), - runnerLogs: newAppendWriter(mu, timestamp), - timestamp: timestamp, - - killDelay: 10 * time.Second, - } -} - -// Run must be called after SetJob and SetCodePath -func (ex *RunExecutor) Run(ctx context.Context) (err error) { - runnerLogFile, err := log.CreateAppendFile(filepath.Join(ex.tempDir, consts.RunnerLogFileName)) - if err != nil { - ex.SetJobState(ctx, states.Failed) - return gerrors.Wrap(err) - } - defer func() { _ = runnerLogFile.Close() }() - - jobLogFile, err := log.CreateAppendFile(filepath.Join(ex.tempDir, consts.RunnerJobLogFileName)) - if err != nil { - ex.SetJobState(ctx, states.Failed) - return gerrors.Wrap(err) - } - defer func() { _ = jobLogFile.Close() }() - - defer func() { - // recover goes after runnerLogFile.Close() to keep the log - if r := recover(); r != nil { - log.Error(ctx, "Executor PANIC", "err", r) - ex.SetJobState(ctx, states.Failed) - err = gerrors.Newf("recovered: %v", r) - } - // no more logs will be written after this - ex.mu.Lock() - ex.SetRunnerState(WaitLogsFinished) - ex.mu.Unlock() - }() - defer func() { - if err != nil { - // TODO: refactor error handling and logs - log.Error(ctx, consts.ExecutorFailedSignature, "err", err) - } - }() - - logger := io.MultiWriter(runnerLogFile, os.Stdout, ex.runnerLogs) - ctx = log.WithLogger(ctx, log.NewEntry(logger, int(log.DefaultEntry.Logger.Level))) // todo loglevel - log.Info(ctx, "Run job", "log_level", log.GetLogger(ctx).Logger.Level.String()) - - if err := ex.setupRepo(ctx); err != nil { - ex.SetJobState(ctx, states.Failed) - return gerrors.Wrap(err) - } - cleanupCredentials, err := ex.setupCredentials(ctx) - if err != nil { - ex.SetJobState(ctx, states.Failed) - return gerrors.Wrap(err) - } - defer cleanupCredentials() - - // var gatewayControl *gateway.SSHControl - //if ex.run.Configuration.Type == "service" { - // log.Info(ctx, "Forwarding service port to the gateway", "hostname", ex.jobSpec.Gateway.Hostname) - // gatewayControl, err = gateway.NewSSHControl(ex.jobSpec.Gateway.Hostname, ex.jobSpec.Gateway.SSHKey) - // if err != nil { - // ex.SetJobState(ctx, states.Failed) - // return gerrors.Wrap(err) - // } - // defer gatewayControl.Cleanup() - // if err = gatewayControl.Publish(strconv.Itoa(ex.jobSpec.Gateway.ServicePort), ex.jobSpec.Gateway.SockPath); err != nil { - // ex.SetJobState(ctx, states.Failed) - // return gerrors.Wrap(err) - // } - // log.Info(ctx, "SSH tunnel established", "sock_path", ex.jobSpec.Gateway.SockPath, "service_port", ex.jobSpec.Gateway.ServicePort) - //} - - ex.SetJobState(ctx, states.Running) - timeoutCtx := ctx - var cancelTimeout context.CancelFunc - if ex.jobSpec.MaxDuration != 0 { - timeoutCtx, cancelTimeout = context.WithTimeout(ctx, time.Duration(ex.jobSpec.MaxDuration)*time.Second) - defer cancelTimeout() - } - if err := ex.execJob(timeoutCtx, jobLogFile); err != nil { - select { - case <-ctx.Done(): - log.Error(ctx, "Job canceled") - ex.SetJobState(ctx, states.Terminated) - return gerrors.Wrap(err) - default: - } - - select { - case <-timeoutCtx.Done(): - log.Error(ctx, "Max duration exceeded", "max_duration", ex.jobSpec.MaxDuration) - ex.SetJobState(ctx, states.Terminated) - return gerrors.Wrap(err) - default: - } - - // todo fail reason? - log.Error(ctx, "Exec failed", "err", err) - ex.SetJobState(ctx, states.Failed) - return gerrors.Wrap(err) - } - - ex.SetJobState(ctx, states.Done) - return nil -} - -func (ex *RunExecutor) SetJob(body schemas.SubmitBody) { - ex.run = body.RunSpec - ex.jobSpec = body.JobSpec - ex.clusterInfo = body.ClusterInfo - ex.secrets = body.Secrets - ex.repoCredentials = body.RepoCredentials - ex.state = WaitCode -} - -func (ex *RunExecutor) SetCodePath(codePath string) { - ex.codePath = codePath - ex.state = WaitRun -} - -func (ex *RunExecutor) SetJobState(ctx context.Context, state string) { - ex.mu.Lock() - ex.jobStateHistory = append(ex.jobStateHistory, schemas.JobStateEvent{State: state, Timestamp: ex.timestamp.Next()}) - ex.mu.Unlock() - log.Info(ctx, "Job state changed", "new", state) -} - -func (ex *RunExecutor) SetRunnerState(state string) { - ex.state = state -} - -func (ex *RunExecutor) execJob(ctx context.Context, jobLogFile io.Writer) error { - node_rank := ex.jobSpec.JobNum - nodes_num := ex.jobSpec.JobsPerReplica - gpus_per_node_num := ex.clusterInfo.GPUSPerJob - gpus_num := nodes_num * gpus_per_node_num - - jobEnvs := map[string]string{ - "RUN_NAME": ex.run.RunName, // deprecated, remove in 0.19 - "REPO_ID": ex.run.RepoId, // deprecated, remove in 0.19 - "DSTACK_RUN_NAME": ex.run.RunName, - "DSTACK_REPO_ID": ex.run.RepoId, - "DSTACK_MASTER_NODE_IP": ex.clusterInfo.MasterJobIP, - "DSTACK_NODE_RANK": strconv.Itoa(node_rank), - "DSTACK_NODES_NUM": strconv.Itoa(nodes_num), - "DSTACK_GPUS_PER_NODE": strconv.Itoa(gpus_per_node_num), - "DSTACK_GPUS_NUM": strconv.Itoa(gpus_num), - } - - // Call buildLDLibraryPathEnv and update jobEnvs if no error occurs - newLDPath, err := buildLDLibraryPathEnv() - if err != nil { - log.Info(ctx, "Continuing without updating LD_LIBRARY_PATH") - } else { - jobEnvs["LD_LIBRARY_PATH"] = newLDPath - log.Info(ctx, "New LD_LIBRARY_PATH set", newLDPath) - } - - cmd := exec.CommandContext(ctx, ex.jobSpec.Commands[0], ex.jobSpec.Commands[1:]...) - cmd.Env = makeEnv(ex.homeDir, jobEnvs, ex.jobSpec.Env, ex.secrets) - cmd.Cancel = func() error { - // returns error on Windows - return gerrors.Wrap(cmd.Process.Signal(os.Interrupt)) - } - cmd.WaitDelay = ex.killDelay // kills the process if it doesn't exit in time - - if ex.jobSpec.WorkingDir != nil { - workingDir, err := joinRelPath(ex.workingDir, *ex.jobSpec.WorkingDir) - if err != nil { - return gerrors.Wrap(err) - } - cmd.Dir = workingDir - } - - log.Trace(ctx, "Starting exec", "cmd", cmd.String(), "working_dir", cmd.Dir, "env", cmd.Env) - - ptmx, err := pty.Start(cmd) - if err != nil { - return gerrors.Wrap(err) - } - defer func() { _ = ptmx.Close() }() - defer func() { _ = cmd.Wait() }() // release resources if copy fails - - logger := io.MultiWriter(jobLogFile, ex.jobLogs) - _, err = io.Copy(logger, ptmx) - if err != nil && !isPtyError(err) { - return gerrors.Wrap(err) - } - return gerrors.Wrap(cmd.Wait()) -} - -func (ex *RunExecutor) setupCredentials(ctx context.Context) (func(), error) { - if ex.repoCredentials == nil { - return func() {}, nil - } - switch ex.repoCredentials.Protocol { - case "ssh": - if ex.repoCredentials.PrivateKey == nil { - return nil, gerrors.New("private key is missing") - } - keyPath := filepath.Join(ex.homeDir, ".ssh/id_rsa") - if _, err := os.Stat(keyPath); err == nil { - return nil, gerrors.New("private key already exists") - } - if err := os.MkdirAll(filepath.Dir(keyPath), 0700); err != nil { - return nil, gerrors.Wrap(err) - } - log.Info(ctx, "Writing private key", "path", keyPath) - if err := os.WriteFile(keyPath, []byte(*ex.repoCredentials.PrivateKey), 0600); err != nil { - return nil, gerrors.Wrap(err) - } - return func() { - log.Info(ctx, "Removing private key", "path", keyPath) - _ = os.Remove(keyPath) - }, nil - case "https": - if ex.repoCredentials.OAuthToken == nil { - return func() {}, nil - } - hostsPath := filepath.Join(ex.homeDir, ".config/gh/hosts.yml") - if _, err := os.Stat(hostsPath); err == nil { - return nil, gerrors.New("hosts.yml file already exists") - } - if err := os.MkdirAll(filepath.Dir(hostsPath), 0700); err != nil { - return nil, gerrors.Wrap(err) - } - log.Info(ctx, "Writing OAuth token", "path", hostsPath) - ghHost := fmt.Sprintf("%s:\n oauth_token: \"%s\"\n", ex.run.RepoData.RepoHostName, *ex.repoCredentials.OAuthToken) - if err := os.WriteFile(hostsPath, []byte(ghHost), 0644); err != nil { - return nil, gerrors.Wrap(err) - } - return func() { - log.Info(ctx, "Removing OAuth token", "path", hostsPath) - _ = os.Remove(hostsPath) - }, nil - } - return nil, gerrors.Newf("unknown protocol %s", ex.repoCredentials.Protocol) -} - -func isPtyError(err error) bool { - /* read /dev/ptmx: input/output error */ - var e *os.PathError - return errors.As(err, &e) && e.Err == syscall.EIO -} - -func buildLDLibraryPathEnv() (string, error) { - // Execute shell command to get Python prefix - cmd := exec.Command("bash", "-i", "-c", "python3-config --prefix") - output, err := cmd.Output() - - if err != nil { - return "", fmt.Errorf("error executing command: %v", err) - } - - // Extract and trim the prefix path - prefixPath := strings.TrimSpace(string(output)) - - // Check if the prefix path exists - if _, err := os.Stat(prefixPath); os.IsNotExist(err) { - return "", fmt.Errorf("python prefix path does not exist: %s", prefixPath) - } - - // Construct the path to Python's shared libraries - sharedLibPath := fmt.Sprintf("%s/lib", prefixPath) - - // Get current LD_LIBRARY_PATH - currentLDPath := os.Getenv("LD_LIBRARY_PATH") - - // Append Python's shared library path if not already present - if !strings.Contains(currentLDPath, sharedLibPath) { - if currentLDPath == "" { - currentLDPath = sharedLibPath - } else { - currentLDPath = fmt.Sprintf("%s:%s", currentLDPath, sharedLibPath) - } - } - - return currentLDPath, nil -} diff --git a/runner/internal/executor/executor_test.go b/runner/internal/executor/executor_test.go deleted file mode 100644 index 93a3e4fa35..0000000000 --- a/runner/internal/executor/executor_test.go +++ /dev/null @@ -1,198 +0,0 @@ -package executor - -import ( - "archive/tar" - "bytes" - "context" - "fmt" - "io" - "os" - "path/filepath" - "testing" - "time" - - "github.com/dstackai/dstack/runner/internal/schemas" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -// todo test get history - -func TestExecutor_WorkingDir(t *testing.T) { - var b bytes.Buffer - ex := makeTestExecutor(t) - ex.jobSpec.Commands = append(ex.jobSpec.Commands, "pwd") - - err := ex.execJob(context.TODO(), io.Writer(&b)) - assert.NoError(t, err) - assert.Equal(t, ex.workingDir+"\r\n", b.String()) -} - -func TestExecutor_HomeDir(t *testing.T) { - var b bytes.Buffer - ex := makeTestExecutor(t) - ex.jobSpec.Commands = append(ex.jobSpec.Commands, "echo ~") - - err := ex.execJob(context.TODO(), io.Writer(&b)) - assert.NoError(t, err) - assert.Equal(t, ex.homeDir+"\r\n", b.String()) -} - -func TestExecutor_NonZeroExit(t *testing.T) { - ex := makeTestExecutor(t) - ex.jobSpec.Commands = append(ex.jobSpec.Commands, "ehco 1") // note: intentional misspelling - - err := ex.execJob(context.TODO(), io.Discard) - assert.Error(t, err) -} - -func TestExecutor_SSHCredentials(t *testing.T) { - key := "== ssh private key ==" - - var b bytes.Buffer - ex := makeTestExecutor(t) - ex.jobSpec.Commands = append(ex.jobSpec.Commands, "cat ~/.ssh/id_rsa") - ex.repoCredentials = &schemas.RepoCredentials{ - Protocol: "ssh", - PrivateKey: &key, - } - - clean, err := ex.setupCredentials(context.TODO()) - defer clean() - require.NoError(t, err) - - err = ex.execJob(context.TODO(), io.Writer(&b)) - assert.NoError(t, err) - assert.Equal(t, key, b.String()) -} - -func TestExecutor_LocalRepo(t *testing.T) { - var b bytes.Buffer - ex := makeTestExecutor(t) - ex.jobSpec.Commands = append(ex.jobSpec.Commands, "cat foo") - makeCodeTar(t, ex.codePath) - - err := ex.setupRepo(context.TODO()) - require.NoError(t, err) - - err = ex.execJob(context.TODO(), io.Writer(&b)) - assert.NoError(t, err) - assert.Equal(t, "bar\r\n", b.String()) -} - -func TestExecutor_Recover(t *testing.T) { - ex := makeTestExecutor(t) - ex.jobSpec.Commands = nil // cause a panic - makeCodeTar(t, ex.codePath) - - err := ex.Run(context.TODO()) - assert.ErrorContains(t, err, "recovered: ") -} - -/* Long tests */ - -func TestExecutor_MaxDuration(t *testing.T) { - if testing.Short() { - t.Skip() - } - - ex := makeTestExecutor(t) - ex.killDelay = 500 * time.Millisecond - ex.jobSpec.Commands = append(ex.jobSpec.Commands, "echo 1 && sleep 2 && echo 2") - ex.jobSpec.MaxDuration = 1 // seconds - makeCodeTar(t, ex.codePath) - - err := ex.Run(context.TODO()) - assert.ErrorContains(t, err, "killed") -} - -func TestExecutor_RemoteRepo(t *testing.T) { - if testing.Short() { - t.Skip() - } - - var b bytes.Buffer - ex := makeTestExecutor(t) - ex.run.RepoData = schemas.RepoData{ - RepoType: "remote", - RepoHostName: "github.com", - RepoPort: 0, - RepoUserName: "dstackai", - RepoName: "dstack-examples", - RepoBranch: "main", - RepoHash: "2b83592e506ed6fe8e49f4eaa97c3866bc9402b1", - RepoConfigName: "Dstack Developer", - RepoConfigEmail: "developer@dstack.ai", - } - ex.jobSpec.Commands = append(ex.jobSpec.Commands, "git rev-parse HEAD && git config user.name && git config user.email") - err := os.WriteFile(ex.codePath, []byte{}, 0600) // empty diff - require.NoError(t, err) - - err = ex.setupRepo(context.TODO()) - require.NoError(t, err) - - err = ex.execJob(context.TODO(), io.Writer(&b)) - assert.NoError(t, err) - expected := fmt.Sprintf("%s\r\n%s\r\n%s\r\n", ex.run.RepoData.RepoHash, ex.run.RepoData.RepoConfigName, ex.run.RepoData.RepoConfigEmail) - assert.Equal(t, expected, b.String()) -} - -/* Helpers */ - -func makeTestExecutor(t *testing.T) *RunExecutor { - t.Helper() - baseDir, err := filepath.EvalSymlinks(t.TempDir()) - workingDir := "." - require.NoError(t, err) - - body := schemas.SubmitBody{ - RunSpec: schemas.RunSpec{ - RunName: "red-turtle-1", - RepoId: "test-000000", - RepoData: schemas.RepoData{RepoType: "local"}, - Configuration: schemas.Configuration{ - Type: "task", - }, - ConfigurationPath: ".dstack.yml", - }, - JobSpec: schemas.JobSpec{ - Commands: []string{"/bin/bash", "-c"}, - Env: make(map[string]string), - MaxDuration: 0, // no timeout - WorkingDir: &workingDir, - }, - Secrets: make(map[string]string), - RepoCredentials: &schemas.RepoCredentials{Protocol: "https"}, - } - - temp := filepath.Join(baseDir, "temp") - _ = os.Mkdir(temp, 0700) - home := filepath.Join(baseDir, "home") - _ = os.Mkdir(home, 0700) - repo := filepath.Join(baseDir, "repo") - _ = os.Mkdir(repo, 0700) - ex := NewRunExecutor(temp, home, repo) - ex.SetJob(body) - ex.SetCodePath(filepath.Join(baseDir, "code")) // note: create file before run - return ex -} - -func makeCodeTar(t *testing.T, path string) { - t.Helper() - file, err := os.Create(path) - require.NoError(t, err) - defer func() { _ = file.Close() }() - tw := tar.NewWriter(file) - - var files = []struct{ name, body string }{ - {"foo", "bar\n"}, - } - - for _, f := range files { - hdr := &tar.Header{Name: f.name, Mode: 0600, Size: int64(len(f.body))} - require.NoError(t, tw.WriteHeader(hdr)) - _, err := tw.Write([]byte(f.body)) - require.NoError(t, err) - } - require.NoError(t, tw.Close()) -} diff --git a/runner/internal/executor/logs.go b/runner/internal/executor/logs.go deleted file mode 100644 index 807071eeb9..0000000000 --- a/runner/internal/executor/logs.go +++ /dev/null @@ -1,32 +0,0 @@ -package executor - -import ( - "sync" - - "github.com/dstackai/dstack/runner/internal/schemas" -) - -type appendWriter struct { - mu *sync.RWMutex // shares with executor - history []schemas.LogEvent - timestamp *MonotonicTimestamp // shares with executor -} - -func newAppendWriter(mu *sync.RWMutex, timestamp *MonotonicTimestamp) *appendWriter { - return &appendWriter{ - mu: mu, - history: make([]schemas.LogEvent, 0), - timestamp: timestamp, - } -} - -func (w *appendWriter) Write(p []byte) (n int, err error) { - w.mu.Lock() - defer w.mu.Unlock() - - pCopy := make([]byte, len(p)) - copy(pCopy, p) - w.history = append(w.history, schemas.LogEvent{Message: pCopy, Timestamp: w.timestamp.Next()}) - - return len(p), nil -} diff --git a/runner/internal/executor/query.go b/runner/internal/executor/query.go deleted file mode 100644 index 6e7f2d2880..0000000000 --- a/runner/internal/executor/query.go +++ /dev/null @@ -1,41 +0,0 @@ -package executor - -import ( - "github.com/dstackai/dstack/runner/internal/schemas" -) - -func (ex *RunExecutor) GetJobLogsHistory() []schemas.LogEvent { - return ex.jobLogs.history -} - -func (ex *RunExecutor) GetHistory(timestamp int64) *schemas.PullResponse { - return &schemas.PullResponse{ - JobStates: eventsAfter(ex.jobStateHistory, timestamp), - JobLogs: eventsAfter(ex.jobLogs.history, timestamp), - RunnerLogs: eventsAfter(ex.runnerLogs.history, timestamp), - LastUpdated: ex.timestamp.GetLatest(), - HasMore: ex.state != WaitLogsFinished, - } -} - -func (ex *RunExecutor) GetRunnerState() string { - return ex.state -} - -type OrderedEvent interface { - GetTimestamp() int64 -} - -func eventsAfter[T OrderedEvent](events []T, timestamp int64) []T { - left := 0 - right := len(events) - for left < right { - mid := (left + right) / 2 - if events[mid].GetTimestamp() <= timestamp { - left = mid + 1 - } else { - right = mid - } - } - return events[left:] -} diff --git a/runner/internal/executor/repo.go b/runner/internal/executor/repo.go deleted file mode 100644 index e7339ee700..0000000000 --- a/runner/internal/executor/repo.go +++ /dev/null @@ -1,96 +0,0 @@ -package executor - -import ( - "context" - "os" - - "github.com/codeclysm/extract/v3" - "github.com/dstackai/dstack/runner/consts" - "github.com/dstackai/dstack/runner/internal/gerrors" - "github.com/dstackai/dstack/runner/internal/log" - "github.com/dstackai/dstack/runner/internal/repo" -) - -// setupRepo must be called from Run -func (ex *RunExecutor) setupRepo(ctx context.Context) error { - if _, err := os.Stat(ex.workingDir); err != nil { - if err = os.MkdirAll(ex.workingDir, 0777); err != nil { - return gerrors.Wrap(err) - } - } - switch ex.run.RepoData.RepoType { - case "remote": - log.Trace(ctx, "Fetching git repository") - if err := ex.prepareGit(ctx); err != nil { - return gerrors.Wrap(err) - } - case "local", "virtual": - log.Trace(ctx, "Extracting tar archive") - if err := ex.prepareArchive(ctx); err != nil { - return gerrors.Wrap(err) - } - default: - return gerrors.Newf("unknown RepoType: %s", ex.run.RepoData.RepoType) - } - return nil -} - -func (ex *RunExecutor) prepareGit(ctx context.Context) error { - repoManager := repo.NewManager(ctx, ex.run.RepoData.FormatURL(consts.REPO_HTTPS_URL), ex.run.RepoData.RepoBranch, ex.run.RepoData.RepoHash).WithLocalPath(ex.workingDir) - if ex.repoCredentials != nil { - log.Trace(ctx, "Credentials is not empty") - switch ex.repoCredentials.Protocol { - case "https": - log.Trace(ctx, "Select HTTPS protocol") - if ex.repoCredentials.OAuthToken == nil { - log.Warning(ctx, "OAuth token is empty") - break - } - repoManager.WithTokenAuth(*ex.repoCredentials.OAuthToken) - case "ssh": - log.Trace(ctx, "Select SSH protocol") - if ex.repoCredentials.PrivateKey == nil { - return gerrors.Newf("private key is empty") - } - repoManager = repo.NewManager(ctx, ex.run.RepoData.FormatURL(consts.REPO_GIT_URL), ex.run.RepoData.RepoBranch, ex.run.RepoData.RepoHash).WithLocalPath(ex.workingDir) - repoManager.WithSSHAuth(*ex.repoCredentials.PrivateKey, "") // we don't support passphrase - default: - return gerrors.Newf("unsupported remote repo protocol: %s", ex.repoCredentials.Protocol) - } - } else { - log.Trace(ctx, "Credentials is empty") - } - - log.Trace(ctx, "Checking out remote repo", "GIT URL", repoManager.URL()) - if err := repoManager.Checkout(); err != nil { - return gerrors.Wrap(err) - } - if err := repoManager.SetConfig(ex.run.RepoData.RepoConfigName, ex.run.RepoData.RepoConfigEmail); err != nil { - return gerrors.Wrap(err) - } - - log.Trace(ctx, "Applying diff") - repoDiff, err := os.ReadFile(ex.codePath) - if err != nil { - return err - } - if len(repoDiff) > 0 { - if err := repo.ApplyDiff(ctx, ex.workingDir, string(repoDiff)); err != nil { - return gerrors.Wrap(err) - } - } - return nil -} - -func (ex *RunExecutor) prepareArchive(ctx context.Context) error { - file, err := os.Open(ex.codePath) - if err != nil { - return gerrors.Wrap(err) - } - defer func() { _ = file.Close() }() - log.Trace(ctx, "Extracting code archive", "src", ex.codePath, "dst", ex.workingDir) - if err := extract.Tar(ctx, file, ex.workingDir, nil); err != nil { - return gerrors.Wrap(err) - } - return nil -} diff --git a/runner/internal/executor/states.go b/runner/internal/executor/states.go deleted file mode 100644 index cfa6dc15e7..0000000000 --- a/runner/internal/executor/states.go +++ /dev/null @@ -1,9 +0,0 @@ -package executor - -const ( - WaitSubmit = "wait_submit" - WaitCode = "wait_code" - WaitRun = "wait_run" - ServeLogs = "serve_logs" - WaitLogsFinished = "wait_logs_finished" -) diff --git a/runner/internal/executor/timestamp.go b/runner/internal/executor/timestamp.go deleted file mode 100644 index a9463c04cc..0000000000 --- a/runner/internal/executor/timestamp.go +++ /dev/null @@ -1,46 +0,0 @@ -package executor - -import ( - "context" - "sync" - "time" - - "github.com/dstackai/dstack/runner/internal/log" -) - -type MonotonicTimestamp struct { - unix int64 - counter int - mu sync.RWMutex -} - -func NewMonotonicTimestamp() *MonotonicTimestamp { - return &MonotonicTimestamp{ - unix: time.Now().Unix(), - counter: 0, - mu: sync.RWMutex{}, - } -} - -func (t *MonotonicTimestamp) GetLatest() int64 { - t.mu.RLock() - defer t.mu.RUnlock() - return t.unix*1000 + int64(t.counter) -} - -func (t *MonotonicTimestamp) Next() int64 { - // warning: time.Now() is not monotonic in general - t.mu.Lock() - now := time.Now().Unix() - if now == t.unix { - t.counter++ - if t.counter == 1000 { - log.Warning(context.TODO(), "Monotonic timestamp counter overflowed", "timestamp", now) - } - } else { - t.unix = now - t.counter = 0 - } - t.mu.Unlock() - return t.GetLatest() -} diff --git a/runner/internal/gateway/ssh.go b/runner/internal/gateway/ssh.go deleted file mode 100644 index f62677fc6d..0000000000 --- a/runner/internal/gateway/ssh.go +++ /dev/null @@ -1,92 +0,0 @@ -package gateway - -import ( - "fmt" - "os" - "os/exec" - "path/filepath" - - "github.com/dstackai/dstack/runner/internal/gerrors" -) - -type SSHControl struct { - keyPath string - controlPath string - hostname string - user string - localTempDir string -} - -func NewSSHControl(hostname, sshKey string) (*SSHControl, error) { - localTempDir, err := os.MkdirTemp("", "") - if err != nil { - return nil, gerrors.Wrap(err) - } - keyPath := filepath.Join(localTempDir, "id_rsa") - if err := os.WriteFile(keyPath, []byte(sshKey), 0o600); err != nil { - return nil, gerrors.Wrap(err) - } - c := &SSHControl{ - keyPath: keyPath, - controlPath: filepath.Join(localTempDir, "ssh.control"), - hostname: hostname, - user: "www-data", - localTempDir: localTempDir, - } - return c, gerrors.Wrap(err) -} - -func (c *SSHControl) exec(args []string, command string) ([]byte, error) { - allArgs := []string{ - "-i", c.keyPath, - "-o", "StrictHostKeyChecking=accept-new", - "-o", fmt.Sprintf("ControlPath=%s", c.controlPath), - "-o", "ControlMaster=auto", - "-o", "ControlPersist=yes", - "-o", "ServerAliveInterval=60", - } - if args != nil { - allArgs = append(allArgs, args...) - } - allArgs = append(allArgs, fmt.Sprintf("%s@%s", c.user, c.hostname)) - if command != "" { - allArgs = append(allArgs, command) - } - cmd := exec.Command("ssh", allArgs...) - - stdoutFile, err := os.CreateTemp("", "") - if err != nil { - panic(err) - } - defer func() { _ = os.Remove(stdoutFile.Name()) }() - stderrFile, err := os.CreateTemp("", "") - if err != nil { - panic(err) - } - defer func() { _ = os.Remove(stderrFile.Name()) }() - // OpenSSH 8.2 (on Ubuntu 20.04) doesn't close stdout/stderr when running in the background (-f option). - // Run command waits indefinitely for closing pipes, but exits immediately if we are using files. - cmd.Stdout = stdoutFile - cmd.Stderr = stderrFile - - if err := cmd.Run(); err != nil { - stderr, _ := os.ReadFile(stderrFile.Name()) - return nil, gerrors.Newf("ssh exec: %s", string(stderr)) - } - stdout, _ := os.ReadFile(stdoutFile.Name()) - return stdout, nil -} - -func (c *SSHControl) Publish(localPort, sockPath string) error { - _, err := c.exec([]string{ - "-f", "-N", - "-R", fmt.Sprintf("%s:localhost:%s", sockPath, localPort), - }, "") - return gerrors.Wrap(err) -} - -func (c *SSHControl) Cleanup() { - // todo cleanup remote - _ = exec.Command("ssh", "-F", "none", "-o", "ControlPath="+c.controlPath, "-O", "exit", c.hostname).Run() - _ = os.RemoveAll(c.localTempDir) -} diff --git a/runner/internal/gerrors/stacktrace.go b/runner/internal/gerrors/stacktrace.go deleted file mode 100644 index 102c7221fa..0000000000 --- a/runner/internal/gerrors/stacktrace.go +++ /dev/null @@ -1,72 +0,0 @@ -package gerrors - -import ( - "errors" - "fmt" - "path/filepath" - "runtime" - "strings" -) - -type withStack struct { - err error - pointFrame uintptr -} - -func (ws withStack) Error() string { - if ws.pointFrame == 0 { - return ws.err.Error() - } - - f := getFrame(ws.pointFrame) - if f.File == "" { - return "[unknown] " + ws.err.Error() - } - - _, file := filepath.Split(f.File) - l := fmt.Sprintf("%s:%d", file, f.Line) - if f.Function != "" { - idx := strings.LastIndex(f.Function, "/") - l += " " + f.Function[idx+1:] - } - return fmt.Sprintf("[%s] %s", l, ws.err) -} - -func (ws withStack) Unwrap() error { - return ws.err -} - -func New(s string) error { - return withStack{ - err: errors.New(s), - pointFrame: pointFrame(), - } -} - -func Newf(format string, a ...interface{}) error { - return withStack{ - err: fmt.Errorf(format, a...), - pointFrame: pointFrame(), - } -} - -func Wrap(err error) error { - if err == nil { - return nil - } - return withStack{ - err: err, - pointFrame: pointFrame(), - } -} - -func pointFrame() uintptr { - pc := make([]uintptr, 1) - runtime.Callers(3, pc) - return pc[0] -} - -func getFrame(pc uintptr) *runtime.Frame { - f, _ := runtime.CallersFrames([]uintptr{pc}).Next() - return &f -} diff --git a/runner/internal/repo/manager.go b/runner/internal/repo/manager.go deleted file mode 100644 index 9a60ca8a1b..0000000000 --- a/runner/internal/repo/manager.go +++ /dev/null @@ -1,161 +0,0 @@ -package repo - -import ( - "context" - "fmt" - "os" - - "github.com/dstackai/dstack/runner/internal/gerrors" - "github.com/dstackai/dstack/runner/internal/log" - "github.com/go-git/go-git/v5" - "github.com/go-git/go-git/v5/plumbing" - "github.com/go-git/go-git/v5/plumbing/transport/http" - gitssh "github.com/go-git/go-git/v5/plumbing/transport/ssh" - "golang.org/x/crypto/ssh" -) - -type Manager struct { - ctx context.Context - localPath string - clo git.CloneOptions - hash string -} - -func NewManager(ctx context.Context, url, branch, hash string) *Manager { - ctx = log.AppendArgsCtx(ctx, "url", url, "branch", branch, "hash", hash) - m := &Manager{ - ctx: ctx, - clo: git.CloneOptions{ - URL: url, - RecurseSubmodules: git.DefaultSubmoduleRecursionDepth, - ReferenceName: plumbing.NewBranchReferenceName(branch), - SingleBranch: true, - }, - hash: hash, - } - - return m -} - -func (m *Manager) WithLocalPath(path string) *Manager { - m.localPath = path - m.ctx = log.AppendArgsCtx(m.ctx, "path", path) - return m -} - -// TODO: works with Github, possibly not with others -func (m *Manager) WithTokenAuth(token string) *Manager { - auth := &http.BasicAuth{ - Username: "anything", - Password: token, - } - m.clo.Auth = auth - return m -} - -func (m *Manager) WithSSHAuth(pem, password string) *Manager { - keys, err := gitssh.NewPublicKeys("git", []byte(pem), password) - if err != nil { - log.Warning(m.ctx, "fail to parse SSH private key", "err", err) - } else { - keys.HostKeyCallbackHelper.HostKeyCallback = ssh.InsecureIgnoreHostKey() - m.clo.Auth = keys - } - return m -} - -func (m *Manager) Checkout() error { - log.Info(m.ctx, "git checkout", "auth", fmt.Sprintf("%T", (&m.clo).Auth)) - if _, err := os.Stat(m.localPath); err == nil { - if err = os.RemoveAll(m.localPath); err != nil { - log.Error(m.ctx, "Failed clear directory") - } - } - ref, err := git.PlainClone(m.localPath, false, &m.clo) - if err != nil && err != git.ErrRepositoryAlreadyExists { - return err - } - if ref != nil { - branchRef, err := ref.Reference(m.clo.ReferenceName, true) - if err != nil { - return gerrors.Wrap(err) - } - var cho git.CheckoutOptions - if m.hash == "" || m.hash == branchRef.Hash().String() { - cho.Branch = m.clo.ReferenceName - } else { - cho.Hash = plumbing.NewHash(m.hash) - } - - workTree, err := ref.Worktree() - if err != nil { - return err - } - err = workTree.Checkout(&cho) - if err != nil { - return err - } - - } else { - log.Warning(m.ctx, "git clone ref==nil") - } - - return nil -} - -func (m *Manager) CheckoutBranch(branch string) error { - log.Info(m.ctx, "git checkout", "auth", fmt.Sprintf("%T", (&m.clo).Auth)) - ref, err := git.PlainClone(m.localPath, false, &m.clo) - if err != nil && err != git.ErrRepositoryAlreadyExists { - return err - } - if ref != nil { - workTree, err := ref.Worktree() - if err != nil { - return err - } - cho := git.CheckoutOptions{Branch: plumbing.NewBranchReferenceName(branch)} - err = workTree.Checkout(&cho) - if err != nil { - return err - } - } else { - log.Warning(m.ctx, "git clone ref==nil") - } - - return nil -} - -func (m *Manager) CheckoutMaster() error { - clo := git.CloneOptions{ - URL: m.clo.URL, - } - log.Info(m.ctx, "git checkout", "auth", fmt.Sprintf("%T", clo.Auth)) - _, err := git.PlainClone(m.localPath, false, &clo) - if err != nil { - return err - } - - return nil -} - -func (m *Manager) URL() string { - return m.clo.URL -} - -func (m *Manager) SetConfig(name, email string) error { - repo, err := git.PlainOpen(m.localPath) - if err != nil { - return gerrors.Wrap(err) - } - config, err := repo.Config() - if err != nil { - return gerrors.Wrap(err) - } - config.User.Name = name - config.User.Email = email - if err := repo.SetConfig(config); err != nil { - return gerrors.Wrap(err) - } - return nil -} diff --git a/runner/internal/runner/api/http.go b/runner/internal/runner/api/http.go index 1e154a9030..4919852aff 100644 --- a/runner/internal/runner/api/http.go +++ b/runner/internal/runner/api/http.go @@ -2,33 +2,51 @@ package api import ( "context" + "errors" + "fmt" "io" + "math" + "mime" + "mime/multipart" "net/http" - "os" - "path/filepath" "strconv" - "github.com/dstackai/dstack/runner/internal/api" - "github.com/dstackai/dstack/runner/internal/executor" - "github.com/dstackai/dstack/runner/internal/gerrors" - "github.com/dstackai/dstack/runner/internal/log" - "github.com/dstackai/dstack/runner/internal/schemas" + "github.com/dstackai/dstack/runner/internal/common/api" + "github.com/dstackai/dstack/runner/internal/common/log" + "github.com/dstackai/dstack/runner/internal/runner/executor" + "github.com/dstackai/dstack/runner/internal/runner/schemas" ) +// TODO: set some reasonable value; (optional) make configurable +const maxBodySize = math.MaxInt64 + func (s *Server) healthcheckGetHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { - s.executor.RLock() - defer s.executor.RUnlock() return &schemas.HealthcheckResponse{ Service: "dstack-runner", Version: s.version, }, nil } +func (s *Server) metricsGetHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { + if s.metricsCollector == nil { + return nil, &api.Error{Status: http.StatusNotFound, Msg: "Metrics collector is not available"} + } + metrics, err := s.metricsCollector.GetSystemMetrics(r.Context()) + if err != nil { + return nil, &api.Error{Status: http.StatusInternalServerError, Err: err} + } + return metrics, nil +} + +// submitPostHandler must be called first +// It's safe to call it more than once func (s *Server) submitPostHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { s.executor.Lock() defer s.executor.Unlock() state := s.executor.GetRunnerState() - if state != executor.WaitSubmit { + if state == executor.WaitRun { + log.Warning(r.Context(), "Job already submitted, submitting again", "current_state", state) + } else if state != executor.WaitSubmit { log.Warning(r.Context(), "Executor doesn't wait submit", "current_state", state) return nil, &api.Error{Status: http.StatusConflict} } @@ -38,53 +56,121 @@ func (s *Server) submitPostHandler(w http.ResponseWriter, r *http.Request) (inte log.Error(r.Context(), "Failed to decode submit body", "err", err) return nil, err } - // todo go-playground/validator s.executor.SetJob(body) - s.jobBarrierCh <- nil // notify server that job submitted + s.executor.SetRunnerState(executor.WaitRun) return nil, nil } -func (s *Server) uploadCodePostHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { +// If uploadArchivePostHandler is called, it must be called after submitPostHandler and before runPostHandler +// It's safe to call it more than once with the same archive +func (s *Server) uploadArchivePostHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { s.executor.Lock() defer s.executor.Unlock() - if s.executor.GetRunnerState() != executor.WaitCode { + if s.executor.GetRunnerState() != executor.WaitRun { return nil, &api.Error{Status: http.StatusConflict} } - r.Body = http.MaxBytesReader(w, r.Body, 10*1024*1024) - codePath := filepath.Join(s.tempDir, "code") // todo random name? - file, err := os.Create(codePath) + contentType := r.Header.Get("Content-Type") + if contentType == "" { + return nil, &api.Error{Status: http.StatusBadRequest, Msg: "missing content-type header"} + } + mediaType, params, err := mime.ParseMediaType(contentType) + if err != nil { + return nil, fmt.Errorf("parse request content-type: %w", err) + } + if mediaType != "multipart/form-data" { + return nil, &api.Error{Status: http.StatusBadRequest, Msg: fmt.Sprintf("multipart/form-data expected, got %s", mediaType)} + } + boundary := params["boundary"] + if boundary == "" { + return nil, &api.Error{Status: http.StatusBadRequest, Msg: "missing boundary"} + } + + r.Body = http.MaxBytesReader(w, r.Body, maxBodySize) + formReader := multipart.NewReader(r.Body, boundary) + part, err := formReader.NextPart() if err != nil { - return nil, gerrors.Wrap(err) + if errors.Is(err, io.EOF) { + return nil, &api.Error{Status: http.StatusBadRequest, Msg: "empty form"} + } + if isMaxBytesError(err) { + return nil, &api.Error{Status: http.StatusRequestEntityTooLarge} + } + return nil, fmt.Errorf("read multipart form: %w", err) + } + defer func() { _ = part.Close() }() + + fieldName := part.FormName() + if fieldName == "" { + return nil, &api.Error{Status: http.StatusBadRequest, Msg: "missing field name"} } - defer func() { _ = file.Close() }() - if _, err = io.Copy(file, r.Body); err != nil { - if err.Error() == "http: request body too large" { + if fieldName != "archive" { + return nil, &api.Error{Status: http.StatusBadRequest, Msg: fmt.Sprintf("unexpected field %s", fieldName)} + } + archiveId := part.FileName() + if archiveId == "" { + return nil, &api.Error{Status: http.StatusBadRequest, Msg: "missing file name"} + } + if err := s.executor.WriteFileArchive(archiveId, part); err != nil { + if isMaxBytesError(err) { return nil, &api.Error{Status: http.StatusRequestEntityTooLarge} } - return nil, gerrors.Wrap(err) + return nil, fmt.Errorf("write file archive: %w", err) + } + if _, err := formReader.NextPart(); !errors.Is(err, io.EOF) { + return nil, &api.Error{Status: http.StatusBadRequest, Msg: "extra form field(s)"} } - s.executor.SetCodePath(codePath) return nil, nil } -func (s *Server) runPostHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { +// If uploadCodePostHandler is called, it must be called after submitPostHandler and before runPostHandler +// It's safe to call it more than once +func (s *Server) uploadCodePostHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { s.executor.Lock() defer s.executor.Unlock() if s.executor.GetRunnerState() != executor.WaitRun { return nil, &api.Error{Status: http.StatusConflict} } + r.Body = http.MaxBytesReader(w, r.Body, maxBodySize) + + if err := s.executor.WriteRepoBlob(r.Body); err != nil { + if isMaxBytesError(err) { + return nil, &api.Error{Status: http.StatusRequestEntityTooLarge} + } + return nil, fmt.Errorf("copy request body: %w", err) + } + + return nil, nil +} + +func (s *Server) runPostHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { + s.executor.Lock() + if s.executor.GetRunnerState() != executor.WaitRun { + s.executor.Unlock() + return nil, &api.Error{Status: http.StatusConflict} + } + s.executor.SetRunnerState(executor.ServeLogs) + s.jobBarrierCh <- nil // notify server that job started + s.executor.Unlock() + var runCtx context.Context runCtx, s.cancelRun = context.WithCancel(context.Background()) + username, workingDir, err := s.executor.GetJobInfo(runCtx) go func() { _ = s.executor.Run(runCtx) // INFO: all errors are handled inside the Run() s.jobBarrierCh <- nil // notify server that job finished }() - s.executor.SetRunnerState(executor.ServeLogs) + + if err == nil { + return &schemas.JobInfoResponse{ + Username: username, + WorkingDir: workingDir, + }, nil + } return nil, nil } @@ -109,6 +195,10 @@ func (s *Server) pullGetHandler(w http.ResponseWriter, r *http.Request) (interfa func (s *Server) stopPostHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { s.stop() - return nil, nil } + +func isMaxBytesError(err error) bool { + var maxBytesError *http.MaxBytesError + return errors.As(err, &maxBytesError) +} diff --git a/runner/internal/runner/api/http_test.go b/runner/internal/runner/api/http_test.go deleted file mode 100644 index 98a6f51808..0000000000 --- a/runner/internal/runner/api/http_test.go +++ /dev/null @@ -1,48 +0,0 @@ -package api - -import ( - "context" - "net/http/httptest" - "strings" - "testing" - - common "github.com/dstackai/dstack/runner/internal/api" - "github.com/dstackai/dstack/runner/internal/shim" - "github.com/dstackai/dstack/runner/internal/shim/api" -) - -type DummyRunner struct { - State shim.RunnerStatus - ContainerStatus shim.ContainerStatus - JobResult shim.JobResult -} - -func (ds DummyRunner) GetState() (shim.RunnerStatus, shim.ContainerStatus, string, shim.JobResult) { - return ds.State, ds.ContainerStatus, "", ds.JobResult -} - -func (ds DummyRunner) Run(context.Context, shim.TaskConfig) error { - return nil -} - -func (ds DummyRunner) Stop(force bool) {} - -func TestHealthcheck(t *testing.T) { - request := httptest.NewRequest("GET", "/api/healthcheck", nil) - responseRecorder := httptest.NewRecorder() - - server := api.NewShimServer(":12345", DummyRunner{}, "0.0.1.dev2") - - f := common.JSONResponseHandler("GET", server.HealthcheckGetHandler) - f(responseRecorder, request) - - if responseRecorder.Code != 200 { - t.Errorf("Want status '%d', got '%d'", 200, responseRecorder.Code) - } - - expected := "{\"service\":\"dstack-shim\",\"version\":\"0.0.1.dev2\"}" - - if strings.TrimSpace(responseRecorder.Body.String()) != expected { - t.Errorf("Want '%s', got '%s'", expected, responseRecorder.Body.String()) - } -} diff --git a/runner/internal/runner/api/server.go b/runner/internal/runner/api/server.go index 9c469e0703..227ea1dbb3 100644 --- a/runner/internal/runner/api/server.go +++ b/runner/internal/runner/api/server.go @@ -4,98 +4,102 @@ import ( "context" "errors" "net/http" - "os" - "os/signal" - "syscall" + _ "net/http/pprof" "time" - "github.com/dstackai/dstack/runner/internal/api" - "github.com/dstackai/dstack/runner/internal/executor" - "github.com/dstackai/dstack/runner/internal/gerrors" - "github.com/dstackai/dstack/runner/internal/log" + "github.com/dstackai/dstack/runner/internal/common/api" + "github.com/dstackai/dstack/runner/internal/common/log" + "github.com/dstackai/dstack/runner/internal/runner/executor" + "github.com/dstackai/dstack/runner/internal/runner/metrics" ) type Server struct { - srv *http.Server - tempDir string - workingDir string + srv *http.Server shutdownCh chan interface{} // server closes this chan on shutdown jobBarrierCh chan interface{} // only server listens on this chan pullDoneCh chan interface{} // Closed then /api/pull gave everything wsDoneCh chan interface{} // Closed then /logs_ws gave everything - submitWaitDuration time.Duration - logsWaitDuration time.Duration + startWaitDuration time.Duration + logsWaitDuration time.Duration executor executor.Executor cancelRun context.CancelFunc + metricsCollector *metrics.MetricsCollector + version string } -func NewServer(tempDir string, homeDir string, workingDir string, address string, version string) *Server { - mux := http.NewServeMux() +func NewServer(ctx context.Context, address string, version string, ex executor.Executor) (*Server, error) { + r := api.NewRouter() + + metricsCollector, err := metrics.NewMetricsCollector(ctx) + if err != nil { + log.Warning(ctx, "Metrics collector is not available", "err", err) + } + s := &Server{ srv: &http.Server{ Addr: address, - Handler: mux, + Handler: r, }, - tempDir: tempDir, - workingDir: workingDir, shutdownCh: make(chan interface{}), jobBarrierCh: make(chan interface{}), pullDoneCh: make(chan interface{}), wsDoneCh: make(chan interface{}), - submitWaitDuration: 2 * time.Minute, - logsWaitDuration: 30 * time.Second, + startWaitDuration: 5 * time.Minute, + logsWaitDuration: 5 * time.Minute, - executor: executor.NewRunExecutor(tempDir, homeDir, workingDir), + executor: ex, + + metricsCollector: metricsCollector, version: version, } - mux.HandleFunc("/api/healthcheck", api.JSONResponseHandler("GET", s.healthcheckGetHandler)) - mux.HandleFunc("/api/submit", api.JSONResponseHandler("POST", s.submitPostHandler)) - mux.HandleFunc("/api/upload_code", api.JSONResponseHandler("POST", s.uploadCodePostHandler)) - mux.HandleFunc("/api/run", api.JSONResponseHandler("POST", s.runPostHandler)) - mux.HandleFunc("/api/pull", api.JSONResponseHandler("GET", s.pullGetHandler)) - mux.HandleFunc("/api/stop", api.JSONResponseHandler("POST", s.stopPostHandler)) - mux.HandleFunc("/logs_ws", api.JSONResponseHandler("GET", s.logsWsGetHandler)) - return s + r.AddHandler("GET", "/api/healthcheck", s.healthcheckGetHandler) + r.AddHandler("GET", "/api/metrics", s.metricsGetHandler) + r.AddHandler("POST", "/api/submit", s.submitPostHandler) + r.AddHandler("POST", "/api/upload_archive", s.uploadArchivePostHandler) + r.AddHandler("POST", "/api/upload_code", s.uploadCodePostHandler) + r.AddHandler("POST", "/api/run", s.runPostHandler) + r.AddHandler("GET", "/api/pull", s.pullGetHandler) + r.AddHandler("POST", "/api/stop", s.stopPostHandler) + r.AddHandler("GET", "/logs_ws", s.logsWsGetHandler) + return s, nil } -func (s *Server) Run() error { - signals := []os.Signal{os.Interrupt, syscall.SIGTERM, syscall.SIGKILL, syscall.SIGQUIT} - signalCh := make(chan os.Signal, 1) - +func (s *Server) Run(ctx context.Context) error { go func() { if err := s.srv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { - log.Error(context.TODO(), "Server failed", "err", err) + log.Error(ctx, "Server failed", "err", err) } }() - defer func() { _ = s.srv.Shutdown(context.TODO()) }() + defer func() { _ = s.srv.Shutdown(ctx) }() select { case <-s.jobBarrierCh: // job started - case <-time.After(s.submitWaitDuration): - log.Error(context.TODO(), "Job didn't start in time, shutting down") - return gerrors.Newf("no job") + case <-time.After(s.startWaitDuration): + log.Error(ctx, "Job didn't start in time, shutting down") + return errors.New("no job submitted") + case <-ctx.Done(): + log.Error(ctx, "Received interrupt signal, shutting down") + return ctx.Err() } // todo timeout on code and run - signal.Notify(signalCh, signals...) select { - case <-signalCh: - log.Error(context.TODO(), "Received interrupt signal, shutting down") - s.stop() case <-s.jobBarrierCh: - log.Info(context.TODO(), "Job finished, shutting down") + log.Info(ctx, "Job finished, shutting down") + case <-ctx.Done(): + log.Error(ctx, "Received interrupt signal, shutting down") + s.stop() } close(s.shutdownCh) - signal.Reset(signals...) logsToWait := []struct { ch <-chan interface{} @@ -109,9 +113,9 @@ loop: for _, ch := range logsToWait { select { case <-ch.ch: - log.Info(context.TODO(), "Logs streaming finished", "endpoint", ch.name) + log.Info(ctx, "Logs streaming finished", "endpoint", ch.name) case <-waitLogsDone: - log.Error(context.TODO(), "Logs streaming didn't finish in time") + log.Error(ctx, "Logs streaming didn't finish in time") break loop // break the loop, not the select } } diff --git a/runner/internal/runner/api/submit_test.go b/runner/internal/runner/api/submit_test.go deleted file mode 100644 index b170c81be3..0000000000 --- a/runner/internal/runner/api/submit_test.go +++ /dev/null @@ -1,46 +0,0 @@ -//go:build !race - -package api - -import ( - "net/http/httptest" - "strings" - "testing" - - common "github.com/dstackai/dstack/runner/internal/api" - "github.com/dstackai/dstack/runner/internal/shim" - "github.com/dstackai/dstack/runner/internal/shim/api" -) - -func TestSubmit(t *testing.T) { - request := httptest.NewRequest("POST", "/api/submit", strings.NewReader("{\"image_name\":\"ubuntu\"}")) - responseRecorder := httptest.NewRecorder() - - dummyRunner := DummyRunner{} - dummyRunner.State = shim.Pending - - server := api.NewShimServer(":12340", &dummyRunner, "0.0.1.dev2") - - firstSubmitPost := common.JSONResponseHandler("POST", server.SubmitPostHandler) - firstSubmitPost(responseRecorder, request) - - if responseRecorder.Code != 200 { - t.Errorf("Want status '%d', got '%d'", 200, responseRecorder.Code) - } - - t.Logf("%v", responseRecorder.Result()) - - dummyRunner.State = shim.Pulling - - request = httptest.NewRequest("POST", "/api/submit", strings.NewReader("{\"image_name\":\"ubuntu\"}")) - responseRecorder = httptest.NewRecorder() - - secondSubmitPost := common.JSONResponseHandler("POST", server.SubmitPostHandler) - secondSubmitPost(responseRecorder, request) - - t.Logf("%v", responseRecorder.Result()) - - if responseRecorder.Code != 409 { - t.Errorf("Want status '%d', got '%d'", 409, responseRecorder.Code) - } -} diff --git a/runner/internal/runner/api/ws.go b/runner/internal/runner/api/ws.go index cade1170a2..3229701a68 100644 --- a/runner/internal/runner/api/ws.go +++ b/runner/internal/runner/api/ws.go @@ -2,13 +2,19 @@ package api import ( "context" + "errors" "net/http" "time" - "github.com/dstackai/dstack/runner/internal/log" "github.com/gorilla/websocket" + + "github.com/dstackai/dstack/runner/internal/common/log" ) +type logsWsRequestParams struct { + startTimestamp int64 +} + var upgrader = websocket.Upgrader{ CheckOrigin: func(r *http.Request) bool { return true @@ -20,39 +26,75 @@ func (s *Server) logsWsGetHandler(w http.ResponseWriter, r *http.Request) (inter if err != nil { return nil, err } + requestParams, err := parseRequestParams(r) + if err != nil { + _ = conn.WriteMessage( + websocket.CloseMessage, + websocket.FormatCloseMessage(websocket.CloseUnsupportedData, err.Error()), + ) + _ = conn.Close() + return nil, nil + } // todo memorize clientId? - go s.streamJobLogs(conn) + go s.streamJobLogs(r.Context(), conn, requestParams) return nil, nil } -func (s *Server) streamJobLogs(conn *websocket.Conn) { - currentPos := 0 +func parseRequestParams(r *http.Request) (logsWsRequestParams, error) { + query := r.URL.Query() + startTimeStr := query.Get("start_time") + var startTimestamp int64 + if startTimeStr != "" { + t, err := time.Parse(time.RFC3339, startTimeStr) + if err != nil { + return logsWsRequestParams{}, errors.New("failed to parse start_time value") + } + startTimestamp = t.Unix() + } + return logsWsRequestParams{startTimestamp: startTimestamp}, nil +} + +func (s *Server) streamJobLogs(ctx context.Context, conn *websocket.Conn, params logsWsRequestParams) { defer func() { _ = conn.WriteMessage(websocket.CloseMessage, nil) _ = conn.Close() }() - + currentPos := 0 + startTimestampMs := params.startTimestamp * 1000 + if startTimestampMs != 0 { + // TODO: Replace currentPos linear search with binary search + s.executor.RLock() + jobLogsWsHistory := s.executor.GetJobWsLogsHistory() + for _, logEntry := range jobLogsWsHistory { + if logEntry.Timestamp < startTimestampMs { + currentPos += 1 + } else { + break + } + } + s.executor.RUnlock() + } for { s.executor.RLock() - jobLogsHistory := s.executor.GetJobLogsHistory() + jobLogsWsHistory := s.executor.GetJobWsLogsHistory() select { case <-s.shutdownCh: - if currentPos >= len(jobLogsHistory) { + if currentPos >= len(jobLogsWsHistory) { s.executor.RUnlock() close(s.wsDoneCh) return } default: - if currentPos >= len(jobLogsHistory) { + if currentPos >= len(jobLogsWsHistory) { s.executor.RUnlock() time.Sleep(100 * time.Millisecond) continue } } - for currentPos < len(jobLogsHistory) { - if err := conn.WriteMessage(websocket.BinaryMessage, jobLogsHistory[currentPos].Message); err != nil { + for currentPos < len(jobLogsWsHistory) { + if err := conn.WriteMessage(websocket.BinaryMessage, jobLogsWsHistory[currentPos].Message); err != nil { s.executor.RUnlock() - log.Error(context.TODO(), "Failed to write message", "err", err) + log.Error(ctx, "failed to write message", "err", err) return } currentPos++ diff --git a/runner/internal/runner/connections/connections.go b/runner/internal/runner/connections/connections.go new file mode 100644 index 0000000000..4a56a6f172 --- /dev/null +++ b/runner/internal/runner/connections/connections.go @@ -0,0 +1,131 @@ +package connections + +import ( + "context" + "fmt" + "sync" + "time" + + "github.com/prometheus/procfs" + + "github.com/dstackai/dstack/runner/internal/common/log" +) + +const connStateEstablished = 1 + +type connection struct { + fromAddr string + fromPort uint64 +} + +type trackingInfo struct { + firstSeenAt time.Time +} + +type ConnectionTrackerConfig struct { + Port uint64 + MinConnDuration time.Duration + Procfs procfs.FS +} + +// Tracks TCP connections to a specified port. +type ConnectionTracker struct { + cfg ConnectionTrackerConfig + connections map[connection]trackingInfo + lastConnectionAt *time.Time + lastCheckedAt *time.Time + stopChan chan struct{} + mu sync.RWMutex +} + +func NewConnectionTracker(cfg ConnectionTrackerConfig) *ConnectionTracker { + tracker := ConnectionTracker{ + cfg: cfg, + connections: make(map[connection]trackingInfo), + lastConnectionAt: nil, + lastCheckedAt: nil, + stopChan: make(chan struct{}), + mu: sync.RWMutex{}, + } + return &tracker +} + +// Returns the number of seconds since the last connection was closed or +// since tracking started. If tracking hasn't started yet, returns 0. +func (t *ConnectionTracker) GetNoConnectionsSecs() int64 { + t.mu.RLock() + defer t.mu.RUnlock() + if t.lastConnectionAt == nil || t.lastCheckedAt == nil { + return 0 + } + return int64(t.lastCheckedAt.Sub(*t.lastConnectionAt).Seconds()) +} + +func (t *ConnectionTracker) Track(ticker <-chan time.Time) { + for { + select { + case now := <-ticker: + t.updateConnections(now) + case <-t.stopChan: + return + } + } +} + +func (t *ConnectionTracker) Stop() { + t.stopChan <- struct{}{} +} + +func (t *ConnectionTracker) updateConnections(now time.Time) { + currentConnections, err := t.getCurrentConnections() + if err != nil { + log.Error(context.TODO(), "Failed to retrieve connections: %v", err) + return + } + t.mu.Lock() + defer t.mu.Unlock() + // evict closed connections + for conn := range t.connections { + if _, ok := currentConnections[conn]; !ok { + delete(t.connections, conn) + } + } + // add new connections + for conn := range currentConnections { + if _, ok := t.connections[conn]; !ok { + t.connections[conn] = trackingInfo{firstSeenAt: now} + } + } + // update lastConnectionAt + for _, connInfo := range t.connections { + if now.Sub(connInfo.firstSeenAt) > t.cfg.MinConnDuration { + t.lastConnectionAt = &now + break + } + } + if t.lastConnectionAt == nil { // first call to updateConnections + t.lastConnectionAt = &now + } + t.lastCheckedAt = &now +} + +func (t *ConnectionTracker) getCurrentConnections() (map[connection]struct{}, error) { + connections := make(map[connection]struct{}) + netTCP, err := t.cfg.Procfs.NetTCP() + if err != nil { + return nil, fmt.Errorf("failed to retrieve IPv4 network connections: %w", err) + } + netTCP6, err := t.cfg.Procfs.NetTCP6() + if err != nil { + return nil, fmt.Errorf("failed to retrieve IPv6 network connections: %w", err) + } + for _, conn := range append(netTCP, netTCP6...) { + if conn.LocalPort == t.cfg.Port && conn.St == connStateEstablished { + connections[connection{ + fromAddr: conn.RemAddr.String(), + fromPort: conn.RemPort, + }] = struct{}{} + } + } + return connections, nil +} diff --git a/runner/internal/runner/connections/connections_test.go b/runner/internal/runner/connections/connections_test.go new file mode 100644 index 0000000000..4d46fb5439 --- /dev/null +++ b/runner/internal/runner/connections/connections_test.go @@ -0,0 +1,94 @@ +package connections + +import ( + "io/fs" + "os" + "testing" + "time" + + "github.com/prometheus/procfs" + "github.com/stretchr/testify/assert" +) + +func TestConnectionTracker(t *testing.T) { + procfsDir := t.TempDir() + proc, err := procfs.NewFS(procfsDir) + assert.NoError(t, err) + err = os.Mkdir(procfsDir+"/net", os.ModePerm) + assert.NoError(t, err) + tracker := NewConnectionTracker(ConnectionTrackerConfig{ + Port: 4096, + MinConnDuration: 5 * time.Second, + Procfs: proc, + }) + ticker := make(chan time.Time) + // Open sockets on ports 53 and 4096 + established connection to port 53 (irrelevant) + noConnTcp := ` sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode + 0: 3500007F:0035 00000000:0000 0A 00000000:00000000 00:00000000 00000000 0 0 12345 1 0000000000000000 100 0 0 10 0 + 1: 00000000:1000 00000000:0000 0A 00000000:00000000 00:00000000 00000000 0 0 12345 1 0000000000000000 100 0 0 10 0 + 2: 3500007F:0035 0100007F:1234 01 00000000:00000000 00:00000000 00000000 0 0 12345 1 0000000000000000 100 0 0 10 0 +` + noConnTcp6 := ` sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode + 0: 00000000000000000000000000000000:0035 00000000000000000000000000000000:0000 0A 00000000:00000000 00:00000000 00000000 0 0 12345 1 0000000000000000 100 0 0 10 0 + 1: 00000000000000000000000000000000:1000 00000000000000000000000000000000:0000 0A 00000000:00000000 00:00000000 00000000 0 0 12345 1 0000000000000000 100 0 0 10 0 + 2: 00000000000000000000000000000000:0035 00000000000000000000000001000000:1234 01 00000000:00000000 00:00000000 00000000 0 0 12345 1 0000000000000000 100 0 0 10 0 +` + // Established connection to port 4096 (relevant) + connTcp := " 3: 00000000:1000 0100007F:4321 01 00000000:00000000 00:00000000 00000000 0 0 12345 1 0000000000000000 100 0 0 10 0" + connTcp6 := " 3: 00000000000000000000000000000000:1000 00000000000000000000000001000000:4321 01 00000000:00000000 00:00000000 00000000 0 0 12345 1 0000000000000000 100 0 0 10 0" + + // Tracking did not start yet + // Returns 0 secs + assert.Equal(t, int64(0), tracker.GetNoConnectionsSecs()) + + go tracker.Track(ticker) + defer tracker.Stop() + assert.Equal(t, int64(0), tracker.GetNoConnectionsSecs()) + + // There is a 2-second-old connection + // Returns 2 secs (the connection doesn't count as it's < MinConnDuration) + writeProcfs(t, procfsDir, noConnTcp+connTcp, noConnTcp6) + tick := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC) + ticker <- tick + wait() + tick = tick.Add(2 * time.Second) + ticker <- tick + wait() + assert.Equal(t, int64(2), tracker.GetNoConnectionsSecs()) + + // There is a 6-second-old connection + // Returns 0 secs (the connection is >= MinConnDuration) + tick = tick.Add(4 * time.Second) + ticker <- tick + wait() + assert.Equal(t, int64(0), tracker.GetNoConnectionsSecs()) + + // The connection is closed and there are no connections for 15 secs. + // Returns 15 secs + writeProcfs(t, procfsDir, noConnTcp, noConnTcp6) + tick = tick.Add(15 * time.Second) + ticker <- tick + wait() + assert.Equal(t, int64(15), tracker.GetNoConnectionsSecs()) + + // There is a 7-second-old connection over IPv6 + // Returns 0 secs (the connection is >= MinConnDuration) + writeProcfs(t, procfsDir, noConnTcp, noConnTcp6+connTcp6) + tick = tick.Add(1 * time.Second) + ticker <- tick + tick = tick.Add(7 * time.Second) + ticker <- tick + wait() + assert.Equal(t, int64(0), tracker.GetNoConnectionsSecs()) +} + +func writeProcfs(t *testing.T, procfsDir, tcp, tcp6 string) { + err := os.WriteFile(procfsDir+"/net/tcp", []byte(tcp), fs.ModePerm) + assert.NoError(t, err) + err = os.WriteFile(procfsDir+"/net/tcp6", []byte(tcp6), fs.ModePerm) + assert.NoError(t, err) +} + +func wait() { + time.Sleep(30 * time.Millisecond) +} diff --git a/runner/internal/runner/executor/base.go b/runner/internal/runner/executor/base.go new file mode 100644 index 0000000000..bafe714bc9 --- /dev/null +++ b/runner/internal/runner/executor/base.go @@ -0,0 +1,39 @@ +package executor + +import ( + "context" + "io" + + "github.com/dstackai/dstack/runner/internal/common/types" + "github.com/dstackai/dstack/runner/internal/runner/schemas" +) + +type Executor interface { + // It must be safe to call SetJob more than once + SetJob(job schemas.SubmitBody) + // It must be safe to call WriteFileArchive more than once with the same archive + WriteFileArchive(id string, src io.Reader) error + // It must be safe to call WriteRepoBlob more than once + WriteRepoBlob(src io.Reader) error + Run(ctx context.Context) error + + GetHistory(timestamp int64) *schemas.PullResponse + GetJobWsLogsHistory() []schemas.LogEvent + + GetRunnerState() string + SetRunnerState(state string) + + GetJobInfo(ctx context.Context) (username string, workingDir string, err error) + SetJobState(ctx context.Context, state schemas.JobState) + SetJobStateWithTerminationReason( + ctx context.Context, + state schemas.JobState, + terminationReason types.TerminationReason, + terminationMessage string, + ) + + Lock() + RLock() + RUnlock() + Unlock() +} diff --git a/runner/internal/runner/executor/env.go b/runner/internal/runner/executor/env.go new file mode 100644 index 0000000000..ff91a0c8c4 --- /dev/null +++ b/runner/internal/runner/executor/env.go @@ -0,0 +1,134 @@ +package executor + +import ( + "fmt" + "strings" +) + +type EnvMap map[string]string + +func (em EnvMap) Get(key string) string { + return em[key] +} + +func (em EnvMap) Update(src map[string]string, interpolate bool) { + for key, value := range src { + if interpolate { + value = interpolateVariables(value, em.Get) + } + em[key] = value + } +} + +func (em EnvMap) Render() []string { + var list []string + for key, value := range em { + list = append(list, fmt.Sprintf("%s=%s", key, value)) + } + return list +} + +func NewEnvMap(sources ...map[string]string) EnvMap { + em := make(EnvMap) + for _, src := range sources { + em.Update(src, false) + } + return em +} + +func ParseEnvList(list []string) EnvMap { + em := make(EnvMap) + for _, item := range list { + parts := strings.SplitN(item, "=", 2) + if len(parts) == 2 { + em[parts[0]] = parts[1] + } + } + return em +} + +// interpolateVariables expands variables as follows: +// `$VARNAME` -> literal `$VARNAME` (curly brackets are mandatory, bare $ means nothing) +// `${VARNAME}` -> getter("VARNAME") return value +// `$${VARNAME}` -> literal `${VARNAME}` +// `$$${VARNAME}` -> literal `$` + getter("VARNAME") return value +// `$$$${VARNAME}` -> literal `$${VARNAME}` +// `${no_closing_bracket`, `${0nonalphafirstchar}`, `${non-alphanum char}`, `${}` -> +// -> corresponding literal as is (only valid placeholder is treated specially requiring +// doubling $ to avoid interpolation, any non-valid syntax with `${` sequence is passed as is) +// See test cases for more examples +func interpolateVariables(s string, getter func(string) string) string { + // assuming that most strings don't contain vars, + // allocate the buffer the same size as input string + buf := make([]byte, 0, len(s)) + dollarCount := 0 + for i := 0; i < len(s); i++ { + switch char := s[i]; char { + case '$': + dollarCount += 1 + case '{': + name, w := getVariableName(s[i+1:]) + if name != "" { + // valid variable name, unescaping $ + for range dollarCount / 2 { + buf = append(buf, '$') + } + if dollarCount%2 != 0 { + // ${var} -> var_value, $$${var} -> $var_value + buf = append(buf, getter(name)...) + } else { + // $${var} -> ${var}, $$$${var} -> $${var} + buf = append(buf, s[i:i+w+1]...) + } + } else { + // not a valid variable name or unclosed ${}, keeping all $ as is + for range dollarCount { + buf = append(buf, '$') + } + buf = append(buf, s[i:i+w+1]...) + } + i += w + dollarCount = 0 + default: + // flush accumulated $, if any + for range dollarCount { + buf = append(buf, '$') + } + dollarCount = 0 + buf = append(buf, char) + } + } + // flush trailing $, if any + for range dollarCount { + buf = append(buf, '$') + } + return string(buf) +} + +func getVariableName(s string) (string, int) { + if len(s) < 2 { + return "", len(s) + } + if !isAlpha(s[0]) { + return "", 1 + } + var i int + for i = 1; i < len(s); i++ { + char := s[i] + if char == '}' { + return s[:i], i + 1 + } + if !isAlphaNum(char) { + return "", i + } + } + return "", i +} + +func isAlpha(c uint8) bool { + return c == '_' || 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' +} + +func isAlphaNum(c uint8) bool { + return isAlpha(c) || '0' <= c && c <= '9' +} diff --git a/runner/internal/runner/executor/env_test.go b/runner/internal/runner/executor/env_test.go new file mode 100644 index 0000000000..10cfc25fb4 --- /dev/null +++ b/runner/internal/runner/executor/env_test.go @@ -0,0 +1,115 @@ +package executor + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func dummyGetter(s string) string { + return "" +} + +func TestInterpolateVariables_DollarEscape(t *testing.T) { + testCases := []struct { + input, expected string + }{ + {"", ""}, + {"just a string", "just a string"}, + {"$ $$ $$$", "$ $$ $$$"}, + {"foo $notavar", "foo $notavar"}, + {"foo $$notavar", "foo $$notavar"}, + {"trailing$", "trailing$"}, + {"trailing$$", "trailing$$"}, + {"trailing${", "trailing${"}, + {"trailing$${", "trailing$${"}, + {"empty${}", "empty${}"}, + {"empty${}empty", "empty${}empty"}, + {"empty$${}empty", "empty$${}empty"}, + {"foo${notavar", "foo${notavar"}, + {"foo${notavar bar", "foo${notavar bar"}, + {"foo$${notavar", "foo$${notavar"}, + {"foo$${notavar bar", "foo$${notavar bar"}, + {"foo${!notavar}", "foo${!notavar}"}, + {"foo${!notavar}bar", "foo${!notavar}bar"}, + {"foo${not!a!var}", "foo${not!a!var}"}, + {"foo$${not!a!var}", "foo$${not!a!var}"}, + {"foo${not!a!var}bar", "foo${not!a!var}bar"}, + {"foo$${not!a!var}bar", "foo$${not!a!var}bar"}, + {"${0notavar}", "${0notavar}"}, + {"foo ${0notavar}bar", "foo ${0notavar}bar"}, + {"foo $$${0notavar}bar", "foo $$${0notavar}bar"}, + {"foo$${escaped}", "foo${escaped}"}, + {"foo$$$${escaped}bar", "foo$${escaped}bar"}, + {"${var}", ""}, + {"$$${var}", "$"}, + {"$$${var}$", "$$"}, + {"$$${var}$$", "$$$"}, + {"foo${var}bar", "foobar"}, + {"hi ${var_WITH_all_allowed_char_types_013}", "hi "}, + } + for _, tc := range testCases { + interpolated := interpolateVariables(tc.input, dummyGetter) + assert.Equal(t, tc.expected, interpolated) + } +} + +func TestEnvMapUpdate_Expand(t *testing.T) { + envMap := EnvMap{"PATH": "/bin:/sbin"} + envMap.Update(EnvMap{"PATH": "/opt/bin:${PATH}"}, true) + assert.Equal(t, EnvMap{"PATH": "/opt/bin:/bin:/sbin"}, envMap) +} + +func TestEnvMapUpdate_Expand_NoCurlyBrackets(t *testing.T) { + envMap := EnvMap{"PATH": "/bin:/sbin"} + envMap.Update(EnvMap{"PATH": "/opt/bin:$PATH"}, true) + assert.Equal(t, EnvMap{"PATH": "/opt/bin:$PATH"}, envMap) +} + +func TestEnvMapUpdate_Expand_MissingVar(t *testing.T) { + envMap := EnvMap{} + envMap.Update(EnvMap{"PATH": "/opt/bin:${PATH}"}, true) + assert.Equal(t, EnvMap{"PATH": "/opt/bin:"}, envMap) +} + +func TestEnvMapUpdate_Expand_VarLike(t *testing.T) { + envMap := EnvMap{} + envMap.Update(EnvMap{"TOKEN": "deadf00d${notavar ${$NOTaVAR}"}, true) + assert.Equal(t, EnvMap{"TOKEN": "deadf00d${notavar ${$NOTaVAR}"}, envMap) +} + +func TestEnvMapUpdate_Merge_NoExpand(t *testing.T) { + envMap := EnvMap{ + "VAR1": "var1_oldvalue", + "VAR2": "var2_value", + } + envMap.Update(map[string]string{ + "VAR1": "var1_newvalue", + "VAR3": "var3_${VAR2}", + }, false) + + expected := EnvMap{ + "VAR1": "var1_newvalue", + "VAR2": "var2_value", + "VAR3": "var3_${VAR2}", + } + assert.Equal(t, expected, envMap) +} + +func TestEnvMapUpdate_Merge_Expand(t *testing.T) { + envMap := EnvMap{ + "VAR1": "var1_oldvalue", + "VAR2": "var2_value", + } + envMap.Update(map[string]string{ + "VAR1": "var1_newvalue", + "VAR3": "var3_${VAR2}", + }, true) + + expected := EnvMap{ + "VAR1": "var1_newvalue", + "VAR2": "var2_value", + "VAR3": "var3_var2_value", + } + assert.Equal(t, expected, envMap) +} diff --git a/runner/internal/runner/executor/executor.go b/runner/internal/runner/executor/executor.go new file mode 100644 index 0000000000..31f3d7fe92 --- /dev/null +++ b/runner/internal/runner/executor/executor.go @@ -0,0 +1,838 @@ +package executor + +import ( + "bytes" + "context" + "errors" + "fmt" + "io" + "net/url" + "os" + "os/exec" + "path" + "path/filepath" + "strconv" + "strings" + "sync" + "syscall" + "time" + + "github.com/creack/pty" + "github.com/dstackai/ansistrip" + "github.com/prometheus/procfs" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" + + "github.com/dstackai/dstack/runner/internal/common/consts" + "github.com/dstackai/dstack/runner/internal/common/log" + "github.com/dstackai/dstack/runner/internal/common/types" + "github.com/dstackai/dstack/runner/internal/common/utils" + "github.com/dstackai/dstack/runner/internal/runner/connections" + cap "github.com/dstackai/dstack/runner/internal/runner/linux/capabilities" + linuxuser "github.com/dstackai/dstack/runner/internal/runner/linux/user" + "github.com/dstackai/dstack/runner/internal/runner/schemas" + "github.com/dstackai/dstack/runner/internal/runner/ssh" +) + +// TODO: Tune these parameters for optimal experience/performance +const ( + // Output is flushed when the cursor doesn't move for this duration + AnsiStripFlushInterval = 500 * time.Millisecond + + // Output is flushed regardless of cursor activity after this maximum delay + AnsiStripMaxDelay = 3 * time.Second + + // Maximum buffer size for ansistrip + MaxBufferSize = 32 * 1024 // 32KB +) + +type ConnectionTracker interface { + GetNoConnectionsSecs() int64 + Track(ticker <-chan time.Time) + Stop() +} + +type RunExecutor struct { + tempDir string + dstackDir string + currentUser linuxuser.User + sshd ssh.SshdManager + + fileArchiveDir string + repoBlobDir string + + runnerLogFile *os.File + runnerLogStripper *ansistrip.Writer + runnerLogger *logrus.Entry + + run schemas.Run + jobSpec schemas.JobSpec + jobSubmission schemas.JobSubmission + clusterInfo schemas.ClusterInfo + secrets map[string]string + repoCredentials *schemas.RepoCredentials + repoDir string + repoBlobPath string + // If the user is not specified in the JobSpec, jobUser should point to currentUser + jobUser *linuxuser.User + jobWorkingDir string + + mu *sync.RWMutex + state string + jobStateHistory []schemas.JobStateEvent + jobLogs *appendWriter + jobWsLogs *appendWriter + runnerLogs *appendWriter + timestamp *MonotonicTimestamp + + killDelay time.Duration + connectionTracker ConnectionTracker +} + +func NewRunExecutor(tempDir string, dstackDir string, currentUser linuxuser.User, sshd ssh.SshdManager) (*RunExecutor, error) { + mu := &sync.RWMutex{} + timestamp := NewMonotonicTimestamp() + + proc, err := procfs.NewDefaultFS() + if err != nil { + return nil, fmt.Errorf("initialize procfs: %w", err) + } + connectionTracker := connections.NewConnectionTracker(connections.ConnectionTrackerConfig{ + Port: uint64(sshd.Port()), + MinConnDuration: 10 * time.Second, // shorter connections are likely from dstack-server + Procfs: proc, + }) + + return &RunExecutor{ + tempDir: tempDir, + dstackDir: dstackDir, + currentUser: currentUser, + sshd: sshd, + + fileArchiveDir: filepath.Join(tempDir, "file_archives"), + repoBlobDir: filepath.Join(tempDir, "repo_blobs"), + + mu: mu, + state: WaitSubmit, + jobStateHistory: make([]schemas.JobStateEvent, 0), + jobLogs: newAppendWriter(mu, timestamp), + jobWsLogs: newAppendWriter(mu, timestamp), + runnerLogs: newAppendWriter(mu, timestamp), + timestamp: timestamp, + + killDelay: 10 * time.Second, + connectionTracker: connectionTracker, + }, nil +} + +// GetJobInfo must be called after SetJob +func (ex *RunExecutor) GetJobInfo(ctx context.Context) (string, string, error) { + // preRun() sets ex.jobUser and ex.jobWorkingDir + if err := ex.preRun(ctx); err != nil { + return "", "", err + } + return ex.jobUser.Username, ex.jobWorkingDir, nil +} + +// Run must be called after SetJob and WriteRepoBlob +func (ex *RunExecutor) Run(ctx context.Context) (err error) { + // If jobStateHistory is not empty, either Run() has already been called or + // preRun() has already been called via GetJobInfo() and failed + if len(ex.jobStateHistory) > 0 { + return errors.New("already running or finished") + } + if err := ex.preRun(ctx); err != nil { + return err + } + defer ex.postRun(ctx) + + jobLogFile, err := log.CreateAppendFile(filepath.Join(ex.tempDir, consts.RunnerJobLogFileName)) + if err != nil { + ex.SetJobState(ctx, schemas.JobStateFailed) + return fmt.Errorf("create job log file: %w", err) + } + defer func() { _ = jobLogFile.Close() }() + + defer func() { + // recover goes after postRun(), which closes runnerLogFile, to keep the log + if r := recover(); r != nil { + log.Error(ctx, "Executor PANIC", "err", r) + ex.SetJobState(ctx, schemas.JobStateFailed) + err = fmt.Errorf("recovered: %v", r) + } + // no more logs will be written after this + ex.mu.Lock() + ex.SetRunnerState(WaitLogsFinished) + ex.mu.Unlock() + }() + defer func() { + if err != nil { + // TODO: refactor error handling and logs + log.Error(ctx, consts.ExecutorFailedSignature, "err", err) + } + }() + + ctx = log.WithLogger(ctx, ex.runnerLogger) + log.Info(ctx, "Run job") + + // setJobUser sets User.HomeDir to "/" if the original home dir is not set or not accessible, + // in that case we skip home dir provisioning + if ex.jobUser.HomeDir == "/" { + log.Info(ctx, "Skipping home dir provisioning") + } else { + // All home dir-related errors are considered non-fatal + cleanupGitCredentials, err := ex.setupGitCredentials(ctx) + if err != nil { + log.Error(ctx, "Failed to set up Git credentials", "err", err) + } else { + defer cleanupGitCredentials() + } + if err := ex.setupClusterSsh(ctx); err != nil { + log.Error(ctx, "Failed to set up cluster SSH", "err", err) + } + } + + if err := ex.setupRepo(ctx); err != nil { + ex.SetJobStateWithTerminationReason( + ctx, + schemas.JobStateFailed, + types.TerminationReasonContainerExitedWithError, + fmt.Sprintf("Failed to set up the repo (%s)", err), + ) + return fmt.Errorf("setup repo: %w", err) + } + + if err := ex.setupFiles(ctx); err != nil { + ex.SetJobStateWithTerminationReason( + ctx, + schemas.JobStateFailed, + types.TerminationReasonExecutorError, + fmt.Sprintf("Failed to set up files (%s)", err), + ) + return fmt.Errorf("setup files: %w", err) + } + + connectionTrackerTicker := time.NewTicker(2500 * time.Millisecond) + go ex.connectionTracker.Track(connectionTrackerTicker.C) + defer ex.connectionTracker.Stop() + + ex.SetJobState(ctx, schemas.JobStateRunning) + timeoutCtx := ctx + var cancelTimeout context.CancelFunc + if ex.jobSpec.MaxDuration != 0 { + timeoutCtx, cancelTimeout = context.WithTimeout(ctx, time.Duration(ex.jobSpec.MaxDuration)*time.Second) + defer cancelTimeout() + } + if err := ex.execJob(timeoutCtx, jobLogFile); err != nil { + select { + case <-ctx.Done(): + log.Error(ctx, "Job canceled") + ex.SetJobState(ctx, schemas.JobStateTerminated) + return fmt.Errorf("job canceled: %w", err) + default: + } + + select { + case <-timeoutCtx.Done(): + log.Error(ctx, "Max duration exceeded", "max_duration", ex.jobSpec.MaxDuration) + ex.SetJobStateWithTerminationReason( + ctx, + schemas.JobStateTerminated, + types.TerminationReasonMaxDurationExceeded, + "Max duration exceeded", + ) + return fmt.Errorf("max duration exceeded: %w", err) + default: + } + + if errors.Is(err, ErrLogQuotaExceeded) { + log.Error(ctx, "Log quota exceeded", "quota", ex.jobLogs.quota) + ex.SetJobStateWithTerminationReason( + ctx, + schemas.JobStateFailed, + types.TerminationReasonLogQuotaExceeded, + fmt.Sprintf("Job log output exceeded the hourly quota of %d bytes", ex.jobLogs.quota), + ) + return fmt.Errorf("log quota exceeded: %w", err) + } + + // todo fail reason? + log.Error(ctx, "Exec failed", "err", err) + var exitError *exec.ExitError + if errors.As(err, &exitError) { + ex.SetJobStateWithExitStatus(ctx, schemas.JobStateFailed, exitError.ExitCode()) + } else { + ex.SetJobState(ctx, schemas.JobStateFailed) + } + return fmt.Errorf("exec job failed: %w", err) + } + + ex.SetJobStateWithExitStatus(ctx, schemas.JobStateDone, 0) + return nil +} + +func (ex *RunExecutor) SetJob(body schemas.SubmitBody) { + ex.run = body.Run + ex.jobSubmission = body.JobSubmission + ex.jobSpec = body.JobSpec + ex.clusterInfo = body.ClusterInfo + ex.secrets = body.Secrets + ex.repoCredentials = body.RepoCredentials + ex.jobLogs.SetQuota(body.LogQuotaHour) +} + +func (ex *RunExecutor) SetJobState(ctx context.Context, state schemas.JobState) { + ex.SetJobStateWithTerminationReason(ctx, state, "", "") +} + +func (ex *RunExecutor) SetJobStateWithTerminationReason( + ctx context.Context, state schemas.JobState, terminationReason types.TerminationReason, terminationMessage string, +) { + ex.mu.Lock() + ex.jobStateHistory = append( + ex.jobStateHistory, + schemas.JobStateEvent{ + State: state, + Timestamp: ex.timestamp.Next(), + TerminationReason: terminationReason, + TerminationMessage: terminationMessage, + }, + ) + ex.mu.Unlock() + if terminationReason != "" { + ctx = log.AppendArgsCtx(ctx, "termination_reason", terminationReason, "termination_message", terminationMessage) + } + log.Info(ctx, "Job state changed", "new", state) +} + +func (ex *RunExecutor) SetJobStateWithExitStatus( + ctx context.Context, state schemas.JobState, exitStatus int, +) { + ex.mu.Lock() + ex.jobStateHistory = append( + ex.jobStateHistory, + schemas.JobStateEvent{ + State: state, + Timestamp: ex.timestamp.Next(), + ExitStatus: &exitStatus, + }, + ) + ex.mu.Unlock() + log.Info(ctx, "Job state changed", "new", state) +} + +func (ex *RunExecutor) SetRunnerState(state string) { + ex.state = state +} + +// preRun performs actions that were once part of Run() but were moved to a separate function +// to implement GetJobInfo() +// preRun must not execute long-running operations, as GetJobInfo() is called synchronously +// in the /api/run method +func (ex *RunExecutor) preRun(ctx context.Context) error { + // Already called once + if ex.runnerLogFile != nil { + return nil + } + + // logging is required for the subsequent setJob{User,WorkingDir} calls + runnerLogFile, err := log.CreateAppendFile(filepath.Join(ex.tempDir, consts.RunnerLogFileName)) + if err != nil { + ex.SetJobState(ctx, schemas.JobStateFailed) + return fmt.Errorf("create runner log file: %w", err) + } + ex.runnerLogFile = runnerLogFile + ex.runnerLogStripper = ansistrip.NewWriter(ex.runnerLogs, AnsiStripFlushInterval, AnsiStripMaxDelay, MaxBufferSize) + runnerLogWriter := io.MultiWriter(ex.runnerLogFile, os.Stdout, ex.runnerLogStripper) + runnerLogLevel := log.DefaultEntry.Logger.Level + ex.runnerLogger = log.NewEntry(runnerLogWriter, int(runnerLogLevel)) + ctx = log.WithLogger(ctx, ex.runnerLogger) + log.Info(ctx, "Logging configured", "log_level", runnerLogLevel.String()) + + // jobUser and jobWorkingDir are required for GetJobInfo() + if err := ex.setJobUser(ctx); err != nil { + ex.SetJobStateWithTerminationReason( + ctx, + schemas.JobStateFailed, + types.TerminationReasonExecutorError, + fmt.Sprintf("Failed to set job user (%s)", err), + ) + return fmt.Errorf("set job user: %w", err) + } + if err := ex.setJobWorkingDir(ctx); err != nil { + ex.SetJobStateWithTerminationReason( + ctx, + schemas.JobStateFailed, + types.TerminationReasonExecutorError, + fmt.Sprintf("Failed to set job working dir (%s)", err), + ) + return fmt.Errorf("set job working dir: %w", err) + } + + return nil +} + +func (ex *RunExecutor) postRun(ctx context.Context) { + if ex.runnerLogFile != nil { + if err := ex.runnerLogFile.Close(); err != nil { + log.Error(ctx, "Failed to close runnerLogFile", "err", err) + } + } + if ex.runnerLogStripper != nil { + if err := ex.runnerLogStripper.Close(); err != nil { + log.Error(ctx, "Failed to close runnerLogStripper", "err", err) + } + } +} + +// setJobWorkingDir must be called from Run after setJobUser +func (ex *RunExecutor) setJobWorkingDir(ctx context.Context) error { + var err error + if ex.jobSpec.WorkingDir == nil { + ex.jobWorkingDir, err = os.Getwd() + if err != nil { + return fmt.Errorf("get working directory: %w", err) + } + } else { + ex.jobWorkingDir, err = utils.ExpandPath(*ex.jobSpec.WorkingDir, "", ex.jobUser.HomeDir) + if err != nil { + return fmt.Errorf("expand working dir path: %w", err) + } + if !path.IsAbs(ex.jobWorkingDir) { + return fmt.Errorf("working dir must be absolute: %s", ex.jobWorkingDir) + } + } + log.Trace(ctx, "Job working dir", "path", ex.jobWorkingDir) + return nil +} + +// setupClusterSsh must be called from Run after setJobUser +func (ex *RunExecutor) setupClusterSsh(ctx context.Context) error { + if ex.jobSpec.SSHKey == nil || len(ex.clusterInfo.JobIPs) < 2 { + return nil + } + + sshDir, err := prepareUserSshDir(ex.jobUser) + if err != nil { + return fmt.Errorf("prepare user ssh dir: %w", err) + } + + privatePath := filepath.Join(sshDir, "dstack_job") + privateFile, err := os.OpenFile(privatePath, os.O_TRUNC|os.O_WRONLY|os.O_CREATE, 0o600) + if err != nil { + return fmt.Errorf("open private key file: %w", err) + } + defer privateFile.Close() + if err := os.Chown(privatePath, ex.jobUser.Uid, ex.jobUser.Uid); err != nil { + return fmt.Errorf("chown private key: %w", err) + } + if _, err := privateFile.WriteString(ex.jobSpec.SSHKey.Private); err != nil { + return fmt.Errorf("write private key: %w", err) + } + + // TODO: move job hosts config to ~/.dstack/ssh/config.d/current_job.conf + // and add "Include ~/.dstack/ssh/config.d/*.conf" directive to ~/.ssh/config if not present + // instead of appending job hosts config directly (don't bloat user's ssh_config) + configPath := filepath.Join(sshDir, "config") + configFile, err := os.OpenFile(configPath, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0o600) + if err != nil { + return fmt.Errorf("open SSH config: %w", err) + } + defer configFile.Close() + if err := os.Chown(configPath, ex.jobUser.Uid, ex.jobUser.Gid); err != nil { + return fmt.Errorf("chown SSH config: %w", err) + } + configBuffer := new(bytes.Buffer) + for _, ip := range ex.clusterInfo.JobIPs { + fmt.Fprintf(configBuffer, "\nHost %s\n", ip) + fmt.Fprintf(configBuffer, " Port %d\n", ex.sshd.Port()) + configBuffer.WriteString(" StrictHostKeyChecking no\n") + configBuffer.WriteString(" UserKnownHostsFile /dev/null\n") + fmt.Fprintf(configBuffer, " IdentityFile %s\n", privatePath) + } + if _, err := configFile.Write(configBuffer.Bytes()); err != nil { + return fmt.Errorf("write SSH config: %w", err) + } + + if err := ex.sshd.AddAuthorizedKeys(ctx, ex.jobSpec.SSHKey.Public); err != nil { + return fmt.Errorf("add authorized key: %w", err) + } + + return nil +} + +func (ex *RunExecutor) getRepoData() schemas.RepoData { + if ex.jobSpec.RepoData == nil { + // jobs submitted before 0.19.17 do not have jobSpec.RepoData + return ex.run.RunSpec.RepoData + } + return *ex.jobSpec.RepoData +} + +func (ex *RunExecutor) execJob(ctx context.Context, jobLogFile io.Writer) error { + nodeRank := ex.jobSpec.JobNum + nodesNum := ex.jobSpec.JobsPerReplica + gpusPerNodeNum := ex.clusterInfo.GPUSPerJob + gpusNum := nodesNum * gpusPerNodeNum + + mpiHostfilePath := filepath.Join(ex.dstackDir, "mpi/hostfile") + + jobEnvs := map[string]string{ + "DSTACK_RUN_ID": ex.run.Id, + "DSTACK_JOB_ID": ex.jobSubmission.Id, + "DSTACK_RUN_NAME": ex.run.RunSpec.RunName, + "DSTACK_REPO_ID": ex.run.RunSpec.RepoId, + "DSTACK_REPO_DIR": ex.repoDir, + "DSTACK_WORKING_DIR": ex.jobWorkingDir, + "DSTACK_NODES_IPS": strings.Join(ex.clusterInfo.JobIPs, "\n"), + "DSTACK_MASTER_NODE_IP": ex.clusterInfo.MasterJobIP, + "DSTACK_NODE_RANK": strconv.Itoa(nodeRank), + "DSTACK_NODES_NUM": strconv.Itoa(nodesNum), + "DSTACK_GPUS_PER_NODE": strconv.Itoa(gpusPerNodeNum), + "DSTACK_GPUS_NUM": strconv.Itoa(gpusNum), + "DSTACK_MPI_HOSTFILE": mpiHostfilePath, + } + + cmd := exec.CommandContext(ctx, ex.jobSpec.Commands[0], ex.jobSpec.Commands[1:]...) + cmd.Cancel = func() error { + // returns error on Windows + if signalErr := cmd.Process.Signal(os.Interrupt); signalErr != nil { + return fmt.Errorf("send interrupt signal: %w", signalErr) + } + return nil + } + cmd.WaitDelay = ex.killDelay // kills the process if it doesn't exit in time + + if err := utils.MkdirAll(ctx, ex.jobWorkingDir, ex.jobUser.Uid, ex.jobUser.Gid, 0o755); err != nil { + return fmt.Errorf("create working directory: %w", err) + } + cmd.Dir = ex.jobWorkingDir + + // CAP_SET{UID,GID} for startCommand() -> Cmd.Start() -> set{uid,gid,groups} syscalls during fork-exec + // CAP_CHOWN for startCommand() -> os.Chown(pts.Name()) + if missing, err := cap.Check(cap.SETUID, cap.SETGID, cap.CHOWN); err != nil { + log.Error( + ctx, "Failed to check capabilities, won't try to set process credentials", + "err", err, "user", ex.currentUser, + ) + } else if len(missing) > 0 { + log.Info( + ctx, "Required capabilities are missing, cannot set process credentials", + "missing", missing, "user", ex.currentUser, + ) + } else { + log.Trace(ctx, "Using credentials", "user", ex.jobUser) + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &syscall.SysProcAttr{} + } + creds, err := ex.jobUser.ProcessCredentials() + if err != nil { + return fmt.Errorf("prepare process credentials: %w", err) + } + cmd.SysProcAttr.Credential = creds + } + + envMap := NewEnvMap(ParseEnvList(os.Environ()), jobEnvs, ex.secrets) + // `env` interpolation feature is postponed to some future release + envMap.Update(ex.jobSpec.Env, false) + + const profilePath = "/etc/profile" + dstackProfilePath := path.Join(ex.dstackDir, "profile") + if err := writeDstackProfile(envMap, dstackProfilePath); err != nil { + log.Warning(ctx, "failed to write dstack_profile", "path", dstackProfilePath, "err", err) + } else if err := includeDstackProfile(profilePath, dstackProfilePath); err != nil { + log.Warning(ctx, "failed to include dstack_profile", "path", profilePath, "err", err) + } + + if err := writeMpiHostfile(ctx, ex.clusterInfo.JobIPs, gpusPerNodeNum, mpiHostfilePath); err != nil { + return fmt.Errorf("write MPI hostfile: %w", err) + } + + // Configure process resource limits + // TODO: Make rlimits customizable in the run configuration. Currently, we only set max locked memory + // to unlimited to fix the issue with InfiniBand/RDMA: "Cannot allocate memory". + // See: https://github.com/ofiwg/libfabric/issues/6437 + // See: https://github.com/openucx/ucx/issues/8229 + // Note: we already set RLIMIT_MEMLOCK to unlimited in the shim if we've detected IB devices + // (see configureHpcNetworkingIfAvailable() function), but, as it's on the shim side, it only works + // with VM-based backends. + if ok, err := cap.Has(cap.SYS_RESOURCE); err != nil { + log.Error(ctx, "Failed to check capabilities, won't try to set resource limits", "err", err) + } else if !ok { + log.Info(ctx, "Required capability is missing, cannot set resource limits", "missing", cap.SYS_RESOURCE) + } else { + rlimitMemlock := unix.Rlimit{Cur: unix.RLIM_INFINITY, Max: unix.RLIM_INFINITY} + if err := unix.Setrlimit(unix.RLIMIT_MEMLOCK, &rlimitMemlock); err != nil { + log.Error(ctx, "Failed to set resource limits", "err", err) + } + } + + // HOME must be added after writeDstackProfile to avoid overriding the correct per-user value set by sshd + envMap["HOME"] = ex.jobUser.HomeDir + cmd.Env = envMap.Render() + + log.Trace(ctx, "Starting exec", "cmd", cmd.String(), "working_dir", cmd.Dir, "env", cmd.Env) + + ptm, err := startCommand(cmd) + if err != nil { + return fmt.Errorf("start command: %w", err) + } + defer func() { _ = ptm.Close() }() + defer func() { _ = cmd.Wait() }() // release resources if copy fails + + stripper := ansistrip.NewWriter(ex.jobLogs, AnsiStripFlushInterval, AnsiStripMaxDelay, MaxBufferSize) + logger := io.MultiWriter(jobLogFile, ex.jobWsLogs, stripper) + + if err := ex.copyOutputWithQuota(cmd, ptm, stripper, logger); err != nil { + return err + } + if err = cmd.Wait(); err != nil { + return fmt.Errorf("wait for command: %w", err) + } + return nil +} + +// copyOutputWithQuota streams process output through the log pipeline and +// monitors for log quota exceeded. The quota signal is out-of-band (via channel) +// because the ansistrip writer is async and swallows downstream write errors. +func (ex *RunExecutor) copyOutputWithQuota(cmd *exec.Cmd, ptm io.Reader, stripper io.Closer, logger io.Writer) error { + copyDone := make(chan error, 1) + go func() { + _, err := io.Copy(logger, ptm) + copyDone <- err + }() + + // Wait for either io.Copy to finish or quota to be exceeded. + var copyErr error + select { + case copyErr = <-copyDone: + case <-ex.jobLogs.QuotaExceeded(): + _ = cmd.Process.Kill() + <-copyDone + } + + // Flush the ansistrip buffer — may also trigger quota exceeded. + _ = stripper.Close() + + select { + case <-ex.jobLogs.QuotaExceeded(): + return ErrLogQuotaExceeded + default: + } + + if copyErr != nil && !isPtyError(copyErr) { + return fmt.Errorf("copy command output: %w", copyErr) + } + return nil +} + +// setupGitCredentials must be called from Run after setJobUser +func (ex *RunExecutor) setupGitCredentials(ctx context.Context) (func(), error) { + if ex.repoCredentials == nil { + return func() {}, nil + } + + switch ex.repoCredentials.GetProtocol() { + case "ssh": + if ex.repoCredentials.PrivateKey == nil { + return nil, fmt.Errorf("private key is missing") + } + sshDir, err := prepareUserSshDir(ex.jobUser) + if err != nil { + return nil, fmt.Errorf("prepare user ssh dir: %w", err) + } + keyPath := filepath.Join(sshDir, "id_rsa") + if _, err := os.Stat(keyPath); err == nil { + return nil, fmt.Errorf("private key already exists") + } + log.Info(ctx, "Writing private key", "path", keyPath) + if err := os.WriteFile(keyPath, []byte(*ex.repoCredentials.PrivateKey), 0o600); err != nil { + return nil, fmt.Errorf("write private key: %w", err) + } + if err := os.Chown(keyPath, ex.jobUser.Uid, ex.jobUser.Gid); err != nil { + return nil, fmt.Errorf("chown private key: %w", err) + } + return func() { + log.Info(ctx, "Removing private key", "path", keyPath) + _ = os.Remove(keyPath) + }, nil + case "https": + if ex.repoCredentials.OAuthToken == nil { + return func() {}, nil + } + hostsPath := filepath.Join(ex.jobUser.HomeDir, ".config/gh/hosts.yml") + if _, err := os.Stat(hostsPath); err == nil { + return nil, fmt.Errorf("hosts.yml file already exists") + } + if err := utils.MkdirAll(ctx, filepath.Dir(hostsPath), ex.jobUser.Uid, ex.jobUser.Gid, 0o700); err != nil { + return nil, fmt.Errorf("create gh config directory: %w", err) + } + log.Info(ctx, "Writing OAuth token", "path", hostsPath) + cloneURL, err := url.Parse(ex.repoCredentials.CloneURL) + if err != nil { + return nil, fmt.Errorf("parse clone URL: %w", err) + } + ghHost := fmt.Sprintf("%s:\n oauth_token: \"%s\"\n", cloneURL.Hostname(), *ex.repoCredentials.OAuthToken) + if err := os.WriteFile(hostsPath, []byte(ghHost), 0o600); err != nil { + return nil, fmt.Errorf("write OAuth token: %w", err) + } + if err := os.Chown(hostsPath, ex.jobUser.Uid, ex.jobUser.Gid); err != nil { + return nil, fmt.Errorf("chown OAuth token: %w", err) + } + return func() { + log.Info(ctx, "Removing OAuth token", "path", hostsPath) + _ = os.Remove(hostsPath) + }, nil + } + return nil, fmt.Errorf("unknown protocol %s", ex.repoCredentials.GetProtocol()) +} + +func isPtyError(err error) bool { + /* read /dev/ptmx: input/output error */ + var e *os.PathError + return errors.As(err, &e) && errors.Is(e.Err, syscall.EIO) +} + +// A simplified copypasta of creack/pty Start->StartWithSize->StartWithAttrs +// with two additions: +// * controlling terminal is properly set (cmd.Extrafiles, Cmd.SysProcAttr.Ctty) +// * owner of slave pty is changed to the child process uid +func startCommand(cmd *exec.Cmd) (*os.File, error) { + ptm, pts, err := pty.Open() + if err != nil { + return nil, fmt.Errorf("open pty: %w", err) + } + defer func() { _ = pts.Close() }() + + cmd.Stdout = pts + cmd.Stderr = pts + cmd.Stdin = pts + cmd.ExtraFiles = []*os.File{pts} + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &syscall.SysProcAttr{} + } + // see https://github.com/creack/pty/issues/96#issuecomment-624372400 + cmd.SysProcAttr.Ctty = 3 // cmd.ExtraFiles[0] + cmd.SysProcAttr.Setctty = true + cmd.SysProcAttr.Setsid = true + + if cmd.SysProcAttr.Credential != nil { + // Initially, /dev/pts/N is owned by the user who open()'ed /dev/ptmx (runner_uid) + // If the runner started by root, we can chown to any user + // If the runner started by non-root, we can chown only to the same user (noop) + // In the latter case, the situation when runner_uid != 0 and + // runner_uid != job_uid should be already handled outside this function + uid := cmd.SysProcAttr.Credential.Uid + if err := os.Chown(pts.Name(), int(uid), -1); err != nil { + _ = ptm.Close() + return nil, fmt.Errorf("chown pty slave: %w", err) + } + } + + if err := cmd.Start(); err != nil { + _ = ptm.Close() + return nil, fmt.Errorf("start command: %w", err) + } + return ptm, nil +} + +func prepareUserSshDir(user *linuxuser.User) (string, error) { + sshDir := filepath.Join(user.HomeDir, ".ssh") + info, err := os.Stat(sshDir) + if err == nil { + if !info.IsDir() { + return "", fmt.Errorf("not a directory: %s", sshDir) + } + if err := os.Chmod(sshDir, 0o700); err != nil { + return "", fmt.Errorf("chmod ssh dir: %w", err) + } + } else if errors.Is(err, os.ErrNotExist) { + if err := os.MkdirAll(sshDir, 0o700); err != nil { + return "", fmt.Errorf("create ssh dir: %w", err) + } + } else { + return "", err + } + if err := os.Chown(sshDir, user.Uid, user.Gid); err != nil { + return "", fmt.Errorf("chown ssh dir: %w", err) + } + return sshDir, nil +} + +func writeMpiHostfile(ctx context.Context, ips []string, gpusPerNode int, path string) error { + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + return fmt.Errorf("create MPI hostfile directory: %w", err) + } + file, err := os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0o644) + if err != nil { + return fmt.Errorf("open MPI hostfile: %w", err) + } + defer file.Close() + nonEmptyIps := []string{} + for _, ip := range ips { + if ip != "" { + nonEmptyIps = append(nonEmptyIps, ip) + } + } + if len(nonEmptyIps) == len(ips) { + var template string + if gpusPerNode == 0 { + // CPU node: the number of slots defaults to the number of processor cores on that host + // See: https://docs.open-mpi.org/en/main/launching-apps/scheduling.html#calculating-the-number-of-slots + template = "%s\n" + } else { + template = fmt.Sprintf("%%s slots=%d\n", gpusPerNode) + } + for _, ip := range nonEmptyIps { + if _, err = fmt.Fprintf(file, template, ip); err != nil { + return fmt.Errorf("write MPI hostfile line: %w", err) + } + } + } else { + log.Info(ctx, "creating empty MPI hostfile: no internal IPs assigned") + } + return nil +} + +func writeDstackProfile(env map[string]string, pth string) error { + if err := os.MkdirAll(path.Dir(pth), 0o755); err != nil { + return fmt.Errorf("create dstack profile directory: %w", err) + } + file, err := os.OpenFile(pth, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0o644) + if err != nil { + return fmt.Errorf("open dstack profile: %w", err) + } + defer file.Close() + for key, value := range env { + switch key { + case "HOSTNAME", "USER", "HOME", "SHELL", "SHLVL", "PWD", "_": + continue + } + line := fmt.Sprintf("export %s='%s'\n", key, strings.ReplaceAll(value, `'`, `'"'"'`)) + if _, err = file.WriteString(line); err != nil { + return fmt.Errorf("write dstack profile: %w", err) + } + } + if _, err = file.WriteString("cd \"$DSTACK_WORKING_DIR\"\n"); err != nil { + return fmt.Errorf("write dstack profile: %w", err) + } + if err = os.Chmod(pth, 0o644); err != nil { + return fmt.Errorf("chmod dstack profile: %w", err) + } + return nil +} + +func includeDstackProfile(profilePath string, dstackProfilePath string) error { + file, err := os.OpenFile(profilePath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644) + if err != nil { + return fmt.Errorf("open profile file: %w", err) + } + defer file.Close() + if _, err = fmt.Fprintf(file, "\n. '%s'\n", dstackProfilePath); err != nil { + return fmt.Errorf("write profile include: %w", err) + } + if err = os.Chmod(profilePath, 0o644); err != nil { + return fmt.Errorf("chmod profile file: %w", err) + } + return nil +} diff --git a/runner/internal/runner/executor/executor_test.go b/runner/internal/runner/executor/executor_test.go new file mode 100644 index 0000000000..2330cd6f3c --- /dev/null +++ b/runner/internal/runner/executor/executor_test.go @@ -0,0 +1,401 @@ +package executor + +import ( + "archive/tar" + "bytes" + "context" + "fmt" + "io" + "os" + "os/exec" + "path" + "path/filepath" + "strings" + "testing" + "time" + + linuxuser "github.com/dstackai/dstack/runner/internal/runner/linux/user" + "github.com/dstackai/dstack/runner/internal/runner/schemas" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestExecutor_WorkingDir_Set(t *testing.T) { + var b bytes.Buffer + ex := makeTestExecutor(t) + baseDir, err := filepath.EvalSymlinks(t.TempDir()) + require.NoError(t, err) + workingDir := path.Join(baseDir, "path/to/wd") + + ex.jobSpec.WorkingDir = &workingDir + ex.jobSpec.Commands = append(ex.jobSpec.Commands, "pwd") + err = ex.setJobWorkingDir(t.Context()) + require.NoError(t, err) + require.Equal(t, workingDir, ex.jobWorkingDir) + err = os.MkdirAll(workingDir, 0o755) + require.NoError(t, err) + + err = ex.execJob(t.Context(), io.Writer(&b)) + assert.NoError(t, err) + // Normalize line endings for cross-platform compatibility. + assert.Equal(t, workingDir+"\n", strings.ReplaceAll(b.String(), "\r\n", "\n")) +} + +func TestExecutor_WorkingDir_NotSet(t *testing.T) { + var b bytes.Buffer + ex := makeTestExecutor(t) + cwd, err := os.Getwd() + require.NoError(t, err) + ex.jobSpec.WorkingDir = nil + ex.jobSpec.Commands = append(ex.jobSpec.Commands, "pwd") + err = ex.setJobWorkingDir(t.Context()) + require.NoError(t, err) + require.Equal(t, cwd, ex.jobWorkingDir) + + err = ex.execJob(t.Context(), io.Writer(&b)) + assert.NoError(t, err) + assert.Equal(t, cwd+"\n", strings.ReplaceAll(b.String(), "\r\n", "\n")) +} + +func TestExecutor_HomeDir(t *testing.T) { + var b bytes.Buffer + ex := makeTestExecutor(t) + ex.jobSpec.Commands = append(ex.jobSpec.Commands, "echo ~") + + err := ex.execJob(t.Context(), io.Writer(&b)) + assert.NoError(t, err) + assert.Equal(t, ex.currentUser.HomeDir+"\n", strings.ReplaceAll(b.String(), "\r\n", "\n")) +} + +func TestExecutor_NonZeroExit(t *testing.T) { + ex := makeTestExecutor(t) + ex.jobSpec.Commands = append(ex.jobSpec.Commands, "exit 100") + makeCodeTar(t, ex) + + err := ex.Run(t.Context()) + assert.Error(t, err) + assert.NotEmpty(t, ex.jobStateHistory) + exitStatus := ex.jobStateHistory[len(ex.jobStateHistory)-1].ExitStatus + assert.NotNil(t, exitStatus) + assert.Equal(t, 100, *exitStatus) +} + +func TestExecutor_SSHCredentials(t *testing.T) { + key := "== ssh private key ==" + + var b bytes.Buffer + ex := makeTestExecutor(t) + ex.jobSpec.Commands = append(ex.jobSpec.Commands, "cat ~/.ssh/id_rsa") + ex.repoCredentials = &schemas.RepoCredentials{ + CloneURL: "ssh://git@github.com/dstackai/dstack-examples.git", + PrivateKey: &key, + } + + clean, err := ex.setupGitCredentials(t.Context()) + defer clean() + require.NoError(t, err) + + err = ex.execJob(t.Context(), io.Writer(&b)) + assert.NoError(t, err) + assert.Equal(t, key, b.String()) +} + +func TestExecutor_LocalRepo(t *testing.T) { + var b bytes.Buffer + ex := makeTestExecutor(t) + cmd := fmt.Sprintf("cat %s/foo", *ex.jobSpec.RepoDir) + ex.jobSpec.Commands = append(ex.jobSpec.Commands, cmd) + makeCodeTar(t, ex) + + err := ex.setupRepo(t.Context()) + require.NoError(t, err) + + err = ex.execJob(t.Context(), io.Writer(&b)) + assert.NoError(t, err) + assert.Equal(t, "bar\n", strings.ReplaceAll(b.String(), "\r\n", "\n")) +} + +func TestExecutor_Recover(t *testing.T) { + ex := makeTestExecutor(t) + ex.jobSpec.Commands = nil // cause a panic + makeCodeTar(t, ex) + + err := ex.Run(t.Context()) + assert.ErrorContains(t, err, "recovered: ") +} + +/* Long tests */ + +func TestExecutor_MaxDuration(t *testing.T) { + if testing.Short() { + t.Skip() + } + + ex := makeTestExecutor(t) + ex.killDelay = 500 * time.Millisecond + ex.jobSpec.Commands = append(ex.jobSpec.Commands, "echo 1 && sleep 2 && echo 2") + ex.jobSpec.MaxDuration = 1 // seconds + makeCodeTar(t, ex) + + err := ex.Run(t.Context()) + assert.ErrorContains(t, err, "killed") +} + +func TestExecutor_LogQuota(t *testing.T) { + if testing.Short() { + t.Skip() + } + + ex := makeTestExecutor(t) + ex.killDelay = 500 * time.Millisecond + // Output >100 bytes to trigger the quota + ex.jobSpec.Commands = append(ex.jobSpec.Commands, "for i in $(seq 1 20); do echo 'This line is long enough to exceed the quota easily'; done") + ex.jobLogs.SetQuota(100) + makeCodeTar(t, ex) + + err := ex.Run(t.Context()) + assert.ErrorContains(t, err, "log quota exceeded") + + // Verify the termination state was set + history := ex.GetHistory(0) + lastState := history.JobStates[len(history.JobStates)-1] + assert.Equal(t, schemas.JobStateFailed, lastState.State) +} + +func TestExecutor_RemoteRepo(t *testing.T) { + if testing.Short() { + t.Skip() + } + + var b bytes.Buffer + ex := makeTestExecutor(t) + ex.jobSpec.RepoData = &schemas.RepoData{ + RepoType: "remote", + RepoBranch: "main", + RepoHash: "2b83592e506ed6fe8e49f4eaa97c3866bc9402b1", + RepoConfigName: "Dstack Developer", + RepoConfigEmail: "developer@dstack.ai", + } + ex.jobSpec.Commands = append(ex.jobSpec.Commands, "git rev-parse HEAD && git config user.name && git config user.email") + err := ex.WriteRepoBlob(bytes.NewReader([]byte{})) // empty diff + require.NoError(t, err) + + err = ex.setJobWorkingDir(t.Context()) + require.NoError(t, err) + err = ex.setupRepo(t.Context()) + require.NoError(t, err) + + err = ex.execJob(t.Context(), io.Writer(&b)) + assert.NoError(t, err) + expected := fmt.Sprintf("%s\n%s\n%s\n", ex.getRepoData().RepoHash, ex.getRepoData().RepoConfigName, ex.getRepoData().RepoConfigEmail) + assert.Equal(t, expected, strings.ReplaceAll(b.String(), "\r\n", "\n")) +} + +/* Helpers */ + +func makeTestExecutor(t *testing.T) *RunExecutor { + t.Helper() + baseDir, err := filepath.EvalSymlinks(t.TempDir()) + require.NoError(t, err) + + repo := filepath.Join(baseDir, "repo") + body := schemas.SubmitBody{ + Run: schemas.Run{ + Id: "12346", + RunSpec: schemas.RunSpec{ + RunName: "red-turtle-1", + RepoId: "test-000000", + RepoData: schemas.RepoData{RepoType: "local"}, + Configuration: schemas.Configuration{ + Type: "task", + }, + ConfigurationPath: ".dstack.yml", + }, + }, + JobSpec: schemas.JobSpec{ + Commands: []string{"/bin/bash", "-c"}, + Env: make(map[string]string), + MaxDuration: 0, // no timeout + WorkingDir: &repo, + RepoDir: &repo, + RepoData: &schemas.RepoData{RepoType: "local"}, + }, + Secrets: make(map[string]string), + RepoCredentials: &schemas.RepoCredentials{ + CloneURL: "https://github.com/dstackai/dstack-examples.git", + }, + } + + tempDir := filepath.Join(baseDir, "temp") + require.NoError(t, os.Mkdir(tempDir, 0o700)) + + dstackDir := filepath.Join(baseDir, "dstack") + require.NoError(t, os.Mkdir(dstackDir, 0o755)) + + currentUser, err := linuxuser.FromCurrentProcess() + require.NoError(t, err) + homeDir := filepath.Join(baseDir, "home") + require.NoError(t, os.Mkdir(homeDir, 0o700)) + currentUser.HomeDir = homeDir + + ex, err := NewRunExecutor(tempDir, dstackDir, *currentUser, new(sshdMock)) + require.NoError(t, err) + + ex.SetJob(body) + require.NoError(t, ex.setJobUser(t.Context())) + require.NoError(t, ex.setJobWorkingDir(t.Context())) + + return ex +} + +func makeCodeTar(t *testing.T, ex *RunExecutor) { + t.Helper() + var b bytes.Buffer + tw := tar.NewWriter(&b) + + files := []struct{ name, body string }{ + {"foo", "bar\n"}, + } + + for _, f := range files { + hdr := &tar.Header{Name: f.name, Mode: 0o600, Size: int64(len(f.body))} + require.NoError(t, tw.WriteHeader(hdr)) + _, err := tw.Write([]byte(f.body)) + require.NoError(t, err) + } + require.NoError(t, tw.Close()) + + require.NoError(t, ex.WriteRepoBlob(&b)) +} + +func TestWriteDstackProfile(t *testing.T) { + testCases := []string{ + "", + "string 'with 'single' quotes", + "multi\nline\tstring", + } + tmp := t.TempDir() + path := tmp + "/dstack_profile" + script := fmt.Sprintf(`. '%s'; printf '%%s' "$VAR"`, path) + for _, value := range testCases { + env := map[string]string{"VAR": value} + writeDstackProfile(env, path) + cmd := exec.CommandContext(t.Context(), "/bin/sh", "-c", script) + out, err := cmd.Output() + assert.NoError(t, err) + assert.Equal(t, value, string(out)) + } +} + +func TestExecutor_Logs(t *testing.T) { + var b bytes.Buffer + ex := makeTestExecutor(t) + // Use printf to generate ANSI control codes. + // \033[31m = red text, \033[1;32m = bold green text, \033[0m = reset + ex.jobSpec.Commands = append(ex.jobSpec.Commands, "printf '\\033[31mRed Hello World\\033[0m\\n' && printf '\\033[1;32mBold Green Line 2\\033[0m\\n' && printf 'Line 3\\n'") + + err := ex.execJob(t.Context(), io.Writer(&b)) + assert.NoError(t, err) + + logHistory := ex.GetHistory(0).JobLogs + assert.NotEmpty(t, logHistory) + + logString := combineLogMessages(logHistory) + normalizedLogString := strings.ReplaceAll(logString, "\r\n", "\n") + + expectedOutput := "Red Hello World\nBold Green Line 2\nLine 3\n" + assert.Equal(t, expectedOutput, normalizedLogString, "Should strip ANSI codes from regular logs") + + // Verify timestamps are in order + assert.Greater(t, len(logHistory), 0) + for i := 1; i < len(logHistory); i++ { + assert.GreaterOrEqual(t, logHistory[i].Timestamp, logHistory[i-1].Timestamp) + } +} + +func TestExecutor_LogsWithErrors(t *testing.T) { + var b bytes.Buffer + ex := makeTestExecutor(t) + ex.jobSpec.Commands = append(ex.jobSpec.Commands, "echo 'Success message' && echo 'Error message' >&2 && exit 1") + + err := ex.execJob(t.Context(), io.Writer(&b)) + assert.Error(t, err) + + logHistory := ex.GetHistory(0).JobLogs + assert.NotEmpty(t, logHistory) + + logString := combineLogMessages(logHistory) + normalizedLogString := strings.ReplaceAll(logString, "\r\n", "\n") + + expectedOutput := "Success message\nError message\n" + assert.Equal(t, expectedOutput, normalizedLogString) +} + +func TestExecutor_LogsAnsiCodeHandling(t *testing.T) { + var b bytes.Buffer + ex := makeTestExecutor(t) + + // Test a variety of ANSI escape sequences on stdout and stderr. + cmd := "printf '\\033[31mRed\\033[0m \\033[32mGreen\\033[0m\\n' && " + + "printf '\\033[1mBold\\033[0m \\033[4mUnderline\\033[0m\\n' && " + + "printf '\\033[s\\033[uPlain text\\n' >&2" + + ex.jobSpec.Commands = append(ex.jobSpec.Commands, cmd) + + err := ex.execJob(t.Context(), io.Writer(&b)) + assert.NoError(t, err) + + // 1. Check WebSocket logs, which should preserve ANSI codes. + wsLogHistory := ex.GetJobWsLogsHistory() + assert.NotEmpty(t, wsLogHistory) + wsLogString := combineLogMessages(wsLogHistory) + normalizedWsLogString := strings.ReplaceAll(wsLogString, "\r\n", "\n") + + expectedWsOutput := "\033[31mRed\033[0m \033[32mGreen\033[0m\n" + + "\033[1mBold\033[0m \033[4mUnderline\033[0m\n" + + "\033[s\033[uPlain text\n" + assert.Equal(t, expectedWsOutput, normalizedWsLogString, "Websocket logs should preserve ANSI codes") + + // 2. Check regular job logs, which should have ANSI codes stripped. + regularLogHistory := ex.GetHistory(0).JobLogs + assert.NotEmpty(t, regularLogHistory) + regularLogString := combineLogMessages(regularLogHistory) + normalizedRegularLogString := strings.ReplaceAll(regularLogString, "\r\n", "\n") + + expectedRegularOutput := "Red Green\n" + + "Bold Underline\n" + + "Plain text\n" + assert.Equal(t, expectedRegularOutput, normalizedRegularLogString, "Regular logs should have ANSI codes stripped") + + // Verify timestamps are ordered for both log types. + assert.Greater(t, len(wsLogHistory), 0) + for i := 1; i < len(wsLogHistory); i++ { + assert.GreaterOrEqual(t, wsLogHistory[i].Timestamp, wsLogHistory[i-1].Timestamp) + } +} + +type sshdMock struct{} + +func (d *sshdMock) Port() int { + return 0 +} + +func (d *sshdMock) Start(context.Context) error { + return nil +} + +func (d *sshdMock) Stop(context.Context) error { + return nil +} + +func (d *sshdMock) AddAuthorizedKeys(context.Context, ...string) error { + return nil +} + +func combineLogMessages(logHistory []schemas.LogEvent) string { + var logOutput bytes.Buffer + for _, logEvent := range logHistory { + logOutput.Write(logEvent.Message) + } + return logOutput.String() +} diff --git a/runner/internal/runner/executor/files.go b/runner/internal/runner/executor/files.go new file mode 100644 index 0000000000..f61b9e2429 --- /dev/null +++ b/runner/internal/runner/executor/files.go @@ -0,0 +1,101 @@ +package executor + +import ( + "context" + "errors" + "fmt" + "io" + "os" + "path" + "path/filepath" + "regexp" + + "github.com/codeclysm/extract/v4" + + "github.com/dstackai/dstack/runner/internal/common/log" + "github.com/dstackai/dstack/runner/internal/common/utils" +) + +var renameRegex = regexp.MustCompile(`^([^/]*)(/|$)`) + +func (ex *RunExecutor) WriteFileArchive(id string, src io.Reader) error { + if err := os.MkdirAll(ex.fileArchiveDir, 0o755); err != nil { + return fmt.Errorf("create archive directory: %w", err) + } + archivePath := path.Join(ex.fileArchiveDir, id) + archive, err := os.Create(archivePath) + if err != nil { + return fmt.Errorf("create archive file: %w", err) + } + defer func() { _ = archive.Close() }() + if _, err = io.Copy(archive, src); err != nil { + return fmt.Errorf("copy archive data: %w", err) + } + return nil +} + +// setupFiles must be called from Run after setJobUser and setJobWorkingDir +func (ex *RunExecutor) setupFiles(ctx context.Context) error { + log.Trace(ctx, "Setting up files") + if ex.jobWorkingDir == "" { + return errors.New("working dir is not set") + } + if !filepath.IsAbs(ex.jobWorkingDir) { + return fmt.Errorf("working dir must be absolute: %s", ex.jobWorkingDir) + } + for _, fa := range ex.jobSpec.FileArchives { + archivePath := path.Join(ex.fileArchiveDir, fa.Id) + err := extractFileArchive( + ctx, archivePath, fa.Path, ex.jobWorkingDir, ex.jobUser.HomeDir, + ex.jobUser.Uid, ex.jobUser.Gid, + ) + if err != nil { + return fmt.Errorf("extract file archive %s: %w", fa.Id, err) + } + } + if err := os.RemoveAll(ex.fileArchiveDir); err != nil { + log.Warning(ctx, "Failed to remove file archives dir", "path", ex.fileArchiveDir, "err", err) + } + return nil +} + +func extractFileArchive(ctx context.Context, archivePath string, destPath string, baseDir string, homeDir string, uid int, gid int) error { + log.Trace(ctx, "Extracting file archive", "archive", archivePath, "dest", destPath, "base", baseDir, "home", homeDir) + + destPath, err := utils.ExpandPath(destPath, baseDir, homeDir) + if err != nil { + return fmt.Errorf("expand destination path: %w", err) + } + destBase, destName := path.Split(destPath) + if err := utils.MkdirAll(ctx, destBase, uid, gid, 0o755); err != nil { + return fmt.Errorf("create destination directory: %w", err) + } + if err := os.RemoveAll(destPath); err != nil { + log.Warning(ctx, "Failed to remove", "path", destPath, "err", err) + } + + archive, err := os.Open(archivePath) + if err != nil { + return fmt.Errorf("open archive file: %w", err) + } + defer archive.Close() + + var paths []string + repl := fmt.Sprintf("%s$2", destName) + renameAndRemember := func(s string) string { + s = renameRegex.ReplaceAllString(s, repl) + paths = append(paths, s) + return s + } + if err := extract.Tar(ctx, archive, destBase, renameAndRemember); err != nil { + return fmt.Errorf("extract tar archive: %w", err) + } + + for _, p := range paths { + if err := os.Chown(path.Join(destBase, p), uid, gid); err != nil { + log.Warning(ctx, "Failed to chown", "path", p, "err", err) + } + } + + return nil +} diff --git a/runner/internal/executor/lock.go b/runner/internal/runner/executor/lock.go similarity index 100% rename from runner/internal/executor/lock.go rename to runner/internal/runner/executor/lock.go diff --git a/runner/internal/runner/executor/logs.go b/runner/internal/runner/executor/logs.go new file mode 100644 index 0000000000..54b087e324 --- /dev/null +++ b/runner/internal/runner/executor/logs.go @@ -0,0 +1,68 @@ +package executor + +import ( + "errors" + "math" + "sync" + "time" + + "github.com/dstackai/dstack/runner/internal/runner/schemas" +) + +var ErrLogQuotaExceeded = errors.New("log quota exceeded") + +type appendWriter struct { + mu *sync.RWMutex // shares with executor + history []schemas.LogEvent + timestamp *MonotonicTimestamp // shares with executor + + quota int // bytes per hour, 0 = unlimited + bytesInHour int // bytes written in current hour bucket + currentHour int // monotonic hour bucket index since timeStarted + timeStarted time.Time // monotonic reference point for hour buckets + quotaExceeded chan struct{} // closed when quota is exceeded (out-of-band signal) + exceededOnce sync.Once +} + +func newAppendWriter(mu *sync.RWMutex, timestamp *MonotonicTimestamp) *appendWriter { + return &appendWriter{ + mu: mu, + history: make([]schemas.LogEvent, 0), + timestamp: timestamp, + quotaExceeded: make(chan struct{}), + } +} + +func (w *appendWriter) SetQuota(quota int) { + w.quota = quota + w.timeStarted = time.Now() +} + +// QuotaExceeded returns a channel that is closed when the log quota is exceeded. +func (w *appendWriter) QuotaExceeded() <-chan struct{} { + return w.quotaExceeded +} + +func (w *appendWriter) Write(p []byte) (n int, err error) { + w.mu.Lock() + defer w.mu.Unlock() + + if w.quota > 0 { + hour := int(math.Floor(time.Since(w.timeStarted).Hours())) + if hour != w.currentHour { + w.bytesInHour = 0 + w.currentHour = hour + } + if w.bytesInHour+len(p) > w.quota { + w.exceededOnce.Do(func() { close(w.quotaExceeded) }) + return 0, ErrLogQuotaExceeded + } + w.bytesInHour += len(p) + } + + pCopy := make([]byte, len(p)) + copy(pCopy, p) + w.history = append(w.history, schemas.LogEvent{Message: pCopy, Timestamp: w.timestamp.Next()}) + + return len(p), nil +} diff --git a/runner/internal/runner/executor/query.go b/runner/internal/runner/executor/query.go new file mode 100644 index 0000000000..f3acbf20ac --- /dev/null +++ b/runner/internal/runner/executor/query.go @@ -0,0 +1,42 @@ +package executor + +import ( + "github.com/dstackai/dstack/runner/internal/runner/schemas" +) + +func (ex *RunExecutor) GetJobWsLogsHistory() []schemas.LogEvent { + return ex.jobWsLogs.history +} + +func (ex *RunExecutor) GetHistory(timestamp int64) *schemas.PullResponse { + return &schemas.PullResponse{ + JobStates: eventsAfter(ex.jobStateHistory, timestamp), + JobLogs: eventsAfter(ex.jobLogs.history, timestamp), + RunnerLogs: eventsAfter(ex.runnerLogs.history, timestamp), + LastUpdated: ex.timestamp.GetLatest(), + NoConnectionsSecs: ex.connectionTracker.GetNoConnectionsSecs(), + HasMore: ex.state != WaitLogsFinished, + } +} + +func (ex *RunExecutor) GetRunnerState() string { + return ex.state +} + +type OrderedEvent interface { + GetTimestamp() int64 +} + +func eventsAfter[T OrderedEvent](events []T, timestamp int64) []T { + left := 0 + right := len(events) + for left < right { + mid := (left + right) / 2 + if events[mid].GetTimestamp() <= timestamp { + left = mid + 1 + } else { + right = mid + } + } + return events[left:] +} diff --git a/runner/internal/runner/executor/repo.go b/runner/internal/runner/executor/repo.go new file mode 100644 index 0000000000..40cb495fbf --- /dev/null +++ b/runner/internal/runner/executor/repo.go @@ -0,0 +1,286 @@ +package executor + +import ( + "context" + "errors" + "fmt" + "io" + "io/fs" + "os" + "os/exec" + "path" + "path/filepath" + + "github.com/codeclysm/extract/v4" + + "github.com/dstackai/dstack/runner/internal/common/log" + "github.com/dstackai/dstack/runner/internal/common/utils" + "github.com/dstackai/dstack/runner/internal/runner/repo" + "github.com/dstackai/dstack/runner/internal/runner/schemas" +) + +// WriteRepoBlob must be called after SetJob +func (ex *RunExecutor) WriteRepoBlob(src io.Reader) error { + if err := os.MkdirAll(ex.repoBlobDir, 0o755); err != nil { + return fmt.Errorf("create blob directory: %w", err) + } + ex.repoBlobPath = path.Join(ex.repoBlobDir, ex.run.RunSpec.RepoId) + blob, err := os.Create(ex.repoBlobPath) + if err != nil { + return fmt.Errorf("create blob file: %w", err) + } + defer func() { _ = blob.Close() }() + if _, err = io.Copy(blob, src); err != nil { + return fmt.Errorf("copy blob data: %w", err) + } + return nil +} + +// setupRepo must be called from Run after setJobUser and setJobWorkingDir +func (ex *RunExecutor) setupRepo(ctx context.Context) error { + log.Trace(ctx, "Setting up repo") + if ex.jobWorkingDir == "" { + return errors.New("working dir is not set") + } + if !filepath.IsAbs(ex.jobWorkingDir) { + return fmt.Errorf("working dir must be absolute: %s", ex.jobWorkingDir) + } + if ex.jobSpec.RepoDir == nil { + return errors.New("repo dir is not set") + } + + var err error + ex.repoDir, err = utils.ExpandPath(*ex.jobSpec.RepoDir, ex.jobWorkingDir, ex.jobUser.HomeDir) + if err != nil { + return fmt.Errorf("expand repo dir path: %w", err) + } + log.Trace(ctx, "Job repo dir", "path", ex.repoDir) + + repoDirIsEmpty, repoDirMustBeMoved, err := ex.checkRepoDir(ctx) + if err != nil { + return fmt.Errorf("prepare repo dir: %w", err) + } + if !repoDirIsEmpty { + var repoExistsAction schemas.RepoExistsAction + if ex.jobSpec.RepoExistsAction != nil { + repoExistsAction = *ex.jobSpec.RepoExistsAction + } else { + log.Debug(ctx, "repo_exists_action is not set, using legacy 'skip' action") + repoExistsAction = schemas.RepoExistsActionSkip + } + switch repoExistsAction { + case schemas.RepoExistsActionError: + return fmt.Errorf("repo dir is not empty: %s", ex.repoDir) + case schemas.RepoExistsActionSkip: + log.Info(ctx, "Skipping repo checkout: repo dir is not empty", "path", ex.repoDir) + return nil + default: + return fmt.Errorf("unsupported action: %s", repoExistsAction) + } + } + + if repoDirMustBeMoved { + // Move existing repo files from the repo dir and back to be able to git clone. + // Currently, only needed for volumes mounted inside repo with lost+found present. + tmpRepoDir, err := os.MkdirTemp(ex.tempDir, "repo_dir_copy") + if err != nil { + return fmt.Errorf("create temp repo dir: %w", err) + } + defer func() { _ = os.RemoveAll(tmpRepoDir) }() + err = ex.moveRepoDir(ctx, tmpRepoDir) + if err != nil { + return fmt.Errorf("move repo dir: %w", err) + } + defer func() { + err_ := ex.restoreRepoDir(ctx, tmpRepoDir) + if err == nil { + err = fmt.Errorf("restore repo dir: %w", err_) + } + }() + } + + switch ex.getRepoData().RepoType { + case "remote": + log.Trace(ctx, "Fetching git repository") + if err := ex.prepareGit(ctx); err != nil { + return fmt.Errorf("prepare git repo: %w", err) + } + case "local", "virtual": + if err := ex.extractCodeArchive(ctx); err != nil { + return fmt.Errorf("extract code archive: %w", err) + } + default: + return fmt.Errorf("unknown RepoType: %s", ex.getRepoData().RepoType) + } + + if err := ex.chownRepoDir(ctx); err != nil { + return fmt.Errorf("chown repo dir: %w", err) + } + + if err := os.RemoveAll(ex.repoBlobDir); err != nil { + log.Warning(ctx, "Failed to remove repo blobs dir", "path", ex.repoBlobDir, "err", err) + } + + return err +} + +func (ex *RunExecutor) prepareGit(ctx context.Context) error { + repoManager := repo.NewManager( + ctx, + ex.repoCredentials.CloneURL, + ex.getRepoData().RepoBranch, + ex.getRepoData().RepoHash, + ex.jobSpec.SingleBranch, + ).WithLocalPath(ex.repoDir) + if ex.repoCredentials != nil { + log.Trace(ctx, "Credentials is not empty") + switch ex.repoCredentials.GetProtocol() { + case "https": + log.Trace(ctx, "Select HTTPS protocol") + if ex.repoCredentials.OAuthToken == nil { + log.Warning(ctx, "OAuth token is empty") + break + } + repoManager.WithTokenAuth(*ex.repoCredentials.OAuthToken) + case "ssh": + log.Trace(ctx, "Select SSH protocol") + if ex.repoCredentials.PrivateKey == nil { + return fmt.Errorf("private key is empty") + } + repoManager.WithSSHAuth(*ex.repoCredentials.PrivateKey, "") // we don't support passphrase + default: + return fmt.Errorf("unsupported remote repo protocol: %s", ex.repoCredentials.GetProtocol()) + } + } else { + log.Trace(ctx, "Credentials is empty") + } + + log.Trace(ctx, "Checking out remote repo", "GIT URL", repoManager.URL()) + if err := repoManager.Checkout(ctx); err != nil { + return fmt.Errorf("checkout repo: %w", err) + } + if err := repoManager.SetConfig(ex.getRepoData().RepoConfigName, ex.getRepoData().RepoConfigEmail); err != nil { + return fmt.Errorf("set repo config: %w", err) + } + + if ex.repoBlobPath == "" { + log.Trace(ctx, "No diff to apply") + return nil + } + log.Trace(ctx, "Applying diff") + repoDiff, err := os.ReadFile(ex.repoBlobPath) + if err != nil { + return fmt.Errorf("read repo diff: %w", err) + } + if err := repo.ApplyDiff(ctx, ex.repoDir, string(repoDiff)); err != nil { + return fmt.Errorf("apply diff: %w", err) + } + return nil +} + +func (ex *RunExecutor) extractCodeArchive(ctx context.Context) error { + if ex.repoBlobPath == "" { + log.Trace(ctx, "No code archive to extract") + return nil + } + log.Trace(ctx, "Extracting code archive", "src", ex.repoBlobPath, "dst", ex.repoDir) + file, err := os.Open(ex.repoBlobPath) + if err != nil { + return fmt.Errorf("open code archive: %w", err) + } + defer func() { _ = file.Close() }() + if err := extract.Tar(ctx, file, ex.repoDir, nil); err != nil { + return fmt.Errorf("extract tar archive: %w", err) + } + return nil +} + +func (ex *RunExecutor) checkRepoDir(ctx context.Context) (isEmpty bool, mustBeMoved bool, err error) { + log.Trace(ctx, "Checking repo dir") + info, err := os.Stat(ex.repoDir) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + // No repo dir + return true, false, nil + } + return false, false, fmt.Errorf("stat repo dir: %w", err) + } + if !info.IsDir() { + return false, false, fmt.Errorf("stat repo dir: %s is not a dir", ex.repoDir) + } + entries, err := os.ReadDir(ex.repoDir) + if err != nil { + return false, false, fmt.Errorf("read repo dir: %w", err) + } + if len(entries) == 0 { + // Repo dir is empty + return true, false, nil + } + if len(entries) == 1 && entries[0].Name() == "lost+found" { + // lost+found may be present on a newly created volume + // We (but not Git, thus mustBeMoved = true) consider such a dir "empty" + return true, true, nil + } + // Repo dir is not empty + return false, false, nil +} + +func (ex *RunExecutor) moveRepoDir(ctx context.Context, tmpDir string) error { + if err := moveDir(ctx, ex.repoDir, tmpDir); err != nil { + return fmt.Errorf("move directory: %w", err) + } + return nil +} + +func (ex *RunExecutor) restoreRepoDir(ctx context.Context, tmpDir string) error { + if err := moveDir(ctx, tmpDir, ex.repoDir); err != nil { + return fmt.Errorf("move directory: %w", err) + } + return nil +} + +func (ex *RunExecutor) chownRepoDir(ctx context.Context) error { + log.Trace(ctx, "Chowning repo dir") + exists, err := utils.PathExists(ex.repoDir) + // We consider all errors here non-fatal + if err != nil { + log.Warning(ctx, "Failed to check if repo dir exists", "err", err) + return nil + } + if !exists { + log.Trace(ctx, "Repo dir does not exist") + return nil + } + return filepath.WalkDir( + ex.repoDir, + func(p string, d fs.DirEntry, err error) error { + if err != nil { + log.Warning(ctx, "Error while walking repo dir", "path", p, "err", err) + return nil + } + if err := os.Chown(p, ex.jobUser.Uid, ex.jobUser.Gid); err != nil { + log.Warning(ctx, "Error while chowning repo dir", "path", p, "err", err) + } + return nil + }, + ) +} + +func moveDir(ctx context.Context, srcDir, dstDir string) error { + // We cannot just move/rename files because with volumes they'll be on different devices + cmd := exec.CommandContext(ctx, "cp", "-a", srcDir+"/.", dstDir) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("failed to cp: %w, output: %s", err, string(output)) + } + entries, err := os.ReadDir(srcDir) + if err != nil { + return fmt.Errorf("read source directory: %w", err) + } + for _, entry := range entries { + err := os.RemoveAll(filepath.Join(srcDir, entry.Name())) + if err != nil { + return fmt.Errorf("remove file from source: %w", err) + } + } + return nil +} diff --git a/runner/internal/runner/executor/states.go b/runner/internal/runner/executor/states.go new file mode 100644 index 0000000000..f188871442 --- /dev/null +++ b/runner/internal/runner/executor/states.go @@ -0,0 +1,8 @@ +package executor + +const ( + WaitSubmit = "wait_submit" + WaitRun = "wait_run" + ServeLogs = "serve_logs" + WaitLogsFinished = "wait_logs_finished" +) diff --git a/runner/internal/runner/executor/timestamp.go b/runner/internal/runner/executor/timestamp.go new file mode 100644 index 0000000000..b06d8cf47e --- /dev/null +++ b/runner/internal/runner/executor/timestamp.go @@ -0,0 +1,61 @@ +package executor + +import ( + "context" + "sync" + "time" + + "github.com/dstackai/dstack/runner/internal/common/log" +) + +type MonotonicTimestamp struct { + initial time.Time + initialUnix int64 // seconds + elapsed int64 // seconds since initial + counter int // surrogate milliseconds + overflow bool + mu sync.RWMutex + getNow func() time.Time +} + +func NewMonotonicTimestamp() *MonotonicTimestamp { + return newMonotonicTimestamp(time.Now) +} + +func newMonotonicTimestamp(getNow func() time.Time) *MonotonicTimestamp { + // getNow must return time.Time with monotonic reading + now := getNow() + return &MonotonicTimestamp{ + initial: now, + initialUnix: now.Unix(), + mu: sync.RWMutex{}, + getNow: getNow, + } +} + +func (t *MonotonicTimestamp) GetLatest() int64 { + t.mu.RLock() + defer t.mu.RUnlock() + return (t.initialUnix+t.elapsed)*1000 + int64(t.counter) +} + +func (t *MonotonicTimestamp) Next() int64 { + t.mu.Lock() + now := t.getNow() + elapsed := int64(now.Sub(t.initial) / time.Second) + if elapsed == t.elapsed { + if t.counter < 999 { + t.counter++ + } else if !t.overflow { + // warn only once per second to avoid log spamming + log.Warning(context.TODO(), "Monotonic timestamp counter overflowed", "unix", t.initialUnix+elapsed) + t.overflow = true + } + } else { + t.elapsed = elapsed + t.counter = 0 + t.overflow = false + } + t.mu.Unlock() + return t.GetLatest() +} diff --git a/runner/internal/runner/executor/timestamp_test.go b/runner/internal/runner/executor/timestamp_test.go new file mode 100644 index 0000000000..ab28bf416e --- /dev/null +++ b/runner/internal/runner/executor/timestamp_test.go @@ -0,0 +1,43 @@ +package executor + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func TestTimestamp_Counter(t *testing.T) { + now := time.Now() + ts := newMonotonicTimestamp(func() time.Time { return now }) + initial := ts.GetLatest() + assert.Equal(t, int64(1), ts.Next()-initial) + assert.Equal(t, int64(2), ts.Next()-initial) + now = now.Add(999 * time.Millisecond) + assert.Equal(t, int64(3), ts.Next()-initial) + now = now.Add(100 * time.Millisecond) + assert.Equal(t, int64(1000), ts.Next()-initial) + assert.Equal(t, int64(1001), ts.Next()-initial) +} + +func TestTimestamp_CounterOverflow(t *testing.T) { + now := time.Now() + ts := newMonotonicTimestamp(func() time.Time { return now }) + initial := ts.GetLatest() + for i := 0; i < 997; i++ { + ts.Next() + } + assert.Equal(t, int64(998), ts.Next()-initial) + assert.False(t, ts.overflow) + assert.Equal(t, int64(999), ts.Next()-initial) + assert.False(t, ts.overflow) + assert.Equal(t, int64(999), ts.Next()-initial) + assert.True(t, ts.overflow) + assert.Equal(t, int64(999), ts.Next()-initial) + assert.True(t, ts.overflow) + now = now.Add(time.Second) + assert.Equal(t, int64(1000), ts.Next()-initial) + assert.False(t, ts.overflow) + assert.Equal(t, int64(1001), ts.Next()-initial) + assert.False(t, ts.overflow) +} diff --git a/runner/internal/runner/executor/user.go b/runner/internal/runner/executor/user.go new file mode 100644 index 0000000000..df9f0fe45c --- /dev/null +++ b/runner/internal/runner/executor/user.go @@ -0,0 +1,184 @@ +package executor + +import ( + "context" + "errors" + "fmt" + "os" + osuser "os/user" + "path" + "strconv" + "strings" + + "github.com/dstackai/dstack/runner/internal/common/log" + linuxuser "github.com/dstackai/dstack/runner/internal/runner/linux/user" + "github.com/dstackai/dstack/runner/internal/runner/schemas" +) + +func (ex *RunExecutor) setJobUser(ctx context.Context) error { + if ex.jobSpec.User == nil { + // JobSpec.User is nil if the user is not specified either in the dstack configuration + // (the `user` property) or in the image (the `USER` Dockerfile instruction). + // In such cases, the root user should be used as a fallback, and we use the current user, + // assuming that the runner is started by root. + ex.jobUser = &ex.currentUser + } else { + jobUser, err := jobUserFromJobSpecUser( + ex.jobSpec.User, + osuser.LookupId, osuser.Lookup, + osuser.LookupGroup, (*osuser.User).GroupIds, + ) + if err != nil { + return fmt.Errorf("job user from job spec: %w", err) + } + ex.jobUser = jobUser + } + + if err := checkHomeDir(ex.jobUser.HomeDir); err != nil { + log.Warning(ctx, "Error while checking job user home dir, using / instead", "err", err) + ex.jobUser.HomeDir = "/" + } + + log.Trace(ctx, "Job user", "user", ex.jobUser) + return nil +} + +func jobUserFromJobSpecUser( + jobSpecUser *schemas.User, + userLookupIdFunc func(string) (*osuser.User, error), + userLookupNameFunc func(string) (*osuser.User, error), + groupLookupNameFunc func(string) (*osuser.Group, error), + userGroupIdsFunc func(*osuser.User) ([]string, error), +) (*linuxuser.User, error) { + if jobSpecUser.Uid == nil && jobSpecUser.Username == nil { + return nil, errors.New("neither uid nor username is set") + } + + var err error + var osUser *osuser.User + + // -1 is a placeholder value, the actual value must be >= 0 + //nolint:ineffassign + uid := -1 + if jobSpecUser.Uid != nil { + uid = int(*jobSpecUser.Uid) + osUser, err = userLookupIdFunc(strconv.Itoa(uid)) + if err != nil { + var notFoundErr osuser.UnknownUserIdError + if !errors.As(err, ¬FoundErr) { + return nil, fmt.Errorf("lookup user by id: %w", err) + } + } + } else { + osUser, err = userLookupNameFunc(*jobSpecUser.Username) + if err != nil { + return nil, fmt.Errorf("lookup user by name: %w", err) + } + uid, err = parseStringId(osUser.Uid) + if err != nil { + return nil, fmt.Errorf("parse user id: %w", err) + } + } + if uid == -1 { + // Assertion, should never occur + return nil, errors.New("failed to infer user id") + } + + // -1 is a placeholder value, the actual value must be >= 0 + //nolint:ineffassign + gid := -1 + // Must include at least one gid, see len(gids) == 0 assertion below + var gids []int + if jobSpecUser.Gid != nil { + gid = int(*jobSpecUser.Gid) + // Here and below: + // > Note that when specifying a group for the user, the user will have + // > only the specified group membership. + // > Any other configured group memberships will be ignored. + // See: https://docs.docker.com/reference/dockerfile/#user + gids = []int{gid} + } else if jobSpecUser.Groupname != nil { + osGroup, err := groupLookupNameFunc(*jobSpecUser.Groupname) + if err != nil { + return nil, fmt.Errorf("lookup group by name: %w", err) + } + gid, err = parseStringId(osGroup.Gid) + if err != nil { + return nil, fmt.Errorf("parse group id: %w", err) + } + gids = []int{gid} + } else if osUser != nil { + gid, err = parseStringId(osUser.Gid) + if err != nil { + return nil, fmt.Errorf("parse group id: %w", err) + } + rawGids, err := userGroupIdsFunc(osUser) + if err != nil { + return nil, fmt.Errorf("get user supplementary group ids: %w", err) + } + // [main_gid, supplementary_gid_1, supplementary_gid_2, ...] + gids = make([]int, len(rawGids)+1) + gids[0] = gid + for index, rawGid := range rawGids { + supplementaryGid, err := parseStringId(rawGid) + if err != nil { + return nil, fmt.Errorf("parse supplementary group id: %w", err) + } + gids[index+1] = supplementaryGid + } + } else { + // > When the user doesn't have a primary group then the image + // > (or the next instructions) will be run with the root group. + // See: https://docs.docker.com/reference/dockerfile/#user + gid = 0 + gids = []int{gid} + } + if gid == -1 { + // Assertion, should never occur + return nil, errors.New("failed to infer group id") + } + if len(gids) == 0 { + // Assertion, should never occur + return nil, errors.New("failed to infer supplementary group ids") + } + + username := "" + homeDir := "" + if osUser != nil { + username = osUser.Username + homeDir = osUser.HomeDir + } + + return linuxuser.NewUser(uid, gid, gids, username, homeDir), nil +} + +func parseStringId(stringId string) (int, error) { + id, err := strconv.Atoi(stringId) + if err != nil { + return 0, err + } + if id < 0 { + return 0, fmt.Errorf("negative id value: %d", id) + } + return id, nil +} + +func checkHomeDir(homeDir string) error { + if homeDir == "" { + return errors.New("not set") + } + if !path.IsAbs(homeDir) { + return fmt.Errorf("must be absolute: %s", homeDir) + } + if info, err := os.Stat(homeDir); errors.Is(err, os.ErrNotExist) { + if strings.Contains(homeDir, "nonexistent") { + // let `/nonexistent` stay non-existent + return fmt.Errorf("non-existent: %s", homeDir) + } + } else if err != nil { + return err + } else if !info.IsDir() { + return fmt.Errorf("not a directory: %s", homeDir) + } + return nil +} diff --git a/runner/internal/runner/executor/user_test.go b/runner/internal/runner/executor/user_test.go new file mode 100644 index 0000000000..c0fc202f2e --- /dev/null +++ b/runner/internal/runner/executor/user_test.go @@ -0,0 +1,232 @@ +package executor + +import ( + "errors" + osuser "os/user" + "strconv" + "testing" + + "github.com/stretchr/testify/require" + + linuxuser "github.com/dstackai/dstack/runner/internal/runner/linux/user" + "github.com/dstackai/dstack/runner/internal/runner/schemas" +) + +var shouldNotBeCalledErr = errors.New("this function should not be called") + +func unknownUserIdError(t *testing.T, strUid string) osuser.UnknownUserIdError { + t.Helper() + uid, err := strconv.Atoi(strUid) + require.NoError(t, err) + return osuser.UnknownUserIdError(uid) +} + +func TestJobUserFromJobSpecUser_Uid_UserDoesNotExist(t *testing.T) { + specUid := uint32(2000) + specUser := schemas.User{Uid: &specUid} + expectedUser := linuxuser.User{ + Uid: 2000, + Gid: 0, + Gids: []int{0}, + Username: "", + HomeDir: "", + } + + user, err := jobUserFromJobSpecUser( + &specUser, + func(id string) (*osuser.User, error) { return nil, unknownUserIdError(t, id) }, + func(name string) (*osuser.User, error) { return nil, shouldNotBeCalledErr }, + func(name string) (*osuser.Group, error) { return nil, shouldNotBeCalledErr }, + func(*osuser.User) ([]string, error) { return nil, shouldNotBeCalledErr }, + ) + + require.NoError(t, err) + require.Equal(t, expectedUser, *user) +} + +func TestJobUserFromJobSpecUser_Uid_Gid_UserDoesNotExist(t *testing.T) { + specUid := uint32(2000) + specGid := uint32(200) + specUser := schemas.User{Uid: &specUid, Gid: &specGid} + expectedUser := linuxuser.User{ + Uid: 2000, + Gid: 200, + Gids: []int{200}, + Username: "", + HomeDir: "", + } + + user, err := jobUserFromJobSpecUser( + &specUser, + func(id string) (*osuser.User, error) { return nil, unknownUserIdError(t, id) }, + func(name string) (*osuser.User, error) { return nil, shouldNotBeCalledErr }, + func(name string) (*osuser.Group, error) { return nil, shouldNotBeCalledErr }, + func(*osuser.User) ([]string, error) { return nil, shouldNotBeCalledErr }, + ) + + require.NoError(t, err) + require.Equal(t, expectedUser, *user) +} + +func TestJobUserFromJobSpecUser_Uid_UserExists(t *testing.T) { + specUid := uint32(2000) + specUser := schemas.User{Uid: &specUid} + osUser := osuser.User{ + Uid: "2000", + Gid: "300", + Username: "testuser", + HomeDir: "/home/testuser", + } + osUserGids := []string{"300", "400", "500"} + expectedUser := linuxuser.User{ + Uid: 2000, + Gid: 300, + Gids: []int{300, 400, 500}, + Username: "testuser", + HomeDir: "/home/testuser", + } + + user, err := jobUserFromJobSpecUser( + &specUser, + func(uid string) (*osuser.User, error) { return &osUser, nil }, + func(name string) (*osuser.User, error) { return nil, shouldNotBeCalledErr }, + func(gid string) (*osuser.Group, error) { return nil, shouldNotBeCalledErr }, + func(*osuser.User) ([]string, error) { return osUserGids, nil }, + ) + + require.NoError(t, err) + require.Equal(t, expectedUser, *user) +} + +func TestJobUserFromJobSpecUser_Uid_Gid_UserExists(t *testing.T) { + specUid := uint32(2000) + specGid := uint32(200) + specUser := schemas.User{Uid: &specUid, Gid: &specGid} + osUser := osuser.User{ + Uid: "2000", + Gid: "300", + Username: "testuser", + HomeDir: "/home/testuser", + } + expectedUser := linuxuser.User{ + Uid: 2000, + Gid: 200, + Gids: []int{200}, + Username: "testuser", + HomeDir: "/home/testuser", + } + + user, err := jobUserFromJobSpecUser( + &specUser, + func(id string) (*osuser.User, error) { return &osUser, nil }, + func(name string) (*osuser.User, error) { return nil, shouldNotBeCalledErr }, + func(name string) (*osuser.Group, error) { return nil, shouldNotBeCalledErr }, + func(*osuser.User) ([]string, error) { return nil, shouldNotBeCalledErr }, + ) + + require.NoError(t, err) + require.Equal(t, expectedUser, *user) +} + +func TestJobUserFromJobSpecUser_Username_UserDoesNotExist(t *testing.T) { + specUsername := "unknownuser" + specUser := schemas.User{Username: &specUsername} + + user, err := jobUserFromJobSpecUser( + &specUser, + func(id string) (*osuser.User, error) { return nil, shouldNotBeCalledErr }, + func(name string) (*osuser.User, error) { return nil, osuser.UnknownUserError(name) }, + func(name string) (*osuser.Group, error) { return nil, shouldNotBeCalledErr }, + func(*osuser.User) ([]string, error) { return nil, shouldNotBeCalledErr }, + ) + + require.ErrorContains(t, err, "lookup user by name") + require.Nil(t, user) +} + +func TestJobUserFromJobSpecUser_Username_UserExists(t *testing.T) { + specUsername := "testnuser" + specUser := schemas.User{Username: &specUsername} + osUser := osuser.User{ + Uid: "2000", + Gid: "300", + Username: "testuser", + HomeDir: "/home/testuser", + } + osUserGids := []string{"300", "400", "500"} + expectedUser := linuxuser.User{ + Uid: 2000, + Gid: 300, + Gids: []int{300, 400, 500}, + Username: "testuser", + HomeDir: "/home/testuser", + } + + user, err := jobUserFromJobSpecUser( + &specUser, + func(id string) (*osuser.User, error) { return nil, shouldNotBeCalledErr }, + func(name string) (*osuser.User, error) { return &osUser, nil }, + func(name string) (*osuser.Group, error) { return nil, shouldNotBeCalledErr }, + func(*osuser.User) ([]string, error) { return osUserGids, nil }, + ) + + require.NoError(t, err) + require.Equal(t, expectedUser, *user) +} + +func TestJobUserFromJobSpecUser_Username_Groupname_UserExists_GroupExists(t *testing.T) { + specUsername := "testnuser" + specGroupname := "testgroup" + specUser := schemas.User{Username: &specUsername, Groupname: &specGroupname} + osUser := osuser.User{ + Uid: "2000", + Gid: "300", + Username: "testuser", + HomeDir: "/home/testuser", + } + osGroup := osuser.Group{ + Gid: "200", + Name: specGroupname, + } + expectedUser := linuxuser.User{ + Uid: 2000, + Gid: 200, + Gids: []int{200}, + Username: "testuser", + HomeDir: "/home/testuser", + } + + user, err := jobUserFromJobSpecUser( + &specUser, + func(id string) (*osuser.User, error) { return nil, shouldNotBeCalledErr }, + func(name string) (*osuser.User, error) { return &osUser, nil }, + func(name string) (*osuser.Group, error) { return &osGroup, nil }, + func(*osuser.User) ([]string, error) { return nil, shouldNotBeCalledErr }, + ) + + require.NoError(t, err) + require.Equal(t, expectedUser, *user) +} + +func TestJobUserFromJobSpecUser_Username_Groupname_UserExists_GroupDoesNotExist(t *testing.T) { + specUsername := "testnuser" + specGroupname := "testgroup" + specUser := schemas.User{Username: &specUsername, Groupname: &specGroupname} + osUser := osuser.User{ + Uid: "2000", + Gid: "300", + Username: "testuser", + HomeDir: "/home/testuser", + } + + user, err := jobUserFromJobSpecUser( + &specUser, + func(id string) (*osuser.User, error) { return nil, shouldNotBeCalledErr }, + func(name string) (*osuser.User, error) { return &osUser, nil }, + func(name string) (*osuser.Group, error) { return nil, osuser.UnknownGroupError(name) }, + func(*osuser.User) ([]string, error) { return nil, shouldNotBeCalledErr }, + ) + + require.ErrorContains(t, err, "lookup group by name") + require.Nil(t, user) +} diff --git a/runner/internal/runner/linux/capabilities/capabilities.go b/runner/internal/runner/linux/capabilities/capabilities.go new file mode 100644 index 0000000000..fb70b60930 --- /dev/null +++ b/runner/internal/runner/linux/capabilities/capabilities.go @@ -0,0 +1,50 @@ +package capabilities + +import ( + "strings" + + "kernel.org/pub/linux/libs/security/libcap/cap" +) + +type Capability cap.Value + +const ( + SETUID = Capability(cap.SETUID) + SETGID = Capability(cap.SETGID) + CHOWN = Capability(cap.CHOWN) + SYS_RESOURCE = Capability(cap.SYS_RESOURCE) +) + +// String returns a text representation of the capability in the form used by container folks: +// UPPER_CASE, no CAP_ prefix: cap_sys_admin -> SYS_ADMIN +func (c Capability) String() string { + return strings.ToUpper(cap.Value(c).String()[4:]) +} + +// Has returns true if the current process has the specified capability in its effective set +func Has(c Capability) (bool, error) { + set, err := cap.GetPID(0) + if err != nil { + return false, err + } + return set.GetFlag(cap.Effective, cap.Value(c)) +} + +// Check checks and returns those capabilities that are _missing_ from the effective set +// of the current process +func Check(cs ...Capability) (missing []Capability, err error) { + set, err := cap.GetPID(0) + if err != nil { + return nil, err + } + for _, c := range cs { + ok, err := set.GetFlag(cap.Effective, cap.Value(c)) + if err != nil { + return nil, err + } + if !ok { + missing = append(missing, c) + } + } + return missing, nil +} diff --git a/runner/internal/runner/linux/user/user.go b/runner/internal/runner/linux/user/user.go new file mode 100644 index 0000000000..caecc1324f --- /dev/null +++ b/runner/internal/runner/linux/user/user.go @@ -0,0 +1,96 @@ +// Despite this package is being located inside the linux package, it should work on any Unix-like system. +package user + +import ( + "fmt" + osuser "os/user" + "slices" + "strconv" + "syscall" +) + +// User represents the user part of process `credentials(7)` +// (real user ID, real group ID, supplementary group IDs) enriched with +// some info from the user database `passwd(5)` (login name, home dir). +// Note, unlike the User struct from os/user, User does not necessarily +// correspond to any existing user account, for example, any of IDs may not exist +// in passwd(5) or group(5) databases at all or the user may not belong to +// the primary group or any of the specified supplementary groups. +type User struct { + // Real user ID + Uid int + // Real group ID + Gid int + // Supplementary group IDs. The primary group should be always included and + // the resulting list should be sorted in ascending order with duplicates removed; + // NewUser() performs such normalization + Gids []int + // May be empty, e.g., if the user does not exist + Username string + // May be Empty, e.g., if the user does not exist + HomeDir string +} + +func (u *User) String() string { + // The format is inspired by `id(1)` + formattedUsername := "" + if u.Username != "" { + formattedUsername = fmt.Sprintf("(%s)", u.Username) + } + return fmt.Sprintf("uid=%d%s gid=%d groups=%v home=%s", u.Uid, formattedUsername, u.Gid, u.Gids, u.HomeDir) +} + +func (u *User) ProcessCredentials() (*syscall.Credential, error) { + if u.Uid < 0 { + return nil, fmt.Errorf("negative user id: %d", u.Uid) + } + if u.Gid < 0 { + return nil, fmt.Errorf("negative group id: %d", u.Gid) + } + groups := make([]uint32, len(u.Gids)) + for index, gid := range u.Gids { + if gid < 0 { + return nil, fmt.Errorf("negative supplementary group id: %d", gid) + } + groups[index] = uint32(gid) + } + creds := syscall.Credential{ + Uid: uint32(u.Uid), + Gid: uint32(u.Gid), + Groups: groups, + } + return &creds, nil +} + +func (u *User) IsRoot() bool { + return u.Uid == 0 +} + +func NewUser(uid int, gid int, gids []int, username string, homeDir string) *User { + normalizedGids := append([]int{gid}, gids...) + slices.Sort(normalizedGids) + normalizedGids = slices.Compact(normalizedGids) + return &User{ + Uid: uid, + Gid: gid, + Gids: normalizedGids, + Username: username, + HomeDir: homeDir, + } +} + +func FromCurrentProcess() (*User, error) { + uid := syscall.Getuid() + gid := syscall.Getgid() + gids, err := syscall.Getgroups() + if err != nil { + return nil, fmt.Errorf("get supplementary groups: %w", err) + } + username := "" + homeDir := "" + if osUser, err := osuser.LookupId(strconv.Itoa(uid)); err == nil { + username = osUser.Username + homeDir = osUser.HomeDir + } + return NewUser(uid, gid, gids, username, homeDir), nil +} diff --git a/runner/internal/runner/metrics/cgroups.go b/runner/internal/runner/metrics/cgroups.go new file mode 100644 index 0000000000..7ac89db4a1 --- /dev/null +++ b/runner/internal/runner/metrics/cgroups.go @@ -0,0 +1,107 @@ +package metrics + +import ( + "bufio" + "context" + "errors" + "fmt" + "os" + "strings" + + "github.com/dstackai/dstack/runner/internal/common/log" +) + +func getProcessCgroupMountPoint(ctx context.Context, ProcPidMountsPath string) (string, error) { + // See proc_pid_mounts(5) for the ProcPidMountsPath file description + file, err := os.Open(ProcPidMountsPath) + if err != nil { + return "", fmt.Errorf("open mounts file: %w", err) + } + defer func() { + _ = file.Close() + }() + + mountPoint := "" + hasCgroupV1 := false + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + // See fstab(5) for the format description + fields := strings.Fields(line) + if len(fields) != 6 { + log.Warning(ctx, "Unexpected number of fields in mounts file", "num", len(fields), "line", line) + continue + } + fsType := fields[2] + if fsType == "cgroup2" { + mountPoint = fields[1] + break + } + if fsType == "cgroup" { + hasCgroupV1 = true + } + } + if err := scanner.Err(); err != nil { + log.Warning(ctx, "Error while scanning mounts file", "err", err) + } + + if mountPoint != "" { + return mountPoint, nil + } + + if hasCgroupV1 { + return "", errors.New("only cgroup v1 mounts found") + } + + return "", errors.New("no cgroup mounts found") +} + +func getProcessCgroupPathname(ctx context.Context, procPidCgroupPath string) (string, error) { + // See cgroups(7) for the procPidCgroupPath file description + file, err := os.Open(procPidCgroupPath) + if err != nil { + return "", fmt.Errorf("open cgroup file: %w", err) + } + defer func() { + _ = file.Close() + }() + + pathname := "" + hasCgroupV1 := false + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + // See cgroups(7) for the format description + fields := strings.Split(line, ":") + if len(fields) != 3 { + log.Warning(ctx, "Unexpected number of fields in cgroup file", "num", len(fields), "line", line) + continue + } + if fields[0] != "0" { + hasCgroupV1 = true + continue + } + if fields[1] != "" { + // Must be empty for v2 + log.Warning(ctx, "Unexpected v2 entry in cgroup file", "num", "line", line) + continue + } + pathname = fields[2] + break + } + if err := scanner.Err(); err != nil { + log.Warning(ctx, "Error while scanning cgroup file", "err", err) + } + + if pathname != "" { + return pathname, nil + } + + if hasCgroupV1 { + return "", errors.New("only cgroup v1 pathnames found") + } + + return "", errors.New("no cgroup pathname found") +} diff --git a/runner/internal/runner/metrics/cgroups_test.go b/runner/internal/runner/metrics/cgroups_test.go new file mode 100644 index 0000000000..3e6e0abca7 --- /dev/null +++ b/runner/internal/runner/metrics/cgroups_test.go @@ -0,0 +1,87 @@ +package metrics + +import ( + "fmt" + "os" + "path" + "testing" + + "github.com/stretchr/testify/require" +) + +const ( + cgroup2MountLine = "cgroup2 /sys/fs/cgroup cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,memory_recursiveprot 0 0" + cgroupMountLine = "cgroup /sys/fs/cgroup/cpu,cpuacct cgroup rw,nosuid,nodev,noexec,relatime,cpu,cpuacct 0 0" + rootMountLine = "/dev/nvme0n1p5 / ext4 rw,relatime 0 0" +) + +func TestGetProcessCgroupMountPoint_ErrorNoCgroupMounts(t *testing.T) { + procPidMountsPath := createProcFile(t, "mounts", rootMountLine, "malformed line") + + mountPoint, err := getProcessCgroupMountPoint(t.Context(), procPidMountsPath) + + require.ErrorContains(t, err, "no cgroup mounts found") + require.Equal(t, "", mountPoint) +} + +func TestGetProcessCgroupMountPoint_ErrorOnlyCgroupV1Mounts(t *testing.T) { + procPidMountsPath := createProcFile(t, "mounts", rootMountLine, cgroupMountLine) + + mountPoint, err := getProcessCgroupMountPoint(t.Context(), procPidMountsPath) + + require.ErrorContains(t, err, "only cgroup v1 mounts found") + require.Equal(t, "", mountPoint) +} + +func TestGetProcessCgroupMountPoint_OK(t *testing.T) { + procPidMountsPath := createProcFile(t, "mounts", rootMountLine, cgroupMountLine, cgroup2MountLine) + + mountPoint, err := getProcessCgroupMountPoint(t.Context(), procPidMountsPath) + + require.NoError(t, err) + require.Equal(t, "/sys/fs/cgroup", mountPoint) +} + +func TestGetProcessCgroupPathname_ErrorNoCgroup(t *testing.T) { + procPidCgroupPath := createProcFile(t, "cgroup", "malformed entry") + + mountPoint, err := getProcessCgroupPathname(t.Context(), procPidCgroupPath) + + require.ErrorContains(t, err, "no cgroup pathname found") + require.Equal(t, "", mountPoint) +} + +func TestGetProcessCgroupPathname_ErrorOnlyCgroupV1(t *testing.T) { + procPidCgroupPath := createProcFile(t, "cgroup", "7:cpu,cpuacct:/user.slice") + + pathname, err := getProcessCgroupPathname(t.Context(), procPidCgroupPath) + + require.ErrorContains(t, err, "only cgroup v1 pathnames found") + require.Equal(t, "", pathname) +} + +func TestGetProcessCgroupPathname_OK(t *testing.T) { + procPidCgroupPath := createProcFile(t, "cgroup", "7:cpu,cpuacct:/user.slice", "0::/user.slice/user-1000.slice/session-1.scope") + + mountPoint, err := getProcessCgroupPathname(t.Context(), procPidCgroupPath) + + require.NoError(t, err) + require.Equal(t, "/user.slice/user-1000.slice/session-1.scope", mountPoint) +} + +func createProcFile(t *testing.T, name string, lines ...string) string { + t.Helper() + tmpDir := t.TempDir() + pth := path.Join(tmpDir, name) + file, err := os.OpenFile(pth, os.O_WRONLY|os.O_CREATE, 0o600) + require.NoError(t, err) + defer func() { + err := file.Close() + require.NoError(t, err) + }() + for _, line := range lines { + _, err := fmt.Fprintln(file, line) + require.NoError(t, err) + } + return pth +} diff --git a/runner/internal/runner/metrics/metrics.go b/runner/internal/runner/metrics/metrics.go new file mode 100644 index 0000000000..56c27a2bb1 --- /dev/null +++ b/runner/internal/runner/metrics/metrics.go @@ -0,0 +1,271 @@ +package metrics + +import ( + "bytes" + "context" + "errors" + "fmt" + "os" + "os/exec" + "path" + "strconv" + "strings" + "time" + + "github.com/dstackai/dstack/runner/internal/common/gpu" + "github.com/dstackai/dstack/runner/internal/common/log" + "github.com/dstackai/dstack/runner/internal/runner/schemas" +) + +type MetricsCollector struct { + cgroupMountPoint string + gpuVendor gpu.GpuVendor +} + +func NewMetricsCollector(ctx context.Context) (*MetricsCollector, error) { + // It's unlikely that cgroup mount point will change during container lifetime, + // so we detect it only once and reuse. + cgroupMountPoint, err := getProcessCgroupMountPoint(ctx, "/proc/self/mounts") + if err != nil { + return nil, fmt.Errorf("get cgroup mount point: %w", err) + } + gpuVendor := gpu.GetGpuVendor() + return &MetricsCollector{ + cgroupMountPoint: cgroupMountPoint, + gpuVendor: gpuVendor, + }, nil +} + +func (s *MetricsCollector) GetSystemMetrics(ctx context.Context) (*schemas.SystemMetrics, error) { + // It's possible to move a process from one control group to another (it's unlikely, but nonetheless), + // so we detect the current group each time. + cgroupPathname, err := getProcessCgroupPathname(ctx, "/proc/self/cgroup") + if err != nil { + return nil, fmt.Errorf("get cgroup pathname: %w", err) + } + cgroupPath := path.Join(s.cgroupMountPoint, cgroupPathname) + timestamp := time.Now() + cpuUsage, err := s.GetCPUUsageMicroseconds(cgroupPath) + if err != nil { + return nil, err + } + memoryUsage, err := s.GetMemoryUsageBytes(cgroupPath) + if err != nil { + return nil, err + } + memoryCache, err := s.GetMemoryCacheBytes(cgroupPath) + if err != nil { + return nil, err + } + memoryWorkingSet := memoryUsage - memoryCache + gpuMetrics, err := s.GetGPUMetrics(ctx) + if err != nil { + log.Debug(context.TODO(), "Failed to get gpu metrics", "err", err) + } + return &schemas.SystemMetrics{ + Timestamp: timestamp.UnixMicro(), + CpuUsage: cpuUsage, + MemoryUsage: memoryUsage, + MemoryWorkingSet: memoryWorkingSet, + GPUMetrics: gpuMetrics, + }, nil +} + +func (s *MetricsCollector) GetCPUUsageMicroseconds(cgroupPath string) (uint64, error) { + cgroupCPUUsagePath := path.Join(cgroupPath, "cpu.stat") + + data, err := os.ReadFile(cgroupCPUUsagePath) + if err != nil { + return 0, fmt.Errorf("could not read CPU usage: %w", err) + } + + lines := strings.Split(string(data), "\n") + for _, line := range lines { + if strings.HasPrefix(line, "usage_usec") { + parts := strings.Fields(line) + if len(parts) != 2 { + return 0, fmt.Errorf("unexpected format in cpu.stat") + } + usageMicroseconds, err := strconv.ParseUint(parts[1], 10, 64) + if err != nil { + return 0, fmt.Errorf("could not parse usage_usec: %w", err) + } + return usageMicroseconds, nil + } + } + return 0, fmt.Errorf("usage_usec not found in cpu.stat") +} + +func (s *MetricsCollector) GetMemoryUsageBytes(cgroupPath string) (uint64, error) { + cgroupMemoryUsagePath := path.Join(cgroupPath, "memory.current") + + data, err := os.ReadFile(cgroupMemoryUsagePath) + if err != nil { + return 0, fmt.Errorf("could not read memory usage: %w", err) + } + usageStr := strings.TrimSpace(string(data)) + + usedMemory, err := strconv.ParseUint(usageStr, 10, 64) + if err != nil { + return 0, fmt.Errorf("could not parse memory usage: %w", err) + } + return usedMemory, nil +} + +func (s *MetricsCollector) GetMemoryCacheBytes(cgroupPath string) (uint64, error) { + cgroupMemoryStatPath := path.Join(cgroupPath, "memory.stat") + + statData, err := os.ReadFile(cgroupMemoryStatPath) + if err != nil { + return 0, fmt.Errorf("could not read memory.stat: %w", err) + } + + lines := strings.Split(string(statData), "\n") + for _, line := range lines { + if strings.HasPrefix(line, "inactive_file") { + parts := strings.Fields(line) + if len(parts) != 2 { + return 0, fmt.Errorf("unexpected format in memory.stat") + } + cacheBytes, err := strconv.ParseUint(parts[1], 10, 64) + if err != nil { + return 0, fmt.Errorf("could not parse cache value: %w", err) + } + return cacheBytes, nil + } + } + return 0, fmt.Errorf("inactive_file not found in cpu.stat") +} + +func (s *MetricsCollector) GetGPUMetrics(ctx context.Context) ([]schemas.GPUMetrics, error) { + var metrics []schemas.GPUMetrics + var err error + switch s.gpuVendor { + case gpu.GpuVendorNvidia: + metrics, err = s.GetNVIDIAGPUMetrics(ctx) + case gpu.GpuVendorAmd: + metrics, err = s.GetAMDGPUMetrics(ctx) + case gpu.GpuVendorIntel: + metrics, err = s.GetIntelAcceleratorMetrics(ctx) + case gpu.GpuVendorTenstorrent: + err = errors.New("tenstorrent metrics not suppored") + case gpu.GpuVendorNone: + // pass + } + if metrics == nil { + metrics = []schemas.GPUMetrics{} + } + return metrics, err +} + +func (s *MetricsCollector) GetNVIDIAGPUMetrics(ctx context.Context) ([]schemas.GPUMetrics, error) { + cmd := exec.CommandContext(ctx, "nvidia-smi", "--query-gpu=memory.used,utilization.gpu", "--format=csv,noheader,nounits") + var out bytes.Buffer + cmd.Stdout = &out + if err := cmd.Run(); err != nil { + return []schemas.GPUMetrics{}, fmt.Errorf("failed to execute nvidia-smi: %w", err) + } + return parseNVIDIASMILikeMetrics(out.String()) +} + +func (s *MetricsCollector) GetAMDGPUMetrics(ctx context.Context) ([]schemas.GPUMetrics, error) { + cmd := exec.CommandContext(ctx, "amd-smi", "monitor", "-vu", "--csv") + var out bytes.Buffer + cmd.Stdout = &out + if err := cmd.Run(); err != nil { + return nil, fmt.Errorf("failed to execute amd-smi: %w", err) + } + return s.getAMDGPUMetrics(out.String()) +} + +func (s *MetricsCollector) getAMDGPUMetrics(csv string) ([]schemas.GPUMetrics, error) { + lines := strings.Split(strings.TrimSpace(csv), "\n") + if len(lines) < 2 { + return nil, errors.New("too few lines in amd-smi output") + } + + gpuUtilIndex := -1 + memUsedIndex := -1 + for index, header := range strings.Split(lines[0], ",") { + switch header { + case "gfx": + gpuUtilIndex = index + case "vram_used": + memUsedIndex = index + } + } + if gpuUtilIndex == -1 { + return nil, errors.New("GPU utilization column not found") + } + if memUsedIndex == -1 { + return nil, errors.New("used VRAM column not found") + } + + metrics := []schemas.GPUMetrics{} + for _, line := range lines[1:] { + fields := strings.Split(line, ",") + if len(fields) <= gpuUtilIndex || len(fields) <= memUsedIndex { + return nil, errors.New("too few columns in amd-smi output") + } + + gpuUtilRaw := strings.TrimSpace(fields[gpuUtilIndex]) + if strings.ToUpper(gpuUtilRaw) == "N/A" { + return nil, errors.New("GPU utilization is N/A") + } + gpuUtil, err := strconv.ParseUint(gpuUtilRaw, 10, 64) + if err != nil { + return nil, fmt.Errorf("failed to parse GPU utilization: %w", err) + } + + memUsedRaw := strings.TrimSpace(fields[memUsedIndex]) + if strings.ToUpper(memUsedRaw) == "N/A" { + return nil, errors.New("used VRAM is N/A") + } + memUsed, err := strconv.ParseUint(memUsedRaw, 10, 64) + if err != nil { + return nil, fmt.Errorf("failed to parse used VRAM: %w", err) + } + metrics = append(metrics, schemas.GPUMetrics{ + GPUMemoryUsage: memUsed * 1024 * 1024, + GPUUtil: gpuUtil, + }) + } + + return metrics, nil +} + +func (s *MetricsCollector) GetIntelAcceleratorMetrics(ctx context.Context) ([]schemas.GPUMetrics, error) { + cmd := exec.CommandContext(ctx, "hl-smi", "--query-aip=memory.used,utilization.aip", "--format=csv,noheader,nounits") + var out bytes.Buffer + cmd.Stdout = &out + if err := cmd.Run(); err != nil { + return []schemas.GPUMetrics{}, fmt.Errorf("failed to execute hl-smi: %w", err) + } + return parseNVIDIASMILikeMetrics(out.String()) +} + +func parseNVIDIASMILikeMetrics(output string) ([]schemas.GPUMetrics, error) { + metrics := []schemas.GPUMetrics{} + + lines := strings.Split(strings.TrimSpace(output), "\n") + for _, line := range lines { + parts := strings.Split(line, ", ") + if len(parts) != 2 { + continue + } + memUsed, err := strconv.ParseUint(strings.TrimSpace(parts[0]), 10, 64) + if err != nil { + return metrics, fmt.Errorf("failed to parse memory used: %w", err) + } + utilization, err := strconv.ParseUint(strings.TrimSpace(strings.TrimSuffix(parts[1], "%")), 10, 64) + if err != nil { + return metrics, fmt.Errorf("failed to parse accelerator utilization: %w", err) + } + metrics = append(metrics, schemas.GPUMetrics{ + GPUMemoryUsage: memUsed * 1024 * 1024, // Convert MiB to bytes + GPUUtil: utilization, + }) + } + + return metrics, nil +} diff --git a/runner/internal/runner/metrics/metrics_test.go b/runner/internal/runner/metrics/metrics_test.go new file mode 100644 index 0000000000..844b02bd30 --- /dev/null +++ b/runner/internal/runner/metrics/metrics_test.go @@ -0,0 +1,47 @@ +package metrics + +import ( + "testing" + + "github.com/dstackai/dstack/runner/internal/runner/schemas" + "github.com/stretchr/testify/assert" +) + +func TestGetAMDGPUMetrics_OK(t *testing.T) { + collector, err := NewMetricsCollector(t.Context()) + assert.NoError(t, err) + + cases := []struct { + csv string + expected []schemas.GPUMetrics + }{ + // AMDSMI Tool: 24.7.1+0012a68 | AMDSMI Library version: 24.7.1.0 | ROCm version: 6.3.1 + { + csv: "gpu,gfx,gfx_clock,vram_used,vram_total\n0,10,132,283,196300\n", + expected: []schemas.GPUMetrics{ + {GPUUtil: 10, GPUMemoryUsage: 296747008}, + }, + }, + // AMDSMI Tool: 25.3.0+ede62f2 | AMDSMI Library version: 25.3.0 | ROCm version: 6.4.0 + { + csv: "gpu,gfx_clk,gfx,vram_used,vram_free,vram_total,vram_percent\n0,132,10,283,196309,196592,0.0\n", + expected: []schemas.GPUMetrics{ + {GPUUtil: 10, GPUMemoryUsage: 296747008}, + }, + }, + } + + for _, tc := range cases { + metrics, err := collector.getAMDGPUMetrics(tc.csv) + assert.NoError(t, err) + assert.Equal(t, tc.expected, metrics) + } +} + +func TestGetAMDGPUMetrics_ErrorGPUUtilNA(t *testing.T) { + collector, err := NewMetricsCollector(t.Context()) + assert.NoError(t, err) + metrics, err := collector.getAMDGPUMetrics("gpu,gfx,gfx_clock,vram_used,vram_total\n0,N/A,N/A,283,196300\n") + assert.ErrorContains(t, err, "GPU utilization is N/A") + assert.Nil(t, metrics) +} diff --git a/runner/internal/repo/diff.go b/runner/internal/runner/repo/diff.go similarity index 76% rename from runner/internal/repo/diff.go rename to runner/internal/runner/repo/diff.go index f39d151a6c..a7f33cad6c 100644 --- a/runner/internal/repo/diff.go +++ b/runner/internal/runner/repo/diff.go @@ -11,7 +11,8 @@ import ( "strings" "github.com/bluekeyes/go-gitdiff/gitdiff" - "github.com/dstackai/dstack/runner/internal/log" + + "github.com/dstackai/dstack/runner/internal/common/log" ) func ApplyDiff(ctx context.Context, dir, patch string) error { @@ -19,11 +20,11 @@ func ApplyDiff(ctx context.Context, dir, patch string) error { log.Info(ctx, "apply diff start", "dir", dir) files, _, err := gitdiff.Parse(strings.NewReader(patch + "\n")) if err != nil { - return err + return fmt.Errorf("parse git diff: %w", err) } - var output = &bytes.Buffer{} - var empty = bytes.NewReader([]byte{}) + output := &bytes.Buffer{} + empty := bytes.NewReader([]byte{}) for _, fileInfo := range files { log.Trace(ctx, "apply diff file", "file", fileInfo.OldName, "text_fragments_cnt", len(fileInfo.TextFragments)) @@ -34,8 +35,7 @@ func ApplyDiff(ctx context.Context, dir, patch string) error { oldFile, err = os.Open(path.Join(dir, fileInfo.OldName)) input = oldFile if err != nil { - log.Error(ctx, "apply diff can not open file", "filename", fileInfo.OldName, "err", err) - return err + return fmt.Errorf("open file %s: %w", fileInfo.OldName, err) } } err = gitdiff.Apply(output, input, fileInfo) @@ -47,14 +47,13 @@ func ApplyDiff(ctx context.Context, dir, patch string) error { aes = fmt.Sprintf("ApplyError{Fragment: %d, FragmentLine: %d, Line: %d}", ae.Fragment, ae.FragmentLine, ae.Line) } - log.Error(ctx, "diff applier error", "filename", fileInfo.OldName, "err", err, "ae", aes) - return err + return fmt.Errorf("diff applier error for file %s (%s): %w", fileInfo.OldName, aes, ae) } if !fileInfo.IsDelete { if fileInfo.IsNew || fileInfo.IsRename { dd := path.Dir(path.Join(dir, fileInfo.NewName)) - err = os.MkdirAll(dd, 0755) + err = os.MkdirAll(dd, 0o755) if err != nil { log.Warning(ctx, "diff apply new file mkdir fail", "filename", fileInfo.NewName, @@ -64,20 +63,19 @@ func ApplyDiff(ctx context.Context, dir, patch string) error { mode := fileModeHeuristic(ctx, dir, fileInfo) err = os.WriteFile(path.Join(dir, fileInfo.NewName), output.Bytes(), mode) if err != nil { - log.Error(ctx, "diff apply write file", "filename", fileInfo.NewName, "err", err) - return err + return fmt.Errorf("write file %s: %w", fileInfo.NewName, err) } // WriteFile does not change perm for existing files err = os.Chmod(path.Join(dir, fileInfo.NewName), mode) if err != nil { - log.Warning(ctx, "diff apply can not chmod", "filename", fileInfo.NewName, "err", err) + log.Warning(ctx, "diff apply cannot chmod", "filename", fileInfo.NewName, "err", err) } } if fileInfo.IsDelete || fileInfo.IsRename { err = os.Remove(path.Join(dir, fileInfo.OldName)) if err != nil { - log.Warning(ctx, "diff apply can not delete", "filename", fileInfo.OldName, "err", err) + log.Warning(ctx, "diff apply cannot delete", "filename", fileInfo.OldName, "err", err) } } } @@ -102,7 +100,7 @@ func fileModeHeuristic(ctx context.Context, dir string, fileInfo *gitdiff.File) } } if mode == 0 { - mode = 0644 // fallback to git no-exec default + mode = 0o644 // fallback to git no-exec default } return mode } diff --git a/runner/internal/repo/diff_test.go b/runner/internal/runner/repo/diff_test.go similarity index 99% rename from runner/internal/repo/diff_test.go rename to runner/internal/runner/repo/diff_test.go index a9de2a3c7d..3976c1b957 100644 --- a/runner/internal/repo/diff_test.go +++ b/runner/internal/runner/repo/diff_test.go @@ -220,7 +220,7 @@ LcmZQzWMT#Y01f~L }, { name: "Executable perm", - expMode: 0100, + expMode: 0o100, diff: "diff --git a/original b/original\nold mode 100644\nnew mode 100755\n", }, { @@ -290,8 +290,7 @@ index 9ce1261..f9c7821 100644 }, } - content := - `First line. + content := `First line. Second line. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. @@ -320,7 +319,7 @@ Last line.` if cont == "" { cont = content + cc.contAdd } - err = os.WriteFile(path.Join(dir, "original"), []byte(cont), 0660) + err = os.WriteFile(path.Join(dir, "original"), []byte(cont), 0o660) assert.NoError(t, err, "write original file") ctx := context.Background() err = ApplyDiff(ctx, dir, cc.diff) diff --git a/runner/internal/runner/repo/manager.go b/runner/internal/runner/repo/manager.go new file mode 100644 index 0000000000..baeec40fad --- /dev/null +++ b/runner/internal/runner/repo/manager.go @@ -0,0 +1,141 @@ +package repo + +import ( + "context" + "fmt" + + "github.com/go-git/go-git/v5" + "github.com/go-git/go-git/v5/plumbing" + "github.com/go-git/go-git/v5/plumbing/transport/http" + gitssh "github.com/go-git/go-git/v5/plumbing/transport/ssh" + "golang.org/x/crypto/ssh" + + "github.com/dstackai/dstack/runner/internal/common/log" +) + +type Manager struct { + ctx context.Context + localPath string + clo git.CloneOptions + branch string + hash string +} + +func NewManager(ctx context.Context, url, branch, hash string, singleBranch bool) *Manager { + ctx = log.AppendArgsCtx(ctx, "url", url, "branch", branch, "hash", hash) + m := &Manager{ + ctx: ctx, + branch: branch, + hash: hash, + clo: git.CloneOptions{ + URL: url, + RecurseSubmodules: git.DefaultSubmoduleRecursionDepth, + SingleBranch: singleBranch, + }, + } + // Only set ReferenceName if branch is non-empty + // If empty, it will default to HEAD in CloneOptions.Validate() + if branch != "" { + m.clo.ReferenceName = plumbing.NewBranchReferenceName(branch) + } + + return m +} + +func (m *Manager) WithLocalPath(path string) *Manager { + m.localPath = path + m.ctx = log.AppendArgsCtx(m.ctx, "path", path) + return m +} + +// TODO: works with Github, possibly not with others +func (m *Manager) WithTokenAuth(token string) *Manager { + auth := &http.BasicAuth{ + Username: "anything", + Password: token, + } + m.clo.Auth = auth + return m +} + +func (m *Manager) WithSSHAuth(pem, password string) *Manager { + keys, err := gitssh.NewPublicKeys("git", []byte(pem), password) + if err != nil { + log.Warning(m.ctx, "fail to parse SSH private key", "err", err) + } else { + keys.HostKeyCallback = ssh.InsecureIgnoreHostKey() + m.clo.Auth = keys + } + return m +} + +func (m *Manager) Checkout(ctx context.Context) error { + log.Info(m.ctx, "git checkout", "auth", fmt.Sprintf("%T", m.clo.Auth)) + ref, err := git.PlainCloneContext(ctx, m.localPath, false, &m.clo) + if err != nil { + return fmt.Errorf("clone repo: %w", err) + } + if ref != nil { + var cho git.CheckoutOptions + needCheckout := false + + if m.branch != "" { + branchRef, err := ref.Reference(m.clo.ReferenceName, true) + if err != nil { + return fmt.Errorf("get branch reference: %w", err) + } + if m.hash == "" || m.hash == branchRef.Hash().String() { + // Hash is empty or matches branch head: checkout branch + cho.Branch = m.clo.ReferenceName + needCheckout = true + } else { + // Hash is specified and different: checkout by hash + cho.Hash = plumbing.NewHash(m.hash) + needCheckout = true + } + } else { + // Branch is empty: checkout by hash if specified, otherwise HEAD is already checked out + if m.hash != "" { + cho.Hash = plumbing.NewHash(m.hash) + needCheckout = true + } + // If hash is also empty, HEAD is already checked out by clone, no need to checkout again + } + + if needCheckout { + workTree, err := ref.Worktree() + if err != nil { + return fmt.Errorf("get worktree: %w", err) + } + err = workTree.Checkout(&cho) + if err != nil { + return fmt.Errorf("checkout: %w", err) + } + } + } else { + log.Warning(m.ctx, "git clone ref==nil") + } + + return nil +} + +func (m *Manager) URL() string { + return m.clo.URL +} + +func (m *Manager) SetConfig(name, email string) error { + repo, err := git.PlainOpen(m.localPath) + if err != nil { + return fmt.Errorf("open repo: %w", err) + } + config, err := repo.Config() + if err != nil { + return fmt.Errorf("get repo config: %w", err) + } + config.User.Name = name + config.User.Email = email + if err := repo.SetConfig(config); err != nil { + return fmt.Errorf("set repo config: %w", err) + } + return nil +} diff --git a/runner/internal/runner/schemas/schemas.go b/runner/internal/runner/schemas/schemas.go new file mode 100644 index 0000000000..47706228cd --- /dev/null +++ b/runner/internal/runner/schemas/schemas.go @@ -0,0 +1,172 @@ +package schemas + +import ( + "strings" + + "github.com/dstackai/dstack/runner/internal/common/types" +) + +type JobState string + +const ( + JobStateDone JobState = "done" + JobStateFailed JobState = "failed" + JobStateRunning JobState = "running" + JobStateTerminated JobState = "terminated" + JobStateTerminating JobState = "terminating" +) + +type JobStateEvent struct { + State JobState `json:"state"` + Timestamp int64 `json:"timestamp"` + TerminationReason types.TerminationReason `json:"termination_reason"` + TerminationMessage string `json:"termination_message"` + ExitStatus *int `json:"exit_status"` +} + +type LogEvent struct { + Message []byte `json:"message"` + Timestamp int64 `json:"timestamp"` // milliseconds +} + +type SubmitBody struct { + Run Run `json:"run"` + JobSpec JobSpec `json:"job_spec"` + JobSubmission JobSubmission `json:"job_submission"` + ClusterInfo ClusterInfo `json:"cluster_info"` + Secrets map[string]string `json:"secrets"` + RepoCredentials *RepoCredentials `json:"repo_credentials"` + LogQuotaHour int `json:"log_quota_hour"` // bytes per hour, 0 = unlimited +} + +type PullResponse struct { + JobStates []JobStateEvent `json:"job_states"` + JobLogs []LogEvent `json:"job_logs"` + RunnerLogs []LogEvent `json:"runner_logs"` + LastUpdated int64 `json:"last_updated"` + NoConnectionsSecs int64 `json:"no_connections_secs"` + HasMore bool `json:"has_more"` +} + +type JobInfoResponse struct { + WorkingDir string `json:"working_dir"` + Username string `json:"username"` +} + +type Run struct { + Id string `json:"id"` + RunSpec RunSpec `json:"run_spec"` +} + +type RunSpec struct { + RunName string `json:"run_name"` + RepoId string `json:"repo_id"` + RepoData RepoData `json:"repo_data"` + Configuration Configuration `json:"configuration"` + ConfigurationPath string `json:"configuration_path"` +} + +type JobSubmission struct { + Id string `json:"id"` +} + +type JobSpec struct { + ReplicaNum int `json:"replica_num"` + JobNum int `json:"job_num"` + JobsPerReplica int `json:"jobs_per_replica"` + User *User `json:"user"` + Commands []string `json:"commands"` + Entrypoint []string `json:"entrypoint"` + Env map[string]string `json:"env"` + SingleBranch bool `json:"single_branch"` + MaxDuration int `json:"max_duration"` + SSHKey *SSHKey `json:"ssh_key"` + WorkingDir *string `json:"working_dir"` + RepoDir *string `json:"repo_dir"` + // `RepoData` is optional for compatibility with jobs submitted before 0.19.17. + // Use `RunExecutor.getRepoData()` to get non-nil `RepoData`. + // TODO: make required when supporting jobs submitted before 0.19.17 is no longer relevant. + RepoData *RepoData `json:"repo_data"` + RepoExistsAction *RepoExistsAction `json:"repo_exists_action"` + FileArchives []FileArchive `json:"file_archives"` +} + +type ClusterInfo struct { + JobIPs []string `json:"job_ips"` + MasterJobIP string `json:"master_job_ip"` + GPUSPerJob int `json:"gpus_per_job"` +} + +type SSHKey struct { + Private string `json:"private"` + Public string `json:"public"` +} + +type RepoCredentials struct { + CloneURL string `json:"clone_url"` + PrivateKey *string `json:"private_key"` + OAuthToken *string `json:"oauth_token"` +} + +type RepoData struct { + RepoType string `json:"repo_type"` + + RepoBranch string `json:"repo_branch"` + RepoHash string `json:"repo_hash"` + + RepoConfigName string `json:"repo_config_name"` + RepoConfigEmail string `json:"repo_config_email"` +} + +type RepoExistsAction string + +const ( + RepoExistsActionError RepoExistsAction = "error" + RepoExistsActionSkip RepoExistsAction = "skip" +) + +type FileArchive struct { + Id string `json:"id"` + Path string `json:"path"` +} + +type Configuration struct { + Type string `json:"type"` +} + +type User struct { + Uid *uint32 `json:"uid"` + Username *string `json:"username"` + Gid *uint32 `json:"gid"` + Groupname *string `json:"groupname"` +} + +type HealthcheckResponse struct { + Service string `json:"service"` + Version string `json:"version"` +} + +type GPUMetrics struct { + GPUMemoryUsage uint64 `json:"gpu_memory_usage_bytes"` + GPUUtil uint64 `json:"gpu_util_percent"` +} + +type SystemMetrics struct { + Timestamp int64 `json:"timestamp_micro"` + CpuUsage uint64 `json:"cpu_usage_micro"` + MemoryUsage uint64 `json:"memory_usage_bytes"` + MemoryWorkingSet uint64 `json:"memory_working_set_bytes"` + GPUMetrics []GPUMetrics `json:"gpus"` +} + +func (c *RepoCredentials) GetProtocol() string { + return strings.SplitN(c.CloneURL, "://", 2)[0] +} + +func (e JobStateEvent) GetTimestamp() int64 { + return e.Timestamp +} + +func (e LogEvent) GetTimestamp() int64 { + return e.Timestamp +} diff --git a/runner/internal/runner/ssh/sshd.go b/runner/internal/runner/ssh/sshd.go new file mode 100644 index 0000000000..07aa1ec82d --- /dev/null +++ b/runner/internal/runner/ssh/sshd.go @@ -0,0 +1,290 @@ +package ssh + +import ( + "context" + "errors" + "fmt" + "os" + "os/exec" + "path" + "sync" + "syscall" + "time" + + "github.com/dstackai/dstack/runner/internal/common/log" + "github.com/dstackai/dstack/runner/internal/common/utils" +) + +type SshdManager interface { + Port() int + + Start(context.Context) error + Stop(context.Context) error + AddAuthorizedKeys(context.Context, ...string) error +} + +// Host keys generated and configured for sshd. RSA is intentionally omitted: +// ssh-keygen -A's RSA-3072 generation dominates runner startup (~1000ms vs +// <50ms for ECDSA + Ed25519 combined). Both ECDSA and Ed25519 are +// universally supported by modern SSH clients (OpenSSH >= 6.5). +var hostKeys = [...]struct { + name string + keyType string +}{ + {name: "ssh_host_ecdsa_key", keyType: "ecdsa"}, + {name: "ssh_host_ed25519_key", keyType: "ed25519"}, +} + +// Implements SshdManager +type Sshd struct { + binPath string + confPath string + logPath string + akPath string + port int + + akMu sync.Mutex + + cmd *exec.Cmd +} + +func NewSshd(binPath string) *Sshd { + return &Sshd{ + binPath: binPath, + } +} + +func (d *Sshd) Port() int { + return d.port +} + +func (d *Sshd) Prepare(ctx context.Context, baseDir string, port int, logLevel string) error { + confDir := path.Join(baseDir, "conf") + if err := os.MkdirAll(confDir, 0o755); err != nil { + return fmt.Errorf("create conf dir: %w", err) + } + + if err := generateHostKeys(ctx, confDir); err != nil { + return fmt.Errorf("generate host keys: %w", err) + } + + akPath, err := prepareAuthorizedKeysFile(confDir) + if err != nil { + return fmt.Errorf("prepare authorized_keys: %w", err) + } + d.akPath = akPath + + confPath, err := createSshdConfig(ctx, confDir, port, logLevel, akPath) + if err != nil { + return fmt.Errorf("create sshd config: %w", err) + } + d.confPath = confPath + d.port = port + + logDir := path.Join(baseDir, "log") + logPath, err := prepareLogPath(logDir) + if err != nil { + return fmt.Errorf("prepare log path: %w", err) + } + d.logPath = logPath + + // /var/empty is the default path if not configured via ./configure --with-privsep-path=... + // /run/sshd is used in Debian-based distros, including Ubuntu: + // https://salsa.debian.org/ssh-team/openssh/-/blob/debian/1%259.7p1-7/debian/rules#L60 + // TODO: change to a custom path if a custom OpenSSH build with overridden PRIVSEP_PATH is used + if err := preparePrivsepPath("/var/empty"); err != nil { + return fmt.Errorf("prepare PRIVSEP_PATH: %w", err) + } + if err := preparePrivsepPath("/run/sshd"); err != nil { + return fmt.Errorf("prepare PRIVSEP_PATH: %w", err) + } + + return nil +} + +func (d *Sshd) AddAuthorizedKeys(ctx context.Context, authorizedKeys ...string) error { + d.akMu.Lock() + defer d.akMu.Unlock() + + file, err := os.OpenFile(d.akPath, os.O_WRONLY|os.O_APPEND, 0o700) + if err != nil { + return fmt.Errorf("open authorized_keys: %w", err) + } + defer func() { + if err := file.Close(); err != nil { + log.Error(ctx, "Close authorized_keys", "err", err) + } + }() + + for _, key := range authorizedKeys { + if _, err := fmt.Fprintln(file, key); err != nil { + return fmt.Errorf("write authorized_keys: %w", err) + } + } + + return nil +} + +func (d *Sshd) Start(ctx context.Context) error { + if d.confPath == "" { + return errors.New("not configured") + } + cmd := exec.CommandContext(ctx, d.binPath, "-D", "-f", d.confPath, "-E", d.logPath) + cmd.Cancel = func() error { + return d.sendSigterm() + } + cmd.WaitDelay = time.Second * 10 + d.cmd = cmd + return cmd.Start() +} + +func (d *Sshd) Stop(ctx context.Context) error { + if d.cmd == nil { + return errors.New("not started") + } + if err := d.sendSigterm(); err != nil { + return err + } + return d.cmd.Wait() +} + +func (d *Sshd) sendSigterm() error { + return d.cmd.Process.Signal(syscall.SIGTERM) +} + +func generateHostKeys(ctx context.Context, confDir string) error { + tmpDir, err := os.MkdirTemp("", "dstack-sshd-*") + if err != nil { + return err + } + defer func() { + if err := os.RemoveAll(tmpDir); err != nil { + log.Error(ctx, "Remove host keys temp dir", "err", err) + } + }() + + // TODO: change if a custom OpenSSH build with overridden SSHDIR is used + keyDir := path.Join(tmpDir, "etc/ssh") + if err := os.MkdirAll(keyDir, 0o700); err != nil { + return err + } + + for _, k := range hostKeys { + // TODO: specify the full path if a custom OpenSSH build is used + cmd := exec.CommandContext(ctx, "ssh-keygen", + "-t", k.keyType, "-q", "-N", "", "-f", path.Join(keyDir, k.name)) + if err := cmd.Run(); err != nil { + return fmt.Errorf("generate %s host key: %w", k.keyType, err) + } + if err := copyHostKey(keyDir, confDir, k.name); err != nil { + return err + } + } + + return nil +} + +func copyHostKey(srcDir string, destDir string, key string) error { + srcPath := path.Join(srcDir, key) + destPath := path.Join(destDir, key) + privKey, err := os.ReadFile(srcPath) + if err != nil { + return err + } + if err := os.WriteFile(destPath, privKey, 0o600); err != nil { + return err + } + + pubKey, err := os.ReadFile(srcPath + ".pub") + if err != nil { + return err + } + if err := os.WriteFile(destPath+".pub", pubKey, 0o644); err != nil { + return err + } + + return nil +} + +func prepareAuthorizedKeysFile(confDir string) (string, error) { + // Ensures that the file exists, has correct ownership and permissions, and is empty + akPath := path.Join(confDir, "authorized_keys") + if _, err := utils.RemoveIfExists(akPath); err != nil { + return "", err + } + file, err := os.OpenFile(akPath, os.O_CREATE|os.O_EXCL|os.O_RDONLY, 0o644) + if err != nil { + return "", err + } + if err := file.Close(); err != nil { + return "", err + } + return akPath, nil +} + +func createSshdConfig(ctx context.Context, confDir string, port int, logLevel string, akPath string) (string, error) { + confPath := path.Join(confDir, "sshd_config") + file, err := os.OpenFile(confPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o644) + if err != nil { + return "", err + } + defer func() { + if err := file.Close(); err != nil { + log.Error(ctx, "Close sshd config", "err", err) + } + }() + + lines := []string{ + fmt.Sprintf("LogLevel %s", logLevel), + fmt.Sprintf("Port %d", port), + "PidFile none", + "Subsystem sftp internal-sftp", + "PasswordAuthentication no", + "KbdInteractiveAuthentication no", + // The default is `no`, but in this case sshd does not allow the user without password to log in, + // as useradd creates a locked user (with a `!` in the second field of /etc/shadow entry) if no password provided, + // that is, you cannot log in as `ubuntu` in Ubuntu images or `dstack` in dstack images. + // See: https://github.com/openssh/openssh-portable/blob/d01efaa1c9ed84fd9011201dbc3c7cb0a82bcee3/auth.c#L108, + // See: https://github.com/openssh/openssh-portable/blob/master/platform.c#L192-L199 + // See: https://github.com/openssh/openssh-portable/blob/d01efaa1c9ed84fd9011201dbc3c7cb0a82bcee3/configure.ac#L949 + // See: shadow(5) + // See: useradd(8) + // TODO: Change to `no` if a custom OpenSSH build without LOCKED_PASSWD_PREFIX is used + "UsePAM yes", + // Keep ~/.ssh/authorized_keys as a fallback in case our sshd server is also used by the user for their purposes + fmt.Sprintf("AuthorizedKeysFile %s .ssh/authorized_keys", akPath), + "AcceptEnv LANG LC_* COLORTERM NO_COLOR", + "ClientAliveInterval 30", + "ClientAliveCountMax 4", + } + for _, k := range hostKeys { + lines = append(lines, fmt.Sprintf("HostKey %s/%s", confDir, k.name)) + } + for _, line := range lines { + if _, err := fmt.Fprintln(file, line); err != nil { + return "", err + } + } + + return confPath, nil +} + +func prepareLogPath(logDir string) (string, error) { + if err := os.MkdirAll(logDir, 0o755); err != nil { + return "", err + } + logPath := path.Join(logDir, "sshd.log") + if _, err := utils.RemoveIfExists(logPath); err != nil { + return "", err + } + return logPath, nil +} + +func preparePrivsepPath(privsepPath string) error { + // Ensure that PRIVSEP_PATH 1) exists 2) empty 3) owned by root, + // see https://github.com/dstackai/dstack/issues/1999 + if err := os.RemoveAll(privsepPath); err != nil { + return err + } + return os.MkdirAll(privsepPath, 0o755) +} diff --git a/runner/internal/schemas/schemas.go b/runner/internal/schemas/schemas.go deleted file mode 100644 index 9399ff14e3..0000000000 --- a/runner/internal/schemas/schemas.go +++ /dev/null @@ -1,110 +0,0 @@ -package schemas - -import "fmt" - -type JobStateEvent struct { - State string `json:"state"` - Timestamp int64 `json:"timestamp"` -} - -type LogEvent struct { - Message []byte `json:"message"` - Timestamp int64 `json:"timestamp"` -} - -type SubmitBody struct { - RunSpec RunSpec `json:"run_spec"` - JobSpec JobSpec `json:"job_spec"` - ClusterInfo ClusterInfo `json:"cluster_info"` - Secrets map[string]string `json:"secrets"` - RepoCredentials *RepoCredentials `json:"repo_credentials"` -} - -type PullResponse struct { - JobStates []JobStateEvent `json:"job_states"` - JobLogs []LogEvent `json:"job_logs"` - RunnerLogs []LogEvent `json:"runner_logs"` - LastUpdated int64 `json:"last_updated"` - HasMore bool `json:"has_more"` - // todo Result -} - -type RunSpec struct { - RunName string `json:"run_name"` - RepoId string `json:"repo_id"` - RepoData RepoData `json:"repo_data"` - Configuration Configuration `json:"configuration"` - ConfigurationPath string `json:"configuration_path"` -} - -type JobSpec struct { - ReplicaNum int `json:"replica_num"` - JobNum int `json:"job_num"` - JobsPerReplica int `json:"jobs_per_replica"` - Commands []string `json:"commands"` - Entrypoint []string `json:"entrypoint"` - Env map[string]string `json:"env"` - Gateway *Gateway `json:"gateway"` - MaxDuration int `json:"max_duration"` - WorkingDir *string `json:"working_dir"` -} - -type ClusterInfo struct { - MasterJobIP string `json:"master_job_ip"` - GPUSPerJob int `json:"gpus_per_job"` -} - -type RepoCredentials struct { - Protocol string `json:"protocol"` - PrivateKey *string `json:"private_key"` - OAuthToken *string `json:"oauth_token"` -} - -type RepoData struct { - RepoType string `json:"repo_type"` - RepoHostName string `json:"repo_host_name"` - RepoPort int `json:"repo_port"` - RepoUserName string `json:"repo_user_name"` - RepoName string `json:"repo_name"` - - RepoBranch string `json:"repo_branch"` - RepoHash string `json:"repo_hash"` - - RepoConfigName string `json:"repo_config_name"` - RepoConfigEmail string `json:"repo_config_email"` -} - -type Configuration struct { - Type string `json:"type"` -} - -type Gateway struct { - GatewayName string `json:"gateway_name"` - ServicePort int `json:"service_port"` - SSHKey string `json:"ssh_key"` - SockPath string `json:"sock_path"` - Hostname string `json:"hostname"` - PublicPort int `json:"public_port"` - Secure bool `json:"secure"` -} - -type HealthcheckResponse struct { - Service string `json:"service"` - Version string `json:"version"` -} - -func (d *RepoData) FormatURL(format string) string { - host := d.RepoHostName - if d.RepoPort != 0 { - host = fmt.Sprintf("%s:%d", d.RepoHostName, d.RepoPort) - } - return fmt.Sprintf(format, host, d.RepoUserName, d.RepoName) -} - -func (e JobStateEvent) GetTimestamp() int64 { - return e.Timestamp -} - -func (e LogEvent) GetTimestamp() int64 { - return e.Timestamp -} diff --git a/runner/internal/shim/api/api_test.go b/runner/internal/shim/api/api_test.go new file mode 100644 index 0000000000..b6879187af --- /dev/null +++ b/runner/internal/shim/api/api_test.go @@ -0,0 +1,53 @@ +package api + +import ( + "context" + "sync" + + "github.com/dstackai/dstack/runner/internal/shim" +) + +type DummyRunner struct { + tasks map[string]bool + mu sync.Mutex +} + +func (ds *DummyRunner) Submit(ctx context.Context, cfg shim.TaskConfig) error { + ds.mu.Lock() + defer ds.mu.Unlock() + if _, ok := ds.tasks[cfg.ID]; ok { + return shim.ErrRequest + } + ds.tasks[cfg.ID] = true + return nil +} + +func (ds *DummyRunner) Run(context.Context, string) error { + return nil +} + +func (ds *DummyRunner) Terminate(context.Context, string, uint, string, string) error { + return nil +} + +func (ds *DummyRunner) Remove(context.Context, string) error { + return nil +} + +func (ds *DummyRunner) TaskList() []*shim.TaskListItem { + return []*shim.TaskListItem{} +} + +func (ds *DummyRunner) TaskInfo(taskID string) shim.TaskInfo { + return shim.TaskInfo{} +} + +func (ds *DummyRunner) Resources(context.Context) shim.Resources { + return shim.Resources{} +} + +func NewDummyRunner() *DummyRunner { + return &DummyRunner{ + tasks: map[string]bool{}, + } +} diff --git a/runner/internal/shim/api/handlers.go b/runner/internal/shim/api/handlers.go new file mode 100644 index 0000000000..b3382d0f26 --- /dev/null +++ b/runner/internal/shim/api/handlers.go @@ -0,0 +1,223 @@ +package api + +import ( + "context" + "errors" + "net/http" + + "github.com/dstackai/dstack/runner/internal/common/api" + "github.com/dstackai/dstack/runner/internal/common/log" + "github.com/dstackai/dstack/runner/internal/shim" + "github.com/dstackai/dstack/runner/internal/shim/components" + "github.com/dstackai/dstack/runner/internal/shim/dcgm" +) + +func (s *ShimServer) HealthcheckHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { + s.mu.RLock() + defer s.mu.RUnlock() + + return &HealthcheckResponse{ + Service: "dstack-shim", + Version: s.version, + }, nil +} + +func (s *ShimServer) ShutdownHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { + var req ShutdownRequest + if err := api.DecodeJSONBody(w, r, &req, true); err != nil { + return nil, err + } + + go func() { + if err := s.Shutdown(s.ctx, req.Force); err != nil { + log.Error(s.ctx, "Shutdown", "err", err) + } + }() + + return nil, nil +} + +func (s *ShimServer) InstanceHealthHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { + ctx := r.Context() + response := InstanceHealthResponse{} + if s.dcgmWrapper != nil { + if dcgmHealth, err := s.dcgmWrapper.GetHealth(); err != nil { + log.Error(ctx, "failed to get health from DCGM", "err", err) + } else { + response.DCGM = &dcgmHealth + } + } + + return &response, nil +} + +func (s *ShimServer) TaskListHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { + tasks := s.runner.TaskList() + return &TaskListResponse{tasks}, nil +} + +func (s *ShimServer) TaskInfoHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { + taskInfo := s.runner.TaskInfo(r.PathValue("id")) + if taskInfo.ID == "" { + return nil, &api.Error{Status: http.StatusNotFound} + } + return TaskInfoResponse(taskInfo), nil +} + +// TaskSubmitHandler submits AND runs a task +func (s *ShimServer) TaskSubmitHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { + var req TaskSubmitRequest + if err := api.DecodeJSONBody(w, r, &req, true); err != nil { + return nil, err + } + if req.ID == "" { + return nil, &api.Error{Status: http.StatusBadRequest, Msg: "empty id"} + } + if req.Name == "" { + return nil, &api.Error{Status: http.StatusBadRequest, Msg: "empty name"} + } + if req.ImageName == "" { + return nil, &api.Error{Status: http.StatusBadRequest, Msg: "empty image_name"} + } + if req.ContainerUser == "" { + req.ContainerUser = "root" + } + if req.NetworkMode == "" { + req.NetworkMode = shim.NetworkModeHost + } + ctx := r.Context() + taskConfig := shim.TaskConfig(req) + if err := s.runner.Submit(ctx, taskConfig); err != nil { + if errors.Is(err, shim.ErrRequest) { + log.Info(ctx, "already submitted", "task", taskConfig.ID, "err", err) + return nil, &api.Error{Status: http.StatusConflict, Err: err} + } + log.Error(ctx, "conflict", "task", taskConfig.ID, "err", err) + return nil, &api.Error{Status: http.StatusInternalServerError, Err: err} + } + log.Info(ctx, "submitted", "task", taskConfig.ID) + + ctx = log.WithLogger(context.Background(), log.GetLogger(ctx)) + go func() { + if err := s.runner.Run(ctx, taskConfig.ID); err != nil { + log.Error(ctx, "failed to run", "task", taskConfig.ID, "err", err) + } + }() + + return s.runner.TaskInfo(taskConfig.ID), nil +} + +func (s *ShimServer) TaskTerminateHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { + ctx := r.Context() + taskID := r.PathValue("id") + var req TaskTerminateRequest + if err := api.DecodeJSONBody(w, r, &req, true); err != nil { + return nil, err + } + if err := s.runner.Terminate(ctx, taskID, req.Timeout, req.TerminationReason, req.TerminationMessage); err != nil { + if errors.Is(err, shim.ErrNotFound) { + log.Info(ctx, "not found", "task", taskID, "err", err) + return nil, &api.Error{Status: http.StatusNotFound, Err: err} + } + if errors.Is(err, shim.ErrRequest) { + log.Info(ctx, "conflict", "task", taskID, "err", err) + return nil, &api.Error{Status: http.StatusConflict, Err: err} + } + log.Error(ctx, "failed to terminate", "task", taskID, "err", err) + return nil, &api.Error{Status: http.StatusInternalServerError, Err: err} + } + log.Info(ctx, "terminated", "task", taskID) + + taskInfo := s.runner.TaskInfo(taskID) + if taskInfo.ID == "" { + return nil, &api.Error{Status: http.StatusNotFound} + } + return TaskInfoResponse(taskInfo), nil +} + +func (s *ShimServer) TaskRemoveHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { + ctx := r.Context() + taskID := r.PathValue("id") + if err := s.runner.Remove(ctx, taskID); err != nil { + if errors.Is(err, shim.ErrNotFound) { + log.Info(ctx, "not found", "task", taskID, "err", err) + return nil, &api.Error{Status: http.StatusNotFound, Err: err} + } + if errors.Is(err, shim.ErrRequest) { + log.Info(ctx, "not terminated", "task", taskID, "err", err) + return nil, &api.Error{Status: http.StatusConflict, Err: err} + } + log.Error(ctx, "failed to remove", "task", taskID, "err", err) + return nil, &api.Error{Status: http.StatusInternalServerError, Err: err} + } + log.Info(ctx, "removed", "task", taskID) + return nil, nil +} + +func (s *ShimServer) TaskMetricsHandler(w http.ResponseWriter, r *http.Request) { + if s.dcgmExporter == nil { + http.Error(w, "DCGM Exporter is not available", http.StatusNotFound) + return + } + taskInfo := s.runner.TaskInfo(r.PathValue("id")) + if taskInfo.ID == "" { + http.Error(w, "Task not found", http.StatusNotFound) + return + } + expfmtBody, err := s.dcgmExporter.Fetch(r.Context()) + if err != nil { + http.Error(w, err.Error(), http.StatusBadGateway) + return + } + response := dcgm.FilterMetrics(expfmtBody, taskInfo.GpuIDs) + _, _ = w.Write(response) +} + +func (s *ShimServer) ComponentListHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { + response := &ComponentListResponse{ + Components: []components.ComponentInfo{ + s.runnerManager.GetInfo(r.Context()), + s.shimManager.GetInfo(r.Context()), + }, + } + return response, nil +} + +func (s *ShimServer) ComponentInstallHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { + var req ComponentInstallRequest + if err := api.DecodeJSONBody(w, r, &req, true); err != nil { + return nil, err + } + + if req.Name == "" { + return nil, &api.Error{Status: http.StatusBadRequest, Msg: "empty name"} + } + + var componentManager components.ComponentManager + switch components.ComponentName(req.Name) { + case components.ComponentNameRunner: + componentManager = s.runnerManager + case components.ComponentNameShim: + componentManager = s.shimManager + default: + return nil, &api.Error{Status: http.StatusBadRequest, Msg: "unknown component"} + } + + if req.URL == "" { + return nil, &api.Error{Status: http.StatusBadRequest, Msg: "empty url"} + } + + // There is still a small chance of time-of-check race condition, but we ignore it. + componentInfo := componentManager.GetInfo(r.Context()) + if componentInfo.Status == components.ComponentStatusInstalling { + return nil, &api.Error{Status: http.StatusConflict, Msg: "already installing"} + } + + s.bgJobsGroup.Go(func() { + if err := componentManager.Install(s.bgJobsCtx, req.URL, true); err != nil { + log.Error(s.bgJobsCtx, "component background install", "name", componentInfo.Name, "err", err) + } + }) + + return nil, nil +} diff --git a/runner/internal/shim/api/handlers_test.go b/runner/internal/shim/api/handlers_test.go new file mode 100644 index 0000000000..bb19ebbf1b --- /dev/null +++ b/runner/internal/shim/api/handlers_test.go @@ -0,0 +1,55 @@ +package api + +import ( + "context" + "net/http/httptest" + "strings" + "testing" + + commonapi "github.com/dstackai/dstack/runner/internal/common/api" +) + +func TestHealthcheck(t *testing.T) { + request := httptest.NewRequest("GET", "/api/healthcheck", nil) + responseRecorder := httptest.NewRecorder() + + server := NewShimServer(context.Background(), ":12345", "0.0.1.dev2", NewDummyRunner(), nil, nil, nil, nil) + + f := commonapi.JSONResponseHandler(server.HealthcheckHandler) + f(responseRecorder, request) + + if responseRecorder.Code != 200 { + t.Errorf("Want status '%d', got '%d'", 200, responseRecorder.Code) + } + + expected := "{\"service\":\"dstack-shim\",\"version\":\"0.0.1.dev2\"}" + + if strings.TrimSpace(responseRecorder.Body.String()) != expected { + t.Errorf("Want '%s', got '%s'", expected, responseRecorder.Body.String()) + } +} + +func TestTaskSubmit(t *testing.T) { + server := NewShimServer(context.Background(), ":12340", "0.0.1.dev2", NewDummyRunner(), nil, nil, nil, nil) + requestBody := `{ + "id": "dummy-id", + "name": "dummy-name", + "image_name": "ubuntu" + }` + + request := httptest.NewRequest("POST", "/api/tasks", strings.NewReader(requestBody)) + responseRecorder := httptest.NewRecorder() + firstSubmitPost := commonapi.JSONResponseHandler(server.TaskSubmitHandler) + firstSubmitPost(responseRecorder, request) + if responseRecorder.Code != 200 { + t.Errorf("Want status '%d', got '%d'", 200, responseRecorder.Code) + } + + request = httptest.NewRequest("POST", "/api/tasks", strings.NewReader(requestBody)) + responseRecorder = httptest.NewRecorder() + secondSubmitPost := commonapi.JSONResponseHandler(server.TaskSubmitHandler) + secondSubmitPost(responseRecorder, request) + if responseRecorder.Code != 409 { + t.Errorf("Want status '%d', got '%d'", 409, responseRecorder.Code) + } +} diff --git a/runner/internal/shim/api/http.go b/runner/internal/shim/api/http.go deleted file mode 100644 index c0fbce5fd2..0000000000 --- a/runner/internal/shim/api/http.go +++ /dev/null @@ -1,89 +0,0 @@ -package api - -import ( - "context" - "fmt" - "log" - "net/http" - - "github.com/dstackai/dstack/runner/internal/api" - "github.com/dstackai/dstack/runner/internal/shim" -) - -func (s *ShimServer) HealthcheckGetHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { - s.mu.RLock() - defer s.mu.RUnlock() - - return &HealthcheckResponse{ - Service: "dstack-shim", - Version: s.version, - }, nil -} - -func (s *ShimServer) SubmitPostHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { - s.mu.RLock() - defer s.mu.RUnlock() - state, _, _, _ := s.runner.GetState() - if state != shim.Pending { - return nil, &api.Error{Status: http.StatusConflict} - } - - var body TaskConfigBody - if err := api.DecodeJSONBody(w, r, &body, true); err != nil { - log.Println("Failed to decode submit body", "err", err) - return nil, err - } - - go func(taskConfig shim.TaskConfig) { - err := s.runner.Run(context.Background(), taskConfig) - if err != nil { - fmt.Printf("failed Run %v\n", err) - } - }(body) - - return nil, nil -} - -func (s *ShimServer) PullGetHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { - s.mu.RLock() - defer s.mu.RUnlock() - - state, containerStatus, executorError, jobResult := s.runner.GetState() - - return &PullResponse{ - State: string(state), - ExecutorError: executorError, - ContainerName: containerStatus.ContainerName, - Status: containerStatus.Status, - Running: containerStatus.Running, - OOMKilled: containerStatus.OOMKilled, - Dead: containerStatus.Dead, - ExitCode: containerStatus.ExitCode, - Error: containerStatus.Error, - Result: jobResult, - }, nil -} - -func (s *ShimServer) StopPostHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { - s.mu.RLock() - defer s.mu.RUnlock() - - state, _, _, _ := s.runner.GetState() - if state == shim.Pending { - return &StopResponse{ - State: string(state), - }, nil - } - - var body StopBody - if err := api.DecodeJSONBody(w, r, &body, true); err != nil { - log.Println("Failed to decode submit stop body", "err", err) - return nil, err - } - - s.runner.Stop(body.Force) - - return &StopResponse{ - State: string(state), - }, nil -} diff --git a/runner/internal/shim/api/schemas.go b/runner/internal/shim/api/schemas.go index e3f0343f7f..0e96028a5b 100644 --- a/runner/internal/shim/api/schemas.go +++ b/runner/internal/shim/api/schemas.go @@ -1,32 +1,56 @@ package api -import "github.com/dstackai/dstack/runner/internal/shim" +import ( + "github.com/dstackai/dstack/runner/internal/shim" + "github.com/dstackai/dstack/runner/internal/shim/components" + "github.com/dstackai/dstack/runner/internal/shim/dcgm" +) -type TaskConfigBody = shim.TaskConfig +type HealthcheckResponse struct { + Service string `json:"service"` + Version string `json:"version"` +} -type StopBody struct { +type ShutdownRequest struct { Force bool `json:"force"` } -type HealthcheckResponse struct { - Service string `json:"service"` - Version string `json:"version"` +type InstanceHealthResponse struct { + DCGM *dcgm.Health `json:"dcgm"` } -type PullResponse struct { - State string `json:"state"` - ExecutorError string `json:"executor_error"` - ContainerName string `json:"container_name"` - Status string `json:"status"` - Running bool `json:"running"` - OOMKilled bool `json:"oom_killed"` - Dead bool `json:"dead"` - ExitCode int `json:"exit_code"` - Error string `json:"error"` - Result shim.JobResult `json:"result"` +type TaskListResponse struct { + Tasks []*shim.TaskListItem `json:"tasks"` } -type StopResponse struct { - State string `json:"state"` +type TaskInfoResponse struct { + ID string `json:"id"` + Status shim.TaskStatus `json:"status"` + TerminationReason string `json:"termination_reason"` + TerminationMessage string `json:"termination_message"` + Ports []shim.PortMapping `json:"ports"` + + ImagePullProgress *shim.ImagePullProgress `json:"image_pull_progress"` + + // The following fields are for debugging only, server doesn't need them + ContainerName string `json:"container_name"` + ContainerID string `json:"container_id"` + GpuIDs []string `json:"gpus_ids"` +} + +type TaskSubmitRequest = shim.TaskConfig + +type TaskTerminateRequest struct { + TerminationReason string `json:"termination_reason"` + TerminationMessage string `json:"termination_message"` + Timeout uint `json:"timeout"` } +type ComponentListResponse struct { + Components []components.ComponentInfo `json:"components"` +} + +type ComponentInstallRequest struct { + Name string `json:"name"` + URL string `json:"url"` +} diff --git a/runner/internal/shim/api/server.go b/runner/internal/shim/api/server.go index 03ed8a7ac7..9008aa2efe 100644 --- a/runner/internal/shim/api/server.go +++ b/runner/internal/shim/api/server.go @@ -2,43 +2,129 @@ package api import ( "context" + "errors" + "net" "net/http" + "reflect" "sync" - "github.com/dstackai/dstack/runner/internal/api" + "github.com/dstackai/dstack/runner/internal/common/api" + "github.com/dstackai/dstack/runner/internal/common/log" "github.com/dstackai/dstack/runner/internal/shim" + "github.com/dstackai/dstack/runner/internal/shim/components" + "github.com/dstackai/dstack/runner/internal/shim/dcgm" ) type TaskRunner interface { - Run(context.Context, shim.TaskConfig) error - GetState() (shim.RunnerStatus, shim.ContainerStatus, string, shim.JobResult) - Stop(bool) + Submit(context.Context, shim.TaskConfig) error + Run(ctx context.Context, taskID string) error + Terminate(ctx context.Context, taskID string, timeout uint, reason string, message string) error + Remove(ctx context.Context, taskID string) error + + Resources(context.Context) shim.Resources + TaskList() []*shim.TaskListItem + TaskInfo(taskID string) shim.TaskInfo } type ShimServer struct { - HttpServer *http.Server - mu sync.RWMutex + httpServer *http.Server + mu sync.RWMutex + ctx context.Context + inShutdown bool + inForceShutdown bool + + bgJobsCtx context.Context + bgJobsCancel context.CancelFunc + bgJobsGroup *sync.WaitGroup runner TaskRunner + dcgmExporter *dcgm.DCGMExporter + dcgmWrapper dcgm.DCGMWrapperInterface // interface with nil value normalized to plain nil + + runnerManager components.ComponentManager + shimManager components.ComponentManager + version string } -func NewShimServer(address string, runner TaskRunner, version string) *ShimServer { - mux := http.NewServeMux() +func NewShimServer( + ctx context.Context, address string, version string, + runner TaskRunner, dcgmExporter *dcgm.DCGMExporter, dcgmWrapper dcgm.DCGMWrapperInterface, + runnerManager components.ComponentManager, shimManager components.ComponentManager, +) *ShimServer { + bgJobsCtx, bgJobsCancel := context.WithCancel(ctx) + if dcgmWrapper != nil && reflect.ValueOf(dcgmWrapper).IsNil() { + dcgmWrapper = nil + } + r := api.NewRouter() s := &ShimServer{ - HttpServer: &http.Server{ - Addr: address, - Handler: mux, + httpServer: &http.Server{ + Addr: address, + Handler: r, + BaseContext: func(l net.Listener) context.Context { return ctx }, }, + ctx: ctx, + + bgJobsCtx: bgJobsCtx, + bgJobsCancel: bgJobsCancel, + bgJobsGroup: &sync.WaitGroup{}, runner: runner, + dcgmExporter: dcgmExporter, + dcgmWrapper: dcgmWrapper, + + runnerManager: runnerManager, + shimManager: shimManager, + version: version, } - mux.HandleFunc("/api/submit", api.JSONResponseHandler("POST", s.SubmitPostHandler)) - mux.HandleFunc("/api/healthcheck", api.JSONResponseHandler("GET", s.HealthcheckGetHandler)) - mux.HandleFunc("/api/pull", api.JSONResponseHandler("GET", s.PullGetHandler)) - mux.HandleFunc("/api/stop", api.JSONResponseHandler("POST", s.StopPostHandler)) + + // The healthcheck endpoint should stay backward compatible, as it is used for negotiation + r.AddHandler("GET", "/api/healthcheck", s.HealthcheckHandler) + r.AddHandler("POST", "/api/shutdown", s.ShutdownHandler) + r.AddHandler("GET", "/api/instance/health", s.InstanceHealthHandler) + r.AddHandler("GET", "/api/components", s.ComponentListHandler) + r.AddHandler("POST", "/api/components/install", s.ComponentInstallHandler) + r.AddHandler("GET", "/api/tasks", s.TaskListHandler) + r.AddHandler("GET", "/api/tasks/{id}", s.TaskInfoHandler) + r.AddHandler("POST", "/api/tasks", s.TaskSubmitHandler) + r.AddHandler("POST", "/api/tasks/{id}/terminate", s.TaskTerminateHandler) + r.AddHandler("POST", "/api/tasks/{id}/remove", s.TaskRemoveHandler) + r.HandleFunc("GET /metrics/tasks/{id}", s.TaskMetricsHandler) + return s } + +func (s *ShimServer) Serve() error { + if err := s.httpServer.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { + return err + } + return nil +} + +func (s *ShimServer) Shutdown(ctx context.Context, force bool) error { + s.mu.Lock() + + if s.inForceShutdown || s.inShutdown && !force { + log.Info(ctx, "Already shutting down, ignoring request") + s.mu.Unlock() + return nil + } + + s.inShutdown = true + if force { + s.inForceShutdown = true + } + s.mu.Unlock() + + log.Info(ctx, "Shutting down", "force", force) + s.bgJobsCancel() + if force { + return s.httpServer.Close() + } + err := s.httpServer.Shutdown(ctx) + s.bgJobsGroup.Wait() + return err +} diff --git a/runner/internal/shim/authorized_keys.go b/runner/internal/shim/authorized_keys.go index 35d5b9d57e..491172ce85 100644 --- a/runner/internal/shim/authorized_keys.go +++ b/runner/internal/shim/authorized_keys.go @@ -9,14 +9,13 @@ import ( "path/filepath" "slices" - "github.com/ztrue/tracerr" "golang.org/x/crypto/ssh" ) func PublicKeyFingerprint(key string) (string, error) { pk, _, _, _, err := ssh.ParseAuthorizedKey([]byte(key)) if err != nil { - return "", tracerr.Wrap(err) + return "", fmt.Errorf("parse authorized key: %w", err) } keyFingerprint := ssh.FingerprintSHA256(pk) return keyFingerprint, nil @@ -54,8 +53,8 @@ func AppendPublicKeys(fileKeys []string, keysToAppend []string) []string { } type AuthorizedKeys struct { - user string - lookup func(username string) (*user.User, error) + user string + lookup func(username string) (*user.User, error) } func (ak AuthorizedKeys) AppendPublicKeys(publicKeys []string) error { @@ -74,7 +73,7 @@ func (ak AuthorizedKeys) read(r io.Reader) ([]string, error) { lines = append(lines, text) } if err := scanner.Err(); err != nil { - return []string{}, tracerr.Wrap(err) + return []string{}, fmt.Errorf("scan authorized keys: %w", err) } return lines, nil } @@ -84,7 +83,7 @@ func (ak AuthorizedKeys) write(w io.Writer, lines []string) error { for _, line := range lines { _, err := fmt.Fprintln(wr, line) if err != nil { - return tracerr.Wrap(err) + return fmt.Errorf("write line: %w", err) } } return wr.Flush() @@ -109,40 +108,40 @@ func (ak AuthorizedKeys) GetAuthorizedKeysPath() (string, error) { func (ak AuthorizedKeys) transformAuthorizedKeys(transform func([]string, []string) []string, publicKeys []string) error { authorizedKeysPath, err := ak.GetAuthorizedKeysPath() if err != nil { - return tracerr.Wrap(err) + return fmt.Errorf("get authorized keys path: %w", err) } info, err := os.Stat(authorizedKeysPath) if err != nil { - return tracerr.Wrap(err) + return fmt.Errorf("stat authorized keys: %w", err) } fileMode := info.Mode().Perm() authorizedKeysFile, err := os.OpenFile(authorizedKeysPath, os.O_RDWR, fileMode) if err != nil { - return tracerr.Wrap(err) + return fmt.Errorf("open authorized keys: %w", err) } defer authorizedKeysFile.Close() lines, err := ak.read(authorizedKeysFile) if err != nil { - return tracerr.Wrap(err) + return fmt.Errorf("read authorized keys: %w", err) } // write backup authorizedKeysPath, err = ak.GetAuthorizedKeysPath() if err != nil { - return tracerr.Wrap(err) + return fmt.Errorf("get authorized keys path: %w", err) } authorizedKeysPathBackup := authorizedKeysPath + ".bak" authorizedKeysBackup, err := os.OpenFile(authorizedKeysPathBackup, os.O_RDWR|os.O_CREATE|os.O_TRUNC, fileMode) if err != nil { - return tracerr.Wrap(err) + return fmt.Errorf("open authorized keys backup: %w", err) } defer authorizedKeysBackup.Close() if err := ak.write(authorizedKeysBackup, lines); err != nil { - return tracerr.Wrap(err) + return fmt.Errorf("write authorized keys backup: %w", err) } // transform lines @@ -150,13 +149,13 @@ func (ak AuthorizedKeys) transformAuthorizedKeys(transform func([]string, []stri // write authorized_keys if err := authorizedKeysFile.Truncate(0); err != nil { - return tracerr.Wrap(err) + return fmt.Errorf("truncate authorized keys: %w", err) } if _, err := authorizedKeysFile.Seek(0, 0); err != nil { - return tracerr.Wrap(err) + return fmt.Errorf("seek authorized keys: %w", err) } if err := ak.write(authorizedKeysFile, newLines); err != nil { - return tracerr.Wrap(err) + return fmt.Errorf("write authorized keys: %w", err) } return nil diff --git a/runner/internal/shim/backends/aws.go b/runner/internal/shim/backends/aws.go new file mode 100644 index 0000000000..1fe7fb890a --- /dev/null +++ b/runner/internal/shim/backends/aws.go @@ -0,0 +1,83 @@ +package backends + +import ( + "bytes" + "context" + "fmt" + "os" + "os/exec" + "strings" +) + +type AWSBackend struct{} + +func NewAWSBackend() *AWSBackend { + return &AWSBackend{} +} + +// GetRealDeviceName returns the device name for the given EBS volume ID and virtual deviceName. +// If the volume has no partitions, returns the volume device. +// If the volume has partitions, return the first partition device. +// The device name on the instance can be different from the device name specified in block-device mapping: +// * Nitro-based instances: /dev/sda => nvme0n1. lsblk returns volume-id in SERIAL. +// * Xen-based Ubuntu instances: /dev/sda => /dev/xvda. +// * Red Hat and CentOS: may increment trailing letters in some versions – not supported. +// * Other legacy systems: /dev/sda => /dev/sda. +// More: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/device_naming.html +func (e *AWSBackend) GetRealDeviceName(volumeID, deviceName string) (string, error) { + // Run the lsblk command to get block device information + // On AWS, SERIAL contains volume id. + cmd := exec.CommandContext(context.TODO(), "lsblk", "-o", "NAME,SERIAL") + var out bytes.Buffer + cmd.Stdout = &out + if err := cmd.Run(); err != nil { + return "", fmt.Errorf("failed to list block devices: %w", err) + } + + baseDevice := "" + + // Parse the output to find the device that matches the volume ID + lines := strings.Split(out.String(), "\n") + for _, line := range lines { + fields := strings.Fields(line) + if len(fields) == 2 && strings.HasPrefix(fields[1], "vol") { + serial := strings.TrimPrefix(fields[1], "vol") + if "vol-"+serial == volumeID { + baseDevice = "/dev/" + fields[0] + } + } + } + + // If no match is found, fall back to mapping AWS device name + if baseDevice == "" && deviceName != "" { + // Try mapping deviceName to possible OS device names + mappedDevices := []string{ + deviceName, + strings.Replace(deviceName, "/dev/sd", "/dev/xvd", 1), // sdX => xvdX + } + for _, dev := range mappedDevices { + if _, err := os.Stat(dev); err == nil { + baseDevice = dev + break + } + } + } + + if baseDevice == "" { + return "", fmt.Errorf("volume %s not found among block devices", volumeID) + } + + // Run lsblk again to check for partitions on the base device + cmd = exec.CommandContext(context.TODO(), "lsblk", "-ln", "-o", "NAME", baseDevice) + out.Reset() + cmd.Stdout = &out + if err := cmd.Run(); err != nil { + return "", fmt.Errorf("failed to list partitions for device %s: %w", baseDevice, err) + } + partitions := strings.Split(strings.TrimSpace(out.String()), "\n") + if len(partitions) > 1 { + return "/dev/" + partitions[1], nil + } + + return baseDevice, nil +} diff --git a/runner/internal/shim/backends/backends.go b/runner/internal/shim/backends/backends.go new file mode 100644 index 0000000000..e99910db1d --- /dev/null +++ b/runner/internal/shim/backends/backends.go @@ -0,0 +1,15 @@ +package backends + +import ( + "fmt" +) + +func GetBackend(backendType string) (Backend, error) { + switch backendType { + case "aws": + return NewAWSBackend(), nil + case "gcp": + return NewGCPBackend(), nil + } + return nil, fmt.Errorf("unknown backend: %q", backendType) +} diff --git a/runner/internal/shim/backends/base.go b/runner/internal/shim/backends/base.go new file mode 100644 index 0000000000..54cb40121d --- /dev/null +++ b/runner/internal/shim/backends/base.go @@ -0,0 +1,6 @@ +package backends + +type Backend interface { + // GetRealDeviceName returns the real device name for the given volume ID and virtual device name. + GetRealDeviceName(volumeID, deviceName string) (string, error) +} diff --git a/runner/internal/shim/backends/gcp.go b/runner/internal/shim/backends/gcp.go new file mode 100644 index 0000000000..65632b2371 --- /dev/null +++ b/runner/internal/shim/backends/gcp.go @@ -0,0 +1,30 @@ +package backends + +import ( + "fmt" + "os" + "path/filepath" +) + +type GCPBackend struct{} + +func NewGCPBackend() *GCPBackend { + return &GCPBackend{} +} + +// GetRealDeviceName resolves device names according to https://cloud.google.com/compute/docs/disks/disk-symlinks +func (e *GCPBackend) GetRealDeviceName(volumeID, deviceName string) (string, error) { + // Try resolving first partition or external volumes + realDeviceName, err := os.Readlink(fmt.Sprintf("/dev/disk/by-id/google-%s-part1", deviceName)) + if err != nil { + realDeviceName, err = os.Readlink(fmt.Sprintf("/dev/disk/by-id/google-%s", deviceName)) + if err != nil { + return "", fmt.Errorf("failed to resolve symlink for volume %s: %w", volumeID, err) + } + } + realDeviceName, err = filepath.Abs(filepath.Join("/dev/disk/by-id/", realDeviceName)) + if err != nil { + return "", fmt.Errorf("get device absolute path: %w", err) + } + return realDeviceName, nil +} diff --git a/runner/internal/shim/components/runner.go b/runner/internal/shim/components/runner.go new file mode 100644 index 0000000000..3dc361a251 --- /dev/null +++ b/runner/internal/shim/components/runner.go @@ -0,0 +1,61 @@ +package components + +import ( + "context" + "fmt" + "sync" +) + +type RunnerManager struct { + path string + version string + status ComponentStatus + + mu *sync.RWMutex +} + +func NewRunnerManager(ctx context.Context, pth string) (*RunnerManager, error) { + m := RunnerManager{ + path: pth, + mu: &sync.RWMutex{}, + } + err := m.check(ctx) + return &m, err +} + +func (m *RunnerManager) GetInfo(ctx context.Context) ComponentInfo { + m.mu.RLock() + defer m.mu.RUnlock() + return ComponentInfo{ + Name: ComponentNameRunner, + Version: m.version, + Status: m.status, + } +} + +func (m *RunnerManager) Install(ctx context.Context, url string, force bool) error { + m.mu.Lock() + if m.status == ComponentStatusInstalling { + m.mu.Unlock() + return fmt.Errorf("install %s: already installing", ComponentNameRunner) + } + m.status = ComponentStatusInstalling + m.version = "" + m.mu.Unlock() + + downloadErr := downloadFile(ctx, url, m.path, 0o755, force) + // Recheck the binary even if the download has failed, just in case. + checkErr := m.check(ctx) + if downloadErr != nil { + return downloadErr + } + return checkErr +} + +func (m *RunnerManager) check(ctx context.Context) (err error) { + m.mu.Lock() + defer m.mu.Unlock() + + m.status, m.version, err = checkDstackComponent(ctx, ComponentNameRunner, m.path) + return err +} diff --git a/runner/internal/shim/components/shim.go b/runner/internal/shim/components/shim.go new file mode 100644 index 0000000000..5ac9b08d39 --- /dev/null +++ b/runner/internal/shim/components/shim.go @@ -0,0 +1,61 @@ +package components + +import ( + "context" + "fmt" + "sync" +) + +type ShimManager struct { + path string + version string + status ComponentStatus + + mu *sync.RWMutex +} + +func NewShimManager(ctx context.Context, pth string) (*ShimManager, error) { + m := ShimManager{ + path: pth, + mu: &sync.RWMutex{}, + } + err := m.check(ctx) + return &m, err +} + +func (m *ShimManager) GetInfo(ctx context.Context) ComponentInfo { + m.mu.RLock() + defer m.mu.RUnlock() + return ComponentInfo{ + Name: ComponentNameShim, + Version: m.version, + Status: m.status, + } +} + +func (m *ShimManager) Install(ctx context.Context, url string, force bool) error { + m.mu.Lock() + if m.status == ComponentStatusInstalling { + m.mu.Unlock() + return fmt.Errorf("install %s: already installing", ComponentNameShim) + } + m.status = ComponentStatusInstalling + m.version = "" + m.mu.Unlock() + + downloadErr := downloadFile(ctx, url, m.path, 0o755, force) + // Recheck the binary even if the download has failed, just in case. + checkErr := m.check(ctx) + if downloadErr != nil { + return downloadErr + } + return checkErr +} + +func (m *ShimManager) check(ctx context.Context) (err error) { + m.mu.Lock() + defer m.mu.Unlock() + + m.status, m.version, err = checkDstackComponent(ctx, ComponentNameShim, m.path) + return err +} diff --git a/runner/internal/shim/components/types.go b/runner/internal/shim/components/types.go new file mode 100644 index 0000000000..57c205af53 --- /dev/null +++ b/runner/internal/shim/components/types.go @@ -0,0 +1,30 @@ +package components + +import "context" + +type ComponentName string + +const ( + ComponentNameRunner ComponentName = "dstack-runner" + ComponentNameShim ComponentName = "dstack-shim" +) + +type ComponentStatus string + +const ( + ComponentStatusNotInstalled ComponentStatus = "not-installed" + ComponentStatusInstalled ComponentStatus = "installed" + ComponentStatusInstalling ComponentStatus = "installing" + ComponentStatusError ComponentStatus = "error" +) + +type ComponentInfo struct { + Name ComponentName `json:"name"` + Version string `json:"version"` + Status ComponentStatus `json:"status"` +} + +type ComponentManager interface { + GetInfo(ctx context.Context) ComponentInfo + Install(ctx context.Context, url string, force bool) error +} diff --git a/runner/internal/shim/components/utils.go b/runner/internal/shim/components/utils.go new file mode 100644 index 0000000000..a4456acaa3 --- /dev/null +++ b/runner/internal/shim/components/utils.go @@ -0,0 +1,116 @@ +package components + +import ( + "context" + "errors" + "fmt" + "io" + "net/http" + "os" + "os/exec" + "path/filepath" + "strings" + "time" + + "github.com/dstackai/dstack/runner/internal/common/log" + "github.com/dstackai/dstack/runner/internal/common/utils" +) + +const downloadTimeout = 10 * time.Minute + +func downloadFile(ctx context.Context, url string, path string, mode os.FileMode, force bool) error { + if _, err := os.Stat(path); err == nil { + if force { + log.Debug(ctx, "file exists, forcing download", "path", path) + } else { + log.Debug(ctx, "file exists, skipping download", "path", path) + return nil + } + } else if !os.IsNotExist(err) { + return fmt.Errorf("check file exists: %w", err) + } + dir, name := filepath.Split(path) + tempFile, err := os.CreateTemp(dir, fmt.Sprintf(".*-%s", name)) + if err != nil { + return fmt.Errorf("create temp file for %s: %w", name, err) + } + defer func() { + if err := tempFile.Close(); err != nil { + log.Error(ctx, "close temp file", "err", err) + } + if err := os.Remove(tempFile.Name()); err != nil && !errors.Is(err, os.ErrNotExist) { + log.Error(ctx, "remove temp file", "err", err) + } + }() + + log.Debug(ctx, "downloading", "path", path, "url", url) + ctx, cancel := context.WithTimeout(ctx, downloadTimeout) + defer cancel() + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return fmt.Errorf("create download request: %w", err) + } + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return fmt.Errorf("execute download request: %w", err) + } + + defer func() { + err := resp.Body.Close() + if err != nil { + log.Error(ctx, "downloadFile: close body error", "err", err) + } + }() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("unexpected status code %s downloading %s from %s", resp.Status, name, url) + } + + written, err := io.Copy(tempFile, resp.Body) + if err != nil { + log.Error(ctx, "download file", "err", err, "bytes", written, "total", resp.ContentLength) + if err := os.Remove(tempFile.Name()); err != nil { + log.Error(ctx, "remove temp file", "err", err) + } + return fmt.Errorf("copy %s: %w", name, err) + } + log.Debug(ctx, "file has been downloaded", "path", path, "bytes", written) + + if err := tempFile.Chmod(mode); err != nil { + return fmt.Errorf("chmod %s: %w", path, err) + } + + if err := os.Rename(tempFile.Name(), path); err != nil { + return fmt.Errorf("move %s to %s: %w", name, path, err) + } + + return nil +} + +func checkDstackComponent(ctx context.Context, name ComponentName, pth string) (status ComponentStatus, version string, err error) { + exists, err := utils.PathExists(pth) + if err != nil { + return ComponentStatusError, "", fmt.Errorf("check %s: %w", name, err) + } + if !exists { + return ComponentStatusNotInstalled, "", nil + } + + cmd := exec.CommandContext(ctx, pth, "--version") + output, err := cmd.Output() + if err != nil { + return ComponentStatusError, "", fmt.Errorf("check %s: %w", name, err) + } + + rawVersion := string(output) // dstack-{shim,runner} version 0.19.38 + versionFields := strings.Fields(rawVersion) + if len(versionFields) != 3 { + return ComponentStatusError, "", fmt.Errorf("check %s: unexpected version output: %s", name, rawVersion) + } + if versionFields[0] != string(name) { + return ComponentStatusError, "", fmt.Errorf("check %s: unexpected component name: %s", name, versionFields[0]) + } + return ComponentStatusInstalled, versionFields[2], nil +} diff --git a/runner/internal/shim/dcgm/exporter.go b/runner/internal/shim/dcgm/exporter.go new file mode 100644 index 0000000000..ed861eb524 --- /dev/null +++ b/runner/internal/shim/dcgm/exporter.go @@ -0,0 +1,213 @@ +package dcgm + +import ( + "context" + "encoding/csv" + "errors" + "fmt" + "io" + "net/http" + "os" + "os/exec" + "strconv" + "strings" + "sync" + "syscall" + "time" + + "github.com/alexellis/go-execute/v2" + + "github.com/dstackai/dstack/runner/internal/common/log" +) + +// Counter represents a single line in counters.csv, see +// https://github.com/NVIDIA/dcgm-exporter/tree/5f9250c211?tab=readme-ov-file#changing-metrics +// For list of supported types see +// https://github.com/NVIDIA/dcgm-exporter/blob/5f9250c211/internal/pkg/counters/variables.go#L23 +// NB: Although it is called "counter" in dcgm-exporter, in fact it can be any Prometheus +// metric type or even a label +type Counter struct { + Name string + Type string + Help string +} + +// Full list: https://docs.nvidia.com/datacenter/dcgm/latest/dcgm-api/dcgm-api-field-ids.html +var counters = [...]Counter{ + {"DCGM_FI_DEV_GPU_UTIL", "gauge", "GPU utilization (in %)."}, + {"DCGM_FI_DEV_MEM_COPY_UTIL", "gauge", "Memory utilization (in %)."}, + {"DCGM_FI_DEV_ENC_UTIL", "gauge", "Encoder utilization (in %)."}, + {"DCGM_FI_DEV_DEC_UTIL", "gauge", "Decoder utilization (in %)."}, + {"DCGM_FI_DEV_FB_FREE", "gauge", "Framebuffer memory free (in MiB)."}, + {"DCGM_FI_DEV_FB_USED", "gauge", "Framebuffer memory used (in MiB)."}, + {"DCGM_FI_PROF_GR_ENGINE_ACTIVE", "gauge", "The ratio of cycles during which a graphics engine or compute engine remains active."}, + {"DCGM_FI_PROF_SM_ACTIVE", "gauge", "The ratio of cycles an SM has at least 1 warp assigned."}, + {"DCGM_FI_PROF_SM_OCCUPANCY", "gauge", "The ratio of number of warps resident on an SM."}, + {"DCGM_FI_PROF_PIPE_TENSOR_ACTIVE", "gauge", "Ratio of cycles the tensor (HMMA) pipe is active."}, + {"DCGM_FI_PROF_PIPE_FP64_ACTIVE", "gauge", "Ratio of cycles the fp64 pipes are active."}, + {"DCGM_FI_PROF_PIPE_FP32_ACTIVE", "gauge", "Ratio of cycles the fp32 pipes are active."}, + {"DCGM_FI_PROF_PIPE_FP16_ACTIVE", "gauge", "Ratio of cycles the fp16 pipes are active."}, + {"DCGM_FI_PROF_PIPE_INT_ACTIVE", "gauge", "Ratio of cycles the integer pipe is active."}, + {"DCGM_FI_PROF_DRAM_ACTIVE", "gauge", "Ratio of cycles the device memory interface is active sending or receiving data."}, + {"DCGM_FI_PROF_PCIE_TX_BYTES", "counter", "The number of bytes of active PCIe tx (transmit) data including both header and payload."}, + {"DCGM_FI_PROF_PCIE_RX_BYTES", "counter", "The number of bytes of active PCIe rx (read) data including both header and payload."}, + {"DCGM_FI_DEV_SM_CLOCK", "gauge", "SM clock frequency (in MHz)."}, + {"DCGM_FI_DEV_MEM_CLOCK", "gauge", "Memory clock frequency (in MHz)."}, + {"DCGM_FI_DEV_MEMORY_TEMP", "gauge", "Memory temperature (in C)."}, + {"DCGM_FI_DEV_GPU_TEMP", "gauge", "GPU temperature (in C)."}, + {"DCGM_FI_DEV_POWER_USAGE", "gauge", "Power draw (in W)."}, + {"DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION", "counter", "Total energy consumption since boot (in mJ)."}, + {"DCGM_FI_DEV_PCIE_REPLAY_COUNTER", "counter", "Total number of PCIe retries."}, + {"DCGM_FI_DEV_XID_ERRORS", "gauge", "Value of the last XID error encountered."}, + {"DCGM_FI_DEV_POWER_VIOLATION", "counter", "Throttling duration due to power constraints (in us)."}, + {"DCGM_FI_DEV_THERMAL_VIOLATION", "counter", "Throttling duration due to thermal constraints (in us)."}, + {"DCGM_FI_DEV_SYNC_BOOST_VIOLATION", "counter", "Throttling duration due to sync-boost constraints (in us)."}, + {"DCGM_FI_DEV_BOARD_LIMIT_VIOLATION", "counter", "Throttling duration due to board limit constraints (in us)."}, + {"DCGM_FI_DEV_LOW_UTIL_VIOLATION", "counter", "Throttling duration due to low utilization (in us)."}, + {"DCGM_FI_DEV_RELIABILITY_VIOLATION", "counter", "Throttling duration due to reliability constraints (in us)."}, + {"DCGM_FI_DEV_ECC_SBE_VOL_TOTAL", "counter", "Total number of single-bit volatile ECC errors."}, + {"DCGM_FI_DEV_ECC_DBE_VOL_TOTAL", "counter", "Total number of double-bit volatile ECC errors."}, + {"DCGM_FI_DEV_ECC_SBE_AGG_TOTAL", "counter", "Total number of single-bit persistent ECC errors."}, + {"DCGM_FI_DEV_ECC_DBE_AGG_TOTAL", "counter", "Total number of double-bit persistent ECC errors."}, + {"DCGM_FI_DEV_RETIRED_SBE", "counter", "Total number of retired pages due to single-bit errors."}, + {"DCGM_FI_DEV_RETIRED_DBE", "counter", "Total number of retired pages due to double-bit errors."}, + {"DCGM_FI_DEV_RETIRED_PENDING", "counter", "Total number of pages pending retirement."}, + {"DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS", "counter", "Number of remapped rows for uncorrectable errors"}, + {"DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS", "counter", "Number of remapped rows for correctable errors"}, + {"DCGM_FI_DEV_ROW_REMAP_FAILURE", "gauge", "Whether remapping of rows has failed"}, + {"DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL", "counter", "Total number of NVLink flow-control CRC errors."}, + {"DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL", "counter", "Total number of NVLink data CRC errors."}, + {"DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL", "counter", "Total number of NVLink retries."}, + {"DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL", "counter", "Total number of NVLink recovery errors."}, + {"DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL", "counter", "Total number of NVLink bandwidth counters for all lanes."}, + {"DCGM_FI_DEV_NVLINK_BANDWIDTH_L0", "counter", "The number of bytes of active NVLink rx or tx data including both header and payload."}, + {"DCGM_FI_PROF_NVLINK_RX_BYTES", "counter", "The number of bytes of active PCIe rx (read) data including both header and payload. "}, + {"DCGM_FI_PROF_NVLINK_TX_BYTES", "counter", "The number of bytes of active NvLink tx (transmit) data including both header and payload. "}, +} + +const dcgmExporterExecName = "dcgm-exporter" + +type DCGMExporter struct { + cmd *exec.Cmd + cancel context.CancelFunc + execPath string + listenAddr string + client *http.Client + url string + interval time.Duration + configPath string + mu sync.Mutex + lastFetchedAt time.Time + lastResponse []byte +} + +func (c *DCGMExporter) Start(ctx context.Context) error { + if c.cmd != nil { + return errors.New("already started") + } + + configFile, err := os.CreateTemp("", "counters-*.csv") + if err != nil { + return fmt.Errorf("create config file: %w", err) + } + defer configFile.Close() + c.configPath = configFile.Name() + configWriter := csv.NewWriter(configFile) + for _, counter := range counters { + err := configWriter.Write([]string{counter.Name, counter.Type, counter.Help}) + if err != nil { + return fmt.Errorf("write config file: %w", err) + } + } + configWriter.Flush() + + cmdCtx, cmdCancel := context.WithCancel(ctx) + c.cancel = cmdCancel + cmd := exec.CommandContext( + cmdCtx, c.execPath, + "-f", c.configPath, + "-a", c.listenAddr, + "-c", strconv.Itoa(int(c.interval.Milliseconds())), + ) + c.cmd = cmd + cmd.Cancel = func() error { + return cmd.Process.Signal(syscall.SIGTERM) + } + cmd.WaitDelay = 5 * time.Second + return cmd.Start() +} + +func (c *DCGMExporter) Stop(context.Context) error { + if c.cmd == nil { + return errors.New("not started") + } + c.cancel() + _ = os.Remove(c.configPath) + return c.cmd.Wait() +} + +func (c *DCGMExporter) Fetch(ctx context.Context) ([]byte, error) { + c.mu.Lock() + defer c.mu.Unlock() + + now := time.Now() + + if now.Sub(c.lastFetchedAt) < c.interval { + return c.lastResponse, nil + } + + req, err := http.NewRequestWithContext(ctx, "GET", c.url, nil) + if err != nil { + return nil, err + } + resp, err := c.client.Do(req) + if err != nil { + return nil, err + } + defer func() { _ = resp.Body.Close() }() + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("status is not OK: %d", resp.StatusCode) + } + response, err := io.ReadAll(resp.Body) + if err != nil { + return nil, err + } + c.lastFetchedAt = now + c.lastResponse = response + return response, nil +} + +func NewDCGMExporter(execPath string, port int, interval time.Duration) *DCGMExporter { + listenAddr := fmt.Sprintf("localhost:%d", port) + client := &http.Client{ + Timeout: 10 * time.Second, + } + return &DCGMExporter{ + execPath: execPath, + listenAddr: listenAddr, + client: client, + url: fmt.Sprintf("http://%s/metrics", listenAddr), + interval: interval, + } +} + +func GetDCGMExporterExecPath(ctx context.Context) (string, error) { + path, err := exec.LookPath(dcgmExporterExecName) + if err != nil { + return "", err + } + cmd := execute.ExecTask{ + Command: path, + Args: []string{"-v"}, + StreamStdio: false, + } + res, err := cmd.Execute(ctx) + if err != nil { + return "", err + } + if res.ExitCode != 0 { + return "", fmt.Errorf("%s returned %d, stderr: %s, stdout: %s", path, res.ExitCode, res.Stderr, res.Stdout) + } + log.Debug(ctx, "detected", "path", path, "version", strings.TrimSpace(res.Stdout)) + return path, nil +} diff --git a/runner/internal/shim/dcgm/metrics.go b/runner/internal/shim/dcgm/metrics.go new file mode 100644 index 0000000000..6b0c8983d4 --- /dev/null +++ b/runner/internal/shim/dcgm/metrics.go @@ -0,0 +1,57 @@ +package dcgm + +import ( + "bufio" + "bytes" + "strings" +) + +// FilterMetrics returns subset of metrics filtered by GPU UUIDs +func FilterMetrics(expfmtBody []byte, uuids []string) []byte { + // DCGM Exporter returns metrics in the following format: + // # HELP DCGM_FIELD_1 Docstring for field 1 + // # TYPE DCGM_FIELD_1 gauge|counter|... + // DCGM_FIELD{gpu="0", UUID="..." [...other labels...]} 0.0 + // DCGM_FIELD{gpu="1", UUID="..." [...other labels...]} 0.5 + // ... + // HELP DCGM_FIELD_2 Docstring for field 2 + // ... + var buffer bytes.Buffer + scanner := bufio.NewScanner(bytes.NewReader(expfmtBody)) + helpComment := "" + typeComment := "" + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if len(line) == 0 { + continue + } + if strings.HasPrefix(line, "# HELP") { + helpComment = line + continue + } + if strings.HasPrefix(line, "# TYPE") { + typeComment = line + continue + } + if strings.HasPrefix(line, "#") { + continue + } + for _, uuid := range uuids { + if strings.Contains(line, uuid) { + if helpComment != "" { + buffer.WriteString(helpComment) + buffer.WriteRune('\n') + helpComment = "" + } + if typeComment != "" { + buffer.WriteString(typeComment) + buffer.WriteRune('\n') + typeComment = "" + } + buffer.WriteString(line) + buffer.WriteRune('\n') + } + } + } + return buffer.Bytes() +} diff --git a/runner/internal/shim/dcgm/metrics_test.go b/runner/internal/shim/dcgm/metrics_test.go new file mode 100644 index 0000000000..133f534a52 --- /dev/null +++ b/runner/internal/shim/dcgm/metrics_test.go @@ -0,0 +1,43 @@ +package dcgm + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestFilterMetrics(t *testing.T) { + body := []byte(` +# Comment +# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). +# TYPE DCGM_FI_DEV_SM_CLOCK gauge +DCGM_FI_DEV_SM_CLOCK{gpu="0",UUID="GPU-0781f3bb-da15-f334-d5db-37b3f19542d0",pci_bus_id="00000000:00:1B.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-31-16-106",DCGM_FI_DRIVER_VERSION="535.183.06"} 1365 +DCGM_FI_DEV_SM_CLOCK{gpu="1",UUID="GPU-41cc2907-3249-5a6b-f0e4-d04063b183a9",pci_bus_id="00000000:00:1C.0",device="nvidia1",modelName="Tesla T4",Hostname="ip-172-31-16-106",DCGM_FI_DRIVER_VERSION="535.183.06"} 300 +DCGM_FI_DEV_SM_CLOCK{gpu="2",UUID="GPU-cc8e8c03-ebaa-f217-8e4c-d9cd98e20aed",pci_bus_id="00000000:00:1D.0",device="nvidia2",modelName="Tesla T4",Hostname="ip-172-31-16-106",DCGM_FI_DRIVER_VERSION="535.183.06"} 300 +DCGM_FI_DEV_SM_CLOCK{gpu="3",UUID="GPU-fb615fb7-3f5a-5600-0ab1-debad8dc80ee",pci_bus_id="00000000:00:1E.0",device="nvidia3",modelName="Tesla T4",Hostname="ip-172-31-16-106",DCGM_FI_DRIVER_VERSION="535.183.06"} 300 + +# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz). +DCGM_FI_DEV_MEM_CLOCK{gpu="0",UUID="GPU-0781f3bb-da15-f334-d5db-37b3f19542d0",pci_bus_id="00000000:00:1B.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-31-16-106",DCGM_FI_DRIVER_VERSION="535.183.06"} 5000 +DCGM_FI_DEV_MEM_CLOCK{gpu="1",UUID="GPU-41cc2907-3249-5a6b-f0e4-d04063b183a9",pci_bus_id="00000000:00:1C.0",device="nvidia1",modelName="Tesla T4",Hostname="ip-172-31-16-106",DCGM_FI_DRIVER_VERSION="535.183.06"} 405 +# comment +DCGM_FI_DEV_MEM_CLOCK{gpu="2",UUID="GPU-cc8e8c03-ebaa-f217-8e4c-d9cd98e20aed",pci_bus_id="00000000:00:1D.0",device="nvidia2",modelName="Tesla T4",Hostname="ip-172-31-16-106",DCGM_FI_DRIVER_VERSION="535.183.06"} 405 +DCGM_FI_DEV_MEM_CLOCK{gpu="3",UUID="GPU-fb615fb7-3f5a-5600-0ab1-debad8dc80ee",pci_bus_id="00000000:00:1E.0",device="nvidia3",modelName="Tesla T4",Hostname="ip-172-31-16-106",DCGM_FI_DRIVER_VERSION="535.183.06"} 405 +DCGM_FI_DEV_MEMORY_TEMP{gpu="0",UUID="GPU-0781f3bb-da15-f334-d5db-37b3f19542d0",pci_bus_id="00000000:00:1B.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-31-16-106",DCGM_FI_DRIVER_VERSION="535.183.06"} 0 +DCGM_FI_DEV_MEMORY_TEMP{gpu="1",UUID="GPU-41cc2907-3249-5a6b-f0e4-d04063b183a9",pci_bus_id="00000000:00:1C.0",device="nvidia1",modelName="Tesla T4",Hostname="ip-172-31-16-106",DCGM_FI_DRIVER_VERSION="535.183.06"} 0 + +DCGM_FI_DEV_MEMORY_TEMP{gpu="2",UUID="GPU-cc8e8c03-ebaa-f217-8e4c-d9cd98e20aed",pci_bus_id="00000000:00:1D.0",device="nvidia2",modelName="Tesla T4",Hostname="ip-172-31-16-106",DCGM_FI_DRIVER_VERSION="535.183.06"} 0 +DCGM_FI_DEV_MEMORY_TEMP{gpu="3",UUID="GPU-fb615fb7-3f5a-5600-0ab1-debad8dc80ee",pci_bus_id="00000000:00:1E.0",device="nvidia3",modelName="Tesla T4",Hostname="ip-172-31-16-106",DCGM_FI_DRIVER_VERSION="535.183.06"} 0 + `) + filtered := FilterMetrics(body, []string{"GPU-41cc2907-3249-5a6b-f0e4-d04063b183a9", "GPU-fb615fb7-3f5a-5600-0ab1-debad8dc80ee"}) + expected := []byte(`# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). +# TYPE DCGM_FI_DEV_SM_CLOCK gauge +DCGM_FI_DEV_SM_CLOCK{gpu="1",UUID="GPU-41cc2907-3249-5a6b-f0e4-d04063b183a9",pci_bus_id="00000000:00:1C.0",device="nvidia1",modelName="Tesla T4",Hostname="ip-172-31-16-106",DCGM_FI_DRIVER_VERSION="535.183.06"} 300 +DCGM_FI_DEV_SM_CLOCK{gpu="3",UUID="GPU-fb615fb7-3f5a-5600-0ab1-debad8dc80ee",pci_bus_id="00000000:00:1E.0",device="nvidia3",modelName="Tesla T4",Hostname="ip-172-31-16-106",DCGM_FI_DRIVER_VERSION="535.183.06"} 300 +# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz). +DCGM_FI_DEV_MEM_CLOCK{gpu="1",UUID="GPU-41cc2907-3249-5a6b-f0e4-d04063b183a9",pci_bus_id="00000000:00:1C.0",device="nvidia1",modelName="Tesla T4",Hostname="ip-172-31-16-106",DCGM_FI_DRIVER_VERSION="535.183.06"} 405 +DCGM_FI_DEV_MEM_CLOCK{gpu="3",UUID="GPU-fb615fb7-3f5a-5600-0ab1-debad8dc80ee",pci_bus_id="00000000:00:1E.0",device="nvidia3",modelName="Tesla T4",Hostname="ip-172-31-16-106",DCGM_FI_DRIVER_VERSION="535.183.06"} 405 +DCGM_FI_DEV_MEMORY_TEMP{gpu="1",UUID="GPU-41cc2907-3249-5a6b-f0e4-d04063b183a9",pci_bus_id="00000000:00:1C.0",device="nvidia1",modelName="Tesla T4",Hostname="ip-172-31-16-106",DCGM_FI_DRIVER_VERSION="535.183.06"} 0 +DCGM_FI_DEV_MEMORY_TEMP{gpu="3",UUID="GPU-fb615fb7-3f5a-5600-0ab1-debad8dc80ee",pci_bus_id="00000000:00:1E.0",device="nvidia3",modelName="Tesla T4",Hostname="ip-172-31-16-106",DCGM_FI_DRIVER_VERSION="535.183.06"} 0 +`) + assert.Equal(t, expected, filtered) +} diff --git a/runner/internal/shim/dcgm/wrapper.go b/runner/internal/shim/dcgm/wrapper.go new file mode 100644 index 0000000000..bc1979a858 --- /dev/null +++ b/runner/internal/shim/dcgm/wrapper.go @@ -0,0 +1,29 @@ +package dcgm + +type HealthStatus string + +const ( + HealthStatusHealthy HealthStatus = "healthy" + HealthStatusWarning HealthStatus = "warning" + HealthStatusFailure HealthStatus = "failure" +) + +type HealthIncident struct { + System int `json:"system"` + Health int `json:"health"` + ErrorMessage string `json:"error_message"` + ErrorCode int `json:"error_code"` + EntityGroupID int `json:"entity_group_id"` + EntityID int `json:"entity_id"` +} + +type Health struct { + OverallHealth int `json:"overall_health"` + Incidents []HealthIncident `json:"incidents"` +} + +type DCGMWrapperInterface interface { + Shutdown() error + EnableHealthChecks() error + GetHealth() (Health, error) +} diff --git a/runner/internal/shim/dcgm/wrapper_cgo.go b/runner/internal/shim/dcgm/wrapper_cgo.go new file mode 100644 index 0000000000..779decea9b --- /dev/null +++ b/runner/internal/shim/dcgm/wrapper_cgo.go @@ -0,0 +1,97 @@ +//go:build cgo + +package dcgm + +import ( + "errors" + "fmt" + "sync" + + godcgm "github.com/NVIDIA/go-dcgm/pkg/dcgm" +) + +// DCGMWrapper is a wrapper around go-dcgm (which, in turn, is a wrapper around libdcgm.so) +type DCGMWrapper struct { + group godcgm.GroupHandle + healthCheckEnabled bool + + mu *sync.Mutex +} + +// NewDCGMWrapper initializes and starts DCGM in the specific mode: +// - If address is empty, then libdcgm starts embedded hostengine within the current process. +// This is the main mode. +// - If address is not empty, then libdcgm connects to already running nv-hostengine service via TCP. +// This mode is useful for debugging, e.g., one can start nv-hostengine via systemd and inject +// errors via dcgmi: +// - systemctl start nvidia-dcgm.service +// - dcgmi test --inject --gpuid 0 -f 202 -v 99999 +// +// Note: embedded hostengine is started in AUTO operation mode, which means that +// the library handles periodic tasks by itself executing them in additional threads. +func NewDCGMWrapper(address string) (*DCGMWrapper, error) { + var err error + if address == "" { + _, err = godcgm.Init(godcgm.Embedded) + } else { + // "address is a unix socket filename (1) or a TCP/IP address (0)" + _, err = godcgm.Init(godcgm.Standalone, address, "0") + } + if err != nil { + return nil, fmt.Errorf("failed to initialize or start DCGM: %w", err) + } + return &DCGMWrapper{ + group: godcgm.GroupAllGPUs(), + mu: new(sync.Mutex), + }, nil +} + +func (w *DCGMWrapper) Shutdown() error { + if err := godcgm.Shutdown(); err != nil { + return fmt.Errorf("failed to shut down DCGM: %w", err) + } + return nil +} + +func (w *DCGMWrapper) EnableHealthChecks() error { + w.mu.Lock() + defer w.mu.Unlock() + if w.healthCheckEnabled { + return errors.New("health check system already enabled") + } + if err := godcgm.HealthSet(w.group, godcgm.DCGM_HEALTH_WATCH_ALL); err != nil { + return fmt.Errorf("failed to configure health watches: %w", err) + } + // "On the first call, stateful information about all of the enabled watches within a group + // is created but no error results are provided. On subsequent calls, any error information + // will be returned." + if _, err := godcgm.HealthCheck(w.group); err != nil { + return fmt.Errorf("failed to initialize health watches state: %w", err) + } + w.healthCheckEnabled = true + return nil +} + +func (w *DCGMWrapper) GetHealth() (Health, error) { + health := Health{} + if !w.healthCheckEnabled { + return health, errors.New("health check system is not enabled") + } + response, err := godcgm.HealthCheck(w.group) + if err != nil { + return health, fmt.Errorf("failed to fetch health status: %w", err) + } + health.OverallHealth = int(response.OverallHealth) + health.Incidents = make([]HealthIncident, 0, len(response.Incidents)) + for _, incident := range response.Incidents { + health.Incidents = append(health.Incidents, HealthIncident{ + System: int(incident.System), + Health: int(incident.Health), + ErrorMessage: incident.Error.Message, + ErrorCode: int(incident.Error.Code), + EntityGroupID: int(incident.EntityInfo.EntityGroupId), + EntityID: int(incident.EntityInfo.EntityId), + }) + } + return health, nil +} diff --git a/runner/internal/shim/dcgm/wrapper_cgo_test.go b/runner/internal/shim/dcgm/wrapper_cgo_test.go new file mode 100644 index 0000000000..e596b4eeba --- /dev/null +++ b/runner/internal/shim/dcgm/wrapper_cgo_test.go @@ -0,0 +1,82 @@ +//go:build cgo + +package dcgm + +import ( + "strings" + "testing" + "time" + + godcgm "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/stretchr/testify/require" +) + +func TestDCGMWrapperGetHealth(t *testing.T) { + dcgmw := getDCGMWrapper(t) + defer dcgmw.Shutdown() + + gpuID := getGpuID(t) + + err := dcgmw.EnableHealthChecks() + require.NoError(t, err) + + health, err := dcgmw.GetHealth() + require.NoError(t, err) + require.Equal(t, health.OverallHealth, 0) // DCGM_HEALTH_RESULT_PASS + require.Len(t, health.Incidents, 0) + + injectError(t, gpuID, godcgm.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, godcgm.DCGM_FT_INT64, int64(888)) + injectError(t, gpuID, godcgm.DCGM_FI_DEV_PCIE_REPLAY_COUNTER, godcgm.DCGM_FT_INT64, int64(999)) + + health, err = dcgmw.GetHealth() + require.NoError(t, err) + require.Equal(t, health.OverallHealth, 20) // DCGM_HEALTH_RESULT_FAIL + require.Len(t, health.Incidents, 2) + for _, incident := range health.Incidents { + switch incident.System { + case 0x1: // DCGM_HEALTH_WATCH_PCIE + require.Equal(t, incident.Health, 10) // DCGM_HEALTH_RESULT_WARN + require.Contains(t, incident.ErrorMessage, "PCIe replay") + case 0x10: // DCGM_HEALTH_WATCH_MEM + require.Equal(t, incident.Health, 20) // DCGM_HEALTH_RESULT_FAIL + require.Contains(t, incident.ErrorMessage, "volatile double-bit ECC error") + default: + t.Logf("unexpected HealthSystem: 0x%x", incident.System) + t.FailNow() + } + require.Equal(t, incident.EntityGroupID, 1) // FE_GPU + require.Equal(t, incident.EntityID, int(gpuID)) + } +} + +// Utils. Must be called after NewDCGMWrapper(), as it indirectly calls dlopen("libdcgm.so.4") + +func getDCGMWrapper(t *testing.T) *DCGMWrapper { + dcgmw, err := NewDCGMWrapper("") + if err != nil && strings.Contains(err.Error(), "libdcgm.so") { + t.Skip("Skipping test that requires ligdcm.so") + } + require.NoError(t, err) + gpuIDs, err := godcgm.GetSupportedDevices() + require.NoError(t, err) + if len(gpuIDs) < 1 { + t.Skip("Skipping test that requires live GPUs. None were found") + } + return dcgmw +} + +func getGpuID(t *testing.T) uint { + t.Helper() + gpuIDs, err := godcgm.GetSupportedDevices() + require.NoError(t, err) + if len(gpuIDs) < 1 { + t.Skip("Skipping test that requires live GPUs. None were found") + } + return gpuIDs[0] +} + +func injectError(t *testing.T, gpuID uint, fieldID godcgm.Short, fieldType uint, value any) { + t.Helper() + err := godcgm.InjectFieldValue(gpuID, fieldID, fieldType, 0, time.Now().UnixMicro(), value) + require.NoError(t, err) +} diff --git a/runner/internal/shim/dcgm/wrapper_nocgo.go b/runner/internal/shim/dcgm/wrapper_nocgo.go new file mode 100644 index 0000000000..730c376e8c --- /dev/null +++ b/runner/internal/shim/dcgm/wrapper_nocgo.go @@ -0,0 +1,9 @@ +//go:build !cgo + +package dcgm + +import "fmt" + +func NewDCGMWrapper(address string) (DCGMWrapperInterface, error) { + return nil, fmt.Errorf("DCGM unavailable: built with CGO disabled (cross-compilation)") +} diff --git a/runner/internal/shim/docker.go b/runner/internal/shim/docker.go index 9a7567aceb..81381125cf 100644 --- a/runner/internal/shim/docker.go +++ b/runner/internal/shim/docker.go @@ -4,329 +4,613 @@ import ( "bufio" "bytes" "context" + "encoding/base64" "encoding/json" + "errors" "fmt" "io" - "log" "os" - "os/exec" "os/user" "path/filepath" - rt "runtime" "strconv" "strings" + "sync" "time" "github.com/docker/docker/api/types/container" "github.com/docker/docker/api/types/filters" "github.com/docker/docker/api/types/image" "github.com/docker/docker/api/types/mount" + "github.com/docker/docker/api/types/registry" + dockersystem "github.com/docker/docker/api/types/system" docker "github.com/docker/docker/client" + "github.com/docker/docker/errdefs" + "github.com/docker/docker/pkg/stdcopy" "github.com/docker/go-connections/nat" - "github.com/dstackai/dstack/runner/consts" - "github.com/icza/backscanner" + "github.com/docker/go-units" bytesize "github.com/inhies/go-bytesize" - "github.com/ztrue/tracerr" + + "github.com/dstackai/dstack/runner/internal/common/consts" + "github.com/dstackai/dstack/runner/internal/common/gpu" + "github.com/dstackai/dstack/runner/internal/common/log" + "github.com/dstackai/dstack/runner/internal/common/types" + "github.com/dstackai/dstack/runner/internal/shim/host" ) // TODO: Allow for configuration via cli arguments or environment variables. const ImagePullTimeout time.Duration = 20 * time.Minute -// Depricated: Remove on next release (0.19) -type ContainerStatus struct { - ContainerID string - ContainerName string - Status string - Running bool - OOMKilled bool - Dead bool - ExitCode int - Error string +const ( + LabelKeyPrefix = "ai.dstack.shim." + // Set to "true" on containers spawned by DockerRunner, used for identification. + LabelKeyIsTask = LabelKeyPrefix + "is-task" + LabelKeyTaskID = LabelKeyPrefix + "task-id" + LabelValueTrue = "true" +) + +// dockerd reports pulling progress as a stream of JSON Lines. The format of records is not documented in the API documentation, +// although it's occasionally mentioned, e.g., https://docs.docker.com/reference/api/engine/version-history/#v148-api-changes +// https://github.com/moby/moby/blob/e77ff99ede5ee5952b3a9227863552ae6e5b6fb1/pkg/jsonmessage/jsonmessage.go#L144 +// All fields are optional. +type PullMessage struct { + Id string `json:"id"` // layer id + Status string `json:"status"` + ProgressDetail ProgressDetail `json:"progressDetail"` + ErrorDetail struct { + Message string `json:"message"` + } `json:"errorDetail"` } -type JobResult struct { - Reason string `json:"reason"` - ReasonMessage string `json:"reason_message"` +type ProgressDetail struct { + Current uint64 `json:"current"` + Total uint64 `json:"total"` + Units string `json:"units"` } -type DockerRunner struct { - client *docker.Client - dockerParams DockerParameters - currentContainer string - state RunnerStatus +func (p *ProgressDetail) isUnitBytes() bool { + // > Units is the unit to print for progress. It defaults to "bytes" if empty + // https://github.com/moby/moby/blob/8151a55a776f5f83f68bcf0030c19031439ea357/api/types/jsonstream/progress.go#L9 + return p.Units == "bytes" || p.Units == "" +} - cancelPull context.CancelFunc +type layerProgress struct { + Status string + DownloadedBytes uint64 + ExtractedBytes uint64 + TotalBytes uint64 +} + +type PullTracker struct { + mu sync.RWMutex + layers map[string]layerProgress +} - containerStatus ContainerStatus // TODO: remove on next release (0.19) - executorError string // TODO: remove on next release (0.19) - jobResult JobResult +func newPullTracker() *PullTracker { + return &PullTracker{layers: make(map[string]layerProgress)} } -func NewDockerRunner(dockerParams DockerParameters) (*DockerRunner, error) { +func (t *PullTracker) Update(msg PullMessage) { + if msg.Id == "" { + return + } + t.mu.Lock() + defer t.mu.Unlock() + layer := t.layers[msg.Id] + switch msg.Status { + case "Pulling fs layer", "Waiting", "Verifying Checksum", "Already exists": + // no bytes to update, just track status + case "Downloading": + if msg.ProgressDetail.isUnitBytes() { + layer.DownloadedBytes = msg.ProgressDetail.Current + layer.TotalBytes = msg.ProgressDetail.Total + } + case "Download complete": + layer.DownloadedBytes = layer.TotalBytes + case "Extracting": + if msg.ProgressDetail.isUnitBytes() { + layer.ExtractedBytes = msg.ProgressDetail.Current + layer.DownloadedBytes = msg.ProgressDetail.Total + layer.TotalBytes = msg.ProgressDetail.Total + } + case "Pull complete": + layer.ExtractedBytes = layer.TotalBytes + layer.DownloadedBytes = layer.TotalBytes + default: + // Non-layer events, such as {"status":"Pulling from library/python","id":"3.11"} + return + } + layer.Status = msg.Status + t.layers[msg.Id] = layer +} + +func (t *PullTracker) Progress() *ImagePullProgress { + t.mu.RLock() + defer t.mu.RUnlock() + if len(t.layers) == 0 { + return nil + } + p := ImagePullProgress{IsTotalBytesFinal: true} + for _, l := range t.layers { + if l.TotalBytes == 0 && l.Status != "Already exists" && l.Status != "Pull complete" { + p.IsTotalBytesFinal = false + } + p.DownloadedBytes += l.DownloadedBytes + p.ExtractedBytes += l.ExtractedBytes + p.TotalBytes += l.TotalBytes + } + return &p +} + +type DockerRunner struct { + client *docker.Client + dockerParams DockerParameters + dockerInfo dockersystem.Info + baseEnv []string + gpus []host.GpuInfo + gpuVendor gpu.GpuVendor + gpuLock *GpuLock + tasks TaskStorage +} + +func NewDockerRunner(ctx context.Context, dockerParams DockerParameters) (*DockerRunner, error) { client, err := docker.NewClientWithOpts(docker.FromEnv, docker.WithAPIVersionNegotiation()) if err != nil { - return nil, tracerr.Wrap(err) + return nil, fmt.Errorf("create docker client: %w", err) + } + dockerInfo, err := client.Info(ctx) + if err != nil { + return nil, fmt.Errorf("get docker info: %w", err) + } + + // Copy variables once rather than on a per-task basis + // We don't expect variables to change during the shim's lifetime + baseEnv := []string{} + for _, name := range dockerParams.DockerPassEnv() { + if value, ok := os.LookupEnv(name); ok { + baseEnv = append(baseEnv, fmt.Sprintf("%s=%s", name, value)) + } + } + + var gpuVendor gpu.GpuVendor + gpus := host.GetGpuInfo(ctx) + if len(gpus) > 0 { + gpuVendor = gpus[0].Vendor + } else { + gpuVendor = gpu.GpuVendorNone + } + gpuLock, err := NewGpuLock(gpus) + if err != nil { + return nil, fmt.Errorf("create GPU lock: %w", err) } runner := &DockerRunner{ client: client, dockerParams: dockerParams, - state: Pending, + dockerInfo: dockerInfo, + baseEnv: baseEnv, + gpus: gpus, + gpuVendor: gpuVendor, + gpuLock: gpuLock, + tasks: NewTaskStorage(), + } + + if err := runner.restoreStateFromContainers(ctx); err != nil { + return nil, fmt.Errorf("failed to restore state from containers: %w", err) } + return runner, nil } -func (d *DockerRunner) Run(ctx context.Context, cfg TaskConfig) error { - var err error - - if cfg.SshKey != "" { - ak := AuthorizedKeys{user: cfg.SshUser, lookup: user.Lookup} - if err := ak.AppendPublicKeys([]string{cfg.SshKey}); err != nil { - d.state = Pending - errMessage := fmt.Sprintf("ak.AppendPublicKeys error: %s", err.Error()) - d.containerStatus.Error = errMessage - log.Println(errMessage) - d.jobResult = JobResult{Reason: "EXECUTOR_ERROR", ReasonMessage: errMessage} - return tracerr.Wrap(err) +// restoreStateFromContainers regenerates TaskStorage and GpuLock inspecting containers +// Used to restore shim state on restarts +func (d *DockerRunner) restoreStateFromContainers(ctx context.Context) error { + listOptions := container.ListOptions{ + All: true, + Filters: filters.NewArgs(filters.Arg("label", fmt.Sprintf("%s=%s", LabelKeyIsTask, LabelValueTrue))), + } + containers, err := d.client.ContainerList(ctx, listOptions) + if err != nil { + return fmt.Errorf("failed to get container list: %w", err) + } + for _, containerShort := range containers { + containerID := containerShort.ID + taskID := containerShort.Labels[LabelKeyTaskID] + if taskID == "" { + log.Error(ctx, "container has no label", "id", containerID, "label", LabelKeyTaskID) + continue } - defer func(cfg TaskConfig) { - err := ak.RemovePublicKeys([]string{cfg.SshKey}) - if err != nil { - log.Printf("Error RemovePublicKeys: %s\n", err.Error()) + var status TaskStatus + if containerShort.State == "exited" { + status = TaskStatusTerminated + } else { + status = TaskStatusRunning + } + var containerName string + if len(containerShort.Names) > 0 { + // "Names are prefixed with their parent and / == the docker daemon" + // https://github.com/moby/moby/issues/6705 + containerName = strings.TrimLeft(containerShort.Names[0], "/") + } + var gpuIDs []string + var ports []PortMapping + if containerFull, err := d.client.ContainerInspect(ctx, containerID); err != nil { + log.Error(ctx, "failed to inspect container", "id", containerID, "task", taskID) + } else { + switch d.gpuVendor { + case gpu.GpuVendorNvidia: + deviceRequests := containerFull.HostConfig.DeviceRequests + if len(deviceRequests) == 1 { + gpuIDs = deviceRequests[0].DeviceIDs + } else if len(deviceRequests) != 0 { + log.Error( + ctx, + "cannot extract GPU IDs from container: more than one DeviceRequest", + "id", containerID, "task", taskID, + ) + } + case gpu.GpuVendorAmd: + for _, device := range containerFull.HostConfig.Devices { + if host.IsRenderNodePath(device.PathOnHost) { + gpuIDs = append(gpuIDs, device.PathOnHost) + } + } + case gpu.GpuVendorTenstorrent: + for _, device := range containerFull.HostConfig.Devices { + if strings.HasPrefix(device.PathOnHost, "/dev/tenstorrent/") { + // Extract the device ID from the path + deviceID := strings.TrimPrefix(device.PathOnHost, "/dev/tenstorrent/") + gpuIDs = append(gpuIDs, deviceID) + } + } + case gpu.GpuVendorIntel: + for _, envVar := range containerFull.Config.Env { + if indices, found := strings.CutPrefix(envVar, "HABANA_VISIBLE_DEVICES="); found { + gpuIDs = strings.Split(indices, ",") + break + } + } + case gpu.GpuVendorNone: + gpuIDs = []string{} } - }(cfg) + ports = extractPorts(ctx, containerFull.NetworkSettings.Ports) + } + var runnerDir string + for _, mount := range containerShort.Mounts { + if mount.Destination == consts.RunnerTempDir { + runnerDir = mount.Source + break + } + } + task := NewTask(taskID, status, containerName, containerID, gpuIDs, ports, runnerDir) + if !d.tasks.Add(task) { + log.Error(ctx, "duplicate restored task", "task", taskID) + } else { + log.Debug(ctx, "restored task", "task", taskID, "status", status, "gpus", gpuIDs) + } + if status == TaskStatusRunning && len(gpuIDs) > 0 { + lockedGpuIDs := d.gpuLock.Lock(ctx, gpuIDs) + log.Debug(ctx, "locked GPU(s) due to running task", "task", taskID, "gpus", lockedGpuIDs) + } } + return nil +} - log.Println("Preparing volumes") - err = prepareVolumes(cfg) +func (d *DockerRunner) Resources(ctx context.Context) Resources { + cpuCount := host.GetCpuCount(ctx) + totalMemory, err := host.GetTotalMemory(ctx) if err != nil { - d.state = Pending - errMessage := fmt.Sprintf("prepareVolumes error: %s", err.Error()) - d.containerStatus.Error = errMessage - log.Println(errMessage) - d.jobResult = JobResult{Reason: "EXECUTOR_ERROR", ReasonMessage: errMessage} - return tracerr.Wrap(err) + log.Error(ctx, err.Error()) } - - d.containerStatus = ContainerStatus{ - ContainerName: cfg.ContainerName, + netAddresses, err := host.GetNetworkAddresses(ctx) + if err != nil { + log.Error(ctx, err.Error()) } - d.executorError = "" - - pullCtx, cancel := context.WithTimeout(ctx, ImagePullTimeout) - defer cancel() - d.cancelPull = cancel + diskSize, err := host.GetDiskSize(ctx, d.dockerInfo.DockerRootDir) + if err != nil { + log.Error(ctx, err.Error()) + } + return Resources{ + Gpus: d.gpus, + CpuCount: cpuCount, + TotalMemory: totalMemory, + DiskSize: diskSize, + NetAddresses: netAddresses, + } +} - log.Println("Pulling image") - d.state = Pulling - if err = pullImage(pullCtx, d.client, cfg); err != nil { - d.state = Pending - errMessage := fmt.Sprintf("pullImage error: %s", err.Error()) - d.containerStatus.Error = errMessage - log.Print(errMessage + "\n") - d.jobResult = JobResult{Reason: "CREATING_CONTAINER_ERROR", ReasonMessage: errMessage} - return tracerr.Wrap(err) +func (d *DockerRunner) TaskList() []*TaskListItem { + tasks := d.tasks.List() + result := make([]*TaskListItem, 0, len(tasks)) + for _, task := range tasks { + result = append(result, &TaskListItem{ID: task.ID, Status: task.Status}) } + return result +} - runnerDir, err := d.dockerParams.MakeRunnerDir() - if err != nil { - d.state = Pending - errMessage := fmt.Sprintf("Cannot create dir for runner: %s", err.Error()) - d.containerStatus.Error = errMessage - log.Print(errMessage + "\n") - d.jobResult = JobResult{Reason: "CREATING_CONTAINER_ERROR", ReasonMessage: errMessage} - return tracerr.Wrap(err) +func (d *DockerRunner) TaskInfo(taskID string) TaskInfo { + task, ok := d.tasks.Get(taskID) + if !ok { + return TaskInfo{} } + return TaskInfo{ + ID: task.ID, + Status: task.Status, + TerminationReason: task.TerminationReason, + TerminationMessage: task.TerminationMessage, + Ports: task.ports, + ContainerName: task.containerName, + ContainerID: task.containerID, + GpuIDs: task.gpuIDs, + ImagePullProgress: task.pullTracker.Progress(), + } +} - log.Println("Creating container") - d.state = Creating - containerID, err := createContainer(ctx, d.client, runnerDir, d.dockerParams, cfg) - if err != nil { - d.state = Pending - errMessage := fmt.Sprintf("createContainer error: %s", err.Error()) - d.containerStatus.Error = errMessage - d.jobResult = JobResult{Reason: "CREATING_CONTAINER_ERROR", ReasonMessage: errMessage} - log.Print(errMessage + "\n") - return tracerr.Wrap(err) +func (d *DockerRunner) Submit(ctx context.Context, cfg TaskConfig) error { + task := NewTaskFromConfig(cfg) + if ok := d.tasks.Add(task); !ok { + return fmt.Errorf("%w: task %s is already submitted", ErrRequest, task.ID) } + log.Debug(ctx, "new task submitted", "task", task.ID) + return nil +} - if !d.dockerParams.DockerKeepContainer() { - defer func() { - log.Println("Deleting container") - err := d.client.ContainerRemove(ctx, containerID, container.RemoveOptions{Force: true}) - if err != nil { - log.Printf("ContainerRemove error: %s\n", err.Error()) - } - }() +func (d *DockerRunner) Run(ctx context.Context, taskID string) error { + task, ok := d.tasks.Get(taskID) + if !ok { + log.Error(ctx, "cannot run: not found", "task", taskID) + return fmt.Errorf("task %s: %w", taskID, ErrNotFound) } - d.containerStatus, _ = inspectContainer(d.client, containerID) - d.state = Running - d.currentContainer = containerID - d.executorError = "" - log.Printf("Running container, name=%s, id=%s\n", d.containerStatus.ContainerName, containerID) + if task.Status != TaskStatusPending { + return fmt.Errorf("%w: cannot run task %s with %s status", ErrRequest, task.ID, task.Status) + } - if err = runContainer(ctx, d.client, containerID); err != nil { - log.Printf("runContainer error: %s\n", err.Error()) - d.state = Pending - d.containerStatus, _ = inspectContainer(d.client, containerID) - d.executorError = FindExecutorError(runnerDir) - d.currentContainer = "" - var errMessage string = d.containerStatus.Error - if d.containerStatus.OOMKilled { - errMessage = "Container killed by OOM" + defer func() { + if err := d.tasks.Update(task); err != nil { + if currentTask, ok := d.tasks.Get(task.ID); ok && currentTask.Status != task.Status { + // ignore error if task is gone or status has not changed, e.g., terminated -> terminated + log.Error(ctx, "failed to update", "task", task.ID, "err", err) + } } - d.jobResult = JobResult{Reason: "CONTAINER_EXITED_WITH_ERROR", ReasonMessage: errMessage} - return tracerr.Wrap(err) + }() + + task.SetStatusPreparing() + if err := d.tasks.Update(task); err != nil { + return fmt.Errorf("%w: failed to update task %s: %w", ErrInternal, task.ID, err) } - log.Printf("Container finished successfully, name=%s, id=%s", d.containerStatus.ContainerName, containerID) - d.containerStatus, _ = inspectContainer(d.client, containerID) - d.executorError = FindExecutorError(runnerDir) - d.state = Pending - d.currentContainer = "" + cfg := task.config + var err error - jobResult := JobResult{Reason: "DONE_BY_RUNNER"} - if d.containerStatus.ExitCode != 0 { - jobResult = JobResult{Reason: "CONTAINER_EXITED_WITH_ERROR", ReasonMessage: d.containerStatus.Error} + runnerDir, err := d.dockerParams.MakeRunnerDir(task.containerName) + if err != nil { + return fmt.Errorf("make runner dir: %w", err) } - d.jobResult = jobResult + task.runnerDir = runnerDir + log.Debug(ctx, "runner dir", "task", task.ID, "path", runnerDir) - return nil -} + if cfg.GPU != 0 { + gpuIDs, err := d.gpuLock.Acquire(ctx, cfg.GPU) + if err != nil { + log.Error(ctx, err.Error()) + task.SetStatusTerminated(string(types.TerminationReasonExecutorError), err.Error()) + return fmt.Errorf("acquire GPU: %w", err) + } + task.gpuIDs = gpuIDs + log.Debug(ctx, "acquired GPU(s)", "task", task.ID, "gpus", gpuIDs) -func (d *DockerRunner) Stop(force bool) { - if d.state == Pulling && d.currentContainer == "" { - d.cancelPull() - return + defer func() { + releasedGpuIDs := d.gpuLock.Release(ctx, task.gpuIDs) + log.Debug(ctx, "released GPU(s)", "task", task.ID, "gpus", releasedGpuIDs) + }() + } else { + task.gpuIDs = []string{} } - stopOptions := container.StopOptions{} - if force { - timeout := int(0) - stopOptions.Timeout = &timeout + if len(cfg.HostSshKeys) > 0 { + ak := AuthorizedKeys{user: cfg.HostSshUser, lookup: user.Lookup} + if err := ak.AppendPublicKeys(cfg.HostSshKeys); err != nil { + errMessage := fmt.Sprintf("ak.AppendPublicKeys error: %s", err.Error()) + log.Error(ctx, errMessage) + task.SetStatusTerminated(string(types.TerminationReasonExecutorError), errMessage) + return fmt.Errorf("append public keys: %w", err) + } + defer func(cfg TaskConfig) { + err := ak.RemovePublicKeys(cfg.HostSshKeys) + if err != nil { + log.Error(ctx, "Error RemovePublicKeys", "err", err) + } + }(cfg) } - err := d.client.ContainerStop(context.Background(), d.currentContainer, stopOptions) + log.Debug(ctx, "Preparing volumes") + // defer unmountVolumes() before calling prepareVolumes(), as the latter + // may fail when some volumes are already mounted; if the volume is not mounted, + // unmountVolumes() simply skips it + defer func() { _ = unmountVolumes(ctx, cfg) }() + err = prepareVolumes(ctx, cfg) if err != nil { - log.Printf("Failed to stop container: %s", err) + errMessage := fmt.Sprintf("prepareVolumes error: %s", err.Error()) + log.Error(ctx, errMessage) + task.SetStatusTerminated(string(types.TerminationReasonExecutorError), errMessage) + return fmt.Errorf("prepare volumes: %w", err) + } + err = prepareInstanceMountPoints(cfg) + if err != nil { + errMessage := fmt.Sprintf("prepareInstanceMountPoints error: %s", err.Error()) + log.Error(ctx, errMessage) + task.SetStatusTerminated(string(types.TerminationReasonExecutorError), errMessage) + return fmt.Errorf("prepare instance mount points: %w", err) } -} -func (d DockerRunner) GetState() (RunnerStatus, ContainerStatus, string, JobResult) { - return d.state, d.containerStatus, d.executorError, d.jobResult -} + log.Debug(ctx, "Pulling image") + pullCtx, cancelPull := context.WithTimeout(ctx, ImagePullTimeout) + defer cancelPull() + task.SetStatusPulling(cancelPull) + if err := d.tasks.Update(task); err != nil { + return fmt.Errorf("%w: failed to update task %s: %w", ErrInternal, task.ID, err) + } + // Although it's called "runner dir", we also use it for shim task-related data. + // Maybe we should rename it to "task dir" (including the `/root/.dstack/runners` dir on the host). + pullLogPath := filepath.Join(runnerDir, "pull.log") + if err = pullImage(pullCtx, d.client, cfg, pullLogPath, task.pullTracker); err != nil { + errMessage := fmt.Sprintf("pullImage error: %s", err.Error()) + log.Error(ctx, errMessage) + task.SetStatusTerminated(string(types.TerminationReasonCreatingContainerError), errMessage) + return fmt.Errorf("pull image: %w", err) + } -func prepareVolumes(taskConfig TaskConfig) error { - for _, volume := range taskConfig.Volumes { - err := formatAndMountVolume(volume) - if err != nil { - return tracerr.Wrap(err) - } + log.Debug(ctx, "Creating container", "task", task.ID, "name", task.containerName) + task.SetStatusCreating() + if err := d.tasks.Update(task); err != nil { + return fmt.Errorf("%w: failed to update task %s: %w", ErrInternal, task.ID, err) + } + if err := d.createContainer(ctx, &task); err != nil { + errMessage := fmt.Sprintf("createContainer error: %s", err.Error()) + log.Error(ctx, errMessage) + task.SetStatusTerminated(string(types.TerminationReasonCreatingContainerError), errMessage) + return fmt.Errorf("create container: %w", err) } - return nil -} -func formatAndMountVolume(volume VolumeInfo) error { - deviceName, err := getRealDeviceName(volume.VolumeId) - if err != nil { - return tracerr.Wrap(err) + log.Debug(ctx, "Running container", "task", task.ID, "name", task.containerName) + task.SetStatusRunning() + if err := d.tasks.Update(task); err != nil { + return fmt.Errorf("%w: failed to update task %s: %w", ErrInternal, task.ID, err) } - _, err = initFileSystem(deviceName, !volume.InitFs) - if err != nil { - return tracerr.Wrap(err) + err = d.startContainer(ctx, &task) + if err == nil { + // startContainer sets `ports` field, committing update + if err := d.tasks.Update(task); err != nil { + return fmt.Errorf("%w: failed to update task %s: %w", ErrInternal, task.ID, err) + } + err = d.waitContainer(ctx, &task) } - err = mountDisk(deviceName, getVolumeMountPoint(volume.Name)) if err != nil { - return tracerr.Wrap(err) + log.Error(ctx, "failed to run container", "err", err) + var errMessage string + if lastLogs, err := getContainerLastLogs(ctx, d.client, task.containerID, 5); err == nil { + errMessage = strings.Join(lastLogs, "\n") + } else { + log.Error(ctx, "getContainerLastLogs error", "err", err) + errMessage = "" + } + task.SetStatusTerminated(string(types.TerminationReasonContainerExitedWithError), errMessage) + return fmt.Errorf("wait container: %w", err) } + + log.Debug(ctx, "Container finished successfully", "task", task.ID, "name", task.containerName) + task.SetStatusTerminated(string(types.TerminationReasonDoneByRunner), "") + return nil } -func getVolumeMountPoint(volumeName string) string { - // Put volumes in data-specific dir to avoid clashes with host dirs - return fmt.Sprintf("/dstack-volumes/%s", volumeName) -} - -// getRealDeviceName returns the device name for the given EBS volume ID. -// The device name on instance can be different from device name specified in block-device mapping -// (e.g. NVMe block devices built on the Nitro System). -func getRealDeviceName(volumeID string) (string, error) { - // Run the lsblk command to get block device information - // TODO: On AWS SERIAL contains volume id. This may not be true for other clouds. - cmd := exec.Command("lsblk", "-o", "NAME,SERIAL") - var out bytes.Buffer - cmd.Stdout = &out - if err := cmd.Run(); err != nil { - return "", fmt.Errorf("failed to list block devices: %v", err) - } - - // Parse the output to find the device that matches the volume ID - lines := strings.Split(out.String(), "\n") - for _, line := range lines { - fields := strings.Fields(line) - if len(fields) == 2 && strings.HasPrefix(fields[1], "vol") { - serial := strings.TrimPrefix(fields[1], "vol") - if "vol-"+serial == volumeID { - return "/dev/" + fields[0], nil - } - } +// Terminate aborts running operations (pulling an image, running a container) and sets task status to terminated +// Associated resources (container, logs, etc.) are not destroyed, use Remove() for cleanup +func (d *DockerRunner) Terminate(ctx context.Context, taskID string, timeout uint, reason string, message string) (err error) { + task, ok := d.tasks.Get(taskID) + if !ok { + log.Error(ctx, "cannot terminate task: not found", "task", taskID) + return fmt.Errorf("task %s: %w", taskID, ErrNotFound) } - - return "", fmt.Errorf("volume %s not found among block devices", volumeID) + task.Lock(ctx) + defer func() { task.Release(ctx) }() + defer func() { + if err := d.tasks.Update(task); err != nil { + log.Error(ctx, "failed to update task", "task", task.ID, "err", err) + } + }() + return d.terminate(ctx, &task, timeout, reason, message) } -// initFileSystem creates an ext4 file system on a disk only if the disk is not already has a file system. -// Returns true if the file system is created. -func initFileSystem(deviceName string, errorIfNotExists bool) (bool, error) { - // Run the lsblk command to get filesystem type - cmd := exec.Command("lsblk", "-no", "FSTYPE", deviceName) - var out bytes.Buffer - cmd.Stdout = &out - if err := cmd.Run(); err != nil { - return false, fmt.Errorf("failed to check if disk is formatted: %v", err) +func (d *DockerRunner) terminate(ctx context.Context, task *Task, timeout uint, reason string, message string) (err error) { + log.Debug(ctx, "terminating", "task", task.ID) + defer func() { + if err != nil { + log.Error(ctx, "cannot terminate task", "task", task.ID, "err", err) + } + }() + if !task.IsTransitionAllowed(TaskStatusTerminated) { + return fmt.Errorf("%w: cannot terminate task %s with %s status", ErrRequest, task.ID, task.Status) } - - // If the output is not empty, the disk is already formatted - fsType := strings.TrimSpace(out.String()) - if fsType != "" { - return false, nil + switch task.Status { + case TaskStatusPending, TaskStatusPreparing, TaskStatusCreating, TaskStatusTerminated: + // nothing to do + case TaskStatusPulling: + task.cancelPull() + case TaskStatusRunning: + stopOptions := container.StopOptions{} + timeout := int(timeout) + stopOptions.Timeout = &timeout + if err := d.client.ContainerStop(ctx, task.containerID, stopOptions); err != nil { + return fmt.Errorf("%w: failed to stop container: %w", ErrInternal, err) + } + default: + return fmt.Errorf("%w: should not reach here", ErrInternal) } - - if errorIfNotExists { - return false, fmt.Errorf("disk has no file system") + if len(task.gpuIDs) > 0 { + releasedGpuIDs := d.gpuLock.Release(ctx, task.gpuIDs) + log.Debug(ctx, "released GPU(s)", "task", task.ID, "gpus", releasedGpuIDs) } + task.SetStatusTerminated(reason, message) + log.Debug(ctx, "terminated", "task", task.ID) + return nil +} - log.Printf("Formatting disk %s with ext4 filesystem...\n", deviceName) - cmd = exec.Command("mkfs.ext4", "-F", deviceName) - if output, err := cmd.CombinedOutput(); err != nil { - return false, fmt.Errorf("failed to format disk: %s, output: %s", err, string(output)) +// Remove destroys resources associated with task (container, logs, etc.), if any +// On success, it also removes the task from TaskStorage +func (d *DockerRunner) Remove(ctx context.Context, taskID string) error { + task, ok := d.tasks.Get(taskID) + if !ok { + log.Error(ctx, "cannot remove: not found", "task", taskID) + return fmt.Errorf("task %s: %w", taskID, ErrNotFound) } - log.Println("Disk formatted succesfully!") - return true, nil + task.Lock(ctx) + defer func() { task.Release(ctx) }() + err := d.remove(ctx, &task) + if err == nil { + d.tasks.Delete(taskID) + } + return err } -func mountDisk(deviceName, mountPoint string) error { - // Create the mount point directory if it doesn't exist - if _, err := os.Stat(mountPoint); os.IsNotExist(err) { - fmt.Printf("Creating mount point %s...\n", mountPoint) - if err := os.MkdirAll(mountPoint, 0755); err != nil { - return fmt.Errorf("failed to create mount point: %s", err) +func (d *DockerRunner) remove(ctx context.Context, task *Task) (err error) { + log.Debug(ctx, "removing", "task", task.ID) + defer func() { + if err != nil { + log.Error(ctx, "cannot remove", "task", task.ID, "err", err) } + }() + if task.Status != TaskStatusTerminated { + return fmt.Errorf("%w: cannot remove task %s with %s status", ErrRequest, task.ID, task.Status) } - - // Mount the disk to the mount point - log.Printf("Mounting disk %s to %s...\n", deviceName, mountPoint) - cmd := exec.Command("mount", deviceName, mountPoint) - if output, err := cmd.CombinedOutput(); err != nil { - return fmt.Errorf("failed to mount disk: %s, output: %s", err, string(output)) + removeOptions := container.RemoveOptions{Force: true, RemoveVolumes: true} + // Normally, it should not be empty + if task.containerID != "" { + err := d.client.ContainerRemove(ctx, task.containerID, removeOptions) + if err != nil { + if errdefs.IsNotFound(err) { + log.Error(ctx, "cannot remove container: not found", "task", task.ID) + } else { + return fmt.Errorf("%w: failed to remove container task=%s: %w", ErrInternal, task.ID, err) + } + } } - - log.Println("Disk mounted successfully!") + // Normally, it should not be empty + if task.runnerDir != "" { + // Failed attempts to remove or rename runner dir are considered non-fatal + if err := os.RemoveAll(task.runnerDir); err != nil { + log.Error(ctx, "failed to remove runner directory", "dir", task.runnerDir, "err", err) + trashName := fmt.Sprintf(".trash-%s-%d", task.runnerDir, time.Now().UnixMicro()) + if err := os.Rename(task.runnerDir, trashName); err != nil { + log.Error(ctx, "failed to rename runner directory", "dir", task.runnerDir, "err", err) + } + } + } + log.Debug(ctx, "removed", "task", task.ID) return nil } -func pullImage(ctx context.Context, client docker.APIClient, taskConfig TaskConfig) error { +func pullImage(ctx context.Context, client docker.APIClient, taskConfig TaskConfig, logPath string, tracker *PullTracker) error { if !strings.Contains(taskConfig.ImageName, ":") { taskConfig.ImageName += ":latest" } @@ -334,7 +618,7 @@ func pullImage(ctx context.Context, client docker.APIClient, taskConfig TaskConf Filters: filters.NewArgs(filters.Arg("reference", taskConfig.ImageName)), }) if err != nil { - return tracerr.Wrap(err) + return fmt.Errorf("list images: %w", err) } // TODO: force pull latset @@ -343,7 +627,10 @@ func pullImage(ctx context.Context, client docker.APIClient, taskConfig TaskConf } opts := image.PullOptions{} - regAuth, _ := taskConfig.EncodeRegistryAuth() + regAuth, err := encodeRegistryAuth(taskConfig.RegistryUsername, taskConfig.RegistryPassword) + if err != nil { + log.Error(ctx, err.Error()) + } if regAuth != "" { opts.RegistryAuth = regAuth } @@ -351,171 +638,224 @@ func pullImage(ctx context.Context, client docker.APIClient, taskConfig TaskConf startTime := time.Now() reader, err := client.ImagePull(ctx, taskConfig.ImageName, opts) if err != nil { - return tracerr.Wrap(err) + return fmt.Errorf("pull image: %w", err) } defer func() { _ = reader.Close() }() - current := make(map[string]uint) - total := make(map[string]uint) - - type ProgressDetail struct { - Current uint `json:"current"` - Total uint `json:"total"` - } - type Progress struct { - Id string `json:"id"` - Status string `json:"status"` - ProgressDetail ProgressDetail `json:"progressDetail"` //nolint:tagliatelle + logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0o644) + if err != nil { + return fmt.Errorf("open pull log file: %w", err) } + defer logFile.Close() - var status bool + teeReader := io.TeeReader(reader, logFile) - scanner := bufio.NewScanner(reader) + var pullCompleted bool + pullErrors := make([]string, 0) + + scanner := bufio.NewScanner(teeReader) for scanner.Scan() { line := scanner.Bytes() - var progressRow Progress - if err := json.Unmarshal(line, &progressRow); err != nil { + var pullMessage PullMessage + if err := json.Unmarshal(line, &pullMessage); err != nil { continue } - if progressRow.Status == "Downloading" { - current[progressRow.Id] = progressRow.ProgressDetail.Current - total[progressRow.Id] = progressRow.ProgressDetail.Total - } - if progressRow.Status == "Download complete" { - current[progressRow.Id] = total[progressRow.Id] + tracker.Update(pullMessage) + if pullMessage.ErrorDetail.Message != "" { + log.Error(ctx, "error pulling image", "name", taskConfig.ImageName, "err", pullMessage.ErrorDetail.Message) + pullErrors = append(pullErrors, pullMessage.ErrorDetail.Message) } - if strings.HasPrefix(progressRow.Status, "Status:") { - status = true - log.Println(progressRow.Status) + // If the pull is successful, the last two entries must be: + // "Digest: sha256:" + // "Status: " + // where is either "Downloaded newer image for " or "Image is up to date for ". + // See: https://github.com/moby/moby/blob/e77ff99ede5ee5952b3a9227863552ae6e5b6fb1/daemon/containerd/image_pull.go#L134-L152 + // See: https://github.com/moby/moby/blob/e77ff99ede5ee5952b3a9227863552ae6e5b6fb1/daemon/containerd/image_pull.go#L257-L263 + if strings.HasPrefix(pullMessage.Status, "Status:") { + pullCompleted = true + log.Debug(ctx, pullMessage.Status) } } duration := time.Since(startTime) - - var currentBytes uint - var totalBytes uint - for _, v := range current { - currentBytes += v + p := tracker.Progress() + var currentBytes, totalBytes uint64 + if p != nil { + currentBytes, totalBytes = p.DownloadedBytes, p.TotalBytes } - for _, v := range total { - totalBytes += v + speed := bytesize.New(float64(currentBytes) / duration.Seconds()) + + if err := ctx.Err(); err != nil { + return fmt.Errorf("image pull interrupted: downloaded %d bytes out of %d (%s/s): %w", currentBytes, totalBytes, speed, err) } - speed := bytesize.New(float64(currentBytes) / duration.Seconds()) - if status && currentBytes == totalBytes { - log.Printf("Image Pull successfully downloaded: %d bytes (%s/s)", currentBytes, speed) + if pullCompleted { + log.Debug(ctx, "image successfully pulled", "bytes", currentBytes, "bps", speed) } else { - log.Printf("Image Pull interrupted: downloaded %d bytes out of %d (%s/s)", currentBytes, totalBytes, speed) + return fmt.Errorf( + "failed pulling %s: downloaded %d/%d bytes (%s/s), errors: %q", + taskConfig.ImageName, currentBytes, totalBytes, speed, pullErrors, + ) } - err = ctx.Err() - if err != nil { - return tracerr.Errorf("imagepull interrupted: downloaded %d bytes out of %d (%s/s): %w", currentBytes, totalBytes, speed, err) - } return nil } -func createContainer(ctx context.Context, client docker.APIClient, runnerDir string, dockerParams DockerParameters, taskConfig TaskConfig) (string, error) { - timeout := int(0) - stopOptions := container.StopOptions{Timeout: &timeout} - err := client.ContainerStop(ctx, taskConfig.ContainerName, stopOptions) +func (d *DockerRunner) createContainer(ctx context.Context, task *Task) error { + mounts, err := d.dockerParams.DockerMounts(task.runnerDir) if err != nil { - log.Printf("Cleanup routine: Cannot stop container: %s", err) + return fmt.Errorf("get docker mounts: %w", err) } - - removeOptions := container.RemoveOptions{Force: true} - err = client.ContainerRemove(ctx, taskConfig.ContainerName, removeOptions) + volumeMounts, err := getVolumeMounts(task.config.VolumeMounts) if err != nil { - log.Printf("Cleanup routine: Cannot remove container: %s", err) + return fmt.Errorf("get volume mounts: %w", err) } - - gpuRequest, err := requestGpuIfAvailable(ctx, client) + mounts = append(mounts, volumeMounts...) + instanceMounts, err := getInstanceMounts(task.config.InstanceMounts) if err != nil { - return "", tracerr.Wrap(err) + return fmt.Errorf("get instance mounts: %w", err) } - mounts, err := dockerParams.DockerMounts(runnerDir) - if err != nil { - return "", tracerr.Wrap(err) + mounts = append(mounts, instanceMounts...) + + // Set the environment variables + envVars := []string{} + envVars = append(envVars, d.baseEnv...) + if pjrtDevice := d.dockerParams.DockerPJRTDevice(); pjrtDevice != "" { + envVars = append(envVars, fmt.Sprintf("PJRT_DEVICE=%s", pjrtDevice)) } - volumeMounts, err := getVolumeMounts(taskConfig.Mounts) - if err != nil { - return "", tracerr.Wrap(err) + + // Override /dev/shm with tmpfs mount with `exec` option (the default is `noexec`) + // if ShmSize is specified (i.e. not zero, which is the default value). + // This is required by some workloads, e.g., Oracle Database with Java Stored Procedures, + // see https://github.com/moby/moby/issues/6758 + var tmpfs map[string]string + if task.config.ShmSize > 0 { + // No need to specify all default options (`nosuid`, etc.), + // the docker daemon will merge our options with the defaults. + tmpfs = map[string]string{ + "/dev/shm": fmt.Sprintf("exec,size=%d", task.config.ShmSize), + } } - mounts = append(mounts, volumeMounts...) - //Set the environment variables - envVars := []string{} - if dockerParams.DockerPJRTDevice() != "" { - envVars = append(envVars, fmt.Sprintf("PJRT_DEVICE=%s", dockerParams.DockerPJRTDevice())) + networkMode := getNetworkMode(task.config.NetworkMode) + ports := d.dockerParams.DockerPorts() + + // Bridge mode - all interfaces + runnerHttpAddress := "" + if networkMode.IsHost() { + runnerHttpAddress = "localhost" } + shellCommands := d.dockerParams.DockerShellCommands(task.config.ContainerSshKeys, runnerHttpAddress) containerConfig := &container.Config{ - Image: taskConfig.ImageName, - Cmd: []string{strings.Join(dockerParams.DockerShellCommands(taskConfig.PublicKeys), " && ")}, + Image: task.config.ImageName, + Cmd: []string{strings.Join(shellCommands, " && ")}, Entrypoint: []string{"/bin/sh", "-c"}, - ExposedPorts: exposePorts(dockerParams.DockerPorts()...), + ExposedPorts: exposePorts(ports), Env: envVars, + Labels: map[string]string{ + LabelKeyIsTask: LabelValueTrue, + LabelKeyTaskID: task.ID, + }, + } + if task.config.ContainerUser != "" { + containerConfig.User = task.config.ContainerUser } hostConfig := &container.HostConfig{ - Privileged: dockerParams.DockerPrivileged(), - NetworkMode: getNetworkMode(), - PortBindings: bindPorts(dockerParams.DockerPorts()...), - PublishAllPorts: true, - Sysctls: map[string]string{}, - Resources: container.Resources{ - DeviceRequests: gpuRequest, - }, - Mounts: mounts, - ShmSize: taskConfig.ShmSize, + Privileged: task.config.Privileged || d.dockerParams.DockerPrivileged(), + NetworkMode: networkMode, + PortBindings: bindPorts(ports), + Mounts: mounts, + ShmSize: task.config.ShmSize, + Tmpfs: tmpfs, + } + hostConfig.NanoCPUs = int64(task.config.CPU * 1000000000) + hostConfig.Memory = task.config.Memory + if len(task.gpuIDs) > 0 { + if len(task.config.GPUDevices) > 0 { + configureGpuDevices(hostConfig, task.config.GPUDevices) + } else { + configureGpus(containerConfig, hostConfig, d.gpuVendor, task.gpuIDs) + } } + configureHpcNetworkingIfAvailable(hostConfig) - log.Printf("Creating container %s:\nconfig: %v\nhostConfig:%v", taskConfig.ContainerName, containerConfig, hostConfig) - resp, err := client.ContainerCreate(ctx, containerConfig, hostConfig, nil, nil, taskConfig.ContainerName) + resp, err := d.client.ContainerCreate(ctx, containerConfig, hostConfig, nil, nil, task.containerName) if err != nil { - return "", tracerr.Wrap(err) + return fmt.Errorf("create container: %w", err) } - return resp.ID, nil + task.containerID = resp.ID + return nil } -func runContainer(ctx context.Context, client docker.APIClient, containerID string) error { - if err := client.ContainerStart(ctx, containerID, container.StartOptions{}); err != nil { - return tracerr.Wrap(err) +func (d *DockerRunner) startContainer(ctx context.Context, task *Task) error { + if err := d.client.ContainerStart(ctx, task.containerID, container.StartOptions{}); err != nil { + return fmt.Errorf("start container: %w", err) } + if getNetworkMode(task.config.NetworkMode).IsHost() { + task.ports = []PortMapping{} + return nil + } + container_, err := d.client.ContainerInspect(ctx, task.containerID) + if err != nil { + return fmt.Errorf("inspect container: %w", err) + } + task.ports = extractPorts(ctx, container_.NetworkSettings.Ports) + return nil +} - waitCh, errorCh := client.ContainerWait(ctx, containerID, "") +func (d *DockerRunner) waitContainer(ctx context.Context, task *Task) error { + waitCh, errorCh := d.client.ContainerWait(ctx, task.containerID, "") select { - case <-waitCh: + case waitResp := <-waitCh: + { + if waitResp.StatusCode != 0 { + return fmt.Errorf("container exited with exit code %d", waitResp.StatusCode) + } + } case err := <-errorCh: - return tracerr.Wrap(err) + return fmt.Errorf("wait for container: %w", err) } - return nil } -func getSSHShellCommands(openSSHPort int, publicSSHKey string) []string { +func encodeRegistryAuth(username string, password string) (string, error) { + if username == "" && password == "" { + return "", nil + } + + authConfig := registry.AuthConfig{ + Username: username, + Password: password, + } + + encodedConfig, err := json.Marshal(authConfig) + if err != nil { + return "", fmt.Errorf("failed to encode auth config: %w", err) + } + + return base64.URLEncoding.EncodeToString(encodedConfig), nil +} + +func getSSHShellCommands() []string { return []string{ - // note: &> redirection doesn't work in /bin/sh + `( :`, + // See https://github.com/dstackai/dstack/issues/1769 + `unset LD_LIBRARY_PATH && unset LD_PRELOAD`, + // common functions + `exists() { command -v "$1" > /dev/null 2>&1; }`, + // package manager detection/abstraction + `install_pkg() { NAME=Distribution; test -f /etc/os-release && . /etc/os-release; echo $NAME not supported; exit 11; }`, + `if exists apt-get; then install_pkg() { apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y "$1"; }; fi`, + `if exists yum; then install_pkg() { yum install -y "$1"; }; fi`, + `if exists apk; then install_pkg() { apk add -U "$1"; }; fi`, // check in sshd is here, install if not - "if ! command -v sshd >/dev/null 2>&1; then { apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y openssh-server; } || { yum -y install openssh-server; }; fi", - // prohibit password authentication - "sed -i \"s/.*PasswordAuthentication.*/PasswordAuthentication no/g\" /etc/ssh/sshd_config", - // create ssh dirs and add public key - "mkdir -p /run/sshd ~/.ssh", - "chmod 700 ~/.ssh", - fmt.Sprintf("echo '%s' > ~/.ssh/authorized_keys", publicSSHKey), - "chmod 600 ~/.ssh/authorized_keys", - // preserve environment variables for SSH clients - "env >> ~/.ssh/environment", - "sed -ie '1s@^@export PATH=\"'\"$PATH\"':$PATH\"\\n\\n@' ~/.profile", - // regenerate host keys - "rm -rf /etc/ssh/ssh_host_*", - "ssh-keygen -A > /dev/null", - // start sshd - fmt.Sprintf("/usr/sbin/sshd -p %d -o PermitUserEnvironment=yes", openSSHPort), - } -} - -func exposePorts(ports ...int) nat.PortSet { + `if ! exists sshd; then install_pkg openssh-server; fi`, + `: )`, + } +} + +func exposePorts(ports []int) nat.PortSet { portSet := make(nat.PortSet) for _, port := range ports { portSet[nat.Port(fmt.Sprintf("%d/tcp", port))] = struct{}{} @@ -524,44 +864,181 @@ func exposePorts(ports ...int) nat.PortSet { } // bindPorts does identity mapping only -func bindPorts(ports ...int) nat.PortMap { +func bindPorts(ports []int) nat.PortMap { portMap := make(nat.PortMap) for _, port := range ports { portMap[nat.Port(fmt.Sprintf("%d/tcp", port))] = []nat.PortBinding{ { - HostIP: "0.0.0.0", - HostPort: strconv.Itoa(port), + HostIP: "127.0.0.1", + HostPort: "", // use ephemeral port from ip_local_port_range }, } } return portMap } -func getNetworkMode() container.NetworkMode { - if rt.GOOS == "linux" { - return "host" +func extractPorts(ctx context.Context, portMap nat.PortMap) []PortMapping { + ports := make([]PortMapping, 0, len(portMap)) + for containerPortWithProto, bindings := range portMap { + // 8080/tcp -> ["8080", "tcp"] + containerPortParts := strings.Split(string(containerPortWithProto), "/") + if len(containerPortParts) != 2 { + log.Error(ctx, "unexpected container port format", "port", containerPortWithProto) + continue + } + if containerPortParts[1] != "tcp" { + continue + } + containerPort, err := strconv.Atoi(containerPortParts[0]) + if err != nil { + log.Error(ctx, "failed to parse container port", "port", containerPortWithProto) + continue + } + for _, binding := range bindings { + // skip IPv6 + if strings.Contains(binding.HostIP, ":") { + continue + } + hostPort, err := strconv.Atoi(binding.HostPort) + if err != nil { + log.Error(ctx, "failed to parse host port", "port", binding.HostPort) + continue + } + ports = append(ports, PortMapping{ + Host: hostPort, + Container: containerPort, + }) + } } - return "default" + return ports } -func requestGpuIfAvailable(ctx context.Context, client docker.APIClient) ([]container.DeviceRequest, error) { - info, err := client.Info(ctx) - if err != nil { - return nil, tracerr.Wrap(err) +func getNetworkMode(networkMode NetworkMode) container.NetworkMode { + return container.NetworkMode(networkMode) +} + +func configureGpuDevices(hostConfig *container.HostConfig, gpuDevices []GPUDevice) { + for _, gpuDevice := range gpuDevices { + hostConfig.Devices = append( + hostConfig.Devices, + container.DeviceMapping{ + PathOnHost: gpuDevice.PathOnHost, + PathInContainer: gpuDevice.PathInContainer, + CgroupPermissions: "rwm", + }, + ) } +} - for runtime := range info.Runtimes { - if runtime == consts.NVIDIA_RUNTIME { - return []container.DeviceRequest{ - {Capabilities: [][]string{{"gpu"}}, Count: -1}, // --gpus=all - }, nil +func configureGpus(config *container.Config, hostConfig *container.HostConfig, vendor gpu.GpuVendor, ids []string) { + // NVIDIA: ids are identifiers reported by nvidia-smi, GPU- strings + // AMD: ids are DRI render node paths, e.g., /dev/dri/renderD128 + // Tenstorrent: ids are device indices to be used with /dev/tenstorrent/ + switch vendor { + case gpu.GpuVendorNvidia: + hostConfig.DeviceRequests = append( + hostConfig.DeviceRequests, + container.DeviceRequest{ + // Request all capabilities to maximize compatibility with all sorts of GPU workloads. + // Default capabilities: utility, compute. + // https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/1.16.0/docker-specialized.html + Capabilities: [][]string{{"gpu", "utility", "compute", "graphics", "video", "display", "compat32"}}, + DeviceIDs: ids, + }, + ) + case gpu.GpuVendorAmd: + // All options are listed here: https://hub.docker.com/r/rocm/pytorch + // Only --device are mandatory, other seem to be performance-related. + // --device=/dev/kfd + hostConfig.Devices = append( + hostConfig.Devices, + container.DeviceMapping{ + PathOnHost: "/dev/kfd", + PathInContainer: "/dev/kfd", + CgroupPermissions: "rwm", + }, + ) + // --device=/dev/dri/renderD + for _, renderNodePath := range ids { + hostConfig.Devices = append( + hostConfig.Devices, + container.DeviceMapping{ + PathOnHost: renderNodePath, + PathInContainer: renderNodePath, + CgroupPermissions: "rwm", + }, + ) } + // --ipc=host + hostConfig.IpcMode = container.IPCModeHost + // --cap-add=SYS_PTRACE + hostConfig.CapAdd = append(hostConfig.CapAdd, "SYS_PTRACE") + // --security-opt=seccomp=unconfined + hostConfig.SecurityOpt = append(hostConfig.SecurityOpt, "seccomp=unconfined") + // TODO: in addition, for non-root user, --group-add=video, and possibly --group-add=render, are required. + case gpu.GpuVendorTenstorrent: + // For Tenstorrent, simply add each device + for _, id := range ids { + devicePath := fmt.Sprintf("/dev/tenstorrent/%s", id) + hostConfig.Devices = append( + hostConfig.Devices, + container.DeviceMapping{ + PathOnHost: devicePath, + PathInContainer: devicePath, + CgroupPermissions: "rwm", + }, + ) + } + // Check and mount hugepages-1G if it exists + if _, err := os.Stat("/dev/hugepages-1G"); err == nil { + hostConfig.Mounts = append(hostConfig.Mounts, mount.Mount{ + Type: mount.TypeBind, + Source: "/dev/hugepages-1G", + Target: "/dev/hugepages-1G", + }) + } + case gpu.GpuVendorIntel: + // All options are listed here: + // https://docs.habana.ai/en/latest/Installation_Guide/Additional_Installation/Docker_Installation.html + // --runtime=habana + hostConfig.Runtime = "habana" + // --ipc=host + hostConfig.IpcMode = container.IPCModeHost + // --cap-add=SYS_NICE + hostConfig.CapAdd = append(hostConfig.CapAdd, "SYS_NICE") + // -e HABANA_VISIBLE_DEVICES=0,1,... + config.Env = append(config.Env, fmt.Sprintf("HABANA_VISIBLE_DEVICES=%s", strings.Join(ids, ","))) + case gpu.GpuVendorNone: + // nothing to do } +} - return nil, nil +func configureHpcNetworkingIfAvailable(hostConfig *container.HostConfig) { + // Although AWS EFA is not InfiniBand, EFA adapters are exposed as /dev/infiniband/uverbsN (N=0,1,...) + if _, err := os.Stat("/dev/infiniband"); !errors.Is(err, os.ErrNotExist) { + hostConfig.Devices = append( + hostConfig.Devices, + container.DeviceMapping{ + PathOnHost: "/dev/infiniband", + PathInContainer: "/dev/infiniband", + CgroupPermissions: "rwm", + }, + ) + // Set max locked memory (ulimit -l) to unlimited. Fixes "Libfabric error: (-12) Cannot allocate memory". + // See: https://github.com/ofiwg/libfabric/issues/6437 + // See: https://aws.amazon.com/blogs/compute/leveraging-efa-to-run-hpc-and-ml-workloads-on-aws-batch/ + hostConfig.Ulimits = append( + hostConfig.Ulimits, + &units.Ulimit{ + Name: "memlock", + Soft: -1, + Hard: -1, + }, + ) + } } -func getVolumeMounts(mountPoints []MountPoint) ([]mount.Mount, error) { +func getVolumeMounts(mountPoints []VolumeMountPoint) ([]mount.Mount, error) { mounts := []mount.Mount{} for _, mountPoint := range mountPoints { source := getVolumeMountPoint(mountPoint.Name) @@ -570,104 +1047,110 @@ func getVolumeMounts(mountPoints []MountPoint) ([]mount.Mount, error) { return mounts, nil } +func getInstanceMounts(mountPoints []InstanceMountPoint) ([]mount.Mount, error) { + mounts := []mount.Mount{} + for _, mountPoint := range mountPoints { + mounts = append(mounts, mount.Mount{Type: mount.TypeBind, Source: mountPoint.InstancePath, Target: mountPoint.Path}) + } + return mounts, nil +} + +func getContainerLastLogs(ctx context.Context, client docker.APIClient, containerID string, n int) ([]string, error) { + options := container.LogsOptions{ + ShowStdout: true, + ShowStderr: true, + Tail: fmt.Sprintf("%d", n), + } + + muxedReader, err := client.ContainerLogs(ctx, containerID, options) + if err != nil { + return nil, err + } + defer func() { _ = muxedReader.Close() }() + + demuxedBuffer := new(bytes.Buffer) + // Using the same Writer for both stdout and stderr should be roughly equivalent to 2>&1 + if _, err := stdcopy.StdCopy(demuxedBuffer, demuxedBuffer, muxedReader); err != nil { + return nil, err + } + + var lines []string + scanner := bufio.NewScanner(demuxedBuffer) + for scanner.Scan() { + lines = append(lines, scanner.Text()) + } + if err := scanner.Err(); err != nil && !errors.Is(err, io.EOF) { + return nil, err + } + + return lines, nil +} + /* DockerParameters interface implementation for CLIArgs */ -func (c CLIArgs) DockerKeepContainer() bool { - return c.Docker.KeepContainer +func (c *CLIArgs) DockerPassEnv() []string { + names := []string{} + for _, name := range strings.Split(c.Docker.PassEnv, ",") { + if name = strings.TrimSpace(name); name != "" { + names = append(names, name) + } + } + return names } -func (c CLIArgs) DockerPrivileged() bool { +func (c *CLIArgs) DockerPrivileged() bool { return c.Docker.Privileged } -func (c CLIArgs) DockerPJRTDevice() string { +func (c *CLIArgs) DockerPJRTDevice() string { return c.Docker.PJRTDevice } -func (c CLIArgs) DockerShellCommands(publicKeys []string) []string { - concatinatedPublicKeys := c.Docker.ConcatinatedPublicSSHKeys - if len(publicKeys) > 0 { - concatinatedPublicKeys = strings.Join(publicKeys, "\n") +func (c *CLIArgs) DockerShellCommands(authorizedKeys []string, runnerHttpAddress string) []string { + commands := getSSHShellCommands() + runnerCommand := []string{ + consts.RunnerBinaryPath, + "--log-level", c.Runner.LogLevel, + "start", + "--temp-dir", consts.RunnerTempDir, + "--http-port", strconv.Itoa(c.Runner.HTTPPort), + "--ssh-port", strconv.Itoa(c.Runner.SSHPort), + } + if runnerHttpAddress != "" { + runnerCommand = append(runnerCommand, "--http-address", runnerHttpAddress) } - commands := getSSHShellCommands(c.Docker.SSHPort, concatinatedPublicKeys) - commands = append(commands, fmt.Sprintf("%s %s", DstackRunnerBinaryName, strings.Join(c.getRunnerArgs(), " "))) - return commands + for _, key := range authorizedKeys { + runnerCommand = append(runnerCommand, "--ssh-authorized-key", fmt.Sprintf("'%s'", key)) + } + if c.Runner.SSHLogLevel != "" { + runnerCommand = append(runnerCommand, "--ssh-log-level", c.Runner.SSHLogLevel) + } + return append(commands, strings.Join(runnerCommand, " ")) } -func (c CLIArgs) DockerMounts(hostRunnerDir string) ([]mount.Mount, error) { +func (c *CLIArgs) DockerMounts(hostRunnerDir string) ([]mount.Mount, error) { return []mount.Mount{ { Type: mount.TypeBind, Source: hostRunnerDir, - Target: c.Runner.TempDir, + Target: consts.RunnerTempDir, }, { Type: mount.TypeBind, Source: c.Runner.BinaryPath, - Target: DstackRunnerBinaryName, + Target: consts.RunnerBinaryPath, }, }, nil } -func (c CLIArgs) DockerPorts() []int { - return []int{c.Runner.HTTPPort, c.Docker.SSHPort} +func (c *CLIArgs) DockerPorts() []int { + return []int{c.Runner.HTTPPort, c.Runner.SSHPort} } -func (c CLIArgs) MakeRunnerDir() (string, error) { - runnerTemp := filepath.Join(c.Shim.HomeDir, "runners", time.Now().Format("20060102-150405")) +func (c *CLIArgs) MakeRunnerDir(name string) (string, error) { + runnerTemp := filepath.Join(c.Shim.HomeDir, "runners", name) if err := os.MkdirAll(runnerTemp, 0o755); err != nil { - return "", tracerr.Wrap(err) + return "", fmt.Errorf("create runner directory: %w", err) } return runnerTemp, nil } - -func inspectContainer(client *docker.Client, containerID string) (ContainerStatus, error) { - inspection, err := client.ContainerInspect(context.Background(), containerID) - if err != nil { - s := ContainerStatus{} - return s, err - } - containerStatus := ContainerStatus{ - ContainerID: containerID, - ContainerName: strings.TrimLeft(inspection.Name, "/"), - Status: inspection.State.Status, - Running: inspection.State.Running, - OOMKilled: inspection.State.OOMKilled, - Dead: inspection.State.Dead, - ExitCode: inspection.State.ExitCode, - Error: inspection.State.Error, - } - return containerStatus, nil -} - -func FindExecutorError(runnerDir string) string { - filename := filepath.Join(runnerDir, consts.RunnerLogFileName) - file, err := os.Open(filename) - if err != nil { - log.Printf("Cannot open file %s: %s\n", filename, err) - return "" - } - defer file.Close() - - fileStatus, err := file.Stat() - if err != nil { - log.Printf("Cannot stat file %s: %s\n", filename, err) - return "" - } - - scanner := backscanner.New(file, int(fileStatus.Size())) - what := []byte(consts.ExecutorFailedSignature) - for { - line, _, err := scanner.LineBytes() - if err != nil { - if err == io.EOF { - return "" // consts.ExecutorFailedSignature is not found in file - } - log.Printf("FindExecutorError scan error: %s\n", err) - return "" - } - if bytes.Contains(line, what) { - return string(line) - } - } -} diff --git a/runner/internal/shim/docker_test.go b/runner/internal/shim/docker_test.go index 5823bdfdf1..41627da20a 100644 --- a/runner/internal/shim/docker_test.go +++ b/runner/internal/shim/docker_test.go @@ -2,12 +2,9 @@ package shim import ( "context" - "os" - "os/exec" - "strconv" - "strings" + "encoding/hex" + "math/rand" "sync" - "sync/atomic" "testing" "time" @@ -17,6 +14,7 @@ import ( ) // TestDocker_SSHServer pulls ubuntu image (without sshd), installs openssh-server and exits +// Basically, it indirectly tests a shell script generated by getSSHShellCommands func TestDocker_SSHServer(t *testing.T) { if testing.Short() { t.Skip() @@ -24,78 +22,82 @@ func TestDocker_SSHServer(t *testing.T) { t.Parallel() params := &dockerParametersMock{ - commands: []string{"echo 1"}, - sshPort: nextPort(), + commands: []string{"/usr/sbin/sshd -V 2>&1 | grep OpenSSH"}, + sshShellCommands: true, + runnerDir: t.TempDir(), } timeout := 180 // seconds - ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeout)*time.Second) + ctx, cancel := context.WithTimeout(t.Context(), time.Duration(timeout)*time.Second) defer cancel() - dockerRunner, _ := NewDockerRunner(params) - assert.NoError(t, dockerRunner.Run(ctx, TaskConfig{ImageName: "ubuntu"})) + dockerRunner, err := NewDockerRunner(ctx, params) + require.NoError(t, err) + + taskConfig := createTaskConfig(t) + defer dockerRunner.Remove(t.Context(), taskConfig.ID) + + assert.NoError(t, dockerRunner.Submit(ctx, taskConfig)) + assert.NoError(t, dockerRunner.Run(ctx, taskConfig.ID)) } -// TestDocker_SSHServerConnect pulls ubuntu image (without sshd), installs openssh-server and tries to connect via SSH -func TestDocker_SSHServerConnect(t *testing.T) { +func TestDocker_ShmNoexecByDefault(t *testing.T) { if testing.Short() { t.Skip() } t.Parallel() - tempDir := t.TempDir() - require.NoError(t, exec.Command("ssh-keygen", "-t", "rsa", "-b", "2048", "-f", tempDir+"/id_rsa", "-q", "-N", "").Run()) - publicBytes, err := os.ReadFile(tempDir + "/id_rsa.pub") + params := &dockerParametersMock{ + commands: []string{"mount | grep '/dev/shm .*size=65536k' | grep noexec"}, + runnerDir: t.TempDir(), + } + + timeout := 180 // seconds + ctx, cancel := context.WithTimeout(t.Context(), time.Duration(timeout)*time.Second) + defer cancel() + + dockerRunner, err := NewDockerRunner(ctx, params) require.NoError(t, err) + taskConfig := createTaskConfig(t) + defer dockerRunner.Remove(t.Context(), taskConfig.ID) + + assert.NoError(t, dockerRunner.Submit(ctx, taskConfig)) + assert.NoError(t, dockerRunner.Run(ctx, taskConfig.ID)) +} + +func TestDocker_ShmExecIfSizeSpecified(t *testing.T) { + if testing.Short() { + t.Skip() + } + t.Parallel() + params := &dockerParametersMock{ - commands: []string{"sleep 5"}, - sshPort: nextPort(), - publicSSHKey: string(publicBytes), + commands: []string{"mount | grep '/dev/shm .*size=1024k' | grep -v noexec"}, + runnerDir: t.TempDir(), } timeout := 180 // seconds - ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeout)*time.Second) + ctx, cancel := context.WithTimeout(t.Context(), time.Duration(timeout)*time.Second) defer cancel() - dockerRunner, _ := NewDockerRunner(params) - - var wg sync.WaitGroup - wg.Add(1) - go func() { - defer wg.Done() - assert.NoError(t, dockerRunner.Run(ctx, TaskConfig{ImageName: "ubuntu"})) - }() - - for i := 0; i < timeout; i++ { - cmd := exec.Command("ssh", - "-F", "none", - "-o", "StrictHostKeyChecking=no", - "-o", "UserKnownHostsFile=/dev/null", - "-i", tempDir+"/id_rsa", - "-p", strconv.Itoa(params.sshPort), - "root@localhost", "whoami", - ) - output, err := cmd.Output() - if err == nil { - assert.Equal(t, "root\n", string(output)) - break - } - time.Sleep(time.Second) // 1 attempt per second - } - wg.Wait() + dockerRunner, err := NewDockerRunner(ctx, params) + require.NoError(t, err) + + taskConfig := createTaskConfig(t) + taskConfig.ShmSize = 1024 * 1024 + defer dockerRunner.Remove(t.Context(), taskConfig.ID) + + assert.NoError(t, dockerRunner.Submit(ctx, taskConfig)) + assert.NoError(t, dockerRunner.Run(ctx, taskConfig.ID)) } /* Mocks */ type dockerParametersMock struct { - commands []string - sshPort int - publicSSHKey string -} - -func (c *dockerParametersMock) DockerKeepContainer() bool { - return false + commands []string + sshShellCommands bool + runnerDir string } func (c *dockerParametersMock) DockerPrivileged() bool { @@ -106,35 +108,189 @@ func (c *dockerParametersMock) DockerPJRTDevice() string { return "" } -func (c *dockerParametersMock) DockerShellCommands(publicKeys []string) []string { - userPublicKey := c.publicSSHKey - if len(publicKeys) > 0 { - userPublicKey = strings.Join(publicKeys, "\n") - } +func (c *dockerParametersMock) DockerShellCommands(authorizedKeys []string, runnerHttpAddress string) []string { commands := make([]string, 0) - commands = append(commands, getSSHShellCommands(c.sshPort, userPublicKey)...) + if c.sshShellCommands { + commands = append(commands, getSSHShellCommands()...) + } commands = append(commands, c.commands...) return commands } +func (c *dockerParametersMock) DockerPassEnv() []string { + return []string{} +} + func (c *dockerParametersMock) DockerPorts() []int { - ports := make([]int, 0) - ports = append(ports, c.sshPort) - return ports + return []int{} } func (c *dockerParametersMock) DockerMounts(string) ([]mount.Mount, error) { return nil, nil } -func (c *dockerParametersMock) MakeRunnerDir() (string, error) { - return "", nil +func (c *dockerParametersMock) MakeRunnerDir(string) (string, error) { + return c.runnerDir, nil } /* Utilities */ -var portNumber int32 = 10000 +var ( + randSrc = rand.New(rand.NewSource(time.Now().UnixNano())) + randMu = sync.Mutex{} +) + +func generateID(t *testing.T) string { + const idLen = 16 + b := make([]byte, idLen/2) + randMu.Lock() + defer randMu.Unlock() + _, err := randSrc.Read(b) + require.Nil(t, err) + return hex.EncodeToString(b)[:idLen] +} + +func createTaskConfig(t *testing.T) TaskConfig { + return TaskConfig{ + ID: generateID(t), + Name: t.Name(), + ImageName: "ubuntu", + } +} + +func pullMsg(id, status string, current, total uint64) PullMessage { + m := PullMessage{Id: id, Status: status} + m.ProgressDetail.Current = current + m.ProgressDetail.Total = total + return m +} + +func TestPullTracker_Empty(t *testing.T) { + tracker := newPullTracker() + assert.Nil(t, tracker.Progress()) +} + +func TestPullTracker_AlreadyExists(t *testing.T) { + tracker := newPullTracker() + tracker.Update(PullMessage{Id: "3.11", Status: "Pulling from library/python"}) + for _, id := range []string{"aaa", "bbb", "ccc"} { + tracker.Update(PullMessage{Id: id, Status: "Already exists"}) + } + tracker.Update(PullMessage{Status: "Digest: sha256:***"}) + tracker.Update(PullMessage{Status: "Status: Image is up to date for python:3.11"}) + p := tracker.Progress() + require.NotNil(t, p) + assert.Equal(t, uint64(0), p.DownloadedBytes) + assert.Equal(t, uint64(0), p.ExtractedBytes) + assert.Equal(t, uint64(0), p.TotalBytes) + assert.True(t, p.IsTotalBytesFinal) +} + +func TestPullTracker_FullPull(t *testing.T) { + const sizeA, sizeB uint64 = 111, 222 + + tracker := newPullTracker() + tracker.Update(PullMessage{Id: "3.11", Status: "Pulling from library/python"}) + tracker.Update(PullMessage{Id: "aaa", Status: "Pulling fs layer"}) + tracker.Update(PullMessage{Id: "bbb", Status: "Pulling fs layer"}) + tracker.Update(PullMessage{Id: "aaa", Status: "Waiting"}) + tracker.Update(PullMessage{Id: "bbb", Status: "Waiting"}) + + // Layers announced but sizes unknown yet + p := tracker.Progress() + require.NotNil(t, p) + assert.Equal(t, uint64(0), p.DownloadedBytes) + assert.Equal(t, uint64(0), p.ExtractedBytes) + assert.Equal(t, uint64(0), p.TotalBytes) + assert.False(t, p.IsTotalBytesFinal) + + // Both layers start downloading - sizes now known + tracker.Update(pullMsg("aaa", "Downloading", 100, sizeA)) + tracker.Update(pullMsg("bbb", "Downloading", 200, sizeB)) + + p = tracker.Progress() + assert.Equal(t, uint64(300), p.DownloadedBytes) + assert.Equal(t, uint64(0), p.ExtractedBytes) + assert.Equal(t, sizeA+sizeB, p.TotalBytes) + assert.True(t, p.IsTotalBytesFinal) + + // Downloads complete + tracker.Update(pullMsg("aaa", "Downloading", sizeA, sizeA)) + tracker.Update(PullMessage{Id: "aaa", Status: "Download complete"}) + tracker.Update(pullMsg("bbb", "Downloading", sizeB, sizeB)) + tracker.Update(PullMessage{Id: "bbb", Status: "Download complete"}) + + p = tracker.Progress() + assert.Equal(t, sizeA+sizeB, p.DownloadedBytes) + assert.Equal(t, uint64(0), p.ExtractedBytes) + assert.Equal(t, sizeA+sizeB, p.TotalBytes) + assert.True(t, p.IsTotalBytesFinal) + + // Both layers start extracting + tracker.Update(pullMsg("aaa", "Extracting", 100, sizeA)) + tracker.Update(pullMsg("bbb", "Extracting", 200, sizeB)) + + p = tracker.Progress() + assert.Equal(t, sizeA+sizeB, p.DownloadedBytes) + assert.Equal(t, uint64(300), p.ExtractedBytes) + assert.Equal(t, sizeA+sizeB, p.TotalBytes) + assert.True(t, p.IsTotalBytesFinal) + + // Extractions complete + tracker.Update(pullMsg("aaa", "Extracting", sizeA, sizeA)) + tracker.Update(PullMessage{Id: "aaa", Status: "Pull complete"}) + tracker.Update(pullMsg("bbb", "Extracting", sizeB, sizeB)) + tracker.Update(PullMessage{Id: "bbb", Status: "Pull complete"}) + tracker.Update(PullMessage{Status: "Digest: sha256:***"}) + tracker.Update(PullMessage{Status: "Status: Downloaded newer image for python:3.11"}) + + p = tracker.Progress() + assert.Equal(t, sizeA+sizeB, p.DownloadedBytes) + assert.Equal(t, sizeA+sizeB, p.ExtractedBytes) + assert.Equal(t, sizeA+sizeB, p.TotalBytes) + assert.True(t, p.IsTotalBytesFinal) +} + +func TestPullTracker_MixedLayerStatuses(t *testing.T) { + tracker := newPullTracker() + + tracker.Update(PullMessage{Id: "layer-exists", Status: "Already exists"}) + tracker.Update(pullMsg("layer-downloading", "Downloading", 50, 100)) + tracker.Update(pullMsg("layer-extracting", "Extracting", 100, 200)) + tracker.Update(PullMessage{Id: "layer-waiting", Status: "Waiting"}) + + p := tracker.Progress() + require.NotNil(t, p) + assert.Equal(t, uint64(50+200), p.DownloadedBytes) + assert.Equal(t, uint64(100), p.ExtractedBytes) + assert.Equal(t, uint64(100+200), p.TotalBytes) + assert.False(t, p.IsTotalBytesFinal) // layer-waiting size unknown +} + +func TestPullTracker_NonBytesExtractingUnit(t *testing.T) { + tracker := newPullTracker() + tracker.Update(PullMessage{Id: "3.11", Status: "Pulling from library/python"}) + tracker.Update(PullMessage{Id: "aaa", Status: "Pulling fs layer"}) + tracker.Update(pullMsg("aaa", "Downloading", 100, 200)) + tracker.Update(pullMsg("aaa", "Downloading", 200, 200)) + tracker.Update(PullMessage{Id: "aaa", Status: "Download complete"}) + // Newer Docker daemons report extraction progress in seconds. Tested with 29.5.2 + tracker.Update(PullMessage{Id: "aaa", Status: "Extracting", ProgressDetail: ProgressDetail{Current: 1, Units: "s"}}) + tracker.Update(PullMessage{Id: "aaa", Status: "Extracting", ProgressDetail: ProgressDetail{Current: 2, Units: "s"}}) + + p := tracker.Progress() + require.NotNil(t, p) + assert.Equal(t, uint64(200), p.DownloadedBytes) + assert.Equal(t, uint64(0), p.ExtractedBytes) // reported in seconds, bytes unknown + assert.Equal(t, uint64(200), p.TotalBytes) + assert.True(t, p.IsTotalBytesFinal) + + tracker.Update(PullMessage{Id: "aaa", Status: "Pull complete"}) -func nextPort() int { - return int(atomic.AddInt32(&portNumber, 1)) + p = tracker.Progress() + require.NotNil(t, p) + assert.Equal(t, uint64(200), p.DownloadedBytes) + assert.Equal(t, uint64(200), p.ExtractedBytes) // pull complete => extracted == total + assert.Equal(t, uint64(200), p.TotalBytes) + assert.True(t, p.IsTotalBytesFinal) } diff --git a/runner/internal/shim/errs.go b/runner/internal/shim/errs.go new file mode 100644 index 0000000000..92f99fe72e --- /dev/null +++ b/runner/internal/shim/errs.go @@ -0,0 +1,33 @@ +package shim + +import "errors" + +/* +Definitions of common error types used throughout shim. +Errors should wrap these errors to simplify error classifications, e.g.: + + func cleanup(containerID string) { + ... + return fmt.Errorf("%w: failed to remove container") + } + + if err := cleanup(containerID); errors.Is(err, ErrInternal) { + return ErrorResponse { + Status: 500, + Message: err.Error(), + } + } else if errors.Is(err, ErrNotFound) { + return ErrorResponse { + Status: 404, + Message: err.Error(), + } + } +*/ +var ( + // shim failed to process request due to internal error + ErrInternal = errors.New("internal error") + // shim rejected to process request, e.g., bad params, state conflict, etc. + ErrRequest = errors.New("request error") + // referenced object does not exist + ErrNotFound = errors.New("not found") +) diff --git a/runner/internal/shim/host/gpu.go b/runner/internal/shim/host/gpu.go new file mode 100644 index 0000000000..eff57f2e00 --- /dev/null +++ b/runner/internal/shim/host/gpu.go @@ -0,0 +1,516 @@ +package host + +import ( + "context" + "encoding/csv" + "encoding/json" + "errors" + "fmt" + "io" + "path/filepath" + "strconv" + "strings" + "time" + + execute "github.com/alexellis/go-execute/v2" + + "github.com/dstackai/dstack/runner/internal/common/gpu" + "github.com/dstackai/dstack/runner/internal/common/log" +) + +const ( + amdSmiImage = "dstackai/amd-smi:latest" + ttSmiImage = "dstackai/tt-smi:latest" +) + +type GpuInfo struct { + Vendor gpu.GpuVendor + Name string + Vram int // MiB + // NVIDIA: uuid field from nvidia-smi, "globally unique immutable alphanumeric identifier of the GPU", + // in the form of `GPU-2b79666e-d81f-f3f8-fd47-9903f118c3f5` + // AMD: empty string (AMD devices have IDs in `amd-smi list`, but we don't need them) + // Intel: empty string (Gaudi devices have IDs called `uuid`, e.g., `01P0-HL2080A0-15-TNPS14-20-07-07`, + // but habana Docker runtime only accepts indices, see below) + ID string + // NVIDIA: empty string (NVIDIA devices have DRI nodes in udev FS, but we don't need them) + // AMD: `/dev/dri/renderD` path + // Intel: empty string + RenderNodePath string + // NVIDIA: empty string + // AMD: empty string + // Intel: accelerator index: ("0", "1", ...), as reported by `hl-smi -Q index` + Index string +} + +func GetGpuInfo(ctx context.Context) []GpuInfo { + switch gpuVendor := gpu.GetGpuVendor(); gpuVendor { + case gpu.GpuVendorNvidia: + return getNvidiaGpuInfo(ctx) + case gpu.GpuVendorAmd: + return getAmdGpuInfo(ctx) + case gpu.GpuVendorIntel: + return getIntelGpuInfo(ctx) + case gpu.GpuVendorTenstorrent: + return getTenstorrentGpuInfo(ctx) + case gpu.GpuVendorNone: + return []GpuInfo{} + } + return []GpuInfo{} +} + +func getNvidiaGpuInfo(ctx context.Context) []GpuInfo { + gpus := []GpuInfo{} + + cmd := execute.ExecTask{ + Command: "nvidia-smi", + Args: []string{"--query-gpu=name,memory.total,uuid", "--format=csv,noheader,nounits"}, + StreamStdio: false, + } + res, err := cmd.Execute(ctx) + if err != nil { + log.Error(ctx, "failed to execute nvidia-smi", "err", err) + return gpus + } + if res.ExitCode != 0 { + log.Error( + ctx, "failed to execute nvidia-smi", + "exitcode", res.ExitCode, "stdout", res.Stdout, "stderr", res.Stderr, + ) + return gpus + } + + r := csv.NewReader(strings.NewReader(res.Stdout)) + for { + record, err := r.Read() + if errors.Is(err, io.EOF) { + break + } + if err != nil { + log.Error(ctx, "cannot read csv", "err", err) + return gpus + } + if len(record) != 3 { + log.Error(ctx, "3 csv fields expected", "len", len(record)) + return gpus + } + vram, err := strconv.Atoi(strings.TrimSpace(record[1])) + if err != nil { + log.Error(ctx, "invalid VRAM value", "value", record[1]) + vram = 0 + } + gpus = append(gpus, GpuInfo{ + Vendor: gpu.GpuVendorNvidia, + Name: strings.TrimSpace(record[0]), + Vram: vram, + ID: strings.TrimSpace(record[2]), + }) + } + return gpus +} + +type amdGpu struct { + Asic amdAsic `json:"asic"` + Vram amdVram `json:"vram"` + Bus amdBus `json:"bus"` +} + +// amd-smi >= 7.x wraps the array in {"gpu_data": [...]} +type amdSmiOutput struct { + GpuData []amdGpu `json:"gpu_data"` +} + +type amdAsic struct { + Name string `json:"market_name"` +} + +type amdVram struct { + Size amdVramSize `json:"size"` +} + +type amdVramSize struct { + Value int `json:"value"` +} + +type amdBus struct { + BDF string `json:"bdf"` // PCIe Domain:Bus:Device.Function notation +} + +// parseAmdSmiOutput handles both amd-smi output formats: +// ROCm 6.x returns a flat array: [{"gpu": 0, ...}, ...] +// ROCm 7.x wraps it: {"gpu_data": [{"gpu": 0, ...}, ...]} +func parseAmdSmiOutput(data []byte) ([]amdGpu, error) { + var amdGpus []amdGpu + if err := json.Unmarshal(data, &amdGpus); err == nil { + return amdGpus, nil + } + var wrapped amdSmiOutput + if err := json.Unmarshal(data, &wrapped); err != nil { + return nil, err + } + return wrapped.GpuData, nil +} + +func getAmdGpuInfo(ctx context.Context) []GpuInfo { + gpus := []GpuInfo{} + + ctx, cancel := context.WithTimeout(ctx, 2*time.Minute) + defer cancel() + + cmd := execute.ExecTask{ + Command: "docker", + Args: []string{ + "run", + "--rm", + "--device", "/dev/kfd", + "--device", "/dev/dri", + amdSmiImage, + "static", "--json", "--asic", "--vram", "--bus", + }, + StreamStdio: false, + } + res, err := cmd.Execute(ctx) + if err != nil { + log.Error(ctx, "failed to execute amd-smi", "err", err) + return gpus + } + if res.ExitCode != 0 { + log.Error( + ctx, "failed to execute amd-smi", + "exitcode", res.ExitCode, "stdout", res.Stdout, "stderr", res.Stderr, + ) + return gpus + } + + amdGpus, err := parseAmdSmiOutput([]byte(res.Stdout)) + if err != nil { + log.Error(ctx, "cannot read json", "err", err) + return gpus + } + for _, amdGpu := range amdGpus { + renderNodePath, err := getAmdRenderNodePath(amdGpu.Bus.BDF) + if err != nil { + log.Error(ctx, "failed to resolve render node path", "bdf", amdGpu.Bus.BDF, "err", err) + continue + } + gpus = append(gpus, GpuInfo{ + Vendor: gpu.GpuVendorAmd, + Name: amdGpu.Asic.Name, + Vram: amdGpu.Vram.Size.Value, + RenderNodePath: renderNodePath, + }) + } + return gpus +} + +type ttSmiSnapshot struct { + DeviceInfo []ttDeviceInfo `json:"device_info"` +} + +type ttDeviceInfo struct { + BoardInfo ttBoardInfo `json:"board_info"` +} + +type ttBoardInfo struct { + BoardType string `json:"board_type"` + BoardID string `json:"board_id"` + BusID string `json:"bus_id"` +} + +func unmarshalTtSmiSnapshot(data []byte) (*ttSmiSnapshot, error) { + var snapshot ttSmiSnapshot + if err := json.Unmarshal(data, &snapshot); err != nil { + return nil, err + } + return &snapshot, nil +} + +func normalizeTtBoardName(name string) string { + switch { + case name == "bh-scrappy" || name == "p100": + return "p100a" + case strings.HasPrefix(name, "p150"): + return "p150" + case strings.HasPrefix(name, "p300"): + return "p300" + default: + return name + } +} + +func splitTtBoardType(boardType string) (name string, suffix string) { + boardType = strings.TrimSpace(boardType) + if strings.HasSuffix(boardType, " L") || strings.HasSuffix(boardType, " R") { + suffix = boardType[len(boardType)-1:] + boardType = strings.TrimSpace(boardType[:len(boardType)-2]) + } + return normalizeTtBoardName(boardType), suffix +} + +func ttBoardVramMib(name string) int { + switch { + case strings.HasPrefix(name, "n150"), + strings.HasPrefix(name, "n300"), + strings.HasPrefix(name, "tt-galaxy-wh"): + return 12 * 1024 + } + switch name { + case "p100a": + return 28 * 1024 + case "p150", "p300", "tt-galaxy-bh": + return 32 * 1024 + default: + return 0 + } +} + +func isTtBlackholeBoard(name string) bool { + switch name { + case "p100a", "p150", "p300", "tt-galaxy-bh": + return true + default: + return false + } +} + +func isRemoteTtDevice(device ttDeviceInfo) bool { + return strings.EqualFold(strings.TrimSpace(device.BoardInfo.BusID), "N/A") +} + +func getGpusFromTtSmiSnapshot(snapshot *ttSmiSnapshot) []GpuInfo { + gpuMap := make(map[string]*GpuInfo) + gpuKeys := []string{} + indexCounter := 0 + addGpu := func(key string, gpuInfo GpuInfo) { + gpuMap[key] = &gpuInfo + gpuKeys = append(gpuKeys, key) + } + + // First pass: identify all "L" and "R" devices + for i, device := range snapshot.DeviceInfo { + boardID := device.BoardInfo.BoardID + name, suffix := splitTtBoardType(device.BoardInfo.BoardType) + + if suffix == "L" { + // Create unique identifier for this "L" device + uniqueID := fmt.Sprintf("%s_L_%d", boardID, i) + + // Determine base VRAM based on board type + baseVram := ttBoardVramMib(name) + + // Create new GPU entry for "L" device + addGpu(uniqueID, GpuInfo{ + Vendor: gpu.GpuVendorTenstorrent, + Name: name, + Vram: baseVram, + ID: boardID, + Index: strconv.Itoa(indexCounter), + }) + indexCounter++ + } + } + + // Second pass: add memory from "R" devices to corresponding "L" devices + for _, device := range snapshot.DeviceInfo { + boardID := device.BoardInfo.BoardID + name, suffix := splitTtBoardType(device.BoardInfo.BoardType) + + if suffix == "R" { + // Find the corresponding "L" device with the same board_id + // Since we need to match "R" to "L", we'll use the board_id as the key + // and add memory to the first "L" device we find with that board_id + for _, key := range gpuKeys { + gpu := gpuMap[key] + if gpu.ID == boardID && (gpu.Name == name || !isTtBlackholeBoard(name)) { + // Add memory to the "L" device + gpu.Vram += ttBoardVramMib(name) + break // Only add to the first matching "L" device + } + } + } + } + + // Handle devices without L/R suffix (backward compatibility) + for i, device := range snapshot.DeviceInfo { + boardID := device.BoardInfo.BoardID + name, suffix := splitTtBoardType(device.BoardInfo.BoardType) + + if suffix == "" { + // For devices without L/R suffix, treat them as standalone GPUs + // This maintains backward compatibility with existing data + uniqueID := fmt.Sprintf("%s_standalone_%d", boardID, i) + + // Determine base VRAM based on board type + baseVram := ttBoardVramMib(name) + + if isTtBlackholeBoard(name) { + if isRemoteTtDevice(device) { + continue + } + addGpu(uniqueID, GpuInfo{ + Vendor: gpu.GpuVendorTenstorrent, + Name: name, + Vram: baseVram, + ID: boardID, + Index: strconv.Itoa(indexCounter), + }) + indexCounter++ + continue + } + + // Check if we already have a GPU with this board_id (old behavior) + existingGpu := false + for _, key := range gpuKeys { + gpu := gpuMap[key] + if gpu.ID == boardID { + gpu.Vram += baseVram + existingGpu = true + break + } + } + + if !existingGpu { + // Create new GPU entry + addGpu(uniqueID, GpuInfo{ + Vendor: gpu.GpuVendorTenstorrent, + Name: name, + Vram: baseVram, + ID: boardID, + Index: strconv.Itoa(indexCounter), + }) + indexCounter++ + } + } + } + + // Add memory from remote Blackhole chips to the matching local board. + for _, device := range snapshot.DeviceInfo { + boardID := device.BoardInfo.BoardID + name, suffix := splitTtBoardType(device.BoardInfo.BoardType) + if suffix != "" || !isTtBlackholeBoard(name) || !isRemoteTtDevice(device) { + continue + } + for _, key := range gpuKeys { + gpu := gpuMap[key] + if gpu.ID == boardID && gpu.Name == name { + gpu.Vram += ttBoardVramMib(name) + break + } + } + } + + // Convert map to slice + var gpus []GpuInfo + for _, key := range gpuKeys { + gpus = append(gpus, *gpuMap[key]) + } + + // Reassign indices sequentially based on discovery order. + for i := range gpus { + gpus[i].Index = strconv.Itoa(i) + } + + return gpus +} + +func getTenstorrentGpuInfo(ctx context.Context) []GpuInfo { + gpus := []GpuInfo{} + + cmd := execute.ExecTask{ + Command: "docker", + Args: []string{ + "run", + "--rm", + "--device", "/dev/tenstorrent", + ttSmiImage, + "-s", + }, + StreamStdio: false, + } + res, err := cmd.Execute(ctx) + if err != nil { + log.Error(ctx, "failed to execute tt-smi", "err", err) + return gpus + } + if res.ExitCode != 0 { + log.Error( + ctx, "failed to execute tt-smi", + "exitcode", res.ExitCode, "stdout", res.Stdout, "stderr", res.Stderr, + ) + return gpus + } + + ttSmiSnapshot, err := unmarshalTtSmiSnapshot([]byte(res.Stdout)) + if err != nil { + log.Error(ctx, "cannot read tt-smi json", "err", err) + log.Debug(ctx, "tt-smi output", "stdout", res.Stdout) + return gpus + } + + return getGpusFromTtSmiSnapshot(ttSmiSnapshot) +} + +func getAmdRenderNodePath(bdf string) (string, error) { + // amd-smi uses extended BDF Notation with domain: Domain:Bus:Device.Function, e.g., 0000:5f:00.0 + // udev creates /dev/dri/by-path/pci--render -> ../renderD symlinks + symlink := fmt.Sprintf("/dev/dri/by-path/pci-%s-render", bdf) + path, err := filepath.EvalSymlinks(symlink) + if err != nil { + return "", err + } + return path, nil +} + +func IsRenderNodePath(path string) bool { + return strings.HasPrefix(path, "/dev/dri/renderD") +} + +func getIntelGpuInfo(ctx context.Context) []GpuInfo { + gpus := []GpuInfo{} + + cmd := execute.ExecTask{ + Command: "hl-smi", + Args: []string{"--query-aip=name,memory.total,index", "--format=csv,noheader,nounits"}, + StreamStdio: false, + } + res, err := cmd.Execute(ctx) + if err != nil { + log.Error(ctx, "failed to execute hl-smi", "err", err) + return gpus + } + if res.ExitCode != 0 { + log.Error( + ctx, "failed to execute hl-smi", + "exitcode", res.ExitCode, "stdout", res.Stdout, "stderr", res.Stderr, + ) + return gpus + } + + r := csv.NewReader(strings.NewReader(res.Stdout)) + for { + record, err := r.Read() + if errors.Is(err, io.EOF) { + break + } + if err != nil { + log.Error(ctx, "cannot read csv", "err", err) + return gpus + } + if len(record) != 3 { + log.Error(ctx, "3 csv fields expected", "len", len(record)) + return gpus + } + vram, err := strconv.Atoi(strings.TrimSpace(record[1])) + if err != nil { + log.Error(ctx, "invalid memory value", "value", record[1]) + vram = 0 + } + gpus = append(gpus, GpuInfo{ + Vendor: gpu.GpuVendorIntel, + Name: strings.TrimSpace(record[0]), + Vram: vram, + Index: strings.TrimSpace(record[2]), + }) + } + return gpus +} diff --git a/runner/internal/shim/host/gpu_test.go b/runner/internal/shim/host/gpu_test.go new file mode 100644 index 0000000000..4110816003 --- /dev/null +++ b/runner/internal/shim/host/gpu_test.go @@ -0,0 +1,495 @@ +package host + +import ( + "os" + "path/filepath" + "reflect" + "strconv" + "testing" + + "github.com/dstackai/dstack/runner/internal/common/gpu" +) + +func loadTestData(filename string) ([]byte, error) { + path := filepath.Join("testdata", filename) + return os.ReadFile(path) +} + +func TestUnmarshalTtSmiSnapshot(t *testing.T) { + tests := []struct { + name string + filename string + want *ttSmiSnapshot + wantErr bool + }{ + { + name: "valid single device", + filename: "tenstorrent/valid_single_device.json", + want: &ttSmiSnapshot{ + DeviceInfo: []ttDeviceInfo{ + { + BoardInfo: ttBoardInfo{ + BoardType: "n150 L", + BoardID: "100018611902010", + }, + }, + }, + }, + wantErr: false, + }, + { + name: "valid multiple devices", + filename: "tenstorrent/valid_multiple_devices.json", + want: &ttSmiSnapshot{ + DeviceInfo: []ttDeviceInfo{ + { + BoardInfo: ttBoardInfo{ + BoardType: "n300 L", + BoardID: "10001451172208f", + }, + }, + { + BoardInfo: ttBoardInfo{ + BoardType: "n300 L", + BoardID: "100014511722053", + }, + }, + { + BoardInfo: ttBoardInfo{ + BoardType: "n300 L", + BoardID: "10001451172209c", + }, + }, + { + BoardInfo: ttBoardInfo{ + BoardType: "n300 L", + BoardID: "100014511722058", + }, + }, + { + BoardInfo: ttBoardInfo{ + BoardType: "n300 R", + BoardID: "10001451172208f", + }, + }, + { + BoardInfo: ttBoardInfo{ + BoardType: "n300 R", + BoardID: "100014511722053", + }, + }, + { + BoardInfo: ttBoardInfo{ + BoardType: "n300 R", + BoardID: "10001451172209c", + }, + }, + { + BoardInfo: ttBoardInfo{ + BoardType: "n300 R", + BoardID: "100014511722058", + }, + }, + }, + }, + wantErr: false, + }, + { + name: "empty device info", + filename: "tenstorrent/empty_device_info.json", + want: &ttSmiSnapshot{ + DeviceInfo: []ttDeviceInfo{}, + }, + wantErr: false, + }, + { + name: "invalid JSON", + filename: "tenstorrent/invalid_json.json", + want: nil, + wantErr: true, + }, + { + name: "missing device_info field", + filename: "tenstorrent/missing_device_info.json", + want: &ttSmiSnapshot{DeviceInfo: nil}, + wantErr: false, + }, + { + name: "empty JSON", + filename: "tenstorrent/empty_json.json", + want: &ttSmiSnapshot{DeviceInfo: nil}, + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + data, err := loadTestData(tt.filename) + if err != nil { + t.Fatalf("Failed to load test data from %s: %v", tt.filename, err) + } + + got, err := unmarshalTtSmiSnapshot(data) + if (err != nil) != tt.wantErr { + t.Errorf("unmarshalTtSmiSnapshot() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !tt.wantErr { + if got == nil { + t.Errorf("unmarshalTtSmiSnapshot() returned nil, expected non-nil result") + return + } + if len(got.DeviceInfo) != len(tt.want.DeviceInfo) { + t.Errorf("unmarshalTtSmiSnapshot() device count = %v, want %v", len(got.DeviceInfo), len(tt.want.DeviceInfo)) + return + } + for i, device := range got.DeviceInfo { + if i >= len(tt.want.DeviceInfo) { + break + } + expected := tt.want.DeviceInfo[i] + if device.BoardInfo.BoardType != expected.BoardInfo.BoardType { + t.Errorf("unmarshalTtSmiSnapshot() device[%d].BoardInfo.BoardType = %v, want %v", i, device.BoardInfo.BoardType, expected.BoardInfo.BoardType) + } + if device.BoardInfo.BoardID != expected.BoardInfo.BoardID { + t.Errorf("unmarshalTtSmiSnapshot() device[%d].BoardInfo.BoardID = %v, want %v", i, device.BoardInfo.BoardID, expected.BoardInfo.BoardID) + } + } + } + }) + } +} + +func TestGetGpusFromTtSmiSnapshot(t *testing.T) { + data, err := loadTestData("tenstorrent/single_n150_gpu.json") + if err != nil { + t.Fatalf("Failed to load test data: %v", err) + } + snapshot, err := unmarshalTtSmiSnapshot(data) + if err != nil { + t.Fatalf("Failed to unmarshal snapshot: %v", err) + } + + expectedGpus := []GpuInfo{ + { + Vendor: gpu.GpuVendorTenstorrent, + Name: "n150", + Vram: 12 * 1024, + ID: "100018611902010", + Index: "0", + }, + } + + gpus := getGpusFromTtSmiSnapshot(snapshot) + + if !reflect.DeepEqual(gpus, expectedGpus) { + t.Errorf("getGpusFromTtSmiSnapshot() = %v, want %v", gpus, expectedGpus) + } +} + +func TestGetGpusFromTtSmiSnapshotMultipleDevices(t *testing.T) { + data, err := loadTestData("tenstorrent/valid_multiple_devices.json") + if err != nil { + t.Fatalf("Failed to load test data: %v", err) + } + snapshot, err := unmarshalTtSmiSnapshot(data) + if err != nil { + t.Fatalf("Failed to unmarshal snapshot: %v", err) + } + + gpus := getGpusFromTtSmiSnapshot(snapshot) + + // Verify we have 4 unique GPUs (grouped by board_id) + if len(gpus) != 4 { + t.Errorf("getGpusFromTtSmiSnapshot() returned %d GPUs, want 4", len(gpus)) + } + + // Create a map to check the results by board_id + gpusByID := make(map[string]GpuInfo) + for _, gpu := range gpus { + gpusByID[gpu.ID] = gpu + } + + // Verify specific GPUs and their aggregated VRAM + expectedGpus := map[string]struct { + name string + vram int + }{ + "10001451172208f": {"n300", 24 * 1024}, // 12GB (n300 L) + 12GB (n300 R) = 24GB + "100014511722053": {"n300", 24 * 1024}, // 12GB (n300 L) + 12GB (n300 R) = 24GB + "10001451172209c": {"n300", 24 * 1024}, // 12GB (n300 L) + 12GB (n300 R) = 24GB + "100014511722058": {"n300", 24 * 1024}, // 12GB (n300 L) + 12GB (n300 R) = 24GB + } + + for boardID, expected := range expectedGpus { + gpu_, exists := gpusByID[boardID] + if !exists { + t.Errorf("Expected GPU with board_id %s not found", boardID) + continue + } + if gpu_.Name != expected.name { + t.Errorf("GPU %s: name = %s, want %s", boardID, gpu_.Name, expected.name) + } + if gpu_.Vram != expected.vram { + t.Errorf("GPU %s: VRAM = %d, want %d", boardID, gpu_.Vram, expected.vram) + } + if gpu_.Vendor != gpu.GpuVendorTenstorrent { + t.Errorf("GPU %s: vendor = %v, want %v", boardID, gpu_.Vendor, gpu.GpuVendorTenstorrent) + } + } +} + +func TestGetGpusFromTtSmiSnapshotWormholePrefixMemoryCompatibility(t *testing.T) { + snapshot := &ttSmiSnapshot{ + DeviceInfo: []ttDeviceInfo{ + {BoardInfo: ttBoardInfo{BoardType: "n150-custom L", BoardID: "100018000000001"}}, + {BoardInfo: ttBoardInfo{BoardType: "n300-custom L", BoardID: "100014000000001"}}, + {BoardInfo: ttBoardInfo{BoardType: "n300-custom R", BoardID: "100014000000001"}}, + {BoardInfo: ttBoardInfo{BoardType: "tt-galaxy-wh-custom L", BoardID: "100035000000001"}}, + }, + } + + gpus := getGpusFromTtSmiSnapshot(snapshot) + + expected := []GpuInfo{ + {Vendor: gpu.GpuVendorTenstorrent, Name: "n150-custom", Vram: 12 * 1024, ID: "100018000000001", Index: "0"}, + {Vendor: gpu.GpuVendorTenstorrent, Name: "n300-custom", Vram: 24 * 1024, ID: "100014000000001", Index: "1"}, + {Vendor: gpu.GpuVendorTenstorrent, Name: "tt-galaxy-wh-custom", Vram: 12 * 1024, ID: "100035000000001", Index: "2"}, + } + if !reflect.DeepEqual(gpus, expected) { + t.Errorf("getGpusFromTtSmiSnapshot() = %v, want %v", gpus, expected) + } +} + +func TestGetGpusFromTtSmiSnapshotGalaxy(t *testing.T) { + data, err := loadTestData("tenstorrent/galaxy.json") + if err != nil { + t.Fatalf("Failed to load test data: %v", err) + } + snapshot, err := unmarshalTtSmiSnapshot(data) + if err != nil { + t.Fatalf("Failed to unmarshal snapshot: %v", err) + } + + gpus := getGpusFromTtSmiSnapshot(snapshot) + + // Galaxy.json contains 32 devices with board_type "tt-galaxy-wh L" + // Each "L" device should be treated as a separate GPU + // Each "tt-galaxy-wh" device has 12GB VRAM + if len(gpus) != 32 { + t.Errorf("getGpusFromTtSmiSnapshot() returned %d GPUs, want 32", len(gpus)) + } + + // Calculate total VRAM: 32 devices × 12GB = 384GB + totalVram := 32 * 12 * 1024 // 32 devices × 12GB × 1024 MiB/GB + actualTotalVram := 0 + + // Verify all GPUs have the correct properties + for i, gpu_ := range gpus { + if gpu_.Vendor != gpu.GpuVendorTenstorrent { + t.Errorf("GPU[%d] vendor = %v, want %v", i, gpu_.Vendor, gpu.GpuVendorTenstorrent) + } + if gpu_.Name != "tt-galaxy-wh" { + t.Errorf("GPU[%d] name = %s, want tt-galaxy-wh", i, gpu_.Name) + } + if gpu_.ID != "100035100000000" { + t.Errorf("GPU[%d] ID = %s, want 100035100000000", i, gpu_.ID) + } + if gpu_.Vram != 12*1024 { + t.Errorf("GPU[%d] VRAM = %d, want %d", i, gpu_.Vram, 12*1024) + } + // Verify indices are sequential (0, 1, 2, ..., 31) + expectedIndex := strconv.Itoa(i) + if gpu_.Index != expectedIndex { + t.Errorf("GPU[%d] index = %s, want %s", i, gpu_.Index, expectedIndex) + } + actualTotalVram += gpu_.Vram + } + + // Verify total VRAM is 384GB + if actualTotalVram != totalVram { + t.Errorf("Total VRAM = %d MiB, want %d MiB (384GB)", actualTotalVram, totalVram) + } +} + +func TestGetGpusFromTtSmiSnapshotBlackholeRevisions(t *testing.T) { + snapshot := &ttSmiSnapshot{ + DeviceInfo: []ttDeviceInfo{ + {BoardInfo: ttBoardInfo{BoardType: "bh-scrappy", BoardID: "0000360000000000"}}, + {BoardInfo: ttBoardInfo{BoardType: "p100", BoardID: "0000430000000000"}}, + {BoardInfo: ttBoardInfo{BoardType: "p100a", BoardID: "0000430000000000"}}, + {BoardInfo: ttBoardInfo{BoardType: "p150a", BoardID: "0000400000000000"}}, + {BoardInfo: ttBoardInfo{BoardType: "p150b", BoardID: "0000410000000000"}}, + {BoardInfo: ttBoardInfo{BoardType: "p150c", BoardID: "0000420000000000"}}, + {BoardInfo: ttBoardInfo{BoardType: "p300b", BoardID: "0000440000000000"}}, + {BoardInfo: ttBoardInfo{BoardType: "p300a", BoardID: "0000450000000000"}}, + {BoardInfo: ttBoardInfo{BoardType: "p300c", BoardID: "0000460000000000"}}, + }, + } + + gpus := getGpusFromTtSmiSnapshot(snapshot) + + expectedNames := []string{"p100a", "p100a", "p100a", "p150", "p150", "p150", "p300", "p300", "p300"} + expectedVram := []int{28 * 1024, 28 * 1024, 28 * 1024, 32 * 1024, 32 * 1024, 32 * 1024, 32 * 1024, 32 * 1024, 32 * 1024} + if len(gpus) != len(expectedNames) { + t.Fatalf("getGpusFromTtSmiSnapshot() returned %d GPUs, want %d", len(gpus), len(expectedNames)) + } + for i, expectedName := range expectedNames { + if gpus[i].Vendor != gpu.GpuVendorTenstorrent { + t.Errorf("GPU[%d] vendor = %v, want %v", i, gpus[i].Vendor, gpu.GpuVendorTenstorrent) + } + if gpus[i].Name != expectedName { + t.Errorf("GPU[%d] name = %s, want %s", i, gpus[i].Name, expectedName) + } + if gpus[i].Vram != expectedVram[i] { + t.Errorf("GPU[%d] VRAM = %d, want %d", i, gpus[i].Vram, expectedVram[i]) + } + if gpus[i].Index != strconv.Itoa(i) { + t.Errorf("GPU[%d] index = %s, want %s", i, gpus[i].Index, strconv.Itoa(i)) + } + } +} + +func TestGetGpusFromTtSmiSnapshotBlackholeSourceFixtures(t *testing.T) { + // Synthetic tt-smi snapshot derived from TT-SMI board name mappings and + // TT-Metal UMD Blackhole board descriptors. + data, err := loadTestData("tenstorrent/blackhole_boards.json") + if err != nil { + t.Fatalf("Failed to load test data: %v", err) + } + snapshot, err := unmarshalTtSmiSnapshot(data) + if err != nil { + t.Fatalf("Failed to unmarshal snapshot: %v", err) + } + + gpus := getGpusFromTtSmiSnapshot(snapshot) + + expected := []GpuInfo{ + {Vendor: gpu.GpuVendorTenstorrent, Name: "p100a", Vram: 28 * 1024, ID: "000004323191b040", Index: "0"}, + {Vendor: gpu.GpuVendorTenstorrent, Name: "p150", Vram: 32 * 1024, ID: "0000040100000000", Index: "1"}, + {Vendor: gpu.GpuVendorTenstorrent, Name: "p150", Vram: 32 * 1024, ID: "000004123191110e", Index: "2"}, + {Vendor: gpu.GpuVendorTenstorrent, Name: "p300", Vram: 32 * 1024, ID: "000004513190f004", Index: "3"}, + {Vendor: gpu.GpuVendorTenstorrent, Name: "p300", Vram: 32 * 1024, ID: "000004513190f004", Index: "4"}, + } + if !reflect.DeepEqual(gpus, expected) { + t.Errorf("getGpusFromTtSmiSnapshot() = %v, want %v", gpus, expected) + } +} + +func TestGetGpusFromTtSmiSnapshotBlackholeEightP150(t *testing.T) { + // Derived from TT-Metal UMD blackhole_8xP150 cluster descriptor. + // The p150b name follows TT-SMI's board ID to board type mapping. + data, err := loadTestData("tenstorrent/blackhole_8xp150.json") + if err != nil { + t.Fatalf("Failed to load test data: %v", err) + } + snapshot, err := unmarshalTtSmiSnapshot(data) + if err != nil { + t.Fatalf("Failed to unmarshal snapshot: %v", err) + } + + gpus := getGpusFromTtSmiSnapshot(snapshot) + + expectedIDs := []string{ + "0000041231915018", + "0000041231915002", + "0000041231915009", + "000004123191500f", + "0000041231914064", + "0000041231915006", + "0000041231914087", + "000004123191402f", + } + if len(gpus) != len(expectedIDs) { + t.Fatalf("getGpusFromTtSmiSnapshot() returned %d GPUs, want %d", len(gpus), len(expectedIDs)) + } + for i, gpu_ := range gpus { + if gpu_.Vendor != gpu.GpuVendorTenstorrent { + t.Errorf("GPU[%d] vendor = %v, want %v", i, gpu_.Vendor, gpu.GpuVendorTenstorrent) + } + if gpu_.Name != "p150" { + t.Errorf("GPU[%d] name = %s, want p150", i, gpu_.Name) + } + if gpu_.Vram != 32*1024 { + t.Errorf("GPU[%d] VRAM = %d, want %d", i, gpu_.Vram, 32*1024) + } + if gpu_.ID != expectedIDs[i] { + t.Errorf("GPU[%d] ID = %s, want %s", i, gpu_.ID, expectedIDs[i]) + } + if gpu_.Index != strconv.Itoa(i) { + t.Errorf("GPU[%d] index = %s, want %s", i, gpu_.Index, strconv.Itoa(i)) + } + } +} + +func TestGetGpusFromTtSmiSnapshotBlackholeP300SameBoardID(t *testing.T) { + snapshot := &ttSmiSnapshot{ + DeviceInfo: []ttDeviceInfo{ + {BoardInfo: ttBoardInfo{BoardType: "p300a", BoardID: "0000450000000000", BusID: "0000:01:00.0"}}, + {BoardInfo: ttBoardInfo{BoardType: "p300a", BoardID: "0000450000000000", BusID: "0000:02:00.0"}}, + }, + } + + gpus := getGpusFromTtSmiSnapshot(snapshot) + + if len(gpus) != 2 { + t.Fatalf("getGpusFromTtSmiSnapshot() returned %d GPUs, want 2", len(gpus)) + } + for i, gpu_ := range gpus { + if gpu_.Name != "p300" { + t.Errorf("GPU[%d] name = %s, want p300", i, gpu_.Name) + } + if gpu_.Vram != 32*1024 { + t.Errorf("GPU[%d] VRAM = %d, want %d", i, gpu_.Vram, 32*1024) + } + if gpu_.Index != strconv.Itoa(i) { + t.Errorf("GPU[%d] index = %s, want %s", i, gpu_.Index, strconv.Itoa(i)) + } + } +} + +func TestGetGpusFromTtSmiSnapshotBlackholeP300RemoteChip(t *testing.T) { + snapshot := &ttSmiSnapshot{ + DeviceInfo: []ttDeviceInfo{ + {BoardInfo: ttBoardInfo{BoardType: "p300a", BoardID: "0000450000000000", BusID: "0000:01:00.0"}}, + {BoardInfo: ttBoardInfo{BoardType: "p300a", BoardID: "0000450000000000", BusID: "N/A"}}, + }, + } + + gpus := getGpusFromTtSmiSnapshot(snapshot) + + expected := []GpuInfo{ + {Vendor: gpu.GpuVendorTenstorrent, Name: "p300", Vram: 64 * 1024, ID: "0000450000000000", Index: "0"}, + } + if !reflect.DeepEqual(gpus, expected) { + t.Errorf("getGpusFromTtSmiSnapshot() = %v, want %v", gpus, expected) + } +} + +func TestGetGpusFromTtSmiSnapshotBlackholeGalaxy(t *testing.T) { + data, err := loadTestData("tenstorrent/blackhole_galaxy.json") + if err != nil { + t.Fatalf("Failed to load test data: %v", err) + } + snapshot, err := unmarshalTtSmiSnapshot(data) + if err != nil { + t.Fatalf("Failed to unmarshal snapshot: %v", err) + } + + gpus := getGpusFromTtSmiSnapshot(snapshot) + + if len(gpus) != 32 { + t.Fatalf("getGpusFromTtSmiSnapshot() returned %d GPUs, want 32", len(gpus)) + } + for i, gpu_ := range gpus { + if gpu_.Name != "tt-galaxy-bh" { + t.Errorf("GPU[%d] name = %s, want tt-galaxy-bh", i, gpu_.Name) + } + if gpu_.Vram != 32*1024 { + t.Errorf("GPU[%d] VRAM = %d, want %d", i, gpu_.Vram, 32*1024) + } + if gpu_.Index != strconv.Itoa(i) { + t.Errorf("GPU[%d] index = %s, want %s", i, gpu_.Index, strconv.Itoa(i)) + } + } +} diff --git a/runner/internal/shim/host/host.go b/runner/internal/shim/host/host.go new file mode 100644 index 0000000000..84d15d1ae8 --- /dev/null +++ b/runner/internal/shim/host/host.go @@ -0,0 +1,61 @@ +package host + +import ( + "context" + "fmt" + "net" + "runtime" + + "github.com/shirou/gopsutil/v4/mem" + "golang.org/x/sys/unix" + + "github.com/dstackai/dstack/runner/internal/common/log" +) + +func GetCpuCount(ctx context.Context) int { + return runtime.NumCPU() +} + +// GetTotalMemory returns total amount of RAM on this system +func GetTotalMemory(ctx context.Context) (uint64, error) { + v, err := mem.VirtualMemory() + if err != nil { + return 0, fmt.Errorf("cannot get total memory: %w", err) + } + return v.Total, nil +} + +func GetDiskSize(ctx context.Context, path string) (uint64, error) { + var stat unix.Statfs_t + err := unix.Statfs(path, &stat) + if err != nil { + return 0, fmt.Errorf("cannot get disk size: %w", err) + } + size := stat.Bavail * uint64(stat.Bsize) + return size, nil +} + +func GetNetworkAddresses(ctx context.Context) ([]string, error) { + var addresses []string + ifaces, err := net.Interfaces() + if err != nil { + return addresses, fmt.Errorf("cannot get interfaces: %w", err) + } + for _, iface := range ifaces { + addrs, err := iface.Addrs() + if err != nil { + log.Error(ctx, "cannot get addrs", "iface", iface, "err", err) + continue + } + for _, addr := range addrs { + switch v := addr.(type) { + case *net.IPNet: + if v.IP.IsLoopback() { + continue + } + addresses = append(addresses, addr.String()) + } + } + } + return addresses, nil +} diff --git a/runner/internal/shim/host/testdata/tenstorrent/README.md b/runner/internal/shim/host/testdata/tenstorrent/README.md new file mode 100644 index 0000000000..db7a8cbc4c --- /dev/null +++ b/runner/internal/shim/host/testdata/tenstorrent/README.md @@ -0,0 +1,19 @@ +# Tenstorrent fixtures + +The Wormhole fixtures are captured `tt-smi -s` snapshots used by the existing +Tenstorrent tests. + +The Blackhole Galaxy fixture is a captured `tt-smi -s` snapshot from a 32-chip +Galaxy Blackhole host. + +The Blackhole PCIe fixtures are source-derived compatibility fixtures: + +- `blackhole_boards.json` covers `tt-smi` Blackhole board names and P300 + same-board dual-MMIO behavior. Board names are based on + `tt_smi/utils.py::get_board_type` and UMD board type mappings. +- `blackhole_8xp150.json` is derived from UMD's + `blackhole_8xP150.yaml` cluster descriptor. The board IDs and PCI bus IDs are + from that descriptor; the `p150b` board name follows the UPI mapping used by + `tt-smi`. +These fixtures are not substitutes for live hardware smoke tests. They preserve +captured `tt-smi` JSON shapes and source-derived UMD topology cases. diff --git a/runner/internal/shim/host/testdata/tenstorrent/blackhole_8xp150.json b/runner/internal/shim/host/testdata/tenstorrent/blackhole_8xp150.json new file mode 100644 index 0000000000..c313c5be0f --- /dev/null +++ b/runner/internal/shim/host/testdata/tenstorrent/blackhole_8xp150.json @@ -0,0 +1,60 @@ +{ + "device_info": [ + { + "board_info": { + "bus_id": "0000:01:00.0", + "board_type": "p150b", + "board_id": "0000041231915018" + } + }, + { + "board_info": { + "bus_id": "0000:21:00.0", + "board_type": "p150b", + "board_id": "0000041231915002" + } + }, + { + "board_info": { + "bus_id": "0000:41:00.0", + "board_type": "p150b", + "board_id": "0000041231915009" + } + }, + { + "board_info": { + "bus_id": "0000:61:00.0", + "board_type": "p150b", + "board_id": "000004123191500f" + } + }, + { + "board_info": { + "bus_id": "0000:81:00.0", + "board_type": "p150b", + "board_id": "0000041231914064" + } + }, + { + "board_info": { + "bus_id": "0000:a1:00.0", + "board_type": "p150b", + "board_id": "0000041231915006" + } + }, + { + "board_info": { + "bus_id": "0000:c1:00.0", + "board_type": "p150b", + "board_id": "0000041231914087" + } + }, + { + "board_info": { + "bus_id": "0000:e1:00.0", + "board_type": "p150b", + "board_id": "000004123191402f" + } + } + ] +} diff --git a/runner/internal/shim/host/testdata/tenstorrent/blackhole_boards.json b/runner/internal/shim/host/testdata/tenstorrent/blackhole_boards.json new file mode 100644 index 0000000000..8dfdd62e64 --- /dev/null +++ b/runner/internal/shim/host/testdata/tenstorrent/blackhole_boards.json @@ -0,0 +1,39 @@ +{ + "device_info": [ + { + "board_info": { + "bus_id": "0000:61:00.0", + "board_type": "p100a", + "board_id": "000004323191b040" + } + }, + { + "board_info": { + "bus_id": "0000:01:00.0", + "board_type": "p150a", + "board_id": "0000040100000000" + } + }, + { + "board_info": { + "bus_id": "0000:02:00.0", + "board_type": "p150b", + "board_id": "000004123191110e" + } + }, + { + "board_info": { + "bus_id": "0000:03:00.0", + "board_type": "p300a", + "board_id": "000004513190f004" + } + }, + { + "board_info": { + "bus_id": "0000:04:00.0", + "board_type": "p300a", + "board_id": "000004513190f004" + } + } + ] +} diff --git a/runner/internal/shim/host/testdata/tenstorrent/blackhole_galaxy.json b/runner/internal/shim/host/testdata/tenstorrent/blackhole_galaxy.json new file mode 100644 index 0000000000..2a85a80786 --- /dev/null +++ b/runner/internal/shim/host/testdata/tenstorrent/blackhole_galaxy.json @@ -0,0 +1,3348 @@ +{ + "time": "2026-05-24T07:51:28.084906", + "host_info": { + "OS": "Linux", + "Distro": "Ubuntu 22.04.5 LTS", + "Kernel": "6.8.0-111-generic", + "Hostname": "de55091aab2e", + "Platform": "x86_64", + "Python": "3.10.12", + "Memory": "566.12 GB", + "Driver": "TT-KMD 2.8.0" + }, + "host_sw_vers": { + "tt_smi": "5.2.0", + "pyluwen": "0.8.5", + "tt_umd": "0.9.5" + }, + "device_info": [ + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e2", + "TDP": "0x27", + "TDC": "0x35", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x2d682a", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x320", + "L2CPUCLK1": "0x320", + "L2CPUCLK2": "0x320", + "L2CPUCLK3": "0x320", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62df9", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x2fff", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x36363834", + "GDDR_2_3_TEMP": "0x36363838", + "GDDR_4_5_TEMP": "0x36323432", + "GDDR_6_7_TEMP": "0x36323634", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x38", + "ASIC_LOCATION": "0x1", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0x30da68d3", + "ASIC_ID_LOW": "0x319cdf5f", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:01:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 53.0", + "power": " 39.0", + "aiclk": " 800", + "asic_temperature": "45.4", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e3", + "TDP": "0x20", + "TDC": "0x2c", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x28ba82", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x320", + "L2CPUCLK1": "0x320", + "L2CPUCLK2": "0x320", + "L2CPUCLK3": "0x320", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62dfa", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3fdf", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x36363834", + "GDDR_2_3_TEMP": "0x38363836", + "GDDR_4_5_TEMP": "0x38383636", + "GDDR_6_7_TEMP": "0x38383838", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x38", + "ASIC_LOCATION": "0x2", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0xafa174e", + "ASIC_ID_LOW": "0x22393ea0", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:02:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 44.0", + "power": " 32.0", + "aiclk": " 800", + "asic_temperature": "40.7", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e8", + "TDP": "0x11", + "TDC": "0x18", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x28db52", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x0", + "L2CPUCLK1": "0x0", + "L2CPUCLK2": "0x0", + "L2CPUCLK3": "0x0", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62df6", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3f7f", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x36363836", + "GDDR_2_3_TEMP": "0x38363836", + "GDDR_4_5_TEMP": "0x3a363834", + "GDDR_6_7_TEMP": "0x38343836", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x3a", + "ASIC_LOCATION": "0x3", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0xc722c9ef", + "ASIC_ID_LOW": "0x5253bcaa", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:03:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 24.0", + "power": " 17.0", + "aiclk": " 800", + "asic_temperature": "40.9", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e3", + "TDP": "0x24", + "TDC": "0x31", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x2dca9a", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x0", + "L2CPUCLK1": "0x0", + "L2CPUCLK2": "0x0", + "L2CPUCLK3": "0x0", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62dfb", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3bff", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x36323434", + "GDDR_2_3_TEMP": "0x36323834", + "GDDR_4_5_TEMP": "0x3a363636", + "GDDR_6_7_TEMP": "0x3a363836", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x3a", + "ASIC_LOCATION": "0x4", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0x5f105885", + "ASIC_ID_LOW": "0x8d9e9a28", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:04:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 49.0", + "power": " 36.0", + "aiclk": " 800", + "asic_temperature": "45.8", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e1", + "TDP": "0x26", + "TDC": "0x34", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x295e92", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x320", + "L2CPUCLK1": "0x320", + "L2CPUCLK2": "0x320", + "L2CPUCLK3": "0x320", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62df7", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3ff7", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x302e302e", + "GDDR_2_3_TEMP": "0x32303230", + "GDDR_4_5_TEMP": "0x34323430", + "GDDR_6_7_TEMP": "0x34343632", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x36", + "ASIC_LOCATION": "0x5", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0x38a62485", + "ASIC_ID_LOW": "0x910f7598", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:05:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 52.0", + "power": " 38.0", + "aiclk": " 800", + "asic_temperature": "41.4", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e8", + "TDP": "0x25", + "TDC": "0x33", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x2a2372", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x320", + "L2CPUCLK1": "0x320", + "L2CPUCLK2": "0x320", + "L2CPUCLK3": "0x320", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62df9", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3ffd", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x34323430", + "GDDR_2_3_TEMP": "0x36363634", + "GDDR_4_5_TEMP": "0x34343432", + "GDDR_6_7_TEMP": "0x36343434", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x36", + "ASIC_LOCATION": "0x6", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0x3b0df290", + "ASIC_ID_LOW": "0x34ed9b6c", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:06:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "8" + }, + "telemetry": { + "voltage": "0.74", + "current": " 51.0", + "power": " 37.0", + "aiclk": " 800", + "asic_temperature": "42.1", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e8", + "TDP": "0x11", + "TDC": "0x18", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x29a032", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x0", + "L2CPUCLK1": "0x0", + "L2CPUCLK2": "0x0", + "L2CPUCLK3": "0x0", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62df8", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3f7f", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x36323230", + "GDDR_2_3_TEMP": "0x36343634", + "GDDR_4_5_TEMP": "0x3634342e", + "GDDR_6_7_TEMP": "0x38343634", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x38", + "ASIC_LOCATION": "0x7", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0x470e750f", + "ASIC_ID_LOW": "0xe3463989", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:07:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 24.0", + "power": " 17.0", + "aiclk": " 800", + "asic_temperature": "41.6", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e8", + "TDP": "0x10", + "TDC": "0x16", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x28894a", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x0", + "L2CPUCLK1": "0x0", + "L2CPUCLK2": "0x0", + "L2CPUCLK3": "0x0", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62df8", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x1fff", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x34323230", + "GDDR_2_3_TEMP": "0x36343834", + "GDDR_4_5_TEMP": "0x3230322e", + "GDDR_6_7_TEMP": "0x32323632", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x38", + "ASIC_LOCATION": "0x8", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0x2ee8b99a", + "ASIC_ID_LOW": "0x700a32f4", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:08:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 22.0", + "power": " 16.0", + "aiclk": " 800", + "asic_temperature": "40.5", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e2", + "TDP": "0x2a", + "TDC": "0x39", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x2e4dda", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x320", + "L2CPUCLK1": "0x320", + "L2CPUCLK2": "0x320", + "L2CPUCLK3": "0x320", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62df8", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3ffe", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x3a363834", + "GDDR_2_3_TEMP": "0x36363836", + "GDDR_4_5_TEMP": "0x36343632", + "GDDR_6_7_TEMP": "0x34343634", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x3a", + "ASIC_LOCATION": "0x1", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0xbc2704c4", + "ASIC_ID_LOW": "0x6388dc72", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:41:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 57.0", + "power": " 42.0", + "aiclk": " 800", + "asic_temperature": "46.3", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e3", + "TDP": "0x24", + "TDC": "0x31", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x28ebba", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x320", + "L2CPUCLK1": "0x320", + "L2CPUCLK2": "0x320", + "L2CPUCLK3": "0x320", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62df9", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3ffd", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x38363836", + "GDDR_2_3_TEMP": "0x3a383836", + "GDDR_4_5_TEMP": "0x38363836", + "GDDR_6_7_TEMP": "0x38363836", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x3a", + "ASIC_LOCATION": "0x2", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0x3bbbc600", + "ASIC_ID_LOW": "0x3117fbe1", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:42:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 49.0", + "power": " 36.0", + "aiclk": " 800", + "asic_temperature": "40.9", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e3", + "TDP": "0x23", + "TDC": "0x30", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x2b198a", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x0", + "L2CPUCLK1": "0x0", + "L2CPUCLK2": "0x0", + "L2CPUCLK3": "0x0", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62df6", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3fef", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x3a363836", + "GDDR_2_3_TEMP": "0x38363836", + "GDDR_4_5_TEMP": "0x38343836", + "GDDR_6_7_TEMP": "0x3a363834", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x3a", + "ASIC_LOCATION": "0x3", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0x1c2b9a0", + "ASIC_ID_LOW": "0x58a60566", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:43:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 48.0", + "power": " 35.0", + "aiclk": " 800", + "asic_temperature": "43.1", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e1", + "TDP": "0x27", + "TDC": "0x35", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x2cb3b2", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x320", + "L2CPUCLK1": "0x320", + "L2CPUCLK2": "0x320", + "L2CPUCLK3": "0x320", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62dfb", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x2fff", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x34323432", + "GDDR_2_3_TEMP": "0x36343632", + "GDDR_4_5_TEMP": "0x38343834", + "GDDR_6_7_TEMP": "0x36363836", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x38", + "ASIC_LOCATION": "0x4", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0x701faaab", + "ASIC_ID_LOW": "0x606f3f29", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:44:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 53.0", + "power": " 39.0", + "aiclk": " 800", + "asic_temperature": "44.7", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e2", + "TDP": "0x24", + "TDC": "0x31", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x2a757a", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x0", + "L2CPUCLK1": "0x0", + "L2CPUCLK2": "0x0", + "L2CPUCLK3": "0x0", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62dfb", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3fef", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x302e2e2a", + "GDDR_2_3_TEMP": "0x3430322e", + "GDDR_4_5_TEMP": "0x34303230", + "GDDR_6_7_TEMP": "0x36343432", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x36", + "ASIC_LOCATION": "0x5", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0xae417056", + "ASIC_ID_LOW": "0x353a8f9c", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:45:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 49.0", + "power": " 36.0", + "aiclk": " 800", + "asic_temperature": "42.5", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e8", + "TDP": "0x10", + "TDC": "0x16", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x28ebba", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x0", + "L2CPUCLK1": "0x0", + "L2CPUCLK2": "0x0", + "L2CPUCLK3": "0x0", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62df8", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3f7f", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x34323430", + "GDDR_2_3_TEMP": "0x36343634", + "GDDR_4_5_TEMP": "0x36343232", + "GDDR_6_7_TEMP": "0x38343634", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x38", + "ASIC_LOCATION": "0x6", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0x3e58f3fa", + "ASIC_ID_LOW": "0xc0c22e55", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:46:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "8" + }, + "telemetry": { + "voltage": "0.74", + "current": " 22.0", + "power": " 16.0", + "aiclk": " 800", + "asic_temperature": "40.9", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e8", + "TDP": "0x12", + "TDC": "0x19", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x295e92", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x0", + "L2CPUCLK1": "0x0", + "L2CPUCLK2": "0x0", + "L2CPUCLK3": "0x0", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62dfa", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3fdf", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x34303030", + "GDDR_2_3_TEMP": "0x38343632", + "GDDR_4_5_TEMP": "0x34323432", + "GDDR_6_7_TEMP": "0x36343634", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x38", + "ASIC_LOCATION": "0x7", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0x9d5d96f", + "ASIC_ID_LOW": "0xeed74c01", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:47:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 25.0", + "power": " 18.0", + "aiclk": " 800", + "asic_temperature": "41.4", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e4", + "TDP": "0x22", + "TDC": "0x2f", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x2b198a", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x0", + "L2CPUCLK1": "0x0", + "L2CPUCLK2": "0x0", + "L2CPUCLK3": "0x0", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62df8", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3bff", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x34323430", + "GDDR_2_3_TEMP": "0x38323634", + "GDDR_4_5_TEMP": "0x3030322e", + "GDDR_6_7_TEMP": "0x3430322e", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x38", + "ASIC_LOCATION": "0x8", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0xf3bf945a", + "ASIC_ID_LOW": "0xdcd2b7ef", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:48:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 47.0", + "power": " 34.0", + "aiclk": " 800", + "asic_temperature": "43.1", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e9", + "TDP": "0x10", + "TDC": "0x16", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x28894a", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x0", + "L2CPUCLK1": "0x0", + "L2CPUCLK2": "0x0", + "L2CPUCLK3": "0x0", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62df7", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3fdf", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x36383836", + "GDDR_2_3_TEMP": "0x3a383838", + "GDDR_4_5_TEMP": "0x36343636", + "GDDR_6_7_TEMP": "0x34343636", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x3a", + "ASIC_LOCATION": "0x1", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0x5cfe5073", + "ASIC_ID_LOW": "0x42cfe4e1", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:81:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 22.0", + "power": " 16.0", + "aiclk": " 800", + "asic_temperature": "40.5", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e8", + "TDP": "0x24", + "TDC": "0x31", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x292d5a", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x320", + "L2CPUCLK1": "0x320", + "L2CPUCLK2": "0x320", + "L2CPUCLK3": "0x320", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62df9", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3fdf", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x38363636", + "GDDR_2_3_TEMP": "0x3a383838", + "GDDR_4_5_TEMP": "0x38383a38", + "GDDR_6_7_TEMP": "0x36383836", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x3a", + "ASIC_LOCATION": "0x2", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0xff51a2a2", + "ASIC_ID_LOW": "0x23203905", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:82:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 49.0", + "power": " 36.0", + "aiclk": " 800", + "asic_temperature": "41.2", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e7", + "TDP": "0x11", + "TDC": "0x18", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x2a4442", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x0", + "L2CPUCLK1": "0x0", + "L2CPUCLK2": "0x0", + "L2CPUCLK3": "0x0", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62df7", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3f7f", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x36363836", + "GDDR_2_3_TEMP": "0x38383836", + "GDDR_4_5_TEMP": "0x38363836", + "GDDR_6_7_TEMP": "0x38383836", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x38", + "ASIC_LOCATION": "0x3", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0x8d27d7ec", + "ASIC_ID_LOW": "0xe45d68f6", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:83:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 24.0", + "power": " 17.0", + "aiclk": " 800", + "asic_temperature": "42.3", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e2", + "TDP": "0x27", + "TDC": "0x35", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x2c0fa2", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x320", + "L2CPUCLK1": "0x320", + "L2CPUCLK2": "0x320", + "L2CPUCLK3": "0x320", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62df6", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3fbf", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x34323432", + "GDDR_2_3_TEMP": "0x36323632", + "GDDR_4_5_TEMP": "0x36343834", + "GDDR_6_7_TEMP": "0x38363836", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x38", + "ASIC_LOCATION": "0x4", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0xef2552b4", + "ASIC_ID_LOW": "0x90ff9acc", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:84:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 53.0", + "power": " 39.0", + "aiclk": " 800", + "asic_temperature": "44.1", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e8", + "TDP": "0x14", + "TDC": "0x1b", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x2a757a", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x0", + "L2CPUCLK1": "0x0", + "L2CPUCLK2": "0x0", + "L2CPUCLK3": "0x0", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62dfa", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3ff7", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x302e302e", + "GDDR_2_3_TEMP": "0x34303230", + "GDDR_4_5_TEMP": "0x34323432", + "GDDR_6_7_TEMP": "0x38343634", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x38", + "ASIC_LOCATION": "0x5", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0x443a0b54", + "ASIC_ID_LOW": "0x29c8374c", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:85:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 27.0", + "power": " 20.0", + "aiclk": " 800", + "asic_temperature": "42.5", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e1", + "TDP": "0x26", + "TDC": "0x34", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x2beed2", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x320", + "L2CPUCLK1": "0x320", + "L2CPUCLK2": "0x320", + "L2CPUCLK3": "0x320", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62dfa", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x2fff", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x36343432", + "GDDR_2_3_TEMP": "0x36363636", + "GDDR_4_5_TEMP": "0x34323230", + "GDDR_6_7_TEMP": "0x36343634", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x36", + "ASIC_LOCATION": "0x6", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0xe2505c71", + "ASIC_ID_LOW": "0xeb8d9097", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:86:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "8" + }, + "telemetry": { + "voltage": "0.74", + "current": " 52.0", + "power": " 38.0", + "aiclk": " 800", + "asic_temperature": "43.9", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e3", + "TDP": "0x24", + "TDC": "0x32", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x2a757a", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x320", + "L2CPUCLK1": "0x320", + "L2CPUCLK2": "0x320", + "L2CPUCLK3": "0x320", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62dfe", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3ffe", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x34323030", + "GDDR_2_3_TEMP": "0x36343632", + "GDDR_4_5_TEMP": "0x34323630", + "GDDR_6_7_TEMP": "0x34343632", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x36", + "ASIC_LOCATION": "0x7", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0x4300de24", + "ASIC_ID_LOW": "0xb9034c67", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:87:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 50.0", + "power": " 36.0", + "aiclk": " 800", + "asic_temperature": "42.5", + "fan_speed": " 0", + "heartbeat": "67498" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e9", + "TDP": "0xf", + "TDC": "0x15", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x27f5a2", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x0", + "L2CPUCLK1": "0x0", + "L2CPUCLK2": "0x0", + "L2CPUCLK3": "0x0", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62df7", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3f7f", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x34323232", + "GDDR_2_3_TEMP": "0x36343634", + "GDDR_4_5_TEMP": "0x322e322e", + "GDDR_6_7_TEMP": "0x34323430", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x36", + "ASIC_LOCATION": "0x8", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0xc4ac0ba6", + "ASIC_ID_LOW": "0x46891f5b", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:88:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 21.0", + "power": " 15.0", + "aiclk": " 800", + "asic_temperature": "40.0", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e3", + "TDP": "0x20", + "TDC": "0x2c", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x2ad7ea", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x0", + "L2CPUCLK1": "0x0", + "L2CPUCLK2": "0x0", + "L2CPUCLK3": "0x0", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62df9", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3fbf", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x38343634", + "GDDR_2_3_TEMP": "0x38343634", + "GDDR_4_5_TEMP": "0x34323434", + "GDDR_6_7_TEMP": "0x34323632", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x38", + "ASIC_LOCATION": "0x1", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0x942a743e", + "ASIC_ID_LOW": "0x637088a4", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:c1:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 44.0", + "power": " 32.0", + "aiclk": " 800", + "asic_temperature": "42.8", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e3", + "TDP": "0x23", + "TDC": "0x30", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x2cd482", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x0", + "L2CPUCLK1": "0x0", + "L2CPUCLK2": "0x0", + "L2CPUCLK3": "0x0", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62df6", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3ffe", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x38383836", + "GDDR_2_3_TEMP": "0x36363636", + "GDDR_4_5_TEMP": "0x38363634", + "GDDR_6_7_TEMP": "0x38363836", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x38", + "ASIC_LOCATION": "0x2", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0xc5898618", + "ASIC_ID_LOW": "0x78af4f3e", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:c2:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 48.0", + "power": " 35.0", + "aiclk": " 800", + "asic_temperature": "44.8", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e8", + "TDP": "0x11", + "TDC": "0x17", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x2a4442", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x0", + "L2CPUCLK1": "0x0", + "L2CPUCLK2": "0x0", + "L2CPUCLK3": "0x0", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62df9", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x1fff", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x36363836", + "GDDR_2_3_TEMP": "0x38363832", + "GDDR_4_5_TEMP": "0x3a363834", + "GDDR_6_7_TEMP": "0x38363836", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x3a", + "ASIC_LOCATION": "0x3", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0xd8f9ffdc", + "ASIC_ID_LOW": "0x3d1b9f23", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:c3:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 23.0", + "power": " 17.0", + "aiclk": " 800", + "asic_temperature": "42.3", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e4", + "TDP": "0x22", + "TDC": "0x2e", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x2af8ba", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x0", + "L2CPUCLK1": "0x0", + "L2CPUCLK2": "0x0", + "L2CPUCLK3": "0x0", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62df8", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x2fff", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x34343432", + "GDDR_2_3_TEMP": "0x36343632", + "GDDR_4_5_TEMP": "0x38343638", + "GDDR_6_7_TEMP": "0x36363834", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x38", + "ASIC_LOCATION": "0x4", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0xea25c6db", + "ASIC_ID_LOW": "0xab35c1ca", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:c4:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 46.0", + "power": " 34.0", + "aiclk": " 800", + "asic_temperature": "43.0", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e3", + "TDP": "0x25", + "TDC": "0x33", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x2deb6a", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x0", + "L2CPUCLK1": "0x0", + "L2CPUCLK2": "0x0", + "L2CPUCLK3": "0x0", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62dfc", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3fef", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x302e302c", + "GDDR_2_3_TEMP": "0x34323230", + "GDDR_4_5_TEMP": "0x34323432", + "GDDR_6_7_TEMP": "0x36343634", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x36", + "ASIC_LOCATION": "0x5", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0x9019d0f8", + "ASIC_ID_LOW": "0x5c61ff49", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:c5:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 51.0", + "power": " 37.0", + "aiclk": " 800", + "asic_temperature": "45.9", + "fan_speed": " 0", + "heartbeat": "67498" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e8", + "TDP": "0x10", + "TDC": "0x16", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x2847aa", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x0", + "L2CPUCLK1": "0x0", + "L2CPUCLK2": "0x0", + "L2CPUCLK3": "0x0", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62df9", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3ff7", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x32323232", + "GDDR_2_3_TEMP": "0x36343434", + "GDDR_4_5_TEMP": "0x34323430", + "GDDR_6_7_TEMP": "0x36343634", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x36", + "ASIC_LOCATION": "0x6", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0x89f9d5c7", + "ASIC_ID_LOW": "0x22a507e9", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:c6:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "8" + }, + "telemetry": { + "voltage": "0.74", + "current": " 22.0", + "power": " 16.0", + "aiclk": " 800", + "asic_temperature": "40.3", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e9", + "TDP": "0x11", + "TDC": "0x17", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x293dc2", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x0", + "L2CPUCLK1": "0x0", + "L2CPUCLK2": "0x0", + "L2CPUCLK3": "0x0", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62df9", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x1fff", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x34303230", + "GDDR_2_3_TEMP": "0x36343432", + "GDDR_4_5_TEMP": "0x34303430", + "GDDR_6_7_TEMP": "0x36323432", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x36", + "ASIC_LOCATION": "0x7", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0xa4d189d2", + "ASIC_ID_LOW": "0x740a2f25", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:c7:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 23.0", + "power": " 17.0", + "aiclk": " 800", + "asic_temperature": "41.2", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + }, + { + "smbus_telem": { + "BOARD_ID_HIGH": "0x471", + "BOARD_ID_LOW": "0x31831011", + "ASIC_ID": null, + "HARVESTING_STATE": "0x0", + "UPDATE_TELEM_SPEED": "0x64", + "VCORE": "0x2e9", + "TDP": "0x24", + "TDC": "0x31", + "VDD_LIMITS": "0x38402bc", + "THM_LIMIT_SHUTDOWN": "0x6e", + "ASIC_TEMPERATURE": "0x27e53a", + "VREG_TEMPERATURE": "0x0", + "BOARD_TEMPERATURE": "0x0", + "AICLK": "0x320", + "AXICLK": "0x3c0", + "ARCCLK": "0x320", + "L2CPUCLK0": "0x320", + "L2CPUCLK1": "0x320", + "L2CPUCLK2": "0x320", + "L2CPUCLK3": "0x320", + "ETH_LIVE_STATUS": "0x0", + "DDR_STATUS": "0x5555", + "DDR_SPEED": "0x36b0", + "ETH_FW_VERSION": "0x0", + "GDDR_FW_VERSION": "0x2000b", + "DM_APP_FW_VERSION": "0x0", + "DM_BL_FW_VERSION": "0x0", + "FLASH_BUNDLE_VERSION": "0x13050000", + "CM_FW_VERSION": "0x1b0000", + "L2CPU_FW_VERSION": "0x0", + "FAN_SPEED": "0x0", + "TIMER_HEARTBEAT": "0x62df8", + "TELEMETRY_ENUM_COUNT": "0x45", + "ENABLED_TENSIX_COL": "0x3fdf", + "ENABLED_ETH": "0x3edf", + "ENABLED_GDDR": "0xff", + "ENABLED_L2CPU": "0xf", + "PCIE_USAGE": "0x4", + "NOC_TRANSLATION": "0x1", + "FAN_RPM": "0x0", + "GDDR_0_1_TEMP": "0x34323230", + "GDDR_2_3_TEMP": "0x36323432", + "GDDR_4_5_TEMP": "0x322e302e", + "GDDR_6_7_TEMP": "0x32303030", + "GDDR_0_1_CORR_ERRS": "0xff00ff00", + "GDDR_2_3_CORR_ERRS": "0xff00ff00", + "GDDR_4_5_CORR_ERRS": "0xff00ff00", + "GDDR_6_7_CORR_ERRS": "0xff00ff00", + "GDDR_UNCORR_ERRS": "0xaaaa", + "MAX_GDDR_TEMP": "0x36", + "ASIC_LOCATION": "0x8", + "BOARD_POWER_LIMIT": "0x0", + "TDC_LIMIT_MAX": "0x1f4", + "THM_LIMIT_THROTTLE": "0x5a", + "TT_FLASH_VERSION": null, + "THERM_TRIP_COUNT": "0x0", + "ASIC_ID_HIGH": "0x1092e6d0", + "ASIC_ID_LOW": "0xdbd47817", + "AICLK_LIMIT_MAX": "0x546", + "TDP_LIMIT_MAX": "0x82", + "NUMBER_OF_TAGS": "0x320" + }, + "board_info": { + "bus_id": "0000:c8:00.0", + "board_type": "tt-galaxy-bh", + "board_id": "0000047131831011", + "coords": "N/A", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.74", + "current": " 49.0", + "power": " 36.0", + "aiclk": " 800", + "asic_temperature": "39.9", + "fan_speed": " 0", + "heartbeat": "67497" + }, + "firmwares": { + "fw_bundle_version": "19.5.0.0", + "tt_flash_version": "N/A", + "cm_fw": "0.27.0.0", + "cm_fw_date": "2020-00-27", + "eth_fw": "0.0.0", + "dm_bl_fw": "0.0.0.0", + "dm_app_fw": "0.0.0.0", + "gddr_fw": "2.11" + }, + "limits": { + "vdd_min": "0.70", + "vdd_max": "0.90", + "tdp_limit": "130", + "tdc_limit": "500", + "asic_fmax": "1350", + "therm_trip_l1_limit": "90", + "thm_limit": "110", + "bus_peak_limit": 0 + } + } + ] +} diff --git a/runner/internal/shim/host/testdata/tenstorrent/empty_device_info.json b/runner/internal/shim/host/testdata/tenstorrent/empty_device_info.json new file mode 100644 index 0000000000..6aab9062ab --- /dev/null +++ b/runner/internal/shim/host/testdata/tenstorrent/empty_device_info.json @@ -0,0 +1,18 @@ +{ + "time": "2025-06-20T12:10:28.926938", + "host_info": { + "OS": "Linux", + "Distro": "Ubuntu 20.04.6 LTS", + "Kernel": "5.15.0-138-generic", + "Hostname": "empty-system", + "Platform": "x86_64", + "Python": "3.8.10", + "Memory": "16.00 GB", + "Driver": "TT-KMD 1.33" + }, + "host_sw_vers": { + "tt_smi": "3.0.15", + "pyluwen": "0.7.2" + }, + "device_info": [] +} diff --git a/runner/internal/shim/host/testdata/tenstorrent/empty_json.json b/runner/internal/shim/host/testdata/tenstorrent/empty_json.json new file mode 100644 index 0000000000..451cbbb316 --- /dev/null +++ b/runner/internal/shim/host/testdata/tenstorrent/empty_json.json @@ -0,0 +1 @@ +{} diff --git a/runner/internal/shim/host/testdata/tenstorrent/galaxy.json b/runner/internal/shim/host/testdata/tenstorrent/galaxy.json new file mode 100644 index 0000000000..efca1df451 --- /dev/null +++ b/runner/internal/shim/host/testdata/tenstorrent/galaxy.json @@ -0,0 +1,2996 @@ + +{ + "time": "2025-08-04T06:58:41.208516", + "host_info": { + "OS": "Linux", + "Distro": "Ubuntu 22.04.5 LTS", + "Kernel": "5.15.0-151-generic", + "Hostname": "UF-MN-A6-GWH02", + "Platform": "x86_64", + "Python": "3.10.12", + "Memory": "566.12 GB", + "Driver": "TT-KMD 2.3.0" + }, + "host_sw_vers": { + "tt_smi": "3.0.26", + "pyluwen": "0.7.9" + }, + "device_info": [ + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2c02a", + "ASIC_IDD": "0x819", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424b161", + "ARC1_HEALTH": "0x13fede", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab6", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31d", + "ASIC_TEMPERATURE": "0x20901f9", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0013", + "TDC": "0xf00018", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x1f21201f", + "ASIC_TMON1": "0x1f20", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424b161" + }, + "board_info": { + "bus_id": "0000:c1:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 24.0", + "power": " 19.0", + "aiclk": " 500", + "asic_temperature": "31.6", + "heartbeat": "687038" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2e1ed", + "ASIC_IDD": "0xb70", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424b4a4", + "ARC1_HEALTH": "0x13fede", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab4", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31d", + "ASIC_TEMPERATURE": "0x2100202", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0015", + "TDC": "0xf0001a", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x1f202120", + "ASIC_TMON1": "0x2121", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424b4a4" + }, + "board_info": { + "bus_id": "0000:c2:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 26.0", + "power": " 21.0", + "aiclk": " 500", + "asic_temperature": "32.1", + "heartbeat": "687037" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2e5e3", + "ASIC_IDD": "0xf56", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424ab6b", + "ARC1_HEALTH": "0x13fede", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab3", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31c", + "ASIC_TEMPERATURE": "0x20d0201", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0016", + "TDC": "0xf0001b", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x20212020", + "ASIC_TMON1": "0x2020", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424ab6b" + }, + "board_info": { + "bus_id": "0000:c3:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 27.0", + "power": " 22.0", + "aiclk": " 500", + "asic_temperature": "32.1", + "heartbeat": "687037" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2db02", + "ASIC_IDD": "0xbcb", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424ad7c", + "ARC1_HEALTH": "0x13fede", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab4", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31d", + "ASIC_TEMPERATURE": "0x219020e", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0014", + "TDC": "0xf00019", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x2123211f", + "ASIC_TMON1": "0x2121", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424ad7c" + }, + "board_info": { + "bus_id": "0000:c4:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 25.0", + "power": " 20.0", + "aiclk": " 500", + "asic_temperature": "32.9", + "heartbeat": "687037" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2c6e5", + "ASIC_IDD": "0xa07", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424acf8", + "ARC1_HEALTH": "0x13ffd0", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab6", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31d", + "ASIC_TEMPERATURE": "0x1f801ea", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0014", + "TDC": "0xf00019", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x1f1f201e", + "ASIC_TMON1": "0x1f1e", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424acf8" + }, + "board_info": { + "bus_id": "0000:c5:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 25.0", + "power": " 20.0", + "aiclk": " 500", + "asic_temperature": "30.6", + "heartbeat": "687038" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2c81a", + "ASIC_IDD": "0x90c", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10840000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424a4c4", + "ARC1_HEALTH": "0x13ffd0", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab1", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31d", + "ASIC_TEMPERATURE": "0x20201f4", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0012", + "TDC": "0xf00017", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x1f201f20", + "ASIC_TMON1": "0x201f", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424a4c4" + }, + "board_info": { + "bus_id": "0000:c6:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "8" + }, + "telemetry": { + "voltage": "0.80", + "current": " 23.0", + "power": " 18.0", + "aiclk": " 500", + "asic_temperature": "31.2", + "heartbeat": "687037" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2e0cc", + "ASIC_IDD": "0xca8", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424ad4c", + "ARC1_HEALTH": "0x13fede", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab5", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31c", + "ASIC_TEMPERATURE": "0x20201f2", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0014", + "TDC": "0xf00019", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x201f1f1f", + "ASIC_TMON1": "0x201f", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424ad4c" + }, + "board_info": { + "bus_id": "0000:c7:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 25.0", + "power": " 20.0", + "aiclk": " 500", + "asic_temperature": "31.1", + "heartbeat": "687037" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2bede", + "ASIC_IDD": "0x8a5", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424bcc6", + "ARC1_HEALTH": "0x13fede", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab9", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31e", + "ASIC_TEMPERATURE": "0x2100202", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0013", + "TDC": "0xf00017", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x2120201f", + "ASIC_TMON1": "0x2120", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424bcc6" + }, + "board_info": { + "bus_id": "0000:c8:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 23.0", + "power": " 19.0", + "aiclk": " 500", + "asic_temperature": "32.1", + "heartbeat": "687038" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2e9d6", + "ASIC_IDD": "0xe1d", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424ad91", + "ARC1_HEALTH": "0x13feda", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab1", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31c", + "ASIC_TEMPERATURE": "0x2120202", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0016", + "TDC": "0xf0001b", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x1f212120", + "ASIC_TMON1": "0x2021", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55352", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424ad91" + }, + "board_info": { + "bus_id": "0000:81:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 27.0", + "power": " 22.0", + "aiclk": " 500", + "asic_temperature": "32.1", + "heartbeat": "687037" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2e546", + "ASIC_IDD": "0xd51", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424ac36", + "ARC1_HEALTH": "0x13feda", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab4", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31d", + "ASIC_TEMPERATURE": "0x1f801ea", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0014", + "TDC": "0xf00019", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x1e201f20", + "ASIC_TMON1": "0x1c1f", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424ac36" + }, + "board_info": { + "bus_id": "0000:82:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 25.0", + "power": " 20.0", + "aiclk": " 500", + "asic_temperature": "30.6", + "heartbeat": "687037" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2d8ef", + "ASIC_IDD": "0xb08", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424acdc", + "ARC1_HEALTH": "0x13feda", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab2", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31d", + "ASIC_TEMPERATURE": "0x2120204", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0014", + "TDC": "0xf00019", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x20212120", + "ASIC_TMON1": "0x2120", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424acdc" + }, + "board_info": { + "bus_id": "0000:83:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 25.0", + "power": " 20.0", + "aiclk": " 500", + "asic_temperature": "32.2", + "heartbeat": "687037" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2d71f", + "ASIC_IDD": "0xcb8", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x54249348", + "ARC1_HEALTH": "0x13feda", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346aa2", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31c", + "ASIC_TEMPERATURE": "0x2210218", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0015", + "TDC": "0xf0001a", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x21222322", + "ASIC_TMON1": "0x2121", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55351", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x54249348" + }, + "board_info": { + "bus_id": "0000:84:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 26.0", + "power": " 21.0", + "aiclk": " 500", + "asic_temperature": "33.5", + "heartbeat": "687034" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2ddc8", + "ASIC_IDD": "0xb97", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424b28e", + "ARC1_HEALTH": "0x13ffcc", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab6", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31d", + "ASIC_TEMPERATURE": "0x20201f0", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0016", + "TDC": "0xf0001b", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x1e201e1f", + "ASIC_TMON1": "0x201f", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424b28e" + }, + "board_info": { + "bus_id": "0000:85:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 27.0", + "power": " 22.0", + "aiclk": " 500", + "asic_temperature": "31.0", + "heartbeat": "687038" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2e9ef", + "ASIC_IDD": "0xf88", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10840000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424afa8", + "ARC1_HEALTH": "0x13ffcc", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab4", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31d", + "ASIC_TEMPERATURE": "0x1fd01f0", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0015", + "TDC": "0xf0001b", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x1f1f1f1f", + "ASIC_TMON1": "0x201f", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424afa8" + }, + "board_info": { + "bus_id": "0000:86:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "8" + }, + "telemetry": { + "voltage": "0.80", + "current": " 27.0", + "power": " 21.0", + "aiclk": " 500", + "asic_temperature": "31.0", + "heartbeat": "687037" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2e92d", + "ASIC_IDD": "0xf4f", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424ab0f", + "ARC1_HEALTH": "0x13feda", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab3", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31d", + "ASIC_TEMPERATURE": "0x2120202", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0015", + "TDC": "0xf0001a", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x1f212020", + "ASIC_TMON1": "0x2021", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424ab0f" + }, + "board_info": { + "bus_id": "0000:87:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 26.0", + "power": " 21.0", + "aiclk": " 500", + "asic_temperature": "32.1", + "heartbeat": "687037" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2d980", + "ASIC_IDD": "0xc62", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424ae80", + "ARC1_HEALTH": "0x13feda", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab6", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31c", + "ASIC_TEMPERATURE": "0x2110205", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0014", + "TDC": "0xf00019", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x2021201f", + "ASIC_TMON1": "0x2221", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424ae80" + }, + "board_info": { + "bus_id": "0000:88:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 25.0", + "power": " 20.0", + "aiclk": " 500", + "asic_temperature": "32.3", + "heartbeat": "687038" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2d282", + "ASIC_IDD": "0xacb", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424b6c8", + "ARC1_HEALTH": "0x13fed8", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab4", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31d", + "ASIC_TEMPERATURE": "0x218020a", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0014", + "TDC": "0xf00019", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x20212121", + "ASIC_TMON1": "0x2120", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424b6c8" + }, + "board_info": { + "bus_id": "0000:01:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 25.0", + "power": " 20.0", + "aiclk": " 500", + "asic_temperature": "32.6", + "heartbeat": "687037" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2c1a6", + "ASIC_IDD": "0x85c", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424a78c", + "ARC1_HEALTH": "0x13fed8", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab0", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31d", + "ASIC_TEMPERATURE": "0x22d021a", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0014", + "TDC": "0xf00018", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x21212322", + "ASIC_TMON1": "0x2222", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55352", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424a78c" + }, + "board_info": { + "bus_id": "0000:02:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 24.0", + "power": " 20.0", + "aiclk": " 500", + "asic_temperature": "33.6", + "heartbeat": "687036" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2cd3f", + "ASIC_IDD": "0x998", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424b908", + "ARC1_HEALTH": "0x13fed8", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab5", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31d", + "ASIC_TEMPERATURE": "0x2550246", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0014", + "TDC": "0xf00019", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x23252525", + "ASIC_TMON1": "0x2525", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424b908" + }, + "board_info": { + "bus_id": "0000:03:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 25.0", + "power": " 20.0", + "aiclk": " 500", + "asic_temperature": "36.4", + "heartbeat": "687037" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2ccb9", + "ASIC_IDD": "0x844", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424a2cd", + "ARC1_HEALTH": "0x13fed8", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346aa9", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31d", + "ASIC_TEMPERATURE": "0x228021a", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0014", + "TDC": "0xf00019", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x21222221", + "ASIC_TMON1": "0x2322", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55352", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424a2cd" + }, + "board_info": { + "bus_id": "0000:04:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 25.0", + "power": " 20.0", + "aiclk": " 500", + "asic_temperature": "33.6", + "heartbeat": "687035" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2d76b", + "ASIC_IDD": "0x94b", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424b432", + "ARC1_HEALTH": "0x13ffc8", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab5", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31c", + "ASIC_TEMPERATURE": "0x220020d", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0014", + "TDC": "0xf00019", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x2121221f", + "ASIC_TMON1": "0x2122", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424b432" + }, + "board_info": { + "bus_id": "0000:05:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 25.0", + "power": " 20.0", + "aiclk": " 500", + "asic_temperature": "32.8", + "heartbeat": "687037" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2d806", + "ASIC_IDD": "0x812", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10840000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424b4ca", + "ARC1_HEALTH": "0x13ffc8", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab7", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31c", + "ASIC_TEMPERATURE": "0x2200210", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0014", + "TDC": "0xf00019", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x1f212222", + "ASIC_TMON1": "0x2121", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424b4ca" + }, + "board_info": { + "bus_id": "0000:06:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "8" + }, + "telemetry": { + "voltage": "0.80", + "current": " 25.0", + "power": " 20.0", + "aiclk": " 500", + "asic_temperature": "33.0", + "heartbeat": "687038" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2cb16", + "ASIC_IDD": "0x7be", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424ac4d", + "ARC1_HEALTH": "0x13fed8", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab2", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31e", + "ASIC_TEMPERATURE": "0x2180208", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0012", + "TDC": "0xf00017", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x20212120", + "ASIC_TMON1": "0x2121", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424ac4d" + }, + "board_info": { + "bus_id": "0000:07:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 23.0", + "power": " 18.0", + "aiclk": " 500", + "asic_temperature": "32.5", + "heartbeat": "687037" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2cf32", + "ASIC_IDD": "0x945", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424ac14", + "ARC1_HEALTH": "0x13fed8", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab0", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31d", + "ASIC_TEMPERATURE": "0x2220210", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0013", + "TDC": "0xf00018", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x20222221", + "ASIC_TMON1": "0x2121", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55352", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424ac14" + }, + "board_info": { + "bus_id": "0000:08:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 24.0", + "power": " 19.0", + "aiclk": " 500", + "asic_temperature": "33.0", + "heartbeat": "687036" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2d763", + "ASIC_IDD": "0xbf8", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424ad88", + "ARC1_HEALTH": "0x13fed8", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab3", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31d", + "ASIC_TEMPERATURE": "0x236022a", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0014", + "TDC": "0xf0001a", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x22232322", + "ASIC_TMON1": "0x2323", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424ad88" + }, + "board_info": { + "bus_id": "0000:41:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 26.0", + "power": " 20.0", + "aiclk": " 500", + "asic_temperature": "34.6", + "heartbeat": "687037" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2e387", + "ASIC_IDD": "0xaa7", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424b0b9", + "ARC1_HEALTH": "0x13fed8", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab9", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31c", + "ASIC_TEMPERATURE": "0x2340228", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0015", + "TDC": "0xf0001b", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x22232321", + "ASIC_TMON1": "0x2423", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424b0b9" + }, + "board_info": { + "bus_id": "0000:42:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 27.0", + "power": " 21.0", + "aiclk": " 500", + "asic_temperature": "34.5", + "heartbeat": "687038" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2cc08", + "ASIC_IDD": "0x8e4", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424aadd", + "ARC1_HEALTH": "0x13fed8", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab2", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31d", + "ASIC_TEMPERATURE": "0x225021c", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0013", + "TDC": "0xf00018", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x21222322", + "ASIC_TMON1": "0x2122", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424aadd" + }, + "board_info": { + "bus_id": "0000:43:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 24.0", + "power": " 19.0", + "aiclk": " 500", + "asic_temperature": "33.8", + "heartbeat": "687037" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2ec5a", + "ASIC_IDD": "0xd6a", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424b0d4", + "ARC1_HEALTH": "0x13fed8", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab6", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31d", + "ASIC_TEMPERATURE": "0x235022e", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0016", + "TDC": "0xf0001b", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x23232422", + "ASIC_TMON1": "0x2423", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424b0d4" + }, + "board_info": { + "bus_id": "0000:44:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 27.0", + "power": " 22.0", + "aiclk": " 500", + "asic_temperature": "34.9", + "heartbeat": "687038" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2e682", + "ASIC_IDD": "0xc78", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424b600", + "ARC1_HEALTH": "0x13ffca", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab5", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31d", + "ASIC_TEMPERATURE": "0x2380229", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0014", + "TDC": "0xf00019", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x21232324", + "ASIC_TMON1": "0x2323", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424b600" + }, + "board_info": { + "bus_id": "0000:45:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 25.0", + "power": " 20.0", + "aiclk": " 500", + "asic_temperature": "34.6", + "heartbeat": "687037" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2db26", + "ASIC_IDD": "0x9d8", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10840000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424b0e5", + "ARC1_HEALTH": "0x13ffca", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab7", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31d", + "ASIC_TEMPERATURE": "0x2400231", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0014", + "TDC": "0xf00019", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x23242423", + "ASIC_TMON1": "0x2224", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424b0e5" + }, + "board_info": { + "bus_id": "0000:46:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "8" + }, + "telemetry": { + "voltage": "0.80", + "current": " 25.0", + "power": " 20.0", + "aiclk": " 500", + "asic_temperature": "35.1", + "heartbeat": "687038" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2f20e", + "ASIC_IDD": "0xdce", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x5424b1ad", + "ARC1_HEALTH": "0x13fed8", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346ab3", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31d", + "ASIC_TEMPERATURE": "0x21c0209", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0015", + "TDC": "0xf0001a", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x1f212121", + "ASIC_TMON1": "0x2121", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55353", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x5424b1ad" + }, + "board_info": { + "bus_id": "0000:47:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 26.0", + "power": " 21.0", + "aiclk": " 500", + "asic_temperature": "32.6", + "heartbeat": "687037" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100035100000000", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2e671", + "ASIC_IDD": "0xce7", + "BOARD_ID_HIGH": "0x1000351", + "BOARD_ID_LOW": null, + "ARC0_FW_VERSION": "0x2220000", + "ARC1_FW_VERSION": "0x2220000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2220000", + "SPIBOOTROM_FW_VERSION": "0x30d0000", + "ETH_FW_VERSION": "0x70000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x5040000", + "DDR_SPEED": null, + "DDR_STATUS": "0x1222222", + "ETH_STATUS0": "0x22222222", + "ETH_STATUS1": "0x22222222", + "PCIE_STATUS": "0x10140000", + "FAULTS": null, + "ARC0_HEALTH": "0x54249971", + "ARC1_HEALTH": "0x13fed8", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x346aa7", + "FAN_SPEED": "0xffffffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31d", + "ASIC_TEMPERATURE": "0x2320224", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x212f2d", + "TDP": "0xaa0014", + "TDC": "0xf00019", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x5616150e", + "ASIC_TMON0": "0x21232322", + "ASIC_TMON1": "0x2323", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0xda7e", + "RT_SECONDS": "0x55352", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": null, + "ETH_DEBUG_STATUS1": null, + "TT_FLASH_VERSION": "0x30400", + "FW_BUNDLE_VERSION": "0x12060000", + "TIMER_HEARTBEAT": "0x54249971" + }, + "board_info": { + "bus_id": "0000:48:00.0", + "board_type": "tt-galaxy-wh L", + "board_id": "100035100000000", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "14G", + "pcie_speed": 4, + "pcie_width": "1" + }, + "telemetry": { + "voltage": "0.80", + "current": " 25.0", + "power": " 20.0", + "aiclk": " 500", + "asic_temperature": "34.2", + "heartbeat": "687035" + }, + "firmwares": { + "fw_bundle_version": "18.6.0.0", + "tt_flash_version": "0.3.4.0", + "cm_fw": "2.34.0.0", + "cm_fw_date": "2025-06-22", + "eth_fw": "7.0.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.4.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "170", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + } + ] +} diff --git a/runner/internal/shim/host/testdata/tenstorrent/invalid_json.json b/runner/internal/shim/host/testdata/tenstorrent/invalid_json.json new file mode 100644 index 0000000000..3cd2b46b13 --- /dev/null +++ b/runner/internal/shim/host/testdata/tenstorrent/invalid_json.json @@ -0,0 +1 @@ +{"device_info": [{"board_info": {"board_type": "n150"}} diff --git a/runner/internal/shim/host/testdata/tenstorrent/missing_device_info.json b/runner/internal/shim/host/testdata/tenstorrent/missing_device_info.json new file mode 100644 index 0000000000..97563f9ccb --- /dev/null +++ b/runner/internal/shim/host/testdata/tenstorrent/missing_device_info.json @@ -0,0 +1,18 @@ +{ + "time": "2025-06-20T12:10:28.926938", + "host_info": { + "OS": "Linux", + "Distro": "Ubuntu 20.04.6 LTS", + "Kernel": "5.15.0-138-generic", + "Hostname": "incomplete-system", + "Platform": "x86_64", + "Python": "3.8.10", + "Memory": "16.00 GB", + "Driver": "TT-KMD 1.33" + }, + "host_sw_vers": { + "tt_smi": "3.0.15", + "pyluwen": "0.7.2" + }, + "other_field": "value" +} diff --git a/runner/internal/shim/host/testdata/tenstorrent/single_n150_gpu.json b/runner/internal/shim/host/testdata/tenstorrent/single_n150_gpu.json new file mode 100644 index 0000000000..5b3ca7371d --- /dev/null +++ b/runner/internal/shim/host/testdata/tenstorrent/single_n150_gpu.json @@ -0,0 +1,110 @@ +{ + "time": "2025-06-20T12:10:28.926938", + "host_info": { + "OS": "Linux", + "Distro": "Ubuntu 20.04.6 LTS", + "Kernel": "5.15.0-138-generic", + "Hostname": "7330093c7194", + "Platform": "x86_64", + "Python": "3.8.10", + "Memory": "30.46 GB", + "Driver": "TT-KMD 1.33" + }, + "host_sw_vers": { + "tt_smi": "3.0.15", + "pyluwen": "0.7.2" + }, + "device_info": [ + { + "smbus_telem": { + "BOARD_ID": "0x100018611902010", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2df07", + "ASIC_IDD": "0xbf1", + "BOARD_ID_HIGH": "0x1000186", + "BOARD_ID_LOW": "0x11902010", + "ARC0_FW_VERSION": "0x2200000", + "ARC1_FW_VERSION": "0x2200000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2200000", + "SPIBOOTROM_FW_VERSION": "0x30c0000", + "ETH_FW_VERSION": "0x6e000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x50c0000", + "DDR_SPEED": null, + "DDR_STATUS": "0x2222222", + "ETH_STATUS0": "0x11111111", + "ETH_STATUS1": "0x11111111", + "PCIE_STATUS": "0x10440000", + "FAULTS": null, + "ARC0_HEALTH": "0x6b1943", + "ARC1_HEALTH": "0x275466", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x42e0", + "FAN_SPEED": "0x4cbffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31b", + "ASIC_TEMPERATURE": "0x3690365", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x323433", + "TDP": "0x64000c", + "TDC": "0xf00010", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x54010d33", + "ASIC_TMON0": "0x37324235", + "ASIC_TMON1": "0x382f", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0x56140b2a", + "RT_SECONDS": "0x6ca", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": "0xccddddcc", + "ETH_DEBUG_STATUS1": "0xccdddddd", + "TT_FLASH_VERSION": "0x30200", + "FW_BUNDLE_VERSION": "0x50110000" + }, + "board_info": { + "bus_id": "0000:01:00.0", + "board_type": "n150 L", + "board_id": "100018611902010", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "12G", + "pcie_speed": 4, + "pcie_width": "4" + }, + "telemetry": { + "voltage": "0.80", + "current": " 16.0", + "power": " 12.0", + "aiclk": " 500", + "asic_temperature": "54.3" + }, + "firmwares": { + "fw_bundle_version": "80.17.0.0", + "tt_flash_version": "0.3.2.0", + "cm_fw": "2.32.0.0", + "cm_fw_date": "2025-04-01", + "eth_fw": "6.14.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.12.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "100", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + } + ] +} diff --git a/runner/internal/shim/host/testdata/tenstorrent/valid_multiple_devices.json b/runner/internal/shim/host/testdata/tenstorrent/valid_multiple_devices.json new file mode 100644 index 0000000000..4fd62a0b38 --- /dev/null +++ b/runner/internal/shim/host/testdata/tenstorrent/valid_multiple_devices.json @@ -0,0 +1,732 @@ +{ + "time": "2025-06-11T03:37:23.927792", + "host_info": { + "OS": "Linux", + "Distro": "Ubuntu 22.04.5 LTS", + "Kernel": "5.15.0-141-generic", + "Hostname": "TT-QuietBox", + "Platform": "x86_64", + "Python": "3.10.12", + "Memory": "503.45 GB", + "Driver": "TT-KMD 1.34" + }, + "host_sw_vers": { + "tt_smi": "3.0.20", + "pyluwen": "0.7.2" + }, + "device_info": [ + { + "smbus_telem": { + "BOARD_ID": "0x10001451172208f", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2df07", + "ASIC_IDD": "0xbf1", + "BOARD_ID_HIGH": "0x1000145", + "BOARD_ID_LOW": "0x1172208f", + "ARC0_FW_VERSION": "0x2200000", + "ARC1_FW_VERSION": "0x2200000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2200000", + "SPIBOOTROM_FW_VERSION": "0x30c0000", + "ETH_FW_VERSION": "0x6e000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x50c0000", + "DDR_SPEED": null, + "DDR_STATUS": "0x2222222", + "ETH_STATUS0": "0x11111111", + "ETH_STATUS1": "0x11111111", + "PCIE_STATUS": "0x10440000", + "FAULTS": null, + "ARC0_HEALTH": "0x6b1943", + "ARC1_HEALTH": "0x275466", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x42e0", + "FAN_SPEED": "0x4cbffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31b", + "ASIC_TEMPERATURE": "0x3690365", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x323433", + "TDP": "0x64000c", + "TDC": "0xf00010", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x54010d33", + "ASIC_TMON0": "0x37324235", + "ASIC_TMON1": "0x382f", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0x56140b2a", + "RT_SECONDS": "0x6ca", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": "0xccddddcc", + "ETH_DEBUG_STATUS1": "0xccdddddd", + "TT_FLASH_VERSION": "0x30200", + "FW_BUNDLE_VERSION": "0x50110000" + }, + "board_info": { + "bus_id": "0000:c1:00.0", + "board_type": "n300 L", + "board_id": "10001451172208f", + "coords": "(1, 0, 0, 0)", + "dram_status": true, + "dram_speed": "12G", + "pcie_speed": 4, + "pcie_width": "16" + }, + "telemetry": { + "voltage": "0.80", + "current": " 16.0", + "power": " 12.0", + "aiclk": " 500", + "asic_temperature": "54.3" + }, + "firmwares": { + "fw_bundle_version": "80.17.0.0", + "tt_flash_version": "0.3.2.0", + "cm_fw": "2.32.0.0", + "cm_fw_date": "2025-04-01", + "eth_fw": "6.14.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.12.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "100", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100014511722053", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2df07", + "ASIC_IDD": "0xbf1", + "BOARD_ID_HIGH": "0x1000145", + "BOARD_ID_LOW": "0x11722053", + "ARC0_FW_VERSION": "0x2200000", + "ARC1_FW_VERSION": "0x2200000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2200000", + "SPIBOOTROM_FW_VERSION": "0x30c0000", + "ETH_FW_VERSION": "0x6e000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x50c0000", + "DDR_SPEED": null, + "DDR_STATUS": "0x2222222", + "ETH_STATUS0": "0x11111111", + "ETH_STATUS1": "0x11111111", + "PCIE_STATUS": "0x10440000", + "FAULTS": null, + "ARC0_HEALTH": "0x6b1943", + "ARC1_HEALTH": "0x275466", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x42e0", + "FAN_SPEED": "0x4cbffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31b", + "ASIC_TEMPERATURE": "0x3690365", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x323433", + "TDP": "0x64000c", + "TDC": "0xf00010", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x54010d33", + "ASIC_TMON0": "0x37324235", + "ASIC_TMON1": "0x382f", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0x56140b2a", + "RT_SECONDS": "0x6ca", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": "0xccddddcc", + "ETH_DEBUG_STATUS1": "0xccdddddd", + "TT_FLASH_VERSION": "0x30200", + "FW_BUNDLE_VERSION": "0x50110000" + }, + "board_info": { + "bus_id": "0000:01:00.0", + "board_type": "n300 L", + "board_id": "100014511722053", + "coords": "(1, 1, 0, 0)", + "dram_status": true, + "dram_speed": "12G", + "pcie_speed": 4, + "pcie_width": "16" + }, + "telemetry": { + "temperature": 48.7, + "power_consumption": 15.2 + }, + "firmwares": { + "version": "1.2.5" + }, + "limits": { + "max_temp": 85.0, + "max_power": 25.0 + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x10001451172209c", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2df07", + "ASIC_IDD": "0xbf1", + "BOARD_ID_HIGH": "0x1000145", + "BOARD_ID_LOW": "0x1172209c", + "ARC0_FW_VERSION": "0x2200000", + "ARC1_FW_VERSION": "0x2200000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2200000", + "SPIBOOTROM_FW_VERSION": "0x30c0000", + "ETH_FW_VERSION": "0x6e000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x50c0000", + "DDR_SPEED": null, + "DDR_STATUS": "0x2222222", + "ETH_STATUS0": "0x11111111", + "ETH_STATUS1": "0x11111111", + "PCIE_STATUS": "0x10440000", + "FAULTS": null, + "ARC0_HEALTH": "0x6b1943", + "ARC1_HEALTH": "0x275466", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x42e0", + "FAN_SPEED": "0x4cbffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31b", + "ASIC_TEMPERATURE": "0x3690365", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x323433", + "TDP": "0x64000c", + "TDC": "0xf00010", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x54010d33", + "ASIC_TMON0": "0x37324235", + "ASIC_TMON1": "0x382f", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0x56140b2a", + "RT_SECONDS": "0x6ca", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": "0xccddddcc", + "ETH_DEBUG_STATUS1": "0xccdddddd", + "TT_FLASH_VERSION": "0x30200", + "FW_BUNDLE_VERSION": "0x50110000" + }, + "board_info": { + "bus_id": "0000:02:00.0", + "board_type": "n300 L", + "board_id": "10001451172209c", + "coords": "(2, 1, 0, 0)", + "dram_status": true, + "dram_speed": "12G", + "pcie_speed": 4, + "pcie_width": "16" + }, + "telemetry": { + "voltage": "0.80", + "current": " 16.0", + "power": " 12.0", + "aiclk": " 500", + "asic_temperature": "54.3" + }, + "firmwares": { + "fw_bundle_version": "80.17.0.0", + "tt_flash_version": "0.3.2.0", + "cm_fw": "2.32.0.0", + "cm_fw_date": "2025-04-01", + "eth_fw": "6.14.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.12.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "100", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100014511722058", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2df07", + "ASIC_IDD": "0xbf1", + "BOARD_ID_HIGH": "0x1000145", + "BOARD_ID_LOW": "0x11722058", + "ARC0_FW_VERSION": "0x2200000", + "ARC1_FW_VERSION": "0x2200000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2200000", + "SPIBOOTROM_FW_VERSION": "0x30c0000", + "ETH_FW_VERSION": "0x6e000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x50c0000", + "DDR_SPEED": null, + "DDR_STATUS": "0x2222222", + "ETH_STATUS0": "0x11111111", + "ETH_STATUS1": "0x11111111", + "PCIE_STATUS": "0x10440000", + "FAULTS": null, + "ARC0_HEALTH": "0x6b1943", + "ARC1_HEALTH": "0x275466", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x42e0", + "FAN_SPEED": "0x4cbffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31b", + "ASIC_TEMPERATURE": "0x3690365", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x323433", + "TDP": "0x64000c", + "TDC": "0xf00010", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x54010d33", + "ASIC_TMON0": "0x37324235", + "ASIC_TMON1": "0x382f", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0x56140b2a", + "RT_SECONDS": "0x6ca", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": "0xccddddcc", + "ETH_DEBUG_STATUS1": "0xccdddddd", + "TT_FLASH_VERSION": "0x30200", + "FW_BUNDLE_VERSION": "0x50110000" + }, + "board_info": { + "bus_id": "0000:41:00.0", + "board_type": "n300 L", + "board_id": "100014511722058", + "coords": "(2, 0, 0, 0)", + "dram_status": true, + "dram_speed": "12G", + "pcie_speed": 4, + "pcie_width": "16" + }, + "telemetry": { + "voltage": "0.80", + "current": " 16.0", + "power": " 12.0", + "aiclk": " 500", + "asic_temperature": "54.3" + }, + "firmwares": { + "fw_bundle_version": "80.17.0.0", + "tt_flash_version": "0.3.2.0", + "cm_fw": "2.32.0.0", + "cm_fw_date": "2025-04-01", + "eth_fw": "6.14.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.12.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "100", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x10001451172208f", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2df07", + "ASIC_IDD": "0xbf1", + "BOARD_ID_HIGH": "0x1000145", + "BOARD_ID_LOW": "0x1172208f", + "ARC0_FW_VERSION": "0x2200000", + "ARC1_FW_VERSION": "0x2200000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2200000", + "SPIBOOTROM_FW_VERSION": "0x30c0000", + "ETH_FW_VERSION": "0x6e000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x50c0000", + "DDR_SPEED": null, + "DDR_STATUS": "0x2222222", + "ETH_STATUS0": "0x11111111", + "ETH_STATUS1": "0x11111111", + "PCIE_STATUS": "0x10440000", + "FAULTS": null, + "ARC0_HEALTH": "0x6b1943", + "ARC1_HEALTH": "0x275466", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x42e0", + "FAN_SPEED": "0x4cbffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31b", + "ASIC_TEMPERATURE": "0x3690365", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x323433", + "TDP": "0x64000c", + "TDC": "0xf00010", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x54010d33", + "ASIC_TMON0": "0x37324235", + "ASIC_TMON1": "0x382f", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0x56140b2a", + "RT_SECONDS": "0x6ca", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": "0xccddddcc", + "ETH_DEBUG_STATUS1": "0xccdddddd", + "TT_FLASH_VERSION": "0x30200", + "FW_BUNDLE_VERSION": "0x50110000" + }, + "board_info": { + "bus_id": "N/A", + "board_type": "n300 R", + "board_id": "10001451172208f", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "12G", + "pcie_speed": "N/A", + "pcie_width": "N/A" + }, + "telemetry": { + "voltage": "0.80", + "current": " 16.0", + "power": " 12.0", + "aiclk": " 500", + "asic_temperature": "54.3" + }, + "firmwares": { + "fw_bundle_version": "80.17.0.0", + "tt_flash_version": "0.3.2.0", + "cm_fw": "2.32.0.0", + "cm_fw_date": "2025-04-01", + "eth_fw": "6.14.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.12.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "100", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100014511722053", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2df07", + "ASIC_IDD": "0xbf1", + "BOARD_ID_HIGH": "0x1000145", + "BOARD_ID_LOW": "0x11722053", + "ARC0_FW_VERSION": "0x2200000", + "ARC1_FW_VERSION": "0x2200000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2200000", + "SPIBOOTROM_FW_VERSION": "0x30c0000", + "ETH_FW_VERSION": "0x6e000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x50c0000", + "DDR_SPEED": null, + "DDR_STATUS": "0x2222222", + "ETH_STATUS0": "0x11111111", + "ETH_STATUS1": "0x11111111", + "PCIE_STATUS": "0x10440000", + "FAULTS": null, + "ARC0_HEALTH": "0x6b1943", + "ARC1_HEALTH": "0x275466", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x42e0", + "FAN_SPEED": "0x4cbffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31b", + "ASIC_TEMPERATURE": "0x3690365", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x323433", + "TDP": "0x64000c", + "TDC": "0xf00010", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x54010d33", + "ASIC_TMON0": "0x37324235", + "ASIC_TMON1": "0x382f", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0x56140b2a", + "RT_SECONDS": "0x6ca", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": "0xccddddcc", + "ETH_DEBUG_STATUS1": "0xccdddddd", + "TT_FLASH_VERSION": "0x30200", + "FW_BUNDLE_VERSION": "0x50110000" + }, + "board_info": { + "bus_id": "N/A", + "board_type": "n300 R", + "board_id": "100014511722053", + "coords": "(0, 1, 0, 0)", + "dram_status": true, + "dram_speed": "12G", + "pcie_speed": "N/A", + "pcie_width": "N/A" + }, + "telemetry": { + "voltage": "0.80", + "current": " 16.0", + "power": " 12.0", + "aiclk": " 500", + "asic_temperature": "54.3" + }, + "firmwares": { + "fw_bundle_version": "80.17.0.0", + "tt_flash_version": "0.3.2.0", + "cm_fw": "2.32.0.0", + "cm_fw_date": "2025-04-01", + "eth_fw": "6.14.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.12.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "100", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x10001451172209c", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2df07", + "ASIC_IDD": "0xbf1", + "BOARD_ID_HIGH": "0x1000145", + "BOARD_ID_LOW": "0x1172209c", + "ARC0_FW_VERSION": "0x2200000", + "ARC1_FW_VERSION": "0x2200000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2200000", + "SPIBOOTROM_FW_VERSION": "0x30c0000", + "ETH_FW_VERSION": "0x6e000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x50c0000", + "DDR_SPEED": null, + "DDR_STATUS": "0x2222222", + "ETH_STATUS0": "0x11111111", + "ETH_STATUS1": "0x11111111", + "PCIE_STATUS": "0x10440000", + "FAULTS": null, + "ARC0_HEALTH": "0x6b1943", + "ARC1_HEALTH": "0x275466", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x42e0", + "FAN_SPEED": "0x4cbffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31b", + "ASIC_TEMPERATURE": "0x3690365", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x323433", + "TDP": "0x64000c", + "TDC": "0xf00010", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x54010d33", + "ASIC_TMON0": "0x37324235", + "ASIC_TMON1": "0x382f", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0x56140b2a", + "RT_SECONDS": "0x6ca", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": "0xccddddcc", + "ETH_DEBUG_STATUS1": "0xccdddddd", + "TT_FLASH_VERSION": "0x30200", + "FW_BUNDLE_VERSION": "0x50110000" + }, + "board_info": { + "bus_id": "N/A", + "board_type": "n300 R", + "board_id": "10001451172209c", + "coords": "(3, 1, 0, 0)", + "dram_status": true, + "dram_speed": "12G", + "pcie_speed": "N/A", + "pcie_width": "N/A" + }, + "telemetry": { + "voltage": "0.80", + "current": " 16.0", + "power": " 12.0", + "aiclk": " 500", + "asic_temperature": "54.3" + }, + "firmwares": { + "fw_bundle_version": "80.17.0.0", + "tt_flash_version": "0.3.2.0", + "cm_fw": "2.32.0.0", + "cm_fw_date": "2025-04-01", + "eth_fw": "6.14.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.12.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "100", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100014511722058", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2df07", + "ASIC_IDD": "0xbf1", + "BOARD_ID_HIGH": "0x1000145", + "BOARD_ID_LOW": "0x11722058", + "ARC0_FW_VERSION": "0x2200000", + "ARC1_FW_VERSION": "0x2200000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2200000", + "SPIBOOTROM_FW_VERSION": "0x30c0000", + "ETH_FW_VERSION": "0x6e000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x50c0000", + "DDR_SPEED": null, + "DDR_STATUS": "0x2222222", + "ETH_STATUS0": "0x11111111", + "ETH_STATUS1": "0x11111111", + "PCIE_STATUS": "0x10440000", + "FAULTS": null, + "ARC0_HEALTH": "0x6b1943", + "ARC1_HEALTH": "0x275466", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x42e0", + "FAN_SPEED": "0x4cbffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31b", + "ASIC_TEMPERATURE": "0x3690365", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x323433", + "TDP": "0x64000c", + "TDC": "0xf00010", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x54010d33", + "ASIC_TMON0": "0x37324235", + "ASIC_TMON1": "0x382f", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0x56140b2a", + "RT_SECONDS": "0x6ca", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": "0xccddddcc", + "ETH_DEBUG_STATUS1": "0xccdddddd", + "TT_FLASH_VERSION": "0x30200", + "FW_BUNDLE_VERSION": "0x50110000" + }, + "board_info": { + "bus_id": "N/A", + "board_type": "n300 R", + "board_id": "100014511722058", + "coords": "(3, 0, 0, 0)", + "dram_status": true, + "dram_speed": "12G", + "pcie_speed": "N/A", + "pcie_width": "N/A" + }, + "telemetry": { + "voltage": "0.80", + "current": " 16.0", + "power": " 12.0", + "aiclk": " 500", + "asic_temperature": "54.3" + }, + "firmwares": { + "fw_bundle_version": "80.17.0.0", + "tt_flash_version": "0.3.2.0", + "cm_fw": "2.32.0.0", + "cm_fw_date": "2025-04-01", + "eth_fw": "6.14.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.12.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "100", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + } + ] +} diff --git a/runner/internal/shim/host/testdata/tenstorrent/valid_single_device.json b/runner/internal/shim/host/testdata/tenstorrent/valid_single_device.json new file mode 100644 index 0000000000..0f48fffe9f --- /dev/null +++ b/runner/internal/shim/host/testdata/tenstorrent/valid_single_device.json @@ -0,0 +1,46 @@ +{ + "time": "2025-06-20T12:10:28.926938", + "host_info": { + "OS": "Linux", + "Distro": "Ubuntu 20.04.6 LTS", + "Kernel": "5.15.0-138-generic", + "Hostname": "7330093c7194", + "Platform": "x86_64", + "Python": "3.8.10", + "Memory": "30.46 GB", + "Driver": "TT-KMD 1.33" + }, + "host_sw_vers": { + "tt_smi": "3.0.15", + "pyluwen": "0.7.2" + }, + "device_info": [ + { + "smbus_telem": { + "temp": 45.2, + "power": 12.5 + }, + "board_info": { + "bus_id": "0000:01:00.0", + "board_type": "n150 L", + "board_id": "100018611902010", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "12G", + "pcie_speed": 4, + "pcie_width": "4" + }, + "telemetry": { + "temperature": 45.2, + "power_consumption": 12.5 + }, + "firmwares": { + "version": "1.2.3" + }, + "limits": { + "max_temp": 85.0, + "max_power": 25.0 + } + } + ] +} diff --git a/runner/internal/shim/host_info.go b/runner/internal/shim/host_info.go new file mode 100644 index 0000000000..2634d939c3 --- /dev/null +++ b/runner/internal/shim/host_info.go @@ -0,0 +1,75 @@ +package shim + +import ( + "encoding/json" + "errors" + "fmt" + "os" + "path/filepath" + + "github.com/dstackai/dstack/runner/internal/common/gpu" +) + +type hostInfo struct { + GpuVendor gpu.GpuVendor `json:"gpu_vendor"` + GpuName string `json:"gpu_name"` + GpuMemory int `json:"gpu_memory"` // MiB + GpuCount int `json:"gpu_count"` + Addresses []string `json:"addresses"` + DiskSize uint64 `json:"disk_size"` // bytes + NumCPUs int `json:"cpus"` + Memory uint64 `json:"memory"` // bytes +} + +func WriteHostInfo(dir string, resources Resources) error { + path := filepath.Join(dir, "host_info.json") + // if host_info.json already exists, do nothing and return os.ErrExist + if _, err := os.Stat(path); !errors.Is(err, os.ErrNotExist) { + return err + } + + gpuVendor := gpu.GpuVendorNone + gpuCount := 0 + gpuMemory := 0 + gpuName := "" + gpus := resources.Gpus + if len(gpus) > 0 { + gpuCount = len(gpus) + gpuVendor = gpus[0].Vendor + gpuMemory = gpus[0].Vram + gpuName = gpus[0].Name + } + info := hostInfo{ + GpuVendor: gpuVendor, + GpuName: gpuName, + GpuMemory: gpuMemory, + GpuCount: gpuCount, + Addresses: resources.NetAddresses, + DiskSize: resources.DiskSize, + NumCPUs: resources.CpuCount, + Memory: resources.TotalMemory, + } + + b, err := json.Marshal(info) + if err != nil { + return fmt.Errorf("failed to marshal %s: %w", path, err) + } + + f, err := os.Create(path) + if err != nil { + return fmt.Errorf("failed to create %s: %w", path, err) + } + defer f.Close() + + _, err = f.Write(b) + if err != nil { + return fmt.Errorf("failed to write %s: %w", path, err) + } + + err = f.Sync() + if err != nil { + return fmt.Errorf("failed to fsync %s: %w", path, err) + } + + return nil +} diff --git a/runner/internal/shim/models.go b/runner/internal/shim/models.go index 2ca7fa9ef2..6cc0935053 100644 --- a/runner/internal/shim/models.go +++ b/runner/internal/shim/models.go @@ -1,89 +1,131 @@ package shim import ( - "encoding/base64" - "encoding/json" - "log" - "github.com/docker/docker/api/types/mount" - "github.com/docker/docker/api/types/registry" ) type DockerParameters interface { + DockerPassEnv() []string DockerPrivileged() bool - DockerKeepContainer() bool - DockerShellCommands([]string) []string + DockerShellCommands(authorizedKeys []string, runnerHttpAddress string) []string DockerMounts(string) ([]mount.Mount, error) DockerPorts() []int - MakeRunnerDir() (string, error) + MakeRunnerDir(name string) (string, error) DockerPJRTDevice() string } type CLIArgs struct { Shim struct { - HTTPPort int - HomeDir string + HTTPPort int + HomeDir string + BinaryPath string + LogLevel string } Runner struct { - HTTPPort int - LogLevel int - Version string - DevChannel bool - BinaryPath string - TempDir string - HomeDir string - WorkingDir string + HTTPPort int + SSHPort int + SSHLogLevel string + DownloadURL string + BinaryPath string + LogLevel string + } + + DCGMExporter struct { + HTTPPort int + Interval int // milliseconds + } + + DCGM struct { + Address string } Docker struct { - SSHPort int - KeepContainer bool - ConcatinatedPublicSSHKeys string - Privileged bool - PJRTDevice string + PassEnv string + Privileged bool + PJRTDevice string } } -type MountPoint struct { +type NetworkMode string + +const ( + NetworkModeHost = "host" + NetworkModeBridge = "bridge" +) + +type VolumeMountPoint struct { Name string `json:"name"` Path string `json:"path"` } +type InstanceMountPoint struct { + InstancePath string `json:"instance_path"` + Path string `json:"path"` +} + type VolumeInfo struct { - Name string `json:"name"` - VolumeId string `json:"volume_id"` - InitFs bool `json:"init_fs"` + Backend string `json:"backend"` + Name string `json:"name"` + VolumeId string `json:"volume_id"` + InitFs bool `json:"init_fs"` + DeviceName string `json:"device_name"` } -type TaskConfig struct { - Username string `json:"username"` - Password string `json:"password"` - ImageName string `json:"image_name"` - ContainerName string `json:"container_name"` - ShmSize int64 `json:"shm_size"` - PublicKeys []string `json:"public_keys"` - SshUser string `json:"ssh_user"` - SshKey string `json:"ssh_key"` - Mounts []MountPoint `json:"mounts"` - Volumes []VolumeInfo `json:"volumes"` +type PortMapping struct { + Host int `json:"host"` + Container int `json:"container"` } -func (ra TaskConfig) EncodeRegistryAuth() (string, error) { - if ra.Username == "" && ra.Password == "" { - return "", nil - } +type GPUDevice struct { + PathOnHost string `json:"path_on_host"` + PathInContainer string `json:"path_in_container"` +} - authConfig := registry.AuthConfig{ - Username: ra.Username, - Password: ra.Password, - } +type TaskConfig struct { + ID string `json:"id"` + Name string `json:"name"` + RegistryUsername string `json:"registry_username"` + RegistryPassword string `json:"registry_password"` + ImageName string `json:"image_name"` + ContainerUser string `json:"container_user"` + Privileged bool `json:"privileged"` + GPU int `json:"gpu"` // -1 = all available, even if zero; 0 = zero, ... + CPU float64 `json:"cpu"` // 0.0 = all available; 0.5 = a half of CPU, ... + Memory int64 `json:"memory"` // bytes; 0 = all avaliable + ShmSize int64 `json:"shm_size"` // bytes; 0 = default (64MiB) + NetworkMode NetworkMode `json:"network_mode"` + Volumes []VolumeInfo `json:"volumes"` + VolumeMounts []VolumeMountPoint `json:"volume_mounts"` + InstanceMounts []InstanceMountPoint `json:"instance_mounts"` + // GPUDevices allows the server to set gpu devices instead of relying on the runner default logic. + // E.g. passing nvidia devices directly instead of using nvidia-container-toolkit. + GPUDevices []GPUDevice `json:"gpu_devices"` + HostSshUser string `json:"host_ssh_user"` + HostSshKeys []string `json:"host_ssh_keys"` + ContainerSshKeys []string `json:"container_ssh_keys"` +} - encodedConfig, err := json.Marshal(authConfig) - if err != nil { - log.Println("Failed to encode auth config", "err", err) - return "", err - } +type TaskListItem struct { + ID string `json:"id"` + Status TaskStatus `json:"status"` +} + +type ImagePullProgress struct { + DownloadedBytes uint64 `json:"downloaded_bytes"` + ExtractedBytes uint64 `json:"extracted_bytes"` + TotalBytes uint64 `json:"total_bytes"` + IsTotalBytesFinal bool `json:"is_total_bytes_final"` +} - return base64.URLEncoding.EncodeToString(encodedConfig), nil +type TaskInfo struct { + ID string + Status TaskStatus + TerminationReason string + TerminationMessage string + Ports []PortMapping + ImagePullProgress *ImagePullProgress + ContainerName string + ContainerID string + GpuIDs []string } diff --git a/runner/internal/shim/resources.go b/runner/internal/shim/resources.go new file mode 100644 index 0000000000..e0d888873b --- /dev/null +++ b/runner/internal/shim/resources.go @@ -0,0 +1,134 @@ +package shim + +import ( + "context" + "errors" + "fmt" + "sync" + + "github.com/dstackai/dstack/runner/internal/common/gpu" + "github.com/dstackai/dstack/runner/internal/common/log" + "github.com/dstackai/dstack/runner/internal/shim/host" +) + +var ErrNoCapacity = errors.New("no capacity") + +type Resources struct { + Gpus []host.GpuInfo + CpuCount int + TotalMemory uint64 // bytes + DiskSize uint64 // bytes + NetAddresses []string +} + +type GpuLock struct { + // resource ID: locked mapping, where resource ID is vendor-specific: + // NVIDIA: host.GpuInfo.ID + // AMD: host.GpuInfo.RenderNodePath + lock map[string]bool + mu sync.Mutex +} + +func NewGpuLock(gpus []host.GpuInfo) (*GpuLock, error) { + lock := make(map[string]bool, len(gpus)) + if len(gpus) > 0 { + vendor := gpus[0].Vendor + for _, gpu_ := range gpus { + if gpu_.Vendor != vendor { + return nil, errors.New("multiple GPU vendors detected") + } + var resourceID string + switch vendor { + case gpu.GpuVendorNvidia: + resourceID = gpu_.ID + case gpu.GpuVendorAmd: + resourceID = gpu_.RenderNodePath + case gpu.GpuVendorTenstorrent: + resourceID = gpu_.Index + case gpu.GpuVendorIntel: + resourceID = gpu_.Index + case gpu.GpuVendorNone: + return nil, fmt.Errorf("unexpected GPU vendor %s", vendor) + default: + return nil, fmt.Errorf("unexpected GPU vendor %s", vendor) + } + lock[resourceID] = false + } + } + return &GpuLock{lock: lock}, nil +} + +// Acquire returns a requested number of GPU resource IDs, marking them locked (busy) +// If there are not enough idle GPUs, none is locked and ErrNoCapacity is returned +// -1 means "all available GPUs", even if none, that is, Acquire(-1) never fails, +// even on hosts without GPU +// To release acquired GPUs, pass the returned resource IDs to Release() method +func (gl *GpuLock) Acquire(ctx context.Context, count int) ([]string, error) { + if count == 0 || count < -1 { + return nil, fmt.Errorf("count must be either positive or -1, got %d", count) + } + gl.mu.Lock() + defer gl.mu.Unlock() + var size int + if count > 0 { + size = count + } else { + size = len(gl.lock) + } + ids := make([]string, 0, size) + for id, locked := range gl.lock { + if !locked { + ids = append(ids, id) + } + if count > 0 && len(ids) >= count { + break + } + } + if len(ids) < count { + return nil, fmt.Errorf("%w: %d GPUs requested, %d available", ErrNoCapacity, count, len(ids)) + } + for _, id := range ids { + gl.lock[id] = true + } + return ids, nil +} + +// Lock marks passed Resource IDs as locked (busy) +// This method never fails, it's safe to lock already locked resource or try to lock unknown resource +// The returned slice contains only actually locked resource IDs +func (gl *GpuLock) Lock(ctx context.Context, ids []string) []string { + gl.mu.Lock() + defer gl.mu.Unlock() + lockedIDs := make([]string, 0, len(ids)) + for _, id := range ids { + if locked, ok := gl.lock[id]; !ok { + log.Warning(ctx, "skip locking: unknown GPU resource", "id", id) + } else if locked { + log.Info(ctx, "skip locking: GPU already locked", "id", id) + } else { + gl.lock[id] = true + lockedIDs = append(lockedIDs, id) + } + } + return lockedIDs +} + +// Release marks passed Resource IDs as idle +// This method never fails, it's safe to release already idle resource or try to release unknown resource +// The returned slice contains only actually released resource IDs +func (gl *GpuLock) Release(ctx context.Context, ids []string) []string { + gl.mu.Lock() + defer gl.mu.Unlock() + releasedIDs := make([]string, 0, len(ids)) + for _, id := range ids { + if locked, ok := gl.lock[id]; !ok { + log.Warning(ctx, "skip releasing: unknown GPU resource", "id", id) + } else if !locked { + log.Info(ctx, "skip releasing: GPU not locked", "id", id) + } else { + gl.lock[id] = false + releasedIDs = append(releasedIDs, id) + } + } + return releasedIDs +} diff --git a/runner/internal/shim/resources_test.go b/runner/internal/shim/resources_test.go new file mode 100644 index 0000000000..424ff55b41 --- /dev/null +++ b/runner/internal/shim/resources_test.go @@ -0,0 +1,209 @@ +package shim + +import ( + "context" + "testing" + + "github.com/dstackai/dstack/runner/internal/common/gpu" + "github.com/dstackai/dstack/runner/internal/shim/host" + "github.com/stretchr/testify/assert" +) + +func TestNewGpuLock_NoGpus(t *testing.T) { + var gpus []host.GpuInfo + gl, err := NewGpuLock(gpus) + assert.Nil(t, err) + assert.Equal(t, map[string]bool{}, gl.lock) +} + +func TestNewGpuLock_NvidiaGpus(t *testing.T) { + gpus := []host.GpuInfo{ + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-beef"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-f00d"}, + } + gl, err := NewGpuLock(gpus) + assert.Nil(t, err) + expected := map[string]bool{ + "GPU-beef": false, + "GPU-f00d": false, + } + assert.Equal(t, expected, gl.lock) +} + +func TestNewGpuLock_AmdGpus(t *testing.T) { + gpus := []host.GpuInfo{ + {Vendor: gpu.GpuVendorAmd, RenderNodePath: "/dev/dri/renderD128"}, + {Vendor: gpu.GpuVendorAmd, RenderNodePath: "/dev/dri/renderD129"}, + } + gl, err := NewGpuLock(gpus) + assert.Nil(t, err) + expected := map[string]bool{ + "/dev/dri/renderD128": false, + "/dev/dri/renderD129": false, + } + assert.Equal(t, expected, gl.lock) +} + +func TestNewGpuLock_ErrorMultipleVendors(t *testing.T) { + gpus := []host.GpuInfo{ + {Vendor: gpu.GpuVendorAmd}, + {Vendor: gpu.GpuVendorNvidia}, + } + gl, err := NewGpuLock(gpus) + assert.Nil(t, gl) + assert.ErrorContains(t, err, "multiple GPU vendors") +} + +func TestGpuLock_Acquire_ErrorBadCount(t *testing.T) { + gl, _ := NewGpuLock([]host.GpuInfo{}) + + ids, err := gl.Acquire(context.Background(), 0) + assert.ErrorContains(t, err, "count must be either positive or -1, got 0") + assert.Equal(t, 0, len(ids)) + + ids, err = gl.Acquire(context.Background(), -2) + assert.ErrorContains(t, err, "count must be either positive or -1, got -2") + assert.Equal(t, 0, len(ids)) +} + +func TestGpuLock_Acquire_All_Available(t *testing.T) { + gpus := []host.GpuInfo{ + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-beef"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-f00d"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-c0de"}, + } + gl, _ := NewGpuLock(gpus) + gl.lock["GPU-f00d"] = true + ids, err := gl.Acquire(context.Background(), -1) + assert.Nil(t, err) + assert.ElementsMatch(t, []string{"GPU-beef", "GPU-c0de"}, ids) + assert.True(t, gl.lock["GPU-beef"], "GPU-beef") + assert.True(t, gl.lock["GPU-f00d"], "GPU-f00d") + assert.True(t, gl.lock["GPU-c0de"], "GPU-c0de") +} + +func TestGpuLock_Acquire_All_NoneAvailable(t *testing.T) { + gpus := []host.GpuInfo{ + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-beef"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-f00d"}, + } + gl, _ := NewGpuLock(gpus) + gl.lock["GPU-beef"] = true + gl.lock["GPU-f00d"] = true + ids, err := gl.Acquire(context.Background(), -1) + assert.Nil(t, err) + assert.Equal(t, 0, len(ids)) +} + +func TestGpuLock_Acquire_All_NoGpus(t *testing.T) { + gl, _ := NewGpuLock([]host.GpuInfo{}) + ids, err := gl.Acquire(context.Background(), -1) + assert.Nil(t, err) + assert.Equal(t, 0, len(ids)) +} + +func TestGpuLock_Acquire_Count_OK(t *testing.T) { + gpus := []host.GpuInfo{ + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-beef"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-f00d"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-c0de"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-cafe"}, + } + gl, _ := NewGpuLock(gpus) + gl.lock["GPU-f00d"] = true + ids, err := gl.Acquire(context.Background(), 2) + assert.Nil(t, err) + assert.Equal(t, 2, len(ids)) + assert.NotEqual(t, ids[0], ids[1]) + assert.NotContains(t, ids, "GPU-f00d") + for id, locked := range gl.lock { + switch id { + case "GPU-f00d", ids[0], ids[1]: + assert.True(t, locked, id) + default: + assert.False(t, locked, id) + } + } +} + +func TestGpuLock_Acquire_Count_ErrNoCapacity(t *testing.T) { + gpus := []host.GpuInfo{ + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-beef"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-f00d"}, + } + gl, _ := NewGpuLock(gpus) + gl.lock["GPU-f00d"] = true + ids, err := gl.Acquire(context.Background(), 2) + assert.ErrorContains(t, err, "2 GPUs requested, 1 available") + assert.Equal(t, 0, len(ids)) + assert.False(t, gl.lock["GPU-beef"], "GPU-beef") + assert.True(t, gl.lock["GPU-f00d"], "GPU-f00d") +} + +func TestGpuLock_Lock(t *testing.T) { + gpus := []host.GpuInfo{ + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-beef"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-f00d"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-c0de"}, + } + gl, _ := NewGpuLock(gpus) + gl.lock["GPU-beef"] = true + gl.lock["GPU-f00d"] = true + locked := gl.Lock(context.Background(), []string{ + "GPU-beef", // already locked + "GPU-dead", // unknown + "GPU-c0de", // not locked + }) + assert.Equal(t, []string{"GPU-c0de"}, locked) + assert.True(t, gl.lock["GPU-beef"], "GPU-beef") // was already locked + assert.True(t, gl.lock["GPU-f00d"], "GPU-f00d") // was already locked + assert.True(t, gl.lock["GPU-c0de"], "GPU-c0de") // has been locked +} + +func TestGpuLock_Lock_Nil(t *testing.T) { + gpus := []host.GpuInfo{ + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-beef"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-f00d"}, + } + gl, _ := NewGpuLock(gpus) + gl.lock["GPU-beef"] = true + var ids []string + locked := gl.Lock(context.Background(), ids) + assert.Equal(t, []string{}, locked) + assert.True(t, gl.lock["GPU-beef"], "GPU-beef") + assert.False(t, gl.lock["GPU-f00d"], "GPU-f00d") +} + +func TestGpuLock_Release(t *testing.T) { + gpus := []host.GpuInfo{ + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-beef"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-f00d"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-c0de"}, + } + gl, _ := NewGpuLock(gpus) + gl.lock["GPU-beef"] = true + gl.lock["GPU-f00d"] = true + released := gl.Release(context.Background(), []string{ + "GPU-beef", // locked + "GPU-dead", // unknown + "GPU-c0de", // not locked + }) + assert.Equal(t, []string{"GPU-beef"}, released) + assert.False(t, gl.lock["GPU-beef"], "GPU-beef") // has been unlocked + assert.True(t, gl.lock["GPU-f00d"], "GPU-f00d") // still locked + assert.False(t, gl.lock["GPU-c0de"], "GPU-c0de") // was already unlocked +} + +func TestGpuLock_Release_Nil(t *testing.T) { + gpus := []host.GpuInfo{ + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-beef"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-f00d"}, + } + gl, _ := NewGpuLock(gpus) + gl.lock["GPU-beef"] = true + var ids []string + released := gl.Release(context.Background(), ids) + assert.Equal(t, []string{}, released) + assert.True(t, gl.lock["GPU-beef"], "GPU-beef") + assert.False(t, gl.lock["GPU-f00d"], "GPU-f00d") +} diff --git a/runner/internal/shim/runner.go b/runner/internal/shim/runner.go deleted file mode 100644 index f96e9afafa..0000000000 --- a/runner/internal/shim/runner.go +++ /dev/null @@ -1,126 +0,0 @@ -package shim - -import ( - "context" - "errors" - "fmt" - "io" - "log" - "net/http" - "os" - "strconv" - "strings" - "time" - - "github.com/dstackai/dstack/runner/internal/gerrors" -) - -const ( - DstackRunnerURL = "https://%s.s3.eu-west-1.amazonaws.com/%s/binaries/dstack-runner-%s-%s" - DstackReleaseBucket = "dstack-runner-downloads" - DstackStagingBucket = "dstack-runner-downloads-stgn" - DstackRunnerBinaryName = "/usr/local/bin/dstack-runner" -) - -func (c *CLIArgs) GetDockerCommands() []string { - return []string{ - // start runner - fmt.Sprintf("%s %s", DstackRunnerBinaryName, strings.Join(c.getRunnerArgs(), " ")), - } -} - -func (c *CLIArgs) DownloadRunner() error { - url := makeDownloadRunnerURL(c.Runner.Version, c.Runner.DevChannel) - - runnerBinaryPath, err := downloadRunner(url) - if err != nil { - return gerrors.Wrap(err) - } - - c.Runner.BinaryPath = runnerBinaryPath - - return nil -} - -func (c *CLIArgs) getRunnerArgs() []string { - return []string{ - "--log-level", strconv.Itoa(c.Runner.LogLevel), - "start", - "--http-port", strconv.Itoa(c.Runner.HTTPPort), - "--temp-dir", c.Runner.TempDir, - "--home-dir", c.Runner.HomeDir, - "--working-dir", c.Runner.WorkingDir, - } -} - -func makeDownloadRunnerURL(version string, staging bool) string { - bucket := DstackReleaseBucket - if staging { - bucket = DstackStagingBucket - } - - osName := "linux" - archName := "amd64" - - url := fmt.Sprintf(DstackRunnerURL, bucket, version, osName, archName) - return url -} - -func downloadRunner(url string) (string, error) { - tempFile, err := os.CreateTemp("", "dstack-runner") - if err != nil { - return "", gerrors.Wrap(err) - } - defer func() { - err := tempFile.Close() - if err != nil { - log.Printf("close file error: %s\n", err) - } - }() - - log.Printf("Downloading runner from %s\n", url) - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) - defer cancel() - - req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) - if err != nil { - return "", gerrors.Wrap(err) - } - - resp, err := http.DefaultClient.Do(req) - if err != nil { - return "", gerrors.Wrap(err) - } - defer func() { - err := resp.Body.Close() - if err != nil { - log.Printf("downloadRunner: close body error: %s\n", err) - } - }() - - if resp.StatusCode != http.StatusOK { - return "", gerrors.Newf("unexpected status code: %s", resp.Status) - } - - written, err := io.Copy(tempFile, resp.Body) - if err != nil { - return "", gerrors.Wrap(err) - } - - select { - case <-ctx.Done(): - err := ctx.Err() - if errors.Is(err, context.DeadlineExceeded) { - fmt.Printf("downloadRunner: %s, %d bytes out of %d bytes were downloaded", err, written, resp.ContentLength) - return "", gerrors.Newf("Cannot download runner %w", err) - } - default: - log.Printf("The runner was downloaded successfully (%d bytes)", written) - } - - if err := tempFile.Chmod(0o755); err != nil { - return "", gerrors.Wrap(err) - } - - return tempFile.Name(), nil -} diff --git a/runner/internal/shim/states.go b/runner/internal/shim/states.go deleted file mode 100644 index e12f66041b..0000000000 --- a/runner/internal/shim/states.go +++ /dev/null @@ -1,10 +0,0 @@ -package shim - -type RunnerStatus string - -const ( - Pending RunnerStatus = "pending" - Pulling RunnerStatus = "pulling" - Creating RunnerStatus = "creating" - Running RunnerStatus = "running" -) diff --git a/runner/internal/shim/task.go b/runner/internal/shim/task.go new file mode 100644 index 0000000000..ea3ad7c960 --- /dev/null +++ b/runner/internal/shim/task.go @@ -0,0 +1,239 @@ +package shim + +import ( + "context" + "crypto/sha256" + "fmt" + "sync" + + "github.com/dstackai/dstack/runner/internal/common/log" +) + +type TaskStatus string + +const ( + // pending -> preparing -> pulling -> creating -> running -> terminated + // | | | | + // v v v v + // terminated terminated terminated terminated + TaskStatusPending TaskStatus = "pending" + TaskStatusPreparing TaskStatus = "preparing" + TaskStatusPulling TaskStatus = "pulling" + TaskStatusCreating TaskStatus = "creating" + TaskStatusRunning TaskStatus = "running" + TaskStatusTerminated TaskStatus = "terminated" +) + +// Task represents shim-specific part of dstack server's Job entity, +// both configuration submitted by the server (container image, +// container user, etc.) and state managed by the shim (container ID, +// status, etc.) +type Task struct { + ID string + Status TaskStatus + TerminationReason string + TerminationMessage string + + config TaskConfig + containerName string + containerID string + cancelPull context.CancelFunc + gpuIDs []string + ports []PortMapping + runnerDir string // path on host mapped to consts.RunnerDir in container + + pullTracker *PullTracker + + mu *sync.Mutex +} + +// Lock is used for exclusive operations, e.g, stopping a container, +// removing task data, etc. +func (t *Task) Lock(ctx context.Context) { + if !t.mu.TryLock() { + log.Fatal(ctx, "already locked!", "task", t.ID) + } + log.Debug(ctx, "locked", "task", t.ID) +} + +// Release should be called Unlock, but this name triggers govet copylocks check, +// since "thanks" to Go implicit interfaces, a struct with Lock/Unlock method pair +// looks like lock: https://github.com/golang/go/issues/18451 +func (t *Task) Release(ctx context.Context) { + t.mu.Unlock() + log.Debug(ctx, "unlocked", "task", t.ID) +} + +func (t *Task) IsTransitionAllowed(toStatus TaskStatus) bool { + // same-state transitions are not allowed unless stated otherwise, meaning that + // task.Update(); task.Update() is not allowed is most cases. + // This is mainly done to avoid erroneous/concurrent updates, though this limits + // our ability to commit internal state more often. + // If this becomes a problem, consider allowing sameState->sameState transitions in general. + switch toStatus { + case TaskStatusPending: + // initial status, task should be Add()ed with it, not Update()d + return false + case TaskStatusPreparing: + return t.Status == TaskStatusPending + case TaskStatusPulling: + return t.Status == TaskStatusPreparing + case TaskStatusCreating: + return t.Status == TaskStatusPulling + case TaskStatusRunning: + // allow running->running transition to update internal state, e.g., ports + return t.Status == TaskStatusCreating || t.Status == TaskStatusRunning + case TaskStatusTerminated: + // terminated -> terminated is also allowed since server _always_ tries to + // terminate the task, even if it is already terminated, but this is a special case, + // see TaskStorage.Update() for details + return true + } + return false +} + +// NB: Some SetStatus* methods also accept and set state fields, but this is for convenience only, +// and does not mean that all state fields are managed that way (quite contrary, most of the fields +// are set directly) + +func (t *Task) SetStatusPreparing() { + t.Status = TaskStatusPreparing +} + +func (t *Task) SetStatusPulling(cancelPull context.CancelFunc) { + t.Status = TaskStatusPulling + t.cancelPull = cancelPull +} + +func (t *Task) SetStatusCreating() { + t.Status = TaskStatusCreating + t.cancelPull = nil +} + +func (t *Task) SetStatusRunning() { + t.Status = TaskStatusRunning +} + +func (t *Task) SetStatusTerminated(reason string, message string) { + t.Status = TaskStatusTerminated + t.TerminationReason = reason + t.TerminationMessage = message + t.cancelPull = nil +} + +func NewTask(id string, status TaskStatus, containerName string, containerID string, gpuIDs []string, ports []PortMapping, runnerDir string) Task { + return Task{ + ID: id, + Status: status, + containerName: containerName, + containerID: containerID, + runnerDir: runnerDir, + gpuIDs: gpuIDs, + ports: ports, + pullTracker: newPullTracker(), + mu: &sync.Mutex{}, + } +} + +func NewTaskFromConfig(cfg TaskConfig) Task { + return Task{ + ID: cfg.ID, + Status: TaskStatusPending, + config: cfg, + containerName: generateUniqueName(cfg.Name, cfg.ID), + pullTracker: newPullTracker(), + mu: &sync.Mutex{}, + } +} + +type TaskStorage struct { + // Task.ID: Task mapping + tasks map[string]Task + mu sync.RWMutex +} + +// Get a _copy_ of all tasks. To "commit" changes, use Update() +func (ts *TaskStorage) List() []Task { + ts.mu.RLock() + defer ts.mu.RUnlock() + tasks := make([]Task, 0, len(ts.tasks)) + for _, task := range ts.tasks { + tasks = append(tasks, task) + } + return tasks +} + +// Get a _copy_ of the task. To "commit" changes, use Update() +func (ts *TaskStorage) Get(id string) (Task, bool) { + ts.mu.RLock() + defer ts.mu.RUnlock() + task, ok := ts.tasks[id] + return task, ok +} + +// Add a _new_ task. If the task is already in the storage, do nothing and return false +func (ts *TaskStorage) Add(task Task) bool { + ts.mu.Lock() + defer ts.mu.Unlock() + if _, ok := ts.tasks[task.ID]; ok { + return false + } + ts.tasks[task.ID] = task + return true +} + +// Update the _existing_ task. If the task is not in the storage, do nothing and return false +// If the current status is terminated, do nothing and return false +func (ts *TaskStorage) Update(task Task) error { + ts.mu.Lock() + defer ts.mu.Unlock() + currentTask, ok := ts.tasks[task.ID] + if !ok { + return ErrNotFound + } + if !currentTask.IsTransitionAllowed(task.Status) { + return fmt.Errorf("%w: %s -> %s transition not allowed", ErrRequest, currentTask.Status, task.Status) + } + if currentTask.Status == TaskStatusTerminated { + // We ignore reason/message fields if they are already set to avoid + // overriding these fields by the server, which _always_ tries to terminate the task, + // even if it is not running + if currentTask.TerminationReason != "" { + task.TerminationReason = currentTask.TerminationReason + task.TerminationMessage = currentTask.TerminationMessage + } + } + ts.tasks[task.ID] = task + return nil +} + +func (ts *TaskStorage) Delete(id string) { + ts.mu.Lock() + defer ts.mu.Unlock() + delete(ts.tasks, id) +} + +func NewTaskStorage() TaskStorage { + return TaskStorage{ + tasks: make(map[string]Task), + } +} + +// generateUniqueName returns a unique name in the form of -, +// where is non-unique human-readable name provided by the server, and +// is a relatively short unique hex string generated from (name, id) pair +func generateUniqueName(name string, id string) string { + suffix := generateNameSuffix(name, id) + return fmt.Sprintf("%s-%s", name, suffix) +} + +// generateNameSuffix returns a (semi-)unique hex string based on (name, id) pair +// Used to avoid possible name clashes +// The generated string is unique as long as +// - (name, id) pair is unique +// - there is no collision within first nameSuffixLen / 2 bytes of hash +func generateNameSuffix(name string, id string) string { + const nameSuffixLen = 8 + b := []byte(fmt.Sprintf("%s/%s", name, id)) + return fmt.Sprintf("%x", sha256.Sum256(b))[:nameSuffixLen] +} diff --git a/runner/internal/shim/task_test.go b/runner/internal/shim/task_test.go new file mode 100644 index 0000000000..37ea6d7542 --- /dev/null +++ b/runner/internal/shim/task_test.go @@ -0,0 +1,150 @@ +package shim + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestTaskStorage_Get(t *testing.T) { + storage := NewTaskStorage() + storedTask := Task{ID: "1", Status: TaskStatusRunning} + storage.tasks["1"] = storedTask + + task, ok := storage.Get("1") + assert.True(t, ok) + assert.Equal(t, storedTask, task) + + task, ok = storage.Get("2") + assert.False(t, ok) + assert.NotEqual(t, storedTask, task) +} + +func TestTaskStorage_Add_OK(t *testing.T) { + storage := NewTaskStorage() + storedTask := Task{ID: "1", Status: TaskStatusRunning} + storage.tasks["1"] = storedTask + addedTask := Task{ID: "2", Status: TaskStatusPending} + + ok := storage.Add(addedTask) + assert.True(t, ok) + assert.Equal(t, storedTask, storage.tasks["1"]) + assert.Equal(t, addedTask, storage.tasks["2"]) +} + +func TestTaskStorage_Add_AlreadyExists(t *testing.T) { + storage := NewTaskStorage() + storedTask := Task{ID: "1", Status: TaskStatusRunning} + storage.tasks["1"] = storedTask + + ok := storage.Add(Task{ID: "1", Status: TaskStatusPending}) + assert.False(t, ok) + assert.Equal(t, storedTask, storage.tasks["1"]) +} + +func TestTaskStorage_Update_OK(t *testing.T) { + storage := NewTaskStorage() + storedTask := Task{ID: "1", Status: TaskStatusRunning} + storage.tasks["1"] = storedTask + updatedTask := Task{ID: "1", Status: TaskStatusTerminated} + + err := storage.Update(updatedTask) + assert.Nil(t, err) + assert.Equal(t, updatedTask, storage.tasks["1"]) +} + +func TestTaskStorage_Update_DoesNotExist(t *testing.T) { + storage := NewTaskStorage() + + err := storage.Update(Task{ID: "1", Status: TaskStatusPending}) + assert.ErrorIs(t, err, ErrNotFound) + assert.Equal(t, 0, len(storage.tasks)) +} + +func TestTaskStorage_Update_TransitionNotAllowed(t *testing.T) { + storage := NewTaskStorage() + storedTask := Task{ID: "1", Status: TaskStatusPending} + storage.tasks["1"] = storedTask + updatedTask := Task{ID: "1", Status: TaskStatusRunning} + + err := storage.Update(updatedTask) + assert.ErrorIs(t, err, ErrRequest) + assert.ErrorContains(t, err, fmt.Sprintf("%s -> %s", storedTask.Status, updatedTask.Status)) + assert.Equal(t, storedTask, storage.tasks["1"]) +} + +func TestTaskStorage_Delete(t *testing.T) { + storage := NewTaskStorage() + storage.tasks["1"] = Task{ID: "1", Status: TaskStatusRunning} + + storage.Delete("2") + assert.Equal(t, 1, len(storage.tasks)) + + storage.Delete("1") + assert.Equal(t, 0, len(storage.tasks)) +} + +func TestTask_IsTransitionAllowed_true(t *testing.T) { + testCases := []struct { + oldStatus, newStatus TaskStatus + }{ + {TaskStatusPending, TaskStatusPreparing}, + {TaskStatusPending, TaskStatusTerminated}, + {TaskStatusPreparing, TaskStatusPulling}, + {TaskStatusPreparing, TaskStatusTerminated}, + {TaskStatusPulling, TaskStatusCreating}, + {TaskStatusPulling, TaskStatusTerminated}, + {TaskStatusCreating, TaskStatusRunning}, + {TaskStatusCreating, TaskStatusTerminated}, + {TaskStatusRunning, TaskStatusRunning}, + {TaskStatusRunning, TaskStatusTerminated}, + {TaskStatusTerminated, TaskStatusTerminated}, + } + for _, tc := range testCases { + task := Task{ID: "1", Status: tc.oldStatus} + assert.True(t, task.IsTransitionAllowed(tc.newStatus), "%s -> %s", tc.oldStatus, tc.newStatus) + } +} + +func TestTask_IsTransitionAllowed_false(t *testing.T) { + testCases := []struct { + oldStatus, newStatus TaskStatus + }{ + // non-exhaustive list of impossible transitions + {TaskStatusPending, TaskStatusPending}, + {TaskStatusPending, TaskStatusRunning}, + {TaskStatusPulling, TaskStatusPending}, + } + for _, tc := range testCases { + task := Task{ID: "1", Status: tc.oldStatus} + assert.False(t, task.IsTransitionAllowed(tc.newStatus), "%s -> %s", tc.oldStatus, tc.newStatus) + } +} + +func TestNewTaskFromConfig(t *testing.T) { + cfg := TaskConfig{ + ID: "66a886db-86db-4cf9-8c06-8984ad15dde2", + Name: "vllm-0-0", + } + task := NewTaskFromConfig(cfg) + + assert.Equal(t, "66a886db-86db-4cf9-8c06-8984ad15dde2", task.ID) + assert.Equal(t, "vllm-0-0-cff1b8da", task.containerName) + assert.Equal(t, TaskStatusPending, task.Status) + assert.Equal(t, cfg, task.config) +} + +func TestGenerateUniqueName(t *testing.T) { + testCases := []struct { + name, id, expected string + }{ + {"vllm-0-0", "66a886db-86db-4cf9-8c06-8984ad15dde2", "vllm-0-0-cff1b8da"}, + {"vllm-0-0", "41728e34-bf7e-41da-bf0e-0f46764b1752", "vllm-0-0-bb2a28c3"}, + {"llamacpp-0-0", "66a886db-86db-4cf9-8c06-8984ad15dde2", "llamacpp-0-0-58d1283d"}, + } + for _, tc := range testCases { + generated := generateUniqueName(tc.name, tc.id) + assert.Equal(t, tc.expected, generated) + } +} diff --git a/runner/internal/shim/volumes.go b/runner/internal/shim/volumes.go new file mode 100644 index 0000000000..eb6fe024f6 --- /dev/null +++ b/runner/internal/shim/volumes.go @@ -0,0 +1,282 @@ +package shim + +import ( + "bytes" + "context" + "errors" + "fmt" + "os" + "os/exec" + "strings" + "time" + + "golang.org/x/sys/unix" + + "github.com/dstackai/dstack/runner/internal/common/log" + "github.com/dstackai/dstack/runner/internal/shim/backends" +) + +func prepareVolumes(ctx context.Context, taskConfig TaskConfig) error { + for _, volume := range taskConfig.Volumes { + err := formatAndMountVolume(ctx, volume) + if err != nil { + return fmt.Errorf("format and mount volume: %w", err) + } + } + return nil +} + +func unmountVolumes(ctx context.Context, taskConfig TaskConfig) error { + if len(taskConfig.Volumes) == 0 { + return nil + } + log.Debug(ctx, "Unmounting volumes...") + var failed []string + for _, volume := range taskConfig.Volumes { + mountPoint := getVolumeMountPoint(volume.Name) + cmd := exec.CommandContext(ctx, "mountpoint", mountPoint) + if output, err := cmd.CombinedOutput(); err != nil { + log.Info(ctx, "skipping", "mountpoint", mountPoint, "output", output) + continue + } + cmd = exec.CommandContext(ctx, "umount", "-qf", mountPoint) + if output, err := cmd.CombinedOutput(); err != nil { + log.Error(ctx, "failed to unmount", "mountpoint", mountPoint, "output", output) + failed = append(failed, mountPoint) + } else { + log.Debug(ctx, "unmounted", "mountpoint", mountPoint) + } + } + if len(failed) > 0 { + return fmt.Errorf("failed to unmount volume(s): %v", failed) + } + return nil +} + +func formatAndMountVolume(ctx context.Context, volume VolumeInfo) error { + backend, err := backends.GetBackend(volume.Backend) + if err != nil { + return fmt.Errorf("get backend: %w", err) + } + deviceName, err := backend.GetRealDeviceName(volume.VolumeId, volume.DeviceName) + if err != nil { + return fmt.Errorf("get real device name: %w", err) + } + fsCreated, err := initFileSystem(ctx, deviceName, !volume.InitFs) + if err != nil { + return fmt.Errorf("init file system: %w", err) + } + // Make FS root directory world-writable (0777) to give any job user + // a permission to create new files + // NOTE: mke2fs (that is, mkfs.ext4) supports `-E root_perms=0777` since 1.47.1: + // https://e2fsprogs.sourceforge.net/e2fsprogs-release.html#1.47.1 + // but, as of 2024-12-04, this version is too new to rely on, for example, + // Ubuntu 24.04 LTS has only 1.47.0 + // 0 means "do not chmod root directory" + var fsRootPerms os.FileMode = 0 + // Change permissions only if the FS was created by us, don't mess with + // user-formatted volumes + if fsCreated { + fsRootPerms = 0o777 + } + err = mountDisk(ctx, deviceName, getVolumeMountPoint(volume.Name), fsRootPerms) + if err != nil { + return fmt.Errorf("mount disk: %w", err) + } + return nil +} + +func getVolumeMountPoint(volumeName string) string { + // Put volumes in dstack-specific dir to avoid clashes with host dirs. + // /mnt/disks is used since on some VM images other places may not be writable (e.g. GCP COS). + return fmt.Sprintf("/mnt/disks/dstack-volumes/%s", volumeName) +} + +func prepareInstanceMountPoints(taskConfig TaskConfig) error { + // If the instance volume directory doesn't exist, create it with world-writable permissions (0777) + // to give any job user a permission to create new files + // If the directory already exists, do nothing, don't mess with already set permissions, especially + // on SSH fleets where permissions are managed by the host admin + for _, mountPoint := range taskConfig.InstanceMounts { + if _, err := os.Stat(mountPoint.InstancePath); errors.Is(err, os.ErrNotExist) { + // All missing parent dirs are created with 0755 permissions + if err = os.MkdirAll(mountPoint.InstancePath, 0o755); err != nil { + return fmt.Errorf("create instance mount directory: %w", err) + } + if err = os.Chmod(mountPoint.InstancePath, 0o777); err != nil { + return fmt.Errorf("chmod instance mount directory: %w", err) + } + } else if err != nil { + return fmt.Errorf("stat instance mount directory: %w", err) + } + } + return nil +} + +// initFileSystem creates an ext4 file system on a disk only if it does not +// already have one. Returns true if the file system was created. +// +// Safety contract: mkfs is reached ONLY after the device is confirmed to be a +// real, ready, non-zero-sized block device AND a direct superblock probe +// repeatedly confirms no signature. +func initFileSystem(ctx context.Context, deviceName string, errorIfNotExists bool) (bool, error) { + if err := waitForBlockDevice(ctx, deviceName, 10*time.Second); err != nil { + return false, fmt.Errorf("device %s not ready: %w", deviceName, err) + } + + fsType, hasFS, err := hasFilesystem(ctx, deviceName) + if err != nil { + return false, fmt.Errorf("failed to check if disk is formatted: %w", err) + } + if hasFS { + log.Debug(ctx, "disk already has a filesystem, skipping format", + "device", deviceName, "fstype", fsType) + return false, nil + } + + if errorIfNotExists { + return false, fmt.Errorf("disk %s has no file system", deviceName) + } + + log.Debug(ctx, "formatting disk with ext4 filesystem...", "device", deviceName) + cmd := exec.CommandContext(ctx, "mkfs.ext4", "-F", deviceName) + if output, err := cmd.CombinedOutput(); err != nil { + return false, fmt.Errorf("failed to format disk: %w, output: %s", err, string(output)) + } + log.Debug(ctx, "disk formatted succesfully!", "device", deviceName) + return true, nil +} + +// waitForBlockDevice blocks until deviceName is a block device with non-zero +// size, or until timeout. The retry loop is for availability (don't fail a job +// on a transient mid-attach state); the non-zero-block-device requirement is +// for safety (don't make a format decision about a not-ready device). +func waitForBlockDevice(ctx context.Context, deviceName string, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + var lastErr error + for { + size, err := blockDeviceSize(deviceName) + if err == nil && size > 0 { + return nil + } + if err != nil { + lastErr = err + } else { + lastErr = fmt.Errorf("device has zero size") + } + if time.Now().After(deadline) { + return fmt.Errorf("not a ready non-zero block device within %s: %w", timeout, lastErr) + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(200 * time.Millisecond): + } + } +} + +// blockDeviceSize returns the size in bytes of a block device, erroring if the +// path is not a block device or cannot be opened/queried. +func blockDeviceSize(deviceName string) (uint64, error) { + fi, err := os.Stat(deviceName) + if err != nil { + return 0, err + } + if fi.Mode()&os.ModeDevice == 0 || fi.Mode()&os.ModeCharDevice != 0 { + return 0, fmt.Errorf("%s is not a block device", deviceName) + } + f, err := os.OpenFile(deviceName, os.O_RDONLY, 0) + if err != nil { + return 0, err + } + defer func() { _ = f.Close() }() + // BLKGETSIZE64 returns the device size in bytes. + size, err := unix.IoctlGetInt(int(f.Fd()), unix.BLKGETSIZE64) + if err != nil { + return 0, fmt.Errorf("BLKGETSIZE64 ioctl on %s: %w", deviceName, err) + } + return uint64(size), nil +} + +// hasFilesystem reports whether deviceName has a filesystem, re-confirming a +// "no filesystem" verdict before believing it. +// +// The check is asymmetric on purpose: it prevents a hypothetical +// transient false "no-fs" from leading to a destructive mkfs. +func hasFilesystem(ctx context.Context, deviceName string) (string, bool, error) { + const confirmAttempts = 3 + const confirmInterval = 1 * time.Second + + fsType, hasFS, err := probeFilesystem(ctx, deviceName) + if err != nil || hasFS { + return fsType, hasFS, err + } + for attempt := range confirmAttempts { + select { + case <-ctx.Done(): + return "", false, ctx.Err() + case <-time.After(confirmInterval): + } + fsType, hasFS, err = probeFilesystem(ctx, deviceName) + if err != nil { + return "", false, err + } + if hasFS { + log.Warning(ctx, "filesystem appeared on re-probe, not formatting", + "fstype", fsType, "attempt", attempt) + return fsType, true, nil + } + } + return "", false, nil +} + +// probeFilesystem reports the filesystem type on deviceName via a direct +// superblock probe (blkid -p), independent of the udev/lsblk cache. +func probeFilesystem(ctx context.Context, deviceName string) (string, bool, error) { + cmd := exec.CommandContext(ctx, "blkid", "-p", "-o", "value", "-s", "TYPE", deviceName) + var out bytes.Buffer + cmd.Stdout = &out + runErr := cmd.Run() + fsType := strings.TrimSpace(out.String()) + if fsType != "" { + return fsType, true, nil // a filesystem signature was found + } + + var exitErr *exec.ExitError + if errors.As(runErr, &exitErr) && exitErr.ExitCode() == 2 { + return "", false, nil // exit 2: no signature at all -> genuinely blank + } + if runErr == nil { + return "", false, fmt.Errorf( + "device %s has a non-filesystem signature but no filesystem; likely wrong device resolved", + deviceName) + } + return "", false, fmt.Errorf("blkid probe of %s failed: %w (output: %q)", + deviceName, runErr, out.String()) +} + +func mountDisk(ctx context.Context, deviceName, mountPoint string, fsRootPerms os.FileMode) error { + // Create the mount point directory if it doesn't exist + if _, err := os.Stat(mountPoint); os.IsNotExist(err) { + log.Debug(ctx, "creating mount point...", "mountpoint", mountPoint) + if err := os.MkdirAll(mountPoint, 0o755); err != nil { + return fmt.Errorf("failed to create mount point: %w", err) + } + } + + // Mount the disk to the mount point + log.Debug(ctx, "mounting disk...", "device", deviceName, "mountpoint", mountPoint) + cmd := exec.CommandContext(ctx, "mount", deviceName, mountPoint) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("failed to mount disk: %w, output: %s", err, string(output)) + } + + if fsRootPerms != 0 { + if err := os.Chmod(mountPoint, fsRootPerms); err != nil { + return fmt.Errorf("failed to chmod volume root directory %s: %w", mountPoint, err) + } + } + + log.Debug(ctx, "disk mounted successfully!") + return nil +} diff --git a/scripts/add_backend.py b/scripts/add_backend.py new file mode 100644 index 0000000000..a18e48c7f2 --- /dev/null +++ b/scripts/add_backend.py @@ -0,0 +1,46 @@ +import argparse +from pathlib import Path + +import jinja2 + + +def main(): + parser = argparse.ArgumentParser( + description="This script generates boilerplate code for a new backend" + ) + parser.add_argument( + "-n", + "--name", + help=( + "The backend name in CamelCase, e.g. AWS, Runpod, VastAI." + " It'll be used for naming backend classes, models, etc." + ), + required=True, + ) + args = parser.parse_args() + generate_backend_code(args.name) + + +def generate_backend_code(backend_name: str): + template_dir_path = Path(__file__).parent.parent.joinpath( + "src/dstack/_internal/core/backends/template" + ) + env = jinja2.Environment( + loader=jinja2.FileSystemLoader( + searchpath=template_dir_path, + ), + keep_trailing_newline=True, + ) + backend_dir_path = Path(__file__).parent.parent.joinpath( + f"src/dstack/_internal/core/backends/{backend_name.lower()}" + ) + backend_dir_path.mkdir(exist_ok=True) + for filename in ["backend.py", "compute.py", "configurator.py", "models.py"]: + template = env.get_template(f"{filename}.jinja") + with open(backend_dir_path.joinpath(filename), "w+") as f: + f.write(template.render({"backend_name": backend_name})) + backend_dir_path.joinpath("__init__.py").write_text("") + + +if __name__ == "__main__": + main() diff --git a/scripts/aws_image_tools.py b/scripts/aws_image_tools.py new file mode 100644 index 0000000000..f9f8660901 --- /dev/null +++ b/scripts/aws_image_tools.py @@ -0,0 +1,410 @@ +""" +Tools for managing dstack AWS AMIs across regions. + +dstack publishes public AMIs (see scripts/packer/aws-image.json) to all regions +listed in scripts/packer/aws-vars-prod.json. Over time these accumulate and hit +the per-region AMI service quota (the AWS error looks like "the maximum number of +AMIs has been reached"). This script helps to: + + 1. request-quota Request a service quota increase across regions (e.g. the EC2 + "AMIs" / "Public AMIs" quota). + 2. list-quotas Discover quota codes and/or quota names + (e.g. search for "AMI") to use with request-quota. + 3. delete-amis Deregister AMIs older than a date and delete their snapshots. + Dry-run by default — pass --yes to actually delete. +""" + +import logging +import sys +from argparse import ArgumentParser, Namespace +from dataclasses import dataclass +from datetime import datetime, timezone +from typing import List, Optional + +import boto3 + +# Regions dstack copies AMIs to, kept in sync with scripts/packer/aws-vars-prod.json. +PROD_REGIONS = [ + "us-east-2", + "us-east-1", + "us-west-1", + "us-west-2", + "ca-central-1", + "eu-central-1", + "eu-west-1", + "eu-west-2", + "eu-west-3", + "eu-north-1", + "ap-southeast-1", +] + +# Default name prefix of dstack AMIs (e.g. dstack-0.18, dstack-cuda-0.18). +DEFAULT_NAME_PREFIX = "dstack-" + +EC2_SERVICE_CODE = "ec2" + + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s %(message)s", + datefmt="%Y-%m-%dT%H:%M:%S", +) +logger = logging.getLogger("aws_image_tools") + + +class ScriptError(Exception): + pass + + +@dataclass +class RequestQuotaCommandArgs: + regions: List[str] + service_code: str + quota_code: str + value: float + yes: bool + + @classmethod + def setup_parser(cls, parser: ArgumentParser) -> None: + parser.add_argument( + "--regions", + metavar="REGION", + nargs="*", + help="Regions to request the increase in (default: dstack prod regions)", + ) + parser.add_argument("--service-code", default=EC2_SERVICE_CODE) + parser.add_argument( + "--quota-code", + required=True, + help="Quota code, e.g. L-0E3CBAB9 (same across regions; find it with list-quotas)", + ) + parser.add_argument("--value", type=float, required=True, help="Desired new quota value") + parser.add_argument( + "--yes", + action="store_true", + help="Actually submit the requests (default: preview only)", + ) + parser.set_defaults(to_struct=cls.from_namespace, run_command=request_quota_command) + + @staticmethod + def from_namespace(args: Namespace) -> "RequestQuotaCommandArgs": + return RequestQuotaCommandArgs( + regions=args.regions or list(PROD_REGIONS), + service_code=args.service_code, + quota_code=args.quota_code, + value=args.value, + yes=args.yes, + ) + + +@dataclass +class ListQuotasCommandArgs: + region: str + service_code: str + search: Optional[str] + + @classmethod + def setup_parser(cls, parser: ArgumentParser) -> None: + parser.add_argument("--region", default=PROD_REGIONS[0], help="Region to list quotas in") + parser.add_argument("--service-code", default=EC2_SERVICE_CODE) + parser.add_argument("--search", help="Case-insensitive substring to filter quota names by") + parser.set_defaults(to_struct=cls.from_namespace, run_command=list_quotas_command) + + @staticmethod + def from_namespace(args: Namespace) -> "ListQuotasCommandArgs": + return ListQuotasCommandArgs( + region=args.region, + service_code=args.service_code, + search=args.search, + ) + + +@dataclass +class DeleteAmisCommandArgs: + regions: List[str] + before: datetime + name_prefix: str + name_contains: Optional[str] + keep_latest: int + yes: bool + + @classmethod + def setup_parser(cls, parser: ArgumentParser) -> None: + parser.add_argument( + "--regions", + metavar="REGION", + nargs="*", + help="Regions to delete AMIs in (default: dstack prod regions)", + ) + parser.add_argument( + "--before", + required=True, + metavar="YYYY-MM-DD", + help="Delete AMIs created strictly before this date (UTC)", + ) + parser.add_argument( + "--name-prefix", + default=DEFAULT_NAME_PREFIX, + help=f"Only consider AMIs whose name starts with this (default: {DEFAULT_NAME_PREFIX!r})", + ) + parser.add_argument( + "--name-contains", + help="Further restrict to AMIs whose name contains this substring " + "(case-insensitive), e.g. a version like 0.18", + ) + parser.add_argument( + "--keep-latest", + type=int, + default=0, + help="Always keep this many newest matching AMIs per region, " + "regardless of --before (default: 0)", + ) + parser.add_argument( + "--yes", + action="store_true", + help="Actually deregister AMIs and delete snapshots (default: preview only)", + ) + parser.set_defaults(to_struct=cls.from_namespace, run_command=delete_amis_command) + + @staticmethod + def from_namespace(args: Namespace) -> "DeleteAmisCommandArgs": + try: + before = datetime.strptime(args.before, "%Y-%m-%d").replace(tzinfo=timezone.utc) + except ValueError: + raise ScriptError(f"Invalid --before date {args.before!r}, expected YYYY-MM-DD") + if args.keep_latest < 0: + raise ScriptError("--keep-latest must be >= 0") + return DeleteAmisCommandArgs( + regions=args.regions or list(PROD_REGIONS), + before=before, + name_prefix=args.name_prefix, + name_contains=args.name_contains, + keep_latest=args.keep_latest, + yes=args.yes, + ) + + +def main() -> None: + parser = ArgumentParser(description="Tools for managing dstack AWS AMIs") + subparsers = parser.add_subparsers() + + request_quota_parser = subparsers.add_parser( + name="request-quota", + description="Request a service quota increase across regions.", + ) + RequestQuotaCommandArgs.setup_parser(request_quota_parser) + + list_quotas_parser = subparsers.add_parser( + name="list-quotas", + description="List service quotas (and their codes) in a region.", + ) + ListQuotasCommandArgs.setup_parser(list_quotas_parser) + + delete_amis_parser = subparsers.add_parser( + name="delete-amis", + description=( + "Deregister AMIs older than a date and delete their snapshots. " + "Dry-run by default; pass --yes to actually delete." + ), + ) + DeleteAmisCommandArgs.setup_parser(delete_amis_parser) + + args = parser.parse_args() + if not hasattr(args, "run_command"): + parser.print_help() + sys.exit(1) + try: + args.run_command(args.to_struct(args)) + except ScriptError as e: + logger.error("%s", e) + sys.exit(1) + + +def request_quota_command(args: RequestQuotaCommandArgs) -> None: + failed = False + quota_code = args.quota_code + for region in args.regions: + client = boto3.client("service-quotas", region_name=region) + current = _get_quota_value(client, args.service_code, quota_code) + if current is not None and current >= args.value: + logger.info( + "[%s] %s already at %g >= %g, skipping", + region, + quota_code, + current, + args.value, + ) + continue + + pending = _get_pending_request_value(client, args.service_code, quota_code) + if pending is not None and pending >= args.value: + logger.info( + "[%s] %s already has a pending request for %g, skipping", + region, + quota_code, + pending, + ) + continue + + if not args.yes: + logger.info( + "[%s] would request %s increase to %g (current: %s)", + region, + quota_code, + args.value, + "unknown" if current is None else f"{current:g}", + ) + continue + + try: + client.request_service_quota_increase( + ServiceCode=args.service_code, + QuotaCode=quota_code, + DesiredValue=args.value, + ) + logger.info("[%s] requested %s increase to %g", region, quota_code, args.value) + except Exception as e: + logger.error("[%s] failed to request %s increase: %s", region, quota_code, e) + failed = True + + if not args.yes: + logger.info("Preview only. Re-run with --yes to submit the requests.") + if failed: + raise ScriptError("Some quota requests failed or were skipped") + + +def _get_quota_value(client, service_code: str, quota_code: str) -> Optional[float]: + try: + resp = client.get_service_quota(ServiceCode=service_code, QuotaCode=quota_code) + return resp["Quota"]["Value"] + except Exception: + return None + + +def _get_pending_request_value(client, service_code: str, quota_code: str) -> Optional[float]: + try: + paginator = client.get_paginator("list_requested_service_quota_change_history_by_quota") + latest_value = None + for page in paginator.paginate(ServiceCode=service_code, QuotaCode=quota_code): + for req in page["RequestedQuotas"]: + if req["Status"] in ("PENDING", "CASE_OPENED"): + value = req["DesiredValue"] + if latest_value is None or value > latest_value: + latest_value = value + return latest_value + except Exception: + return None + + +def list_quotas_command(args: ListQuotasCommandArgs) -> None: + client = boto3.client("service-quotas", region_name=args.region) + needle = args.search.lower() if args.search else None + paginator = client.get_paginator("list_service_quotas") + rows = [] + for page in paginator.paginate(ServiceCode=args.service_code): + for quota in page["Quotas"]: + if needle and needle not in quota["QuotaName"].lower(): + continue + rows.append((quota["QuotaCode"], quota["Value"], quota["QuotaName"])) + rows.sort(key=lambda r: r[2]) + if not rows: + logger.info("No quotas found matching the filter.") + return + print(f"{'QUOTA CODE':<16} {'VALUE':>10} NAME") + for code, value, name in rows: + print(f"{code:<16} {value:>10g} {name}") + + +def delete_amis_command(args: DeleteAmisCommandArgs) -> None: + total_deleted = 0 + for region in args.regions: + ec2 = boto3.client("ec2", region_name=region) + images = _find_self_owned_images(ec2, args.name_prefix, args.name_contains) + # Sort newest first so --keep-latest preserves the most recent images. + images.sort(key=lambda img: img["_created"], reverse=True) + + to_delete = [] + for index, image in enumerate(images): + if index < args.keep_latest: + continue + if image["_created"] < args.before: + to_delete.append(image) + + keep = [img for img in images if img not in to_delete] + logger.info( + "[%s] %d matching AMIs: %d to delete, %d to keep", + region, + len(images), + len(to_delete), + len(keep), + ) + for image in keep: + logger.info( + "[%s] KEEP %s %s (%s)", + region, + image["ImageId"], + image["Name"], + image["CreationDate"], + ) + for image in to_delete: + snapshot_ids = _image_snapshot_ids(image) + logger.info( + "[%s] DELETE %s %s (%s) snapshots=%s", + region, + image["ImageId"], + image["Name"], + image["CreationDate"], + ",".join(snapshot_ids) or "none", + ) + if args.yes: + _deregister_image(ec2, region, image, snapshot_ids) + total_deleted += 1 + + if not args.yes: + logger.info("Preview only. Re-run with --yes to deregister AMIs and delete snapshots.") + else: + logger.info("Deleted %d AMIs.", total_deleted) + + +def _find_self_owned_images(ec2, name_prefix: str, name_contains: Optional[str]) -> List[dict]: + resp = ec2.describe_images( + Owners=["self"], + Filters=[{"Name": "name", "Values": [f"{name_prefix}*"]}], + ) + images = resp["Images"] + if name_contains: + needle = name_contains.lower() + images = [img for img in images if needle in img["Name"].lower()] + for image in images: + image["_created"] = datetime.strptime( + image["CreationDate"], "%Y-%m-%dT%H:%M:%S.%fZ" + ).replace(tzinfo=timezone.utc) + return images + + +def _image_snapshot_ids(image: dict) -> List[str]: + snapshot_ids = [] + for mapping in image.get("BlockDeviceMappings", []): + ebs = mapping.get("Ebs") + if ebs and ebs.get("SnapshotId"): + snapshot_ids.append(ebs["SnapshotId"]) + return snapshot_ids + + +def _deregister_image(ec2, region: str, image: dict, snapshot_ids: List[str]) -> None: + try: + ec2.deregister_image(ImageId=image["ImageId"]) + logger.info("[%s] deregistered %s", region, image["ImageId"]) + except Exception as e: + logger.error("[%s] failed to deregister %s: %s", region, image["ImageId"], e) + return + for snapshot_id in snapshot_ids: + try: + ec2.delete_snapshot(SnapshotId=snapshot_id) + logger.info("[%s] deleted snapshot %s", region, snapshot_id) + except Exception as e: + logger.error("[%s] failed to delete snapshot %s: %s", region, snapshot_id, e) + + +if __name__ == "__main__": + main() diff --git a/scripts/build_frontend.sh b/scripts/build_frontend.sh index 06366221c9..31c39f3502 100755 --- a/scripts/build_frontend.sh +++ b/scripts/build_frontend.sh @@ -3,7 +3,9 @@ script_path="$(realpath $0)" root_dir="$(dirname $(dirname $script_path))" -cd hub +cd $root_dir +cd frontend +npm install npm run build -rm -rf ../cli/dstack/_internal/hub/statics -cp -r build ../cli/dstack/_internal/hub/statics +rm -rf ../src/dstack/_internal/server/statics +cp -a build ../src/dstack/_internal/server/statics diff --git a/scripts/docs/gen_cli_reference.py b/scripts/docs/gen_cli_reference.py index 8f77370a42..86ddae329b 100644 --- a/scripts/docs/gen_cli_reference.py +++ b/scripts/docs/gen_cli_reference.py @@ -3,6 +3,8 @@ Finds the pattern in docs/references/cli/*.md and replace it with the output of the command. """ +import concurrent +import concurrent.futures import logging import os import re @@ -12,15 +14,13 @@ from functools import cache import mkdocs_gen_files + from mkdocs.structure.files import File -FILE_PATTERN = "docs/reference/cli/*.md" +FILE_PATTERN = "docs/reference/cli/dstack/*.md" logger = logging.getLogger("mkdocs.plugins.dstack.cli") -disable_env = "DSTACK_DOCS_DISABLE_CLI_REFERENCE" -if os.environ.get(disable_env): - logger.warning(f"CLI reference generation is disabled: {disable_env} is set") - exit() +DISABLE_ENV = "DSTACK_DOCS_DISABLE_CLI_REFERENCE" @cache # TODO make caching work @@ -29,7 +29,7 @@ def call_dstack(command: str) -> str: def sub_help(match: re.Match) -> str: - logger.info("Generating help for `%s`", match.group(1)) + logger.debug("Generating CLI reference for `%s`", match.group(1)) try: output = call_dstack(match.group(1)) except subprocess.CalledProcessError: @@ -38,10 +38,10 @@ def sub_help(match: re.Match) -> str: return f"```shell\n$ {match.group(1)}\n{output}\n```" -file: File -for file in mkdocs_gen_files.files: +def process_file(file: File): + logger.debug(file.src_uri) if not fnmatch(file.src_uri, FILE_PATTERN): - continue + return logger.debug("Looking for CLI `dstack --help` calls in %s", file.src_uri) with mkdocs_gen_files.open(file.src_uri, "r") as f: text = f.read() @@ -53,3 +53,20 @@ def sub_help(match: re.Match) -> str: text = re.sub(r"```shell\s*\n\$ (dstack .*--help)\s*\n#GENERATE#\s*\n```", sub_help, text) with mkdocs_gen_files.open(file.src_uri, "w") as f: f.write(text) + + +def main(): + if os.environ.get(DISABLE_ENV): + logger.warning("CLI reference generation is disabled") + exit() + + logger.info("Generating CLI reference...") + # Sequential processing take > 10s + with concurrent.futures.ThreadPoolExecutor() as pool: + futures = [] + for file in mkdocs_gen_files.files: + futures.append(pool.submit(process_file, file)) + concurrent.futures.wait(futures) + + +main() diff --git a/scripts/docs/gen_llms_files.py b/scripts/docs/gen_llms_files.py new file mode 100644 index 0000000000..3e1b9ec6d0 --- /dev/null +++ b/scripts/docs/gen_llms_files.py @@ -0,0 +1,235 @@ +""" +Generate llms.txt and llms-full.txt from documentation files. + +llms.txt: Generated from mkdocs nav structure with descriptions from page frontmatter +llms-full.txt: Full concatenation of all markdown content +""" + +import os +from pathlib import Path +from typing import Any, Dict, List, Optional + +import yaml + +# Configuration for which sections to include/exclude +INCLUDE_SECTIONS = ["Getting started", "Concepts", "Guides", "Examples"] +EXCLUDE_SECTIONS = ["Reference"] + + +def read_frontmatter(file_path: Path) -> Dict[str, Any]: + """Read YAML frontmatter from markdown file.""" + try: + content = file_path.read_text(encoding="utf-8") + if content.startswith("---"): + parts = content.split("---", 2) + if len(parts) >= 3: + frontmatter = yaml.safe_load(parts[1]) + if isinstance(frontmatter, dict): + return frontmatter + except Exception as e: + print(f"Warning: Failed to read frontmatter from {file_path}: {e}") + return {} + + +def get_page_info(page_path: str, docs_dir: Path) -> Optional[Dict[str, str]]: + """Get title and description for a page from its frontmatter.""" + # page_path is relative to docs_dir + full_path = docs_dir / page_path + + if not full_path.exists(): + return None + + frontmatter = read_frontmatter(full_path) + + # Get title from frontmatter or filename + title = frontmatter.get("title") + if not title: + # Use filename as fallback + title = full_path.stem.replace("-", " ").title() + + # Get description from frontmatter + description = frontmatter.get("description", "") + + return {"title": title, "description": description} + + +def parse_mkdocs_nav(mkdocs_config: Dict[str, Any], repo_root: str) -> List[Dict[str, Any]]: + """Parse mkdocs nav structure and extract relevant sections.""" + nav = mkdocs_config.get("nav", []) + sections = [] + + # Get docs_dir from config + docs_dir = Path(repo_root) / mkdocs_config.get("docs_dir", "docs") + + def extract_pages(content_list): + """Recursively extract all pages from a section's content, including nested subsections.""" + items = [] + for item in content_list: + if isinstance(item, str): + # Plain string path like "examples.md" + page_info = get_page_info(item, docs_dir) + if page_info: + items.append( + { + "type": "page", + "title": page_info["title"], + "path": item, + "description": page_info["description"], + } + ) + elif isinstance(item, dict): + for title, path in item.items(): + if isinstance(path, str): + # Page with title + page_info = get_page_info(path, docs_dir) + if page_info: + items.append( + { + "type": "page", + "title": title, # Use title from nav + "path": path, + "description": page_info["description"], + } + ) + elif isinstance(path, list): + # Nested subsection - create subsection with its pages + subsection_items = extract_pages(path) + if subsection_items: + items.append( + { + "type": "subsection", + "title": title, + "items": subsection_items, + } + ) + return items + + def process_nav_items(nav_items): + """Recursively process nav items to find matching sections.""" + for item in nav_items: + if isinstance(item, dict): + for section_name, section_content in item.items(): + # Check if this section should be included + if section_name in INCLUDE_SECTIONS and section_name not in EXCLUDE_SECTIONS: + # Extract all pages from this section, including nested subsections + items = [] + if isinstance(section_content, list): + items = extract_pages(section_content) + + if items: + sections.append( + { + "title": section_name, + "items": items, + } + ) + + # Recursively process nested sections + elif isinstance(section_content, list): + process_nav_items(section_content) + + process_nav_items(nav) + return sections + + +def generate_llms_txt(repo_root: str, mkdocs_config: Dict[str, Any], output_path: str) -> None: + """Generate llms.txt from mkdocs nav structure.""" + # Get title, description, and base_url from mkdocs config + title = mkdocs_config.get("site_name", "") + description = mkdocs_config.get("site_description", "") + base_url = mkdocs_config.get("site_url", "").rstrip("/") + + lines = [] + + # Title and description + lines.append(f"# {title}\n") + lines.append(f"> {description}\n") + + # Parse sections from mkdocs nav + sections = parse_mkdocs_nav(mkdocs_config, repo_root) + + # Generate sections + def render_items(items, indent_level=0): + """Render items (pages and subsections) with proper formatting.""" + rendered = [] + for item in items: + if item["type"] == "page": + # Use .md paths as-is since hooks.py copies them to site + url = f"{base_url}/{item['path']}" + if item["description"]: + rendered.append(f"- [{item['title']}]({url}): {item['description']}") + else: + rendered.append(f"- [{item['title']}]({url})") + elif item["type"] == "subsection": + # Render subsection header + rendered.append(f"\n### {item['title']}\n") + # Render subsection items + rendered.extend(render_items(item["items"], indent_level + 1)) + return rendered + + for section in sections: + lines.append(f"## {section['title']}\n") + lines.extend(render_items(section["items"])) + lines.append("") + + # Write to file + with open(output_path, "w", encoding="utf-8") as f: + f.write("\n".join(lines)) + + +def generate_llms_full_txt( + repo_root: str, mkdocs_config: Dict[str, Any], output_path: str +) -> None: + """Generate llms-full.txt by concatenating all pages from llms.txt sections.""" + content_parts = [] + + # Get docs_dir from config + docs_dir = Path(repo_root) / mkdocs_config.get("docs_dir", "docs") + + # Parse sections from mkdocs nav (same as llms.txt) + sections = parse_mkdocs_nav(mkdocs_config, repo_root) + + def extract_page_paths(items): + """Recursively extract all page paths from items (including nested subsections).""" + paths = [] + for item in items: + if item["type"] == "page": + paths.append(item["path"]) + elif item["type"] == "subsection": + paths.extend(extract_page_paths(item["items"])) + return paths + + # Concatenate all pages from all sections + for section in sections: + for page_path in extract_page_paths(section["items"]): + full_path = docs_dir / page_path + + if full_path.is_file(): + try: + content = full_path.read_text(encoding="utf-8") + content_parts.append(f"# {page_path}\n\n{content}\n\n") + except Exception as e: + print(f"Warning: Failed to read {page_path}: {e}") + else: + print(f"Warning: File not found: {page_path}") + + # Write to file + if content_parts: + with open(output_path, "w", encoding="utf-8") as f: + f.write("".join(content_parts)) + else: + print("Warning: No content found for llms-full.txt") + + +def generate_llms_files(repo_root: str, site_dir: str, mkdocs_config: Dict[str, Any]) -> None: + """Generate both llms.txt and llms-full.txt.""" + llms_txt_path = os.path.join(site_dir, "llms.txt") + llms_full_txt_path = os.path.join(site_dir, "llms-full.txt") + + print("Generating llms.txt from mkdocs nav...") + generate_llms_txt(repo_root, mkdocs_config, llms_txt_path) + print("Generated llms.txt") + + print("Generating llms-full.txt...") + generate_llms_full_txt(repo_root, mkdocs_config, llms_full_txt_path) + print("Generated llms-full.txt") diff --git a/scripts/docs/gen_openapi_reference.py b/scripts/docs/gen_openapi_reference.py index f6e136b5fe..def7cf24b6 100644 --- a/scripts/docs/gen_openapi_reference.py +++ b/scripts/docs/gen_openapi_reference.py @@ -3,20 +3,136 @@ """ import json +import logging +import os +import re +from pathlib import Path +from typing import Any import mkdocs_gen_files -import dstack.version from dstack._internal.server.main import app +from dstack._internal.settings import DSTACK_VERSION -app.title = "REST API" -app.description = ( - "The REST API enables running tasks, services, and managing runs programmatically." -) -app.servers = [ - {"url": "http://localhost:3000", "description": "Local server"}, - {"url": "https://sky.dstack.ai", "description": "Managed server"}, -] -app.version = dstack.version.__version__ or "0.0.0" -with mkdocs_gen_files.open("docs/reference/api/rest/openapi.json", "w") as f: - json.dump(app.openapi(), f) +logger = logging.getLogger("mkdocs.plugins.dstack.openapi") +disable_env = "DSTACK_DOCS_DISABLE_OPENAPI_REFERENCE" +output_dir = Path("docs/reference/http") +source_output_dir = Path("mkdocs") / output_dir +openapi_path = output_dir / "openapi.json" + +TAG_LIST_BEGIN = "" +TAG_LIST_END = "" +HTTP_METHODS = {"get", "put", "post", "delete", "options", "head", "patch", "trace"} +UNTAGGED_TAG = "default" +OPENAPI_VERSION = "3.0.3" + +if os.environ.get(disable_env): + logger.warning("OpenAPI reference generation is disabled") + exit(0) + + +def _write_tag_references(tags: list[str]) -> None: + page_filenames = {_tag_page_filename(tag) for tag in tags} + for tag in tags: + page_filename = _tag_page_filename(tag) + _write_text(output_dir / page_filename, _tag_page_content(tag)) + _remove_stale_tag_pages(page_filenames) + _remove_stale_openapi_files() + + +def _update_index(tags: list[str]) -> None: + index_path = output_dir / "index.md" + try: + text = _read_text(index_path) + except FileNotFoundError: + return + tag_links = "\n".join(f"- [{_tag_title(tag)}]({_tag_page_filename(tag)})" for tag in tags) + generated = f"{TAG_LIST_BEGIN}\n{tag_links}\n{TAG_LIST_END}" + pattern = re.compile(f"{re.escape(TAG_LIST_BEGIN)}.*?{re.escape(TAG_LIST_END)}", re.S) + new_text, count = pattern.subn(generated, text) + if count == 0: + logger.warning("HTTP API index is missing generated tag list markers") + return + _write_text(index_path, new_text) + + +def _remove_stale_openapi_files() -> None: + for path in source_output_dir.glob("*.openapi.json"): + path.unlink() + + +def _remove_stale_tag_pages(page_filenames: set[str]) -> None: + for path in source_output_dir.glob("*.md"): + if path.name != "index.md" and path.name not in page_filenames: + path.unlink() + + +def _tag_page_content(tag: str) -> str: + return f"""--- +title: {_tag_title(tag)} +--- + +!!swagger {openapi_path.name} tag={json.dumps(tag)}!! +""" + + +def _tag_title(tag: str) -> str: + return tag + + +def _tag_page_filename(tag: str) -> str: + return f"{_tag_slug(tag)}.md" + + +def _tag_slug(tag: str) -> str: + return re.sub(r"[^a-z0-9]+", "-", tag.lower()).strip("-") or UNTAGGED_TAG + + +def _write_json(path: Path, data: dict[str, Any]) -> None: + _write_text(path, json.dumps(data) + "\n") + + +def _write_text(path: Path, content: str) -> None: + with mkdocs_gen_files.open(path.as_posix(), "w") as f: + f.write(content) + + +def _read_text(path: Path) -> str: + with mkdocs_gen_files.open(path.as_posix(), "r") as f: + return f.read() + + +def main() -> None: + app.title = "OpenAPI Spec" + app.servers = [ + {"url": "https://sky.dstack.ai", "description": "dstack Sky"}, + {"url": "http://localhost:3000", "description": "Local server"}, + ] + app.version = DSTACK_VERSION or "0.0.0" + app.openapi_version = OPENAPI_VERSION + app.openapi_schema = None + schema = app.openapi() + tags = _get_tags(schema) + + logger.info("Generating OpenAPI reference...") + source_output_dir.mkdir(parents=True, exist_ok=True) + _write_json(openapi_path, schema) + _write_tag_references(tags) + _update_index(tags) + + +def _get_tags(schema: dict[str, Any]) -> list[str]: + tags = [] + for path, path_item in schema.get("paths", {}).items(): + if not isinstance(path_item, dict): + continue + for method, operation in path_item.items(): + if method.lower() not in HTTP_METHODS or not isinstance(operation, dict): + continue + for tag in operation.get("tags") or [UNTAGGED_TAG]: + if tag not in tags: + tags.append(tag) + return tags + + +main() diff --git a/scripts/docs/gen_rest_plugin_spec_reference.py b/scripts/docs/gen_rest_plugin_spec_reference.py new file mode 100644 index 0000000000..8c89a84596 --- /dev/null +++ b/scripts/docs/gen_rest_plugin_spec_reference.py @@ -0,0 +1,37 @@ +""" +Generates OpenAPI schema from an example REST plugin. +""" + +import json +import logging +import os +from pathlib import Path + +from dstack._internal.settings import DSTACK_VERSION + +logger = logging.getLogger("mkdocs.plugins.dstack.rest_plugin_schema") +disable_env = "DSTACK_DOCS_DISABLE_REST_PLUGIN_SPEC_REFERENCE" +if os.environ.get(disable_env): + logger.warning("REST plugin spec reference generation is disabled") + exit(0) + +try: + from example_plugin_server.main import app +except ImportError: + logger.warning( + "No module named 'example_plugin_server'." + " The REST Plugin API won't be generated." + " Run 'uv pip install examples/plugins/example_plugin_server' to install 'example_plugin_server'." + ) + exit(0) + +app.title = "REST Plugin OpenAPI Spec" +app.servers = [ + {"url": "http://localhost:8000", "description": "Local server"}, +] +app.version = DSTACK_VERSION or "0.0.0" +output_path = Path("mkdocs/docs/reference/plugins/rest/rest_plugin_openapi.json") +output_path.parent.mkdir(parents=True, exist_ok=True) +new_content = json.dumps(app.openapi()) + "\n" +if not output_path.exists() or output_path.read_text() != new_content: + output_path.write_text(new_content) diff --git a/scripts/docs/gen_schema_reference.py b/scripts/docs/gen_schema_reference.py index 250a7508b4..62f379c821 100644 --- a/scripts/docs/gen_schema_reference.py +++ b/scripts/docs/gen_schema_reference.py @@ -4,50 +4,218 @@ import importlib import inspect +import json import logging import re +from enum import Enum from fnmatch import fnmatch +from typing import Optional import mkdocs_gen_files import yaml -from mkdocs.structure.files import File from pydantic.main import BaseModel from typing_extensions import Annotated, Any, Dict, Literal, Type, Union, get_args, get_origin from dstack._internal.core.models.resources import Range +from mkdocs.structure.files import File FILE_PATTERN = "docs/reference/**.md" logger = logging.getLogger("mkdocs.plugins.dstack.schema") +logger.info("Generating schema reference...") + + +def _is_linkable_type(annotation: Any) -> bool: + """Check if a type annotation contains a BaseModel subclass (excluding Range).""" + origin = get_origin(annotation) + type_ = origin if origin is not None else annotation + if inspect.isclass(type_): + return issubclass(type_, BaseModel) and not issubclass(type_, Range) + if origin is Annotated: + return _is_linkable_type(get_args(annotation)[0]) + if origin is Union: + return any(_is_linkable_type(arg) for arg in get_args(annotation)) + if origin is list: + args = get_args(annotation) + return bool(args) and _is_linkable_type(args[0]) + return False + + +def _type_sort_key(t: str) -> tuple: + """Sort key for type parts: primitives first, then literals, then compound types.""" + order = {"bool": 0, "int": 1, "float": 2, "str": 3} + if t in order: + return (0, order[t]) + if t.startswith('"'): + return (1, t) + if t.startswith("list"): + return (2, t) + if t == "dict": + return (3, "") + if t == "object": + return (4, "") + return (5, t) -def get_type(annotation: Type) -> str: + +def get_friendly_type(annotation: Type) -> str: + """Get a user-friendly type string for documentation. + + Produces types like: ``int | str``, ``"vscode" | "cursor"``, ``list[object]``. + """ + # Unwrap Annotated if get_origin(annotation) is Annotated: - return get_type(get_args(annotation)[0]) + return get_friendly_type(get_args(annotation)[0]) + + # Handle Union (including Optional) if get_origin(annotation) is Union: - # Optional is Union with None. - # We don't want to show Optional[A, None] but just Optional[A] - if annotation.__name__ == "Optional": - args = ",".join(get_type(arg) for arg in get_args(annotation)[:-1]) - else: - args = ",".join(get_type(arg) for arg in get_args(annotation)) - return f"{annotation.__name__}[{args}]" + args = [a for a in get_args(annotation) if a is not type(None)] + if not args: + return "" + parts: list = [] + for arg in args: + friendly = get_friendly_type(arg) + # Split compound types (e.g., "int | str" from Range) to deduplicate, + # but avoid splitting types that contain brackets (e.g., list[...]) + if "[" not in friendly: + for part in friendly.split(" | "): + if part and part not in parts: + parts.append(part) + else: + if friendly and friendly not in parts: + parts.append(friendly) + parts.sort(key=_type_sort_key) + return " | ".join(parts) + + # Handle Literal — list values if get_origin(annotation) is Literal: - return str(annotation).split(".", maxsplit=1)[-1] + values = [v.value if isinstance(v, Enum) else v for v in get_args(annotation)] + return " | ".join(f'"{v}"' for v in values) + + # Handle list if get_origin(annotation) is list: - return f"List[{get_type(get_args(annotation)[0])}]" + args = get_args(annotation) + if args: + inner = get_friendly_type(args[0]) + return f"list[{inner}]" + return "list" + + # Handle dict if get_origin(annotation) is dict: - return f"Dict[{get_type(get_args(annotation)[0])}, {get_type(get_args(annotation)[1])}]" - return annotation.__name__ + return "dict" + + # Handle concrete classes + if inspect.isclass(annotation): + # Enum — list values + if issubclass(annotation, Enum): + values = [e.value for e in annotation] + return " | ".join(f'"{v}"' for v in values) + + # Range — depends on inner type parameter + if issubclass(annotation, Range): + min_field = annotation.__fields__.get("min") + if min_field and inspect.isclass(min_field.type_): + # Range[Memory] → str, Range[int] → int | str + if issubclass(min_field.type_, float): + return "str" + return "int | str" + + # Memory (float subclass that parses "8GB" strings) + from dstack._internal.core.models.resources import Memory as _Memory + + if issubclass(annotation, _Memory): + return "str" + + # BaseModel subclass (not Range) + if issubclass(annotation, BaseModel) and not issubclass(annotation, Range): + # Root models (with __root__ field) — resolve from the root type + if "__root__" in annotation.__fields__: + return get_friendly_type(annotation.__fields__["__root__"].annotation) + # Models with custom __get_validators__ accept primitive input (int, str) + # in addition to the full object form (e.g., GPUSpec, CPUSpec, DiskSpec) + if "__get_validators__" in annotation.__dict__: + return "int | str | object" + return "object" + + # ComputeCapability (tuple subclass that parses "7.5" strings) + if annotation.__name__ == "ComputeCapability": + return "float | str" + + # Constrained and primitive types — check MRO + # bool must come before int (bool is a subclass of int) + if issubclass(annotation, bool): + return "bool" + if issubclass(annotation, int): + # Duration (int subclass that parses "5m" strings) + if annotation.__name__ == "Duration": + return "int | str" + return "int" + if issubclass(annotation, float): + return "float" + if issubclass(annotation, str): + return "str" + if issubclass(annotation, (list, tuple)): + return "list" + if issubclass(annotation, dict): + return "dict" + + return annotation.__name__ + + return str(annotation) + + +_JSON_SCHEMA_TYPE_MAP = { + "string": "str", + "integer": "int", + "number": "float", + "boolean": "bool", + "array": "list", + "object": "object", +} + + +def _enrich_type_from_schema(friendly_type: str, prop_schema: Dict[str, Any]) -> str: + """Enrich the friendly type with extra accepted types from the JSON schema. + + Models may define ``schema_extra`` that adds ``anyOf`` entries for fields + that accept alternative input types (e.g., duration fields typed as ``int`` + but also accepting ``str`` like ``"5m"``). + """ + any_of = prop_schema.get("anyOf") + if not any_of: + return friendly_type + # Only consider string/integer — the most common alternative input types. + # Skip boolean (typically a backward-compat artifact) and object/array. + _ENRICHABLE = {"string": "str", "integer": "int"} + schema_types = set() + for entry in any_of: + # Skip entries with enum constraints — those are already captured as literal values + if "enum" in entry: + continue + mapped = _ENRICHABLE.get(entry.get("type", "")) + if mapped: + schema_types.add(mapped) + # Add any schema types not already present in the friendly type + current_parts = [p.strip() for p in friendly_type.split(" | ")] + new_parts = schema_types - set(current_parts) + if not new_parts: + return friendly_type + all_parts = list(set(current_parts) | new_parts) + # If str is now present, single-value literals are redundant + if "str" in all_parts: + all_parts = [p for p in all_parts if not p.startswith('"') or p in all_parts] + all_parts.sort(key=_type_sort_key) + return " | ".join(all_parts) def generate_schema_reference( model_path: str, *, - overrides: Dict[str, Dict[str, Any]] = None, + overrides: Optional[dict[str, dict[str, Any]]] = None, prefix: str = "", ) -> str: module, model_name = model_path.rsplit(".", maxsplit=1) cls = getattr(importlib.import_module(module), model_name) + assert issubclass(cls, BaseModel) rows = [] if ( not overrides @@ -60,12 +228,33 @@ def generate_schema_reference( "", ] ) + # Get JSON schema to detect extra accepted types from schema_extra + try: + schema_props = cls.schema().get("properties", {}) + except Exception: + schema_props = {} for name, field in cls.__fields__.items(): + default = field.default + default_repr: Optional[str] + if default is None: + default_repr = None + elif isinstance(default, (list, tuple, dict)) and len(default) == 0: + default_repr = None + elif isinstance(default, Enum): + default_repr = str(default.value) + elif isinstance(default, BaseModel): + default_repr = str(default) + elif isinstance(default, str): + default_repr = default + else: + default_repr = json.dumps(default) + friendly_type = get_friendly_type(field.annotation) + friendly_type = _enrich_type_from_schema(friendly_type, schema_props.get(name, {})) values = dict( name=name, description=field.field_info.description, - type=get_type(field.annotation), - default=field.default, + type=friendly_type, + default=default_repr, required=field.required, ) # TODO: If the field doesn't have description (e.g. BaseConfiguration.type), we could fallback to docstring @@ -76,17 +265,9 @@ def generate_schema_reference( # TODO: This is a dirty workaround if field_type: if field.annotation.__name__ == "Annotated": - if field_type.__name__ == "Optional": - field_type = get_args(get_args(field.annotation)[0])[0] - if field_type.__name__ == "List": - field_type = get_args(get_args(field.annotation)[0])[0] - if field_type.__name__ == "Union": - field_type = get_args(get_args(field.annotation)[0])[0] - base_model = ( - inspect.isclass(field_type) - and issubclass(field_type, BaseModel) - and not issubclass(field_type, Range) - ) + if field_type.__name__ in ["Optional", "List", "list", "Union"]: + field_type = get_args(field_type)[0] + base_model = _is_linkable_type(field_type) else: base_model = False _defaults = ( @@ -112,34 +293,32 @@ def generate_schema_reference( if not base_model else f"[`{values['name']}`](#{item_id_prefix}{link_name})" ) - item_optional_marker = "(Optional)" if not values["required"] else "" + item_required_marker = "(Required)" if values["required"] else "(Optional)" + item_type_display = f"`{values['type']}`" if values.get("type") else "" item_description = (values["description"]).replace("\n", "
    ") + "." item_default = _defaults if not values["required"] else _must_be item_id = f"#{values['name']}" if not base_model else f"#_{values['name']}" item_toc_label = f"data-toc-label='{values['name']}'" item_css_cass = "class='reference-item'" - rows.append( - prefix - + " ".join( - [ - f"#### {item_header}", - "-", - item_optional_marker, - item_description, - item_default, - "{", - item_id, - item_toc_label, - item_css_cass, - "}", - ] - ) - ) + parts = [ + f"###### {item_header}", + "-", + item_required_marker, + item_type_display, + item_description, + item_default, + "{", + item_id, + item_toc_label, + item_css_cass, + "}", + ] + rows.append(prefix + " ".join(p for p in parts if p)) return "\n".join(rows) def sub_schema_reference(match: re.Match) -> str: - logger.info("Generating schema reference for `%s`", match.group(2)) + logger.debug("Generating schema reference for `%s`", match.group(2)) options = yaml.safe_load("\n".join(row[4:] for row in match.group(3).split("\n"))) logger.debug("Options: %s", options) return ( @@ -148,10 +327,18 @@ def sub_schema_reference(match: re.Match) -> str: ) -file: File -for file in mkdocs_gen_files.files: +def expand_schema_references(text: str) -> str: + """Expand #SCHEMA# placeholders in markdown text. Used by hooks when gen-files is not used.""" + return re.sub( + r"( *)#SCHEMA#\s+(dstack\.[.a-z_0-9A-Z]+)\s*((?:\n {4}[^\n]+)*)\n", + sub_schema_reference, + text, + ) + + +def process_file(file: File): if not fnmatch(file.src_uri, FILE_PATTERN): - continue + return logger.debug("Looking for schema references in `%s`", file.src_uri) with mkdocs_gen_files.open(file.src_uri, "r") as f: text = f.read() @@ -160,10 +347,15 @@ def sub_schema_reference(match: re.Match) -> str: # overrides: # name: # required: true - text = re.sub( - r"( *)#SCHEMA#\s+(dstack\.[.a-z_0-9A-Z]+)\s*((?:\n {4}[^\n]+)*)\n", - sub_schema_reference, - text, - ) + text = expand_schema_references(text) with mkdocs_gen_files.open(file.src_uri, "w") as f: f.write(text) + + +def main(): + # Processing sequentially since there is no speed up with concurrent processing + for file in mkdocs_gen_files.files: + process_file(file) + + +main() diff --git a/scripts/docs/hooks.py b/scripts/docs/hooks.py new file mode 100644 index 0000000000..ca78a48cf5 --- /dev/null +++ b/scripts/docs/hooks.py @@ -0,0 +1,365 @@ +import importlib.util +import json +import logging +import mimetypes +import os +import posixpath +import re +import shutil +import sys +from pathlib import Path +from xml.sax.saxutils import escape + +import yaml + +from mkdocs.structure.files import File + +mimetypes.add_type("text/plain", ".md") + +log = logging.getLogger("mkdocs") + +WELL_KNOWN_SKILLS_DIR = ".well-known/skills" +SKILL_PATH = ("skills", "dstack", "SKILL.md") +DISABLE_LLM_TXT_ENV = "DSTACK_DOCS_DISABLE_LLM_TXT" +DISABLE_YAML_SCHEMAS_ENV = "DSTACK_DOCS_DISABLE_YAML_SCHEMAS" +SCHEMA_REFERENCE_PREFIX = "docs/reference/" +SWAGGER_TAG_ARG = r"(?:\s+tag=(?P[\"'])(?P.*?)(?P=tag_quote))?" +SWAGGER_TOKEN = re.compile(rf"!!swagger(?:\s+(?P[^\s<>&:!]+){SWAGGER_TAG_ARG})?!!") +SWAGGER_HTTP_TOKEN = re.compile( + rf"!!swagger-http(?:\s+(?Phttps?://[^\s!]+){SWAGGER_TAG_ARG})?!!" +) +SWAGGER_USAGE_MSG = ( + "Usage: '!!swagger [tag=\"tag name\"]!!' or " + "'!!swagger-http [tag=\"tag name\"]!!'. " + "File must either exist locally and be placed next to the .md that contains " + "the swagger statement, or be an http(s) URL." +) +HTTP_METHODS = {"get", "put", "post", "delete", "options", "head", "patch", "trace"} +UNTAGGED_OPENAPI_TAG = "default" + + +def _expand_schema_references(text: str) -> str: + """Lazy load gen_schema_reference by file path so it works regardless of sys.path.""" + hooks_dir = os.path.dirname(os.path.abspath(__file__)) + gen_path = os.path.join(hooks_dir, "gen_schema_reference.py") + spec = importlib.util.spec_from_file_location("gen_schema_reference", gen_path) + if spec is None or spec.loader is None: + raise ImportError(f"Cannot load {gen_path}") + module = importlib.util.module_from_spec(spec) + sys.modules["gen_schema_reference"] = module + spec.loader.exec_module(module) + return module.expand_schema_references(text) + + +def _get_schema_expanded_content(rel_path, config, src_path=None): + """Return expanded markdown for reference/**/*.md that contain #SCHEMA#, else None. + If src_path is given (e.g. from on_post_build loop), read from it; else build path from config. + """ + if os.environ.get(DISABLE_YAML_SCHEMAS_ENV): + return None + if not rel_path.startswith(SCHEMA_REFERENCE_PREFIX) or not rel_path.endswith(".md"): + log.debug(f"Skipping {rel_path}: not in {SCHEMA_REFERENCE_PREFIX} or not .md") + return None + if src_path is None: + repo_root = os.path.dirname(config["config_file_path"]) + docs_dir = config["docs_dir"] + if not os.path.isabs(docs_dir): + docs_dir = os.path.join(repo_root, docs_dir) + src_path = os.path.join(docs_dir, rel_path.replace("/", os.sep)) + if not os.path.isfile(src_path): + log.debug(f"Skipping {rel_path}: source file not found at {src_path}") + return None + try: + with open(src_path, "r", encoding="utf-8") as f: + text = f.read() + except OSError as e: + log.debug(f"Skipping {rel_path}: error reading file: {e}") + return None + if "#SCHEMA#" not in text: + log.debug(f"Skipping {rel_path}: no #SCHEMA# placeholders found") + return None + log.debug(f"Expanding schema references in {rel_path}") + return _expand_schema_references(text) + + +def on_page_read_source(page, config): + """Use expanded schema content for reference docs when rendering HTML.""" + rel_path = page.file.src_uri + content = _get_schema_expanded_content(rel_path, config) + if content is not None: + return content + return None + + +def on_page_markdown(markdown, page, config, files): + """Render Swagger UI tokens with the project's preferred defaults.""" + while True: + match = SWAGGER_TOKEN.search(markdown) + is_http = False + if match is None: + match = SWAGGER_HTTP_TOKEN.search(markdown) + is_http = True + if match is None: + return markdown + markdown = _replace_swagger_token(markdown, match, is_http, page, files) + + +def _replace_swagger_token(markdown, match, is_http, page, files): + pre_token = markdown[: match.start()] + post_token = markdown[match.end() :] + path = match.group("path") + tag = match.groupdict().get("tag") + operation_headings = "" + if path is None: + return _swagger_error(pre_token, post_token, SWAGGER_USAGE_MSG) + if is_http: + url = path + else: + try: + api_file = Path(page.file.abs_src_path).parent / path + except ValueError as exc: # pragma: no cover + return _swagger_error(pre_token, post_token, f"Invalid path. {exc.args[0]}") + if not api_file.exists(): + return _swagger_error(pre_token, post_token, f"File {path} not found.") + try: + src_uri = api_file.relative_to(page.file.src_dir).as_posix() + except ValueError as exc: + return _swagger_error( + pre_token, + post_token, + f"File {path} must be inside the docs directory. {exc.args[0]}", + ) + new_file = File(src_uri, page.file.src_dir, page.file.dest_dir, False) + url = _relative_url(page.file.dest_uri, new_file.dest_uri) + for file in files: + if file.dest_uri != new_file.dest_uri: + continue + if file.abs_src_path == new_file.abs_src_path: + break + return _swagger_error( + pre_token, + post_token, + "Cannot use 2 different swagger files with same filename in same page.", + ) + else: + files.append(new_file) + operation_headings = _openapi_operation_headings(api_file, tag) + return pre_token + operation_headings + _swagger_html(url, tag) + post_token + + +def _relative_url(page_dest_uri: str, asset_dest_uri: str) -> str: + page_dir = posixpath.dirname(page_dest_uri) + return posixpath.relpath(asset_dest_uri, page_dir) + + +def _swagger_html(url: str, tag: str | None) -> str: + tag_attr = "" + if tag is not None: + tag_attr = f' data-openapi-tag="{_escape_html_attr(tag)}"' + return f""" + +
    + +""" + + +def _openapi_operation_headings(api_file: Path, tag: str | None) -> str: + try: + schema = json.loads(api_file.read_text()) + except (OSError, json.JSONDecodeError) as exc: + log.warning(f"Cannot generate Swagger operation headings from {api_file}: {exc}") + return "" + + operations = _get_openapi_operations(schema, tag) + if not operations: + return "" + + used_ids: set[str] = set() + headings = [_openapi_operation_heading(operation, used_ids) for operation in operations] + return "\n".join(headings) + "\n\n" + + +def _get_openapi_operations( + schema: dict, + tag: str | None, +) -> list[dict[str, str]]: + operations = [] + for path, path_item in schema.get("paths", {}).items(): + if not isinstance(path_item, dict): + continue + for method, operation in path_item.items(): + method = method.lower() + if method not in HTTP_METHODS or not isinstance(operation, dict): + continue + operation_tags = operation.get("tags") or [UNTAGGED_OPENAPI_TAG] + if tag is not None and tag not in operation_tags: + continue + operations.append( + { + "method": method, + "path": path, + "summary": str(operation.get("summary") or ""), + } + ) + return operations + + +def _openapi_operation_heading(operation: dict[str, str], used_ids: set[str]) -> str: + method = operation["method"] + path = operation["path"] + label = _openapi_operation_label(operation) + anchor_id = _openapi_operation_anchor_id(method, path, used_ids) + attrs = [ + f"#{anchor_id}", + ".dstack-swagger-operation-anchor", + f"data-toc-label={json.dumps(label)}", + f"data-openapi-method={json.dumps(method)}", + f"data-openapi-path={json.dumps(path)}", + ] + return f"## {label} {{ {' '.join(attrs)} }}" + + +def _openapi_operation_label(operation: dict[str, str]) -> str: + summary = operation.get("summary", "").strip() + if summary: + return summary + return operation["path"] + + +def _openapi_operation_anchor_id(method: str, path: str, used_ids: set[str]) -> str: + base = re.sub(r"[^a-z0-9]+", "-", f"{method}-{path}".lower()).strip("-") or method + anchor_id = base + index = 2 + while anchor_id in used_ids: + anchor_id = f"{base}-{index}" + index += 1 + used_ids.add(anchor_id) + return anchor_id + + +def _escape_html_attr(value: str) -> str: + return escape(value, {'"': """}) + + +def _swagger_error(pre_token: str, post_token: str, message: str) -> str: + return pre_token + escape(f"!! SWAGGER ERROR: {message} !!") + post_token + + +def on_config(config): + if os.environ.get(DISABLE_YAML_SCHEMAS_ENV): + log.warning("YAML schema reference generation is disabled") + if os.environ.get(DISABLE_LLM_TXT_ENV): + log.warning("llms.txt generation is disabled") + return config + + +def on_post_build(config): + """Copy .md files to site (raw) and write .well-known/skills index.""" + site_dir = config["site_dir"] + docs_dir = config["docs_dir"] + + # Create .nojekyll to prevent GitHub Pages from ignoring .well-known directory + nojekyll_path = os.path.join(site_dir, ".nojekyll") + with open(nojekyll_path, "w") as f: + f.write("") + + # Create _config.yml to explicitly include .well-known directory + # This ensures Jekyll (if it runs) includes the .well-known directory + config_yml_path = os.path.join(site_dir, "_config.yml") + with open(config_yml_path, "w") as f: + f.write('include: [".well-known"]\n') + + for root, _, files in os.walk(docs_dir): + for file in files: + if not file.endswith(".md"): + continue + + src_path = os.path.join(root, file) + rel_path = os.path.relpath(src_path, docs_dir).replace(os.sep, "/") + content = _get_schema_expanded_content(rel_path, config, src_path=src_path) + dest_path = os.path.join(site_dir, rel_path) + os.makedirs(os.path.dirname(dest_path), exist_ok=True) + if content is not None: + # Write expanded schema content + log.info(f"Expanding schema references in {rel_path}") + with open(dest_path, "w", encoding="utf-8") as f: + f.write(content) + else: + # Just copy the file as-is + shutil.copy2(src_path, dest_path) + + _write_well_known_skills(config, site_dir) + _generate_llms_files(config, site_dir) + + +def _write_well_known_skills(config, site_dir): + """Parse skills/dstack/SKILL.md and write .well-known/skills/index.json. name and description come from frontmatter only.""" + repo_root = os.path.dirname(config["config_file_path"]) + skill_src = os.path.join(repo_root, *SKILL_PATH) + if not os.path.isfile(skill_src): + return + + name = None + description = None + try: + with open(skill_src, "r", encoding="utf-8") as f: + text = f.read() + if text.startswith("---"): + parts = text.split("---", 2) + if len(parts) >= 3: + data = yaml.safe_load(parts[1]) + if isinstance(data, dict): + name = data.get("name") + description = data.get("description") + except Exception as e: + log.error(f"Skill parsing error: {e}") + + if not name or not description: + log.warning( + "skills/dstack/SKILL.md missing name or description in frontmatter; skipping .well-known/skills" + ) + return + + out_dir = os.path.join(site_dir, WELL_KNOWN_SKILLS_DIR, name) + os.makedirs(out_dir, exist_ok=True) + shutil.copy2(skill_src, os.path.join(out_dir, "SKILL.md")) + # Serve skill at site root (both skill.md and SKILL.md) from skills/dstack/SKILL.md + shutil.copy2(skill_src, os.path.join(site_dir, "skill.md")) + shutil.copy2(skill_src, os.path.join(site_dir, "SKILL.md")) + + index_path = os.path.join(site_dir, WELL_KNOWN_SKILLS_DIR, "index.json") + index = { + "skills": [ + {"name": name, "description": description.strip()[:1024], "files": ["SKILL.md"]} + ] + } + with open(index_path, "w", encoding="utf-8") as f: + json.dump(index, f, indent=2) + + log.info(f"Published skill: {name}") + + +def _generate_llms_files(config, site_dir): + """Generate llms.txt and llms-full.txt using external script.""" + if os.environ.get(DISABLE_LLM_TXT_ENV): + return + + repo_root = os.path.dirname(config["config_file_path"]) + + # Import and run the generator + hooks_dir = os.path.dirname(os.path.abspath(__file__)) + gen_path = os.path.join(hooks_dir, "gen_llms_files.py") + spec = importlib.util.spec_from_file_location("gen_llms_files", gen_path) + if spec is None or spec.loader is None: + log.error(f"Cannot load {gen_path}") + return + module = importlib.util.module_from_spec(spec) + sys.modules["gen_llms_files"] = module + spec.loader.exec_module(module) + + try: + # Pass mkdocs config to generator + module.generate_llms_files(repo_root, site_dir, config) + log.info("Generated llms.txt and llms-full.txt") + except Exception as e: + log.error(f"Failed to generate llms files: {e}") diff --git a/scripts/merge_kubeconfigs.sh b/scripts/merge_kubeconfigs.sh new file mode 100755 index 0000000000..e7087f1aca --- /dev/null +++ b/scripts/merge_kubeconfigs.sh @@ -0,0 +1,12 @@ +#!/bin/sh +set -eu + +if [ ${#} -lt 2 ]; then + echo "usage: $(basename "${0}") PATH1 PATH2 [PATH3 ...]" >&2 + exit 1 +fi + +# Windows is not supported; on Windows a path separator is ';', not ':' +KUBECONFIG=$(IFS=':'; echo "${*}") +export KUBECONFIG +kubectl config view --raw --flatten | grep -Ev '^current-context: ' diff --git a/scripts/oci_image_tools.py b/scripts/oci_image_tools.py index ab401c837e..07021fbf86 100644 --- a/scripts/oci_image_tools.py +++ b/scripts/oci_image_tools.py @@ -3,20 +3,20 @@ import time from argparse import ArgumentParser, Namespace from dataclasses import dataclass -from datetime import datetime, timedelta -from typing import Dict, Iterable, List, Mapping +from datetime import datetime, timedelta, timezone +from typing import Callable, Dict, Iterable, List, Mapping, Optional, Sequence, Tuple, TypeVar import oci from oci.object_storage.models import Bucket from oci.work_requests.models import WorkRequest from dstack._internal.core.backends.oci import resources +from dstack._internal.core.backends.oci.models import OCIDefaultCreds from dstack._internal.core.backends.oci.region import ( OCIRegionClient, get_subscribed_regions, make_region_clients_map, ) -from dstack._internal.core.models.backends.oci import OCIDefaultCreds WORK_REQUEST_UPDATE_INTERVAL_SECS = 15 MAX_IMAGE_IMPORT_OR_EXPORT_SECS = 40 * 60 @@ -121,7 +121,7 @@ def setup_parser(cls, parser: ArgumentParser) -> None: parser.set_defaults(to_struct=cls.from_namespace, run_command=check_command) @staticmethod - def from_namespace(args: Namespace) -> "PublishCommandArgs": + def from_namespace(args: Namespace) -> "CheckCommandArgs": return CheckCommandArgs( image_name=args.image_name, regions=args.regions or [], @@ -129,6 +129,128 @@ def from_namespace(args: Namespace) -> "PublishCommandArgs": ) +@dataclass +class DeletePublicationsCommandArgs: + compartment_id: str + regions: List[str] + before: datetime + name_contains: Optional[str] + keep_latest: int + yes: bool + + @classmethod + def setup_parser(cls, parser: ArgumentParser) -> None: + parser.add_argument("--compartment", dest="compartment_id", required=True) + parser.add_argument("--regions", metavar="REGION_NAME", nargs="*") + _add_cleanup_filter_arguments(parser) + parser.set_defaults(to_struct=cls.from_namespace, run_command=delete_publications_command) + + @staticmethod + def from_namespace(args: Namespace) -> "DeletePublicationsCommandArgs": + return DeletePublicationsCommandArgs( + compartment_id=args.compartment_id, + regions=args.regions or [], + before=_parse_before(args.before), + name_contains=args.name_contains, + keep_latest=_validate_keep_latest(args.keep_latest), + yes=args.yes, + ) + + +@dataclass +class DeleteImagesCommandArgs: + compartment_id: str + regions: List[str] + before: datetime + name_contains: Optional[str] + keep_latest: int + yes: bool + + @classmethod + def setup_parser(cls, parser: ArgumentParser) -> None: + parser.add_argument("--compartment", dest="compartment_id", required=True) + parser.add_argument("--regions", metavar="REGION_NAME", nargs="*") + _add_cleanup_filter_arguments(parser) + parser.set_defaults(to_struct=cls.from_namespace, run_command=delete_images_command) + + @staticmethod + def from_namespace(args: Namespace) -> "DeleteImagesCommandArgs": + return DeleteImagesCommandArgs( + compartment_id=args.compartment_id, + regions=args.regions or [], + before=_parse_before(args.before), + name_contains=args.name_contains, + keep_latest=_validate_keep_latest(args.keep_latest), + yes=args.yes, + ) + + +@dataclass +class DeleteBucketsCommandArgs: + compartment_id: str + regions: List[str] + before: datetime + name_contains: Optional[str] + keep_latest: int + yes: bool + + @classmethod + def setup_parser(cls, parser: ArgumentParser) -> None: + parser.add_argument("--compartment", dest="compartment_id", required=True) + parser.add_argument("--regions", metavar="REGION_NAME", nargs="*") + _add_cleanup_filter_arguments(parser) + parser.set_defaults(to_struct=cls.from_namespace, run_command=delete_buckets_command) + + @staticmethod + def from_namespace(args: Namespace) -> "DeleteBucketsCommandArgs": + return DeleteBucketsCommandArgs( + compartment_id=args.compartment_id, + regions=args.regions or [], + before=_parse_before(args.before), + name_contains=args.name_contains, + keep_latest=_validate_keep_latest(args.keep_latest), + yes=args.yes, + ) + + +def _add_cleanup_filter_arguments(parser: ArgumentParser) -> None: + parser.add_argument( + "--before", + required=True, + metavar="YYYY-MM-DD", + help="Delete resources created strictly before this date (UTC)", + ) + parser.add_argument( + "--name-contains", + help="Only consider resources whose name contains this substring (case-insensitive)", + ) + parser.add_argument( + "--keep-latest", + type=int, + default=0, + help="Always keep this many newest matching resources per region, " + "regardless of --before (default: 0)", + ) + parser.add_argument( + "--yes", + action="store_true", + help="Actually delete (default: preview only)", + ) + + +def _parse_before(value: str) -> datetime: + try: + return datetime.strptime(value, "%Y-%m-%d").replace(tzinfo=timezone.utc) + except ValueError: + raise ScriptError(f"Invalid --before date {value!r}, expected YYYY-MM-DD") + + +def _validate_keep_latest(value: int) -> int: + if value < 0: + raise ScriptError("--keep-latest must be >= 0") + return value + + def main() -> None: parser = ArgumentParser(description="Tools for delivering OCI images") subparsers = parser.add_subparsers() @@ -161,7 +283,44 @@ def main() -> None: ) CheckCommandArgs.setup_parser(check_parser) + delete_publications_parser = subparsers.add_parser( + name="delete-publications", + description=( + "Delete OCI Marketplace community publications (a.k.a. Community " + "Applications) older than a date to free up the marketplace quota. " + "Dry-run by default; pass --yes to actually delete. Run this before " + "delete-images, since an image cannot be deleted while a publication " + "still references it." + ), + ) + DeletePublicationsCommandArgs.setup_parser(delete_publications_parser) + + delete_images_parser = subparsers.add_parser( + name="delete-images", + description=( + "Delete Custom Images older than a date in the given compartment and " + "regions. Dry-run by default; pass --yes to actually delete." + ), + ) + DeleteImagesCommandArgs.setup_parser(delete_images_parser) + + delete_buckets_parser = subparsers.add_parser( + name="delete-buckets", + description=( + "Delete Object Storage buckets older than a date in the given compartment " + "and regions, along with their contents (objects, pre-authenticated " + "requests, in-progress uploads). The copy command creates a bucket named " + "after the image to transfer it between regions and normally deletes it; " + "use this to clean up buckets left over by interrupted copies. " + "Dry-run by default; pass --yes to actually delete." + ), + ) + DeleteBucketsCommandArgs.setup_parser(delete_buckets_parser) + args = parser.parse_args() + if not hasattr(args, "run_command"): + parser.print_help() + sys.exit(1) args.run_command(args.to_struct(args)) @@ -241,6 +400,176 @@ def check_command(args: CheckCommandArgs) -> None: ) +def delete_publications_command(args: DeletePublicationsCommandArgs) -> None: + region_clients = get_region_clients(required_regions=args.regions) + regions_to_clean = args.regions or list(region_clients) + total_deleted = 0 + + for region in sorted(regions_to_clean): + client = region_clients[region].marketplace_client + publications = list_community_publications(args.compartment_id, client) + publications = _filter_by_name(publications, lambda p: p.name, args.name_contains) + keep, to_delete = _partition_for_deletion( + publications, lambda p: p.time_created, args.before, args.keep_latest + ) + _report_selection(region, "publications", keep, to_delete, lambda p: (p.name, p.id)) + + if args.yes: + for publication in to_delete: + client.delete_publication(publication.id) + logging.info( + "[%s] deleted publication %s (%s)", region, publication.name, publication.id + ) + total_deleted += 1 + + _report_outcome(args.yes, "publications", total_deleted) + + +def delete_images_command(args: DeleteImagesCommandArgs) -> None: + region_clients = get_region_clients(required_regions=args.regions) + regions_to_clean = args.regions or list(region_clients) + total_deleted = 0 + + for region in sorted(regions_to_clean): + client = region_clients[region].compute_client + images = list_compartment_images(args.compartment_id, client) + images = _filter_by_name(images, lambda i: i.display_name, args.name_contains) + keep, to_delete = _partition_for_deletion( + images, lambda i: i.time_created, args.before, args.keep_latest + ) + _report_selection(region, "images", keep, to_delete, lambda i: (i.display_name, i.id)) + + if args.yes: + for image in to_delete: + client.delete_image(image.id) + logging.info("[%s] deleted image %s (%s)", region, image.display_name, image.id) + total_deleted += 1 + + _report_outcome(args.yes, "images", total_deleted) + + +def delete_buckets_command(args: DeleteBucketsCommandArgs) -> None: + region_clients = get_region_clients(required_regions=args.regions) + regions_to_clean = args.regions or list(region_clients) + total_deleted = 0 + + for region in sorted(regions_to_clean): + client = region_clients[region].object_storage_client + namespace: str = client.get_namespace().data + buckets = list_compartment_buckets(namespace, args.compartment_id, client) + buckets = _filter_by_name(buckets, lambda b: b.name, args.name_contains) + keep, to_delete = _partition_for_deletion( + buckets, lambda b: b.time_created, args.before, args.keep_latest + ) + _report_selection(region, "buckets", keep, to_delete, lambda b: (b.name, namespace)) + + if args.yes: + for bucket in to_delete: + resources.delete_bucket(namespace, bucket.name, client) + logging.info("[%s] deleted bucket %s", region, bucket.name) + total_deleted += 1 + + _report_outcome(args.yes, "buckets", total_deleted) + + +def list_community_publications( + compartment_id: str, client: oci.marketplace.MarketplaceClient +) -> List[oci.marketplace.models.PublicationSummary]: + """ + List community publications (a.k.a. "Community Applications") created in + `compartment_id`. These are the publisher-side counterparts of marketplace + listings and count against the marketplace "Community Applications" quota. + """ + return list( + resources.chain_paginated_responses( + client.list_publications, + compartment_id=compartment_id, + listing_type=oci.marketplace.models.PublicationSummary.LISTING_TYPE_COMMUNITY, + ) + ) + + +def list_compartment_images( + compartment_id: str, client: oci.core.ComputeClient +) -> List[oci.core.models.Image]: + """ + List Custom Images owned by `compartment_id`. `list_images` also returns + Oracle platform images (with no compartment), which must never be deleted, + so they are filtered out here. + """ + images = resources.chain_paginated_responses(client.list_images, compartment_id=compartment_id) + return [image for image in images if image.compartment_id == compartment_id] + + +def list_compartment_buckets( + namespace: str, compartment_id: str, client: oci.object_storage.ObjectStorageClient +) -> List[oci.object_storage.models.BucketSummary]: + return list( + resources.chain_paginated_responses( + client.list_buckets, namespace_name=namespace, compartment_id=compartment_id + ) + ) + + +T = TypeVar("T") + + +def _filter_by_name( + items: Iterable[T], get_name: Callable[[T], str], name_contains: Optional[str] +) -> List[T]: + if not name_contains: + return list(items) + needle = name_contains.lower() + return [item for item in items if needle in get_name(item).lower()] + + +def _partition_for_deletion( + items: Iterable[T], + get_time: Callable[[T], datetime], + before: datetime, + keep_latest: int, +) -> Tuple[List[T], List[T]]: + # Sort newest first so --keep-latest preserves the most recent resources. + ordered = sorted(items, key=get_time, reverse=True) + keep, to_delete = [], [] + for index, item in enumerate(ordered): + if index < keep_latest or get_time(item) >= before: + keep.append(item) + else: + to_delete.append(item) + return keep, to_delete + + +def _report_selection( + region: str, + kind: str, + keep: Sequence[T], + to_delete: Sequence[T], + describe: Callable[[T], Tuple[str, str]], +) -> None: + logging.info( + "[%s] %d matching %s: %d to delete, %d to keep", + region, + len(keep) + len(to_delete), + kind, + len(to_delete), + len(keep), + ) + for item in keep: + name, ocid = describe(item) + logging.info("[%s] KEEP %s (%s)", region, name, ocid) + for item in to_delete: + name, ocid = describe(item) + logging.info("[%s] DELETE %s (%s)", region, name, ocid) + + +def _report_outcome(deleted_for_real: bool, kind: str, total_deleted: int) -> None: + if not deleted_for_real: + logging.info("Preview only. Re-run with --yes to delete the %s.", kind) + else: + logging.info("Deleted %d %s.", total_deleted, kind) + + def get_region_clients( required_regions: Iterable[str] = frozenset(), ) -> Dict[str, OCIRegionClient]: @@ -420,8 +749,7 @@ def find_image_in_regions( ) if image is None: raise ScriptError( - f"Image {image_name} does not exist is {region_name}, " - f"compartment {compartment_id}" + f"Image {image_name} does not exist is {region_name}, compartment {compartment_id}" ) images[region_name] = image return images diff --git a/scripts/packer/README.md b/scripts/packer/README.md index 1cd91feaac..be77673054 100644 --- a/scripts/packer/README.md +++ b/scripts/packer/README.md @@ -1,13 +1,18 @@ -To run, you need to specify AWS credentials in ENV +# Packer templates for `dstack` VM images -## Build Ubuntu AMI (with CUDA) -```shell -packer build packer.json -``` +This directory contains HashiCorp Packer templates for building VM images that are then used by `dstack` when running instances on some of the VM-based backends. While `dstack` uses standard OS images for some backends, backends with custom-built images have an advantage because these images are optimized for `dstack`, e.g. they contain pre-pulled `dstack` Docker images, which reduces the startup time of `dstack` jobs. + +For most backends, we build two images: one for CPU-only instances, typically published as `dstack-X.Y`, and one for NVIDIA GPU instances, typically published as `dstack-cuda-X.Y`, where `X.Y` is the image version. Some backends may have additional images, e.g. Azure has `dstack-grid-X.Y` for instances requiring NVIDIA Grid drivers. + +## Builds + +Production builds are triggered manually in GitHub Actions, see `.github/workflows/docker.yml`. -# Azure +The GitHub Actions workflow also allows for staging builds. Staging builds are more limited than production builds, e.g. the resulting image can be restricted to a single region and not made public, but this is usually sufficient for testing. -## Allocate resources (make credentials) +If you still need to build the images locally, see the GitHub Actions workflow for examples of how to use the packer templates. Additional instructions for some backends are provided below. + +### Azure Follow [installation instruction](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli-linux?pivots=apt) for Azure CLI `az`. [Login](https://learn.microsoft.com/en-us/cli/azure/authenticate-azure-cli) for managing resources: @@ -53,27 +58,3 @@ Set environment variables. | AZURE_CLIENT_SECRET | client_secret | | AZURE_TENANT_ID | tenant_id | | AZURE_SUBSCRIPTION_ID | subscription_id | - -# Nebius - -## Setup Nebius credentials - -> `compute.admin` is not sufficient for packer. Use `admin` role instead. - -```shell -ncp config profile create packer -ncp config set service-account-key path/to/service_account.json -ncp config set endpoint api.ai.nebius.cloud:443 -export PKR_VAR_nebius_token=$(ncp iam create-token) -``` - -## Build images - -```shell -export PKR_VAR_nebius_folder_id=... -export PKR_VAR_nebius_subnet_id=... -# no CUDA -packer build -only yandex.nebius -var image_version=0.4rc3 . -# with CUDA -packer build -only yandex.nebius-cuda -var image_version=0.4rc3 . -``` diff --git a/scripts/packer/aws-image-cuda.json b/scripts/packer/aws-image-cuda.json index 012da212d2..c60ee6d325 100644 --- a/scripts/packer/aws-image-cuda.json +++ b/scripts/packer/aws-image-cuda.json @@ -4,7 +4,7 @@ "aws_secret_key": "{{env `AWS_SECRET_ACCESS_KEY`}}", "region": "eu-west-1", "ssh_username": "ubuntu", - "base_ami": "ami-0cffefff2d52e0a23", + "base_ami": "ami-0bc691261a82b32bc", "instance_type": "c5.large", "subnet_id": "subnet-c39cb6a5", "docker_version": "", @@ -79,12 +79,7 @@ }, { "type": "shell", - "script": "provisioners/install-nvidia-docker.sh" - }, - { - "type": "shell", - "environment_vars": ["IMAGE_VERSION={{user `image_version`}}"], - "script": "provisioners/pull-docker-images.sh" + "script": "provisioners/install-nvidia-container-toolkit.sh" } ] } diff --git a/scripts/packer/aws-image.json b/scripts/packer/aws-image.json index 84d136b005..ab6922e9fb 100644 --- a/scripts/packer/aws-image.json +++ b/scripts/packer/aws-image.json @@ -4,7 +4,7 @@ "aws_secret_key": "{{env `AWS_SECRET_ACCESS_KEY`}}", "region": "eu-west-1", "ssh_username": "ubuntu", - "base_ami": "ami-0cffefff2d52e0a23", + "base_ami": "ami-0bc691261a82b32bc", "instance_type": "c5.large", "subnet_id": "subnet-c39cb6a5", "docker_version": "", @@ -70,11 +70,6 @@ "cd /tmp", "chmod +x install-docker.sh", "./install-docker.sh --version {{user `docker_version`}}"] - }, - { - "type": "shell", - "environment_vars": ["IMAGE_VERSION={{user `image_version`}}"], - "script": "provisioners/pull-docker-images.sh" } ] } diff --git a/scripts/packer/azure-image-cuda.json b/scripts/packer/azure-image-cuda.json index 1cef65f862..0a200e7855 100644 --- a/scripts/packer/azure-image-cuda.json +++ b/scripts/packer/azure-image-cuda.json @@ -6,7 +6,7 @@ "azure_tenant_id": "{{env `AZURE_TENANT_ID`}}", "azure_subscription_id": "{{env `AZURE_SUBSCRIPTION_ID`}}", "azure_location": "westeurope", - "azure_vm_size": "Standard_DS1_v2", + "azure_vm_size": "Standard_D2s_v6", "build_prefix": "", "docker_version": "", "cuda_drivers_version": "", @@ -22,8 +22,8 @@ "managed_image_name": "{{user `build_prefix`}}dstack-cuda-{{user `image_version` | clean_resource_name}}", "os_type": "Linux", "image_publisher": "canonical", - "image_offer": "0001-com-ubuntu-server-jammy", - "image_sku": "22_04-lts-gen2", + "image_offer": "ubuntu-24_04-lts", + "image_sku": "server", "azure_tags": { "Name": "DSTACK-CUDA" }, @@ -70,12 +70,7 @@ }, { "type": "shell", - "script": "provisioners/install-nvidia-docker.sh" - }, - { - "type": "shell", - "environment_vars": ["IMAGE_VERSION={{user `image_version`}}"], - "script": "provisioners/pull-docker-images.sh" + "script": "provisioners/install-nvidia-container-toolkit.sh" }, { "type": "shell", diff --git a/scripts/packer/azure-image-grid.json b/scripts/packer/azure-image-grid.json index 726959d4cf..bad64a24aa 100644 --- a/scripts/packer/azure-image-grid.json +++ b/scripts/packer/azure-image-grid.json @@ -6,7 +6,7 @@ "azure_tenant_id": "{{env `AZURE_TENANT_ID`}}", "azure_subscription_id": "{{env `AZURE_SUBSCRIPTION_ID`}}", "azure_location": "westeurope", - "azure_vm_size": "Standard_DS1_v2", + "azure_vm_size": "Standard_D2s_v6", "build_prefix": "", "docker_version": "", "image_version": "" @@ -21,8 +21,8 @@ "managed_image_name": "{{user `build_prefix`}}dstack-grid-{{user `image_version` | clean_resource_name}}", "os_type": "Linux", "image_publisher": "canonical", - "image_offer": "0001-com-ubuntu-server-jammy", - "image_sku": "22_04-lts-gen2", + "image_offer": "ubuntu-24_04-lts", + "image_sku": "server", "azure_tags": { "Name": "DSTACK-GRID" }, @@ -64,16 +64,20 @@ }, { "type": "shell", - "script": "provisioners/install-nvidia-grid-driver-for-azure.sh" + "script": "provisioners/downgrade-azure-kernel.sh" }, { "type": "shell", - "script": "provisioners/install-nvidia-docker.sh" + "inline": ["sudo reboot"], + "expect_disconnect": true + }, + { + "type": "shell", + "script": "provisioners/install-nvidia-grid-driver-for-azure.sh" }, { "type": "shell", - "environment_vars": ["IMAGE_VERSION={{user `image_version`}}"], - "script": "provisioners/pull-docker-images.sh" + "script": "provisioners/install-nvidia-container-toolkit.sh" }, { "type": "shell", diff --git a/scripts/packer/azure-image.json b/scripts/packer/azure-image.json index 3a0967328c..7d7d0d9c82 100644 --- a/scripts/packer/azure-image.json +++ b/scripts/packer/azure-image.json @@ -6,7 +6,7 @@ "azure_tenant_id": "{{env `AZURE_TENANT_ID`}}", "azure_subscription_id": "{{env `AZURE_SUBSCRIPTION_ID`}}", "azure_location": "westeurope", - "azure_vm_size": "Standard_DS1_v2", + "azure_vm_size": "Standard_D2s_v6", "build_prefix": "", "docker_version": "", "image_version": "" @@ -21,8 +21,8 @@ "managed_image_name": "{{user `build_prefix`}}dstack-{{user `image_version` | clean_resource_name}}", "os_type": "Linux", "image_publisher": "canonical", - "image_offer": "0001-com-ubuntu-server-jammy", - "image_sku": "22_04-lts-gen2", + "image_offer": "ubuntu-24_04-lts", + "image_sku": "server", "azure_tags": { "Name": "DSTACK" }, @@ -62,11 +62,6 @@ "./install-docker.sh --version {{user `docker_version`}}" ] }, - { - "type": "shell", - "environment_vars": ["IMAGE_VERSION={{user `image_version`}}"], - "script": "provisioners/pull-docker-images.sh" - }, { "type": "shell", "execute_command": "chmod +x {{ .Path }}; {{ .Vars }} sudo -E sh '{{ .Path }}'", diff --git a/scripts/packer/build-cuda-image.pkr.hcl b/scripts/packer/build-cuda-image.pkr.hcl index ddfe2ce43c..48b1c20024 100644 --- a/scripts/packer/build-cuda-image.pkr.hcl +++ b/scripts/packer/build-cuda-image.pkr.hcl @@ -1,11 +1,5 @@ build { - source "source.yandex.nebius" { - name = "nebius-cuda" - image_description = "Ubuntu 22.04 with CUDA, Docker and dstackai/base:cuda images" - image_family = "dstack-cuda" - image_name = "${local.image_name}-cuda" - } - # TODO(egor-s) add other sources + # TODO: transition to this generic template from legacy per-backend JSON templates provisioner "shell" { inline = ["cloud-init status --long --wait"] @@ -35,11 +29,6 @@ build { } provisioner "shell" { - script = "provisioners/install-nvidia-docker.sh" - } - - provisioner "shell" { - environment_vars = ["IMAGE_VERSION=${var.image_version}"] - script = "provisioners/pull-docker-images.sh" + script = "provisioners/install-nvidia-container-toolkit.sh" } } diff --git a/scripts/packer/build-image.pkr.hcl b/scripts/packer/build-image.pkr.hcl index e486d3fd31..6033ee4b1f 100644 --- a/scripts/packer/build-image.pkr.hcl +++ b/scripts/packer/build-image.pkr.hcl @@ -1,10 +1,5 @@ build { - source "source.yandex.nebius" { - image_description = "Ubuntu 22.04 with Docker and dstackai/base images" - image_family = "dstack" - image_name = local.image_name - } - # TODO(egor-s) add other sources + # TODO: transition to this generic template from legacy per-backend JSON templates provisioner "shell" { inline = ["cloud-init status --long --wait"] @@ -27,9 +22,4 @@ build { provisioner "shell" { inline = ["cd /tmp", "chmod +x install-docker.sh", "./install-docker.sh --version ${local.docker_version}"] } - - provisioner "shell" { - environment_vars = ["IMAGE_VERSION=${var.image_version}"] - script = "provisioners/pull-docker-images.sh" - } } diff --git a/scripts/packer/config.pkr.hcl b/scripts/packer/config.pkr.hcl index 6e5af302a9..521b167092 100644 --- a/scripts/packer/config.pkr.hcl +++ b/scripts/packer/config.pkr.hcl @@ -5,4 +5,4 @@ packer { source = "github.com/hashicorp/yandex" } } -} \ No newline at end of file +} diff --git a/scripts/packer/gcp-a3mega-image.json b/scripts/packer/gcp-a3mega-image.json new file mode 100644 index 0000000000..ed9876fb97 --- /dev/null +++ b/scripts/packer/gcp-a3mega-image.json @@ -0,0 +1,32 @@ +{ + "variables": { + "image_version": "" + }, + "builders": [ + { + "type": "googlecompute", + "project_id": "dstack", + "source_image": "dstack-a3mega-20250401t065024z", + "image_name": "dstack-a3mega-{{user `image_version`}}", + "instance_name": "dstack-a3mega-{{user `image_version`}}", + "image_description": "dstack VM image for A3 Mega instances with pre-pulled Docker images. The source image is based on https://cloud.google.com/cluster-toolkit/docs/deploy/deploy-a3-mega-cluster.", + "ssh_username": "ubuntu", + "zone": "us-central1-a", + "disk_size": 100 + } + ], + "provisioners": [ + { + "type": "shell", + "inline": [ + "sudo rm /etc/apt/sources.list.d/ar_us_apt_pkg_dev_projects_gce_ai_infra.list", + "sudo apt-get update", + "sudo apt-get install -y --no-install-recommends datacenter-gpu-manager-4-proprietary datacenter-gpu-manager-exporter", + "sudo systemctl disable google-cloud-ops-agent.service", + "gcloud -q auth configure-docker us-docker.pkg.dev", + "docker pull us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.14", + "docker pull us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.8-1" + ] + } + ] +} diff --git a/scripts/packer/gcp-image-cuda.json b/scripts/packer/gcp-image-cuda.json index 443fa27aac..6f68d6e277 100644 --- a/scripts/packer/gcp-image-cuda.json +++ b/scripts/packer/gcp-image-cuda.json @@ -9,7 +9,7 @@ { "type": "googlecompute", "project_id": "dstack", - "source_image": "ubuntu-2204-jammy-v20230714", + "source_image": "ubuntu-2404-noble-amd64-v20250828", "image_name": "{{user `build_prefix`}}dstack-cuda-{{user `image_version` | clean_resource_name}}", "instance_name": "{{user `build_prefix`}}dstack-cuda-{{user `image_version` | clean_resource_name}}", "ssh_username": "ubuntu", @@ -54,12 +54,7 @@ }, { "type": "shell", - "script": "provisioners/install-nvidia-docker.sh" - }, - { - "type": "shell", - "environment_vars": ["IMAGE_VERSION={{user `image_version`}}"], - "script": "provisioners/pull-docker-images.sh" + "script": "provisioners/install-nvidia-container-toolkit.sh" } ] } diff --git a/scripts/packer/gcp-image.json b/scripts/packer/gcp-image.json index 7fa2058af6..b500a78989 100644 --- a/scripts/packer/gcp-image.json +++ b/scripts/packer/gcp-image.json @@ -8,7 +8,7 @@ { "type": "googlecompute", "project_id": "dstack", - "source_image": "ubuntu-2204-jammy-v20230714", + "source_image": "ubuntu-2404-noble-amd64-v20250828", "image_name": "{{user `build_prefix`}}dstack-{{user `image_version` | clean_resource_name}}", "instance_name": "{{user `build_prefix`}}dstack-{{user `image_version` | clean_resource_name}}", "ssh_username": "ubuntu", @@ -45,11 +45,6 @@ "cd /tmp", "chmod +x install-docker.sh", "./install-docker.sh --version {{user `docker_version`}}"] - }, - { - "type": "shell", - "environment_vars": ["IMAGE_VERSION={{user `image_version`}}"], - "script": "provisioners/pull-docker-images.sh" } ] } diff --git a/scripts/packer/nebius.pkr.hcl b/scripts/packer/nebius.pkr.hcl deleted file mode 100644 index e19b6b0ba0..0000000000 --- a/scripts/packer/nebius.pkr.hcl +++ /dev/null @@ -1,12 +0,0 @@ -source "yandex" "nebius" { - disk_size_gb = 30 - disk_type = "network-ssd" - endpoint = "api.ai.nebius.cloud:443" - folder_id = var.nebius_folder_id - source_image_family = "ubuntu-2204-lts" - ssh_username = "ubuntu" - subnet_id = var.nebius_subnet_id - token = var.nebius_token - use_ipv4_nat = true - zone = "eu-north1-c" -} diff --git a/scripts/packer/oci-image-cuda.json b/scripts/packer/oci-image-cuda.json index 7ba151ec3e..df2d69af9f 100644 --- a/scripts/packer/oci-image-cuda.json +++ b/scripts/packer/oci-image-cuda.json @@ -15,7 +15,7 @@ "compartment_ocid": "{{user `oci_compartment_ocid`}}", "subnet_ocid": "{{user `oci_subnet_ocid`}}", "shape": "VM.Standard2.1", - "base_image_ocid": "ocid1.image.oc1.eu-frankfurt-1.aaaaaaaaxroekfbow3kdrdjlwao6tsxxfcb23xmqrdjtjcay2ow52eijvzqa", + "base_image_ocid": "ocid1.image.oc1.eu-frankfurt-1.aaaaaaaahelib4o7g4fsjgck2lhxjmzonvbniwcmjjn2im4cxlksjgyzw5gq", "image_name": "{{user `build_prefix`}}dstack-cuda-{{user `image_version`}}", "instance_name": "packer-{{user `build_prefix`}}dstack-cuda-{{user `image_version`}}", "ssh_username": "ubuntu" @@ -26,10 +26,6 @@ "type": "shell", "inline": ["cloud-init status --long --wait"] }, - { - "type": "shell", - "script": "provisioners/wait-for-dpkg-lock.sh" - }, { "type": "shell", "scripts": [ @@ -63,12 +59,7 @@ }, { "type": "shell", - "script": "provisioners/install-nvidia-docker.sh" - }, - { - "type": "shell", - "environment_vars": ["IMAGE_VERSION={{user `image_version`}}"], - "script": "provisioners/pull-docker-images.sh" + "script": "provisioners/install-nvidia-container-toolkit.sh" } ] } diff --git a/scripts/packer/oci-image.json b/scripts/packer/oci-image.json index cc0945eebf..c80626723f 100644 --- a/scripts/packer/oci-image.json +++ b/scripts/packer/oci-image.json @@ -14,7 +14,7 @@ "compartment_ocid": "{{user `oci_compartment_ocid`}}", "subnet_ocid": "{{user `oci_subnet_ocid`}}", "shape": "VM.Standard2.1", - "base_image_ocid": "ocid1.image.oc1.eu-frankfurt-1.aaaaaaaaxroekfbow3kdrdjlwao6tsxxfcb23xmqrdjtjcay2ow52eijvzqa", + "base_image_ocid": "ocid1.image.oc1.eu-frankfurt-1.aaaaaaaahelib4o7g4fsjgck2lhxjmzonvbniwcmjjn2im4cxlksjgyzw5gq", "image_name": "{{user `build_prefix`}}dstack-{{user `image_version`}}", "instance_name": "packer-{{user `build_prefix`}}dstack-{{user `image_version`}}", "ssh_username": "ubuntu" @@ -25,10 +25,6 @@ "type": "shell", "inline": ["cloud-init status --long --wait"] }, - { - "type": "shell", - "script": "provisioners/wait-for-dpkg-lock.sh" - }, { "type": "shell", "scripts": [ @@ -54,11 +50,6 @@ "cd /tmp", "chmod +x install-docker.sh", "./install-docker.sh --version {{user `docker_version`}}"] - }, - { - "type": "shell", - "environment_vars": ["IMAGE_VERSION={{user `image_version`}}"], - "script": "provisioners/pull-docker-images.sh" } ] } diff --git a/scripts/packer/provisioners/cuda.sh b/scripts/packer/provisioners/cuda.sh index 4e7ab85dd0..6815683e2d 100644 --- a/scripts/packer/provisioners/cuda.sh +++ b/scripts/packer/provisioners/cuda.sh @@ -9,14 +9,17 @@ sudo DEBIAN_FRONTEND=noninteractive apt-get install -y linux-headers-$(uname -r) ARCH=$(uname -m) CUDA_DISTRO=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.//g') -# based on https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html#ubuntu-lts -wget https://developer.download.nvidia.com/compute/cuda/repos/$CUDA_DISTRO/$ARCH/cuda-keyring_1.0-1_all.deb -sudo dpkg -i cuda-keyring_1.0-1_all.deb -rm cuda-keyring_1.0-1_all.deb +# based on https://docs.nvidia.com/datacenter/tesla/driver-installation-guide/ubuntu.html +wget https://developer.download.nvidia.com/compute/cuda/repos/$CUDA_DISTRO/$ARCH/cuda-keyring_1.1-1_all.deb +sudo dpkg -i cuda-keyring_1.1-1_all.deb +rm cuda-keyring_1.1-1_all.deb -CUDA_BRANCH=$(cut -d '.' -f 1 <<< "$CUDA_DRIVERS_VERSION") sudo apt-get update sudo DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - cuda-drivers-$CUDA_BRANCH=$CUDA_DRIVERS_VERSION \ - nvidia-fabricmanager-$CUDA_BRANCH=$CUDA_DRIVERS_VERSION + nvidia-driver-pinning-$CUDA_DRIVERS_VERSION + +sudo DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + nvidia-open \ + nvidia-fabricmanager \ + datacenter-gpu-manager-4-core datacenter-gpu-manager-4-proprietary datacenter-gpu-manager-exporter sudo systemctl enable nvidia-fabricmanager diff --git a/scripts/packer/provisioners/downgrade-azure-kernel.sh b/scripts/packer/provisioners/downgrade-azure-kernel.sh new file mode 100755 index 0000000000..6cfc4a7687 --- /dev/null +++ b/scripts/packer/provisioners/downgrade-azure-kernel.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +# based on https://learn.microsoft.com/en-us/azure/virtual-machines/extensions/hpccompute-gpu-linux#known-issues +# this is a temporary solution only required until the issue is fixed + +set -e + +# Install the latest available 6.8 Azure kernel. The exact revision Azure ships +# in the repos changes over time, so we resolve it dynamically instead of pinning +# a specific one (which eventually gets removed and breaks the build). +sudo apt-get update +KERNEL_VERSION=$(apt-cache search --names-only '^linux-image-6\.8\.0-[0-9]+-azure$' | awk '{print $1}' | sed 's/^linux-image-//' | sort -V | tail -1) + +if [ -z "$KERNEL_VERSION" ]; then + echo "No linux-image-6.8.0-*-azure kernel available in the repositories" >&2 + exit 1 +fi +echo "Installing Azure kernel $KERNEL_VERSION" +sudo DEBIAN_FRONTEND=noninteractive apt install "linux-image-$KERNEL_VERSION" "linux-headers-$KERNEL_VERSION" -y + +# Update the Grub entry name +grub_entry_name="$(sudo grep -Po "menuentry '\KUbuntu, with Linux 6\.8[^(']+" /boot/grub/grub.cfg | sort -V | head -1)" +sudo sed -i "s/^\s*GRUB_DEFAULT=.*$/GRUB_DEFAULT='Advanced options for Ubuntu>$grub_entry_name'/" /etc/default/grub +sudo update-grub + +# Disable the kernel package upgrade +sudo apt-mark hold $(dpkg --get-selections | grep -Po "^linux[^\t]+${grub_entry_name##* }") diff --git a/scripts/packer/provisioners/install-nvidia-container-toolkit.sh b/scripts/packer/provisioners/install-nvidia-container-toolkit.sh new file mode 100644 index 0000000000..bfaf1a3d0f --- /dev/null +++ b/scripts/packer/provisioners/install-nvidia-container-toolkit.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -e + +sudo DEBIAN_FRONTEND=noninteractive apt-get install -y curl + +curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ + | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg +curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ + | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \ + | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list + +sudo apt-get update +sudo DEBIAN_FRONTEND=noninteractive apt-get install -y nvidia-container-toolkit + +sudo nvidia-ctk runtime configure --runtime=docker diff --git a/scripts/packer/provisioners/install-nvidia-docker.sh b/scripts/packer/provisioners/install-nvidia-docker.sh deleted file mode 100644 index 311e962985..0000000000 --- a/scripts/packer/provisioners/install-nvidia-docker.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -set -e - -sudo DEBIAN_FRONTEND=noninteractive apt-get install -y curl -NVDOCKER_DISTRO=$(. /etc/os-release;echo $ID$VERSION_ID) - -curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - -curl -s -L https://nvidia.github.io/nvidia-docker/$NVDOCKER_DISTRO/nvidia-docker.list \ - | sudo tee /etc/apt/sources.list.d/nvidia-docker.list -curl -s -L https://nvidia.github.io/nvidia-container-runtime/experimental/$NVDOCKER_DISTRO/nvidia-container-runtime.list \ - | sudo tee /etc/apt/sources.list.d/nvidia-container-runtime.list - -sudo apt-get update -sudo DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends nvidia-docker2 diff --git a/scripts/packer/provisioners/install-nvidia-grid-driver-for-azure.sh b/scripts/packer/provisioners/install-nvidia-grid-driver-for-azure.sh index 4d10c4c1c7..8a57b2a7cf 100755 --- a/scripts/packer/provisioners/install-nvidia-grid-driver-for-azure.sh +++ b/scripts/packer/provisioners/install-nvidia-grid-driver-for-azure.sh @@ -8,7 +8,7 @@ sudo apt-get update sudo DEBIAN_FRONTEND=noninteractive apt-get install build-essential linux-azure -y wget --no-verbose -O NVIDIA-Linux-x86_64-grid.run \ - https://download.microsoft.com/download/8/d/a/8da4fb8e-3a9b-4e6a-bc9a-72ff64d7a13c/NVIDIA-Linux-x86_64-535.161.08-grid-azure.run + https://download.microsoft.com/download/2a04ca6a-9eec-40d9-9564-9cdea1ab795f/NVIDIA-Linux-x86_64-570.211.01-grid-azure.run chmod +x NVIDIA-Linux-x86_64-grid.run sudo ./NVIDIA-Linux-x86_64-grid.run --silent --disable-nouveau rm NVIDIA-Linux-x86_64-grid.run diff --git a/scripts/packer/provisioners/kernel/apt-packages.sh b/scripts/packer/provisioners/kernel/apt-packages.sh index 3eafe73c6d..3b53230bb0 100644 --- a/scripts/packer/provisioners/kernel/apt-packages.sh +++ b/scripts/packer/provisioners/kernel/apt-packages.sh @@ -2,8 +2,6 @@ set -e -sudo apt-get update - # Common packages across all versions DEPS=" net-tools @@ -19,16 +17,20 @@ DEPS=" tree jq gdb + ufw python3-pip python3-boto python3-boto3 " +# No `apt-get update` here on purpose: the apt package indexes are already +# refreshed by apt-upgrade.sh, which always runs right before this script. + # Install basic packages for dep in $DEPS; do if ! dpkg -s $dep > /dev/null 2>&1; then echo "Attempting installation of missing package: $dep" - sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -q $dep + sudo DEBIAN_FRONTEND=noninteractive apt-get -o DPkg::Lock::Timeout=60 install -y -q $dep fi done @@ -36,9 +38,9 @@ done sudo snap remove amazon-ssm-agent || true # Uninstall snapd, which is not used by us. -sudo apt-get purge -y snapd +sudo apt-get -o DPkg::Lock::Timeout=60 purge -y snapd # Uninstall ec2-instance-connect, which is not used by us. # This resolves ec2-instance-connect.service failure during boot, # which causes "systemctl status" in "degraded" state. -sudo apt-get purge -y --auto-remove ec2-instance-connect +sudo apt-get -o DPkg::Lock::Timeout=60 purge -y --auto-remove ec2-instance-connect diff --git a/scripts/packer/provisioners/kernel/apt-upgrade.sh b/scripts/packer/provisioners/kernel/apt-upgrade.sh index 1f60c5383e..f30209e374 100644 --- a/scripts/packer/provisioners/kernel/apt-upgrade.sh +++ b/scripts/packer/provisioners/kernel/apt-upgrade.sh @@ -5,5 +5,44 @@ set -e -sudo apt-get update -sudo DEBIAN_FRONTEND=noninteractive apt-get -o DPkg::Lock::Timeout=60 dist-upgrade -y -q +# +# Run apt-get update, but retry in case the lock is held by another process. +# +# A better way of handling apt races is the `-o DPkg::Lock::Timeout=X` option, +# but it does not work with `apt-get update`. +# +# This function was added specifically for the `oci` backend, where our build +# process conflicts with OCI's instance agent. +# +apt_update_with_retry() { + local MAX_RETRIES=10 + local RETRY_DELAY=3 + local COUNT=0 + local LOGFILE=$(mktemp) + + while [ $COUNT -lt $MAX_RETRIES ]; do + set +e + sudo apt-get update 2>&1 | tee "$LOGFILE" + local EXIT_CODE=${PIPESTATUS[0]} + set -e + + if grep -q "Could not get lock" "$LOGFILE"; then + echo "apt lock file is held by another process. Retrying in $RETRY_DELAY seconds..." + COUNT=$((COUNT + 1)) + sleep $RETRY_DELAY + else + return $EXIT_CODE + fi + done + + echo "apt-get update failed due to lock being held after $MAX_RETRIES attempts." + return 1 +} + +apt_update_with_retry +# See https://man7.org/linux/man-pages/man1/dpkg.1.html#OPTIONS for confold/confdef +sudo DEBIAN_FRONTEND=noninteractive apt-get \ + -o DPkg::Lock::Timeout=60 \ + -o Dpkg::Options::=--force-confold \ + -o Dpkg::Options::=--force-confdef \ + dist-upgrade -y -q diff --git a/scripts/packer/provisioners/pull-docker-images.sh b/scripts/packer/provisioners/pull-docker-images.sh deleted file mode 100644 index 23da8cbc4a..0000000000 --- a/scripts/packer/provisioners/pull-docker-images.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -set -e - -IMAGES=" - dstackai/base:py3.12-${IMAGE_VERSION}-cuda-12.1 - dstackai/base:py3.11-${IMAGE_VERSION}-cuda-12.1 - dstackai/base:py3.10-${IMAGE_VERSION}-cuda-12.1 - dstackai/base:py3.9-${IMAGE_VERSION}-cuda-12.1 - dstackai/base:py3.8-${IMAGE_VERSION}-cuda-12.1 -" -echo "START pull image" -for img in $IMAGES; do - docker pull --platform linux/amd64 $img -done -echo "LIST installed images" -docker image ls --all -echo "END " diff --git a/scripts/packer/provisioners/wait-for-dpkg-lock.sh b/scripts/packer/provisioners/wait-for-dpkg-lock.sh deleted file mode 100644 index cc6448a7cf..0000000000 --- a/scripts/packer/provisioners/wait-for-dpkg-lock.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -# -# Wait until another process releases an apt/dpkg lock. -# -# This is a hack and it might not work for all cases. A better way of handling -# apt races is the `-o DPkg::Lock::Timeout=X` option, but it does not work for -# `apt-get update`. -# - -while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do - sleep 1 -done diff --git a/scripts/packer/variables.pkr.hcl b/scripts/packer/variables.pkr.hcl index f2f3aae4f0..2768467984 100644 --- a/scripts/packer/variables.pkr.hcl +++ b/scripts/packer/variables.pkr.hcl @@ -6,22 +6,3 @@ variable "build_prefix" { variable "image_version" { type = string } - -# Nebius -variable "nebius_folder_id" { - type = string - default = null - sensitive = true -} - -variable "nebius_subnet_id" { - type = string - default = null - sensitive = true -} - -variable "nebius_token" { - type = string - default = null - sensitive = true -} diff --git a/scripts/packer/versions.json b/scripts/packer/versions.json index bc4cf2bc61..584a832abb 100644 --- a/scripts/packer/versions.json +++ b/scripts/packer/versions.json @@ -1,4 +1,4 @@ { - "docker_version": "20.10.17", - "cuda_drivers_version": "535.183.01-1" + "docker_version": "27.1.1", + "cuda_drivers_version": "580" } diff --git a/scripts/publish_azure_image.sh b/scripts/publish_azure_image.sh index fd253bd620..3139693809 100755 --- a/scripts/publish_azure_image.sh +++ b/scripts/publish_azure_image.sh @@ -20,7 +20,7 @@ function get_image_definition { } # We create a separate image definition for each dstack version since -# gallery-image-version can't be in one-to-one correspondance with dstack versions +# gallery-image-version can't be in one-to-one correspondence with dstack versions # (it has to follow semver, e.g. no rc) function create_image_definition() { echo Creating image definition... @@ -33,7 +33,8 @@ function create_image_definition() { --sku $image_definition \ --os-type Linux \ --os-state generalized \ - --hyper-v-generation V2 + --hyper-v-generation V2 \ + --features DiskControllerTypes=SCSI,NVMe } function create_image_version() { diff --git a/scripts/release_notes.py b/scripts/release_notes.py new file mode 100644 index 0000000000..ab2da2d210 --- /dev/null +++ b/scripts/release_notes.py @@ -0,0 +1,116 @@ +# /// script +# dependencies = [ +# "requests", +# "litellm", +# "boto3", # for AWS Bedrock +# ] +# /// + +import argparse +import os +import re +from pathlib import Path + +import requests +from litellm import completion + +REPO = "dstackai/dstack" +BRANCH = "master" + +# GITHUB_TOKEN to avoid rate limiting +GITHUB_TOKEN = os.environ["GITHUB_TOKEN"] + +# Model can be any supported by LiteLLM: https://docs.litellm.ai/docs/providers +# Prover-specific credentials are picked up from env +# e.g. AWS_REGION=us-east-1 AWS_PROFILE=... for AWS Bedrock +MODEL = os.getenv("LLM_MODEL", "bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0") + + +def get_draft_release_by_tag(tag: str) -> dict: + r = requests.get( + f"https://api.github.com/repos/{REPO}/releases", + headers={"Authorization": f"token {GITHUB_TOKEN}"}, + timeout=10, + ) + if not r.ok: + msg = f"Error getting GitHub releases; status: {r.status_code}, body: {r.text}" + raise Exception(msg) + for release in r.json(): + if release["tag_name"] == tag and release["draft"]: + return release + # May error if the draft not on the first page - we assume draft was created recently + raise ValueError(f"Release for tag {tag} not found") + + +def get_prs_from_draft(draft_body: str) -> list[dict]: + prs = [] + pr_numbers = extract_pr_numbers_from_draft(draft_body) + for pr_number in pr_numbers: + r = requests.get( + f"https://api.github.com/repos/{REPO}/pulls/{pr_number}", + headers={"Authorization": f"token {GITHUB_TOKEN}"}, + timeout=10, + ) + prs.append(r.json()) + return prs + + +def extract_pr_numbers_from_draft(notes: str) -> list[int]: + return [int(num) for num in re.findall(r"/pull/(\d+)", notes)] + + +def generate_release_notes( + draft_body: str, + prs: list[dict], + examples: str, +) -> str: + pr_summaries = "\n\n".join(f"PR #{pr['number']}: {pr['title']}\n{pr['body']}" for pr in prs) + prompt = f""" +You are a release notes generator. + +Here are the draft GitHub release notes: +{draft_body} + +Here are the PR details (titles + descriptions): +{pr_summaries} + +Task: +* Keep the 'What's Changed' and 'Contributors' sections as they are. +* Add expanded sections in the beginning for major features and changes. Do not mention minor fixes. +* Use clear, user-friendly prose. Avoid emojis. +* Use the PR descriptions to enrich the expanded sections. +* Include examples of how to use the new features when they are available in the PR descriptions. +* Do not group sections based on functionality (like "New features"). Instead, group by domain (e.g. "Runs", "Backends", "Examples") or do not group at all. +* Include "Deprecations" and "Breaking changes" sections if there are any. + +Examples of good release notes: +{examples} + +""" + response = completion( + model=MODEL, + messages=[{"role": "user", "content": prompt}], + ) + return response["choices"][0]["message"]["content"] + + +if __name__ == "__main__": + # TODO: When the script is sufficiently polished, we may automate draft release generation and its update, + # and integrate the script into the CI. + parser = argparse.ArgumentParser( + description=( + "Generate expanded `dstack` release notes from a release draft using LLM." + " The script accepts a release tag for which you must generate automatic release notes beforehand." + " The script does not publish or change anything on GitHub and only outputs the generated release notes." + ) + ) + parser.add_argument("tag", help="Release tag (e.g., 0.19.25)") + args = parser.parse_args() + + with open(Path(__file__).parent / "release_notes_examples.md") as f: + examples = f.read() + draft_release = get_draft_release_by_tag(args.tag) + draft_body = draft_release["body"] + prs = get_prs_from_draft(draft_body) + notes = generate_release_notes(draft_body, prs, examples) + print(notes) diff --git a/scripts/release_notes_examples.md b/scripts/release_notes_examples.md new file mode 100644 index 0000000000..7220d7b206 --- /dev/null +++ b/scripts/release_notes_examples.md @@ -0,0 +1,112 @@ +## Run configurations + +### Repo directory + +It's now possible to specify the directory in the container where the repo is mounted: + +```yaml +type: dev-environment + +ide: vscode + +repos: + - local_path: . + path: my_repo + + # or using short syntax: + # - .:my_repo +``` + +The `path` property can be an absolute path or a relative path (with respect to `working_dir`). It's available inside run as the `$DSTACK_REPO_DIR` environment variable. If `path` is not set, the `/workflow` path is used. + +### Working directory + +Previously, the `working_dir` property had complicated semantics: it defaulted to the repo path (`/workflow`), but for tasks and services without `commands`, the image working directory was used. You could also specify custom `working_dir` relative to the repo directory. This is now reversed: you specify `working_dir` as absolute path, and the repo path can be specified relative to it. + +> [!NOTE] +> During transitioning period, the legacy behavior of using `/workflow` is preserved if `working_dir` is not set. In future releases, this will be simplified, and `working_dir` will always default to the image working directory. + +## Fleet configuration + +### Nodes, retry, and target + +`dstack` now indefinitely maintains `nodes.min` specified for cloud fleets. If instances get terminated for any reason and there are fewer instances than `nodes.min`, `dstack` will provision new fleet instances in the background. + +There is also a new `nodes.target` property that specifies the number of instances to provision on fleet apply. Since now `nodes.min` is always maintained, you may specify `nodes.target` different from `nodes.min` to provision more instances than needs to be maintained. + +Example: + +```yaml +type: fleet +name: default-fleet +nodes: + min: 1 # Maintain one instance + target: 2 # Provision two instances initially + max: 3 +``` + +`dstack` will provision two instances. After deleting one instance, there will be one instances left. Deleting the last instance will trigger `dstack` to re-create the instance. + +## Offers + +The UI now has a dedicated page showing GPU offers available across all configured backends. + + + +## Digital Ocean and AMD Developer Cloud + +The release adds native integration with [DigitalOcean](https://www.digitalocean.com/products/gradient/gpu-droplets) and +[AMD Developer Cloud](https://www.amd.com/en/developer/resources/cloud-access/amd-developer-cloud.html). + +A backend configuration example: + +```yaml +projects: +- name: main + backends: + - type: amddevcloud + project_name: TestProject + creds: + type: api_key + api_key: ... +``` + +For DigitalOcean, set `type` to `digitalocean`. + +The `digitalocean` and `amddevcloud` backends support NVIDIA and AMD GPU VMs, respectively, and allow you to run +[dev environments](../../docs/concepts/dev-environments.md) (interactive development), [tasks](../../docs/concepts/tasks.md) +(training, fine-tuning, or other batch jobs), and [services](../../docs/concepts/services.md) (inference). + +## Security + +> [!IMPORTANT] +> This update fixes a vulnerability in the `cloudrift`, `cudo`, and `datacrunch` backends. Instances created with earlier `dstack` versions lack proper firewall rules, potentially exposing internal APIs and allowing unauthorized access. +> +> Users of these backends are advised to update to the latest version and re-create any running instances. + +## What's changed + +* Minor Hot Aisle Cleanup by @Bihan in https://github.com/dstackai/dstack/pull/2978 +* UI for offers #3004 by @olgenn in https://github.com/dstackai/dstack/pull/3042 +* Add `repos[].path` property by @un-def in https://github.com/dstackai/dstack/pull/3041 +* style(frontend): Add missing final newline by @un-def in https://github.com/dstackai/dstack/pull/3044 +* Implement fleet state-spec consolidation to maintain `nodes.min` by @r4victor in https://github.com/dstackai/dstack/pull/3047 +* Add digital ocean and amd dev backend by @Bihan in https://github.com/dstackai/dstack/pull/3030 +* test: include amddevcloud and digitalocean in backend types by @Bihan in https://github.com/dstackai/dstack/pull/3053 +* Fix missing digitaloceanbase configurator methods by @Bihan in https://github.com/dstackai/dstack/pull/3055 +* Expose job working dir via environment variable by @un-def in https://github.com/dstackai/dstack/pull/3049 +* [runner] Ensure `working_dir` exists by @un-def in https://github.com/dstackai/dstack/pull/3052 +* Fix server compatibility with pre-0.19.27 runners by @un-def in https://github.com/dstackai/dstack/pull/3054 +* Bind shim and exposed container ports to localhost by @jvstme in https://github.com/dstackai/dstack/pull/3057 +* Fix client compatibility with pre-0.19.27 servers by @un-def in https://github.com/dstackai/dstack/pull/3063 +* [Docs] Reflect the repo and working directory changes (#3041) by @peterschmidt85 in https://github.com/dstackai/dstack/pull/3064 +* Show a CLI warning when using autocreated fleets by @r4victor in https://github.com/dstackai/dstack/pull/3060 +* Improve UX with private repos by @un-def in https://github.com/dstackai/dstack/pull/3065 +* Set up instance-level firewall on all backends by @jvstme in https://github.com/dstackai/dstack/pull/3058 +* Exclude target when equal to min for responses by @r4victor in https://github.com/dstackai/dstack/pull/3070 +* [Docs] Shorten the default `working_dir` warning by @peterschmidt85 in https://github.com/dstackai/dstack/pull/3072 +* Do not issue empty update for deleted_fleets_placement_groups by @r4victor in https://github.com/dstackai/dstack/pull/3071 +* Exclude target when equal to min for responses (attempt 2) by @r4victor in https://github.com/dstackai/dstack/pull/3074 + + +**Full changelog**: https://github.com/dstackai/dstack/compare/0.19.26...0.19.27 diff --git a/scripts/setup_kubernetes.py b/scripts/setup_kubernetes.py new file mode 100644 index 0000000000..22295ffebf --- /dev/null +++ b/scripts/setup_kubernetes.py @@ -0,0 +1,324 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [] +# /// +""" +This script prepares a K8s cluster for dstack integration, namely: + + * Creates/updates required objects: a namespace, a service account, roles, etc. + * Generates and prints a kubeconfig that can be used with dstack. + +By default, a plain kubeconfig is generated. With `--output-format=dstack`, a dstack backend +config with kubeconfig contents embedded is generated. + +The config is printed to stdout. Use shell redirects (`> /path/to/file`) to save it. + +Example: + + # Generate a dstack backend config with the embedded kubeconfig data and save it to a file + uv run scripts/setup_kubernetes.py --namespace dstack --output-format dstack > k8s.yml + # Now you can copy k8s.yml's contents into the web UI project settings or server/config.yml +""" + +import argparse +import base64 +import logging +import os +import pathlib +import shlex +import subprocess +import tempfile +import textwrap +import time +from typing import Literal + +REPO_DIR = pathlib.Path(__file__).parent.parent +MANIFESTS_DIR = REPO_DIR / "mkdocs/snippets/kubernetes" + +# See MANIFESTS_DIR/*.yaml +ROLE_NAME = "dstack-backend" +NAMESPACE_PLACEHOLDER = "${NAMESPACE}" + +DEFAULT_SERVICE_ACCOUNT_NAME = ROLE_NAME +DEFAULT_OUTPUT_FORMAT = "kubeconfig" +DEFAULT_LOG_LEVEL = "INFO" + + +class Kubectl: + def __init__(self, kubeconfig: str | None = None, context: str | None = None) -> None: + self._kubeconfig = kubeconfig + self._context = context + if context is None: + logging.debug("using current-context") + current_context = self.call("config", "current-context", capture_stdout=True) + # Always use the once-resolved current-context to avoid in-flight context switching + self._context = current_context + + @property + def context(self) -> str: + assert self._context is not None + return self._context + + def call(self, *args: str, input: str | None = None, capture_stdout: bool = False) -> str: + cmd = ["kubectl"] + if self._kubeconfig is not None: + cmd.extend(["--kubeconfig", self._kubeconfig]) + if self._context is not None: + cmd.extend(["--context", self._context]) + cmd.extend(args) + logging.debug("kubectl call: %s", shlex.join(cmd)) + cp = subprocess.run(cmd, text=True, input=input, stdout=subprocess.PIPE) + output = cp.stdout.strip() + if cp.returncode != 0: + logging.error("kubectl command failed: %s", output) + cp.check_returncode() + if capture_stdout: + return output + logging.debug("kubectl output: %s", output) + return "" + + def apply(self, manifest: str) -> None: + manifest = textwrap.dedent(manifest).strip() + logging.debug("applying manifest:\n%s", textwrap.indent(manifest, " ")) + self.call("apply", "-f", "-", input=manifest) + + +def create_resources( + *, + kubectl: Kubectl, + namespace: str, + service_account_name: str, + cluster_role_name: str, + role_name: str, +) -> str: + logging.info("creating required resources") + + # Namespace + kubectl.apply(f""" + apiVersion: v1 + kind: Namespace + metadata: + name: {namespace} + """) + + # ServiceAccount + kubectl.apply(f""" + apiVersion: v1 + kind: ServiceAccount + metadata: + name: {service_account_name} + namespace: {namespace} + """) + + # Secret for service-account-token + service_account_token_name = f"{service_account_name}-service-account-token" + kubectl.apply(f""" + apiVersion: v1 + kind: Secret + metadata: + name: {service_account_token_name} + namespace: {namespace} + annotations: + kubernetes.io/service-account.name: {service_account_name} + type: kubernetes.io/service-account-token + """) + for _ in range(10): + token = kubectl.call( + "get", + "secret", + service_account_token_name, + "-n", + namespace, + "-o", + "jsonpath={.data.token}", + "--ignore-not-found", + capture_stdout=True, + ) + if token: + break + logging.debug("service account token does not exist yet, waiting") + time.sleep(1) + else: + raise AssertionError(f"service account token does not exist: {service_account_token_name}") + token = base64.b64decode(token).decode() + + # ClusterRole + with open(MANIFESTS_DIR / "dstack-backend-clusterrole.yaml") as f: + cluster_role_manifest = f.read() + assert f" name: {cluster_role_name}\n" in cluster_role_manifest + kubectl.apply(cluster_role_manifest) + + # ClusterRoleBinding + kubectl.apply(f""" + apiVersion: rbac.authorization.k8s.io/v1 + kind: ClusterRoleBinding + metadata: + name: {cluster_role_name} + subjects: + - kind: ServiceAccount + name: {service_account_name} + namespace: {namespace} + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {cluster_role_name} + """) + + # Role + with open(MANIFESTS_DIR / "dstack-backend-role.yaml") as f: + role_manifest = f.read() + assert f" namespace: {NAMESPACE_PLACEHOLDER}\n" in role_manifest + assert f" name: {role_name}\n" in role_manifest + role_manifest = role_manifest.replace(NAMESPACE_PLACEHOLDER, namespace) + kubectl.apply(role_manifest) + + # RoleBinding + kubectl.apply(f""" + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: {role_name} + namespace: {namespace} + subjects: + - kind: ServiceAccount + name: {service_account_name} + namespace: {namespace} + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {role_name} + """) + + logging.info("all resources created") + return token + + +def generate_kubeconfig( + *, + kubectl: Kubectl, + namespace: str, + service_account_name: str, + service_account_token: str, +) -> str: + logging.info("generating kubeconfig") + kubeconfig_content = kubectl.call( + "config", "view", "--minify", "--raw", "--flatten", capture_stdout=True + ) + with tempfile.NamedTemporaryFile("w+") as f: + f.write(kubeconfig_content) + f.flush() + tmp_kubectl = Kubectl(kubeconfig=f.name) + old_user = tmp_kubectl.call( + "config", "view", "-o", "jsonpath={.contexts[0].context.user}", capture_stdout=True + ) + assert old_user + tmp_kubectl.call("config", "delete-user", old_user) + cluster = tmp_kubectl.call( + "config", "view", "-o", "jsonpath={.contexts[0].context.cluster}", capture_stdout=True + ) + assert cluster + new_user = f"{cluster}-{service_account_name}" + tmp_kubectl.call("config", "set-credentials", new_user, "--token", service_account_token) + tmp_kubectl.call( + "config", "set-context", "--current", "--user", new_user, "--namespace", namespace + ) + logging.info("kubeconfig generated") + return tmp_kubectl.call("config", "view", "--raw", capture_stdout=True) + + +class Args(argparse.Namespace): + kubeconfig: str | None + context: str | None + namespace: str + service_account: str + output_format: Literal["kubeconfig", "dstack"] + log_level: str + + +def parse_args() -> Args: + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--kubeconfig", + required=False, + help="path to kubeconfig file (default: same as with kubectl)", + ) + parser.add_argument( + "--context", + required=False, + help="kubeconfig context to use (default: same as with kubectl)", + ) + parser.add_argument( + "--namespace", + required=True, + help="namespace for all resources managed by dstack (required)", + ) + parser.add_argument( + "--service-account", + default=DEFAULT_SERVICE_ACCOUNT_NAME, + help=f"name of dstack service account (default: {DEFAULT_SERVICE_ACCOUNT_NAME})", + ) + parser.add_argument( + "--output-format", + choices=["kubeconfig", "dstack"], + default=DEFAULT_OUTPUT_FORMAT, + help=( + "output format, kubeconfig for plain kubeconfig file," + " dstack for dstack backend config with embedded kubeconfig" + f" (default: {DEFAULT_OUTPUT_FORMAT})" + ), + ) + parser.add_argument( + "--log-level", + default=DEFAULT_LOG_LEVEL, + help=f"script logging level (default: {DEFAULT_LOG_LEVEL})", + ) + return parser.parse_args(namespace=Args()) + + +def main() -> None: + args = parse_args() + logging.basicConfig(level=args.log_level.upper(), format="%(levelname)s: %(message)s") + kubectl = Kubectl(kubeconfig=args.kubeconfig, context=args.context) + if args.kubeconfig is not None: + logging.info("using kubeconfig: %s", args.kubeconfig) + elif kubeconfig_env_value := os.getenv("KUBECONFIG"): + logging.info("using kubeconfig(s) from env: %s", kubeconfig_env_value) + else: + logging.info("using default kubeconfig") + logging.info("using context: %s", kubectl.context) + logging.info("using namespace: %s", args.namespace) + whoami = kubectl.call("auth", "whoami", "-o", "name", capture_stdout=True) + logging.debug("whoami: %s", whoami) + service_account_token = create_resources( + kubectl=kubectl, + namespace=args.namespace, + service_account_name=args.service_account, + cluster_role_name=ROLE_NAME, + role_name=ROLE_NAME, + ) + generated_kubeconfig_content = generate_kubeconfig( + kubectl=kubectl, + namespace=args.namespace, + service_account_name=args.service_account, + service_account_token=service_account_token, + ) + logging.info("generated config in `%s` format is printed to stdout\n", args.output_format) + if args.output_format == "dstack": + print( + textwrap.dedent(f""" + type: kubernetes + namespace: {args.namespace} + kubeconfig: + data: | + """).strip() + ) + print(textwrap.indent(generated_kubeconfig_content, " ")) + else: + print(generated_kubeconfig_content) + + +if __name__ == "__main__": + main() diff --git a/scripts/sqlite_to_psql.load b/scripts/sqlite_to_psql.load new file mode 100644 index 0000000000..48385da72d --- /dev/null +++ b/scripts/sqlite_to_psql.load @@ -0,0 +1,9 @@ +LOAD DATABASE + FROM {{SOURCE_PATH}} /* e.g. sqlite:///Users/me/.dstack/server/data/sqlite.db */ + INTO {{TARGET_PATH}} /* e.g. postgresql://postgres:postgres@localhost:5432/postgres */ + +WITH preserve index names, data only + +EXCLUDING TABLE NAMES LIKE 'alembic_version' + +SET work_mem to '16MB', maintenance_work_mem to '512 MB'; diff --git a/setup.py b/setup.py deleted file mode 100644 index 33c70fd041..0000000000 --- a/setup.py +++ /dev/null @@ -1,150 +0,0 @@ -import re -import sys -from pathlib import Path - -from setuptools import find_packages, setup - -project_dir = Path(__file__).parent - - -def get_version(): - text = (project_dir / "src" / "dstack" / "version.py").read_text() - match = re.compile(r"__version__\s*=\s*\"?([^\n\"]+)\"?.*").match(text) - if match: - if match.group(1) != "None": - return match.group(1) - else: - return None - else: - sys.exit("Can't parse version.py") - - -def get_long_description(): - return re.sub( - r"\s*|]*>\s*|\s*|]*>\s*|\s*|### Demo\s*", - "", - open(project_dir / "README.md").read(), - ) - - -BASE_DEPS = [ - "pyyaml", - "requests", - "typing-extensions>=4.0.0", - "cryptography", - "packaging", - "python-dateutil", - "gitpython", - "jsonschema", - "paramiko", - "git-url-parse", - "cursor", - "rich", - "rich-argparse", - "tqdm", - "simple-term-menu", - "fastapi", - "starlette>=0.26.0", - "uvicorn", - "pydantic>=1.10.10,<2.0.0", - "pydantic-duality>=1.2.0", - "sqlalchemy[asyncio]>=2.0.0", - "sqlalchemy_utils>=0.40.0", - "alembic>=1.10.2", - "apscheduler<4", - "aiosqlite", - "aiohttp", - "websocket-client", - "watchfiles", - "python-multipart", - "filelock", - "docker>=6.0.0", - "python-dxf>=11.0.0", - "cachetools", - "dnspython", - "grpcio>=1.50", # indirect - "gpuhunt>=0.0.11", - "sentry-sdk[fastapi]", - "httpx", - "aiorwlock", - "python-json-logger", - "alembic-postgresql-enum", - "asyncpg", -] - -AWS_DEPS = [ - "boto3", - "botocore", -] - -AZURE_DEPS = [ - "azure-identity>=1.12.0", - "azure-mgmt-subscription>=3.1.1", - "azure-mgmt-compute>=29.1.0", - "azure-mgmt-network>=23.0.0", - "azure-mgmt-resource>=22.0.0", - "azure-mgmt-authorization>=3.0.0", -] - -GCP_DEPS = [ - "google-auth>=2.3.0", # indirect - "google-cloud-storage>=2.0.0", - "google-cloud-compute>=1.5.0", - "google-cloud-logging>=2.0.0", - "google-api-python-client>=2.80.0", - "google-cloud-billing>=1.11.0", - "google-cloud-tpu>=1.18.3", -] - -DATACRUNCH_DEPS = ["datacrunch"] - -KUBERNETES_DEPS = ["kubernetes"] - -LAMBDA_DEPS = AWS_DEPS - -OCI_DEPS = ["oci"] - -ALL_DEPS = AWS_DEPS + AZURE_DEPS + GCP_DEPS + DATACRUNCH_DEPS + KUBERNETES_DEPS + OCI_DEPS - - -setup( - name="dstack", - version=get_version(), - author="Andrey Cheptsov", - author_email="andrey@dstack.ai", - package_dir={"": "src"}, - packages=find_packages("src"), - package_data={ - "dstack.api._public.huggingface.finetuning.sft": ["requirements.txt"], - }, - include_package_data=True, - scripts=[], - entry_points={ - "console_scripts": ["dstack=dstack._internal.cli.main:main"], - }, - url="https://dstack.ai", - project_urls={ - "Source": "https://github.com/dstackai/dstack", - }, - description="dstack is an open-source orchestration engine for running AI workloads on any cloud or on-premises.", - long_description=get_long_description(), - long_description_content_type="text/markdown", - python_requires=">=3.8", - install_requires=BASE_DEPS, - extras_require={ - "all": ALL_DEPS, - "aws": AWS_DEPS, - "azure": AZURE_DEPS, - "datacrunch": DATACRUNCH_DEPS, - "gcp": GCP_DEPS, - "kubernetes": KUBERNETES_DEPS, - "lambda": LAMBDA_DEPS, - "oci": OCI_DEPS, - }, - classifiers=[ - "Development Status :: 2 - Pre-Alpha", - "Topic :: Scientific/Engineering :: Artificial Intelligence", - "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)", - "Programming Language :: Python :: 3", - ], -) diff --git a/skills/dstack/SKILL.md b/skills/dstack/SKILL.md new file mode 100644 index 0000000000..4df1843b75 --- /dev/null +++ b/skills/dstack/SKILL.md @@ -0,0 +1,566 @@ +--- +name: dstack +description: | + dstack is an open-source control plane for GPU provisioning and orchestration across GPU clouds, Kubernetes, and on-prem clusters. +--- + +# dstack + +## Overview + +`dstack` provisions and orchestrates workloads across GPU clouds, Kubernetes, and on-prem via fleets. + +**When to use this skill:** +- Running or managing dev environments, tasks, or services on dstack +- Creating, editing, or applying `*.dstack.yml` configurations +- Managing fleets, volumes, gateways, and checking available offers + +## How it works + +`dstack` operates through three core components: + +1. `dstack` server - Can run locally, remotely, or via dstack Sky (managed) +2. `dstack` CLI - Applies configurations and manages or inspects fleets, runs, + logs, events, volumes, gateways, and offers; it uses project configurations + stored in `~/.dstack/config.yml`, which can be managed with `dstack project` +3. `dstack` configuration files - YAML files ending with `.dstack.yml` + +`dstack apply` shows a plan and submits configuration changes. For run +configurations, it attaches when the run reaches `running` by default: it +configures SSH access, forwards declared ports, and streams logs. With `-d`, it +submits and exits. + +## Quick agent flow (detached runs) + +1) Show plan: `echo "n" | dstack apply -f ` +2) If plan is OK and user confirms, apply detached: `dstack apply -f -y -d` +3) Check status once: `dstack ps -v` +4) If dev-environment or task with ports and running: attach to surface IDE link/ports/SSH alias (agent runs attach in background); ask to open link +5) If attach fails in sandbox: request escalation; if not approved, ask the user to run `dstack attach` locally and share the output + +**CRITICAL: Never propose `dstack` CLI commands or YAML syntaxes that don't exist.** +- Only use CLI commands and YAML syntax documented here or verified via `--help` +- If uncertain about a command or its syntax, check the links or use `--help` + +**NEVER do the following:** +- Invent CLI flags not documented here or shown in `--help` +- Guess YAML property names - verify in configuration reference links +- Run `dstack apply` for runs without `-d` in automated contexts (blocks indefinitely) +- Retry failed commands without addressing the underlying error +- Summarize or reformat tabular CLI output - show it as-is +- Use `echo "y" |` when `-y` flag is available +- Assume a command succeeded without checking output for errors + +## Agent execution guidelines + +### Output accuracy +- **NEVER reformat, summarize, or paraphrase CLI output.** Display tables, status output, and error messages exactly as returned. +- When showing command results, use code blocks to preserve formatting. +- If output is truncated due to length, indicate this clearly (e.g., "Output truncated. Full output shows X entries."). + +### Verification before execution +- **When uncertain about any CLI flag or YAML property, run `dstack --help` first.** +- Never guess or invent flags. Example verification commands: + ```bash + dstack --help # List all commands + dstack apply -h # Flags for apply per configuration type (dev-environment, task, service, fleet, etc) + dstack fleet --help # Fleet subcommands + dstack ps --help # Flags for ps + ``` +- If a command or flag isn't documented, it doesn't exist. + +### Command timing and confirmation handling + +**Commands that stream indefinitely in the foreground:** +- `dstack attach` +- `dstack apply` without `-d` for runs +- `dstack ps -w` + +Agents should avoid blocking: use `-d`, timeouts, or background attach. When attach is needed, run it in the background by default (`nohup ...`), but describe it to the user simply as "attach" unless they ask for a live foreground session. Prefer `dstack ps -v` and poll in a loop if the user wants to watch status. + +**All other commands:** Use 10-60s timeout. Most complete within this range. **While waiting, monitor the output** - it may contain errors, warnings, or prompts requiring attention. + +**Confirmation handling:** +- `dstack apply`, `dstack stop`, `dstack fleet delete` require confirmation +- Use `-y` flag to auto-confirm when user has already approved +- For `dstack stop`, always use `-y` after the user confirms to avoid interactive prompts +- Use `echo "n" |` to preview `dstack apply` plan without executing (avoid `echo "y" |`, prefer `-y`) + +**Best practices:** +- Prefer modifying configuration files over passing parameters to `dstack apply` (unless it's an exception) +- When user confirms deletion/stop operations, use `-y` flag to skip confirmation prompts + +### Detached run follow-up (after `-d`) + +After submitting a run with `-d` (dev-environment, task, service), first determine whether submission failed. If the apply output shows errors (validation, no offers, etc.), stop and surface the error. + +If the run was submitted, do a quick status check with `dstack ps -v`, then guide the user through relevant next steps: +If you need to prompt for next actions, be explicit about the dstack step and command (avoid vague questions). When speaking to the user, refer to the action as "attach" (not "background attach"). +- **Monitor status:** Report the current status (provisioning/pulling/running/finished) and offer to keep watching. Poll `dstack ps -v` every 10-20s if the user wants updates. +- **Attach when running:** For agents, run attach in the background by default so the session does not block. Use it to capture IDE links/SSH alias or enable port forwarding; when describing the action to the user, just say "attach". +- **Dev environments or tasks with ports:** Once `running`, attach to surface the IDE link/port forwarding/SSH alias, then ask whether to open the IDE link. Never open links without explicit approval. +- **Services:** Prefer using service endpoints. Attach only if the user explicitly needs port forwarding or full log replay. +- **Tasks without ports:** Default to `dstack logs` for progress; attach only if full log replay is required. + +### Attaching behavior (blocking vs non-blocking) + +`dstack attach` runs until interrupted and blocks the terminal. **Agents must avoid indefinite blocking.** If a brief attach is needed, use a timeout to capture initial output (IDE link, SSH alias) and then detach. + +Note: `dstack attach` writes SSH alias info under `~/.dstack/ssh/config` (and may update `~/.ssh/config`) to enable `ssh `, IDE connections, port forwarding, and real-time logs (`dstack attach --logs`). If the sandbox cannot write there, the alias will not be created. + +**Permissions guardrail:** If `dstack attach` fails due to sandbox permissions, request permission escalation to run it outside the sandbox. If escalation isn’t approved or attach still fails, ask the user to run `dstack attach` locally and share the IDE link/SSH alias output. + +**Background attach (non-blocking default for agents):** +```bash +nohup dstack attach --logs > /tmp/.attach.log 2>&1 & echo $! > /tmp/.attach.pid +``` +Then read the output: +```bash +tail -n 50 /tmp/.attach.log +``` +Offer live follow only if asked: +```bash +tail -f /tmp/.attach.log +``` +Stop the background attach (preferred): +```bash +kill "$(cat /tmp/.attach.pid)" +``` +If the PID file is missing, fall back to a specific match (avoid killing all attaches): +```bash +pkill -f "dstack attach " +``` +**Why this helps:** it keeps the attach session alive (including port forwarding) while the agent remains usable. IDE links and SSH instructions appear in the log file -- surface them and ask whether to open the link (`open ""` on macOS, `xdg-open ""` on Linux) only after explicit approval. + +If background attach fails in the sandbox (permissions writing `~/.dstack` or `~/.ssh`, timeouts), request escalation to run attach outside the sandbox. If not approved, ask the user to run attach locally and share the IDE link/SSH alias. + +### Interpreting user requests + +**"Run something":** When the user asks to run a workload (dev environment, task, service), use `dstack apply` with the appropriate configuration. Note: `dstack run` only supports `dstack run get --json` for retrieving run details -- it cannot start workloads. + +**"Connect to" or "open" a dev environment:** If a dev environment is already running, use `dstack attach --logs` (agent runs it in the background by default) to surface the IDE URL (`cursor://`, `vscode://`, etc.) and SSH alias. If sandboxed attach fails, request escalation or ask the user to run attach locally and share the link. + +## Configuration types + +`dstack` supports run configurations (dev environments, tasks, and services) and infrastructure configurations (fleets, volumes, and gateways). Configuration files can be named `.dstack.yml` or simply `.dstack.yml`. + +**Common parameters:** All run configurations (dev environments, tasks, services) support many parameters including: +- **Git integration:** Clone repos automatically (`repo`) or mount existing repos (`repos`) +- **File upload:** Upload local files (`files`; see concept docs for examples) +- **Docker support:** Use custom Docker images (`image`); use `docker: true` if you want to use Docker from inside the container (VM-based backends only) +- **Environment:** Set environment variables (`env`), often via `.envrc`. Secrets are supported but less common. +- **Storage:** Persistent network volumes (`volumes`), specify disk size +- **Resources:** Define GPU, CPU, memory, and disk requirements + +**Best practices:** +- Prefer giving configurations a `name` property for easier management +- When configurations need credentials (API keys, tokens), list only env var names in the `env` section (e.g., `- HF_TOKEN`), not values. Recommend storing actual values in a `.envrc` file alongside the configuration, applied via `source .envrc && dstack apply`. +- `python` and `image` are mutually exclusive in run configurations. If `image` is set, do not set `python`. + +### `files` and `repos` intent policy + +Use `files` and `repos` only when the user intends to use local/repo files inside the run. + +- If user asks to use project code/data/config in the run, then add `files` or `repos` as appropriate. +- If it is totally unclear whether files or repos must be mounted, ask one explicit clarification question or default to not mounting. + +`files` guidance: +- Relative paths are valid and preferred for local project files. +- A relative `files` path is placed under the run's `working_dir` (default or set by user). + +`repos` + image/working directory guidance: +- With non-default Docker images, prefer explicit absolute mount targets for `repos` (e.g., `.:/dstack/run`). +- When setting an explicit repo mount path, also set `working_dir` to the same path. +- Reason: custom images may have a different/non-empty default working directory, and mounting a repo into a non-empty path can fail. +- With `dstack` default images, the default `working_dir` is already `/dstack/run`. + +### 1. Dev environments +**Use for:** Interactive development with IDE integration (VS Code, Cursor, etc.). + +```yaml +type: dev-environment +name: cursor + +python: "3.12" +ide: vscode + +resources: + gpu: 80GB +``` + +[Concept documentation](https://dstack.ai/docs/concepts/dev-environments.md) | [Configuration reference](https://dstack.ai/docs/reference/dstack.yml/dev-environment.md) + +### 2. Tasks +**Use for:** Batch jobs, training runs, fine-tuning, web applications, any executable workload. + +**Key features:** Distributed training (multi-node) and port forwarding for web apps. + +```yaml +type: task +name: train + +python: "3.12" +env: + - HUGGING_FACE_HUB_TOKEN +commands: + - uv pip install -r requirements.txt + - uv run python train.py +ports: + - 8501 # Optional: expose ports for web apps + +resources: + gpu: A100:40GB:2 +``` + +**Port forwarding:** When you specify `ports`, `dstack apply` forwards them to `localhost` while attached. Use `dstack attach ` to reconnect and restore port forwarding. The run name becomes an SSH alias (e.g., `ssh `) for direct access. + +**Distributed training:** Multi-node tasks are supported (e.g., via `nodes`) and require fleets that support inter-node communication (see `placement: cluster` in fleets). + +[Concept documentation](https://dstack.ai/docs/concepts/tasks.md) | [Configuration reference](https://dstack.ai/docs/reference/dstack.yml/task.md) + +### 3. Services +**Use for:** Deploying models or web applications as production endpoints. + +**Key features:** OpenAI-compatible model serving, auto-scaling (RPS/queue), custom gateways with HTTPS. + +```yaml +type: service +name: llama31 + +python: "3.12" +env: + - HF_TOKEN +commands: + - uv pip install vllm + - uv run vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct +port: 8000 +model: meta-llama/Meta-Llama-3.1-8B-Instruct + +resources: + gpu: 80GB + disk: 200GB +``` + +**Service endpoints:** +- Without gateway: `/proxy/services///` +- With gateway: `https://./` +- Authentication: Unless `auth` is `false`, include `Authorization: Bearer ` on service requests. +- Model endpoint: If `model` is set, `service.model.base_url` from `dstack run get --json` provides the model endpoint. For OpenAI-compatible models (the default, unless format is set otherwise), this will be `service.url` + `/v1`. +- Example (with gateway): + ```bash + curl -sS -X POST "https://./v1/chat/completions" \ + -H "Authorization: Bearer " \ + -H "Content-Type: application/json" \ + -d '{"model":"","messages":[{"role":"user","content":"Hello"}],"max_tokens":64}' + ``` + +[Concept documentation](https://dstack.ai/docs/concepts/services.md) | [Configuration reference](https://dstack.ai/docs/reference/dstack.yml/service.md) + +### 4. Fleets +**Use for:** Pre-provisioning infrastructure for workloads, managing on-prem GPU servers, creating auto-scaling instance pools. + +```yaml +type: fleet +name: my-fleet +nodes: 0..2 + +resources: + gpu: 24GB.. + disk: 200GB + +spot_policy: auto # other values: spot, on-demand +idle_duration: 5m +``` + +**On-demand provisioning:** When `nodes` is a range (e.g., `0..2`), dstack creates a template and provisions instances on demand within the min/max. Use `idle_duration` to terminate idle instances. + +**Distributed workloads:** Use `placement: cluster` for fleets intended for multi-node tasks that require inter-node networking. + +**SSH fleet (on-prem or pre-provisioned):** +```yaml +type: fleet +name: on-prem-fleet + +ssh_config: + user: ubuntu + identity_file: ~/.ssh/id_rsa + hosts: + - 192.168.1.10 + - 192.168.1.11 +``` + +[Concept documentation](https://dstack.ai/docs/concepts/fleets.md) | [Configuration reference](https://dstack.ai/docs/reference/dstack.yml/fleet.md) + +### 5. Volumes +**Use for:** Persistent storage for datasets, model checkpoints, training artifacts. + +```yaml +type: volume +name: my-volume + +backend: aws +region: us-east-1 + +resources: + disk: 500GB +``` + +**Instance volumes (local, ephemeral, often optional):** +```yaml +type: dev-environment +# ... other config +volumes: + - instance_path: /dstack-cache/pip + path: /root/.cache/pip + optional: true + - instance_path: /dstack-cache/huggingface + path: /root/.cache/huggingface + optional: true +``` + +**Mounting volumes:** Use `volumes` in dev environments, tasks, and services. Network volumes persist independently; instance volumes are tied to the instance lifecycle. + +[Concept documentation](https://dstack.ai/docs/concepts/volumes.md) | [Configuration reference](https://dstack.ai/docs/reference/dstack.yml/volume.md) + +### 6. Gateways +**Use for:** Gateways are optional for basic service endpoints. They are +required when a service uses auto-scaling or rate limits, needs HTTPS on a +custom domain, requires WebSockets, or cannot work with the server proxy path +prefix. + +```yaml +type: gateway +name: my-gateway + +backend: aws +region: us-east-1 +domain: example.com +``` + +[Concept documentation](https://dstack.ai/docs/concepts/gateways.md) | [Configuration reference](https://dstack.ai/docs/reference/dstack.yml/gateway.md) + +## Essential CLI commands + +### Apply configurations + +**Important behavior:** +- `dstack apply` shows a plan with estimated costs and may ask for confirmation +- In attached mode (default), the terminal blocks and shows output +- In detached mode (`-d`), it submits and exits without attaching + +**Workflow for applying run configurations (dev-environment, task, service):** + +1. **Show plan:** + ```bash + echo "n" | dstack apply -f config.dstack.yml + ``` + Display the FULL output including the offers table and cost estimate. **Do NOT summarize or reformat.** + +2. **Wait for user confirmation.** Do NOT proceed if: + - Output shows "No offers found" or similar errors + - Output shows validation errors + - User has not explicitly confirmed + +3. **Execute (only after user confirms):** + ```bash + dstack apply -f config.dstack.yml -y -d + ``` + +4. **Verify apply status:** + ```bash + dstack ps -v + ``` + +**Workflow for infrastructure (fleet, volume, gateway):** + +1. **Show plan:** + ```bash + echo "n" | dstack apply -f infra.dstack.yml + ``` + Display the FULL output. **Do NOT summarize or reformat.** + +2. **Wait for user confirmation.** + +3. **Execute:** + ```bash + dstack apply -f infra.dstack.yml -y + ``` + +4. **Verify:** Use `dstack fleet`, `dstack volume`, or `dstack gateway` respectively. + +### Fleet management + +```bash +# Create/update fleet +dstack apply -f fleet.dstack.yml + +# List fleets +dstack fleet + +# Get fleet details +dstack fleet get my-fleet + +# Get fleet details as JSON (for troubleshooting) +dstack fleet get my-fleet --json + +# Delete entire fleet (use -y when user already confirmed) +dstack fleet delete my-fleet -y + +# Delete specific instance from fleet (use -y when user already confirmed) +dstack fleet delete my-fleet -i -y +``` + +### Monitor runs + +```bash +# List all runs +dstack ps + +# Verbose output with full details +dstack ps -v + +# JSON output (for troubleshooting/scripting) +dstack ps --json + +# Get specific run details as JSON +dstack run get my-run-name --json +``` + +### Attach to runs + +```bash +# Attach and replay logs from start (preferred, unless asked otherwise) +dstack attach my-run-name --logs + +# Attach without replaying logs (restores port forwarding + SSH only) +dstack attach my-run-name +``` + +### View logs + +```bash +# Stream logs (tail mode) +dstack logs my-run-name + +# Debug mode (includes additional runner logs) +dstack logs my-run-name -d + +# Fetch logs from specific replica (multi-node runs) +dstack logs my-run-name --replica 1 + +# Fetch logs from specific job +dstack logs my-run-name --job 0 +``` + +### Stop runs + +```bash +# Stop specific run (use -y after user confirms) +dstack stop my-run-name -y + +# Abort (force stop) +dstack stop my-run-name --abort +``` + +### List offers + +Offers represent available instance configurations that match resource +requirements. If `--fleet` is omitted, `dstack offer` checks all configured +backends. Listing offers does not create capacity; submitting a run still +requires at least one fleet that can provision or reuse matching instances. +Use `--fleet` to inspect offers available through specific fleets. + +```bash +# Filter by specific backend +dstack offer --backend aws + +# Filter by GPU type +dstack offer --gpu A100 + +# Filter by GPU memory +dstack offer --gpu 24GB..80GB + +# Combine filters +dstack offer --backend aws --gpu A100:80GB + +# Limit to a specific fleet +dstack offer --fleet my-fleet + +# Combine offers from multiple fleets +dstack offer --fleet my-fleet --fleet other-fleet + +# JSON output (for troubleshooting/scripting) +dstack offer --json +``` + +With one `--fleet`, `dstack offer` shows offers available through that fleet. With multiple `--fleet`, it combines offers available through the selected fleets. Identical backend offers are shown once, while matching existing instances stay separate. + +**Max offers:** By default, `dstack offer` returns first N offers (output also +includes the total number). Use `--max-offers N` to increase the limit. + +**Grouping:** Prefer `--group-by gpu` for aggregated output across all offers, +not `--max-offers`. Other supported fields are `backend`, `region`, and +`count`; `region` requires `backend`. + +## Troubleshooting + +When diagnosing issues with dstack workloads or infrastructure: + +1. **Use JSON output for detailed inspection:** + ```bash + dstack fleet get my-fleet --json + dstack run get my-run --json + dstack ps -n 10 --json + dstack offer --json + ``` + +2. **Check verbose run status:** + ```bash + dstack ps -v + ``` + +3. **Examine logs with debug output:** + ```bash + dstack logs my-run -d + ``` + +4. **Attach with log replay:** + ```bash + dstack attach my-run --logs + ``` + +Common issues: +- **No offers:** Check `dstack offer`; if submitting a run, ensure at least one fleet can provision or reuse matching instances +- **No fleet:** Ensure at least one fleet is created +- **Configuration errors:** Validate YAML syntax; check `dstack apply` output for specific errors +- **Provisioning timeouts:** Use `dstack ps -v` to see provisioning status; consider spot vs on-demand +- **Connection issues:** Verify server status, check authentication, ensure network access to backends + +**When errors occur:** +1. Display the full error message unchanged +2. Do NOT retry the same command without addressing the error +3. Refer to the [Troubleshooting guide](https://dstack.ai/docs/guides/troubleshooting.md) for guidance + +## Additional resources + +**Core documentation:** +- [Overview](https://dstack.ai/docs/overview.md) +- [Installation](https://dstack.ai/docs/installation.md) +- [Quickstart](https://dstack.ai/docs/quickstart.md) + +**Additional concepts:** +- [Secrets](https://dstack.ai/docs/concepts/secrets.md) +- [Projects](https://dstack.ai/docs/concepts/projects.md) +- [Metrics](https://dstack.ai/docs/concepts/metrics.md) +- [Events](https://dstack.ai/docs/concepts/events.md) + +**Guides:** +- [CLI & API](https://dstack.ai/docs/guides/cli-api.md) +- [Server deployment](https://dstack.ai/docs/guides/server-deployment.md) + +**Accelerator-specific examples:** +- [AMD](https://dstack.ai/examples/accelerators/amd/index.md) +- [Google TPU](https://dstack.ai/examples/accelerators/tpu/index.md) +- [Tenstorrent](https://dstack.ai/examples/accelerators/tenstorrent/index.md) + +**Full documentation:** https://dstack.ai/llms-full.txt diff --git a/src/dstack/__init__.py b/src/dstack/__init__.py index e69de29bb2..a90955ea78 100644 --- a/src/dstack/__init__.py +++ b/src/dstack/__init__.py @@ -0,0 +1,4 @@ +import sys + +if sys.version_info >= (3, 14): + raise ImportError("dstack does not support Python 3.14 or later. Please use Python 3.10–3.13.") diff --git a/src/dstack/_internal/cli/commands/__init__.py b/src/dstack/_internal/cli/commands/__init__.py index 8279b08d5d..b1a56b92ed 100644 --- a/src/dstack/_internal/cli/commands/__init__.py +++ b/src/dstack/_internal/cli/commands/__init__.py @@ -1,20 +1,23 @@ import argparse import os +import shlex from abc import ABC, abstractmethod -from typing import List, Optional +from typing import ClassVar, Optional from rich_argparse import RichHelpFormatter +from dstack._internal.cli.services.completion import ProjectNameCompleter from dstack._internal.cli.utils.common import configure_logging -from dstack._internal.core.errors import CLIError, ConfigurationError +from dstack._internal.core.errors import CLIError from dstack.api import Client class BaseCommand(ABC): - NAME: str = "name the command" - DESCRIPTION: str = "describe the command" - DEFAULT_HELP: bool = True - ALIASES: Optional[List[str]] = None + NAME: ClassVar[str] = "name the command" + DESCRIPTION: ClassVar[str] = "describe the command" + DEFAULT_HELP: ClassVar[bool] = True + ALIASES: ClassVar[Optional[list[str]]] = None + ACCEPT_EXTRA_ARGS: ClassVar[bool] = False def __init__(self, parser: argparse.ArgumentParser): self._parser = parser @@ -50,11 +53,19 @@ def _register(self): @abstractmethod def _command(self, args: argparse.Namespace): - pass + self._configure_logging() + if not self.ACCEPT_EXTRA_ARGS and args.extra_args: + raise CLIError(f"Unrecognized arguments: {shlex.join(args.extra_args)}") + + def _configure_logging(self) -> None: + """ + Override this method to configure command-specific logging + """ + configure_logging() class APIBaseCommand(BaseCommand): - api: Client = None + api: Client def _register(self): self._parser.add_argument( @@ -62,11 +73,8 @@ def _register(self): help="The name of the project. Defaults to [code]$DSTACK_PROJECT[/]", metavar="NAME", default=os.getenv("DSTACK_PROJECT"), - ) + ).completer = ProjectNameCompleter() # type: ignore[attr-defined] def _command(self, args: argparse.Namespace): - configure_logging() - try: - self.api = Client.from_config(project_name=args.project) - except ConfigurationError as e: - raise CLIError(str(e)) + super()._command(args) + self.api = Client.from_config(project_name=args.project) diff --git a/src/dstack/_internal/cli/commands/apply.py b/src/dstack/_internal/cli/commands/apply.py index 58bf395ff1..c10f54f33e 100644 --- a/src/dstack/_internal/cli/commands/apply.py +++ b/src/dstack/_internal/cli/commands/apply.py @@ -1,28 +1,55 @@ import argparse -from pathlib import Path +import shlex + +from argcomplete import FilesCompleter # type: ignore[attr-defined] from dstack._internal.cli.commands import APIBaseCommand from dstack._internal.cli.services.configurators import ( + APPLY_STDIN_NAME, get_apply_configurator_class, load_apply_configuration, ) -from dstack._internal.cli.utils.common import cli_error -from dstack._internal.core.errors import ConfigurationError +from dstack._internal.cli.utils.common import console +from dstack._internal.core.errors import CLIError +from dstack._internal.core.models.configurations import ApplyConfigurationType + +NOTSET = object() class ApplyCommand(APIBaseCommand): NAME = "apply" - DESCRIPTION = "Apply dstack configuration" + DESCRIPTION = "Apply a configuration" + DEFAULT_HELP = False + ACCEPT_EXTRA_ARGS = True def _register(self): super()._register() + self._parser.add_argument( + "-h", + "--help", + nargs="?", + type=ApplyConfigurationType, + default=NOTSET, + help="Show this help message and exit.", + dest="help", + metavar="TYPE", + ) self._parser.add_argument( "-f", "--file", - type=Path, metavar="FILE", - help="The path to the configuration file. Defaults to [code]$PWD/.dstack.yml[/]", + help=( + "The path to the configuration file." + " Specify [code]-[/] to read configuration from stdin." + " Defaults to [code]$PWD/.dstack.yml[/]" + ), dest="configuration_file", + ).completer = FilesCompleter(allowednames=["*.yml", "*.yaml"]) # type: ignore[attr-defined] + self._parser.add_argument( + "-y", + "--yes", + help="Do not ask for confirmation", + action="store_true", ) self._parser.add_argument( "--force", @@ -30,18 +57,50 @@ def _register(self): action="store_true", ) self._parser.add_argument( - "-y", - "--yes", - help="Do not ask for confirmation", + "-d", + "--detach", + help="Exit immediately after submitting configuration", + action="store_true", + ) + self._parser.add_argument( + "-v", + "--verbose", + help="Show all plan properties including those with default values", action="store_true", ) def _command(self, args: argparse.Namespace): - super()._command(args) try: - configuration = load_apply_configuration(args.configuration_file) - except ConfigurationError as e: - raise cli_error(e) - configurator_class = get_apply_configurator_class(configuration.type) - configurator = configurator_class(api_client=self.api) - configurator.apply_configuration(conf=configuration, args=args) + if args.help is not NOTSET: + if args.help is not None: + configurator_class = get_apply_configurator_class( + ApplyConfigurationType(args.help) + ) + configurator_class.register_args(self._parser) + self._parser.print_help() + return + self._parser.print_help() + console.print( + "\nType `dstack apply -h CONFIGURATION_TYPE` to see configuration-specific options.\n" + ) + return + + super()._command(args) + if not args.yes and args.configuration_file == APPLY_STDIN_NAME: + raise CLIError("Cannot read configuration from stdin if -y/--yes is not specified") + configuration_path, configuration = load_apply_configuration(args.configuration_file) + configurator_class = get_apply_configurator_class(configuration.type) + configurator = configurator_class(api_client=self.api) + configurator_parser = configurator.get_parser() + configurator_args, unknown_args = configurator_parser.parse_known_args(args.extra_args) + if unknown_args: + raise CLIError(f"Unrecognized arguments: {shlex.join(unknown_args)}") + configurator.apply_configuration( + conf=configuration, + configuration_path=configuration_path, + command_args=args, + configurator_args=configurator_args, + ) + except KeyboardInterrupt: + console.print("\nOperation interrupted by user. Exiting...") + exit(0) diff --git a/src/dstack/_internal/cli/commands/attach.py b/src/dstack/_internal/cli/commands/attach.py new file mode 100644 index 0000000000..a22d63d37c --- /dev/null +++ b/src/dstack/_internal/cli/commands/attach.py @@ -0,0 +1,197 @@ +import argparse +import sys +import time +from pathlib import Path +from typing import Optional + +from dstack._internal.cli.commands import APIBaseCommand +from dstack._internal.cli.services.args import port_mapping +from dstack._internal.cli.services.completion import RunNameCompleter +from dstack._internal.cli.services.configurators.run import ( + get_run_exit_code, + print_finished_message, +) +from dstack._internal.cli.utils.common import console, get_start_time +from dstack._internal.cli.utils.rich import MultiItemStatus +from dstack._internal.cli.utils.run import get_runs_table +from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT +from dstack._internal.core.errors import CLIError +from dstack._internal.core.models.runs import RunStatus +from dstack._internal.core.services.ssh.ports import PortUsedError +from dstack._internal.utils.common import get_or_error +from dstack.api._public.runs import Run + + +class AttachCommand(APIBaseCommand): + NAME = "attach" + DESCRIPTION = "Attach to the run" + + def _register(self): + super()._register() + self._parser.add_argument( + "--ssh-identity", + metavar="SSH_PRIVATE_KEY", + help="The private SSH key path for SSH tunneling", + type=Path, + dest="ssh_identity_file", + ) + self._parser.add_argument( + "--logs", + action="store_true", + help="Print run logs as they follow", + ) + self._parser.add_argument( + "--host", + help="Local address to bind. Defaults to [code]localhost[/].", + metavar="HOST", + ) + self._parser.add_argument( + "-p", + "--port", + type=port_mapping, + action="append", + help="Port mapping overrides", + dest="ports", + metavar="MAPPING", + ) + self._parser.add_argument( + "--replica", + help="The replica number. Defaults to any running replica.", + type=int, + ) + self._parser.add_argument( + "--job", + help="The job number inside the replica. Defaults to 0.", + type=int, + default=0, + ) + self._parser.add_argument( + "--since", + help=( + "Show only logs newer than the specified date." + " Can be a duration (e.g. 10s, 5m, 1d) or an RFC 3339 string (e.g. 2023-09-24T15:30:00Z)." + ), + type=str, + ) + self._parser.add_argument("run_name").completer = RunNameCompleter() # type: ignore[attr-defined] + + def _command(self, args: argparse.Namespace): + super()._command(args) + run = self.api.runs.get(args.run_name) + if run is None: + raise CLIError(f"Run {args.run_name} not found") + + # Show live progress while waiting for the run to be ready + if _is_provisioning(run): + with MultiItemStatus(f"Attaching to [code]{run.name}[/]...", console=console) as live: + while _is_provisioning(run): + live.update(get_runs_table([run])) + time.sleep(5) + run.refresh() + console.print(get_runs_table([run], verbose=run.status == RunStatus.FAILED)) + console.print( + f"\nProvisioning [code]{run.name}[/] completed [secondary]({run.status.value})[/]" + ) + + if run.status.is_finished() and run.status != RunStatus.DONE: + raise CLIError(f"Run {args.run_name} is {run.status.value}") + + exit_code = 0 + try: + try: + attached = run.attach( + ssh_identity_file=args.ssh_identity_file, + bind_address=args.host, + ports_overrides=args.ports, + replica_num=args.replica, + job_num=args.job, + ) + except PortUsedError as e: + console.print( + f"[error]Failed to attach: port [code]{e.port}[/code] is already in use." + f" Use [code]-p[/code] in [code]dstack attach[/code] to override the local" + f" port mapping, e.g. [code]-p {e.port + 1}:{e.port}[/code].[/]" + ) + exit(1) + if not attached: + raise CLIError(f"Failed to attach to run {args.run_name}") + _print_attached_message( + run=run, + bind_address=args.host, + replica_num=args.replica, + job_num=args.job, + ) + if args.logs: + start_time = get_start_time(args.since) + logs = run.logs( + start_time=start_time, + replica_num=args.replica, + job_num=args.job, + ) + for log in logs: + sys.stdout.buffer.write(log) + sys.stdout.buffer.flush() + _print_finished_message_when_available(run) + exit_code = get_run_exit_code(run) + else: + while True: + time.sleep(10) + except KeyboardInterrupt: + console.print("\nDetached") + finally: + run.detach() + # TODO: Handle run resubmissions similar to dstack apply + exit(exit_code) + + +def _print_finished_message_when_available(run: Run) -> None: + # After reading the logs, the run may not be marked as finished immediately. + # Give the run some time to transition to a finished state before exiting. + for _ in range(30): + run.refresh() + if run.status.is_finished(): + print_finished_message(run) + break + time.sleep(1) + else: + console.print( + "[error]Lost run connection. Timed out waiting for run final status." + " Check `dstack ps` to see if it's done or failed." + ) + + +_IGNORED_PORTS = [DSTACK_RUNNER_HTTP_PORT] + + +def _print_attached_message( + run: Run, + bind_address: Optional[str], + replica_num: Optional[int], + job_num: int, +): + if bind_address is None: + bind_address = "localhost" + + job = get_or_error(run._find_job(replica_num=replica_num, job_num=job_num)) + replica_num = job.job_spec.replica_num + output = f"Attached to run [code]{run.name}[/] (replica={replica_num} job={job_num})\n" + name = run.name + if replica_num != 0 or job_num != 0: + name = job.job_spec.job_name + ports = get_or_error(run.ports) + ports = {k: v for k, v in ports.items() if k not in _IGNORED_PORTS} + if len(ports) > 0: + output += "Forwarded ports (local -> remote):\n" + for remote_port, local_port in ports.items(): + output += f" - {bind_address}:{local_port} -> {remote_port}\n" + output += f"To connect to the run via SSH, use `ssh {name}`.\n" + output += "Press Ctrl+C to detach..." + console.print(output) + + +def _is_provisioning(run: Run) -> bool: + return run.status in ( + RunStatus.SUBMITTED, + RunStatus.PENDING, + RunStatus.PROVISIONING, + ) diff --git a/src/dstack/_internal/cli/commands/completion.py b/src/dstack/_internal/cli/commands/completion.py new file mode 100644 index 0000000000..3bcdfbcfec --- /dev/null +++ b/src/dstack/_internal/cli/commands/completion.py @@ -0,0 +1,22 @@ +import argparse + +import argcomplete + +from dstack._internal.cli.commands import BaseCommand + + +class CompletionCommand(BaseCommand): + NAME = "completion" + DESCRIPTION = "Generate shell completion scripts" + + def _register(self): + super()._register() + self._parser.add_argument( + "shell", + help="The shell to generate the completion script for", + choices=["bash", "zsh"], + ) + + def _command(self, args: argparse.Namespace): + super()._command(args) + print(argcomplete.shellcode(["dstack"], shell=args.shell)) # type: ignore[attr-defined] diff --git a/src/dstack/_internal/cli/commands/config.py b/src/dstack/_internal/cli/commands/config.py deleted file mode 100644 index fffe5fd1f4..0000000000 --- a/src/dstack/_internal/cli/commands/config.py +++ /dev/null @@ -1,87 +0,0 @@ -import argparse - -from requests import HTTPError - -import dstack.api.server -from dstack._internal.cli.commands import BaseCommand -from dstack._internal.cli.utils.common import confirm_ask, console -from dstack._internal.core.errors import CLIError -from dstack._internal.core.services.configs import ConfigManager -from dstack._internal.utils.logging import get_logger - -logger = get_logger(__name__) - - -class ConfigCommand(BaseCommand): - NAME = "config" - DESCRIPTION = "Configure projects" - - def _register(self): - super()._register() - self._parser.add_argument( - "--project", type=str, help="The name of the project to configure" - ) - self._parser.add_argument("--url", type=str, help="Server url") - self._parser.add_argument("--token", type=str, help="User token") - self._parser.add_argument( - "--default", - action="store_true", - help="Set the project as default. It will be used when --project is omitted in commands.", - default=False, - ) - self._parser.add_argument( - "--remove", action="store_true", help="Delete project configuration" - ) - self._parser.add_argument( - "--no-default", - help="Do not prompt to set the project as default", - action="store_true", - ) - - def _command(self, args: argparse.Namespace): - config_manager = ConfigManager() - if args.remove: - config_manager.delete_project(args.project) - config_manager.save() - console.print("[grey58]OK[/]") - return - - if not args.url: - console.print("Specify --url") - exit(1) - elif not args.token: - console.print("Specify --token") - exit(1) - api_client = dstack.api.server.APIClient(base_url=args.url, token=args.token) - try: - api_client.projects.get(args.project) - except HTTPError as e: - if e.response.status_code == 403: - raise CLIError("Forbidden. Ensure the token is valid.") - elif e.response.status_code == 404: - raise CLIError(f"Project '{args.project}' not found.") - else: - raise e - default_project = config_manager.get_project_config() - if ( - default_project is None - or default_project.name != args.project - or default_project.url != args.url - or default_project.token != args.token - ): - set_it_as_default = ( - ( - args.default - or not default_project - or confirm_ask(f"Set '{args.project}' as your default project?") - ) - if not args.no_default - else False - ) - config_manager.configure_project( - name=args.project, url=args.url, token=args.token, default=set_it_as_default - ) - config_manager.save() - logger.info( - f"Configuration updated at {config_manager.config_filepath}", {"show_path": False} - ) diff --git a/src/dstack/_internal/cli/commands/delete.py b/src/dstack/_internal/cli/commands/delete.py index a68c97c119..6d95c0dae6 100644 --- a/src/dstack/_internal/cli/commands/delete.py +++ b/src/dstack/_internal/cli/commands/delete.py @@ -1,18 +1,18 @@ import argparse from pathlib import Path +from argcomplete import FilesCompleter # type: ignore[attr-defined] + from dstack._internal.cli.commands import APIBaseCommand from dstack._internal.cli.services.configurators import ( get_apply_configurator_class, load_apply_configuration, ) -from dstack._internal.cli.utils.common import cli_error -from dstack._internal.core.errors import ConfigurationError class DeleteCommand(APIBaseCommand): NAME = "delete" - DESCRIPTION = "Delete resources defined by dstack configuration" + DESCRIPTION = "Delete resources" ALIASES = ["destroy"] def _register(self): @@ -24,7 +24,7 @@ def _register(self): metavar="FILE", help="The path to the configuration file. Defaults to [code]$PWD/.dstack.yml[/]", dest="configuration_file", - ) + ).completer = FilesCompleter(allowednames=["*.yml", "*.yaml"]) # type: ignore[attr-defined] self._parser.add_argument( "-y", "--yes", @@ -34,10 +34,11 @@ def _register(self): def _command(self, args: argparse.Namespace): super()._command(args) - try: - configuration = load_apply_configuration(args.configuration_file) - except ConfigurationError as e: - raise cli_error(e) + configuration_path, configuration = load_apply_configuration(args.configuration_file) configurator_class = get_apply_configurator_class(configuration.type) configurator = configurator_class(api_client=self.api) - configurator.delete_configuration(conf=configuration, args=args) + configurator.delete_configuration( + conf=configuration, + configuration_path=configuration_path, + command_args=args, + ) diff --git a/src/dstack/_internal/cli/commands/event.py b/src/dstack/_internal/cli/commands/event.py new file mode 100644 index 0000000000..1b8a094ae8 --- /dev/null +++ b/src/dstack/_internal/cli/commands/event.py @@ -0,0 +1,193 @@ +import argparse +from dataclasses import asdict + +from dstack._internal.cli.commands import APIBaseCommand +from dstack._internal.cli.services.events import ( + EventListFilters, + EventPaginator, + EventTracker, + print_event, +) +from dstack._internal.cli.utils.common import ( + get_start_time, +) +from dstack._internal.core.errors import CLIError +from dstack._internal.core.models.common import EntityReference +from dstack._internal.core.models.events import EventTargetType +from dstack._internal.server.schemas.events import LIST_EVENTS_DEFAULT_LIMIT +from dstack.api import Client + + +class EventCommand(APIBaseCommand): + NAME = "event" + DESCRIPTION = "View events" + + def _register(self): + super()._register() + self._parser.set_defaults(subfunc=self._list) + subparsers = self._parser.add_subparsers(dest="action") + + list_parser = subparsers.add_parser( + "list", + help="List events within the selected project", + formatter_class=self._parser.formatter_class, + ) + list_parser.set_defaults(subfunc=self._list) + + for parser in [self._parser, list_parser]: + parser.add_argument( + "-w", + "--watch", + help="Watch events in realtime", + action="store_true", + ) + parser.add_argument( + "--since", + help=( + "Only show events newer than the specified date." + " Can be a duration (e.g. 10s, 5m, 1d) or an RFC 3339 string (e.g. 2023-09-24T15:30:00Z)." + f" If not specified, show the last {LIST_EVENTS_DEFAULT_LIMIT} events." + ), + type=str, + ) + target_filters_group = parser.add_mutually_exclusive_group() + target_filters_group.add_argument( + "--target-fleet", + action="append", + metavar="NAME", + dest="target_fleets", + type=EntityReference.parse, + help="Only show events that target the specified fleets", + ) + target_filters_group.add_argument( + "--target-run", + action="append", + metavar="NAME", + dest="target_runs", + help="Only show events that target the specified runs", + ) + target_filters_group.add_argument( + "--target-volume", + action="append", + metavar="NAME", + dest="target_volumes", + help="Only show events that target the specified volumes", + ) + target_filters_group.add_argument( + "--target-gateway", + action="append", + metavar="NAME", + dest="target_gateways", + help="Only show events that target the specified gateways", + ) + target_filters_group.add_argument( + "--target-secret", + action="append", + metavar="NAME", + dest="target_secrets", + help="Only show events that target the specified secrets", + ) + within_filters_group = parser.add_mutually_exclusive_group() + within_filters_group.add_argument( + "--within-fleet", + action="append", + metavar="NAME", + dest="within_fleets", + type=EntityReference.parse, + help="Only show events that target the specified fleets or instances within those fleets", + ) + within_filters_group.add_argument( + "--within-run", + action="append", + metavar="NAME", + dest="within_runs", + help="Only show events that target the specified runs or jobs within those runs", + ) + parser.add_argument( + "--include-target-type", + action="append", + metavar="TYPE", + type=EventTargetType, + dest="include_target_types", + help="Only show events that target entities of the specified types", + ) + + def _command(self, args: argparse.Namespace): + super()._command(args) + args.subfunc(args) + + def _list(self, args: argparse.Namespace): + since = get_start_time(args.since) + filters = _build_filters(args, self.api) + + if args.watch: + events = EventTracker( + client=self.api.client.events, filters=filters, since=since + ).stream_forever() + elif since is not None: + events = EventPaginator(self.api.client.events).list( + filters=filters, since=since, ascending=True + ) + else: + events = reversed(self.api.client.events.list(ascending=False, **asdict(filters))) + try: + for event in events: + print_event(current_project=self.api.project, event=event) + except KeyboardInterrupt: + pass + + +def _build_filters(args: argparse.Namespace, api: Client) -> EventListFilters: + filters = EventListFilters() + + has_target_filters = True + if args.target_fleets: + filters.target_fleets = [ + api.client.fleets.get(ref.project or api.project, ref.name).id + for ref in args.target_fleets + ] + elif args.target_runs: + filters.target_runs = [ + api.client.runs.get(api.project, name).id for name in args.target_runs + ] + elif args.target_volumes: + filters.target_volumes = [ + api.client.volumes.get(project_name=api.project, name=name).id + for name in args.target_volumes + ] + elif args.target_gateways: + filters.target_gateways = [] + for name in args.target_gateways: + id = api.client.gateways.get(api.project, name).id + if id is None: + # TODO(0.21): Remove this check once `Gateway.id` is required. + raise CLIError( + "Cannot determine gateway ID, most likely due to an outdated dstack server." + " Update the server to 0.20.7 or higher or remove --target-gateway." + ) + filters.target_gateways.append(id) + elif args.target_secrets: + filters.target_secrets = [ + api.client.secrets.get(api.project, name=name).id for name in args.target_secrets + ] + else: + has_target_filters = False + + if args.within_fleets: + filters.within_fleets = [ + api.client.fleets.get(ref.project or api.project, ref.name).id + for ref in args.within_fleets + ] + elif args.within_runs: + filters.within_runs = [ + api.client.runs.get(api.project, name).id for name in args.within_runs + ] + elif not has_target_filters: + # default - limit to current project, + # unless there are more specific filters (e.g., for imported entities) + filters.within_projects = [api.client.projects.get(api.project).project_id] + + if args.include_target_types: + filters.include_target_types = args.include_target_types + + return filters diff --git a/src/dstack/_internal/cli/commands/export.py b/src/dstack/_internal/cli/commands/export.py new file mode 100644 index 0000000000..b21b58cfe0 --- /dev/null +++ b/src/dstack/_internal/cli/commands/export.py @@ -0,0 +1,219 @@ +import argparse + +from rich.table import Table + +from dstack._internal.cli.commands import APIBaseCommand +from dstack._internal.cli.services.completion import ExportNameCompleter +from dstack._internal.cli.utils.common import add_row_from_dict, confirm_ask, console +from dstack._internal.core.models.exports import Export + + +class ExportCommand(APIBaseCommand): + NAME = "export" + DESCRIPTION = "Manage exports" + + def _register(self): + super()._register() + self._parser.set_defaults(subfunc=self._list) + subparsers = self._parser.add_subparsers(dest="action") + + list_parser = subparsers.add_parser( + "list", help="List exports", formatter_class=self._parser.formatter_class + ) + list_parser.set_defaults(subfunc=self._list) + + create_parser = subparsers.add_parser( + "create", help="Create an export", formatter_class=self._parser.formatter_class + ) + create_parser.add_argument( + "name", + help="The name of the export", + ) + create_parser.add_argument( + "--importer", + action="append", + dest="importers", + help="Importer project name (can be specified multiple times)", + default=[], + ) + create_parser.add_argument( + "--fleet", + action="append", + dest="fleets", + help="Fleet name to export (can be specified multiple times)", + default=[], + ) + create_parser.add_argument( + "--gateway", + action="append", + dest="gateways", + help="Gateway name to export (can be specified multiple times)", + default=[], + ) + create_parser.add_argument( + "--global", + dest="is_global", + action="store_true", + help="Make this export global (automatically imported into all projects)", + default=False, + ) + create_parser.set_defaults(subfunc=self._create) + + update_parser = subparsers.add_parser( + "update", help="Update an export", formatter_class=self._parser.formatter_class + ) + update_parser.add_argument( + "name", + help="The name of the export", + ).completer = ExportNameCompleter() # type: ignore[attr-defined] + update_parser.add_argument( + "--add-importer", + action="append", + dest="add_importers", + help="Importer project name to add (can be specified multiple times)", + default=[], + ) + update_parser.add_argument( + "--remove-importer", + action="append", + dest="remove_importers", + help="Importer project name to remove (can be specified multiple times)", + default=[], + ) + update_parser.add_argument( + "--add-fleet", + action="append", + dest="add_fleets", + help="Fleet name to add (can be specified multiple times)", + default=[], + ) + update_parser.add_argument( + "--remove-fleet", + action="append", + dest="remove_fleets", + help="Fleet name to remove (can be specified multiple times)", + default=[], + ) + update_parser.add_argument( + "--add-gateway", + action="append", + dest="add_gateways", + help="Gateway name to add (can be specified multiple times)", + default=[], + ) + update_parser.add_argument( + "--remove-gateway", + action="append", + dest="remove_gateways", + help="Gateway name to remove (can be specified multiple times)", + default=[], + ) + global_group = update_parser.add_mutually_exclusive_group() + global_group.add_argument( + "--set-global", + dest="set_global", + action="store_true", + help="Make this export global (automatically imported into all projects)", + default=False, + ) + global_group.add_argument( + "--unset-global", + dest="unset_global", + action="store_true", + help="Remove the global flag from this export", + default=False, + ) + update_parser.set_defaults(subfunc=self._update) + + delete_parser = subparsers.add_parser( + "delete", help="Delete an export", formatter_class=self._parser.formatter_class + ) + delete_parser.add_argument( + "name", + help="The name of the export", + ).completer = ExportNameCompleter() # type: ignore[attr-defined] + delete_parser.add_argument( + "-y", "--yes", help="Don't ask for confirmation", action="store_true" + ) + delete_parser.set_defaults(subfunc=self._delete) + + def _command(self, args: argparse.Namespace): + super()._command(args) + args.subfunc(args) + + def _list(self, args: argparse.Namespace): + exports = self.api.client.exports.list(self.api.project) + print_exports_table(exports) + + def _create(self, args: argparse.Namespace): + with console.status("Creating export..."): + export = self.api.client.exports.create( + project_name=self.api.project, + name=args.name, + is_global=args.is_global, + importer_projects=args.importers, + exported_fleets=args.fleets, + exported_gateways=args.gateways, + ) + print_exports_table([export]) + + def _update(self, args: argparse.Namespace): + with console.status("Updating export..."): + export = self.api.client.exports.update( + project_name=self.api.project, + name=args.name, + set_global=args.set_global, + unset_global=args.unset_global, + add_importer_projects=args.add_importers, + remove_importer_projects=args.remove_importers, + add_exported_fleets=args.add_fleets, + remove_exported_fleets=args.remove_fleets, + add_exported_gateways=args.add_gateways, + remove_exported_gateways=args.remove_gateways, + ) + print_exports_table([export]) + + def _delete(self, args: argparse.Namespace): + if not args.yes and not confirm_ask(f"Delete the export [code]{args.name}[/]?"): + console.print("\nExiting...") + return + + with console.status("Deleting export..."): + self.api.client.exports.delete(project_name=self.api.project, name=args.name) + + console.print(f"Export [code]{args.name}[/] deleted") + + +def print_exports_table(exports: list[Export]): + table = Table(box=None) + table.add_column("NAME", no_wrap=True) + table.add_column("FLEETS") + table.add_column("GATEWAYS") + table.add_column("IMPORTERS") + + for export in exports: + fleets = ( + ", ".join([f.name for f in export.exported_fleets]) if export.exported_fleets else "-" + ) + gateways = ( + ", ".join([g.name for g in export.exported_gateways]) + if export.exported_gateways + else "-" + ) + if export.is_global: + importers = "*" + else: + importers = ( + ", ".join([i.project_name for i in export.imports]) if export.imports else "-" + ) + + row = { + "NAME": export.name, + "FLEETS": fleets, + "GATEWAYS": gateways, + "IMPORTERS": importers, + } + add_row_from_dict(table, row) + + console.print(table) + console.print() diff --git a/src/dstack/_internal/cli/commands/fleet.py b/src/dstack/_internal/cli/commands/fleet.py new file mode 100644 index 0000000000..c0b4c0e715 --- /dev/null +++ b/src/dstack/_internal/cli/commands/fleet.py @@ -0,0 +1,179 @@ +import argparse +import time +from uuid import UUID + +from rich.live import Live + +from dstack._internal.cli.commands import APIBaseCommand +from dstack._internal.cli.services.completion import FleetNameCompleter +from dstack._internal.cli.utils.common import ( + LIVE_TABLE_PROVISION_INTERVAL_SECS, + LIVE_TABLE_REFRESH_RATE_PER_SEC, + confirm_ask, + console, +) +from dstack._internal.cli.utils.fleet import get_fleets_table, print_fleets_table +from dstack._internal.core.errors import CLIError, ResourceNotExistsError +from dstack._internal.core.models.common import EntityReference +from dstack._internal.utils.json_utils import pydantic_orjson_dumps_with_indent + + +class FleetCommand(APIBaseCommand): + NAME = "fleet" + DESCRIPTION = "Manage fleets" + + def _register(self): + super()._register() + self._parser.set_defaults(subfunc=self._list) + subparsers = self._parser.add_subparsers(dest="action") + + list_parser = subparsers.add_parser( + "list", help="List fleets", formatter_class=self._parser.formatter_class + ) + list_parser.set_defaults(subfunc=self._list) + + for parser in [self._parser, list_parser]: + parser.add_argument( + "-w", + "--watch", + help="Update listing in realtime", + action="store_true", + ) + parser.add_argument( + "-v", "--verbose", action="store_true", help="Show more information" + ) + + delete_parser = subparsers.add_parser( + "delete", + help="Delete fleets and instances", + formatter_class=self._parser.formatter_class, + ) + delete_parser.add_argument( + "name", + type=EntityReference.parse, + help="The name of the fleet", + ).completer = FleetNameCompleter() # type: ignore[attr-defined] + delete_parser.add_argument( + "-i", + "--instance", + action="append", + metavar="INSTANCE_NUM", + dest="instances", + help="The instances to delete", + type=int, + ) + delete_parser.add_argument( + "-y", "--yes", help="Don't ask for confirmation", action="store_true" + ) + delete_parser.set_defaults(subfunc=self._delete) + + get_parser = subparsers.add_parser( + "get", help="Get a fleet", formatter_class=self._parser.formatter_class + ) + name_group = get_parser.add_mutually_exclusive_group(required=True) + name_group.add_argument( + "name", + nargs="?", + metavar="NAME", + type=EntityReference.parse, + help="The name of the fleet", + ).completer = FleetNameCompleter() # type: ignore[attr-defined] + name_group.add_argument( + "--id", + type=str, + help="The ID of the fleet (UUID)", + ) + get_parser.add_argument( + "--json", + action="store_true", + required=True, + help="Output in JSON format", + ) + get_parser.set_defaults(subfunc=self._get) + + def _command(self, args: argparse.Namespace): + super()._command(args) + args.subfunc(args) + + def _list(self, args: argparse.Namespace): + fleets = self.api.client.fleets.list(self.api.project, include_imported=True) + if not args.watch: + print_fleets_table(fleets, current_project=self.api.project, verbose=args.verbose) + return + + try: + with Live(console=console, refresh_per_second=LIVE_TABLE_REFRESH_RATE_PER_SEC) as live: + while True: + live.update( + get_fleets_table( + fleets, current_project=self.api.project, verbose=args.verbose + ) + ) + time.sleep(LIVE_TABLE_PROVISION_INTERVAL_SECS) + fleets = self.api.client.fleets.list(self.api.project, include_imported=True) + except KeyboardInterrupt: + pass + + def _delete(self, args: argparse.Namespace): + if args.name.project is not None: + console.print( + "The [code]/[/] format is not supported for fleet names." + " Can only delete fleets or instances owned by the current project" + ) + exit(1) + name = args.name.name + + try: + self.api.client.fleets.get(project_name=self.api.project, name=name) + except ResourceNotExistsError: + console.print(f"Fleet [code]{name}[/] does not exist") + exit(1) + + if not args.instances: + if not args.yes and not confirm_ask(f"Delete the fleet [code]{name}[/]?"): + console.print("\nExiting...") + return + + with console.status("Deleting fleet..."): + self.api.client.fleets.delete(project_name=self.api.project, names=[name]) + + console.print(f"Fleet [code]{name}[/] deleted") + return + + if not args.yes and not confirm_ask( + f"Delete the fleet [code]{name}[/] instances [code]{args.instances}[/]?" + ): + console.print("\nExiting...") + return + + with console.status("Deleting fleet instances..."): + self.api.client.fleets.delete_instances( + project_name=self.api.project, name=name, instance_nums=args.instances + ) + + console.print(f"Fleet [code]{name}[/] instances deleted") + + def _get(self, args: argparse.Namespace): + # TODO: Implement non-json output format + fleet_id = None + if args.id is not None: + try: + fleet_id = UUID(args.id) + except ValueError: + raise CLIError(f"Invalid UUID format: {args.id}") + + try: + if args.id is not None: + fleet = self.api.client.fleets.get( + project_name=self.api.project, fleet_id=fleet_id + ) + else: + fleet = self.api.client.fleets.get( + project_name=args.name.project or self.api.project, + name=args.name.name, + ) + except ResourceNotExistsError: + console.print(f"Fleet [code]{args.name or args.id}[/] not found") + exit(1) + + print(pydantic_orjson_dumps_with_indent(fleet.dict(), default=None)) diff --git a/src/dstack/_internal/cli/commands/gateway.py b/src/dstack/_internal/cli/commands/gateway.py index 3783f378ec..c226b3334b 100644 --- a/src/dstack/_internal/cli/commands/gateway.py +++ b/src/dstack/_internal/cli/commands/gateway.py @@ -1,9 +1,29 @@ import argparse +import time + +from rich.live import Live from dstack._internal.cli.commands import APIBaseCommand -from dstack._internal.cli.utils.common import confirm_ask, console -from dstack._internal.cli.utils.gateway import print_gateways_table -from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.cli.services.completion import GatewayNameCompleter +from dstack._internal.cli.utils.common import ( + LIVE_TABLE_PROVISION_INTERVAL_SECS, + LIVE_TABLE_REFRESH_RATE_PER_SEC, + confirm_ask, + console, +) +from dstack._internal.cli.utils.gateway import ( + get_gateway_relative_to_project, + get_gateways_table, + print_gateways_json, + print_gateways_table, +) +from dstack._internal.core.errors import CLIError +from dstack._internal.core.models.common import EntityReference +from dstack._internal.core.models.gateways import GatewayStatus +from dstack._internal.utils.json_utils import pydantic_orjson_dumps_with_indent +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) class GatewayCommand(APIBaseCommand): @@ -18,32 +38,39 @@ def _register(self): list_parser = subparsers.add_parser( "list", help="List gateways", formatter_class=self._parser.formatter_class ) - list_parser.add_argument( - "-v", "--verbose", action="store_true", help="Show more information" - ) list_parser.set_defaults(subfunc=self._list) - create_parser = subparsers.add_parser( - "create", help="Add a gateway", formatter_class=self._parser.formatter_class - ) - create_parser.set_defaults(subfunc=self._create) - create_parser.add_argument( - "--backend", choices=["aws", "azure", "gcp", "kubernetes"], required=True - ) - create_parser.add_argument("--region", required=True) - create_parser.add_argument( - "--set-default", action="store_true", help="Set as default gateway for the project" - ) - create_parser.add_argument("--name", help="Set a custom name for the gateway") - create_parser.add_argument( - "--domain", help="Set the domain for the gateway", required=True - ) + for parser in [self._parser, list_parser]: + parser.add_argument( + "-w", + "--watch", + help="Update listing in realtime", + action="store_true", + ) + parser.add_argument( + "-v", "--verbose", action="store_true", help="Show more information" + ) + parser.add_argument( + "--format", + choices=["plain", "json"], + default="plain", + help="Output format (default: plain)", + ) + parser.add_argument( + "--json", + action="store_const", + const="json", + dest="format", + help="Output in JSON format (equivalent to --format json)", + ) delete_parser = subparsers.add_parser( "delete", help="Delete a gateway", formatter_class=self._parser.formatter_class ) delete_parser.set_defaults(subfunc=self._delete) - delete_parser.add_argument("name", help="The name of the gateway") + delete_parser.add_argument( + "name", type=EntityReference.parse, help="The name of the gateway" + ).completer = GatewayNameCompleter() # type: ignore[attr-defined] delete_parser.add_argument( "-y", "--yes", action="store_true", help="Don't ask for confirmation" ) @@ -52,53 +79,127 @@ def _register(self): "update", help="Update a gateway", formatter_class=self._parser.formatter_class ) update_parser.set_defaults(subfunc=self._update) - update_parser.add_argument("name", help="The name of the gateway") + update_parser.add_argument( + "name", type=EntityReference.parse, help="The name of the gateway" + ).completer = GatewayNameCompleter() # type: ignore[attr-defined] update_parser.add_argument( "--set-default", action="store_true", help="Set it the default gateway for the project" ) update_parser.add_argument("--domain", help="Set the domain for the gateway") + get_parser = subparsers.add_parser( + "get", help="Get a gateway", formatter_class=self._parser.formatter_class + ) + get_parser.add_argument( + "name", metavar="NAME", type=EntityReference.parse, help="The name of the gateway" + ).completer = GatewayNameCompleter() # type: ignore[attr-defined] + get_parser.add_argument( + "--json", + action="store_true", + required=True, + help="Output in JSON format", + ) + get_parser.set_defaults(subfunc=self._get) + def _command(self, args: argparse.Namespace): super()._command(args) # TODO handle errors args.subfunc(args) def _list(self, args: argparse.Namespace): - gateways = self.api.client.gateways.list(self.api.project) - print_gateways_table(gateways, verbose=getattr(args, "verbose", False)) + if args.watch and args.format == "json": + raise CLIError("JSON output is not supported together with --watch") - def _create(self, args: argparse.Namespace): - with console.status("Creating gateway..."): - gateway = self.api.client.gateways.create( - self.api.project, args.name, BackendType(args.backend), args.region + gateways = self.api.client.gateways.list(self.api.project, include_imported=True) + deprecated_router_gateways = [ + g.name + for g in gateways + if g.status != GatewayStatus.FAILED and g.configuration.router is not None + ] + if deprecated_router_gateways and args.format != "json": + logger.warning( + "Specifying `router` in gateway configurations is deprecated" + " and will be disallowed in a future release." + " Please migrate to replica-based routers:" + " https://dstack.ai/docs/concepts/services/#pd-disaggregation" + " (affected gateways: %s)", + ", ".join(deprecated_router_gateways), ) - if args.set_default: - self.api.client.gateways.set_default(self.api.project, gateway.name) - if args.domain: - self.api.client.gateways.set_wildcard_domain( - self.api.project, gateway.name, args.domain + if not args.watch: + if args.format == "json": + print_gateways_json(gateways, project=self.api.project) + else: + print_gateways_table( + gateways, current_project=self.api.project, verbose=args.verbose ) - gateway = self.api.client.gateways.get(self.api.project, gateway.name) - print_gateways_table([gateway]) + return + + try: + with Live(console=console, refresh_per_second=LIVE_TABLE_REFRESH_RATE_PER_SEC) as live: + while True: + live.update( + get_gateways_table( + gateways, current_project=self.api.project, verbose=args.verbose + ) + ) + time.sleep(LIVE_TABLE_PROVISION_INTERVAL_SECS) + gateways = self.api.client.gateways.list( + self.api.project, include_imported=True + ) + except KeyboardInterrupt: + pass def _delete(self, args: argparse.Namespace): - gateway = self.api.client.gateways.get(self.api.project, args.name) - print_gateways_table([gateway]) + if args.name.project is not None: + console.print( + "The [code]/[/] format is not supported for gateway names." + " Can only delete gateways owned by the current project" + ) + exit(1) + name = args.name.name + gateway = self.api.client.gateways.get(self.api.project, name) + print_gateways_table([gateway], current_project=self.api.project) if args.yes or confirm_ask("Do you want to delete the gateway?"): with console.status("Deleting gateway..."): - self.api.client.gateways.delete(self.api.project, [args.name]) + self.api.client.gateways.delete(self.api.project, [name]) console.print("Gateway deleted") else: - console.print("Canceled") + console.print("Exiting...") return def _update(self, args: argparse.Namespace): with console.status("Updating gateway..."): - if args.set_default: - self.api.client.gateways.set_default(self.api.project, args.name) if args.domain: + if args.name.project is not None: + console.print( + "The [code]/[/] format is not supported for gateway names" + " when [code]--domain[/] is passed." + " Can only update gateways owned by the current project" + ) + exit(1) self.api.client.gateways.set_wildcard_domain( - self.api.project, args.name, args.domain + self.api.project, args.name.name, args.domain + ) + if args.set_default: + self.api.client.gateways.set_default( + self.api.project, + gateway_name=args.name.name, + gateway_project=args.name.project, ) - gateway = self.api.client.gateways.get(self.api.project, args.name) - print_gateways_table([gateway]) + gateway = get_gateway_relative_to_project( + client=self.api.client.gateways, + project=self.api.project, + gateway_project=args.name.project or self.api.project, + gateway_name=args.name.name, + ) + print_gateways_table([gateway], current_project=self.api.project) + + def _get(self, args: argparse.Namespace): + # TODO: Implement non-json output format + gateway = get_gateway_relative_to_project( + client=self.api.client.gateways, + project=self.api.project, + gateway_project=args.name.project or self.api.project, + gateway_name=args.name.name, + ) + print(pydantic_orjson_dumps_with_indent(gateway.dict(), default=None)) diff --git a/src/dstack/_internal/cli/commands/import_.py b/src/dstack/_internal/cli/commands/import_.py new file mode 100644 index 0000000000..155a42a789 --- /dev/null +++ b/src/dstack/_internal/cli/commands/import_.py @@ -0,0 +1,94 @@ +import argparse + +from rich.table import Table + +from dstack._internal.cli.commands import APIBaseCommand +from dstack._internal.cli.services.completion import ImportNameCompleter +from dstack._internal.cli.utils.common import add_row_from_dict, confirm_ask, console +from dstack._internal.core.models.imports import Import + + +class ImportCommand(APIBaseCommand): + NAME = "import" + DESCRIPTION = "Manage imports" + + def _register(self): + super()._register() + self._parser.set_defaults(subfunc=self._list) + subparsers = self._parser.add_subparsers(dest="action") + + list_parser = subparsers.add_parser( + "list", help="List imports", formatter_class=self._parser.formatter_class + ) + list_parser.set_defaults(subfunc=self._list) + + delete_parser = subparsers.add_parser( + "delete", help="Delete an import", formatter_class=self._parser.formatter_class + ) + delete_parser.add_argument( + "name", + help="The import to delete, in `export-project/export-name` format", + ).completer = ImportNameCompleter() # type: ignore[attr-defined] + delete_parser.add_argument( + "-y", "--yes", help="Don't ask for confirmation", action="store_true" + ) + delete_parser.set_defaults(subfunc=self._delete) + + def _command(self, args: argparse.Namespace): + super()._command(args) + args.subfunc(args) + + def _list(self, args: argparse.Namespace): + imports = self.api.client.imports.list(self.api.project) + print_imports_table(imports) + + def _delete(self, args: argparse.Namespace): + parts = args.name.split("/") + if len(parts) != 2 or not parts[0] or not parts[1]: + self._parser.error( + f"Invalid format: {args.name!r}. Expected /" + ) + export_project_name, export_name = parts + + if not args.yes and not confirm_ask(f"Delete the import [code]{args.name}[/]?"): + console.print("\nExiting...") + return + + with console.status("Deleting import..."): + self.api.client.imports.delete( + project_name=self.api.project, + export_project_name=export_project_name, + export_name=export_name, + ) + + console.print(f"Import [code]{args.name}[/] deleted") + + +def print_imports_table(imports: list[Import]): + table = Table(box=None) + table.add_column("NAME", no_wrap=True) + table.add_column("FLEETS") + table.add_column("GATEWAYS") + + for imp in imports: + name = f"{imp.export.project_name}/{imp.export.name}" + fleets = ( + ", ".join([f.name for f in imp.export.exported_fleets]) + if imp.export.exported_fleets + else "-" + ) + gateways = ( + ", ".join([g.name for g in imp.export.exported_gateways]) + if imp.export.exported_gateways + else "-" + ) + + row = { + "NAME": name, + "FLEETS": fleets, + "GATEWAYS": gateways, + } + add_row_from_dict(table, row) + + console.print(table) + console.print() diff --git a/src/dstack/_internal/cli/commands/init.py b/src/dstack/_internal/cli/commands/init.py index e5f55f87d7..64c5a240ae 100644 --- a/src/dstack/_internal/cli/commands/init.py +++ b/src/dstack/_internal/cli/commands/init.py @@ -1,12 +1,18 @@ import argparse import os from pathlib import Path +from typing import Optional from dstack._internal.cli.commands import BaseCommand -from dstack._internal.cli.utils.common import cli_error, configure_logging, console -from dstack._internal.core.errors import ConfigurationError -from dstack._internal.core.models.repos.base import RepoType -from dstack._internal.core.services.configs import ConfigManager +from dstack._internal.cli.services.repos import ( + get_repo_from_dir, + get_repo_from_url, + is_git_repo_url, + register_init_repo_args, +) +from dstack._internal.cli.utils.common import console +from dstack._internal.core.errors import CLIError, RepoInvalidCredentialsError +from dstack._internal.core.services.repos import get_repo_creds_and_default_branch from dstack.api import Client @@ -21,53 +27,49 @@ def _register(self): default=os.getenv("DSTACK_PROJECT"), ) self._parser.add_argument( - "-t", - "--token", - metavar="OAUTH_TOKEN", - help="An authentication token for Git", - type=str, - dest="gh_token", - ) - self._parser.add_argument( - "--git-identity", - metavar="SSH_PRIVATE_KEY", - help="The private SSH key path to access the remote repo", - type=str, - dest="git_identity_file", - ) - self._parser.add_argument( - "--ssh-identity", - metavar="SSH_PRIVATE_KEY", - help="The private SSH key path for SSH tunneling", - type=Path, - dest="ssh_identity_file", - ) - self._parser.add_argument( - "--local", - action="store_true", - help="Do not use git", + "-P", + "--repo", + help=( + "The repo to initialize. Can be a local path or a Git repo URL." + " Defaults to the current working directory." + ), + dest="repo", ) + register_init_repo_args(self._parser) def _command(self, args: argparse.Namespace): - configure_logging() + super()._command(args) + + repo_path: Optional[Path] = None + repo_url: Optional[str] = None + repo_arg: Optional[str] = args.repo + if repo_arg is not None: + if is_git_repo_url(repo_arg): + repo_url = repo_arg + else: + repo_path = Path(repo_arg).expanduser().resolve() + else: + repo_path = Path.cwd() + + if repo_url is not None: + repo = get_repo_from_url(repo_url) + elif repo_path is not None: + repo = get_repo_from_dir(repo_path) + else: + assert False, "should not reach here" + try: - api = Client.from_config( - project_name=args.project, ssh_identity_file=args.ssh_identity_file - ) - repo = api.repos.load( - Path.cwd(), - local=args.local, - init=True, - git_identity_file=args.git_identity_file, + repo_creds, _ = get_repo_creds_and_default_branch( + repo_url=repo.repo_url, + identity_file=args.git_identity_file, oauth_token=args.gh_token, ) - if args.ssh_identity_file: - ConfigManager().save_repo_config( - repo.repo_dir, - repo.repo_id, - RepoType(repo.run_repo_data.repo_type), - args.ssh_identity_file, - ) - except ConfigurationError as e: - raise cli_error(e) + except RepoInvalidCredentialsError: + raise CLIError( + "No valid default Git credentials found. Pass valid `--token` or `--git-identity`." + ) + + api = Client.from_config(project_name=args.project) + api.repos.init(repo=repo, creds=repo_creds) + console.print("OK") diff --git a/src/dstack/_internal/cli/commands/login.py b/src/dstack/_internal/cli/commands/login.py new file mode 100644 index 0000000000..8431a76c61 --- /dev/null +++ b/src/dstack/_internal/cli/commands/login.py @@ -0,0 +1,352 @@ +import argparse +import queue +import sys +import threading +import urllib.parse +import webbrowser +from http.server import BaseHTTPRequestHandler, HTTPServer +from typing import Any, Optional + +import questionary +from rich.prompt import Prompt as RichPrompt +from rich.text import Text + +from dstack._internal.cli.commands import BaseCommand +from dstack._internal.cli.commands.project import select_default_project +from dstack._internal.cli.utils.common import console, resolve_url +from dstack._internal.core.errors import ClientError, CLIError +from dstack._internal.core.models.users import UserWithCreds +from dstack._internal.utils.logging import get_logger +from dstack.api._public.runs import ConfigManager +from dstack.api.server import APIClient + +logger = get_logger(__name__) + +is_project_menu_supported = sys.stdin.isatty() + + +class UrlPrompt(RichPrompt): + def render_default(self, default: Any) -> Text: + return Text(f"({default})", style="bold orange1") + + +class LoginCommand(BaseCommand): + NAME = "login" + DESCRIPTION = "Authorize the CLI using Single Sign-On" + + def _register(self): + super()._register() + self._parser.add_argument( + "--url", + help="The server URL, e.g. https://sky.dstack.ai", + required=not is_project_menu_supported, + ) + self._parser.add_argument( + "-p", + "--provider", + help=( + "The SSO provider name." + " Selected automatically if the server supports only one provider." + ), + ) + self._parser.add_argument( + "-y", + "--yes", + help="Don't ask for confirmation (e.g. set first project as default)", + action="store_true", + ) + self._parser.add_argument( + "-n", + "--no", + help="Don't ask for confirmation (e.g. do not change default project)", + action="store_true", + ) + + def _command(self, args: argparse.Namespace): + super()._command(args) + url = args.url + if url is None: + url = self._prompt_url() + base_url = _normalize_url_or_error(url) + api_client = APIClient(base_url=base_url) + provider = self._select_provider_or_error(api_client=api_client, provider=args.provider) + server = _LoginServer(api_client=api_client, provider=provider) + try: + server.start() + auth_resp = api_client.auth.authorize(provider=provider, local_port=server.port) + opened = webbrowser.open(auth_resp.authorization_url) + if opened: + console.print( + f"Your browser has been opened to log in with [code]{provider.title()}[/]:\n" + ) + else: + console.print(f"Open the URL to log in with [code]{provider.title()}[/]:\n") + print(f"{auth_resp.authorization_url}\n") + user = server.get_logged_in_user() + finally: + server.shutdown() + if user is None: + raise CLIError("CLI authentication failed") + console.print(f"Logged in as [code]{user.username}[/]") + api_client = APIClient(base_url=base_url, token=user.creds.token) + self._configure_projects(api_client=api_client, user=user, args=args) + + def _select_provider_or_error(self, api_client: APIClient, provider: Optional[str]) -> str: + providers = api_client.auth.list_providers() + available_providers = [p.name for p in providers if p.enabled] + if len(available_providers) == 0: + raise CLIError("No SSO providers configured on the server.") + if provider is None: + if len(available_providers) > 1: + if is_project_menu_supported: + return self._prompt_provider(available_providers) + raise CLIError( + "Specify -p/--provider to choose SSO provider" + f" Available providers: {', '.join(available_providers)}" + ) + return available_providers[0] + if provider not in available_providers: + raise CLIError( + f"Provider {provider} not configured on the server." + f" Available providers: {', '.join(available_providers)}" + ) + return provider + + def _prompt_url(self) -> str: + try: + url = UrlPrompt.ask( + "Enter the server URL", + default="https://sky.dstack.ai", + console=console, + ) + except KeyboardInterrupt: + console.print("\nCancelled by user") + raise SystemExit(1) + if url is None: + raise CLIError("URL is required") + return url + + def _prompt_provider(self, available_providers: list[str]) -> str: + choices = [ + questionary.Choice(title=provider, value=provider) for provider in available_providers + ] + selected_provider = questionary.select( + message="Select SSO provider:", + choices=choices, + qmark="", + instruction="(↑↓ Enter)", + ).ask() + if selected_provider is None: + raise SystemExit(1) + return selected_provider + + def _configure_projects( + self, api_client: APIClient, user: UserWithCreds, args: argparse.Namespace + ): + projects = api_client.projects.list(include_not_joined=False) + if len(projects) == 0: + console.print( + "No projects configured." + " Create your own project via the UI or contact a project manager to add you to the project." + ) + return + config_manager = ConfigManager() + default_project = config_manager.get_project_config() + for project in projects: + config_manager.configure_project( + name=project.project_name, + url=api_client.base_url, + token=user.creds.token, + default=False, + ) + config_manager.save() + project_names = ", ".join(f"[code]{p.project_name}[/]" for p in projects) + console.print( + f"Added {project_names} project{'' if len(projects) == 1 else 's'} at {config_manager.config_filepath}" + ) + + project_configs = config_manager.list_project_configs() + + if args.no: + return + + if args.yes: + if len(projects) > 0: + first_project_from_server = projects[0] + first_project_config = next( + ( + pc + for pc in project_configs + if pc.name == first_project_from_server.project_name + ), + None, + ) + if first_project_config is not None: + config_manager.configure_project( + name=first_project_config.name, + url=first_project_config.url, + token=first_project_config.token, + default=True, + ) + config_manager.save() + console.print( + f"Set [code]{first_project_config.name}[/] project as default at {config_manager.config_filepath}" + ) + return + + if len(project_configs) == 1 or not is_project_menu_supported: + selected_project = None + if len(project_configs) == 1: + selected_project = project_configs[0] + else: + for i, project in enumerate(projects): + set_as_default = ( + default_project is None + and i == 0 + or default_project is not None + and default_project.name == project.project_name + ) + if set_as_default: + selected_project = next( + (pc for pc in project_configs if pc.name == project.project_name), + None, + ) + break + if selected_project is not None: + config_manager.configure_project( + name=selected_project.name, + url=selected_project.url, + token=selected_project.token, + default=True, + ) + config_manager.save() + console.print( + f"Set [code]{selected_project.name}[/] project as default at {config_manager.config_filepath}" + ) + else: + console.print() + selected_project = select_default_project(project_configs, default_project) + if selected_project is not None: + config_manager.configure_project( + name=selected_project.name, + url=selected_project.url, + token=selected_project.token, + default=True, + ) + config_manager.save() + + +class _BadRequestError(Exception): + pass + + +class _LoginServer: + def __init__(self, api_client: APIClient, provider: str): + self._api_client = api_client + self._provider = provider + self._result_queue: queue.Queue[Optional[UserWithCreds]] = queue.Queue() + # Using built-in HTTP server to avoid extra deps. + callback_handler = self._make_callback_handler( + result_queue=self._result_queue, + api_client=api_client, + provider=provider, + ) + self._server = self._create_server(handler=callback_handler) + + def start(self): + self._thread = threading.Thread(target=self._server.serve_forever) + self._thread.start() + + def shutdown(self): + self._server.shutdown() + + def get_logged_in_user(self) -> Optional[UserWithCreds]: + return self._result_queue.get() + + @property + def port(self) -> int: + return self._server.server_port + + def _make_callback_handler( + self, + result_queue: queue.Queue[Optional[UserWithCreds]], + api_client: APIClient, + provider: str, + ) -> type[BaseHTTPRequestHandler]: + class _CallbackHandler(BaseHTTPRequestHandler): + def do_GET(self): + parsed_path = urllib.parse.urlparse(self.path) + if parsed_path.path != "/auth/callback": + self.send_response(404) + self.end_headers() + return + try: + self._handle_auth_callback(parsed_path) + except _BadRequestError as e: + self.send_error(400, e.args[0]) + result_queue.put(None) + + def log_message(self, format: str, *args): + # Do not log server requests. + pass + + def _handle_auth_callback(self, parsed_path: urllib.parse.ParseResult): + try: + params = urllib.parse.parse_qs(parsed_path.query, strict_parsing=True) + except ValueError: + raise _BadRequestError("Bad query params") + code = params.get("code", [None])[0] + state = params.get("state", [None])[0] + if code is None or state is None: + raise _BadRequestError("Missing required params") + try: + user = api_client.auth.callback(provider=provider, code=code, state=state) + except ClientError: + raise _BadRequestError("Authentication failed") + self._send_success_html() + result_queue.put(user) + + def _send_success_html(self): + body = _SUCCESS_HTML.encode() + self.send_response(200) + self.send_header("Content-Type", "text/html; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + return _CallbackHandler + + def _create_server(self, handler: type[BaseHTTPRequestHandler]) -> HTTPServer: + server_address = ("127.0.0.1", 0) + server = HTTPServer(server_address, handler) + return server + + +def _normalize_url_or_error(url: str) -> str: + try: + # Validate the URL and determine the URL scheme. + # Need to resolve the scheme before making first POST request + # since for some redirect codes (301), clients change POST to GET. + url = resolve_url(url) + except ValueError as e: + raise CLIError(e.args[0]) + return url + + +_SUCCESS_HTML = """\ + + + + + CLI authenticated + + + +

    dstack CLI authenticated

    +

    You may close this page.

    + + +""" diff --git a/src/dstack/_internal/cli/commands/logs.py b/src/dstack/_internal/cli/commands/logs.py index 1a7d5aa501..78cde52f49 100644 --- a/src/dstack/_internal/cli/commands/logs.py +++ b/src/dstack/_internal/cli/commands/logs.py @@ -1,9 +1,13 @@ import argparse import sys -from pathlib import Path from dstack._internal.cli.commands import APIBaseCommand +from dstack._internal.cli.services.completion import RunNameCompleter +from dstack._internal.cli.utils.common import get_start_time from dstack._internal.core.errors import CLIError +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) class LogsCommand(APIBaseCommand): @@ -12,23 +16,12 @@ class LogsCommand(APIBaseCommand): def _register(self): super()._register() - self._parser.add_argument("-d", "--diagnose", action="store_true") - self._parser.add_argument( - "-a", - "--attach", - action="store_true", - help="Set up an SSH tunnel, and print logs as they follow.", - ) self._parser.add_argument( - "--ssh-identity", - metavar="SSH_PRIVATE_KEY", - help="The private SSH key path for SSH tunneling", - type=Path, - dest="ssh_identity_file", + "-d", "--diagnose", action="store_true", help="Show run diagnostic logs" ) self._parser.add_argument( "--replica", - help="The relica number. Defaults to 0.", + help="The replica number. Defaults to 0.", type=int, default=0, ) @@ -38,19 +31,25 @@ def _register(self): type=int, default=0, ) - self._parser.add_argument("run_name") + self._parser.add_argument( + "--since", + help=( + "Show only logs newer than the specified date." + " Can be a duration (e.g. 10s, 5m, 1d) or an RFC 3339 string (e.g. 2023-09-24T15:30:00Z)." + ), + type=str, + ) + self._parser.add_argument("run_name").completer = RunNameCompleter(all=True) # type: ignore[attr-defined] def _command(self, args: argparse.Namespace): super()._command(args) run = self.api.runs.get(args.run_name) if run is None: raise CLIError(f"Run {args.run_name} not found") - if not args.diagnose and args.attach: - if run.status.is_finished(): - raise CLIError(f"Run {args.run_name} is finished") - else: - run.attach(args.ssh_identity_file) + + start_time = get_start_time(args.since) logs = run.logs( + start_time=start_time, diagnose=args.diagnose, replica_num=args.replica, job_num=args.job, diff --git a/src/dstack/_internal/cli/commands/metrics.py b/src/dstack/_internal/cli/commands/metrics.py new file mode 100644 index 0000000000..16092748c8 --- /dev/null +++ b/src/dstack/_internal/cli/commands/metrics.py @@ -0,0 +1,153 @@ +import argparse +import time +from typing import Any, List, Optional + +from rich.live import Live +from rich.table import Table + +from dstack._internal.cli.commands import APIBaseCommand +from dstack._internal.cli.services.completion import RunNameCompleter +from dstack._internal.cli.utils.common import ( + LIVE_TABLE_PROVISION_INTERVAL_SECS, + LIVE_TABLE_REFRESH_RATE_PER_SEC, + add_row_from_dict, + console, +) +from dstack._internal.core.errors import CLIError +from dstack._internal.core.models.instances import Resources +from dstack._internal.core.models.metrics import JobMetrics +from dstack.api._public import Client +from dstack.api._public.runs import Run + + +class MetricsCommand(APIBaseCommand): + NAME = "metrics" + DESCRIPTION = "Show run metrics" + + def _register(self): + super()._register() + self._parser.add_argument("run_name").completer = RunNameCompleter() # type: ignore[attr-defined] + self._parser.add_argument( + "-w", + "--watch", + help="Watch run metrics in realtime", + action="store_true", + ) + + def _command(self, args: argparse.Namespace): + super()._command(args) + run = self.api.runs.get(run_name=args.run_name) + if run is None: + raise CLIError(f"Run {args.run_name} not found") + metrics = _get_run_jobs_metrics(api=self.api, run=run) + + if not args.watch: + console.print(_get_metrics_table(run, metrics)) + return + + try: + with Live(console=console, refresh_per_second=LIVE_TABLE_REFRESH_RATE_PER_SEC) as live: + while True: + live.update(_get_metrics_table(run, metrics)) + time.sleep(LIVE_TABLE_PROVISION_INTERVAL_SECS) + run = self.api.runs.get(run_name=args.run_name) + if run is None: + raise CLIError(f"Run {args.run_name} not found") + metrics = _get_run_jobs_metrics(api=self.api, run=run) + except KeyboardInterrupt: + pass + + +def _get_run_jobs_metrics(api: Client, run: Run) -> List[JobMetrics]: + metrics = [] + for job in run._run.jobs: + job_metrics = api.client.metrics.get_job_metrics( + project_name=api.project, + run_name=run.name, + replica_num=job.job_spec.replica_num, + job_num=job.job_spec.job_num, + ) + metrics.append(job_metrics) + return metrics + + +def _get_metrics_table(run: Run, metrics: List[JobMetrics]) -> Table: + table = Table(box=None) + table.add_column("NAME", style="bold", no_wrap=True) + table.add_column("STATUS") + table.add_column("CPU") + table.add_column("MEMORY") + table.add_column("GPU") + + run_row = {"NAME": run.name, "STATUS": run.status.value} + if len(run._run.jobs) != 1: + add_row_from_dict(table, run_row) + + for job, job_metrics in zip(run._run.jobs, metrics): + jrd = job.job_submissions[-1].job_runtime_data + jpd = job.job_submissions[-1].job_provisioning_data + resources: Optional[Resources] = None + if jrd is not None and jrd.offer is not None: + resources = jrd.offer.instance.resources + elif jpd is not None: + resources = jpd.instance_type.resources + cpu_usage = _get_metric_value(job_metrics, "cpu_usage_percent") + if cpu_usage is not None: + if resources is not None: + cpu_usage = cpu_usage / resources.cpus + cpu_usage = f"{cpu_usage:.0f}%" + memory_usage = _get_metric_value(job_metrics, "memory_working_set_bytes") + if memory_usage is not None: + memory_usage = _format_memory(memory_usage, 2) + if resources is not None: + memory_usage += f"/{_format_memory(resources.memory_mib * 1024 * 1024, 2)}" + gpu_metrics = "" + gpus_detected_num = _get_metric_value(job_metrics, "gpus_detected_num") + if gpus_detected_num is not None: + for i in range(gpus_detected_num): + gpu_memory_usage = _get_metric_value(job_metrics, f"gpu_memory_usage_bytes_gpu{i}") + gpu_util_percent = _get_metric_value(job_metrics, f"gpu_util_percent_gpu{i}") + if gpu_memory_usage is not None: + if i != 0: + gpu_metrics += "\n" + gpu_metrics += f"gpu={i} mem={_format_memory(gpu_memory_usage, 2)}" + if resources is not None: + gpu_metrics += ( + f"/{_format_memory(resources.gpus[i].memory_mib * 1024 * 1024, 2)}" + ) + gpu_metrics += f" util={gpu_util_percent}%" + + job_row = { + "NAME": f" replica={job.job_spec.replica_num} job={job.job_spec.job_num}", + "STATUS": job.job_submissions[-1].status.value, + "CPU": cpu_usage or "-", + "MEMORY": memory_usage or "-", + "GPU": gpu_metrics or "-", + } + if len(run._run.jobs) == 1: + job_row.update(run_row) + add_row_from_dict(table, job_row) + + return table + + +def _get_metric_value(job_metrics: JobMetrics, name: str) -> Optional[Any]: + for metric in job_metrics.metrics: + if metric.name == name: + return metric.values[-1] + return None + + +def _format_memory(memory_bytes: int, decimal_places: int) -> str: + """See test_format_memory in tests/_internal/cli/commands/test_metrics.py for examples.""" + memory_mb = memory_bytes / 1024 / 1024 + if memory_mb >= 1024: + value = memory_mb / 1024 + unit = "GB" + else: + value = memory_mb + unit = "MB" + + if decimal_places == 0: + return f"{round(value)}{unit}" + return f"{value:.{decimal_places}f}".rstrip("0").rstrip(".") + unit diff --git a/src/dstack/_internal/cli/commands/offer.py b/src/dstack/_internal/cli/commands/offer.py new file mode 100644 index 0000000000..92157ad9b1 --- /dev/null +++ b/src/dstack/_internal/cli/commands/offer.py @@ -0,0 +1,153 @@ +import argparse +from pathlib import Path +from typing import List, Literal, cast + +from dstack._internal.cli.commands import APIBaseCommand +from dstack._internal.cli.services.configurators.run import ( + BaseRunConfigurator, +) +from dstack._internal.cli.services.profile import register_profile_args +from dstack._internal.cli.services.resources import register_resources_args +from dstack._internal.cli.utils.common import console +from dstack._internal.cli.utils.gpu import print_gpu_json, print_gpu_table +from dstack._internal.cli.utils.run import print_offers_json, print_run_plan +from dstack._internal.core.errors import CLIError +from dstack._internal.core.models.configurations import ApplyConfigurationType, TaskConfiguration +from dstack._internal.core.models.gpus import GpuGroup +from dstack._internal.core.models.runs import RunSpec +from dstack.api.utils import load_profile + + +class OfferConfigurator(BaseRunConfigurator): + TYPE = ApplyConfigurationType.TASK + + @classmethod + def register_args(cls, parser: argparse.ArgumentParser): + configuration_group = parser.add_argument_group(f"{cls.TYPE.value} Options") + parser.add_argument( + "--group-by", + action="append", + help=( + "Group results by fields ([code]gpu[/code], [code]backend[/code], [code]region[/code], [code]count[/code]). " + "Optional, but if used, must include [code]gpu[/code]. " + "The use of [code]region[/code] also requires [code]backend[/code]. " + "Can be repeated or comma-separated (e.g. [code]--group-by gpu,backend[/code])." + ), + ) + configuration_group.add_argument( + "-n", + "--name", + dest="run_name", + help="The name of the run. If not specified, a random name is assigned", + ) + configuration_group.add_argument( + "--max-offers", + help="Number of offers to show in the run plan", + type=int, + default=50, + ) + cls.register_env_args(configuration_group) + register_resources_args(configuration_group) + register_profile_args(parser) + + +class OfferCommand(APIBaseCommand): + NAME = "offer" + DESCRIPTION = "List offers" + + def _register(self): + super()._register() + self._parser.add_argument( + "--format", + choices=["plain", "json"], + default="plain", + help="Output format (default: plain)", + ) + self._parser.add_argument( + "--json", + action="store_const", + const="json", + dest="format", + help="Output in JSON format (equivalent to --format json)", + ) + OfferConfigurator.register_args(self._parser) + + def _command(self, args: argparse.Namespace): + super()._command(args) + # Set image and user so that the server (a) does not default gpu.vendor + # to nvidia — `dstack offer` should show all vendors, and (b) does not + # attempt to pull image config from the Docker registry. + conf = TaskConfiguration(commands=[":"], image="scratch", user="root") + + configurator = OfferConfigurator(api_client=self.api) + configurator.apply_args(conf, args) + profile = load_profile(Path.cwd(), profile_name=args.profile) + + run_spec = RunSpec( + configuration=conf, + profile=profile, + ) + + if args.group_by: + args.group_by = self._process_group_by_args(args.group_by) + + if args.group_by and "gpu" not in args.group_by: + group_values = ", ".join(args.group_by) + raise CLIError(f"Cannot group by '{group_values}' without also grouping by 'gpu'") + + if args.format == "plain": + with console.status("Getting offers..."): + if args.group_by: + gpus = self._list_gpus(args, run_spec) + print_gpu_table(gpus, run_spec, args.group_by, self.api.project) + else: + run_plan = self.api.client.runs.get_plan( + self.api.project, + run_spec, + max_offers=args.max_offers, + ) + print_run_plan( + run_plan, + include_run_properties=False, + show_offer_fleet_hint=run_spec.merged_profile.fleets is None, + ) + else: + if args.group_by: + gpus = self._list_gpus(args, run_spec) + print_gpu_json( + gpus, + run_spec, + cast(List[Literal["gpu", "backend", "region", "count"]], args.group_by), + self.api.project, + ) + else: + run_plan = self.api.client.runs.get_plan( + self.api.project, + run_spec, + max_offers=args.max_offers, + ) + print_offers_json(run_plan, run_spec) + + def _process_group_by_args(self, group_by_args: List[str]) -> List[str]: + valid_choices = {"gpu", "backend", "region", "count"} + processed = [] + + for arg in group_by_args: + values = [v.strip() for v in arg.split(",") if v.strip()] + for value in values: + if value in valid_choices: + processed.append(value) + else: + raise CLIError( + f"Invalid group-by value: '{value}'. Valid choices are: {', '.join(sorted(valid_choices))}" + ) + + return processed + + def _list_gpus(self, args: argparse.Namespace, run_spec: RunSpec) -> List[GpuGroup]: + group_by = [g for g in args.group_by if g != "gpu"] or None + return self.api.client.gpus.list_gpus( + self.api.project, + run_spec, + group_by=group_by, + ) diff --git a/src/dstack/_internal/cli/commands/pool.py b/src/dstack/_internal/cli/commands/pool.py deleted file mode 100644 index f6be01a2eb..0000000000 --- a/src/dstack/_internal/cli/commands/pool.py +++ /dev/null @@ -1,573 +0,0 @@ -import argparse -import getpass -import ipaddress -import time -import urllib.parse -from pathlib import Path -from typing import Optional, Sequence, Tuple - -from rich.console import Group -from rich.live import Live -from rich.table import Table - -from dstack._internal.cli.commands import APIBaseCommand -from dstack._internal.cli.services.args import cpu_spec, disk_spec, gpu_spec, memory_spec -from dstack._internal.cli.services.profile import ( - apply_profile_args, - register_profile_args, -) -from dstack._internal.cli.utils.common import confirm_ask, console -from dstack._internal.core.errors import CLIError, ServerClientError -from dstack._internal.core.models.instances import ( - InstanceAvailability, - InstanceOfferWithAvailability, - SSHKey, -) -from dstack._internal.core.models.pools import Instance, Pool -from dstack._internal.core.models.profiles import Profile, SpotPolicy, parse_duration -from dstack._internal.core.models.resources import DEFAULT_CPU_COUNT, DEFAULT_MEMORY_SIZE -from dstack._internal.core.models.runs import InstanceStatus, Requirements, get_policy_map -from dstack._internal.utils.common import pretty_date -from dstack._internal.utils.logging import get_logger -from dstack._internal.utils.ssh import convert_pkcs8_to_pem, generate_public_key, rsa_pkey_from_str -from dstack.api._public.resources import Resources -from dstack.api.utils import load_profile - -REFRESH_RATE_PER_SEC = 5 -LIVE_PROVISION_INTERVAL_SECS = 10 - -logger = get_logger(__name__) - - -class PoolCommand(APIBaseCommand): - NAME = "pool" - DESCRIPTION = "Pool management" - - def _register(self) -> None: - super()._register() - self._parser.set_defaults(subfunc=self._list) - - subparsers = self._parser.add_subparsers(dest="action") - - # list pools - list_parser = subparsers.add_parser( - "list", - help="List pools", - description="List available pools", - formatter_class=self._parser.formatter_class, - ) - list_parser.add_argument("-v", "--verbose", help="Show more information") - list_parser.set_defaults(subfunc=self._list) - - # create pool - create_parser = subparsers.add_parser( - "create", help="Create pool", formatter_class=self._parser.formatter_class - ) - create_parser.add_argument( - "-n", "--name", dest="pool_name", help="The name of the pool", required=True - ) - create_parser.set_defaults(subfunc=self._create) - - # delete pool - delete_parser = subparsers.add_parser( - "delete", help="Delete pool", formatter_class=self._parser.formatter_class - ) - delete_parser.add_argument( - "-n", "--name", dest="pool_name", help="The name of the pool", required=True - ) - # TODO: support --force - delete_parser.set_defaults(subfunc=self._delete) - - # show pool instances - ps_parser = subparsers.add_parser( - "ps", - help="Show pool instances", - description="Show instances in the pool", - formatter_class=self._parser.formatter_class, - ) - ps_parser.add_argument( - "--pool", - dest="pool_name", - help="The name of the pool. If not set, the default pool will be used", - ) - ps_parser.add_argument( - "-w", - "--watch", - help="Watch instances in realtime", - action="store_true", - ) - ps_parser.set_defaults(subfunc=self._ps) - - # add instance - add_parser = subparsers.add_parser( - "add", help="Add instance to pool", formatter_class=self._parser.formatter_class - ) - self._parser.add_argument( - "--max-offers", - help="Number of offers to show in the run plan", - type=int, - default=3, - ) - add_parser.add_argument( - "-y", "--yes", help="Don't ask for confirmation", action="store_true" - ) - register_profile_args(add_parser, pool_add=True) - register_resource_args(add_parser) - add_parser.set_defaults(subfunc=self._add) - - # remove instance - remove_parser = subparsers.add_parser( - "rm", - help="Remove instance from the pool", - formatter_class=self._parser.formatter_class, - aliases=["remove"], - ) - remove_parser.add_argument( - "instance_name", - help="The name of the instance", - ) - remove_parser.add_argument( - "--pool", - dest="pool_name", - help="The name of the pool. If not set, the default pool will be used", - ) - remove_parser.add_argument( - "--force", - action="store_true", - help="The name of the instance", - ) - remove_parser.add_argument( - "-y", "--yes", help="Don't ask for confirmation", action="store_true" - ) - remove_parser.set_defaults(subfunc=self._remove) - - # pool set-default - set_default_parser = subparsers.add_parser( - "set-default", - help="Set the project's default pool", - formatter_class=self._parser.formatter_class, - ) - set_default_parser.add_argument( - "--pool", dest="pool_name", help="The name of the pool", required=True - ) - set_default_parser.set_defaults(subfunc=self._set_default) - - # add-ssh - add_ssh = subparsers.add_parser( - "add-ssh", - help="Add remote instance to pool", - formatter_class=self._parser.formatter_class, - ) - add_ssh.add_argument("destination") - add_ssh.add_argument( - "-i", - metavar="SSH_PRIVATE_KEY", - help="The private SSH key path for SSH", - type=Path, - dest="ssh_identity_file", - required=True, - ) - add_ssh.add_argument("-p", help="SSH port to connect", dest="ssh_port", type=int) - add_ssh.add_argument("-l", help="User to login", dest="login_name") - add_ssh.add_argument("--region", help="Host region", dest="region") - add_ssh.add_argument("--pool", help="Pool name", dest="pool_name") - add_ssh.add_argument("--name", dest="instance_name", help="Set the name of the instance") - add_ssh.add_argument( - "--network", - dest="network", - help="Network address for multinode setup. Format /", - ) - add_ssh.set_defaults(subfunc=self._add_ssh) - - def _list(self, args: argparse.Namespace) -> None: - pools = self.api.client.pool.list(self.api.project) - print_pool_table(pools, verbose=getattr(args, "verbose", False)) - - def _create(self, args: argparse.Namespace) -> None: - self.api.client.pool.create(self.api.project, args.pool_name) - console.print(f"Pool {args.pool_name!r} created") - - def _delete(self, args: argparse.Namespace) -> None: - # TODO(egor-s): ask for confirmation - with console.status("Removing pool..."): - self.api.client.pool.delete(self.api.project, args.pool_name, False) - console.print(f"Pool {args.pool_name!r} removed") - - def _remove(self, args: argparse.Namespace) -> None: - pool = self.api.client.pool.show(self.api.project, args.pool_name) - pool.instances = [i for i in pool.instances if i.name == args.instance_name] - if not pool.instances: - raise CLIError(f"Instance {args.instance_name!r} not found in pool {pool.name!r}") - - console.print(f" [bold]Pool name[/] {pool.name}\n") - print_instance_table(pool.instances) - - if not args.force and any(i.status == InstanceStatus.BUSY for i in pool.instances): - # TODO(egor-s): implement this logic in the server too - raise CLIError("Can't remove busy instance. Use `--force` to remove anyway") - - if not args.yes and not confirm_ask(f"Remove instance {args.instance_name!r}?"): - console.print("\nExiting...") - return - - with console.status("Removing instance..."): - self.api.client.pool.remove( - self.api.project, pool.name, args.instance_name, args.force - ) - console.print(f"Instance {args.instance_name!r} removed") - - def _set_default(self, args: argparse.Namespace) -> None: - self.api.client.pool.set_default(self.api.project, args.pool_name) - - def _ps(self, args: argparse.Namespace) -> None: - pool_name_template = " [bold]Pool name[/] {}\n" - if not args.watch: - resp = self.api.client.pool.show(self.api.project, args.pool_name) - console.print(pool_name_template.format(resp.name)) - print_instance_table(resp.instances) - return - - try: - with Live(console=console, refresh_per_second=REFRESH_RATE_PER_SEC) as live: - while True: - resp = self.api.client.pool.show(self.api.project, args.pool_name) - group = Group( - pool_name_template.format(resp.name), get_instance_table(resp.instances) - ) - live.update(group) - time.sleep(LIVE_PROVISION_INTERVAL_SECS) - except KeyboardInterrupt: - pass - - def _add(self, args: argparse.Namespace) -> None: - super()._command(args) - - resources = Resources( - cpu=args.cpu, - memory=args.memory, - gpu=args.gpu, - shm_size=args.shared_memory, - disk=args.disk, - ) - - profile = load_profile(Path.cwd(), args.profile) - apply_profile_args(args, profile, pool_add=True) - - spot = get_policy_map(profile.spot_policy, default=SpotPolicy.ONDEMAND) - - requirements = Requirements( - resources=resources, - max_price=profile.max_price, - spot=spot, - ) - - with console.status("Getting instances..."): - pool_offers = self.api.runs.get_offers(profile, requirements) - - profile.pool_name = pool_offers.pool_name - - print_offers_table( - profile=profile, - requirements=requirements, - instance_offers=pool_offers.instances, - offers_limit=args.max_offers, - ) - if not pool_offers.instances: - console.print("\nThere are no offers with these criteria. Exiting...") - return - - if not args.yes and not confirm_ask("Continue?"): - console.print("\nExiting...") - return - - try: - with console.status("Creating instance..."): - # TODO: Instance name is not passed, so --instance does not work. - # There is profile.instance_name but it makes sense for `dstack run` only. - instance = self.api.runs.create_instance(profile, requirements) - except ServerClientError as e: - raise CLIError(e.msg) - console.print() - print_instance_table([instance]) - - def _add_ssh(self, args: argparse.Namespace) -> None: - super()._command(args) - - # validate network - if args.network is not None: - try: - network = ipaddress.IPv4Interface(args.network).network - except ValueError as e: - console.print( - f"[error]Can't parse network. The address must be in the format /, example `10.0.0.0/24`. Error: {e}[/]" - ) - return - if not network.is_private: - console.print( - f"[error]The network must be private network. The {network} is not private[/]" - ) - return - - ssh_keys = [] - if args.ssh_identity_file: - try: - private_key = convert_pkcs8_to_pem(args.ssh_identity_file.read_text()) - try: - pub_key = args.ssh_identity_file.with_suffix(".pub").read_text() - except FileNotFoundError: - pub_key = generate_public_key(rsa_pkey_from_str(private_key)) - ssh_key = SSHKey(public=pub_key, private=private_key) - ssh_keys.append(ssh_key) - except OSError: - console.print("[error]Unable to read the public key.[/]") - return - - login, ssh_host, port = parse_destination(args.destination) - - ssh_port = 22 - if port is not None: - ssh_port = port - if args.ssh_port is not None: - ssh_port = args.ssh_port - - ssh_user = args.login_name - if ssh_user is None: - ssh_user = login - if ssh_user is None: - try: - ssh_user = getpass.getuser() - except OSError: - console.print("[error]Set the user name with the `-l` parameter.[/]") - return - - result = self.api.client.pool.add_remote( - project_name=self.api.project, - pool_name=args.pool_name, - instance_name=args.instance_name, - instance_network=args.network, - region=args.region, - host=ssh_host, - port=ssh_port, - ssh_user=ssh_user, - ssh_keys=ssh_keys, - ) - if not result: - console.print(f"[error]Failed to add remote instance {args.instance_name!r}[/]") - return - console.print( - f"Remote instance [code]{result.name!r}[/] has been added with status [secondary]{result.status.upper()}[/]" - ) - - def _command(self, args: argparse.Namespace) -> None: - super()._command(args) - # TODO handle 404 and other errors - args.subfunc(args) - - -def print_pool_table(pools: Sequence[Pool], verbose: bool) -> None: - table = Table(box=None) - table.add_column("NAME") - table.add_column("DEFAULT") - table.add_column("INSTANCES") - if verbose: - table.add_column("CREATED") - - sorted_pools = sorted(pools, key=lambda r: r.name) - for pool in sorted_pools: - default_mark = "default" if pool.default else "" - style = "success" if pool.total_instances == pool.available_instances else "error" - health = f"[{style}]{pool.available_instances}/{pool.total_instances}[/]" - row = [pool.name, default_mark, health] - if verbose: - row.append(pretty_date(pool.created_at)) - table.add_row(*row) - - console.print(table) - console.print() - - -def print_instance_table(instances: Sequence[Instance]) -> None: - console.print(get_instance_table(instances)) - console.print() - - -def get_instance_table(instances: Sequence[Instance]) -> Table: - table = Table(box=None) - table.add_column("INSTANCE", no_wrap=True) - table.add_column("BACKEND") - table.add_column("REGION") - table.add_column("RESOURCES") - table.add_column("SPOT") - table.add_column("PRICE") - table.add_column("STATUS") - table.add_column("CREATED") - - for instance in instances: - resources = "" - spot = "" - if instance.instance_type is not None: - resources = instance.instance_type.resources.pretty_format() - spot = "yes" if instance.instance_type.resources.spot else "no" - - status = instance.status.value - if instance.status in [InstanceStatus.IDLE, InstanceStatus.BUSY] and instance.unreachable: - status += "\n(unreachable)" - - row = [ - instance.name, - (instance.backend or "").replace("remote", "ssh"), - instance.region or "", - resources, - spot, - f"${instance.price:.4}" if instance.price is not None else "", - status, - pretty_date(instance.created), - ] - table.add_row(*row) - - return table - - -def print_offers_table( - profile: Profile, - requirements: Requirements, - instance_offers: Sequence[InstanceOfferWithAvailability], - offers_limit: int, -) -> None: - pretty_req = requirements.pretty_format(resources_only=True) - max_price = f"${requirements.max_price:g}" if requirements.max_price else "-" - termination_policy = profile.termination_policy - termination_idle_time = f"{parse_duration(profile.termination_idle_time)}s" - - if requirements.spot is None: - spot_policy = "auto" - elif requirements.spot: - spot_policy = "spot" - else: - spot_policy = "on-demand" - - def th(s: str) -> str: - return f"[bold]{s}[/bold]" - - props = Table(box=None, show_header=False) - props.add_column(no_wrap=True) # key - props.add_column() # value - - props.add_row(th("Pool"), profile.pool_name) - props.add_row(th("Min resources"), pretty_req) - props.add_row(th("Max price"), max_price) - props.add_row(th("Spot policy"), spot_policy) - props.add_row(th("Termination policy"), termination_policy) - props.add_row(th("Termination idle time"), termination_idle_time) - - offers_table = Table(box=None) - offers_table.add_column("#") - offers_table.add_column("BACKEND") - offers_table.add_column("REGION") - offers_table.add_column("INSTANCE") - offers_table.add_column("RESOURCES") - offers_table.add_column("SPOT") - offers_table.add_column("PRICE") - offers_table.add_column() - - print_offers = instance_offers[:offers_limit] - - for index, offer in enumerate(print_offers, start=1): - resources = offer.instance.resources - - availability = "" - if offer.availability in { - InstanceAvailability.NOT_AVAILABLE, - InstanceAvailability.NO_QUOTA, - }: - availability = offer.availability.value.replace("_", " ").title() - offers_table.add_row( - f"{index}", - offer.backend.replace("remote", "ssh"), - offer.region, - offer.instance.name, - resources.pretty_format(), - "yes" if resources.spot else "no", - f"${offer.price:g}", - availability, - style=None if index == 1 else "secondary", - ) - if len(print_offers) > offers_limit: - offers_table.add_row("", "...", style="secondary") - - console.print(props) - console.print() - if len(print_offers) > 0: - console.print(offers_table) - console.print() - - -def register_resource_args(parser: argparse.ArgumentParser) -> None: - resources_group = parser.add_argument_group("Resources") - resources_group.add_argument( - "--cpu", - help=f"Request the CPU count. Default: {DEFAULT_CPU_COUNT}", - dest="cpu", - metavar="SPEC", - default=DEFAULT_CPU_COUNT, - type=cpu_spec, - ) - - resources_group.add_argument( - "--memory", - help="Request the size of RAM. " - f"The format is [code]SIZE[/]:[code]MB|GB|TB[/]. Default: {DEFAULT_MEMORY_SIZE}", - dest="memory", - metavar="SIZE", - default=DEFAULT_MEMORY_SIZE, - type=memory_spec, - ) - - resources_group.add_argument( - "--shared-memory", - help="Request the size of Shared Memory. The format is [code]SIZE[/]:[code]MB|GB|TB[/].", - dest="shared_memory", - default=None, - metavar="SIZE", - ) - - resources_group.add_argument( - "--gpu", - help="Request GPU for the run. " - "The format is [code]NAME[/]:[code]COUNT[/]:[code]MEMORY[/] (all parts are optional)", - dest="gpu", - default=None, - metavar="SPEC", - type=gpu_spec, - ) - - resources_group.add_argument( - "--disk", - help="Request the size of disk for the run. Example [code]--disk 100GB..[/].", - dest="disk", - metavar="SIZE", - default=None, - type=disk_spec, - ) - - -def parse_destination(destination: str) -> Tuple[Optional[str], str, Optional[int]]: - port = None - netloc = destination - - if destination.startswith("ssh://"): - parse_result = urllib.parse.urlparse(destination) - netloc, _, netloc_port = parse_result.netloc.partition(":") - try: - port = int(netloc_port) - except ValueError: - pass - - head, sep, tail = netloc.partition("@") - if sep == "@": - login = head - host = tail - else: - login = None - host = head - return login, host, port diff --git a/src/dstack/_internal/cli/commands/project.py b/src/dstack/_internal/cli/commands/project.py new file mode 100644 index 0000000000..867bd9686d --- /dev/null +++ b/src/dstack/_internal/cli/commands/project.py @@ -0,0 +1,259 @@ +import argparse +import sys +from typing import Optional + +import questionary +from requests import HTTPError +from rich.table import Table + +import dstack.api.server +from dstack._internal.cli.commands import BaseCommand +from dstack._internal.cli.utils.common import add_row_from_dict, confirm_ask, console +from dstack._internal.core.errors import ClientError, CLIError +from dstack._internal.core.models.config import ProjectConfig +from dstack._internal.core.services.configs import ConfigManager +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + +is_project_menu_supported = sys.stdin.isatty() + + +def select_default_project( + project_configs: list[ProjectConfig], default_project: Optional[ProjectConfig] +) -> Optional[ProjectConfig]: + """Show an interactive menu to select a default project. + + This method only prompts for selection and does not update the configuration. + Use `ConfigManager.configure_project()` and `ConfigManager.save()` to persist + the selected project as default. + + Args: + project_configs: Non-empty list of available project configurations. + default_project: Currently default project, if any. + + Returns: + Selected project configuration, or None if cancelled. + + Raises: + CLIError: If `is_project_menu_supported` is False or `project_configs` is empty. + """ + if not is_project_menu_supported: + raise CLIError("Interactive menu is not supported on this platform") + + if len(project_configs) == 0: + raise CLIError("No projects configured") + + menu_entries = [] + default_index = None + for i, project_config in enumerate(project_configs): + is_default = project_config.name == default_project.name if default_project else False + entry = f"{project_config.name} ({project_config.url})" + if is_default: + default_index = i + menu_entries.append((entry, i)) + + choices = [questionary.Choice(title=entry, value=index) for entry, index in menu_entries] + default_value = default_index + selected_index = questionary.select( + message="Select the default project:", + choices=choices, + default=default_value, # pyright: ignore[reportArgumentType] + qmark="", + instruction="(↑↓ Enter)", + ).ask() + + if selected_index is not None and isinstance(selected_index, int): + return project_configs[selected_index] + return None + + +class ProjectCommand(BaseCommand): + NAME = "project" + DESCRIPTION = "Manage projects configs" + + def _register(self): + super()._register() + subparsers = self._parser.add_subparsers(dest="subcommand", help="Command to execute") + + # Add subcommand + add_parser = subparsers.add_parser("add", help="Add or update a project config") + add_parser.add_argument( + "--name", type=str, help="The name of the project to configure", required=True + ) + add_parser.add_argument("--url", type=str, help="Server url", required=True) + add_parser.add_argument("--token", type=str, help="User token", required=True) + add_parser.add_argument( + "-y", + "--yes", + help="Don't ask for confirmation (e.g. update the config)", + action="store_true", + ) + add_parser.add_argument( + "-n", + "--no", + help="Don't ask for confirmation (e.g. do not update the config)", + action="store_true", + ) + add_parser.set_defaults(subfunc=self._add) + + # Delete subcommand + delete_parser = subparsers.add_parser("delete", help="Delete a project config") + delete_parser.add_argument( + "--name", type=str, help="The name of the project to delete", required=True + ) + delete_parser.add_argument( + "-y", + "--yes", + help="Don't ask for confirmation", + action="store_true", + ) + delete_parser.set_defaults(subfunc=self._delete) + + # List subcommand + list_parser = subparsers.add_parser("list", help="List configured projects") + list_parser.set_defaults(subfunc=self._list) + for parser in [self._parser, list_parser]: + parser.add_argument( + "-v", "--verbose", action="store_true", help="Show more information" + ) + + # Set default subcommand + set_default_parser = subparsers.add_parser("set-default", help="Set default project") + set_default_parser.add_argument( + "name", + type=str, + nargs="?" if is_project_menu_supported else None, + help="The name of the project to set as default", + ) + set_default_parser.set_defaults(subfunc=self._set_default) + + def _command(self, args: argparse.Namespace): + super()._command(args) + if not hasattr(args, "subfunc"): + args.subfunc = self._project + args.subfunc(args) + + def _add(self, args: argparse.Namespace): + config_manager = ConfigManager() + api_client = dstack.api.server.APIClient(base_url=args.url, token=args.token) + try: + api_client.projects.get(args.name) + except HTTPError as e: + if e.response is not None and e.response.status_code == 403: + raise CLIError("Forbidden. Ensure the token is valid.") + elif e.response is not None and e.response.status_code == 404: + raise CLIError(f"Project '{args.name}' not found.") + else: + raise e + default_project = config_manager.get_project_config() + if ( + default_project is None + or default_project.name != args.name + or default_project.url != args.url + or default_project.token != args.token + ): + set_it_as_default = ( + ( + args.yes + or not default_project + or confirm_ask(f"Set '{args.name}' as your default project?") + ) + if not args.no + else False + ) + config_manager.configure_project( + name=args.name, url=args.url, token=args.token, default=set_it_as_default + ) + config_manager.save() + logger.info( + f"Configuration updated at {config_manager.config_filepath}", {"show_path": False} + ) + + def _delete(self, args: argparse.Namespace): + config_manager = ConfigManager() + if args.yes or confirm_ask(f"Are you sure you want to delete project '{args.name}'?"): + config_manager.delete_project(args.name) + config_manager.save() + console.print("[grey58]OK[/]") + + def _list(self, args: argparse.Namespace): + config_manager = ConfigManager() + default_project = config_manager.get_project_config() + + table = Table(box=None) + table.add_column("PROJECT", style="bold", no_wrap=True) + table.add_column("URL", style="grey58") + if args.verbose: + table.add_column("USER", style="grey58") + table.add_column("DEFAULT", justify="center") + + for project_config in config_manager.list_project_configs(): + project_name = project_config.name + is_default = project_name == default_project.name if default_project else False + row = { + "PROJECT": project_name, + "URL": project_config.url, + "DEFAULT": "✓" if is_default else "", + } + + if args.verbose: + # Get username from API + try: + api_client = dstack.api.server.APIClient( + base_url=project_config.url, token=project_config.token + ) + user_info = api_client.users.get_my_user() + username = user_info.username + except ClientError: + username = "(invalid token)" + row["USER"] = username + + add_row_from_dict(table, row, style="bold" if is_default else None) + + console.print(table) + + def _project(self, args: argparse.Namespace): + if is_project_menu_supported and not getattr(args, "verbose", False): + config_manager = ConfigManager() + project_configs = config_manager.list_project_configs() + default_project = config_manager.get_project_config() + selected_project = select_default_project(project_configs, default_project) + if selected_project is not None: + config_manager.configure_project( + name=selected_project.name, + url=selected_project.url, + token=selected_project.token, + default=True, + ) + config_manager.save() + console.print("[grey58]OK[/]") + else: + self._list(args) + + def _set_default(self, args: argparse.Namespace): + if args.name: + config_manager = ConfigManager() + project_config = config_manager.get_project_config(args.name) + if project_config is None: + raise CLIError(f"Project '{args.name}' not found") + + config_manager.configure_project( + name=args.name, url=project_config.url, token=project_config.token, default=True + ) + config_manager.save() + console.print("[grey58]OK[/]") + else: + config_manager = ConfigManager() + project_configs = config_manager.list_project_configs() + default_project = config_manager.get_project_config() + selected_project = select_default_project(project_configs, default_project) + if selected_project is not None: + config_manager.configure_project( + name=selected_project.name, + url=selected_project.url, + token=selected_project.token, + default=True, + ) + config_manager.save() + console.print("[grey58]OK[/]") diff --git a/src/dstack/_internal/cli/commands/ps.py b/src/dstack/_internal/cli/commands/ps.py index f7f78bf4e1..e254793a61 100644 --- a/src/dstack/_internal/cli/commands/ps.py +++ b/src/dstack/_internal/cli/commands/ps.py @@ -5,10 +5,16 @@ import dstack._internal.cli.utils.run as run_utils from dstack._internal.cli.commands import APIBaseCommand -from dstack._internal.cli.utils.common import console +from dstack._internal.cli.utils.common import ( + LIVE_TABLE_PROVISION_INTERVAL_SECS, + LIVE_TABLE_REFRESH_RATE_PER_SEC, + console, +) +from dstack._internal.core.errors import CLIError +from dstack._internal.core.models.configurations import ServiceConfiguration +from dstack._internal.utils.logging import get_logger -REFRESH_RATE_PER_SEC = 3 -LIVE_PROVISION_INTERVAL_SECS = 2 +logger = get_logger(__name__) class PsCommand(APIBaseCommand): @@ -35,19 +41,62 @@ def _register(self): help="Watch statuses of runs in realtime", action="store_true", ) + self._parser.add_argument( + "-n", + "--last", + help="Show only the last N runs. Implies --all", + type=int, + default=None, + ) + self._parser.add_argument( + "--format", + choices=["plain", "json"], + default="plain", + help="Output format (default: plain)", + ) + self._parser.add_argument( + "--json", + action="store_const", + const="json", + dest="format", + help="Output in JSON format (equivalent to --format json)", + ) def _command(self, args: argparse.Namespace): super()._command(args) - runs = self.api.runs.list(all=args.all) + if args.watch and args.format == "json": + raise CLIError("JSON output is not supported together with --watch") + + runs = self.api.runs.list(all=args.all, limit=args.last) + deprecated_router_runs = [ + run._run.run_spec.run_name + for run in runs + if not run.status.is_finished() + and isinstance(run._run.run_spec.configuration, ServiceConfiguration) + and run._run.run_spec.configuration.router is not None + and run._run.run_spec.run_name is not None + ] + if deprecated_router_runs and args.format != "json": + logger.warning( + "Specifying `router` in service configurations is deprecated" + " and will be disallowed in a future release." + " Please migrate to replica-based routers:" + " https://dstack.ai/docs/concepts/services/#pd-disaggregation" + " (affected runs: %s)", + ", ".join(deprecated_router_runs), + ) if not args.watch: - console.print(run_utils.generate_runs_table(runs, verbose=args.verbose)) + if args.format == "json": + run_utils.print_runs_json(self.api.project, runs) + else: + console.print(run_utils.get_runs_table(runs, verbose=args.verbose)) return try: - with Live(console=console, refresh_per_second=REFRESH_RATE_PER_SEC) as live: + with Live(console=console, refresh_per_second=LIVE_TABLE_REFRESH_RATE_PER_SEC) as live: while True: - live.update(run_utils.generate_runs_table(runs, verbose=args.verbose)) - time.sleep(LIVE_PROVISION_INTERVAL_SECS) - runs = self.api.runs.list(all=args.all) + live.update(run_utils.get_runs_table(runs, verbose=args.verbose)) + time.sleep(LIVE_TABLE_PROVISION_INTERVAL_SECS) + runs = self.api.runs.list(all=args.all, limit=args.last) except KeyboardInterrupt: pass diff --git a/src/dstack/_internal/cli/commands/run.py b/src/dstack/_internal/cli/commands/run.py index 3ed5ceda40..337b0a75cf 100644 --- a/src/dstack/_internal/cli/commands/run.py +++ b/src/dstack/_internal/cli/commands/run.py @@ -1,288 +1,69 @@ import argparse -import sys -import time -from pathlib import Path -from typing import Optional, Tuple +from uuid import UUID from dstack._internal.cli.commands import APIBaseCommand -from dstack._internal.cli.services.configurators.run import ( - BaseRunConfigurator, - run_configurators_mapping, -) -from dstack._internal.cli.services.profile import ( - apply_profile_args, - register_profile_args, -) -from dstack._internal.cli.utils.common import confirm_ask, console -from dstack._internal.cli.utils.run import print_run_plan -from dstack._internal.core.errors import CLIError, ConfigurationError, ServerClientError -from dstack._internal.core.models.configurations import RunConfigurationType -from dstack._internal.core.models.runs import JobSubmission, JobTerminationReason -from dstack._internal.core.services.configs import ConfigManager -from dstack._internal.utils.logging import get_logger -from dstack.api import RunStatus -from dstack.api._public.runs import Run -from dstack.api.utils import load_configuration, load_profile - -logger = get_logger(__name__) -NOTSET = object() +from dstack._internal.cli.services.completion import RunNameCompleter +from dstack._internal.cli.utils.common import console +from dstack._internal.core.errors import CLIError, ResourceNotExistsError +from dstack._internal.utils.json_utils import pydantic_orjson_dumps_with_indent class RunCommand(APIBaseCommand): NAME = "run" - DESCRIPTION = "Run a configuration" - DEFAULT_HELP = False + DESCRIPTION = "Manage runs" def _register(self): super()._register() - self._parser.add_argument( - "-h", - "--help", - nargs="?", - type=RunConfigurationType, - default=NOTSET, - help="Show this help message and exit. TYPE is one of [code]task[/], [code]dev-environment[/], [code]service[/]", - dest="help", - metavar="TYPE", - ) - self._parser.add_argument("working_dir") - self._parser.add_argument( - "-f", - "--file", - type=Path, - metavar="FILE", - help="The path to the run configuration file. Defaults to [code]WORKING_DIR/.dstack.yml[/]", - dest="configuration_file", - ) - self._parser.add_argument( - "-n", - "--name", - dest="run_name", - help="The name of the run. If not specified, a random name is assigned", + subparsers = self._parser.add_subparsers(dest="action") + + # TODO: Add `list` subcommand and make `dstack ps` an alias to `dstack run list` + + get_parser = subparsers.add_parser( + "get", help="Get a run", formatter_class=self._parser.formatter_class ) - self._parser.add_argument( - "-d", - "--detach", - help="Do not poll logs and run status", - action="store_true", + name_group = get_parser.add_mutually_exclusive_group(required=True) + name_group.add_argument( + "name", + nargs="?", + metavar="NAME", + help="The name of the run", + ).completer = RunNameCompleter() # type: ignore[attr-defined] + name_group.add_argument( + "--id", + type=str, + help="The ID of the run (UUID)", ) - self._parser.add_argument( - "-y", - "--yes", - help="Do not ask for plan confirmation", + get_parser.add_argument( + "--json", action="store_true", + required=True, + help="Output in JSON format", ) - self._parser.add_argument( - "--max-offers", - help="Number of offers to show in the run plan", - type=int, - default=3, - ) - register_profile_args(self._parser) + get_parser.set_defaults(subfunc=self._get) def _command(self, args: argparse.Namespace): - if args.help is not NOTSET: - if args.help is not None: - run_configurators_mapping[RunConfigurationType(args.help)].register(self._parser) - else: - BaseRunConfigurator.register(self._parser) - self._parser.print_help() - return - super()._command(args) - try: - repo = self.api.repos.load(Path.cwd()) - self.api.ssh_identity_file = ( - ConfigManager().get_repo_config(repo.repo_dir).ssh_key_path - ) - profile = load_profile(Path.cwd(), args.profile) - configuration_path, conf = load_configuration( - Path.cwd(), args.working_dir, args.configuration_file - ) - apply_profile_args(args, conf) - logger.debug("Configuration loaded: %s", configuration_path) - parser = argparse.ArgumentParser() - configurator = run_configurators_mapping[RunConfigurationType(conf.type)] - configurator.register(parser) - known, unknown = parser.parse_known_args(args.unknown) - configurator.apply(known, unknown, conf) - - with console.status("Getting run plan..."): - run_plan = self.api.runs.get_plan( - configuration=conf, - repo=repo, - configuration_path=configuration_path, - backends=profile.backends, - regions=profile.regions, - instance_types=profile.instance_types, - spot_policy=profile.spot_policy, # pass profile piece by piece - retry_policy=profile.retry_policy, - max_duration=profile.max_duration, - max_price=profile.max_price, - working_dir=args.working_dir, - run_name=args.run_name, - pool_name=profile.pool_name, - instance_name=profile.instance_name, - creation_policy=profile.creation_policy, - termination_policy=profile.termination_policy, - termination_policy_idle=profile.termination_idle_time, - ) - except ConfigurationError as e: - raise CLIError(str(e)) - - print_run_plan(run_plan, offers_limit=args.max_offers) - if not args.yes and not confirm_ask("Continue?"): - console.print("\nExiting...") - return - - if args.run_name: - old_run = self.api.runs.get(run_name=args.run_name) - if old_run is not None: - if not args.yes and not confirm_ask( - f"Run [code]{args.run_name}[/] already exists. Override the run?" - ): - console.print("\nExiting...") - return - - try: - with console.status("Submitting run..."): - run = self.api.runs.exec_plan(run_plan, repo, reserve_ports=not args.detach) - except ServerClientError as e: - raise CLIError(e.msg) - - if args.detach: - console.print(f"Run [code]{run.name}[/] submitted, detaching...") - return - - abort_at_exit = False - try: - # We can attach to run multiple times if it goes from running to pending (retried). - while True: - with console.status(f"Launching [code]{run.name}[/]") as status: - while run.status in ( - RunStatus.SUBMITTED, - RunStatus.PENDING, - RunStatus.PROVISIONING, - ): - job_statuses = "\n".join( - f" - {job.job_spec.job_name} [secondary]({job.job_submissions[-1].status.value})[/]" - for job in run._run.jobs - ) - status.update( - f"Launching [code]{run.name}[/] [secondary]({run.status.value})[/]\n{job_statuses}" - ) - time.sleep(5) - run.refresh() - console.print( - f"[code]{run.name}[/] provisioning completed [secondary]({run.status.value})[/]" - ) - - current_job_submission = run._run.latest_job_submission - if run.status in (RunStatus.RUNNING, RunStatus.DONE): - if run._run.run_spec.configuration.type == RunConfigurationType.SERVICE.value: - console.print( - f"Service is published at [link={run.service_url}]{run.service_url}[/]\n" - ) - try: - if run.attach(): - for entry in run.logs(): - sys.stdout.buffer.write(entry) - sys.stdout.buffer.flush() - else: - console.print("[error]Failed to attach, exiting...[/]") - return - finally: - run.detach() + if hasattr(args, "subfunc"): + args.subfunc(args) + else: + self._parser.print_help() - # After reading the logs, the run may not be marked as finished immediately. - # Give the run some time to transit into a finished state before exiting. - reattach = False - for _ in range(30): - run.refresh() - if _run_resubmitted(run, current_job_submission): - # The run was resubmitted - reattach = True - break - if run.status.is_finished(): - _print_finished_message(run) - return - time.sleep(1) - if not reattach: - console.print( - "[error]Lost run connection. Timed out waiting for run final status." - " Check `dstack ps` to see if it's done or failed." - ) - return - except KeyboardInterrupt: + def _get(self, args: argparse.Namespace): + # TODO: Implement non-json output format + run_id = None + if args.id is not None: try: - if not confirm_ask("\nStop the run before detaching?"): - console.print("Detached") - abort_at_exit = False - return - # Gently stop the run and wait for it to finish - with console.status("Stopping..."): - run.stop(abort=False) - while not run.status.is_finished(): - time.sleep(2) - run.refresh() - console.print("Stopped") - except KeyboardInterrupt: - abort_at_exit = True - finally: - run.detach() - if abort_at_exit: - with console.status("Aborting..."): - run.stop(abort=True) - console.print("[error]Aborted[/]") - - -def _print_finished_message(run: Run): - if run.status == RunStatus.DONE: - console.print("[code]Done[/]") - return - - termination_reason, termination_reason_message = _get_run_termination_reason(run) - message = "Run failed due to unknown reason. Check CLI and server logs." - if run.status == RunStatus.TERMINATED: - message = "Run terminated due to unknown reason. Check CLI and server logs." - - if termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY: - message = ( - "All provisioning attempts failed. " - "This is likely due to cloud providers not having enough capacity. " - "Check CLI and server logs for more details." - ) - elif termination_reason == JobTerminationReason.CREATING_CONTAINER_ERROR: - message = ( - "Cannot create container.\n" - f"Error: {termination_reason_message}\n" - "Check CLI and server logs for more details." - ) - elif termination_reason == JobTerminationReason.EXECUTOR_ERROR: - message = ( - f"Error: {termination_reason_message}\nCheck CLI and server logs for more details." - ) - elif termination_reason is not None: - message = ( - f"Run failed with error code {termination_reason.name}.\n" - f"Error: {termination_reason_message}\n" - "Check CLI and server logs for more details." - ) - console.print(f"[error]{message}[/]") - - -def _get_run_termination_reason(run: Run) -> Tuple[Optional[JobTerminationReason], Optional[str]]: - job = run._run.jobs[0] - if len(job.job_submissions) == 0: - return None, None - job_submission = job.job_submissions[0] - return job_submission.termination_reason, job_submission.termination_reason_message + run_id = UUID(args.id) + except ValueError: + raise CLIError(f"Invalid UUID format: {args.id}") + try: + if args.id is not None: + run = self.api.client.runs.get(project_name=self.api.project, run_id=run_id) + else: + run = self.api.client.runs.get(project_name=self.api.project, run_name=args.name) + except ResourceNotExistsError: + console.print(f"Run [code]{args.name or args.id}[/] not found") + exit(1) -def _run_resubmitted(run: Run, current_job_submission: Optional[JobSubmission]) -> bool: - if current_job_submission is None or run._run.latest_job_submission is None: - return False - return run.status == RunStatus.PENDING or ( - not run.status.is_finished() - and run._run.latest_job_submission.submitted_at > current_job_submission.submitted_at - ) + print(pydantic_orjson_dumps_with_indent(run.dict(), default=None)) diff --git a/src/dstack/_internal/cli/commands/secrets.py b/src/dstack/_internal/cli/commands/secrets.py new file mode 100644 index 0000000000..64a678b2d5 --- /dev/null +++ b/src/dstack/_internal/cli/commands/secrets.py @@ -0,0 +1,92 @@ +import argparse + +from dstack._internal.cli.commands import APIBaseCommand +from dstack._internal.cli.services.completion import SecretNameCompleter +from dstack._internal.cli.utils.common import ( + confirm_ask, + console, +) +from dstack._internal.cli.utils.secrets import print_secrets_table + + +class SecretCommand(APIBaseCommand): + NAME = "secret" + DESCRIPTION = "Manage secrets" + + def _register(self): + super()._register() + self._parser.set_defaults(subfunc=self._list) + subparsers = self._parser.add_subparsers(dest="action") + + list_parser = subparsers.add_parser( + "list", help="List secrets", formatter_class=self._parser.formatter_class + ) + list_parser.set_defaults(subfunc=self._list) + + get_parser = subparsers.add_parser( + "get", help="Get secret value", formatter_class=self._parser.formatter_class + ) + get_parser.add_argument( + "name", + help="The name of the secret", + ).completer = SecretNameCompleter() # type: ignore[attr-defined] + get_parser.set_defaults(subfunc=self._get) + + set_parser = subparsers.add_parser( + "set", help="Set secret", formatter_class=self._parser.formatter_class + ) + set_parser.add_argument( + "name", + help="The name of the secret", + ) + set_parser.add_argument( + "value", + help="The value of the secret", + ) + set_parser.set_defaults(subfunc=self._set) + + delete_parser = subparsers.add_parser( + "delete", + help="Delete secrets", + formatter_class=self._parser.formatter_class, + ) + delete_parser.add_argument( + "name", + help="The name of the secret", + ).completer = SecretNameCompleter() # type: ignore[attr-defined] + delete_parser.add_argument( + "-y", "--yes", help="Don't ask for confirmation", action="store_true" + ) + delete_parser.set_defaults(subfunc=self._delete) + + def _command(self, args: argparse.Namespace): + super()._command(args) + args.subfunc(args) + + def _list(self, args: argparse.Namespace): + secrets = self.api.client.secrets.list(self.api.project) + print_secrets_table(secrets) + + def _get(self, args: argparse.Namespace): + secret = self.api.client.secrets.get(self.api.project, name=args.name) + print_secrets_table([secret]) + + def _set(self, args: argparse.Namespace): + self.api.client.secrets.create_or_update( + self.api.project, + name=args.name, + value=args.value, + ) + console.print("[grey58]OK[/]") + + def _delete(self, args: argparse.Namespace): + if not args.yes and not confirm_ask(f"Delete the secret [code]{args.name}[/]?"): + console.print("\nExiting...") + return + + with console.status("Deleting secret..."): + self.api.client.secrets.delete( + project_name=self.api.project, + names=[args.name], + ) + console.print("[grey58]OK[/]") diff --git a/src/dstack/_internal/cli/commands/server.py b/src/dstack/_internal/cli/commands/server.py index aa48cb6c7f..255f92e9a5 100644 --- a/src/dstack/_internal/cli/commands/server.py +++ b/src/dstack/_internal/cli/commands/server.py @@ -1,10 +1,17 @@ +import argparse import os -from argparse import Namespace +from pathlib import Path +from typing import Optional -import uvicorn - -from dstack import version +from dstack._internal import settings from dstack._internal.cli.commands import BaseCommand +from dstack._internal.core.errors import CLIError + +UVICORN_INSTALLED = True +try: + import uvicorn +except ImportError: + UVICORN_INSTALLED = False class ServerCommand(BaseCommand): @@ -27,7 +34,14 @@ def _register(self): help="Bind socket to this port. Defaults to 3000.", default=os.getenv("DSTACK_SERVER_PORT", 3000), ) - self._parser.add_argument( + group = self._parser.add_mutually_exclusive_group() + group.add_argument( + "-d", + "--debug", + help="Enable debug logging level (same as [code]-l debug[/code])", + action="store_true", + ) + group.add_argument( "-l", "--log-level", type=str, @@ -35,34 +49,63 @@ def _register(self): default=os.getenv("DSTACK_SERVER_LOG_LEVEL", "INFO"), ) self._parser.add_argument( - "--default", - help="Update the default project configuration", + "-y", + "--yes", + help="Don't ask for confirmation (e.g. update the config)", action="store_true", ) self._parser.add_argument( - "--no-default", - help="Do not update the default project configuration", + "-n", + "--no", + help="Don't ask for confirmation (e.g. do not update the config)", action="store_true", ) self._parser.add_argument("--token", type=str, help="The admin user token") - def _command(self, args: Namespace): + def _command(self, args: argparse.Namespace): super()._command(args) + if not UVICORN_INSTALLED: + raise CLIError( + "Failed to start dstack server due to missing server dependencies." + "\nInstall server dependencies with `pip install dstack[server]` or `pip install dstack[all]`." + ) + os.environ["DSTACK_SERVER_HOST"] = args.host os.environ["DSTACK_SERVER_PORT"] = str(args.port) - os.environ["DSTACK_SERVER_LOG_LEVEL"] = args.log_level - if args.default: + os.environ["DSTACK_SERVER_LOG_LEVEL"] = "DEBUG" if args.debug else args.log_level + if args.yes: os.environ["DSTACK_UPDATE_DEFAULT_PROJECT"] = "1" - if args.no_default: + if args.no: os.environ["DSTACK_DO_NOT_UPDATE_DEFAULT_PROJECT"] = "1" if args.token: os.environ["DSTACK_SERVER_ADMIN_TOKEN"] = args.token + # Hide noisy "Other threads are currently calling into gRPC, skipping fork() handlers" + # messages in server logs. Users can still change this with GRPC_VERBOSITY. + os.environ.setdefault("GRPC_VERBOSITY", "ERROR") uvicorn_log_level = os.getenv("DSTACK_SERVER_UVICORN_LOG_LEVEL", "ERROR").lower() - uvicorn.run( + reload_disabled = os.getenv("DSTACK_SERVER_RELOAD_DISABLED") is not None + + reload = settings.DSTACK_VERSION is None and not reload_disabled + reload_excludes: Optional[list[str]] = None + if reload: + # Don't reload on dstack._internal.cli package changes + for parent in Path(__file__).parents: + if parent.name == "cli": + reload_excludes = [str(parent)] + break + + uvicorn.run( # type: ignore[unbound-variable] "dstack._internal.server.main:app", host=args.host, port=args.port, - reload=version.__version__ is None, + reload=reload, + reload_excludes=reload_excludes, log_level=uvicorn_log_level, + workers=1, ) + + def _configure_logging(self) -> None: + # Server logging is configured in the FastAPI lifespan function. + # No need to configure CLI logging. + pass diff --git a/src/dstack/_internal/cli/commands/stop.py b/src/dstack/_internal/cli/commands/stop.py index ad08228b30..2eaca90edc 100644 --- a/src/dstack/_internal/cli/commands/stop.py +++ b/src/dstack/_internal/cli/commands/stop.py @@ -1,6 +1,7 @@ import argparse from dstack._internal.cli.commands import APIBaseCommand +from dstack._internal.cli.services.completion import RunNameCompleter from dstack._internal.cli.utils.common import confirm_ask from dstack._internal.core.errors import CLIError @@ -13,7 +14,7 @@ def _register(self): super()._register() self._parser.add_argument("-x", "--abort", action="store_true") self._parser.add_argument("-y", "--yes", action="store_true") - self._parser.add_argument("run_name") + self._parser.add_argument("run_name").completer = RunNameCompleter() # type: ignore[attr-defined] def _command(self, args: argparse.Namespace): super()._command(args) diff --git a/src/dstack/_internal/cli/commands/volume.py b/src/dstack/_internal/cli/commands/volume.py index 36e660a171..e78ec352c6 100644 --- a/src/dstack/_internal/cli/commands/volume.py +++ b/src/dstack/_internal/cli/commands/volume.py @@ -1,7 +1,19 @@ import argparse +import time + +from rich.live import Live from dstack._internal.cli.commands import APIBaseCommand -from dstack._internal.cli.utils.volume import print_volumes_table +from dstack._internal.cli.services.completion import VolumeNameCompleter +from dstack._internal.cli.utils.common import ( + LIVE_TABLE_PROVISION_INTERVAL_SECS, + LIVE_TABLE_REFRESH_RATE_PER_SEC, + confirm_ask, + console, +) +from dstack._internal.cli.utils.volume import get_volumes_table, print_volumes_table +from dstack._internal.core.errors import ResourceNotExistsError +from dstack._internal.utils.json_utils import pydantic_orjson_dumps_with_indent class VolumeCommand(APIBaseCommand): @@ -16,15 +28,90 @@ def _register(self): list_parser = subparsers.add_parser( "list", help="List volumes", formatter_class=self._parser.formatter_class ) - list_parser.add_argument( - "-v", "--verbose", action="store_true", help="Show more information" - ) list_parser.set_defaults(subfunc=self._list) + for parser in [self._parser, list_parser]: + parser.add_argument( + "-w", + "--watch", + help="Update listing in realtime", + action="store_true", + ) + parser.add_argument( + "-v", "--verbose", action="store_true", help="Show more information" + ) + + delete_parser = subparsers.add_parser( + "delete", + help="Delete volumes", + formatter_class=self._parser.formatter_class, + ) + delete_parser.add_argument( + "name", + help="The name of the volume", + ).completer = VolumeNameCompleter() # type: ignore[attr-defined] + delete_parser.add_argument( + "-y", "--yes", help="Don't ask for confirmation", action="store_true" + ) + delete_parser.set_defaults(subfunc=self._delete) + + get_parser = subparsers.add_parser( + "get", help="Get a volume", formatter_class=self._parser.formatter_class + ) + get_parser.add_argument( + "name", + metavar="NAME", + help="The name of the volume", + ).completer = VolumeNameCompleter() # type: ignore[attr-defined] + get_parser.add_argument( + "--json", + action="store_true", + required=True, + help="Output in JSON format", + ) + get_parser.set_defaults(subfunc=self._get) + def _command(self, args: argparse.Namespace): super()._command(args) args.subfunc(args) def _list(self, args: argparse.Namespace): volumes = self.api.client.volumes.list(self.api.project) - print_volumes_table(volumes, verbose=getattr(args, "verbose", False)) + if not args.watch: + print_volumes_table(volumes, verbose=args.verbose) + return + + try: + with Live(console=console, refresh_per_second=LIVE_TABLE_REFRESH_RATE_PER_SEC) as live: + while True: + live.update(get_volumes_table(volumes, verbose=args.verbose)) + time.sleep(LIVE_TABLE_PROVISION_INTERVAL_SECS) + volumes = self.api.client.volumes.list(self.api.project) + except KeyboardInterrupt: + pass + + def _delete(self, args: argparse.Namespace): + try: + self.api.client.volumes.get(project_name=self.api.project, name=args.name) + except ResourceNotExistsError: + console.print(f"Volume [code]{args.name}[/] does not exist") + exit(1) + + if not args.yes and not confirm_ask(f"Delete the volume [code]{args.name}[/]?"): + console.print("\nExiting...") + return + + with console.status("Deleting volume..."): + self.api.client.volumes.delete(project_name=self.api.project, names=[args.name]) + + console.print(f"Volume [code]{args.name}[/] deleted") + + def _get(self, args: argparse.Namespace): + # TODO: Implement non-json output format + try: + volume = self.api.client.volumes.get(project_name=self.api.project, name=args.name) + except ResourceNotExistsError: + console.print("Volume not found") + exit(1) + + print(pydantic_orjson_dumps_with_indent(volume.dict(), default=None)) diff --git a/src/dstack/_internal/cli/main.py b/src/dstack/_internal/cli/main.py index b0253e2e90..32f15a95f8 100644 --- a/src/dstack/_internal/cli/main.py +++ b/src/dstack/_internal/cli/main.py @@ -1,23 +1,34 @@ import argparse +import argcomplete from rich.markup import escape from rich_argparse import RichHelpFormatter from dstack._internal.cli.commands.apply import ApplyCommand -from dstack._internal.cli.commands.config import ConfigCommand +from dstack._internal.cli.commands.attach import AttachCommand +from dstack._internal.cli.commands.completion import CompletionCommand from dstack._internal.cli.commands.delete import DeleteCommand +from dstack._internal.cli.commands.event import EventCommand +from dstack._internal.cli.commands.export import ExportCommand +from dstack._internal.cli.commands.fleet import FleetCommand from dstack._internal.cli.commands.gateway import GatewayCommand +from dstack._internal.cli.commands.import_ import ImportCommand from dstack._internal.cli.commands.init import InitCommand +from dstack._internal.cli.commands.login import LoginCommand from dstack._internal.cli.commands.logs import LogsCommand -from dstack._internal.cli.commands.pool import PoolCommand +from dstack._internal.cli.commands.metrics import MetricsCommand +from dstack._internal.cli.commands.offer import OfferCommand +from dstack._internal.cli.commands.project import ProjectCommand from dstack._internal.cli.commands.ps import PsCommand from dstack._internal.cli.commands.run import RunCommand +from dstack._internal.cli.commands.secrets import SecretCommand from dstack._internal.cli.commands.server import ServerCommand from dstack._internal.cli.commands.stop import StopCommand from dstack._internal.cli.commands.volume import VolumeCommand from dstack._internal.cli.utils.common import _colors, console from dstack._internal.cli.utils.updates import check_for_updates -from dstack._internal.core.errors import ClientError, CLIError +from dstack._internal.core.errors import ClientError, CLIError, ConfigurationError, SSHError +from dstack._internal.core.services.ssh.client import get_ssh_client_info from dstack._internal.utils.logging import get_logger from dstack.version import __version__ as version @@ -33,8 +44,8 @@ def main(): parser = argparse.ArgumentParser( description=( - "Not sure where to start? Call [code]dstack init[/].\n" - "Define a [code].dstack.yml[/] configuration file and run it via [code]dstack run[/]\n" + "Not sure where to start?" + " Define a [code].dstack.yml[/] configuration file and run it via [code]dstack apply[/]\n" ), formatter_class=RichHelpFormatter, epilog=( @@ -54,24 +65,37 @@ def main(): subparsers = parser.add_subparsers(metavar="COMMAND") ApplyCommand.register(subparsers) - ConfigCommand.register(subparsers) + AttachCommand.register(subparsers) DeleteCommand.register(subparsers) + EventCommand.register(subparsers) + ExportCommand.register(subparsers) + FleetCommand.register(subparsers) + ImportCommand.register(subparsers) GatewayCommand.register(subparsers) - PoolCommand.register(subparsers) InitCommand.register(subparsers) + OfferCommand.register(subparsers) + LoginCommand.register(subparsers) LogsCommand.register(subparsers) + MetricsCommand.register(subparsers) + ProjectCommand.register(subparsers) PsCommand.register(subparsers) RunCommand.register(subparsers) + SecretCommand.register(subparsers) ServerCommand.register(subparsers) StopCommand.register(subparsers) VolumeCommand.register(subparsers) + CompletionCommand.register(subparsers) + + argcomplete.autocomplete(parser, always_complete_options=False) args, unknown_args = parser.parse_known_args() - args.unknown = unknown_args + args.extra_args = unknown_args + try: check_for_updates() + get_ssh_client_info() args.func(args) - except (ClientError, CLIError) as e: + except (ClientError, CLIError, ConfigurationError, SSHError) as e: console.print(f"[error]{escape(str(e))}[/]") logger.debug(e, exc_info=True) exit(1) diff --git a/src/dstack/_internal/cli/models/__init__.py b/src/dstack/_internal/cli/models/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/cli/models/gateways.py b/src/dstack/_internal/cli/models/gateways.py new file mode 100644 index 0000000000..94dfa88982 --- /dev/null +++ b/src/dstack/_internal/cli/models/gateways.py @@ -0,0 +1,16 @@ +from typing import List + +from dstack._internal.core.models.common import CoreConfig, generate_dual_core_model +from dstack._internal.core.models.gateways import Gateway +from dstack._internal.utils.json_utils import pydantic_orjson_dumps_with_indent + + +class GatewayCommandOutputConfig(CoreConfig): + json_dumps = pydantic_orjson_dumps_with_indent + + +class GatewayCommandOutput(generate_dual_core_model(GatewayCommandOutputConfig)): + """JSON output model for `dstack gateway` command.""" + + project: str + gateways: List[Gateway] diff --git a/src/dstack/_internal/cli/models/offers.py b/src/dstack/_internal/cli/models/offers.py new file mode 100644 index 0000000000..56d5e21ea7 --- /dev/null +++ b/src/dstack/_internal/cli/models/offers.py @@ -0,0 +1,47 @@ +from typing import List, Literal, Optional + +from dstack._internal.core.models.common import CoreConfig, generate_dual_core_model +from dstack._internal.core.models.gpus import GpuGroup +from dstack._internal.core.models.instances import InstanceOfferWithAvailability +from dstack._internal.core.models.resources import ResourcesSpec +from dstack._internal.utils.json_utils import pydantic_orjson_dumps_with_indent + + +class OfferRequirementsConfig(CoreConfig): + json_dumps = pydantic_orjson_dumps_with_indent + + +class OfferRequirements(generate_dual_core_model(OfferRequirementsConfig)): + """Profile/requirements output model for CLI commands.""" + + resources: ResourcesSpec + max_price: Optional[float] = None + spot: Optional[bool] = None + reservation: Optional[str] = None + + +class OfferCommandOutputConfig(CoreConfig): + json_dumps = pydantic_orjson_dumps_with_indent + + +class OfferCommandOutput(generate_dual_core_model(OfferCommandOutputConfig)): + """JSON output model for `dstack offer` command.""" + + project: str + user: str + requirements: OfferRequirements + offers: List[InstanceOfferWithAvailability] + total_offers: int + + +class OfferCommandGroupByGpuOutputConfig(CoreConfig): + json_dumps = pydantic_orjson_dumps_with_indent + + +class OfferCommandGroupByGpuOutput(generate_dual_core_model(OfferCommandGroupByGpuOutputConfig)): + """JSON output model for `dstack offer` command with GPU grouping.""" + + project: str + requirements: OfferRequirements + group_by: List[Literal["gpu", "backend", "region", "count"]] + gpus: List[GpuGroup] diff --git a/src/dstack/_internal/cli/models/runs.py b/src/dstack/_internal/cli/models/runs.py new file mode 100644 index 0000000000..db951b752d --- /dev/null +++ b/src/dstack/_internal/cli/models/runs.py @@ -0,0 +1,16 @@ +from typing import List + +from dstack._internal.core.models.common import CoreConfig, generate_dual_core_model +from dstack._internal.core.models.runs import Run +from dstack._internal.utils.json_utils import pydantic_orjson_dumps_with_indent + + +class PsCommandOutputConfig(CoreConfig): + json_dumps = pydantic_orjson_dumps_with_indent + + +class PsCommandOutput(generate_dual_core_model(PsCommandOutputConfig)): + """JSON output model for `dstack ps` command.""" + + project: str + runs: List[Run] diff --git a/src/dstack/_internal/cli/services/args.py b/src/dstack/_internal/cli/services/args.py index d0b509cb64..a189984500 100644 --- a/src/dstack/_internal/cli/services/args.py +++ b/src/dstack/_internal/cli/services/args.py @@ -1,34 +1,26 @@ -import re -from typing import Dict, Tuple, Union +from typing import Dict from pydantic import parse_obj_as from dstack._internal.core.models import resources as resources -from dstack._internal.core.models.configurations import EnvSentinel, PortMapping +from dstack._internal.core.models.configurations import PortMapping +from dstack._internal.core.models.envs import EnvVarTuple def gpu_spec(v: str) -> Dict: return resources.GPUSpec.parse(v) -def env_var(v: str) -> Tuple[str, Union[str, EnvSentinel]]: - r = re.match(r"^([a-zA-Z_][a-zA-Z0-9_]*)(=.*$|$)", v) - if r is None: - raise ValueError(v) - if "=" in v: - key, value = v.split("=", 1) - else: - key = r.group(1) - value = EnvSentinel(key=key) - return key, value +def env_var(v: str) -> EnvVarTuple: + return EnvVarTuple.parse(v) def port_mapping(v: str) -> PortMapping: return PortMapping.parse(v) -def cpu_spec(v: str) -> resources.Range[int]: - return parse_obj_as(resources.Range[int], v) +def cpu_spec(v: str) -> dict: + return resources.CPUSpec.parse(v) def memory_spec(v: str) -> resources.Range[resources.Memory]: diff --git a/src/dstack/_internal/cli/services/completion.py b/src/dstack/_internal/cli/services/completion.py new file mode 100644 index 0000000000..4fa276945d --- /dev/null +++ b/src/dstack/_internal/cli/services/completion.py @@ -0,0 +1,104 @@ +import argparse +import os +from abc import ABC, abstractmethod +from typing import Iterable, List, Optional + +import argcomplete +from argcomplete.completers import BaseCompleter + +from dstack._internal.core.errors import ConfigurationError +from dstack._internal.core.services.configs import ConfigManager +from dstack.api import Client + + +class BaseAPINameCompleter(BaseCompleter, ABC): + """ + Base class for name completers that fetch resource names via the API. + """ + + def __init__(self): + super().__init__() + + def get_api(self, parsed_args: argparse.Namespace) -> Optional[Client]: + argcomplete.debug(f"{self.__class__.__name__}: Retrieving API client") + project = getattr(parsed_args, "project", os.getenv("DSTACK_PROJECT")) + try: + return Client.from_config(project_name=project) + except ConfigurationError as e: + argcomplete.debug(f"{self.__class__.__name__}: Error initializing API client: {e}") + return None + + def __call__(self, prefix: str, parsed_args: argparse.Namespace, **kwargs) -> List[str]: + api = self.get_api(parsed_args) + if api is None: + return [] + + argcomplete.debug(f"{self.__class__.__name__}: Fetching completions") + try: + resource_names = self.fetch_resource_names(api) + return [name for name in resource_names if name.startswith(prefix)] + except Exception as e: + argcomplete.debug( + f"{self.__class__.__name__}: Error fetching resource completions: {e}" + ) + return [] + + @abstractmethod + def fetch_resource_names(self, api: Client) -> Iterable[str]: + """ + Returns an iterable of resource names. + """ + pass + + +class RunNameCompleter(BaseAPINameCompleter): + def __init__(self, all: bool = False): + super().__init__() + self.all = all + + def fetch_resource_names(self, api: Client) -> Iterable[str]: + return [r.name for r in api.runs.list(self.all)] + + +class FleetNameCompleter(BaseAPINameCompleter): + def fetch_resource_names(self, api: Client) -> Iterable[str]: + return [r.name for r in api.client.fleets.list(api.project)] + + +class VolumeNameCompleter(BaseAPINameCompleter): + def fetch_resource_names(self, api: Client) -> Iterable[str]: + return [r.name for r in api.client.volumes.list(api.project)] + + +class GatewayNameCompleter(BaseAPINameCompleter): + def fetch_resource_names(self, api: Client) -> Iterable[str]: + return [r.name for r in api.client.gateways.list(api.project)] + + +class SecretNameCompleter(BaseAPINameCompleter): + def fetch_resource_names(self, api: Client) -> Iterable[str]: + return [r.name for r in api.client.secrets.list(api.project)] + + +class ExportNameCompleter(BaseAPINameCompleter): + def fetch_resource_names(self, api: Client) -> Iterable[str]: + return [r.name for r in api.client.exports.list(api.project)] + + +class ImportNameCompleter(BaseAPINameCompleter): + def fetch_resource_names(self, api: Client) -> Iterable[str]: + return [ + f"{imp.export.project_name}/{imp.export.name}" + for imp in api.client.imports.list(api.project) + ] + + +class ProjectNameCompleter(BaseCompleter): + """ + Completer for local project names. + """ + + def __call__(self, prefix: str, parsed_args: argparse.Namespace, **kwargs) -> List[str]: + argcomplete.debug(f"{self.__class__.__name__}: Listing projects from ConfigManager") + projects = ConfigManager().list_project_configs() + return [p.name for p in projects if p.name.startswith(prefix)] diff --git a/src/dstack/_internal/cli/services/configurators/__init__.py b/src/dstack/_internal/cli/services/configurators/__init__.py index 4deed5f3c1..91768bdcd3 100644 --- a/src/dstack/_internal/cli/services/configurators/__init__.py +++ b/src/dstack/_internal/cli/services/configurators/__init__.py @@ -1,10 +1,18 @@ +import sys from pathlib import Path -from typing import Dict, Optional, Type +from typing import Dict, Optional, Tuple, Type import yaml from dstack._internal.cli.services.configurators.base import BaseApplyConfigurator +from dstack._internal.cli.services.configurators.fleet import FleetConfigurator from dstack._internal.cli.services.configurators.gateway import GatewayConfigurator +from dstack._internal.cli.services.configurators.run import ( + BaseRunConfigurator, + DevEnvironmentConfigurator, + ServiceConfigurator, + TaskConfigurator, +) from dstack._internal.cli.services.configurators.volume import VolumeConfigurator from dstack._internal.core.errors import ConfigurationError from dstack._internal.core.models.configurations import ( @@ -13,16 +21,47 @@ parse_apply_configuration, ) -apply_configurators_mapping: Dict[ApplyConfigurationType, Type[BaseApplyConfigurator]] = { - cls.TYPE: cls for cls in [GatewayConfigurator, VolumeConfigurator] +APPLY_STDIN_NAME = "-" + + +apply_configurators_mapping: Dict[ + ApplyConfigurationType, Type[BaseApplyConfigurator[AnyApplyConfiguration]] +] = { + cls.TYPE: cls + for cls in [ + DevEnvironmentConfigurator, + TaskConfigurator, + ServiceConfigurator, + FleetConfigurator, + GatewayConfigurator, + VolumeConfigurator, + ] +} + + +run_configurators_mapping: Dict[ApplyConfigurationType, Type[BaseRunConfigurator]] = { + cls.TYPE: cls + for cls in [ + DevEnvironmentConfigurator, + TaskConfigurator, + ServiceConfigurator, + ] } -def get_apply_configurator_class(configurator_type: str) -> Type[BaseApplyConfigurator]: +def get_apply_configurator_class( + configurator_type: str, +) -> Type[BaseApplyConfigurator[AnyApplyConfiguration]]: return apply_configurators_mapping[ApplyConfigurationType(configurator_type)] -def load_apply_configuration(configuration_file: Optional[str]) -> AnyApplyConfiguration: +def get_run_configurator_class(configurator_type: str) -> Type[BaseRunConfigurator]: + return run_configurators_mapping[ApplyConfigurationType(configurator_type)] + + +def load_apply_configuration( + configuration_file: Optional[str], +) -> Tuple[str, AnyApplyConfiguration]: if configuration_file is None: configuration_path = Path.cwd() / ".dstack.yml" if not configuration_path.exists(): @@ -31,6 +70,8 @@ def load_apply_configuration(configuration_file: Optional[str]) -> AnyApplyConfi raise ConfigurationError( "No configuration file specified via `-f` and no default .dstack.yml configuration found" ) + elif configuration_file == APPLY_STDIN_NAME: + configuration_path = sys.stdin.fileno() else: configuration_path = Path(configuration_file) if not configuration_path.exists(): @@ -40,4 +81,6 @@ def load_apply_configuration(configuration_file: Optional[str]) -> AnyApplyConfi conf = parse_apply_configuration(yaml.safe_load(f)) except OSError: raise ConfigurationError(f"Failed to load configuration from {configuration_path}") - return conf + if isinstance(configuration_path, int): + return APPLY_STDIN_NAME, conf + return str(configuration_path.absolute().relative_to(Path.cwd())), conf diff --git a/src/dstack/_internal/cli/services/configurators/base.py b/src/dstack/_internal/cli/services/configurators/base.py index dc855a4f1c..c2d88b565a 100644 --- a/src/dstack/_internal/cli/services/configurators/base.py +++ b/src/dstack/_internal/cli/services/configurators/base.py @@ -1,32 +1,103 @@ import argparse +import os from abc import ABC, abstractmethod -from typing import List +from typing import ClassVar, Generic, List, TypeVar, Union, cast +from dstack._internal.cli.services.args import env_var +from dstack._internal.core.errors import ConfigurationError from dstack._internal.core.models.configurations import ( AnyApplyConfiguration, ApplyConfigurationType, ) +from dstack._internal.core.models.envs import Env, EnvSentinel, EnvVarTuple from dstack.api._public import Client +ArgsParser = Union[argparse._ArgumentGroup, argparse.ArgumentParser] -class BaseApplyConfigurator(ABC): - TYPE: ApplyConfigurationType +ApplyConfigurationT = TypeVar("ApplyConfigurationT", bound=AnyApplyConfiguration) + + +class BaseApplyConfigurator(ABC, Generic[ApplyConfigurationT]): + TYPE: ClassVar[ApplyConfigurationType] def __init__(self, api_client: Client): - self.api_client = api_client + self.api = api_client @abstractmethod - def apply_configuration(self, conf: AnyApplyConfiguration, args: argparse.Namespace): + def apply_configuration( + self, + conf: ApplyConfigurationT, + configuration_path: str, + command_args: argparse.Namespace, + configurator_args: argparse.Namespace, + ): + """ + Implements `dstack apply` for a given configuration type. + + Args: + conf: The apply configuration. + configuration_path: The path to the configuration file. + command_args: The args parsed by `dstack apply`. + configurator_args: The known args parsed by `cls.get_parser()`. + """ pass @abstractmethod - def delete_configuration(self, conf: AnyApplyConfiguration, args: argparse.Namespace): - pass + def delete_configuration( + self, + conf: ApplyConfigurationT, + configuration_path: str, + command_args: argparse.Namespace, + ): + """ + Implements `dstack delete` for a given configuration type. - def register_args(self, parser: argparse.ArgumentParser): + Args: + conf: The apply configuration. + configuration_path: The path to the configuration file. + command_args: The args parsed by `dstack delete`. + """ pass - def apply_args( - self, args: argparse.Namespace, unknown: List[str], conf: AnyApplyConfiguration - ): + @classmethod + def get_parser(cls) -> argparse.ArgumentParser: + """ + Returns a parser to parse configuration-specific args. + """ + parser = argparse.ArgumentParser() + cls.register_args(parser) + return parser + + @classmethod + def register_args(cls, parser: argparse.ArgumentParser): + """ + Adds configuration-specific args to `parser`. + This is separated from `cls.get_parser()` so that `dstack apply` can register + args with different parser to show unified help. + """ pass + + +class ApplyEnvVarsConfiguratorMixin: + @classmethod + def register_env_args(cls, parser: ArgsParser): + parser.add_argument( + "-e", + "--env", + type=env_var, + action="append", + help="Environment variables", + dest="env_vars", + default=[], + metavar="KEY[=VALUE]", + ) + + def apply_env_vars(self, env: Env, configurator_args: argparse.Namespace) -> None: + for k, v in cast(List[EnvVarTuple], configurator_args.env_vars): + env[k] = v + for k, v in env.items(): + if isinstance(v, EnvSentinel): + try: + env[k] = v.from_env(os.environ) + except ValueError as e: + raise ConfigurationError(*e.args) diff --git a/src/dstack/_internal/cli/services/configurators/fleet.py b/src/dstack/_internal/cli/services/configurators/fleet.py new file mode 100644 index 0000000000..0e3dbdd809 --- /dev/null +++ b/src/dstack/_internal/cli/services/configurators/fleet.py @@ -0,0 +1,530 @@ +import argparse +import time +from pathlib import Path +from typing import Optional + +from rich.table import Table + +from dstack._internal.cli.services.configurators.base import ( + ApplyEnvVarsConfiguratorMixin, + BaseApplyConfigurator, +) +from dstack._internal.cli.utils.common import ( + LIVE_TABLE_PROVISION_INTERVAL_SECS, + NO_OFFERS_WARNING, + confirm_ask, + console, + format_instance_availability, +) +from dstack._internal.cli.utils.fleet import get_fleets_table +from dstack._internal.cli.utils.rich import MultiItemStatus +from dstack._internal.core.errors import ( + CLIError, + ConfigurationError, + ResourceNotExistsError, + ServerClientError, +) +from dstack._internal.core.models.common import ApplyAction +from dstack._internal.core.models.configurations import ApplyConfigurationType +from dstack._internal.core.models.fleets import ( + Fleet, + FleetConfiguration, + FleetPlan, + FleetSpec, + InstanceGroupPlacement, +) +from dstack._internal.core.models.instances import InstanceStatus, SSHKey +from dstack._internal.core.services.diff import copy_model, diff_models +from dstack._internal.utils.common import local_time +from dstack._internal.utils.logging import get_logger +from dstack._internal.utils.nested_list import NestedList, NestedListItem +from dstack._internal.utils.ssh import convert_ssh_key_to_pem, generate_public_key, pkey_from_str +from dstack.api.utils import load_profile + +logger = get_logger(__name__) + + +class FleetConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator[FleetConfiguration]): + TYPE = ApplyConfigurationType.FLEET + + def apply_configuration( + self, + conf: FleetConfiguration, + configuration_path: str, + command_args: argparse.Namespace, + configurator_args: argparse.Namespace, + ): + self.apply_args(conf, configurator_args) + profile = load_profile(Path.cwd(), None) + spec = FleetSpec( + configuration=conf, + configuration_path=configuration_path, + profile=profile, + ) + _preprocess_spec(spec) + + with console.status("Getting apply plan..."): + plan = self.api.client.fleets.get_plan( + project_name=self.api.project, + spec=spec, + ) + _print_plan_header(plan) + if plan.action is not None: + self._apply_plan(plan, command_args) + else: + # Old servers don't support spec update + self._apply_plan_on_old_server(plan, command_args) + + def _apply_plan(self, plan: FleetPlan, command_args: argparse.Namespace): + delete_fleet_name: Optional[str] = None + action_message = "" + confirm_message = "" + if plan.current_resource is None: + if plan.spec.configuration.name is not None: + action_message += ( + f"Fleet [code]{plan.spec.configuration.name}[/] does not exist yet." + ) + confirm_message += "Create the fleet?" + else: + effective_spec = plan.get_effective_spec() + diff = _render_fleet_spec_diff(plan.current_resource.spec, effective_spec) + action_message += f"Found fleet [code]{plan.spec.configuration.name}[/]." + if plan.current_resource.spec == effective_spec: + if command_args.yes and not command_args.force: + # --force is required only with --yes, + # otherwise we may ask for force apply interactively. + console.print( + "No configuration changes detected. Use --force to apply anyway." + ) + return + delete_fleet_name = plan.current_resource.name + action_message += " No configuration changes detected." + confirm_message += "Re-create the fleet?" + elif plan.action == ApplyAction.CREATE: + delete_fleet_name = plan.current_resource.name + if diff is not None: + # TODO: Highlight only the fields that block in-place update instead of + # showing the full detected diff here. + action_message += ( + f" Detected changes that [error]cannot[/] be updated in-place:\n{diff}" + ) + else: + action_message += ( + " Configuration changes detected. Cannot update the fleet in-place." + ) + confirm_message += "Re-create the fleet?" + else: + if diff is not None: + action_message += ( + f" Detected changes that [code]can[/] be updated in-place:\n{diff}" + ) + else: + action_message += " Configuration changes detected." + confirm_message += "Update the fleet in-place?" + + console.print(action_message) + if not command_args.yes and not confirm_ask(confirm_message): + console.print("\nExiting...") + return + + if delete_fleet_name is not None: + with console.status("Deleting existing fleet..."): + self.api.client.fleets.delete( + project_name=self.api.project, names=[delete_fleet_name] + ) + # Fleet deletion is async. Wait for fleet to be deleted. + while True: + try: + self.api.client.fleets.get( + project_name=self.api.project, name=delete_fleet_name + ) + except ResourceNotExistsError: + break + else: + time.sleep(1) + + try: + with console.status("Applying plan..."): + fleet = self.api.client.fleets.apply_plan(project_name=self.api.project, plan=plan) + except ServerClientError as e: + raise CLIError(e.msg) + if command_args.detach: + console.print("Fleet configuration submitted. Exiting...") + return + try: + with MultiItemStatus( + f"Provisioning [code]{fleet.name}[/]...", console=console + ) as live: + while not _finished_provisioning(fleet): + table = get_fleets_table([fleet], current_project=self.api.project) + live.update(table) + time.sleep(LIVE_TABLE_PROVISION_INTERVAL_SECS) + fleet = self.api.client.fleets.get(self.api.project, fleet.name) + except KeyboardInterrupt: + if not command_args.yes and confirm_ask("Delete the fleet before exiting?"): + with console.status("Deleting fleet..."): + self.api.client.fleets.delete( + project_name=self.api.project, names=[fleet.name] + ) + else: + console.print("Exiting... Fleet provisioning will continue in the background.") + return + console.print( + get_fleets_table( + [fleet], + verbose=_fleet_has_failed_instances(fleet), + format_date=local_time, + current_project=self.api.project, + ) + ) + if _fleet_has_failed_instances(fleet): + if _fleet_retrying(fleet): + console.print( + "\n[error]Some instances failed. Provisioning will be retried in the background.[/]" + ) + else: + console.print( + "\n[error]Some instances failed. Check the table above for errors.[/]" + ) + exit(1) + + def _apply_plan_on_old_server(self, plan: FleetPlan, command_args: argparse.Namespace): + action_message = "" + confirm_message = "" + if plan.current_resource is None: + if plan.spec.configuration.name is not None: + action_message += ( + f"Fleet [code]{plan.spec.configuration.name}[/] does not exist yet." + ) + confirm_message += "Create the fleet?" + else: + action_message += f"Found fleet [code]{plan.spec.configuration.name}[/]." + diff = diff_models( + old=plan.current_resource.spec.configuration, + new=plan.spec.configuration, + reset={ + "ssh_config": { + "ssh_key": True, + "proxy_jump": {"ssh_key"}, + "hosts": {"__all__": {"ssh_key": True, "proxy_jump": {"ssh_key"}}}, + } + }, + ) + if not diff: + if command_args.yes and not command_args.force: + # --force is required only with --yes, + # otherwise we may ask for force apply interactively. + console.print( + "No configuration changes detected. Use --force to apply anyway." + ) + return + action_message += " No configuration changes detected." + confirm_message += "Re-create the fleet?" + else: + action_message += " Configuration changes detected." + confirm_message += "Re-create the fleet?" + + console.print(action_message) + if not command_args.yes and not confirm_ask(confirm_message): + console.print("\nExiting...") + return + + if plan.current_resource is not None: + with console.status("Deleting existing fleet..."): + self.api.client.fleets.delete( + project_name=self.api.project, names=[plan.current_resource.name] + ) + # Fleet deletion is async. Wait for fleet to be deleted. + while True: + try: + self.api.client.fleets.get( + project_name=self.api.project, name=plan.current_resource.name + ) + except ResourceNotExistsError: + break + else: + time.sleep(1) + + try: + with console.status("Applying plan..."): + fleet = self.api.client.fleets.apply_plan(project_name=self.api.project, plan=plan) + except ServerClientError as e: + raise CLIError(e.msg) + if command_args.detach: + console.print("Fleet configuration submitted. Exiting...") + return + try: + with MultiItemStatus( + f"Provisioning [code]{fleet.name}[/]...", console=console + ) as live: + while not _finished_provisioning(fleet): + table = get_fleets_table([fleet], current_project=self.api.project) + live.update(table) + time.sleep(LIVE_TABLE_PROVISION_INTERVAL_SECS) + fleet = self.api.client.fleets.get(self.api.project, fleet.name) + except KeyboardInterrupt: + if confirm_ask("Delete the fleet before exiting?"): + with console.status("Deleting fleet..."): + self.api.client.fleets.delete( + project_name=self.api.project, names=[fleet.name] + ) + else: + console.print("Exiting... Fleet provisioning will continue in the background.") + return + console.print( + get_fleets_table( + [fleet], + verbose=_fleet_has_failed_instances(fleet), + format_date=local_time, + current_project=self.api.project, + ) + ) + if _fleet_has_failed_instances(fleet): + console.print("\n[error]Some instances failed. Check the table above for errors.[/]") + exit(1) + + def delete_configuration( + self, + conf: FleetConfiguration, + configuration_path: str, + command_args: argparse.Namespace, + ): + if conf.name is None: + console.print("[error]Configuration specifies no fleet to delete[/]") + exit(1) + + try: + self.api.client.fleets.get( + project_name=self.api.project, + name=conf.name, + ) + except ResourceNotExistsError: + console.print(f"Fleet [code]{conf.name}[/] does not exist") + exit(1) + + if not command_args.yes and not confirm_ask(f"Delete the fleet [code]{conf.name}[/]?"): + console.print("\nExiting...") + return + + with console.status("Deleting fleet..."): + self.api.client.fleets.delete(project_name=self.api.project, names=[conf.name]) + # Fleet deletion is async. Wait for fleet to be deleted. + while True: + try: + self.api.client.fleets.get(project_name=self.api.project, name=conf.name) + except ResourceNotExistsError: + break + else: + time.sleep(1) + + console.print(f"Fleet [code]{conf.name}[/] deleted") + + @classmethod + def register_args(cls, parser: argparse.ArgumentParser): + configuration_group = parser.add_argument_group(f"{cls.TYPE.value} Options") + configuration_group.add_argument( + "-n", + "--name", + dest="name", + help="The fleet name", + ) + cls.register_env_args(configuration_group) + + def apply_args(self, conf: FleetConfiguration, args: argparse.Namespace): + if args.name: + conf.name = args.name + self.apply_env_vars(conf.env, args) + if conf.ssh_config is None and conf.env: + raise ConfigurationError("`env` is currently supported for SSH fleets only") + + +def _preprocess_spec(spec: FleetSpec): + ssh_config = spec.configuration.ssh_config + if ssh_config is not None: + ssh_config.ssh_key = _resolve_ssh_key(ssh_config.identity_file) + if ssh_config.proxy_jump is not None: + ssh_config.proxy_jump.ssh_key = _resolve_ssh_key(ssh_config.proxy_jump.identity_file) + for host in ssh_config.hosts: + if not isinstance(host, str): + host.ssh_key = _resolve_ssh_key(host.identity_file) + if host.proxy_jump is not None: + host.proxy_jump.ssh_key = _resolve_ssh_key(host.proxy_jump.identity_file) + + +def _resolve_ssh_key(ssh_key_path: Optional[str]) -> Optional[SSHKey]: + if ssh_key_path is None: + return None + ssh_key_path_obj = Path(ssh_key_path).expanduser() + try: + private_key = convert_ssh_key_to_pem(ssh_key_path_obj.read_text()) + try: + pub_key = ssh_key_path_obj.with_suffix(".pub").read_text() + except FileNotFoundError: + pub_key = generate_public_key(pkey_from_str(private_key)) + return SSHKey(public=pub_key, private=private_key) + except OSError as e: + logger.debug("Got OSError: %s", repr(e)) + console.print(f"[error]Unable to read the SSH key at {ssh_key_path}[/]") + exit() + except ValueError as e: + logger.debug("Key type is not supported", repr(e)) + console.print("[error]Key type is not supported[/]") + exit() + + +def _render_fleet_spec_diff(old_spec: FleetSpec, new_spec: FleetSpec) -> Optional[str]: + old_spec = copy_model(old_spec) + new_spec = copy_model(new_spec) + changed_spec_fields = list(diff_models(old_spec, new_spec)) + if not changed_spec_fields: + return None + + nested_list = NestedList() + for spec_field in changed_spec_fields: + if spec_field == "merged_profile": + continue + if spec_field == "configuration": + item = NestedListItem( + "Configuration properties:", + children=[ + NestedListItem(field) + for field in diff_models(old_spec.configuration, new_spec.configuration) + ], + ) + elif spec_field == "profile": + item = NestedListItem( + "Profile properties:", + children=[ + NestedListItem(field) + for field in diff_models(old_spec.profile, new_spec.profile) + ], + ) + elif spec_field == "configuration_path": + item = NestedListItem("Configuration path") + else: + item = NestedListItem(spec_field.replace("_", " ").capitalize()) + nested_list.children.append(item) + + if not nested_list.children: + return None + return nested_list.render() + + +def _print_plan_header(plan: FleetPlan): + def th(s: str) -> str: + return f"[bold]{s}[/bold]" + + spec = plan.get_effective_spec() + + configuration_table = Table(box=None, show_header=False) + configuration_table.add_column(no_wrap=True) # key + configuration_table.add_column() # value + + configuration_table.add_row(th("Project"), plan.project_name) + configuration_table.add_row(th("User"), plan.user) + configuration_table.add_row(th("Configuration"), spec.configuration_path or "?") + configuration_table.add_row(th("Type"), spec.configuration.type) + + fleet_type = "cloud" + nodes = spec.configuration.nodes or "-" + placement = spec.configuration.placement or InstanceGroupPlacement.ANY + reservation = spec.configuration.reservation + backends = None + if spec.configuration.backends is not None: + backends = ", ".join(b.value for b in spec.configuration.backends) + regions = None + if spec.configuration.regions is not None: + regions = ", ".join(spec.configuration.regions) + resources = None + if spec.configuration.resources is not None: + resources = spec.configuration.resources.pretty_format() + spot_policy = spec.merged_profile.spot_policy + if spec.configuration.ssh_config is not None: + fleet_type = "ssh" + nodes = len(spec.configuration.ssh_config.hosts) + resources = None + spot_policy = None + + configuration_table.add_row(th("Fleet type"), fleet_type) + configuration_table.add_row(th("Nodes"), str(nodes)) + configuration_table.add_row(th("Placement"), placement.value) + if backends is not None: + configuration_table.add_row(th("Backends"), str(backends)) + if regions is not None: + configuration_table.add_row(th("Regions"), str(regions)) + if resources is not None: + configuration_table.add_row(th("Resources"), resources) + if spot_policy is not None: + configuration_table.add_row(th("Spot policy"), spot_policy) + if reservation is not None: + configuration_table.add_row(th("Reservation"), reservation) + # TODO: [Andrey] Display "Idle duration" + + offers_table = Table(box=None) + offers_table.add_column("#") + offers_table.add_column("BACKEND") + offers_table.add_column("REGION") + offers_table.add_column("INSTANCE") + offers_table.add_column("RESOURCES") + offers_table.add_column("SPOT") + offers_table.add_column("PRICE") + offers_table.add_column() + + offers_limit = 3 + print_offers = plan.offers[:offers_limit] + + for index, offer in enumerate(print_offers, start=1): + resources = offer.instance.resources + + offers_table.add_row( + f"{index}", + offer.backend.replace("remote", "ssh"), + offer.region, + offer.instance.name, + resources.pretty_format(), + "yes" if resources.spot else "no", + f"${offer.price:3f}".rstrip("0").rstrip("."), + format_instance_availability(offer.availability), + style=None if index == 1 else "secondary", + ) + if len(plan.offers) > offers_limit: + offers_table.add_row("", "...", style="secondary") + + console.print(configuration_table) + console.print() + + if len(print_offers) > 0: + console.print(offers_table) + if len(plan.offers) > offers_limit: + console.print( + f"[secondary] Shown {len(print_offers)} of {plan.total_offers} offers, " + f"${plan.max_offer_price:g} max[/]" + ) + console.print() + elif fleet_type == "cloud": + console.print(NO_OFFERS_WARNING) + + +def _finished_provisioning(fleet: Fleet) -> bool: + for instance in fleet.instances: + if instance.status in [ + InstanceStatus.PENDING, + InstanceStatus.PROVISIONING, + InstanceStatus.TERMINATING, + ]: + return False + return True + + +def _fleet_has_failed_instances(fleet: Fleet) -> bool: + for instance in fleet.instances: + if instance.status == InstanceStatus.TERMINATED: + return True + return False + + +def _fleet_retrying(fleet: Fleet) -> bool: + if fleet.spec.configuration.nodes is None: + return False + active_instances = [i for i in fleet.instances if i.status.is_active()] + return len(active_instances) < fleet.spec.configuration.nodes.min diff --git a/src/dstack/_internal/cli/services/configurators/gateway.py b/src/dstack/_internal/cli/services/configurators/gateway.py index 0cdc553eed..9f8e6cd0d1 100644 --- a/src/dstack/_internal/cli/services/configurators/gateway.py +++ b/src/dstack/_internal/cli/services/configurators/gateway.py @@ -1,82 +1,248 @@ import argparse +import time + +from rich.table import Table from dstack._internal.cli.services.configurators.base import BaseApplyConfigurator -from dstack._internal.cli.utils.common import confirm_ask, console -from dstack._internal.cli.utils.gateway import print_gateways_table +from dstack._internal.cli.utils.common import ( + LIVE_TABLE_PROVISION_INTERVAL_SECS, + confirm_ask, + console, +) +from dstack._internal.cli.utils.gateway import get_gateways_table +from dstack._internal.cli.utils.rich import MultiItemStatus from dstack._internal.core.errors import ResourceNotExistsError from dstack._internal.core.models.configurations import ApplyConfigurationType -from dstack._internal.core.models.gateways import GatewayConfiguration +from dstack._internal.core.models.gateways import ( + Gateway, + GatewayConfiguration, + GatewayPlan, + GatewaySpec, + GatewayStatus, +) +from dstack._internal.core.services.diff import diff_models +from dstack._internal.utils.common import local_time +from dstack._internal.utils.logging import get_logger +from dstack.api._public import Client + +logger = get_logger(__name__) + +class GatewayConfigurator(BaseApplyConfigurator[GatewayConfiguration]): + TYPE = ApplyConfigurationType.GATEWAY -class GatewayConfigurator(BaseApplyConfigurator): - TYPE: ApplyConfigurationType = ApplyConfigurationType.GATEWAY + def apply_configuration( + self, + conf: GatewayConfiguration, + configuration_path: str, + command_args: argparse.Namespace, + configurator_args: argparse.Namespace, + ): + self.apply_args(conf, configurator_args) + spec = GatewaySpec( + configuration=conf, + configuration_path=configuration_path, + ) + if spec.configuration.router is not None: + logger.warning( + "Specifying `router` in gateway configurations is deprecated" + " and will be disallowed in a future release." + " Please migrate to replica-based routers:" + " https://dstack.ai/docs/concepts/services/#pd-disaggregation" + ) + with console.status("Getting apply plan..."): + plan = _get_plan(api=self.api, spec=spec) + _print_plan_header(plan) - def apply_configuration(self, conf: GatewayConfiguration, args: argparse.Namespace): - # TODO: Show apply plan - # TODO: Update gateway in-place when domain/default change - confirmed = False - if conf.name is not None: - try: - gateway = self.api_client.client.gateways.get( - project_name=self.api_client.project, gateway_name=conf.name + action_message = "" + confirm_message = "" + if plan.current_resource is None: + if plan.spec.configuration.name is not None: + action_message += ( + f"Gateway [code]{plan.spec.configuration.name}[/] does not exist yet." ) - except ResourceNotExistsError: - pass + confirm_message += "Create the gateway?" + else: + action_message += f"Found gateway [code]{plan.spec.configuration.name}[/]." + diff = diff_models( + plan.spec.configuration, + plan.current_resource.configuration, + ) + changed_fields = list(diff.keys()) + if ( + plan.current_resource.configuration == plan.spec.configuration + or changed_fields == ["default"] + ): + if command_args.yes and not command_args.force: + # --force is required only with --yes, + # otherwise we may ask for force apply interactively. + console.print( + "No configuration changes detected. Use --force to apply anyway." + ) + return + action_message += " No configuration changes detected." + confirm_message += "Re-create the gateway?" else: - if gateway.configuration == conf: - if not args.force: - console.print( - "Gateway configuration has not changed. Use --force to recreate the gateway." + action_message += " Configuration changes detected." + confirm_message += "Re-create the gateway?" + + console.print(action_message) + if not command_args.yes and not confirm_ask(confirm_message): + console.print("\nExiting...") + return + + if plan.current_resource is not None: + with console.status("Deleting existing gateway..."): + self.api.client.gateways.delete( + project_name=self.api.project, + gateways_names=[plan.current_resource.name], + ) + # Gateway deletion is async. Wait for gateway to be deleted. + while True: + try: + self.api.client.gateways.get( + project_name=self.api.project, + gateway_name=plan.current_resource.name, ) - return - if not args.yes and not confirm_ask( - "Gateway configuration has not changed. Re-create the gateway?" - ): - console.print("\nExiting...") - return - elif not args.yes and not confirm_ask( - f"Gateway [code]{conf.name}[/] already exists. Re-create the gateway?" - ): - console.print("\nExiting...") - return - confirmed = True - with console.status("Deleting gateway..."): - self.api_client.client.gateways.delete( - project_name=self.api_client.project, gateways_names=[conf.name] - ) - if not confirmed and not args.yes: - if not confirm_ask( - f"Gateway [code]{conf.name}[/] does not exist yet. Create the gateway?" - ): - console.print("\nExiting...") - return + except ResourceNotExistsError: + break + else: + time.sleep(1) + with console.status("Creating gateway..."): - gateway = self.api_client.client.gateways.create( - project_name=self.api_client.project, + gateway = self.api.client.gateways.create( + project_name=self.api.project, configuration=conf, ) - print_gateways_table([gateway]) + if command_args.detach: + console.print("Gateway configuration submitted. Exiting...") + return + try: + with MultiItemStatus( + f"Provisioning [code]{gateway.name}[/]...", console=console + ) as live: + while not _finished_provisioning(gateway): + table = get_gateways_table( + [gateway], current_project=self.api.project, include_created=True + ) + live.update(table) + time.sleep(LIVE_TABLE_PROVISION_INTERVAL_SECS) + gateway = self.api.client.gateways.get(self.api.project, gateway.name) + except KeyboardInterrupt: + if not command_args.yes and confirm_ask("Delete the gateway before exiting?"): + with console.status("Deleting gateway..."): + self.api.client.gateways.delete( + project_name=self.api.project, + gateways_names=[gateway.name], + ) + else: + console.print("Exiting... Gateway provisioning will continue in the background.") + return + console.print( + get_gateways_table( + [gateway], + current_project=self.api.project, + verbose=gateway.status == GatewayStatus.FAILED, + include_created=True, + format_date=local_time, + ) + ) + if gateway.status == GatewayStatus.FAILED: + console.print( + f"\n[error]Provisioning failed. Error: {gateway.status_message or 'unknown'}[/]" + ) + exit(1) - def delete_configuration(self, conf: GatewayConfiguration, args: argparse.Namespace): + def delete_configuration( + self, + conf: GatewayConfiguration, + configuration_path: str, + command_args: argparse.Namespace, + ): if conf.name is None: console.print("[error]Configuration specifies no gateway to delete[/]") - return + exit(1) try: - self.api_client.client.gateways.get( - project_name=self.api_client.project, gateway_name=conf.name - ) + self.api.client.gateways.get(project_name=self.api.project, gateway_name=conf.name) except ResourceNotExistsError: console.print(f"Gateway [code]{conf.name}[/] does not exist") - return + exit(1) - if not args.yes and not confirm_ask(f"Delete the gateway [code]{conf.name}[/]?"): + if not command_args.yes and not confirm_ask(f"Delete the gateway [code]{conf.name}[/]?"): console.print("\nExiting...") return with console.status("Deleting gateway..."): - self.api_client.client.gateways.delete( - project_name=self.api_client.project, gateways_names=[conf.name] + self.api.client.gateways.delete( + project_name=self.api.project, gateways_names=[conf.name] ) console.print(f"Gateway [code]{conf.name}[/] deleted") + + @classmethod + def register_args(cls, parser: argparse.ArgumentParser): + configuration_group = parser.add_argument_group(f"{cls.TYPE.value} Options") + configuration_group.add_argument( + "-n", + "--name", + dest="name", + help="The gateway name", + ) + + def apply_args(self, conf: GatewayConfiguration, args: argparse.Namespace): + if args.name: + conf.name = args.name + + +def _get_plan(api: Client, spec: GatewaySpec) -> GatewayPlan: + # TODO: Implement server-side /get_plan with an offer included + user = api.client.users.get_my_user() + current_resource = None + if spec.configuration.name is not None: + try: + current_resource = api.client.gateways.get( + project_name=api.project, + gateway_name=spec.configuration.name, + ) + except ResourceNotExistsError: + pass + return GatewayPlan( + project_name=api.project, + user=user.username, + spec=spec, + current_resource=current_resource, + ) + + +def _print_plan_header(plan: GatewayPlan): + def th(s: str) -> str: + return f"[bold]{s}[/bold]" + + configuration_table = Table(box=None, show_header=False) + configuration_table.add_column(no_wrap=True) # key + configuration_table.add_column() # value + + configuration_table.add_row(th("Project"), plan.project_name) + configuration_table.add_row(th("User"), plan.user) + configuration_table.add_row(th("Configuration"), plan.spec.configuration_path) + configuration_table.add_row(th("Type"), plan.spec.configuration.type) + + domain = "-" + if plan.spec.configuration.domain is not None: + domain = plan.spec.configuration.domain + + configuration_table.add_row(th("Backend"), plan.spec.configuration.backend.value) + configuration_table.add_row(th("Region"), plan.spec.configuration.region) + configuration_table.add_row(th("Domain"), domain) + + if plan.spec.configuration.replicas is not None: + assert isinstance(plan.spec.configuration.replicas, int) + configuration_table.add_row(th("Replicas"), str(plan.spec.configuration.replicas)) + + console.print(configuration_table) + console.print() + + +def _finished_provisioning(gateway: Gateway) -> bool: + return gateway.status in [GatewayStatus.RUNNING, GatewayStatus.FAILED] diff --git a/src/dstack/_internal/cli/services/configurators/run.py b/src/dstack/_internal/cli/services/configurators/run.py index 1dc1cd692a..16b0f0a87b 100644 --- a/src/dstack/_internal/cli/services/configurators/run.py +++ b/src/dstack/_internal/cli/services/configurators/run.py @@ -1,89 +1,610 @@ import argparse +import json import os +import shlex +import shutil import subprocess -from typing import Dict, List, Optional, Type - -import dstack._internal.core.models.resources as resources -from dstack._internal.cli.services.args import disk_spec, env_var, gpu_spec, port_mapping -from dstack._internal.cli.utils.common import console -from dstack._internal.core.errors import ConfigurationError -from dstack._internal.core.models.common import is_core_model_instance +import sys +import time +from pathlib import Path +from typing import Dict, List, Optional, Set, TypeVar + +import gpuhunt +from pydantic import parse_obj_as + +from dstack._internal.cli.services.args import port_mapping +from dstack._internal.cli.services.configurators.base import ( + ApplyEnvVarsConfiguratorMixin, + BaseApplyConfigurator, +) +from dstack._internal.cli.services.profile import apply_profile_args, register_profile_args +from dstack._internal.cli.services.repos import ( + get_repo_from_dir, + get_repo_from_url, + init_default_virtual_repo, + is_git_repo_url, + register_init_repo_args, +) +from dstack._internal.cli.services.resources import apply_resources_args, register_resources_args +from dstack._internal.cli.utils.common import confirm_ask, console +from dstack._internal.cli.utils.rich import MultiItemStatus +from dstack._internal.cli.utils.run import ( + RunWaitStatus, + get_run_wait_status, + get_runs_table, + print_run_plan, +) +from dstack._internal.core.errors import ( + CLIError, + ConfigurationError, + RepoInvalidCredentialsError, + ResourceNotExistsError, + ServerClientError, +) +from dstack._internal.core.models.common import ApplyAction, RegistryAuth from dstack._internal.core.models.configurations import ( - BaseConfiguration, - BaseConfigurationWithPorts, + AnyRunConfiguration, + ApplyConfigurationType, + ConfigurationWithCommandsParams, + ConfigurationWithPortsParams, DevEnvironmentConfiguration, - EnvSentinel, PortMapping, RunConfigurationType, ServiceConfiguration, TaskConfiguration, ) -from dstack._internal.utils.interpolator import VariablesInterpolator +from dstack._internal.core.models.repos import RepoHeadWithCreds +from dstack._internal.core.models.repos.remote import RemoteRepo, RemoteRepoCreds +from dstack._internal.core.models.resources import CPUSpec +from dstack._internal.core.models.runs import JobStatus, JobSubmission, RunSpec, RunStatus +from dstack._internal.core.services.diff import diff_models +from dstack._internal.core.services.repos import get_repo_creds_and_default_branch +from dstack._internal.core.services.ssh.ports import PortUsedError +from dstack._internal.settings import FeatureFlags +from dstack._internal.utils.common import local_time +from dstack._internal.utils.interpolator import InterpolatorError, VariablesInterpolator +from dstack._internal.utils.logging import get_logger +from dstack._internal.utils.nested_list import NestedList, NestedListItem +from dstack._internal.utils.path import is_absolute_posix_path +from dstack.api._public.runs import Run +from dstack.api.utils import load_profile + +_KNOWN_AMD_GPUS = {gpu.name.lower() for gpu in gpuhunt.KNOWN_AMD_GPUS} +_KNOWN_NVIDIA_GPUS = {gpu.name.lower() for gpu in gpuhunt.KNOWN_NVIDIA_GPUS} +_KNOWN_TPU_VERSIONS = {gpu.name.lower() for gpu in gpuhunt.KNOWN_TPUS} +_KNOWN_TENSTORRENT_GPUS = {gpu.name.lower() for gpu in gpuhunt.KNOWN_TENSTORRENT_ACCELERATORS} +_BIND_ADDRESS_ARG = "bind_address" + +logger = get_logger(__name__) + +RunConfigurationT = TypeVar("RunConfigurationT", bound=AnyRunConfiguration) + + +class BaseRunConfigurator( + ApplyEnvVarsConfiguratorMixin, + BaseApplyConfigurator[RunConfigurationT], +): + def apply_configuration( + self, + conf: RunConfigurationT, + configuration_path: str, + command_args: argparse.Namespace, + configurator_args: argparse.Namespace, + ): + if configurator_args.repo and configurator_args.no_repo: + raise CLIError("Either --repo or --no-repo can be specified") + + self.apply_args(conf, configurator_args) + self.validate_gpu_vendor_and_image(conf) + self.validate_cpu_arch_and_image(conf) + + if conf.working_dir is not None and not is_absolute_posix_path(conf.working_dir): + raise ConfigurationError("working_dir must be absolute") + + if isinstance(conf, ServiceConfiguration) and conf.router is not None: + logger.warning( + "Specifying `router` in service configurations is deprecated" + " and will be disallowed in a future release." + " Please migrate to replica-based routers:" + " https://dstack.ai/docs/concepts/services/#pd-disaggregation" + ) + + repo = self.get_repo(conf, configuration_path, configurator_args) + if repo is None: + repo = init_default_virtual_repo(api=self.api) + profile = load_profile(Path.cwd(), configurator_args.profile) + with console.status("Getting apply plan..."): + run_plan = self.api.runs.get_run_plan( + configuration=conf, + repo=repo, + configuration_path=configuration_path, + profile=profile, + ssh_identity_file=configurator_args.ssh_identity_file, + ) + + no_fleets = False + if len(run_plan.job_plans[0].offers) == 0: + if len(self.api.client.fleets.list(self.api.project, include_imported=True)) == 0: + no_fleets = True + + print_run_plan( + run_plan, + max_offers=configurator_args.max_offers, + no_fleets=no_fleets, + verbose=command_args.verbose, + ) + + confirm_message = "Submit a new run?" + if conf.name: + confirm_message = f"Submit the run [code]{conf.name}[/]?" + stop_run_name = None + if run_plan.current_resource is not None: + diff = render_run_spec_diff( + run_plan.get_effective_run_spec(), + run_plan.current_resource.run_spec, + ) + if run_plan.action == ApplyAction.UPDATE and diff is not None: + console.print( + f"Active run [code]{conf.name}[/] already exists." + f" Detected changes that [code]can[/] be updated in-place:\n{diff}" + ) + confirm_message = "Update the run?" + elif run_plan.action == ApplyAction.UPDATE and diff is None: + stop_run_name = run_plan.current_resource.run_spec.run_name + console.print( + f"Active run [code]{conf.name}[/] already exists. Detected no changes." + ) + if command_args.yes and not command_args.force: + console.print("Use --force to apply anyway.") + return + confirm_message = "Stop and override the run?" + elif not run_plan.current_resource.status.is_finished(): + stop_run_name = run_plan.current_resource.run_spec.run_name + # TODO: Highlight only the fields that block in-place update instead of + # showing the full detected diff here. + console.print( + f"Active run [code]{conf.name}[/] already exists." + f" Detected changes that [error]cannot[/] be updated in-place:\n{diff}" + ) + confirm_message = "Stop and override the run?" + + if not command_args.yes and not confirm_ask(confirm_message): + console.print("\nExiting...") + return + + if stop_run_name is not None: + with console.status("Stopping run..."): + self.api.client.runs.stop(self.api.project, [stop_run_name], abort=False) + while True: + run = self.api.runs.get(stop_run_name) + if run is None or run.status.is_finished(): + break + time.sleep(1) + + try: + with console.status("Applying plan..."): + run = self.api.runs.apply_plan( + run_plan=run_plan, repo=repo, reserve_ports=not command_args.detach + ) + except ServerClientError as e: + raise CLIError(e.msg) + except PortUsedError as e: + console.print( + f"[error]Failed to submit: port [code]{e.port}[/code] is already in use." + f" Use [code]-p[/code] in [code]dstack apply[/code] to override the local" + f" port mapping, e.g. [code]-p {e.port + 1}:{e.port}[/code].[/]" + ) + exit(1) + + if command_args.detach: + detach_message = f"Run [code]{run.name}[/] submitted, detaching..." + if run_plan.action == ApplyAction.UPDATE: + detach_message = f"Run [code]{run.name}[/] updated, detaching..." + console.print(detach_message) + return + + abort_at_exit = False + try: + # We can attach to run multiple times if it goes from running to pending (retried). + while True: + with MultiItemStatus(_get_apply_status(run), console=console) as live: + ready_wait_attempt = 0 + while not _is_ready_to_attach(run): + table = get_runs_table([run]) + live.update( + table, + *_get_apply_wait_renderables(run), + status=_get_apply_status(run), + ) + time.sleep(_get_ready_wait_interval(ready_wait_attempt)) + ready_wait_attempt += 1 + run.refresh() + console.print( + get_runs_table( + [run], + verbose=run.status == RunStatus.FAILED, + format_date=local_time, + ) + ) -class BaseRunConfigurator: - TYPE: RunConfigurationType = None + console.print( + f"\n[code]{run.name}[/] provisioning completed [secondary]({run.status.value})[/]" + ) + + current_job_submission = run._run.latest_job_submission + if run.status in (RunStatus.RUNNING, RunStatus.DONE): + _print_service_urls(run) + _print_dev_environment_connection_info(run) + bind_address: Optional[str] = getattr( + configurator_args, _BIND_ADDRESS_ARG, None + ) + try: + try: + attached = run.attach(bind_address=bind_address) + except PortUsedError as e: + console.print( + f"[error]Failed to attach: port [code]{e.port}[/code] is already in use." + f" Use [code]-p[/code] in [code]dstack attach[/code] to override the local" + f" port mapping, e.g. [code]-p {e.port + 1}:{e.port}[/code].[/]" + ) + exit(1) + if attached: + for entry in run.logs(): + sys.stdout.buffer.write(entry) + sys.stdout.buffer.flush() + else: + console.print("[error]Failed to attach, exiting...[/]") + exit(1) + finally: + run.detach() + + # After reading the logs, the run may not be marked as finished immediately. + # Give the run some time to transition to a finished state before exiting. + reattach = False + for _ in range(30): + run.refresh() + if _run_resubmitted(run, current_job_submission): + # The run was resubmitted + reattach = True + break + if run.status.is_finished(): + print_finished_message(run) + exit(get_run_exit_code(run)) + time.sleep(1) + if not reattach: + console.print( + "[error]Lost run connection. Timed out waiting for run final status." + " Check `dstack ps` to see if it's done or failed." + ) + exit(1) + except KeyboardInterrupt: + try: + if command_args.yes or not confirm_ask( + f"\nStop the run [code]{run.name}[/] before detaching?" + ): + console.print("Detached") + abort_at_exit = False + return + # Gently stop the run and wait for it to finish + with console.status("Stopping..."): + run.stop(abort=False) + while not run.status.is_finished(): + time.sleep(2) + run.refresh() + console.print("Stopped") + except KeyboardInterrupt: + abort_at_exit = True + finally: + run.detach() + if abort_at_exit: + with console.status("Aborting..."): + run.stop(abort=True) + console.print("[error]Aborted[/]") + + def delete_configuration( + self, + conf: RunConfigurationT, + configuration_path: str, + command_args: argparse.Namespace, + ): + if conf.name is None: + console.print("[error]Configuration specifies no run to delete[/]") + exit(1) + try: + self.api.client.runs.get( + project_name=self.api.project, + run_name=conf.name, + ) + except ResourceNotExistsError: + console.print(f"Run [code]{conf.name}[/] does not exist") + exit(1) + if not command_args.yes and not confirm_ask(f"Delete the run [code]{conf.name}[/]?"): + console.print("\nExiting...") + return + with console.status("Deleting run..."): + self.api.client.runs.delete( + project_name=self.api.project, + runs_names=[conf.name], + ) + console.print(f"Run [code]{conf.name}[/] deleted") @classmethod - def register(cls, parser: argparse.ArgumentParser): + def register_args(cls, parser: argparse.ArgumentParser): parser.add_argument( - "-e", - "--env", - type=env_var, - action="append", - help="Environment variables", - dest="envs", - metavar="KEY=VALUE", + "--ssh-identity", + metavar="SSH_PRIVATE_KEY", + help="The private SSH key path for SSH tunneling", + type=Path, + dest="ssh_identity_file", ) - parser.add_argument( - "--gpu", - type=gpu_spec, - help="Request GPU for the run. " - "The format is [code]NAME[/]:[code]COUNT[/]:[code]MEMORY[/] (all parts are optional)", - dest="gpu_spec", - metavar="SPEC", + configuration_group = parser.add_argument_group(f"{cls.TYPE.value} Options") + configuration_group.add_argument( + "-n", + "--name", + dest="run_name", + help="The name of the run. If not specified, a random name is assigned", ) - parser.add_argument( - "--disk", - type=disk_spec, - help="Request the size range of disk for the run. Example [code]--disk 100GB..[/].", - metavar="RANGE", - dest="disk_spec", + configuration_group.add_argument( + "--max-offers", + help="Number of offers to show in the run plan", + type=int, + default=3, ) - - @classmethod - def apply(cls, args: argparse.Namespace, unknown: List[str], conf: BaseConfiguration): - if args.envs: - for k, v in args.envs: - conf.env[k] = v - if args.gpu_spec: - gpu = (conf.resources.gpu or resources.GPUSpec()).dict() - gpu.update(args.gpu_spec) - conf.resources.gpu = resources.GPUSpec.parse_obj(gpu) - if args.disk_spec: - conf.resources.disk = args.disk_spec - - for k, v in conf.env.items(): - if is_core_model_instance(v, EnvSentinel): - try: - conf.env[k] = v.from_env(os.environ) - except ValueError as e: - raise ConfigurationError(*e.args) - - cls.interpolate_run_args(conf.setup, unknown) - - @classmethod - def interpolate_run_args(cls, value: List[str], unknown): - run_args = " ".join(unknown) - interpolator = VariablesInterpolator({"run": {"args": run_args}}, skip=["secrets"]) - for i in range(len(value)): - value[i] = interpolator.interpolate(value[i]) - - -class RunWithPortsConfigurator(BaseRunConfigurator): + cls.register_env_args(configuration_group) + register_resources_args(configuration_group) + register_profile_args(parser) + repo_group = parser.add_argument_group("Repo Options") + repo_group.add_argument( + "-P", + "--repo", + help=("The repo to use for the run. Can be a local path or a Git repo URL."), + dest="repo", + ) + repo_group.add_argument( + "--repo-branch", + help="The repo branch to use for the run", + dest="repo_branch", + ) + repo_group.add_argument( + "--repo-hash", + help="The hash of the repo commit to use for the run", + dest="repo_hash", + ) + repo_group.add_argument( + "--no-repo", + help="Do not use any repo for the run", + dest="no_repo", + action="store_true", + ) + register_init_repo_args(repo_group) + + def apply_args(self, conf: RunConfigurationT, args: argparse.Namespace): + apply_resources_args(args, conf) + apply_profile_args(args, conf) + if args.run_name: + conf.name = args.run_name + self.apply_env_vars(conf.env, args) + self.interpolate_env(conf) + + def interpolate_env(self, conf: RunConfigurationT): + env_dict = conf.env.as_dict() + interpolator = VariablesInterpolator({"env": env_dict}, skip=["secrets"]) + try: + if conf.registry_auth is not None: + conf.registry_auth = RegistryAuth( + username=interpolator.interpolate_or_error(conf.registry_auth.username), + password=interpolator.interpolate_or_error(conf.registry_auth.password), + ) + if isinstance(conf, ServiceConfiguration): + for probe in conf.probes or []: + for header in probe.headers: + header.value = interpolator.interpolate_or_error(header.value) + if probe.url: + probe.url = interpolator.interpolate_or_error(probe.url) + if probe.body: + probe.body = interpolator.interpolate_or_error(probe.body) + except InterpolatorError as e: + raise ConfigurationError(e.args[0]) + + def validate_gpu_vendor_and_image(self, conf: RunConfigurationT) -> None: + """ + Infers GPU vendor if not set. Defaults to Nvidia when using the default + CUDA image. Requires explicit `image` if the vendor is AMD or Tenstorrent. + + When vendor is inferred from GPU name (e.g. A100 -> nvidia), it is written to + gpu_spec. When vendor is inferred from image context (no name, no vendor, default + CUDA image -> nvidia), it is NOT written to gpu_spec because 0.19.x servers + (gpuhunt <0.1.12) break on vendor=nvidia + min_gpu_count=0. The server applies + the same default in set_gpu_vendor_default(). + + TODO: This entire method should move to the server (set_resources_defaults) + so that defaults and validation are equal for CLI and API users. + """ + gpu_spec = conf.resources.gpu + if gpu_spec is None: + return + if gpu_spec.count.max == 0: + return + has_amd_gpu: bool + has_tt_gpu: bool + vendor = gpu_spec.vendor + if vendor is None: + names = gpu_spec.name + if names: + # None is a placeholder for an unknown vendor. + vendors: Set[Optional[gpuhunt.AcceleratorVendor]] = set() + for name in names: + name = name.lower() + if name in _KNOWN_NVIDIA_GPUS: + vendors.add(gpuhunt.AcceleratorVendor.NVIDIA) + elif name in _KNOWN_AMD_GPUS: + vendors.add(gpuhunt.AcceleratorVendor.AMD) + elif name in _KNOWN_TENSTORRENT_GPUS: + vendors.add(gpuhunt.AcceleratorVendor.TENSTORRENT) + else: + maybe_tpu_version, _, maybe_tpu_cores = name.partition("-") + if maybe_tpu_version in _KNOWN_TPU_VERSIONS and maybe_tpu_cores.isdigit(): + vendors.add(gpuhunt.AcceleratorVendor.GOOGLE) + else: + vendors.add(None) + if len(vendors) == 1: + # Only one vendor or all names are not known. + vendor = next(iter(vendors)) + else: + # More than one vendor or some names are not known; in either case, we + # cannot set the vendor to a specific value, will use only names for matching. + vendor = None + # If some names are unknown, let's assume they are _not_ AMD products, otherwise + # ConfigurationError message may be confusing. In worst-case scenario we'll try + # to execute a run on an instance with an AMD accelerator with a default + # CUDA image, not a big deal. + has_amd_gpu = gpuhunt.AcceleratorVendor.AMD in vendors + has_tt_gpu = gpuhunt.AcceleratorVendor.TENSTORRENT in vendors + # Set vendor inferred from name on the spec (server needs it for filtering). + gpu_spec.vendor = vendor + else: + # No vendor or name specified. Default to Nvidia if using the + # default CUDA image, since it's only compatible with Nvidia GPUs. + if conf.image is None and conf.docker is not True: + vendor = gpuhunt.AcceleratorVendor.NVIDIA + has_amd_gpu = False + has_tt_gpu = False + else: + has_amd_gpu = vendor == gpuhunt.AcceleratorVendor.AMD + has_tt_gpu = vendor == gpuhunt.AcceleratorVendor.TENSTORRENT + # When docker=True, the system uses Docker-in-Docker image, so no custom image is required + if has_amd_gpu and conf.image is None and conf.docker is not True: + raise ConfigurationError("`image` is required if `resources.gpu.vendor` is `amd`") + if has_tt_gpu and conf.image is None and conf.docker is not True: + raise ConfigurationError( + "`image` is required if `resources.gpu.vendor` is `tenstorrent`" + ) + + def validate_cpu_arch_and_image(self, conf: RunConfigurationT) -> None: + """ + Infers `resources.cpu.arch` if not set, requires `image` if the architecture is ARM. + """ + # TODO: Remove in 0.20. Use conf.resources.cpu directly + cpu_spec = parse_obj_as(CPUSpec, conf.resources.cpu) + arch = cpu_spec.arch + if arch is None: + gpu_spec = conf.resources.gpu + if ( + gpu_spec is not None + and gpu_spec.vendor in [None, gpuhunt.AcceleratorVendor.NVIDIA] + and gpu_spec.name + and any(map(gpuhunt.is_nvidia_superchip, gpu_spec.name)) + ): + arch = gpuhunt.CPUArchitecture.ARM + else: + arch = gpuhunt.CPUArchitecture.X86 + # NOTE: We don't set the inferred resources.cpu.arch for compatibility with older servers. + # Servers with ARM support set the arch using the same logic. + if arch == gpuhunt.CPUArchitecture.ARM and conf.image is None: + raise ConfigurationError("`image` is required if `resources.cpu.arch` is `arm`") + + def get_repo( + self, + conf: RunConfigurationT, + configuration_path: str, + configurator_args: argparse.Namespace, + ) -> Optional[RemoteRepo]: + if configurator_args.no_repo: + return None + + repo: Optional[RemoteRepo] = None + repo_head: Optional[RepoHeadWithCreds] = None + repo_branch: Optional[str] = configurator_args.repo_branch + repo_hash: Optional[str] = configurator_args.repo_hash + repo_creds: Optional[RemoteRepoCreds] = None + git_identity_file: Optional[str] = configurator_args.git_identity_file + git_private_key: Optional[str] = None + oauth_token: Optional[str] = configurator_args.gh_token + # Should we (re)initialize the repo? + # If any Git credentials provided, we reinitialize the repo, as the user may have provided + # updated credentials. + init = git_identity_file is not None or oauth_token is not None + + url: Optional[str] = None + local_path: Optional[Path] = None + # dummy value, safe to join with any path + root_dir = Path(".") + if repo_arg := configurator_args.repo: + if is_git_repo_url(repo_arg): + url = repo_arg + else: + local_path = Path(repo_arg) + # rel paths in `--repo` are resolved relative to the current working dir + root_dir = Path.cwd() + elif conf.repos: + repo_spec = conf.repos[0] + if repo_spec.url: + url = repo_spec.url + elif repo_spec.local_path: + local_path = Path(repo_spec.local_path) + # rel paths in the conf are resolved relative to the conf's parent dir + root_dir = Path(configuration_path).resolve().parent + else: + assert False, f"should not reach here: {repo_spec}" + if repo_branch is None: + repo_branch = repo_spec.branch + if repo_hash is None: + repo_hash = repo_spec.hash + else: + return None + + if url: + repo = get_repo_from_url(url) + repo_head = self.api.repos.get(repo_id=repo.repo_id, with_creds=True) + elif local_path: + original_local_path = local_path + local_path = local_path.expanduser() + if not local_path.is_absolute(): + local_path = (root_dir / local_path).resolve() + if not local_path.exists(): + raise ConfigurationError( + f"Invalid repo path: {original_local_path} -> {local_path}" + ) + repo = get_repo_from_dir(local_path) + repo_head = self.api.repos.get(repo_id=repo.repo_id, with_creds=True) + repo_branch = repo.run_repo_data.repo_branch + repo_hash = repo.run_repo_data.repo_hash + else: + assert False, "should not reach here" + + if repo_head is not None and repo_head.repo_creds is not None: + if git_identity_file is None and oauth_token is None: + git_private_key = repo_head.repo_creds.private_key + oauth_token = repo_head.repo_creds.oauth_token + else: + init = True + + try: + repo_creds, _ = get_repo_creds_and_default_branch( + repo_url=repo.repo_url, + identity_file=git_identity_file, + private_key=git_private_key, + oauth_token=oauth_token, + ) + except RepoInvalidCredentialsError: + raise CLIError( + "No valid default Git credentials found. Pass valid `--token` or `--git-identity`." + ) + + repo.run_repo_data.repo_branch = repo_branch + if repo_hash is not None: + repo.run_repo_data.repo_hash = repo_hash + + if init: + self.api.repos.init(repo=repo, creds=repo_creds) + + return repo + + +class RunWithPortsConfiguratorMixin: @classmethod - def register(cls, parser: argparse.ArgumentParser): - super().register(parser) + def register_ports_args(cls, parser: argparse.ArgumentParser): parser.add_argument( "-p", "--port", @@ -93,32 +614,79 @@ def register(cls, parser: argparse.ArgumentParser): dest="ports", metavar="MAPPING", ) + parser.add_argument( + "--host", + help="Local address to bind. Defaults to [code]localhost[/]", + dest=_BIND_ADDRESS_ARG, + metavar="HOST", + ) - @classmethod - def apply(cls, args: argparse.Namespace, unknown: List[str], conf: BaseConfigurationWithPorts): - super().apply(args, unknown, conf) + def apply_ports_args( + self, + conf: ConfigurationWithPortsParams, + args: argparse.Namespace, + ): if args.ports: - conf.ports = list(merge_ports(conf.ports, args.ports).values()) + conf.ports = list(_merge_ports(conf.ports, args.ports).values()) + + +class RunWithCommandsConfiguratorMixin: + @classmethod + def register_commands_args(cls, parser: argparse.ArgumentParser): + parser.add_argument( + "run_args", + help=( + "Run arguments. Available in the configuration [code]commands[/code] as" + " [code]${{ run.args }}[/code]." + " Use [code]--[/code] to separate run options from [code]dstack[/code] options" + ), + nargs="*", + metavar="RUN_ARGS", + ) + + def apply_commands_args( + self, + conf: ConfigurationWithCommandsParams, + args: argparse.Namespace, + ): + commands = conf.commands + run_args = shlex.join(args.run_args) + interpolator = VariablesInterpolator({"run": {"args": run_args}}, skip=["secrets"]) + try: + for i, command in enumerate(commands): + commands[i] = interpolator.interpolate_or_error(command) + except InterpolatorError as e: + raise ConfigurationError(e.args[0]) -class TaskRunConfigurator(RunWithPortsConfigurator): - TYPE = RunConfigurationType.TASK +class TaskConfigurator( + RunWithPortsConfiguratorMixin, RunWithCommandsConfiguratorMixin, BaseRunConfigurator +): + TYPE = ApplyConfigurationType.TASK @classmethod - def apply(cls, args: argparse.Namespace, unknown: List[str], conf: TaskConfiguration): - super().apply(args, unknown, conf) + def register_args(cls, parser: argparse.ArgumentParser): + super().register_args(parser) + cls.register_ports_args(parser) + cls.register_commands_args(parser) - cls.interpolate_run_args(conf.commands, unknown) + def apply_args(self, conf: TaskConfiguration, args: argparse.Namespace): + super().apply_args(conf, args) + self.apply_ports_args(conf, args) + self.apply_commands_args(conf, args) -class DevEnvironmentRunConfigurator(RunWithPortsConfigurator): - TYPE = RunConfigurationType.DEV_ENVIRONMENT +class DevEnvironmentConfigurator(RunWithPortsConfiguratorMixin, BaseRunConfigurator): + TYPE = ApplyConfigurationType.DEV_ENVIRONMENT @classmethod - def apply( - cls, args: argparse.Namespace, unknown: List[str], conf: DevEnvironmentConfiguration - ): - super().apply(args, unknown, conf) + def register_args(cls, parser: argparse.ArgumentParser): + super().register_args(parser) + cls.register_ports_args(parser) + + def apply_args(self, conf: DevEnvironmentConfiguration, args: argparse.Namespace): + super().apply_args(conf, args) + self.apply_ports_args(conf, args) if conf.ide == "vscode" and conf.version is None: conf.version = _detect_vscode_version() if conf.version is None: @@ -127,31 +695,54 @@ def apply( "Fix by opening [code]Command Palette[/code], executing [code]Shell Command: " "Install 'code' command in PATH[/code], and restarting terminal.[/]\n" ) + if conf.ide == "cursor" and conf.version is None: + conf.version = _detect_cursor_version() + if conf.version is None: + console.print( + "[secondary]Unable to detect the Cursor version and pre-install extensions. " + "Fix by opening [code]Command Palette[/code], executing [code]Shell Command: " + "Install 'cursor' command in PATH[/code], and restarting terminal.[/]\n" + ) + if conf.ide == "windsurf" and conf.version is None: + conf.version = _detect_windsurf_version() + if conf.version is None: + console.print( + "[secondary]Unable to detect the Windsurf version and pre-install extensions. " + "Fix by opening [code]Command Palette[/code], executing [code]Shell Command: " + "Install 'surf' command in PATH[/code], and restarting terminal.[/]\n" + ) -class ServiceRunConfigurator(BaseRunConfigurator): - TYPE = RunConfigurationType.SERVICE +class ServiceConfigurator(RunWithCommandsConfiguratorMixin, BaseRunConfigurator): + TYPE = ApplyConfigurationType.SERVICE @classmethod - def apply(cls, args: argparse.Namespace, unknown: List[str], conf: ServiceConfiguration): - super().apply(args, unknown, conf) + def register_args(cls, parser: argparse.ArgumentParser): + super().register_args(parser) + cls.register_commands_args(parser) + + def apply_args(self, conf: TaskConfiguration, args: argparse.Namespace): + super().apply_args(conf, args) + self.apply_commands_args(conf, args) - cls.interpolate_run_args(conf.commands, unknown) +def _get_ready_wait_interval(attempt: int) -> float: + if attempt < 5: + return 1 + return 5 -def merge_ports(conf: List[PortMapping], args: List[PortMapping]) -> Dict[int, PortMapping]: - unique_ports_constraint([pm.container_port for pm in conf]) - unique_ports_constraint([pm.container_port for pm in args]) +def _merge_ports(conf: List[PortMapping], args: List[PortMapping]) -> Dict[int, PortMapping]: + _unique_ports_constraint([pm.container_port for pm in conf]) + _unique_ports_constraint([pm.container_port for pm in args]) ports = {pm.container_port: pm for pm in conf} for pm in args: # override conf ports[pm.container_port] = pm - - unique_ports_constraint([pm.local_port for pm in ports.values() if pm.local_port is not None]) + _unique_ports_constraint([pm.local_port for pm in ports.values() if pm.local_port is not None]) return ports -def unique_ports_constraint(ports: List[int]): +def _unique_ports_constraint(ports: List[int]): used_ports = set() for i in ports: if i in used_ports: @@ -169,11 +760,228 @@ def _detect_vscode_version(exe: str = "code") -> Optional[str]: return None -run_configurators_mapping: Dict[RunConfigurationType, Type[BaseRunConfigurator]] = { - cls.TYPE: cls - for cls in [ - TaskRunConfigurator, - DevEnvironmentRunConfigurator, - ServiceRunConfigurator, - ] -} +def _detect_cursor_version(exe: str = "cursor") -> Optional[str]: + try: + run = subprocess.run([exe, "--version"], capture_output=True) + except FileNotFoundError: + return None + if run.returncode == 0: + return run.stdout.decode().split("\n")[1].strip() + return None + + +def _detect_windsurf_version(exe: str = "windsurf") -> Optional[str]: + """ + Detects the installed Windsurf product version and commit hash. + Returns string in format 'version@commit' (e.g., '1.13.5@97d7a...') or None. + """ + # 1. Locate executable in PATH + cmd_path = shutil.which(exe) + if not cmd_path: + return None + + try: + # 2. Resolve symlinks to find the actual installation directory + current_dir = os.path.dirname(os.path.realpath(cmd_path)) + + # 3. Walk up directory tree to find 'resources/app/product.json' + # Covers Linux (/opt/...), macOS (Contents/Resources/...), and Windows + for _ in range(6): + # Check standard lowercase and macOS TitleCase + for resource_folder in ["resources", "Resources"]: + json_path = os.path.join(current_dir, resource_folder, "app", "product.json") + + if os.path.exists(json_path): + try: + with open(json_path, "r", encoding="utf-8") as f: + data = json.load(f) + # Key 'windsurfVersion' is the product version (1.13.5) + # Key 'version' is the base VS Code version (1.9x) + ver = data.get("windsurfVersion") + commit = data.get("commit") + + if ver and commit: + return f"{ver}@{commit}" + except (OSError, json.JSONDecodeError): + continue + + # Move up one directory level + parent = os.path.dirname(current_dir) + if parent == current_dir: # Reached filesystem root + break + current_dir = parent + + except Exception: + return None + + return None + + +def _print_service_urls(run: Run) -> None: + if run._run.run_spec.configuration.type != RunConfigurationType.SERVICE.value: + return + console.print(_get_service_url_renderable(run)) + if model := run.service_model: + console.print( + f"Model [code]{model.name}[/] is published at:\n [link={model.url}]{model.url}[/]" + ) + console.print() + + +def _get_apply_status(run: Run) -> str: + wait_status = get_run_wait_status(run._run) + if wait_status is None: + return f"Launching [code]{run.name}[/]..." + return f"[code]{run.name}[/] is {wait_status.value}..." + + +def _get_apply_wait_renderables(run: Run) -> list[str]: + wait_status = get_run_wait_status(run._run) + if wait_status is RunWaitStatus.WAITING_FOR_REQUESTS and run._run.service is not None: + return [_get_service_url_renderable(run)] + if ( + wait_status is RunWaitStatus.WAITING_FOR_SCHEDULE + and run._run.next_triggered_at is not None + ): + next_run = run._run.next_triggered_at.astimezone().strftime("%Y-%m-%d %H:%M %Z") + return [f"Next run: {next_run}"] + return [] + + +def _get_service_url_renderable(run: Run) -> str: + return f"Service is published at:\n [link={run.service_url}]{run.service_url}[/]" + + +def _print_dev_environment_connection_info(run: Run) -> None: + if not FeatureFlags.CLI_PRINT_JOB_CONNECTION_INFO: + return + if run._run.run_spec.configuration.type != RunConfigurationType.DEV_ENVIRONMENT.value: + return + jci = run._run.jobs[0].job_connection_info + if jci is None: + return + if jci.ide_name: + urls = [u for u in (jci.attached_ide_url, jci.proxied_ide_url) if u] + if urls: + console.print( + f"To open in {jci.ide_name}, use link{'s' if len(urls) > 1 else ''} below:\n" + ) + for link in urls: + console.print(f" [link={link}]{link}[/]\n") + ssh_commands = [" ".join(c) for c in (jci.attached_ssh_command, jci.proxied_ssh_command) if c] + if ssh_commands: + console.print( + f"To connect via SSH, use: {' or '.join(f'[code]{c}[/]' for c in ssh_commands)}\n" + ) + console.print() + + +def print_finished_message(run: Run): + status_message = ( + run._run.latest_job_submission.status_message + if run._run.latest_job_submission + else run._run.status_message + ) + error = ( + run._run.latest_job_submission.error if run._run.latest_job_submission else run._run.error + ) + termination_reason = ( + run._run.latest_job_submission.termination_reason + if run._run.latest_job_submission + else None + ) + termination_reason_message = ( + run._run.latest_job_submission.termination_reason_message + if run._run.latest_job_submission + else None + ) + if run.status == RunStatus.DONE: + console.print(f"[code]{status_message.capitalize()}[/code]") + return + else: + str = f"[error]{status_message.capitalize()}[/error]" + if error: + str += f" ([error]{error.capitalize()}[/error])" + console.print(str) + + if termination_reason_message: + console.print(f"[error]{termination_reason_message}[/error]") + + if termination_reason: + console.print(f"Check [code]dstack logs -d {run.name}[/code] for more details.") + + +def get_run_exit_code(run: Run) -> int: + if run.status == RunStatus.DONE: + return 0 + return 1 + + +def _is_ready_to_attach(run: Run) -> bool: + return not ( + run.status + in [ + RunStatus.SUBMITTED, + RunStatus.PENDING, + RunStatus.PROVISIONING, + RunStatus.TERMINATING, + ] + or run._run.jobs[0].job_submissions[-1].status + in [JobStatus.SUBMITTED, JobStatus.PROVISIONING, JobStatus.PULLING] + or run._run.is_deployment_in_progress() + ) + + +def _run_resubmitted(run: Run, current_job_submission: Optional[JobSubmission]) -> bool: + if current_job_submission is None or run._run.latest_job_submission is None: + return False + return run.status == RunStatus.PENDING or ( + not run.status.is_finished() + and run._run.latest_job_submission.submitted_at > current_job_submission.submitted_at + ) + + +def render_run_spec_diff(old_spec: RunSpec, new_spec: RunSpec) -> Optional[str]: + changed_spec_fields = list(diff_models(old_spec, new_spec)) + if not changed_spec_fields: + return None + friendly_spec_field_names = { + "repo_id": "Repo ID", + "repo_code_hash": "Repo files", + "repo_data": "Repo state (branch, commit, or other)", + "ssh_key_pub": "Public SSH key", + } + nested_list = NestedList() + for spec_field in changed_spec_fields: + if spec_field == "merged_profile": + continue + elif spec_field == "configuration": + if type(old_spec.configuration) is not type(new_spec.configuration): + item = NestedListItem("Configuration type") + else: + item = NestedListItem( + "Configuration properties:", + children=[ + NestedListItem(field) + for field in diff_models(old_spec.configuration, new_spec.configuration) + ], + ) + elif spec_field == "profile": + if type(old_spec.profile) is not type(new_spec.profile): + item = NestedListItem("Profile") + else: + assert old_spec.profile is not None + assert new_spec.profile is not None + item = NestedListItem( + "Profile properties:", + children=[ + NestedListItem(field) + for field in diff_models(old_spec.profile, new_spec.profile) + ], + ) + elif spec_field in friendly_spec_field_names: + item = NestedListItem(friendly_spec_field_names[spec_field]) + else: + item = NestedListItem(spec_field.replace("_", " ").capitalize()) + nested_list.children.append(item) + return nested_list.render() diff --git a/src/dstack/_internal/cli/services/configurators/volume.py b/src/dstack/_internal/cli/services/configurators/volume.py index da4ead1b5b..2449de0da7 100644 --- a/src/dstack/_internal/cli/services/configurators/volume.py +++ b/src/dstack/_internal/cli/services/configurators/volume.py @@ -1,83 +1,224 @@ import argparse +import time + +from rich.table import Table from dstack._internal.cli.services.configurators.base import BaseApplyConfigurator -from dstack._internal.cli.utils.common import confirm_ask, console -from dstack._internal.cli.utils.volume import print_volumes_table +from dstack._internal.cli.utils.common import ( + LIVE_TABLE_PROVISION_INTERVAL_SECS, + confirm_ask, + console, +) +from dstack._internal.cli.utils.rich import MultiItemStatus +from dstack._internal.cli.utils.volume import get_volumes_table from dstack._internal.core.errors import ResourceNotExistsError from dstack._internal.core.models.configurations import ApplyConfigurationType -from dstack._internal.core.models.volumes import VolumeConfiguration +from dstack._internal.core.models.volumes import ( + AnyVolumeConfiguration, + Volume, + VolumeConfigurationWithRegion, + VolumePlan, + VolumeSpec, + VolumeStatus, +) +from dstack._internal.utils.common import local_time +from dstack.api._public import Client + +class VolumeConfigurator(BaseApplyConfigurator[AnyVolumeConfiguration]): + TYPE = ApplyConfigurationType.VOLUME -class VolumeConfigurator(BaseApplyConfigurator): - TYPE: ApplyConfigurationType = ApplyConfigurationType.VOLUME + def apply_configuration( + self, + conf: AnyVolumeConfiguration, + configuration_path: str, + command_args: argparse.Namespace, + configurator_args: argparse.Namespace, + ): + self.apply_args(conf, configurator_args) + spec = VolumeSpec( + configuration=conf, + configuration_path=configuration_path, + ) + with console.status("Getting apply plan..."): + plan = _get_plan(api=self.api, spec=spec) + _print_plan_header(plan) - def apply_configuration(self, conf: VolumeConfiguration, args: argparse.Namespace): - # TODO: Show apply plan - confirmed = False - if conf.name is not None: - try: - volume = self.api_client.client.volumes.get( - project_name=self.api_client.project, - name=conf.name, + action_message = "" + confirm_message = "" + if plan.current_resource is None: + if plan.spec.configuration.name is not None: + action_message += ( + f"Volume [code]{plan.spec.configuration.name}[/] does not exist yet." ) - except ResourceNotExistsError: - pass + confirm_message += "Create the volume?" + else: + action_message += f"Found volume [code]{plan.spec.configuration.name}[/]." + if plan.current_resource.configuration == plan.spec.configuration: + if command_args.yes and not command_args.force: + # --force is required only with --yes, + # otherwise we may ask for force apply interactively. + console.print( + "No configuration changes detected. Use --force to apply anyway." + ) + return + action_message += " No configuration changes detected." + confirm_message += "Re-create the volume?" else: - if volume.configuration == conf: - if not args.force: - console.print( - "Volume configuration has not changed. Use --force to recreate the volume." + action_message += " Configuration changes detected." + confirm_message += "Re-create the volume?" + + console.print(action_message) + if not command_args.yes and not confirm_ask(confirm_message): + console.print("\nExiting...") + return + + if plan.current_resource is not None: + with console.status("Deleting existing volume..."): + self.api.client.volumes.delete( + project_name=self.api.project, names=[plan.current_resource.name] + ) + # Volume deletion is async. Wait for volume to be deleted. + while True: + try: + self.api.client.volumes.get( + project_name=self.api.project, name=plan.current_resource.name ) - return - if not args.yes and not confirm_ask( - "Volume configuration has not changed. Re-create the volume?" - ): - console.print("\nExiting...") - return - elif not args.yes and not confirm_ask( - f"Volume [code]{conf.name}[/] already exists. Re-create the volume?" - ): - console.print("\nExiting...") - return - confirmed = True - with console.status("Deleting volume..."): - self.api_client.client.volumes.delete( - project_name=self.api_client.project, names=[conf.name] - ) - if not confirmed and not args.yes: - if not confirm_ask( - f"Volume [code]{conf.name}[/] does not exist yet. Create the volume?" - ): - console.print("\nExiting...") - return + except ResourceNotExistsError: + break + else: + time.sleep(1) + with console.status("Creating volume..."): - volume = self.api_client.client.volumes.create( - project_name=self.api_client.project, + volume = self.api.client.volumes.create( + project_name=self.api.project, configuration=conf, ) - print_volumes_table([volume]) + if command_args.detach: + console.print("Volume configuration submitted. Exiting...") + return + try: + with MultiItemStatus( + f"Provisioning [code]{volume.name}[/]...", console=console + ) as live: + while not _finished_provisioning(volume): + table = get_volumes_table([volume]) + live.update(table) + time.sleep(LIVE_TABLE_PROVISION_INTERVAL_SECS) + volume = self.api.client.volumes.get(self.api.project, volume.name) + except KeyboardInterrupt: + if not command_args.yes and confirm_ask("Delete the volume before exiting?"): + with console.status("Deleting volume..."): + self.api.client.volumes.delete( + project_name=self.api.project, names=[volume.name] + ) + else: + console.print("Exiting... Volume provisioning will continue in the background.") + return + console.print( + get_volumes_table( + [volume], + verbose=volume.status == VolumeStatus.FAILED, + format_date=local_time, + ) + ) + if volume.status == VolumeStatus.FAILED: + console.print( + f"\n[error]Provisioning failed. Error: {volume.status_message or 'unknown'}[/]" + ) + exit(1) - def delete_configuration(self, conf: VolumeConfiguration, args: argparse.Namespace): + def delete_configuration( + self, + conf: AnyVolumeConfiguration, + configuration_path: str, + command_args: argparse.Namespace, + ): if conf.name is None: console.print("[error]Configuration specifies no volume to delete[/]") - return + exit(1) try: - self.api_client.client.volumes.get( - project_name=self.api_client.project, + self.api.client.volumes.get( + project_name=self.api.project, name=conf.name, ) except ResourceNotExistsError: console.print(f"Volume [code]{conf.name}[/] does not exist") - return + exit(1) - if not args.yes and not confirm_ask(f"Delete the volume [code]{conf.name}[/]?"): + if not command_args.yes and not confirm_ask(f"Delete the volume [code]{conf.name}[/]?"): console.print("\nExiting...") return with console.status("Deleting volume..."): - self.api_client.client.volumes.delete( - project_name=self.api_client.project, names=[conf.name] - ) + self.api.client.volumes.delete(project_name=self.api.project, names=[conf.name]) console.print(f"Volume [code]{conf.name}[/] deleted") + + @classmethod + def register_args(cls, parser: argparse.ArgumentParser): + configuration_group = parser.add_argument_group(f"{cls.TYPE.value} Options") + configuration_group.add_argument( + "-n", + "--name", + dest="name", + help="The volume name", + ) + + def apply_args(self, conf: AnyVolumeConfiguration, args: argparse.Namespace): + if args.name: + conf.name = args.name + + +def _get_plan(api: Client, spec: VolumeSpec) -> VolumePlan: + # TODO: Implement server-side /get_plan with an offer included + user = api.client.users.get_my_user() + current_resource = None + if spec.configuration.name is not None: + try: + current_resource = api.client.volumes.get( + project_name=api.project, name=spec.configuration.name + ) + except ResourceNotExistsError: + pass + return VolumePlan( + project_name=api.project, + user=user.username, + spec=spec, + current_resource=current_resource, + ) + + +def _print_plan_header(plan: VolumePlan): + def th(s: str) -> str: + return f"[bold]{s}[/bold]" + + configuration_table = Table(box=None, show_header=False) + configuration_table.add_column(no_wrap=True) # key + configuration_table.add_column() # value + + configuration_table.add_row(th("Project"), plan.project_name) + configuration_table.add_row(th("User"), plan.user) + configuration_table.add_row(th("Configuration"), plan.spec.configuration_path) + configuration_table.add_row(th("Type"), plan.spec.configuration.type) + + volume_type = "managed" + size = "-" + if plan.spec.configuration.size is not None: + size = str(plan.spec.configuration.size) + if plan.spec.configuration.is_external: + volume_type = "external" + + configuration_table.add_row(th("Volume type"), volume_type) + configuration_table.add_row(th("Backend"), plan.spec.configuration.backend.value) + if isinstance(plan.spec.configuration, VolumeConfigurationWithRegion): + configuration_table.add_row(th("Region"), plan.spec.configuration.region) + configuration_table.add_row(th("Size"), size) + + console.print(configuration_table) + console.print() + + +def _finished_provisioning(volume: Volume) -> bool: + return volume.status in [VolumeStatus.ACTIVE, VolumeStatus.FAILED] diff --git a/src/dstack/_internal/cli/services/events.py b/src/dstack/_internal/cli/services/events.py new file mode 100644 index 0000000000..2efe82fb52 --- /dev/null +++ b/src/dstack/_internal/cli/services/events.py @@ -0,0 +1,156 @@ +import time +import uuid +from collections.abc import Iterator +from dataclasses import asdict, dataclass +from datetime import datetime, timedelta +from typing import Optional + +from rich.text import Text + +from dstack._internal.cli.utils.common import LIVE_TABLE_PROVISION_INTERVAL_SECS, console +from dstack._internal.core.models.events import Event, EventTarget, EventTargetType +from dstack._internal.server.schemas.events import LIST_EVENTS_DEFAULT_LIMIT +from dstack.api.server._events import EventsAPIClient + + +@dataclass +class EventListFilters: + target_fleets: Optional[list[uuid.UUID]] = None + target_runs: Optional[list[uuid.UUID]] = None + target_volumes: Optional[list[uuid.UUID]] = None + target_gateways: Optional[list[uuid.UUID]] = None + target_secrets: Optional[list[uuid.UUID]] = None + within_projects: Optional[list[uuid.UUID]] = None + within_fleets: Optional[list[uuid.UUID]] = None + within_runs: Optional[list[uuid.UUID]] = None + include_target_types: Optional[list[EventTargetType]] = None + + +class EventPaginator: + def __init__(self, client: EventsAPIClient) -> None: + self._client = client + + def list( + self, filters: EventListFilters, since: Optional[datetime], ascending: bool + ) -> Iterator[Event]: + prev_id = None + prev_recorded_at = since + while True: + events = self._client.list( + prev_id=prev_id, + prev_recorded_at=prev_recorded_at, + limit=LIST_EVENTS_DEFAULT_LIMIT, + ascending=ascending, + **asdict(filters), + ) + for event in events: + yield event + if len(events) < LIST_EVENTS_DEFAULT_LIMIT: + break + prev_id = events[-1].id + prev_recorded_at = events[-1].recorded_at + + +class EventTracker: + """ + Tracks new events from the server. Implements a sliding window mechanism to avoid + missing events that are commited with a delay. + """ + + def __init__( + self, + client: EventsAPIClient, + filters: EventListFilters, + since: Optional[datetime], + event_delay_tolerance: timedelta = timedelta(seconds=20), + ) -> None: + self._client = client + self._filters = filters + self._since = since + self._event_delay_tolerance = event_delay_tolerance + self._seen_events: dict[uuid.UUID, _SeenEvent] = {} + self._latest_event: Optional[Event] = None + + def poll(self) -> Iterator[Event]: + """ + Fetches the next batch of events from the server. + """ + + if self._since is None and self._latest_event is None: + # First batch without `since` - fetch some recent events + event_stream = reversed(self._client.list(ascending=False, **asdict(self._filters))) + else: + configured_since = self._since or datetime.fromtimestamp(0) + latest_event_recorded_at = ( + self._latest_event.recorded_at + if self._latest_event is not None + else datetime.fromtimestamp(0) + ) + since = max( + configured_since.astimezone(), + latest_event_recorded_at.astimezone() - self._event_delay_tolerance, + ) + self._cleanup_seen_events(before=since) + event_stream = EventPaginator(self._client).list(self._filters, since, ascending=True) + + for event in event_stream: + if event.id not in self._seen_events: + self._seen_events[event.id] = _SeenEvent(recorded_at=event.recorded_at) + yield event + self._latest_event = event + + def stream_forever( + self, + update_interval: timedelta = timedelta(seconds=LIVE_TABLE_PROVISION_INTERVAL_SECS), + ) -> Iterator[Event]: + """ + Yields events as they are received from the server. + """ + + while True: + for event in self.poll(): + yield event + time.sleep(update_interval.total_seconds()) + + def _cleanup_seen_events(self, before: datetime) -> None: + ids_to_delete = { + event_id + for event_id, seen_event in self._seen_events.items() + if seen_event.recorded_at.astimezone() < before.astimezone() + } + for event_id in ids_to_delete: + del self._seen_events[event_id] + + +@dataclass +class _SeenEvent: + recorded_at: datetime + + +def print_event(current_project: str, event: Event) -> None: + recorded_at = event.recorded_at.astimezone().strftime("%Y-%m-%d %H:%M:%S") + targets = ", ".join(_format_target(current_project, target) for target in event.targets) + message = [ + Text(f"[{recorded_at}]", style="log.time"), + ] + if event.actor_user: + message.append(Text(f"[👤{event.actor_user}]", style="secondary")) + message += [ + Text(f"[{targets}]", style="secondary"), + Text(event.message, style="log.message"), + ] + console.print( + *message, + soft_wrap=True, # Strictly one line per event. Allows for grepping + ) + + +def _format_target(current_project: str, target: EventTarget) -> str: + name = target.name + if ( + target.project_name is not None + and target.type != EventTargetType.PROJECT + and target.project_name != current_project + ): + name = f"{target.project_name}/{name}" + return f"{target.type} {name}" diff --git a/src/dstack/_internal/cli/services/profile.py b/src/dstack/_internal/cli/services/profile.py index dab533ed09..e8086bb43b 100644 --- a/src/dstack/_internal/cli/services/profile.py +++ b/src/dstack/_internal/cli/services/profile.py @@ -4,21 +4,18 @@ from dstack._internal.core.models.configurations import AnyRunConfiguration from dstack._internal.core.models.profiles import ( - DEFAULT_INSTANCE_RETRY_DURATION, - DEFAULT_POOL_TERMINATION_IDLE_TIME, CreationPolicy, Profile, - ProfileRetryPolicy, + ProfileRetry, SpotPolicy, - TerminationPolicy, parse_duration, parse_max_duration, ) -def register_profile_args(parser: argparse.ArgumentParser, pool_add: bool = False): +def register_profile_args(parser: argparse.ArgumentParser): """ - Registers `parser` with `dstack run` and `dstack pool add` + Registers `parser` with `dstack apply` run configuration CLI arguments that override `profiles.yml` settings. """ profile_group = parser.add_argument_group("Profile") @@ -36,14 +33,13 @@ def register_profile_args(parser: argparse.ArgumentParser, pool_add: bool = Fals help="The maximum price per hour, in dollars", dest="max_price", ) - if not pool_add: - profile_group.add_argument( - "--max-duration", - type=max_duration, - dest="max_duration", - help="The maximum duration of the run", - metavar="DURATION", - ) + profile_group.add_argument( + "--max-duration", + type=max_duration, + dest="max_duration", + help="The maximum duration of the run", + metavar="DURATION", + ) profile_group.add_argument( "-b", "--backend", @@ -67,41 +63,35 @@ def register_profile_args(parser: argparse.ArgumentParser, pool_add: bool = Fals dest="instance_types", help="The cloud-specific instance types that will be tried for provisioning", ) - if pool_add: - pools_group_exc = parser - else: - pools_group = parser.add_argument_group("Pools") - pools_group_exc = pools_group.add_mutually_exclusive_group() - pools_group_exc.add_argument( - "--pool", - dest="pool_name", - help="The name of the pool. If not set, the default pool will be used", + + fleets_group = parser.add_argument_group("Fleets") + fleets_group.add_argument( + "--fleet", + action="append", + metavar="NAME", + dest="fleets", + help="Consider only the specified fleet(s)", ) - pools_group_exc.add_argument( + fleets_group_exc = fleets_group.add_mutually_exclusive_group() + fleets_group_exc.add_argument( + "-R", "--reuse", dest="creation_policy_reuse", action="store_true", - help="Reuse instance from pool", + help="Reuse an existing instance from fleet (do not provision a new one)", ) - pools_group_exc.add_argument( + fleets_group_exc.add_argument( "--dont-destroy", dest="dont_destroy", action="store_true", - help="Do not destroy instance after the run is finished", + help="Do not destroy instance after the run is finished (if the run provisions a new instance)", ) - pools_group_exc.add_argument( + fleets_group_exc.add_argument( "--idle-duration", dest="idle_duration", type=str, - help="Time to wait before destroying the idle instance", + help="Time to wait before destroying the idle instance (if the run provisions a new instance)", ) - if not pool_add: - pools_group_exc.add_argument( - "--instance", - dest="instance_name", - metavar="NAME", - help="Reuse instance from pool with name [code]NAME[/]", - ) spot_group = parser.add_argument_group("Spot policy") spot_group_exc = spot_group.add_mutually_exclusive_group() @@ -136,10 +126,8 @@ def register_profile_args(parser: argparse.ArgumentParser, pool_add: bool = Fals retry_group = parser.add_argument_group("Retry policy") retry_group_exc = retry_group.add_mutually_exclusive_group() - retry_group_exc.add_argument("--retry", action="store_const", dest="retry_policy", const=True) - retry_group_exc.add_argument( - "--no-retry", action="store_const", dest="retry_policy", const=False - ) + retry_group_exc.add_argument("--retry", action="store_const", dest="retry", const=True) + retry_group_exc.add_argument("--no-retry", action="store_const", dest="retry", const=False) retry_group_exc.add_argument( "--retry-duration", type=retry_duration, dest="retry_duration", metavar="DURATION" ) @@ -148,7 +136,6 @@ def register_profile_args(parser: argparse.ArgumentParser, pool_add: bool = Fals def apply_profile_args( args: argparse.Namespace, profile_settings: Union[Profile, AnyRunConfiguration], - pool_add: bool = False, ): """ Overrides `profile_settings` settings with arguments registered by `register_profile_args()`. @@ -164,53 +151,27 @@ def apply_profile_args( profile_settings.instance_types = args.instance_types if args.max_price is not None: profile_settings.max_price = args.max_price - if not pool_add: - if args.max_duration is not None: - profile_settings.max_duration = args.max_duration - - if args.pool_name: - profile_settings.pool_name = args.pool_name + if args.max_duration is not None: + profile_settings.max_duration = args.max_duration + if args.fleets: + profile_settings.fleets = args.fleets if args.idle_duration is not None: - profile_settings.termination_idle_time = args.idle_duration - if pool_add and args.idle_duration is None: - profile_settings.termination_idle_time = DEFAULT_POOL_TERMINATION_IDLE_TIME - - if args.dont_destroy: - profile_settings.termination_policy = TerminationPolicy.DONT_DESTROY - if not pool_add: - if args.instance_name: - profile_settings.instance_name = args.instance_name - if args.creation_policy_reuse: - profile_settings.creation_policy = CreationPolicy.REUSE + profile_settings.idle_duration = args.idle_duration + elif args.dont_destroy: + profile_settings.idle_duration = -1 + if args.creation_policy_reuse: + profile_settings.creation_policy = CreationPolicy.REUSE if args.spot_policy is not None: profile_settings.spot_policy = args.spot_policy - if pool_add and args.spot_policy is None: # ONDEMAND by default for `dstack pool add` - profile_settings.spot_policy = SpotPolicy.ONDEMAND - if not pool_add: - if args.retry_policy is not None: - if not profile_settings.retry_policy: - profile_settings.retry_policy = ProfileRetryPolicy() - profile_settings.retry_policy.retry = args.retry_policy - elif args.retry_duration is not None: - if not profile_settings.retry_policy: - profile_settings.retry_policy = ProfileRetryPolicy() - profile_settings.retry_policy.retry = True - profile_settings.retry_policy.duration = args.retry_duration - else: - if args.retry_policy is not None: - if not profile_settings.retry_policy: - profile_settings.retry_policy = ProfileRetryPolicy() - profile_settings.retry_policy.retry = args.retry_policy - if profile_settings.retry_policy.retry: - profile_settings.retry_policy.duration = DEFAULT_INSTANCE_RETRY_DURATION - elif args.retry_duration is not None: - if not profile_settings.retry_policy: - profile_settings.retry_policy = ProfileRetryPolicy() - profile_settings.retry_policy.retry = True - profile_settings.retry_policy.duration = args.retry_duration # --retry-duration + if args.retry is not None: + profile_settings.retry = args.retry + elif args.retry_duration is not None: + profile_settings.retry = ProfileRetry( + duration=args.retry_duration, + ) def max_duration(v: str) -> int: diff --git a/src/dstack/_internal/cli/services/repos.py b/src/dstack/_internal/cli/services/repos.py new file mode 100644 index 0000000000..b9a48e295a --- /dev/null +++ b/src/dstack/_internal/cli/services/repos.py @@ -0,0 +1,72 @@ +from pathlib import Path + +from dstack._internal.cli.services.configurators.base import ArgsParser +from dstack._internal.core.errors import ( + CLIError, + RepoDetachedHeadError, + RepoError, + RepoInvalidGitRepositoryError, +) +from dstack._internal.core.models.repos.remote import GitRepoURL, RemoteRepo +from dstack._internal.core.models.repos.virtual import VirtualRepo +from dstack._internal.utils.path import PathLike +from dstack.api._public import Client + + +def register_init_repo_args(parser: ArgsParser): + parser.add_argument( + "-t", + "--token", + metavar="OAUTH_TOKEN", + help="An authentication token to access a private Git repo", + type=str, + dest="gh_token", + ) + parser.add_argument( + "--git-identity", + metavar="SSH_PRIVATE_KEY", + help="The private SSH key path to access a private Git repo", + type=str, + dest="git_identity_file", + ) + + +def init_default_virtual_repo(api: Client) -> VirtualRepo: + repo = VirtualRepo() + api.repos.init(repo) + return repo + + +def get_repo_from_dir(repo_dir: PathLike) -> RemoteRepo: + repo_dir = Path(repo_dir) + if not repo_dir.exists(): + raise CLIError(f"Path does not exist: {repo_dir}") + if not repo_dir.is_dir(): + raise CLIError(f"Path is not a directory: {repo_dir}") + try: + return RemoteRepo.from_dir(repo_dir) + except RepoInvalidGitRepositoryError: + raise CLIError( + f"Git repo not found: {repo_dir}\n" + "Use `files` to mount an arbitrary directory:" + " https://dstack.ai/docs/concepts/tasks/#files" + ) + except RepoDetachedHeadError: + raise CLIError(f"Git repo in 'detached HEAD' state: {repo_dir}\nCheck out to a branch") + except RepoError as e: + raise CLIError(str(e)) from e + + +def get_repo_from_url(repo_url: str) -> RemoteRepo: + try: + return RemoteRepo.from_url(repo_url) + except RepoError as e: + raise CLIError(str(e)) from e + + +def is_git_repo_url(value: str) -> bool: + try: + GitRepoURL.parse(value) + except RepoError: + return False + return True diff --git a/src/dstack/_internal/cli/services/resources.py b/src/dstack/_internal/cli/services/resources.py new file mode 100644 index 0000000000..e81b6078db --- /dev/null +++ b/src/dstack/_internal/cli/services/resources.py @@ -0,0 +1,54 @@ +import argparse + +from dstack._internal.cli.services.args import cpu_spec, disk_spec, gpu_spec, memory_spec +from dstack._internal.cli.services.configurators.base import ArgsParser +from dstack._internal.core.models import resources +from dstack._internal.core.models.configurations import AnyRunConfiguration + + +def register_resources_args(parser: ArgsParser) -> None: + parser.add_argument( + "--cpu", + type=cpu_spec, + help=( + "Request CPU for the run." + " The format is [code]ARCH[/]:[code]COUNT[/] (all parts are optional)" + ), + dest="cpu_spec", + metavar="SPEC", + ) + parser.add_argument( + "--gpu", + type=gpu_spec, + help=( + "Request GPU for the run." + " The format is [code]NAME[/]:[code]COUNT[/]:[code]MEMORY[/] (all parts are optional)" + ), + dest="gpu_spec", + metavar="SPEC", + ) + parser.add_argument( + "--memory", + type=memory_spec, + help="Request the size range of RAM for the run. Example [code]--memory 128GB..256GB[/]", + dest="memory_spec", + metavar="RANGE", + ) + parser.add_argument( + "--disk", + type=disk_spec, + help="Request the size range of disk for the run. Example [code]--disk 100GB..[/]", + dest="disk_spec", + metavar="RANGE", + ) + + +def apply_resources_args(args: argparse.Namespace, conf: AnyRunConfiguration) -> None: + if args.cpu_spec: + conf.resources.cpu = resources.CPUSpec.parse_obj(args.cpu_spec) + if args.gpu_spec: + conf.resources.gpu = resources.GPUSpec.parse_obj(args.gpu_spec) + if args.memory_spec: + conf.resources.memory = args.memory_spec + if args.disk_spec: + conf.resources.disk = args.disk_spec diff --git a/src/dstack/_internal/cli/utils/common.py b/src/dstack/_internal/cli/utils/common.py index dfc948eb8f..b437a3c2cc 100644 --- a/src/dstack/_internal/cli/utils/common.py +++ b/src/dstack/_internal/cli/utils/common.py @@ -1,14 +1,20 @@ import logging -import os -from typing import Any, Dict, Union +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Optional +import requests from rich.console import Console from rich.prompt import Confirm from rich.table import Table from rich.theme import Theme +from dstack._internal import settings from dstack._internal.cli.utils.rich import DstackRichHandler from dstack._internal.core.errors import CLIError, DstackError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import InstanceAvailability +from dstack._internal.utils.common import get_dstack_dir, parse_since _colors = { "secondary": "grey58", @@ -18,35 +24,153 @@ "code": "bold sea_green3", } -console = Console(theme=Theme(_colors)) +console = Console( + theme=Theme(_colors), + force_terminal=settings.CLI_RICH_FORCE_TERMINAL, +) + + +LIVE_TABLE_REFRESH_RATE_PER_SEC = 1 +LIVE_TABLE_PROVISION_INTERVAL_SECS = 2 +NO_OFFERS_WARNING = ( + "[warning]" + "No matching instance offers available. Possible reasons:" + " [link]https://dstack.ai/docs/guides/troubleshooting/#no-offers[/link]" + "[/]\n" +) +NO_FLEETS_WARNING = ( + "[error]" + "The project has no fleets. Create one before submitting a run.\n" + "See [link]https://dstack.ai/docs/guides/troubleshooting/#no-fleets[/link]" + "[/]\n" +) def cli_error(e: DstackError) -> CLIError: return CLIError(*e.args) +def _get_cli_log_file() -> Path: + """Get the CLI log file path, rotating the previous log if needed.""" + log_dir = get_dstack_dir() / "logs" / "cli" + log_file = log_dir / "latest.log" + + if log_file.exists(): + file_mtime = datetime.fromtimestamp(log_file.stat().st_mtime, tz=timezone.utc) + current_date = datetime.now(timezone.utc).date() + + if file_mtime.date() < current_date: + date_str = file_mtime.strftime("%Y-%m-%d") + rotated_file = log_dir / f"{date_str}.log" + + counter = 1 + while rotated_file.exists(): + rotated_file = log_dir / f"{date_str}-{counter}.log" + counter += 1 + + log_file.rename(rotated_file) + + log_dir.mkdir(parents=True, exist_ok=True) + return log_file + + def configure_logging(): dstack_logger = logging.getLogger("dstack") - dstack_logger.setLevel(os.getenv("DSTACK_CLI_LOG_LEVEL", "INFO").upper()) - handler = DstackRichHandler(console=console) - handler.setFormatter(logging.Formatter(fmt="%(message)s", datefmt="[%X]")) - dstack_logger.addHandler(handler) + dstack_logger.handlers.clear() + + stdout_handler = DstackRichHandler(console=console) + stdout_handler.setFormatter(logging.Formatter(fmt="%(message)s", datefmt="[%X]")) + stdout_handler.setLevel(settings.CLI_LOG_LEVEL) + dstack_logger.addHandler(stdout_handler) + + log_file = get_dstack_dir() / "logs" / "cli" / "latest.log" + try: + log_file = _get_cli_log_file() + file_handler = logging.FileHandler(log_file) + file_handler.setFormatter( + logging.Formatter( + fmt="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + ) + file_handler.setLevel(settings.CLI_FILE_LOG_LEVEL) + dstack_logger.addHandler(file_handler) + except PermissionError: + console.print(f"[warning]Couldn't write to {log_file} due to a permissions problem.[/]") + + # the logger allows all messages, filtering is done by the handlers + dstack_logger.setLevel(logging.DEBUG) def confirm_ask(prompt, **kwargs) -> bool: kwargs["console"] = console - return Confirm.ask(prompt=prompt, **kwargs) + try: + return Confirm.ask(prompt=prompt, **kwargs) + except KeyboardInterrupt: + console.print("\nCancelled by user") + raise SystemExit(1) -def add_row_from_dict(table: Table, data: Dict[Union[str, int], Any], **kwargs): - """Maps dict keys to a table columns. `data` key is a column name or index. Missing keys are ignored.""" +def add_row_from_dict(table: Table, data: dict[str, Any], **kwargs): + """Maps dict keys to table columns. `data` key is the column name. Missing keys are ignored.""" row = [] - for i, col in enumerate(table.columns): - # TODO(egor-s): clear header style - if col.header in data: - row.append(data[col.header]) - elif i in data: - row.append(data[i]) - else: - row.append("") + for col in table.columns: + row.append(data.get(str(col.header), "")) table.add_row(*row, **kwargs) + + +def warn(message: str): + if not message.endswith("\n"): + # Additional blank line for better visibility if there are more than one warning + message = f"{message}\n" + console.print(f"[warning][bold]{message}[/]") + + +def get_start_time(since: Optional[str]) -> Optional[datetime]: + if since is None: + return None + try: + return parse_since(since) + except ValueError as e: + raise CLIError(e.args[0]) + + +def resolve_url(url: str, timeout: float = 5.0) -> str: + """ + Starts with http:// and follows redirects. Returns the final URL (including scheme). + """ + if not url.startswith("https://") and not url.startswith("https://"): + url = "https://" + url + try: + response = requests.get( + url, + allow_redirects=True, + timeout=timeout, + ) + except requests.exceptions.ConnectionError as e: + raise ValueError(f"Failed to resolve url {url}") from e + return response.url + + +def format_entity_reference(name: str, project: str, current_project: str) -> str: + if current_project == project: + return name + else: + return f"{project}/{name}" + + +def format_instance_availability(v: InstanceAvailability) -> str: + if v in (InstanceAvailability.UNKNOWN, InstanceAvailability.AVAILABLE): + return "" + return v.value.replace("_", " ").lower() + + +def format_backend(backend: Optional[BackendType], region: Optional[str]) -> str: + if backend is None: + return "-" + backend_str = backend.value + if backend == BackendType.REMOTE: + backend_str = "ssh" + if region: + backend_str += f" ({region})" + return backend_str diff --git a/src/dstack/_internal/cli/utils/fleet.py b/src/dstack/_internal/cli/utils/fleet.py new file mode 100644 index 0000000000..ccb2400857 --- /dev/null +++ b/src/dstack/_internal/cli/utils/fleet.py @@ -0,0 +1,281 @@ +from typing import Any, List, Optional + +from rich.table import Table + +from dstack._internal.cli.utils.common import ( + add_row_from_dict, + console, + format_backend, + format_entity_reference, +) +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.fleets import Fleet, FleetNodesSpec, FleetStatus +from dstack._internal.core.models.instances import Instance, InstanceStatus +from dstack._internal.core.models.resources import GPUSpec, ResourcesSpec +from dstack._internal.utils.common import DateFormatter, pretty_date + + +def print_fleets_table(fleets: List[Fleet], current_project: str, verbose: bool = False) -> None: + console.print(get_fleets_table(fleets, current_project=current_project, verbose=verbose)) + console.print() + + +def get_fleets_table( + fleets: List[Fleet], + current_project: str, + verbose: bool = False, + format_date: DateFormatter = pretty_date, +) -> Table: + table = Table(box=None) + + # Columns + table.add_column("NAME", style="bold", no_wrap=True) + table.add_column("NODES") + if verbose: + table.add_column("RESOURCES") + else: + table.add_column("GPU") + table.add_column("SPOT") + table.add_column("BACKEND") + table.add_column("PRICE") + table.add_column("STATUS", no_wrap=True) + table.add_column("CREATED", no_wrap=True) + if verbose: + table.add_column("ERROR") + + for fleet in fleets: + # Fleet row + config = fleet.spec.configuration + merged_profile = fleet.spec.merged_profile + + # Detect SSH fleet vs backend fleet + if config.ssh_config is not None: + # SSH fleet: fixed number of hosts, no cloud billing + nodes = str(len(config.ssh_config.hosts)) + resources = "-" + gpu = "-" + backend = "ssh" + spot_policy = "-" + max_price = "-" + else: + # Backend fleet: dynamic nodes, cloud billing + nodes = _format_nodes(config.nodes) + resources = config.resources.pretty_format() if config.resources else "-" + gpu = _format_fleet_gpu(config.resources) + backend = _format_backends(config.backends) + spot_policy = "-" + if merged_profile and merged_profile.spot_policy: + spot_policy = merged_profile.spot_policy.value + # Format as "$0..$X.XX" range, or "-" if not set + if merged_profile and merged_profile.max_price is not None: + max_price = f"$0..{_format_price(merged_profile.max_price)}" + else: + max_price = "-" + + # In verbose mode, append placement to nodes if cluster + if verbose and config.placement and config.placement.value == "cluster": + nodes = f"{nodes} (cluster)" + + fleet_row = { + "NAME": format_entity_reference(fleet.name, fleet.project_name, current_project), + "NODES": nodes, + "RESOURCES": resources, + "GPU": gpu, + "BACKEND": backend, + "PRICE": max_price, + "SPOT": spot_policy, + "STATUS": _format_fleet_status(fleet), + "CREATED": format_date(fleet.created_at), + } + + add_row_from_dict(table, fleet_row) + + # Instance rows (indented) + for instance in fleet.instances: + # Check if this is an SSH instance + is_ssh_instance = instance.backend == BackendType.REMOTE + + # Format backend with region (and AZ in verbose mode) + if verbose and instance.availability_zone: + # In verbose mode, show AZ instead of region (AZ is more specific) + backend_with_region = format_backend(instance.backend, instance.availability_zone) + else: + backend_with_region = format_backend(instance.backend, instance.region) + + # Get spot info from instance resources (not applicable to SSH) + if is_ssh_instance: + instance_spot = "-" + instance_price = "-" + else: + instance_spot = "-" + if ( + instance.instance_type is not None + and instance.instance_type.resources is not None + ): + instance_spot = ( + "spot" if instance.instance_type.resources.spot else "on-demand" + ) + instance_price = _format_price(instance.price) + + instance_row = { + "NAME": f" instance={instance.instance_num}", + "NODES": "", + "RESOURCES": _format_instance_resources(instance), + "GPU": _format_instance_gpu(instance), + "BACKEND": backend_with_region, + "PRICE": instance_price, + "SPOT": instance_spot, + "STATUS": _format_instance_status(instance), + "CREATED": format_date(instance.created), + } + + if instance.status == InstanceStatus.TERMINATED and instance.termination_reason: + instance_row["ERROR"] = instance.termination_reason + + add_row_from_dict(table, instance_row, style="secondary") + + return table + + +def _format_nodes(nodes: Optional[FleetNodesSpec]) -> str: + """Format nodes spec as '0..1', '3', '2..10', etc.""" + if nodes is None: + return "-" + if nodes.min == nodes.max: + return str(nodes.min) + if nodes.max is None: + return f"{nodes.min}.." + return f"{nodes.min}..{nodes.max}" + + +def _format_backends(backends: Optional[List[BackendType]]) -> str: + if backends is None or len(backends) == 0: + return "*" + return ", ".join(b.value.replace("remote", "ssh") for b in backends) + + +def _format_range(min_val: Optional[Any], max_val: Optional[Any]) -> str: + if min_val is None and max_val is None: + return "" + if min_val == max_val: + return str(min_val) + if max_val is None: + return f"{min_val}.." + if min_val is None: + return f"..{max_val}" + return f"{min_val}..{max_val}" + + +def _format_fleet_gpu(resources: Optional[ResourcesSpec]) -> str: + """Extract GPU-only info from fleet requirements, handling ranges.""" + if resources is None or resources.gpu is None: + return "-" + + gpu: GPUSpec = resources.gpu + + # Check if there's actually a GPU requirement + count = gpu.count + if count is None or (count.min == 0 and (count.max is None or count.max == 0)): + return "-" + + parts = [] + + # GPU name(s) + if gpu.name: + parts.append(",".join(gpu.name)) + else: + parts.append("gpu") + + # GPU memory (range) + if gpu.memory is not None: + mem_str = _format_range(gpu.memory.min, gpu.memory.max) + if mem_str: + parts.append(mem_str) + + # GPU count (range) + count_str = _format_range(count.min, count.max) + if count_str: + parts.append(count_str) + + return ":".join(parts) + + +def _format_fleet_status(fleet: Fleet) -> str: + status = fleet.status + status_text = status.value + + color_map = { + FleetStatus.SUBMITTED: "grey", + FleetStatus.ACTIVE: "white", + FleetStatus.TERMINATING: "deep_sky_blue1", + FleetStatus.TERMINATED: "grey", + FleetStatus.FAILED: "indian_red1", + } + color = color_map.get(status, "white") + is_finished = status in [FleetStatus.TERMINATED, FleetStatus.FAILED] + status_style = f"bold {color}" if not is_finished else color + return f"[{status_style}]{status_text}[/]" + + +def _format_instance_status(instance: Instance) -> str: + """Format instance status with colors and health info.""" + status = instance.status + status_text = status.value + + total_blocks = instance.total_blocks + busy_blocks = instance.busy_blocks + if ( + status in [InstanceStatus.IDLE, InstanceStatus.BUSY] + and total_blocks is not None + and total_blocks > 1 + ): + status_text = f"{busy_blocks}/{total_blocks} {InstanceStatus.BUSY.value}" + + # Add health status + health_suffix = "" + if status in [InstanceStatus.IDLE, InstanceStatus.BUSY]: + if instance.unreachable: + health_suffix = " (unreachable)" + elif not instance.health_status.is_healthy(): + health_suffix = f" ({instance.health_status.value})" + + color_map = { + InstanceStatus.PENDING: "deep_sky_blue1", + InstanceStatus.PROVISIONING: "deep_sky_blue1", + InstanceStatus.IDLE: "sea_green3", + InstanceStatus.BUSY: "white", + InstanceStatus.TERMINATING: "deep_sky_blue1", + InstanceStatus.TERMINATED: "grey", + } + color = color_map.get(status, "white") + is_finished = status == InstanceStatus.TERMINATED + status_style = f"bold {color}" if not is_finished else color + return f"[{status_style}]{status_text}{health_suffix}[/]" + + +def _format_price(price: Optional[float]) -> str: + if price is None: + return "-" + return f"${price:.4f}".rstrip("0").rstrip(".") + + +def _format_instance_gpu(instance: Instance) -> str: + if instance.instance_type is None: + return "-" + if instance.backend == BackendType.REMOTE and instance.status in [ + InstanceStatus.PENDING, + InstanceStatus.PROVISIONING, + ]: + return "-" + return instance.instance_type.resources.pretty_format(gpu_only=True, include_spot=False) or "-" + + +def _format_instance_resources(instance: Instance) -> str: + if instance.instance_type is None: + return "-" + if instance.backend == BackendType.REMOTE and instance.status in [ + InstanceStatus.PENDING, + InstanceStatus.PROVISIONING, + ]: + return "-" + return instance.instance_type.resources.pretty_format(include_spot=False) diff --git a/src/dstack/_internal/cli/utils/gateway.py b/src/dstack/_internal/cli/utils/gateway.py index 3f9dec8c0c..0d873a9a5d 100644 --- a/src/dstack/_internal/cli/utils/gateway.py +++ b/src/dstack/_internal/cli/utils/gateway.py @@ -1,43 +1,133 @@ -import itertools from typing import List from rich.table import Table -from dstack._internal.cli.utils.common import console +from dstack._internal.cli.models.gateways import GatewayCommandOutput +from dstack._internal.cli.utils.common import ( + add_row_from_dict, + console, + format_backend, + format_entity_reference, +) +from dstack._internal.core.errors import ResourceNotExistsError +from dstack._internal.core.models.common import EntityReference from dstack._internal.core.models.gateways import Gateway -from dstack._internal.utils.common import pretty_date +from dstack._internal.utils.common import DateFormatter, interpolate_gateway_domain, pretty_date +from dstack.api.server._gateways import GatewaysAPIClient -def print_gateways_table(gateways: List[Gateway], verbose: bool = False): +def get_gateway_relative_to_project( + client: GatewaysAPIClient, project: str, gateway_project: str, gateway_name: str +) -> Gateway: + """ + Retrieves a single gateway, ensuring that `Gateway.default` is resolved relative to + `project` rather than relative to the gateway's host project. + """ + if project == gateway_project: + return client.get(project, gateway_name) + + # For imported gateways, use `list`. + # `get` would resolve `Gateway.default` relative to the gateway's host project + gateways = client.list(project, include_imported=True) + for gateway in gateways: + if gateway.name == gateway_name and ( + gateway_project == gateway.project_name + # Compatibility with pre-0.20.20 servers: + # gateway.project_name is None means the gateway is in the current `project` + or (gateway.project_name is None and gateway_project == project) + ): + return gateway + ref = EntityReference(name=gateway_name, project=gateway_project) + raise ResourceNotExistsError(msg=f"Gateway {ref.format()!r} not found in project {project!r}") + + +def print_gateways_table(gateways: List[Gateway], current_project: str, verbose: bool = False): + table = get_gateways_table(gateways, current_project, verbose=verbose) + console.print(table) + console.print() + + +def print_gateways_json(gateways: List[Gateway], project: str) -> None: + """Print gateways information in JSON format.""" + output = GatewayCommandOutput( + project=project, + gateways=gateways, + ) + print(output.json()) + + +def get_gateways_table( + gateways: List[Gateway], + current_project: str, + verbose: bool = False, + include_created: bool = False, + format_date: DateFormatter = pretty_date, +) -> Table: table = Table(box=None) - table.add_column("BACKEND") - table.add_column("REGION") table.add_column("NAME", no_wrap=True) + table.add_column("BACKEND") table.add_column("HOSTNAME", no_wrap=True) - table.add_column("DOMAIN") + table.add_column("DOMAIN", no_wrap=True) table.add_column("DEFAULT") table.add_column("STATUS") + if verbose or include_created: + table.add_column("CREATED") if verbose: table.add_column("ERROR") - table.add_column("CREATED") - table.add_column("INSTANCE_ID") - - gateways = sorted(gateways, key=lambda g: g.backend) - for backend, backend_gateways in itertools.groupby(gateways, key=lambda g: g.backend): - for i, gateway in enumerate(backend_gateways): - renderables = [ - backend.value if i == 0 else "", - gateway.region, - gateway.name, - gateway.hostname, - gateway.wildcard_domain, - "✓" if gateway.default else "", - gateway.status, - ] - if verbose: - renderables.append(gateway.status_message) - renderables.append(pretty_date(gateway.created_at)) - renderables.append(gateway.instance_id) - table.add_row(*renderables) - console.print(table) - console.print() + + for gateway in gateways: + name = format_entity_reference( + gateway.name, + # project_name == None means pre-0.20.20 server, which means no gateway exports support, + # which means the gateway is from the current project + gateway.project_name if gateway.project_name is not None else current_project, + current_project, + ) + domain = gateway.wildcard_domain + if ( + gateway.project_name is not None + and gateway.project_name != current_project + and domain is not None + ): + domain = interpolate_gateway_domain( + domain=domain, + run_project_name=current_project, + # Ignore errors in case future server versions introduce more interpolation variables + exception_type=None, + ) + + gateway_row = { + "NAME": name, + "DOMAIN": domain, + "DEFAULT": "✓" if gateway.default else "", + "STATUS": gateway.status, + "CREATED": format_date(gateway.created_at), + "ERROR": gateway.status_message, + } + if gateway.hostname is not None: + gateway_row["HOSTNAME"] = gateway.hostname + if len(gateway.replicas) == 0: + # replicas not yet created, or it's a pre-0.20.25 server without replica support + gateway_row["BACKEND"] = format_backend( + gateway.configuration.backend, gateway.configuration.region + ) + gateway_row["HOSTNAME"] = gateway_row.get("HOSTNAME", gateway.ip_address) + if len(gateway.replicas) == 1: + # compact display for single-replica gateway + gateway_row["BACKEND"] = format_backend( + gateway.replicas[0].backend, gateway.replicas[0].region + ) + gateway_row["HOSTNAME"] = gateway_row.get("HOSTNAME", gateway.replicas[0].hostname) + add_row_from_dict(table, gateway_row) + + if len(gateway.replicas) > 1: + for replica in gateway.replicas: + replica_row = { + "NAME": f" replica={replica.replica_num}", + "BACKEND": format_backend(replica.backend, replica.region), + "HOSTNAME": replica.hostname, + "CREATED": format_date(replica.created_at), + } + add_row_from_dict(table, replica_row, style="secondary") + + return table diff --git a/src/dstack/_internal/cli/utils/gpu.py b/src/dstack/_internal/cli/utils/gpu.py new file mode 100644 index 0000000000..3d19b173ba --- /dev/null +++ b/src/dstack/_internal/cli/utils/gpu.py @@ -0,0 +1,178 @@ +import shutil +from typing import List, Literal + +from rich.table import Table + +from dstack._internal.cli.models.offers import OfferCommandGroupByGpuOutput, OfferRequirements +from dstack._internal.cli.utils.common import console, format_instance_availability +from dstack._internal.core.models.gpus import GpuGroup +from dstack._internal.core.models.profiles import SpotPolicy +from dstack._internal.core.models.runs import Requirements, RunSpec, get_policy_map + + +def print_gpu_json( + gpus: List[GpuGroup], + run_spec: RunSpec, + group_by: List[Literal["gpu", "backend", "region", "count"]], + project: str, +): + """Print GPU information in JSON format.""" + req = OfferRequirements( + resources=run_spec.configuration.resources, + max_price=run_spec.merged_profile.max_price, + spot=get_policy_map(run_spec.merged_profile.spot_policy, default=SpotPolicy.AUTO), + reservation=run_spec.configuration.reservation, + ) + + output = OfferCommandGroupByGpuOutput( + project=project, + requirements=req, + group_by=group_by, + gpus=gpus, + ) + + print(output.json()) + + +def print_gpu_table(gpus: List[GpuGroup], run_spec: RunSpec, group_by: List[str], project: str): + """Print GPU information in a formatted table.""" + print_filter_info(run_spec, group_by, project) + + has_single_backend = any(gpu_group.backend for gpu_group in gpus) + has_single_region = any(gpu_group.region for gpu_group in gpus) + has_multiple_regions = any(gpu_group.regions for gpu_group in gpus) + + if has_single_backend and has_single_region: + backend_column = "BACKEND" + region_column = "REGION" + elif has_single_backend and has_multiple_regions: + backend_column = "BACKEND" + region_column = "REGIONS" + else: + backend_column = "BACKENDS" + region_column = None + + table = Table(box=None, expand=shutil.get_terminal_size(fallback=(120, 40)).columns <= 110) + table.add_column("#") + table.add_column("GPU", no_wrap=True, ratio=2) + table.add_column("SPOT", style="grey58", ratio=1) + table.add_column("$/GPU", style="grey58", ratio=1) + table.add_column(backend_column, style="grey58", ratio=2) + if region_column: + table.add_column(region_column, style="grey58", ratio=2) + table.add_column() + + for i, gpu_group in enumerate(gpus, start=1): + backend_text = "" + if gpu_group.backend: + backend_text = gpu_group.backend.value + elif gpu_group.backends: + backend_text = ", ".join(b.value for b in gpu_group.backends) + + region_text = "" + if gpu_group.region: + region_text = gpu_group.region + elif gpu_group.regions: + if len(gpu_group.regions) <= 3: + region_text = ", ".join(gpu_group.regions) + else: + region_text = f"{len(gpu_group.regions)} regions" + + if not region_column: + if gpu_group.regions and len(gpu_group.regions) > 3: + shortened_region_text = f"{len(gpu_group.regions)} regions" + backends_display = ( + f"{backend_text} ({shortened_region_text})" + if shortened_region_text + else backend_text + ) + else: + backends_display = ( + f"{backend_text} ({region_text})" if region_text else backend_text + ) + else: + backends_display = backend_text + + memory_gb = f"{gpu_group.memory_mib // 1024}GB" + if gpu_group.count.min == gpu_group.count.max: + count_range = str(gpu_group.count.min) + else: + count_range = f"{gpu_group.count.min}..{gpu_group.count.max}" + + gpu_spec = f"{gpu_group.name}:{memory_gb}:{count_range}" + + spot_types = [] + if "spot" in gpu_group.spot: + spot_types.append("spot") + if "on-demand" in gpu_group.spot: + spot_types.append("on-demand") + spot_display = ", ".join(spot_types) + + if gpu_group.price.min == gpu_group.price.max: + price_display = f"{gpu_group.price.min:.4f}".rstrip("0").rstrip(".") + else: + min_formatted = f"{gpu_group.price.min:.4f}".rstrip("0").rstrip(".") + max_formatted = f"{gpu_group.price.max:.4f}".rstrip("0").rstrip(".") + price_display = f"{min_formatted}..{max_formatted}" + + availability = "" + has_available = any(av.is_available() for av in gpu_group.availability) + if not has_available: + availability = ", ".join( + map(format_instance_availability, set(gpu_group.availability)) + ) + + secondary_style = "grey58" + row_data = [ + f"[{secondary_style}]{i}[/]", + gpu_spec, + f"[{secondary_style}]{spot_display}[/]", + f"[{secondary_style}]{price_display}[/]", + f"[{secondary_style}]{backends_display}[/]", + ] + if region_column: + row_data.append(f"[{secondary_style}]{region_text}[/]") + row_data.append(f"[{secondary_style}]{availability}[/]") + + table.add_row(*row_data) + + console.print(table) + + +def print_filter_info(run_spec: RunSpec, group_by: List[str], project: str): + """Print filter information for GPU display.""" + props = Table(box=None, show_header=False) + props.add_column(no_wrap=True) + props.add_column() + + req = Requirements( + resources=run_spec.configuration.resources, + max_price=run_spec.merged_profile.max_price, + spot=get_policy_map(run_spec.merged_profile.spot_policy, default=SpotPolicy.AUTO), + reservation=run_spec.merged_profile.reservation, + ) + + pretty_req = req.pretty_format(resources_only=True) + max_price = f"${req.max_price:3f}".rstrip("0").rstrip(".") if req.max_price else "-" + + if req.spot is None: + spot_policy = "auto" + elif req.spot: + spot_policy = "spot" + else: + spot_policy = "on-demand" + + def th(s: str) -> str: + return f"[bold]{s}[/bold]" + + props.add_row(th("Project"), project) + # TODO: Show user name + props.add_row(th("Resources"), pretty_req) + props.add_row(th("Spot policy"), spot_policy) + props.add_row(th("Max price"), max_price) + props.add_row(th("Reservation"), run_spec.configuration.reservation or "-") + if group_by: + props.add_row(th("Group by"), ", ".join(group_by)) + + console.print(props) + console.print() diff --git a/src/dstack/_internal/cli/utils/rich.py b/src/dstack/_internal/cli/utils/rich.py index 891eef4d74..1a89059202 100644 --- a/src/dstack/_internal/cli/utils/rich.py +++ b/src/dstack/_internal/cli/utils/rich.py @@ -1,8 +1,12 @@ import logging from datetime import datetime +from types import TracebackType from typing import TYPE_CHECKING, Callable, Iterable, List, Optional, Union +from rich.console import Group +from rich.live import Live from rich.logging import RichHandler +from rich.spinner import Spinner from rich.text import Text from rich.traceback import Traceback @@ -122,3 +126,35 @@ def prepend_path(self, message, record): path = path + "[/link]" message = f"[log.path]{path}[/] " + message return message + + +class MultiItemStatus: + """An alternative to rich.status.Status that allows extra renderables below the spinner""" + + def __init__(self, status: "RenderableType", *, console: Optional["Console"] = None) -> None: + self._spinner = Spinner("dots", text=status, style="status.spinner") + self._live = Live( + renderable=self._spinner, + console=console, + refresh_per_second=12.5, + transient=True, + ) + + def update( + self, *renderables: "RenderableType", status: Optional["RenderableType"] = None + ) -> None: + if status is not None: + self._spinner.update(text=status) + self._live.update(renderable=Group(self._spinner, *renderables)) + + def __enter__(self) -> "MultiItemStatus": + self._live.start() + return self + + def __exit__( + self, + exc_type: Optional[type[BaseException]], + exc_val: Optional[BaseException], + exc_tb: Optional[TracebackType], + ) -> None: + self._live.stop() diff --git a/src/dstack/_internal/cli/utils/run.py b/src/dstack/_internal/cli/utils/run.py index 939d4ff2e8..42f6388ab3 100644 --- a/src/dstack/_internal/cli/utils/run.py +++ b/src/dstack/_internal/cli/utils/run.py @@ -1,22 +1,107 @@ -from typing import List, Optional +import shutil +from enum import Enum +from typing import Any, Dict, List, Optional from rich.markup import escape from rich.table import Table -from dstack._internal.cli.utils.common import add_row_from_dict, console -from dstack._internal.core.models.instances import InstanceAvailability -from dstack._internal.core.models.profiles import TerminationPolicy +from dstack._internal.cli.models.offers import OfferCommandOutput, OfferRequirements +from dstack._internal.cli.models.runs import PsCommandOutput +from dstack._internal.cli.utils.common import ( + NO_FLEETS_WARNING, + NO_OFFERS_WARNING, + add_row_from_dict, + console, + format_backend, + format_instance_availability, +) +from dstack._internal.core.models.configurations import DevEnvironmentConfiguration +from dstack._internal.core.models.instances import ( + InstanceOfferWithAvailability, + InstanceType, +) +from dstack._internal.core.models.profiles import ( + DEFAULT_RUN_TERMINATION_IDLE_TIME, + CreationPolicy, + SpotPolicy, + TerminationPolicy, +) from dstack._internal.core.models.runs import ( + ImagePullProgress, Job, - JobTerminationReason, + JobStatus, + JobSubmission, + Probe, + ProbeSpec, RunPlan, - RunTerminationReason, + RunStatus, + get_policy_map, +) +from dstack._internal.core.models.runs import ( + Run as CoreRun, +) +from dstack._internal.core.services.profiles import get_termination +from dstack._internal.utils.common import ( + DateFormatter, + batched, + format_duration_multiunit, + format_pretty_duration, + pretty_date, ) -from dstack._internal.utils.common import format_pretty_duration, pretty_date from dstack.api import Run -def print_run_plan(run_plan: RunPlan, offers_limit: int = 3): +class RunWaitStatus(str, Enum): + WAITING_FOR_REQUESTS = "waiting for requests" + WAITING_FOR_SCHEDULE = "waiting for schedule" + + +_OFFER_FLEET_HINT = ( + "Hint: Existing fleets are ignored, and all available offers are shown." + " To filter by fleet, pass --fleet NAME." +) + + +def print_offers_json(run_plan: RunPlan, run_spec): + """Print offers information in JSON format.""" + job_plan = run_plan.job_plans[0] + + requirements = OfferRequirements( + resources=job_plan.job_spec.requirements.resources, + max_price=job_plan.job_spec.requirements.max_price, + spot=get_policy_map(run_spec.configuration.spot_policy, default=SpotPolicy.AUTO), + reservation=run_plan.run_spec.configuration.reservation, + ) + + output = OfferCommandOutput( + project=run_plan.project_name, + user=run_plan.user, + requirements=requirements, + offers=job_plan.offers, + total_offers=job_plan.total_offers, + ) + + print(output.json()) + + +def print_runs_json(project: str, runs: List[Run]) -> None: + """Print runs information in JSON format.""" + output = PsCommandOutput( + project=project, + runs=[r._run for r in runs], + ) + print(output.json()) + + +def print_run_plan( + run_plan: RunPlan, + max_offers: Optional[int] = None, + include_run_properties: bool = True, + no_fleets: bool = False, + verbose: bool = False, + show_offer_fleet_hint: bool = False, +): + run_spec = run_plan.get_effective_run_spec() job_plan = run_plan.job_plans[0] props = Table(box=None, show_header=False) @@ -25,22 +110,35 @@ def print_run_plan(run_plan: RunPlan, offers_limit: int = 3): req = job_plan.job_spec.requirements pretty_req = req.pretty_format(resources_only=True) - max_price = f"${req.max_price:g}" if req.max_price else "-" + max_price = f"${req.max_price:3f}".rstrip("0").rstrip(".") if req.max_price else "off" max_duration = ( - f"{job_plan.job_spec.max_duration / 3600:g}h" if job_plan.job_spec.max_duration else "-" + format_pretty_duration(job_plan.job_spec.max_duration) + if job_plan.job_spec.max_duration + else "off" ) + inactivity_duration = None + if isinstance(run_spec.configuration, DevEnvironmentConfiguration): + inactivity_duration = "off" + if isinstance(run_spec.configuration.inactivity_duration, int): + inactivity_duration = format_pretty_duration( + run_spec.configuration.inactivity_duration + ) if job_plan.job_spec.retry is None: - retry = "no" + retry = "off" else: retry = escape(job_plan.job_spec.retry.pretty_format()) - profile = run_plan.run_spec.merged_profile + profile = run_spec.merged_profile creation_policy = profile.creation_policy - termination_policy = profile.termination_policy + # FIXME: This assumes the default idle_duration is the same for client and server. + # If the server changes idle_duration, old clients will see incorrect value. + termination_policy, termination_idle_time = get_termination( + profile, DEFAULT_RUN_TERMINATION_IDLE_TIME + ) if termination_policy == TerminationPolicy.DONT_DESTROY: - termination_idle_time = "-" + idle_duration = "-" else: - termination_idle_time = format_pretty_duration(profile.termination_idle_time) + idle_duration = format_pretty_duration(termination_idle_time) if req.spot is None: spot_policy = "auto" @@ -52,154 +150,445 @@ def print_run_plan(run_plan: RunPlan, offers_limit: int = 3): def th(s: str) -> str: return f"[bold]{s}[/bold]" - props.add_row(th("Configuration"), run_plan.run_spec.configuration_path) props.add_row(th("Project"), run_plan.project_name) props.add_row(th("User"), run_plan.user) - props.add_row(th("Pool"), profile.pool_name) - props.add_row(th("Min resources"), pretty_req) - props.add_row(th("Max price"), max_price) - props.add_row(th("Max duration"), max_duration) + if include_run_properties: + configuration_type = run_spec.configuration.type + if run_spec.configuration.type == "task": + configuration_type += f" (nodes={run_spec.configuration.nodes})" + props.add_row(th("Type"), configuration_type) + props.add_row(th("Resources"), pretty_req) props.add_row(th("Spot policy"), spot_policy) - props.add_row(th("Retry policy"), retry) - props.add_row(th("Creation policy"), creation_policy) - props.add_row(th("Termination policy"), termination_policy) - props.add_row(th("Termination idle time"), termination_idle_time) + props.add_row(th("Max price"), max_price) + if include_run_properties: + props.add_row(th("Retry policy"), retry) + if verbose or creation_policy != CreationPolicy.REUSE_OR_CREATE: + props.add_row(th("Creation policy"), creation_policy) + props.add_row(th("Idle duration"), idle_duration) + props.add_row(th("Max duration"), max_duration) + if inactivity_duration is not None: # only set for dev-environment + props.add_row(th("Inactivity duration"), inactivity_duration) + if verbose or run_spec.configuration.reservation: + props.add_row(th("Reservation"), run_spec.configuration.reservation or "no") - offers = Table(box=None) + offers = Table(box=None, expand=shutil.get_terminal_size(fallback=(120, 40)).columns <= 110) offers.add_column("#") - offers.add_column("BACKEND") - offers.add_column("REGION") - offers.add_column("INSTANCE") - offers.add_column("RESOURCES") - offers.add_column("SPOT") - offers.add_column("PRICE") + offers.add_column("BACKEND", style="grey58", ratio=2) + offers.add_column("RESOURCES", ratio=4) + offers.add_column("INSTANCE TYPE", style="grey58", no_wrap=True, ratio=2) + offers.add_column("PRICE", style="grey58", ratio=1) offers.add_column() - job_plan.offers = job_plan.offers[:offers_limit] + displayed_offers = job_plan.offers[:max_offers] if max_offers else job_plan.offers - for i, offer in enumerate(job_plan.offers, start=1): + for i, offer in enumerate(displayed_offers, start=1): r = offer.instance.resources - availability = "" - if offer.availability in { - InstanceAvailability.NOT_AVAILABLE, - InstanceAvailability.NO_QUOTA, - InstanceAvailability.IDLE, - InstanceAvailability.BUSY, - }: - availability = offer.availability.value.replace("_", " ").lower() + instance = offer.instance.name + if offer.total_blocks > 1: + instance += f" ({offer.blocks}/{offer.total_blocks})" offers.add_row( f"{i}", - offer.backend.replace("remote", "ssh"), - offer.region, - offer.instance.name, - r.pretty_format(), - "yes" if r.spot else "no", - f"${offer.price:g}", - availability, - style=None if i == 1 else "secondary", + format_backend(offer.backend, offer.region), + r.pretty_format(include_spot=True), + instance, + f"${offer.price:.4f}".rstrip("0").rstrip("."), + format_instance_availability(offer.availability), + style=None if i == 1 or not include_run_properties else "secondary", ) - if job_plan.total_offers > len(job_plan.offers): + if job_plan.total_offers > len(displayed_offers): offers.add_row("", "...", style="secondary") console.print(props) console.print() - if len(job_plan.offers) > 0: + if len(displayed_offers) > 0: + show_offer_fleet_hint_before_table = ( + show_offer_fleet_hint + and job_plan.total_offers <= len(displayed_offers) + and len(displayed_offers) < 3 + ) + show_offer_fleet_hint_after_table = ( + show_offer_fleet_hint and not show_offer_fleet_hint_before_table + ) + if show_offer_fleet_hint_before_table: + console.print(f"[secondary]{_OFFER_FLEET_HINT}[/]") + console.print() console.print(offers) - if job_plan.total_offers > len(job_plan.offers): + if job_plan.total_offers > len(displayed_offers): console.print( - f"[secondary] Shown {len(job_plan.offers)} of {job_plan.total_offers} offers, " - f"${job_plan.max_price:g} max[/]" + f"[secondary] Shown {len(displayed_offers)} of {job_plan.total_offers} offers, " + f"${job_plan.max_price:3f}".rstrip("0").rstrip(".") + + "max[/]" ) + if show_offer_fleet_hint_after_table: + console.print(f"[secondary]{_OFFER_FLEET_HINT}[/]") console.print() + else: + console.print(NO_FLEETS_WARNING if no_fleets else NO_OFFERS_WARNING) + + +def get_run_wait_status(run: CoreRun) -> Optional[RunWaitStatus]: + # Only synthesize a CLI-specific waiting state when the server did not provide + # a more specific run-level message such as "retrying". + if run.status_message not in ("", run.status.value): + return None + + if run.status == RunStatus.PENDING and run.next_triggered_at is not None: + return RunWaitStatus.WAITING_FOR_SCHEDULE + + if _is_waiting_for_requests(run): + return RunWaitStatus.WAITING_FOR_REQUESTS + + return None + + +def _is_waiting_for_requests(run: CoreRun) -> bool: + if run.run_spec.configuration.type != "service": + return False + if run.service is None or run.next_triggered_at is not None: + return False + if run.status not in (RunStatus.SUBMITTED, RunStatus.PENDING): + return False + return not any(_is_job_active(job.job_submissions[-1].status) for job in run.jobs) + + +def _is_job_active(status: JobStatus) -> bool: + return status in ( + JobStatus.SUBMITTED, + JobStatus.PROVISIONING, + JobStatus.PULLING, + JobStatus.RUNNING, + ) + + +def _format_run_status(run) -> str: + status_text = ( + run.latest_job_submission.status_message + if run.status.is_finished() and run.latest_job_submission + else run.status_message + ) + # Inline of _get_run_status_style + color_map = { + RunStatus.PENDING: "white", + RunStatus.SUBMITTED: "grey", + RunStatus.PROVISIONING: "deep_sky_blue1", + RunStatus.RUNNING: "sea_green3", + RunStatus.TERMINATING: "deep_sky_blue1", + RunStatus.TERMINATED: "grey", + RunStatus.FAILED: "indian_red1", + RunStatus.DONE: "grey", + } + if status_text in ("no offers", "interrupted"): + color = "gold1" + elif status_text == "no fleets": + color = "indian_red1" + elif status_text == "pulling": + color = "sea_green3" + else: + color = color_map.get(run.status, "white") + status_style = f"bold {color}" if not run.status.is_finished() else color + return f"[{status_style}]{status_text}[/]" + + +def _format_job_submission_status(job_submission: JobSubmission, verbose: bool) -> str: + status_message = job_submission.status_message + job_status = job_submission.status + if status_message in ("no offers", "interrupted"): + color = "gold1" + elif status_message == "no fleets": + color = "indian_red1" + elif status_message == "stopped": + color = "grey" + else: + color_map = { + JobStatus.SUBMITTED: "grey", + JobStatus.PROVISIONING: "deep_sky_blue1", + JobStatus.PULLING: "sea_green3", + JobStatus.RUNNING: "sea_green3", + JobStatus.TERMINATING: "deep_sky_blue1", + JobStatus.TERMINATED: "grey", + JobStatus.ABORTED: "gold1", + JobStatus.FAILED: "indian_red1", + JobStatus.DONE: "grey", + } + color = color_map.get(job_status, "white") + status_style = f"bold {color}" if not job_status.is_finished() else color + formatted_status_message = f"[{status_style}]{status_message}[/]" + if job_status == JobStatus.PULLING and job_submission.image_pull_progress is not None: + formatted_status_message += ( + f" [secondary]{_format_pull_progress(job_submission.image_pull_progress)}[/]" + ) + if verbose and job_submission.inactivity_secs: + inactive_for = format_duration_multiunit(job_submission.inactivity_secs) + formatted_status_message += f" (inactive for {inactive_for})" + return formatted_status_message + + +def _format_pull_progress(progress: ImagePullProgress) -> str: + if progress.total_bytes >= 2**30: # 1GB + unit = "GB" + + def f(x: int) -> str: + return f"{x / 2**30:.2f}" + else: + unit = "MB" + + def f(x: int) -> str: + return f"{x / 2**20:.0f}" + + # NOTE: The format is documented in protips.md. Keep in sync. + total_sign = "≥" if not progress.is_total_bytes_final else "" + return f"{f(progress.extracted_bytes)}/{f(progress.downloaded_bytes)}/{total_sign}{f(progress.total_bytes)}{unit}" + + +def _get_show_deployment_replica_job(run: CoreRun, verbose: bool) -> tuple[bool, bool, bool]: + show_deployment_num = ( + verbose and run.run_spec.configuration.type == "service" + ) or run.is_deployment_in_progress() + + replica_nums = {job.job_spec.replica_num for job in run.jobs} + show_replica = len(replica_nums) > 1 + + jobs_by_replica: Dict[int, List[Any]] = {} + for job in run.jobs: + replica_num = job.job_spec.replica_num + if replica_num not in jobs_by_replica: + jobs_by_replica[replica_num] = [] + jobs_by_replica[replica_num].append(job) + + show_job = any( + len({j.job_spec.job_num for j in jobs}) > 1 for jobs in jobs_by_replica.values() + ) + + return show_deployment_num, show_replica, show_job + + +def _format_job_name( + job: Job, + latest_job_submission: JobSubmission, + show_deployment_num: bool, + show_replica: bool, + show_job: bool, + group_index: Optional[int] = None, + last_shown_group_index: Optional[int] = None, +) -> str: + name_parts = [] + prefix = "" + if show_replica: + # Show group information if replica groups are used + if group_index is not None: + # Show group=X replica=Y when group changes, or just replica=Y when same group + if group_index != last_shown_group_index: + # First job in group: use 3 spaces indent + prefix = " " + name_parts.append(f"group={group_index} replica={job.job_spec.replica_num}") + else: + # Subsequent job in same group: align "replica=" with first job's "replica=" + # Calculate padding: width of " group={last_shown_group_index} " + padding_width = 3 + len(f"group={last_shown_group_index}") + 1 + prefix = " " * padding_width + name_parts.append(f"replica={job.job_spec.replica_num}") + else: + # Legacy behavior: no replica groups + prefix = " " + name_parts.append(f"replica={job.job_spec.replica_num}") + else: + prefix = " " + + if show_job: + name_parts.append(f"job={job.job_spec.job_num}") + name_suffix = ( + f" deployment={latest_job_submission.deployment_num}" if show_deployment_num else "" + ) + name_value = prefix + (" ".join(name_parts) if name_parts else "") + name_value += name_suffix + return name_value + + +def _format_price(price: float, is_spot: bool) -> str: + price_str = f"${price:.4f}".rstrip("0").rstrip(".") + if is_spot: + price_str += " (spot)" + return price_str -def generate_runs_table( - runs: List[Run], include_configuration: bool = False, verbose: bool = False +def _format_instance_type( + instance_type: InstanceType, + shared_offer: Optional[InstanceOfferWithAvailability], + reservation: Optional[str], +) -> str: + instance_type_str = instance_type.name + if shared_offer is not None: + instance_type_str += f" ({shared_offer.blocks}/{shared_offer.total_blocks})" + if reservation is not None: + instance_type_str += f" ({reservation})" + return instance_type_str + + +def _format_run_name(run: CoreRun, show_deployment_num: bool) -> str: + parts: List[str] = [run.run_spec.run_name] + if show_deployment_num: + parts.append(f" [secondary]deployment={run.deployment_num}[/]") + return "".join(parts) + + +def get_runs_table( + runs: List[Run], verbose: bool = False, format_date: DateFormatter = pretty_date ) -> Table: - table = Table(box=None) - table.add_column("NAME", style="bold", no_wrap=True) - if include_configuration: - table.add_column("CONFIGURATION", style="grey58") - table.add_column("BACKEND", style="grey58", no_wrap=True, max_width=16) - table.add_column("REGION", style="grey58") + table = Table(box=None, expand=shutil.get_terminal_size(fallback=(120, 40)).columns <= 110) + table.add_column("NAME", style="bold", no_wrap=True, ratio=2) + table.add_column("BACKEND", style="grey58", ratio=2) if verbose: - table.add_column("INSTANCE", no_wrap=True) - table.add_column("RESOURCES") - table.add_column("SPOT") - table.add_column("PRICE", no_wrap=True) - table.add_column("STATUS", no_wrap=True) - table.add_column("SUBMITTED", style="grey58", no_wrap=True) + table.add_column("RESOURCES", style="grey58", ratio=3) + table.add_column("INSTANCE TYPE", style="grey58", no_wrap=True, ratio=1) + else: + table.add_column("GPU", ratio=2) + table.add_column("PRICE", style="grey58", ratio=1) + table.add_column("STATUS", ratio=1) + if verbose or any( + run._run.is_deployment_in_progress() + and any(job.job_submissions[-1].probes for job in run._run.jobs) + for run in runs + ): + table.add_column("PROBES", ratio=1) + table.add_column("SUBMITTED", style="grey58", no_wrap=True, ratio=1) if verbose: - table.add_column("ERROR", no_wrap=True) + table.add_column("ERROR", no_wrap=True, ratio=2) for run in runs: - run_error = _get_run_error(run) run = run._run # TODO(egor-s): make public attribute + show_deployment_num, show_replica, show_job = _get_show_deployment_replica_job( + run, verbose + ) + merge_job_rows = len(run.jobs) == 1 and not show_deployment_num + + group_name_to_index: Dict[str, int] = {} + if run.run_spec.configuration.type == "service" and hasattr( + run.run_spec.configuration, "replica_groups" + ): + replica_groups = run.run_spec.configuration.replica_groups + if replica_groups: + for idx, group in enumerate(replica_groups): + assert group.name is not None, "Group name is always set" + group_name = group.name + group_name_to_index[group_name] = idx run_row = { - "NAME": run.run_spec.run_name, - "CONFIGURATION": run.run_spec.configuration_path, - "STATUS": run.status, - "SUBMITTED": pretty_date(run.submitted_at), - "ERROR": run_error, + "NAME": _format_run_name(run, show_deployment_num), + "SUBMITTED": format_date(run.submitted_at), + "STATUS": _format_run_status(run), + "RESOURCES": "-", + "GPU": "-", + "PRICE": "-", } - if len(run.jobs) != 1: + if run.error: + run_row["ERROR"] = run.error + if not merge_job_rows: add_row_from_dict(table, run_row) - for job in run.jobs: + # Sort jobs by group index first, then by replica_num within each group + def get_job_sort_key(job: Job) -> tuple: + group_index = None + if group_name_to_index: + group_index = group_name_to_index.get(job.job_spec.replica_group) + # Use a large number for jobs without groups to put them at the end + return (group_index if group_index is not None else 999999, job.job_spec.replica_num) + + sorted_jobs = sorted(run.jobs, key=get_job_sort_key) + + last_shown_group_index: Optional[int] = None + for job in sorted_jobs: + latest_job_submission = job.job_submissions[-1] + status_formatted = _format_job_submission_status(latest_job_submission, verbose) + + # Get group index for this job + group_index: Optional[int] = None + if group_name_to_index: + group_index = group_name_to_index.get(job.job_spec.replica_group) + job_row = { - "NAME": f" replica {job.job_spec.replica_num}\n job_num {job.job_spec.job_num}", - "STATUS": job.job_submissions[-1].status, - "SUBMITTED": pretty_date(job.job_submissions[-1].submitted_at), - "ERROR": _get_job_error(job), + "NAME": _format_job_name( + job, + latest_job_submission, + show_deployment_num, + show_replica, + show_job, + group_index=group_index, + last_shown_group_index=last_shown_group_index, + ), + "STATUS": status_formatted, + "PROBES": _format_job_probes( + job.job_spec.probes, latest_job_submission.probes, latest_job_submission.status + ), + "SUBMITTED": format_date(latest_job_submission.submitted_at), + "ERROR": latest_job_submission.error, + "RESOURCES": "-", + "GPU": "-", + "PRICE": "-", } - jpd = job.job_submissions[-1].job_provisioning_data + # Update last shown group index for next iteration + if group_index is not None: + last_shown_group_index = group_index + jpd = latest_job_submission.job_provisioning_data if jpd is not None: + shared_offer: Optional[InstanceOfferWithAvailability] = None + instance_type = jpd.instance_type + price = jpd.price + jrd = latest_job_submission.job_runtime_data + if jrd is not None and jrd.offer is not None and jrd.offer.total_blocks > 1: + # We only use offer data from jrd if the job is/was running on a shared + # instance (the instance blocks feature). In that case, jpd contains the full + # instance offer data, while jrd contains the shared offer (a fraction of + # the full offer). Although jrd always contains the offer, we don't use it in + # other cases, as, unlike jpd offer data, jrd offer is not updated after + # Compute.update_provisioning_data() call, but some backends, namely + # Kubernetes, may update offer data via that method. + # As long as we don't have a backend which both supports the blocks feature + # and may update offer data in update_provisioning_data(), this logic is fine. + shared_offer = jrd.offer + instance_type = shared_offer.instance + price = shared_offer.price + resources = instance_type.resources job_row.update( { - "BACKEND": jpd.backend.value.replace("remote", "ssh"), - "REGION": jpd.region, - "INSTANCE": jpd.instance_type.name, - "RESOURCES": jpd.instance_type.resources.pretty_format(), - "SPOT": "yes" if jpd.instance_type.resources.spot else "no", - "PRICE": f"${jpd.price:.4}", + "BACKEND": format_backend(jpd.backend, jpd.region), + "RESOURCES": resources.pretty_format(include_spot=False), + "GPU": resources.pretty_format(gpu_only=True, include_spot=False), + "INSTANCE TYPE": _format_instance_type( + instance_type, shared_offer, jpd.reservation + ), + "PRICE": _format_price(price, resources.spot), } ) - if len(run.jobs) == 1: - # merge rows + if merge_job_rows: + _status = job_row["STATUS"] + _resources = job_row["RESOURCES"] + _gpu = job_row["GPU"] + _price = job_row["PRICE"] job_row.update(run_row) + job_row["RESOURCES"] = _resources + job_row["GPU"] = _gpu + job_row["PRICE"] = _price + job_row["STATUS"] = _status add_row_from_dict(table, job_row, style="secondary" if len(run.jobs) != 1 else None) return table -def _get_run_error(run: Run) -> str: - if run._run.termination_reason is None: - return "" - if len(run._run.jobs) > 1: - return run._run.termination_reason.name - run_job_termination_reason = _get_run_job_termination_reason(run) - # For failed runs, also show termination reason to provide more context. - # For other run statuses, the job termination reason will duplicate run status. - if run_job_termination_reason is not None and run._run.termination_reason in [ - RunTerminationReason.JOB_FAILED, - RunTerminationReason.SERVER_ERROR, - RunTerminationReason.RETRY_LIMIT_EXCEEDED, - ]: - return f"{run._run.termination_reason.name}\n({run_job_termination_reason.name})" - return run._run.termination_reason.name - - -def _get_run_job_termination_reason(run: Run) -> Optional[JobTerminationReason]: - for job in run._run.jobs: - if len(job.job_submissions) > 0: - if job.job_submissions[-1].termination_reason is not None: - return job.job_submissions[-1].termination_reason - return None - - -def _get_job_error(job: Job) -> str: - if job.job_submissions[-1].termination_reason is None: +def _format_job_probes( + probe_specs: list[ProbeSpec], probes: list[Probe], job_status: JobStatus +) -> str: + if not probes or job_status != JobStatus.RUNNING: return "" - return job.job_submissions[-1].termination_reason.name + statuses = [] + for probe_spec, probe in zip(probe_specs, probes): + # NOTE: the symbols are documented in concepts/services.md, keep in sync. + if probe.success_streak >= probe_spec.ready_after: + status = "[code]✓[/]" + elif probe.success_streak > 0: + status = "[warning]~[/]" + else: + status = "[error]×[/]" + statuses.append(status) + # split into whitespace-delimited batches to allow column wrapping + return " ".join("".join(batch) for batch in batched(statuses, 5)) diff --git a/src/dstack/_internal/cli/utils/secrets.py b/src/dstack/_internal/cli/utils/secrets.py new file mode 100644 index 0000000000..5fcbb5a99a --- /dev/null +++ b/src/dstack/_internal/cli/utils/secrets.py @@ -0,0 +1,25 @@ +from typing import List + +from rich.table import Table + +from dstack._internal.cli.utils.common import add_row_from_dict, console +from dstack._internal.core.models.secrets import Secret + + +def print_secrets_table(secrets: List[Secret]) -> None: + console.print(get_secrets_table(secrets)) + console.print() + + +def get_secrets_table(secrets: List[Secret]) -> Table: + table = Table(box=None) + table.add_column("NAME", no_wrap=True) + table.add_column("VALUE") + + for secret in secrets: + row = { + "NAME": secret.name, + "VALUE": secret.value or "*" * 6, + } + add_row_from_dict(table, row) + return table diff --git a/src/dstack/_internal/cli/utils/updates.py b/src/dstack/_internal/cli/utils/updates.py index 66f2363c65..418cefb3b4 100644 --- a/src/dstack/_internal/cli/utils/updates.py +++ b/src/dstack/_internal/cli/utils/updates.py @@ -57,10 +57,22 @@ def _is_last_check_time_outdated() -> bool: ) +def is_update_available(current_version: str, latest_version: str) -> bool: + """ + Return True if latest_version is newer than current_version. + Pre-releases are only considered if the current version is also a pre-release. + """ + _current_version = pkg_version.parse(str(current_version)) + _latest_version = pkg_version.parse(str(latest_version)) + return _current_version < _latest_version and ( + not _latest_version.is_prerelease or _current_version.is_prerelease + ) + + def _check_version(): latest_version = get_latest_version() if latest_version is not None: - if pkg_version.parse(str(version.__version__)) < pkg_version.parse(latest_version): + if is_update_available(version.__version__, latest_version): console.print(f"A new version of dstack is available: [code]{latest_version}[/]\n") @@ -79,8 +91,7 @@ def _get_last_check_path() -> Path: def check_for_updates(): - current_version = version.__version__ - if current_version: + if version.__is_release__: if _is_last_check_time_outdated(): logger.debug("Checking for updates...") _check_version() diff --git a/src/dstack/_internal/cli/utils/volume.py b/src/dstack/_internal/cli/utils/volume.py index 757c6eb01e..d9cec1e57f 100644 --- a/src/dstack/_internal/cli/utils/volume.py +++ b/src/dstack/_internal/cli/utils/volume.py @@ -2,28 +2,58 @@ from rich.table import Table -from dstack._internal.cli.utils.common import console +from dstack._internal.cli.utils.common import add_row_from_dict, console from dstack._internal.core.models.volumes import Volume -from dstack._internal.utils.common import pretty_date +from dstack._internal.utils.common import DateFormatter, pretty_date def print_volumes_table(volumes: List[Volume], verbose: bool = False): + table = get_volumes_table(volumes, verbose=verbose) + console.print(table) + console.print() + + +def get_volumes_table( + volumes: List[Volume], verbose: bool = False, format_date: DateFormatter = pretty_date +) -> Table: table = Table(box=None) table.add_column("NAME", no_wrap=True) table.add_column("BACKEND") - table.add_column("REGION") + if verbose: + table.add_column("REGION") table.add_column("STATUS") + if verbose: + table.add_column("ATTACHED") table.add_column("CREATED") + if verbose: + table.add_column("ERROR") for volume in volumes: - renderables = [ - volume.name, - volume.configuration.backend, - volume.configuration.region, - volume.status, - pretty_date(volume.created_at), - ] - table.add_row(*renderables) - - console.print(table) - console.print() + backend = volume.get_backend().value + region = volume.get_region() + if verbose: + # In verbose mode, BACKEND displays `backend` only, and REGION displays nothing or + # `region` or `region (az)` + if availability_zone := volume.get_availability_zone(): + region = f"{region} ({availability_zone})" + elif region: + # In non-verbose mode, BACKEND displays `backend` or `backend (region)`, and REGION + # is hidden + backend = f"{backend} ({region})" + attached = "-" + if volume.attachments is not None: + attached = ", ".join( + {va.instance.fleet_name for va in volume.attachments if va.instance.fleet_name} + ) + attached = attached or "-" + row = { + "NAME": volume.name, + "BACKEND": backend, + "REGION": region, + "STATUS": volume.status, + "ATTACHED": attached, + "CREATED": format_date(volume.created_at), + "ERROR": volume.status_message, + } + add_row_from_dict(table, row) + return table diff --git a/src/dstack/_internal/compat.py b/src/dstack/_internal/compat.py new file mode 100644 index 0000000000..4e17099eb7 --- /dev/null +++ b/src/dstack/_internal/compat.py @@ -0,0 +1,3 @@ +import os + +IS_WINDOWS = os.name == "nt" diff --git a/src/dstack/_internal/core/backends/__init__.py b/src/dstack/_internal/core/backends/__init__.py index 1dfeb5f159..e69de29bb2 100644 --- a/src/dstack/_internal/core/backends/__init__.py +++ b/src/dstack/_internal/core/backends/__init__.py @@ -1,28 +0,0 @@ -from dstack._internal.core.models.backends.base import BackendType - -BACKENDS_WITH_MULTINODE_SUPPORT = [ - BackendType.AWS, - BackendType.AZURE, - BackendType.GCP, - BackendType.REMOTE, - BackendType.OCI, -] -BACKENDS_WITH_CREATE_INSTANCE_SUPPORT = [ - BackendType.AWS, - BackendType.DSTACK, - BackendType.AZURE, - BackendType.CUDO, - BackendType.DATACRUNCH, - BackendType.GCP, - BackendType.LAMBDA, - BackendType.OCI, - BackendType.TENSORDOCK, -] -BACKENDS_WITH_GATEWAY_SUPPORT = [ - BackendType.AWS, - BackendType.AZURE, - BackendType.GCP, - BackendType.KUBERNETES, -] -BACKENDS_WITH_PRIVATE_GATEWAY_SUPPORT = [BackendType.AWS] -BACKENDS_WITH_VOLUMES_SUPPORT = [BackendType.AWS, BackendType.LOCAL] diff --git a/src/dstack/_internal/core/backends/amddevcloud/__init__.py b/src/dstack/_internal/core/backends/amddevcloud/__init__.py new file mode 100644 index 0000000000..16e553969f --- /dev/null +++ b/src/dstack/_internal/core/backends/amddevcloud/__init__.py @@ -0,0 +1 @@ +# This package contains the implementation for the AMDDevCloud backend. diff --git a/src/dstack/_internal/core/backends/amddevcloud/backend.py b/src/dstack/_internal/core/backends/amddevcloud/backend.py new file mode 100644 index 0000000000..9a0477d760 --- /dev/null +++ b/src/dstack/_internal/core/backends/amddevcloud/backend.py @@ -0,0 +1,16 @@ +from dstack._internal.core.backends.amddevcloud.compute import AMDDevCloudCompute +from dstack._internal.core.backends.digitalocean_base.backend import BaseDigitalOceanBackend +from dstack._internal.core.backends.digitalocean_base.models import BaseDigitalOceanConfig +from dstack._internal.core.models.backends.base import BackendType + + +class AMDDevCloudBackend(BaseDigitalOceanBackend): + TYPE = BackendType.AMDDEVCLOUD + COMPUTE_CLASS = AMDDevCloudCompute + + def __init__(self, config: BaseDigitalOceanConfig, api_url: str): + self.config = config + self._compute = AMDDevCloudCompute(self.config, api_url=api_url, type=self.TYPE) + + def compute(self) -> AMDDevCloudCompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/amddevcloud/compute.py b/src/dstack/_internal/core/backends/amddevcloud/compute.py new file mode 100644 index 0000000000..945eb63f93 --- /dev/null +++ b/src/dstack/_internal/core/backends/amddevcloud/compute.py @@ -0,0 +1,5 @@ +from dstack._internal.core.backends.digitalocean_base.compute import BaseDigitalOceanCompute + + +class AMDDevCloudCompute(BaseDigitalOceanCompute): + pass diff --git a/src/dstack/_internal/core/backends/amddevcloud/configurator.py b/src/dstack/_internal/core/backends/amddevcloud/configurator.py new file mode 100644 index 0000000000..2f00f359eb --- /dev/null +++ b/src/dstack/_internal/core/backends/amddevcloud/configurator.py @@ -0,0 +1,29 @@ +from typing import Optional + +from dstack._internal.core.backends.amddevcloud.backend import AMDDevCloudBackend +from dstack._internal.core.backends.base.configurator import BackendRecord +from dstack._internal.core.backends.digitalocean_base.api_client import DigitalOceanAPIClient +from dstack._internal.core.backends.digitalocean_base.backend import BaseDigitalOceanBackend +from dstack._internal.core.backends.digitalocean_base.configurator import ( + BaseDigitalOceanConfigurator, +) +from dstack._internal.core.backends.digitalocean_base.models import AnyBaseDigitalOceanCreds +from dstack._internal.core.models.backends.base import ( + BackendType, +) + + +class AMDDevCloudConfigurator(BaseDigitalOceanConfigurator): + TYPE = BackendType.AMDDEVCLOUD + BACKEND_CLASS = AMDDevCloudBackend + API_URL = "https://api-amd.digitalocean.com" + + def get_backend(self, record: BackendRecord) -> BaseDigitalOceanBackend: + config = self._get_config(record) + return AMDDevCloudBackend(config=config, api_url=self.API_URL) + + def _validate_creds(self, creds: AnyBaseDigitalOceanCreds, project_name: Optional[str] = None): + api_client = DigitalOceanAPIClient(creds.api_key, self.API_URL) + api_client.validate_api_key() + if project_name: + api_client.validate_project_name(project_name) diff --git a/src/dstack/_internal/core/backends/aws/__init__.py b/src/dstack/_internal/core/backends/aws/__init__.py index badc10bf3e..e69de29bb2 100644 --- a/src/dstack/_internal/core/backends/aws/__init__.py +++ b/src/dstack/_internal/core/backends/aws/__init__.py @@ -1,25 +0,0 @@ -import botocore.exceptions - -from dstack._internal.core.backends.aws.compute import AWSCompute -from dstack._internal.core.backends.aws.config import AWSConfig -from dstack._internal.core.backends.base import Backend -from dstack._internal.core.errors import BackendInvalidCredentialsError -from dstack._internal.core.models.backends.base import BackendType - - -class AWSBackend(Backend): - TYPE: BackendType = BackendType.AWS - - def __init__(self, config: AWSConfig): - self.config = config - self._compute = AWSCompute(self.config) - self._check_credentials() - - def compute(self) -> AWSCompute: - return self._compute - - def _check_credentials(self): - try: - pass - except (botocore.exceptions.ClientError, botocore.exceptions.NoCredentialsError): - raise BackendInvalidCredentialsError() diff --git a/src/dstack/_internal/core/backends/aws/auth.py b/src/dstack/_internal/core/backends/aws/auth.py index 1aa8944546..e4e56d4dcb 100644 --- a/src/dstack/_internal/core/backends/aws/auth.py +++ b/src/dstack/_internal/core/backends/aws/auth.py @@ -2,9 +2,8 @@ import botocore.exceptions from boto3.session import Session +from dstack._internal.core.backends.aws.models import AnyAWSCreds, AWSAccessKeyCreds from dstack._internal.core.errors import BackendAuthError -from dstack._internal.core.models.backends.aws import AnyAWSCreds, AWSAccessKeyCreds -from dstack._internal.core.models.common import is_core_model_instance def authenticate(creds: AnyAWSCreds, region: str) -> Session: @@ -14,7 +13,7 @@ def authenticate(creds: AnyAWSCreds, region: str) -> Session: def get_session(creds: AnyAWSCreds, region: str) -> Session: - if is_core_model_instance(creds, AWSAccessKeyCreds): + if isinstance(creds, AWSAccessKeyCreds): return boto3.session.Session( region_name=region, aws_access_key_id=creds.access_key, @@ -29,12 +28,3 @@ def validate_credentials(session: Session): sts.get_caller_identity() except (botocore.exceptions.ClientError, botocore.exceptions.NoCredentialsError): raise BackendAuthError() - - -def default_creds_available() -> bool: - session = boto3.session.Session() - try: - validate_credentials(session) - except BackendAuthError: - return False - return True diff --git a/src/dstack/_internal/core/backends/aws/backend.py b/src/dstack/_internal/core/backends/aws/backend.py new file mode 100644 index 0000000000..1169227cc7 --- /dev/null +++ b/src/dstack/_internal/core/backends/aws/backend.py @@ -0,0 +1,31 @@ +from typing import Optional + +import botocore.exceptions + +from dstack._internal.core.backends.aws.compute import AWSCompute +from dstack._internal.core.backends.aws.models import AWSConfig +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.errors import BackendInvalidCredentialsError +from dstack._internal.core.models.backends.base import BackendType + + +class AWSBackend(Backend): + TYPE = BackendType.AWS + COMPUTE_CLASS = AWSCompute + + def __init__(self, config: AWSConfig, compute: Optional[AWSCompute] = None): + self.config = config + if compute is not None: + self._compute = compute + else: + self._compute = AWSCompute(self.config) + self._check_credentials() + + def compute(self) -> AWSCompute: + return self._compute + + def _check_credentials(self): + try: + pass + except (botocore.exceptions.ClientError, botocore.exceptions.NoCredentialsError): + raise BackendInvalidCredentialsError() diff --git a/src/dstack/_internal/core/backends/aws/compute.py b/src/dstack/_internal/core/backends/aws/compute.py index e160dd1224..cd072c1f5f 100644 --- a/src/dstack/_internal/core/backends/aws/compute.py +++ b/src/dstack/_internal/core/backends/aws/compute.py @@ -1,25 +1,59 @@ +import threading +from collections.abc import Iterable from concurrent.futures import ThreadPoolExecutor, as_completed -from typing import Any, Dict, List, Optional, Tuple +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Optional, Tuple import boto3 import botocore.client import botocore.exceptions +from cachetools import Cache, TTLCache, cachedmethod +from cachetools.keys import hashkey from pydantic import ValidationError import dstack._internal.core.backends.aws.resources as aws_resources from dstack._internal import settings -from dstack._internal.core.backends.aws.config import AWSConfig +from dstack._internal.core.backends.aws.models import ( + AWSAccessKeyCreds, + AWSConfig, + AWSOSImageConfig, +) from dstack._internal.core.backends.base.compute import ( Compute, + ComputeCache, + ComputeTTLCache, + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithGatewaySupport, + ComputeWithInstanceVolumesSupport, + ComputeWithMultinodeSupport, + ComputeWithPlacementGroupSupport, + ComputeWithPrivateGatewaySupport, + ComputeWithPrivilegedSupport, + ComputeWithReservationSupport, + ComputeWithVolumeSupport, + generate_unique_gateway_instance_name, + generate_unique_instance_name, + generate_unique_short_backend_name, + generate_unique_volume_name, get_gateway_user_data, - get_instance_name, get_user_data, + merge_tags, +) +from dstack._internal.core.backends.base.offers import ( + OfferModifier, + get_catalog_offers, + get_offers_disk_modifier, +) +from dstack._internal.core.errors import ( + ComputeError, + NoCapacityError, + PlacementGroupInUseError, + PlacementGroupNotSupportedError, + ProvisioningError, ) -from dstack._internal.core.backends.base.offers import get_catalog_offers -from dstack._internal.core.errors import ComputeError, NoCapacityError -from dstack._internal.core.models.backends.aws import AWSAccessKeyCreds from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.common import CoreModel, is_core_model_instance +from dstack._internal.core.models.common import CoreModel from dstack._internal.core.models.gateways import ( GatewayComputeConfiguration, GatewayProvisioningData, @@ -29,23 +63,34 @@ InstanceConfiguration, InstanceOffer, InstanceOfferWithAvailability, - SSHKey, ) -from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run +from dstack._internal.core.models.placement import ( + PlacementGroup, + PlacementGroupProvisioningData, + PlacementStrategy, +) +from dstack._internal.core.models.resources import Memory, Range +from dstack._internal.core.models.runs import JobProvisioningData, Requirements from dstack._internal.core.models.volumes import ( + AWSVolumeConfiguration, Volume, VolumeAttachmentData, VolumeProvisioningData, ) +from dstack._internal.utils.common import get_or_error from dstack._internal.utils.logging import get_logger logger = get_logger(__name__) +# gp2 volumes can be 1GB-16TB, dstack AMIs are 100GB +CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("100GB"), max=Memory.parse("16TB")) +DEFAULT_GATEWAY_INSTANCE_TYPE = "t3.micro" class AWSGatewayBackendData(CoreModel): lb_arn: str tg_arn: str listener_arn: str + http_listener_arn: Optional[str] = None # None for old gateways class AWSVolumeBackendData(CoreModel): @@ -53,57 +98,130 @@ class AWSVolumeBackendData(CoreModel): iops: int -class AWSCompute(Compute): - def __init__(self, config: AWSConfig): +class AWSInstanceBackendData(CoreModel): + eip_allocation_id: Optional[str] = None + """Elastic IP allocated for multi-ENI instances launched with `public_ips: true`. + """ + + +def _ec2client_cache_methodkey(self, ec2_client, *args, **kwargs): + return hashkey(*args, **kwargs) + + +@dataclass +class AWSQuotasCache(ComputeTTLCache): + execution_lock: threading.Lock = field(default_factory=threading.Lock) + + +class AWSCompute( + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithPrivilegedSupport, + ComputeWithInstanceVolumesSupport, + ComputeWithMultinodeSupport, + ComputeWithReservationSupport, + ComputeWithPlacementGroupSupport, + ComputeWithGatewaySupport, + ComputeWithPrivateGatewaySupport, + ComputeWithVolumeSupport, + Compute, +): + def __init__( + self, + config: AWSConfig, + quotas_cache: Optional[ComputeTTLCache] = None, + zones_cache: Optional[ComputeCache] = None, + ): + super().__init__() self.config = config - if is_core_model_instance(config.creds, AWSAccessKeyCreds): + if isinstance(config.creds, AWSAccessKeyCreds): self.session = boto3.Session( aws_access_key_id=config.creds.access_key, aws_secret_access_key=config.creds.secret_key, ) else: # default creds self.session = boto3.Session() + # Caches to avoid redundant API calls when provisioning many instances + # get_offers is already cached but we still cache its sub-functions + # with more aggressive/longer caches. + self._offers_post_filter_cache = ComputeTTLCache(cache=TTLCache(maxsize=10, ttl=180)) + if quotas_cache is None: + quotas_cache = ComputeTTLCache(cache=TTLCache(maxsize=10, ttl=600)) + self._regions_to_quotas_cache = quotas_cache + if zones_cache is None: + zones_cache = ComputeCache(cache=Cache(maxsize=10)) + self._regions_to_zones_cache = zones_cache + self._vpc_id_subnets_ids_cache = ComputeTTLCache(cache=TTLCache(maxsize=100, ttl=600)) + self._maximum_efa_interfaces_cache = ComputeCache(cache=Cache(maxsize=100)) + self._subnets_availability_zones_cache = ComputeCache(cache=Cache(maxsize=100)) + self._security_group_cache = ComputeTTLCache(cache=TTLCache(maxsize=100, ttl=600)) + self._image_id_and_username_cache = ComputeTTLCache(cache=TTLCache(maxsize=100, ttl=600)) - def get_offers( - self, requirements: Optional[Requirements] = None - ) -> List[InstanceOfferWithAvailability]: + def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]: offers = get_catalog_offers( backend=BackendType.AWS, locations=self.config.regions, - requirements=requirements, extra_filter=_supported_instances, ) - regions = set(i.region for i in offers) - - def get_quotas(client: botocore.client.BaseClient) -> Dict[str, int]: - region_quotas = {} - for page in client.get_paginator("list_service_quotas").paginate(ServiceCode="ec2"): - for q in page["Quotas"]: - if "On-Demand" in q["QuotaName"]: - region_quotas[q["UsageMetric"]["MetricDimensions"]["Class"]] = q["Value"] - return region_quotas - - quotas = {} - with ThreadPoolExecutor(max_workers=8) as executor: - future_to_region = {} - for region in regions: - future = executor.submit( - get_quotas, self.session.client("service-quotas", region_name=region) - ) - future_to_region[future] = region - for future in as_completed(future_to_region): - quotas[future_to_region[future]] = future.result() + regions = list(set(i.region for i in offers)) + regions_to_quotas = self._get_regions_to_quotas(self.session, regions) + regions_to_zones = self._get_regions_to_zones(self.session, regions) availability_offers = [] for offer in offers: availability = InstanceAvailability.UNKNOWN - if not _has_quota(quotas[offer.region], offer.instance.name): + quota = _has_quota(regions_to_quotas[offer.region], offer.instance.name) + if quota is not None and not quota: availability = InstanceAvailability.NO_QUOTA availability_offers.append( - InstanceOfferWithAvailability(**offer.dict(), availability=availability) + offer.with_availability( + availability=availability, + availability_zones=regions_to_zones[offer.region], + ) ) return availability_offers + def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]: + return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)] + + def _get_offers_cached_key(self, requirements: Requirements) -> int: + # Requirements is not hashable, so we use a hack to get arguments hash + return hash(requirements.json()) + + @cachedmethod( + cache=lambda self: self._offers_post_filter_cache.cache, + key=_get_offers_cached_key, + lock=lambda self: self._offers_post_filter_cache.lock, + ) + def get_offers_post_filter( + self, requirements: Requirements + ) -> Optional[Callable[[InstanceOfferWithAvailability], bool]]: + if requirements.reservation: + region_to_reservation = {} + for region in get_or_error(self.config.regions): + reservation = aws_resources.get_reservation( + ec2_client=self.session.client("ec2", region_name=region), + reservation_id=requirements.reservation, + instance_count=1, + ) + if reservation is not None: + region_to_reservation[region] = reservation + + def reservation_filter(offer: InstanceOfferWithAvailability) -> bool: + # Filter: Spot instances can't be used with reservations + if offer.instance.resources.spot: + return False + region = offer.region + reservation = region_to_reservation.get(region) + # Filter: only instance types matching the capacity reservation + if not bool(reservation and offer.instance.name == reservation["InstanceType"]): + return False + return True + + return reservation_filter + + return None + def terminate_instance( self, instance_id: str, region: str, backend_data: Optional[str] = None ) -> None: @@ -112,118 +230,278 @@ def terminate_instance( ec2_client.terminate_instances(InstanceIds=[instance_id]) except botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == "InvalidInstanceID.NotFound": - pass + logger.debug("Skipping instance %s termination. Instance not found.", instance_id) else: raise e + instance_backend_data = _parse_instance_backend_data(backend_data) + if instance_backend_data.eip_allocation_id is not None: + _release_eip( + ec2_client=ec2_client, + allocation_id=instance_backend_data.eip_allocation_id, + ) def create_instance( self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration, + placement_group: Optional[PlacementGroup], ) -> JobProvisioningData: project_name = instance_config.project_name ec2_resource = self.session.resource("ec2", region_name=instance_offer.region) ec2_client = self.session.client("ec2", region_name=instance_offer.region) allocate_public_ip = self.config.allocate_public_ips - availability_zones = None - if instance_config.availability_zone is not None: - availability_zones = [instance_config.availability_zone] - - tags = [ - {"Key": "Name", "Value": instance_config.instance_name}, - {"Key": "owner", "Value": "dstack"}, - {"Key": "dstack_project", "Value": project_name}, - {"Key": "dstack_user", "Value": instance_config.user}, - ] + zones = instance_offer.availability_zones + if zones is not None and len(zones) == 0: + raise NoCapacityError("No eligible availability zones") + + instance_name = generate_unique_instance_name(instance_config) + base_tags = { + "Name": instance_name, + "owner": "dstack", + "dstack_project": project_name, + "dstack_name": instance_config.instance_name, + "dstack_user": instance_config.user, + } + tags = merge_tags( + base_tags=base_tags, + backend_tags=self.config.tags, + resource_tags=instance_config.tags, + ) + tags = aws_resources.filter_invalid_tags(tags) + + disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024) + max_efa_interfaces = self._get_maximum_efa_interfaces( + ec2_client=ec2_client, + region=instance_offer.region, + instance_type=instance_offer.instance.name, + ) + enable_efa = max_efa_interfaces > 0 + is_capacity_block = False try: - vpc_id, subnets_ids = get_vpc_id_subnet_id_or_error( + vpc_id, subnets_ids = self._get_vpc_id_subnets_ids_or_error( ec2_client=ec2_client, config=self.config, region=instance_offer.region, allocate_public_ip=allocate_public_ip, - availability_zones=availability_zones, + availability_zones=zones, ) - subnet_id = subnets_ids[0] - availability_zone = aws_resources.get_availability_zone_by_subnet_id( + subnet_id_to_az_map = self._get_subnets_availability_zones( ec2_client=ec2_client, - subnet_id=subnet_id, + region=instance_offer.region, + subnets_ids=subnets_ids, ) - disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024) - response = ec2_resource.create_instances( - **aws_resources.create_instances_struct( - disk_size=disk_size, - image_id=aws_resources.get_image_id( - ec2_client=ec2_client, - cuda=len(instance_offer.instance.resources.gpus) > 0, - ), - instance_type=instance_offer.instance.name, - iam_instance_profile_arn=None, - user_data=get_user_data(authorized_keys=instance_config.get_public_keys()), - tags=tags, - security_group_id=aws_resources.create_security_group( - ec2_client=ec2_client, - project_id=project_name, - vpc_id=vpc_id, - ), - spot=instance_offer.instance.resources.spot, - subnet_id=subnet_id, - allocate_public_ip=allocate_public_ip, + if instance_config.reservation: + reservation = aws_resources.get_reservation( + ec2_client=ec2_client, + reservation_id=instance_config.reservation, + instance_count=1, ) + if reservation is not None: + # Filter out az different from capacity reservation + subnet_id_to_az_map = { + k: v + for k, v in subnet_id_to_az_map.items() + if v == reservation["AvailabilityZone"] + } + if reservation.get("ReservationType") == "capacity-block": + is_capacity_block = True + except botocore.exceptions.ClientError as e: + logger.warning("Got botocore.exceptions.ClientError: %s", e) + raise NoCapacityError() + + tried_zones = set() + for subnet_id, az in subnet_id_to_az_map.items(): + if az in tried_zones: + continue + tried_zones.add(az) + logger.debug("Trying provisioning %s in %s", instance_offer.instance.name, az) + image_id, username = self._get_image_id_and_username( + ec2_client=ec2_client, + region=instance_offer.region, + gpu_name=( + instance_offer.instance.resources.gpus[0].name + if len(instance_offer.instance.resources.gpus) > 0 + else None + ), + instance_type=instance_offer.instance.name, + image_config=self.config.os_images, ) + security_group_id = self._create_security_group( + ec2_client=ec2_client, + region=instance_offer.region, + project_id=project_name, + vpc_id=vpc_id, + ) + try: + response = ec2_resource.create_instances( # pyright: ignore[reportAttributeAccessIssue] + **aws_resources.create_instances_struct( + disk_size=disk_size, + image_id=image_id, + instance_type=instance_offer.instance.name, + iam_instance_profile=self.config.iam_instance_profile, + user_data=get_user_data( + authorized_keys=instance_config.get_public_keys(), + # Custom OS images may lack ufw, so don't attempt to set up the firewall. + # Rely on security groups and the image's built-in firewall rules instead. + skip_firewall_setup=self.config.os_images is not None, + ), + tags=aws_resources.make_tags(tags), + security_group_id=security_group_id, + spot=instance_offer.instance.resources.spot, + subnet_id=subnet_id, + allocate_public_ip=allocate_public_ip, + placement_group_name=placement_group.name if placement_group else None, + enable_efa=enable_efa, + max_efa_interfaces=max_efa_interfaces, + reservation_id=instance_config.reservation, + is_capacity_block=is_capacity_block, + ) + ) + except botocore.exceptions.ClientError as e: + logger.warning("Got botocore.exceptions.ClientError: %s", e) + if e.response["Error"]["Code"] == "InvalidParameterValue": + msg = e.response["Error"].get("Message", "") + raise ComputeError(f"Invalid AWS request: {msg}") + continue instance = response[0] + # wait_until_running() is only needed so that instance is immediately ready for volume attach. + # TODO: Drop wait_until_running() once attach readiness is checked outside. instance.wait_until_running() - instance.reload() # populate instance.public_ip_address - if instance_offer.instance.resources.spot: # it will not terminate the instance - ec2_client.cancel_spot_instance_requests( - SpotInstanceRequestIds=[instance.spot_instance_request_id] - ) - hostname = _get_instance_ip(instance, allocate_public_ip) + if instance_offer.instance.resources.spot: + # it will not terminate the instance + try: + ec2_client.cancel_spot_instance_requests( + SpotInstanceRequestIds=[instance.spot_instance_request_id] + ) + except Exception: + logger.exception( + "Failed to cancel spot instance request. The instance will be terminated." + ) + self.terminate_instance( + instance_id=instance.instance_id, region=instance_offer.region + ) + raise NoCapacityError() return JobProvisioningData( backend=instance_offer.backend, instance_type=instance_offer.instance, instance_id=instance.instance_id, public_ip_enabled=allocate_public_ip, - hostname=hostname, - internal_ip=instance.private_ip_address, + hostname=None, + internal_ip=None, region=instance_offer.region, - availability_zone=availability_zone, + availability_zone=az, + reservation=instance.capacity_reservation_id, price=instance_offer.price, - username="ubuntu", - ssh_port=22, - dockerized=True, # because `dstack-shim docker` is used + username=username, + ssh_port=None, + dockerized=True, # because `dstack-shim` is used ssh_proxy=None, backend_data=None, ) - except botocore.exceptions.ClientError as e: - logger.warning("Got botocore.exceptions.ClientError: %s", e) - raise NoCapacityError() + raise NoCapacityError() - def run_job( + def update_provisioning_data( self, - run: Run, - job: Job, - instance_offer: InstanceOfferWithAvailability, + provisioning_data: JobProvisioningData, project_ssh_public_key: str, project_ssh_private_key: str, - volumes: List[Volume], - ) -> JobProvisioningData: - instance_config = InstanceConfiguration( - project_name=run.project_name, - instance_name=get_instance_name(run, job), # TODO: generate name - ssh_keys=[ - SSHKey(public=project_ssh_public_key.strip()), - ], - job_docker_config=None, - user=run.user, - ) - if len(volumes) > 0: - volume = volumes[0] - if ( - volume.provisioning_data is not None - and volume.provisioning_data.availability_zone is not None - ): - instance_config.availability_zone = volume.provisioning_data.availability_zone - return self.create_instance(instance_offer, instance_config) + ): + ec2_resource = self.session.resource("ec2", region_name=provisioning_data.region) + ec2_client = self.session.client("ec2", region_name=provisioning_data.region) + instance = ec2_resource.Instance(provisioning_data.instance_id) # pyright: ignore[reportAttributeAccessIssue] + try: + instance.load() + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "InvalidInstanceID.NotFound": + logger.debug( + "Instance %s not found. Waiting for the instance to appear" + " or to timeout if the instance is manually deleted.", + provisioning_data.instance_id, + ) + # Instance may be created but not yet visible to due AWS eventual consistency, + # so we wait instead of failing immediately. + return + raise e + + state = instance.state.get("Name") + if state == "pending": + return + if state in [None, "shutting-down", "terminated", "stopping", "stopped"]: + raise ProvisioningError( + f"Failed to get instance IP address. Instance state is {state}." + ) + if state != "running": + raise ProvisioningError( + f"Failed to get instance IP address. Unknown instance state {state}." + ) + + if self.config.allocate_public_ips and instance.public_ip_address is None: + # AWS can't auto-assign a public IPv4 to multi-ENI instances (multi-EFA instances). + # When `public_ips: true` and no public IP is present after launch, attach an Elastic IP to the primary ENI. + # The check relies on running instances always having IP assigned if ever. + public_ip, allocation_id = _allocate_and_associate_eip( + ec2_client=ec2_client, + instance=instance, + project_name=_get_project_name_from_instance_tags(instance), + backend_tags=self.config.tags, + ) + provisioning_data.backend_data = AWSInstanceBackendData( + eip_allocation_id=allocation_id + ).json() + provisioning_data.hostname = public_ip + else: + provisioning_data.hostname = _get_instance_ip( + instance, self.config.allocate_public_ips + ) + provisioning_data.internal_ip = instance.private_ip_address + provisioning_data.ssh_port = 22 + + def create_placement_group( + self, + placement_group: PlacementGroup, + master_instance_offer: InstanceOffer, + ) -> PlacementGroupProvisioningData: + if not _offer_supports_placement_group(master_instance_offer, placement_group): + raise PlacementGroupNotSupportedError() + ec2_client = self.session.client("ec2", region_name=placement_group.configuration.region) + logger.debug("Creating placement group %s...", placement_group.name) + ec2_client.create_placement_group( + GroupName=placement_group.name, + Strategy=placement_group.configuration.placement_strategy.value, + ) + logger.debug("Created placement group %s", placement_group.name) + return PlacementGroupProvisioningData( + backend=BackendType.AWS, + backend_data=None, + ) + + def delete_placement_group( + self, + placement_group: PlacementGroup, + ): + ec2_client = self.session.client("ec2", region_name=placement_group.configuration.region) + logger.debug("Deleting placement group %s...", placement_group.name) + try: + ec2_client.delete_placement_group(GroupName=placement_group.name) + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "InvalidPlacementGroup.Unknown": + logger.debug("Placement group %s not found", placement_group.name) + return + elif e.response["Error"]["Code"] == "InvalidPlacementGroup.InUse": + logger.debug("Placement group %s is in use", placement_group.name) + raise PlacementGroupInUseError() + else: + raise e + logger.debug("Deleted placement group %s", placement_group.name) + + def is_suitable_placement_group( + self, + placement_group: PlacementGroup, + instance_offer: InstanceOffer, + ) -> bool: + if not _offer_supports_placement_group(instance_offer, placement_group): + return False + return placement_group.configuration.region == instance_offer.region def create_gateway( self, @@ -232,15 +510,24 @@ def create_gateway( ec2_resource = self.session.resource("ec2", region_name=configuration.region) ec2_client = self.session.client("ec2", region_name=configuration.region) - tags = [ - {"Key": "Name", "Value": configuration.instance_name}, - {"Key": "owner", "Value": "dstack"}, - {"Key": "dstack_project", "Value": configuration.project_name}, - ] + instance_name = generate_unique_gateway_instance_name(configuration) + base_tags = { + "Name": instance_name, + "owner": "dstack", + "dstack_project": configuration.project_name, + "dstack_name": configuration.instance_name, + } if settings.DSTACK_VERSION is not None: - tags.append({"Key": "dstack_version", "Value": settings.DSTACK_VERSION}) + base_tags["dstack_version"] = settings.DSTACK_VERSION + tags = merge_tags( + base_tags=base_tags, + backend_tags=self.config.tags, + resource_tags=configuration.tags, + ) + tags = aws_resources.filter_invalid_tags(tags) + tags = aws_resources.make_tags(tags) - vpc_id, subnets_ids = get_vpc_id_subnet_id_or_error( + vpc_id, subnets_ids = self._get_vpc_id_subnets_ids_or_error( ec2_client=ec2_client, config=self.config, region=configuration.region, @@ -256,20 +543,27 @@ def create_gateway( project_id=configuration.project_name, vpc_id=vpc_id, ) - response = ec2_resource.create_instances( - **aws_resources.create_instances_struct( - disk_size=10, - image_id=aws_resources.get_gateway_image_id(ec2_client), - instance_type="t2.micro", - iam_instance_profile_arn=None, - user_data=get_gateway_user_data(configuration.ssh_key_pub), - tags=tags, - security_group_id=security_group_id, - spot=False, - subnet_id=subnet_id, - allocate_public_ip=configuration.public_ip, - ) + instance_struct = aws_resources.create_instances_struct( + disk_size=10, + image_id=aws_resources.get_gateway_image_id(ec2_client), + instance_type=configuration.instance_type or DEFAULT_GATEWAY_INSTANCE_TYPE, + iam_instance_profile=None, + user_data=get_gateway_user_data( + configuration.ssh_key_pub, router=configuration.router + ), + tags=tags, + security_group_id=security_group_id, + spot=False, + subnet_id=subnet_id, + allocate_public_ip=configuration.public_ip, ) + try: + response = ec2_resource.create_instances(**instance_struct) # pyright: ignore[reportAttributeAccessIssue] + except botocore.exceptions.ClientError as e: + msg = f"AWS Error: {e.response['Error']['Code']}" + if e.response["Error"].get("Message"): + msg += f": {e.response['Error']['Message']}" + raise ComputeError(msg) instance = response[0] instance.wait_until_running() instance.reload() # populate instance.public_ip_address @@ -284,15 +578,21 @@ def create_gateway( elb_client = self.session.client("elbv2", region_name=configuration.region) - if len(subnets_ids) < 2: + lb_subnets_ids = self._get_gateway_lb_subnets_ids( + ec2_client=ec2_client, region=configuration.region, subnets_ids=subnets_ids + ) + if len(lb_subnets_ids) < 2: raise ComputeError( "Deploying gateway with ACM certificate requires at least two subnets in different AZs" ) + # Using short names as LB and target groups have length limit of 32. + resources_name_prefix = generate_unique_short_backend_name() + logger.debug("Creating ALB for gateway %s...", configuration.instance_name) response = elb_client.create_load_balancer( - Name=f"{configuration.instance_name}-lb", - Subnets=subnets_ids, + Name=f"{resources_name_prefix}-lb", + Subnets=lb_subnets_ids, SecurityGroups=[security_group_id], Scheme="internet-facing" if configuration.public_ip else "internal", Tags=tags, @@ -306,7 +606,7 @@ def create_gateway( logger.debug("Creating Target Group for gateway %s...", configuration.instance_name) response = elb_client.create_target_group( - Name=f"{configuration.instance_name}-tg", + Name=f"{resources_name_prefix}-tg", Protocol="HTTP", Port=80, VpcId=vpc_id, @@ -324,7 +624,7 @@ def create_gateway( ) logger.debug("Registered ALB target for gateway %s", configuration.instance_name) - logger.debug("Creating ALB Listener for gateway %s...", configuration.instance_name) + logger.debug("Creating HTTPS ALB listener for gateway %s...", configuration.instance_name) response = elb_client.create_listener( LoadBalancerArn=lb_arn, Protocol="HTTPS", @@ -341,7 +641,26 @@ def create_gateway( ], ) listener_arn = response["Listeners"][0]["ListenerArn"] - logger.debug("Created ALB Listener for gateway %s", configuration.instance_name) + logger.debug("Created HTTPS ALB listener for gateway %s", configuration.instance_name) + + logger.debug("Creating HTTP ALB listener for gateway %s...", configuration.instance_name) + response = elb_client.create_listener( + LoadBalancerArn=lb_arn, + Protocol="HTTP", + Port=80, + DefaultActions=[ + { + "Type": "redirect", + "RedirectConfig": { + "Protocol": "HTTPS", + "Port": "443", + "StatusCode": "HTTP_301", + }, + } + ], + ) + http_listener_arn = response["Listeners"][0]["ListenerArn"] + logger.debug("Created HTTP ALB listener for gateway %s", configuration.instance_name) ip_address = _get_instance_ip(instance, configuration.public_ip) return GatewayProvisioningData( @@ -353,12 +672,13 @@ def create_gateway( lb_arn=lb_arn, tg_arn=tg_arn, listener_arn=listener_arn, + http_listener_arn=http_listener_arn, ).json(), ) def terminate_gateway( self, - instance_id, + instance_id: str, configuration: GatewayComputeConfiguration, backend_data: Optional[str] = None, ): @@ -384,16 +704,20 @@ def terminate_gateway( "Failed to terminate all gateway %s resources. backend_data parsing error.", configuration.instance_name, ) + return elb_client = self.session.client("elbv2", region_name=configuration.region) logger.debug("Deleting ALB resources for gateway %s...", configuration.instance_name) + if backend_data_parsed.http_listener_arn is not None: + elb_client.delete_listener(ListenerArn=backend_data_parsed.http_listener_arn) elb_client.delete_listener(ListenerArn=backend_data_parsed.listener_arn) elb_client.delete_target_group(TargetGroupArn=backend_data_parsed.tg_arn) elb_client.delete_load_balancer(LoadBalancerArn=backend_data_parsed.lb_arn) logger.debug("Deleted ALB resources for gateway %s", configuration.instance_name) def register_volume(self, volume: Volume) -> VolumeProvisioningData: + assert isinstance(volume.configuration, AWSVolumeConfiguration) ec2_client = self.session.client("ec2", region_name=volume.configuration.region) logger.debug("Requesting EBS volume %s", volume.configuration.volume_id) @@ -421,33 +745,45 @@ def register_volume(self, volume: Volume) -> VolumeProvisioningData: ) def create_volume(self, volume: Volume) -> VolumeProvisioningData: + assert isinstance(volume.configuration, AWSVolumeConfiguration) ec2_client = self.session.client("ec2", region_name=volume.configuration.region) - tags = [ - {"Key": "Name", "Value": volume.configuration.name}, - {"Key": "owner", "Value": "dstack"}, - {"Key": "dstack_project", "Value": volume.project_name}, - ] + volume_name = generate_unique_volume_name(volume) + base_tags = { + "Name": volume_name, + "owner": "dstack", + "dstack_project": volume.project_name, + "dstack_name": volume.name, + "dstack_user": volume.user, + } + tags = merge_tags( + base_tags=base_tags, + backend_tags=self.config.tags, + resource_tags=volume.configuration.tags, + ) + tags = aws_resources.filter_invalid_tags(tags) - zone = aws_resources.get_availability_zone( + zones = aws_resources.get_availability_zones( ec2_client=ec2_client, region=volume.configuration.region ) - if zone is None: + if volume.configuration.availability_zone is not None: + zones = [z for z in zones if z == volume.configuration.availability_zone] + if len(zones) == 0: raise ComputeError( f"Failed to find availability zone in region {volume.configuration.region}" ) - + zone = zones[0] volume_type = "gp3" logger.debug("Creating EBS volume %s", volume.configuration.name) response = ec2_client.create_volume( - Size=int(volume.configuration.size), + Size=volume.configuration.size_gb, AvailabilityZone=zone, VolumeType=volume_type, TagSpecifications=[ { "ResourceType": "volume", - "Tags": tags, + "Tags": aws_resources.make_tags(tags), } ], ) @@ -455,7 +791,6 @@ def create_volume(self, volume: Volume) -> VolumeProvisioningData: size = response["Size"] iops = response["Iops"] - return VolumeProvisioningData( backend=BackendType.AWS, volume_id=response["VolumeId"], @@ -469,6 +804,7 @@ def create_volume(self, volume: Volume) -> VolumeProvisioningData: ) def delete_volume(self, volume: Volume): + assert isinstance(volume.configuration, AWSVolumeConfiguration) ec2_client = self.session.client("ec2", region_name=volume.configuration.region) logger.debug("Deleting EBS volume %s", volume.configuration.name) @@ -481,9 +817,13 @@ def delete_volume(self, volume: Volume): raise e logger.debug("Deleted EBS volume %s", volume.configuration.name) - def attach_volume(self, volume: Volume, instance_id: str) -> VolumeAttachmentData: + def attach_volume( + self, volume: Volume, provisioning_data: JobProvisioningData + ) -> VolumeAttachmentData: + assert isinstance(volume.configuration, AWSVolumeConfiguration) ec2_client = self.session.client("ec2", region_name=volume.configuration.region) + instance_id = provisioning_data.instance_id device_names = aws_resources.list_available_device_names( ec2_client=ec2_client, instance_id=instance_id ) @@ -499,9 +839,9 @@ def attach_volume(self, volume: Volume, instance_id: str) -> VolumeAttachmentDat if e.response["Error"]["Code"] == "VolumeInUse": raise ComputeError(f"Failed to attach volume in use: {volume.volume_id}") if e.response["Error"]["Code"] == "InvalidVolume.ZoneMismatch": - raise ComputeError( - f"Failed to attach volume {volume.volume_id}. Volume zone is different from instance zone." - ) + raise ComputeError("Volume zone is different from instance zone") + if e.response["Error"]["Code"] == "InvalidVolume.NotFound": + raise ComputeError("Volume not found") if ( e.response["Error"]["Code"] == "InvalidParameterValue" and f"Invalid value '{device_name}' for unixDevice" @@ -516,19 +856,234 @@ def attach_volume(self, volume: Volume, instance_id: str) -> VolumeAttachmentDat logger.debug("Attached EBS volume %s to instance %s", volume.volume_id, instance_id) return VolumeAttachmentData(device_name=device_name) - def detach_volume(self, volume: Volume, instance_id: str): + def detach_volume( + self, volume: Volume, provisioning_data: JobProvisioningData, force: bool = False + ): + assert isinstance(volume.configuration, AWSVolumeConfiguration) ec2_client = self.session.client("ec2", region_name=volume.configuration.region) + instance_id = provisioning_data.instance_id logger.debug("Detaching EBS volume %s from instance %s", volume.volume_id, instance_id) - ec2_client.detach_volume( - VolumeId=volume.volume_id, - InstanceId=instance_id, - Device=volume.attachment_data.device_name, - ) + attachment_data = get_or_error(volume.get_attachment_data_for_instance(instance_id)) + try: + ec2_client.detach_volume( + VolumeId=volume.volume_id, + InstanceId=instance_id, + Device=attachment_data.device_name, + Force=force, + ) + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "IncorrectState": + logger.info( + "Skipping EBS volume %s detach since it's already detached", volume.volume_id + ) + return + raise e logger.debug("Detached EBS volume %s from instance %s", volume.volume_id, instance_id) + def is_volume_detached(self, volume: Volume, provisioning_data: JobProvisioningData) -> bool: + assert isinstance(volume.configuration, AWSVolumeConfiguration) + ec2_client = self.session.client("ec2", region_name=volume.configuration.region) + + instance_id = provisioning_data.instance_id + logger.debug("Getting EBS volume %s status", volume.volume_id) + response = ec2_client.describe_volumes(VolumeIds=[volume.volume_id]) + volumes_infos = response.get("Volumes") + if len(volumes_infos) == 0: + logger.debug( + "Failed to check EBS volume %s status. Volume not found.", volume.volume_id + ) + return True + volume_info = volumes_infos[0] + for attachment in volume_info["Attachments"]: + if attachment["InstanceId"] != instance_id: + continue + if attachment["State"] != "detached": + return False + return True + return True + + def _get_regions_to_quotas_key( + self, + session: boto3.Session, + regions: List[str], + ) -> tuple: + return hashkey(tuple(regions)) + + @cachedmethod( + cache=lambda self: self._regions_to_quotas_cache.cache, + key=_get_regions_to_quotas_key, + lock=lambda self: self._regions_to_quotas_cache.lock, + ) + def _get_regions_to_quotas( + self, + session: boto3.Session, + regions: List[str], + ) -> Dict[str, Dict[str, int]]: + return _get_regions_to_quotas(session=session, regions=regions) + + def _get_regions_to_zones_key( + self, + session: boto3.Session, + regions: List[str], + ) -> tuple: + return hashkey(tuple(regions)) + + @cachedmethod( + cache=lambda self: self._regions_to_zones_cache.cache, + key=_get_regions_to_zones_key, + lock=lambda self: self._regions_to_zones_cache.lock, + ) + def _get_regions_to_zones( + self, + session: boto3.Session, + regions: List[str], + ) -> Dict[str, List[str]]: + return _get_regions_to_zones(session=session, regions=regions) + + def _get_vpc_id_subnets_ids_or_error_cache_key( + self, + ec2_client: botocore.client.BaseClient, + config: AWSConfig, + region: str, + allocate_public_ip: bool, + availability_zones: Optional[List[str]] = None, + ) -> tuple: + return hashkey( + region, allocate_public_ip, tuple(availability_zones) if availability_zones else None + ) + + @cachedmethod( + cache=lambda self: self._vpc_id_subnets_ids_cache.cache, + key=_get_vpc_id_subnets_ids_or_error_cache_key, + lock=lambda self: self._vpc_id_subnets_ids_cache.lock, + ) + def _get_vpc_id_subnets_ids_or_error( + self, + ec2_client: botocore.client.BaseClient, + config: AWSConfig, + region: str, + allocate_public_ip: bool, + availability_zones: Optional[List[str]] = None, + ) -> Tuple[str, List[str]]: + return get_vpc_id_subnets_ids_or_error( + ec2_client=ec2_client, + config=config, + region=region, + allocate_public_ip=allocate_public_ip, + availability_zones=availability_zones, + ) + + @cachedmethod( + cache=lambda self: self._maximum_efa_interfaces_cache.cache, + key=_ec2client_cache_methodkey, + lock=lambda self: self._maximum_efa_interfaces_cache.lock, + ) + def _get_maximum_efa_interfaces( + self, + ec2_client: botocore.client.BaseClient, + region: str, + instance_type: str, + ) -> int: + return _get_maximum_efa_interfaces( + ec2_client=ec2_client, + instance_type=instance_type, + ) + + def _get_subnets_availability_zones_key( + self, + ec2_client: botocore.client.BaseClient, + region: str, + subnets_ids: List[str], + ) -> tuple: + return hashkey(region, tuple(subnets_ids)) + + @cachedmethod( + cache=lambda self: self._subnets_availability_zones_cache.cache, + key=_get_subnets_availability_zones_key, + lock=lambda self: self._subnets_availability_zones_cache.lock, + ) + def _get_subnets_availability_zones( + self, + ec2_client: botocore.client.BaseClient, + region: str, + subnets_ids: List[str], + ) -> Dict[str, str]: + return aws_resources.get_subnets_availability_zones( + ec2_client=ec2_client, + subnets_ids=subnets_ids, + ) -def get_vpc_id_subnet_id_or_error( + @cachedmethod( + cache=lambda self: self._security_group_cache.cache, + key=_ec2client_cache_methodkey, + lock=lambda self: self._security_group_cache.lock, + ) + def _create_security_group( + self, + ec2_client: botocore.client.BaseClient, + region: str, + project_id: str, + vpc_id: Optional[str], + ) -> str: + return aws_resources.create_security_group( + ec2_client=ec2_client, + project_id=project_id, + vpc_id=vpc_id, + ) + + def _get_image_id_and_username_cache_key( + self, + ec2_client: botocore.client.BaseClient, + region: str, + gpu_name: Optional[str], + instance_type: str, + image_config: Optional[AWSOSImageConfig] = None, + ) -> tuple: + return hashkey( + region, gpu_name, instance_type, image_config.json() if image_config else None + ) + + @cachedmethod( + cache=lambda self: self._image_id_and_username_cache.cache, + key=_get_image_id_and_username_cache_key, + lock=lambda self: self._image_id_and_username_cache.lock, + ) + def _get_image_id_and_username( + self, + ec2_client: botocore.client.BaseClient, + region: str, + gpu_name: Optional[str], + instance_type: str, + image_config: Optional[AWSOSImageConfig] = None, + ) -> tuple[str, str]: + return aws_resources.get_image_id_and_username( + ec2_client=ec2_client, + gpu_name=gpu_name, + instance_type=instance_type, + image_config=image_config, + ) + + def _get_gateway_lb_subnets_ids( + self, + ec2_client: botocore.client.BaseClient, + region: str, + subnets_ids: List[str], + ) -> List[str]: + """ + Returns subnet IDs to be used for gateway Load Balancer among `subnets_ids`. + Filters out subnets from the same AZ since Load Balancer requires all subnets to be in different AZ. + """ + subnet_id_to_az_map = self._get_subnets_availability_zones( + ec2_client=ec2_client, + region=region, + subnets_ids=subnets_ids, + ) + az_to_subnet_id_map = {az: subnet_id for subnet_id, az in subnet_id_to_az_map.items()} + return list(az_to_subnet_id_map.values()) + + +def get_vpc_id_subnets_ids_or_error( ec2_client: botocore.client.BaseClient, config: AWSConfig, region: str, @@ -551,11 +1106,15 @@ def get_vpc_id_subnet_id_or_error( return vpc_id, subnets_ids if allocate_public_ip: raise ComputeError(f"Failed to find public subnets for VPC {vpc_id}") - raise ComputeError(f"Failed to find private subnets for VPC {vpc_id}") + raise ComputeError( + f"Failed to find private subnets for VPC {vpc_id} with outbound internet access. " + "Ensure you've setup NAT Gateway, Transit Gateway, or other mechanism " + "to provide outbound internet access from private subnets." + ) if not config.use_default_vpcs: raise ComputeError(f"No VPC ID configured for region {region}") - return _get_vpc_id_subnet_id_by_vpc_name_or_error( + return _get_vpc_id_subnets_ids_by_vpc_name_or_error( ec2_client=ec2_client, vpc_name=config.vpc_name, region=region, @@ -564,7 +1123,7 @@ def get_vpc_id_subnet_id_or_error( ) -def _get_vpc_id_subnet_id_by_vpc_name_or_error( +def _get_vpc_id_subnets_ids_by_vpc_name_or_error( ec2_client: botocore.client.BaseClient, vpc_name: Optional[str], region: str, @@ -605,33 +1164,126 @@ def _get_vpc_id_subnet_id_by_vpc_name_or_error( ) -def _has_quota(quotas: Dict[str, int], instance_name: str) -> bool: +_ON_DEMAND_QUOTA_CODES = { + "L-1216C47A": "Standard/OnDemand", + "L-417A185B": "P/OnDemand", + "L-DB2E81BA": "G/OnDemand", +} + + +def _get_regions_to_quotas( + session: boto3.Session, regions: List[str] +) -> Dict[str, Dict[str, int]]: + def get_region_quotas(region_name: str, client: botocore.client.BaseClient) -> Dict[str, int]: + region_quotas = {} + for quota_code, quota_class in _ON_DEMAND_QUOTA_CODES.items(): + try: + resp = client.get_service_quota(ServiceCode="ec2", QuotaCode=quota_code) + region_quotas[quota_class] = resp["Quota"]["Value"] + except botocore.exceptions.ClientError as e: + if "TooManyRequestsException" in str(e): + logger.warning( + "Failed to get quota %s in %s due to rate limits", + quota_code, + region_name, + ) + else: + logger.exception(e) + return region_quotas + + regions_to_quotas = {} + with ThreadPoolExecutor(max_workers=12) as executor: + future_to_region = {} + for region in regions: + future = executor.submit( + get_region_quotas, region, session.client("service-quotas", region_name=region) + ) + future_to_region[future] = region + for future in as_completed(future_to_region): + regions_to_quotas[future_to_region[future]] = future.result() + return regions_to_quotas + + +def _has_quota(quotas: Dict[str, int], instance_name: str) -> Optional[bool]: + quota = quotas.get("Standard/OnDemand") if instance_name.startswith("p"): - return quotas.get("P/OnDemand", 0) > 0 + quota = quotas.get("P/OnDemand") if instance_name.startswith("g"): - return quotas.get("G/OnDemand", 0) > 0 - return quotas.get("Standard/OnDemand", 0) > 0 + quota = quotas.get("G/OnDemand") + if quota is None: + return None + return quota > 0 + + +def _get_regions_to_zones(session: boto3.Session, regions: List[str]) -> Dict[str, List[str]]: + regions_to_zones = {} + with ThreadPoolExecutor(max_workers=12) as executor: + future_to_region = {} + for region in regions: + future = executor.submit( + aws_resources.get_availability_zones, + session.client("ec2", region_name=region), + region, + ) + future_to_region[future] = region + for future in as_completed(future_to_region): + regions_to_zones[future_to_region[future]] = future.result() + return regions_to_zones def _supported_instances(offer: InstanceOffer) -> bool: for family in [ + "m7i.", + "c7i.", + "r7i.", + "t3.", "t2.small", "c5.", "m5.", - "g4dn.", - "g5.", - "g6.", - "gr6.", - "p3.", + "p6-b300.", + "p6-b200.", + "p5.", + "p5e.", "p4d.", "p4de.", - "p5.", + "g7e.", + "g6.", + "g6e.", + "gr6.", + "g5.", + "g4dn.", ]: if offer.instance.name.startswith(family): return True return False +def _offer_supports_placement_group(offer: InstanceOffer, placement_group: PlacementGroup) -> bool: + if placement_group.configuration.placement_strategy != PlacementStrategy.CLUSTER: + return True + for family in ["t3.", "t2."]: + if offer.instance.name.startswith(family): + return False + return True + + +def _get_maximum_efa_interfaces(ec2_client: botocore.client.BaseClient, instance_type: str) -> int: + try: + response = ec2_client.describe_instance_types( + InstanceTypes=[instance_type], + Filters=[{"Name": "network-info.efa-supported", "Values": ["true"]}], + ) + except botocore.exceptions.ClientError as e: + if e.response.get("Error", {}).get("Code") == "InvalidInstanceType": + # "The following supplied instance types do not exist: []" + return 0 + raise + instance_types = response["InstanceTypes"] + if not instance_types: + return 0 + return instance_types[0]["NetworkInfo"]["EfaInfo"]["MaximumEfaInterfaces"] + + def _get_instance_ip(instance: Any, public_ip: bool) -> str: if public_ip: return instance.public_ip_address @@ -641,3 +1293,139 @@ def _get_instance_ip(instance: Any, public_ip: bool) -> str: def _get_volume_price(size: int, iops: int) -> float: # https://aws.amazon.com/ebs/pricing/ return size * 0.08 + (iops - 3000) * 0.005 + + +def _parse_instance_backend_data(backend_data: Optional[str]) -> "AWSInstanceBackendData": + if backend_data is None: + return AWSInstanceBackendData() + try: + return AWSInstanceBackendData.parse_raw(backend_data) + except ValidationError: + logger.exception("Failed to parse AWS instance backend_data; treating as empty") + return AWSInstanceBackendData() + + +def _get_project_name_from_instance_tags(instance: Any) -> Optional[str]: + for tag in instance.tags or []: + if tag.get("Key") == "dstack_project": + return tag.get("Value") + return None + + +def _allocate_and_associate_eip( + ec2_client: botocore.client.BaseClient, + instance: Any, + project_name: Optional[str], + backend_tags: Optional[Dict[str, str]], +) -> Tuple[str, str]: + """ + Allocates an Elastic IP and associates it with the primary ENI of `instance`. + Returns `(public_ip, allocation_id)`. + """ + primary_nic_id = _get_primary_network_interface_id(instance) + tags = { + "owner": "dstack", + "dstack_instance": instance.instance_id, + } + if project_name is not None: + tags["dstack_project"] = project_name + if backend_tags: + for k, v in backend_tags.items(): + tags.setdefault(k, v) + tags = aws_resources.filter_invalid_tags(tags) + + try: + allocate_response = ec2_client.allocate_address( + Domain="vpc", + TagSpecifications=[ + { + "ResourceType": "elastic-ip", + "Tags": aws_resources.make_tags(tags), + } + ], + ) + except botocore.exceptions.ClientError as e: + code = e.response.get("Error", {}).get("Code", "") + region = ec2_client.meta.region_name + if code == "AddressLimitExceeded": + raise ProvisioningError( + f"Elastic IP quota exceeded in {region}. " + "Raise the EC2 'EC2-VPC Elastic IPs' quota in Service Quotas, " + "or reduce concurrent multi-EFA instances." + ) + raise ProvisioningError(f"Failed to allocate Elastic IP in {region}: {e}") + + allocation_id = allocate_response["AllocationId"] + public_ip = allocate_response["PublicIp"] + try: + ec2_client.associate_address( + AllocationId=allocation_id, + NetworkInterfaceId=primary_nic_id, + AllowReassociation=False, + ) + except botocore.exceptions.ClientError as e: + # Best-effort release; on failure the EIP leaks until manually released. + logger.warning( + "Failed to associate EIP %s to instance %s; releasing.", + allocation_id, + instance.instance_id, + ) + try: + ec2_client.release_address(AllocationId=allocation_id) + except botocore.exceptions.ClientError: + logger.exception( + "Failed to release just-allocated EIP %s; release it manually.", + allocation_id, + ) + raise ProvisioningError( + f"Failed to associate Elastic IP {allocation_id} to instance " + f"{instance.instance_id}: {e}" + ) + return public_ip, allocation_id + + +def _get_primary_network_interface_id(instance: Any) -> str: + for nic in instance.network_interfaces_attribute or []: + attachment = nic.get("Attachment") or {} + if attachment.get("DeviceIndex") == 0: + return nic["NetworkInterfaceId"] + raise ProvisioningError( + f"Instance {instance.instance_id} has no primary network interface (DeviceIndex=0)" + ) + + +def _release_eip(ec2_client: botocore.client.BaseClient, allocation_id: str) -> None: + """ + Releases an Elastic IP by allocation ID. Disassociates first if the EIP is still + bound to an instance — `TerminateInstances` only initiates shutdown, and AWS + auto-disassociates only once the instance reaches `terminated`. Releasing + explicitly avoids the `InvalidIPAddress.InUse` race and the retry loop. + """ + try: + response = ec2_client.describe_addresses(AllocationIds=[allocation_id]) + except botocore.exceptions.ClientError as e: + code = e.response.get("Error", {}).get("Code", "") + if code in ("InvalidAllocationID.NotFound", "InvalidAddress.NotFound"): + logger.debug("Skipping EIP %s release. Already released.", allocation_id) + return + raise + addresses = response.get("Addresses", []) + if not addresses: + return + association_id = addresses[0].get("AssociationId") + if association_id is not None: + try: + ec2_client.disassociate_address(AssociationId=association_id) + except botocore.exceptions.ClientError as e: + code = e.response.get("Error", {}).get("Code", "") + # AWS may have auto-disassociated between our Describe and Disassociate + # if the instance just reached `terminated`. Tolerated. + if code != "InvalidAssociationID.NotFound": + raise + try: + ec2_client.release_address(AllocationId=allocation_id) + except botocore.exceptions.ClientError as e: + code = e.response.get("Error", {}).get("Code", "") + if code in ("InvalidAllocationID.NotFound", "InvalidAddress.NotFound"): + return + raise diff --git a/src/dstack/_internal/core/backends/aws/config.py b/src/dstack/_internal/core/backends/aws/config.py deleted file mode 100644 index 7bef93c13a..0000000000 --- a/src/dstack/_internal/core/backends/aws/config.py +++ /dev/null @@ -1,18 +0,0 @@ -from dstack._internal.core.backends.base.config import BackendConfig -from dstack._internal.core.models.backends.aws import AnyAWSCreds, AWSStoredConfig - - -class AWSConfig(AWSStoredConfig, BackendConfig): - creds: AnyAWSCreds - - @property - def allocate_public_ips(self) -> bool: - if self.public_ips is not None: - return self.public_ips - return True - - @property - def use_default_vpcs(self) -> bool: - if self.default_vpcs is not None: - return self.default_vpcs - return True diff --git a/src/dstack/_internal/core/backends/aws/configurator.py b/src/dstack/_internal/core/backends/aws/configurator.py new file mode 100644 index 0000000000..8d6c8afe14 --- /dev/null +++ b/src/dstack/_internal/core/backends/aws/configurator.py @@ -0,0 +1,191 @@ +import concurrent.futures +import json + +import botocore.exceptions +from boto3.session import Session + +from dstack._internal.core.backends.aws import auth, compute, resources +from dstack._internal.core.backends.aws.backend import AWSBackend +from dstack._internal.core.backends.aws.models import ( + AWSAccessKeyCreds, + AWSBackendConfig, + AWSBackendConfigWithCreds, + AWSConfig, + AWSCreds, + AWSDefaultCreds, + AWSStoredConfig, +) +from dstack._internal.core.backends.base.configurator import ( + TAGS_MAX_NUM, + BackendRecord, + Configurator, + raise_invalid_credentials_error, +) +from dstack._internal.core.errors import ( + BackendError, + ServerClientError, +) +from dstack._internal.core.models.backends.base import ( + BackendType, +) +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + +# where dstack OS images are published +REGIONS = [ + ("US East, N. Virginia", "us-east-1"), + ("US East, Ohio", "us-east-2"), + ("US West, N. California", "us-west-1"), + ("US West, Oregon", "us-west-2"), + ("Asia Pacific, Singapore", "ap-southeast-1"), + ("Canada, Central", "ca-central-1"), + ("Europe, Frankfurt", "eu-central-1"), + ("Europe, Ireland", "eu-west-1"), + ("Europe, London", "eu-west-2"), + ("Europe, Paris", "eu-west-3"), + ("Europe, Stockholm", "eu-north-1"), +] +REGION_VALUES = [r[1] for r in REGIONS] +DEFAULT_REGIONS = REGION_VALUES +MAIN_REGION = "us-east-1" + + +class AWSConfigurator( + Configurator[ + AWSBackendConfig, + AWSBackendConfigWithCreds, + ] +): + TYPE = BackendType.AWS + BACKEND_CLASS = AWSBackend + + def validate_config(self, config: AWSBackendConfigWithCreds, default_creds_enabled: bool): + if isinstance(config.creds, AWSDefaultCreds) and not default_creds_enabled: + raise_invalid_credentials_error(fields=[["creds"]]) + try: + session = auth.authenticate(creds=config.creds, region=MAIN_REGION) + except Exception: + if isinstance(config.creds, AWSAccessKeyCreds): + raise_invalid_credentials_error( + fields=[ + ["creds", "access_key"], + ["creds", "secret_key"], + ] + ) + else: + raise_invalid_credentials_error(fields=[["creds"]]) + self._check_config_tags(config) + self._check_config_iam_instance_profile(session, config) + self._check_config_vpc(session, config) + + def create_backend( + self, project_name: str, config: AWSBackendConfigWithCreds + ) -> BackendRecord: + if config.regions is None: + config.regions = DEFAULT_REGIONS + return BackendRecord( + config=AWSStoredConfig( + **AWSBackendConfig.__response__.parse_obj(config).dict() + ).json(), + auth=AWSCreds.parse_obj(config.creds).json(), + ) + + def get_backend_config_with_creds(self, record: BackendRecord) -> AWSBackendConfigWithCreds: + config = self._get_config(record) + return AWSBackendConfigWithCreds.__response__.parse_obj(config) + + def get_backend_config_without_creds(self, record: BackendRecord) -> AWSBackendConfig: + config = self._get_config(record) + return AWSBackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> AWSBackend: + config = self._get_config(record) + return AWSBackend(config=config) + + def _get_config(self, record: BackendRecord) -> AWSConfig: + return AWSConfig.__response__( + **json.loads(record.config), + creds=AWSCreds.parse_raw(record.auth).__root__, + ) + + def _check_config_tags(self, config: AWSBackendConfigWithCreds): + if not config.tags: + return + if len(config.tags) > TAGS_MAX_NUM: + raise ServerClientError( + f"Maximum number of tags exceeded. Up to {TAGS_MAX_NUM} tags is allowed." + ) + try: + resources.validate_tags(config.tags) + except BackendError as e: + raise ServerClientError(e.args[0]) + + def _check_config_iam_instance_profile( + self, session: Session, config: AWSBackendConfigWithCreds + ): + if config.iam_instance_profile is None: + return + try: + iam_client = session.client("iam") + iam_client.get_instance_profile(InstanceProfileName=config.iam_instance_profile) + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "NoSuchEntity": + raise ServerClientError( + f"IAM instance profile {config.iam_instance_profile} not found" + ) + logger.exception( + "Got botocore.exceptions.ClientError when checking iam_instance_profile" + ) + raise ServerClientError( + f"Failed to check IAM instance profile {config.iam_instance_profile}" + ) + except Exception: + logger.exception("Got exception when checking iam_instance_profile") + raise ServerClientError( + f"Failed to check IAM instance profile {config.iam_instance_profile}" + ) + + def _check_config_vpc(self, session: Session, config: AWSBackendConfigWithCreds): + allocate_public_ip = config.public_ips if config.public_ips is not None else True + use_default_vpcs = config.default_vpcs if config.default_vpcs is not None else True + if config.vpc_name is not None and config.vpc_ids is not None: + raise ServerClientError(msg="Only one of `vpc_name` and `vpc_ids` can be specified") + if not use_default_vpcs and config.vpc_name is None and config.vpc_ids is None: + raise ServerClientError( + msg="`vpc_name` or `vpc_ids` must be specified if `default_vpcs: false`." + ) + regions = config.regions + if regions is None: + regions = DEFAULT_REGIONS + if config.vpc_ids is not None and not use_default_vpcs: + vpc_ids_regions = list(config.vpc_ids.keys()) + not_configured_regions = [r for r in regions if r not in vpc_ids_regions] + if len(not_configured_regions) > 0: + if config.regions is None: + raise ServerClientError( + f"`vpc_ids` not configured for regions {not_configured_regions}. " + "Configure `vpc_ids` for all regions or specify `regions`." + ) + raise ServerClientError( + f"`vpc_ids` not configured for regions {not_configured_regions}. " + "Configure `vpc_ids` for all regions specified in `regions`." + ) + # The number of workers should be >= the number of regions + with concurrent.futures.ThreadPoolExecutor(max_workers=12) as executor: + futures = [] + for region in regions: + ec2_client = session.client("ec2", region_name=region) + future = executor.submit( + compute.get_vpc_id_subnets_ids_or_error, + ec2_client=ec2_client, + config=AWSConfig.parse_obj(config), + region=region, + allocate_public_ip=allocate_public_ip, + ) + futures.append(future) + for future in concurrent.futures.as_completed(futures): + try: + future.result() + except BackendError as e: + raise ServerClientError(e.args[0]) diff --git a/src/dstack/_internal/core/backends/aws/models.py b/src/dstack/_internal/core/backends/aws/models.py new file mode 100644 index 0000000000..d76a4c9ab3 --- /dev/null +++ b/src/dstack/_internal/core/backends/aws/models.py @@ -0,0 +1,135 @@ +from typing import Annotated, Dict, List, Literal, Optional, Union + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +class AWSOSImage(CoreModel): + name: Annotated[str, Field(description="The AMI name")] + owner: Annotated[ + str, + Field(regex=r"^(\d{12}|self)$", description="The AMI owner, account ID or `self`"), + ] = "self" + user: Annotated[str, Field(description="The OS user for provisioning")] + + +class AWSOSImageConfig(CoreModel): + cpu: Annotated[Optional[AWSOSImage], Field(description="The AMI used for CPU instances")] = ( + None + ) + nvidia: Annotated[ + Optional[AWSOSImage], Field(description="The AMI used for NVIDIA GPU instances") + ] = None + + +class AWSAccessKeyCreds(CoreModel): + type: Annotated[Literal["access_key"], Field(description="The type of credentials")] = ( + "access_key" + ) + access_key: Annotated[str, Field(description="The access key")] + secret_key: Annotated[str, Field(description="The secret key")] + + +class AWSDefaultCreds(CoreModel): + type: Annotated[Literal["default"], Field(description="The type of credentials")] = "default" + + +AnyAWSCreds = Union[AWSAccessKeyCreds, AWSDefaultCreds] + + +class AWSCreds(CoreModel): + __root__: AnyAWSCreds = Field(..., discriminator="type") + + +class AWSBackendConfig(CoreModel): + type: Annotated[Literal["aws"], Field(description="The type of the backend")] = "aws" + regions: Annotated[ + Optional[List[str]], Field(description="The list of AWS regions. Omit to use all regions") + ] = None + vpc_name: Annotated[ + Optional[str], + Field( + description=( + "The name of custom VPCs. All configured regions must have a VPC with this name." + " If your custom VPCs don't have names or have different names in different regions, use `vpc_ids` instead." + ) + ), + ] = None + vpc_ids: Annotated[ + Optional[Dict[str, str]], + Field( + description=( + "The mapping from AWS regions to VPC IDs." + " If `default_vpcs: true`, omitted regions will use default VPCs" + ) + ), + ] = None + default_vpcs: Annotated[ + Optional[bool], + Field( + description=( + "A flag to enable/disable using default VPCs in regions not configured by `vpc_ids`." + " Set to `false` if default VPCs should never be used." + " Defaults to `true`" + ) + ), + ] = None + public_ips: Annotated[ + Optional[bool], + Field( + description=( + "A flag to enable/disable public IP assigning on instances." + " `public_ips: false` requires at least one private subnet with outbound internet connectivity" + " provided by a NAT Gateway or a Transit Gateway." + " Defaults to `true`" + ) + ), + ] = None + iam_instance_profile: Annotated[ + Optional[str], + Field( + description=( + "The name of the IAM instance profile to associate with EC2 instances." + " You can also specify the IAM role name for roles created via the AWS console." + " AWS automatically creates an instance profile and gives it the same name as the role" + ) + ), + ] = None + tags: Annotated[ + Optional[Dict[str, str]], + Field(description="The tags that will be assigned to resources created by `dstack`"), + ] = None + os_images: Annotated[ + Optional[AWSOSImageConfig], + Field( + description="The mapping of instance categories (CPU, NVIDIA GPU) to AMI configurations" + ), + ] = None + + +class AWSBackendConfigWithCreds(AWSBackendConfig): + creds: AnyAWSCreds = Field(..., description="The credentials", discriminator="type") + + +AnyAWSBackendConfig = Union[AWSBackendConfig, AWSBackendConfigWithCreds] + + +class AWSStoredConfig(AWSBackendConfig): + pass + + +class AWSConfig(AWSStoredConfig): + creds: AnyAWSCreds + + @property + def allocate_public_ips(self) -> bool: + if self.public_ips is not None: + return self.public_ips + return True + + @property + def use_default_vpcs(self) -> bool: + if self.default_vpcs is not None: + return self.default_vpcs + return True diff --git a/src/dstack/_internal/core/backends/aws/resources.py b/src/dstack/_internal/core/backends/aws/resources.py index 83bcd74829..ae79675c6c 100644 --- a/src/dstack/_internal/core/backends/aws/resources.py +++ b/src/dstack/_internal/core/backends/aws/resources.py @@ -1,26 +1,59 @@ +import re from typing import Any, Dict, List, Optional import botocore.client import botocore.exceptions -import dstack.version as version -from dstack._internal.core.errors import ComputeResourceNotFoundError +from dstack._internal import settings +from dstack._internal.core.backends.aws.models import AWSOSImageConfig +from dstack._internal.core.errors import BackendError, ComputeError, ComputeResourceNotFoundError +from dstack._internal.utils.logging import get_logger +logger = get_logger(__name__) + +DSTACK_ACCOUNT_ID = "142421590066" +DLAMI_OWNER_ACCOUNT_ID = "898082745236" -def get_image_id(ec2_client: botocore.client.BaseClient, cuda: bool) -> str: - image_name = ( - f"dstack-{version.base_image}" if not cuda else f"dstack-cuda-{version.base_image}" - ) - response = ec2_client.describe_images(Filters=[{"Name": "name", "Values": [image_name]}]) +def get_image_id_and_username( + ec2_client: botocore.client.BaseClient, + gpu_name: Optional[str], + instance_type: str, + image_config: Optional[AWSOSImageConfig] = None, +) -> tuple[str, str]: + if image_config is not None: + image = image_config.nvidia if gpu_name else image_config.cpu + if image is None: + logger.warning("%s image not configured", "nvidia" if gpu_name else "cpu") + raise ComputeResourceNotFoundError() + image_name = image.name + image_owner = image.owner + username = image.user + elif gpu_name is not None: + # AWS Deep Learning AMIs (DLAMI) support all GPU instance types currently supported by dstack. + # dstack's cuda AMI is still built but not used. + # It may be used again in case some instance types are not supported by DLAMI. + image_name = "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04) *" + image_owner = DLAMI_OWNER_ACCOUNT_ID + username = "ubuntu" + else: + image_name = ( + f"{settings.DSTACK_VM_BASE_IMAGE_PREFIX}dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}" + ) + image_owner = DSTACK_ACCOUNT_ID + username = "ubuntu" + response = ec2_client.describe_images( + Filters=[{"Name": "name", "Values": [image_name]}], Owners=[image_owner] + ) images = sorted( (i for i in response["Images"] if i["State"] == "available"), key=lambda i: i["CreationDate"], reverse=True, ) if not images: + logger.warning("image '%s' not found", image_name) raise ComputeResourceNotFoundError() - return images[0]["ImageId"] + return images[0]["ImageId"], username def create_security_group( @@ -91,6 +124,15 @@ def create_security_group( security_group_id=security_group_id, rule={"IpProtocol": "-1"}, ) + _add_egress_security_group_rule_if_missing( + ec2_client=ec2_client, + security_group=security_group, + security_group_id=security_group_id, + rule={ + "IpProtocol": "-1", + "UserIdGroupPairs": [{"GroupId": security_group_id}], + }, + ) return security_group_id @@ -98,15 +140,20 @@ def create_instances_struct( disk_size: int, image_id: str, instance_type: str, - iam_instance_profile_arn: Optional[str], + iam_instance_profile: Optional[str], user_data: str, tags: List[Dict[str, str]], security_group_id: str, spot: bool, subnet_id: Optional[str] = None, allocate_public_ip: bool = True, + placement_group_name: Optional[str] = None, + enable_efa: bool = False, + max_efa_interfaces: int = 0, + reservation_id: Optional[str] = None, + is_capacity_block: bool = False, ) -> Dict[str, Any]: - struct = dict( + struct: Dict[str, Any] = dict( BlockDeviceMappings=[ { "DeviceName": "/dev/sda1", @@ -128,8 +175,8 @@ def create_instances_struct( }, ], ) - if iam_instance_profile_arn: - struct["IamInstanceProfile"] = {"Arn": iam_instance_profile_arn} + if iam_instance_profile: + struct["IamInstanceProfile"] = {"Name": iam_instance_profile} if spot: struct["InstanceMarketOptions"] = { "MarketType": "spot", @@ -138,19 +185,34 @@ def create_instances_struct( "InstanceInterruptionBehavior": "terminate", }, } + + if is_capacity_block: + struct["InstanceMarketOptions"] = {"MarketType": "capacity-block"} + if enable_efa and not subnet_id: + raise ComputeError("EFA requires subnet") # AWS allows specifying either NetworkInterfaces for specific subnet_id # or instance-level SecurityGroupIds in case of no specific subnet_id, not both. if subnet_id is not None: - struct["NetworkInterfaces"] = [ - { - "AssociatePublicIpAddress": allocate_public_ip, - "DeviceIndex": 0, - "SubnetId": subnet_id, - "Groups": [security_group_id], - }, - ] + struct["NetworkInterfaces"] = _create_network_interfaces_struct( + instance_type=instance_type, + subnet_id=subnet_id, + security_group_id=security_group_id, + allocate_public_ip=allocate_public_ip, + max_efa_interfaces=max_efa_interfaces, + ) else: struct["SecurityGroupIds"] = [security_group_id] + + if placement_group_name is not None: + struct["Placement"] = { + "GroupName": placement_group_name, + } + + if reservation_id is not None: + struct["CapacityReservationSpecification"] = { + "CapacityReservationTarget": {"CapacityReservationId": reservation_id} + } + return struct @@ -277,6 +339,7 @@ def get_subnets_ids_for_vpc( """ If `allocate_public_ip` is True, returns public subnets found in the VPC. If `allocate_public_ip` is False, returns subnets with NAT found in the VPC. + Returns """ subnets = _get_subnets_by_vpc_id( ec2_client=ec2_client, @@ -295,24 +358,15 @@ def get_subnets_ids_for_vpc( if is_public_subnet: subnets_ids.append(subnet_id) else: - subnet_behind_nat = _is_subnet_behind_nat( + is_eligible_private_subnet = _is_private_subnet_with_internet_egress( ec2_client=ec2_client, vpc_id=vpc_id, subnet_id=subnet_id, ) - if subnet_behind_nat: + if is_eligible_private_subnet: subnets_ids.append(subnet_id) - return subnets_ids - -def get_availability_zone(ec2_client: botocore.client.BaseClient, region: str) -> Optional[str]: - zone_names = get_availability_zones( - ec2_client=ec2_client, - region=region, - ) - if len(zone_names) == 0: - return None - return zone_names[0] + return subnets_ids def get_availability_zones(ec2_client: botocore.client.BaseClient, region: str) -> List[str]: @@ -335,6 +389,16 @@ def get_availability_zone_by_subnet_id( return response["Subnets"][0]["AvailabilityZone"] +def get_subnets_availability_zones( + ec2_client: botocore.client.BaseClient, subnets_ids: List[str] +) -> Dict[str, str]: + response = ec2_client.describe_subnets(SubnetIds=subnets_ids) + subnet_id_to_az_map = { + subnet["SubnetId"]: subnet["AvailabilityZone"] for subnet in response["Subnets"] + } + return subnet_id_to_az_map + + def list_available_device_names( ec2_client: botocore.client.BaseClient, instance_id: str ) -> List[str]: @@ -356,6 +420,52 @@ def list_instance_device_names( return device_names +def make_tags(tags: Dict[str, str]) -> List[Dict[str, str]]: + tags_list = [] + for k, v in tags.items(): + tags_list.append({"Key": k, "Value": v}) + return tags_list + + +def filter_invalid_tags(tags: Dict[str, str]) -> Dict[str, str]: + filtered_tags = {} + for k, v in tags.items(): + if not _is_valid_tag(k, v): + logger.warning("Skipping invalid tag '%s: %s'", k, v) + continue + filtered_tags[k] = v + return filtered_tags + + +def validate_tags(tags: Dict[str, str]): + for k, v in tags.items(): + if not _is_valid_tag(k, v): + raise BackendError( + "Invalid resource tags. " + "See tags restrictions: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/Using_Tags.html#tag-restrictions" + ) + + +def _is_valid_tag(key: str, value: str) -> bool: + return _is_valid_tag_key(key) and _is_valid_tag_value(value) + + +TAG_KEY_PATTERN = re.compile(r"^[\w .:/=\-+@]{1,128}$") +TAG_VALUE_PATTERN = re.compile(r"^[\w .:/=\-+@]{0,256}$") + + +def _is_valid_tag_key(key: str) -> bool: + if key.startswith("aws:"): + return False + match = re.match(TAG_KEY_PATTERN, key) + return match is not None + + +def _is_valid_tag_value(value: str) -> bool: + match = re.match(TAG_VALUE_PATTERN, value) + return match is not None + + def _list_possible_device_names() -> List[str]: suffixes = ["f", "g", "h", "i", "j", "k", "l", "m", "n"] return [f"/dev/sd{s}" for s in suffixes] @@ -458,7 +568,10 @@ def _is_public_subnet( return False -def _is_subnet_behind_nat( +_PRIVATE_SUBNET_EGRESS_ROUTE_KEYS = ["NatGatewayId", "TransitGatewayId", "VpcPeeringConnectionId"] + + +def _is_private_subnet_with_internet_egress( ec2_client: botocore.client.BaseClient, vpc_id: str, subnet_id: str, @@ -469,8 +582,9 @@ def _is_subnet_behind_nat( ) for route_table in response["RouteTables"]: for route in route_table["Routes"]: - if "NatGatewayId" in route and route["NatGatewayId"].startswith("nat-"): - return True + if route.get("DestinationCidrBlock") == "0.0.0.0/0": + if any(route.get(k) for k in _PRIVATE_SUBNET_EGRESS_ROUTE_KEYS): + return True # Main route table controls the routing of all subnetes # that are not explicitly associated with any other route table. @@ -486,7 +600,107 @@ def _is_subnet_behind_nat( ) for route_table in response["RouteTables"]: for route in route_table["Routes"]: - if "NatGatewayId" in route and route["NatGatewayId"].startswith("nat-"): - return True + if route.get("DestinationCidrBlock") == "0.0.0.0/0": + if any(route.get(k) for k in _PRIVATE_SUBNET_EGRESS_ROUTE_KEYS): + return True return False + + +def _create_network_interfaces_struct( + instance_type: str, + subnet_id: str, + security_group_id: str, + allocate_public_ip: bool, + max_efa_interfaces: int, +) -> List[Dict[str, Any]]: + # AWS does not auto-assign a public IPv4 to instances launched with multiple network + # interfaces ("AssociatePublicIpAddress [...] You cannot specify more than one network + # interface in the request"). For multi-EFA instance types (e.g. p4d, p5, p6, trn1), we + # therefore launch all EFA NICs without `AssociatePublicIpAddress` and, when + # `public_ips: true`, attach an Elastic IP after launch in `update_provisioning_data`. + multi_eni = max_efa_interfaces > 1 + primary_supports_efa = _primary_nic_supports_efa(instance_type) + network_interfaces: List[Dict[str, Any]] = [ + { + "AssociatePublicIpAddress": allocate_public_ip and not multi_eni, + "DeviceIndex": 0, + "SubnetId": subnet_id, + "Groups": [security_group_id], + "InterfaceType": "efa" + if max_efa_interfaces > 0 and primary_supports_efa + else "interface", + }, + ] + + if multi_eni: + last_card_index = max_efa_interfaces + if not primary_supports_efa: + last_card_index += 1 + for i in range(1, last_card_index): + # Set to efa-only to use interfaces exclusively for GPU-to-GPU communication + interface_type = "efa-only" + if instance_type == "p5.48xlarge": + # EFA configuration for P5 instances: + # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-acc-inst-types.html#efa-for-p5 + interface_type = "efa" if i % 4 == 0 else "efa-only" + network_interfaces.append( + { + "AssociatePublicIpAddress": False, + "NetworkCardIndex": i, + "DeviceIndex": 1, + "SubnetId": subnet_id, + "Groups": [security_group_id], + "InterfaceType": interface_type, + } + ) + return network_interfaces + + +def _primary_nic_supports_efa(instance_type: str) -> bool: + """For most EFA-supported instance types, primary network card (index 0) supports + attaching both ENA and EFA. But some may support only one interface (ENA), + and all EFA interfaces are placed on the secondary network cards (1..max_efa_interfaces). + """ + return instance_type not in {"p6-b300.48xlarge"} + + +def get_reservation( + ec2_client: botocore.client.BaseClient, + reservation_id: str, + instance_count: int = 0, + instance_types: Optional[List[str]] = None, + is_capacity_block: bool = False, +) -> Optional[Dict[str, Any]]: + filters = [{"Name": "state", "Values": ["active"]}] + if instance_types: + filters.append({"Name": "instance-type", "Values": instance_types}) + try: + response = ec2_client.describe_capacity_reservations( + CapacityReservationIds=[reservation_id], Filters=filters + ) + except botocore.exceptions.ParamValidationError as e: + logger.debug( + "Skipping reservation %s. Parameter validation error: %s", reservation_id, repr(e) + ) + return None + except botocore.exceptions.ClientError as e: + error_code = e.response.get("Error", {}).get("Code") + if error_code == "InvalidCapacityReservationId.Malformed": + logger.debug("Skipping reservation %s. Malformed ID.", reservation_id) + return None + if error_code == "InvalidCapacityReservationId.NotFound": + logger.debug( + "Skipping reservation %s. Capacity Reservation not found.", reservation_id + ) + return None + raise + reservation = response["CapacityReservations"][0] + + if instance_count > 0 and reservation["AvailableInstanceCount"] < instance_count: + return None + + if is_capacity_block and reservation["ReservationType"] != "capacity-block": + return None + + return reservation diff --git a/src/dstack/_internal/core/backends/azure/__init__.py b/src/dstack/_internal/core/backends/azure/__init__.py index 49607da4f8..e69de29bb2 100644 --- a/src/dstack/_internal/core/backends/azure/__init__.py +++ b/src/dstack/_internal/core/backends/azure/__init__.py @@ -1,20 +0,0 @@ -from dstack._internal.core.backends.azure import auth -from dstack._internal.core.backends.azure.compute import AzureCompute -from dstack._internal.core.backends.azure.config import AzureConfig -from dstack._internal.core.backends.base import Backend -from dstack._internal.core.models.backends.base import BackendType - - -class AzureBackend(Backend): - TYPE: BackendType = BackendType.AZURE - - def __init__(self, config: AzureConfig): - self.config = config - self.credential, _ = auth.authenticate(self.config.creds) - self._compute = AzureCompute( - config=self.config, - credential=self.credential, - ) - - def compute(self) -> AzureCompute: - return self._compute diff --git a/src/dstack/_internal/core/backends/azure/auth.py b/src/dstack/_internal/core/backends/azure/auth.py index 2001e33b64..e81105d530 100644 --- a/src/dstack/_internal/core/backends/azure/auth.py +++ b/src/dstack/_internal/core/backends/azure/auth.py @@ -4,13 +4,11 @@ from azure.identity import ClientSecretCredential, DefaultAzureCredential from azure.mgmt.subscription import SubscriptionClient -from dstack._internal.core.errors import BackendAuthError -from dstack._internal.core.models.backends.azure import ( +from dstack._internal.core.backends.azure.models import ( AnyAzureCreds, AzureClientCreds, - AzureDefaultCreds, ) -from dstack._internal.core.models.common import is_core_model_instance +from dstack._internal.core.errors import BackendAuthError AzureCredential = Union[ClientSecretCredential, DefaultAzureCredential] @@ -22,7 +20,7 @@ def authenticate(creds: AnyAzureCreds) -> Tuple[AzureCredential, str]: def get_credential(creds: AnyAzureCreds) -> Tuple[AzureCredential, str]: - if is_core_model_instance(creds, AzureClientCreds): + if isinstance(creds, AzureClientCreds): credential = ClientSecretCredential( tenant_id=creds.tenant_id, client_id=creds.client_id, @@ -39,11 +37,3 @@ def check_credential(credential: AzureCredential): list(client.subscriptions.list()) except ClientAuthenticationError: raise BackendAuthError() - - -def default_creds_available() -> bool: - try: - authenticate(AzureDefaultCreds()) - except BackendAuthError: - return False - return True diff --git a/src/dstack/_internal/core/backends/azure/backend.py b/src/dstack/_internal/core/backends/azure/backend.py new file mode 100644 index 0000000000..ec61b80530 --- /dev/null +++ b/src/dstack/_internal/core/backends/azure/backend.py @@ -0,0 +1,21 @@ +from dstack._internal.core.backends.azure import auth +from dstack._internal.core.backends.azure.compute import AzureCompute +from dstack._internal.core.backends.azure.models import AzureConfig +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.models.backends.base import BackendType + + +class AzureBackend(Backend): + TYPE = BackendType.AZURE + COMPUTE_CLASS = AzureCompute + + def __init__(self, config: AzureConfig): + self.config = config + self.credential, _ = auth.authenticate(self.config.creds) + self._compute = AzureCompute( + config=self.config, + credential=self.credential, + ) + + def compute(self) -> AzureCompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/azure/compute.py b/src/dstack/_internal/core/backends/azure/compute.py index 66b251c7ec..d2843c7f22 100644 --- a/src/dstack/_internal/core/backends/azure/compute.py +++ b/src/dstack/_internal/core/backends/azure/compute.py @@ -1,10 +1,12 @@ import base64 import enum import re -from typing import List, Optional, Tuple +from collections.abc import Iterable +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import Dict, List, Optional, Tuple from azure.core.credentials import TokenCredential -from azure.core.exceptions import ResourceExistsError +from azure.core.exceptions import HttpResponseError, ResourceExistsError, ResourceNotFoundError from azure.mgmt import compute as compute_mgmt from azure.mgmt import network as network_mgmt from azure.mgmt.compute.models import ( @@ -31,17 +33,32 @@ VirtualMachinePublicIPAddressConfiguration, ) -from dstack import version +from dstack._internal import settings +from dstack._internal.core.backends.azure import resources as azure_resources from dstack._internal.core.backends.azure import utils as azure_utils -from dstack._internal.core.backends.azure.config import AzureConfig +from dstack._internal.core.backends.azure.models import AzureConfig from dstack._internal.core.backends.base.compute import ( Compute, + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithGatewaySupport, + ComputeWithInstanceVolumesSupport, + ComputeWithMultinodeSupport, + ComputeWithPrivilegedSupport, + generate_unique_gateway_instance_name, + generate_unique_instance_name, get_gateway_user_data, - get_instance_name, get_user_data, + merge_tags, + requires_nvidia_proprietary_kernel_modules, ) -from dstack._internal.core.backends.base.offers import get_catalog_offers -from dstack._internal.core.errors import NoCapacityError +from dstack._internal.core.backends.base.offers import ( + OfferModifier, + get_catalog_offers, + get_offers_disk_modifier, +) +from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES +from dstack._internal.core.errors import ComputeError, NoCapacityError from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.gateways import ( GatewayComputeConfiguration, @@ -53,17 +70,29 @@ InstanceOffer, InstanceOfferWithAvailability, InstanceType, - SSHKey, ) -from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run -from dstack._internal.core.models.volumes import Volume +from dstack._internal.core.models.placement import PlacementGroup +from dstack._internal.core.models.resources import Memory, Range +from dstack._internal.core.models.runs import JobProvisioningData, Requirements from dstack._internal.utils.logging import get_logger logger = get_logger(__name__) - - -class AzureCompute(Compute): +# OS disks can be 1GB-4095GB, dstack images are 30GB +CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("30GB"), max=Memory.parse("4095GB")) +DEFAULT_GATEWAY_INSTANCE_TYPE = "Standard_B1ms" + + +class AzureCompute( + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithPrivilegedSupport, + ComputeWithInstanceVolumesSupport, + ComputeWithMultinodeSupport, + ComputeWithGatewaySupport, + Compute, +): def __init__(self, config: AzureConfig, credential: TokenCredential): + super().__init__() self.config = config self.credential = credential self._compute_client = compute_mgmt.ComputeManagementClient( @@ -73,27 +102,31 @@ def __init__(self, config: AzureConfig, credential: TokenCredential): credential=credential, subscription_id=config.subscription_id ) - def get_offers( - self, requirements: Optional[Requirements] = None - ) -> List[InstanceOfferWithAvailability]: + def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]: offers = get_catalog_offers( backend=BackendType.AZURE, - locations=self.config.locations, - requirements=requirements, + locations=self.config.regions, extra_filter=_supported_instances, ) offers_with_availability = _get_offers_with_availability( compute_client=self._compute_client, - config_locations=self.config.locations, + config_locations=self.config.regions, offers=offers, ) return offers_with_availability + def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]: + return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)] + def create_instance( self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration, + placement_group: Optional[PlacementGroup], ) -> JobProvisioningData: + instance_name = generate_unique_instance_name( + instance_config, max_length=azure_resources.MAX_RESOURCE_NAME_LEN + ) location = instance_offer.region logger.info( "Requesting %s %s instance in %s...", @@ -103,38 +136,65 @@ def create_instance( ) ssh_pub_keys = instance_config.get_public_keys() disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024) - vm = _launch_instance( + + allocate_public_ip = self.config.allocate_public_ips + network_resource_group, network, subnet = get_resource_group_network_subnet_or_error( + network_client=self._network_client, + resource_group=self.config.resource_group, + vpc_ids=self.config.vpc_ids, + subnet_ids=self.config.subnet_ids, + location=location, + allocate_public_ip=allocate_public_ip, + ) + network_security_group = azure_utils.get_default_network_security_group_name( + resource_group=self.config.resource_group, + location=location, + ) + + managed_identity_resource_group, managed_identity_name = parse_vm_managed_identity( + self.config.vm_managed_identity + ) + + base_tags = { + "owner": "dstack", + "dstack_project": instance_config.project_name, + "dstack_name": instance_config.instance_name, + "dstack_user": instance_config.user, + } + tags = merge_tags( + base_tags=base_tags, + backend_tags=self.config.tags, + resource_tags=instance_config.tags, + ) + tags = azure_resources.filter_invalid_tags(tags) + + # TODO: Support custom availability_zones. + # Currently, VMs are regional, which means they don't have zone info. + vm = _create_instance_and_wait( compute_client=self._compute_client, subscription_id=self.config.subscription_id, location=location, resource_group=self.config.resource_group, - network_security_group=azure_utils.get_default_network_security_group_name( - resource_group=self.config.resource_group, - location=location, - ), - network=azure_utils.get_default_network_name( - resource_group=self.config.resource_group, - location=location, - ), - subnet=azure_utils.get_default_subnet_name( - resource_group=self.config.resource_group, - location=location, - ), - managed_identity=None, + network_security_group=network_security_group, + network=network, + subnet=subnet, + managed_identity_name=managed_identity_name, + managed_identity_resource_group=managed_identity_resource_group, image_reference=_get_image_ref( compute_client=self._compute_client, location=location, variant=VMImageVariant.from_instance_type(instance_offer.instance), ), vm_size=instance_offer.instance.name, - # instance_name includes region because Azure may create an instance resource - # even when provisioning fails. - instance_name=f"{instance_config.instance_name}-{instance_offer.region}", + instance_name=instance_name, user_data=get_user_data(authorized_keys=ssh_pub_keys), ssh_pub_keys=ssh_pub_keys, spot=instance_offer.instance.resources.spot, disk_size=disk_size, computer_name="runnervm", + allocate_public_ip=allocate_public_ip, + network_resource_group=network_resource_group, + tags=tags, ) logger.info("Request succeeded") public_ip, private_ip = _get_vm_public_private_ips( @@ -142,11 +202,14 @@ def create_instance( resource_group=self.config.resource_group, vm=vm, ) + hostname = public_ip + if not allocate_public_ip: + hostname = private_ip return JobProvisioningData( backend=instance_offer.backend, instance_type=instance_offer.instance, instance_id=vm.name, - hostname=public_ip, + hostname=hostname, internal_ip=private_ip, region=location, price=instance_offer.price, @@ -157,26 +220,6 @@ def create_instance( backend_data=None, ) - def run_job( - self, - run: Run, - job: Job, - instance_offer: InstanceOfferWithAvailability, - project_ssh_public_key: str, - project_ssh_private_key: str, - volumes: List[Volume], - ) -> JobProvisioningData: - instance_config = InstanceConfiguration( - project_name=run.project_name, - instance_name=get_instance_name(run, job), # TODO: generate name - ssh_keys=[ - SSHKey(public=project_ssh_public_key.strip()), - ], - job_docker_config=None, - user=run.user, - ) - return self.create_instance(instance_offer, instance_config) - def terminate_instance( self, instance_id: str, region: str, backend_data: Optional[str] = None ): @@ -190,37 +233,70 @@ def create_gateway( self, configuration: GatewayComputeConfiguration, ) -> GatewayProvisioningData: + if configuration.instance_type is not None: + # TODO: support instance_type. Requires selecting a VM image to avoid errors like this: + # > The selected VM size 'Standard_E4s_v6' cannot boot Hypervisor Generation '1' + raise ComputeError( + "The `azure` backend does not support the `instance_type`" + " gateway configuration property" + ) logger.info( "Launching %s gateway instance in %s...", configuration.instance_name, configuration.region, ) - vm = _launch_instance( + instance_name = generate_unique_gateway_instance_name( + configuration, max_length=azure_resources.MAX_RESOURCE_NAME_LEN + ) + network_resource_group, network, subnet = get_resource_group_network_subnet_or_error( + network_client=self._network_client, + resource_group=self.config.resource_group, + vpc_ids=self.config.vpc_ids, + subnet_ids=self.config.subnet_ids, + location=configuration.region, + allocate_public_ip=True, + ) + network_security_group = azure_utils.get_default_network_security_group_name( + resource_group=self.config.resource_group, + location=configuration.region, + ) + + base_tags = { + "owner": "dstack", + "dstack_project": configuration.project_name, + "dstack_name": configuration.instance_name, + } + if settings.DSTACK_VERSION is not None: + base_tags["dstack_version"] = settings.DSTACK_VERSION + tags = merge_tags( + base_tags=base_tags, + backend_tags=self.config.tags, + resource_tags=configuration.tags, + ) + tags = azure_resources.filter_invalid_tags(tags) + + vm = _create_instance_and_wait( compute_client=self._compute_client, subscription_id=self.config.subscription_id, location=configuration.region, resource_group=self.config.resource_group, - network_security_group=azure_utils.get_gateway_network_security_group_name( - resource_group=self.config.resource_group, - location=configuration.region, - ), - network=azure_utils.get_default_network_name( - resource_group=self.config.resource_group, - location=configuration.region, - ), - subnet=azure_utils.get_default_subnet_name( - resource_group=self.config.resource_group, - location=configuration.region, - ), - managed_identity=None, + network_security_group=network_security_group, + network=network, + subnet=subnet, + managed_identity_name=None, + managed_identity_resource_group=None, image_reference=_get_gateway_image_ref(), - vm_size="Standard_B1s", - instance_name=configuration.instance_name, - user_data=get_gateway_user_data(configuration.ssh_key_pub), + vm_size=DEFAULT_GATEWAY_INSTANCE_TYPE, + instance_name=instance_name, + user_data=get_gateway_user_data( + configuration.ssh_key_pub, router=configuration.router + ), ssh_pub_keys=[configuration.ssh_key_pub], spot=False, disk_size=30, computer_name="gatewayvm", + network_resource_group=network_resource_group, + tags=tags, ) logger.info("Request succeeded") public_ip, _ = _get_vm_public_private_ips( @@ -247,9 +323,110 @@ def terminate_gateway( ) +def get_resource_group_network_subnet_or_error( + network_client: network_mgmt.NetworkManagementClient, + resource_group: Optional[str], + vpc_ids: Optional[Dict[str, str]], + subnet_ids: Optional[Dict[str, str]], + location: str, + allocate_public_ip: bool, +) -> Tuple[str, str, str]: + if subnet_ids is not None and location in subnet_ids: + subnet_id = subnet_ids[location] + try: + net_resource_group, network_name, subnet_name = _parse_config_subnet_id(subnet_id) + except Exception: + raise ComputeError( + "Subnet specified in incorrect format." + " Supported format for `subnet_ids` values: 'networkResourceGroupName/networkName/subnetName'" + ) + try: + subnet = network_client.subnets.get(net_resource_group, network_name, subnet_name) + except ResourceNotFoundError: + raise ComputeError( + f"Subnet {subnet_name} not found in network {network_name}" + f" in resource group {net_resource_group}" + ) + if not allocate_public_ip and not azure_resources.is_eligible_private_subnet( + network_client=network_client, + resource_group=net_resource_group, + network_name=network_name, + subnet=subnet, + ): + raise ComputeError( + f"Subnet {subnet_name} in network {network_name} does not have outbound internet connectivity." + " Ensure a NAT Gateway is attached or VNet peering is configured." + ) + return net_resource_group, network_name, subnet_name + + if vpc_ids is not None: + vpc_id = vpc_ids.get(location) + if vpc_id is None: + raise ComputeError(f"Network not configured for location {location}") + try: + resource_group, network_name = _parse_config_vpc_id(vpc_id) + except Exception: + raise ComputeError( + "Network specified in incorrect format." + " Supported format for `vpc_ids` values: 'networkResourceGroupName/networkName'" + ) + elif resource_group is not None: + network_name = azure_utils.get_default_network_name(resource_group, location) + else: + raise ComputeError("`resource_group` or `vpc_ids` must be specified") + + try: + subnets = azure_resources.get_network_subnets( + network_client=network_client, + resource_group=resource_group, + network_name=network_name, + private=not allocate_public_ip, + ) + except ResourceNotFoundError: + raise ComputeError( + f"Network {network_name} not found in location {location} in resource group {resource_group}" + ) + + if len(subnets) == 0: + if not allocate_public_ip: + raise ComputeError( + f"Failed to find private subnets with outbound internet connectivity in network {network_name}" + ) + raise ComputeError(f"Failed to find subnets in network {network_name}") + + subnet_name = subnets[0] + return resource_group, network_name, subnet_name + + +def parse_vm_managed_identity( + vm_managed_identity: Optional[str], +) -> Tuple[Optional[str], Optional[str]]: + if vm_managed_identity is None: + return None, None + try: + resource_group, managed_identity = vm_managed_identity.split("/") + return resource_group, managed_identity + except Exception: + raise ComputeError( + "`vm_managed_identity` specified in incorrect format." + " Supported format: 'managedIdentityResourceGroup/managedIdentityName'" + ) + + +def _parse_config_vpc_id(vpc_id: str) -> Tuple[str, str]: + resource_group, network_name = vpc_id.split("/") + return resource_group, network_name + + +def _parse_config_subnet_id(subnet_id: str) -> Tuple[str, str, str]: + resource_group, network_name, subnet_name = subnet_id.split("/") + return resource_group, network_name, subnet_name + + class VMImageVariant(enum.Enum): GRID = enum.auto() CUDA = enum.auto() + CUDA_WITH_PROPRIETARY_KERNEL_MODULES = enum.auto() STANDARD = enum.auto() @classmethod @@ -257,31 +434,40 @@ def from_instance_type(cls, instance: InstanceType) -> "VMImageVariant": if "_A10_v5" in instance.name: return cls.GRID elif len(instance.resources.gpus) > 0: - return cls.CUDA + if not requires_nvidia_proprietary_kernel_modules(instance.resources.gpus[0].name): + return cls.CUDA + else: + return cls.CUDA_WITH_PROPRIETARY_KERNEL_MODULES else: return cls.STANDARD def get_image_name(self) -> str: - name = "dstack-" + prefix = settings.DSTACK_VM_BASE_IMAGE_PREFIX if self is self.GRID: - name += "grid-" + return f"{prefix}dstack-grid-{settings.DSTACK_VM_BASE_IMAGE_VERSION}" elif self is self.CUDA: - name += "cuda-" - name += version.base_image - return name + return f"{prefix}dstack-cuda-{settings.DSTACK_VM_BASE_IMAGE_VERSION}" + elif self is self.CUDA_WITH_PROPRIETARY_KERNEL_MODULES: + return f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}" + elif self is self.STANDARD: + return f"{prefix}dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}" + else: + raise ValueError(f"Unexpected image variant {self!r}") _SUPPORTED_VM_SERIES_PATTERNS = [ - r"D(\d+)s_v3", # Dsv3-series - r"E(\d+)i?s_v4", # Esv4-series - r"E(\d+)-(\d+)s_v4", # Esv4-series (constrained vCPU) + r"D(\d+)s_v6", # Dsv6-series (general purpose) + r"E(\d+)i?s_v6", # Esv6-series (memory optimized) + r"F(\d+)s_v2", # Fsv2-series (compute optimized) r"NC(\d+)s_v3", # NCv3-series [V100 16GB] r"NC(\d+)as_T4_v3", # NCasT4_v3-series [T4] r"ND(\d+)rs_v2", # NDv2-series [8xV100 32GB] r"NV(\d+)adm?s_A10_v5", # NVadsA10 v5-series [A10] r"NC(\d+)ads_A100_v4", # NC A100 v4-series [A100 80GB] + r"NC(\d+)adi?s_H100_v5", # NC H100 v5-series [H100 NVL 94GB] r"ND(\d+)asr_v4", # ND A100 v4-series [8xA100 40GB] r"ND(\d+)amsr_A100_v4", # NDm A100 v4-series [8xA100 80GB] + r"ND(\d+)isr_H200_v5", # ND H200 v5-series [8xH200 141GB] ] _SUPPORTED_VM_SERIES_PATTERN = ( "^Standard_(" + "|".join(f"({s})" for s in _SUPPORTED_VM_SERIES_PATTERNS) + ")$" @@ -301,22 +487,29 @@ def _get_offers_with_availability( offers = [offer for offer in offers if offer.region in config_locations] locations = set(offer.region for offer in offers) - has_quota = set() - for location in locations: + def get_location_quotas(location: str) -> List[str]: + quotas = [] resources = compute_client.resource_skus.list(filter=f"location eq '{location}'") for resource in resources: if resource.resource_type != "virtualMachines" or not _vm_type_available(resource): continue - has_quota.add((resource.name, location)) + quotas.append((resource.name, location)) + return quotas + + has_quota = set() + with ThreadPoolExecutor(max_workers=8) as executor: + futures = [] + for location in locations: + futures.append(executor.submit(get_location_quotas, location)) + for future in as_completed(futures): + has_quota.update(future.result()) offers_with_availability = [] for offer in offers: availability = InstanceAvailability.NO_QUOTA if (offer.instance.name, offer.region) in has_quota: availability = InstanceAvailability.UNKNOWN - offers_with_availability.append( - InstanceOfferWithAvailability(**offer.dict(), availability=availability) - ) + offers_with_availability.append(offer.with_availability(availability=availability)) return offers_with_availability @@ -336,6 +529,13 @@ def _get_image_ref( location: str, variant: VMImageVariant, ) -> ImageReference: + if settings.DSTACK_VM_BASE_IMAGE_PREFIX: + # Staging images are not published to the community gallery, so reference directly. + image = compute_client.images.get( + resource_group_name="dstack-resources-westeurope", + image_name=variant.get_image_name(), + ) + return ImageReference(id=image.id) image = compute_client.community_gallery_images.get( location=location, public_gallery_name="dstack-ebac134d-04b9-4c2b-8b6c-ad3e73904aa7", # Gen2 @@ -353,7 +553,7 @@ def _get_gateway_image_ref() -> ImageReference: ) -def _launch_instance( +def _begin_create_instance( compute_client: compute_mgmt.ComputeManagementClient, subscription_id: str, location: str, @@ -361,7 +561,8 @@ def _launch_instance( network_security_group: str, network: str, subnet: str, - managed_identity: Optional[str], + managed_identity_name: Optional[str], + managed_identity_resource_group: Optional[str], image_reference: ImageReference, vm_size: str, instance_name: str, @@ -370,7 +571,34 @@ def _launch_instance( spot: bool, disk_size: int, computer_name: str, -) -> VirtualMachine: + allocate_public_ip: bool = True, + network_resource_group: Optional[str] = None, + tags: Optional[Dict[str, str]] = None, +): + """Starts VM creation and returns immediately. The VM is created asynchronously.""" + if tags is None: + tags = {} + if network_resource_group is None: + network_resource_group = resource_group + public_ip_address_configuration = None + if allocate_public_ip: + public_ip_address_configuration = VirtualMachinePublicIPAddressConfiguration( + name="public_ip_config", + ) + managed_identity = None + if managed_identity_name is not None: + if managed_identity_resource_group is None: + managed_identity_resource_group = resource_group + managed_identity = VirtualMachineIdentity( + type=ResourceIdentityType.USER_ASSIGNED, + user_assigned_identities={ + azure_utils.get_managed_identity_id( + subscription_id, + managed_identity_resource_group, + managed_identity_name, + ): UserAssignedIdentitiesValue(), + }, + ) try: poller = compute_client.virtual_machines.begin_create_or_update( resource_group, @@ -422,14 +650,12 @@ def _launch_instance( subnet=SubResource( id=azure_utils.get_subnet_id( subscription_id, - resource_group, + network_resource_group, network, subnet, ) ), - public_ip_address_configuration=VirtualMachinePublicIPAddressConfiguration( - name="public_ip_config", - ), + public_ip_address_configuration=public_ip_address_configuration, ) ], ) @@ -437,17 +663,9 @@ def _launch_instance( ), priority="Spot" if spot else "Regular", eviction_policy="Delete" if spot else None, - identity=None - if managed_identity is None - else VirtualMachineIdentity( - type=ResourceIdentityType.USER_ASSIGNED, - user_assigned_identities={ - azure_utils.get_managed_identity_id( - subscription_id, resource_group, managed_identity - ): UserAssignedIdentitiesValue() - }, - ), + identity=managed_identity, user_data=base64.b64encode(user_data.encode()).decode(), + tags=tags, ), ) except ResourceExistsError as e: @@ -456,7 +674,93 @@ def _launch_instance( message = e.error.message if e.error.message is not None else "" raise NoCapacityError(message) raise e - vm = poller.result() + return poller + + +def _create_instance_and_wait( + compute_client: compute_mgmt.ComputeManagementClient, + subscription_id: str, + location: str, + resource_group: str, + network_security_group: str, + network: str, + subnet: str, + managed_identity_name: Optional[str], + managed_identity_resource_group: Optional[str], + image_reference: ImageReference, + vm_size: str, + instance_name: str, + user_data: str, + ssh_pub_keys: List[str], + spot: bool, + disk_size: int, + computer_name: str, + allocate_public_ip: bool = True, + network_resource_group: Optional[str] = None, + tags: Optional[Dict[str, str]] = None, +) -> VirtualMachine: + """Blocking version used for gateway provisioning where IP is needed immediately.""" + poller = _begin_create_instance( + compute_client=compute_client, + subscription_id=subscription_id, + location=location, + resource_group=resource_group, + network_security_group=network_security_group, + network=network, + subnet=subnet, + managed_identity_name=managed_identity_name, + managed_identity_resource_group=managed_identity_resource_group, + image_reference=image_reference, + vm_size=vm_size, + instance_name=instance_name, + user_data=user_data, + ssh_pub_keys=ssh_pub_keys, + spot=spot, + disk_size=disk_size, + computer_name=computer_name, + allocate_public_ip=allocate_public_ip, + network_resource_group=network_resource_group, + tags=tags, + ) + try: + vm = poller.result(timeout=600) + except HttpResponseError as e: + # Azure may create a VM resource even when provisioning fails (e.g., AllocationFailed). + # Clean it up to avoid orphan VMs. + logger.warning( + "Instance %s provisioning failed: %s. Cleaning up.", + instance_name, + repr(e), + ) + _terminate_instance( + compute_client=compute_client, + resource_group=resource_group, + instance_name=instance_name, + ) + if e.error is not None and e.error.code in ( + "AllocationFailed", + "OverconstrainedAllocationRequest", + ): + raise NoCapacityError(e.error.message or str(e)) + raise + if not poller.done(): + logger.error( + "Timed out waiting for instance %s launch. The instance will be terminated.", + instance_name, + ) + _terminate_instance( + compute_client=compute_client, + resource_group=resource_group, + instance_name=instance_name, + ) + raise ComputeError(f"Timed out waiting for instance {instance_name} launch") + if (vm.provisioning_state or "").lower() == "failed": + _terminate_instance( + compute_client=compute_client, + resource_group=resource_group, + instance_name=instance_name, + ) + raise NoCapacityError(f"VM {instance_name} provisioning failed") return vm @@ -464,18 +768,21 @@ def _get_vm_public_private_ips( network_client: network_mgmt.NetworkManagementClient, resource_group: str, vm: VirtualMachine, -) -> Tuple[str, str]: +) -> Tuple[Optional[str], str]: nic_id = vm.network_profile.network_interfaces[0].id nic_name = azure_utils.get_resource_name_from_resource_id(nic_id) nic = network_client.network_interfaces.get( resource_group_name=resource_group, network_interface_name=nic_name, ) + + private_ip = nic.ip_configurations[0].private_ip_address + if nic.ip_configurations[0].public_ip_address is None: + return None, private_ip + public_ip_id = nic.ip_configurations[0].public_ip_address.id public_ip_name = azure_utils.get_resource_name_from_resource_id(public_ip_id) public_ip = network_client.public_ip_addresses.get(resource_group, public_ip_name) - - private_ip = nic.ip_configurations[0].private_ip_address return public_ip.ip_address, private_ip diff --git a/src/dstack/_internal/core/backends/azure/config.py b/src/dstack/_internal/core/backends/azure/config.py deleted file mode 100644 index 4e7cff268f..0000000000 --- a/src/dstack/_internal/core/backends/azure/config.py +++ /dev/null @@ -1,6 +0,0 @@ -from dstack._internal.core.backends.base.config import BackendConfig -from dstack._internal.core.models.backends.azure import AnyAzureCreds, AzureStoredConfig - - -class AzureConfig(AzureStoredConfig, BackendConfig): - creds: AnyAzureCreds diff --git a/src/dstack/_internal/core/backends/azure/configurator.py b/src/dstack/_internal/core/backends/azure/configurator.py new file mode 100644 index 0000000000..d3cf0649d0 --- /dev/null +++ b/src/dstack/_internal/core/backends/azure/configurator.py @@ -0,0 +1,488 @@ +import json +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import List, Optional, Tuple + +import azure.core.exceptions +from azure.core.credentials import TokenCredential +from azure.mgmt import msi as msi_mgmt +from azure.mgmt import network as network_mgmt +from azure.mgmt import resource as resource_mgmt +from azure.mgmt import subscription as subscription_mgmt +from azure.mgmt.network.models import ( + AddressSpace, + NetworkSecurityGroup, + SecurityRule, + SecurityRuleAccess, + SecurityRuleDirection, + SecurityRuleProtocol, + Subnet, + VirtualNetwork, +) +from azure.mgmt.resource.resources.models import ResourceGroup + +from dstack._internal.core.backends.azure import auth, compute, resources +from dstack._internal.core.backends.azure import utils as azure_utils +from dstack._internal.core.backends.azure.backend import AzureBackend +from dstack._internal.core.backends.azure.models import ( + AzureBackendConfig, + AzureBackendConfigWithCreds, + AzureClientCreds, + AzureConfig, + AzureCreds, + AzureDefaultCreds, + AzureStoredConfig, +) +from dstack._internal.core.backends.base.configurator import ( + TAGS_MAX_NUM, + BackendRecord, + Configurator, + raise_invalid_credentials_error, +) +from dstack._internal.core.errors import ( + BackendAuthError, + BackendError, + ServerClientError, +) +from dstack._internal.core.models.backends.base import ( + BackendType, +) + +LOCATIONS = [ + ("(US) Central US", "centralus"), + ("(US) East US, Virginia", "eastus"), + ("(US) East US 2, Virginia", "eastus2"), + ("(US) South Central US, Texas", "southcentralus"), + ("(US) West US 2, Washington", "westus2"), + ("(US) West US 3, Phoenix", "westus3"), + ("(Canada) Canada Central, Toronto", "canadacentral"), + ("(Europe) France Central, Paris", "francecentral"), + ("(Europe) Germany West Central, Frankfurt", "germanywestcentral"), + ("(Europe) North Europe, Ireland", "northeurope"), + ("(Europe) Sweden Central, Gävle", "swedencentral"), + ("(Europe) UK South, London", "uksouth"), + ("(Europe) West Europe", "westeurope"), + ("(Asia Pacific) Southeast Asia, Singapore", "southeastasia"), + ("(Asia Pacific) East Asia", "eastasia"), + ("(South America) Brazil South", "brazilsouth"), +] +LOCATION_VALUES = [loc[1] for loc in LOCATIONS] +DEFAULT_LOCATIONS = LOCATION_VALUES +MAIN_LOCATION = "eastus" + + +class AzureConfigurator( + Configurator[ + AzureBackendConfig, + AzureBackendConfigWithCreds, + ] +): + TYPE = BackendType.AZURE + BACKEND_CLASS = AzureBackend + + def validate_config(self, config: AzureBackendConfigWithCreds, default_creds_enabled: bool): + if isinstance(config.creds, AzureDefaultCreds) and not default_creds_enabled: + raise_invalid_credentials_error(fields=[["creds"]]) + if isinstance(config.creds, AzureClientCreds): + self._set_client_creds_tenant_id(config.creds, config.tenant_id) + try: + credential, _ = auth.authenticate(config.creds) + except BackendAuthError: + if isinstance(config.creds, AzureClientCreds): + raise_invalid_credentials_error( + fields=[ + ["creds", "tenant_id"], + ["creds", "client_id"], + ["creds", "client_secret"], + ] + ) + else: + raise_invalid_credentials_error(fields=[["creds"]]) + self._check_config_tenant_id(config=config, credential=credential) + self._check_config_subscription_id(config=config, credential=credential) + self._check_config_locations(config) + self._check_config_tags(config) + self._check_config_resource_group(config=config, credential=credential) + self._check_config_vm_managed_identity(config=config, credential=credential) + self._check_config_vpc(config=config, credential=credential) + + def create_backend( + self, project_name: str, config: AzureBackendConfigWithCreds + ) -> BackendRecord: + if config.regions is None: + config.regions = DEFAULT_LOCATIONS + if isinstance(config.creds, AzureClientCreds): + self._set_client_creds_tenant_id(config.creds, config.tenant_id) + credential, _ = auth.authenticate(config.creds) + if config.resource_group is None: + config.resource_group = self._create_resource_group( + credential=credential, + subscription_id=config.subscription_id, + location=MAIN_LOCATION, + project_name=project_name, + ) + self._create_network_resources( + credential=credential, + subscription_id=config.subscription_id, + resource_group=config.resource_group, + locations=config.regions, + create_default_network=config.vpc_ids is None and config.subnet_ids is None, + ) + return BackendRecord( + config=AzureStoredConfig( + **AzureBackendConfig.__response__.parse_obj(config).dict() + ).json(), + auth=AzureCreds.parse_obj(config.creds).__root__.json(), + ) + + def get_backend_config_with_creds(self, record: BackendRecord) -> AzureBackendConfigWithCreds: + config = self._get_config(record) + return AzureBackendConfigWithCreds.__response__.parse_obj(config) + + def get_backend_config_without_creds(self, record: BackendRecord) -> AzureBackendConfig: + config = self._get_config(record) + return AzureBackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> AzureBackend: + config = self._get_config(record) + return AzureBackend(config=config) + + def _get_config(self, record: BackendRecord) -> AzureConfig: + config_dict = json.loads(record.config) + regions = config_dict.pop("regions", None) + if regions is None: + # Legacy config stores regions as locations + regions = config_dict.pop("locations") + return AzureConfig.__response__( + **config_dict, + regions=regions, + creds=AzureCreds.parse_raw(record.auth).__root__, + ) + + def _check_config_tenant_id( + self, config: AzureBackendConfigWithCreds, credential: auth.AzureCredential + ): + subscription_client = subscription_mgmt.SubscriptionClient(credential=credential) + tenant_ids = [] + for tenant in subscription_client.tenants.list(): + tenant_ids.append(tenant.tenant_id) + if config.tenant_id not in tenant_ids: + raise ServerClientError( + "Invalid tenant_id", + fields=[["tenant_id"]], + ) + + def _check_config_subscription_id( + self, config: AzureBackendConfigWithCreds, credential: auth.AzureCredential + ): + subscription_client = subscription_mgmt.SubscriptionClient(credential=credential) + subscription_ids = [] + for subscription in subscription_client.subscriptions.list(): + subscription_ids.append(subscription.subscription_id) + if config.subscription_id not in subscription_ids: + raise ServerClientError( + "Invalid subscription_id", + fields=[["subscription_id"]], + ) + if len(subscription_ids) == 0: + # Credentials without granted roles don't see any subscriptions + raise ServerClientError( + msg="No Azure subscriptions found for provided credentials. Ensure the account has enough permissions.", + ) + + def _check_config_locations(self, config: AzureBackendConfigWithCreds): + if config.regions is None: + return + for location in config.regions: + if location not in LOCATION_VALUES: + raise ServerClientError(f"Unknown Azure location {location}") + + def _check_config_tags(self, config: AzureBackendConfigWithCreds): + if not config.tags: + return + if len(config.tags) > TAGS_MAX_NUM: + raise ServerClientError( + f"Maximum number of tags exceeded. Up to {TAGS_MAX_NUM} tags is allowed." + ) + try: + resources.validate_tags(config.tags) + except BackendError as e: + raise ServerClientError(e.args[0]) + + def _check_config_resource_group( + self, config: AzureBackendConfigWithCreds, credential: auth.AzureCredential + ): + if config.resource_group is None: + return + resource_manager = ResourceManager( + credential=credential, + subscription_id=config.subscription_id, + ) + if not resource_manager.resource_group_exists(config.resource_group): + raise ServerClientError(f"Resource group {config.resource_group} not found") + + def _check_config_vpc( + self, config: AzureBackendConfigWithCreds, credential: auth.AzureCredential + ): + if config.subscription_id is None: + return None + allocate_public_ip = config.public_ips if config.public_ips is not None else True + if config.public_ips is False and config.vpc_ids is None and config.subnet_ids is None: + raise ServerClientError( + msg="`vpc_ids` or `subnet_ids` must be specified if `public_ips: false`." + ) + if config.vpc_ids is not None and config.subnet_ids is not None: + overlap = sorted(set(config.vpc_ids.keys()) & set(config.subnet_ids.keys())) + if overlap: + raise ServerClientError( + f"Regions {overlap} are configured in both `vpc_ids` and `subnet_ids`." + " Each region must be specified in only one of them." + ) + locations = config.regions + if locations is None: + locations = DEFAULT_LOCATIONS + if config.vpc_ids is not None or config.subnet_ids is not None: + configured_locations = set() + if config.vpc_ids is not None: + configured_locations |= set(config.vpc_ids.keys()) + if config.subnet_ids is not None: + configured_locations |= set(config.subnet_ids.keys()) + not_configured_locations = [ + loc for loc in locations if loc not in configured_locations + ] + if len(not_configured_locations) > 0: + if config.regions is None: + raise ServerClientError( + f"Networking not configured for regions {not_configured_locations}. " + "Configure either `vpc_ids` or `subnet_ids` for all regions or specify `regions`." + ) + raise ServerClientError( + f"Networking not configured for regions {not_configured_locations}. " + "Configure either `vpc_ids` or `subnet_ids` for all regions specified in `regions`." + ) + network_client = network_mgmt.NetworkManagementClient( + credential=credential, + subscription_id=config.subscription_id, + ) + with ThreadPoolExecutor(max_workers=8) as executor: + futures = [] + for location in locations: + future = executor.submit( + compute.get_resource_group_network_subnet_or_error, + network_client=network_client, + resource_group=None, + vpc_ids=config.vpc_ids, + subnet_ids=config.subnet_ids, + location=location, + allocate_public_ip=allocate_public_ip, + ) + futures.append(future) + for future in as_completed(futures): + try: + future.result() + except BackendError as e: + raise ServerClientError(e.args[0]) + + def _check_config_vm_managed_identity( + self, config: AzureBackendConfigWithCreds, credential: auth.AzureCredential + ): + try: + resource_group, identity_name = compute.parse_vm_managed_identity( + config.vm_managed_identity + ) + except BackendError as e: + raise ServerClientError(e.args[0]) + if resource_group is None or identity_name is None: + return + msi_client = msi_mgmt.ManagedServiceIdentityClient(credential, config.subscription_id) + try: + msi_client.user_assigned_identities.get(resource_group, identity_name) + except azure.core.exceptions.ResourceNotFoundError: + raise ServerClientError( + f"Managed identity {identity_name} not found in resource group {resource_group}" + ) + + def _set_client_creds_tenant_id( + self, + creds: AzureClientCreds, + tenant_id: Optional[str], + ): + if creds.tenant_id is not None: + return + if tenant_id is None: + raise_invalid_credentials_error( + fields=[ + ["creds", "tenant_id"], + ["tenant_id"], + ] + ) + creds.tenant_id = tenant_id + + def _create_resource_group( + self, + credential: auth.AzureCredential, + subscription_id: str, + location: str, + project_name: str, + ) -> str: + resource_manager = ResourceManager( + credential=credential, + subscription_id=subscription_id, + ) + return resource_manager.create_resource_group( + name=_get_resource_group_name(project_name), + location=location, + ) + + def _create_network_resources( + self, + credential: auth.AzureCredential, + subscription_id: str, + resource_group: str, + locations: List[str], + create_default_network: bool, + ): + def func(location: str): + network_manager = NetworkManager( + credential=credential, subscription_id=subscription_id + ) + if create_default_network: + network_manager.create_virtual_network( + resource_group=resource_group, + location=location, + name=azure_utils.get_default_network_name(resource_group, location), + subnet_name=azure_utils.get_default_subnet_name(resource_group, location), + ) + network_manager.create_network_security_group( + resource_group=resource_group, + location=location, + name=azure_utils.get_default_network_security_group_name(resource_group, location), + ) + network_manager.create_gateway_network_security_group( + resource_group=resource_group, + location=location, + name=azure_utils.get_gateway_network_security_group_name(resource_group, location), + ) + + with ThreadPoolExecutor(max_workers=8) as executor: + for location in locations: + executor.submit(func, location) + + +def _get_resource_group_name(project_name: str) -> str: + return f"dstack-{project_name}" + + +class ResourceManager: + def __init__(self, credential: TokenCredential, subscription_id: str): + self.resource_client = resource_mgmt.ResourceManagementClient( + credential=credential, subscription_id=subscription_id + ) + + def create_resource_group( + self, + name: str, + location: str, + ) -> str: + resource_group: ResourceGroup = self.resource_client.resource_groups.create_or_update( + resource_group_name=name, + parameters=ResourceGroup( + location=location, + ), + ) + return resource_group.name + + def resource_group_exists( + self, + name: str, + ) -> bool: + try: + self.resource_client.resource_groups.get( + resource_group_name=name, + ) + except azure.core.exceptions.ResourceNotFoundError: + return False + return True + + +class NetworkManager: + def __init__(self, credential: TokenCredential, subscription_id: str): + self.network_client = network_mgmt.NetworkManagementClient( + credential=credential, subscription_id=subscription_id + ) + + def create_virtual_network( + self, + resource_group: str, + name: str, + subnet_name: str, + location: str, + ) -> Tuple[str, str]: + network: VirtualNetwork = self.network_client.virtual_networks.begin_create_or_update( + resource_group_name=resource_group, + virtual_network_name=name, + parameters=VirtualNetwork( + location=location, + address_space=AddressSpace(address_prefixes=["10.0.0.0/16"]), + subnets=[ + Subnet( + name=subnet_name, + address_prefix="10.0.0.0/20", + ) + ], + ), + ).result() + return network.name, subnet_name + + def create_network_security_group( + self, + resource_group: str, + location: str, + name: str, + ): + self.network_client.network_security_groups.begin_create_or_update( + resource_group_name=resource_group, + network_security_group_name=name, + parameters=NetworkSecurityGroup( + location=location, + security_rules=[ + SecurityRule( + name="runner_ssh", + protocol=SecurityRuleProtocol.TCP, + source_address_prefix="Internet", + source_port_range="*", + destination_address_prefix="*", + destination_port_range="22", + access=SecurityRuleAccess.ALLOW, + priority=100, + direction=SecurityRuleDirection.INBOUND, + ), + ], + ), + ).result() + + def create_gateway_network_security_group( + self, + resource_group: str, + location: str, + name: str, + ): + self.network_client.network_security_groups.begin_create_or_update( + resource_group_name=resource_group, + network_security_group_name=name, + parameters=NetworkSecurityGroup( + location=location, + security_rules=[ + SecurityRule( + name="gateway_all", + protocol=SecurityRuleProtocol.TCP, + source_address_prefix="Internet", + source_port_range="*", + destination_address_prefix="*", + destination_port_ranges=["22", "80", "443"], + access=SecurityRuleAccess.ALLOW, + priority=101, + direction=SecurityRuleDirection.INBOUND, + ) + ], + ), + ).result() diff --git a/src/dstack/_internal/core/backends/azure/models.py b/src/dstack/_internal/core/backends/azure/models.py new file mode 100644 index 0000000000..9881edcaea --- /dev/null +++ b/src/dstack/_internal/core/backends/azure/models.py @@ -0,0 +1,108 @@ +from typing import Annotated, Dict, List, Literal, Optional, Union + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +class AzureClientCreds(CoreModel): + type: Annotated[Literal["client"], Field(description="The type of credentials")] = "client" + client_id: Annotated[str, Field(description="The client ID")] + client_secret: Annotated[str, Field(description="The client secret")] + # if tenant_id is missing, it will be populated from config info + tenant_id: Optional[str] + + +class AzureDefaultCreds(CoreModel): + type: Annotated[Literal["default"], Field(description="The type of credentials")] = "default" + + +AnyAzureCreds = Union[AzureClientCreds, AzureDefaultCreds] + + +class AzureCreds(CoreModel): + __root__: AnyAzureCreds = Field(..., discriminator="type") + + +class AzureBackendConfig(CoreModel): + type: Annotated[Literal["azure"], Field(description="The type of the backend")] = "azure" + tenant_id: Annotated[str, Field(description="The tenant ID")] + subscription_id: Annotated[str, Field(description="The subscription ID")] + resource_group: Annotated[ + Optional[str], + Field( + description=( + "The resource group for resources created by `dstack`." + " If not specified, `dstack` will create a new resource group" + ) + ), + ] = None + regions: Annotated[ + Optional[List[str]], + Field(description="The list of Azure regions (locations). Omit to use all regions"), + ] = None + vpc_ids: Annotated[ + Optional[Dict[str, str]], + Field( + description=( + "The mapping from configured Azure locations to network IDs." + " A network ID must have a format `networkResourceGroup/networkName`" + " If not specified, `dstack` will create a new network for every configured region" + ) + ), + ] = None + subnet_ids: Annotated[ + Optional[Dict[str, str]], + Field( + description=( + "The mapping from configured Azure locations to subnet IDs." + " A subnet ID must have a format `networkResourceGroup/networkName/subnetName`." + " Cannot be configured for the same region as `vpc_ids`" + ) + ), + ] = None + public_ips: Annotated[ + Optional[bool], + Field( + description=( + "A flag to enable/disable public IP assigning on instances." + " `public_ips: false` requires `vpc_ids` or `subnet_ids` that specifies custom networks" + " with outbound internet connectivity provided by NAT Gateway or other mechanism." + " Defaults to `true`" + ) + ), + ] = None + vm_managed_identity: Annotated[ + Optional[str], + Field( + description=( + "The managed identity to associate with provisioned VMs." + " Must have a format `managedIdentityResourceGroup/managedIdentityName`" + ) + ), + ] = None + tags: Annotated[ + Optional[Dict[str, str]], + Field(description="The tags that will be assigned to resources created by `dstack`"), + ] = None + + +class AzureBackendConfigWithCreds(AzureBackendConfig): + creds: AnyAzureCreds = Field(..., description="The credentials", discriminator="type") + + +AnyAzureBackendConfig = Union[AzureBackendConfig, AzureBackendConfigWithCreds] + + +class AzureStoredConfig(AzureBackendConfig): + resource_group: str = "" + + +class AzureConfig(AzureStoredConfig): + creds: AnyAzureCreds + + @property + def allocate_public_ips(self) -> bool: + if self.public_ips is not None: + return self.public_ips + return True diff --git a/src/dstack/_internal/core/backends/azure/resources.py b/src/dstack/_internal/core/backends/azure/resources.py new file mode 100644 index 0000000000..7e34e8c4ea --- /dev/null +++ b/src/dstack/_internal/core/backends/azure/resources.py @@ -0,0 +1,116 @@ +import re +from typing import Dict, List + +from azure.mgmt import network as network_mgmt +from azure.mgmt.network.models import Subnet + +from dstack._internal.core.errors import BackendError +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +MAX_RESOURCE_NAME_LEN = 64 + + +def get_network_subnets( + network_client: network_mgmt.NetworkManagementClient, + resource_group: str, + network_name: str, + private: bool, +) -> List[str]: + res = [] + subnets = network_client.subnets.list( + resource_group_name=resource_group, virtual_network_name=network_name + ) + for subnet in subnets: + if private: + if is_eligible_private_subnet( + network_client=network_client, + resource_group=resource_group, + network_name=network_name, + subnet=subnet, + ): + res.append(subnet.name) + else: + if _is_eligible_public_subnet( + network_client=network_client, + resource_group=resource_group, + network_name=network_name, + subnet=subnet, + ): + res.append(subnet.name) + return res + + +def _is_eligible_public_subnet( + network_client: network_mgmt.NetworkManagementClient, + resource_group: str, + network_name: str, + subnet: Subnet, +) -> bool: + # Apparently, in Azure practically any subnet can be used + # to provision instances with public IPs + return True + + +def is_eligible_private_subnet( + network_client: network_mgmt.NetworkManagementClient, + resource_group: str, + network_name: str, + subnet: Subnet, +) -> bool: + # Azure provides default outbound connectivity but it's deprecated + # and does not work with Flexible orchestration used in dstack, + # so we require an explicit outbound method such as NAT Gateway. + + if subnet.nat_gateway is not None: + return True + + vnet_peerings = list( + network_client.virtual_network_peerings.list( + resource_group_name=resource_group, + virtual_network_name=network_name, + ) + ) + if len(vnet_peerings) > 0: + # We currently assume that any peering can provide outbound connectivity. + # There can be a more elaborate check of the peering configuration. + return True + + return False + + +def filter_invalid_tags(tags: Dict[str, str]) -> Dict[str, str]: + filtered_tags = {} + for k, v in tags.items(): + if not _is_valid_tag(k, v): + logger.warning("Skipping invalid tag '%s: %s'", k, v) + continue + filtered_tags[k] = v + return filtered_tags + + +def validate_tags(tags: Dict[str, str]): + for k, v in tags.items(): + if not _is_valid_tag(k, v): + raise BackendError( + "Invalid Azure resource tags. " + "See tags restrictions: https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/tag-resources#limitations" + ) + + +def _is_valid_tag(key: str, value: str) -> bool: + return _is_valid_tag_key(key) and _is_valid_tag_value(value) + + +TAG_KEY_PATTERN = re.compile(r"^(?!.*[<>&\\%?\/]).{1,512}$") +TAG_VALUE_PATTERN = re.compile(r".{0,256}$") + + +def _is_valid_tag_key(key: str) -> bool: + return TAG_KEY_PATTERN.match(key) is not None + + +def _is_valid_tag_value(value: str) -> bool: + return TAG_VALUE_PATTERN.match(value) is not None diff --git a/src/dstack/_internal/core/backends/base/__init__.py b/src/dstack/_internal/core/backends/base/__init__.py index 42b779169f..e69de29bb2 100644 --- a/src/dstack/_internal/core/backends/base/__init__.py +++ b/src/dstack/_internal/core/backends/base/__init__.py @@ -1,12 +0,0 @@ -from abc import abstractmethod - -from dstack._internal.core.backends.base.compute import Compute -from dstack._internal.core.models.backends.base import BackendType - - -class Backend: - TYPE: BackendType - - @abstractmethod - def compute(self) -> Compute: - pass diff --git a/src/dstack/_internal/core/backends/base/backend.py b/src/dstack/_internal/core/backends/base/backend.py new file mode 100644 index 0000000000..ee19fd9897 --- /dev/null +++ b/src/dstack/_internal/core/backends/base/backend.py @@ -0,0 +1,18 @@ +from abc import ABC, abstractmethod +from typing import ClassVar + +from dstack._internal.core.backends.base.compute import Compute +from dstack._internal.core.models.backends.base import BackendType + + +class Backend(ABC): + TYPE: ClassVar[BackendType] + # `COMPUTE_CLASS` is used to introspect compute features without initializing it. + COMPUTE_CLASS: ClassVar[type[Compute]] + + @abstractmethod + def compute(self) -> Compute: + """ + Returns Compute instance. + """ + pass diff --git a/src/dstack/_internal/core/backends/base/compute.py b/src/dstack/_internal/core/backends/base/compute.py index 5db45aa18e..8d57f73d6a 100644 --- a/src/dstack/_internal/core/backends/base/compute.py +++ b/src/dstack/_internal/core/backends/base/compute.py @@ -1,40 +1,121 @@ import os +import random import re +import shlex +import string +import threading from abc import ABC, abstractmethod +from collections.abc import Iterable, Iterator +from dataclasses import dataclass, field +from enum import Enum from functools import lru_cache -from typing import Dict, List, Optional +from pathlib import Path +from typing import Callable, Dict, List, Optional import git import requests import yaml +from cachetools import Cache, TTLCache, cachedmethod +from gpuhunt import CPUArchitecture from dstack._internal import settings +from dstack._internal.core.backends.base.models import JobConfiguration +from dstack._internal.core.backends.base.offers import OfferModifier, filter_offers_by_requirements +from dstack._internal.core.consts import ( + DSTACK_RUNNER_HTTP_PORT, + DSTACK_RUNNER_SSH_PORT, + DSTACK_SHIM_HTTP_PORT, +) +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.compute_groups import ComputeGroup, ComputeGroupProvisioningData from dstack._internal.core.models.gateways import ( GatewayComputeConfiguration, GatewayProvisioningData, ) from dstack._internal.core.models.instances import ( InstanceConfiguration, + InstanceOffer, InstanceOfferWithAvailability, + SSHKey, ) +from dstack._internal.core.models.placement import PlacementGroup, PlacementGroupProvisioningData +from dstack._internal.core.models.routers import AnyGatewayRouterConfig from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run from dstack._internal.core.models.volumes import ( Volume, VolumeAttachmentData, VolumeProvisioningData, ) +from dstack._internal.core.services import is_valid_dstack_resource_name from dstack._internal.utils.logging import get_logger +from dstack._internal.utils.path import PathLike logger = get_logger(__name__) -DSTACK_WORKING_DIR = "/root/.dstack" +DSTACK_SHIM_BINARY_NAME = "dstack-shim" +DSTACK_SHIM_RESTART_INTERVAL_SECONDS = 3 +DSTACK_RUNNER_BINARY_NAME = "dstack-runner" +DEFAULT_PRIVATE_SUBNETS = ("10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16") +NVIDIA_GPUS_REQUIRING_PROPRIETARY_KERNEL_MODULES = frozenset( + # All NVIDIA architectures prior to Turing do not support Open Kernel Modules and require + # proprietary modules. This list is incomplete, update when necessary. + [ + "v100", + "p100", + "p40", + "p4", + "m60", + "m40", + "m4", + "k80", + "k40", + "k20", + ] +) + + +class GoArchType(str, Enum): + """ + A subset of GOARCH values + """ + + AMD64 = "amd64" + ARM64 = "arm64" + + def to_cpu_architecture(self) -> CPUArchitecture: + if self == self.AMD64: + return CPUArchitecture.X86 + if self == self.ARM64: + return CPUArchitecture.ARM + assert False, self + + +@dataclass +class ComputeCache: + cache: Cache + lock: threading.Lock = field(default_factory=threading.Lock) + + +@dataclass +class ComputeTTLCache: + cache: TTLCache + lock: threading.Lock = field(default_factory=threading.Lock) class Compute(ABC): + """ + A base class for all compute implementations with minimal features. + If a compute supports additional features, it must also subclass `ComputeWith*` classes. + """ + @abstractmethod - def get_offers( - self, requirements: Optional[Requirements] = None - ) -> List[InstanceOfferWithAvailability]: + def get_offers(self, requirements: Requirements) -> Iterator[InstanceOfferWithAvailability]: + """ + Returns offers with availability matching `requirements`. + If the provider is added to gpuhunt, typically gets offers using + `base.offers.get_catalog_offers()` and extends them with availability info. + It is called from async code in executor. It can block on call but not between yields. + """ pass @abstractmethod @@ -46,6 +127,7 @@ def run_job( project_ssh_public_key: str, project_ssh_private_key: str, volumes: List[Volume], + placement_group: Optional[PlacementGroup], ) -> JobProvisioningData: """ Launches a new instance for the job. It should return `JobProvisioningData` ASAP. @@ -62,39 +144,334 @@ def terminate_instance( backend_data: Optional[str] = None, ) -> None: """ - Terminates an instance by `instance_id`. If instance does not exist, - it should not raise errors but return silently. + Terminates an instance by `instance_id`. + If the instance does not exist, it should not raise errors but return silently. + + Should return ASAP. If required to wait for some operation, raise `NotYetTerminated`. + In this case, the method will be called again after a few seconds. + """ + pass + + def update_provisioning_data( + self, + provisioning_data: JobProvisioningData, + project_ssh_public_key: str, + project_ssh_private_key: str, + ): + """ + This method is called if `JobProvisioningData` returned from `run_job()`/`create_instance()` + is not complete, e.g. missing `hostname` or `ssh_port`. + It can be used if getting complete provisioning data takes a long of time. + It should not wait but return immediately. + If it raises `ProvisioningError`, there will be no further attempts to update the provisioning data, + and the run will be terminated. + """ + pass + + +class ComputeWithAllOffersCached(ABC): + """ + Provides common `get_offers()` implementation for backends + whose offers do not depend on requirements. + It caches all offers with availability and post-filters by requirements. + """ + + def __init__(self) -> None: + super().__init__() + self._offers_cache_lock = threading.Lock() + self._offers_cache_execution_lock = threading.Lock() + self._offers_cache = TTLCache(maxsize=1, ttl=180) + + @abstractmethod + def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]: + """ + Returns all backend offers with availability. + """ + pass + + def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]: + """ + Returns functions that modify offers before they are filtered by requirements. + A modifier function can return `None` to exclude the offer. + E.g. can be used to set appropriate disk size based on requirements. + """ + return [] + + def get_offers_post_filter( + self, requirements: Requirements + ) -> Optional[Callable[[InstanceOfferWithAvailability], bool]]: + """ + Returns a filter function to apply to offers based on requirements. + This allows backends to implement custom post-filtering logic for specific requirements. + """ + return None + + def get_offers(self, requirements: Requirements) -> Iterator[InstanceOfferWithAvailability]: + with self._offers_cache_execution_lock: + # Cache lock does not prevent concurrent execution. + # We use a separate lock to avoid requesting offers in parallel, re-doing the work and hitting rate limits. + cached_offers = self._get_all_offers_with_availability_cached() + offers = self.__apply_modifiers(cached_offers, self.get_offers_modifiers(requirements)) + offers = filter_offers_by_requirements(offers, requirements) + post_filter = self.get_offers_post_filter(requirements) + if post_filter is not None: + offers = (o for o in offers if post_filter(o)) + return offers + + @cachedmethod( + cache=lambda self: self._offers_cache, + lock=lambda self: self._offers_cache_lock, + ) + def _get_all_offers_with_availability_cached(self) -> List[InstanceOfferWithAvailability]: + return self.get_all_offers_with_availability() + + @staticmethod + def __apply_modifiers( + offers: Iterable[InstanceOfferWithAvailability], modifiers: Iterable[OfferModifier] + ) -> Iterator[InstanceOfferWithAvailability]: + for offer in offers: + for modifier in modifiers: + offer = modifier(offer) + if offer is None: + break + else: + yield offer + + +class ComputeWithFilteredOffersCached(ABC): + """ + Provides common `get_offers()` implementation for backends + whose offers depend on requirements. + It caches offers using requirements as key. + """ + + def __init__(self) -> None: + super().__init__() + self._offers_cache_lock = threading.Lock() + self._offers_cache = TTLCache(maxsize=10, ttl=180) + + @abstractmethod + def get_offers_by_requirements( + self, requirements: Requirements + ) -> List[InstanceOfferWithAvailability]: + """ + Returns backend offers with availability matching requirements. """ pass + def get_offers(self, requirements: Requirements) -> Iterator[InstanceOfferWithAvailability]: + return iter(self._get_offers_cached(requirements)) + + def _get_offers_cached_key(self, requirements: Requirements) -> int: + # Requirements is not hashable, so we use a hack to get arguments hash + return hash(requirements.json()) + + @cachedmethod( + cache=lambda self: self._offers_cache, + key=_get_offers_cached_key, + lock=lambda self: self._offers_cache_lock, + ) + def _get_offers_cached( + self, requirements: Requirements + ) -> List[InstanceOfferWithAvailability]: + return self.get_offers_by_requirements(requirements) + + +class ComputeWithCreateInstanceSupport(ABC): + """ + Must be subclassed and implemented to support fleets (instance creation without running a job). + Typically, a compute that runs VMs would implement it, + and a compute that runs containers would not. + """ + + @abstractmethod def create_instance( self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration, + placement_group: Optional[PlacementGroup], ) -> JobProvisioningData: """ Launches a new instance. It should return `JobProvisioningData` ASAP. If required to wait to get the IP address or SSH port, return partially filled `JobProvisioningData` and implement `update_provisioning_data()`. """ - raise NotImplementedError() + pass - def update_provisioning_data( + def run_job( self, - provisioning_data: JobProvisioningData, + run: Run, + job: Job, + instance_offer: InstanceOfferWithAvailability, project_ssh_public_key: str, project_ssh_private_key: str, + volumes: List[Volume], + placement_group: Optional[PlacementGroup], + ) -> JobProvisioningData: + """ + The default `run_job()` implementation for all backends that support `create_instance()`. + Override only if custom `run_job()` behavior is required. + """ + instance_config = InstanceConfiguration( + project_name=run.project_name, + instance_name=get_job_instance_name(run, job), + user=run.user, + ssh_keys=[SSHKey(public=project_ssh_public_key.strip())], + volumes=volumes, + reservation=job.job_spec.requirements.reservation, + tags=run.run_spec.merged_profile.tags, + ) + instance_offer = instance_offer.copy() + self._restrict_instance_offer_az_to_volumes_az(instance_offer, volumes) + return self.create_instance( + instance_offer, instance_config, placement_group=placement_group + ) + + def _restrict_instance_offer_az_to_volumes_az( + self, + instance_offer: InstanceOfferWithAvailability, + volumes: List[Volume], ): + if len(volumes) == 0: + return + volume = volumes[0] + if ( + volume.provisioning_data is not None + and volume.provisioning_data.availability_zone is not None + ): + if instance_offer.availability_zones is None: + instance_offer.availability_zones = [volume.provisioning_data.availability_zone] + instance_offer.availability_zones = [ + z + for z in instance_offer.availability_zones + if z == volume.provisioning_data.availability_zone + ] + + +class ComputeWithGroupProvisioningSupport(ABC): + @abstractmethod + def run_jobs( + self, + run: Run, + job_configurations: List[JobConfiguration], + instance_offer: InstanceOfferWithAvailability, + project_ssh_public_key: str, + project_ssh_private_key: str, + placement_group: Optional[PlacementGroup], + ) -> ComputeGroupProvisioningData: + pass + + @abstractmethod + def terminate_compute_group(self, compute_group: ComputeGroup): + pass + + +class ComputeWithPrivilegedSupport: + """ + Must be subclassed to support runs with `privileged: true`. + All VM-based Computes (that is, Computes that use the shim) should subclass this mixin. + """ + + pass + + +class ComputeWithInstanceVolumesSupport: + """ + Must be subclassed to support runs with `/host/path:/container/path` volumes. + All VM-based Computes (that is, Computes that use the shim) should subclass this mixin. + """ + + pass + + +class ComputeWithMultinodeSupport: + """ + Must be subclassed to support multinode tasks and cluster fleets. + Instances provisioned in the same project/region must be interconnected. + """ + + pass + + +class ComputeWithReservationSupport: + """ + Must be subclassed to support provisioning from reservations. + + The following is expected from a backend that supports reservations: + + - `get_offers` respects `Requirements.reservation` if set, and only returns + offers that can be provisioned in the configured reservation. It can + adjust some offer properties such as `availability` and + `availability_zones` if necessary. + - `create_instance` respects `InstanceConfig.reservation` if set, and + provisions the instance in the configured reservation. + """ + + pass + + +class ComputeWithPlacementGroupSupport(ABC): + """ + Must be subclassed and implemented to support placement groups. + """ + + @abstractmethod + def create_placement_group( + self, + placement_group: PlacementGroup, + master_instance_offer: InstanceOffer, + ) -> PlacementGroupProvisioningData: """ - This method is called if `JobProvisioningData` returned from `run_job()`/`create_instance()` - is not complete, e.g. missing `hostname` or `ssh_port`. - It can be used if getting complete provisioning data takes a long of time. - It should not wait but return immediately. - If it raises `ProvisioningError`, there will be no further attempts to update the provisioning data, - and the run will be terminated. + Creates a placement group. + + Args: + placement_group: details about the placement group to be created + master_instance_offer: the first instance dstack will attempt to add + to the placement group + """ + pass + + @abstractmethod + def delete_placement_group( + self, + placement_group: PlacementGroup, + ): + """ + Deletes a placement group. + If the group does not exist, it should not raise errors but return silently. + """ + pass + + @abstractmethod + def is_suitable_placement_group( + self, + placement_group: PlacementGroup, + instance_offer: InstanceOffer, + ) -> bool: + """ + Checks if the instance offer can be provisioned in the placement group. + + Should return immediately, without performing API calls. """ pass + def are_placement_groups_compatible_with_reservations(self, backend_type: BackendType) -> bool: + """ + Whether placement groups can be used for instances provisioned in reservations. + + Arguments: + backend_type: matches the backend type of this compute, unless this compute is a proxy + for other backends (dstack Sky) + """ + return True + + +class ComputeWithGatewaySupport(ABC): + """ + Must be subclassed and implemented to support gateways. + """ + + @abstractmethod def create_gateway( self, configuration: GatewayComputeConfiguration, @@ -102,8 +479,9 @@ def create_gateway( """ Creates a gateway instance. """ - raise NotImplementedError() + pass + @abstractmethod def terminate_gateway( self, instance_id: str, @@ -114,48 +492,247 @@ def terminate_gateway( Terminates a gateway instance. Generally, it passes the call to `terminate_instance()`, but may perform additional work such as deleting a load balancer when a gateway has one. """ - raise NotImplementedError() + pass + + +class ComputeWithPrivateGatewaySupport: + """ + Must be subclassed to support private gateways. + `create_gateway()` must be able to create private gateways. + """ + pass + + +class ComputeWithVolumeSupport(ABC): + """ + Must be subclassed and implemented to support volumes. + """ + + @abstractmethod def register_volume(self, volume: Volume) -> VolumeProvisioningData: """ Returns VolumeProvisioningData for an existing volume. Used to add external volumes to dstack. """ - raise NotImplementedError() + pass + @abstractmethod def create_volume(self, volume: Volume) -> VolumeProvisioningData: """ Creates a new volume. """ raise NotImplementedError() + @abstractmethod def delete_volume(self, volume: Volume): """ Deletes a volume. """ raise NotImplementedError() - def attach_volume(self, volume: Volume, instance_id: str) -> VolumeAttachmentData: + def attach_volume( + self, volume: Volume, provisioning_data: JobProvisioningData + ) -> VolumeAttachmentData: """ Attaches a volume to the instance. + If the volume is not found, it should raise `ComputeError()`. + Implement only if compute may return `VolumeProvisioningData.attachable`. + Otherwise, volumes should be attached by `run_job()`. """ raise NotImplementedError() - def detach_volume(self, volume: Volume, instance_id: str): + def detach_volume( + self, volume: Volume, provisioning_data: JobProvisioningData, force: bool = False + ): """ Detaches a volume from the instance. + Implement only if compute may return `VolumeProvisioningData.detachable`. + Otherwise, volumes should be detached on instance termination. """ raise NotImplementedError() + def is_volume_detached(self, volume: Volume, provisioning_data: JobProvisioningData) -> bool: + """ + Checks if a volume was detached from the instance. + If `detach_volume()` may fail to detach volume, + this method should be overridden to check the volume status. + The caller will trigger force detach if the volume gets stuck detaching. + """ + return True + + +def get_dstack_working_dir(base_path: Optional[PathLike] = None) -> str: + if base_path is None: + base_path = "/root" + return str(Path(base_path, ".dstack")) + + +def get_dstack_shim_binary_path(bin_path: Optional[PathLike] = None) -> str: + if bin_path is None: + bin_path = "/usr/local/bin" + return str(Path(bin_path, DSTACK_SHIM_BINARY_NAME)) -def get_instance_name(run: Run, job: Job) -> str: - return f"{run.project_name.lower()}-{job.job_spec.job_name}" + +def get_dstack_runner_binary_path(bin_path: Optional[PathLike] = None) -> str: + if bin_path is None: + bin_path = "/usr/local/bin" + return str(Path(bin_path, DSTACK_RUNNER_BINARY_NAME)) + + +def get_job_instance_name(run: Run, job: Job) -> str: + return job.job_spec.job_name + + +_DEFAULT_MAX_RESOURCE_NAME_LEN = 60 +_CLOUD_RESOURCE_SUFFIX_LEN = 8 + + +def generate_unique_instance_name( + instance_configuration: InstanceConfiguration, + max_length: int = _DEFAULT_MAX_RESOURCE_NAME_LEN, +) -> str: + """ + Generates a unique instance name valid across all backends. + """ + return generate_unique_backend_name( + resource_name=instance_configuration.instance_name, + project_name=instance_configuration.project_name, + max_length=max_length, + ) + + +def generate_unique_instance_name_for_job( + run: Run, + job: Job, + max_length: int = _DEFAULT_MAX_RESOURCE_NAME_LEN, +) -> str: + """ + Generates a unique instance name for a job valid across all backends. + """ + return generate_unique_backend_name( + resource_name=get_job_instance_name(run, job), + project_name=run.project_name, + max_length=max_length, + ) + + +def generate_unique_gateway_instance_name( + gateway_compute_configuration: GatewayComputeConfiguration, + max_length: int = _DEFAULT_MAX_RESOURCE_NAME_LEN, +) -> str: + """ + Generates a unique gateway instance name valid across all backends. + """ + return generate_unique_backend_name( + resource_name=gateway_compute_configuration.instance_name, + project_name=gateway_compute_configuration.project_name, + max_length=max_length, + ) + + +def generate_unique_volume_name( + volume: Volume, + max_length: int = _DEFAULT_MAX_RESOURCE_NAME_LEN, +) -> str: + """ + Generates a unique volume name valid across all backends. + """ + return generate_unique_backend_name( + resource_name=volume.name, + project_name=volume.project_name, + max_length=max_length, + ) + + +def generate_unique_placement_group_name( + project_name: str, + fleet_name: str, + max_length: int = _DEFAULT_MAX_RESOURCE_NAME_LEN, +) -> str: + """ + Generates a unique placement group name valid across all backends. + """ + return generate_unique_backend_name( + resource_name=fleet_name, + project_name=project_name, + max_length=max_length, + ) + + +def generate_unique_backend_name( + resource_name: str, + project_name: Optional[str], + max_length: int, +) -> str: + """ + Generates a unique resource name valid across all backends. + Backend resource names must be unique on every provisioning so that + resource re-submission/re-creation doesn't lead to conflicts + on backends that require unique names (e.g. Azure, GCP). + """ + # resource_name is guaranteed to be valid in all backends + prefix = f"dstack-{resource_name}" + if project_name is not None and is_valid_dstack_resource_name(project_name): + # project_name is not guaranteed to be valid in all backends, + # so we add it only if it passes the validation + prefix = f"dstack-{project_name}-{resource_name}" + return generate_unique_name( + prefix=prefix, + max_length=max_length, + ) + + +def generate_unique_short_backend_name() -> str: + """ + Generates a unique 15-char resource name of the form "dstack-12345xyz". + Can be used for resources that have a very small length limit like AWS LBs. + """ + return generate_unique_name(prefix="dstack") + + +def generate_unique_name( + *, + prefix: Optional[str] = None, + suffix_length: Optional[int] = None, + max_length: Optional[int] = None, +) -> str: + if suffix_length is None: + suffix_length = _CLOUD_RESOURCE_SUFFIX_LEN + if max_length is not None: + assert max_length >= suffix_length + if prefix is not None: + prefix_len = max_length - suffix_length - 1 + assert prefix_len > 0 + prefix = prefix[:prefix_len] + suffix = "".join( + random.choice(string.ascii_lowercase + string.digits) for _ in range(suffix_length) + ) + if prefix is None: + return suffix + return f"{prefix}-{suffix}" + + +def get_cloud_config(**config) -> str: + return "#cloud-config\n" + yaml.dump(config, default_flow_style=False) def get_user_data( - authorized_keys: List[str], backend_specific_commands: Optional[List[str]] = None + authorized_keys: List[str], + backend_specific_commands: Optional[List[str]] = None, + base_path: Optional[PathLike] = None, + bin_path: Optional[PathLike] = None, + backend_shim_env: Optional[Dict[str, str]] = None, + skip_firewall_setup: bool = False, + firewall_allow_from_subnets: Iterable[str] = DEFAULT_PRIVATE_SUBNETS, ) -> str: - shim_commands = get_shim_commands(authorized_keys) + shim_commands = get_shim_commands( + base_path=base_path, + bin_path=bin_path, + backend_shim_env=backend_shim_env, + skip_firewall_setup=skip_firewall_setup, + firewall_allow_from_subnets=firewall_allow_from_subnets, + ) commands = (backend_specific_commands or []) + shim_commands return get_cloud_config( runcmd=[["sh", "-c", " && ".join(commands)]], @@ -163,70 +740,224 @@ def get_user_data( ) -def get_shim_env(build: str, authorized_keys: List[str]) -> Dict[str, str]: - build = get_dstack_runner_version() +def get_shim_env( + base_path: Optional[PathLike] = None, + bin_path: Optional[PathLike] = None, + backend_shim_env: Optional[Dict[str, str]] = None, + arch: Optional[str] = None, +) -> Dict[str, str]: + log_level = "5" # Debug envs = { - "DSTACK_RUNNER_LOG_LEVEL": "6", - "DSTACK_RUNNER_VERSION": build, - "DSTACK_PUBLIC_SSH_KEY": "\n".join(authorized_keys), - "DSTACK_HOME": DSTACK_WORKING_DIR, + "DSTACK_SHIM_HOME": get_dstack_working_dir(base_path), + "DSTACK_SHIM_HTTP_PORT": str(DSTACK_SHIM_HTTP_PORT), + "DSTACK_SHIM_LOG_LEVEL": log_level, + "DSTACK_RUNNER_DOWNLOAD_URL": get_dstack_runner_download_url(arch), + "DSTACK_RUNNER_BINARY_PATH": get_dstack_runner_binary_path(bin_path), + "DSTACK_RUNNER_HTTP_PORT": str(DSTACK_RUNNER_HTTP_PORT), + "DSTACK_RUNNER_SSH_PORT": str(DSTACK_RUNNER_SSH_PORT), + "DSTACK_RUNNER_LOG_LEVEL": log_level, } + if backend_shim_env is not None: + envs |= backend_shim_env return envs def get_shim_commands( - authorized_keys: List[str], *, is_privileged: bool = False, pjrt_device: Optional[str] = None + *, + is_privileged: bool = False, + pjrt_device: Optional[str] = None, + base_path: Optional[PathLike] = None, + bin_path: Optional[PathLike] = None, + backend_shim_env: Optional[Dict[str, str]] = None, + arch: Optional[str] = None, + skip_firewall_setup: bool = False, + firewall_allow_from_subnets: Iterable[str] = DEFAULT_PRIVATE_SUBNETS, ) -> List[str]: - build = get_dstack_runner_version() - commands = get_shim_pre_start_commands( - build, + commands = get_setup_cloud_instance_commands( + skip_firewall_setup=skip_firewall_setup, + firewall_allow_from_subnets=firewall_allow_from_subnets, + ) + commands += get_shim_pre_start_commands( + base_path=base_path, + bin_path=bin_path, + arch=arch, + ) + shim_env = get_shim_env( + base_path=base_path, + bin_path=bin_path, + backend_shim_env=backend_shim_env, + arch=arch, ) - for k, v in get_shim_env(build, authorized_keys).items(): + for k, v in shim_env.items(): commands += [f'export "{k}={v}"'] - commands += get_run_shim_script(is_privileged, pjrt_device) + commands += get_run_shim_script( + is_privileged=is_privileged, + pjrt_device=pjrt_device, + bin_path=bin_path, + ) return commands -def get_dstack_runner_version() -> str: - if settings.DSTACK_VERSION is not None: - return settings.DSTACK_VERSION - version = os.environ.get("DSTACK_RUNNER_VERSION", None) - if version is None and settings.DSTACK_USE_LATEST_FROM_BRANCH: - version = get_latest_runner_build() - return version or "latest" +def get_dstack_runner_version() -> Optional[str]: + if version := settings.DSTACK_VERSION: + return version + if version := settings.DSTACK_RUNNER_VERSION: + return version + if version_url := settings.DSTACK_RUNNER_VERSION_URL: + return _fetch_version(version_url) + if settings.DSTACK_USE_LATEST_FROM_BRANCH: + return get_latest_runner_build() + return None -def get_cloud_config(**config) -> str: - return "#cloud-config\n" + yaml.dump(config, default_flow_style=False) +def get_dstack_shim_version() -> Optional[str]: + if version := settings.DSTACK_VERSION: + return version + if version := settings.DSTACK_SHIM_VERSION: + return version + if version := settings.DSTACK_RUNNER_VERSION: + logger.warning( + "DSTACK_SHIM_VERSION is not set, using DSTACK_RUNNER_VERSION." + " Future versions will not fall back to DSTACK_RUNNER_VERSION." + " Set DSTACK_SHIM_VERSION to supress this warning." + ) + return version + if version_url := settings.DSTACK_SHIM_VERSION_URL: + return _fetch_version(version_url) + if settings.DSTACK_USE_LATEST_FROM_BRANCH: + return get_latest_runner_build() + return None + + +def normalize_arch(arch: Optional[str] = None) -> GoArchType: + """ + Converts the given free-form architecture string to the Go GOARCH format. + Only 64-bit x86 and ARM are supported. If the word size is not specified (e.g., `x86`, `arm`), + 64-bit is implied. + If the arch is not specified, falls back to `amd64`. + """ + if not arch: + return GoArchType.AMD64 + arch_lower = arch.lower() + if "32" in arch_lower or arch_lower in ["i386", "i686"]: + raise ValueError(f"32-bit architectures are not supported: {arch}") + if arch_lower.startswith("x86") or arch_lower.startswith("amd"): + return GoArchType.AMD64 + if arch_lower.startswith("arm") or arch_lower.startswith("aarch"): + return GoArchType.ARM64 + raise ValueError(f"Unsupported architecture: {arch}") -def get_shim_pre_start_commands(build: str) -> List[str]: - bucket = "dstack-runner-downloads-stgn" - if settings.DSTACK_VERSION is not None: - bucket = "dstack-runner-downloads" +def get_dstack_runner_download_url( + arch: Optional[str] = None, version: Optional[str] = None +) -> str: + url_template = settings.DSTACK_RUNNER_DOWNLOAD_URL + if not url_template: + if settings.DSTACK_VERSION is not None: + bucket = "dstack-runner-downloads" + else: + bucket = "dstack-runner-downloads-stgn" + url_template = ( + f"https://{bucket}.s3.eu-west-1.amazonaws.com" + "/{version}/binaries/dstack-runner-linux-{arch}" + ) + if version is None: + version = get_dstack_runner_version() or "latest" + return _format_download_url(url_template, version, arch) + - url = f"https://{bucket}.s3.eu-west-1.amazonaws.com/{build}/binaries/dstack-shim-linux-amd64" +def get_dstack_shim_download_url(arch: Optional[str] = None, version: Optional[str] = None) -> str: + url_template = settings.DSTACK_SHIM_DOWNLOAD_URL + if not url_template: + if settings.DSTACK_VERSION is not None: + bucket = "dstack-runner-downloads" + else: + bucket = "dstack-runner-downloads-stgn" + url_template = ( + f"https://{bucket}.s3.eu-west-1.amazonaws.com" + "/{version}/binaries/dstack-shim-linux-{arch}" + ) + if version is None: + version = get_dstack_shim_version() or "latest" + return _format_download_url(url_template, version, arch) - dstack_shim_binary_path = "/usr/local/bin/dstack-shim" +def get_setup_cloud_instance_commands( + skip_firewall_setup: bool, + firewall_allow_from_subnets: Iterable[str], +) -> list[str]: + commands = [ + # Workaround for https://github.com/NVIDIA/nvidia-container-toolkit/issues/48 + # Attempts to patch /etc/docker/daemon.json while keeping any custom settings it may have. + ( + "/bin/sh -c '" # wrap in /bin/sh to avoid interfering with other cloud init commands + " grep -q nvidia /etc/docker/daemon.json" + " && ! grep -q native.cgroupdriver /etc/docker/daemon.json" + " && jq '\\''.\"exec-opts\" = ((.\"exec-opts\" // []) + [\"native.cgroupdriver=cgroupfs\"])'\\'' /etc/docker/daemon.json > /tmp/daemon.json" + " && sudo mv /tmp/daemon.json /etc/docker/daemon.json" + " && sudo service docker restart" + " || true" + "'" + ), + ] + if not skip_firewall_setup: + commands += [ + "ufw --force reset", # Some OS images have default rules like `allow 80`. Delete them + "ufw default deny incoming", + "ufw default allow outgoing", + "ufw allow ssh", + ] + for subnet in firewall_allow_from_subnets: + commands.append(f"ufw allow from {subnet}") + commands += [ + "ufw --force enable", + ] + return commands + + +def get_shim_pre_start_commands( + base_path: Optional[PathLike] = None, + bin_path: Optional[PathLike] = None, + arch: Optional[str] = None, +) -> List[str]: + url = get_dstack_shim_download_url(arch) + dstack_shim_binary_path = get_dstack_shim_binary_path(bin_path) + dstack_working_dir = get_dstack_working_dir(base_path) return [ - f'sudo curl -s --compressed --connect-timeout 60 --max-time 240 --retry 1 --output {dstack_shim_binary_path} "{url}"', + f"dlpath=$(sudo mktemp -t {DSTACK_SHIM_BINARY_NAME}.XXXXXXXXXX)", + # -sS -- disable progress meter and warnings, but still show errors (unlike bare -s) + f'sudo curl -sS --compressed --connect-timeout 60 --max-time 240 --retry 1 --output "$dlpath" "{url}"', + f'sudo mv "$dlpath" {dstack_shim_binary_path}', f"sudo chmod +x {dstack_shim_binary_path}", - f"sudo mkdir {DSTACK_WORKING_DIR} -p", + f"{{ sudo chcon system_u:object_r:bin_t:s0 {dstack_shim_binary_path} 2>/dev/null || true; }}", + f"sudo mkdir {dstack_working_dir} -p", ] -def get_run_shim_script(is_privileged: bool, pjrt_device: Optional[str]) -> List[str]: - dev_flag = "" if settings.DSTACK_VERSION is not None else "--dev" +def get_run_shim_script( + is_privileged: bool, + pjrt_device: Optional[str], + bin_path: Optional[PathLike] = None, +) -> List[str]: + dstack_shim_binary_path = get_dstack_shim_binary_path(bin_path) privileged_flag = "--privileged" if is_privileged else "" pjrt_device_env = f"--pjrt-device={pjrt_device}" if pjrt_device else "" - + # TODO: Use a proper process supervisor? return [ - f"nohup dstack-shim {dev_flag} docker --keep-container {privileged_flag} {pjrt_device_env} >{DSTACK_WORKING_DIR}/shim.log 2>&1 &", + f""" + nohup sh -c ' + while true; do + {dstack_shim_binary_path} {privileged_flag} {pjrt_device_env} + sleep {DSTACK_SHIM_RESTART_INTERVAL_SECONDS} + done + ' & + """, ] -def get_gateway_user_data(authorized_key: str) -> str: +def get_gateway_user_data( + authorized_key: str, router: Optional[AnyGatewayRouterConfig] = None +) -> str: return get_cloud_config( package_update=True, packages=[ @@ -242,55 +973,57 @@ def get_gateway_user_data(authorized_key: str) -> str: "s/# server_names_hash_bucket_size 64;/server_names_hash_bucket_size 128;/", "/etc/nginx/nginx.conf", ], - ["su", "ubuntu", "-c", " && ".join(get_dstack_gateway_commands())], + ["su", "ubuntu", "-c", " && ".join(get_dstack_gateway_commands(router))], ], ssh_authorized_keys=[authorized_key], ) def get_docker_commands( - authorized_keys: List[str], fix_path_in_dot_profile: bool = True -) -> List[str]: - authorized_keys_content = "\n".join(authorized_keys).strip() + authorized_keys: list[str], + bin_path: Optional[PathLike] = None, +) -> list[str]: + dstack_runner_binary_path = get_dstack_runner_binary_path(bin_path) commands = [ - # note: &> redirection doesn't work in /bin/sh + "( :", + # See https://github.com/dstackai/dstack/issues/1769 + "unset LD_LIBRARY_PATH && unset LD_PRELOAD", + # common functions + 'exists() { command -v "$1" > /dev/null 2>&1; }', + # package manager detection/abstraction + "install_pkg() { NAME=Distribution; test -f /etc/os-release && . /etc/os-release; echo $NAME not supported; exit 11; }", + 'if exists apt-get; then install_pkg() { apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y "$1"; }; fi', + 'if exists yum; then install_pkg() { yum install -y "$1"; }; fi', + 'if exists apk; then install_pkg() { apk add -U "$1"; }; fi', # check in sshd is here, install if not - "if ! command -v sshd >/dev/null 2>&1; then apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y openssh-server || yum install -y openssh-server; fi", + "if ! exists sshd; then install_pkg openssh-server; fi", # install curl if necessary - "if ! command -v curl >/dev/null 2>&1; then apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y curl || yum install -y curl; fi", - # prohibit password authentication - 'sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config', - # create ssh dirs and add public key - "mkdir -p /run/sshd ~/.ssh", - "chmod 700 ~/.ssh", - f"echo '{authorized_keys_content}' > ~/.ssh/authorized_keys", - "chmod 600 ~/.ssh/authorized_keys", - # preserve environment variables for SSH clients - "env >> ~/.ssh/environment", - "sed -ie '1s@^@export PATH=\"'\"$PATH\"':$PATH\"\\n\\n@' ~/.profile" - if fix_path_in_dot_profile - else ":", - # regenerate host keys - "rm -rf /etc/ssh/ssh_host_*", - "ssh-keygen -A > /dev/null", - # start sshd - "/usr/sbin/sshd -p 10022 -o PermitUserEnvironment=yes", + "if ! exists curl; then install_pkg curl; fi", + ": )", ] - runner = "/usr/local/bin/dstack-runner" - - build = get_dstack_runner_version() - bucket = "dstack-runner-downloads-stgn" - if settings.DSTACK_VERSION is not None: - bucket = "dstack-runner-downloads" - - url = f"https://{bucket}.s3.eu-west-1.amazonaws.com/{build}/binaries/dstack-runner-linux-amd64" + runner_command = [ + dstack_runner_binary_path, + "--log-level", + "6", + "start", + "--temp-dir", + "/tmp/runner", + "--http-port", + str(DSTACK_RUNNER_HTTP_PORT), + "--ssh-port", + str(DSTACK_RUNNER_SSH_PORT), + ] + for authorized_key in authorized_keys: + runner_command += ["--ssh-authorized-key", authorized_key] + url = get_dstack_runner_download_url() commands += [ - f"curl --connect-timeout 60 --max-time 240 --retry 1 --output {runner} {url}", - f"chmod +x {runner}", - f"{runner} --log-level 6 start --http-port 10999 --temp-dir /tmp/runner --home-dir /root --working-dir /workflow", + f"curl --connect-timeout 60 --max-time 240 --retry 1 --output {dstack_runner_binary_path} {url}", + f"chmod +x {dstack_runner_binary_path}", + shlex.join(runner_command), ] + return commands @@ -336,23 +1069,66 @@ def get_latest_runner_build() -> Optional[str]: return None -def get_dstack_gateway_wheel(build: str) -> str: +def get_dstack_gateway_wheel(build: str, router: Optional[AnyGatewayRouterConfig] = None) -> str: channel = "release" if settings.DSTACK_RELEASE else "stgn" base_url = f"https://dstack-gateway-downloads.s3.amazonaws.com/{channel}" if build == "latest": - r = requests.get(f"{base_url}/latest-version", timeout=5) - r.raise_for_status() - build = r.text.strip() + build = _fetch_version(f"{base_url}/latest-version") or "latest" logger.debug("Found the latest gateway build: %s", build) - return f"{base_url}/dstack_gateway-{build}-py3-none-any.whl" + wheel = f"{base_url}/dstack_gateway-{build}-py3-none-any.whl" + # Build package spec with extras if router is specified + if router: + return f"dstack-gateway[{router.type}] @ {wheel}" + return f"dstack-gateway @ {wheel}" -def get_dstack_gateway_commands() -> List[str]: - build = get_dstack_runner_version() +def get_dstack_gateway_commands(router: Optional[AnyGatewayRouterConfig] = None) -> List[str]: + build = get_dstack_runner_version() or "latest" + gateway_package = get_dstack_gateway_wheel(build, router) return [ "mkdir -p /home/ubuntu/dstack", "python3 -m venv /home/ubuntu/dstack/blue", "python3 -m venv /home/ubuntu/dstack/green", - f"/home/ubuntu/dstack/blue/bin/pip install {get_dstack_gateway_wheel(build)}", + f"/home/ubuntu/dstack/blue/bin/pip install '{gateway_package}'", "sudo /home/ubuntu/dstack/blue/bin/python -m dstack.gateway.systemd install --run", ] + + +def merge_tags( + base_tags: Dict[str, str], + backend_tags: Optional[Dict[str, str]] = None, + resource_tags: Optional[Dict[str, str]] = None, +) -> Dict[str, str]: + res = base_tags.copy() + # backend_tags have priority over resource_tags + # so that regular users do not override the tags set by admins + if backend_tags is not None: + for k, v in backend_tags.items(): + res.setdefault(k, v) + if resource_tags is not None: + for k, v in resource_tags.items(): + res.setdefault(k, v) + return res + + +def requires_nvidia_proprietary_kernel_modules(gpu_name: str) -> bool: + """ + Returns: + Whether this NVIDIA GPU requires NVIDIA proprietary kernel modules + instead of open kernel modules. + """ + return gpu_name.lower() in NVIDIA_GPUS_REQUIRING_PROPRIETARY_KERNEL_MODULES + + +def _fetch_version(url: str) -> Optional[str]: + r = requests.get(url, timeout=5) + r.raise_for_status() + version = r.text.strip() + if not version: + logger.warning("Empty version response from URL: %s", url) + return None + return version + + +def _format_download_url(template: str, version: str, arch: Optional[str]) -> str: + return template.format(version=version, arch=normalize_arch(arch).value) diff --git a/src/dstack/_internal/core/backends/base/config.py b/src/dstack/_internal/core/backends/base/config.py deleted file mode 100644 index fd66ee5b21..0000000000 --- a/src/dstack/_internal/core/backends/base/config.py +++ /dev/null @@ -1,5 +0,0 @@ -from pydantic import BaseModel - - -class BackendConfig(BaseModel): - pass diff --git a/src/dstack/_internal/core/backends/base/configurator.py b/src/dstack/_internal/core/backends/base/configurator.py new file mode 100644 index 0000000000..5c9fd3d898 --- /dev/null +++ b/src/dstack/_internal/core/backends/base/configurator.py @@ -0,0 +1,119 @@ +from abc import ABC, abstractmethod +from typing import Any, ClassVar, Generic, List, NoReturn, Optional, TypeVar +from uuid import UUID + +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.models import ( + AnyBackendConfigWithCreds, + AnyBackendConfigWithoutCreds, +) +from dstack._internal.core.errors import BackendInvalidCredentialsError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import CoreModel + +# Most clouds allow ~ 40-60 tags/labels per resource. +# We'll introduce our own base limit that can be customized per backend if required. +TAGS_MAX_NUM = 25 + +BackendConfigWithoutCredsT = TypeVar( + "BackendConfigWithoutCredsT", bound=AnyBackendConfigWithoutCreds +) +BackendConfigWithCredsT = TypeVar("BackendConfigWithCredsT", bound=AnyBackendConfigWithCreds) + + +class BackendRecord(CoreModel): + """ + This model includes backend parameters to store in the DB. + """ + + config: str + """`config` stores text-encoded non-sensitive backend config parameters (e.g. json) + """ + auth: str + """`auth` stores text-encoded sensitive backend config parameters (e.g. json). + `Configurator` should not encrypt/decrypt it. This is done by the caller. + """ + + +class StoredBackendRecord(BackendRecord): + """ + This model includes backend parameters stored in the DB. + """ + + # IDs of DB models. + # Can be used by externally-registered Configurator to work with the DB directly. + project_id: UUID + backend_id: UUID + + +class Configurator(ABC, Generic[BackendConfigWithoutCredsT, BackendConfigWithCredsT]): + """ + `Configurator` is responsible for configuring backends + and initializing `Backend` instances from backend configs. + Every backend must implement `Configurator` and register it + in `dstack._internal.core.backends.configurators`. + """ + + TYPE: ClassVar[BackendType] + BACKEND_CLASS: ClassVar[type[Backend]] + """`BACKEND_CLASS` is used to introspect backend features without initializing it.""" + + @abstractmethod + def validate_config(self, config: BackendConfigWithCredsT, default_creds_enabled: bool): + """ + Validates backend config including backend creds and other parameters. + Raises `ServerClientError` or its subclass if config is invalid. + If the backend supports default creds and not `default_creds_enabled`, should raise an error. + """ + pass + + @abstractmethod + def create_backend(self, project_name: str, config: BackendConfigWithCredsT) -> BackendRecord: + """ + Sets up backend given backend config and returns + text-encoded config and creds to be stored in the DB. + It may perform backend initialization, create + cloud resources such as networks and managed identities, and + save additional configuration parameters. + It does not need to duplicate validation done by `validate_config()` + since the caller guarantees to call `validate_config()` first. + It may perform additional validation not possible in `validate_config()` + and raise `ServerClientError` or its subclass if config is invalid. + """ + pass + + @abstractmethod + def get_backend_config_with_creds( + self, record: StoredBackendRecord + ) -> BackendConfigWithCredsT: + """ + Constructs `BackendConfig` with credentials included. + Used internally and when project admins need to see backend's creds. + """ + pass + + @abstractmethod + def get_backend_config_without_creds( + self, record: StoredBackendRecord + ) -> BackendConfigWithoutCredsT: + """ + Constructs `BackendConfig` without sensitive information. + Used for API responses where creds should not be exposed. + """ + pass + + @abstractmethod + def get_backend(self, record: StoredBackendRecord) -> Backend: + """ + Returns `Backend` instance from config and creds stored in `record`. + """ + pass + + +def raise_invalid_credentials_error( + fields: Optional[List[List[str]]] = None, details: Optional[Any] = None +) -> NoReturn: + msg = BackendInvalidCredentialsError.msg + if details: + msg += f": {details}" + raise BackendInvalidCredentialsError(fields=fields, msg=msg) diff --git a/src/dstack/_internal/core/backends/base/models.py b/src/dstack/_internal/core/backends/base/models.py new file mode 100644 index 0000000000..b65024c1bb --- /dev/null +++ b/src/dstack/_internal/core/backends/base/models.py @@ -0,0 +1,24 @@ +from pathlib import Path +from typing import List + +from dstack._internal.core.models.common import CoreModel +from dstack._internal.core.models.runs import Job +from dstack._internal.core.models.volumes import Volume + + +class JobConfiguration(CoreModel): + job: Job + volumes: List[Volume] + + +def fill_data(values: dict, filename_field: str = "filename", data_field: str = "data") -> dict: + if values.get(data_field) is not None: + return values + if (filename := values.get(filename_field)) is None: + raise ValueError(f"Either `{filename_field}` or `{data_field}` must be specified") + try: + with open(Path(filename).expanduser()) as f: + values[data_field] = f.read() + except OSError: + raise ValueError(f"No such file {filename}") + return values diff --git a/src/dstack/_internal/core/backends/base/offers.py b/src/dstack/_internal/core/backends/base/offers.py index 1dbb33d796..a7e0239c82 100644 --- a/src/dstack/_internal/core/backends/base/offers.py +++ b/src/dstack/_internal/core/backends/base/offers.py @@ -1,29 +1,58 @@ +from collections.abc import Iterable, Iterator from dataclasses import asdict -from typing import Callable, List, Optional +from typing import Callable, List, Optional, TypeVar import gpuhunt +from pydantic import parse_obj_as from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.instances import ( Disk, Gpu, InstanceOffer, + InstanceOfferWithAvailability, InstanceType, Resources, ) +from dstack._internal.core.models.resources import DEFAULT_DISK, CPUSpec, Memory, Range from dstack._internal.core.models.runs import Requirements +from dstack._internal.utils.common import get_or_error + +# Offers not supported by all dstack versions are hidden behind one or more flags. +# This list enables the flags that are currently supported. +SUPPORTED_GPUHUNT_FLAGS = [ + "oci-spot", + "lambda-arm", + "gcp-a4", + "gcp-g4", + "gcp-dws-calendar-mode", + "runpod-cpu", + "runpod-cluster", +] def get_catalog_offers( backend: BackendType, locations: Optional[List[str]] = None, requirements: Optional[Requirements] = None, + configurable_disk_size: Range[Memory] = Range[Memory](min=Memory.parse("1GB"), max=None), extra_filter: Optional[Callable[[InstanceOffer], bool]] = None, catalog: Optional[gpuhunt.Catalog] = None, + catalog_item_filter: Optional[Callable[[gpuhunt.CatalogItem], bool]] = None, ) -> List[InstanceOffer]: + """ + Args: + catalog_item_filter: applied to raw catalog items before the conversion to + `InstanceOffer` models. Use it for filtering that can be done on raw catalog fields + to avoid expensive model construction for items that will be discarded. + """ provider = backend.value + if backend == BackendType.DATACRUNCH: + provider = BackendType.VERDA.value # Backward compatibility if backend == BackendType.LAMBDA: provider = "lambdalabs" + if backend == BackendType.AMDDEVCLOUD: + provider = "digitalocean" q = requirements_to_query_filter(requirements) q.provider = [provider] offers = [] @@ -32,7 +61,11 @@ def get_catalog_offers( for item in catalog.query(**asdict(q)): if locations is not None and item.location not in locations: continue - offer = catalog_item_to_offer(backend, item, requirements) + if catalog_item_filter is not None and not catalog_item_filter(item): + continue + offer = catalog_item_to_offer(backend, item, requirements, configurable_disk_size) + if offer is None: + continue if extra_filter is not None and not extra_filter(offer): continue offers.append(offer) @@ -40,43 +73,59 @@ def get_catalog_offers( def catalog_item_to_offer( - backend: BackendType, item: gpuhunt.CatalogItem, requirements: Optional[Requirements] -) -> InstanceOffer: + backend: BackendType, + item: gpuhunt.CatalogItem, + requirements: Optional[Requirements], + configurable_disk_size: Range[Memory], +) -> Optional[InstanceOffer]: + # Gpu() keeps validation for vendor normalization. + # The rest use construct() to skip redundant validation — data comes from already validated CatalogItem. gpus = [] if item.gpu_count > 0: - gpus = [Gpu(name=item.gpu_name, memory_mib=round(item.gpu_memory * 1024))] * item.gpu_count - disk_size_mib = round( - item.disk_size * 1024 - if item.disk_size - else requirements.resources.disk.size.min * 1024 + gpu = Gpu( + vendor=item.gpu_vendor, name=item.gpu_name, memory_mib=round(item.gpu_memory * 1024) + ) + gpus = [gpu] * item.gpu_count + disk_size_mib = choose_disk_size_mib( + catalog_item_disk_size_gib=item.disk_size, + requirements_disk_size=requirements.resources.disk.size if requirements and requirements.resources.disk - else 102400 # TODO: Make requirements' fields required + else None, + configurable_disk_size=configurable_disk_size, ) - resources = Resources( + if disk_size_mib is None: + return None + resources = Resources.construct( + cpu_arch=item.cpu_arch, cpus=item.cpu, memory_mib=round(item.memory * 1024), gpus=gpus, spot=item.spot, - disk=Disk(size_mib=disk_size_mib), + disk=Disk.construct(size_mib=disk_size_mib), ) - resources.description = resources.pretty_format() - return InstanceOffer( + return InstanceOffer.construct( backend=backend, - instance=InstanceType( + instance=InstanceType.construct( name=item.instance_name, resources=resources, ), region=item.location, price=item.price, + backend_data=item.provider_data, ) def offer_to_catalog_item(offer: InstanceOffer) -> gpuhunt.CatalogItem: + cpu_arch = offer.instance.resources.cpu_arch + if cpu_arch is None: + cpu_arch = gpuhunt.CPUArchitecture.X86 gpu_count = len(offer.instance.resources.gpus) + gpu_vendor = None gpu_name = None gpu_memory = None if gpu_count > 0: gpu = offer.instance.resources.gpus[0] + gpu_vendor = gpu.vendor gpu_name = gpu.name gpu_memory = gpu.memory_mib / 1024 return gpuhunt.CatalogItem( @@ -84,18 +133,20 @@ def offer_to_catalog_item(offer: InstanceOffer) -> gpuhunt.CatalogItem: instance_name=offer.instance.name, location=offer.region, price=offer.price, + cpu_arch=cpu_arch, cpu=offer.instance.resources.cpus, memory=offer.instance.resources.memory_mib / 1024, gpu_count=gpu_count, + gpu_vendor=gpu_vendor, gpu_name=gpu_name, gpu_memory=gpu_memory, spot=offer.instance.resources.spot, - disk_size=offer.instance.resources.disk.size_mib, + disk_size=offer.instance.resources.disk.size_mib / 1024, ) def requirements_to_query_filter(req: Optional[Requirements]) -> gpuhunt.QueryFilter: - q = gpuhunt.QueryFilter() + q = gpuhunt.QueryFilter(allowed_flags=SUPPORTED_GPUHUNT_FLAGS) if req is None: return q @@ -104,8 +155,11 @@ def requirements_to_query_filter(req: Optional[Requirements]) -> gpuhunt.QueryFi res = req.resources if res.cpu: - q.min_cpu = res.cpu.min - q.max_cpu = res.cpu.max + # TODO: Remove in 0.20. Use res.cpu directly + cpu = parse_obj_as(CPUSpec, res.cpu) + q.cpu_arch = cpu.arch + q.min_cpu = cpu.count.min + q.max_cpu = cpu.count.max if res.memory: q.min_memory = res.memory.min q.max_memory = res.memory.max @@ -114,6 +168,7 @@ def requirements_to_query_filter(req: Optional[Requirements]) -> gpuhunt.QueryFi q.max_disk_size = res.disk.size.max if res.gpu: + q.gpu_vendor = res.gpu.vendor q.gpu_name = res.gpu.name if res.gpu.memory: q.min_gpu_memory = res.gpu.memory.min @@ -130,13 +185,59 @@ def requirements_to_query_filter(req: Optional[Requirements]) -> gpuhunt.QueryFi return q -def match_requirements( - offers: List[InstanceOffer], requirements: Optional[Requirements] -) -> List[InstanceOffer]: +InstanceOfferT = TypeVar("InstanceOfferT", InstanceOffer, InstanceOfferWithAvailability) + + +def filter_offers_by_requirements( + offers: Iterable[InstanceOfferT], + requirements: Optional[Requirements], +) -> Iterator[InstanceOfferT]: query_filter = requirements_to_query_filter(requirements) - filtered_offers = [] for offer in offers: catalog_item = offer_to_catalog_item(offer) if gpuhunt.matches(catalog_item, q=query_filter): - filtered_offers.append(offer) - return filtered_offers + yield offer + + +def choose_disk_size_mib( + catalog_item_disk_size_gib: Optional[float], + requirements_disk_size: Optional[Range[Memory]], + configurable_disk_size: Range[Memory], +) -> Optional[int]: + if catalog_item_disk_size_gib: + disk_size_gib = catalog_item_disk_size_gib + else: + disk_size_range = requirements_disk_size or DEFAULT_DISK.size + disk_size_range = disk_size_range.intersect(configurable_disk_size) + if disk_size_range is None: + return None + disk_size_gib = disk_size_range.min + + return round(disk_size_gib * 1024) + + +OfferModifier = Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]] + + +def get_offers_disk_modifier( + configurable_disk_size: Range[Memory], requirements: Requirements +) -> OfferModifier: + """ + Returns a func that modifies offers disk by setting min value that satisfies both + `configurable_disk_size` and `requirements`. + """ + + def modifier(offer: InstanceOfferWithAvailability) -> Optional[InstanceOfferWithAvailability]: + requirements_disk_range = DEFAULT_DISK.size + if requirements.resources.disk is not None: + requirements_disk_range = requirements.resources.disk.size + disk_size_range = requirements_disk_range.intersect(configurable_disk_size) + if disk_size_range is None: + return None + offer_copy = offer.copy(deep=True) + offer_copy.instance.resources.disk = Disk( + size_mib=get_or_error(disk_size_range.min) * 1024 + ) + return offer_copy + + return modifier diff --git a/src/dstack/_internal/core/backends/base/profile_options.py b/src/dstack/_internal/core/backends/base/profile_options.py new file mode 100644 index 0000000000..04fd496c91 --- /dev/null +++ b/src/dstack/_internal/core/backends/base/profile_options.py @@ -0,0 +1,23 @@ +from abc import ABC, abstractmethod +from typing import Generic, Optional, Sequence, Type, TypeVar + +from dstack._internal.core.models.common import CoreModel + +T = TypeVar("T", bound="BackendProfileOptions") + + +class BackendProfileOptions(CoreModel, ABC, Generic[T]): + @abstractmethod + def combine(self, other: T) -> T: ... + + +_OptionsT = TypeVar("_OptionsT", bound="BackendProfileOptions") + + +def get_backend_profile_options( + options: Optional[Sequence[BackendProfileOptions]], + options_type: Type[_OptionsT], +) -> Optional[_OptionsT]: + if not options: + return None + return next((opt for opt in options if isinstance(opt, options_type)), None) diff --git a/src/dstack/_internal/core/backends/cloudrift/__init__.py b/src/dstack/_internal/core/backends/cloudrift/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/core/backends/cloudrift/api_client.py b/src/dstack/_internal/core/backends/cloudrift/api_client.py new file mode 100644 index 0000000000..d3bb425e9b --- /dev/null +++ b/src/dstack/_internal/core/backends/cloudrift/api_client.py @@ -0,0 +1,229 @@ +import os +import re +from typing import Any, Dict, List, Mapping, Optional, Union + +import requests +from packaging import version +from requests import Response + +from dstack._internal.core.errors import BackendError, BackendInvalidCredentialsError +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +CLOUDRIFT_SERVER_ADDRESS = "https://api.cloudrift.ai" +CLOUDRIFT_API_VERSION = "2025-05-29" + + +class RiftClient: + def __init__(self, api_key: Optional[str] = None): + self.public_api_root = os.path.join(CLOUDRIFT_SERVER_ADDRESS, "api/v1") + self.api_key = api_key + + def validate_api_key(self) -> bool: + """ + Validates the API key by making a request to the server. + Returns True if the API key is valid, False otherwise. + """ + try: + response = self._make_request("auth/me") + if isinstance(response, dict): + return "email" in response + return False + except BackendInvalidCredentialsError: + return False + except Exception as e: + logger.error(f"Error validating API key: {e}") + return False + + def get_instance_types(self) -> List[Dict]: + request_data = {"selector": {"ByServiceAndLocation": {"services": ["vm"]}}} + response_data = self._make_request("instance-types/list", request_data) + if isinstance(response_data, dict): + return response_data.get("instance_types", []) + return [] + + def list_recipes(self) -> List[Dict]: + request_data = {} + response_data = self._make_request("recipes/list", request_data) + if isinstance(response_data, dict): + return response_data.get("groups", []) + return [] + + def get_vm_recipies(self) -> List[Dict]: + """ + Retrieves a list of VM recipes from the CloudRift API. + Returns a list of dictionaries containing recipe information. + """ + recipe_group = self.list_recipes() + vm_recipes = [] + for group in recipe_group: + tags = group.get("tags", []) + has_vm = "vm" in map(str.lower, tags) + if group.get("name", "").lower() != "linux" or not has_vm: + continue + + recipes = group.get("recipes", []) + for recipe in recipes: + details = recipe.get("details", {}) + if details.get("VirtualMachine", False): + vm_recipes.append(recipe) + + return vm_recipes + + def get_vm_image_url(self, gpu_vendor: Optional[str] = None) -> Optional[str]: + recipes = self.get_vm_recipies() + if gpu_vendor == "amd": + driver_tag = "amd-driver" + else: + driver_tag = "nvidia-driver" + + ubuntu_images = [] + for recipe in recipes: + if driver_tag not in recipe.get("tags", []): + continue + + recipe_name = recipe.get("name", "") + if "Ubuntu" not in recipe_name: + continue + + url = recipe["details"].get("VirtualMachine", {}).get("image_url", None) + version_match = re.search(r".* (\d+\.\d+)", recipe_name) + if url and version_match and version_match.group(1): + ubuntu_version = version.parse(version_match.group(1)) + ubuntu_images.append((ubuntu_version, url)) + + ubuntu_images.sort(key=lambda x: x[0]) # Sort by version + if ubuntu_images: + return ubuntu_images[-1][1] + + return None + + def deploy_instance( + self, + instance_type: str, + region: str, + ssh_keys: List[str], + cmd: str, + gpu_vendor: Optional[str] = None, + ) -> List[str]: + image_url = self.get_vm_image_url(gpu_vendor=gpu_vendor) + if not image_url: + raise BackendError("No suitable VM image found.") + + request_data = { + "config": { + "VirtualMachine": { + "cloudinit_commands": cmd, + "image_url": image_url, + "ssh_key": {"PublicKeys": ssh_keys}, + } + }, + "selector": { + "ByInstanceTypeAndLocation": { + "datacenters": [region], + "instance_type": instance_type, + } + }, + "with_public_ip": True, + } + logger.debug("Deploying instance with request data: %s", request_data) + + response_data = self._make_request("instances/rent", request_data) + if isinstance(response_data, dict): + return response_data.get("instance_ids", []) + return [] + + def list_instances(self, instance_ids: Optional[List[str]] = None) -> List[Dict]: + request_data = { + "selector": { + "ByStatus": ["Initializing", "Active", "Deactivating"], + } + } + logger.debug("Listing instances with request data: %s", request_data) + response_data = self._make_request("instances/list", request_data) + if isinstance(response_data, dict): + return response_data.get("instances", []) + + return [] + + def get_instance_by_id(self, instance_id: str) -> Optional[Dict]: + request_data = {"selector": {"ById": [instance_id]}} + logger.debug("Getting instance with request data: %s", request_data) + response_data = self._make_request("instances/list", request_data) + if isinstance(response_data, dict): + instances = response_data.get("instances", []) + if isinstance(instances, list) and len(instances) > 0: + return instances[0] + + return None + + def terminate_instance(self, instance_id: str) -> bool: + request_data = {"selector": {"ById": [instance_id]}} + logger.debug("Terminating instance with request data: %s", request_data) + response_data = self._make_request("instances/terminate", request_data) + if isinstance(response_data, dict): + logger.debug("Terminating instance with response: %s", response_data) + info = response_data.get("terminated", []) + is_terminated = len(info) > 0 + if not is_terminated: + # check if the instance is already terminated + instance_info = self.get_instance_by_id(instance_id) + is_terminated = instance_info is None or instance_info.get("status") == "Inactive" + logger.debug( + "Instance %s is already terminated: %s response: %s", + instance_id, + is_terminated, + instance_info, + ) + return is_terminated + + return False + + def _make_request( + self, + endpoint: str, + data: Optional[Mapping[str, Any]] = None, + method: str = "POST", + **kwargs, + ) -> Union[Mapping[str, Any], str, Response]: + headers = {} + if self.api_key is not None: + headers["X-API-Key"] = self.api_key + + version = CLOUDRIFT_API_VERSION + full_url = f"{self.public_api_root}/{endpoint}" + + try: + response = requests.request( + method, + full_url, + headers=headers, + json={"version": version, "data": data}, + timeout=15, + **kwargs, + ) + + if not response.ok: + response.raise_for_status() + try: + response_json = response.json() + if isinstance(response_json, str): + return response_json + if version is not None and version < response_json["version"]: + logger.warning( + "The API version %s is lower than the server version %s. ", + version, + response_json["version"], + ) + return response_json["data"] + except requests.exceptions.JSONDecodeError: + return response + except requests.HTTPError as e: + if e.response is not None and e.response.status_code in ( + requests.codes.forbidden, + requests.codes.unauthorized, + ): + raise BackendInvalidCredentialsError(e.response.text) + raise diff --git a/src/dstack/_internal/core/backends/cloudrift/backend.py b/src/dstack/_internal/core/backends/cloudrift/backend.py new file mode 100644 index 0000000000..cca0c620c6 --- /dev/null +++ b/src/dstack/_internal/core/backends/cloudrift/backend.py @@ -0,0 +1,16 @@ +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.cloudrift.compute import CloudRiftCompute +from dstack._internal.core.backends.cloudrift.models import CloudRiftConfig +from dstack._internal.core.models.backends.base import BackendType + + +class CloudRiftBackend(Backend): + TYPE = BackendType.CLOUDRIFT + COMPUTE_CLASS = CloudRiftCompute + + def __init__(self, config: CloudRiftConfig): + self.config = config + self._compute = CloudRiftCompute(self.config) + + def compute(self) -> CloudRiftCompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/cloudrift/compute.py b/src/dstack/_internal/core/backends/cloudrift/compute.py new file mode 100644 index 0000000000..4316d47ec5 --- /dev/null +++ b/src/dstack/_internal/core/backends/cloudrift/compute.py @@ -0,0 +1,152 @@ +from typing import Dict, List, Optional + +from dstack._internal.core.backends.base.compute import ( + Compute, + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithInstanceVolumesSupport, + ComputeWithPrivilegedSupport, + get_shim_commands, +) +from dstack._internal.core.backends.base.offers import get_catalog_offers +from dstack._internal.core.backends.cloudrift.api_client import RiftClient +from dstack._internal.core.backends.cloudrift.models import CloudRiftConfig +from dstack._internal.core.errors import ComputeError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import ( + InstanceAvailability, + InstanceConfiguration, + InstanceOffer, + InstanceOfferWithAvailability, +) +from dstack._internal.core.models.placement import PlacementGroup +from dstack._internal.core.models.runs import JobProvisioningData +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +class CloudRiftCompute( + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithPrivilegedSupport, + ComputeWithInstanceVolumesSupport, + Compute, +): + def __init__(self, config: CloudRiftConfig): + super().__init__() + self.config = config + self.client = RiftClient(self.config.creds.api_key) + + def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]: + offers = get_catalog_offers( + backend=BackendType.CLOUDRIFT, + locations=self.config.regions or None, + ) + offers_with_availabilities = self._get_offers_with_availability(offers) + return offers_with_availabilities + + def _get_offers_with_availability( + self, offers: List[InstanceOffer] + ) -> List[InstanceOfferWithAvailability]: + instance_types_with_availabilities: List[Dict] = self.client.get_instance_types() + + region_availabilities = {} + for instance_type in instance_types_with_availabilities: + for variant in instance_type["variants"]: + for dc, count in variant["available_nodes_per_dc"].items(): + if count > 0: + key = (variant["name"], dc) + region_availabilities[key] = InstanceAvailability.AVAILABLE + + availability_offers = [] + for offer in offers: + key = (offer.instance.name, offer.region) + availability = region_availabilities.get(key, InstanceAvailability.NOT_AVAILABLE) + availability_offers.append(offer.with_availability(availability=availability)) + + return availability_offers + + def create_instance( + self, + instance_offer: InstanceOfferWithAvailability, + instance_config: InstanceConfiguration, + placement_group: Optional[PlacementGroup], + ) -> JobProvisioningData: + # TODO: Remove once CloudRift fixes their VM RTC clock. + # Wrong RTC + NTP backward jump breaks Docker container lifecycle. + ntp_sync_commands = [ + ( + "timeout 60 bash -c '" + "while ! timedatectl show -p NTPSynchronized --value | grep -q yes;" + " do sleep 1; done' || true" + ), + ] + commands = ntp_sync_commands + get_shim_commands() + startup_script = " ".join([" && ".join(commands)]) + logger.debug( + f"Creating instance for offer {instance_offer.instance.name} in region {instance_offer.region} with commands: {startup_script}" + ) + + gpu_vendor = None + if instance_offer.instance.resources.gpus: + gpu_vendor = instance_offer.instance.resources.gpus[0].vendor.value + + instance_ids = self.client.deploy_instance( + instance_type=instance_offer.instance.name, + region=instance_offer.region, + ssh_keys=instance_config.get_public_keys(), + cmd=startup_script, + gpu_vendor=gpu_vendor, + ) + + if len(instance_ids) == 0: + raise ComputeError( + f"Failed to create instance for offer {instance_offer.instance.name} in region {instance_offer.region}." + ) + + return JobProvisioningData( + backend=instance_offer.backend, + instance_type=instance_offer.instance, + instance_id=instance_ids[0], + hostname=None, + internal_ip=None, + region=instance_offer.region, + price=instance_offer.price, + username="riftuser", + ssh_port=22, + dockerized=True, + ssh_proxy=None, + backend_data=None, + ) + + def update_provisioning_data( + self, + provisioning_data: JobProvisioningData, + project_ssh_public_key: str, + project_ssh_private_key: str, + ): + instance_info = self.client.get_instance_by_id(provisioning_data.instance_id) + + if not instance_info: + return + + instance_mode = instance_info.get("node_mode", "") + + if not instance_mode or instance_mode != "VirtualMachine": + return + + vms = instance_info.get("virtual_machines", []) + if len(vms) == 0: + return + + vm_ready = vms[0].get("ready", False) + if vm_ready: + provisioning_data.hostname = instance_info.get("host_address", None) + + def terminate_instance( + self, instance_id: str, region: str, backend_data: Optional[str] = None + ): + terminated = self.client.terminate_instance(instance_id=instance_id) + if not terminated: + raise ComputeError(f"Failed to terminate instance {instance_id} in region {region}.") diff --git a/src/dstack/_internal/core/backends/cloudrift/configurator.py b/src/dstack/_internal/core/backends/cloudrift/configurator.py new file mode 100644 index 0000000000..b6097d1654 --- /dev/null +++ b/src/dstack/_internal/core/backends/cloudrift/configurator.py @@ -0,0 +1,72 @@ +import json + +from dstack._internal.core.backends.base.configurator import ( + BackendRecord, + Configurator, + raise_invalid_credentials_error, +) +from dstack._internal.core.backends.cloudrift.api_client import RiftClient +from dstack._internal.core.backends.cloudrift.backend import CloudRiftBackend +from dstack._internal.core.backends.cloudrift.models import ( + AnyCloudRiftCreds, + CloudRiftBackendConfig, + CloudRiftBackendConfigWithCreds, + CloudRiftConfig, + CloudRiftCreds, + CloudRiftStoredConfig, +) +from dstack._internal.core.models.backends.base import ( + BackendType, +) + + +class CloudRiftConfigurator( + Configurator[ + CloudRiftBackendConfig, + CloudRiftBackendConfigWithCreds, + ] +): + TYPE = BackendType.CLOUDRIFT + BACKEND_CLASS = CloudRiftBackend + + def validate_config( + self, config: CloudRiftBackendConfigWithCreds, default_creds_enabled: bool + ): + self._validate_creds(config.creds) + + def create_backend( + self, project_name: str, config: CloudRiftBackendConfigWithCreds + ) -> BackendRecord: + return BackendRecord( + config=CloudRiftStoredConfig( + **CloudRiftBackendConfig.__response__.parse_obj(config).dict() + ).json(), + auth=CloudRiftCreds.parse_obj(config.creds).json(), + ) + + def get_backend_config_with_creds( + self, record: BackendRecord + ) -> CloudRiftBackendConfigWithCreds: + config = self._get_config(record) + return CloudRiftBackendConfigWithCreds.__response__.parse_obj(config) + + def get_backend_config_without_creds(self, record: BackendRecord) -> CloudRiftBackendConfig: + config = self._get_config(record) + return CloudRiftBackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> CloudRiftBackend: + config = self._get_config(record) + return CloudRiftBackend(config=config) + + def _get_config(self, record: BackendRecord) -> CloudRiftConfig: + return CloudRiftConfig.__response__( + **json.loads(record.config), + creds=CloudRiftCreds.parse_raw(record.auth), + ) + + def _validate_creds(self, creds: AnyCloudRiftCreds): + if not isinstance(creds, CloudRiftCreds): + raise_invalid_credentials_error(fields=[["creds"]]) + client = RiftClient(creds.api_key) + if not client.validate_api_key(): + raise_invalid_credentials_error(fields=[["creds", "api_key"]]) diff --git a/src/dstack/_internal/core/backends/cloudrift/models.py b/src/dstack/_internal/core/backends/cloudrift/models.py new file mode 100644 index 0000000000..62a6726f9a --- /dev/null +++ b/src/dstack/_internal/core/backends/cloudrift/models.py @@ -0,0 +1,40 @@ +from typing import Annotated, List, Literal, Optional, Union + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +class CloudRiftAPIKeyCreds(CoreModel): + type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" + api_key: Annotated[str, Field(description="The API key")] + + +AnyCloudRiftCreds = CloudRiftAPIKeyCreds +CloudRiftCreds = AnyCloudRiftCreds + + +class CloudRiftBackendConfig(CoreModel): + type: Annotated[ + Literal["cloudrift"], + Field(description="The type of backend"), + ] = "cloudrift" + regions: Annotated[ + Optional[List[str]], + Field(description="The list of CloudRift regions. Omit to use all regions"), + ] = None + + +class CloudRiftBackendConfigWithCreds(CloudRiftBackendConfig): + creds: Annotated[AnyCloudRiftCreds, Field(description="The credentials")] + + +AnyCloudRiftBackendConfig = Union[CloudRiftBackendConfig, CloudRiftBackendConfigWithCreds] + + +class CloudRiftStoredConfig(CloudRiftBackendConfig): + pass + + +class CloudRiftConfig(CloudRiftStoredConfig): + creds: AnyCloudRiftCreds diff --git a/src/dstack/_internal/core/backends/configurators.py b/src/dstack/_internal/core/backends/configurators.py new file mode 100644 index 0000000000..cdeac7f608 --- /dev/null +++ b/src/dstack/_internal/core/backends/configurators.py @@ -0,0 +1,199 @@ +from typing import List, Optional, Type, Union + +from dstack._internal.core.backends.base.configurator import Configurator +from dstack._internal.core.models.backends.base import BackendType + +_CONFIGURATOR_CLASSES: List[Type[Configurator]] = [] + +try: + from dstack._internal.core.backends.amddevcloud.configurator import AMDDevCloudConfigurator + + _CONFIGURATOR_CLASSES.append(AMDDevCloudConfigurator) +except ImportError: + pass + +try: + from dstack._internal.core.backends.aws.configurator import AWSConfigurator + + _CONFIGURATOR_CLASSES.append(AWSConfigurator) +except ImportError: + pass + +try: + from dstack._internal.core.backends.azure.configurator import AzureConfigurator + + _CONFIGURATOR_CLASSES.append(AzureConfigurator) +except ImportError: + pass + +try: + from dstack._internal.core.backends.cloudrift.configurator import ( + CloudRiftConfigurator, + ) + + _CONFIGURATOR_CLASSES.append(CloudRiftConfigurator) +except ImportError: + pass + +try: + from dstack._internal.core.backends.crusoe.configurator import ( + CrusoeConfigurator, + ) + + _CONFIGURATOR_CLASSES.append(CrusoeConfigurator) +except ImportError: + pass + +try: + from dstack._internal.core.backends.cudo.configurator import ( + CudoConfigurator, + ) + + _CONFIGURATOR_CLASSES.append(CudoConfigurator) +except ImportError: + pass + +try: + from dstack._internal.core.backends.datacrunch.configurator import ( + DataCrunchConfigurator, + ) + + _CONFIGURATOR_CLASSES.append(DataCrunchConfigurator) +except ImportError: + pass + +try: + from dstack._internal.core.backends.digitalocean.configurator import ( + DigitalOceanConfigurator, + ) + + _CONFIGURATOR_CLASSES.append(DigitalOceanConfigurator) +except ImportError: + pass + +try: + from dstack._internal.core.backends.gcp.configurator import GCPConfigurator + + _CONFIGURATOR_CLASSES.append(GCPConfigurator) +except ImportError: + pass + +try: + from dstack._internal.core.backends.hotaisle.configurator import ( + HotAisleConfigurator, + ) + + _CONFIGURATOR_CLASSES.append(HotAisleConfigurator) +except ImportError: + pass + +try: + from dstack._internal.core.backends.jarvislabs.configurator import ( + JarvisLabsConfigurator, + ) + + _CONFIGURATOR_CLASSES.append(JarvisLabsConfigurator) +except ImportError: + pass + +try: + from dstack._internal.core.backends.kubernetes.configurator import ( + KubernetesConfigurator, + ) + + _CONFIGURATOR_CLASSES.append(KubernetesConfigurator) +except ImportError: + pass + +try: + from dstack._internal.core.backends.lambdalabs.configurator import ( + LambdaConfigurator, + ) + + _CONFIGURATOR_CLASSES.append(LambdaConfigurator) +except ImportError: + pass + +try: + from dstack._internal.core.backends.nebius.configurator import ( + NebiusConfigurator, + ) + + _CONFIGURATOR_CLASSES.append(NebiusConfigurator) +except ImportError: + pass + +try: + from dstack._internal.core.backends.oci.configurator import OCIConfigurator + + _CONFIGURATOR_CLASSES.append(OCIConfigurator) +except ImportError: + pass + +try: + from dstack._internal.core.backends.runpod.configurator import RunpodConfigurator + + _CONFIGURATOR_CLASSES.append(RunpodConfigurator) +except ImportError: + pass + + +try: + from dstack._internal.core.backends.vastai.configurator import VastAIConfigurator + + _CONFIGURATOR_CLASSES.append(VastAIConfigurator) +except ImportError: + pass + +try: + from dstack._internal.core.backends.verda.configurator import ( + VerdaConfigurator, + ) + + _CONFIGURATOR_CLASSES.append(VerdaConfigurator) +except ImportError: + pass + +try: + from dstack._internal.core.backends.vultr.configurator import VultrConfigurator + + _CONFIGURATOR_CLASSES.append(VultrConfigurator) +except ImportError: + pass + + +_BACKEND_TYPE_TO_CONFIGURATOR_CLASS_MAP = {c.TYPE: c for c in _CONFIGURATOR_CLASSES} +_BACKEND_TYPES = [c.TYPE for c in _CONFIGURATOR_CLASSES] + + +def get_configurator(backend_type: Union[BackendType, str]) -> Optional[Configurator]: + """ + Returns an available `Configurator` for a given `backend_type`. + """ + backend_type = BackendType(backend_type) + configurator_class = _BACKEND_TYPE_TO_CONFIGURATOR_CLASS_MAP.get(backend_type) + if configurator_class is None: + return None + return configurator_class() + + +def list_available_backend_types() -> List[BackendType]: + """ + Lists all backend types available on the server. + """ + return _BACKEND_TYPES + + +def list_available_configurator_classes() -> List[type[Configurator]]: + """ + Lists all backend configurator classes available on the server. + """ + return _CONFIGURATOR_CLASSES + + +def register_configurator(configurator: Type[Configurator]): + """ + A hook to for registering new configurators without importing them. + Can be used to extend dstack functionality. + """ + _BACKEND_TYPE_TO_CONFIGURATOR_CLASS_MAP[configurator.TYPE] = configurator diff --git a/src/dstack/_internal/core/backends/crusoe/backend.py b/src/dstack/_internal/core/backends/crusoe/backend.py new file mode 100644 index 0000000000..9f81f136d1 --- /dev/null +++ b/src/dstack/_internal/core/backends/crusoe/backend.py @@ -0,0 +1,16 @@ +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.crusoe.compute import CrusoeCompute +from dstack._internal.core.backends.crusoe.models import CrusoeConfig +from dstack._internal.core.models.backends.base import BackendType + + +class CrusoeBackend(Backend): + TYPE = BackendType.CRUSOE + COMPUTE_CLASS = CrusoeCompute + + def __init__(self, config: CrusoeConfig): + self.config = config + self._compute = CrusoeCompute(self.config) + + def compute(self) -> CrusoeCompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/crusoe/compute.py b/src/dstack/_internal/core/backends/crusoe/compute.py new file mode 100644 index 0000000000..fe1411fe7c --- /dev/null +++ b/src/dstack/_internal/core/backends/crusoe/compute.py @@ -0,0 +1,432 @@ +from collections.abc import Iterable +from typing import List, Optional + +import gpuhunt +from gpuhunt.providers.crusoe import CrusoeProvider + +from dstack._internal.core.backends.base.backend import Compute +from dstack._internal.core.backends.base.compute import ( + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithInstanceVolumesSupport, + ComputeWithMultinodeSupport, + ComputeWithPlacementGroupSupport, + ComputeWithPrivilegedSupport, + generate_unique_instance_name, + get_shim_commands, +) +from dstack._internal.core.backends.base.offers import ( + OfferModifier, + get_catalog_offers, + get_offers_disk_modifier, +) +from dstack._internal.core.backends.crusoe.models import CrusoeConfig +from dstack._internal.core.backends.crusoe.resources import CrusoeClient +from dstack._internal.core.errors import BackendError, NotYetTerminated +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import CoreModel +from dstack._internal.core.models.instances import ( + InstanceAvailability, + InstanceConfiguration, + InstanceOffer, + InstanceOfferWithAvailability, +) +from dstack._internal.core.models.placement import ( + PlacementGroup, + PlacementGroupProvisioningData, + PlacementStrategy, +) +from dstack._internal.core.models.resources import Memory, Range +from dstack._internal.core.models.runs import JobProvisioningData, Requirements +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + +# Range for the persistent data disk created for instance types without ephemeral NVMe. +CONFIGURABLE_DISK_SIZE = Range[Memory]( + min=Memory.parse("50GB"), + max=Memory.parse("5000GB"), +) +WAIT_FOR_DISK_TIMEOUT = 30 +WAIT_FOR_VM_TIMEOUT = 120 + +SETUP_COMMANDS = [ + 'sed -i "s/.*AllowTcpForwarding.*/AllowTcpForwarding yes/g" /etc/ssh/sshd_config', + "service ssh restart", +] + +# Set up storage on the best available disk and move containerd there. +# Docker on Crusoe images delegates image storage to containerd's native snapshotter, +# so /var/lib/containerd is what determines container disk space. +# Handles: /dev/vdb (persistent data disk we create) or /dev/nvme* (ephemeral NVMe). +# For multiple NVMe drives, uses mdadm RAID-0 for maximum space. +STORAGE_SETUP_COMMANDS = [ + ( + "DISK='' && " + "if [ -b /dev/vdb ]; then DISK=/dev/vdb; " + "elif ls /dev/nvme*n1 >/dev/null 2>&1; then" + " NVME_DEVS=$(ls /dev/nvme*n1 2>/dev/null);" + " NVME_COUNT=$(echo $NVME_DEVS | wc -w);" + " if [ $NVME_COUNT -eq 1 ]; then DISK=$NVME_DEVS;" + " elif [ $NVME_COUNT -gt 1 ]; then" + " apt-get install -y -qq mdadm >/dev/null 2>&1 || true;" + " mdadm --create /dev/md0 --level=0 --raid-devices=$NVME_COUNT $NVME_DEVS --force --run;" + " DISK=/dev/md0;" + " fi;" + "fi && " + 'if [ -n "$DISK" ]; then' + " mkfs.ext4 -q -F $DISK" + " && mkdir -p /data" + " && mount $DISK /data" + " && service docker stop" + " && systemctl stop containerd || true" + " && mkdir -p /data/containerd" + " && rsync -a /var/lib/containerd/ /data/containerd/" + " && mount --bind /data/containerd /var/lib/containerd" + " && systemctl start containerd || true" + " && service docker start" + "; fi" + ), +] + +IMAGE_SXM_DOCKER = "ubuntu22.04-nvidia-sxm-docker:latest" +IMAGE_PCIE_DOCKER = "ubuntu22.04-nvidia-pcie-docker:latest" +IMAGE_ROCM = "ubuntu-rocm:latest" + + +def _get_image(instance_name: str, gpu_type: str) -> str: + # Check instance name for SXM -- gpu_type from gpuhunt is normalized (e.g. "A100") + # and doesn't contain "SXM", but instance names like "a100-80gb-sxm-ib.8x" do. + if "-sxm" in instance_name.lower(): + return IMAGE_SXM_DOCKER + if "MI3" in gpu_type: + return IMAGE_ROCM + # Use PCIe docker image for both PCIe GPUs and CPU-only types. + # Crusoe has no CPU-specific Docker image; the base ubuntu image lacks Docker. + return IMAGE_PCIE_DOCKER + + +def _is_ib_type(instance_name: str) -> bool: + prefix = instance_name.split(".")[0] + return prefix.endswith("-ib") or prefix.endswith("-roce") + + +def _get_instance_family(instance_name: str) -> str: + return instance_name.rsplit(".", 1)[0] + + +def _has_ephemeral_disk(offer: InstanceOffer) -> bool: + """Check if the instance type has ephemeral NVMe storage via gpuhunt provider_data.""" + backend_data = offer.backend_data or {} + return backend_data.get("disk_gb", 0) > 0 + + +class CrusoeCompute( + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithPrivilegedSupport, + ComputeWithInstanceVolumesSupport, + ComputeWithMultinodeSupport, + ComputeWithPlacementGroupSupport, + Compute, +): + def __init__(self, config: CrusoeConfig): + super().__init__() + self.config = config + self._client = CrusoeClient(config.creds, config.project_id) + self._catalog = gpuhunt.Catalog(balance_resources=False, auto_reload=False) + self._catalog.add_provider( + CrusoeProvider( + access_key=config.creds.access_key, + secret_key=config.creds.secret_key, + project_id=config.project_id, + ) + ) + + def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]: + offers = get_catalog_offers( + backend=BackendType.CRUSOE, + locations=self.config.regions or None, + catalog=self._catalog, + ) + quota_map = self._get_quota_map() + result = [] + for offer in offers: + family = _get_instance_family(offer.instance.name) + availability = InstanceAvailability.UNKNOWN + for prog_name, available in quota_map.items(): + if family.startswith(prog_name) or prog_name.startswith(family): + availability = ( + InstanceAvailability.AVAILABLE + if available > 0 + else InstanceAvailability.NO_QUOTA + ) + break + result.append(offer.with_availability(availability=availability)) + return result + + def _get_quota_map(self) -> dict[str, int]: + try: + quotas = self._client.list_quotas() + except Exception: + logger.warning("Failed to fetch Crusoe quotas, availability will be UNKNOWN") + return {} + result = {} + for q in quotas: + prog_name = q.get("programmatic_name", "") + available = q.get("available", 0) + category = q.get("category", "") + if "Instance" in category: + result[prog_name] = available + return result + + def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]: + # Only adjust disk size for types without ephemeral NVMe (disk_gb == 0). + # Types with ephemeral NVMe already have their disk_size set by gpuhunt. + base_modifier = get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements) + + def modifier( + offer: InstanceOfferWithAvailability, + ) -> Optional[InstanceOfferWithAvailability]: + if _has_ephemeral_disk(offer): + return offer + return base_modifier(offer) + + return [modifier] + + def create_instance( + self, + instance_offer: InstanceOfferWithAvailability, + instance_config: InstanceConfiguration, + placement_group: Optional[PlacementGroup], + ) -> JobProvisioningData: + instance_name = generate_unique_instance_name(instance_config) + region = instance_offer.region + + ib_partition_id = None + if placement_group: + assert placement_group.provisioning_data is not None + pg_data = CrusoePlacementGroupBackendData.load( + placement_group.provisioning_data.backend_data + ) + ib_partition_id = pg_data.ib_partition_id + + gpus = instance_offer.instance.resources.gpus + gpu_type = gpus[0].name if gpus else "" + instance_type_name = instance_offer.instance.name + image = _get_image(instance_type_name, gpu_type) + + needs_data_disk = not _has_ephemeral_disk(instance_offer) + # Always include storage setup: it auto-detects /dev/vdb (data disk) or + # /dev/nvme* (ephemeral NVMe) and moves containerd storage there. + commands = SETUP_COMMANDS + STORAGE_SETUP_COMMANDS + get_shim_commands(is_privileged=True) + startup_script = "#!/bin/bash\nset -e\n" + " && ".join(commands) + + data_disk_id = None + create_op = None + try: + if needs_data_disk: + disk_size_mib = instance_offer.instance.resources.disk.size_mib + disk_size_gib = max(disk_size_mib // 1024, 1) + disk_op = self._client.create_disk( + name=f"{instance_name}-data", + size=f"{disk_size_gib}GiB", + location=region, + ) + data_disk_id = disk_op["metadata"]["id"] + self._client.wait_for_disk_operation( + disk_op["operation_id"], timeout=WAIT_FOR_DISK_TIMEOUT + ) + + disks = None + if data_disk_id: + disks = [ + {"disk_id": data_disk_id, "mode": "read-write", "attachment_type": "data"} + ] + + host_channel_adapters = None + if ib_partition_id: + host_channel_adapters = [{"ib_partition_id": ib_partition_id}] + + create_op = self._client.create_vm( + name=instance_name, + vm_type=instance_type_name, + location=region, + ssh_public_key=instance_config.get_public_keys()[0], + image=image, + startup_script=startup_script, + disks=disks, + host_channel_adapters=host_channel_adapters, + ) + vm_id = create_op["metadata"]["id"] + self._client.wait_for_vm_operation( + create_op["operation_id"], timeout=WAIT_FOR_VM_TIMEOUT + ) + except BaseException: + if create_op is not None: + vm_id_to_delete = create_op.get("metadata", {}).get("id") + if vm_id_to_delete: + try: + self._client.delete_vm(vm_id_to_delete) + except Exception as e: + logger.exception("Could not delete VM %s: %s", vm_id_to_delete, e) + if data_disk_id: + try: + self._client.delete_disk(data_disk_id) + except Exception as e: + logger.exception("Could not delete disk %s: %s", data_disk_id, e) + raise + + return JobProvisioningData( + backend=instance_offer.backend, + instance_type=instance_offer.instance, + instance_id=vm_id, + hostname=None, + region=region, + price=instance_offer.price, + ssh_port=22, + username="ubuntu", + dockerized=True, + backend_data=CrusoeInstanceBackendData(data_disk_id=data_disk_id).json(), + ) + + def update_provisioning_data( + self, provisioning_data, project_ssh_public_key, project_ssh_private_key + ): + try: + vm = self._client.get_vm(provisioning_data.instance_id) + except Exception: + return + interfaces = vm.get("network_interfaces", []) + if not interfaces: + return + ips = interfaces[0].get("ips", []) + if not ips: + return + public_ipv4 = ips[0].get("public_ipv4", {}) + private_ipv4 = ips[0].get("private_ipv4", {}) + if public_ipv4.get("address"): + provisioning_data.hostname = public_ipv4["address"] + if private_ipv4.get("address"): + provisioning_data.internal_ip = private_ipv4["address"] + + def terminate_instance( + self, instance_id: str, region: str, backend_data: Optional[str] = None + ): + backend_data_parsed = CrusoeInstanceBackendData.load(backend_data) + try: + vm = self._client.get_vm(instance_id) + except BackendError: + # VM not found (404) or other API error -- treat as already deleted + vm = None + + if vm is not None: + state = vm.get("state", "") + if state not in ("STATE_DELETING", "STATE_DELETED"): + try: + self._client.delete_vm(instance_id) + except BackendError: + pass + raise NotYetTerminated(f"Requested VM deletion. State was: {state}") + else: + raise NotYetTerminated(f"Waiting for VM deletion. State: {state}") + + # OS disk is auto-deleted with the VM. Data disk must be deleted separately. + if backend_data_parsed.data_disk_id: + try: + self._client.delete_disk(backend_data_parsed.data_disk_id) + except BackendError: + pass + + def create_placement_group( + self, + placement_group: PlacementGroup, + master_instance_offer: InstanceOffer, + ) -> PlacementGroupProvisioningData: + assert placement_group.configuration.placement_strategy == PlacementStrategy.CLUSTER + instance_name = master_instance_offer.instance.name + region = placement_group.configuration.region + + if not _is_ib_type(instance_name): + return PlacementGroupProvisioningData( + backend=BackendType.CRUSOE, + backend_data=CrusoePlacementGroupBackendData( + ib_partition_id=None, ib_network_id=None + ).json(), + ) + + ib_networks = self._client.list_ib_networks() + target_network = None + for net in ib_networks: + if net.get("location") != region: + continue + for cap in net.get("capacities", []): + if cap.get("slice_type") == instance_name: + target_network = net + break + if target_network: + break + + if target_network is None: + raise BackendError( + f"No IB network found in {region} for instance type {instance_name}" + ) + + partition = self._client.create_ib_partition( + name=placement_group.name, + ib_network_id=target_network["id"], + ) + return PlacementGroupProvisioningData( + backend=BackendType.CRUSOE, + backend_data=CrusoePlacementGroupBackendData( + ib_partition_id=partition["id"], + ib_network_id=target_network["id"], + ).json(), + ) + + def delete_placement_group(self, placement_group: PlacementGroup) -> None: + assert placement_group.provisioning_data is not None + pg_data = CrusoePlacementGroupBackendData.load( + placement_group.provisioning_data.backend_data + ) + if pg_data.ib_partition_id: + try: + self._client.delete_ib_partition(pg_data.ib_partition_id) + except BackendError: + pass + + def is_suitable_placement_group( + self, + placement_group: PlacementGroup, + instance_offer: InstanceOffer, + ) -> bool: + if placement_group.configuration.region != instance_offer.region: + return False + assert placement_group.provisioning_data is not None + pg_data = CrusoePlacementGroupBackendData.load( + placement_group.provisioning_data.backend_data + ) + if pg_data.ib_partition_id is None: + return not _is_ib_type(instance_offer.instance.name) + return _is_ib_type(instance_offer.instance.name) + + +class CrusoeInstanceBackendData(CoreModel): + data_disk_id: Optional[str] = None + + @classmethod + def load(cls, raw: Optional[str]) -> "CrusoeInstanceBackendData": + if raw is None: + return cls() + return cls.__response__.parse_raw(raw) + + +class CrusoePlacementGroupBackendData(CoreModel): + ib_partition_id: Optional[str] = None + ib_network_id: Optional[str] = None + + @classmethod + def load(cls, raw: Optional[str]) -> "CrusoePlacementGroupBackendData": + if raw is None: + return cls() + return cls.__response__.parse_raw(raw) diff --git a/src/dstack/_internal/core/backends/crusoe/configurator.py b/src/dstack/_internal/core/backends/crusoe/configurator.py new file mode 100644 index 0000000000..95f805458e --- /dev/null +++ b/src/dstack/_internal/core/backends/crusoe/configurator.py @@ -0,0 +1,78 @@ +import json + +from dstack._internal.core.backends.base.configurator import ( + BackendRecord, + Configurator, + raise_invalid_credentials_error, +) +from dstack._internal.core.backends.crusoe.backend import CrusoeBackend +from dstack._internal.core.backends.crusoe.models import ( + CrusoeBackendConfig, + CrusoeBackendConfigWithCreds, + CrusoeConfig, + CrusoeCreds, + CrusoeStoredConfig, +) +from dstack._internal.core.backends.crusoe.resources import CrusoeClient +from dstack._internal.core.models.backends.base import BackendType + + +class CrusoeConfigurator( + Configurator[ + CrusoeBackendConfig, + CrusoeBackendConfigWithCreds, + ] +): + TYPE = BackendType.CRUSOE + BACKEND_CLASS = CrusoeBackend + + def validate_config(self, config: CrusoeBackendConfigWithCreds, default_creds_enabled: bool): + try: + client = CrusoeClient(config.creds, config.project_id) + client.list_quotas() + except Exception as e: + raise_invalid_credentials_error( + fields=[["creds"]], + details=str(e), + ) + if config.regions: + try: + available = set(client.list_locations()) + except Exception: + return + invalid = set(config.regions) - available + if invalid: + raise_invalid_credentials_error( + fields=[["regions"]], + details=( + f"Unknown regions: {sorted(invalid)}. Valid regions: {sorted(available)}" + ), + ) + + def create_backend( + self, project_name: str, config: CrusoeBackendConfigWithCreds + ) -> BackendRecord: + return BackendRecord( + config=CrusoeStoredConfig( + **CrusoeBackendConfig.__response__.parse_obj(config).dict() + ).json(), + auth=CrusoeCreds.parse_obj(config.creds).json(), + ) + + def get_backend_config_with_creds(self, record: BackendRecord) -> CrusoeBackendConfigWithCreds: + config = self._get_config(record) + return CrusoeBackendConfigWithCreds.__response__.parse_obj(config) + + def get_backend_config_without_creds(self, record: BackendRecord) -> CrusoeBackendConfig: + config = self._get_config(record) + return CrusoeBackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> CrusoeBackend: + config = self._get_config(record) + return CrusoeBackend(config=config) + + def _get_config(self, record: BackendRecord) -> CrusoeConfig: + return CrusoeConfig.__response__( + **json.loads(record.config), + creds=CrusoeCreds.parse_raw(record.auth), + ) diff --git a/src/dstack/_internal/core/backends/crusoe/models.py b/src/dstack/_internal/core/backends/crusoe/models.py new file mode 100644 index 0000000000..d867301c0d --- /dev/null +++ b/src/dstack/_internal/core/backends/crusoe/models.py @@ -0,0 +1,48 @@ +from typing import Annotated, List, Literal, Optional, Union + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +class CrusoeAccessKeyCreds(CoreModel): + type: Annotated[Literal["access_key"], Field(description="The type of credentials")] = ( + "access_key" + ) + access_key: Annotated[str, Field(description="The Crusoe API access key")] + secret_key: Annotated[str, Field(description="The Crusoe API secret key")] + + +AnyCrusoeCreds = CrusoeAccessKeyCreds +CrusoeCreds = AnyCrusoeCreds + + +class CrusoeBackendConfig(CoreModel): + type: Annotated[ + Literal["crusoe"], + Field(description="The type of backend"), + ] = "crusoe" + project_id: Annotated[str, Field(description="The Crusoe project ID")] + regions: Annotated[ + Optional[List[str]], + Field(description="The list of allowed Crusoe regions. Omit to use all regions"), + ] = None + + +class CrusoeBackendConfigWithCreds(CrusoeBackendConfig): + creds: Annotated[AnyCrusoeCreds, Field(description="The credentials")] + + +AnyCrusoeBackendConfig = Union[CrusoeBackendConfig, CrusoeBackendConfigWithCreds] + + +class CrusoeBackendFileConfigWithCreds(CrusoeBackendConfig): + creds: Annotated[AnyCrusoeCreds, Field(description="The credentials")] + + +class CrusoeStoredConfig(CrusoeBackendConfig): + pass + + +class CrusoeConfig(CrusoeStoredConfig): + creds: AnyCrusoeCreds diff --git a/src/dstack/_internal/core/backends/crusoe/resources.py b/src/dstack/_internal/core/backends/crusoe/resources.py new file mode 100644 index 0000000000..1f84ff4019 --- /dev/null +++ b/src/dstack/_internal/core/backends/crusoe/resources.py @@ -0,0 +1,198 @@ +import base64 +import datetime +import hashlib +import hmac +import time +from typing import Any, Dict, List, Optional + +import requests + +from dstack._internal.core.backends.crusoe.models import CrusoeAccessKeyCreds +from dstack._internal.core.errors import BackendError, NoCapacityError, ProvisioningError +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + +API_URL = "https://api.crusoecloud.com" +API_VERSION = "/v1alpha5" +SIGNATURE_VERSION = "1.0" +REQUEST_TIMEOUT = 30 + + +class CrusoeClient: + def __init__(self, creds: CrusoeAccessKeyCreds, project_id: str): + self.access_key = creds.access_key + self.secret_key = creds.secret_key + self.project_id = project_id + + def _request( + self, + method: str, + path: str, + params: Optional[dict] = None, + body: Optional[dict] = None, + ) -> requests.Response: + dt = str(datetime.datetime.now(datetime.timezone.utc).replace(microsecond=0)) + dt = dt.replace(" ", "T") + + query_string = "" + if params: + query_string = "&".join(f"{k}={v}" for k, v in sorted(params.items())) + + payload = f"{API_VERSION}{path}\n{query_string}\n{method}\n{dt}\n" + + decoded_secret = base64.urlsafe_b64decode( + self.secret_key + "=" * (-len(self.secret_key) % 4) + ) + sig = hmac.new(decoded_secret, msg=payload.encode("ascii"), digestmod=hashlib.sha256) + encoded_sig = base64.urlsafe_b64encode(sig.digest()).decode("ascii").rstrip("=") + + headers = { + "X-Crusoe-Timestamp": dt, + "Authorization": f"Bearer {SIGNATURE_VERSION}:{self.access_key}:{encoded_sig}", + } + if body is not None: + headers["Content-Type"] = "application/json" + + url = f"{API_URL}{API_VERSION}{path}" + resp = requests.request( + method, url, headers=headers, params=params, json=body, timeout=REQUEST_TIMEOUT + ) + if resp.status_code >= 400: + _raise_api_error(resp) + return resp + + def _project_path(self, path: str) -> str: + return f"/projects/{self.project_id}{path}" + + # --- VM operations --- + + def create_vm( + self, + name: str, + vm_type: str, + location: str, + ssh_public_key: str, + image: str, + startup_script: str, + disks: Optional[List[Dict[str, str]]] = None, + host_channel_adapters: Optional[List[Dict[str, str]]] = None, + ) -> dict: + body: Dict[str, Any] = { + "name": name, + "type": vm_type, + "location": location, + "ssh_public_key": ssh_public_key, + "image": image, + "startup_script": startup_script, + } + if disks: + body["disks"] = disks + if host_channel_adapters: + body["host_channel_adapters"] = host_channel_adapters + resp = self._request("POST", self._project_path("/compute/vms/instances"), body=body) + return resp.json()["operation"] + + def get_vm(self, vm_id: str) -> dict: + resp = self._request("GET", self._project_path(f"/compute/vms/instances/{vm_id}")) + return resp.json() + + def delete_vm(self, vm_id: str) -> dict: + resp = self._request("DELETE", self._project_path(f"/compute/vms/instances/{vm_id}")) + return resp.json()["operation"] + + def get_vm_operation(self, operation_id: str) -> dict: + resp = self._request( + "GET", self._project_path(f"/compute/vms/instances/operations/{operation_id}") + ) + return resp.json() + + # --- Disk operations --- + + def create_disk(self, name: str, size: str, location: str) -> dict: + body = { + "name": name, + "size": size, + "location": location, + "type": "persistent-ssd", + "block_size": 4096, + } + resp = self._request("POST", self._project_path("/storage/disks"), body=body) + return resp.json()["operation"] + + def delete_disk(self, disk_id: str) -> dict: + resp = self._request("DELETE", self._project_path(f"/storage/disks/{disk_id}")) + return resp.json()["operation"] + + def get_disk_operation(self, operation_id: str) -> dict: + resp = self._request( + "GET", self._project_path(f"/storage/disks/operations/{operation_id}") + ) + return resp.json() + + # --- Quota operations --- + + def list_quotas(self) -> List[dict]: + resp = self._request("GET", self._project_path("/quotas")) + return resp.json().get("quotas", []) + + # --- Location operations --- + + def list_locations(self) -> List[str]: + resp = self._request("GET", "/locations") + return resp.json().get("items", []) + + # --- IB operations --- + + def list_ib_networks(self) -> List[dict]: + resp = self._request("GET", self._project_path("/networking/ib-networks")) + return resp.json().get("items", []) + + def create_ib_partition(self, name: str, ib_network_id: str) -> dict: + body = {"name": name, "ib_network_id": ib_network_id} + resp = self._request("POST", self._project_path("/networking/ib-partitions"), body=body) + return resp.json() + + def delete_ib_partition(self, partition_id: str) -> None: + self._request("DELETE", self._project_path(f"/networking/ib-partitions/{partition_id}")) + + # --- Operation polling --- + + def wait_for_vm_operation( + self, operation_id: str, timeout: float = 120, interval: float = 5 + ) -> dict: + return self._wait_for_operation(operation_id, self.get_vm_operation, timeout, interval) + + def wait_for_disk_operation( + self, operation_id: str, timeout: float = 30, interval: float = 2 + ) -> dict: + return self._wait_for_operation(operation_id, self.get_disk_operation, timeout, interval) + + def _wait_for_operation(self, operation_id, get_fn, timeout, interval) -> dict: + deadline = time.monotonic() + timeout + while True: + op = get_fn(operation_id) + state = op.get("state", op.get("operation", {}).get("state")) + if state == "SUCCEEDED": + return op + if state == "FAILED": + result = op.get("result", {}) + code = result.get("code", "") + message = result.get("message", str(result)) + if code == "out_of_stock": + raise NoCapacityError(message) + raise ProvisioningError(f"Operation {operation_id} failed: {message}") + if time.monotonic() + interval > deadline: + raise BackendError(f"Operation {operation_id} timed out (state: {state})") + time.sleep(interval) + + +def _raise_api_error(resp: requests.Response) -> None: + try: + data = resp.json() + message = data.get("message", data.get("error", str(data))) + except Exception: + message = resp.text[:500] + if resp.status_code == 404: + raise BackendError(f"Resource not found: {message}") + raise BackendError(f"Crusoe API error ({resp.status_code}): {message}") diff --git a/src/dstack/_internal/core/backends/cudo/__init__.py b/src/dstack/_internal/core/backends/cudo/__init__.py index 2e606f50fe..e69de29bb2 100644 --- a/src/dstack/_internal/core/backends/cudo/__init__.py +++ b/src/dstack/_internal/core/backends/cudo/__init__.py @@ -1,15 +0,0 @@ -from dstack._internal.core.backends.base import Backend -from dstack._internal.core.backends.cudo.compute import CudoCompute -from dstack._internal.core.backends.cudo.config import CudoConfig -from dstack._internal.core.models.backends.base import BackendType - - -class CudoBackend(Backend): - TYPE: BackendType = BackendType.CUDO - - def __init__(self, config: CudoConfig): - self.config = config - self._compute = CudoCompute(self.config) - - def compute(self) -> CudoCompute: - return self._compute diff --git a/src/dstack/_internal/core/backends/cudo/backend.py b/src/dstack/_internal/core/backends/cudo/backend.py new file mode 100644 index 0000000000..b7e724058f --- /dev/null +++ b/src/dstack/_internal/core/backends/cudo/backend.py @@ -0,0 +1,16 @@ +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.cudo.compute import CudoCompute +from dstack._internal.core.backends.cudo.models import CudoConfig +from dstack._internal.core.models.backends.base import BackendType + + +class CudoBackend(Backend): + TYPE = BackendType.CUDO + COMPUTE_CLASS = CudoCompute + + def __init__(self, config: CudoConfig): + self.config = config + self._compute = CudoCompute(self.config) + + def compute(self) -> CudoCompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/cudo/compute.py b/src/dstack/_internal/core/backends/cudo/compute.py index 250057b50f..0dbddcba0e 100644 --- a/src/dstack/_internal/core/backends/cudo/compute.py +++ b/src/dstack/_internal/core/backends/cudo/compute.py @@ -2,92 +2,87 @@ import requests -from dstack._internal.core.backends.base import Compute +from dstack._internal.core.backends.base.backend import Compute from dstack._internal.core.backends.base.compute import ( - get_instance_name, + ComputeWithCreateInstanceSupport, + ComputeWithFilteredOffersCached, + ComputeWithInstanceVolumesSupport, + ComputeWithPrivilegedSupport, + generate_unique_instance_name, get_shim_commands, ) from dstack._internal.core.backends.base.offers import get_catalog_offers from dstack._internal.core.backends.cudo.api_client import CudoApiClient -from dstack._internal.core.backends.cudo.config import CudoConfig +from dstack._internal.core.backends.cudo.models import CudoConfig from dstack._internal.core.errors import BackendError, NoCapacityError, ProvisioningError from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.instances import ( InstanceAvailability, InstanceConfiguration, InstanceOfferWithAvailability, - SSHKey, ) -from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run -from dstack._internal.core.models.volumes import Volume +from dstack._internal.core.models.placement import PlacementGroup +from dstack._internal.core.models.runs import JobProvisioningData, Requirements from dstack._internal.utils.logging import get_logger logger = get_logger(__name__) -class CudoCompute(Compute): +MAX_RESOURCE_NAME_LEN = 30 + + +class CudoCompute( + ComputeWithFilteredOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithPrivilegedSupport, + ComputeWithInstanceVolumesSupport, + Compute, +): def __init__(self, config: CudoConfig): + super().__init__() self.config = config self.api_client = CudoApiClient(config.creds.api_key) - def get_offers( - self, requirements: Optional[Requirements] = None + def get_offers_by_requirements( + self, requirements: Requirements ) -> List[InstanceOfferWithAvailability]: offers = get_catalog_offers( backend=BackendType.CUDO, + locations=self.config.regions, requirements=requirements, ) offers = [ - InstanceOfferWithAvailability( - **offer.dict(), availability=InstanceAvailability.AVAILABLE - ) + offer.with_availability(availability=InstanceAvailability.AVAILABLE) for offer in offers + # in-hyderabad-1 is known to have provisioning issues if offer.region not in ["in-hyderabad-1"] ] return offers - def run_job( - self, - run: Run, - job: Job, - instance_offer: InstanceOfferWithAvailability, - project_ssh_public_key: str, - project_ssh_private_key: str, - volumes: List[Volume], - ) -> JobProvisioningData: - instance_config = InstanceConfiguration( - project_name=run.project_name, - instance_name=get_instance_name(run, job), - ssh_keys=[ - SSHKey(public=project_ssh_public_key.strip()), - ], - job_docker_config=None, - user=run.user, - ) - return self.create_instance(instance_offer, instance_config) - def create_instance( self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration, + placement_group: Optional[PlacementGroup], ) -> JobProvisioningData: + vm_id = generate_unique_instance_name(instance_config, max_length=MAX_RESOURCE_NAME_LEN) public_keys = instance_config.get_public_keys() memory_size = round(instance_offer.instance.resources.memory_mib / 1024) disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024) - commands = get_shim_commands(authorized_keys=public_keys) gpus_no = len(instance_offer.instance.resources.gpus) - shim_commands = " ".join([" && ".join(commands)]) - startup_script = ( - shim_commands if gpus_no > 0 else f"{install_docker_script()} && {shim_commands}" - ) + if gpus_no > 0: + # we'll need jq for patching /etc/docker/daemon.json, see get_shim_commands() + commands = install_jq_commands() + else: + commands = install_docker_commands() + commands += get_shim_commands() - vm_id = f"{instance_config.instance_name}-{instance_offer.region}" try: resp_data = self.api_client.create_virtual_machine( project_id=self.config.project_id, boot_disk_storage_class="STORAGE_CLASS_NETWORK", boot_disk_size_gib=disk_size, - book_disk_id=f"{instance_config.instance_name}_{instance_offer.region}_disk_id", + book_disk_id=f"{vm_id}_disk_id", boot_disk_image_id=_get_image_id(gpus_no > 0), data_center_id=instance_offer.region, gpus=gpus_no, @@ -95,7 +90,7 @@ def create_instance( memory_gib=memory_size, vcpus=instance_offer.instance.resources.cpus, vm_id=vm_id, - start_script=startup_script, + start_script=" && ".join(commands), password=None, customSshKeys=public_keys, ) @@ -138,10 +133,10 @@ def terminate_instance( try: self.api_client.terminate_virtual_machine(instance_id, self.config.project_id) except requests.HTTPError as e: - if e.response.status_code == requests.codes.not_found: + if e.response is not None and e.response.status_code == requests.codes.not_found: logger.debug("The instance with name %s not found", instance_id) return - raise BackendError(e.response.text) + raise BackendError(e.response.text if e.response is not None else str(e)) def update_provisioning_data( self, @@ -157,10 +152,23 @@ def update_provisioning_data( def _get_image_id(cuda: bool) -> str: - image_name = "ubuntu-2204-nvidia-535-docker-v20240214" if cuda else "ubuntu-2204" + image_name = "ubuntu-2204-nvidia-535-docker-v20241017" if cuda else "ubuntu-2204" return image_name -def install_docker_script(): - commands = 'export DEBIAN_FRONTEND="noninteractive" && mkdir -p /etc/apt/keyrings && curl --max-time 60 -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null && apt-get update && apt-get --assume-yes install docker-ce docker-ce-cli containerd.io docker-compose-plugin' - return commands +def install_jq_commands(): + return [ + "export DEBIAN_FRONTEND=noninteractive", + "apt-get --assume-yes install jq", + ] + + +def install_docker_commands(): + return [ + "export DEBIAN_FRONTEND=noninteractive", + "mkdir -p /etc/apt/keyrings", + "curl --max-time 60 -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg", + 'echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null', + "apt-get update", + "apt-get --assume-yes install docker-ce docker-ce-cli containerd.io docker-compose-plugin", + ] diff --git a/src/dstack/_internal/core/backends/cudo/config.py b/src/dstack/_internal/core/backends/cudo/config.py deleted file mode 100644 index 65dad034c1..0000000000 --- a/src/dstack/_internal/core/backends/cudo/config.py +++ /dev/null @@ -1,9 +0,0 @@ -from dstack._internal.core.backends.base.config import BackendConfig -from dstack._internal.core.models.backends.cudo import ( - AnyCudoCreds, - CudoStoredConfig, -) - - -class CudoConfig(CudoStoredConfig, BackendConfig): - creds: AnyCudoCreds diff --git a/src/dstack/_internal/core/backends/cudo/configurator.py b/src/dstack/_internal/core/backends/cudo/configurator.py new file mode 100644 index 0000000000..a4d5b7d94d --- /dev/null +++ b/src/dstack/_internal/core/backends/cudo/configurator.py @@ -0,0 +1,63 @@ +import json + +from dstack._internal.core.backends.base.configurator import ( + BackendRecord, + Configurator, + raise_invalid_credentials_error, +) +from dstack._internal.core.backends.cudo import api_client +from dstack._internal.core.backends.cudo.backend import CudoBackend +from dstack._internal.core.backends.cudo.models import ( + CudoBackendConfig, + CudoBackendConfigWithCreds, + CudoConfig, + CudoCreds, + CudoStoredConfig, +) +from dstack._internal.core.models.backends.base import BackendType + + +class CudoConfigurator( + Configurator[ + CudoBackendConfig, + CudoBackendConfigWithCreds, + ] +): + TYPE = BackendType.CUDO + BACKEND_CLASS = CudoBackend + + def validate_config(self, config: CudoBackendConfigWithCreds, default_creds_enabled: bool): + self._validate_cudo_api_key(config.creds.api_key) + + def create_backend( + self, project_name: str, config: CudoBackendConfigWithCreds + ) -> BackendRecord: + return BackendRecord( + config=CudoStoredConfig( + **CudoBackendConfig.__response__.parse_obj(config).dict() + ).json(), + auth=CudoCreds.parse_obj(config.creds).json(), + ) + + def get_backend_config_with_creds(self, record: BackendRecord) -> CudoBackendConfigWithCreds: + config = self._get_config(record) + return CudoBackendConfigWithCreds.__response__.parse_obj(config) + + def get_backend_config_without_creds(self, record: BackendRecord) -> CudoBackendConfig: + config = self._get_config(record) + return CudoBackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> CudoBackend: + config = self._get_config(record) + return CudoBackend(config=config) + + def _get_config(self, record: BackendRecord) -> CudoConfig: + return CudoConfig.__response__( + **json.loads(record.config), + creds=CudoCreds.parse_raw(record.auth), + ) + + def _validate_cudo_api_key(self, api_key: str): + client = api_client.CudoApiClient(api_key=api_key) + if not client.validate_api_key(): + raise_invalid_credentials_error(fields=[["creds", "api_key"]]) diff --git a/src/dstack/_internal/core/backends/cudo/models.py b/src/dstack/_internal/core/backends/cudo/models.py new file mode 100644 index 0000000000..e22d9d49af --- /dev/null +++ b/src/dstack/_internal/core/backends/cudo/models.py @@ -0,0 +1,37 @@ +from typing import Annotated, List, Literal, Optional, Union + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +class CudoAPIKeyCreds(CoreModel): + type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" + api_key: Annotated[str, Field(description="The API key")] + + +AnyCudoCreds = CudoAPIKeyCreds +CudoCreds = AnyCudoCreds + + +class CudoBackendConfig(CoreModel): + type: Annotated[Literal["cudo"], Field(description="The type of backend")] = "cudo" + regions: Annotated[ + Optional[List[str]], Field(description="The list of Cudo regions. Omit to use all regions") + ] = None + project_id: Annotated[str, Field(description="The project ID")] + + +class CudoBackendConfigWithCreds(CudoBackendConfig): + creds: Annotated[AnyCudoCreds, Field(description="The credentials")] + + +AnyCudoBackendConfig = Union[CudoBackendConfig, CudoBackendConfigWithCreds] + + +class CudoStoredConfig(CudoBackendConfig): + pass + + +class CudoConfig(CudoStoredConfig): + creds: AnyCudoCreds diff --git a/src/dstack/_internal/core/backends/datacrunch/__init__.py b/src/dstack/_internal/core/backends/datacrunch/__init__.py index 7545c87b10..ca4773c861 100644 --- a/src/dstack/_internal/core/backends/datacrunch/__init__.py +++ b/src/dstack/_internal/core/backends/datacrunch/__init__.py @@ -1,15 +1 @@ -from dstack._internal.core.backends.base import Backend -from dstack._internal.core.backends.datacrunch.compute import DataCrunchCompute -from dstack._internal.core.backends.datacrunch.config import DataCrunchConfig -from dstack._internal.core.models.backends.base import BackendType - - -class DataCrunchBackend(Backend): - TYPE: BackendType = BackendType.DATACRUNCH - - def __init__(self, config: DataCrunchConfig): - self.config = config - self._compute = DataCrunchCompute(self.config) - - def compute(self) -> DataCrunchCompute: - return self._compute +# DataCrunch backend for backward compatibility diff --git a/src/dstack/_internal/core/backends/datacrunch/api_client.py b/src/dstack/_internal/core/backends/datacrunch/api_client.py deleted file mode 100644 index 74ddeb5630..0000000000 --- a/src/dstack/_internal/core/backends/datacrunch/api_client.py +++ /dev/null @@ -1,77 +0,0 @@ -from typing import Optional - -from datacrunch import DataCrunchClient -from datacrunch.exceptions import APIException -from datacrunch.instances.instances import Instance - -from dstack._internal.core.errors import NoCapacityError -from dstack._internal.utils.ssh import get_public_key_fingerprint - - -class DataCrunchAPIClient: - def __init__(self, client_id: str, client_secret: str): - self.client = DataCrunchClient(client_id, client_secret) - - def delete_instance(self, instance_id: str) -> None: - try: - self.client.instances.action(id_list=[instance_id], action="delete") - except APIException: - pass - - def get_or_create_ssh_key(self, name: str, public_key: str) -> str: - fingerprint = get_public_key_fingerprint(public_key) - keys = self.client.ssh_keys.get() - found_keys = [ - key for key in keys if fingerprint == get_public_key_fingerprint(key.public_key) - ] - if found_keys: - key = found_keys[0] - return key.id - - key = self.client.ssh_keys.create(name, public_key) - return key.id - - def get_or_create_startup_scrpit(self, name: str, script: str) -> str: - scripts = self.client.startup_scripts.get() - found_scripts = [startup_script for startup_script in scripts if script == startup_script] - if found_scripts: - startup_script = found_scripts[0] - return startup_script.id - - startup_script = self.client.startup_scripts.create(name, script) - return startup_script.id - - def get_instance_by_id(self, instance_id: str) -> Optional[Instance]: - try: - return self.client.instances.get_by_id(instance_id) - except APIException: - return None - - def deploy_instance( - self, - instance_type, - image, - ssh_key_ids, - hostname, - description, - startup_script_id, - disk_size, - is_spot=True, - location="FIN-01", - ) -> Instance: - try: - instance = self.client.instances.create( - instance_type=instance_type, - image=image, - ssh_key_ids=ssh_key_ids, - hostname=hostname, - description=description, - startup_script_id=startup_script_id, - is_spot=is_spot, - location=location, - os_volume={"name": "OS volume", "size": disk_size}, - ) - except APIException: - raise NoCapacityError() - - return instance diff --git a/src/dstack/_internal/core/backends/datacrunch/backend.py b/src/dstack/_internal/core/backends/datacrunch/backend.py new file mode 100644 index 0000000000..1ce1c97c42 --- /dev/null +++ b/src/dstack/_internal/core/backends/datacrunch/backend.py @@ -0,0 +1,18 @@ +from dstack._internal.core.backends.datacrunch.compute import DataCrunchCompute +from dstack._internal.core.backends.verda.backend import VerdaBackend +from dstack._internal.core.backends.verda.models import VerdaConfig +from dstack._internal.core.models.backends.base import BackendType + + +# Deprecated +# TODO: Remove in 0.21 +class DataCrunchBackend(VerdaBackend): + TYPE = BackendType.DATACRUNCH + COMPUTE_CLASS = DataCrunchCompute + + def __init__(self, config: VerdaConfig): + self.config = config + self._compute = DataCrunchCompute(self.config, self.TYPE) + + def compute(self) -> DataCrunchCompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/datacrunch/compute.py b/src/dstack/_internal/core/backends/datacrunch/compute.py index d7750e1d7f..906c9ea2e5 100644 --- a/src/dstack/_internal/core/backends/datacrunch/compute.py +++ b/src/dstack/_internal/core/backends/datacrunch/compute.py @@ -1,172 +1,8 @@ -from typing import Dict, List, Optional - -from dstack._internal.core.backends.base import Compute -from dstack._internal.core.backends.base.compute import ( - get_shim_commands, -) -from dstack._internal.core.backends.base.offers import get_catalog_offers -from dstack._internal.core.backends.datacrunch.api_client import DataCrunchAPIClient -from dstack._internal.core.backends.datacrunch.config import DataCrunchConfig +from dstack._internal.core.backends.verda.compute import VerdaCompute +from dstack._internal.core.backends.verda.models import VerdaConfig from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.instances import ( - InstanceAvailability, - InstanceConfiguration, - InstanceOffer, - InstanceOfferWithAvailability, - SSHKey, -) -from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run -from dstack._internal.core.models.volumes import Volume -from dstack._internal.utils.logging import get_logger - -logger = get_logger("datacrunch.compute") - - -class DataCrunchCompute(Compute): - def __init__(self, config: DataCrunchConfig): - self.config = config - self.api_client = DataCrunchAPIClient(config.creds.client_id, config.creds.client_secret) - - def get_offers( - self, requirements: Optional[Requirements] = None - ) -> List[InstanceOfferWithAvailability]: - offers = get_catalog_offers( - backend=BackendType.DATACRUNCH, - locations=self.config.regions, - requirements=requirements, - ) - offers_with_availability = self._get_offers_with_availability(offers) - return offers_with_availability - - def _get_offers_with_availability( - self, offers: List[InstanceOffer] - ) -> List[InstanceOfferWithAvailability]: - raw_availabilities: List[Dict] = self.api_client.client.instances.get_availabilities() - - region_availabilities = {} - for location in raw_availabilities: - location_code = location["location_code"] - availabilities = location["availabilities"] - if location_code not in self.config.regions: - continue - for name in availabilities: - key = (name, location_code) - region_availabilities[key] = InstanceAvailability.AVAILABLE - - availability_offers = [] - for offer in offers: - key = (offer.instance.name, offer.region) - availability = region_availabilities.get(key, InstanceAvailability.NOT_AVAILABLE) - availability_offers.append( - InstanceOfferWithAvailability(**offer.dict(), availability=availability) - ) - - return availability_offers - - def create_instance( - self, - instance_offer: InstanceOfferWithAvailability, - instance_config: InstanceConfiguration, - ) -> JobProvisioningData: - public_keys = instance_config.get_public_keys() - ssh_ids = [] - for ssh_public_key in public_keys: - ssh_ids.append( - # datacrunch allows you to use the same name - self.api_client.get_or_create_ssh_key( - name=f"dstack-{instance_config.instance_name}.key", - public_key=ssh_public_key, - ) - ) - - commands = get_shim_commands(authorized_keys=public_keys) - - startup_script = " ".join([" && ".join(commands)]) - script_name = f"dstack-{instance_config.instance_name}.sh" - - logger.debug("startup script:", startup_script) - - startup_script_ids = self.api_client.get_or_create_startup_scrpit( - name=script_name, script=startup_script - ) - - # Id of image "Ubuntu 22.04 + CUDA 12.0 + Docker" - # from API https://datacrunch.stoplight.io/docs/datacrunch-public/c46ab45dbc508-get-all-image-types - image_name = "2088da25-bb0d-41cc-a191-dccae45d96fd" - - disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024) - - instance = self.api_client.deploy_instance( - instance_type=instance_offer.instance.name, - ssh_key_ids=ssh_ids, - startup_script_id=startup_script_ids, - hostname=instance_config.instance_name, - description=instance_config.instance_name, - image=image_name, - disk_size=disk_size, - location=instance_offer.region, - ) - - logger.debug( - "deploy_instance", - { - "instance_type": instance_offer.instance.name, - "ssh_key_ids": ssh_ids, - "startup_script_id": startup_script_ids, - "hostname": instance_config.instance_name, - "description": instance_config.instance_name, - "image": image_name, - "disk_size": disk_size, - "location": instance_offer.region, - }, - ) - - return JobProvisioningData( - backend=instance_offer.backend, - instance_type=instance_offer.instance, - instance_id=instance.id, - hostname=None, - internal_ip=None, - region=instance.location, - price=instance_offer.price, - username="root", - ssh_port=22, - dockerized=True, - ssh_proxy=None, - backend_data=None, - ) - - def run_job( - self, - run: Run, - job: Job, - instance_offer: InstanceOfferWithAvailability, - project_ssh_public_key: str, - project_ssh_private_key: str, - volumes: List[Volume], - ) -> JobProvisioningData: - instance_config = InstanceConfiguration( - project_name=run.project_name, - instance_name=job.job_spec.job_name, # TODO: generate name - ssh_keys=[ - SSHKey(public=project_ssh_public_key.strip()), - ], - job_docker_config=None, - user=run.user, - ) - return self.create_instance(instance_offer, instance_config) - def terminate_instance( - self, instance_id: str, region: str, backend_data: Optional[str] = None - ) -> None: - self.api_client.delete_instance(instance_id) - def update_provisioning_data( - self, - provisioning_data: JobProvisioningData, - project_ssh_public_key: str, - project_ssh_private_key: str, - ): - instance = self.api_client.get_instance_by_id(provisioning_data.instance_id) - if instance is not None and instance.status == "running": - provisioning_data.hostname = instance.ip +class DataCrunchCompute(VerdaCompute): + def __init__(self, config: VerdaConfig, backend_type: BackendType): + super().__init__(config, backend_type) diff --git a/src/dstack/_internal/core/backends/datacrunch/config.py b/src/dstack/_internal/core/backends/datacrunch/config.py deleted file mode 100644 index 39369ff748..0000000000 --- a/src/dstack/_internal/core/backends/datacrunch/config.py +++ /dev/null @@ -1,9 +0,0 @@ -from dstack._internal.core.backends.base.config import BackendConfig -from dstack._internal.core.models.backends.datacrunch import ( - AnyDataCrunchCreds, - DataCrunchStoredConfig, -) - - -class DataCrunchConfig(DataCrunchStoredConfig, BackendConfig): - creds: AnyDataCrunchCreds diff --git a/src/dstack/_internal/core/backends/datacrunch/configurator.py b/src/dstack/_internal/core/backends/datacrunch/configurator.py new file mode 100644 index 0000000000..944f8657d3 --- /dev/null +++ b/src/dstack/_internal/core/backends/datacrunch/configurator.py @@ -0,0 +1,17 @@ +from dstack._internal.core.backends.base.configurator import BackendRecord +from dstack._internal.core.backends.datacrunch.backend import DataCrunchBackend +from dstack._internal.core.backends.verda.configurator import ( + VerdaConfigurator, +) +from dstack._internal.core.models.backends.base import ( + BackendType, +) + + +class DataCrunchConfigurator(VerdaConfigurator): + TYPE = BackendType.DATACRUNCH + BACKEND_CLASS = DataCrunchBackend + + def get_backend(self, record: BackendRecord) -> DataCrunchBackend: + config = self._get_config(record) + return DataCrunchBackend(config=config) diff --git a/src/dstack/_internal/core/backends/digitalocean/__init__.py b/src/dstack/_internal/core/backends/digitalocean/__init__.py new file mode 100644 index 0000000000..0f0092fd9f --- /dev/null +++ b/src/dstack/_internal/core/backends/digitalocean/__init__.py @@ -0,0 +1 @@ +# DigitalOcean backend for dstack diff --git a/src/dstack/_internal/core/backends/digitalocean/backend.py b/src/dstack/_internal/core/backends/digitalocean/backend.py new file mode 100644 index 0000000000..fc09b4c03d --- /dev/null +++ b/src/dstack/_internal/core/backends/digitalocean/backend.py @@ -0,0 +1,16 @@ +from dstack._internal.core.backends.digitalocean.compute import DigitalOceanCompute +from dstack._internal.core.backends.digitalocean_base.backend import BaseDigitalOceanBackend +from dstack._internal.core.backends.digitalocean_base.models import BaseDigitalOceanConfig +from dstack._internal.core.models.backends.base import BackendType + + +class DigitalOceanBackend(BaseDigitalOceanBackend): + TYPE = BackendType.DIGITALOCEAN + COMPUTE_CLASS = DigitalOceanCompute + + def __init__(self, config: BaseDigitalOceanConfig, api_url: str): + self.config = config + self._compute = DigitalOceanCompute(self.config, api_url=api_url, type=self.TYPE) + + def compute(self) -> DigitalOceanCompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/digitalocean/compute.py b/src/dstack/_internal/core/backends/digitalocean/compute.py new file mode 100644 index 0000000000..e3b26d0261 --- /dev/null +++ b/src/dstack/_internal/core/backends/digitalocean/compute.py @@ -0,0 +1,5 @@ +from ..digitalocean_base.compute import BaseDigitalOceanCompute + + +class DigitalOceanCompute(BaseDigitalOceanCompute): + pass diff --git a/src/dstack/_internal/core/backends/digitalocean/configurator.py b/src/dstack/_internal/core/backends/digitalocean/configurator.py new file mode 100644 index 0000000000..0453723128 --- /dev/null +++ b/src/dstack/_internal/core/backends/digitalocean/configurator.py @@ -0,0 +1,31 @@ +from typing import Optional + +from dstack._internal.core.backends.base.configurator import BackendRecord +from dstack._internal.core.backends.digitalocean.backend import DigitalOceanBackend +from dstack._internal.core.backends.digitalocean_base.api_client import DigitalOceanAPIClient +from dstack._internal.core.backends.digitalocean_base.backend import BaseDigitalOceanBackend +from dstack._internal.core.backends.digitalocean_base.configurator import ( + BaseDigitalOceanConfigurator, +) +from dstack._internal.core.backends.digitalocean_base.models import ( + AnyBaseDigitalOceanCreds, +) +from dstack._internal.core.models.backends.base import ( + BackendType, +) + + +class DigitalOceanConfigurator(BaseDigitalOceanConfigurator): + TYPE = BackendType.DIGITALOCEAN + BACKEND_CLASS = DigitalOceanBackend + API_URL = "https://api.digitalocean.com" + + def get_backend(self, record: BackendRecord) -> BaseDigitalOceanBackend: + config = self._get_config(record) + return DigitalOceanBackend(config=config, api_url=self.API_URL) + + def _validate_creds(self, creds: AnyBaseDigitalOceanCreds, project_name: Optional[str] = None): + api_client = DigitalOceanAPIClient(creds.api_key, self.API_URL) + api_client.validate_api_key() + if project_name: + api_client.validate_project_name(project_name) diff --git a/src/dstack/_internal/core/backends/digitalocean_base/__init__.py b/src/dstack/_internal/core/backends/digitalocean_base/__init__.py new file mode 100644 index 0000000000..cc8247e940 --- /dev/null +++ b/src/dstack/_internal/core/backends/digitalocean_base/__init__.py @@ -0,0 +1 @@ +# This package contains the base classes for DigitalOcean and AMDDevCloud backends. diff --git a/src/dstack/_internal/core/backends/digitalocean_base/api_client.py b/src/dstack/_internal/core/backends/digitalocean_base/api_client.py new file mode 100644 index 0000000000..fa901fc0d9 --- /dev/null +++ b/src/dstack/_internal/core/backends/digitalocean_base/api_client.py @@ -0,0 +1,103 @@ +from typing import Any, Dict, List, Optional + +import requests + +from dstack._internal.core.backends.base.configurator import raise_invalid_credentials_error +from dstack._internal.core.errors import NoCapacityError +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +class DigitalOceanAPIClient: + def __init__(self, api_key: str, api_url: str): + self.api_key = api_key + self.base_url = api_url + + def validate_api_key(self) -> bool: + try: + response = self._make_request("GET", "/v2/account") + response.raise_for_status() + return True + except requests.HTTPError as e: + if e.response is not None and e.response.status_code == 401: + raise_invalid_credentials_error( + fields=[["creds", "api_key"]], details="Invaild API key" + ) + raise e + + def validate_project_name(self, project_name: str) -> bool: + if self.get_project_id(project_name) is None: + raise_invalid_credentials_error( + fields=[["project_name"]], + details=f"Project with name '{project_name}' does not exist", + ) + return True + + def list_ssh_keys(self) -> List[Dict[str, Any]]: + response = self._make_request("GET", "/v2/account/keys") + response.raise_for_status() + return response.json()["ssh_keys"] + + def list_projects(self) -> List[Dict[str, Any]]: + response = self._make_request("GET", "/v2/projects") + response.raise_for_status() + return response.json()["projects"] + + def get_project_id(self, project_name: str) -> Optional[str]: + projects = self.list_projects() + for project in projects: + if project["name"] == project_name: + return project["id"] + return None + + def create_ssh_key(self, name: str, public_key: str) -> Dict[str, Any]: + payload = {"name": name, "public_key": public_key} + response = self._make_request("POST", "/v2/account/keys", json=payload) + response.raise_for_status() + return response.json()["ssh_key"] + + def get_or_create_ssh_key(self, name: str, public_key: str) -> int: + ssh_keys = self.list_ssh_keys() + for ssh_key in ssh_keys: + if ssh_key["public_key"].strip() == public_key.strip(): + return ssh_key["id"] + + ssh_key = self.create_ssh_key(name, public_key) + return ssh_key["id"] + + def create_droplet(self, droplet_config: Dict[str, Any]) -> Dict[str, Any]: + response = self._make_request("POST", "/v2/droplets", json=droplet_config) + if response.status_code == 422: + raise NoCapacityError(response.json()["message"]) + response.raise_for_status() + return response.json()["droplet"] + + def get_droplet(self, droplet_id: str) -> Dict[str, Any]: + response = self._make_request("GET", f"/v2/droplets/{droplet_id}") + response.raise_for_status() + return response.json()["droplet"] + + def delete_droplet(self, droplet_id: str) -> None: + response = self._make_request("DELETE", f"/v2/droplets/{droplet_id}") + if response.status_code == 404: + logger.debug("DigitalOcean droplet %s not found", droplet_id) + return + response.raise_for_status() + + def _make_request( + self, method: str, endpoint: str, json: Optional[Dict[str, Any]] = None, timeout: int = 30 + ) -> requests.Response: + url = f"{self.base_url}{endpoint}" + headers = { + "Authorization": f"Bearer {self.api_key}", + } + + response = requests.request( + method=method, + url=url, + headers=headers, + json=json, + timeout=timeout, + ) + return response diff --git a/src/dstack/_internal/core/backends/digitalocean_base/backend.py b/src/dstack/_internal/core/backends/digitalocean_base/backend.py new file mode 100644 index 0000000000..42884b3072 --- /dev/null +++ b/src/dstack/_internal/core/backends/digitalocean_base/backend.py @@ -0,0 +1,5 @@ +from dstack._internal.core.backends.base.backend import Backend + + +class BaseDigitalOceanBackend(Backend): + pass diff --git a/src/dstack/_internal/core/backends/digitalocean_base/compute.py b/src/dstack/_internal/core/backends/digitalocean_base/compute.py new file mode 100644 index 0000000000..1128cc5306 --- /dev/null +++ b/src/dstack/_internal/core/backends/digitalocean_base/compute.py @@ -0,0 +1,173 @@ +from typing import List, Optional + +import gpuhunt +from gpuhunt.providers.digitalocean import DigitalOceanProvider + +from dstack._internal.core.backends.base.backend import Compute +from dstack._internal.core.backends.base.compute import ( + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithInstanceVolumesSupport, + ComputeWithPrivilegedSupport, + generate_unique_instance_name, + get_user_data, +) +from dstack._internal.core.backends.base.offers import get_catalog_offers +from dstack._internal.core.backends.digitalocean_base.api_client import DigitalOceanAPIClient +from dstack._internal.core.backends.digitalocean_base.models import BaseDigitalOceanConfig +from dstack._internal.core.errors import BackendError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import ( + InstanceAvailability, + InstanceConfiguration, + InstanceOfferWithAvailability, +) +from dstack._internal.core.models.placement import PlacementGroup +from dstack._internal.core.models.runs import JobProvisioningData +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + +MAX_INSTANCE_NAME_LEN = 60 +DOCKER_INSTALL_COMMANDS = [ + "export DEBIAN_FRONTEND=noninteractive", + "mkdir -p /etc/apt/keyrings", + "curl --max-time 60 -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg", + 'echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null', + "apt-get update", + "apt-get --assume-yes install docker-ce docker-ce-cli containerd.io docker-compose-plugin", +] + + +class BaseDigitalOceanCompute( + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithPrivilegedSupport, + ComputeWithInstanceVolumesSupport, + Compute, +): + def __init__(self, config: BaseDigitalOceanConfig, api_url: str, type: BackendType): + super().__init__() + self.config = config + self.api_client = DigitalOceanAPIClient(config.creds.api_key, api_url) + self.catalog = gpuhunt.Catalog(balance_resources=False, auto_reload=False) + self.BACKEND_TYPE = type + self.catalog.add_provider( + DigitalOceanProvider(api_key=config.creds.api_key, api_url=api_url) + ) + + def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]: + offers = get_catalog_offers( + backend=self.BACKEND_TYPE, + locations=self.config.regions, + catalog=self.catalog, + ) + return [ + offer.with_availability(availability=InstanceAvailability.AVAILABLE) + for offer in offers + ] + + def create_instance( + self, + instance_offer: InstanceOfferWithAvailability, + instance_config: InstanceConfiguration, + placement_group: Optional[PlacementGroup], + ) -> JobProvisioningData: + instance_name = generate_unique_instance_name( + instance_config, max_length=MAX_INSTANCE_NAME_LEN + ) + + project_ssh_key = instance_config.ssh_keys[0] + ssh_key_id = self.api_client.get_or_create_ssh_key( + name=f"dstack-{instance_config.project_name}", + public_key=project_ssh_key.public, + ) + size_slug = instance_offer.instance.name + + if not instance_offer.instance.resources.gpus: + backend_specific_commands = DOCKER_INSTALL_COMMANDS + else: + backend_specific_commands = None + + project_id = None + if self.config.project_name: + project_id = self.api_client.get_project_id(self.config.project_name) + if project_id is None: + raise BackendError(f"Project {self.config.project_name} does not exist") + droplet_config = { + "name": instance_name, + "region": instance_offer.region, + "size": size_slug, + "image": self._get_image_for_instance(instance_offer), + "ssh_keys": [ssh_key_id], + "backups": False, + "ipv6": False, + "monitoring": False, + "tags": [], + "user_data": get_user_data( + authorized_keys=instance_config.get_public_keys(), + backend_specific_commands=backend_specific_commands, + ), + **({"project_id": project_id} if project_id is not None else {}), + } + + droplet = self.api_client.create_droplet(droplet_config) + + return JobProvisioningData( + backend=instance_offer.backend, + instance_type=instance_offer.instance, + instance_id=str(droplet["id"]), + hostname=None, + internal_ip=None, + region=instance_offer.region, + price=instance_offer.price, + username="root", + ssh_port=22, + dockerized=True, + ssh_proxy=None, + backend_data=None, + ) + + def update_provisioning_data( + self, + provisioning_data: JobProvisioningData, + project_ssh_public_key: str, + project_ssh_private_key: str, + ): + droplet = self.api_client.get_droplet(provisioning_data.instance_id) + if droplet["status"] == "active": + for network in droplet["networks"]["v4"]: + if network["type"] == "public": + provisioning_data.hostname = network["ip_address"] + break + + def terminate_instance( + self, instance_id: str, region: str, backend_data: Optional[str] = None + ): + self.api_client.delete_droplet(instance_id) + + def _get_image_for_instance(self, instance_offer: InstanceOfferWithAvailability) -> str: + if not instance_offer.instance.resources.gpus: + # No GPUs, use CPU image + return "ubuntu-24-04-x64" + + gpu_count = len(instance_offer.instance.resources.gpus) + gpu_vendor = instance_offer.instance.resources.gpus[0].vendor + + if gpu_vendor == gpuhunt.AcceleratorVendor.AMD: + # AMD GPU + return "digitaloceanai-rocmjupyter" + else: + # NVIDIA GPUs - DO only supports 1 and 8 GPU configurations. + # DO says for single GPU plans using GPUs other than H100s use "gpu-h100x1-base". DO does not provide guidance for x8 GPUs so assuming the same applies. + # See (https://docs.digitalocean.com/products/droplets/getting-started/recommended-gpu-setup/#aiml-ready-image) + if gpu_count == 8: + return "gpu-h100x8-base" + elif gpu_count == 1: + return "gpu-h100x1-base" + else: + # For Unsupported GPU count - use single GPU image and log warning + logger.warning( + f"Unsupported NVIDIA GPU count: {gpu_count}, using single GPU image" + ) + return "gpu-h100x1-base" diff --git a/src/dstack/_internal/core/backends/digitalocean_base/configurator.py b/src/dstack/_internal/core/backends/digitalocean_base/configurator.py new file mode 100644 index 0000000000..f44c5d2d0f --- /dev/null +++ b/src/dstack/_internal/core/backends/digitalocean_base/configurator.py @@ -0,0 +1,57 @@ +import json +from typing import Optional + +from dstack._internal.core.backends.base.configurator import ( + BackendRecord, + Configurator, +) +from dstack._internal.core.backends.digitalocean_base.backend import BaseDigitalOceanBackend +from dstack._internal.core.backends.digitalocean_base.models import ( + AnyBaseDigitalOceanCreds, + BaseDigitalOceanBackendConfig, + BaseDigitalOceanBackendConfigWithCreds, + BaseDigitalOceanConfig, + BaseDigitalOceanCreds, + BaseDigitalOceanStoredConfig, +) + + +class BaseDigitalOceanConfigurator(Configurator): + def validate_config( + self, config: BaseDigitalOceanBackendConfigWithCreds, default_creds_enabled: bool + ): + self._validate_creds(config.creds, config.project_name) + + def create_backend( + self, project_name: str, config: BaseDigitalOceanBackendConfigWithCreds + ) -> BackendRecord: + return BackendRecord( + config=BaseDigitalOceanStoredConfig( + **BaseDigitalOceanBackendConfig.__response__.parse_obj(config).dict() + ).json(), + auth=BaseDigitalOceanCreds.parse_obj(config.creds).json(), + ) + + def get_backend_config_with_creds( + self, record: BackendRecord + ) -> BaseDigitalOceanBackendConfigWithCreds: + config = self._get_config(record) + return BaseDigitalOceanBackendConfigWithCreds.__response__.parse_obj(config) + + def get_backend_config_without_creds( + self, record: BackendRecord + ) -> BaseDigitalOceanBackendConfig: + config = self._get_config(record) + return BaseDigitalOceanBackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> BaseDigitalOceanBackend: + raise NotImplementedError("Subclasses must implement get_backend") + + def _get_config(self, record: BackendRecord) -> BaseDigitalOceanConfig: + return BaseDigitalOceanConfig.__response__( + **json.loads(record.config), + creds=BaseDigitalOceanCreds.parse_raw(record.auth), + ) + + def _validate_creds(self, creds: AnyBaseDigitalOceanCreds, project_name: Optional[str] = None): + pass diff --git a/src/dstack/_internal/core/backends/digitalocean_base/models.py b/src/dstack/_internal/core/backends/digitalocean_base/models.py new file mode 100644 index 0000000000..e3d179fcc3 --- /dev/null +++ b/src/dstack/_internal/core/backends/digitalocean_base/models.py @@ -0,0 +1,43 @@ +from typing import Annotated, List, Literal, Optional, Union + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +class BaseDigitalOceanAPIKeyCreds(CoreModel): + type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" + api_key: Annotated[str, Field(description="The API key")] + + +AnyBaseDigitalOceanCreds = BaseDigitalOceanAPIKeyCreds +BaseDigitalOceanCreds = AnyBaseDigitalOceanCreds + + +class BaseDigitalOceanBackendConfig(CoreModel): + type: Annotated[ + Literal["amddevcloud", "digitalocean"], + Field(description="The type of backend"), + ] + project_name: Annotated[Optional[str], Field(description="The name of the project")] = None + regions: Annotated[ + Optional[List[str]], + Field(description="The list of regions. Omit to use all regions"), + ] = None + + +class BaseDigitalOceanBackendConfigWithCreds(BaseDigitalOceanBackendConfig): + creds: Annotated[AnyBaseDigitalOceanCreds, Field(description="The credentials")] + + +AnyBaseDigitalOceanBackendConfig = Union[ + BaseDigitalOceanBackendConfig, BaseDigitalOceanBackendConfigWithCreds +] + + +class BaseDigitalOceanStoredConfig(BaseDigitalOceanBackendConfig): + pass + + +class BaseDigitalOceanConfig(BaseDigitalOceanStoredConfig): + creds: AnyBaseDigitalOceanCreds diff --git a/src/dstack/_internal/core/backends/dstack/__init__.py b/src/dstack/_internal/core/backends/dstack/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/core/backends/dstack/models.py b/src/dstack/_internal/core/backends/dstack/models.py new file mode 100644 index 0000000000..cfc30dc249 --- /dev/null +++ b/src/dstack/_internal/core/backends/dstack/models.py @@ -0,0 +1,26 @@ +from typing import Annotated, List, Literal + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + +# The OSS is currently aware of some of the DstackBackend internals (DstackBackendConfig) to be able to +# show DstackBackend base backends as regular backends. +# Consider designing an API that would allow DstackBackend to do the same without exposing its internals. + + +class DstackBackendConfig(CoreModel): + """ + This is a config model of DstackBackend stored in BackendModel.config and used by DstackConfigurator. + """ + + type: Literal["dstack"] = "dstack" + base_backends: List[str] + + +class DstackBaseBackendConfig(CoreModel): + type: str + + +class DstackConfig(CoreModel): + type: Annotated[Literal["dstack"], Field(description="The type of backend")] = "dstack" diff --git a/src/dstack/_internal/core/backends/features.py b/src/dstack/_internal/core/backends/features.py new file mode 100644 index 0000000000..d5c28728dc --- /dev/null +++ b/src/dstack/_internal/core/backends/features.py @@ -0,0 +1,75 @@ +from dstack._internal.core.backends.base.compute import ( + ComputeWithCreateInstanceSupport, + ComputeWithGatewaySupport, + ComputeWithGroupProvisioningSupport, + ComputeWithInstanceVolumesSupport, + ComputeWithMultinodeSupport, + ComputeWithPlacementGroupSupport, + ComputeWithPrivateGatewaySupport, + ComputeWithPrivilegedSupport, + ComputeWithReservationSupport, + ComputeWithVolumeSupport, +) +from dstack._internal.core.backends.base.configurator import Configurator +from dstack._internal.core.backends.configurators import list_available_configurator_classes +from dstack._internal.core.models.backends.base import BackendType + +_configurator_classes = list_available_configurator_classes() + + +def _get_backends_with_compute_feature( + configurator_classes: list[type[Configurator]], + compute_feature_class: type, +) -> list[BackendType]: + backend_types_and_computes = [ + (configurator_class.TYPE, configurator_class.BACKEND_CLASS.COMPUTE_CLASS) + for configurator_class in configurator_classes + ] + backend_types = [] + for backend_type, compute_class in backend_types_and_computes: + if issubclass(compute_class, compute_feature_class): + backend_types.append(backend_type) + return backend_types + + +# The following backend lists do not include unavailable backends (i.e. backends missing deps). +BACKENDS_WITH_CREATE_INSTANCE_SUPPORT = _get_backends_with_compute_feature( + configurator_classes=_configurator_classes, + compute_feature_class=ComputeWithCreateInstanceSupport, +) +BACKENDS_WITH_GROUP_PROVISIONING_SUPPORT = _get_backends_with_compute_feature( + configurator_classes=_configurator_classes, + compute_feature_class=ComputeWithGroupProvisioningSupport, +) +BACKENDS_WITH_PRIVILEGED_SUPPORT = _get_backends_with_compute_feature( + configurator_classes=_configurator_classes, + compute_feature_class=ComputeWithPrivilegedSupport, +) +BACKENDS_WITH_INSTANCE_VOLUMES_SUPPORT = _get_backends_with_compute_feature( + configurator_classes=_configurator_classes, + compute_feature_class=ComputeWithInstanceVolumesSupport, +) +BACKENDS_WITH_MULTINODE_SUPPORT = [BackendType.REMOTE] + _get_backends_with_compute_feature( + configurator_classes=_configurator_classes, + compute_feature_class=ComputeWithMultinodeSupport, +) +BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT = _get_backends_with_compute_feature( + configurator_classes=_configurator_classes, + compute_feature_class=ComputeWithPlacementGroupSupport, +) +BACKENDS_WITH_RESERVATION_SUPPORT = _get_backends_with_compute_feature( + configurator_classes=_configurator_classes, + compute_feature_class=ComputeWithReservationSupport, +) +BACKENDS_WITH_GATEWAY_SUPPORT = _get_backends_with_compute_feature( + configurator_classes=_configurator_classes, + compute_feature_class=ComputeWithGatewaySupport, +) +BACKENDS_WITH_PRIVATE_GATEWAY_SUPPORT = _get_backends_with_compute_feature( + configurator_classes=_configurator_classes, + compute_feature_class=ComputeWithPrivateGatewaySupport, +) +BACKENDS_WITH_VOLUMES_SUPPORT = _get_backends_with_compute_feature( + configurator_classes=_configurator_classes, + compute_feature_class=ComputeWithVolumeSupport, +) diff --git a/src/dstack/_internal/core/backends/gcp/__init__.py b/src/dstack/_internal/core/backends/gcp/__init__.py index 74efe2aa98..e69de29bb2 100644 --- a/src/dstack/_internal/core/backends/gcp/__init__.py +++ b/src/dstack/_internal/core/backends/gcp/__init__.py @@ -1,16 +0,0 @@ -from dstack._internal.core.backends.base import Backend -from dstack._internal.core.backends.gcp.compute import GCPCompute -from dstack._internal.core.backends.gcp.config import GCPConfig -from dstack._internal.core.models.backends.base import BackendType - - -class GCPBackend(Backend): - TYPE: BackendType = BackendType.GCP - - def __init__(self, config: GCPConfig): - self.config = config - self._compute = GCPCompute(self.config) - # self._check_credentials() - - def compute(self) -> GCPCompute: - return self._compute diff --git a/src/dstack/_internal/core/backends/gcp/auth.py b/src/dstack/_internal/core/backends/gcp/auth.py index c80471df3b..7ded04612a 100644 --- a/src/dstack/_internal/core/backends/gcp/auth.py +++ b/src/dstack/_internal/core/backends/gcp/auth.py @@ -1,33 +1,35 @@ import json from typing import Optional, Tuple +import google.api_core.exceptions import google.auth +import google.cloud.compute_v1 as compute_v1 from google.auth.credentials import Credentials from google.auth.exceptions import DefaultCredentialsError -from google.cloud import storage from google.oauth2 import service_account -from dstack._internal.core.errors import BackendAuthError -from dstack._internal.core.models.backends.gcp import ( +from dstack._internal.core.backends.gcp.models import ( AnyGCPCreds, - GCPDefaultCreds, GCPServiceAccountCreds, ) -from dstack._internal.core.models.common import is_core_model_instance +from dstack._internal.core.errors import BackendAuthError -def authenticate(creds: AnyGCPCreds) -> Tuple[Credentials, Optional[str]]: - """ - :raises BackendAuthError: - :return: GCP credentials and project_id - """ - credentials, project_id = get_credentials(creds) - validate_credentials(credentials) +def authenticate(creds: AnyGCPCreds, project_id: Optional[str] = None) -> Tuple[Credentials, str]: + credentials, credentials_project_id = get_credentials(creds) + if project_id is None: + # If project_id is not specified explicitly, try using credentials' project_id. + # Explicit project_id takes precedence because credentials' project_id may be irrelevant. + # For example, with Workload Identity Federation for GKE, it's cluster project_id. + project_id = credentials_project_id + if project_id is None: + raise BackendAuthError("Credentials require project_id to be specified") + validate_credentials(credentials, project_id) return credentials, project_id def get_credentials(creds: AnyGCPCreds) -> Tuple[Credentials, Optional[str]]: - if is_core_model_instance(creds, GCPServiceAccountCreds): + if isinstance(creds, GCPServiceAccountCreds): try: service_account_info = json.loads(creds.data) credentials = service_account.Credentials.from_service_account_info( @@ -40,22 +42,16 @@ def get_credentials(creds: AnyGCPCreds) -> Tuple[Credentials, Optional[str]]: try: default_credentials, project_id = google.auth.default() except DefaultCredentialsError: - raise BackendAuthError() + raise BackendAuthError("Failed to find default credentials") return default_credentials, project_id -def validate_credentials(credentials: Credentials): - try: - storage_client = storage.Client(credentials=credentials) - storage_client.list_buckets(max_results=1) - except Exception: - raise BackendAuthError() - - -def default_creds_available() -> bool: +def validate_credentials(credentials: Credentials, project_id: str): try: - authenticate(GCPDefaultCreds()) - except BackendAuthError: - return False - return True + client = compute_v1.ProjectsClient(credentials=credentials) + client.get(project=project_id) + except google.api_core.exceptions.NotFound: + raise BackendAuthError(f"project_id {project_id} not found") + except Exception as e: + raise BackendAuthError(f"Insufficient permissions: {e}") diff --git a/src/dstack/_internal/core/backends/gcp/backend.py b/src/dstack/_internal/core/backends/gcp/backend.py new file mode 100644 index 0000000000..15c9eb7c3f --- /dev/null +++ b/src/dstack/_internal/core/backends/gcp/backend.py @@ -0,0 +1,17 @@ +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.gcp.compute import GCPCompute +from dstack._internal.core.backends.gcp.models import GCPConfig +from dstack._internal.core.models.backends.base import BackendType + + +class GCPBackend(Backend): + TYPE = BackendType.GCP + COMPUTE_CLASS = GCPCompute + + def __init__(self, config: GCPConfig): + self.config = config + self._compute = GCPCompute(self.config) + # self._check_credentials() + + def compute(self) -> GCPCompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/gcp/compute.py b/src/dstack/_internal/core/backends/gcp/compute.py index 51bd07ae8b..86544b112f 100644 --- a/src/dstack/_internal/core/backends/gcp/compute.py +++ b/src/dstack/_internal/core/backends/gcp/compute.py @@ -1,30 +1,61 @@ import concurrent.futures import json +import re from collections import defaultdict -from typing import Callable, Dict, List, Optional +from collections.abc import Iterable +from dataclasses import dataclass +from typing import Callable, Dict, List, Literal, Optional, Tuple import google.api_core.exceptions import google.cloud.compute_v1 as compute_v1 +import gpuhunt +from cachetools import TTLCache, cachedmethod from google.cloud import tpu_v2 +from google.cloud.compute_v1.types.compute import Instance +from gpuhunt import KNOWN_TPUS import dstack._internal.core.backends.gcp.auth as auth import dstack._internal.core.backends.gcp.resources as gcp_resources +from dstack._internal import settings from dstack._internal.core.backends.base.compute import ( Compute, + ComputeTTLCache, + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithGatewaySupport, + ComputeWithInstanceVolumesSupport, + ComputeWithMultinodeSupport, + ComputeWithPlacementGroupSupport, + ComputeWithPrivateGatewaySupport, + ComputeWithPrivilegedSupport, + ComputeWithReservationSupport, + ComputeWithVolumeSupport, + generate_unique_gateway_instance_name, + generate_unique_instance_name, + generate_unique_volume_name, get_gateway_user_data, - get_instance_name, get_shim_commands, get_user_data, + merge_tags, + requires_nvidia_proprietary_kernel_modules, ) -from dstack._internal.core.backends.base.offers import get_catalog_offers -from dstack._internal.core.backends.gcp.config import GCPConfig +from dstack._internal.core.backends.base.offers import ( + OfferModifier, + get_catalog_offers, + get_offers_disk_modifier, +) +from dstack._internal.core.backends.gcp.features import tcpx as tcpx_features +from dstack._internal.core.backends.gcp.models import GCPConfig +from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES from dstack._internal.core.errors import ( ComputeError, ComputeResourceNotFoundError, NoCapacityError, + PlacementGroupInUseError, ProvisioningError, ) from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import CoreModel from dstack._internal.core.models.gateways import ( GatewayComputeConfiguration, GatewayProvisioningData, @@ -34,60 +65,161 @@ InstanceConfiguration, InstanceOffer, InstanceOfferWithAvailability, - InstanceType, Resources, - SSHKey, ) -from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run -from dstack._internal.core.models.volumes import Volume +from dstack._internal.core.models.placement import PlacementGroup, PlacementGroupProvisioningData +from dstack._internal.core.models.resources import Memory, Range +from dstack._internal.core.models.runs import JobProvisioningData, Requirements +from dstack._internal.core.models.volumes import ( + GCPVolumeConfiguration, + Volume, + VolumeAttachmentData, + VolumeProvisioningData, +) +from dstack._internal.utils.common import get_or_error from dstack._internal.utils.logging import get_logger logger = get_logger(__name__) +# pd-balanced disks can be 10GB-64TB, but dstack images are 20GB and cannot grow larger +# than 32TB because of filesystem settings +CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("20GB"), max=Memory.parse("32TB")) +# Pattern from https://cloud.google.com/compute/docs/instances/reservations-consume#consuming_instances_from_a_specific_reservation +RESERVATION_PATTERN = re.compile( + r"projects/(?P[a-z0-9-]+)/reservations/(?P[a-z0-9-]+)" +) +RESOURCE_NAME_PATTERN = re.compile(r"[a-z0-9-]+") +TPU_VERSIONS = [tpu.name for tpu in KNOWN_TPUS] +DEFAULT_GATEWAY_INSTANCE_TYPE = "e2-medium" + + +class GCPOfferBackendData(CoreModel): + is_dws_calendar_mode: bool = False + + +class GCPVolumeDiskBackendData(CoreModel): + type: Literal["disk"] = "disk" + disk_type: str + -class GCPCompute(Compute): +class GCPCompute( + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithPrivilegedSupport, + ComputeWithInstanceVolumesSupport, + ComputeWithMultinodeSupport, + ComputeWithReservationSupport, + ComputeWithPlacementGroupSupport, + ComputeWithGatewaySupport, + ComputeWithPrivateGatewaySupport, + ComputeWithVolumeSupport, + Compute, +): def __init__(self, config: GCPConfig): + super().__init__() self.config = config - self.credentials, self.project_id = auth.authenticate(config.creds) + self.credentials, _ = auth.authenticate(config.creds, self.config.project_id) self.instances_client = compute_v1.InstancesClient(credentials=self.credentials) self.firewalls_client = compute_v1.FirewallsClient(credentials=self.credentials) self.regions_client = compute_v1.RegionsClient(credentials=self.credentials) self.subnetworks_client = compute_v1.SubnetworksClient(credentials=self.credentials) self.routers_client = compute_v1.RoutersClient(credentials=self.credentials) self.tpu_client = tpu_v2.TpuClient(credentials=self.credentials) + self.disk_client = compute_v1.DisksClient(credentials=self.credentials) + self.resource_policies_client = compute_v1.ResourcePoliciesClient( + credentials=self.credentials + ) + self.reservations_client = compute_v1.ReservationsClient(credentials=self.credentials) + self._usable_subnets_cache = ComputeTTLCache(cache=TTLCache(maxsize=1, ttl=120)) + # Smaller TTL since we check the reservation's in_use_count, which can change often + self._reservation_cache = ComputeTTLCache(cache=TTLCache(maxsize=8, ttl=20)) - def get_offers( - self, requirements: Optional[Requirements] = None - ) -> List[InstanceOfferWithAvailability]: + def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]: + regions = get_or_error(self.config.regions) + zones_by_key: Dict[Tuple, List[str]] = {} + catalog_item_filter = _make_catalog_item_filter(regions, zones_by_key) offers = get_catalog_offers( backend=BackendType.GCP, - requirements=requirements, - extra_filter=_supported_instances_and_zones(self.config.regions), + catalog_item_filter=catalog_item_filter, ) quotas: Dict[str, Dict[str, float]] = defaultdict(dict) for region in self.regions_client.list(project=self.config.project_id): for quota in region.quotas: quotas[region.name][quota.metric] = quota.limit - quota.usage - seen_region_offers = set() offers_with_availability = [] for offer in offers: region = offer.region[:-2] # strip zone - key = (_unique_instance_name(offer.instance), region) - if key in seen_region_offers: - continue - seen_region_offers.add(key) + gpu_name = ( + offer.instance.resources.gpus[0].name if offer.instance.resources.gpus else None + ) + key = _offer_dedup_key( + offer.instance.name, offer.instance.resources.spot, gpu_name, region + ) availability = InstanceAvailability.NO_QUOTA if _has_gpu_quota(quotas[region], offer.instance.resources): availability = InstanceAvailability.UNKNOWN # todo quotas: cpu, memory, global gpu, tpu - offers_with_availability.append( - InstanceOfferWithAvailability(**offer.dict(), availability=availability) + offer_with_availability = offer.with_availability( + availability=availability, + availability_zones=zones_by_key.get(key, []), ) - offers_with_availability[-1].region = region - + offers_with_availability.append(offer_with_availability) + offer_with_availability.region = region return offers_with_availability + def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]: + modifiers = [] + + if requirements.reservation: + zone_to_reservation = self._find_reservation(requirements.reservation) + + def reservation_modifier( + offer: InstanceOfferWithAvailability, + ) -> Optional[InstanceOfferWithAvailability]: + if offer.instance.resources.spot: + return None + assert offer.availability_zones is not None + matching_zones = [] + zones_with_capacity = [] + for zone in offer.availability_zones: + reservation = zone_to_reservation.get(zone) + if reservation is not None and _offer_matches_reservation(offer, reservation): + matching_zones.append(zone) + if _reservation_has_capacity(reservation): + zones_with_capacity.append(zone) + if not matching_zones: + return None + offer = offer.copy(deep=True) + if zones_with_capacity: + offer.availability_zones = zones_with_capacity + else: + offer.availability_zones = matching_zones + offer.availability = InstanceAvailability.NOT_AVAILABLE + return offer + + modifiers.append(reservation_modifier) + + modifiers.append(get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)) + return modifiers + + def get_offers_post_filter( + self, requirements: Requirements + ) -> Optional[Callable[[InstanceOfferWithAvailability], bool]]: + if requirements.reservation is None: + + def reserved_offers_filter(offer: InstanceOfferWithAvailability) -> bool: + """Remove reserved-only offers""" + if GCPOfferBackendData.__response__.parse_obj( + offer.backend_data + ).is_dws_calendar_mode: + return False + return True + + return reserved_offers_filter + + return None + def terminate_instance( self, instance_id: str, region: str, backend_data: Optional[str] = None ) -> None: @@ -101,14 +233,14 @@ def terminate_instance( is_tpu = backend_data_dict.get("is_tpu", False) try: if is_tpu: - name = f"projects/{self.project_id}/locations/{zone}/nodes/{instance_id}" - delete_request = tpu_v2.DeleteNodeRequest( - name=name, - ) + name = f"projects/{self.config.project_id}/locations/{zone}/nodes/{instance_id}" + delete_request = tpu_v2.DeleteNodeRequest(name=name) self.tpu_client.delete_node(request=delete_request) else: self.instances_client.delete( - project=self.config.project_id, zone=zone, instance=instance_id + project=self.config.project_id, + zone=zone, + instance=instance_id, ) except google.api_core.exceptions.NotFound: pass @@ -117,21 +249,18 @@ def create_instance( self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration, + placement_group: Optional[PlacementGroup], ) -> JobProvisioningData: - instance_name = instance_config.instance_name + instance_name = generate_unique_instance_name( + instance_config, max_length=gcp_resources.MAX_RESOURCE_NAME_LEN + ) allocate_public_ip = self.config.allocate_public_ips - if not gcp_resources.is_valid_resource_name(instance_name): - # In a rare case the instance name is invalid in GCP, - # we better use a random instance name than fail provisioning. - instance_name = gcp_resources.generate_random_resource_name() - logger.warning( - "Invalid GCP instance name: %s. A new valid name is generated: %s", - instance_config.instance_name, - instance_name, - ) - authorized_keys = instance_config.get_public_keys() + # get_offers always fills instance_offer.availability_zones + zones = get_or_error(instance_offer.availability_zones) + if len(zones) == 0: + raise NoCapacityError("No eligible availability zones") # If a shared VPC is not used, we can create firewall rules for user if self.config.vpc_project_id is None: gcp_resources.create_runner_firewall_rules( @@ -142,34 +271,58 @@ def create_instance( disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024) # Choose any usable subnet in a VPC. # Configuring a specific subnet per region is not supported yet. - subnetwork = _get_vpc_subnet( - subnetworks_client=self.subnetworks_client, - config=self.config, + subnetwork = self._get_vpc_subnet(instance_offer.region) + extra_subnets = self._get_extra_subnets( + region=instance_offer.region, + instance_type_name=instance_offer.instance.name, + ) + roce_subnets = self._get_roce_subnets( region=instance_offer.region, + instance_type_name=instance_offer.instance.name, ) + placement_policy = None + if placement_group is not None: + placement_policy = gcp_resources.get_placement_policy_resource_name( + project_id=self.config.project_id, + region=instance_offer.region, + placement_policy=placement_group.name, + ) labels = { "owner": "dstack", "dstack_project": instance_config.project_name.lower(), + "dstack_name": instance_config.instance_name, "dstack_user": instance_config.user.lower(), } - labels = {k: v for k, v in labels.items() if gcp_resources.is_valid_label_value(v)} - tpu = ( + labels = merge_tags( + base_tags=labels, + backend_tags=self.config.tags, + resource_tags=instance_config.tags, + ) + labels = gcp_resources.filter_invalid_labels(labels) + is_tpu = ( _is_tpu(instance_offer.instance.resources.gpus[0].name) if instance_offer.instance.resources.gpus else False ) - if tpu: - instance_id = f"tpu-{instance_config.instance_name}" - startup_script = _get_tpu_startup_script(authorized_keys) - for zone in _get_instance_zones(instance_offer): + if is_tpu: + instance_id = instance_name + startup_script = _get_tpu_startup_script() + # GCP does not allow attaching disks while TPUs is creating, + # so we need to attach the disks on creation. + data_disks = _get_tpu_data_disks(self.config.project_id, instance_config.volumes) + for zone in zones: tpu_node = gcp_resources.create_tpu_node_struct( instance_name=instance_offer.instance.name, startup_script=startup_script, authorized_keys=authorized_keys, spot=instance_offer.instance.resources.spot, labels=labels, + runtime_version=_get_tpu_runtime_version(instance_offer.instance.name), + network=self.config.vpc_resource_name, subnetwork=subnetwork, allocate_public_ip=allocate_public_ip, + service_account=self.config.vm_service_account, + data_disks=data_disks, ) create_node_request = tpu_v2.CreateNodeRequest( parent=f"projects/{self.config.project_id}/locations/{zone}", @@ -204,31 +357,36 @@ def create_instance( username="ubuntu", ssh_proxy=None, dockerized=True, - backend_data=json.dumps({"is_tpu": tpu, "zone": zone}), + backend_data=json.dumps({"is_tpu": is_tpu, "zone": zone}), ) raise NoCapacityError() - if not allocate_public_ip and not gcp_resources.has_vpc_nat_access( - routers_client=self.routers_client, - project_id=self.config.vpc_project_id or self.config.project_id, - vpc_name=self.config.vpc_resource_name, - region=instance_offer.region, - ): - raise ComputeError( - "VPC does not have access to the external internet through Cloud NAT. " - f"Region: {instance_offer.region}, VPC name: {self.config.vpc_resource_name}, " - f"Project ID: {self.config.vpc_project_id or self.config.project_id}." - ) + image = _get_image( + instance_type_name=instance_offer.instance.name, + gpu_name=( + instance_offer.instance.resources.gpus[0].name + if len(instance_offer.instance.resources.gpus) > 0 + else None + ), + ) - for zone in _get_instance_zones(instance_offer): + for zone in zones: + reservation = None + if instance_config.reservation: + reservation = self._find_reservation(instance_config.reservation).get(zone) + if reservation is None: + logger.warning( + "Reservation %s no longer exists in zone %s", + instance_config.reservation, + zone, + ) + continue request = compute_v1.InsertInstanceRequest() request.zone = zone request.project = self.config.project_id request.instance_resource = gcp_resources.create_instance_struct( disk_size=disk_size, - image_id=gcp_resources.get_image_id( - len(instance_offer.instance.resources.gpus) > 0, - ), + image_id=image.id, machine_type=instance_offer.instance.name, accelerators=gcp_resources.get_accelerators( project_id=self.config.project_id, @@ -236,15 +394,24 @@ def create_instance( gpus=instance_offer.instance.resources.gpus, ), spot=instance_offer.instance.resources.spot, - user_data=get_user_data(authorized_keys), + user_data=_get_user_data( + authorized_keys=authorized_keys, + instance_type_name=instance_offer.instance.name, + is_ufw_installed=image.is_ufw_installed, + ), authorized_keys=authorized_keys, labels=labels, tags=[gcp_resources.DSTACK_INSTANCE_TAG], instance_name=instance_name, zone=zone, + service_account=self.config.vm_service_account, network=self.config.vpc_resource_name, subnetwork=subnetwork, + extra_subnetworks=extra_subnets, + roce_subnetworks=roce_subnets, allocate_public_ip=allocate_public_ip, + placement_policy=placement_policy, + reservation=reservation, ) try: # GCP needs some time to return an error in case of no capacity (< 30s). @@ -252,6 +419,13 @@ def create_instance( # If the request succeeds, we'll probably timeout and update_provisioning_data() will get hostname. operation = self.instances_client.insert(request=request) gcp_resources.wait_for_extended_operation(operation, timeout=30) + except google.api_core.exceptions.BadRequest as e: + if "Network profile only allows resource creation in location" in e.message: + # A hack to find the correct RoCE VPC zone by trial and error. + # Could be better to find it via the API. + logger.debug("Got GCP error when provisioning a VM: %s", e) + continue + raise except ( google.api_core.exceptions.ServiceUnavailable, google.api_core.exceptions.NotFound, @@ -294,7 +468,7 @@ def update_provisioning_data( if is_tpu: node_request = tpu_v2.GetNodeRequest( - name=f"projects/dstack/locations/{zone}/nodes/{provisioning_data.instance_id}", + name=f"projects/{self.config.project_id}/locations/{zone}/nodes/{provisioning_data.instance_id}", ) try: instance = self.tpu_client.get_node(request=node_request) @@ -326,36 +500,62 @@ def update_provisioning_data( if instance.status in ["PROVISIONING", "STAGING"]: return if instance.status == "RUNNING": - if allocate_public_ip: - hostname = instance.network_interfaces[0].access_configs[0].nat_i_p - else: - hostname = instance.network_interfaces[0].network_i_p - provisioning_data.hostname = hostname + provisioning_data.hostname = _get_instance_ip(instance, allocate_public_ip) provisioning_data.internal_ip = instance.network_interfaces[0].network_i_p return raise ProvisioningError( f"Failed to get instance IP address. Instance status: {instance.status}" ) - def run_job( + def create_placement_group( self, - run: Run, - job: Job, - instance_offer: InstanceOfferWithAvailability, - project_ssh_public_key: str, - project_ssh_private_key: str, - volumes: List[Volume], - ) -> JobProvisioningData: - instance_config = InstanceConfiguration( - project_name=run.project_name, - instance_name=get_instance_name(run, job), # TODO: generate name - ssh_keys=[ - SSHKey(public=project_ssh_public_key.strip()), - ], - job_docker_config=None, - user=run.user, + placement_group: PlacementGroup, + master_instance_offer: InstanceOffer, + ) -> PlacementGroupProvisioningData: + policy = compute_v1.ResourcePolicy( + name=placement_group.name, + region=placement_group.configuration.region, + group_placement_policy=compute_v1.ResourcePolicyGroupPlacementPolicy( + availability_domain_count=1, + collocation="COLLOCATED", + ), + ) + self.resource_policies_client.insert( + project=self.config.project_id, + region=placement_group.configuration.region, + resource_policy_resource=policy, ) - return self.create_instance(instance_offer, instance_config) + return PlacementGroupProvisioningData(backend=BackendType.GCP) + + def delete_placement_group( + self, + placement_group: PlacementGroup, + ): + try: + operation = self.resource_policies_client.delete( + project=self.config.project_id, + region=placement_group.configuration.region, + resource_policy=placement_group.name, + ) + operation.result() # Wait for operation to complete + except google.api_core.exceptions.NotFound: + logger.debug("Placement group %s not found", placement_group.name) + except google.api_core.exceptions.BadRequest as e: + if "is already being used by" in e.message: + raise PlacementGroupInUseError() + raise + + def is_suitable_placement_group( + self, + placement_group: PlacementGroup, + instance_offer: InstanceOffer, + ) -> bool: + return placement_group.configuration.region == instance_offer.region + + def are_placement_groups_compatible_with_reservations(self, backend_type: BackendType) -> bool: + # Cannot use our own placement policies when provisioning in a reservation. + # Instead, we use the placement policy defined in reservation settings. + return False def create_gateway( self, @@ -374,46 +574,63 @@ def create_gateway( else: raise ComputeResourceNotFoundError() + instance_name = generate_unique_gateway_instance_name( + configuration, max_length=gcp_resources.MAX_RESOURCE_NAME_LEN + ) # Choose any usable subnet in a VPC. # Configuring a specific subnet per region is not supported yet. - subnetwork = _get_vpc_subnet( - subnetworks_client=self.subnetworks_client, - config=self.config, - region=configuration.region, + subnetwork = self._get_vpc_subnet(configuration.region) + + labels = { + "owner": "dstack", + "dstack_project": configuration.project_name.lower(), + "dstack_name": configuration.instance_name, + } + labels = merge_tags( + base_tags=labels, + backend_tags=self.config.tags, + resource_tags=configuration.tags, ) + labels = gcp_resources.filter_invalid_labels(labels) request = compute_v1.InsertInstanceRequest() request.zone = zone request.project = self.config.project_id request.instance_resource = gcp_resources.create_instance_struct( disk_size=10, - image_id=gcp_resources.get_gateway_image_id(), - machine_type="e2-small", + image_id=_get_gateway_image_id(), + machine_type=configuration.instance_type or DEFAULT_GATEWAY_INSTANCE_TYPE, accelerators=[], spot=False, - user_data=get_gateway_user_data(configuration.ssh_key_pub), + user_data=get_gateway_user_data( + configuration.ssh_key_pub, router=configuration.router + ), authorized_keys=[configuration.ssh_key_pub], - labels={ - "owner": "dstack", - "dstack_project": configuration.project_name, - }, + labels=labels, tags=[gcp_resources.DSTACK_GATEWAY_TAG], - instance_name=configuration.instance_name, + instance_name=instance_name, zone=zone, - service_account=None, + service_account=self.config.vm_service_account, network=self.config.vpc_resource_name, subnetwork=subnetwork, + allocate_public_ip=configuration.public_ip, ) - operation = self.instances_client.insert(request=request) - gcp_resources.wait_for_extended_operation(operation, "instance creation") + try: + operation = self.instances_client.insert(request=request) + gcp_resources.wait_for_extended_operation(operation, "instance creation") + except ( + google.api_core.exceptions.ServiceUnavailable, + google.api_core.exceptions.ClientError, + ) as e: + raise ComputeError(f"GCP error: {e.message}") instance = self.instances_client.get( - project=self.config.project_id, zone=zone, instance=configuration.instance_name + project=self.config.project_id, zone=zone, instance=instance_name ) return GatewayProvisioningData( - instance_id=configuration.instance_name, + instance_id=instance_name, region=configuration.region, # used for instance termination availability_zone=zone, - ip_address=instance.network_interfaces[0].access_configs[0].nat_i_p, + ip_address=_get_instance_ip(instance, configuration.public_ip), backend_data=json.dumps({"zone": zone}), ) @@ -429,48 +646,409 @@ def terminate_gateway( backend_data=backend_data, ) + def register_volume(self, volume: Volume) -> VolumeProvisioningData: + assert isinstance(volume.configuration, GCPVolumeConfiguration) + logger.debug("Requesting persistent disk %s", volume.configuration.volume_id) + zones = gcp_resources.get_availability_zones( + regions_client=self.regions_client, + project_id=self.config.project_id, + region=volume.configuration.region, + ) + for zone in zones: + try: + disk = self.disk_client.get( + project=self.config.project_id, + zone=zone, + disk=volume.configuration.volume_id, + ) + except google.api_core.exceptions.NotFound: + pass + else: + logger.debug("Found persistent disk %s", volume.configuration.volume_id) + return VolumeProvisioningData( + backend=BackendType.GCP, + volume_id=disk.name, + size_gb=disk.size_gb, + availability_zone=zone, + attachable=True, + detachable=True, + backend_data=GCPVolumeDiskBackendData( + disk_type=gcp_resources.full_resource_name_to_name(disk.type_), + ).json(), + ) + raise ComputeError(f"Persistent disk {volume.configuration.volume_id} not found") -def _get_vpc_subnet( - subnetworks_client: compute_v1.SubnetworksClient, - config: GCPConfig, - region: str, -) -> Optional[str]: - if config.vpc_name is None: - return None - return gcp_resources.get_vpc_subnet_or_error( - subnetworks_client=subnetworks_client, - vpc_project_id=config.vpc_project_id or config.project_id, - vpc_name=config.vpc_name, - region=region, + def create_volume(self, volume: Volume) -> VolumeProvisioningData: + assert isinstance(volume.configuration, GCPVolumeConfiguration) + zones = gcp_resources.get_availability_zones( + regions_client=self.regions_client, + project_id=self.config.project_id, + region=volume.configuration.region, + ) + if volume.configuration.availability_zone is not None: + zones = [z for z in zones if z == volume.configuration.availability_zone] + if len(zones) == 0: + raise ComputeError( + f"Failed to find availability zone in region {volume.configuration.region}" + ) + zone = zones[0] + + disk_name = generate_unique_volume_name( + volume, max_length=gcp_resources.MAX_RESOURCE_NAME_LEN + ) + + labels = { + "owner": "dstack", + "dstack_project": volume.project_name.lower(), + "dstack_name": volume.name, + "dstack_user": volume.user, + } + labels = merge_tags( + base_tags=labels, + backend_tags=self.config.tags, + resource_tags=volume.configuration.tags, + ) + labels = gcp_resources.filter_invalid_labels(labels) + + disk = compute_v1.Disk() + disk.name = disk_name + disk.size_gb = volume.configuration.size_gb + disk.type_ = f"zones/{zone}/diskTypes/pd-balanced" + disk.labels = labels + + logger.debug("Creating persistent disk for volume %s", volume.name) + try: + operation = self.disk_client.insert( + project=self.config.project_id, + zone=zone, + disk_resource=disk, + ) + gcp_resources.wait_for_extended_operation(operation, "persistent disk creation") + except google.api_core.exceptions.Conflict: + raise ComputeError(f"Volume {volume.name} already exists") + created_disk = self.disk_client.get( + project=self.config.project_id, + zone=zone, + disk=disk_name, + ) + logger.debug("Created persistent disk for volume %s", volume.name) + return VolumeProvisioningData( + backend=BackendType.GCP, + volume_id=created_disk.name, + size_gb=created_disk.size_gb, + availability_zone=zone, + price=_get_volume_price(created_disk.size_gb), + attachable=True, + detachable=True, + backend_data=GCPVolumeDiskBackendData( + disk_type=gcp_resources.full_resource_name_to_name(disk.type_), + ).json(), + ) + + def delete_volume(self, volume: Volume): + logger.debug("Deleting persistent disk for volume %s", volume.name) + try: + operation = self.disk_client.delete( + project=self.config.project_id, + zone=get_or_error(volume.provisioning_data).availability_zone, + disk=volume.volume_id, + ) + gcp_resources.wait_for_extended_operation(operation, "persistent disk deletion") + except google.api_core.exceptions.NotFound: + logger.debug("Failed to find persistent disk for volume %s", volume.name) + pass + logger.debug("Deleted persistent disk for volume %s", volume.name) + + def attach_volume( + self, volume: Volume, provisioning_data: JobProvisioningData + ) -> VolumeAttachmentData: + instance_id = provisioning_data.instance_id + logger.debug( + "Attaching persistent disk for volume %s to instance %s", + volume.volume_id, + instance_id, + ) + if not gcp_resources.instance_type_supports_persistent_disk( + provisioning_data.instance_type.name + ): + raise ComputeError( + f"Instance type {provisioning_data.instance_type.name} does not support Persistent disk volumes" + ) + + zone = get_or_error(volume.provisioning_data).availability_zone + is_tpu = _is_tpu_provisioning_data(provisioning_data) + try: + disk = self.disk_client.get( + project=self.config.project_id, + zone=zone, + disk=volume.volume_id, + ) + disk_url = disk.self_link + except google.api_core.exceptions.NotFound: + raise ComputeError("Persistent disk found") + + try: + if is_tpu: + get_node_request = tpu_v2.GetNodeRequest( + name=f"projects/{self.config.project_id}/locations/{zone}/nodes/{instance_id}", + ) + tpu_node = self.tpu_client.get_node(get_node_request) + + # Python API to attach a disk to a TPU is not documented, + # so we follow the code from the gcloud CLI: + # https://github.com/twistedpair/google-cloud-sdk/blob/26ab5a281d56b384cc25750f3279a27afe5b499f/google-cloud-sdk/lib/googlecloudsdk/command_lib/compute/tpus/tpu_vm/util.py#L113 + source_disk = ( + f"projects/{self.config.project_id}/zones/{zone}/disks/{volume.volume_id}" + ) + # create_instance() has already attached the disks + # if the TPU is provisioned on the run submission via run_job() + for i, disk in enumerate(tpu_node.data_disks, start=1): + if disk.source_disk == source_disk: + device_name = f"persistent-disk-{i}" + logger.debug( + "Persistent disk for volume %s is already attached to instance %s", + volume.volume_id, + instance_id, + ) + return VolumeAttachmentData(device_name=device_name) + attached_disk = tpu_v2.AttachedDisk( + source_disk=source_disk, + mode=tpu_v2.AttachedDisk.DiskMode.READ_WRITE, + ) + tpu_node.data_disks.append(attached_disk) + # Cannot set device name for TPUs, so use default naming + device_name = f"persistent-disk-{len(tpu_node.data_disks)}" + update_node_request = tpu_v2.UpdateNodeRequest( + node=tpu_node, + update_mask="dataDisks", + ) + operation = self.tpu_client.update_node(update_node_request) + gcp_resources.wait_for_operation(operation, "persistent disk attachment") + else: + attached_disk = compute_v1.AttachedDisk() + attached_disk.source = disk_url + attached_disk.auto_delete = False + attached_disk.device_name = f"pd-{volume.volume_id}" + device_name = attached_disk.device_name + operation = self.instances_client.attach_disk( + project=self.config.project_id, + zone=zone, + instance=instance_id, + attached_disk_resource=attached_disk, + ) + gcp_resources.wait_for_extended_operation(operation, "persistent disk attachment") + except google.api_core.exceptions.NotFound: + raise ComputeError("Disk or instance not found") + logger.debug( + "Attached persistent disk for volume %s to instance %s", volume.volume_id, instance_id + ) + return VolumeAttachmentData(device_name=device_name) + + def detach_volume( + self, volume: Volume, provisioning_data: JobProvisioningData, force: bool = False + ): + instance_id = provisioning_data.instance_id + logger.debug( + "Detaching persistent disk for volume %s from instance %s", + volume.volume_id, + instance_id, + ) + zone = get_or_error(volume.provisioning_data).availability_zone + attachment_data = get_or_error(volume.get_attachment_data_for_instance(instance_id)) + is_tpu = _is_tpu_provisioning_data(provisioning_data) + if is_tpu: + try: + get_node_request = tpu_v2.GetNodeRequest( + name=f"projects/{self.config.project_id}/locations/{zone}/nodes/{instance_id}", + ) + tpu_node = self.tpu_client.get_node(get_node_request) + except google.api_core.exceptions.NotFound: + raise ComputeError("Instance not found") + + source_disk = ( + f"projects/{self.config.project_id}/zones/{zone}/disks/{volume.volume_id}" + ) + tpu_node.data_disks = [ + disk for disk in tpu_node.data_disks if disk.source_disk != source_disk + ] + update_node_request = tpu_v2.UpdateNodeRequest( + node=tpu_node, + update_mask="dataDisks", + ) + operation = self.tpu_client.update_node(update_node_request) + gcp_resources.wait_for_operation(operation, "persistent disk detachment") + else: + operation = self.instances_client.detach_disk( + project=self.config.project_id, + zone=get_or_error(volume.provisioning_data).availability_zone, + instance=instance_id, + device_name=attachment_data.device_name, + ) + gcp_resources.wait_for_extended_operation(operation, "persistent disk detachment") + logger.debug( + "Detached persistent disk for volume %s from instance %s", + volume.volume_id, + instance_id, + ) + + def _get_extra_subnets( + self, + region: str, + instance_type_name: str, + ) -> List[Tuple[str, str]]: + if self.config.extra_vpcs is None: + return [] + if instance_type_name == "a3-megagpu-8g": + subnets_num = 8 + elif instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]: + subnets_num = 4 + elif instance_type_name == "a4-highgpu-8g": + subnets_num = 1 # 1 main + 1 extra + 8 RoCE + else: + return [] + extra_subnets = [] + for vpc_name in self.config.extra_vpcs[:subnets_num]: + subnet = gcp_resources.get_vpc_subnet_or_error( + vpc_name=vpc_name, + region=region, + usable_subnets=self._list_usable_subnets(), + ) + vpc_resource_name = gcp_resources.vpc_name_to_vpc_resource_name( + project_id=self.config.vpc_project_id or self.config.project_id, + vpc_name=vpc_name, + ) + extra_subnets.append((vpc_resource_name, subnet)) + return extra_subnets + + def _get_roce_subnets( + self, + region: str, + instance_type_name: str, + ) -> List[Tuple[str, str]]: + if not self.config.roce_vpcs: + return [] + if instance_type_name == "a4-highgpu-8g": + nics_num = 8 + else: + return [] + roce_vpc = self.config.roce_vpcs[0] # roce_vpcs is validated to have at most 1 item + subnets = gcp_resources.get_vpc_subnets( + vpc_name=roce_vpc, + region=region, + usable_subnets=self._list_usable_subnets(), + ) + if len(subnets) < nics_num: + raise ComputeError( + f"{instance_type_name} requires {nics_num} RoCE subnets," + f" but only {len(subnets)} are available in VPC {roce_vpc}" + ) + vpc_resource_name = gcp_resources.vpc_name_to_vpc_resource_name( + project_id=self.config.vpc_project_id or self.config.project_id, + vpc_name=roce_vpc, + ) + nic_subnets = [] + for subnet in subnets[:nics_num]: + nic_subnets.append((vpc_resource_name, subnet)) + return nic_subnets + + @cachedmethod( + cache=lambda self: self._usable_subnets_cache.cache, + lock=lambda self: self._usable_subnets_cache.lock, ) + def _list_usable_subnets(self) -> list[compute_v1.UsableSubnetwork]: + # To avoid hitting the `ListUsable requests per minute` system limit, we fetch all subnets + # at once and cache them + return gcp_resources.list_project_usable_subnets( + subnetworks_client=self.subnetworks_client, + project_id=self.config.vpc_project_id or self.config.project_id, + ) + + def _get_vpc_subnet(self, region: str) -> Optional[str]: + if self.config.vpc_name is None: + return None + return gcp_resources.get_vpc_subnet_or_error( + vpc_name=self.config.vpc_name, + region=region, + usable_subnets=self._list_usable_subnets(), + ) + + @cachedmethod( + cache=lambda self: self._reservation_cache.cache, + lock=lambda self: self._reservation_cache.lock, + ) + def _find_reservation(self, configured_name: str) -> dict[str, compute_v1.Reservation]: + if match := RESERVATION_PATTERN.fullmatch(configured_name): + project_id = match.group("project_id") + name = match.group("reservation_name") + elif RESOURCE_NAME_PATTERN.fullmatch(configured_name): + project_id = self.config.project_id + name = configured_name + else: + # misconfigured or non-GCP + return {} + return gcp_resources.find_reservation( + reservations_client=self.reservations_client, + project_id=project_id, + name=name, + ) + + +def _is_supported_gcp_instance(instance_name: str, gpu_name: Optional[str]) -> bool: + """Check if the instance is supported by dstack.""" + if _is_tpu(instance_name) and not _is_single_host_tpu(instance_name): + return False + for family in [ + "m4-", + "c4-", + "n4-", + "h3-", + "n2-", + "e2-medium", + "e2-standard-", + "e2-highmem-", + "e2-highcpu-", + "m1-", + "a2-", + "a3-", + "g2-", + ]: + if instance_name.startswith(family): + return True + if gpu_name is not None and gpu_name not in {"K80", "P4"}: + return True + return False + + +def _offer_dedup_key( + instance_name: str, spot: bool, gpu_name: Optional[str], region: str +) -> Tuple[str, bool, Optional[str], str]: + """Key for deduplicating GCP per-zone items into per-region offers.""" + return (instance_name, spot, gpu_name, region) -def _supported_instances_and_zones( +def _make_catalog_item_filter( regions: List[str], -) -> Optional[Callable[[InstanceOffer], bool]]: - def _filter(offer: InstanceOffer) -> bool: - # strip zone - if offer.region[:-2] not in regions: + zones_by_key: Dict[Tuple, List[str]], +) -> Callable[[gpuhunt.CatalogItem], bool]: + """ + Returns a filter that checks region, instance support, and deduplicates + per-zone items into per-region offers. Zones are collected in `zones_by_key` + so the caller can attach them to offers later. + """ + seen: set = set() + + def _filter(item: gpuhunt.CatalogItem) -> bool: + region = item.location[:-2] + if region not in regions: return False - # remove TPU Pod for initial release - if _is_tpu(f"tpu-{offer.instance.name}") and _is_pod(offer.instance.name): + if not _is_supported_gcp_instance(item.instance_name, item.gpu_name): return False - for family in [ - "e2-medium", - "e2-standard-", - "e2-highmem-", - "e2-highcpu-", - "m1-", - "a2-", - "a3-", - "g2-", - ]: - if offer.instance.name.startswith(family): - return True - if offer.instance.resources.gpus: - if offer.instance.resources.gpus[0].name not in {"K80", "P4"}: - return True - return False + key = _offer_dedup_key(item.instance_name, item.spot, item.gpu_name, region) + zones_by_key.setdefault(key, []).append(item.location) + if key in seen: + return False + seen.add(key) + return True return _filter @@ -481,8 +1059,8 @@ def _has_gpu_quota(quotas: Dict[str, float], resources: Resources) -> bool: gpu = resources.gpus[0] if _is_tpu(gpu.name): return True - if gpu.name == "H100": - # H100 and H100_MEGA quotas are not returned by `regions_client.list` + if gpu.name in ["B200", "H100", "RTXPRO6000"]: + # B200, H100, H100_MEGA, and RTXPRO6000 quotas are not returned by `regions_client.list` return True quota_name = f"NVIDIA_{gpu.name}_GPUS" if gpu.name == "A100" and gpu.memory_mib == 80 * 1024: @@ -492,60 +1070,213 @@ def _has_gpu_quota(quotas: Dict[str, float], resources: Resources) -> bool: return len(resources.gpus) <= quotas.get(quota_name, 0) -def _unique_instance_name(instance: InstanceType) -> str: - if instance.resources.spot: - name = f"{instance.name}-spot" +def _offer_matches_reservation( + offer: InstanceOfferWithAvailability, reservation: compute_v1.Reservation +) -> bool: + if ( + reservation.specific_reservation is None + or reservation.specific_reservation.instance_properties is None + ): + return False + properties = reservation.specific_reservation.instance_properties + if properties.machine_type != offer.instance.name: + return False + accelerators = properties.guest_accelerators or [] + if not accelerators and offer.instance.resources.gpus: + return False + if len(accelerators) > 1: + logger.warning( + "Expected 0 or 1 accelerator types per instance," + f" but {properties.machine_type} has {len(accelerators)}." + f" Ignoring reservation {reservation.self_link}" + ) + return False + if accelerators: + if accelerators[0].accelerator_count != len(offer.instance.resources.gpus): + return False + if ( + offer.instance.resources.gpus + and gcp_resources.find_accelerator_name( + offer.instance.resources.gpus[0].name, + offer.instance.resources.gpus[0].memory_mib, + ) + != accelerators[0].accelerator_type + ): + return False + return True + + +def _reservation_has_capacity(reservation: compute_v1.Reservation) -> bool: + return ( + reservation.specific_reservation is not None + and reservation.specific_reservation.in_use_count is not None + and reservation.specific_reservation.assured_count is not None + and reservation.specific_reservation.in_use_count + < reservation.specific_reservation.assured_count + ) + + +@dataclass +class GCPImage: + id: str + is_ufw_installed: bool + + +def _get_image(instance_type_name: str, gpu_name: Optional[str]) -> GCPImage: + if instance_type_name == "a3-megagpu-8g": + image_name = "dstack-a3mega-5" + is_ufw_installed = False + elif instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]: + return GCPImage( + id="projects/cos-cloud/global/images/cos-105-17412-535-78", + is_ufw_installed=False, + ) + elif gpu_name is not None: + if not requires_nvidia_proprietary_kernel_modules(gpu_name): + image_name = ( + f"{settings.DSTACK_VM_BASE_IMAGE_PREFIX}" + f"dstack-cuda-{settings.DSTACK_VM_BASE_IMAGE_VERSION}" + ) + else: + image_name = f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}" + is_ufw_installed = True else: - name = instance.name - if not instance.resources.gpus: - return name - gpu = instance.resources.gpus[0] - return f"{name}-{gpu.name}-{gpu.memory_mib}" - - -def _get_instance_zones(instance_offer: InstanceOffer) -> List[str]: - zones = [] - for offer in get_catalog_offers(backend=BackendType.GCP): - if _unique_instance_name(instance_offer.instance) != _unique_instance_name(offer.instance): - continue - if offer.region[:-2] != instance_offer.region: - continue - zones.append(offer.region) - return zones - - -def _get_tpu_startup_script(authorized_keys: List[str]) -> str: - commands = get_shim_commands( - authorized_keys=authorized_keys, is_privileged=True, pjrt_device="TPU" + image_name = ( + f"{settings.DSTACK_VM_BASE_IMAGE_PREFIX}dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}" + ) + is_ufw_installed = True + image_name = image_name.replace(".", "-") + return GCPImage( + id=f"projects/dstack/global/images/{image_name}", + is_ufw_installed=is_ufw_installed, + ) + + +def _get_gateway_image_id() -> str: + return "projects/ubuntu-os-cloud/global/images/ubuntu-2204-jammy-v20230714" + + +def _get_user_data( + authorized_keys: List[str], instance_type_name: str, is_ufw_installed: bool +) -> str: + base_path = None + bin_path = None + backend_shim_env = None + if instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]: + # In the COS image the / file system is not writable. + # /home and /var are writable but not executable. + # Only /etc is both writable and executable, so use it for shim/runner binaries. + # See: https://cloud.google.com/container-optimized-os/docs/concepts/disks-and-filesystem + base_path = bin_path = "/etc" + backend_shim_env = { + # In COS nvidia binaries are not installed on PATH by default. + # Set so that shim can run nvidia-smi. + "PATH": "/var/lib/nvidia/bin:$PATH", + } + return get_user_data( + authorized_keys=authorized_keys, + backend_specific_commands=_get_backend_specific_commands( + instance_type_name=instance_type_name, + ), + base_path=base_path, + bin_path=bin_path, + backend_shim_env=backend_shim_env, + # Instance-level firewall is optional on GCP. The main protection comes from GCP firewalls. + # So only set up instance-level firewall as an additional measure if ufw is available. + skip_firewall_setup=not is_ufw_installed, ) + + +def _get_backend_specific_commands(instance_type_name: str) -> List[str]: + if instance_type_name == "a3-megagpu-8g": + return tcpx_features.get_backend_specific_commands_tcpxo() + if instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]: + return tcpx_features.get_backend_specific_commands_tcpx() + return [] + + +def _get_volume_price(size: int) -> float: + # https://cloud.google.com/compute/disks-image-pricing#persistentdisk + # The price is different in different regions. Take max across supported regions. + return size * 0.12 + + +def _get_tpu_startup_script() -> str: + commands = get_shim_commands(is_privileged=True, pjrt_device="TPU") startup_script = " ".join([" && ".join(commands)]) startup_script = "#! /bin/bash\n" + startup_script return startup_script -def _is_tpu(name: str) -> bool: - tpu_versions = ["tpu-v2", "tpu-v3", "tpu-v4", "tpu-v5p", "tpu-v5litepod"] - parts = name.split("-") - if len(parts) == 3: - version = f"{parts[0]}-{parts[1]}" - cores = parts[2] - if version in tpu_versions and cores.isdigit(): +def _is_tpu(instance_name: str) -> bool: + parts = instance_name.split("-") + if len(parts) == 2: + version, cores = parts + if version in TPU_VERSIONS and cores.isdigit(): return True return False -def _is_pod(instance_name: str) -> bool: +def _get_tpu_runtime_version(instance_name: str) -> str: + tpu_version = _get_tpu_version(instance_name) + if tpu_version == "v6e": + return "v2-alpha-tpuv6e" + elif tpu_version == "v5litepod": + return "v2-alpha-tpuv5-lite" + return "tpu-ubuntu2204-base" + + +def _get_tpu_version(instance_name: str) -> str: + return instance_name.split("-")[0] + + +def _is_single_host_tpu(instance_name: str) -> bool: parts = instance_name.split("-") if len(parts) != 2: - raise ValueError(f"Invalid tpu type: {instance_name}") - version, tensor_cores = parts + logger.info("Skipping unknown TPU: %s", instance_name) + return False + tpu_version, tensor_cores = parts try: tensor_cores = int(tensor_cores) except ValueError: - raise ValueError(f"Invalid number in tpu tensor cores: {tensor_cores}") - if version in ["v2", "v3", "v5p", "v5litepod"]: - return tensor_cores > 8 - elif version == "v4": - return True + logger.info("Skipping TPU due to invalid number of tensor cores: %s", tensor_cores) + return False + if tpu_version in ["v2", "v3", "v5p", "v5litepod", "v6e"]: + return tensor_cores <= 8 + elif tpu_version == "v4": + return False else: - raise ValueError(f"Unknown TPU version: {version}") + logger.info("Skipping unknown TPU: %s", instance_name) + return False + + +def _get_tpu_data_disks( + project_id: str, volumes: Optional[List[Volume]] +) -> List[tpu_v2.AttachedDisk]: + if volumes is None: + return [] + return [_get_tpu_data_disk_for_volume(project_id, volume) for volume in volumes] + + +def _get_tpu_data_disk_for_volume(project_id: str, volume: Volume) -> tpu_v2.AttachedDisk: + zone = get_or_error(volume.provisioning_data).availability_zone + source_disk = f"projects/{project_id}/zones/{zone}/disks/{volume.volume_id}" + attached_disk = tpu_v2.AttachedDisk( + source_disk=source_disk, + mode=tpu_v2.AttachedDisk.DiskMode.READ_WRITE, + ) + return attached_disk + + +def _is_tpu_provisioning_data(provisioning_data: JobProvisioningData) -> bool: + is_tpu = False + if provisioning_data.backend_data: + backend_data_dict = json.loads(provisioning_data.backend_data) + is_tpu = backend_data_dict.get("is_tpu", False) + return is_tpu + + +def _get_instance_ip(instance: Instance, public_ip: bool) -> str: + if public_ip: + return instance.network_interfaces[0].access_configs[0].nat_i_p + return instance.network_interfaces[0].network_i_p diff --git a/src/dstack/_internal/core/backends/gcp/config.py b/src/dstack/_internal/core/backends/gcp/config.py deleted file mode 100644 index a93e31b5b9..0000000000 --- a/src/dstack/_internal/core/backends/gcp/config.py +++ /dev/null @@ -1,22 +0,0 @@ -from dstack._internal.core.backends.base.config import BackendConfig -from dstack._internal.core.models.backends.gcp import AnyGCPCreds, GCPStoredConfig - - -class GCPConfig(GCPStoredConfig, BackendConfig): - creds: AnyGCPCreds - - @property - def allocate_public_ips(self) -> bool: - if self.public_ips is not None: - return self.public_ips - return True - - @property - def vpc_resource_name(self) -> str: - vpc_name = self.vpc_name - if vpc_name is None: - vpc_name = "default" - project_id = self.project_id - if self.vpc_project_id is not None: - project_id = self.vpc_project_id - return f"projects/{project_id}/global/networks/{vpc_name}" diff --git a/src/dstack/_internal/core/backends/gcp/configurator.py b/src/dstack/_internal/core/backends/gcp/configurator.py new file mode 100644 index 0000000000..c40aa2b7d3 --- /dev/null +++ b/src/dstack/_internal/core/backends/gcp/configurator.py @@ -0,0 +1,206 @@ +import json + +import google.cloud.compute_v1 as compute_v1 + +from dstack._internal.core.backends.base.configurator import ( + TAGS_MAX_NUM, + BackendRecord, + Configurator, + raise_invalid_credentials_error, +) +from dstack._internal.core.backends.gcp import auth, resources +from dstack._internal.core.backends.gcp.backend import GCPBackend +from dstack._internal.core.backends.gcp.models import ( + GCPBackendConfig, + GCPBackendConfigWithCreds, + GCPConfig, + GCPCreds, + GCPDefaultCreds, + GCPServiceAccountCreds, + GCPStoredConfig, +) +from dstack._internal.core.errors import BackendAuthError, BackendError, ServerClientError +from dstack._internal.core.models.backends.base import ( + BackendType, +) + +LOCATIONS = [ + { + "name": "North America", + "regions": [ + "northamerica-northeast1", + "northamerica-northeast2", + "us-central1", + "us-east1", + "us-east4", + "us-east5", + "us-south1", + "us-west1", + "us-west2", + "us-west3", + "us-west4", + ], + "default_region": "us-west1", + "default_zone": "us-west1-b", + }, + { + "name": "South America", + "regions": [ + "southamerica-east1", + "southamerica-west1", + ], + "default_region": "southamerica-east1", + "default_zone": "southamerica-east1-b", + }, + { + "name": "Europe", + "regions": [ + "europe-central2", + "europe-north1", + "europe-southwest1", + "europe-west1", + "europe-west2", + "europe-west3", + "europe-west4", + "europe-west6", + "europe-west8", + "europe-west9", + ], + "default_region": "europe-west4", + "default_zone": "europe-west4-a", + }, + { + "name": "Asia", + "regions": [ + "asia-east1", + "asia-east2", + "asia-northeast1", + "asia-northeast2", + "asia-northeast3", + "asia-south1", + "asia-south2", + "asia-southeast1", + "asia-southeast2", + ], + "default_region": "asia-southeast1", + "default_zone": "asia-southeast1-b", + }, + { + "name": "Middle East", + "regions": [ + "me-west1", + ], + "default_region": "me-west1", + "default_zone": "me-west1-b", + }, + { + "name": "Australia", + "regions": [ + "australia-southeast1", + "australia-southeast2", + ], + "default_region": "australia-southeast1", + "default_zone": "australia-southeast1-c", + }, +] +REGIONS = [r for loc in LOCATIONS for r in loc["regions"]] +DEFAULT_REGIONS = REGIONS +MAIN_REGION = "us-east1" + + +class GCPConfigurator( + Configurator[ + GCPBackendConfig, + GCPBackendConfigWithCreds, + ] +): + TYPE = BackendType.GCP + BACKEND_CLASS = GCPBackend + + def validate_config(self, config: GCPBackendConfigWithCreds, default_creds_enabled: bool): + if isinstance(config.creds, GCPDefaultCreds) and not default_creds_enabled: + raise_invalid_credentials_error(fields=[["creds"]]) + try: + credentials, _ = auth.authenticate(creds=config.creds, project_id=config.project_id) + except BackendAuthError as e: + details = None + if len(e.args) > 0: + details = e.args[0] + if isinstance(config.creds, GCPServiceAccountCreds): + raise_invalid_credentials_error(fields=[["creds", "data"]], details=details) + else: + raise_invalid_credentials_error(fields=[["creds"]], details=details) + subnetworks_client = compute_v1.SubnetworksClient(credentials=credentials) + routers_client = compute_v1.RoutersClient(credentials=credentials) + self._check_config_tags(config) + self._check_config_vpc( + subnetworks_client=subnetworks_client, + routers_client=routers_client, + config=config, + ) + + def create_backend( + self, project_name: str, config: GCPBackendConfigWithCreds + ) -> BackendRecord: + if config.regions is None: + config.regions = DEFAULT_REGIONS + return BackendRecord( + config=GCPStoredConfig( + **GCPBackendConfig.__response__.parse_obj(config).dict(), + ).json(), + auth=GCPCreds.parse_obj(config.creds).json(), + ) + + def get_backend_config_with_creds(self, record: BackendRecord) -> GCPBackendConfigWithCreds: + config = self._get_config(record) + return GCPBackendConfigWithCreds.__response__.parse_obj(config) + + def get_backend_config_without_creds(self, record: BackendRecord) -> GCPBackendConfig: + config = self._get_config(record) + return GCPBackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> GCPBackend: + config = self._get_config(record) + return GCPBackend(config=config) + + def _get_config(self, record: BackendRecord) -> GCPConfig: + return GCPConfig.__response__( + **json.loads(record.config), + creds=GCPCreds.parse_raw(record.auth).__root__, + ) + + def _check_config_tags(self, config: GCPBackendConfigWithCreds): + if not config.tags: + return + if len(config.tags) > TAGS_MAX_NUM: + raise ServerClientError( + f"Maximum number of tags exceeded. Up to {TAGS_MAX_NUM} tags is allowed." + ) + try: + resources.validate_labels(config.tags) + except BackendError as e: + raise ServerClientError(e.args[0]) + + def _check_config_vpc( + self, + config: GCPBackendConfigWithCreds, + subnetworks_client: compute_v1.SubnetworksClient, + routers_client: compute_v1.RoutersClient, + ): + allocate_public_ip = config.public_ips if config.public_ips is not None else True + nat_check = config.nat_check if config.nat_check is not None else True + try: + resources.check_vpc( + subnetworks_client=subnetworks_client, + routers_client=routers_client, + project_id=config.project_id, + regions=config.regions or DEFAULT_REGIONS, + vpc_name=config.vpc_name, + shared_vpc_project_id=config.vpc_project_id, + allocate_public_ip=allocate_public_ip, + nat_check=nat_check, + ) + except BackendError as e: + raise ServerClientError(e.args[0]) + # Not checking config.extra_vpcs and config.roce_vpcs so that users are not required to configure subnets for all regions + # but only for regions they intend to use. Validation will be done on provisioning. diff --git a/src/dstack/_internal/core/backends/gcp/features/__init__.py b/src/dstack/_internal/core/backends/gcp/features/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/core/backends/gcp/features/tcpx.py b/src/dstack/_internal/core/backends/gcp/features/tcpx.py new file mode 100644 index 0000000000..2d1c34013d --- /dev/null +++ b/src/dstack/_internal/core/backends/gcp/features/tcpx.py @@ -0,0 +1,65 @@ +from typing import List + + +def get_backend_specific_commands_tcpxo() -> List[str]: + return [ + "modprobe import-helper", + "gcloud -q auth configure-docker us-docker.pkg.dev", + # Install the nccl, nccl-net lib into /var/lib/tcpxo/lib64/. + ( + "docker run --rm " + "--name nccl-installer " + "--pull=never " + "--network=host " + "--volume /var/lib:/var/lib " + "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.8-1 " + "install --install-nccl" + ), + # Start FasTrak receive-datapath-manager + ( + "docker run " + "--name receive-datapath-manager " + "--detach " + "--pull=never " + "--cap-add=NET_ADMIN " + "--network=host " + "--privileged " + "--gpus all " + "--volume /usr/lib32:/usr/local/nvidia/lib64 " + "--volume /dev/dmabuf_import_helper:/dev/dmabuf_import_helper " + "--env LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu " + "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.14 " + "--num_hops=2 --num_nics=8 --uid= --alsologtostderr" + ), + ] + + +def get_backend_specific_commands_tcpx() -> List[str]: + return [ + "cos-extensions install gpu -- --version=latest", + "sudo mount --bind /var/lib/nvidia /var/lib/nvidia", + "sudo mount -o remount,exec /var/lib/nvidia", + ( + "docker run " + "--detach " + "--pull=always " + "--name receive-datapath-manager " + "--privileged " + "--cap-add=NET_ADMIN --network=host " + "--volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 " + "--device /dev/nvidia0:/dev/nvidia0 --device /dev/nvidia1:/dev/nvidia1 " + "--device /dev/nvidia2:/dev/nvidia2 --device /dev/nvidia3:/dev/nvidia3 " + "--device /dev/nvidia4:/dev/nvidia4 --device /dev/nvidia5:/dev/nvidia5 " + "--device /dev/nvidia6:/dev/nvidia6 --device /dev/nvidia7:/dev/nvidia7 " + "--device /dev/nvidia-uvm:/dev/nvidia-uvm --device /dev/nvidiactl:/dev/nvidiactl " + "--env LD_LIBRARY_PATH=/usr/local/nvidia/lib64 " + "--volume /run/tcpx:/run/tcpx " + "--entrypoint /tcpgpudmarxd/build/app/tcpgpudmarxd " + "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd " + '--gpu_nic_preset a3vm --gpu_shmem_type fd --uds_path "/run/tcpx" --setup_param "--verbose 128 2 0"' + ), + "sudo iptables -I INPUT -p tcp -m tcp -j ACCEPT", + "docker run --rm -v /var/lib:/var/lib us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx install --install-nccl", + "sudo mount --bind /var/lib/tcpx /var/lib/tcpx", + "sudo mount -o remount,exec /var/lib/tcpx", + ] diff --git a/src/dstack/_internal/core/backends/gcp/models.py b/src/dstack/_internal/core/backends/gcp/models.py new file mode 100644 index 0000000000..4d06144ee8 --- /dev/null +++ b/src/dstack/_internal/core/backends/gcp/models.py @@ -0,0 +1,160 @@ +from typing import Annotated, Dict, List, Literal, Optional, Union + +from pydantic import Field, root_validator + +from dstack._internal.core.backends.base.models import fill_data +from dstack._internal.core.models.common import CoreModel + + +class GCPServiceAccountCreds(CoreModel): + type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = ( + "service_account" + ) + filename: Annotated[ + Optional[str], Field(description="The path to the service account file") + ] = "" + data: Annotated[str, Field(description="The contents of the service account file")] + + +class GCPDefaultCreds(CoreModel): + type: Literal["default"] = "default" + + +AnyGCPCreds = Union[GCPServiceAccountCreds, GCPDefaultCreds] + + +class GCPCreds(CoreModel): + __root__: AnyGCPCreds = Field(..., discriminator="type") + + +class GCPBackendConfig(CoreModel): + type: Annotated[Literal["gcp"], Field(description="The type of backend")] = "gcp" + project_id: Annotated[str, Field(description="The project ID")] + regions: Annotated[ + Optional[List[str]], Field(description="The list of GCP regions. Omit to use all regions") + ] = None + vpc_name: Annotated[ + Optional[str], + Field(description="The name of a custom VPC. If not specified, the default VPC is used"), + ] = None + extra_vpcs: Annotated[ + Optional[List[str]], + Field( + description=( + "The names of additional VPCs used for multi-NIC instances, such as those that support GPUDirect." + " Specify eight VPCs to maximize bandwidth in clusters with eight-GPU instances." + " Each VPC must have a subnet and a firewall rule allowing internal traffic across all subnets" + ) + ), + ] = None + roce_vpcs: Annotated[ + Optional[List[str]], + Field( + description=( + "The names of additional VPCs with the RoCE network profile." + " Used for RDMA on GPU instances that support the MRDMA interface type." + " A VPC should have eight subnets to maximize the bandwidth in clusters" + " with eight-GPU instances." + ), + max_items=1, # The currently supported instance types only need one VPC with eight subnets. + ), + ] = None + vpc_project_id: Annotated[ + Optional[str], + Field(description="The shared VPC hosted project ID. Required for shared VPC only"), + ] = None + public_ips: Annotated[ + Optional[bool], + Field( + description="A flag to enable/disable public IP assigning on instances. Defaults to `true`" + ), + ] = None + nat_check: Annotated[ + Optional[bool], + Field( + description=( + "A flag to enable/disable a check that Cloud NAT is configured for the VPC." + " This should be set to `false` when `public_ips: false` and outbound internet connectivity" + " is provided by a mechanism other than Cloud NAT such as a third-party NAT appliance." + " Defaults to `true`" + ) + ), + ] = None + vm_service_account: Annotated[ + Optional[str], Field(description="The service account to associate with provisioned VMs") + ] = None + tags: Annotated[ + Optional[Dict[str, str]], + Field( + description="The tags (labels) that will be assigned to resources created by `dstack`" + ), + ] = None + preview_features: Annotated[ + Optional[List[Literal["g4"]]], + Field( + description=( + "The list of preview GCP features to enable." + " There are currently no preview features" + ), + max_items=1, + ), + ] = None + + +class GCPBackendConfigWithCreds(GCPBackendConfig): + creds: AnyGCPCreds = Field(..., description="The credentials", discriminator="type") + + +class GCPServiceAccountFileCreds(CoreModel): + type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = ( + "service_account" + ) + filename: Annotated[str, Field(description="The path to the service account file")] + data: Annotated[ + Optional[str], + Field( + description=( + "The contents of the service account file." + " When configuring via `server/config.yml`, it's automatically filled from `filename`." + " When configuring via UI, it has to be specified explicitly" + ) + ), + ] = None + + @root_validator + def fill_data(cls, values): + return fill_data(values) + + +AnyGCPFileCreds = Union[GCPServiceAccountFileCreds, GCPDefaultCreds] + + +class GCPBackendFileConfigWithCreds(GCPBackendConfig): + creds: AnyGCPFileCreds = Field(..., description="The credentials", discriminator="type") + + +AnyGCPBackendConfig = Union[GCPBackendConfig, GCPBackendConfigWithCreds] + + +class GCPStoredConfig(GCPBackendConfig): + pass + + +class GCPConfig(GCPStoredConfig): + creds: AnyGCPCreds + + @property + def allocate_public_ips(self) -> bool: + if self.public_ips is not None: + return self.public_ips + return True + + @property + def vpc_resource_name(self) -> str: + vpc_name = self.vpc_name + if vpc_name is None: + vpc_name = "default" + project_id = self.project_id + if self.vpc_project_id is not None: + project_id = self.vpc_project_id + return f"projects/{project_id}/global/networks/{vpc_name}" diff --git a/src/dstack/_internal/core/backends/gcp/resources.py b/src/dstack/_internal/core/backends/gcp/resources.py index 7cf79f5fbc..ffbbda3158 100644 --- a/src/dstack/_internal/core/backends/gcp/resources.py +++ b/src/dstack/_internal/core/backends/gcp/resources.py @@ -1,8 +1,6 @@ import concurrent.futures -import random import re -import string -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Tuple import google.api_core.exceptions import google.cloud.compute_v1 as compute_v1 @@ -10,8 +8,7 @@ from google.api_core.operation import Operation from google.cloud import tpu_v2 -import dstack.version as version -from dstack._internal.core.errors import ComputeError +from dstack._internal.core.errors import BackendError, ComputeError from dstack._internal.core.models.instances import Gpu from dstack._internal.utils.common import remove_prefix from dstack._internal.utils.logging import get_logger @@ -22,23 +19,60 @@ DSTACK_GATEWAY_TAG = "dstack-gateway-instance" supported_accelerators = [ + {"accelerator_name": "nvidia-b200", "gpu_name": "B200", "memory_mb": 1024 * 180}, {"accelerator_name": "nvidia-a100-80gb", "gpu_name": "A100", "memory_mb": 1024 * 80}, {"accelerator_name": "nvidia-tesla-a100", "gpu_name": "A100", "memory_mb": 1024 * 40}, {"accelerator_name": "nvidia-l4", "gpu_name": "L4", "memory_mb": 1024 * 24}, {"accelerator_name": "nvidia-tesla-t4", "gpu_name": "T4", "memory_mb": 1024 * 16}, {"accelerator_name": "nvidia-tesla-v100", "gpu_name": "V100", "memory_mb": 1024 * 16}, {"accelerator_name": "nvidia-tesla-p100", "gpu_name": "P100", "memory_mb": 1024 * 16}, + {"accelerator_name": "nvidia-rtx-pro-6000", "gpu_name": "RTXPRO6000", "memory_mb": 1024 * 96}, ] +def find_accelerator_name(gpu_name: str, memory_mib: int) -> Optional[str]: + for acc in supported_accelerators: + if gpu_name == acc["gpu_name"] and memory_mib == acc["memory_mb"]: + return acc["accelerator_name"] + return None + + +def sanitize_filter_value(value: str) -> str: + """ + Escape characters that could break the Compute Engine API filter string. + """ + return value.replace("\\", "\\\\").replace('"', '\\"') + + +def get_resource_project(resource_url: str) -> str: + """ + Extract the project ID from a URL like + https://www.googleapis.com/compute/v1/projects/proj-id/zones/us-central1-a/instances/vm-name + """ + matches = re.findall(r"/projects/(?P[a-z0-9-]+)/", resource_url) + if not matches: + raise BackendError(f"Invalid resource URL {resource_url}") + return matches[0] + + +def get_availability_zones( + regions_client: compute_v1.RegionsClient, + project_id: str, + region: str, +) -> List[str]: + region_info = regions_client.get(project=project_id, region=region) + return [full_resource_name_to_name(z) for z in region_info.zones] + + def check_vpc( - network_client: compute_v1.NetworksClient, + subnetworks_client: compute_v1.SubnetworksClient, routers_client: compute_v1.RoutersClient, project_id: str, regions: List[str], allocate_public_ip: bool, vpc_name: Optional[str] = None, shared_vpc_project_id: Optional[str] = None, + nat_check: bool = True, ): if vpc_name is None: vpc_name = "default" @@ -46,22 +80,35 @@ def check_vpc( if shared_vpc_project_id: vpc_project_id = shared_vpc_project_id try: - network_client.get(project=vpc_project_id, network=vpc_name) + usable_subnets = list_project_usable_subnets( + subnetworks_client=subnetworks_client, project_id=vpc_project_id + ) + for region in regions: + get_vpc_subnet_or_error( + vpc_name=vpc_name, + region=region, + usable_subnets=usable_subnets, + ) except google.api_core.exceptions.NotFound: - raise ComputeError(f"Failed to find VPC {vpc_name} in project {vpc_project_id}") + raise ComputeError(f"Failed to find VPC project {vpc_project_id}") if allocate_public_ip: return - regions_without_nat = [] - for region in regions: - if not has_vpc_nat_access(routers_client, vpc_project_id, vpc_name, region): - regions_without_nat.append(region) - - if regions_without_nat: - raise ComputeError( - f"VPC {vpc_name} in project {vpc_project_id} does not have Cloud NAT configured for external internet access in regions: {regions_without_nat}" - ) + # We may have no permissions to check NAT in a shared VPC + if nat_check and shared_vpc_project_id is None: + regions_without_nat = [] + for region in regions: + if not has_vpc_nat_access(routers_client, vpc_project_id, vpc_name, region): + regions_without_nat.append(region) + + if regions_without_nat: + raise ComputeError( + f"VPC {vpc_name} in project {vpc_project_id} does not have Cloud NAT configured" + f" for outbound internet connectivity in regions: {regions_without_nat}." + " Specify `nat_check: false` if you use a different mechanism" + " for outbound internet connectivity such as a third-party NAT appliance." + ) def has_vpc_nat_access( @@ -98,26 +145,22 @@ def create_instance_struct( service_account: Optional[str] = None, network: str = "global/networks/default", subnetwork: Optional[str] = None, + extra_subnetworks: Optional[List[Tuple[str, str]]] = None, + roce_subnetworks: Optional[List[Tuple[str, str]]] = None, allocate_public_ip: bool = True, + placement_policy: Optional[str] = None, + reservation: Optional[compute_v1.Reservation] = None, ) -> compute_v1.Instance: - network_interface = compute_v1.NetworkInterface() - network_interface.network = network - if subnetwork is not None: - network_interface.subnetwork = subnetwork - - if allocate_public_ip: - access = compute_v1.AccessConfig() - access.type_ = compute_v1.AccessConfig.Type.ONE_TO_ONE_NAT.name - access.name = "External NAT" - access.network_tier = access.NetworkTier.PREMIUM.name - network_interface.access_configs = [access] - else: - network_interface.access_configs = [] - instance = compute_v1.Instance() - instance.network_interfaces = [network_interface] instance.name = instance_name instance.machine_type = f"zones/{zone}/machineTypes/{machine_type}" + instance.network_interfaces = _get_network_interfaces( + network=network, + subnetwork=subnetwork, + allocate_public_ip=allocate_public_ip, + extra_subnetworks=extra_subnetworks, + roce_subnetworks=roce_subnetworks, + ) disk = compute_v1.AttachedDisk() disk.auto_delete = True @@ -125,9 +168,31 @@ def create_instance_struct( initialize_params = compute_v1.AttachedDiskInitializeParams() initialize_params.source_image = image_id initialize_params.disk_size_gb = disk_size - initialize_params.disk_type = f"zones/{zone}/diskTypes/pd-balanced" + if instance_type_supports_persistent_disk(machine_type): + initialize_params.disk_type = f"zones/{zone}/diskTypes/pd-balanced" + else: + initialize_params.disk_type = f"zones/{zone}/diskTypes/hyperdisk-balanced" disk.initialize_params = initialize_params instance.disks = [disk] + if ( + reservation is not None + and reservation.specific_reservation is not None + and reservation.specific_reservation.instance_properties is not None + and reservation.specific_reservation.instance_properties.local_ssds is not None + ): + for local_ssd in reservation.specific_reservation.instance_properties.local_ssds: + instance.disks.append( + compute_v1.AttachedDisk( + auto_delete=True, + boot=False, + type_="SCRATCH", + initialize_params=compute_v1.AttachedDiskInitializeParams( + disk_type=f"zones/{zone}/diskTypes/local-ssd", + disk_size_gb=local_ssd.disk_size_gb, + ), + interface=local_ssd.interface, + ) + ) if accelerators: instance.guest_accelerators = accelerators @@ -141,6 +206,11 @@ def create_instance_struct( # Attachable GPUs, H100, A100, and L4 instance.scheduling.on_host_maintenance = "TERMINATE" + if placement_policy is not None: + instance.resource_policies = [placement_policy] + elif reservation is not None and "placement" in reservation.resource_policies: + instance.resource_policies = [reservation.resource_policies["placement"]] + if spot: instance.scheduling = compute_v1.Scheduling() instance.scheduling.provisioning_model = compute_v1.Scheduling.ProvisioningModel.SPOT.name @@ -165,44 +235,108 @@ def create_instance_struct( ) ] + if reservation is not None: + reservation_project = get_resource_project(reservation.self_link) + instance.reservation_affinity = compute_v1.ReservationAffinity() + instance.reservation_affinity.consume_reservation_type = ( + compute_v1.ReservationAffinity.ConsumeReservationType.SPECIFIC_RESERVATION.name + ) + instance.reservation_affinity.key = "compute.googleapis.com/reservation-name" + instance.reservation_affinity.values = [ + f"projects/{reservation_project}/reservations/{reservation.name}" + ] + return instance -def get_image_id(cuda: bool) -> str: - if not cuda: - image_name = f"dstack-{version.base_image}" +def _get_network_interfaces( + network: str, + subnetwork: Optional[str], + allocate_public_ip: bool, + extra_subnetworks: Optional[List[Tuple[str, str]]], + roce_subnetworks: Optional[List[Tuple[str, str]]], +) -> List[compute_v1.NetworkInterface]: + network_interface = compute_v1.NetworkInterface() + network_interface.network = network + if subnetwork is not None: + network_interface.subnetwork = subnetwork + if allocate_public_ip: + access = compute_v1.AccessConfig() + access.type_ = compute_v1.AccessConfig.Type.ONE_TO_ONE_NAT.name + access.name = "External NAT" + access.network_tier = access.NetworkTier.PREMIUM.name + network_interface.access_configs = [access] else: - image_name = f"dstack-cuda-{version.base_image}" - image_name = image_name.replace(".", "-") + network_interface.access_configs = [] - return f"projects/dstack/global/images/{image_name}" + if extra_subnetworks: + # Multiple interfaces are set only for GPU VM that require gVNIC for best performance + network_interface.nic_type = compute_v1.NetworkInterface.NicType.GVNIC.name + + network_interfaces = [network_interface] + for network, subnetwork in extra_subnetworks or []: + network_interfaces.append( + compute_v1.NetworkInterface( + network=network, + subnetwork=subnetwork, + nic_type=compute_v1.NetworkInterface.NicType.GVNIC.name, + ) + ) + for network, subnetwork in roce_subnetworks or []: + network_interfaces.append( + compute_v1.NetworkInterface( + network=network, + subnetwork=subnetwork, + nic_type=compute_v1.NetworkInterface.NicType.MRDMA.name, + ) + ) + return network_interfaces -def get_gateway_image_id() -> str: - return "projects/ubuntu-os-cloud/global/images/ubuntu-2204-jammy-v20230714" +def list_project_usable_subnets( + subnetworks_client: compute_v1.SubnetworksClient, + project_id: str, +) -> List[compute_v1.UsableSubnetwork]: + request = compute_v1.ListUsableSubnetworksRequest(project=project_id) + return [s for s in subnetworks_client.list_usable(request=request)] def get_vpc_subnet_or_error( - subnetworks_client: compute_v1.SubnetworksClient, - vpc_project_id: str, vpc_name: str, region: str, + usable_subnets: list[compute_v1.UsableSubnetwork], ) -> str: """ Returns resource name of any usable subnet in a given VPC (e.g. "projects/example-project/regions/europe-west4/subnetworks/example-subnet") """ - request = compute_v1.ListUsableSubnetworksRequest(project=vpc_project_id) - for subnet in subnetworks_client.list_usable(request=request): + vpc_subnets = get_vpc_subnets(vpc_name, region, usable_subnets) + if vpc_subnets: + return vpc_subnets[0] + raise ComputeError( + f"No usable subnetwork found in region {region} for VPC {vpc_name}." + f" Ensure that VPC {vpc_name} exists and has usable subnetworks." + ) + + +def get_vpc_subnets( + vpc_name: str, + region: str, + usable_subnets: list[compute_v1.UsableSubnetwork], +) -> list[str]: + """ + Returns resource names of all usable subnets in a given VPC + (e.g. ["projects/example-project/regions/europe-west4/subnetworks/example-subnet"]) + """ + result = [] + for subnet in usable_subnets: network_name = subnet.network.split("/")[-1] subnet_url = subnet.subnetwork subnet_resource_name = remove_prefix(subnet_url, "https://www.googleapis.com/compute/v1/") subnet_region = subnet_resource_name.split("/")[3] if network_name == vpc_name and subnet_region == region: - return subnet_resource_name - raise ComputeError( - f"No usable subnetwork found in region {region} for VPC {vpc_name} in project {vpc_project_id}" - ) + result.append(subnet_resource_name) + return result def create_runner_firewall_rules( @@ -275,11 +409,8 @@ def get_accelerators( return [] accelerator_config = compute_v1.AcceleratorConfig() accelerator_config.accelerator_count = len(gpus) - for acc in supported_accelerators: - if gpus[0].name == acc["gpu_name"] and gpus[0].memory_mib == acc["memory_mb"]: - accelerator_name = acc["accelerator_name"] - break - else: + accelerator_name = find_accelerator_name(gpus[0].name, gpus[0].memory_mib) + if accelerator_name is None: raise ValueError(f"Unsupported GPU: {gpus[0].name} {gpus[0].memory_mib} MiB") accelerator_config.accelerator_type = ( f"projects/{project_id}/zones/{zone}/acceleratorTypes/{accelerator_name}" @@ -287,13 +418,61 @@ def get_accelerators( return [accelerator_config] -NAME_PATTERN = re.compile(r"^[a-z]([-a-z0-9]*[a-z0-9])?$") +def find_reservation( + reservations_client: compute_v1.ReservationsClient, + project_id: str, + name: str, +) -> dict[str, compute_v1.Reservation]: + request = compute_v1.AggregatedListReservationsRequest( + project=project_id, + filter=( + f'(name = "{sanitize_filter_value(name)}")' + ' AND (status = "READY")' + " AND (specificReservationRequired = true)" + ), + ) + try: + aggregated_reservations = reservations_client.aggregated_list(request=request) + except (google.api_core.exceptions.NotFound, google.api_core.exceptions.Forbidden) as e: + logger.warning("Could not find reservation: %s", e) + return {} + zone_to_reservation = {} + for zone, zone_reservations in aggregated_reservations: + if zone_reservations.reservations: + zone_to_reservation[zone.split("/")[-1]] = zone_reservations.reservations[0] + return zone_to_reservation + + +def filter_invalid_labels(labels: Dict[str, str]) -> Dict[str, str]: + filtered_labels = {} + for k, v in labels.items(): + if not _is_valid_label(k, v): + logger.warning("Skipping invalid label '%s: %s'", k, v) + continue + filtered_labels[k] = v + return filtered_labels + + +def validate_labels(labels: Dict[str, str]): + for k, v in labels.items(): + if not _is_valid_label(k, v): + raise BackendError( + "Invalid resource labels. " + "See labels restrictions: https://cloud.google.com/compute/docs/labeling-resources#requirements" + ) + + +def _is_valid_label(key: str, value: str) -> bool: + return is_valid_resource_name(key) and is_valid_label_value(value) + -LABEL_VALUE_PATTERN = re.compile(r"^[-a-z0-9]{0,63}$") +MAX_RESOURCE_NAME_LEN = 63 +NAME_PATTERN = re.compile(r"^[a-z][_\-a-z0-9]{0,62}$") +LABEL_VALUE_PATTERN = re.compile(r"^[_\-a-z0-9]{0,63}$") def is_valid_resource_name(name: str) -> bool: - if len(name) < 1 or len(name) > 63: + if len(name) < 1 or len(name) > MAX_RESOURCE_NAME_LEN: return False match = re.match(NAME_PATTERN, name) return match is not None @@ -304,34 +483,40 @@ def is_valid_label_value(value: str) -> bool: return match is not None -def generate_random_resource_name(length: int = 40) -> str: - return random.choice(string.ascii_lowercase) + "".join( - random.choice(string.ascii_lowercase + string.digits) for _ in range(length) - ) - - def create_tpu_node_struct( instance_name: str, startup_script: str, authorized_keys: List[str], spot: bool, labels: Dict[str, str], + runtime_version: str = "tpu-ubuntu2204-base", + network: str = "global/networks/default", subnetwork: Optional[str] = None, allocate_public_ip: bool = True, + service_account: Optional[str] = None, + data_disks: Optional[List[tpu_v2.AttachedDisk]] = None, ) -> tpu_v2.Node: node = tpu_v2.Node() if spot: node.scheduling_config = tpu_v2.SchedulingConfig(preemptible=True) node.accelerator_type = instance_name - node.runtime_version = "tpu-ubuntu2204-base" - # subnetwork determines the network, so network shouldn't be specified + node.runtime_version = runtime_version node.network_config = tpu_v2.NetworkConfig( enable_external_ips=allocate_public_ip, + network=network, subnetwork=subnetwork, ) ssh_keys = "\n".join(f"ubuntu:{key}" for key in authorized_keys) node.metadata = {"ssh-keys": ssh_keys, "startup-script": startup_script} node.labels = labels + if service_account is not None: + node.service_account = tpu_v2.ServiceAccount( + email=service_account, + scope=["https://www.googleapis.com/auth/cloud-platform"], + ) + if data_disks is not None: + for disk in data_disks: + node.data_disks.append(disk) return node @@ -342,7 +527,7 @@ def wait_for_extended_operation( if operation.error_code: # Write only debug logs here. - # The unexpected errors will be propagated and logged appropriatly by the caller. + # The unexpected errors will be propagated and logged appropriately by the caller. logger.debug( "Error during %s: [Code: %s]: %s", verbose_name, @@ -363,7 +548,38 @@ def wait_for_operation(operation: Operation, verbose_name: str = "operation", ti raise except Exception as e: # Write only debug logs here. - # The unexpected errors will be propagated and logged appropriatly by the caller. + # The unexpected errors will be propagated and logged appropriately by the caller. logger.debug("Error during %s: %s", verbose_name, e) raise operation.exception() or e return result + + +def full_resource_name_to_name(full_resource_name: str) -> str: + return full_resource_name.split("/")[-1] + + +def vpc_name_to_vpc_resource_name(project_id: str, vpc_name: str) -> str: + return f"projects/{project_id}/global/networks/{vpc_name}" + + +def get_placement_policy_resource_name( + project_id: str, + region: str, + placement_policy: str, +) -> str: + return f"projects/{project_id}/regions/{region}/resourcePolicies/{placement_policy}" + + +def instance_type_supports_persistent_disk(instance_type_name: str) -> bool: + return not any( + instance_type_name.startswith(series) + for series in [ + "m4-", + "c4-", + "n4-", + "h3-", + "v6e", + "a4-", + "g4-", + ] + ) diff --git a/src/dstack/_internal/core/backends/hotaisle/__init__.py b/src/dstack/_internal/core/backends/hotaisle/__init__.py new file mode 100644 index 0000000000..9c665d1498 --- /dev/null +++ b/src/dstack/_internal/core/backends/hotaisle/__init__.py @@ -0,0 +1 @@ +# Hotaisle backend for dstack diff --git a/src/dstack/_internal/core/backends/hotaisle/api_client.py b/src/dstack/_internal/core/backends/hotaisle/api_client.py new file mode 100644 index 0000000000..a3cc355fcd --- /dev/null +++ b/src/dstack/_internal/core/backends/hotaisle/api_client.py @@ -0,0 +1,113 @@ +from typing import Any, Dict, Optional + +import requests + +from dstack._internal.core.backends.base.configurator import raise_invalid_credentials_error +from dstack._internal.utils.logging import get_logger + +API_URL = "https://admin.hotaisle.app/api" + +logger = get_logger(__name__) + + +class HotAisleAPIClient: + def __init__(self, api_key: str, team_handle: str): + self.api_key = api_key + self.team_handle = team_handle + + def validate_api_key(self) -> bool: + url = f"{API_URL}/user/" + try: + response = self._make_request("GET", url) + response.raise_for_status() + except requests.HTTPError as e: + if e.response is not None: + if e.response.status_code == 401: + raise_invalid_credentials_error( + fields=[["creds", "api_key"]], details="Invalid API key" + ) + if e.response.status_code == 403: + raise_invalid_credentials_error( + fields=[["creds", "api_key"]], + details="Authenticated user does not have required permissions", + ) + raise + + user_data = response.json() + teams = user_data["teams"] + if not teams: + raise_invalid_credentials_error( + fields=[["creds", "api_key"]], + details="Valid API key but no teams found for this user", + ) + + available_teams = [team["handle"] for team in teams] + if self.team_handle not in available_teams: + raise_invalid_credentials_error( + fields=[["team_handle"]], + details=f"Team handle '{self.team_handle}' not found", + ) + return True + + def upload_ssh_key(self, public_key: str) -> bool: + url = f"{API_URL}/user/ssh_keys/" + payload = {"authorized_key": public_key} + + response = self._make_request("POST", url, json=payload) + + if response.status_code == 409: + return True # Key already exists - success + response.raise_for_status() + return True + + def create_virtual_machine(self, vm_payload: Dict[str, Any]) -> Dict[str, Any]: + url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/" + response = self._make_request("POST", url, json=vm_payload) + response.raise_for_status() + vm_data = response.json() + return vm_data + + def get_vm_state(self, vm_name: str) -> str: + url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/{vm_name}/state/" + response = self._make_request("GET", url) + response.raise_for_status() + state_data = response.json() + return state_data["state"] + + def terminate_virtual_machine(self, vm_name: str) -> None: + url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/{vm_name}/" + response = self._make_request( + "DELETE", + url, + params={ + "force": "true", # delete even if min reservation time not met + }, + ) + if response.status_code == 404: + logger.debug("Hot Aisle virtual machine %s not found", vm_name) + return + response.raise_for_status() + + def _make_request( + self, + method: str, + url: str, + json: Optional[dict[str, Any]] = None, + params: Optional[dict[str, str]] = None, + timeout: int = 30, + ) -> requests.Response: + headers = { + "accept": "application/json", + "Authorization": f"Token {self.api_key}", + } + if json is not None: + headers["Content-Type"] = "application/json" + + return requests.request( + method=method, + url=url, + headers=headers, + json=json, + params=params, + timeout=timeout, + ) diff --git a/src/dstack/_internal/core/backends/hotaisle/backend.py b/src/dstack/_internal/core/backends/hotaisle/backend.py new file mode 100644 index 0000000000..cb568f5258 --- /dev/null +++ b/src/dstack/_internal/core/backends/hotaisle/backend.py @@ -0,0 +1,16 @@ +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.hotaisle.compute import HotAisleCompute +from dstack._internal.core.backends.hotaisle.models import HotAisleConfig +from dstack._internal.core.models.backends.base import BackendType + + +class HotAisleBackend(Backend): + TYPE = BackendType.HOTAISLE + COMPUTE_CLASS = HotAisleCompute + + def __init__(self, config: HotAisleConfig): + self.config = config + self._compute = HotAisleCompute(self.config) + + def compute(self) -> HotAisleCompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/hotaisle/compute.py b/src/dstack/_internal/core/backends/hotaisle/compute.py new file mode 100644 index 0000000000..b206a71d47 --- /dev/null +++ b/src/dstack/_internal/core/backends/hotaisle/compute.py @@ -0,0 +1,187 @@ +import shlex +import subprocess +import tempfile +from threading import Thread +from typing import Any, List, Optional + +import gpuhunt +from gpuhunt.providers.hotaisle import HotAisleProvider + +from dstack._internal.core.backends.base.compute import ( + Compute, + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithInstanceVolumesSupport, + ComputeWithPrivilegedSupport, + get_shim_commands, +) +from dstack._internal.core.backends.base.offers import get_catalog_offers +from dstack._internal.core.backends.hotaisle.api_client import HotAisleAPIClient +from dstack._internal.core.backends.hotaisle.models import HotAisleConfig +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import CoreModel +from dstack._internal.core.models.instances import ( + InstanceAvailability, + InstanceConfiguration, + InstanceOffer, + InstanceOfferWithAvailability, +) +from dstack._internal.core.models.placement import PlacementGroup +from dstack._internal.core.models.runs import JobProvisioningData +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +SUPPORTED_GPUS = ["MI300X"] + + +class HotAisleCompute( + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithPrivilegedSupport, + ComputeWithInstanceVolumesSupport, + Compute, +): + def __init__(self, config: HotAisleConfig): + super().__init__() + self.config = config + self.api_client = HotAisleAPIClient(config.creds.api_key, config.team_handle) + self.catalog = gpuhunt.Catalog(balance_resources=False, auto_reload=False) + self.catalog.add_provider( + HotAisleProvider(api_key=config.creds.api_key, team_handle=config.team_handle) + ) + + def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]: + offers = get_catalog_offers( + backend=BackendType.HOTAISLE, + locations=self.config.regions or None, + catalog=self.catalog, + extra_filter=_supported_instances, + ) + return [ + offer.with_availability(availability=InstanceAvailability.AVAILABLE) + for offer in offers + ] + + def create_instance( + self, + instance_offer: InstanceOfferWithAvailability, + instance_config: InstanceConfiguration, + placement_group: Optional[PlacementGroup], + ) -> JobProvisioningData: + project_ssh_key = instance_config.ssh_keys[0] + self.api_client.upload_ssh_key(project_ssh_key.public) + offer_backend_data: HotAisleOfferBackendData = ( + HotAisleOfferBackendData.__response__.parse_obj(instance_offer.backend_data) + ) + vm_data = self.api_client.create_virtual_machine(offer_backend_data.vm_specs) + return JobProvisioningData( + backend=instance_offer.backend, + instance_type=instance_offer.instance, + instance_id=vm_data["name"], + hostname=None, + internal_ip=None, + region=instance_offer.region, + price=instance_offer.price, + username="hotaisle", + ssh_port=22, + dockerized=True, + ssh_proxy=None, + backend_data=HotAisleInstanceBackendData(ip_address=vm_data["ip_address"]).json(), + ) + + def update_provisioning_data( + self, + provisioning_data: JobProvisioningData, + project_ssh_public_key: str, + project_ssh_private_key: str, + ): + vm_state = self.api_client.get_vm_state(provisioning_data.instance_id) + if vm_state == "running": + if provisioning_data.hostname is None and provisioning_data.backend_data: + backend_data = HotAisleInstanceBackendData.load(provisioning_data.backend_data) + provisioning_data.hostname = backend_data.ip_address + commands = get_shim_commands(arch=provisioning_data.instance_type.resources.cpu_arch) + launch_command = "sudo sh -c " + shlex.quote(" && ".join(commands)) + thread = Thread( + target=_start_runner, + kwargs={ + "hostname": provisioning_data.hostname, + "project_ssh_private_key": project_ssh_private_key, + "launch_command": launch_command, + }, + daemon=True, + ) + thread.start() + + def terminate_instance( + self, instance_id: str, region: str, backend_data: Optional[str] = None + ): + vm_name = instance_id + self.api_client.terminate_virtual_machine(vm_name) + + +def _start_runner( + hostname: str, + project_ssh_private_key: str, + launch_command: str, +): + _launch_runner( + hostname=hostname, + ssh_private_key=project_ssh_private_key, + launch_command=launch_command, + ) + + +def _launch_runner( + hostname: str, + ssh_private_key: str, + launch_command: str, +): + daemonized_command = f"{launch_command.rstrip('&')} >/tmp/dstack-shim.log 2>&1 & disown" + _run_ssh_command( + hostname=hostname, + ssh_private_key=ssh_private_key, + command=daemonized_command, + ) + + +def _run_ssh_command(hostname: str, ssh_private_key: str, command: str): + with tempfile.NamedTemporaryFile("w+", 0o600) as f: + f.write(ssh_private_key) + f.flush() + subprocess.run( + [ + "ssh", + "-F", + "none", + "-o", + "StrictHostKeyChecking=no", + "-i", + f.name, + f"hotaisle@{hostname}", + command, + ], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + + +def _supported_instances(offer: InstanceOffer) -> bool: + return len(offer.instance.resources.gpus) > 0 and all( + gpu.name in SUPPORTED_GPUS for gpu in offer.instance.resources.gpus + ) + + +class HotAisleInstanceBackendData(CoreModel): + ip_address: str + + @classmethod + def load(cls, raw: Optional[str]) -> "HotAisleInstanceBackendData": + assert raw is not None + return cls.__response__.parse_raw(raw) + + +class HotAisleOfferBackendData(CoreModel): + vm_specs: dict[str, Any] diff --git a/src/dstack/_internal/core/backends/hotaisle/configurator.py b/src/dstack/_internal/core/backends/hotaisle/configurator.py new file mode 100644 index 0000000000..8f7a6f537f --- /dev/null +++ b/src/dstack/_internal/core/backends/hotaisle/configurator.py @@ -0,0 +1,66 @@ +import json + +from dstack._internal.core.backends.base.configurator import ( + BackendRecord, + Configurator, +) +from dstack._internal.core.backends.hotaisle.api_client import HotAisleAPIClient +from dstack._internal.core.backends.hotaisle.backend import HotAisleBackend +from dstack._internal.core.backends.hotaisle.models import ( + AnyHotAisleCreds, + HotAisleBackendConfig, + HotAisleBackendConfigWithCreds, + HotAisleConfig, + HotAisleCreds, + HotAisleStoredConfig, +) +from dstack._internal.core.models.backends.base import ( + BackendType, +) + + +class HotAisleConfigurator( + Configurator[ + HotAisleBackendConfig, + HotAisleBackendConfigWithCreds, + ] +): + TYPE = BackendType.HOTAISLE + BACKEND_CLASS = HotAisleBackend + + def validate_config(self, config: HotAisleBackendConfigWithCreds, default_creds_enabled: bool): + self._validate_creds(config.creds, config.team_handle) + + def create_backend( + self, project_name: str, config: HotAisleBackendConfigWithCreds + ) -> BackendRecord: + return BackendRecord( + config=HotAisleStoredConfig( + **HotAisleBackendConfig.__response__.parse_obj(config).dict() + ).json(), + auth=HotAisleCreds.parse_obj(config.creds).json(), + ) + + def get_backend_config_with_creds( + self, record: BackendRecord + ) -> HotAisleBackendConfigWithCreds: + config = self._get_config(record) + return HotAisleBackendConfigWithCreds.__response__.parse_obj(config) + + def get_backend_config_without_creds(self, record: BackendRecord) -> HotAisleBackendConfig: + config = self._get_config(record) + return HotAisleBackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> HotAisleBackend: + config = self._get_config(record) + return HotAisleBackend(config=config) + + def _get_config(self, record: BackendRecord) -> HotAisleConfig: + return HotAisleConfig.__response__( + **json.loads(record.config), + creds=HotAisleCreds.parse_raw(record.auth), + ) + + def _validate_creds(self, creds: AnyHotAisleCreds, team_handle: str): + api_client = HotAisleAPIClient(creds.api_key, team_handle) + api_client.validate_api_key() diff --git a/src/dstack/_internal/core/backends/hotaisle/models.py b/src/dstack/_internal/core/backends/hotaisle/models.py new file mode 100644 index 0000000000..efee6b4e93 --- /dev/null +++ b/src/dstack/_internal/core/backends/hotaisle/models.py @@ -0,0 +1,45 @@ +from typing import Annotated, List, Literal, Optional, Union + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +class HotAisleAPIKeyCreds(CoreModel): + type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" + api_key: Annotated[str, Field(description="The Hot Aisle API key")] + + +AnyHotAisleCreds = HotAisleAPIKeyCreds +HotAisleCreds = AnyHotAisleCreds + + +class HotAisleBackendConfig(CoreModel): + type: Annotated[ + Literal["hotaisle"], + Field(description="The type of backend"), + ] = "hotaisle" + team_handle: Annotated[str, Field(description="The Hot Aisle team handle")] + regions: Annotated[ + Optional[List[str]], + Field(description="The list of Hot Aisle regions. Omit to use all regions"), + ] = None + + +class HotAisleBackendConfigWithCreds(HotAisleBackendConfig): + creds: Annotated[AnyHotAisleCreds, Field(description="The credentials")] + + +AnyHotAisleBackendConfig = Union[HotAisleBackendConfig, HotAisleBackendConfigWithCreds] + + +class HotAisleBackendFileConfigWithCreds(HotAisleBackendConfig): + creds: Annotated[AnyHotAisleCreds, Field(description="The credentials")] + + +class HotAisleStoredConfig(HotAisleBackendConfig): + pass + + +class HotAisleConfig(HotAisleStoredConfig): + creds: AnyHotAisleCreds diff --git a/src/dstack/_internal/core/backends/jarvislabs/__init__.py b/src/dstack/_internal/core/backends/jarvislabs/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/core/backends/jarvislabs/api_client.py b/src/dstack/_internal/core/backends/jarvislabs/api_client.py new file mode 100644 index 0000000000..0a9a3b68d2 --- /dev/null +++ b/src/dstack/_internal/core/backends/jarvislabs/api_client.py @@ -0,0 +1,327 @@ +import hashlib +from typing import Any, Dict, List, Optional + +import requests +from gpuhunt.providers.jarvislabs import API_URL, JARVISLABS_REGION_URLS + +from dstack._internal.core.errors import ( + BackendError, + BackendInvalidCredentialsError, + NoCapacityError, +) + +TIMEOUT = 120 + + +class JarvisLabsNotFoundError(BackendError): + pass + + +class JarvisLabsAPIClient: + def __init__(self, api_key: str): + self.api_key = api_key + + def validate_api_key(self) -> bool: + try: + self.get_user_info() + except BackendInvalidCredentialsError: + return False + return True + + def get_user_info(self) -> Dict[str, Any]: + resp = self._make_request("GET", "users/user_info") + if not isinstance(resp, dict): + raise BackendError("Unexpected JarvisLabs user_info response") + return resp + + def list_ssh_keys(self) -> List[Dict[str, Any]]: + resp = self._make_request("GET", "ssh/") + if isinstance(resp, list): + return resp + raise BackendError("Unexpected JarvisLabs SSH key list response") + + def add_ssh_key(self, public_key: str, key_name: str) -> None: + resp = self._make_request( + "POST", + "ssh/", + json={ + "ssh_key": public_key, + "key_name": key_name, + }, + ) + _raise_if_unsuccessful(resp, "Failed to add JarvisLabs SSH key") + + def create_ssh_key(self, public_key: str, key_name: str) -> str: + self.add_ssh_key(public_key=public_key, key_name=key_name) + key_id = self.find_ssh_key_id(public_key=public_key, key_name=key_name) + if key_id is None: + raise BackendError("Failed to find created JarvisLabs SSH key") + return key_id + + def find_ssh_key_id(self, public_key: str, key_name: str) -> Optional[str]: + normalized_key = _normalize_public_key(public_key) + for ssh_key in self.list_ssh_keys(): + if str(ssh_key.get("key_name", "")) != key_name: + continue + if _normalize_public_key(str(ssh_key.get("ssh_key", ""))) != normalized_key: + continue + key_id = ssh_key.get("key_id") + if key_id is not None: + return str(key_id) + return None + + def delete_ssh_key(self, key_id: str) -> None: + try: + resp = self._make_request("DELETE", f"ssh/{key_id}") + except JarvisLabsNotFoundError: + return + _raise_if_unsuccessful(resp, "Failed to delete JarvisLabs SSH key") + + def add_ssh_key_if_needed(self, public_key: str) -> None: + normalized_key = _normalize_public_key(public_key) + for ssh_key in self.list_ssh_keys(): + if _normalize_public_key(str(ssh_key.get("ssh_key", ""))) == normalized_key: + return + key_name = _get_ssh_key_name(normalized_key) + self.add_ssh_key(public_key=public_key, key_name=key_name) + + def create_gpu_vm( + self, + *, + gpu_type: str, + num_gpus: int, + is_spot: bool, + storage: int, + region: str, + name: str, + ) -> str: + resp = self._make_request( + "POST", + "templates/vm/create", + region=region, + json={ + "gpu_type": gpu_type, + "num_gpus": num_gpus, + "hdd": storage, + "region": region, + "name": name, + "is_spot": is_spot, + "duration": "hour", + "disk_type": "ssd", + "http_ports": "", + # JarvisLabs accepts script_id for VM creates, but live CPU/GPU VM tests + # showed it is not injected into cloud-init user-data/runcmd. + "script_id": None, + "script_args": "", + "fs_id": None, + "arguments": "", + }, + ) + return _get_created_machine_id(resp, "GPU VM creation") + + def create_cpu_vm( + self, + *, + vcpus: int, + ram_gb: int, + storage: int, + region: str, + name: str, + ) -> str: + resp = self._make_request( + "POST", + "templates/vm/cpu/create", + region=region, + json={ + "num_cpus": 1, + "vcpus": vcpus, + "ram_gb": ram_gb, + "hdd": storage, + "region": region, + "name": name, + "duration": "hour", + "disk_type": "ssd", + # Do not pass script_id here either; CPU VM create accepts it but ignores it. + }, + ) + return _get_created_machine_id(resp, "CPU VM creation") + + def get_instance(self, machine_id: str) -> Optional[Dict[str, Any]]: + try: + resp = self._make_request("GET", f"users/fetch/{machine_id}") + except JarvisLabsNotFoundError: + return None + if not _is_successful(resp): + return None + if isinstance(resp, dict): + instance = resp.get("instance") + if isinstance(instance, dict): + return instance + return None + + def get_instance_status(self, *, machine_id: str, region: str) -> Optional[Dict[str, Any]]: + try: + resp = self._make_request( + "GET", + "misc/status", + region=region, + params={"machine_id": machine_id}, + ) + except JarvisLabsNotFoundError: + return None + if isinstance(resp, dict): + return resp + return None + + def destroy_instance(self, *, machine_id: str, region: str) -> None: + instance = self.get_instance(machine_id) + if instance is None: + return + endpoint = "templates/vm/destroy" + if is_cpu_vm(instance): + endpoint = "templates/vm/cpu/destroy" + elif _instance_template(instance) != "vm": + endpoint = "misc/destroy" + + try: + resp = self._make_request( + "POST", + endpoint, + region=instance.get("region") or region, + params={"machine_id": machine_id}, + ) + except JarvisLabsNotFoundError: + return + _raise_if_unsuccessful(resp, "Failed to destroy JarvisLabs instance") + + def _make_request( + self, + method: str, + path: str, + *, + json: Optional[Dict[str, Any]] = None, + params: Optional[Dict[str, Any]] = None, + region: Optional[str] = None, + ) -> Any: + try: + response = requests.request( + method=method, + url=self._url(path=path, region=region), + headers={"Authorization": f"Bearer {self.api_key}"}, + json=json, + params=params, + timeout=TIMEOUT, + ) + except requests.RequestException as e: + raise BackendError(f"JarvisLabs request failed: {e}") from e + if response.ok: + if not response.content: + return {} + try: + return response.json() + except ValueError as e: + raise BackendError("Unexpected non-JSON JarvisLabs response") from e + message = _get_response_error(response) + if response.status_code in [401, 403]: + raise BackendInvalidCredentialsError(fields=[["creds", "api_key"]]) + if response.status_code == 404: + raise JarvisLabsNotFoundError(message) + if response.status_code in [400, 409] and _looks_like_no_capacity(message): + raise NoCapacityError(message) + raise BackendError(message) + + def _url(self, *, path: str, region: Optional[str] = None) -> str: + if region is None: + base_url = API_URL + else: + # gpuhunt owns this allowlist because it filters JarvisLabs offers. Do not + # fall back for unknown regions: regional VM APIs use separate hosts and + # JarvisLabs does not expose endpoint discovery in server_meta. + base_url = JARVISLABS_REGION_URLS.get(region) + if base_url is None: + raise BackendError( + f"Unsupported JarvisLabs region {region!r}. " + "JarvisLabs does not expose provisioning endpoint discovery." + ) + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def is_cpu_vm(instance: Dict[str, Any]) -> bool: + return _instance_template(instance) == "vm" and str(instance.get("gpu_type")).upper() == "CPU" + + +def _instance_template(instance: Dict[str, Any]) -> str: + return str(instance.get("template") or instance.get("framework") or "").lower() + + +def _get_created_machine_id(resp: Any, operation: str) -> str: + _raise_if_unsuccessful(resp, f"JarvisLabs {operation} failed") + if isinstance(resp, dict): + machine_id = resp.get("machine_id") + if machine_id is not None: + return str(machine_id) + raise BackendError(f"JarvisLabs {operation} failed: missing machine_id") + + +def _raise_if_unsuccessful(resp: Any, message: str) -> None: + if _is_successful(resp): + return + backend_message = _backend_message(resp) + if _looks_like_no_capacity(backend_message): + raise NoCapacityError(backend_message) + raise BackendError(f"{message}: {backend_message}") + + +def _is_successful(resp: Any) -> bool: + if not isinstance(resp, dict): + return True + if "success" in resp: + return _coerce_bool(resp["success"]) + if "sucess" in resp: + return _coerce_bool(resp["sucess"]) + return True + + +def _coerce_bool(value: Any) -> bool: + if isinstance(value, bool): + return value + if isinstance(value, str): + return value.strip().lower() in {"1", "true", "yes", "success"} + return bool(value) + + +def _get_response_error(response: requests.Response) -> str: + try: + data = response.json() + except ValueError: + return response.text or f"HTTP {response.status_code}" + message = _backend_message(data) + return message or f"HTTP {response.status_code}" + + +def _backend_message(resp: Any) -> str: + if isinstance(resp, dict): + detail = resp.get("detail") + if isinstance(detail, list): + return "; ".join(str(item.get("msg", item)) for item in detail) + return str( + resp.get("message") + or resp.get("error") + or resp.get("detail") + or resp.get("msg") + or resp + ) + return str(resp) + + +def _looks_like_no_capacity(message: str) -> bool: + message = message.lower() + return "capacity" in message or "available" in message or "stock" in message + + +def _normalize_public_key(public_key: str) -> str: + return " ".join(public_key.strip().split()[:2]) + + +def _get_ssh_key_name(public_key: str) -> str: + return "dstack-" + hashlib.sha1(public_key.encode()).hexdigest()[:16] diff --git a/src/dstack/_internal/core/backends/jarvislabs/backend.py b/src/dstack/_internal/core/backends/jarvislabs/backend.py new file mode 100644 index 0000000000..ac47171bd6 --- /dev/null +++ b/src/dstack/_internal/core/backends/jarvislabs/backend.py @@ -0,0 +1,16 @@ +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.jarvislabs.compute import JarvisLabsCompute +from dstack._internal.core.backends.jarvislabs.models import JarvisLabsConfig +from dstack._internal.core.models.backends.base import BackendType + + +class JarvisLabsBackend(Backend): + TYPE = BackendType.JARVISLABS + COMPUTE_CLASS = JarvisLabsCompute + + def __init__(self, config: JarvisLabsConfig): + self.config = config + self._compute = JarvisLabsCompute(self.config) + + def compute(self) -> JarvisLabsCompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/jarvislabs/compute.py b/src/dstack/_internal/core/backends/jarvislabs/compute.py new file mode 100644 index 0000000000..24e13f8501 --- /dev/null +++ b/src/dstack/_internal/core/backends/jarvislabs/compute.py @@ -0,0 +1,388 @@ +import shlex +import subprocess +import tempfile +from collections.abc import Iterable +from typing import List, Optional, cast + +import gpuhunt +from gpuhunt.providers.jarvislabs import JarvisLabsProvider +from typing_extensions import NotRequired, TypedDict + +from dstack._internal.core.backends.base.backend import Compute +from dstack._internal.core.backends.base.compute import ( + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithInstanceVolumesSupport, + ComputeWithPrivilegedSupport, + generate_unique_instance_name, + get_shim_commands, +) +from dstack._internal.core.backends.base.offers import ( + OfferModifier, + get_catalog_offers, + get_offers_disk_modifier, +) +from dstack._internal.core.backends.jarvislabs.api_client import JarvisLabsAPIClient +from dstack._internal.core.backends.jarvislabs.models import JarvisLabsConfig +from dstack._internal.core.errors import ProvisioningError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import CoreModel +from dstack._internal.core.models.instances import ( + InstanceAvailability, + InstanceConfiguration, + InstanceOfferWithAvailability, +) +from dstack._internal.core.models.placement import PlacementGroup +from dstack._internal.core.models.resources import Memory, Range +from dstack._internal.core.models.runs import JobProvisioningData, Requirements +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + +MAX_INSTANCE_NAME_LEN = 40 +# JarvisLabs VM storage is configurable through the `hdd` create parameter. +MIN_DISK_SIZE = Memory.parse("100GB") +CONFIGURABLE_DISK_SIZE = Range[Memory](min=MIN_DISK_SIZE, max=None) +DEFAULT_USERNAME = "ubuntu" +SSH_CONNECT_TIMEOUT_SECONDS = 10 +SSH_SETUP_TIMEOUT_SECONDS = 240 +SSH_LAUNCH_TIMEOUT_SECONDS = 60 + + +class JarvisLabsOfferBackendData(TypedDict): + # Set by gpuhunt when normalized GPU identity differs from the JarvisLabs VM + # create token, e.g. "RTX-PRO6000" normalized to "RTXPRO6000". + gpu_type: NotRequired[str] + + +class JarvisLabsInstanceBackendData(CoreModel): + ssh_key_ids: Optional[List[str]] = None + + @classmethod + def load(cls, raw: Optional[str]) -> "JarvisLabsInstanceBackendData": + if raw is None: + return cls() + return cls.__response__.parse_raw(raw) + + +class JarvisLabsCompute( + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithPrivilegedSupport, + ComputeWithInstanceVolumesSupport, + Compute, +): + def __init__(self, config: JarvisLabsConfig): + super().__init__() + self.config = config + self.api_client = JarvisLabsAPIClient(config.creds.api_key) + self._catalog = gpuhunt.Catalog(balance_resources=False, auto_reload=False) + self._catalog.add_provider(JarvisLabsProvider(api_key=self.config.creds.api_key)) + + def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]: + offers = get_catalog_offers( + backend=BackendType.JARVISLABS, + locations=self.config.regions or None, + catalog=self._catalog, + configurable_disk_size=CONFIGURABLE_DISK_SIZE, + ) + return [ + offer.with_availability(availability=InstanceAvailability.AVAILABLE) + for offer in offers + ] + + def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]: + return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)] + + def create_instance( + self, + instance_offer: InstanceOfferWithAvailability, + instance_config: InstanceConfiguration, + placement_group: Optional[PlacementGroup], + ) -> JobProvisioningData: + instance_name = generate_unique_instance_name( + instance_config, max_length=MAX_INSTANCE_NAME_LEN + ) + ssh_key_ids: List[str] = [] + instance_id = None + try: + # TODO: JarvisLabs has a default 10 SSH key limit. Consider project-level + # key reuse if per-instance keys become a bottleneck. + for idx, ssh_public_key in enumerate(instance_config.get_public_keys()): + ssh_key_ids.append( + _create_ssh_key( + client=self.api_client, + name=f"{instance_name}-{idx}.key", + public_key=ssh_public_key, + ) + ) + if instance_offer.instance.resources.gpus: + instance_id = self.api_client.create_gpu_vm( + gpu_type=_get_jarvislabs_gpu_type(instance_offer), + num_gpus=len(instance_offer.instance.resources.gpus), + is_spot=instance_offer.instance.resources.spot, + storage=_get_disk_size_gb(instance_offer), + region=instance_offer.region, + name=instance_name, + ) + else: + instance_id = self.api_client.create_cpu_vm( + vcpus=instance_offer.instance.resources.cpus, + ram_gb=round(instance_offer.instance.resources.memory_mib / 1024), + storage=_get_disk_size_gb(instance_offer), + region=instance_offer.region, + name=instance_name, + ) + + except BaseException: + if instance_id is not None: + try: + self.api_client.destroy_instance( + machine_id=instance_id, + region=instance_offer.region, + ) + except Exception: + logger.exception( + "Could not destroy failed JarvisLabs instance %s", instance_id + ) + try: + _delete_ssh_keys(self.api_client, ssh_key_ids) + except Exception: + logger.exception( + "Could not delete JarvisLabs SSH keys %s after provisioning failure", + ssh_key_ids, + ) + raise + return JobProvisioningData( + backend=instance_offer.backend, + instance_type=instance_offer.instance, + instance_id=instance_id, + hostname=None, + internal_ip=None, + region=instance_offer.region, + price=instance_offer.price, + username=DEFAULT_USERNAME, + ssh_port=22, + dockerized=True, + ssh_proxy=None, + backend_data=JarvisLabsInstanceBackendData(ssh_key_ids=ssh_key_ids).json(), + ) + + def update_provisioning_data( + self, + provisioning_data: JobProvisioningData, + project_ssh_public_key: str, + project_ssh_private_key: str, + ): + instance = self.api_client.get_instance(provisioning_data.instance_id) + if instance is None: + status = self.api_client.get_instance_status( + machine_id=provisioning_data.instance_id, + region=provisioning_data.region, + ) + if status is not None and str(status.get("status")).lower() == "failed": + _raise_failed_status(status) + return + + status = str(instance.get("status")).lower() + if status == "failed": + _raise_failed_status(instance) + if status != "running": + return + + hostname = instance.get("public_ip") + if not hostname: + return + username = _get_ssh_username(instance) + if not _start_runner( + hostname=hostname, + username=username, + project_ssh_private_key=project_ssh_private_key, + arch=provisioning_data.instance_type.resources.cpu_arch, + ): + return + provisioning_data.hostname = hostname + provisioning_data.username = username + + def terminate_instance( + self, instance_id: str, region: str, backend_data: Optional[str] = None + ): + backend_data_parsed = JarvisLabsInstanceBackendData.load(backend_data) + self.api_client.destroy_instance(machine_id=instance_id, region=region) + _delete_ssh_keys(self.api_client, backend_data_parsed.ssh_key_ids) + + +def _create_ssh_key(client: JarvisLabsAPIClient, name: str, public_key: str) -> str: + return client.create_ssh_key(public_key=public_key, key_name=name) + + +def _delete_ssh_keys(client: JarvisLabsAPIClient, ssh_key_ids: Optional[List[str]]) -> None: + if not ssh_key_ids: + return + for ssh_key_id in ssh_key_ids: + client.delete_ssh_key(ssh_key_id) + + +def _get_jarvislabs_gpu_type(instance_offer: InstanceOfferWithAvailability) -> str: + gpu_type = _get_jarvislabs_gpu_type_from_backend_data(instance_offer.backend_data) + if gpu_type is not None: + return gpu_type + + gpu = instance_offer.instance.resources.gpus[0] + return gpu.name + + +def _get_jarvislabs_gpu_type_from_backend_data(backend_data: dict) -> Optional[str]: + offer_backend_data = cast(JarvisLabsOfferBackendData, backend_data) + gpu_type = offer_backend_data.get("gpu_type") + if not isinstance(gpu_type, str) or not gpu_type: + return None + return gpu_type + + +def _get_disk_size_gb(instance_offer: InstanceOfferWithAvailability) -> int: + disk_size_gb = round(instance_offer.instance.resources.disk.size_mib / 1024) + return max(round(MIN_DISK_SIZE), disk_size_gb) + + +def _format_failed_status(status: dict) -> str: + message = status.get("error") or "unknown error" + code = status.get("code") + if code is not None: + return f"JarvisLabs instance creation failed: {message} (code={code})" + return f"JarvisLabs instance creation failed: {message}" + + +def _raise_failed_status(status: dict) -> None: + raise ProvisioningError(_format_failed_status(status), status) + + +def _get_ssh_username(instance: dict) -> str: + ssh_command = instance.get("ssh_str") or instance.get("ssh_command") + if not isinstance(ssh_command, str): + return DEFAULT_USERNAME + try: + parts = shlex.split(ssh_command) + except ValueError: + return DEFAULT_USERNAME + for part in parts[1:]: + if part.startswith("-") or "@" not in part: + continue + return part.rsplit("@", 1)[0] + return DEFAULT_USERNAME + + +def _start_runner( + hostname: str, + username: str, + project_ssh_private_key: str, + arch: Optional[str], +) -> bool: + commands = get_shim_commands(arch=arch) + launch_command = "sudo sh -c " + shlex.quote(" && ".join(commands)) + try: + if not _setup_instance( + hostname=hostname, + username=username, + ssh_private_key=project_ssh_private_key, + ): + return False + return _launch_runner( + hostname=hostname, + username=username, + ssh_private_key=project_ssh_private_key, + launch_command=launch_command, + ) + except Exception: + logger.exception("Failed to start dstack shim on JarvisLabs instance %s", hostname) + return False + + +def _setup_instance( + hostname: str, + username: str, + ssh_private_key: str, +) -> bool: + setup_commands = [ + "mkdir -p ~/.dstack", + "if ! command -v curl >/dev/null 2>&1 || ! command -v docker >/dev/null 2>&1 || ! command -v jq >/dev/null 2>&1; then sudo apt-get update; fi", + "if ! command -v curl >/dev/null 2>&1; then sudo DEBIAN_FRONTEND=noninteractive apt-get install -y curl; fi", + "if ! command -v docker >/dev/null 2>&1; then sudo apt-get update && sudo DEBIAN_FRONTEND=noninteractive apt-get install -y docker.io; fi", + "if ! command -v jq >/dev/null 2>&1; then sudo apt-get update && sudo DEBIAN_FRONTEND=noninteractive apt-get install -y jq; fi", + "sudo systemctl enable --now docker || sudo service docker start || true", + ] + return _run_ssh_command( + hostname=hostname, + username=username, + ssh_private_key=ssh_private_key, + command=" && ".join(setup_commands), + timeout=SSH_SETUP_TIMEOUT_SECONDS, + ) + + +def _launch_runner( + hostname: str, + username: str, + ssh_private_key: str, + launch_command: str, +) -> bool: + daemonized_command = f"{launch_command.rstrip('&')} >/tmp/dstack-shim.log 2>&1 & disown" + return _run_ssh_command( + hostname=hostname, + username=username, + ssh_private_key=ssh_private_key, + command=daemonized_command, + timeout=SSH_LAUNCH_TIMEOUT_SECONDS, + ) + + +def _run_ssh_command( + hostname: str, + username: str, + ssh_private_key: str, + command: str, + timeout: int, +) -> bool: + with tempfile.NamedTemporaryFile("w+") as f: + f.write(ssh_private_key) + f.flush() + try: + proc = subprocess.run( + [ + "ssh", + "-F", + "none", + "-o", + "BatchMode=yes", + "-o", + f"ConnectTimeout={SSH_CONNECT_TIMEOUT_SECONDS}", + "-o", + "ConnectionAttempts=1", + "-o", + "StrictHostKeyChecking=no", + "-o", + "UserKnownHostsFile=/dev/null", + "-o", + "LogLevel=ERROR", + "-i", + f.name, + f"{username}@{hostname}", + command, + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + timeout=timeout, + ) + except subprocess.TimeoutExpired: + logger.debug("Timed out running SSH command on JarvisLabs instance %s", hostname) + return False + if proc.returncode != 0: + logger.debug( + "SSH command failed on JarvisLabs instance %s: exit_code=%s stderr=%r", + hostname, + proc.returncode, + proc.stderr[-1000:], + ) + return False + return True diff --git a/src/dstack/_internal/core/backends/jarvislabs/configurator.py b/src/dstack/_internal/core/backends/jarvislabs/configurator.py new file mode 100644 index 0000000000..ceaebeedf9 --- /dev/null +++ b/src/dstack/_internal/core/backends/jarvislabs/configurator.py @@ -0,0 +1,85 @@ +import json + +from gpuhunt.providers.jarvislabs import JARVISLABS_REGION_URLS + +from dstack._internal.core.backends.base.configurator import ( + BackendRecord, + Configurator, + raise_invalid_credentials_error, +) +from dstack._internal.core.backends.jarvislabs import api_client +from dstack._internal.core.backends.jarvislabs.backend import JarvisLabsBackend +from dstack._internal.core.backends.jarvislabs.models import ( + JarvisLabsBackendConfig, + JarvisLabsBackendConfigWithCreds, + JarvisLabsConfig, + JarvisLabsCreds, + JarvisLabsStoredConfig, +) +from dstack._internal.core.errors import ServerClientError +from dstack._internal.core.models.backends.base import BackendType + + +class JarvisLabsConfigurator( + Configurator[ + JarvisLabsBackendConfig, + JarvisLabsBackendConfigWithCreds, + ] +): + TYPE = BackendType.JARVISLABS + BACKEND_CLASS = JarvisLabsBackend + + def validate_config( + self, config: JarvisLabsBackendConfigWithCreds, default_creds_enabled: bool + ): + self._validate_api_key(config.creds.api_key) + self._validate_regions(config.regions) + + def create_backend( + self, project_name: str, config: JarvisLabsBackendConfigWithCreds + ) -> BackendRecord: + return BackendRecord( + config=JarvisLabsStoredConfig( + **JarvisLabsBackendConfig.__response__.parse_obj(config).dict() + ).json(), + auth=JarvisLabsCreds.parse_obj(config.creds).json(), + ) + + def get_backend_config_with_creds( + self, record: BackendRecord + ) -> JarvisLabsBackendConfigWithCreds: + config = self._get_config(record) + return JarvisLabsBackendConfigWithCreds.__response__.parse_obj(config) + + def get_backend_config_without_creds(self, record: BackendRecord) -> JarvisLabsBackendConfig: + config = self._get_config(record) + return JarvisLabsBackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> JarvisLabsBackend: + config = self._get_config(record) + return JarvisLabsBackend(config=config) + + def _get_config(self, record: BackendRecord) -> JarvisLabsConfig: + return JarvisLabsConfig.__response__( + **json.loads(record.config), + creds=JarvisLabsCreds.parse_raw(record.auth), + ) + + def _validate_api_key(self, api_key: str): + client = api_client.JarvisLabsAPIClient(api_key=api_key) + if not client.validate_api_key(): + raise_invalid_credentials_error(fields=[["creds", "api_key"]]) + + def _validate_regions(self, regions: list[str] | None): + if not regions: + return + invalid_regions = sorted(set(regions) - set(JARVISLABS_REGION_URLS)) + if invalid_regions: + raise ServerClientError( + msg=( + f"Unsupported JarvisLabs regions: {invalid_regions}. " + f"Supported regions: {sorted(JARVISLABS_REGION_URLS)}. " + "JarvisLabs does not expose provisioning endpoint discovery." + ), + fields=[["regions"]], + ) diff --git a/src/dstack/_internal/core/backends/jarvislabs/models.py b/src/dstack/_internal/core/backends/jarvislabs/models.py new file mode 100644 index 0000000000..dae710089e --- /dev/null +++ b/src/dstack/_internal/core/backends/jarvislabs/models.py @@ -0,0 +1,47 @@ +from typing import Annotated, List, Literal, Optional, Union + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +class JarvisLabsAPIKeyCreds(CoreModel): + type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" + api_key: Annotated[str, Field(description="The JarvisLabs API key")] + + +AnyJarvisLabsCreds = JarvisLabsAPIKeyCreds +JarvisLabsCreds = AnyJarvisLabsCreds + + +class JarvisLabsBackendConfig(CoreModel): + type: Annotated[ + Literal["jarvislabs"], + Field(description="The type of backend"), + ] = "jarvislabs" + regions: Annotated[ + Optional[List[str]], + Field(description="The list of JarvisLabs regions. Omit to use all regions"), + ] = None + + +class JarvisLabsBackendConfigWithCreds(JarvisLabsBackendConfig): + creds: Annotated[AnyJarvisLabsCreds, Field(description="The credentials")] + + +AnyJarvisLabsBackendConfig = Union[ + JarvisLabsBackendConfig, + JarvisLabsBackendConfigWithCreds, +] + + +class JarvisLabsBackendFileConfigWithCreds(JarvisLabsBackendConfig): + creds: Annotated[AnyJarvisLabsCreds, Field(description="The credentials")] + + +class JarvisLabsStoredConfig(JarvisLabsBackendConfig): + pass + + +class JarvisLabsConfig(JarvisLabsStoredConfig): + creds: AnyJarvisLabsCreds diff --git a/src/dstack/_internal/core/backends/kubernetes/__init__.py b/src/dstack/_internal/core/backends/kubernetes/__init__.py index e184fafaa4..e69de29bb2 100644 --- a/src/dstack/_internal/core/backends/kubernetes/__init__.py +++ b/src/dstack/_internal/core/backends/kubernetes/__init__.py @@ -1,15 +0,0 @@ -from dstack._internal.core.backends.base import Backend -from dstack._internal.core.backends.kubernetes.compute import KubernetesCompute -from dstack._internal.core.backends.kubernetes.config import KubernetesConfig -from dstack._internal.core.models.backends.base import BackendType - - -class KubernetesBackend(Backend): - TYPE: BackendType = BackendType.KUBERNETES - - def __init__(self, config: KubernetesConfig): - self.config = config - self._compute = KubernetesCompute(self.config) - - def compute(self) -> KubernetesCompute: - return self._compute diff --git a/src/dstack/_internal/core/backends/kubernetes/api_client.py b/src/dstack/_internal/core/backends/kubernetes/api_client.py new file mode 100644 index 0000000000..29a915d646 --- /dev/null +++ b/src/dstack/_internal/core/backends/kubernetes/api_client.py @@ -0,0 +1,46 @@ +from typing import Optional + +from kubernetes.client.api_client import ApiClient as _BaseApiClient +from kubernetes.client.configuration import Configuration as _ClientConfiguration +from kubernetes.client.exceptions import ApiException +from kubernetes.config import load_kube_config_from_dict +from urllib3.exceptions import HTTPError + +# 30 * 2 (original request + 1 retry) = 60 seconds total +DEFAULT_REQUEST_TIMEOUT = 30 +DEFAULT_RETRIES = 1 + + +API_CLIENT_EXCEPTIONS: tuple[type[Exception], ...] = (HTTPError, ApiException) + + +class ApiClient(_BaseApiClient): + def __init__(self, *, configuration: _ClientConfiguration, request_timeout: int) -> None: + self.__request_timeout = request_timeout + super().__init__(configuration=configuration) + + def request(self, *args, **kwargs): + if kwargs.get("_request_timeout") is None: + kwargs["_request_timeout"] = self.__request_timeout + return super().request(*args, **kwargs) # pyright: ignore[reportAttributeAccessIssue] + + +def get_api_client_from_kubeconfig_dict( + kubeconfig_dict: dict, + *, + context: str, + request_timeout: Optional[int] = None, + retries: Optional[int] = None, +) -> ApiClient: + if request_timeout is None: + request_timeout = DEFAULT_REQUEST_TIMEOUT + if retries is None: + retries = DEFAULT_RETRIES + client_configuration = _ClientConfiguration() + client_configuration.retries = retries # pyright: ignore[reportAttributeAccessIssue] + load_kube_config_from_dict( + config_dict=kubeconfig_dict, + context=context, + client_configuration=client_configuration, + ) + return ApiClient(configuration=client_configuration, request_timeout=request_timeout) diff --git a/src/dstack/_internal/core/backends/kubernetes/backend.py b/src/dstack/_internal/core/backends/kubernetes/backend.py new file mode 100644 index 0000000000..dabdf18470 --- /dev/null +++ b/src/dstack/_internal/core/backends/kubernetes/backend.py @@ -0,0 +1,16 @@ +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.kubernetes.compute import KubernetesCompute +from dstack._internal.core.backends.kubernetes.models import KubernetesConfig +from dstack._internal.core.models.backends.base import BackendType + + +class KubernetesBackend(Backend): + TYPE = BackendType.KUBERNETES + COMPUTE_CLASS = KubernetesCompute + + def __init__(self, config: KubernetesConfig): + self.config = config + self._compute = KubernetesCompute(self.config) + + def compute(self) -> KubernetesCompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/kubernetes/compute.py b/src/dstack/_internal/core/backends/kubernetes/compute.py index 8ada3830f5..6cc7c08a0a 100644 --- a/src/dstack/_internal/core/backends/kubernetes/compute.py +++ b/src/dstack/_internal/core/backends/kubernetes/compute.py @@ -1,93 +1,170 @@ +import concurrent.futures +import random +import shlex import subprocess import tempfile -import threading import time -from typing import Dict, List, Optional +from contextlib import ExitStack +from decimal import Decimal +from enum import Enum +from typing import List, Optional -from gpuhunt import KNOWN_GPUS +from gpuhunt import AcceleratorVendor from kubernetes import client +from typing_extensions import Self from dstack._internal.core.backends.base.compute import ( Compute, + ComputeWithFilteredOffersCached, + ComputeWithGatewaySupport, + ComputeWithInstanceVolumesSupport, + ComputeWithMultinodeSupport, + ComputeWithPrivilegedSupport, + ComputeWithVolumeSupport, + generate_unique_gateway_instance_name, + generate_unique_instance_name_for_job, + generate_unique_name, + generate_unique_volume_name, get_docker_commands, get_dstack_gateway_commands, - get_instance_name, + merge_tags, +) +from dstack._internal.core.backends.kubernetes.api_client import API_CLIENT_EXCEPTIONS +from dstack._internal.core.backends.kubernetes.models import KubernetesConfig +from dstack._internal.core.backends.kubernetes.resources import ( + AMD_GPU_DEVICE_ID_LABEL_PREFIX, + AMD_GPU_NAME_TO_DEVICE_IDS, + AMD_GPU_NODE_TAINT, + AMD_GPU_RESOURCE, + LABEL_VALUE_MAX_LENGTH, + NVIDIA_GPU_NAME_TO_GPU_INFO, + NVIDIA_GPU_NODE_TAINT, + NVIDIA_GPU_PRODUCT_LABEL, + NVIDIA_GPU_RESOURCE, + OBJECT_NAME_MAX_LENGTH, + PodPhase, + TaintEffect, + build_base_labels, + build_dockerconfigjson, + filter_invalid_labels, + format_memory, + get_amd_gpu_from_node_labels, + get_gpu_request_from_gpu_spec, + get_instance_offer_from_node, + get_instance_offers, + get_node_labels, + get_node_name, + get_nvidia_gpu_from_node_labels, + is_hard_taint, + is_taint_tolerated, + parse_quantity, ) -from dstack._internal.core.backends.base.offers import match_requirements -from dstack._internal.core.backends.kubernetes.config import KubernetesConfig from dstack._internal.core.backends.kubernetes.utils import ( - get_api_from_config_data, - get_cluster_public_ip, + LEGACY_CURRENT_CONTEXT_REGION, + Cluster, + SkipOfferCache, + call_api_method, + get_clusters_from_backend_config, + try_delete_object_if_exists, + watch_events, ) -from dstack._internal.core.errors import ComputeError +from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT +from dstack._internal.core.errors import ComputeError, ProvisioningError, SkipOffer from dstack._internal.core.models.backends.base import BackendType - -# TODO: update import as KNOWN_GPUS becomes public +from dstack._internal.core.models.common import CoreModel from dstack._internal.core.models.gateways import ( GatewayComputeConfiguration, GatewayProvisioningData, ) from dstack._internal.core.models.instances import ( - Disk, Gpu, - InstanceAvailability, InstanceOfferWithAvailability, - InstanceRuntime, - InstanceType, - Resources, SSHConnectionParams, ) -from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run -from dstack._internal.core.models.volumes import Volume -from dstack._internal.utils.common import parse_memory +from dstack._internal.core.models.placement import PlacementGroup +from dstack._internal.core.models.resources import CPUSpec, GPUSpec +from dstack._internal.core.models.routers import AnyGatewayRouterConfig +from dstack._internal.core.models.runs import ( + Job, + JobProvisioningData, + JobSpec, + Requirements, + Run, + RunSpec, +) +from dstack._internal.core.models.volumes import ( + InstanceMountPoint, + KubernetesVolumeConfiguration, + Volume, + VolumeMountPoint, + VolumeProvisioningData, +) +from dstack._internal.utils.common import get_or_error from dstack._internal.utils.logging import get_logger logger = get_logger(__name__) -RUNNER_SSH_PORT = 10022 +JUMP_POD_IMAGE = "testcontainers/sshd:1.3.0@sha256:c50c0f59554dcdb2d9e5e705112144428ae9d04ac0af6322b365a18e24213a6a" JUMP_POD_SSH_PORT = 22 -DEFAULT_NAMESPACE = "default" +JUMP_POD_USER = "root" + +JOB_POD_SCHEDULING_TIMEOUT = 10 + -GPU_NAME_TO_GPU_INFO = {gpu.name: gpu for gpu in KNOWN_GPUS} -GPU_NAMES = GPU_NAME_TO_GPU_INFO.keys() +class Operator(str, Enum): + EXISTS = "Exists" + IN = "In" -class KubernetesCompute(Compute): +class KubernetesBackendData(CoreModel): + jump_pod_name: str + jump_pod_service_name: str + user_ssh_public_key: str + + @classmethod + def load(cls, raw: str) -> Self: + return cls.__response__.parse_raw(raw) + + +class KubernetesCompute( + ComputeWithFilteredOffersCached, + ComputeWithPrivilegedSupport, + ComputeWithInstanceVolumesSupport, + ComputeWithVolumeSupport, + ComputeWithGatewaySupport, + ComputeWithMultinodeSupport, + Compute, +): def __init__(self, config: KubernetesConfig): - self.config = config - self.api = get_api_from_config_data(config.kubeconfig.data) - - def get_offers( - self, requirements: Optional[Requirements] = None - ) -> List[InstanceOfferWithAvailability]: - nodes = self.api.list_node() - instance_offers = [] - for node in nodes.items: - instance_offer = InstanceOfferWithAvailability( - backend=BackendType.KUBERNETES, - instance=InstanceType( - name=node.metadata.name, - resources=Resources( - cpus=node.status.capacity["cpu"], - memory_mib=int(parse_memory(node.status.capacity["memory"], as_untis="M")), - gpus=_get_gpus_from_node_labels(node.metadata.labels), - spot=False, - disk=Disk( - size_mib=int( - parse_memory( - node.status.capacity["ephemeral-storage"], as_untis="M" - ) - ) - ), - ), - ), - price=0, - region="-", - availability=InstanceAvailability.AVAILABLE, - instance_runtime=InstanceRuntime.RUNNER, - ) - instance_offers.extend(match_requirements([instance_offer], requirements)) - return instance_offers + super().__init__() + self.region_cluster_map = {c.region: c for c in get_clusters_from_backend_config(config)} + self.skip_offer_cache = SkipOfferCache(ttl=60) + + def get_offers_by_requirements( + self, requirements: Requirements + ) -> list[InstanceOfferWithAvailability]: + offers: list[InstanceOfferWithAvailability] = [] + with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: + future_cluster_map: dict[ + concurrent.futures.Future[list[InstanceOfferWithAvailability]], Cluster + ] = {} + for region, cluster in self.region_cluster_map.items(): + api = client.CoreV1Api(cluster.api_client) + future = executor.submit(get_instance_offers, api, region, requirements) + future_cluster_map[future] = cluster + for future in concurrent.futures.as_completed(future_cluster_map): + try: + cluster_offers = future.result() + except API_CLIENT_EXCEPTIONS as e: + logger.warning( + "Failed to get offers from cluster %s: %s: %s", + future_cluster_map[future], + e.__class__.__name__, + e, + ) + continue + offers.extend(cluster_offers) + return offers def run_job( self, @@ -96,208 +173,402 @@ def run_job( instance_offer: InstanceOfferWithAvailability, project_ssh_public_key: str, project_ssh_private_key: str, - volumes: List[Volume], + volumes: list[Volume], + placement_group: Optional[PlacementGroup], ) -> JobProvisioningData: - instance_name = get_instance_name(run, job) - commands = get_docker_commands( - [run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()] - ) - # Before running a job, ensure a jump pod service is running. - # There is a one jump pod per Kubernetes backend that is used - # as an ssh proxy jump to connect to all other services in Kubernetes. - # Setup jump pod in a separate thread to avoid long-running run_job. - # In case the thread fails, the job will be failed and resubmitted. - jump_pod_hostname = self.config.networking.ssh_host - if jump_pod_hostname is None: - jump_pod_hostname = get_cluster_public_ip(self.api) - if jump_pod_hostname is None: - raise ComputeError( - "Failed to acquire an IP for jump pod automatically. " - "Specify ssh_host for Kubernetes backend." - ) - jump_pod_port = _create_jump_pod_service_if_not_exists( - api=self.api, + cluster = self.region_cluster_map.get(instance_offer.region) + if cluster is None: + raise ComputeError(f"Unknown region: {instance_offer.region!r}") + if self.skip_offer_cache.check(run, job, instance_offer): + raise SkipOffer(f"cluster {cluster} has recently failed to schedule a similar job") + api = client.CoreV1Api(cluster.api_client) + namespace = cluster.namespace + + # There is one jump pod per project that is used as an ssh proxy jump to connect + # to all job pods of the same project. + # The service is created here and configured later in update_provisioning_data() + jump_pod_name = f"dstack-{run.project_name}-ssh-jump-pod" + jump_pod_service_name = _get_pod_service_name(jump_pod_name) + _create_jump_pod_service_if_not_exists( + api=api, + namespace=namespace, project_name=run.project_name, + jump_pod_name=jump_pod_name, + jump_pod_service_name=jump_pod_service_name, + jump_pod_port=cluster.proxy_jump.port, project_ssh_public_key=project_ssh_public_key.strip(), - jump_pod_port=self.config.networking.ssh_port, - ) - threading.Thread( - target=_continue_setup_jump_pod, - kwargs={ - "api": self.api, - "project_name": run.project_name, - "project_ssh_private_key": project_ssh_private_key.strip(), - "user_ssh_public_key": run.run_spec.ssh_key_pub.strip(), - "jump_pod_host": jump_pod_hostname, - "jump_pod_port": jump_pod_port, - }, - ).start() - self.api.create_namespaced_pod( - namespace=DEFAULT_NAMESPACE, - body=client.V1Pod( - metadata=client.V1ObjectMeta( - name=instance_name, - labels={"app.kubernetes.io/name": instance_name}, - ), - spec=client.V1PodSpec( - containers=[ - client.V1Container( - name=f"{instance_name}-container", - image=job.job_spec.image_name, - command=["/bin/sh"], - args=["-c", " && ".join(commands)], - ports=[ - client.V1ContainerPort( - container_port=RUNNER_SSH_PORT, - ) - ], - # TODO: Pass cpu, memory, gpu as requests. - # Beware that node capacity != allocatable, so - # if the node has 2xCPU – then cpu=2 request will probably fail. - resources=client.V1ResourceRequirements(requests={}), - ) - ] - ), - ), ) - service_response = self.api.create_namespaced_service( - namespace=DEFAULT_NAMESPACE, - body=client.V1Service( - metadata=client.V1ObjectMeta(name=_get_pod_service_name(instance_name)), - spec=client.V1ServiceSpec( - type="ClusterIP", - selector={"app.kubernetes.io/name": instance_name}, - ports=[client.V1ServicePort(port=RUNNER_SSH_PORT)], + + pod_name = generate_unique_instance_name_for_job( + run, job, max_length=LABEL_VALUE_MAX_LENGTH + ) + + base_labels = build_base_labels( + component="job", + unique_name=pod_name, + project=run.project_name, + name=job.job_spec.job_name, + user=run.user, + ) + labels = merge_tags( + base_tags=base_labels, + resource_tags=run.run_spec.configuration.tags, + ) + labels = filter_invalid_labels(labels) + + registry_auth_secret_name: Optional[str] = None + with ExitStack() as exit_stack: + if job.job_spec.registry_auth is not None: + registry_auth_secret_name = _get_registry_auth_secret_name(pod_name) + _create_registry_auth_secret( + api=api, + namespace=namespace, + labels=labels, + secret_name=registry_auth_secret_name, + image_name=job.job_spec.image_name, + username=job.job_spec.registry_auth.username, + password=job.job_spec.registry_auth.password, + ) + exit_stack.callback( + try_delete_object_if_exists, + api.delete_namespaced_secret, + namespace=namespace, + name=registry_auth_secret_name, + description="registry auth secret", + should_delete_manually_if_failed=True, + ) + + assert run.run_spec.ssh_key_pub is not None + authorized_keys = [run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()] + _create_job_pod( + api=api, + namespace=namespace, + labels=labels, + pod_name=pod_name, + registry_auth_secret_name=registry_auth_secret_name, + run_spec=run.run_spec, + job_spec=job.job_spec, + volumes=volumes, + authorized_keys=authorized_keys, + ) + exit_stack.callback( + try_delete_object_if_exists, + api.delete_namespaced_pod, + namespace=namespace, + name=pod_name, + description="pod", + should_delete_manually_if_failed=True, + ) + is_pod_scheduled_or_finished, pod_phase = _wait_for_pod_scheduled_or_finished( + api=api, + namespace=namespace, + pod_name=pod_name, + timeout_seconds=JOB_POD_SCHEDULING_TIMEOUT, + ) + if not is_pod_scheduled_or_finished: + self.skip_offer_cache.add(run, job, instance_offer) + reason, message = _get_unscheduled_pod_reason_message( + api=api, + namespace=namespace, + pod_name=pod_name, + ) + raise ComputeError( + f"Pod {pod_name} was not scheduled:" + f" {reason or 'unknown reason'}: {message or 'no message'}" + ) + if pod_phase is not None and pod_phase.is_finished(): + # It's not clear if we should add an entry to the SkipOfferCache in this case. + raise ComputeError(f"Pod {pod_name} already finished: {pod_phase}") + + pod_service_name = _get_pod_service_name(pod_name) + api.create_namespaced_service( + namespace=namespace, + body=client.V1Service( + metadata=client.V1ObjectMeta( + name=pod_service_name, + labels=labels, + ), + spec=client.V1ServiceSpec( + type="ClusterIP", + selector=_build_service_selector_from_labels(base_labels), + ports=[client.V1ServicePort(port=DSTACK_RUNNER_SSH_PORT)], + ), ), - ), + ) + exit_stack.callback( + try_delete_object_if_exists, + api.delete_namespaced_service, + namespace=namespace, + name=pod_service_name, + description="pod service", + should_delete_manually_if_failed=True, + ) + + # Cancel all cleanup callbacks + exit_stack.pop_all() + + backend_data = KubernetesBackendData( + jump_pod_name=jump_pod_name, + jump_pod_service_name=jump_pod_service_name, + user_ssh_public_key=run.run_spec.ssh_key_pub.strip(), ) - service_ip = service_response.spec.cluster_ip + return JobProvisioningData( backend=instance_offer.backend, - instance_type=instance_offer.instance, - instance_id=instance_name, - hostname=service_ip, - internal_ip=None, - region="local", + instance_id=pod_name, + region=instance_offer.region, price=instance_offer.price, username="root", - ssh_port=RUNNER_SSH_PORT, + ssh_port=DSTACK_RUNNER_SSH_PORT, dockerized=False, - ssh_proxy=SSHConnectionParams( - hostname=jump_pod_hostname, - username="root", - port=jump_pod_port, - ), - backend_data=None, + # Although we can already get Service's ClusterIP from the `V1Service` object returned + # by the `create_namespaced_service` method, we still need: + # - updated instance offer + # - job pod's PodIP for multinode runs + # - jump pod node's ExternalIP and jump pod service's NodePort for ssh_proxy + # We'll update all these fields once both the jump pod and the job pod are assigned + # to the nodes. + hostname=None, + instance_type=instance_offer.instance, + internal_ip=None, + ssh_proxy=None, + backend_data=backend_data.json(), ) + def update_provisioning_data( + self, + provisioning_data: JobProvisioningData, + project_ssh_public_key: str, + project_ssh_private_key: str, + ): + cluster = self.region_cluster_map.get(provisioning_data.region) + if cluster is None: + raise ProvisioningError(f"Unknown region: {provisioning_data.region!r}") + api = client.CoreV1Api(cluster.api_client) + namespace = cluster.namespace + + if provisioning_data.backend_data is not None: + # Before running a job, ensure the jump pod is running and has user's public SSH key. + backend_data = KubernetesBackendData.load(provisioning_data.backend_data) + ssh_proxy = _check_and_configure_jump_pod_service( + api=api, + namespace=namespace, + jump_pod_name=backend_data.jump_pod_name, + jump_pod_service_name=backend_data.jump_pod_service_name, + jump_pod_hostname=cluster.proxy_jump.hostname, + project_ssh_private_key=project_ssh_private_key, + user_ssh_public_key=backend_data.user_ssh_public_key, + ) + if ssh_proxy is None: + # Jump pod is not ready yet + return + provisioning_data.ssh_proxy = ssh_proxy + # Remove backend data to save space in DB and skip this step + # in case update_provisioning_data() is called again. + provisioning_data.backend_data = None + + pod = api.read_namespaced_pod( + name=provisioning_data.instance_id, + namespace=namespace, + ) + if pod.status is None: + return + pod_ip = pod.status.pod_ip + if not pod_ip: + return + provisioning_data.internal_ip = pod_ip + service = api.read_namespaced_service( + name=_get_pod_service_name(provisioning_data.instance_id), + namespace=namespace, + ) + service_spec = get_or_error(service.spec) + provisioning_data.hostname = get_or_error(service_spec.cluster_ip) + pod_spec = get_or_error(pod.spec) + node = api.read_node(name=get_or_error(pod_spec.node_name)) + # In the original offer, the resources have already been adjusted according to + # the run configuration resource requirements, see get_offers_by_requirements() + original_resources = provisioning_data.instance_type.resources + instance_offer = get_instance_offer_from_node( + node=node, + region=cluster.region, + cpu_request=original_resources.cpus, + memory_mib_request=original_resources.memory_mib, + gpu_request=len(original_resources.gpus), + disk_mib_request=original_resources.disk.size_mib, + ) + if instance_offer is not None: + provisioning_data.instance_type = instance_offer.instance + provisioning_data.price = instance_offer.price + def terminate_instance( self, instance_id: str, region: str, backend_data: Optional[str] = None ): - try: - self.api.delete_namespaced_service( + cluster = self.region_cluster_map.get(region) + if cluster is None and region == "-": + # legacy DUMMY_REGION + cluster = self.region_cluster_map.get(LEGACY_CURRENT_CONTEXT_REGION) + if cluster is not None: + logger.warning( + ( + "Terminating instance %s in unknown region %s." + " Assuming it was created before multi-cluster support was added" + " and is located in cluster %s" + ), + instance_id, + repr(region), + cluster, + ) + if cluster is None: + raise ComputeError(f"Unknown region: {region!r}") + api = client.CoreV1Api(cluster.api_client) + namespace = cluster.namespace + deleted = [ + try_delete_object_if_exists( + api.delete_namespaced_service, + namespace=namespace, name=_get_pod_service_name(instance_id), - namespace=DEFAULT_NAMESPACE, - body=client.V1DeleteOptions(), - ) - except client.ApiException as e: - if e.status != 404: - raise - try: - self.api.delete_namespaced_pod( - name=instance_id, namespace=DEFAULT_NAMESPACE, body=client.V1DeleteOptions() - ) - except client.ApiException as e: - if e.status != 404: - raise + description="pod service", + ), + try_delete_object_if_exists( + api.delete_namespaced_pod, + namespace=namespace, + name=instance_id, + description="pod", + ), + try_delete_object_if_exists( + api.delete_namespaced_secret, + namespace=namespace, + name=_get_registry_auth_secret_name(instance_id), + description="registry auth secret", + ), + ] + if not all(deleted): + raise ComputeError("Not all objects were deleted, check logs") def create_gateway( self, configuration: GatewayComputeConfiguration, ) -> GatewayProvisioningData: + cluster = self.region_cluster_map.get(configuration.region) + if cluster is None: + raise ComputeError(f"Unknown region: {configuration.region!r}") + api = client.CoreV1Api(cluster.api_client) + namespace = cluster.namespace + # Gateway creation is currently limited to Kubernetes with Load Balancer support. # If the cluster does not support Load Balancer, the service will be provisioned but # the external IP/hostname will never be allocated. - # TODO: This implementation is only tested on EKS. Test other managed Kubernetes. - # TODO: By default EKS creates a Classic Load Balancer for Load Balancer services. # Consider deploying an NLB. It seems it requires some extra configuration on the cluster: # https://docs.aws.amazon.com/eks/latest/userguide/network-load-balancing.html - instance_name = configuration.instance_name - commands = _get_gateway_commands(authorized_keys=[configuration.ssh_key_pub]) - self.api.create_namespaced_pod( - namespace=DEFAULT_NAMESPACE, - body=client.V1Pod( - metadata=client.V1ObjectMeta( - name=instance_name, - labels={"app.kubernetes.io/name": instance_name}, - ), - spec=client.V1PodSpec( - containers=[ - client.V1Container( - name=f"{instance_name}-container", - image="ubuntu:22.04", - command=["/bin/sh"], - args=["-c", " && ".join(commands)], - ports=[ - client.V1ContainerPort( - container_port=22, - ), - client.V1ContainerPort( - container_port=80, - ), - client.V1ContainerPort( - container_port=443, - ), - ], - ) - ] - ), - ), + if configuration.instance_type is not None: + raise ComputeError( + "The `kubernetes` backend does not support the `instance_type`" + " gateway configuration property" + ) + + instance_name = generate_unique_gateway_instance_name( + configuration, max_length=LABEL_VALUE_MAX_LENGTH ) - self.api.create_namespaced_service( - namespace=DEFAULT_NAMESPACE, - body=client.V1Service( - metadata=client.V1ObjectMeta( - name=_get_pod_service_name(instance_name), - ), - spec=client.V1ServiceSpec( - type="LoadBalancer", - selector={"app.kubernetes.io/name": instance_name}, - ports=[ - client.V1ServicePort( - name="ssh", - port=22, - target_port=22, - ), - client.V1ServicePort( - name="http", - port=80, - target_port=80, - ), - client.V1ServicePort( - name="https", - port=443, - target_port=443, + + base_labels = build_base_labels( + component="gateway", + unique_name=instance_name, + project=configuration.project_name, + name=configuration.instance_name, + ) + labels = merge_tags( + base_tags=base_labels, + resource_tags=configuration.tags, + ) + labels = filter_invalid_labels(labels) + + commands = _get_gateway_commands( + authorized_keys=[configuration.ssh_key_pub], router=configuration.router + ) + pod = client.V1Pod( + metadata=client.V1ObjectMeta( + name=instance_name, + labels=labels, + ), + spec=client.V1PodSpec( + containers=[ + client.V1Container( + name=f"{instance_name}-container", + image="ubuntu:22.04", + command=["/bin/sh"], + args=["-c", " && ".join(commands)], + ports=[ + client.V1ContainerPort( + container_port=22, + ), + client.V1ContainerPort( + container_port=80, + ), + client.V1ContainerPort( + container_port=443, + ), + ], + security_context=client.V1SecurityContext( + run_as_user=0, + run_as_group=0, ), - ], - ), + ) + ] ), ) - hostname = _wait_for_load_balancer_hostname( - api=self.api, service_name=_get_pod_service_name(instance_name) + api.create_namespaced_pod( + namespace=namespace, + body=pod, + ) + service = client.V1Service( + metadata=client.V1ObjectMeta( + name=_get_pod_service_name(instance_name), + labels=labels, + ), + spec=client.V1ServiceSpec( + type="LoadBalancer", + selector=_build_service_selector_from_labels(base_labels), + ports=[ + client.V1ServicePort( + name="ssh", + port=22, + target_port=22, + ), + client.V1ServicePort( + name="http", + port=80, + target_port=80, + ), + client.V1ServicePort( + name="https", + port=443, + target_port=443, + ), + ], + ), + ) + api.create_namespaced_service( + namespace=namespace, + body=service, + ) + # address is eiher a domain name or an IP address + address = _wait_for_load_balancer_address( + api=api, + namespace=namespace, + service_name=_get_pod_service_name(instance_name), ) - if hostname is None: - self.terminate_instance(instance_name, region="-") + if address is None: + self.terminate_instance(instance_name, region=configuration.region) raise ComputeError( "Failed to get gateway hostname. " "Ensure the Kubernetes cluster supports Load Balancer services." ) return GatewayProvisioningData( instance_id=instance_name, - ip_address=hostname, - region="-", + ip_address=address, + region=cluster.region, ) def terminate_gateway( @@ -306,143 +577,498 @@ def terminate_gateway( configuration: GatewayComputeConfiguration, backend_data: Optional[str] = None, ): + region = configuration.region + cluster = self.region_cluster_map.get(region) + if cluster is None: + # It may be a legacy configuration with the region set to an arbitrary value + cluster = self.region_cluster_map.get(LEGACY_CURRENT_CONTEXT_REGION) + if cluster is not None: + logger.warning( + ( + "Terminating gateway %s in unknown region %s." + " Assuming it was created before multi-cluster support was added" + " and is located in cluster %s" + ), + instance_id, + repr(region), + cluster, + ) + region = LEGACY_CURRENT_CONTEXT_REGION + else: + raise ComputeError(f"Unknown region: {region!r}") self.terminate_instance( instance_id=instance_id, - region=configuration.region, + region=region, backend_data=backend_data, ) + def register_volume(self, volume: Volume) -> VolumeProvisioningData: + assert isinstance(volume.configuration, KubernetesVolumeConfiguration) -def _get_gpus_from_node_labels(labels: Dict) -> List[Gpu]: - # We rely on https://github.com/NVIDIA/gpu-feature-discovery to detect gpus. - # Note that "nvidia.com/gpu.product" is not a short gpu name like "T4" or "A100" but a product name - # from nvidia-smi like "Tesla-T4" or "A100-SXM4-40GB". - # Thus, we convert the product name to a known gpu name. - gpu_count = labels.get("nvidia.com/gpu.count") - gpu_product = labels.get("nvidia.com/gpu.product") - if gpu_count is None or gpu_product is None: - return [] - gpu_count = int(gpu_count) - gpu_name = None - for known_gpu_name in GPU_NAMES: - if known_gpu_name.lower() in gpu_product.lower().split("-"): - gpu_name = known_gpu_name - break - if gpu_name is None: - return [] - gpu_info = GPU_NAME_TO_GPU_INFO[gpu_name] - gpu_memory = gpu_info.memory * 1024 - # A100 may come in two variants - if "40GB" in gpu_product: - gpu_memory = 40 * 1024 - return [Gpu(name=gpu_name, memory_mib=gpu_memory) for _ in range(gpu_count)] + region = volume.configuration.region + cluster = self.region_cluster_map.get(region) + if cluster is None: + if region == "": + raise ComputeError("region is not set") + raise ComputeError(f"Unknown region: {region!r}") + api = client.CoreV1Api(cluster.api_client) + namespace = cluster.namespace + pvc_name = volume.configuration.claim_name + assert pvc_name is not None -def _continue_setup_jump_pod( - api: client.CoreV1Api, - project_name: str, - project_ssh_private_key: str, - user_ssh_public_key: str, - jump_pod_host: str, - jump_pod_port: int, -): - _wait_for_pod_ready( - api=api, - pod_name=_get_jump_pod_name(project_name), + pvc = call_api_method( + api.read_namespaced_persistent_volume_claim, + expected=404, + namespace=namespace, + name=pvc_name, + ) + if pvc is None: + raise ComputeError(f"PersistentVolumeClaim {pvc_name} not found") + + capacity_bytes: Optional[Decimal] = None + if pvc.status is not None: + actual_capacity_qty = (pvc.status.capacity or {}).get("storage") + if actual_capacity_qty is not None: + capacity_bytes = parse_quantity(actual_capacity_qty) + if capacity_bytes is None and pvc.spec is not None and pvc.spec.resources is not None: + requested_capacity_qty = (pvc.spec.resources.requests or {}).get("storage") + if requested_capacity_qty is not None: + capacity_bytes = parse_quantity(requested_capacity_qty) + if capacity_bytes is None: + raise ComputeError(f"Failed to detect PersistentVolumeClaim {pvc_name} capacity") + + return VolumeProvisioningData( + backend=BackendType.KUBERNETES, + volume_id=pvc_name, + size_gb=int(capacity_bytes // 2**30), + attachable=False, + detachable=False, + ) + + def create_volume(self, volume: Volume) -> VolumeProvisioningData: + assert isinstance(volume.configuration, KubernetesVolumeConfiguration) + assert volume.configuration.size is not None + + region = volume.configuration.region + cluster = self.region_cluster_map.get(region) + if cluster is None: + if region == "": + raise ComputeError("region is not set") + raise ComputeError(f"Unknown region: {region!r}") + api = client.CoreV1Api(cluster.api_client) + namespace = cluster.namespace + + pvc_name = generate_unique_volume_name(volume, max_length=LABEL_VALUE_MAX_LENGTH) + + base_labels = build_base_labels( + component="volume", + unique_name=pvc_name, + project=volume.project_name, + name=volume.name, + user=volume.user, + ) + labels = merge_tags( + base_tags=base_labels, + resource_tags=volume.configuration.tags, + ) + labels = filter_invalid_labels(labels) + + pvc = client.V1PersistentVolumeClaim( + metadata=client.V1ObjectMeta( + name=pvc_name, + labels=labels, + ), + spec=client.V1PersistentVolumeClaimSpec( + access_modes=volume.configuration.access_modes, + storage_class_name=volume.configuration.storage_class_name, + resources=client.V1VolumeResourceRequirements( + requests={ + "storage": format_memory(volume.configuration.size), + }, + ), + ), + ) + api.create_namespaced_persistent_volume_claim( + namespace=namespace, + body=pvc, + ) + logger.debug("Created PVC %s for volume %s", pvc_name, volume.name) + + return VolumeProvisioningData( + backend=BackendType.KUBERNETES, + volume_id=pvc_name, + size_gb=volume.configuration.size_gb, + attachable=False, + detachable=False, + ) + + def delete_volume(self, volume: Volume): + assert isinstance(volume.configuration, KubernetesVolumeConfiguration) + + region = volume.configuration.region + cluster = self.region_cluster_map.get(region) + if cluster is None: + raise ComputeError(f"Unknown region: {region!r}") + api = client.CoreV1Api(cluster.api_client) + namespace = cluster.namespace + + pvc_name = volume.volume_id + assert pvc_name is not None + + pvc = call_api_method( + api.delete_namespaced_persistent_volume_claim, + expected=404, + namespace=namespace, + name=pvc_name, + ) + if pvc is None: + logger.debug("PVC %s for volume %s not found", pvc_name, volume.name) + else: + logger.debug("Deleted PVC %s for volume %s", pvc_name, volume.name) + + +def _get_pod_spec_parameters_for_gpu( + api: client.CoreV1Api, gpu_spec: GPUSpec +) -> tuple[str, client.V1NodeAffinity, str]: + nodes = api.list_node().items + gpu_vendor = gpu_spec.vendor + # If no vendor specified, we assume it's NVIDIA. Technically, it's possible to request either + # NVIDIA or AMD in the run configuration using only GPU names (e.g.,`gpu: H100,MI300X:8`), + # but we ignore such configurations as it's hard to translate them to K8s request. + if gpu_vendor is None or gpu_vendor == AcceleratorVendor.NVIDIA: + node_affinity = _get_nvidia_gpu_node_affinity(gpu_spec, nodes) + return NVIDIA_GPU_RESOURCE, node_affinity, NVIDIA_GPU_NODE_TAINT + if gpu_vendor == AcceleratorVendor.AMD: + node_affinity = _get_amd_gpu_node_affinity(gpu_spec, nodes) + return AMD_GPU_RESOURCE, node_affinity, AMD_GPU_NODE_TAINT + raise ComputeError(f"Unsupported GPU vendor: {gpu_vendor}") + + +def _get_nvidia_gpu_node_affinity( + gpu_spec: GPUSpec, nodes: list[client.V1Node] +) -> client.V1NodeAffinity: + matching_gpu_label_values: set[str] = set() + for node in nodes: + labels = get_node_labels(node) + gpu = get_nvidia_gpu_from_node_labels(labels) + if gpu is not None and _gpu_matches_gpu_spec(gpu, gpu_spec): + matching_gpu_label_values.add(labels[NVIDIA_GPU_PRODUCT_LABEL]) + if not matching_gpu_label_values: + raise ComputeError( + f"NVIDIA GPU is requested but no matching GPU labels found: {gpu_spec=}" + ) + logger.debug( + "Selecting nodes by labels %s for NVIDIA %s", matching_gpu_label_values, gpu_spec.name ) - _add_authorized_key_to_jump_pod( - jump_pod_host=jump_pod_host, - jump_pod_port=jump_pod_port, - ssh_private_key=project_ssh_private_key, - ssh_authorized_key=user_ssh_public_key, + return client.V1NodeAffinity( + required_during_scheduling_ignored_during_execution=client.V1NodeSelector( + node_selector_terms=[ + client.V1NodeSelectorTerm( + match_expressions=[ + client.V1NodeSelectorRequirement( + key=NVIDIA_GPU_PRODUCT_LABEL, + operator=Operator.IN, + values=list(matching_gpu_label_values), + ), + ], + ), + ], + ), ) +def _get_amd_gpu_node_affinity( + gpu_spec: GPUSpec, nodes: list[client.V1Node] +) -> client.V1NodeAffinity: + matching_device_ids: set[int] = set() + for node in nodes: + labels = get_node_labels(node) + gpu = get_amd_gpu_from_node_labels(labels) + if gpu is not None and _gpu_matches_gpu_spec(gpu, gpu_spec): + matching_device_ids.update(AMD_GPU_NAME_TO_DEVICE_IDS[gpu.name]) + return client.V1NodeAffinity( + required_during_scheduling_ignored_during_execution=client.V1NodeSelector( + node_selector_terms=[ + client.V1NodeSelectorTerm( + match_expressions=[ + client.V1NodeSelectorRequirement( + key=f"{AMD_GPU_DEVICE_ID_LABEL_PREFIX}{device_id:x}", + operator=Operator.EXISTS, + ), + ], + ) + for device_id in matching_device_ids + ], + ), + ) + + +def _gpu_matches_gpu_spec(gpu: Gpu, gpu_spec: GPUSpec) -> bool: + if gpu_spec.vendor is not None and gpu.vendor != gpu_spec.vendor: + return False + if gpu_spec.name is not None and gpu.name.lower() not in map(str.lower, gpu_spec.name): + return False + if gpu_spec.memory is not None: + min_memory_gib = gpu_spec.memory.min + if min_memory_gib is not None and gpu.memory_mib < min_memory_gib * 1024: + return False + max_memory_gib = gpu_spec.memory.max + if max_memory_gib is not None and gpu.memory_mib > max_memory_gib * 1024: + return False + if gpu_spec.compute_capability is not None: + if gpu.vendor != AcceleratorVendor.NVIDIA: + return False + gpu_info = NVIDIA_GPU_NAME_TO_GPU_INFO.get(gpu.name) + if gpu_info is None: + return False + if gpu_info.compute_capability < gpu_spec.compute_capability: + return False + return True + + def _create_jump_pod_service_if_not_exists( api: client.CoreV1Api, + namespace: str, project_name: str, - project_ssh_public_key: str, + jump_pod_name: str, + jump_pod_service_name: str, jump_pod_port: Optional[int], -) -> int: - try: - service = api.read_namespaced_service( - name=_get_jump_pod_service_name(project_name), - namespace=DEFAULT_NAMESPACE, + project_ssh_public_key: str, +) -> None: + base_labels = build_base_labels( + component="ssh-proxy", + unique_name=jump_pod_name, + project=project_name, + ) + labels = filter_invalid_labels(base_labels) + + service: Optional[client.V1Service] = None + pod: Optional[client.V1Pod] = None + _namespace = call_api_method( + api.read_namespace, + expected=404, + name=namespace, + ) + if _namespace is None: + _namespace = client.V1Namespace( + metadata=client.V1ObjectMeta( + name=namespace, + ), + ) + api.create_namespace(body=_namespace) + else: + service = call_api_method( + api.read_namespaced_service, + expected=404, + name=jump_pod_service_name, + namespace=namespace, + ) + pod = call_api_method( + api.read_namespaced_pod, + expected=404, + name=jump_pod_name, + namespace=namespace, ) - except client.ApiException as e: - if e.status == 404: - service = _create_jump_pod_service( - api=api, - project_name=project_name, - project_ssh_public_key=project_ssh_public_key, - jump_pod_port=jump_pod_port, - ) - else: - raise - return service.spec.ports[0].node_port + # The service may exist without the pod if the node on which the jump pod was running + # has been deleted. + if service is not None and pod is not None: + return -def _create_jump_pod_service( - api: client.CoreV1Api, - project_name: str, - project_ssh_public_key: str, - jump_pod_port: Optional[int], -) -> client.V1Service: - # TODO use restricted ssh-forwarding-only user for jump pod instead of root. + call_api_method( + api.delete_namespaced_pod, + expected=404, + namespace=namespace, + name=jump_pod_name, + ) + # False if we found at least one node without any "hard" taint, that is, if we don't need to + # specify the toleration. + toleration_required = True + # (key, effect) pairs. + tolerated_taints: set[tuple[str, str]] = set() + for node in api.list_node().items: + if (node_spec := node.spec) is None: + continue + # True if the node has at least one NoExecute or NoSchedule taint. + has_hard_taint = False + taints = node_spec.taints or [] + for taint in taints: + # A "soft" taint, ignore. + if not is_hard_taint(taint): + continue + has_hard_taint = True + if is_taint_tolerated(taint): + tolerated_taints.add((taint.key, taint.effect)) + if not has_hard_taint: + toleration_required = False + break + tolerations: list[client.V1Toleration] = [] + if toleration_required: + for key, effect in tolerated_taints: + tolerations.append( + client.V1Toleration(key=key, operator=Operator.EXISTS, effect=effect) + ) + if not tolerations: + logger.warning("No appropriate node found, the jump pod may never be scheduled") commands = _get_jump_pod_commands(authorized_keys=[project_ssh_public_key]) - pod_name = _get_jump_pod_name(project_name) - api.create_namespaced_pod( - namespace=DEFAULT_NAMESPACE, - body=client.V1Pod( - metadata=client.V1ObjectMeta( - name=pod_name, - labels={"app.kubernetes.io/name": pod_name}, - ), - spec=client.V1PodSpec( - containers=[ - client.V1Container( - name=f"{pod_name}-container", - # TODO: Choose appropriate image for jump pod - image="dstackai/base:py3.11-0.4rc4", - command=["/bin/sh"], - args=["-c", " && ".join(commands)], - ports=[ - client.V1ContainerPort( - container_port=JUMP_POD_SSH_PORT, - ) - ], - ) - ] - ), + pod = client.V1Pod( + metadata=client.V1ObjectMeta( + name=jump_pod_name, + labels=labels, + ), + spec=client.V1PodSpec( + containers=[ + client.V1Container( + name=f"{jump_pod_name}-container", + image=JUMP_POD_IMAGE, + command=["/bin/sh"], + args=["-c", " && ".join(commands)], + ports=[ + client.V1ContainerPort( + container_port=JUMP_POD_SSH_PORT, + ) + ], + ) + ], + tolerations=tolerations, ), ) - service_response = api.create_namespaced_service( - namespace=DEFAULT_NAMESPACE, - body=client.V1Service( - metadata=client.V1ObjectMeta(name=_get_jump_pod_service_name(project_name)), - spec=client.V1ServiceSpec( - type="NodePort", - selector={"app.kubernetes.io/name": pod_name}, - ports=[ - client.V1ServicePort( - port=JUMP_POD_SSH_PORT, - target_port=JUMP_POD_SSH_PORT, - node_port=jump_pod_port, - ) - ], - ), + api.create_namespaced_pod( + namespace=namespace, + body=pod, + ) + call_api_method( + api.delete_namespaced_service, + expected=404, + namespace=namespace, + name=jump_pod_service_name, + ) + service = client.V1Service( + metadata=client.V1ObjectMeta( + name=jump_pod_service_name, + labels=labels, + ), + spec=client.V1ServiceSpec( + type="NodePort", + selector=_build_service_selector_from_labels(base_labels), + ports=[ + client.V1ServicePort( + port=JUMP_POD_SSH_PORT, + target_port=JUMP_POD_SSH_PORT, + node_port=jump_pod_port, + ) + ], ), ) - return service_response + api.create_namespaced_service( + namespace=namespace, + body=service, + ) -def _get_jump_pod_commands(authorized_keys: List[str]) -> List[str]: +def _check_and_configure_jump_pod_service( + api: client.CoreV1Api, + namespace: str, + jump_pod_name: str, + jump_pod_service_name: str, + jump_pod_hostname: Optional[str], + project_ssh_private_key: str, + user_ssh_public_key: str, +) -> Optional[SSHConnectionParams]: + jump_pod = api.read_namespaced_pod( + namespace=namespace, + name=jump_pod_name, + ) + jump_pod_phase = PodPhase(get_or_error(get_or_error(jump_pod.status).phase)) + if jump_pod_phase.is_finished(): + raise ProvisioningError(f"Jump pod {jump_pod_name} is unexpectedly finished") + if not jump_pod_phase.is_running(): + logger.debug("Jump pod %s is not running yet", jump_pod_name) + return None + + if jump_pod_hostname is None: + jump_pod_node_name = get_or_error(get_or_error(jump_pod.spec).node_name) + cluster_external_ips: list[str] = [] + for node in api.list_node().items: + node_external_ips = [ + node_address.address + for node_address in get_or_error(get_or_error(node.status).addresses) + if node_address.type == "ExternalIP" + ] + if node_external_ips: + if get_node_name(node) == jump_pod_node_name: + jump_pod_hostname = node_external_ips[0] + break + cluster_external_ips.extend(node_external_ips) + if jump_pod_hostname is None: + if not cluster_external_ips: + raise ProvisioningError( + "Failed to acquire an IP for jump pod automatically." + " Specify proxy_jump.hostname for Kubernetes backend." + ) + jump_pod_hostname = random.choice(cluster_external_ips) + logger.info( + ( + "Jump pod %s is running on node %s which has no external IP," + " picking a random external IP: %s" + ), + jump_pod_name, + jump_pod_node_name, + jump_pod_hostname, + ) + + jump_pod_service = api.read_namespaced_service( + name=jump_pod_service_name, + namespace=namespace, + ) + jump_pod_service_ports = get_or_error(jump_pod_service.spec).ports + if not jump_pod_service_ports: + raise ProvisioningError("Jump pod service %s ports are empty", jump_pod_service_name) + if (jump_pod_port := jump_pod_service_ports[0].node_port) is None: + raise ProvisioningError("Jump pod service %s port is not set", jump_pod_service_name) + + ssh_exit_status, ssh_output = _run_ssh_command( + hostname=jump_pod_hostname, + port=jump_pod_port, + username=JUMP_POD_USER, + ssh_private_key=project_ssh_private_key, + # command= in authorized_keys is equivalent to ForceCommand in sshd_config + # By forcing the /bin/false command we only allow proxy jumping, no shell access + command=f""" + if grep -qvF '{user_ssh_public_key}' ~/.ssh/authorized_keys; then + echo 'command="/bin/false" {user_ssh_public_key}' >> ~/.ssh/authorized_keys + fi + """, + ) + if ssh_exit_status != 0: + logger.debug( + "Jump pod %s @ %s:%d, SSH command failed, exit status: %d, output: %s", + jump_pod_name, + jump_pod_hostname, + jump_pod_port, + ssh_exit_status, + ssh_output, + ) + return None + + logger.debug( + "Jump pod %s is available @ %s:%d", + jump_pod_name, + jump_pod_hostname, + jump_pod_port, + ) + return SSHConnectionParams( + hostname=jump_pod_hostname, + port=jump_pod_port, + username=JUMP_POD_USER, + ) + + +def _get_jump_pod_commands(authorized_keys: list[str]) -> list[str]: authorized_keys_content = "\n".join(authorized_keys).strip() commands = [ - # prohibit password authentication - 'sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config', - # create ssh dirs and add public key - "mkdir -p /run/sshd ~/.ssh", + "mkdir -p ~/.ssh", "chmod 700 ~/.ssh", f"echo '{authorized_keys_content}' > ~/.ssh/authorized_keys", "chmod 600 ~/.ssh/authorized_keys", @@ -450,51 +1076,305 @@ def _get_jump_pod_commands(authorized_keys: List[str]) -> List[str]: "rm -rf /etc/ssh/ssh_host_*", "ssh-keygen -A > /dev/null", # start sshd - f"/usr/sbin/sshd -p {JUMP_POD_SSH_PORT} -o PermitUserEnvironment=yes", - "sleep infinity", + ( + f"/usr/sbin/sshd -D -e -p {JUMP_POD_SSH_PORT}" + " -o LogLevel=ERROR" + " -o PasswordAuthentication=no" + " -o AllowTcpForwarding=local" + ), ] return commands -def _wait_for_pod_ready( +def _create_registry_auth_secret( + api: client.CoreV1Api, + namespace: str, + labels: dict[str, str], + secret_name: str, + image_name: str, + username: str, + password: str, +) -> None: + dockerconfigjson = build_dockerconfigjson( + image_name=image_name, + username=username, + password=password, + ) + secret = client.V1Secret( + metadata=client.V1ObjectMeta( + name=secret_name, + labels=labels, + ), + type="kubernetes.io/dockerconfigjson", + string_data={".dockerconfigjson": dockerconfigjson}, + ) + api.create_namespaced_secret( + namespace=namespace, + body=secret, + ) + + +def _create_job_pod( api: client.CoreV1Api, + namespace: str, + labels: dict[str, str], pod_name: str, - timeout_seconds: int = 300, -): - start_time = time.time() - while True: - try: - pod = api.read_namespaced_pod(name=pod_name, namespace=DEFAULT_NAMESPACE) - except client.ApiException as e: - if e.status != 404: - raise + registry_auth_secret_name: Optional[str], + run_spec: RunSpec, + job_spec: JobSpec, + volumes: list[Volume], + authorized_keys: list[str], +) -> None: + resources_requests: dict[str, str] = {} + resources_limits: dict[str, str] = {} + node_affinity: Optional[client.V1NodeAffinity] = None + tolerations: list[client.V1Toleration] = [] + volumes_: list[client.V1Volume] = [] + volume_mounts: list[client.V1VolumeMount] = [] + + resources_spec = job_spec.requirements.resources + assert isinstance(resources_spec.cpu, CPUSpec) + if (cpu_min := resources_spec.cpu.count.min) is not None: + resources_requests["cpu"] = str(cpu_min) + if (cpu_max := resources_spec.cpu.count.max) is not None: + resources_limits["cpu"] = str(cpu_max) + if (gpu_spec := resources_spec.gpu) is not None: + if (gpu_request := get_gpu_request_from_gpu_spec(gpu_spec)) > 0: + gpu_resource, node_affinity, node_taint = _get_pod_spec_parameters_for_gpu( + api, gpu_spec + ) + logger.debug("Requesting GPU resource: %s=%d", gpu_resource, gpu_request) + resources_requests[gpu_resource] = str(gpu_request) + # Limit must be set (GPU resources cannot be overcommitted) + # and must be equal to request. + resources_limits[gpu_resource] = str(gpu_request) + # It should be NoSchedule, but we also add NoExecute toleration just in case. + for effect in [TaintEffect.NO_SCHEDULE, TaintEffect.NO_EXECUTE]: + tolerations.append( + client.V1Toleration(key=node_taint, operator=Operator.EXISTS, effect=effect) + ) + if (memory_min := resources_spec.memory.min) is not None: + resources_requests["memory"] = format_memory(memory_min) + if (memory_max := resources_spec.memory.max) is not None: + resources_limits["memory"] = format_memory(memory_max) + if (disk_spec := resources_spec.disk) is not None: + if (disk_min := disk_spec.size.min) is not None: + resources_requests["ephemeral-storage"] = format_memory(disk_min) + if (disk_max := disk_spec.size.max) is not None: + resources_limits["ephemeral-storage"] = format_memory(disk_max) + if (shm_size := resources_spec.shm_size) is not None: + shm_volume_name = "dev-shm" + volumes_.append( + client.V1Volume( + name=shm_volume_name, + empty_dir=client.V1EmptyDirVolumeSource( + medium="Memory", + size_limit=format_memory(shm_size), + ), + ) + ) + volume_mounts.append( + client.V1VolumeMount( + name=shm_volume_name, + mount_path="/dev/shm", + ) + ) + + volume_name_path_map: dict[str, str] = {} + mount_points = job_spec.volumes + if mount_points is None: + # Legacy JobSpec without volumes + mount_points = run_spec.configuration.volumes + for mount_point in mount_points: + if isinstance(mount_point, VolumeMountPoint): + if isinstance(mount_point.name, str): + volume_names = [mount_point.name] + else: + volume_names = mount_point.name + for volume_name in volume_names: + volume_name_path_map[volume_name] = mount_point.path + elif isinstance(mount_point, InstanceMountPoint): + # "Must be a DNS_LABEL and unique within the pod" + volume_name = generate_unique_name( + prefix="host-path", max_length=OBJECT_NAME_MAX_LENGTH + ) + volumes_.append( + client.V1Volume( + name=volume_name, + host_path=client.V1HostPathVolumeSource( + path=mount_point.instance_path, + type="DirectoryOrCreate", + ), + ), + ) + volume_mounts.append( + client.V1VolumeMount( + name=volume_name, + mount_path=mount_point.path, + ) + ) else: - if pod.status.phase == "Running" and all( - container_status.ready for container_status in pod.status.container_statuses - ): - return True - elapsed_time = time.time() - start_time - if elapsed_time >= timeout_seconds: - logger.warning("Timeout waiting for pod %s to be ready", pod_name) - return False - time.sleep(1) + assert False, f"unexpected mount point: {mount_point!r}" + for volume in volumes: + assert isinstance(volume.configuration, KubernetesVolumeConfiguration) + pvc_name = volume.volume_id + assert pvc_name is not None, f"missing PVC name: {volume!r}" + mount_path = volume_name_path_map.get(volume.name) + assert mount_path is not None, f"missing mount path: {volume!r}" + volume_name = generate_unique_name(prefix="pvc", max_length=OBJECT_NAME_MAX_LENGTH) + volumes_.append( + client.V1Volume( + name=volume_name, + persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource( + claim_name=pvc_name, + ), + ), + ) + volume_mounts.append( + client.V1VolumeMount( + name=volume_name, + mount_path=mount_path, + read_only=volume.configuration.read_only, + recursive_read_only="IfPossible" if volume.configuration.read_only else None, + ) + ) + + pod = client.V1Pod( + metadata=client.V1ObjectMeta( + name=pod_name, + labels=labels, + ), + spec=client.V1PodSpec( + containers=[ + client.V1Container( + name=f"{pod_name}-container", + image=job_spec.image_name, + command=["/bin/sh"], + args=["-c", " && ".join(get_docker_commands(authorized_keys))], + ports=[ + client.V1ContainerPort( + container_port=DSTACK_RUNNER_SSH_PORT, + ) + ], + security_context=client.V1SecurityContext( + run_as_user=0, + run_as_group=0, + privileged=job_spec.privileged, + capabilities=client.V1Capabilities( + add=[ + # Allow to increase hard resource limits, see getrlimit(2) + "SYS_RESOURCE", + ], + ), + ), + resources=client.V1ResourceRequirements( + requests=resources_requests, + limits=resources_limits, + ), + volume_mounts=volume_mounts, + ) + ], + image_pull_secrets=( + [client.V1LocalObjectReference(name=registry_auth_secret_name)] + if registry_auth_secret_name is not None + else None + ), + affinity=client.V1Affinity( + node_affinity=node_affinity, + ), + tolerations=tolerations, + volumes=volumes_, + ), + ) + api.create_namespaced_pod( + namespace=namespace, + body=pod, + ) + + +def _wait_for_pod_scheduled_or_finished( + api: client.CoreV1Api, + namespace: str, + pod_name: str, + timeout_seconds: int, +) -> tuple[bool, Optional[PodPhase]]: + # We wait until container_statuses is populated rather than checking the PodScheduled + # condition or spec.node_name. container_statuses is set by the kubelet only after it + # has accepted the bound pod and started creating containers, so it implies both that + # the scheduler confirmed capacity and that the assigned node is actually Ready and + # working on the pod. + pod_phase: Optional[PodPhase] = None + # Ensure that API's timeoutSeconds fires earlier than the network timeout, which defaults to + # our custom ApiClient's constructor parameter, see DEFAULT_REQUEST_TIMEOUT + request_timeout = timeout_seconds + 5 + with watch_events( + api.list_namespaced_pod, + namespace=namespace, + field_selector=f"metadata.name={pod_name}", + timeout_seconds=timeout_seconds, + _request_timeout=request_timeout, + ) as event_iter: + for _, pod in event_iter: + pod_status = pod.status + if pod_status is None: + continue + if pod_status.phase is not None: + pod_phase = PodPhase(pod_status.phase) + else: + pod_phase = None + if pod_status.container_statuses is not None: + return True, pod_phase + if pod_phase is not None and pod_phase is not PodPhase.PENDING: + return True, pod_phase + return False, pod_phase + + +def _get_unscheduled_pod_reason_message( + api: client.CoreV1Api, + namespace: str, + pod_name: str, +) -> tuple[Optional[str], Optional[str]]: + pod = call_api_method( + api.read_namespaced_pod, + expected=404, + name=pod_name, + namespace=namespace, + ) + if pod is not None and pod.status is not None and pod.status.conditions: + for cond in pod.status.conditions: + if cond.type == "PodScheduled" and cond.status == "False": + return cond.reason, cond.message + return None, None -def _wait_for_load_balancer_hostname( +def _wait_for_load_balancer_address( api: client.CoreV1Api, + namespace: str, service_name: str, timeout_seconds: int = 120, ) -> Optional[str]: start_time = time.time() while True: - try: - service = api.read_namespaced_service(name=service_name, namespace=DEFAULT_NAMESPACE) - except client.ApiException as e: - if e.status != 404: - raise - else: - if service.status.load_balancer.ingress is not None: - return service.status.load_balancer.ingress[0].hostname + service = call_api_method( + api.read_namespaced_service, + expected=404, + name=service_name, + namespace=namespace, + ) + if ( + service is not None + and (service_status := service.status) is not None + and (lb_status := service_status.load_balancer) is not None + and (ingress_points := lb_status.ingress) + ): + ingress_point = ingress_points[0] + # > Hostname is set for load-balancer ingress points that are DNS based (typically + # > AWS load-balancers) + # > IP is set for load-balancer ingress points that are IP based (typically GCE or + # > OpenStack load-balancers) + address = ingress_point.hostname or ingress_point.ip + if address is not None: + return address elapsed_time = time.time() - start_time if elapsed_time >= timeout_seconds: logger.warning("Timeout waiting for load balancer %s to get ip", service_name) @@ -502,33 +1382,19 @@ def _wait_for_load_balancer_hostname( time.sleep(1) -def _add_authorized_key_to_jump_pod( - jump_pod_host: str, - jump_pod_port: int, - ssh_private_key: str, - ssh_authorized_key: str, -): - _run_ssh_command( - hostname=jump_pod_host, - port=jump_pod_port, - ssh_private_key=ssh_private_key, - command=( - f'if grep -qvF "{ssh_authorized_key}" ~/.ssh/authorized_keys; then ' - f"echo {ssh_authorized_key} >> ~/.ssh/authorized_keys; " - "fi" - ), - ) - - -def _get_gateway_commands(authorized_keys: List[str]) -> List[str]: +def _get_gateway_commands( + authorized_keys: List[str], router: Optional[AnyGatewayRouterConfig] = None +) -> List[str]: authorized_keys_content = "\n".join(authorized_keys).strip() - gateway_commands = " && ".join(get_dstack_gateway_commands()) + gateway_commands = " && ".join(get_dstack_gateway_commands(router=router)) + quoted_gateway_commands = shlex.quote(gateway_commands) + commands = [ # install packages "apt-get update && apt-get install -y sudo wget openssh-server nginx python3.10-venv libaugeas0", # install docker-systemctl-replacement "wget https://raw.githubusercontent.com/gdraheim/docker-systemctl-replacement/b18d67e521f0d1cf1d705dbb8e0416bef23e377c/files/docker/systemctl3.py -O /usr/bin/systemctl", - "chmod + /usr/bin/systemctl", + "chmod a+rx /usr/bin/systemctl", # install certbot "python3 -m venv /root/certbotvenv/", "/root/certbotvenv/bin/pip install certbot-nginx", @@ -536,8 +1402,7 @@ def _get_gateway_commands(authorized_keys: List[str]) -> List[str]: # prohibit password authentication 'sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config', # set up ubuntu user - "adduser ubuntu", - "usermod -aG sudo ubuntu", + "useradd -mUG sudo ubuntu", "echo 'ubuntu ALL=(ALL:ALL) NOPASSWD: ALL' | tee /etc/sudoers.d/ubuntu", # create ssh dirs and add public key "mkdir -p /run/sshd /home/ubuntu/.ssh", @@ -548,45 +1413,56 @@ def _get_gateway_commands(authorized_keys: List[str]) -> List[str]: # regenerate host keys "rm -rf /etc/ssh/ssh_host_*", "ssh-keygen -A > /dev/null", - # start sshd - "/usr/sbin/sshd -p 22 -o PermitUserEnvironment=yes", - # run gateway - f"su ubuntu -c '{gateway_commands}'", - "sleep infinity", + # install gateway + f"su ubuntu -c {quoted_gateway_commands}", + # start docker-systemctl-replacement as an init replacement (PID 1), which + # - starts and supervises enabled services (sshd, nginx, dstack.gateway) + # - stops running services on SIGTERM (graceful shutdown) + # - reaps orphan processes + # See: https://github.com/gdraheim/docker-systemctl-replacement/blob/b18d67e521f0d1cf1d705dbb8e0416bef23e377c/INIT-DAEMON.md + "exec systemctl default", ] return commands -def _run_ssh_command(hostname: str, port: int, ssh_private_key: str, command: str): +def _run_ssh_command( + hostname: str, port: int, username: str, ssh_private_key: str, command: str +) -> tuple[int, bytes]: with tempfile.NamedTemporaryFile("w+", 0o600) as f: f.write(ssh_private_key) f.flush() - subprocess.run( + proc = subprocess.run( [ "ssh", "-F", "none", "-o", "StrictHostKeyChecking=no", + "-o", + # The same timeout as in core.services.ssh.tunnel.SSH_DEFAULT_OPTIONS, + # which is used, for example, by server.services.runner.ssh.runner_ssh_tunnel() + "ConnectTimeout=3", "-i", f.name, "-p", str(port), - f"root@{hostname}", + f"{username}@{hostname}", command, ], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, ) + return proc.returncode, proc.stdout -def _get_jump_pod_name(project_name: str) -> str: - return f"{project_name}-ssh-jump-pod" - - -def _get_jump_pod_service_name(project_name: str) -> str: - return f"{project_name}-ssh-jump-pod-service" +def _build_service_selector_from_labels(labels: dict[str, str]) -> dict[str, str]: + label_key = "app.kubernetes.io/instance" + return {label_key: labels[label_key]} def _get_pod_service_name(pod_name: str) -> str: return f"{pod_name}-service" + + +def _get_registry_auth_secret_name(pod_name: str) -> str: + return f"{pod_name}-registry-auth" diff --git a/src/dstack/_internal/core/backends/kubernetes/config.py b/src/dstack/_internal/core/backends/kubernetes/config.py deleted file mode 100644 index 7cf6fd2a04..0000000000 --- a/src/dstack/_internal/core/backends/kubernetes/config.py +++ /dev/null @@ -1,6 +0,0 @@ -from dstack._internal.core.backends.base.config import BackendConfig -from dstack._internal.core.models.backends.kubernetes import KubernetesStoredConfig - - -class KubernetesConfig(KubernetesStoredConfig, BackendConfig): - pass diff --git a/src/dstack/_internal/core/backends/kubernetes/configurator.py b/src/dstack/_internal/core/backends/kubernetes/configurator.py new file mode 100644 index 0000000000..b8872f0211 --- /dev/null +++ b/src/dstack/_internal/core/backends/kubernetes/configurator.py @@ -0,0 +1,91 @@ +from dstack._internal.core.backends.base.configurator import ( + BackendRecord, + Configurator, + raise_invalid_credentials_error, +) +from dstack._internal.core.backends.kubernetes.backend import KubernetesBackend +from dstack._internal.core.backends.kubernetes.models import ( + KubernetesBackendConfig, + KubernetesBackendConfigWithCreds, + KubernetesConfig, + KubernetesStoredConfig, +) +from dstack._internal.core.backends.kubernetes.utils import ( + check_cluster, + get_clusters_from_backend_config, +) +from dstack._internal.core.errors import ServerClientError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +class KubernetesConfigurator( + Configurator[ + KubernetesBackendConfig, + KubernetesBackendConfigWithCreds, + ] +): + TYPE = BackendType.KUBERNETES + BACKEND_CLASS = KubernetesBackend + + def validate_config( + self, config: KubernetesBackendConfigWithCreds, default_creds_enabled: bool + ): + self._check_config_contexts(config) + try: + clusters = get_clusters_from_backend_config(config, request_timeout=10, retries=0) + except Exception as e: + raise ServerClientError(str(e)) + for cluster in clusters: + if not check_cluster(cluster): + raise_invalid_credentials_error( + fields=[["kubeconfig"]], + details=f"Failed to validate cluster {cluster}", + ) + + def create_backend( + self, project_name: str, config: KubernetesBackendConfigWithCreds + ) -> BackendRecord: + return BackendRecord( + config=KubernetesStoredConfig.__response__.parse_obj(config).json(), + auth="", + ) + + def get_backend_config_with_creds( + self, record: BackendRecord + ) -> KubernetesBackendConfigWithCreds: + config = self._get_config(record) + return KubernetesBackendConfigWithCreds.__response__.parse_obj(config) + + def get_backend_config_without_creds(self, record: BackendRecord) -> KubernetesBackendConfig: + config = self._get_config(record) + return KubernetesBackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> KubernetesBackend: + return KubernetesBackend(self._get_config(record)) + + def _get_config(self, record: BackendRecord) -> KubernetesConfig: + return KubernetesConfig.__response__.parse_raw(record.config) + + def _check_config_contexts(self, config: KubernetesBackendConfig): + if config.contexts is None: + return + if config.proxy_jump is not None: + raise ServerClientError("proxy_jump must not be set if contexts is set") + if config.namespace is not None: + raise ServerClientError("namespace must not be set if contexts is set") + seen: set[str] = set() + duplicates: set[str] = set() + for context in config.contexts: + if isinstance(context, str): + name = context + else: + name = context.name + if name in seen: + duplicates.add(name) + else: + seen.add(name) + if duplicates: + raise ServerClientError(f"duplicate contexts: {', '.join(sorted(duplicates))}") diff --git a/src/dstack/_internal/core/backends/kubernetes/models.py b/src/dstack/_internal/core/backends/kubernetes/models.py new file mode 100644 index 0000000000..eb92982e45 --- /dev/null +++ b/src/dstack/_internal/core/backends/kubernetes/models.py @@ -0,0 +1,111 @@ +from typing import Annotated, Literal, Optional, Union + +from pydantic import Field, root_validator + +from dstack._internal.core.backends.base.models import fill_data +from dstack._internal.core.models.common import CoreModel + + +class KubernetesProxyJumpConfig(CoreModel): + hostname: Annotated[ + Optional[str], Field(description="The external IP address or hostname of any node") + ] = None + port: Annotated[ + Optional[int], Field(description="Any port accessible outside of the cluster") + ] = None + + +class KubernetesContextConfig(CoreModel): + name: Annotated[str, Field(description="The name of the context")] + proxy_jump: Annotated[ + Optional[KubernetesProxyJumpConfig], Field(description="The SSH proxy jump configuration") + ] = None + + +class KubeconfigConfig(CoreModel): + filename: Annotated[str, Field(description="The path to the kubeconfig file")] = "" + data: Annotated[str, Field(description="The contents of the kubeconfig file")] + + +class KubernetesBackendConfig(CoreModel): + type: Annotated[Literal["kubernetes"], Field(description="The type of backend")] = "kubernetes" + contexts: Annotated[ + Optional[list[Union[KubernetesContextConfig, str]]], + Field( + description=( + "Enabled contexts (clusters). Each context should map to a separate cluster." + " The context name becomes the region name." + " If `contexts` is set, top-level `proxy_jump` and `namespace` must not be set." + " `proxy_jump`, if necessary, should be configured per-context;" + " `namespace` is taken from the corresponding kubeconfig context's property." + " If `contexts` is not set (not recommended), the kubeconfig's `current-context`" + " is used as the only context, with an empty string as the region name" + ), + ), + ] = None + proxy_jump: Annotated[ + Optional[KubernetesProxyJumpConfig], + Field( + description=( + "Only used if `contexts` is not set; must not be set otherwise." + " The SSH proxy jump configuration" + ), + ), + ] = None + namespace: Annotated[ + Optional[str], + Field( + description=( + "Only used if `contexts` is not set; must not be set otherwise." + " The namespace for resources managed by `dstack`." + " If `contexts` is not set, overrides the namespace set in the kubeconfig," + " even if not set. Defaults to `default`." + " Deprecated; will eventually be removed in future versions," + " but in the current version must be set if `contexts` is not set and the value" + " is not equal to `default`." + " Future versions will use the namespace from the kubeconfig instead." + " To prepare for future versions, set the same value in the kubeconfig" + ) + ), + ] = None + """`namespace` is formally deprecated since 0.20.20 but still used. Future versions will switch + to namespace from kubeconfig context, which is currently ignored""" + + +class KubernetesBackendConfigWithCreds(KubernetesBackendConfig): + kubeconfig: Annotated[KubeconfigConfig, Field(description="The kubeconfig configuration")] + + +class KubeconfigFileConfig(CoreModel): + filename: Annotated[str, Field(description="The path to the kubeconfig file")] = "" + data: Annotated[ + Optional[str], + Field( + description=( + "The contents of the kubeconfig file." + " When configuring via `server/config.yml`, it's automatically filled from `filename`." + " When configuring via UI, it has to be specified explicitly" + ) + ), + ] = None + + @root_validator + def fill_data(cls, values: dict) -> dict: + if values.get("filename") == "" and values.get("data") is None: + raise ValueError("filename or data must be specified") + return fill_data(values) + + +class KubernetesBackendFileConfigWithCreds(KubernetesBackendConfig): + kubeconfig: Annotated[KubeconfigFileConfig, Field(description="The kubeconfig configuration")] + + +AnyKubernetesBackendConfig = Union[KubernetesBackendConfig, KubernetesBackendConfigWithCreds] + + +class KubernetesStoredConfig(KubernetesBackendConfigWithCreds): + pass + + +class KubernetesConfig(KubernetesStoredConfig): + pass diff --git a/src/dstack/_internal/core/backends/kubernetes/resources.py b/src/dstack/_internal/core/backends/kubernetes/resources.py new file mode 100644 index 0000000000..2d3ff02de3 --- /dev/null +++ b/src/dstack/_internal/core/backends/kubernetes/resources.py @@ -0,0 +1,463 @@ +import base64 +import dataclasses +import json +import re +from collections.abc import Mapping +from decimal import Decimal +from enum import Enum +from typing import Callable, Literal, Optional, Union, cast + +from gpuhunt import KNOWN_AMD_GPUS, KNOWN_NVIDIA_GPUS, AcceleratorVendor + +# XXX: kubernetes.utils is missing in the stubs package +from kubernetes import utils as _kubernetes_utils # pyright: ignore[reportAttributeAccessIssue] +from kubernetes.client import CoreV1Api, V1Node, V1Taint +from typing_extensions import Self + +from dstack._internal.core.backends.base.compute import normalize_arch +from dstack._internal.core.backends.base.offers import filter_offers_by_requirements +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import ( + Disk, + Gpu, + InstanceAvailability, + InstanceOfferWithAvailability, + InstanceRuntime, + InstanceType, + Resources, +) +from dstack._internal.core.models.resources import CPUSpec, GPUSpec, Memory +from dstack._internal.core.models.runs import Requirements +from dstack._internal.utils import docker as docker_utils +from dstack._internal.utils.common import get_or_error +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + +# https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names +OBJECT_NAME_MAX_LENGTH = 253 + +# https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#syntax-and-character-set +LABEL_KEY_PREFIX_MAX_LENGTH = 253 +LABEL_KEY_PREFIX_REGEX = re.compile( + r"^[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(?:\.[a-z0-9]([a-z0-9-]*[a-z0-9])?)*$" +) +LABEL_KEY_NAME_MAX_LENGTH = 63 +LABEL_KEY_NAME_REGEX = re.compile(r"^[A-Za-z0-9](?:[A-Za-z0-9_.-]*[A-Za-z0-9])?$") +LABEL_VALUE_MAX_LENGTH = 63 +LABEL_VALUE_REGEX = re.compile(r"^(?:[A-Za-z0-9](?:[A-Za-z0-9_.-]*[A-Za-z0-9])?)?$") + +NVIDIA_GPU_RESOURCE = "nvidia.com/gpu" +NVIDIA_GPU_NODE_TAINT = NVIDIA_GPU_RESOURCE +NVIDIA_GPU_PRODUCT_LABEL = f"{NVIDIA_GPU_RESOURCE}.product" + +AMD_GPU_RESOURCE = "amd.com/gpu" +AMD_GPU_NODE_TAINT = AMD_GPU_RESOURCE +# The oldest but still supported label format, the safest option, see the commit message: +# https://github.com/ROCm/k8s-device-plugin/commit/c0b0231b391a56bc9da4f362d561e25e960d7a48 +# E.g., beta.amd.com/gpu.device-id.74b5=4 - A node with four MI300X VF (0x74b5) GPUs +# We cannot rely on the beta.amd.com/gpu.product-name.* label, as it may be missing, see the issue: +# https://github.com/ROCm/k8s-device-plugin/issues/112 +AMD_GPU_DEVICE_ID_LABEL_PREFIX = f"beta.{AMD_GPU_RESOURCE}.device-id." + +NVIDIA_GPU_NAME_TO_GPU_INFO = {gpu.name: gpu for gpu in KNOWN_NVIDIA_GPUS} +NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys() + +AMD_GPU_DEVICE_ID_TO_GPU_INFO = { + device_id: gpu_info for gpu_info in KNOWN_AMD_GPUS for device_id in gpu_info.device_ids +} +AMD_GPU_NAME_TO_DEVICE_IDS = {gpu.name: gpu.device_ids for gpu in KNOWN_AMD_GPUS} + + +class PodPhase(str, Enum): + PENDING = "Pending" + RUNNING = "Running" + SUCCEEDED = "Succeeded" + FAILED = "Failed" + UNKNOWN = "Unknown" # Deprecated: It isn't being set since 2015 + + def is_finished(self): + return self in [self.SUCCEEDED, self.FAILED] + + def is_running(self): + return self == self.RUNNING + + +class TaintEffect(str, Enum): + NO_EXECUTE = "NoExecute" + NO_SCHEDULE = "NoSchedule" + PREFER_NO_SCHEDULE = "PreferNoSchedule" + + +class KubernetesResource(str, Enum): + CPU = "cpu" + MEMORY = "memory" + EPHEMERAL_STORAGE = "ephemeral-storage" + NVIDIA_GPU = NVIDIA_GPU_RESOURCE + AMD_GPU = AMD_GPU_RESOURCE + + +@dataclasses.dataclass +class KubernetesResources: + cpu: Decimal = Decimal("0") + memory: Decimal = Decimal("0") + ephemeral_storage: Decimal = Decimal("0") + nvidia_gpu: Decimal = Decimal("0") + amd_gpu: Decimal = Decimal("0") + + @classmethod + def from_kubernetes_map(cls, map_: Mapping[str, str]) -> Self: + dct: dict[str, Decimal] = {} + for resource in KubernetesResource: + if (qty := map_.get(resource.value)) is not None: + dct[resource.name.lower()] = parse_quantity(qty) + return cls(**dct) + + def __getitem__(self, key: str) -> Decimal: + try: + resource = KubernetesResource(key) + except ValueError: + raise KeyError(key) + return getattr(self, resource.name.lower()) + + def __add__(self, other: Self) -> Self: + dct: dict[str, Decimal] = dataclasses.asdict(self) + qty: Decimal + for field, qty in dataclasses.asdict(other).items(): + dct[field] += qty + return type(self)(**dct) + + def __sub__(self, other: Self) -> Self: + dct: dict[str, Decimal] = dataclasses.asdict(self) + qty: Decimal + for field, qty in dataclasses.asdict(other).items(): + dct[field] -= qty + return type(self)(**dct) + + +def build_base_labels( + *, + component: Literal["ssh-proxy", "job", "gateway", "volume"], + unique_name: str, + project: str, + name: Optional[str] = None, + user: Optional[str] = None, +) -> dict[str, str]: + labels = { + "app.kubernetes.io/name": f"dstack-{component}", + # app.kubernetes.io/component would be redundant as app.kubernetes.io/name already includes + # it with dstack- prefix + "app.kubernetes.io/instance": unique_name, + "app.kubernetes.io/managed-by": "dstack", + "k8s.dstack.ai/project": project, + } + if name is not None: + labels["k8s.dstack.ai/name"] = name + if user is not None: + labels["k8s.dstack.ai/user"] = user + return labels + + +def filter_invalid_labels(labels: dict[str, str]) -> dict[str, str]: + filtered_labels: dict[str, str] = {} + for k, v in labels.items(): + try: + validate_label_key(k) + validate_label_value(v) + except ValueError as e: + logger.warning("Skipping invalid label %s=%s: %s", k, v, e) + continue + filtered_labels[k] = v + return filtered_labels + + +def validate_label_key(key: str) -> None: + parts = key.split("/") + if len(parts) > 2: + raise ValueError("Too many segments") + name: str + if len(parts) == 2: + prefix, name = parts + if not prefix: + raise ValueError("Empty prefix") + if len(prefix) > LABEL_KEY_PREFIX_MAX_LENGTH: + raise ValueError("Prefix too long") + if LABEL_KEY_PREFIX_REGEX.fullmatch(prefix) is None: + raise ValueError("Invalid prefix") + else: + name = parts[0] + if not name: + raise ValueError("Empty name") + if len(name) > LABEL_KEY_NAME_MAX_LENGTH: + raise ValueError("Name too long") + if LABEL_KEY_NAME_REGEX.fullmatch(name) is None: + raise ValueError("Invalid name") + + +def validate_label_value(value: str) -> None: + if len(value) > LABEL_VALUE_MAX_LENGTH: + raise ValueError("Value too long") + if LABEL_VALUE_REGEX.fullmatch(value) is None: + raise ValueError("Invalid value") + + +def build_dockerconfigjson(image_name: str, username: str, password: str) -> str: + registry = docker_utils.parse_image_name(image_name).registry + if registry is None or docker_utils.is_default_registry(registry): + # https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ + # > Use https://index.docker.io/v1/ for DockerHub + registry = "https://index.docker.io/v1/" + auth = base64.b64encode(f"{username}:{password}".encode()).decode() + entry = { + "username": username, + "password": password, + "auth": auth, + } + return json.dumps({"auths": {registry: entry}}) + + +parse_quantity = cast( + Callable[[Union[str, int, float, Decimal]], Decimal], _kubernetes_utils.parse_quantity +) + + +def format_memory(memory: Memory) -> str: + return f"{float(memory)}Gi" + + +def get_gpu_request_from_gpu_spec(gpu_spec: GPUSpec) -> int: + return gpu_spec.count.min or 0 + + +def get_node_name(node: V1Node) -> Optional[str]: + if (metadata := node.metadata) is None: + return None + return metadata.name + + +def get_node_labels(node: V1Node) -> dict[str, str]: + if (metadata := node.metadata) is None: + return {} + if (labels := metadata.labels) is None: + return {} + return labels + + +def is_hard_taint(taint: V1Taint) -> bool: + try: + taint_effect = TaintEffect(taint.effect) + except ValueError: + logger.warning( + "Unexpected taint %s=%s effect: %s", taint.key, taint.value or "", taint.effect + ) + return True + return taint_effect is not TaintEffect.PREFER_NO_SCHEDULE + + +def is_taint_tolerated(taint: V1Taint) -> bool: + return taint.key in (NVIDIA_GPU_NODE_TAINT, AMD_GPU_NODE_TAINT) + + +def get_instance_offers( + api: CoreV1Api, region: str, requirements: Requirements +) -> list[InstanceOfferWithAvailability]: + resources_spec = requirements.resources + assert isinstance(resources_spec.cpu, CPUSpec) + cpu_request = resources_spec.cpu.count.min or 0 + memory_mib_request = round((resources_spec.memory.min or 0) * 1024) + gpu_request = 0 + if (gpu_spec := resources_spec.gpu) is not None: + gpu_request = get_gpu_request_from_gpu_spec(gpu_spec) + disk_mib_request = 0 + if (disk_spec := resources_spec.disk) is not None: + disk_mib_request = round((disk_spec.size.min or 0) * 1024) + + nodes_allocated_resources = _get_nodes_allocated_resources(api) + offers: list[InstanceOfferWithAvailability] = [] + for node in api.list_node().items: + if (node_name := get_node_name(node)) is None: + continue + offer = _get_instance_offer_from_node( + node=node, + node_name=node_name, + node_allocated_resources=nodes_allocated_resources.get(node_name), + region=region, + cpu_request=cpu_request, + memory_mib_request=memory_mib_request, + gpu_request=gpu_request, + disk_mib_request=disk_mib_request, + ) + if offer is not None: + offers.extend(filter_offers_by_requirements([offer], requirements)) + return offers + + +def get_instance_offer_from_node( + node: V1Node, + *, + region: str, + cpu_request: int, + memory_mib_request: int, + gpu_request: int, + disk_mib_request: int, +) -> Optional[InstanceOfferWithAvailability]: + node_name = get_node_name(node) + if node_name is None: + return None + return _get_instance_offer_from_node( + node=node, + node_name=node_name, + node_allocated_resources=None, + region=region, + cpu_request=cpu_request, + memory_mib_request=memory_mib_request, + gpu_request=gpu_request, + disk_mib_request=disk_mib_request, + ) + + +def get_nvidia_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]: + # We rely on https://github.com/NVIDIA/k8s-device-plugin/tree/main/docs/gpu-feature-discovery + # to detect gpus. Note that "nvidia.com/gpu.product" is not a short gpu name like "T4" or + # "A100" but a product name like "Tesla-T4" or "A100-SXM4-40GB". + # Thus, we convert the product name to a known gpu name. + gpu_product = labels.get(NVIDIA_GPU_PRODUCT_LABEL) + if gpu_product is None: + return None + gpu_product = gpu_product.replace("RTX-", "RTX") + for gpu_name in NVIDIA_GPU_NAMES: + if gpu_name.lower() in gpu_product.lower().split("-"): + break + else: + return None + gpu_info = NVIDIA_GPU_NAME_TO_GPU_INFO[gpu_name] + gpu_memory = gpu_info.memory * 1024 + # A100 may come in two variants + if "40GB" in gpu_product: + gpu_memory = 40 * 1024 + return Gpu(vendor=AcceleratorVendor.NVIDIA, name=gpu_name, memory_mib=gpu_memory) + + +def get_amd_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]: + # (AMDGPUInfo.name, AMDGPUInfo.memory) pairs + gpus: set[tuple[str, int]] = set() + for label in labels: + if not label.startswith(AMD_GPU_DEVICE_ID_LABEL_PREFIX): + continue + _, _, _device_id = label.rpartition(".") + device_id = int(_device_id, 16) + gpu_info = AMD_GPU_DEVICE_ID_TO_GPU_INFO.get(device_id) + if gpu_info is None: + logger.warning("Unknown AMD GPU device id: %X", device_id) + continue + gpus.add((gpu_info.name, gpu_info.memory)) + if not gpus: + return None + if len(gpus) == 1: + gpu_name, gpu_memory_gib = next(iter(gpus)) + return Gpu(vendor=AcceleratorVendor.AMD, name=gpu_name, memory_mib=gpu_memory_gib * 1024) + logger.warning("Multiple AMD GPU models detected: %s, ignoring all GPUs", gpus) + return None + + +def _get_instance_offer_from_node( + node: V1Node, + node_name: str, + node_allocated_resources: Optional[KubernetesResources], + region: str, + cpu_request: int, + memory_mib_request: int, + gpu_request: int, + disk_mib_request: int, +) -> Optional[InstanceOfferWithAvailability]: + try: + node_spec = get_or_error(node.spec) + if any(is_hard_taint(t) and not is_taint_tolerated(t) for t in node_spec.taints or []): + logger.debug("Node %s: untolerated taint(s) found, skipping", node_name) + return None + node_status = get_or_error(node.status) + allocatable = get_or_error(node_status.allocatable) + _cpu_arch: Optional[str] = None + if node_status.node_info is not None: + _cpu_arch = node_status.node_info.architecture + cpu_arch = normalize_arch(_cpu_arch).to_cpu_architecture() + except ValueError as e: + logger.exception("Failed to process node %s: %s: %s", node_name, type(e).__name__, e) + return None + + node_resources = KubernetesResources.from_kubernetes_map(allocatable) + if node_allocated_resources is not None: + node_resources = node_resources - node_allocated_resources + cpu = max(0, int(node_resources.cpu)) + memory_mib = max(0, int(node_resources.memory / 2**20)) + disk_mib = max(0, int(node_resources.ephemeral_storage / 2**20)) + gpus = _get_gpus_from_node(node, node_name, node_resources) + + return InstanceOfferWithAvailability( + backend=BackendType.KUBERNETES, + instance=InstanceType( + name=node_name, + resources=Resources( + cpus=min(cpu_request, cpu), + cpu_arch=cpu_arch, + memory_mib=min(memory_mib_request, memory_mib), + gpus=gpus[:gpu_request], + disk=Disk(size_mib=min(disk_mib_request, disk_mib)), + spot=False, + ), + ), + price=0, + region=region, + availability=InstanceAvailability.AVAILABLE, + instance_runtime=InstanceRuntime.RUNNER, + ) + + +def _get_gpus_from_node( + node: V1Node, node_name: str, node_resources: KubernetesResources +) -> list[Gpu]: + labels = get_node_labels(node) + for gpu_resource, gpu_getter in ( + (NVIDIA_GPU_RESOURCE, get_nvidia_gpu_from_node_labels), + (AMD_GPU_RESOURCE, get_amd_gpu_from_node_labels), + ): + gpu_count = int(node_resources[gpu_resource]) + if gpu_count < 1: + continue + gpu = gpu_getter(labels) + if gpu is None: + logger.warning( + "Node %s: GPU resource found, but failed to detect its model: %s=%d", + node_name, + gpu_resource, + gpu_count, + ) + return [] + return [gpu] * gpu_count + logger.debug("Node %s: no available GPU resource found", node_name) + return [] + + +def _get_nodes_allocated_resources(api: CoreV1Api) -> dict[str, KubernetesResources]: + nodes_allocated_resources: dict[str, KubernetesResources] = {} + for pod in api.list_pod_for_all_namespaces().items: + pod_status = get_or_error(pod.status) + pod_phase = PodPhase(get_or_error(pod_status.phase)) + if pod_phase.is_finished(): + continue + pod_spec = get_or_error(pod.spec) + node_name = pod_spec.node_name + if node_name is None: + continue + pod_requests = KubernetesResources() + # TODO: Should we also check PodSpec.resources? As of 2026-01-21, it's in alpha + for container in pod_spec.containers: + if container.resources is not None and container.resources.requests: + pod_requests += KubernetesResources.from_kubernetes_map( + container.resources.requests + ) + try: + nodes_allocated_resources[node_name] += pod_requests + except KeyError: + nodes_allocated_resources[node_name] = pod_requests + return nodes_allocated_resources diff --git a/src/dstack/_internal/core/backends/kubernetes/utils.py b/src/dstack/_internal/core/backends/kubernetes/utils.py index b4a19fd448..a90f750f1b 100644 --- a/src/dstack/_internal/core/backends/kubernetes/utils.py +++ b/src/dstack/_internal/core/backends/kubernetes/utils.py @@ -1,40 +1,355 @@ -from typing import Dict, List, Optional +from collections.abc import Generator +from contextlib import contextmanager +from dataclasses import dataclass +from typing import ( + Annotated, + Any, + Callable, + Generic, + Literal, + Optional, + Protocol, + TypeVar, + Union, + cast, +) +from uuid import UUID -import kubernetes import yaml +from cachetools import TTLCache +from kubernetes.client import V1Status, VersionApi +from kubernetes.client.exceptions import ApiException +from kubernetes.watch import Watch +from pydantic import Field +from typing_extensions import ParamSpec, TypedDict +from dstack._internal.core.backends.kubernetes.api_client import ( + API_CLIENT_EXCEPTIONS, + ApiClient, + get_api_client_from_kubeconfig_dict, +) +from dstack._internal.core.backends.kubernetes.models import ( + KubernetesBackendConfigWithCreds, + KubernetesProxyJumpConfig, +) +from dstack._internal.core.models.common import CoreModel +from dstack._internal.core.models.instances import InstanceOffer +from dstack._internal.core.models.runs import Job, Run +from dstack._internal.utils.logging import get_logger -def get_api_from_config_data(kubeconfig_data: str) -> kubernetes.client.CoreV1Api: - config_dict = yaml.load(kubeconfig_data, yaml.FullLoader) - return get_api_from_config_dict(config_dict) +logger = get_logger(__name__) +T = TypeVar("T") +P = ParamSpec("P") -def get_api_from_config_dict(kubeconfig: Dict) -> kubernetes.client.CoreV1Api: - api_client = kubernetes.config.new_client_from_config_dict(config_dict=kubeconfig) - return kubernetes.client.CoreV1Api(api_client=api_client) +LEGACY_CURRENT_CONTEXT_REGION = "" -def get_cluster_public_ip(api_client: kubernetes.client.CoreV1Api) -> Optional[str]: + +@dataclass +class Cluster: + context_name: str + region: str + api_client: ApiClient + namespace: str + proxy_jump: KubernetesProxyJumpConfig + + def __str__(self) -> str: + parts: list[str] = [] + parts.append(f"context={self.context_name!r}") + if self.context_name != self.region: + parts.append(f"region={self.region!r}") + return f"({' '.join(parts)})" + + def __repr__(self) -> str: + return f"{self.__class__.__name__}{self}" + + +def check_cluster(cluster: Cluster) -> bool: + version_api = VersionApi(cluster.api_client) + try: + version_info = version_api.get_code() + except API_CLIENT_EXCEPTIONS as e: + logger.debug("cluster %s check failed: %s: %s", cluster, e.__class__.__name__, e) + return False + logger.debug("cluster %s gitVersion: %s", cluster, version_info.git_version) + return True + + +def get_clusters_from_backend_config( + config: KubernetesBackendConfigWithCreds, + *, + request_timeout: Optional[int] = None, + retries: Optional[int] = None, +) -> list[Cluster]: + clusters: list[Cluster] = [] + kubeconfig_dict = kubeconfig_data_to_kubeconfig_dict(config.kubeconfig.data) + kubeconfig = kubeconfig_dict_to_kubeconfig(kubeconfig_dict) + if config.contexts is not None: + for context in config.contexts: + if isinstance(context, str): + context_name = context + proxy_jump = None + else: + context_name = context.name + proxy_jump = context.proxy_jump + kubeconfig_context = kubeconfig.get_context(context_name) + api_client = get_api_client_from_kubeconfig_dict( + kubeconfig_dict, + context=context_name, + request_timeout=request_timeout, + retries=retries, + ) + namespace = kubeconfig_context.namespace + if proxy_jump is None: + proxy_jump = KubernetesProxyJumpConfig() + clusters.append( + Cluster( + context_name=context_name, + region=context_name, + api_client=api_client, + namespace=namespace, + proxy_jump=proxy_jump, + ) + ) + else: + current_kubeconfig_context = kubeconfig.get_context() + context_name = kubeconfig.current_context + # Already checked by Kubeconfig.get_context() + assert context_name is not None + api_client = get_api_client_from_kubeconfig_dict( + kubeconfig_dict, + context=context_name, + request_timeout=request_timeout, + retries=retries, + ) + config_namespace = config.namespace + if config_namespace is None: + config_namespace = "default" + context_namespace = current_kubeconfig_context.namespace + if context_namespace != config_namespace: + logger.warning( + ( + "Namespace mismatch: kubeconfig -> '%s', backend config -> '%s'." + " The current dstack version ignores kubeconfig" + " and uses deprecated namespace property from backend config." + " Future versions will use namespace from kubeconfig." + " To keep using '%s' namespace in future versions and suppress this warning," + " set namespace to '%s' in kubeconfig context '%s'" + ), + context_namespace, + config_namespace, + config_namespace, + config_namespace, + context_name, + ) + proxy_jump = config.proxy_jump + if proxy_jump is None: + proxy_jump = KubernetesProxyJumpConfig() + clusters.append( + Cluster( + context_name=context_name, + region=LEGACY_CURRENT_CONTEXT_REGION, + api_client=api_client, + # TODO: switch to context_namespace + namespace=config_namespace, + proxy_jump=proxy_jump, + ) + ) + return clusters + + +class KubeconfigContext(CoreModel): + namespace: str = "default" + + +class KubeconfigNamedContext(CoreModel): + name: str + context: KubeconfigContext + + +class Kubeconfig(CoreModel): + """ + `Kubeconfig` model only includes fields used by `dstack`. + Reference: https://kubernetes.io/docs/reference/config-api/kubeconfig.v1/ + """ + + contexts: list[KubeconfigNamedContext] = [] + current_context: Annotated[Optional[str], Field(alias="current-context")] = None + + def get_context(self, name: Optional[str] = None) -> KubeconfigContext: + if name is None: + name = self.current_context + if name is None: + raise ValueError("current-context is not set") + for named_context in self.contexts: + if named_context.name == name: + return named_context.context + raise ValueError(f"context {name} not found") + + +def kubeconfig_data_to_kubeconfig_dict(kubeconfig_data: str) -> dict: + kubeconfig_dict = yaml.load(kubeconfig_data, yaml.FullLoader) + if not isinstance(kubeconfig_dict, dict): + raise TypeError(f"Unexpected kubeconfig_data type: {kubeconfig_dict.__class__.__name__}") + return kubeconfig_dict + + +def kubeconfig_dict_to_kubeconfig(kubeconfig_dict: dict) -> Kubeconfig: + return Kubeconfig.__response__.parse_obj(kubeconfig_dict) + + +class SkipOfferCache: """ - Returns public IP of any cluster node. + `SkipOfferCache` is used to track (run/job, offer) pairs that failed to provision. + + The current implementation tracks _any_ job of the specific run (identified by `Run.id`) + on the specific cluster (identified by `InstanceOffer.region`, that is, a kubeconfig context). """ - public_ips = get_cluster_public_ips(api_client) - if len(public_ips) == 0: - return None - return public_ips[0] + def __init__(self, *, ttl: int, maxsize: int = 1000) -> None: + self._cache = TTLCache[tuple[UUID, str], Literal[True]](maxsize=maxsize, ttl=ttl) + + def add(self, run: Run, job: Job, offer: InstanceOffer) -> None: + self._cache[self._build_key(run, job, offer)] = True + + def check(self, run: Run, job: Job, offer: InstanceOffer) -> bool: + return self._build_key(run, job, offer) in self._cache -def get_cluster_public_ips(api_client: kubernetes.client.CoreV1Api) -> List[str]: + def _build_key(self, run: Run, job: Job, offer: InstanceOffer) -> tuple[UUID, str]: + # The current implementation uses only Run.id ignoring the job/job spec. + # A more sophisticated implementation could use some parts of the job spec + # (e.g., requirements, volumes) instead. + return (run.id, offer.region) + + +def call_api_method( + method: Callable[P, T], + expected: Union[int, tuple[int, ...], list[int]], + *args: P.args, + **kwargs: P.kwargs, +) -> Optional[T]: """ - Returns public IPs of all cluster nodes. + Returns the result of the API method call ignoring specified HTTP status codes. + + If you don't expect any error status code, just call the method directly. + + Args: + method: the `CoreV1Api` bound method. + expected: Expected error statuses, e.g., 404. + args: positional arguments of the method. + kwargs: keyword arguments of the method. + Returns: + The return value or `None` in case of the expected error. """ - public_ips = [] - for node in api_client.list_node().items: - addresses = node.status.addresses + if isinstance(expected, int): + expected = (expected,) + try: + return method(*args, **kwargs) + except ApiException as e: + if e.status not in expected: + raise + return None + + +class NamespacedNameMethod(Protocol): + def __call__(self, name: str, namespace: str) -> Any: ... + + +def try_delete_object_if_exists( + method: NamespacedNameMethod, + *, + namespace: str, + name: str, + description: str, + should_delete_manually_if_failed: bool = False, +) -> bool: + try: + call_api_method( + method, + expected=404, + namespace=namespace, + name=name, + ) + except API_CLIENT_EXCEPTIONS as e: + if should_delete_manually_if_failed: + logger.exception( + "Failed to delete %s %s in namespace %s. Please delete it manually", + description, + name, + namespace, + ) + else: + logger.warning( + "Failed to delete %s %s in namespace %s: %s: %s", + description, + name, + namespace, + e.__class__.__name__, + e, + ) + return False + return True + + +class ObjectList(Protocol[T]): + items: list[T] + + +@contextmanager +def watch_events( + method: Callable[P, ObjectList[T]], *args: P.args, **kwargs: P.kwargs +) -> Generator[Generator[tuple[str, T], None, None], None, None]: + watch = Watch() + inner_gen = cast(Generator[_EventDict[T], None, None], watch.stream(method, *args, **kwargs)) + gen = _watch_events_gen(inner_gen) + try: + yield gen + finally: + gen.close() + watch.stop() + + +class _StateEventDict(TypedDict, Generic[T]): + type: Literal["ADDED", "MODIFIED", "DELETED"] + object: T + + +class _BookmarkEventDict(TypedDict, Generic[T]): + type: Literal["BOOKMARK"] + # The object is a minimal instance of the watched resource's type -- same kind and apiVersion, + # but only metadata.resourceVersion is populated. Everything else is empty or zero-valued. + object: T + + +class _ErrorEventDict(TypedDict): + type: Literal["ERROR"] + object: V1Status + + +_EventDict = Union[_StateEventDict[T], _BookmarkEventDict[T], _ErrorEventDict] - # Look for an external IP address - for address in addresses: - if address.type == "ExternalIP": - public_ips.append(address.address) - return public_ips +def _watch_events_gen( + gen: Generator[_EventDict[T], None, None], +) -> Generator[tuple[str, T], None, None]: + try: + for event in gen: + match event["type"]: + case "ADDED" | "MODIFIED" | "DELETED": + yield event["type"], event["object"] + case "BOOKMARK": + pass + case "ERROR": + status = event["object"] + logger.warning( + "Got ERROR event (status=%s reason=%s code=%s): %s", + status.status, + status.reason, + status.code, + status.message, + ) + case _: + logger.warning("Got unexpected event: %s", event) + finally: + gen.close() diff --git a/src/dstack/_internal/core/backends/lambdalabs/__init__.py b/src/dstack/_internal/core/backends/lambdalabs/__init__.py index f5b0e075e0..e69de29bb2 100644 --- a/src/dstack/_internal/core/backends/lambdalabs/__init__.py +++ b/src/dstack/_internal/core/backends/lambdalabs/__init__.py @@ -1,16 +0,0 @@ -from dstack._internal.core.backends.base import Backend -from dstack._internal.core.backends.lambdalabs.compute import LambdaCompute -from dstack._internal.core.backends.lambdalabs.config import LambdaConfig -from dstack._internal.core.models.backends.base import BackendType - - -class LambdaBackend(Backend): - TYPE: BackendType = BackendType.LAMBDA - - def __init__(self, config: LambdaConfig): - self.config = config - self._compute = LambdaCompute(self.config) - # self._check_credentials() - - def compute(self) -> LambdaCompute: - return self._compute diff --git a/src/dstack/_internal/core/backends/lambdalabs/api_client.py b/src/dstack/_internal/core/backends/lambdalabs/api_client.py index 30ed013625..0da70ff5fc 100644 --- a/src/dstack/_internal/core/backends/lambdalabs/api_client.py +++ b/src/dstack/_internal/core/backends/lambdalabs/api_client.py @@ -13,7 +13,7 @@ def validate_api_key(self) -> bool: try: self.list_instance_types() except requests.HTTPError as e: - if e.response.status_code in [401, 403]: + if e.response is not None and e.response.status_code in [401, 403]: return False raise e return True @@ -47,22 +47,22 @@ def launch_instances( "name": name, } resp = self._make_request("POST", "/instance-operations/launch", data) - if resp.ok: - return resp.json()["data"]["instance_ids"] - resp.raise_for_status() + if not resp.ok: + resp.raise_for_status() + return resp.json()["data"]["instance_ids"] def terminate_instances(self, instance_ids: List[str]) -> List[str]: data = {"instance_ids": instance_ids} resp = self._make_request("POST", "/instance-operations/terminate", data) - if resp.ok: - return resp.json()["data"] - resp.raise_for_status() + if not resp.ok: + resp.raise_for_status() + return resp.json()["data"] def list_ssh_keys(self) -> List[Dict]: resp = self._make_request("GET", "/ssh-keys") - if resp.ok: - return resp.json()["data"] - resp.raise_for_status() + if not resp.ok: + resp.raise_for_status() + return resp.json()["data"] def add_ssh_key(self, name: str, public_key: str) -> List[Dict]: data = { @@ -70,17 +70,17 @@ def add_ssh_key(self, name: str, public_key: str) -> List[Dict]: "public_key": public_key, } resp = self._make_request("POST", "/ssh-keys", data) - if resp.ok: - return resp.json()["data"] - resp.raise_for_status() + if not resp.ok: + resp.raise_for_status() + return resp.json()["data"] def _make_request(self, method: str, path: str, data: Any = None): - # TODO: fix S113 by setting an adequate timeout here or in every method - return requests.request( # noqa: S113 + return requests.request( method=method, url=API_URL + path, json=data, headers={"Authorization": f"Bearer {self.api_key}"}, + timeout=120, ) def _url(self, path: str) -> str: diff --git a/src/dstack/_internal/core/backends/lambdalabs/backend.py b/src/dstack/_internal/core/backends/lambdalabs/backend.py new file mode 100644 index 0000000000..48d46bedb3 --- /dev/null +++ b/src/dstack/_internal/core/backends/lambdalabs/backend.py @@ -0,0 +1,17 @@ +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.lambdalabs.compute import LambdaCompute +from dstack._internal.core.backends.lambdalabs.models import LambdaConfig +from dstack._internal.core.models.backends.base import BackendType + + +class LambdaBackend(Backend): + TYPE = BackendType.LAMBDA + COMPUTE_CLASS = LambdaCompute + + def __init__(self, config: LambdaConfig): + self.config = config + self._compute = LambdaCompute(self.config) + # self._check_credentials() + + def compute(self) -> LambdaCompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/lambdalabs/compute.py b/src/dstack/_internal/core/backends/lambdalabs/compute.py index 8588dcd14c..1007d33bea 100644 --- a/src/dstack/_internal/core/backends/lambdalabs/compute.py +++ b/src/dstack/_internal/core/backends/lambdalabs/compute.py @@ -1,4 +1,5 @@ import hashlib +import shlex import subprocess import tempfile from threading import Thread @@ -6,43 +7,58 @@ from dstack._internal.core.backends.base.compute import ( Compute, - get_instance_name, + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithInstanceVolumesSupport, + ComputeWithPrivilegedSupport, + generate_unique_instance_name, get_shim_commands, ) from dstack._internal.core.backends.base.offers import get_catalog_offers from dstack._internal.core.backends.lambdalabs.api_client import LambdaAPIClient -from dstack._internal.core.backends.lambdalabs.config import LambdaConfig +from dstack._internal.core.backends.lambdalabs.models import LambdaConfig from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.instances import ( InstanceAvailability, InstanceConfiguration, InstanceOffer, InstanceOfferWithAvailability, - SSHKey, ) -from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run -from dstack._internal.core.models.volumes import Volume +from dstack._internal.core.models.placement import PlacementGroup +from dstack._internal.core.models.runs import JobProvisioningData +MAX_INSTANCE_NAME_LEN = 60 -class LambdaCompute(Compute): + +class LambdaCompute( + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithPrivilegedSupport, + ComputeWithInstanceVolumesSupport, + Compute, +): def __init__(self, config: LambdaConfig): + super().__init__() self.config = config self.api_client = LambdaAPIClient(config.creds.api_key) - def get_offers( - self, requirements: Optional[Requirements] = None - ) -> List[InstanceOfferWithAvailability]: + def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]: offers = get_catalog_offers( backend=BackendType.LAMBDA, - locations=self.config.regions, - requirements=requirements, + locations=self.config.regions or None, ) offers_with_availability = self._get_offers_with_availability(offers) return offers_with_availability def create_instance( - self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration + self, + instance_offer: InstanceOfferWithAvailability, + instance_config: InstanceConfiguration, + placement_group: Optional[PlacementGroup], ) -> JobProvisioningData: + instance_name = generate_unique_instance_name( + instance_config, max_length=MAX_INSTANCE_NAME_LEN + ) project_ssh_key = instance_config.ssh_keys[0] project_key_name = _add_project_ssh_key( api_client=self.api_client, @@ -52,7 +68,7 @@ def create_instance( region_name=instance_offer.region, instance_type_name=instance_offer.instance.name, ssh_key_names=[project_key_name], - name=instance_config.instance_name, + name=instance_name, quantity=1, file_system_names=[], ) @@ -81,9 +97,9 @@ def update_provisioning_data( instance_info = _get_instance_info(self.api_client, provisioning_data.instance_id) if instance_info is not None and instance_info["status"] != "booting": provisioning_data.hostname = instance_info["ip"] - commands = get_shim_commands(authorized_keys=[project_ssh_public_key]) - # shim is asssumed to be run under root - launch_command = "sudo sh -c '" + "&& ".join(commands) + "'" + commands = get_shim_commands(arch=provisioning_data.instance_type.resources.cpu_arch) + # shim is assumed to be run under root + launch_command = "sudo sh -c " + shlex.quote(" && ".join(commands)) thread = Thread( target=_start_runner, kwargs={ @@ -95,29 +111,6 @@ def update_provisioning_data( ) thread.start() - def run_job( - self, - run: Run, - job: Job, - instance_offer: InstanceOfferWithAvailability, - project_ssh_public_key: str, - project_ssh_private_key: str, - volumes: List[Volume], - ) -> JobProvisioningData: - instance_config = InstanceConfiguration( - project_name=run.project_name, - instance_name=get_instance_name(run, job), # TODO: generate name - ssh_keys=[ - SSHKey( - public=project_ssh_public_key.strip(), private=project_ssh_private_key.strip() - ), - SSHKey(public=run.run_spec.ssh_key_pub.strip()), - ], - job_docker_config=None, - user=run.user, - ) - return self.create_instance(instance_offer, instance_config) - def terminate_instance( self, instance_id: str, region: str, backend_data: Optional[str] = None ): @@ -134,14 +127,10 @@ def _get_offers_with_availability( } availability_offers = [] for offer in offers: - if offer.region not in self.config.regions: - continue availability = InstanceAvailability.NOT_AVAILABLE if offer.region in instance_availability.get(offer.instance.name, []): availability = InstanceAvailability.AVAILABLE - availability_offers.append( - InstanceOfferWithAvailability(**offer.dict(), availability=availability) - ) + availability_offers.append(offer.with_availability(availability=availability)) return availability_offers @@ -196,13 +185,18 @@ def _setup_instance( ssh_private_key: str, ): setup_commands = ( - "mkdir /home/ubuntu/.dstack && " - "sudo apt-get update && " - "sudo apt-get install -y --no-install-recommends nvidia-container-toolkit && " - "sudo nvidia-ctk runtime configure --runtime=docker && " - "sudo pkill -SIGHUP dockerd" + "mkdir /home/ubuntu/.dstack", + "sudo apt-get update", + "sudo apt-get install -y --no-install-recommends nvidia-container-toolkit", + "sudo install -d -m 0755 /etc/docker", + # Workaround for https://github.com/NVIDIA/nvidia-container-toolkit/issues/48 + """echo '{"exec-opts":["native.cgroupdriver=cgroupfs"]}' | sudo tee /etc/docker/daemon.json""", + "sudo nvidia-ctk runtime configure --runtime=docker", + "sudo systemctl restart docker.service", # `systemctl reload` (`kill -HUP`) won't work + ) + _run_ssh_command( + hostname=hostname, ssh_private_key=ssh_private_key, command=" && ".join(setup_commands) ) - _run_ssh_command(hostname=hostname, ssh_private_key=ssh_private_key, command=setup_commands) def _launch_runner( @@ -210,10 +204,11 @@ def _launch_runner( ssh_private_key: str, launch_command: str, ): + daemonized_command = f"{launch_command.rstrip('&')} >/tmp/dstack-shim.log 2>&1 & disown" _run_ssh_command( hostname=hostname, ssh_private_key=ssh_private_key, - command=launch_command, + command=daemonized_command, ) diff --git a/src/dstack/_internal/core/backends/lambdalabs/config.py b/src/dstack/_internal/core/backends/lambdalabs/config.py deleted file mode 100644 index 4fbea18e1f..0000000000 --- a/src/dstack/_internal/core/backends/lambdalabs/config.py +++ /dev/null @@ -1,9 +0,0 @@ -from dstack._internal.core.backends.base.config import BackendConfig -from dstack._internal.core.models.backends.lambdalabs import ( - AnyLambdaCreds, - LambdaStoredConfig, -) - - -class LambdaConfig(LambdaStoredConfig, BackendConfig): - creds: AnyLambdaCreds diff --git a/src/dstack/_internal/core/backends/lambdalabs/configurator.py b/src/dstack/_internal/core/backends/lambdalabs/configurator.py new file mode 100644 index 0000000000..7c99cb2139 --- /dev/null +++ b/src/dstack/_internal/core/backends/lambdalabs/configurator.py @@ -0,0 +1,65 @@ +import json + +from dstack._internal.core.backends.base.configurator import ( + BackendRecord, + Configurator, + raise_invalid_credentials_error, +) +from dstack._internal.core.backends.lambdalabs import api_client +from dstack._internal.core.backends.lambdalabs.backend import LambdaBackend +from dstack._internal.core.backends.lambdalabs.models import ( + LambdaBackendConfig, + LambdaBackendConfigWithCreds, + LambdaConfig, + LambdaCreds, + LambdaStoredConfig, +) +from dstack._internal.core.models.backends.base import ( + BackendType, +) + + +class LambdaConfigurator( + Configurator[ + LambdaBackendConfig, + LambdaBackendConfigWithCreds, + ] +): + TYPE = BackendType.LAMBDA + BACKEND_CLASS = LambdaBackend + + def validate_config(self, config: LambdaBackendConfigWithCreds, default_creds_enabled: bool): + self._validate_lambda_api_key(config.creds.api_key) + + def create_backend( + self, project_name: str, config: LambdaBackendConfigWithCreds + ) -> BackendRecord: + return BackendRecord( + config=LambdaStoredConfig( + **LambdaBackendConfig.__response__.parse_obj(config).dict() + ).json(), + auth=LambdaCreds.parse_obj(config.creds).json(), + ) + + def get_backend_config_with_creds(self, record: BackendRecord) -> LambdaBackendConfigWithCreds: + config = self._get_config(record) + return LambdaBackendConfigWithCreds.__response__.parse_obj(config) + + def get_backend_config_without_creds(self, record: BackendRecord) -> LambdaBackendConfig: + config = self._get_config(record) + return LambdaBackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> LambdaBackend: + config = self._get_config(record) + return LambdaBackend(config=config) + + def _get_config(self, record: BackendRecord) -> LambdaConfig: + return LambdaConfig.__response__( + **json.loads(record.config), + creds=LambdaCreds.parse_raw(record.auth), + ) + + def _validate_lambda_api_key(self, api_key: str): + client = api_client.LambdaAPIClient(api_key=api_key) + if not client.validate_api_key(): + raise_invalid_credentials_error(fields=[["creds", "api_key"]]) diff --git a/src/dstack/_internal/core/backends/lambdalabs/models.py b/src/dstack/_internal/core/backends/lambdalabs/models.py new file mode 100644 index 0000000000..7fb3ca020a --- /dev/null +++ b/src/dstack/_internal/core/backends/lambdalabs/models.py @@ -0,0 +1,37 @@ +from typing import Annotated, List, Literal, Optional, Union + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +class LambdaAPIKeyCreds(CoreModel): + type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" + api_key: Annotated[str, Field(description="The API key")] + + +AnyLambdaCreds = LambdaAPIKeyCreds +LambdaCreds = AnyLambdaCreds + + +class LambdaBackendConfig(CoreModel): + type: Annotated[Literal["lambda"], Field(description="The type of backend")] = "lambda" + regions: Annotated[ + Optional[List[str]], + Field(description="The list of Lambda regions. Omit to use all regions"), + ] = None + + +class LambdaBackendConfigWithCreds(LambdaBackendConfig): + creds: Annotated[AnyLambdaCreds, Field(description="The credentials")] + + +AnyLambdaBackendConfig = Union[LambdaBackendConfig, LambdaBackendConfigWithCreds] + + +class LambdaStoredConfig(LambdaBackendConfig): + pass + + +class LambdaConfig(LambdaStoredConfig): + creds: AnyLambdaCreds diff --git a/src/dstack/_internal/core/backends/local/__init__.py b/src/dstack/_internal/core/backends/local/__init__.py deleted file mode 100644 index 0bd31ea8bf..0000000000 --- a/src/dstack/_internal/core/backends/local/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from dstack._internal.core.backends.base import Backend -from dstack._internal.core.backends.local.compute import LocalCompute -from dstack._internal.core.models.backends.base import BackendType - - -class LocalBackend(Backend): - TYPE: BackendType = BackendType.LOCAL - - def __init__(self): - self._compute = LocalCompute() - - def compute(self) -> LocalCompute: - return self._compute diff --git a/src/dstack/_internal/core/backends/local/compute.py b/src/dstack/_internal/core/backends/local/compute.py deleted file mode 100644 index 2c3396e46a..0000000000 --- a/src/dstack/_internal/core/backends/local/compute.py +++ /dev/null @@ -1,100 +0,0 @@ -from typing import List, Optional - -from dstack._internal.core.backends.base.compute import Compute -from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.instances import ( - InstanceAvailability, - InstanceConfiguration, - InstanceOfferWithAvailability, - InstanceRuntime, - InstanceType, - Resources, -) -from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run -from dstack._internal.core.models.volumes import Volume, VolumeProvisioningData -from dstack._internal.utils.logging import get_logger - -logger = get_logger(__name__) - - -class LocalCompute(Compute): - def get_offers( - self, requirements: Optional[Requirements] = None - ) -> List[InstanceOfferWithAvailability]: - return [ - InstanceOfferWithAvailability( - backend=BackendType.LOCAL, - instance=InstanceType( - name="local", - resources=Resources(cpus=4, memory_mib=8192, gpus=[], spot=False), - ), - region="local", - price=0.00, - availability=InstanceAvailability.AVAILABLE, - instance_runtime=InstanceRuntime.RUNNER, - ) - ] - - def terminate_instance( - self, instance_id: str, region: str, backend_data: Optional[str] = None - ): - pass - - def create_instance( - self, - instance_offer: InstanceOfferWithAvailability, - instance_config: InstanceConfiguration, - ) -> JobProvisioningData: - return JobProvisioningData( - backend=instance_offer.backend, - instance_type=instance_offer.instance, - instance_id="local", - hostname="127.0.0.1", - internal_ip=None, - region="", - price=instance_offer.price, - username="root", - ssh_port=10022, - ssh_proxy=None, - dockerized=True, - backend_data=None, - ) - - def run_job( - self, - run: Run, - job: Job, - instance_offer: InstanceOfferWithAvailability, - project_ssh_public_key: str, - project_ssh_private_key: str, - volumes: List[Volume], - ) -> JobProvisioningData: - return JobProvisioningData( - backend=instance_offer.backend, - instance_type=instance_offer.instance, - instance_id="local", - hostname="127.0.0.1", - internal_ip=None, - region="", - price=instance_offer.price, - username="root", - ssh_port=10022, - ssh_proxy=None, - dockerized=True, - backend_data=None, - ) - - def create_volume(self, volume: Volume) -> VolumeProvisioningData: - return VolumeProvisioningData( - volume_id=volume.name, - size_gb=int(volume.configuration.size), - ) - - def delete_volume(self, volume: Volume): - pass - - def attach_volume(self, volume: Volume, instance_id: str): - pass - - def detach_volume(self, volume: Volume, instance_id: str): - pass diff --git a/src/dstack/_internal/core/backends/models.py b/src/dstack/_internal/core/backends/models.py new file mode 100644 index 0000000000..c21141378e --- /dev/null +++ b/src/dstack/_internal/core/backends/models.py @@ -0,0 +1,174 @@ +from typing import Union + +from dstack._internal.core.backends.aws.models import ( + AWSBackendConfig, + AWSBackendConfigWithCreds, +) +from dstack._internal.core.backends.azure.models import ( + AzureBackendConfig, + AzureBackendConfigWithCreds, +) +from dstack._internal.core.backends.cloudrift.models import ( + CloudRiftBackendConfig, + CloudRiftBackendConfigWithCreds, +) +from dstack._internal.core.backends.crusoe.models import ( + CrusoeBackendConfig, + CrusoeBackendConfigWithCreds, + CrusoeBackendFileConfigWithCreds, +) +from dstack._internal.core.backends.cudo.models import ( + CudoBackendConfig, + CudoBackendConfigWithCreds, +) +from dstack._internal.core.backends.digitalocean_base.models import ( + BaseDigitalOceanBackendConfig, + BaseDigitalOceanBackendConfigWithCreds, +) +from dstack._internal.core.backends.dstack.models import ( + DstackBackendConfig, + DstackBaseBackendConfig, +) +from dstack._internal.core.backends.gcp.models import ( + GCPBackendConfig, + GCPBackendConfigWithCreds, + GCPBackendFileConfigWithCreds, +) +from dstack._internal.core.backends.hotaisle.models import ( + HotAisleBackendConfig, + HotAisleBackendConfigWithCreds, + HotAisleBackendFileConfigWithCreds, +) +from dstack._internal.core.backends.jarvislabs.models import ( + JarvisLabsBackendConfig, + JarvisLabsBackendConfigWithCreds, + JarvisLabsBackendFileConfigWithCreds, +) +from dstack._internal.core.backends.kubernetes.models import ( + KubernetesBackendConfig, + KubernetesBackendConfigWithCreds, + KubernetesBackendFileConfigWithCreds, +) +from dstack._internal.core.backends.lambdalabs.models import ( + LambdaBackendConfig, + LambdaBackendConfigWithCreds, +) +from dstack._internal.core.backends.nebius.models import ( + NebiusBackendConfig, + NebiusBackendConfigWithCreds, + NebiusBackendFileConfigWithCreds, +) +from dstack._internal.core.backends.oci.models import ( + OCIBackendConfig, + OCIBackendConfigWithCreds, +) +from dstack._internal.core.backends.runpod.models import ( + RunpodBackendConfig, + RunpodBackendConfigWithCreds, +) +from dstack._internal.core.backends.tensordock.models import ( + TensorDockBackendConfig, + TensorDockBackendConfigWithCreds, +) +from dstack._internal.core.backends.vastai.models import ( + VastAIBackendConfig, + VastAIBackendConfigWithCreds, +) +from dstack._internal.core.backends.verda.models import ( + VerdaBackendConfig, + VerdaBackendConfigWithCreds, +) +from dstack._internal.core.backends.vultr.models import ( + VultrBackendConfig, + VultrBackendConfigWithCreds, +) +from dstack._internal.core.models.common import CoreModel + +# Backend config returned by the API +AnyBackendConfigWithoutCreds = Union[ + AWSBackendConfig, + AzureBackendConfig, + CloudRiftBackendConfig, + CrusoeBackendConfig, + CudoBackendConfig, + BaseDigitalOceanBackendConfig, + GCPBackendConfig, + HotAisleBackendConfig, + JarvisLabsBackendConfig, + KubernetesBackendConfig, + LambdaBackendConfig, + NebiusBackendConfig, + OCIBackendConfig, + RunpodBackendConfig, + TensorDockBackendConfig, + VastAIBackendConfig, + VerdaBackendConfig, + VultrBackendConfig, + DstackBackendConfig, + DstackBaseBackendConfig, +] + +# Same as AnyBackendConfigWithoutCreds but also includes creds. +# Used to create/update backend. +# Also returned by the API to project admins so that they can see/update backend creds. +AnyBackendConfigWithCreds = Union[ + AWSBackendConfigWithCreds, + AzureBackendConfigWithCreds, + CloudRiftBackendConfigWithCreds, + CrusoeBackendConfigWithCreds, + CudoBackendConfigWithCreds, + VerdaBackendConfigWithCreds, + BaseDigitalOceanBackendConfigWithCreds, + GCPBackendConfigWithCreds, + HotAisleBackendConfigWithCreds, + JarvisLabsBackendConfigWithCreds, + KubernetesBackendConfigWithCreds, + LambdaBackendConfigWithCreds, + OCIBackendConfigWithCreds, + NebiusBackendConfigWithCreds, + RunpodBackendConfigWithCreds, + TensorDockBackendConfigWithCreds, + VastAIBackendConfigWithCreds, + VultrBackendConfigWithCreds, + DstackBackendConfig, +] + +# Backend config accepted in server/config.yaml. +# This can be different from the API config. +# For example, it can make creds data optional and resolve it by filename. +AnyBackendFileConfigWithCreds = Union[ + AWSBackendConfigWithCreds, + AzureBackendConfigWithCreds, + CloudRiftBackendConfigWithCreds, + CrusoeBackendFileConfigWithCreds, + CudoBackendConfigWithCreds, + VerdaBackendConfigWithCreds, + BaseDigitalOceanBackendConfigWithCreds, + GCPBackendFileConfigWithCreds, + HotAisleBackendFileConfigWithCreds, + JarvisLabsBackendFileConfigWithCreds, + KubernetesBackendFileConfigWithCreds, + LambdaBackendConfigWithCreds, + OCIBackendConfigWithCreds, + NebiusBackendFileConfigWithCreds, + RunpodBackendConfigWithCreds, + TensorDockBackendConfigWithCreds, + VastAIBackendConfigWithCreds, + VultrBackendConfigWithCreds, +] + + +# The API can return backend config with or without creds +AnyBackendConfig = Union[AnyBackendConfigWithoutCreds, AnyBackendConfigWithCreds] + + +# In case we'll support multiple backends of the same type, +# this adds backend name to backend config. +class BackendInfo(CoreModel): + name: str + config: AnyBackendConfigWithoutCreds + + +class BackendInfoYAML(CoreModel): + name: str + config_yaml: str diff --git a/src/dstack/_internal/core/backends/nebius/__init__.py b/src/dstack/_internal/core/backends/nebius/__init__.py index 38c3d7efef..e69de29bb2 100644 --- a/src/dstack/_internal/core/backends/nebius/__init__.py +++ b/src/dstack/_internal/core/backends/nebius/__init__.py @@ -1,15 +0,0 @@ -from dstack._internal.core.backends.base import Backend -from dstack._internal.core.backends.nebius.compute import NebiusCompute -from dstack._internal.core.backends.nebius.config import NebiusConfig -from dstack._internal.core.models.backends.base import BackendType - - -class NebiusBackend(Backend): - TYPE: BackendType = BackendType.NEBIUS - - def __init__(self, config: NebiusConfig): - self.config = config - self._compute = NebiusCompute(self.config) - - def compute(self) -> NebiusCompute: - return self._compute diff --git a/src/dstack/_internal/core/backends/nebius/api_client.py b/src/dstack/_internal/core/backends/nebius/api_client.py deleted file mode 100644 index 14d949826c..0000000000 --- a/src/dstack/_internal/core/backends/nebius/api_client.py +++ /dev/null @@ -1,319 +0,0 @@ -import time -from typing import Dict, List, Optional - -import jwt -import requests - -from dstack._internal.core.backends.nebius.types import ( - ClientError, - ConflictError, - ForbiddenError, - NebiusError, - NotFoundError, - ResourcesSpec, - ServiceAccount, -) -from dstack._internal.utils.logging import get_logger - -logger = get_logger("nebius") -API_URL = "api.ai.nebius.cloud" -REQUEST_TIMEOUT = 15 - - -class NebiusAPIClient: - # Reference: https://nebius.ai/docs/api-design-guide/ - def __init__(self, service_account: ServiceAccount): - self.service_account = service_account - self.s = requests.Session() - self.expires_at = 0 - - def get_token(self): - now = int(time.time()) - if now + 60 < self.expires_at: - return - logger.debug("Refreshing IAM token") - expires_at = now + 3600 - payload = { - "aud": self.url("iam", "/tokens"), - "iss": self.service_account["service_account_id"], - "iat": now, - "exp": expires_at, - } - jwt_token = jwt.encode( - payload, - self.service_account["private_key"], - algorithm="PS256", - headers={"kid": self.service_account["id"]}, - ) - - resp = requests.post(payload["aud"], json={"jwt": jwt_token}, timeout=REQUEST_TIMEOUT) - resp.raise_for_status() - iam_token = resp.json()["iamToken"] - self.s.headers["Authorization"] = f"Bearer {iam_token}" - self.expires_at = expires_at - - def compute_zones_list(self) -> List[dict]: - logger.debug("Fetching compute zones") - self.get_token() - resp = self.s.get(self.url("compute", "/zones"), timeout=REQUEST_TIMEOUT) - self.raise_for_status(resp) - return resp.json()["zones"] - - def resource_manager_folders_create(self, cloud_id: str, name: str, **kwargs) -> dict: - logger.debug("Creating folder %s", name) - self.get_token() - resp = self.s.post( - self.url("resource-manager", "/folders"), - json=omit_none( - cloudId=cloud_id, - name=name, - **kwargs, - ), - timeout=REQUEST_TIMEOUT, - ) - self.raise_for_status(resp) - return resp.json() - - def vpc_networks_create(self, folder_id: str, name: str, **kwargs) -> dict: - logger.debug("Creating network %s in %s", name, folder_id) - self.get_token() - resp = self.s.post( - self.url("vpc", "/networks"), - json=omit_none( - folderId=folder_id, - name=name, - **kwargs, - ), - timeout=REQUEST_TIMEOUT, - ) - self.raise_for_status(resp) - return resp.json() - - def vpc_networks_list(self, folder_id: str, filter: Optional[str] = None) -> List[dict]: - logger.debug("Fetching networks in %s", folder_id) - return self.list( - "vpc", - "networks", - params=dict( - folderId=folder_id, - filter=filter, - ), - ) - - def vpc_subnets_create( - self, - folder_id: str, - name: str, - network_id: str, - zone: str, - cird_blocks: List[str], - **kwargs, - ) -> dict: - logger.debug("Creating subnet %s in %s", name, network_id) - self.get_token() - resp = self.s.post( - self.url("vpc", "/subnets"), - json=omit_none( - folderId=folder_id, - name=name, - networkId=network_id, - zoneId=zone, - v4CidrBlocks=cird_blocks, - **kwargs, - ), - timeout=REQUEST_TIMEOUT, - ) - self.raise_for_status(resp) - return resp.json() - - def vpc_subnets_list(self, folder_id: str, filter: Optional[str] = None) -> List[dict]: - logger.debug("Fetching subnets in %s", folder_id) - return self.list( - "vpc", - "subnets", - params=dict( - folderId=folder_id, - filter=filter, - ), - ) - - def vpc_security_groups_create( - self, folder_id: str, name: str, network_id: str, rule_specs: List[dict], **kwargs - ) -> dict: - logger.debug("Creating security group %s in %s", name, folder_id) - self.get_token() - resp = self.s.post( - self.url("vpc", "/securityGroups"), - json=omit_none( - folderId=folder_id, - name=name, - networkId=network_id, - ruleSpecs=rule_specs, - **kwargs, - ), - timeout=REQUEST_TIMEOUT, - ) - self.raise_for_status(resp) - return resp.json() - - def vpc_security_groups_list(self, folder_id: str, filter: Optional[str] = None) -> List[dict]: - logger.debug("Fetching security groups in %s", folder_id) - return self.list( - "vpc", - "securityGroups", - params=dict( - folderId=folder_id, - filter=filter, - ), - ) - - def vpc_security_groups_delete(self, security_group_id: str): - logger.debug("Deleting security group %s", security_group_id) - self.get_token() - resp = self.s.delete( - self.url("vpc", f"/securityGroups/{security_group_id}"), timeout=REQUEST_TIMEOUT - ) - self.raise_for_status(resp) - - def compute_instances_create( - self, - folder_id: str, - name: str, - zone_id: str, - platform_id: str, - resources_spec: ResourcesSpec, - metadata: Optional[Dict[str, str]], - disk_size_gb: int, - image_id: str, - subnet_id: str, - security_group_ids: List[str], - **kwargs, - ) -> dict: - # Reference: https://nebius.ai/docs/api-design-guide/compute/v1/api-ref/Instance/create - logger.debug("Creating instance %s (%s) in %s", name, platform_id, folder_id) - self.get_token() - resp = self.s.post( - self.url("compute", "/instances"), - json=omit_none( - folderId=folder_id, - name=name, - zoneId=zone_id, - platformId=platform_id, - resourcesSpec=resources_spec, - metadata=metadata, - boot_disk_spec=dict( - autoDelete=True, - diskSpec=dict( - typeId="network-ssd", - size=disk_size_gb * 1024 * 1024 * 1024, - imageId=image_id, - ), - ), - networkInterfaceSpecs=[ - dict( - subnetId=subnet_id, - primaryV4AddressSpec=dict( - oneToOneNatSpec=dict( - ipVersion="IPV4", - ), - ), - securityGroupIds=security_group_ids, - ) - ], - **kwargs, - ), - timeout=REQUEST_TIMEOUT, - ) - self.raise_for_status(resp) - return resp.json() - - def compute_instances_list( - self, folder_id: str, filter: Optional[str] = None, order_by: Optional[str] = None - ) -> List[dict]: - logger.debug("Fetching instances in %s", folder_id) - return self.list( - "compute", - "instances", - params=dict( - folderId=folder_id, - filter=filter, - orderBy=order_by, - ), - ) - - def compute_instances_delete(self, instance_id: str): - logger.debug("Deleting instance %s", instance_id) - self.get_token() - resp = self.s.delete( - self.url("compute", f"/instances/{instance_id}"), timeout=REQUEST_TIMEOUT - ) - self.raise_for_status(resp) - - def compute_instances_get(self, instance_id: str, full: bool = False) -> dict: - logger.debug("Fetching instance %s", instance_id) - self.get_token() - resp = self.s.get( - self.url("compute", f"/instances/{instance_id}"), - params=dict( - view="FULL" if full else "BASIC", - ), - timeout=REQUEST_TIMEOUT, - ) - self.raise_for_status(resp) - return resp.json() - - def compute_images_list( - self, folder_id: str, filter: Optional[str] = None, order_by: Optional[str] = None - ): - logger.debug("Fetching images in %s", folder_id) - return self.list( - "compute", - "images", - params=dict( - folderId=folder_id, - filter=filter, - orderBy=order_by, - ), - ) - - def list(self, service: str, resource: str, params: dict, page_size: int = 1000) -> List[dict]: - page_token = None - output = [] - while True: - self.get_token() - resp = self.s.get( - self.url(service, f"/{resource}"), - params=omit_none( - pageSize=page_size, - pageToken=page_token, - **params, - ), - timeout=REQUEST_TIMEOUT, - ) - self.raise_for_status(resp) - data = resp.json() - output += data.get(resource, []) - page_token = data.get("nextPageToken") - if not page_token: - break - return output - - def url(self, service: str, path: str, version="v1") -> str: - return f"https://{service}.{API_URL.rstrip('/')}/{service}/{version}/{path.lstrip('/')}" - - def raise_for_status(self, resp: requests.Response): - if resp.status_code == 400: - raise NebiusError(resp.text) - if resp.status_code == 401: - raise ClientError(resp.text) - if resp.status_code == 403: - raise ForbiddenError(resp.text) - if resp.status_code == 404: - raise NotFoundError(resp.text) - if resp.status_code == 409: - raise ConflictError(resp.text) - resp.raise_for_status() - - -def omit_none(**kwargs) -> dict: - return {k: v for k, v in kwargs.items() if v is not None} diff --git a/src/dstack/_internal/core/backends/nebius/backend.py b/src/dstack/_internal/core/backends/nebius/backend.py new file mode 100644 index 0000000000..c79709b8b7 --- /dev/null +++ b/src/dstack/_internal/core/backends/nebius/backend.py @@ -0,0 +1,16 @@ +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.nebius.compute import NebiusCompute +from dstack._internal.core.backends.nebius.models import NebiusConfig +from dstack._internal.core.models.backends.base import BackendType + + +class NebiusBackend(Backend): + TYPE = BackendType.NEBIUS + COMPUTE_CLASS = NebiusCompute + + def __init__(self, config: NebiusConfig): + self.config = config + self._compute = NebiusCompute(self.config) + + def compute(self) -> NebiusCompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/nebius/compute.py b/src/dstack/_internal/core/backends/nebius/compute.py index 8ec656632f..a90b4a4c1f 100644 --- a/src/dstack/_internal/core/backends/nebius/compute.py +++ b/src/dstack/_internal/core/backends/nebius/compute.py @@ -1,216 +1,398 @@ import json -import re +import random +import shlex import time +from collections.abc import Iterable +from functools import cached_property from typing import List, Optional -import dstack.version as version -from dstack._internal import settings -from dstack._internal.core.backends.base import Compute -from dstack._internal.core.backends.base.compute import get_instance_name, get_user_data -from dstack._internal.core.backends.base.offers import get_catalog_offers -from dstack._internal.core.backends.nebius.api_client import NebiusAPIClient -from dstack._internal.core.backends.nebius.config import NebiusConfig -from dstack._internal.core.backends.nebius.types import ( - ForbiddenError, - NotFoundError, - ResourcesSpec, +from nebius.aio.operation import Operation as SDKOperation +from nebius.aio.service_error import RequestError, StatusCode +from nebius.api.nebius.common.v1 import Operation +from nebius.sdk import SDK + +from dstack._internal.core.backends.base.backend import Compute +from dstack._internal.core.backends.base.compute import ( + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithInstanceVolumesSupport, + ComputeWithMultinodeSupport, + ComputeWithPlacementGroupSupport, + ComputeWithPrivilegedSupport, + generate_unique_instance_name, + get_user_data, + merge_tags, +) +from dstack._internal.core.backends.base.offers import ( + OfferModifier, + get_catalog_offers, + get_offers_disk_modifier, +) +from dstack._internal.core.backends.nebius import resources +from dstack._internal.core.backends.nebius.models import ( + NebiusConfig, + NebiusOfferBackendData, + NebiusServiceAccountCreds, +) +from dstack._internal.core.errors import ( + BackendError, + NotYetTerminated, + ProvisioningError, ) -from dstack._internal.core.errors import NoCapacityError from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import CoreModel from dstack._internal.core.models.instances import ( InstanceAvailability, InstanceConfiguration, + InstanceOffer, InstanceOfferWithAvailability, - SSHKey, ) -from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run -from dstack._internal.core.models.volumes import Volume +from dstack._internal.core.models.placement import ( + PlacementGroup, + PlacementGroupProvisioningData, + PlacementStrategy, +) +from dstack._internal.core.models.resources import Memory, Range +from dstack._internal.core.models.runs import JobProvisioningData, Requirements +from dstack._internal.utils.logging import get_logger -MEGABYTE = 1024**2 -INSTANCE_PULL_INTERVAL = 10 +logger = get_logger(__name__) +CONFIGURABLE_DISK_SIZE = Range[Memory]( + min=Memory.parse("40GB"), # min for the ubuntu24.04-cuda13.0 image + max=Memory.parse("8192GB"), # max for the NETWORK_SSD disk type +) +WAIT_FOR_DISK_TIMEOUT = 20 +WAIT_FOR_INSTANCE_TIMEOUT = 30 +WAIT_FOR_INSTANCE_UPDATE_INTERVAL = 2.5 +DELETE_INSTANCE_TIMEOUT = 25 +DOCKER_DAEMON_CONFIG = { + "runtimes": {"nvidia": {"args": [], "path": "nvidia-container-runtime"}}, + # Workaround for https://github.com/NVIDIA/nvidia-container-toolkit/issues/48 + "exec-opts": ["native.cgroupdriver=cgroupfs"], +} +SETUP_COMMANDS = [ + 'sed -i "s/.*AllowTcpForwarding.*/AllowTcpForwarding yes/g" /etc/ssh/sshd_config', + "service ssh restart", + f"echo {shlex.quote(json.dumps(DOCKER_DAEMON_CONFIG))} > /etc/docker/daemon.json", + "service docker restart", +] +SUPPORTED_PLATFORMS = [ + "gpu-h100-sxm", + "gpu-h200-sxm", + "gpu-b200-sxm", + "gpu-b200-sxm-a", + "gpu-l40s-a", + "gpu-l40s-d", + "gpu-rtx6000", + "cpu-d3", + "cpu-e2", +] -class NebiusCompute(Compute): +class NebiusCompute( + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithPrivilegedSupport, + ComputeWithInstanceVolumesSupport, + ComputeWithMultinodeSupport, + ComputeWithPlacementGroupSupport, + Compute, +): def __init__(self, config: NebiusConfig): + super().__init__() self.config = config - self.api_client = NebiusAPIClient(json.loads(self.config.creds.data)) + self._subnet_id_cache: dict[str, str] = {} + + @cached_property + def _sdk(self) -> SDK: + assert isinstance(self.config.creds, NebiusServiceAccountCreds) + return resources.make_sdk(self.config.creds) - def get_offers( - self, requirements: Optional[Requirements] = None - ) -> List[InstanceOfferWithAvailability]: + @cached_property + def _region_to_project_id(self) -> dict[str, str]: + return resources.get_region_to_project_id_map( + self._sdk, + configured_regions=self.config.regions, + configured_project_ids=self.config.projects, + ) + + def _get_subnet_id(self, region: str) -> str: + if region not in self._subnet_id_cache: + self._subnet_id_cache[region] = resources.get_default_subnet( + self._sdk, self._region_to_project_id[region] + ).metadata.id + return self._subnet_id_cache[region] + + def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]: offers = get_catalog_offers( backend=BackendType.NEBIUS, - locations=self.config.regions, - requirements=requirements, + locations=list(self._region_to_project_id), + extra_filter=_supported_instances, ) - # TODO(egor-s) quotas return [ - InstanceOfferWithAvailability( - **offer.dict(), availability=InstanceAvailability.UNKNOWN - ) - for offer in offers + offer.with_availability(availability=InstanceAvailability.UNKNOWN) for offer in offers ] + def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]: + return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)] + def create_instance( self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration, + placement_group: Optional[PlacementGroup], ) -> JobProvisioningData: - cuda = len(instance_offer.instance.resources.gpus) > 0 - security_group_id = self._get_security_group_id(project_name=instance_config.project_name) - subnet_id = self._get_subnet_id(zone=instance_offer.region) - image_id = self._get_image_id(cuda=cuda) + # NOTE: This method can block for a long time as it waits for the boot disk to be created + # and the instance to enter the STARTING state. This has to be done in create_instance so + # that we can handle quota and availability errors that may occur even after creating an + # instance. + instance_name = generate_unique_instance_name(instance_config) + platform, preset = instance_offer.instance.name.split() + cluster_id = None + if placement_group: + assert placement_group.provisioning_data is not None + backend_data = NebiusPlacementGroupBackendData.load( + placement_group.provisioning_data.backend_data + ) + if backend_data.cluster is not None: + cluster_id = backend_data.cluster.id + labels = { + "owner": "dstack", + "dstack_project": instance_config.project_name.lower(), + "dstack_name": instance_config.instance_name, + "dstack_user": instance_config.user.lower(), + } + labels = merge_tags( + base_tags=labels, + backend_tags=self.config.tags, + resource_tags=instance_config.tags, + ) + labels = resources.filter_invalid_labels(labels) + create_disk_op = resources.create_disk( + sdk=self._sdk, + name=instance_name, + project_id=self._region_to_project_id[instance_offer.region], + size_mib=instance_offer.instance.resources.disk.size_mib, + image_family="ubuntu24.04-cuda13.0", + labels=labels, + ) + create_instance_op = None try: - disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024) - resp = self.api_client.compute_instances_create( - folder_id=self.config.folder_id, - name=instance_config.instance_name, - zone_id=instance_offer.region, - platform_id=instance_offer.instance.name, - resources_spec=ResourcesSpec( - memory=int(instance_offer.instance.resources.memory_mib * MEGABYTE), - cores=instance_offer.instance.resources.cpus, - coreFraction=100, - gpus=len(instance_offer.instance.resources.gpus), + logger.debug("Blocking until disk %s is created", create_disk_op.resource_id) + resources.wait_for_operation(create_disk_op, timeout=WAIT_FOR_DISK_TIMEOUT) + if not create_disk_op.successful(): + raw_op = create_disk_op.raw() + raise ProvisioningError( + f"Create disk operation failed. Message: {raw_op.status.message}." + f" Details: {raw_op.status.details}" + ) + create_instance_op = resources.create_instance( + sdk=self._sdk, + name=instance_name, + project_id=self._region_to_project_id[instance_offer.region], + user_data=get_user_data( + instance_config.get_public_keys(), + backend_specific_commands=SETUP_COMMANDS, ), - metadata={ - "user-data": get_user_data(authorized_keys=instance_config.get_public_keys()) - }, - disk_size_gb=disk_size, - image_id=image_id, - subnet_id=subnet_id, - security_group_ids=[security_group_id], - labels=self._get_labels(project=instance_config.project_name), + platform=platform, + preset=preset, + cluster_id=cluster_id, + disk_id=create_disk_op.resource_id, + subnet_id=self._get_subnet_id(instance_offer.region), + preemptible=instance_offer.instance.resources.spot, + labels=labels, ) - except ForbiddenError as e: - if instance_offer.instance.name in e.args[0]: - raise NoCapacityError(json.loads(e.args[0])["message"]) - raise - instance_id = resp["metadata"]["instanceId"] - try: - while True: - instance = self.api_client.compute_instances_get(instance_id) - if "primaryV4Address" in instance["networkInterfaces"][0]: - break - time.sleep(INSTANCE_PULL_INTERVAL) - except Exception: - self.terminate_instance(instance_id, instance_offer.region) + _wait_for_instance(self._sdk, create_instance_op) + except BaseException: + if create_instance_op is not None: + try: + with resources.ignore_errors([StatusCode.NOT_FOUND]): + delete_instance_op = resources.delete_instance( + self._sdk, create_instance_op.resource_id + ) + resources.wait_for_operation( + delete_instance_op, timeout=DELETE_INSTANCE_TIMEOUT + ) + except Exception as e: + logger.exception( + "Could not delete instance %s: %s", create_instance_op.resource_id, e + ) + try: + with resources.ignore_errors([StatusCode.NOT_FOUND]): + resources.delete_disk(self._sdk, create_disk_op.resource_id) + except Exception as e: + logger.exception( + "Could not delete boot disk %s: %s", create_disk_op.resource_id, e + ) raise return JobProvisioningData( backend=instance_offer.backend, instance_type=instance_offer.instance, - instance_id=instance_id, - hostname=instance["networkInterfaces"][0]["primaryV4Address"]["oneToOneNat"][ - "address" - ], - internal_ip=None, + instance_id=create_instance_op.resource_id, + hostname=None, region=instance_offer.region, price=instance_offer.price, - username="ubuntu", ssh_port=22, + username="ubuntu", dockerized=True, - ssh_proxy=None, - backend_data=None, + backend_data=NebiusInstanceBackendData(boot_disk_id=create_disk_op.resource_id).json(), ) - def run_job( - self, - run: Run, - job: Job, - instance_offer: InstanceOfferWithAvailability, - project_ssh_public_key: str, - project_ssh_private_key: str, - volumes: List[Volume], - ) -> JobProvisioningData: - instance_config = InstanceConfiguration( - project_name=run.project_name, - instance_name=get_instance_name(run, job), # TODO: generate name - ssh_keys=[ - SSHKey(public=project_ssh_public_key.strip()), - ], - job_docker_config=None, - user=run.user, - ) - return self.create_instance(instance_offer, instance_config) + def update_provisioning_data( + self, provisioning_data, project_ssh_public_key, project_ssh_private_key + ): + instance = resources.get_instance(self._sdk, provisioning_data.instance_id) + if not instance.status.network_interfaces: + return + interface = instance.status.network_interfaces[0] + provisioning_data.hostname, _ = interface.public_ip_address.address.split("/") + provisioning_data.internal_ip, _ = interface.ip_address.address.split("/") def terminate_instance( self, instance_id: str, region: str, backend_data: Optional[str] = None ): + backend_data_parsed = NebiusInstanceBackendData.load(backend_data) try: - self.api_client.compute_instances_delete(instance_id) - except NotFoundError: - pass - - def _get_security_group_id(self, project_name: str) -> str: - name = project_name - security_groups = self.api_client.vpc_security_groups_list( - folder_id=self.config.folder_id, - filter=f'name="{name}"', + instance = resources.get_instance(self._sdk, instance_id) + except RequestError as e: + if e.status.code != StatusCode.NOT_FOUND: + raise + instance = None + if instance is not None: + if instance.status.state != instance.status.InstanceState.DELETING: + resources.delete_instance(self._sdk, instance_id) + raise NotYetTerminated( + "Requested instance deletion." + " Will wait for deletion before deleting the boot disk." + f" Instance state was: {instance.status.state.name}" + ) + else: + raise NotYetTerminated( + "Waiting for instance deletion before deleting the boot disk." + f" Instance state: {instance.status.state.name}" + ) + with resources.ignore_errors([StatusCode.NOT_FOUND]): + resources.delete_disk(self._sdk, backend_data_parsed.boot_disk_id) + + def create_placement_group( + self, + placement_group: PlacementGroup, + master_instance_offer: InstanceOffer, + ) -> PlacementGroupProvisioningData: + assert placement_group.configuration.placement_strategy == PlacementStrategy.CLUSTER + master_instance_offer_backend_data: NebiusOfferBackendData = ( + NebiusOfferBackendData.__response__.parse_obj(master_instance_offer.backend_data) + ) + fabrics = list(master_instance_offer_backend_data.fabrics) + if self.config.fabrics is not None: + fabrics = [f for f in fabrics if f in self.config.fabrics] + placement_group_backend_data = NebiusPlacementGroupBackendData(cluster=None) + # Only create a Nebius cluster if the instance supports it. + # For other instances, return dummy PlacementGroupProvisioningData. + if fabrics: + fabric = random.choice(fabrics) + op = resources.create_cluster( + self._sdk, + name=placement_group.name, + project_id=self._region_to_project_id[placement_group.configuration.region], + fabric=fabric, + ) + placement_group_backend_data.cluster = NebiusClusterBackendData( + id=op.resource_id, + fabric=fabric, + ) + return PlacementGroupProvisioningData( + backend=BackendType.NEBIUS, + backend_data=placement_group_backend_data.json(), + ) + + def delete_placement_group(self, placement_group: PlacementGroup) -> None: + assert placement_group.provisioning_data is not None + backend_data = NebiusPlacementGroupBackendData.load( + placement_group.provisioning_data.backend_data ) - if security_groups: - return security_groups[0]["id"] - resp = self.api_client.vpc_security_groups_create( - folder_id=self.config.folder_id, - name=name, - network_id=self.config.network_id, - rule_specs=[ - { - "description": "SSH access", - "direction": "INGRESS", - "ports": {"fromPort": 22, "toPort": 22}, - "protocolName": "ANY", - "cidrBlocks": {"v4CidrBlocks": ["0.0.0.0/0"]}, - }, - { - "description": "Project intranet", - "direction": "INGRESS", - "protocolName": "ANY", - "predefinedTarget": "self_security_group", - }, - { - "description": "Internet access", - "direction": "EGRESS", - "protocolName": "ANY", - "cidrBlocks": {"v4CidrBlocks": ["0.0.0.0/0"]}, - }, - ], - description="For job instance, by dstack", - labels=self._get_labels(project=project_name), + if backend_data.cluster is not None: + with resources.ignore_errors([StatusCode.NOT_FOUND]): + resources.delete_cluster(self._sdk, backend_data.cluster.id) + + def is_suitable_placement_group( + self, + placement_group: PlacementGroup, + instance_offer: InstanceOffer, + ) -> bool: + if placement_group.configuration.region != instance_offer.region: + return False + assert placement_group.provisioning_data is not None + placement_group_backend_data = NebiusPlacementGroupBackendData.load( + placement_group.provisioning_data.backend_data ) - return resp["response"]["id"] - - def _get_subnet_id(self, zone: str, name: Optional[str] = None) -> str: - name = name or f"default-{zone}" - subnets = self.api_client.vpc_subnets_list(folder_id=self.config.folder_id) - for subnet in subnets: - if subnet["name"] == name: - return subnet["id"] - n = len(subnets) - resp = self.api_client.vpc_subnets_create( - folder_id=self.config.folder_id, - name=name, - network_id=self.config.network_id, - zone=zone, - cird_blocks=[f"10.{n}.0.0/16"], - labels=self._get_labels(), + instance_offer_backend_data: NebiusOfferBackendData = ( + NebiusOfferBackendData.__response__.parse_obj(instance_offer.backend_data) ) - return resp["response"]["id"] - - def _get_image_id(self, cuda: bool) -> str: - image_name = re.sub(r"[^a-z0-9-]", "-", f"dstack-{version.base_image}") - if cuda: - image_name += "-cuda" - images = self.api_client.compute_images_list( - folder_id="bjel82ie37qos4pc6guk", filter=f'name="{image_name}"' + return ( + placement_group_backend_data.cluster is None + or placement_group_backend_data.cluster.fabric in instance_offer_backend_data.fabrics ) - return images[0]["id"] - def _get_labels(self, **kwargs) -> dict: - labels = { - "owner": "dstack", - **kwargs, - } - if settings.DSTACK_VERSION is not None: - labels["dstack-version"] = settings.DSTACK_VERSION.replace(".", "-") - return labels + +class NebiusInstanceBackendData(CoreModel): + boot_disk_id: str + + @classmethod + def load(cls, raw: Optional[str]) -> "NebiusInstanceBackendData": + assert raw is not None + return cls.__response__.parse_raw(raw) + + +class NebiusClusterBackendData(CoreModel): + id: str + fabric: str + + +class NebiusPlacementGroupBackendData(CoreModel): + cluster: Optional[NebiusClusterBackendData] + + @classmethod + def load(cls, raw: Optional[str]) -> "NebiusPlacementGroupBackendData": + assert raw is not None + return cls.__response__.parse_raw(raw) + + +def _wait_for_instance(sdk: SDK, op: SDKOperation[Operation]) -> None: + start = time.monotonic() + while True: + if op.done() and not op.successful(): + raise ProvisioningError( + f"Create instance operation failed. Message: {op.raw().status.message}." + f" Details: {op.raw().status.details}" + ) + instance = resources.get_instance(sdk, op.resource_id) + if instance.status.state in [ + instance.status.InstanceState.STARTING, + instance.status.InstanceState.RUNNING, + ]: + break + if time.monotonic() - start > WAIT_FOR_INSTANCE_TIMEOUT: + raise BackendError( + f"Instance {instance.metadata.id} did not start booting in time." + f" Status: {instance.status.state.name}" + ) + logger.debug( + "Waiting for instance %s. Status: %s. Operation status: %s", + instance.metadata.name, + instance.status.state.name, + op.status(), + ) + time.sleep(WAIT_FOR_INSTANCE_UPDATE_INTERVAL) + resources.LOOP.await_( + op.update( + per_retry_timeout=resources.REQUEST_TIMEOUT, + auth_options=resources.REQUEST_AUTH_OPTIONS, + ) + ) + + +def _supported_instances(offer: InstanceOffer) -> bool: + platform, _ = offer.instance.name.split() + return platform in SUPPORTED_PLATFORMS diff --git a/src/dstack/_internal/core/backends/nebius/config.py b/src/dstack/_internal/core/backends/nebius/config.py deleted file mode 100644 index 4bface329e..0000000000 --- a/src/dstack/_internal/core/backends/nebius/config.py +++ /dev/null @@ -1,6 +0,0 @@ -from dstack._internal.core.backends.base.config import BackendConfig -from dstack._internal.core.models.backends.nebius import AnyNebiusCreds, NebiusStoredConfig - - -class NebiusConfig(NebiusStoredConfig, BackendConfig): - creds: AnyNebiusCreds diff --git a/src/dstack/_internal/core/backends/nebius/configurator.py b/src/dstack/_internal/core/backends/nebius/configurator.py new file mode 100644 index 0000000000..a27fa0b0aa --- /dev/null +++ b/src/dstack/_internal/core/backends/nebius/configurator.py @@ -0,0 +1,98 @@ +import json + +from nebius.aio.service_error import RequestError + +from dstack._internal.core.backends.base.configurator import ( + TAGS_MAX_NUM, + BackendRecord, + Configurator, + raise_invalid_credentials_error, +) +from dstack._internal.core.backends.nebius import resources +from dstack._internal.core.backends.nebius.backend import NebiusBackend +from dstack._internal.core.backends.nebius.models import ( + NebiusBackendConfig, + NebiusBackendConfigWithCreds, + NebiusConfig, + NebiusCreds, + NebiusServiceAccountCreds, + NebiusStoredConfig, +) +from dstack._internal.core.backends.nebius.resources import get_all_infiniband_fabrics +from dstack._internal.core.errors import BackendError, ServerClientError +from dstack._internal.core.models.backends.base import BackendType + + +class NebiusConfigurator( + Configurator[ + NebiusBackendConfig, + NebiusBackendConfigWithCreds, + ] +): + TYPE = BackendType.NEBIUS + BACKEND_CLASS = NebiusBackend + + def validate_config(self, config: NebiusBackendConfigWithCreds, default_creds_enabled: bool): + assert isinstance(config.creds, NebiusServiceAccountCreds) + try: + sdk = resources.make_sdk(config.creds) + # check that it's possible to build the projects map with configured settings + resources.get_region_to_project_id_map( + sdk, configured_regions=config.regions, configured_project_ids=config.projects + ) + except (ValueError, RequestError) as e: + raise_invalid_credentials_error( + fields=[["creds"]], + details=str(e), + ) + valid_fabrics = get_all_infiniband_fabrics() + if invalid_fabrics := set(config.fabrics or []) - valid_fabrics: + raise_invalid_credentials_error( + fields=[["fabrics"]], + details=( + "These InfiniBand fabrics do not exist or are not known to dstack:" + f" {sorted(invalid_fabrics)}. Omit `fabrics` to allow all fabrics or select" + f" some of the valid options: {sorted(valid_fabrics)}" + ), + ) + self._check_config_tags(config) + + def _check_config_tags(self, config: NebiusBackendConfigWithCreds): + if not config.tags: + return + if len(config.tags) > TAGS_MAX_NUM: + raise ServerClientError( + f"Maximum number of tags exceeded. Up to {TAGS_MAX_NUM} tags is allowed." + ) + try: + resources.validate_labels(config.tags) + except BackendError as e: + raise ServerClientError(e.args[0]) + + def create_backend( + self, project_name: str, config: NebiusBackendConfigWithCreds + ) -> BackendRecord: + return BackendRecord( + config=NebiusStoredConfig( + **NebiusBackendConfig.__response__.parse_obj(config).dict() + ).json(), + auth=NebiusCreds.parse_obj(config.creds).json(), + ) + + def get_backend_config_with_creds(self, record: BackendRecord) -> NebiusBackendConfigWithCreds: + config = self._get_config(record) + return NebiusBackendConfigWithCreds.__response__.parse_obj(config) + + def get_backend_config_without_creds(self, record: BackendRecord) -> NebiusBackendConfig: + config = self._get_config(record) + return NebiusBackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> NebiusBackend: + config = self._get_config(record) + return NebiusBackend(config=config) + + def _get_config(self, record: BackendRecord) -> NebiusConfig: + return NebiusConfig.__response__( + **json.loads(record.config), + creds=NebiusCreds.parse_raw(record.auth), + ) diff --git a/src/dstack/_internal/core/backends/nebius/models.py b/src/dstack/_internal/core/backends/nebius/models.py new file mode 100644 index 0000000000..143eb55746 --- /dev/null +++ b/src/dstack/_internal/core/backends/nebius/models.py @@ -0,0 +1,185 @@ +import json +from pathlib import Path +from typing import Annotated, Dict, Literal, Optional, Union + +from pydantic import Field, root_validator + +from dstack._internal.core.backends.base.models import fill_data +from dstack._internal.core.models.common import CoreModel + +DEFAULT_PROJECT_NAME_PREFIX = "default" + + +class NebiusServiceAccountCreds(CoreModel): + type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = ( + "service_account" + ) + service_account_id: Annotated[str, Field(description="Service account ID")] + public_key_id: Annotated[str, Field(description="ID of the service account public key")] + private_key_file: Annotated[ + Optional[str], Field(description=("Path to the service account private key")) + ] = None + private_key_content: Annotated[ + str, + Field( + description=( + "Content of the service account private key. When configuring via" + " `server/config.yml`, it's automatically filled from `private_key_file`." + " When configuring via UI, it has to be specified explicitly." + ) + ), + ] + filename: Annotated[ + Optional[str], Field(description="The path to the service account credentials file") + ] = None + + +class NebiusServiceAccountFileCreds(CoreModel): + type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = ( + "service_account" + ) + service_account_id: Annotated[ + Optional[str], + Field( + description=( + "Service account ID. Set automatically if `filename` is specified. When configuring via the UI, it must be specified explicitly" + ) + ), + ] = None + public_key_id: Annotated[ + Optional[str], + Field( + description=( + "ID of the service account public key. Set automatically if `filename` is specified. When configuring via the UI, it must be specified explicitly" + ) + ), + ] = None + private_key_file: Annotated[ + Optional[str], + Field( + description=( + "Path to the service account private key. Set automatically if `filename` or `private_key_content` is specified. When configuring via the UI, it must be specified explicitly" + ) + ), + ] = None + private_key_content: Annotated[ + Optional[str], + Field( + description=( + "Content of the service account private key. When configuring via" + " `server/config.yml`, it's automatically filled from `private_key_file`." + " When configuring via UI, it has to be specified explicitly" + ) + ), + ] = None + filename: Annotated[ + Optional[str], Field(description="The path to the service account credentials file") + ] = None + + @root_validator + def fill_data(cls, values): + if filename := values.get("filename"): + try: + with open(Path(filename).expanduser()) as f: + data = json.load(f) + from nebius.base.service_account.credentials_file import ( + ServiceAccountCredentials, + ) + + credentials = ServiceAccountCredentials.from_json(data) + subject = credentials.subject_credentials + values["service_account_id"] = subject.sub + values["public_key_id"] = subject.kid + values["private_key_content"] = subject.private_key + except OSError: + raise ValueError(f"No such file {filename}") + except Exception as e: + raise ValueError(f"Failed to parse credentials file {filename}: {e}") + return values + + return fill_data( + values, filename_field="private_key_file", data_field="private_key_content" + ) + + +AnyNebiusCreds = NebiusServiceAccountCreds +NebiusCreds = AnyNebiusCreds +AnyNebiusFileCreds = NebiusServiceAccountFileCreds + + +class NebiusBackendConfig(CoreModel): + """ + The backend config used in the API, server/config.yml, `NebiusConfigurator`. + It also serves as a base class for other backend config models. + Should not include creds. + """ + + type: Annotated[ + Literal["nebius"], + Field(description="The type of backend"), + ] = "nebius" + projects: Annotated[ + Optional[list[str]], + Field( + description=( + "The list of allowed Nebius project IDs." + " Omit to use the default project in each region." + " The project is considered default if it is the only project in the region" + f" or if its name starts with `{DEFAULT_PROJECT_NAME_PREFIX}`" + ) + ), + ] = None + regions: Annotated[ + Optional[list[str]], + Field(description="The list of allowed Nebius regions. Omit to allow all regions"), + ] = None + fabrics: Annotated[ + Optional[list[str]], + Field( + description=( + "The list of allowed fabrics for InfiniBand clusters. Omit to allow all fabrics" + ) + ), + ] = None + tags: Annotated[ + Optional[Dict[str, str]], + Field( + description="The tags (labels) that will be assigned to resources created by `dstack`" + ), + ] = None + + +class NebiusBackendConfigWithCreds(NebiusBackendConfig): + """ + Same as `NebiusBackendConfig` but also includes creds. + """ + + creds: Annotated[AnyNebiusCreds, Field(description="The credentials")] + + +class NebiusBackendFileConfigWithCreds(NebiusBackendConfig): + creds: AnyNebiusFileCreds = Field(description="The credentials") + + +AnyNebiusBackendConfig = Union[NebiusBackendConfig, NebiusBackendConfigWithCreds] + + +class NebiusStoredConfig(NebiusBackendConfig): + """ + The backend config used for config parameters in the DB. + Can extend `NebiusBackendConfig` with additional parameters. + """ + + pass + + +class NebiusConfig(NebiusStoredConfig): + """ + The backend config used by `NebiusBackend` and `NebiusCompute`. + """ + + creds: AnyNebiusCreds + + +class NebiusOfferBackendData(CoreModel): + fabrics: set[str] = set() diff --git a/src/dstack/_internal/core/backends/nebius/resources.py b/src/dstack/_internal/core/backends/nebius/resources.py new file mode 100644 index 0000000000..c9871f2da8 --- /dev/null +++ b/src/dstack/_internal/core/backends/nebius/resources.py @@ -0,0 +1,433 @@ +import logging +import re +import time +from collections import defaultdict +from collections.abc import Container as ContainerT +from collections.abc import Generator, Iterable, Sequence +from contextlib import contextmanager +from tempfile import NamedTemporaryFile +from typing import Dict, Optional + +from nebius.aio.operation import Operation as SDKOperation +from nebius.aio.service_error import RequestError, StatusCode +from nebius.aio.token.renewable import OPTION_RENEW_REQUEST_TIMEOUT, OPTION_RENEW_SYNCHRONOUS +from nebius.api.nebius.common.v1 import Operation, ResourceMetadata +from nebius.api.nebius.compute.v1 import ( + AttachedDiskSpec, + CreateDiskRequest, + CreateGpuClusterRequest, + CreateInstanceRequest, + DeleteDiskRequest, + DeleteGpuClusterRequest, + DeleteInstanceRequest, + DiskServiceClient, + DiskSpec, + ExistingDisk, + GetInstanceRequest, + GpuClusterServiceClient, + GpuClusterSpec, + Instance, + InstanceGpuClusterSpec, + InstanceRecoveryPolicy, + InstanceServiceClient, + InstanceSpec, + IPAddress, + NetworkInterfaceSpec, + PreemptibleSpec, + PublicIPAddress, + ResourcesSpec, + SourceImageFamily, +) +from nebius.api.nebius.iam.v1 import ( + Container, + ListProjectsRequest, + ListTenantsRequest, + ProjectServiceClient, + TenantServiceClient, +) +from nebius.api.nebius.vpc.v1 import ListSubnetsRequest, Subnet, SubnetServiceClient +from nebius.sdk import SDK + +from dstack._internal.core.backends.base.configurator import raise_invalid_credentials_error +from dstack._internal.core.backends.base.offers import get_catalog_offers +from dstack._internal.core.backends.nebius.models import ( + DEFAULT_PROJECT_NAME_PREFIX, + NebiusOfferBackendData, + NebiusServiceAccountCreds, +) +from dstack._internal.core.errors import BackendError, NoCapacityError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.utils.event_loop import DaemonEventLoop +from dstack._internal.utils.logging import get_logger + +# +# Guidelines on using the Nebius SDK: +# +# Do not use Request.wait() or other sync SDK methods, they suffer from deadlocks. +# Instead, use async methods and await them with LOOP.await_() +LOOP = DaemonEventLoop() +# Pass a timeout to all methods to avoid infinite waiting +REQUEST_TIMEOUT = 10 +# Pass REQUEST_AUTH_OPTIONS to all methods to avoid infinite retries in case of invalid credentials +REQUEST_AUTH_OPTIONS = { + OPTION_RENEW_SYNCHRONOUS: "true", + OPTION_RENEW_REQUEST_TIMEOUT: "5", +} + +# disables log messages about errors such as invalid creds or expired timeouts +logging.getLogger("nebius").setLevel(logging.CRITICAL) +logger = get_logger(__name__) + + +@contextmanager +def wrap_capacity_errors() -> Generator[None, None, None]: + try: + yield + except RequestError as e: + if e.status.code == StatusCode.RESOURCE_EXHAUSTED or "Quota limit exceeded" in str(e): + raise NoCapacityError(e) + raise + + +@contextmanager +def ignore_errors(status_codes: ContainerT[StatusCode]) -> Generator[None, None, None]: + try: + yield + except RequestError as e: + if e.status.code not in status_codes: + raise + + +def make_sdk(creds: NebiusServiceAccountCreds) -> SDK: + with NamedTemporaryFile("w") as f: + f.write(creds.private_key_content) + f.flush() + return SDK( + service_account_private_key_file_name=f.name, + service_account_public_key_id=creds.public_key_id, + service_account_id=creds.service_account_id, + ) + + +def wait_for_operation( + op: SDKOperation[Operation], + timeout: float, + interval: float = 1, +) -> None: + # Re-implementation of SDKOperation.wait() to avoid https://github.com/nebius/pysdk/issues/74 + deadline = time.monotonic() + timeout + while not op.done(): + if time.monotonic() + interval > deadline: + raise TimeoutError(f"Operation {op.id} wait timeout") + time.sleep(interval) + LOOP.await_( + op.update(per_retry_timeout=REQUEST_TIMEOUT, auth_options=REQUEST_AUTH_OPTIONS) + ) + + +def get_region_to_project_id_map( + sdk: SDK, configured_regions: Optional[list[str]], configured_project_ids: Optional[list[str]] +) -> dict[str, str]: + """Validate backend settings and build region->project_id map""" + + projects = list_tenant_projects(sdk) + if configured_regions: + validate_regions( + configured=set(configured_regions), available={p.status.region for p in projects} + ) + if configured_project_ids is not None: + return _get_region_to_configured_project_id_map( + projects, configured_project_ids, configured_regions + ) + else: + return _get_region_to_default_project_id_map(projects, configured_regions) + + +def validate_regions(configured: set[str], available: set[str]) -> None: + if invalid := set(configured) - available: + raise_invalid_credentials_error( + fields=[["regions"]], + details=( + f"Configured regions {invalid} do not exist in this Nebius tenancy." + " Omit `regions` to use all regions or select some of the available regions:" + f" {available}" + ), + ) + + +def list_tenant_projects(sdk: SDK) -> Sequence[Container]: + tenants = LOOP.await_( + TenantServiceClient(sdk).list( + ListTenantsRequest(), + per_retry_timeout=REQUEST_TIMEOUT, + auth_options=REQUEST_AUTH_OPTIONS, + ) + ) + if len(tenants.items) != 1: + raise ValueError(f"Expected to find 1 tenant, found {(len(tenants.items))}") + tenant_id = tenants.items[0].metadata.id + projects = LOOP.await_( + ProjectServiceClient(sdk).list( + ListProjectsRequest(parent_id=tenant_id, page_size=999), + per_retry_timeout=REQUEST_TIMEOUT, + auth_options=REQUEST_AUTH_OPTIONS, + ) + ) + return projects.items + + +def _get_region_to_default_project_id_map( + all_tenant_projects: Iterable[Container], configured_regions: Optional[list[str]] +) -> dict[str, str]: + region_to_projects: defaultdict[str, list[Container]] = defaultdict(list) + for project in all_tenant_projects: + region_to_projects[project.status.region].append(project) + region_to_project_id = {} + for region, region_projects in region_to_projects.items(): + if configured_regions and region not in configured_regions: + continue + if len(region_projects) != 1: + region_projects = [ + p + for p in region_projects + if p.metadata.name.startswith(DEFAULT_PROJECT_NAME_PREFIX) + ] + if len(region_projects) != 1: + raise_invalid_credentials_error( + ["regions"], + ( + f"Could not find the default project in region {region}." + " Consider setting the `projects` property in backend settings" + ), + ) + region_to_project_id[region] = region_projects[0].metadata.id + return region_to_project_id + + +def _get_region_to_configured_project_id_map( + all_tenant_projects: Iterable[Container], + configured_project_ids: list[str], + configured_regions: Optional[list[str]], +) -> dict[str, str]: + project_id_to_project = {p.metadata.id: p for p in all_tenant_projects} + region_to_project_id = {} + for project_id in configured_project_ids: + project = project_id_to_project.get(project_id) + if project is None: + raise_invalid_credentials_error( + ["projects"], + f"Configured project ID {project_id!r} not found in this Nebius tenancy", + ) + duplicate_project_id = region_to_project_id.get(project.status.region) + if duplicate_project_id: + raise_invalid_credentials_error( + ["projects"], + ( + f"Configured projects {project_id} and {duplicate_project_id}" + f" both belong to the same region {project.status.region}." + " Only one project per region is allowed" + ), + ) + region_to_project_id[project.status.region] = project_id + if configured_regions: + # only filter by region after validating all project IDs + return { + region: project_id + for region, project_id in region_to_project_id.items() + if region in configured_regions + } + return region_to_project_id + + +def get_default_subnet(sdk: SDK, project_id: str) -> Subnet: + subnets = LOOP.await_( + SubnetServiceClient(sdk).list( + ListSubnetsRequest(parent_id=project_id, page_size=999), + per_retry_timeout=REQUEST_TIMEOUT, + auth_options=REQUEST_AUTH_OPTIONS, + ) + ) + for subnet in subnets.items: + if subnet.metadata.name.startswith("default-subnet"): + return subnet + raise BackendError(f"Could not find default subnet in project {project_id}") + + +def get_all_infiniband_fabrics() -> set[str]: + offers = get_catalog_offers(backend=BackendType.NEBIUS) + result = set() + for offer in offers: + backend_data: NebiusOfferBackendData = NebiusOfferBackendData.__response__.parse_obj( + offer.backend_data + ) + result |= backend_data.fabrics + return result + + +def create_disk( + sdk: SDK, name: str, project_id: str, size_mib: int, image_family: str, labels: Dict[str, str] +) -> SDKOperation[Operation]: + client = DiskServiceClient(sdk) + request = CreateDiskRequest( + metadata=ResourceMetadata( + name=name, + parent_id=project_id, + labels=labels, + ), + spec=DiskSpec( + size_mebibytes=size_mib, + type=DiskSpec.DiskType.NETWORK_SSD, + source_image_family=SourceImageFamily(image_family=image_family), + ), + ) + with wrap_capacity_errors(): + return LOOP.await_( + client.create( + request, per_retry_timeout=REQUEST_TIMEOUT, auth_options=REQUEST_AUTH_OPTIONS + ) + ) + + +def delete_disk(sdk: SDK, disk_id: str) -> None: + LOOP.await_( + DiskServiceClient(sdk).delete( + DeleteDiskRequest(id=disk_id), + per_retry_timeout=REQUEST_TIMEOUT, + auth_options=REQUEST_AUTH_OPTIONS, + ) + ) + + +def create_instance( + sdk: SDK, + name: str, + project_id: str, + user_data: str, + platform: str, + preset: str, + cluster_id: Optional[str], + disk_id: str, + subnet_id: str, + preemptible: bool, + labels: Dict[str, str], +) -> SDKOperation[Operation]: + client = InstanceServiceClient(sdk) + request = CreateInstanceRequest( + metadata=ResourceMetadata( + name=name, + parent_id=project_id, + labels=labels, + ), + spec=InstanceSpec( + cloud_init_user_data=user_data, + resources=ResourcesSpec(platform=platform, preset=preset), + gpu_cluster=InstanceGpuClusterSpec(id=cluster_id) if cluster_id is not None else None, + boot_disk=AttachedDiskSpec( + attach_mode=AttachedDiskSpec.AttachMode.READ_WRITE, + existing_disk=ExistingDisk(id=disk_id), + ), + network_interfaces=[ + NetworkInterfaceSpec( + name="dstack-default-interface", + subnet_id=subnet_id, + ip_address=IPAddress(), + public_ip_address=PublicIPAddress(static=True), + ) + ], + preemptible=PreemptibleSpec( + priority=1, on_preemption=PreemptibleSpec.PreemptionPolicy.STOP + ) + if preemptible + else None, + recovery_policy=InstanceRecoveryPolicy.FAIL if preemptible else None, + ), + ) + with wrap_capacity_errors(): + return LOOP.await_( + client.create( + request, per_retry_timeout=REQUEST_TIMEOUT, auth_options=REQUEST_AUTH_OPTIONS + ) + ) + + +def get_instance(sdk: SDK, instance_id: str) -> Instance: + return LOOP.await_( + InstanceServiceClient(sdk).get( + GetInstanceRequest(id=instance_id), + per_retry_timeout=REQUEST_TIMEOUT, + auth_options=REQUEST_AUTH_OPTIONS, + ) + ) + + +def delete_instance(sdk: SDK, instance_id: str) -> SDKOperation[Operation]: + return LOOP.await_( + InstanceServiceClient(sdk).delete( + DeleteInstanceRequest(id=instance_id), + per_retry_timeout=REQUEST_TIMEOUT, + auth_options=REQUEST_AUTH_OPTIONS, + ) + ) + + +def create_cluster(sdk: SDK, name: str, project_id: str, fabric: str) -> SDKOperation[Operation]: + with wrap_capacity_errors(): + return LOOP.await_( + GpuClusterServiceClient(sdk).create( + CreateGpuClusterRequest( + metadata=ResourceMetadata(name=name, parent_id=project_id), + spec=GpuClusterSpec(infiniband_fabric=fabric), + ), + per_retry_timeout=REQUEST_TIMEOUT, + auth_options=REQUEST_AUTH_OPTIONS, + ) + ) + + +def delete_cluster(sdk: SDK, cluster_id: str) -> None: + LOOP.await_( + GpuClusterServiceClient(sdk).delete( + DeleteGpuClusterRequest(id=cluster_id), + per_retry_timeout=REQUEST_TIMEOUT, + auth_options=REQUEST_AUTH_OPTIONS, + ) + ) + + +def filter_invalid_labels(labels: Dict[str, str]) -> Dict[str, str]: + filtered_labels = {} + for k, v in labels.items(): + if not _is_valid_label(k, v): + logger.warning("Skipping invalid label '%s: %s'", k, v) + continue + filtered_labels[k] = v + return filtered_labels + + +def validate_labels(labels: Dict[str, str]): + for k, v in labels.items(): + if not _is_valid_label(k, v): + raise BackendError("Invalid resource labels") + + +def _is_valid_label(key: str, value: str) -> bool: + # TODO: [Nebius] current validation logic reuses GCP's approach. + # There is no public information on Nebius labels restrictions. + return is_valid_resource_name(key) and is_valid_label_value(value) + + +MAX_RESOURCE_NAME_LEN = 63 +NAME_PATTERN = re.compile(r"^[a-z][_\-a-z0-9]{0,62}$") +LABEL_VALUE_PATTERN = re.compile(r"^[_\-a-z0-9]{0,63}$") + + +def is_valid_resource_name(name: str) -> bool: + if len(name) < 1 or len(name) > MAX_RESOURCE_NAME_LEN: + return False + match = re.match(NAME_PATTERN, name) + return match is not None + + +def is_valid_label_value(value: str) -> bool: + match = re.match(LABEL_VALUE_PATTERN, value) + return match is not None diff --git a/src/dstack/_internal/core/backends/nebius/types.py b/src/dstack/_internal/core/backends/nebius/types.py deleted file mode 100644 index 177a983892..0000000000 --- a/src/dstack/_internal/core/backends/nebius/types.py +++ /dev/null @@ -1,37 +0,0 @@ -from typing import TypedDict - - -class ServiceAccount(TypedDict): - id: str - service_account_id: str - created_at: str - key_algorithm: str - public_key: str - private_key: str - - -class ResourcesSpec(TypedDict): - memory: int - cores: int - coreFraction: int - gpus: int - - -class NebiusError(Exception): - pass - - -class ClientError(NebiusError): - pass - - -class ForbiddenError(NebiusError): - pass - - -class NotFoundError(NebiusError): - pass - - -class ConflictError(NebiusError): - pass diff --git a/src/dstack/_internal/core/backends/oci/__init__.py b/src/dstack/_internal/core/backends/oci/__init__.py index 30f36e91c3..e69de29bb2 100644 --- a/src/dstack/_internal/core/backends/oci/__init__.py +++ b/src/dstack/_internal/core/backends/oci/__init__.py @@ -1,15 +0,0 @@ -from dstack._internal.core.backends.base import Backend -from dstack._internal.core.backends.oci.compute import OCICompute -from dstack._internal.core.backends.oci.config import OCIConfig -from dstack._internal.core.models.backends.base import BackendType - - -class OCIBackend(Backend): - TYPE: BackendType = BackendType.OCI - - def __init__(self, config: OCIConfig): - self.config = config - self._compute = OCICompute(self.config) - - def compute(self) -> OCICompute: - return self._compute diff --git a/src/dstack/_internal/core/backends/oci/auth.py b/src/dstack/_internal/core/backends/oci/auth.py index 7404cc5bae..c751c10a35 100644 --- a/src/dstack/_internal/core/backends/oci/auth.py +++ b/src/dstack/_internal/core/backends/oci/auth.py @@ -2,12 +2,11 @@ from typing_extensions import Any, Mapping from dstack._internal.core.backends.oci.exceptions import any_oci_exception -from dstack._internal.core.models.backends.oci import AnyOCICreds, OCIDefaultCreds -from dstack._internal.core.models.common import is_core_model_instance +from dstack._internal.core.backends.oci.models import AnyOCICreds, OCIDefaultCreds def get_client_config(creds: AnyOCICreds) -> Mapping[str, Any]: - if is_core_model_instance(creds, OCIDefaultCreds): + if isinstance(creds, OCIDefaultCreds): return oci.config.from_file(file_location=creds.file, profile_name=creds.profile) return creds.dict(exclude={"type"}) @@ -20,7 +19,3 @@ def creds_valid(creds: AnyOCICreds) -> bool: except any_oci_exception: return False return True - - -def default_creds_available() -> bool: - return creds_valid(OCIDefaultCreds()) diff --git a/src/dstack/_internal/core/backends/oci/backend.py b/src/dstack/_internal/core/backends/oci/backend.py new file mode 100644 index 0000000000..bb9db1c93f --- /dev/null +++ b/src/dstack/_internal/core/backends/oci/backend.py @@ -0,0 +1,16 @@ +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.oci.compute import OCICompute +from dstack._internal.core.backends.oci.models import OCIConfig +from dstack._internal.core.models.backends.base import BackendType + + +class OCIBackend(Backend): + TYPE = BackendType.OCI + COMPUTE_CLASS = OCICompute + + def __init__(self, config: OCIConfig): + self.config = config + self._compute = OCICompute(self.config) + + def compute(self) -> OCICompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/oci/compute.py b/src/dstack/_internal/core/backends/oci/compute.py index 1b7cf342d5..f5a6c8439b 100644 --- a/src/dstack/_internal/core/backends/oci/compute.py +++ b/src/dstack/_internal/core/backends/oci/compute.py @@ -1,13 +1,27 @@ +from collections.abc import Iterable from concurrent.futures import ThreadPoolExecutor from functools import cached_property from typing import List, Optional import oci -from dstack._internal.core.backends.base.compute import Compute, get_instance_name, get_user_data -from dstack._internal.core.backends.base.offers import get_catalog_offers +from dstack._internal.core.backends.base.compute import ( + Compute, + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithInstanceVolumesSupport, + ComputeWithMultinodeSupport, + ComputeWithPrivilegedSupport, + generate_unique_instance_name, + get_user_data, +) +from dstack._internal.core.backends.base.offers import ( + OfferModifier, + get_catalog_offers, + get_offers_disk_modifier, +) from dstack._internal.core.backends.oci import resources -from dstack._internal.core.backends.oci.config import OCIConfig +from dstack._internal.core.backends.oci.models import OCIConfig from dstack._internal.core.backends.oci.region import make_region_clients_map from dstack._internal.core.errors import NoCapacityError from dstack._internal.core.models.backends.base import BackendType @@ -16,10 +30,10 @@ InstanceConfiguration, InstanceOffer, InstanceOfferWithAvailability, - SSHKey, ) -from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run -from dstack._internal.core.models.volumes import Volume +from dstack._internal.core.models.placement import PlacementGroup +from dstack._internal.core.models.resources import Memory, Range +from dstack._internal.core.models.runs import JobProvisioningData, Requirements SUPPORTED_SHAPE_FAMILIES = [ "VM.Standard2.", @@ -37,10 +51,19 @@ "VM.GPU.A10.", "BM.GPU.A10.", ] +CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("50GB"), max=Memory.parse("32TB")) -class OCICompute(Compute): +class OCICompute( + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithPrivilegedSupport, + ComputeWithInstanceVolumesSupport, + ComputeWithMultinodeSupport, + Compute, +): def __init__(self, config: OCIConfig): + super().__init__() self.config = config self.regions = make_region_clients_map(config.regions or [], config.creds) @@ -48,13 +71,10 @@ def __init__(self, config: OCIConfig): def shapes_quota(self) -> resources.ShapesQuota: return resources.ShapesQuota.load(self.regions, self.config.compartment_id) - def get_offers( - self, requirements: Optional[Requirements] = None - ) -> List[InstanceOfferWithAvailability]: + def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]: offers = get_catalog_offers( backend=BackendType.OCI, locations=self.config.regions, - requirements=requirements, extra_filter=_supported_instances, ) @@ -72,28 +92,18 @@ def get_offers( else: availability = InstanceAvailability.NO_QUOTA offers_with_availability.append( - InstanceOfferWithAvailability(**offer.dict(), availability=availability) + offer.with_availability( + availability=availability, + availability_zones=shapes_availability[offer.region].get( + offer.instance.name, [] + ), + ) ) return offers_with_availability - def run_job( - self, - run: Run, - job: Job, - instance_offer: InstanceOfferWithAvailability, - project_ssh_public_key: str, - project_ssh_private_key: str, - volumes: List[Volume], - ) -> JobProvisioningData: - instance_config = InstanceConfiguration( - project_name=run.project_name, - instance_name=get_instance_name(run, job), - ssh_keys=[SSHKey(public=project_ssh_public_key.strip())], - job_docker_config=None, - user=run.user, - ) - return self.create_instance(instance_offer, instance_config) + def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]: + return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)] def terminate_instance( self, instance_id: str, region: str, backend_data: Optional[str] = None @@ -105,17 +115,20 @@ def create_instance( self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration, + placement_group: Optional[PlacementGroup], ) -> JobProvisioningData: region = self.regions[instance_offer.region] - availability_domain = resources.choose_available_domain( - instance_offer.instance.name, self.shapes_quota, region, self.config.compartment_id - ) - if availability_domain is None: + if not instance_offer.availability_zones: raise NoCapacityError("Shape unavailable in all availability domains") + availability_domain = instance_offer.availability_zones[0] listing, package = resources.get_marketplace_listing_and_package( - cuda=len(instance_offer.instance.resources.gpus) > 0, + gpu_name=( + instance_offer.instance.resources.gpus[0].name + if len(instance_offer.instance.resources.gpus) > 0 + else None + ), client=region.marketplace_client, ) resources.accept_marketplace_listing_agreements( @@ -135,12 +148,12 @@ def create_instance( security_group.id, region.virtual_network_client ) - setup_commands = [ - f"sudo iptables -I INPUT -s {resources.VCN_CIDR} -j ACCEPT", - "sudo netfilter-persistent save", - ] - cloud_init_user_data = get_user_data(instance_config.get_public_keys(), setup_commands) + cloud_init_user_data = get_user_data( + authorized_keys=instance_config.get_public_keys(), + firewall_allow_from_subnets=[resources.VCN_CIDR], + ) + display_name = generate_unique_instance_name(instance_config) try: instance = resources.launch_instance( region=region, @@ -148,7 +161,7 @@ def create_instance( compartment_id=self.config.compartment_id, subnet_id=subnet.id, security_group_id=security_group.id, - display_name=instance_config.instance_name, + display_name=display_name, cloud_init_user_data=cloud_init_user_data, shape=instance_offer.instance.name, is_spot=instance_offer.instance.resources.spot, @@ -156,7 +169,7 @@ def create_instance( image_id=package.image_id, ) except oci.exceptions.ServiceError as e: - if e.code in ("LimitExceeded", "QuotaExceeded"): + if e.code in ("LimitExceeded", "QuotaExceeded") or "Out of host capacity" in e.message: raise NoCapacityError(e.message) raise @@ -167,6 +180,7 @@ def create_instance( hostname=None, internal_ip=None, region=instance_offer.region, + availability_zone=availability_domain, price=instance_offer.price, username="ubuntu", ssh_port=22, diff --git a/src/dstack/_internal/core/backends/oci/config.py b/src/dstack/_internal/core/backends/oci/config.py deleted file mode 100644 index 9a272406b1..0000000000 --- a/src/dstack/_internal/core/backends/oci/config.py +++ /dev/null @@ -1,6 +0,0 @@ -from dstack._internal.core.backends.base.config import BackendConfig -from dstack._internal.core.models.backends.oci import AnyOCICreds, OCIStoredConfig - - -class OCIConfig(OCIStoredConfig, BackendConfig): - creds: AnyOCICreds diff --git a/src/dstack/_internal/core/backends/oci/configurator.py b/src/dstack/_internal/core/backends/oci/configurator.py new file mode 100644 index 0000000000..4558e8bf96 --- /dev/null +++ b/src/dstack/_internal/core/backends/oci/configurator.py @@ -0,0 +1,156 @@ +import json +from typing import Dict, Iterable, List, Set, Tuple + +from dstack._internal.core.backends.base.configurator import ( + BackendRecord, + Configurator, + raise_invalid_credentials_error, +) +from dstack._internal.core.backends.oci import resources +from dstack._internal.core.backends.oci.backend import OCIBackend +from dstack._internal.core.backends.oci.exceptions import any_oci_exception +from dstack._internal.core.backends.oci.models import ( + OCIBackendConfig, + OCIBackendConfigWithCreds, + OCIConfig, + OCICreds, + OCIDefaultCreds, + OCIStoredConfig, +) +from dstack._internal.core.backends.oci.region import ( + get_subscribed_regions, + make_region_client, + make_region_clients_map, +) +from dstack._internal.core.errors import ServerClientError +from dstack._internal.core.models.backends.base import ( + BackendType, +) + +# where dstack images are published +SUPPORTED_REGIONS = frozenset( + [ + "eu-frankfurt-1", + "eu-milan-1", + "me-dubai-1", + "uk-london-1", + "us-ashburn-1", + "us-chicago-1", + "us-phoenix-1", + ] +) + + +class OCIConfigurator( + Configurator[ + OCIBackendConfig, + OCIBackendConfigWithCreds, + ] +): + TYPE = BackendType.OCI + BACKEND_CLASS = OCIBackend + + def validate_config(self, config: OCIBackendConfigWithCreds, default_creds_enabled: bool): + if isinstance(config.creds, OCIDefaultCreds) and not default_creds_enabled: + raise_invalid_credentials_error( + fields=[["creds"]], + details="Default credentials are forbidden by dstack settings", + ) + try: + get_subscribed_regions(config.creds).names + except any_oci_exception as e: + raise_invalid_credentials_error(fields=[["creds"]], details=e) + + def create_backend( + self, project_name: str, config: OCIBackendConfigWithCreds + ) -> BackendRecord: + try: + subscribed_regions = get_subscribed_regions(config.creds) + except any_oci_exception as e: + raise_invalid_credentials_error(fields=[["creds"]], details=e) + + if config.regions is None: + config.regions = _filter_supported_regions(subscribed_regions.names) + else: + _raise_if_regions_unavailable(config.regions, subscribed_regions.names) + + compartment_id, subnet_ids_per_region = _create_resources( + project_name, config, subscribed_regions.home_region_name + ) + config.compartment_id = compartment_id + stored_config = OCIStoredConfig.__response__( + **config.dict(), subnet_ids_per_region=subnet_ids_per_region + ) + + return BackendRecord( + config=stored_config.json(), + auth=OCICreds.parse_obj(config.creds).json(), + ) + + def get_backend_config_with_creds(self, record: BackendRecord) -> OCIBackendConfigWithCreds: + config = self._get_config(record) + return OCIBackendConfigWithCreds.__response__.parse_obj(config) + + def get_backend_config_without_creds(self, record: BackendRecord) -> OCIBackendConfig: + config = self._get_config(record) + return OCIBackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> OCIBackend: + config = self._get_config(record) + return OCIBackend(config=config) + + def _get_config(self, record: BackendRecord) -> OCIConfig: + return OCIConfig.__response__( + **json.loads(record.config), + creds=OCICreds.parse_raw(record.auth).__root__, + ) + + +def _filter_supported_regions(subscribed_region_names: Set[str]) -> List[str]: + available_regions = subscribed_region_names & SUPPORTED_REGIONS + if not available_regions: + msg = ( + f"None of your subscribed regions {subscribed_region_names} are supported " + "by dstack yet. Please subscribe to a supported region in OCI Console or " + "contact dstack if you need a specific region to become supported. " + f"Currently supported regions are: {set(SUPPORTED_REGIONS)}" + ) + raise ServerClientError(msg) + return list(available_regions) + + +def _raise_if_regions_unavailable( + region_names: Iterable[str], subscribed_region_names: Set[str] +) -> None: + region_names = set(region_names) + if unsupported_regions := region_names - SUPPORTED_REGIONS: + msg = ( + f"Regions {unsupported_regions} are configured but not supported by dstack yet. " + f"Only these regions are supported: {set(SUPPORTED_REGIONS)}. " + "Please contact dstack if a region you need is missing." + ) + raise ServerClientError(msg, fields=[["regions"]]) + if unsubscribed_regions := region_names - subscribed_region_names: + msg = f"Regions {unsubscribed_regions} are configured but not subscribed to in OCI" + raise ServerClientError(msg, fields=[["regions"]]) + + +def _create_resources( + project_name: str, config: OCIBackendConfigWithCreds, home_region: str +) -> Tuple[str, Dict[str, str]]: + compartment_id = config.compartment_id + if not compartment_id: + home_region_client = make_region_client(home_region, config.creds) + compartment_id = resources.get_or_create_compartment( + f"dstack-{project_name}", + home_region_client.client_config["tenancy"], + home_region_client.identity_client, + ).id + + region_clients = make_region_clients_map(config.regions, config.creds) + resources.wait_until_compartment_active(compartment_id, region_clients) + subnets_per_region = resources.set_up_network_resources( + compartment_id, project_name, region_clients + ) + + return compartment_id, subnets_per_region diff --git a/src/dstack/_internal/core/backends/oci/models.py b/src/dstack/_internal/core/backends/oci/models.py new file mode 100644 index 0000000000..12ce4e9f91 --- /dev/null +++ b/src/dstack/_internal/core/backends/oci/models.py @@ -0,0 +1,87 @@ +from typing import Annotated, Dict, List, Literal, Optional, Union + +from pydantic import Field, root_validator + +from dstack._internal.core.models.common import CoreModel + + +class OCIClientCreds(CoreModel): + type: Annotated[Literal["client"], Field(description="The type of credentials")] = "client" + user: Annotated[str, Field(description="User OCID")] + tenancy: Annotated[str, Field(description="Tenancy OCID")] + key_file: Annotated[ + Optional[str], + Field( + description="Path to the user's private PEM key. Either this or `key_content` should be set" + ), + ] + key_content: Annotated[ + Optional[str], + Field( + description="Content of the user's private PEM key. Either this or `key_file` should be set" + ), + ] + pass_phrase: Annotated[ + Optional[str], Field(description="Passphrase for the private PEM key if it is encrypted") + ] + fingerprint: Annotated[str, Field(description="User's public key fingerprint")] + region: Annotated[ + str, Field(description="Name or key of any region the tenancy is subscribed to") + ] + + @root_validator + def key_file_xor_key_content(cls, values): + key_file, key_content = values["key_file"], values["key_content"] + if key_file and key_content: + raise ValueError("key_file and key_content are mutually exclusive") + if not key_file and not key_content: + raise ValueError("Either key_file or key_content should be set") + return values + + +class OCIDefaultCreds(CoreModel): + type: Annotated[Literal["default"], Field(description="The type of credentials")] = "default" + file: Annotated[str, Field(description="Path to the OCI CLI-compatible config file")] = ( + "~/.oci/config" + ) + profile: Annotated[str, Field(description="Profile to load from the config file")] = "DEFAULT" + + +AnyOCICreds = Union[OCIClientCreds, OCIDefaultCreds] + + +class OCICreds(CoreModel): + __root__: AnyOCICreds = Field(..., discriminator="type") + + +class OCIBackendConfig(CoreModel): + type: Annotated[Literal["oci"], Field(description="The type of backend")] = "oci" + regions: Annotated[ + Optional[List[str]], + Field(description="The list of OCI regions. Omit to use all regions"), + ] = None + compartment_id: Annotated[ + Optional[str], + Field( + description=( + "Compartment where `dstack` will create all resources." + " Omit to instruct `dstack` to create a new compartment" + ) + ), + ] = None + + +class OCIBackendConfigWithCreds(OCIBackendConfig): + creds: Annotated[AnyOCICreds, Field(description="The credentials", discriminator="type")] + + +AnyOCIBackendConfig = Union[OCIBackendConfig, OCIBackendConfigWithCreds] + + +class OCIStoredConfig(OCIBackendConfig): + compartment_id: str + subnet_ids_per_region: Dict[str, str] + + +class OCIConfig(OCIStoredConfig): + creds: AnyOCICreds diff --git a/src/dstack/_internal/core/backends/oci/region.py b/src/dstack/_internal/core/backends/oci/region.py index 48022c0f57..806ed76fd1 100644 --- a/src/dstack/_internal/core/backends/oci/region.py +++ b/src/dstack/_internal/core/backends/oci/region.py @@ -5,7 +5,7 @@ import oci from dstack._internal.core.backends.oci.auth import get_client_config -from dstack._internal.core.models.backends.oci import AnyOCICreds +from dstack._internal.core.backends.oci.models import AnyOCICreds class OCIRegionClient: diff --git a/src/dstack/_internal/core/backends/oci/resources.py b/src/dstack/_internal/core/backends/oci/resources.py index b889d509f3..91e1d890f0 100644 --- a/src/dstack/_internal/core/backends/oci/resources.py +++ b/src/dstack/_internal/core/backends/oci/resources.py @@ -22,11 +22,13 @@ import oci from oci.object_storage.models import CreatePreauthenticatedRequestDetails -from dstack import version +from dstack._internal import settings +from dstack._internal.core.backends.base.compute import requires_nvidia_proprietary_kernel_modules from dstack._internal.core.backends.oci.region import OCIRegionClient +from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES from dstack._internal.core.errors import BackendError from dstack._internal.core.models.instances import InstanceOffer -from dstack._internal.utils.common import split_chunks +from dstack._internal.utils.common import batched from dstack._internal.utils.logging import get_logger logger = get_logger(__name__) @@ -203,34 +205,29 @@ def check_availability_in_domain( return available -def check_availability_in_region( +def check_availability_per_domain( shape_names: Iterable[str], shapes_quota: ShapesQuota, region: OCIRegionClient, compartment_id: str, -) -> Set[str]: - """ - Returns a subset of `shape_names` with only the shapes available in at least - one availability domain within `region`. - """ - +) -> Dict[str, Set[str]]: all_shapes = set(shape_names) - available_shapes = set() + available_shapes_per_domain = {} for availability_domain in region.availability_domains: shapes_to_check = { shape - for shape in all_shapes - available_shapes + for shape in all_shapes if shapes_quota.is_within_domain_quota(shape, availability_domain.name) } - available_shapes |= check_availability_in_domain( + available_shapes_per_domain[availability_domain.name] = check_availability_in_domain( shape_names=shapes_to_check, availability_domain_name=availability_domain.name, client=region.compute_client, compartment_id=compartment_id, ) - return available_shapes + return available_shapes_per_domain def get_shapes_availability( @@ -239,12 +236,11 @@ def get_shapes_availability( regions: Mapping[str, OCIRegionClient], compartment_id: str, executor: Executor, -) -> Dict[str, Set[str]]: +) -> Dict[str, Dict[str, List[str]]]: """ - Returns a mapping of region names to sets of shape names available in these - regions. Only shapes from `offers` are checked. + Returns availability domains where shapes are available as regions->shapes->availability_domains mapping. + Only shapes from `offers` are checked. """ - shape_names_per_region = {region: set() for region in regions} for offer in offers: if shapes_quota.is_within_region_quota(offer.instance.name, offer.region): @@ -253,7 +249,7 @@ def get_shapes_availability( future_to_region_name = {} for region_name, shape_names in shape_names_per_region.items(): future = executor.submit( - check_availability_in_region, + check_availability_per_domain, shape_names, shapes_quota, regions[region_name], @@ -263,29 +259,32 @@ def get_shapes_availability( result = {} for future in as_completed(future_to_region_name): - region_name = future_to_region_name[future] - result[region_name] = future.result() + domains_to_shape_names = future.result() + shape_names_to_domains = {} + for domain, shape_names in domains_to_shape_names.items(): + for shape_name in shape_names: + shape_names_to_domains.setdefault(shape_name, []).append(domain) + result[future_to_region_name[future]] = shape_names_to_domains return result -def choose_available_domain( +def get_available_domains( shape_name: str, shapes_quota: ShapesQuota, region: OCIRegionClient, compartment_id: str -) -> Optional[str]: +) -> List[str]: """ - Returns the name of any availability domain within `region` in which - `shape_name` is available. None if the shape is unavailable or not within - `shapes_quota` in all domains. + Returns the names of all availability domains in `region` in which + `shape_name` is available and within `shapes_quota`. """ - + domains = [] for domain in region.availability_domains: if shapes_quota.is_within_domain_quota( shape_name, domain.name ) and check_availability_in_domain( {shape_name}, domain.name, region.compute_client, compartment_id ): - return domain.name - return None + domains.append(domain.name) + return domains def get_instance_vnic( @@ -355,11 +354,15 @@ def terminate_instance_if_exists(client: oci.core.ComputeClient, instance_id: st def get_marketplace_listing_and_package( - cuda: bool, client: oci.marketplace.MarketplaceClient + gpu_name: Optional[str], client: oci.marketplace.MarketplaceClient ) -> Tuple[oci.marketplace.models.Listing, oci.marketplace.models.ImageListingPackage]: - listing_name = f"dstack-{version.base_image}" - if cuda: - listing_name = f"dstack-cuda-{version.base_image}" + prefix = settings.DSTACK_VM_BASE_IMAGE_PREFIX + listing_name = f"{prefix}dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}" + if gpu_name is not None: + if not requires_nvidia_proprietary_kernel_modules(gpu_name): + listing_name = f"{prefix}dstack-cuda-{settings.DSTACK_VM_BASE_IMAGE_VERSION}" + else: + listing_name = f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}" listing_summaries = list_marketplace_listings(listing_name, client) if len(listing_summaries) != 1: @@ -670,21 +673,21 @@ def add_security_group_rules( security_group_id: str, rules: Iterable[SecurityRule], client: oci.core.VirtualNetworkClient ) -> None: rules_details = map(SecurityRule.to_sdk_add_rule_details, rules) - for chunk in split_chunks(rules_details, ADD_SECURITY_RULES_MAX_CHUNK_SIZE): + for batch in batched(rules_details, ADD_SECURITY_RULES_MAX_CHUNK_SIZE): client.add_network_security_group_security_rules( security_group_id, - oci.core.models.AddNetworkSecurityGroupSecurityRulesDetails(security_rules=chunk), + oci.core.models.AddNetworkSecurityGroupSecurityRulesDetails(security_rules=batch), ) def remove_security_group_rules( security_group_id: str, rule_ids: Iterable[str], client: oci.core.VirtualNetworkClient ) -> None: - for chunk in split_chunks(rule_ids, REMOVE_SECURITY_RULES_MAX_CHUNK_SIZE): + for batch in batched(rule_ids, REMOVE_SECURITY_RULES_MAX_CHUNK_SIZE): client.remove_network_security_group_security_rules( security_group_id, oci.core.models.RemoveNetworkSecurityGroupSecurityRulesDetails( - security_rule_ids=chunk + security_rule_ids=batch ), ) @@ -726,6 +729,12 @@ def create_pre_authenticated_request( def delete_bucket( namespace: str, bucket_name: str, client: oci.object_storage.ObjectStorageClient ) -> None: + in_progress_uploads: Iterable[oci.object_storage.models.MultipartUpload] = ( + chain_paginated_responses(client.list_multipart_uploads, namespace, bucket_name) + ) + for upload in in_progress_uploads: + client.abort_multipart_upload(namespace, bucket_name, upload.object, upload.upload_id) + par_ids = { par.id for par in chain_paginated_responses( diff --git a/src/dstack/_internal/core/backends/profile_options.py b/src/dstack/_internal/core/backends/profile_options.py new file mode 100644 index 0000000000..cc2ae99562 --- /dev/null +++ b/src/dstack/_internal/core/backends/profile_options.py @@ -0,0 +1,5 @@ +from dstack._internal.core.backends.vastai.profile_options import VastAIProfileOptions + +# TODO: when adding options for the first VM-based backend, +# implement the logic to check idle instances against backend options before reusing. +AnyBackendProfileOptions = VastAIProfileOptions diff --git a/src/dstack/_internal/core/backends/remote/provisioning.py b/src/dstack/_internal/core/backends/remote/provisioning.py deleted file mode 100644 index f4faecc656..0000000000 --- a/src/dstack/_internal/core/backends/remote/provisioning.py +++ /dev/null @@ -1,248 +0,0 @@ -import io -import json -import time -from contextlib import contextmanager -from typing import Any, Dict, Generator, List - -import paramiko - -from dstack._internal.core.errors import ProvisioningError -from dstack._internal.core.models.instances import ( - Disk, - Gpu, - InstanceType, - Resources, -) -from dstack._internal.utils.gpu import convert_gpu_name -from dstack._internal.utils.logging import get_logger - -logger = get_logger(__name__) - - -SSH_CONNECT_TIMEOUT = 10 - -DSTACK_SHIM_ENV_FILE = "dstack-shim.env" - - -def sftp_upload(client: paramiko.SSHClient, path: str, body: str) -> None: - try: - sftp = client.open_sftp() - channel = sftp.get_channel() - if channel is not None: - channel.settimeout(10) - sftp.putfo(io.BytesIO(body.encode()), path) - sftp.close() - except (paramiko.SSHException, OSError) as e: - raise ProvisioningError(f"sft_upload failed: {e}") from e - - -def upload_envs(client: paramiko.SSHClient, working_dir: str, envs: Dict[str, str]) -> None: - envs["DSTACK_SERVICE_MODE"] = "1" # make host_info.json on start - dot_env = "\n".join(f'{key.upper()}="{value.strip()}"' for key, value in envs.items()) - tmp_file_path = f"/tmp/{DSTACK_SHIM_ENV_FILE}" - sftp_upload(client, tmp_file_path, dot_env) - try: - cmd = f"sudo mkdir -p {working_dir} && sudo mv {tmp_file_path} {working_dir}/" - _, stdout, stderr = client.exec_command(cmd, timeout=20) - out = stdout.read().strip().decode() - err = stderr.read().strip().decode() - if out or err: - raise ProvisioningError( - f"The command 'upload_envs' didn't work. stdout: {out}, stderr: {err}" - ) - except (paramiko.SSHException, OSError) as e: - raise ProvisioningError(f"upload_envs failed: {e}") from e - - -def run_pre_start_commands( - client: paramiko.SSHClient, shim_pre_start_commands: List[str], authorized_keys: List[str] -) -> None: - try: - authorized_keys_content = "\n".join(authorized_keys).strip() - _, stdout, stderr = client.exec_command( - f"echo '\n{authorized_keys_content}' >> ~/.ssh/authorized_keys", timeout=5 - ) - out = stdout.read().strip().decode() - err = stderr.read().strip().decode() - if out or err: - raise ProvisioningError( - f"The command 'authorized_keys' didn't work. stdout: {out}, stderr: {err}" - ) - except (paramiko.SSHException, OSError) as e: - raise ProvisioningError(f"upload authorized_keys failed: {e}") from e - - script = " && ".join(shim_pre_start_commands) - try: - _, stdout, stderr = client.exec_command(f"sudo sh -c '{script}'", timeout=120) - out = stdout.read().strip().decode() - err = stderr.read().strip().decode() - if out or err: - raise ProvisioningError( - f"The command 'run_pre_start_commands' didn't work. stdout: {out}, stderr: {err}" - ) - except (paramiko.SSHException, OSError) as e: - raise ProvisioningError(f"run_pre-start_commands failed: {e}") from e - - -def run_shim_as_systemd_service(client: paramiko.SSHClient, working_dir: str, dev: bool) -> None: - dev_flag = "--dev" if dev else "" - shim_service = f"""\ - [Unit] - Description=dstack-shim - After=network.target - - [Service] - Type=simple - User=root - Restart=always - WorkingDirectory={working_dir} - EnvironmentFile={working_dir}/{DSTACK_SHIM_ENV_FILE} - ExecStart=/usr/local/bin/dstack-shim {dev_flag} docker --keep-container - StandardOutput=append:/root/.dstack/shim.log - StandardError=append:/root/.dstack/shim.log - - [Install] - WantedBy=multi-user.target - """ - - stripped_shim_service = "\n".join(line.strip() for line in shim_service.splitlines()) - sftp_upload(client, "/tmp/dstack-shim.service", stripped_shim_service) - - try: - cmd = """\ - sudo mv /tmp/dstack-shim.service /etc/systemd/system/dstack-shim.service && \ - sudo systemctl daemon-reload && \ - sudo systemctl --quiet enable dstack-shim && \ - sudo systemctl restart dstack-shim - """ - _, stdout, stderr = client.exec_command(cmd, timeout=100) - out = stdout.read().strip().decode() - err = stderr.read().strip().decode() - if out or err: - raise ProvisioningError( - f"The command 'run_shim_as_systemd_service' didn't work. stdout: {out}, stderr: {err}" - ) - except (paramiko.SSHException, OSError) as e: - raise ProvisioningError(f"run_shim_as_systemd failed: {e}") from e - - -def check_dstack_shim_service(client: paramiko.SSHClient): - try: - _, stdout, _ = client.exec_command("sudo systemctl status dstack-shim.service", timeout=10) - status = stdout.read() - except (paramiko.SSHException, OSError) as e: - raise ProvisioningError(f"Checking dstack-shim.service status failed: {e}") from e - - for raw_line in status.splitlines(): - line = raw_line.decode() - if line.strip().startswith("Active: failed"): - raise ProvisioningError(f"The dstack-shim service doesn't start: {line.strip()}") - - -def get_host_info(client: paramiko.SSHClient, working_dir: str) -> Dict[str, Any]: - # wait host_info - retries = 60 - iter_delay = 3 - for _ in range(retries): - try: - _, stdout, stderr = client.exec_command( - f"sudo cat {working_dir}/host_info.json", timeout=10 - ) - err = stderr.read().decode().strip() - if err: - logger.debug("Retry after error: %s", err) - time.sleep(iter_delay) - continue - except (paramiko.SSHException, OSError) as e: - logger.debug("Cannot run `cat host_info.json` in the remote instance: %s", e) - else: - try: - host_info_json = stdout.read() - host_info = json.loads(host_info_json) - return host_info - except ValueError: # JSON parse error - check_dstack_shim_service(client) - raise ProvisioningError("Cannot parse host_info") - time.sleep(iter_delay) - else: - check_dstack_shim_service(client) - raise ProvisioningError("Cannot get host_info") - - -def get_shim_healthcheck(client: paramiko.SSHClient) -> str: - retries = 20 - iter_delay = 3 - for _ in range(retries): - try: - _, stdout, stderr = client.exec_command( - "curl -s http://localhost:10998/api/healthcheck", timeout=15 - ) - out = stdout.read().strip().decode() - err = stderr.read().strip().decode() - if err: - raise ProvisioningError( - f"The command 'get_shim_healthcheck' didn't work. stdout: {out}, stderr: {err}" - ) - if not out: - logger.debug("healthcheck is empty. retry") - time.sleep(iter_delay) - continue - return out - except (paramiko.SSHException, OSError) as e: - raise ProvisioningError(f"get_shim_healthcheck failed: {e}") from e - - -def host_info_to_instance_type(host_info: Dict[str, Any]) -> InstanceType: - gpu_name = convert_gpu_name(host_info["gpu_name"]) - if host_info.get("gpu_count", 0): - gpu_memory = int(host_info["gpu_memory"].lower().replace("mib", "").strip()) - gpus = [Gpu(name=gpu_name, memory_mib=gpu_memory)] * host_info["gpu_count"] - else: - gpus = [] - instance_type = InstanceType( - name="instance", - resources=Resources( - cpus=host_info["cpus"], - memory_mib=host_info["memory"] / 1024 / 1024, - spot=False, - gpus=gpus, - disk=Disk(size_mib=host_info["disk_size"] / 1024 / 1024), - ), - ) - return instance_type - - -@contextmanager -def get_paramiko_connection( - ssh_user: str, host: str, port: int, pkeys: List[paramiko.PKey] -) -> Generator[paramiko.SSHClient, None, None]: - with paramiko.SSHClient() as client: - client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - for pkey in pkeys: - conn_url = f"{ssh_user}@{host}:{port}" - try: - logger.debug("Try to connect to %s with key %s", conn_url, pkey.fingerprint) - client.connect( - username=ssh_user, - hostname=host, - port=port, - pkey=pkey, - look_for_keys=False, - allow_agent=False, - timeout=SSH_CONNECT_TIMEOUT, - ) - except paramiko.AuthenticationException: - logger.debug( - f'Authentication faild to connect to "{conn_url}" and {pkey.fingerprint}' - ) - continue # try next key - except (paramiko.SSHException, OSError) as e: - raise ProvisioningError(f"Connect failed: {e}") from e - else: - yield client - return - else: - keys_fp = ", ".join(f"{pk.fingerprint!r}" for pk in pkeys) - raise ProvisioningError( - f"SSH connection to the {conn_url} with keys [{keys_fp}] was unsuccessful" - ) diff --git a/src/dstack/_internal/core/backends/runpod/__init__.py b/src/dstack/_internal/core/backends/runpod/__init__.py index 3a0770ee81..e69de29bb2 100644 --- a/src/dstack/_internal/core/backends/runpod/__init__.py +++ b/src/dstack/_internal/core/backends/runpod/__init__.py @@ -1,15 +0,0 @@ -from dstack._internal.core.backends.base import Backend -from dstack._internal.core.backends.runpod.compute import RunpodCompute -from dstack._internal.core.backends.runpod.config import RunpodConfig -from dstack._internal.core.models.backends.base import BackendType - - -class RunpodBackend(Backend): - TYPE: BackendType = BackendType.RUNPOD - - def __init__(self, config: RunpodConfig): - self.config = config - self._compute = RunpodCompute(self.config) - - def compute(self) -> RunpodCompute: - return self._compute diff --git a/src/dstack/_internal/core/backends/runpod/api_client.py b/src/dstack/_internal/core/backends/runpod/api_client.py index 88df1bbb3e..2feae83ca7 100644 --- a/src/dstack/_internal/core/backends/runpod/api_client.py +++ b/src/dstack/_internal/core/backends/runpod/api_client.py @@ -3,6 +3,7 @@ from typing import Any, Dict, List, Optional import requests +from gpuhunt.providers.runpod import RunpodProvider from requests import Response from dstack._internal.core.errors import BackendError, BackendInvalidCredentialsError @@ -11,9 +12,18 @@ API_URL = "https://api.runpod.io/graphql" +class RunpodApiClientError(BackendError): + errors: List[Dict] + + def __init__(self, errors: List[Dict]): + self.errors = errors + super().__init__(errors) + + class RunpodApiClient: def __init__(self, api_key: str): - self.api_key = api_key + self._session = requests.Session() + self._session.headers.update({"Authorization": f"Bearer {api_key}"}) def validate_api_key(self) -> bool: try: @@ -23,7 +33,19 @@ def validate_api_key(self) -> bool: return True def get_user_details(self) -> Dict: - resp = self._make_request({"query": user_details_query, "variable": {}}) + resp = self._make_request( + { + "query": """ + query myself { + myself { + id + authId + email + } + } + """ + } + ) return resp.json() def create_pod( @@ -31,7 +53,7 @@ def create_pod( name: str, image_name: str, gpu_type_id: str, - cloud_type: str = "ALL", + cloud_type: str, support_public_ip: bool = True, start_ssh: bool = True, data_center_id: Optional[str] = None, @@ -43,61 +65,98 @@ def create_pod( min_memory_in_gb: int = 1, docker_args: str = "", ports: Optional[str] = None, - volume_mount_path: str = "/runpod-volume", + volume_mount_path: Optional[str] = None, env: Optional[Dict[str, Any]] = None, template_id: Optional[str] = None, network_volume_id: Optional[str] = None, allowed_cuda_versions: Optional[List[str]] = None, bid_per_gpu: Optional[float] = None, + container_registry_auth_id: Optional[str] = None, ) -> Dict: resp = self._make_request( { - "query": generate_pod_deployment_mutation( - name, - image_name, - gpu_type_id, - cloud_type, - support_public_ip, - start_ssh, - data_center_id, - country_code, - gpu_count, - volume_in_gb, - container_disk_in_gb, - min_vcpu_count, - min_memory_in_gb, - docker_args, - ports, - volume_mount_path, - env, - template_id, - network_volume_id, - allowed_cuda_versions, - bid_per_gpu, + "query": _generate_pod_deployment_mutation( + name=name, + image_name=image_name, + gpu_type_id=gpu_type_id, + cloud_type=cloud_type, + support_public_ip=support_public_ip, + start_ssh=start_ssh, + data_center_id=data_center_id, + country_code=country_code, + gpu_count=gpu_count, + volume_in_gb=volume_in_gb, + container_disk_in_gb=container_disk_in_gb, + min_vcpu_count=min_vcpu_count, + min_memory_in_gb=min_memory_in_gb, + docker_args=docker_args, + ports=ports, + volume_mount_path=volume_mount_path, + env=env, + template_id=template_id, + network_volume_id=network_volume_id, + allowed_cuda_versions=allowed_cuda_versions, + bid_per_gpu=bid_per_gpu, + container_registry_auth_id=container_registry_auth_id, ) } ) data = resp.json()["data"] return data["podRentInterruptable"] if bid_per_gpu else data["podFindAndDeployOnDemand"] - def edit_pod( + def create_cpu_pod( self, - pod_id: str, + name: str, image_name: str, - container_disk_in_gb: int, + instance_id: str, + cloud_type: str, + deploy_cost: float, + start_ssh: bool = True, + data_center_id: Optional[str] = None, + container_disk_in_gb: Optional[int] = None, + docker_args: Optional[str] = None, + ports: Optional[str] = None, + volume_mount_path: Optional[str] = None, + env: Optional[Dict[str, Any]] = None, + template_id: Optional[str] = None, + network_volume_id: Optional[str] = None, + container_registry_auth_id: Optional[str] = None, + ) -> Dict: + resp = self._make_request( + { + "query": _generate_cpu_pod_deployment_mutation( + name=name, + image_name=image_name, + instance_id=instance_id, + cloud_type=cloud_type, + deploy_cost=deploy_cost, + start_ssh=start_ssh, + data_center_id=data_center_id, + container_disk_in_gb=container_disk_in_gb, + docker_args=docker_args, + ports=ports, + volume_mount_path=volume_mount_path, + env=env, + template_id=template_id, + network_volume_id=network_volume_id, + container_registry_auth_id=container_registry_auth_id, + ) + } + ) + return resp.json()["data"]["deployCpuPod"] + + def update_pod_container_registry_auth( + self, + pod_id: str, container_registry_auth_id: str, - volume_in_gb: int = 0, - ) -> int: + ) -> str: resp = self._make_request( { "query": f""" mutation {{ podEditJob(input: {{ podId: "{pod_id}" - imageName: "{image_name}" - containerDiskInGb: {container_disk_in_gb} containerRegistryAuthId: "{container_registry_auth_id}" - volumeInGb: {volume_in_gb} }}) {{ id }} @@ -108,12 +167,12 @@ def edit_pod( return resp.json()["data"]["podEditJob"]["id"] def get_pod(self, pod_id: str) -> Dict: - resp = self._make_request({"query": generate_pod_query(pod_id)}) + resp = self._make_request({"query": _generate_pod_query(pod_id)}) data = resp.json() return data["data"]["pod"] def terminate_pod(self, pod_id: str) -> Dict: - resp = self._make_request({"query": generate_pod_terminate_mutation(pod_id)}) + resp = self._make_request({"query": _generate_pod_terminate_mutation(pod_id)}) data = resp.json() return data["data"] @@ -134,7 +193,7 @@ def get_container_registry_auths(self) -> List[Dict]: ) return resp.json()["data"]["myself"]["containerRegistryCreds"] - def add_container_registry_auth(self, name: str, username: str, password: str) -> int: + def add_container_registry_auth(self, name: str, username: str, password: str) -> str: resp = self._make_request( { "query": f""" @@ -167,19 +226,139 @@ def delete_container_registry_auth(self, auth_id: str) -> None: } ) - def _make_request(self, data: Any = None) -> Response: + def get_network_volume(self, volume_id: str) -> Optional[Dict]: + response = self._make_request( + { + "query": """ + query getMyVolumes { + myself { + networkVolumes { + id, + name, + size, + dataCenter { + id + name + } + } + } + } + """ + } + ) + network_volumes = response.json()["data"]["myself"]["networkVolumes"] + for vol in network_volumes: + if vol["id"] == volume_id: + return vol + return None + + def create_network_volume(self, name: str, region: str, size: int) -> str: + response = self._make_request( + { + "query": f""" + mutation {{ + createNetworkVolume( + input: {{ + name: "{name}", + size: {size}, + dataCenterId: "{region}" + }} + ) {{ + id + }} + }} + """ + } + ) + return response.json()["data"]["createNetworkVolume"]["id"] + + def delete_network_volume(self, volume_id: str) -> None: + self._make_request( + { + "query": f""" + mutation {{ + deleteNetworkVolume( + input: {{ + id: "{volume_id}" + }} + ) + }} + """ + } + ) + + def create_cluster( + self, + cluster_name: str, + gpu_type_id: str, + pod_count: int, + gpu_count_per_pod: int, + image_name: str, + deploy_cost: str, + template_id: Optional[str] = None, + cluster_type: str = "TRAINING", + network_volume_id: Optional[str] = None, + volume_in_gb: Optional[int] = None, + throughput: Optional[int] = None, + allowed_cuda_versions: Optional[List[str]] = None, + volume_key: Optional[str] = None, + data_center_id: Optional[str] = None, + start_jupyter: bool = False, + start_ssh: bool = False, + container_disk_in_gb: Optional[int] = None, + docker_args: Optional[str] = None, + env: Optional[Dict[str, Any]] = None, + volume_mount_path: Optional[str] = None, + ports: Optional[str] = None, + ) -> Dict: + resp = self._make_request( + { + "query": _generate_create_cluster_mutation( + cluster_name=cluster_name, + gpu_type_id=gpu_type_id, + pod_count=pod_count, + gpu_count_per_pod=gpu_count_per_pod, + image_name=image_name, + cluster_type=cluster_type, + deploy_cost=deploy_cost, + template_id=template_id, + network_volume_id=network_volume_id, + volume_in_gb=volume_in_gb, + throughput=throughput, + allowed_cuda_versions=allowed_cuda_versions, + volume_key=volume_key, + data_center_id=data_center_id, + start_jupyter=start_jupyter, + start_ssh=start_ssh, + container_disk_in_gb=container_disk_in_gb, + docker_args=docker_args, + env=env, + volume_mount_path=volume_mount_path, + ports=ports, + ) + } + ) + data = resp.json()["data"] + return data["createCluster"] + + def delete_cluster(self, cluster_id: str) -> bool: + resp = self._make_request({"query": _generate_delete_cluster_mutation(cluster_id)}) + data = resp.json()["data"] + return data["deleteCluster"] + + def _make_request(self, data: Optional[Dict[str, Any]] = None) -> Response: try: - # TODO: fix S113 by setting an adequate timeout here or in every method - response = requests.request( # noqa: S113 + response = self._session.request( method="POST", - url=f"{API_URL}?api_key={self.api_key}", + url=API_URL, json=data, + timeout=120, ) response.raise_for_status() - if "errors" in response.json(): - if "podTerminate" in response.json()["errors"][0]["path"]: - raise BackendError("Instance Not Found") - raise BackendError(response.json()["errors"][0]["message"]) + response_json = response.json() + # Runpod returns 200 on client errors + if "errors" in response_json: + raise RunpodApiClientError(errors=response_json["errors"]) return response except requests.HTTPError as e: if e.response is not None and e.response.status_code in ( @@ -189,7 +368,7 @@ def _make_request(self, data: Any = None) -> Response: raise BackendInvalidCredentialsError(e.response.text) raise - def wait_for_instance(self, instance_id) -> Optional[Dict]: + def wait_for_instance(self, instance_id: str) -> Optional[Dict]: start = get_current_datetime() wait_for_instance_interval = 5 # To change the status to "running," the image must be pulled and then started. @@ -202,18 +381,7 @@ def wait_for_instance(self, instance_id) -> Optional[Dict]: return -user_details_query = """ -query myself { - myself { - id - authId - email - } -} -""" - - -def generate_pod_query(pod_id: str) -> str: +def _generate_pod_query(pod_id: str) -> str: """ Generate a query for a specific GPU type """ @@ -222,6 +390,7 @@ def generate_pod_query(pod_id: str) -> str: query pod {{ pod(input: {{podId: "{pod_id}"}}) {{ id + clusterIp containerDiskInGb costPerHr desiredStatus @@ -258,51 +427,46 @@ def generate_pod_query(pod_id: str) -> str: """ -def generate_pod_deployment_mutation( +def _generate_pod_deployment_mutation( name: str, image_name: str, gpu_type_id: str, - cloud_type: str = "ALL", + cloud_type: str, support_public_ip: bool = True, start_ssh: bool = True, - data_center_id=None, - country_code=None, - gpu_count=None, - volume_in_gb=None, - container_disk_in_gb=None, - min_vcpu_count=None, - min_memory_in_gb=None, - docker_args=None, - ports=None, - volume_mount_path=None, + data_center_id: Optional[str] = None, + country_code: Optional[str] = None, + gpu_count: Optional[int] = None, + volume_in_gb: Optional[int] = None, + container_disk_in_gb: Optional[int] = None, + min_vcpu_count: Optional[int] = None, + min_memory_in_gb: Optional[int] = None, + docker_args: Optional[str] = None, + ports: Optional[str] = None, + volume_mount_path: Optional[str] = None, env: Optional[Dict[str, Any]] = None, - template_id=None, - network_volume_id=None, + template_id: Optional[str] = None, + network_volume_id: Optional[str] = None, allowed_cuda_versions: Optional[List[str]] = None, bid_per_gpu: Optional[float] = None, + container_registry_auth_id: Optional[str] = None, ) -> str: """ Generates a mutation to deploy pod. """ input_fields = [] - - # ------------------------------ Required Fields ----------------------------- # input_fields.append(f'name: "{name}"') input_fields.append(f'imageName: "{image_name}"') input_fields.append(f'gpuTypeId: "{gpu_type_id}"') - - # ------------------------------ Default Fields ------------------------------ # input_fields.append(f"cloudType: {cloud_type}") + input_fields.append(f'minCudaVersion: "{RunpodProvider.MIN_CUDA_VERSION}"') if start_ssh: input_fields.append("startSsh: true") - if support_public_ip: input_fields.append("supportPublicIp: true") else: input_fields.append("supportPublicIp: false") - - # ------------------------------ Optional Fields ----------------------------- # if bid_per_gpu is not None: input_fields.append(f"bidPerGpu: {bid_per_gpu}") if data_center_id is not None: @@ -333,18 +497,18 @@ def generate_pod_deployment_mutation( input_fields.append(f"env: [{env_string}]") if template_id is not None: input_fields.append(f'templateId: "{template_id}"') - if network_volume_id is not None: input_fields.append(f'networkVolumeId: "{network_volume_id}"') - if allowed_cuda_versions is not None: allowed_cuda_versions_string = ", ".join( [f'"{version}"' for version in allowed_cuda_versions] ) input_fields.append(f"allowedCudaVersions: [{allowed_cuda_versions_string}]") + if container_registry_auth_id is not None: + input_fields.append(f'containerRegistryAuthId: "{container_registry_auth_id}"') pod_deploy = "podFindAndDeployOnDemand" if bid_per_gpu is None else "podRentInterruptable" - # Format input fields + input_string = ", ".join(input_fields) return f""" mutation {{ @@ -364,7 +528,78 @@ def generate_pod_deployment_mutation( """ -def generate_pod_terminate_mutation(pod_id: str) -> str: +def _generate_cpu_pod_deployment_mutation( + name: str, + image_name: str, + instance_id: str, + cloud_type: str, + deploy_cost: float, + start_ssh: bool = True, + data_center_id: Optional[str] = None, + container_disk_in_gb: Optional[int] = None, + docker_args: Optional[str] = None, + ports: Optional[str] = None, + volume_mount_path: Optional[str] = None, + env: Optional[Dict[str, Any]] = None, + template_id: Optional[str] = None, + network_volume_id: Optional[str] = None, + container_registry_auth_id: Optional[str] = None, +) -> str: + """ + Generates a mutation to deploy CPU pod. + """ + input_fields = [] + input_fields.append(f'name: "{name}"') + input_fields.append(f'imageName: "{image_name}"') + input_fields.append(f'instanceId: "{instance_id}"') + input_fields.append(f"cloudType: {cloud_type}") + input_fields.append(f"deployCost: {deploy_cost}") + + if start_ssh: + input_fields.append("startSsh: true") + if data_center_id is not None: + input_fields.append(f'dataCenterId: "{data_center_id}"') + if container_disk_in_gb is not None: + input_fields.append(f"containerDiskInGb: {container_disk_in_gb}") + if docker_args is not None: + input_fields.append(f'dockerArgs: "{docker_args}"') + if ports is not None: + ports = ports.replace(" ", "") + input_fields.append(f'ports: "{ports}"') + if volume_mount_path is not None: + input_fields.append(f'volumeMountPath: "{volume_mount_path}"') + if env is not None: + env_string = ", ".join( + [f'{{ key: "{key}", value: "{value}" }}' for key, value in env.items()] + ) + input_fields.append(f"env: [{env_string}]") + if template_id is not None: + input_fields.append(f'templateId: "{template_id}"') + if network_volume_id is not None: + input_fields.append(f'networkVolumeId: "{network_volume_id}"') + if container_registry_auth_id is not None: + input_fields.append(f'containerRegistryAuthId: "{container_registry_auth_id}"') + + input_string = ", ".join(input_fields) + return f""" + mutation {{ + deployCpuPod( + input: {{ + {input_string} + }} + ) {{ + id + lastStatusChange + imageName + machine {{ + podHostId + }} + }} + }} + """ + + +def _generate_pod_terminate_mutation(pod_id: str) -> str: """ Generates a mutation to terminate a pod. """ @@ -373,3 +608,124 @@ def generate_pod_terminate_mutation(pod_id: str) -> str: podTerminate(input: {{ podId: "{pod_id}" }}) }} """ + + +def _generate_delete_cluster_mutation(cluster_id: str) -> str: + """ + Generates a mutation to delete a cluster. + """ + return f""" + mutation {{ + deleteCluster( + input: {{ + id: "{cluster_id}" + }} + ) + }} + """ + + +def _generate_create_cluster_mutation( + cluster_name: str, + gpu_type_id: str, + pod_count: int, + gpu_count_per_pod: int, + image_name: str, + cluster_type: str, + deploy_cost: str, + template_id: Optional[str] = None, + network_volume_id: Optional[str] = None, + volume_in_gb: Optional[int] = None, + throughput: Optional[int] = None, + allowed_cuda_versions: Optional[List[str]] = None, + volume_key: Optional[str] = None, + data_center_id: Optional[str] = None, + start_jupyter: bool = False, + start_ssh: bool = False, + container_disk_in_gb: Optional[int] = None, + docker_args: Optional[str] = None, + env: Optional[Dict[str, Any]] = None, + volume_mount_path: Optional[str] = None, + ports: Optional[str] = None, +) -> str: + """ + Generates a mutation to create a cluster. + """ + input_fields = [] + + # ------------------------------ Required Fields ----------------------------- # + input_fields.append(f'clusterName: "{cluster_name}"') + input_fields.append(f'gpuTypeId: "{gpu_type_id}"') + input_fields.append(f"podCount: {pod_count}") + input_fields.append(f'imageName: "{image_name}"') + input_fields.append(f"type: {cluster_type}") + input_fields.append(f"gpuCountPerPod: {gpu_count_per_pod}") + # If deploy_cost is not specified, Runpod returns Insufficient resources error. + input_fields.append(f"deployCost: {deploy_cost}") + + # ------------------------------ Optional Fields ----------------------------- # + if template_id is not None: + input_fields.append(f'templateId: "{template_id}"') + if network_volume_id is not None: + input_fields.append(f'networkVolumeId: "{network_volume_id}"') + if volume_in_gb is not None: + input_fields.append(f"volumeInGb: {volume_in_gb}") + if throughput is not None: + input_fields.append(f"throughput: {throughput}") + if allowed_cuda_versions is not None: + allowed_cuda_versions_string = ", ".join( + [f'"{version}"' for version in allowed_cuda_versions] + ) + input_fields.append(f"allowedCudaVersions: [{allowed_cuda_versions_string}]") + if volume_key is not None: + input_fields.append(f'volumeKey: "{volume_key}"') + if data_center_id is not None: + input_fields.append(f'dataCenterId: "{data_center_id}"') + if start_jupyter: + input_fields.append("startJupyter: true") + if start_ssh: + input_fields.append("startSsh: true") + if container_disk_in_gb is not None: + input_fields.append(f"containerDiskInGb: {container_disk_in_gb}") + if docker_args is not None: + input_fields.append(f'dockerArgs: "{docker_args}"') + if env is not None: + env_string = ", ".join( + [f'{{ key: "{key}", value: "{value}" }}' for key, value in env.items()] + ) + input_fields.append(f"env: [{env_string}]") + if volume_mount_path is not None: + input_fields.append(f'volumeMountPath: "{volume_mount_path}"') + if ports is not None: + ports = ports.replace(" ", "") + input_fields.append(f'ports: "{ports}"') + + # Provisioning fails if minCudaVersion is specified for createCluster. + # See https://github.com/dstackai/dstack/issues/3910. + # TODO: Uncomment when RunPod fixes minCudaVersion for createCluster. + # + # input_fields.append(f'minCudaVersion: "{RunpodProvider.MIN_CUDA_VERSION}"') + + # Format input fields + input_string = ", ".join(input_fields) + return f""" + mutation {{ + createCluster( + input: {{ + {input_string} + }} + ) {{ + id + name + pods {{ + id + clusterIp + lastStatusChange + imageName + machine {{ + podHostId + }} + }} + }} + }} + """ diff --git a/src/dstack/_internal/core/backends/runpod/backend.py b/src/dstack/_internal/core/backends/runpod/backend.py new file mode 100644 index 0000000000..7232feff93 --- /dev/null +++ b/src/dstack/_internal/core/backends/runpod/backend.py @@ -0,0 +1,16 @@ +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.runpod.compute import RunpodCompute +from dstack._internal.core.backends.runpod.models import RunpodConfig +from dstack._internal.core.models.backends.base import BackendType + + +class RunpodBackend(Backend): + TYPE = BackendType.RUNPOD + COMPUTE_CLASS = RunpodCompute + + def __init__(self, config: RunpodConfig): + self.config = config + self._compute = RunpodCompute(self.config) + + def compute(self) -> RunpodCompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/runpod/compute.py b/src/dstack/_internal/core/backends/runpod/compute.py index 2f71ca6a02..7158c6950b 100644 --- a/src/dstack/_internal/core/backends/runpod/compute.py +++ b/src/dstack/_internal/core/backends/runpod/compute.py @@ -1,58 +1,126 @@ import json import uuid +from collections.abc import Iterable from datetime import timedelta -from typing import List, Optional +from typing import Callable, List, Optional -from dstack._internal.core.backends.base import Compute +from dstack._internal.core.backends.base.backend import Compute from dstack._internal.core.backends.base.compute import ( + ComputeWithAllOffersCached, + ComputeWithGroupProvisioningSupport, + ComputeWithMultinodeSupport, + ComputeWithVolumeSupport, + generate_unique_instance_name, + generate_unique_volume_name, get_docker_commands, - get_instance_name, + get_job_instance_name, ) -from dstack._internal.core.backends.base.offers import get_catalog_offers -from dstack._internal.core.backends.runpod.api_client import RunpodApiClient +from dstack._internal.core.backends.base.models import JobConfiguration +from dstack._internal.core.backends.base.offers import ( + OfferModifier, + get_catalog_offers, + get_offers_disk_modifier, +) +from dstack._internal.core.backends.runpod.api_client import RunpodApiClient, RunpodApiClientError +from dstack._internal.core.backends.runpod.models import RunpodConfig +from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT from dstack._internal.core.errors import ( - BackendError, + ComputeError, ) from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.configurations import RegistryAuth +from dstack._internal.core.models.common import CoreModel, RegistryAuth +from dstack._internal.core.models.compute_groups import ComputeGroup, ComputeGroupProvisioningData from dstack._internal.core.models.instances import ( InstanceAvailability, InstanceConfiguration, InstanceOfferWithAvailability, SSHKey, ) +from dstack._internal.core.models.placement import PlacementGroup +from dstack._internal.core.models.resources import Memory, Range from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run -from dstack._internal.core.models.volumes import Volume -from dstack._internal.utils.common import get_current_datetime +from dstack._internal.core.models.volumes import ( + RunpodVolumeConfiguration, + Volume, + VolumeProvisioningData, +) +from dstack._internal.utils.common import get_current_datetime, get_or_error from dstack._internal.utils.logging import get_logger logger = get_logger(__name__) +# Undocumented but names of len 60 work +MAX_RESOURCE_NAME_LEN = 60 + CONTAINER_REGISTRY_AUTH_CLEANUP_INTERVAL = 60 * 60 * 24 # 24 hour +# Runpod does not seem to have any limits on the disk size. +CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("1GB"), max=None) + + +class RunpodOfferBackendData(CoreModel): + pod_counts: Optional[list[int]] = None + -class RunpodCompute(Compute): +class RunpodCompute( + ComputeWithAllOffersCached, + ComputeWithVolumeSupport, + ComputeWithMultinodeSupport, + ComputeWithGroupProvisioningSupport, + Compute, +): _last_cleanup_time = None - def __init__(self, config): + def __init__(self, config: RunpodConfig): + super().__init__() self.config = config self.api_client = RunpodApiClient(config.creds.api_key) - def get_offers( - self, requirements: Optional[Requirements] = None - ) -> List[InstanceOfferWithAvailability]: + def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]: offers = get_catalog_offers( backend=BackendType.RUNPOD, - requirements=requirements, + locations=self.config.regions or None, + requirements=None, + extra_filter=lambda o: _is_secure_cloud(o.region) or self.config.allow_community_cloud, ) offers = [ - InstanceOfferWithAvailability( - **offer.dict(), availability=InstanceAvailability.AVAILABLE - ) + offer.with_availability(availability=InstanceAvailability.AVAILABLE) for offer in offers ] return offers + def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]: + gpu_disk_modifier = get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements) + + def disk_modifier( + offer: InstanceOfferWithAvailability, + ) -> Optional[InstanceOfferWithAvailability]: + if len(offer.instance.resources.gpus) > 0: + return gpu_disk_modifier(offer) + + # For Runpod CPU offers, gpuhunt disk is the per-flavor max. + # Choose requested disk within [1GB, max] or filter the offer out. + cpu_max_disk_size_gb = Memory(offer.instance.resources.disk.size_mib / 1024) + cpu_configurable_disk_size = Range[Memory]( + min=Memory.parse("1GB"), + max=cpu_max_disk_size_gb, + ) + return get_offers_disk_modifier(cpu_configurable_disk_size, requirements)(offer) + + return [disk_modifier] + + def get_offers_post_filter( + self, requirements: Requirements + ) -> Optional[Callable[[InstanceOfferWithAvailability], bool]]: + def offers_post_filter(offer: InstanceOfferWithAvailability) -> bool: + pod_counts = _get_offer_pod_counts(offer) + is_cluster_offer = len(pod_counts) > 0 and any(pc != 1 for pc in pod_counts) + if requirements.multinode: + return is_cluster_offer + return not is_cluster_offer + + return offers_post_filter + def run_job( self, run: Run, @@ -61,53 +129,91 @@ def run_job( project_ssh_public_key: str, project_ssh_private_key: str, volumes: List[Volume], + placement_group: Optional[PlacementGroup], ) -> JobProvisioningData: + assert run.run_spec.ssh_key_pub is not None instance_config = InstanceConfiguration( project_name=run.project_name, - instance_name=get_instance_name(run, job), + instance_name=get_job_instance_name(run, job), ssh_keys=[ SSHKey(public=run.run_spec.ssh_key_pub.strip()), SSHKey(public=project_ssh_public_key.strip()), ], - job_docker_config=None, user=run.user, ) + pod_name = generate_unique_instance_name(instance_config, max_length=MAX_RESOURCE_NAME_LEN) authorized_keys = instance_config.get_public_keys() memory_size = round(instance_offer.instance.resources.memory_mib / 1024) disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024) + + network_volume_id = None + volume_mount_path = None + if len(volumes) > 1: + raise ComputeError("Mounting more than one network volume is not supported in runpod") + if len(volumes) == 1: + network_volume_id = volumes[0].volume_id + volume_mount_path = run.run_spec.configuration.volumes[0].path + container_registry_auth_id = self._generate_container_registry_auth_id( job.job_spec.registry_auth ) - resp = self.api_client.create_pod( - name=instance_config.instance_name, - image_name=job.job_spec.image_name, - gpu_type_id=instance_offer.instance.name, - cloud_type="ALL", # ["ALL", "COMMUNITY", "SECURE"]: - gpu_count=len(instance_offer.instance.resources.gpus), - container_disk_in_gb=disk_size, - min_vcpu_count=instance_offer.instance.resources.cpus, - min_memory_in_gb=memory_size, - support_public_ip=True, - docker_args=get_docker_args(authorized_keys), - ports="10022/tcp", - bid_per_gpu=instance_offer.price if instance_offer.instance.resources.spot else None, - ) - - instance_id = resp["id"] - - # TODO: remove editPod once createPod supports docker's username and password - # editPod is temporary solution to set container_registry_auth_id because createPod does not - # support it currently. This will be removed once createPod supports container_registry_auth_id - # or username and password - if container_registry_auth_id is not None: - instance_id = self.api_client.edit_pod( - pod_id=instance_id, + gpu_count = len(instance_offer.instance.resources.gpus) + if gpu_count == 0: + if not _is_secure_cloud(instance_offer.region): + raise ComputeError("Runpod CPU offers are only supported in secure cloud regions") + resp = self.api_client.create_cpu_pod( + name=pod_name, image_name=job.job_spec.image_name, + container_registry_auth_id=container_registry_auth_id, + instance_id=instance_offer.instance.name, + cloud_type="SECURE", + deploy_cost=instance_offer.price, + data_center_id=instance_offer.region, container_disk_in_gb=disk_size, + start_ssh=True, + docker_args=_get_docker_args(authorized_keys), + ports=f"{DSTACK_RUNNER_SSH_PORT}/tcp", + network_volume_id=network_volume_id, + volume_mount_path=volume_mount_path, + env={"RUNPOD_POD_USER": "0"}, + ) + else: + bid_per_gpu = None + if instance_offer.instance.resources.spot: + bid_per_gpu = instance_offer.price / gpu_count + if _is_secure_cloud(instance_offer.region): + cloud_type = "SECURE" + data_center_id = instance_offer.region + country_code = None + else: + cloud_type = "COMMUNITY" + data_center_id = None + country_code = instance_offer.region + + resp = self.api_client.create_pod( + name=pod_name, + image_name=job.job_spec.image_name, container_registry_auth_id=container_registry_auth_id, + gpu_type_id=instance_offer.instance.name, + cloud_type=cloud_type, + data_center_id=data_center_id, + country_code=country_code, + gpu_count=gpu_count, + container_disk_in_gb=disk_size, + min_vcpu_count=instance_offer.instance.resources.cpus, + min_memory_in_gb=memory_size, + support_public_ip=True, + docker_args=_get_docker_args(authorized_keys), + ports=f"{DSTACK_RUNNER_SSH_PORT}/tcp", + bid_per_gpu=bid_per_gpu, + network_volume_id=network_volume_id, + volume_mount_path=volume_mount_path, + env={"RUNPOD_POD_USER": "0"}, ) + instance_id = resp["id"] + if ( self._last_cleanup_time is None or self._last_cleanup_time @@ -131,14 +237,130 @@ def run_job( backend_data=None, ) + def run_jobs( + self, + run: Run, + job_configurations: List[JobConfiguration], + instance_offer: InstanceOfferWithAvailability, + project_ssh_public_key: str, + project_ssh_private_key: str, + placement_group: Optional[PlacementGroup], + ) -> ComputeGroupProvisioningData: + master_job_configuration = job_configurations[0] + master_job = master_job_configuration.job + master_job_volumes = master_job_configuration.volumes + all_volumes_names = set(v.name for jc in job_configurations for v in jc.volumes) + instance_config = InstanceConfiguration( + project_name=run.project_name, + instance_name=get_job_instance_name(run, master_job), + ssh_keys=[ + SSHKey(public=get_or_error(run.run_spec.ssh_key_pub).strip()), + SSHKey(public=project_ssh_public_key.strip()), + ], + user=run.user, + ) + + pod_name = generate_unique_instance_name(instance_config, max_length=MAX_RESOURCE_NAME_LEN) + authorized_keys = instance_config.get_public_keys() + disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024) + + network_volume_id = None + volume_mount_path = None + if len(master_job_volumes) > 1: + raise ComputeError("Mounting more than one network volume is not supported in runpod") + if len(all_volumes_names) > 1: + raise ComputeError( + "Mounting different volumes to different jobs is not supported in runpod" + ) + if len(master_job_volumes) == 1: + network_volume_id = master_job_volumes[0].volume_id + volume_mount_path = run.run_spec.configuration.volumes[0].path + + offer_pod_counts = _get_offer_pod_counts(instance_offer) + pod_count = len(job_configurations) + gpu_count = len(instance_offer.instance.resources.gpus) + data_center_id = instance_offer.region + + if pod_count not in offer_pod_counts: + raise ComputeError( + f"Failed to provision {pod_count} pods. Available pod counts: {offer_pod_counts}" + ) + + container_registry_auth_id = self._generate_container_registry_auth_id( + master_job.job_spec.registry_auth + ) + resp = self.api_client.create_cluster( + cluster_name=pod_name, + gpu_type_id=instance_offer.instance.name, + pod_count=pod_count, + gpu_count_per_pod=gpu_count, + deploy_cost=f"{instance_offer.price * pod_count:.2f}", + image_name=master_job.job_spec.image_name, + cluster_type="TRAINING", + data_center_id=data_center_id, + container_disk_in_gb=disk_size, + docker_args=_get_docker_args(authorized_keys), + ports=f"{DSTACK_RUNNER_SSH_PORT}/tcp", + network_volume_id=network_volume_id, + volume_mount_path=volume_mount_path, + env={"RUNPOD_POD_USER": "0"}, + ) + + # Unlike create mutations for individual pods, createCluster mutation doesn't accept + # containerRegistryAuthId. + # The workaround is to inject containerRegistryAuthId into already created pods. + # Expect a long time (~5m) for the pods to pick up the creds. + # TODO: remove once createCluster supports containerRegistryAuthId + if container_registry_auth_id is not None: + for pod in resp["pods"]: + self.api_client.update_pod_container_registry_auth( + pod_id=pod["id"], + container_registry_auth_id=container_registry_auth_id, + ) + + jpds = [ + JobProvisioningData( + backend=instance_offer.backend, + instance_type=instance_offer.instance, + instance_id=pod["id"], + hostname=None, + internal_ip=pod["clusterIp"], + region=instance_offer.region, + price=instance_offer.price, + username="root", + dockerized=False, + ) + for pod in resp["pods"] + ] + return ComputeGroupProvisioningData( + compute_group_id=resp["id"], + compute_group_name=resp["name"], + backend=BackendType.RUNPOD, + region=instance_offer.region, + job_provisioning_datas=jpds, + ) + def terminate_instance( self, instance_id: str, region: str, backend_data: Optional[str] = None - ) -> None: + ): try: self.api_client.terminate_pod(instance_id) - except BackendError as e: - if e.args[0] == "Instance Not Found": - logger.debug("The instance with name %s not found", instance_id) + except RunpodApiClientError as e: + if len(e.errors) > 0 and e.errors[0]["message"] == "pod not found to terminate": + logger.debug("The instance %s not found. Skipping deletion.", instance_id) + return + raise + + def terminate_compute_group(self, compute_group: ComputeGroup): + provisioning_data = compute_group.provisioning_data + try: + self.api_client.delete_cluster(provisioning_data.compute_group_id) + except RunpodApiClientError as e: + if len(e.errors) > 0 and e.errors[0]["extensions"]["code"] == "Cluster not found": + logger.debug( + "The cluster %s not found. Skipping deletion.", + provisioning_data.compute_group_id, + ) return raise @@ -156,23 +378,75 @@ def update_provisioning_data( if ports is None: return for port in pod["runtime"]["ports"]: - if port["privatePort"] == 10022: + if port["privatePort"] == DSTACK_RUNNER_SSH_PORT: provisioning_data.hostname = port["ip"] provisioning_data.ssh_port = port["publicPort"] + def register_volume(self, volume: Volume) -> VolumeProvisioningData: + assert isinstance(volume.configuration, RunpodVolumeConfiguration) + volume_data = self.api_client.get_network_volume( + volume_id=get_or_error(volume.configuration.volume_id) + ) + if volume_data is None: + raise ComputeError(f"Volume {volume.configuration.volume_id} not found") + size_gb = volume_data["size"] + return VolumeProvisioningData( + backend=BackendType.RUNPOD, + volume_id=volume_data["id"], + size_gb=size_gb, + price=_get_volume_price(size_gb), + attachable=False, + detachable=False, + ) + + def create_volume(self, volume: Volume) -> VolumeProvisioningData: + assert isinstance(volume.configuration, RunpodVolumeConfiguration) + volume_name = generate_unique_volume_name(volume, max_length=MAX_RESOURCE_NAME_LEN) + size_gb = volume.configuration.size_gb + # Runpod regions must be uppercase. + # Lowercase regions are accepted in the API but they break Runpod in several ways. + region = volume.configuration.region.upper() + volume_id = self.api_client.create_network_volume( + name=volume_name, + region=region, + size=size_gb, + ) + return VolumeProvisioningData( + backend=BackendType.RUNPOD, + volume_id=volume_id, + size_gb=size_gb, + price=_get_volume_price(size_gb), + attachable=False, + detachable=False, + ) + + def delete_volume(self, volume: Volume): + if volume.volume_id is not None: + try: + self.api_client.delete_network_volume(volume_id=volume.volume_id) + except RunpodApiClientError as e: + if ( + len(e.errors) > 0 + and "Tried to delete nonexistent network volume" in e.errors[0]["message"] + ): + logger.debug( + "The volume %s not found. Skipping deletion.", + volume.volume_id, + ) + return + raise + def _generate_container_registry_auth_id( self, registry_auth: Optional[RegistryAuth] ) -> Optional[str]: if registry_auth is None: return None - return self.api_client.add_container_registry_auth( uuid.uuid4().hex, registry_auth.username, registry_auth.password ) def _clean_stale_container_registry_auths(self) -> None: container_registry_auths = self.api_client.get_container_registry_auths() - # Container_registry_auths sorted by creation time so try to delete the oldest first # when we reach container_registry_auths that is still in use, we stop for container_registry_auth in container_registry_auths: @@ -182,9 +456,31 @@ def _clean_stale_container_registry_auths(self) -> None: break -def get_docker_args(authorized_keys: List[str]) -> str: - commands = get_docker_commands(authorized_keys, False) +def _get_docker_args(authorized_keys: List[str]) -> str: + commands = get_docker_commands(authorized_keys) command = " && ".join(commands) docker_args = {"cmd": [command], "entrypoint": ["/bin/sh", "-c"]} docker_args_escaped = json.dumps(json.dumps(docker_args)).strip('"') return docker_args_escaped + + +def _get_volume_price(size: int) -> float: + if size < 1000: + return 0.07 * size + return 0.05 * size + + +def _is_secure_cloud(region: str) -> bool: + """ + Secure cloud regions are datacenter IDs: CA-MTL-1, EU-NL-1, etc. + Community cloud regions are country codes: CA, NL, etc. + """ + return "-" in region + + +def _get_offer_pod_counts(offer: InstanceOfferWithAvailability) -> list[int]: + backend_data: RunpodOfferBackendData = RunpodOfferBackendData.__response__.parse_obj( + offer.backend_data + ) + pod_counts = backend_data.pod_counts or [] + return pod_counts diff --git a/src/dstack/_internal/core/backends/runpod/config.py b/src/dstack/_internal/core/backends/runpod/config.py deleted file mode 100644 index af5be6fcb3..0000000000 --- a/src/dstack/_internal/core/backends/runpod/config.py +++ /dev/null @@ -1,9 +0,0 @@ -from dstack._internal.core.backends.base.config import BackendConfig -from dstack._internal.core.models.backends.runpod import ( - AnyRunpodCreds, - RunpodStoredConfig, -) - - -class RunpodConfig(RunpodStoredConfig, BackendConfig): - creds: AnyRunpodCreds diff --git a/src/dstack/_internal/core/backends/runpod/configurator.py b/src/dstack/_internal/core/backends/runpod/configurator.py new file mode 100644 index 0000000000..df023f7179 --- /dev/null +++ b/src/dstack/_internal/core/backends/runpod/configurator.py @@ -0,0 +1,63 @@ +import json + +from dstack._internal.core.backends.base.configurator import ( + BackendRecord, + Configurator, + raise_invalid_credentials_error, +) +from dstack._internal.core.backends.runpod import api_client +from dstack._internal.core.backends.runpod.backend import RunpodBackend +from dstack._internal.core.backends.runpod.models import ( + RunpodBackendConfig, + RunpodBackendConfigWithCreds, + RunpodConfig, + RunpodCreds, + RunpodStoredConfig, +) +from dstack._internal.core.models.backends.base import BackendType + + +class RunpodConfigurator( + Configurator[ + RunpodBackendConfig, + RunpodBackendConfigWithCreds, + ] +): + TYPE = BackendType.RUNPOD + BACKEND_CLASS = RunpodBackend + + def validate_config(self, config: RunpodBackendConfigWithCreds, default_creds_enabled: bool): + self._validate_runpod_api_key(config.creds.api_key) + + def create_backend( + self, project_name: str, config: RunpodBackendConfigWithCreds + ) -> BackendRecord: + return BackendRecord( + config=RunpodStoredConfig( + **RunpodBackendConfig.__response__.parse_obj(config).dict() + ).json(), + auth=RunpodCreds.parse_obj(config.creds).json(), + ) + + def get_backend_config_with_creds(self, record: BackendRecord) -> RunpodBackendConfigWithCreds: + config = self._get_config(record) + return RunpodBackendConfigWithCreds.__response__.parse_obj(config) + + def get_backend_config_without_creds(self, record: BackendRecord) -> RunpodBackendConfig: + config = self._get_config(record) + return RunpodBackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> RunpodBackend: + config = self._get_config(record) + return RunpodBackend(config=config) + + def _get_config(self, record: BackendRecord) -> RunpodConfig: + return RunpodConfig( + **json.loads(record.config), + creds=RunpodCreds.parse_raw(record.auth), + ) + + def _validate_runpod_api_key(self, api_key: str): + client = api_client.RunpodApiClient(api_key=api_key) + if not client.validate_api_key(): + raise_invalid_credentials_error(fields=[["creds", "api_key"]]) diff --git a/src/dstack/_internal/core/backends/runpod/models.py b/src/dstack/_internal/core/backends/runpod/models.py new file mode 100644 index 0000000000..7bc11c2818 --- /dev/null +++ b/src/dstack/_internal/core/backends/runpod/models.py @@ -0,0 +1,54 @@ +from typing import Annotated, List, Literal, Optional, Union + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + +RUNPOD_COMMUNITY_CLOUD_DEFAULT = False + + +class RunpodAPIKeyCreds(CoreModel): + type: Literal["api_key"] = "api_key" + api_key: Annotated[str, Field(description="The API key")] + + +AnyRunpodCreds = RunpodAPIKeyCreds +RunpodCreds = AnyRunpodCreds + + +class RunpodBackendConfig(CoreModel): + type: Literal["runpod"] = "runpod" + regions: Annotated[ + Optional[List[str]], + Field(description="The list of Runpod regions. Omit to use all regions"), + ] = None + community_cloud: Annotated[ + Optional[bool], + Field( + description=( + "Whether Community Cloud offers can be suggested in addition to Secure Cloud." + f" Defaults to `{str(RUNPOD_COMMUNITY_CLOUD_DEFAULT).lower()}`" + ) + ), + ] = None + + +class RunpodBackendConfigWithCreds(RunpodBackendConfig): + creds: Annotated[AnyRunpodCreds, Field(description="The credentials")] + + +AnyRunpodBackendConfig = Union[RunpodBackendConfig, RunpodBackendConfigWithCreds] + + +class RunpodStoredConfig(RunpodBackendConfig): + pass + + +class RunpodConfig(RunpodStoredConfig): + creds: AnyRunpodCreds + + @property + def allow_community_cloud(self) -> bool: + if self.community_cloud is not None: + return self.community_cloud + return RUNPOD_COMMUNITY_CLOUD_DEFAULT diff --git a/src/dstack/_internal/core/backends/template/__init__.py b/src/dstack/_internal/core/backends/template/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/core/backends/template/backend.py.jinja b/src/dstack/_internal/core/backends/template/backend.py.jinja new file mode 100644 index 0000000000..d52cf73ea9 --- /dev/null +++ b/src/dstack/_internal/core/backends/template/backend.py.jinja @@ -0,0 +1,16 @@ +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.{{ backend_name|lower }}.compute import {{ backend_name }}Compute +from dstack._internal.core.backends.{{ backend_name|lower }}.models import {{ backend_name }}Config +from dstack._internal.core.models.backends.base import BackendType + + +class {{ backend_name }}Backend(Backend): + TYPE = BackendType.{{ backend_name|upper }} + COMPUTE_CLASS = {{ backend_name }}Compute + + def __init__(self, config: {{ backend_name }}Config): + self.config = config + self._compute = {{ backend_name }}Compute(self.config) + + def compute(self) -> {{ backend_name }}Compute: + return self._compute diff --git a/src/dstack/_internal/core/backends/template/compute.py.jinja b/src/dstack/_internal/core/backends/template/compute.py.jinja new file mode 100644 index 0000000000..cb4c4a8b09 --- /dev/null +++ b/src/dstack/_internal/core/backends/template/compute.py.jinja @@ -0,0 +1,94 @@ +from collections.abc import Iterator +from typing import List, Optional + +from dstack._internal.core.backends.base.backend import Compute +from dstack._internal.core.backends.base.compute import ( + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithGatewaySupport, + ComputeWithMultinodeSupport, + ComputeWithPlacementGroupSupport, + ComputeWithPrivateGatewaySupport, + ComputeWithPrivilegedSupport, + ComputeWithInstanceVolumesSupport, + ComputeWithReservationSupport, + ComputeWithVolumeSupport, +) +from dstack._internal.core.backends.base.offers import get_catalog_offers +from dstack._internal.core.backends.{{ backend_name|lower }}.models import {{ backend_name }}Config +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import ( + InstanceAvailability, + InstanceConfiguration, + InstanceOfferWithAvailability, +) +from dstack._internal.core.models.placement import PlacementGroup +from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run +from dstack._internal.core.models.volumes import Volume +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +class {{ backend_name }}Compute( + # TODO: Choose ComputeWith* classes to extend and implement + # ComputeWithAllOffersCached, + # ComputeWithCreateInstanceSupport, + # ComputeWithPrivilegedSupport, + # ComputeWithInstanceVolumesSupport, + # ComputeWithMultinodeSupport, + # ComputeWithReservationSupport, + # ComputeWithPlacementGroupSupport, + # ComputeWithGatewaySupport, + # ComputeWithPrivateGatewaySupport, + # ComputeWithVolumeSupport, + Compute, +): + def __init__(self, config: {{ backend_name }}Config): + super().__init__() + self.config = config + + def get_offers( + self, requirements: Requirements + ) -> Iterator[InstanceOfferWithAvailability]: + # If the provider is added to gpuhunt, you'd typically get offers + # using `get_catalog_offers()` and extend them with availability info. + offers = get_catalog_offers( + backend=BackendType.{{ backend_name|upper }}, + locations=self.config.regions or None, + requirements=requirements, + # configurable_disk_size=..., TODO: set in case of boot volume size limits + ) + # TODO: Add availability info to offers + return ( + offer.with_availability(availability=InstanceAvailability.UNKNOWN) + for offer in offers + ) + + def create_instance( + self, + instance_offer: InstanceOfferWithAvailability, + instance_config: InstanceConfiguration, + placement_group: Optional[PlacementGroup], + ) -> JobProvisioningData: + # TODO: Implement if backend supports creating instances (VM-based). + # Delete if backend can only run jobs (container-based). + raise NotImplementedError() + + def run_job( + self, + run: Run, + job: Job, + instance_offer: InstanceOfferWithAvailability, + project_ssh_public_key: str, + project_ssh_private_key: str, + volumes: List[Volume], + placement_group: Optional[PlacementGroup], + ) -> JobProvisioningData: + # TODO: Implement if create_instance() is not implemented. Delete otherwise. + raise NotImplementedError() + + def terminate_instance( + self, instance_id: str, region: str, backend_data: Optional[str] = None + ): + raise NotImplementedError() diff --git a/src/dstack/_internal/core/backends/template/configurator.py.jinja b/src/dstack/_internal/core/backends/template/configurator.py.jinja new file mode 100644 index 0000000000..47ea303903 --- /dev/null +++ b/src/dstack/_internal/core/backends/template/configurator.py.jinja @@ -0,0 +1,69 @@ +import json + +from dstack._internal.core.backends.base.configurator import ( + BackendRecord, + Configurator, + raise_invalid_credentials_error, +) +from dstack._internal.core.backends.{{ backend_name|lower }}.backend import {{ backend_name }}Backend +from dstack._internal.core.backends.{{ backend_name|lower }}.models import ( + Any{{ backend_name }}Creds, + {{ backend_name }}BackendConfig, + {{ backend_name }}BackendConfigWithCreds, + {{ backend_name }}Config, + {{ backend_name }}Creds, + {{ backend_name }}StoredConfig, +) +from dstack._internal.core.models.backends.base import ( + BackendType, +) + + +class {{ backend_name }}Configurator( + Configurator[ + {{ backend_name }}BackendConfig, + {{ backend_name }}BackendConfigWithCreds, + ] +): + TYPE = BackendType.{{ backend_name|upper }} + BACKEND_CLASS = {{ backend_name }}Backend + + def validate_config( + self, config: {{ backend_name }}BackendConfigWithCreds, default_creds_enabled: bool + ): + self._validate_creds(config.creds) + # TODO: If possible, validate config.regions and any other config parameters + + def create_backend( + self, project_name: str, config: {{ backend_name }}BackendConfigWithCreds + ) -> BackendRecord: + return BackendRecord( + config={{ backend_name }}StoredConfig( + **{{ backend_name }}BackendConfig.__response__.parse_obj(config).dict() + ).json(), + auth={{ backend_name }}Creds.parse_obj(config.creds).json(), + ) + + def get_backend_config_with_creds(self, record: BackendRecord) -> {{ backend_name }}BackendConfigWithCreds: + config = self._get_config(record) + return {{ backend_name }}BackendConfigWithCreds.__response__.parse_obj(config) + + def get_backend_config_without_creds(self, record: BackendRecord) -> {{ backend_name }}BackendConfig: + config = self._get_config(record) + return {{ backend_name }}BackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> {{ backend_name }}Backend: + config = self._get_config(record) + return {{ backend_name }}Backend(config=config) + + def _get_config(self, record: BackendRecord) -> {{ backend_name }}Config: + return {{ backend_name }}Config.__response__( + **json.loads(record.config), + creds={{ backend_name }}Creds.parse_raw(record.auth), + ) + + def _validate_creds(self, creds: Any{{ backend_name }}Creds): + # TODO: Implement API key or other creds validation + # if valid: + # return + raise_invalid_credentials_error(fields=[["creds", "api_key"]]) diff --git a/src/dstack/_internal/core/backends/template/models.py.jinja b/src/dstack/_internal/core/backends/template/models.py.jinja new file mode 100644 index 0000000000..6fab7a13fe --- /dev/null +++ b/src/dstack/_internal/core/backends/template/models.py.jinja @@ -0,0 +1,62 @@ +from typing import Annotated, List, Literal, Optional, Union + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +# The template uses "api_key" creds as the most popular creds type. +# TODO: Adjust it or add additional creds models if necessary. +class {{ backend_name }}APIKeyCreds(CoreModel): + type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" + api_key: Annotated[str, Field(description="The API key")] + + +Any{{ backend_name }}Creds = {{ backend_name }}APIKeyCreds +{{ backend_name }}Creds = Any{{ backend_name }}Creds + + +class {{ backend_name }}BackendConfig(CoreModel): + """ + The backend config used in the API, server/config.yml, `{{ backend_name }}Configurator`. + It also serves as a base class for other backend config models. + Should not include creds. + """ + + type: Annotated[ + Literal["{{ backend_name|lower }}"], + Field(description="The type of backend"), + ] = "{{ backend_name|lower }}" + regions: Annotated[ + Optional[List[str]], + Field(description="The list of {{ backend_name }} regions. Omit to use all regions"), + ] = None + # TODO: Add additional backend parameters if necessary + + +class {{ backend_name }}BackendConfigWithCreds({{ backend_name }}BackendConfig): + """ + Same as `{{ backend_name }}BackendConfig` but also includes creds. + """ + + creds: Annotated[Any{{ backend_name }}Creds, Field(description="The credentials")] + + +Any{{ backend_name }}BackendConfig = Union[{{ backend_name }}BackendConfig, {{ backend_name }}BackendConfigWithCreds] + + +class {{ backend_name }}StoredConfig({{ backend_name }}BackendConfig): + """ + The backend config used for config parameters in the DB. + Can extend `{{ backend_name }}BackendConfig` with additional parameters. + """ + + pass + + +class {{ backend_name }}Config({{ backend_name }}StoredConfig): + """ + The backend config used by `{{ backend_name }}Backend` and `{{ backend_name }}Compute`. + """ + + creds: Any{{ backend_name }}Creds diff --git a/src/dstack/_internal/core/backends/tensordock/__init__.py b/src/dstack/_internal/core/backends/tensordock/__init__.py deleted file mode 100644 index 8f1ba5ccf4..0000000000 --- a/src/dstack/_internal/core/backends/tensordock/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -from dstack._internal.core.backends.base import Backend -from dstack._internal.core.backends.tensordock.compute import TensorDockCompute -from dstack._internal.core.backends.tensordock.config import TensorDockConfig -from dstack._internal.core.models.backends.base import BackendType - - -class TensorDockBackend(Backend): - TYPE: BackendType = BackendType.TENSORDOCK - - def __init__(self, config: TensorDockConfig): - self.config = config - self._compute = TensorDockCompute(self.config) - - def compute(self) -> TensorDockCompute: - return self._compute diff --git a/src/dstack/_internal/core/backends/tensordock/api_client.py b/src/dstack/_internal/core/backends/tensordock/api_client.py deleted file mode 100644 index ed808c237e..0000000000 --- a/src/dstack/_internal/core/backends/tensordock/api_client.py +++ /dev/null @@ -1,97 +0,0 @@ -import uuid - -import requests -import yaml - -from dstack._internal.core.errors import BackendError -from dstack._internal.core.models.instances import InstanceType -from dstack._internal.utils.logging import get_logger - -logger = get_logger(__name__) - - -class TensorDockAPIClient: - def __init__(self, api_key: str, api_token: str): - self.api_url = "https://marketplace.tensordock.com/api/v0".rstrip("/") - self.api_key = api_key - self.api_token = api_token - self.s = requests.Session() # TODO: set adequate timeout everywhere the session is used - - def auth_test(self) -> bool: - resp = self.s.post( - self._url("/auth/test"), data={"api_key": self.api_key, "api_token": self.api_token} - ) - resp.raise_for_status() - return resp.json()["success"] - - def get_hostnode(self, hostnode_id: str) -> dict: - logger.debug("Fetching hostnode %s", hostnode_id) - resp = self.s.get(self._url(f"/client/deploy/hostnodes/{hostnode_id}")) - resp.raise_for_status() - data = resp.json() - if not data["success"]: - raise requests.HTTPError(data) - return data["hostnode"] - - def deploy_single(self, instance_name: str, instance: InstanceType, cloudinit: dict) -> dict: - hostnode = self.get_hostnode(instance.name) - gpu = instance.resources.gpus[0] - for gpu_model in hostnode["specs"]["gpu"].keys(): - if gpu_model.endswith(f"-{gpu.memory_mib // 1024}gb"): - if gpu.name.lower() in gpu_model.lower(): - break - else: - raise ValueError(f"Can't find GPU on the hostnode: {gpu.name}") - form = { - "api_key": self.api_key, - "api_token": self.api_token, - "password": uuid.uuid4().hex, # we disable the password auth, but it's required - "name": instance_name, - "gpu_count": len(instance.resources.gpus), - "gpu_model": gpu_model, - "vcpus": instance.resources.cpus, - "ram": instance.resources.memory_mib // 1024, - "external_ports": "{%s}" - % max(hostnode["networking"]["ports"]), # it's safer to use a higher port - "internal_ports": "{22}", - "hostnode": instance.name, - "storage": 100, # TODO(egor-s): take from instance.resources - "operating_system": "Ubuntu 22.04 LTS", - "cloudinit_script": yaml.dump(cloudinit).replace("\n", "\\n"), - } - logger.debug( - "Deploying instance hostnode=%s, cpus=%s, memory=%s, gpu=%sx %s", - form["hostnode"], - form["vcpus"], - form["ram"], - form["gpu_count"], - form["gpu_model"], - ) - resp = self.s.post(self._url("/client/deploy/single"), data=form) - resp.raise_for_status() - data = resp.json() - if not data["success"]: - raise requests.HTTPError(data) - data["password"] = form["password"] - return data - - def delete_single(self, instance_id: str): - logger.debug("Deleting instance %s", instance_id) - resp = self.s.post( - self._url("/client/delete/single"), - data={ - "api_key": self.api_key, - "api_token": self.api_token, - "server": instance_id, - }, - ) - resp.raise_for_status() - try: - data = resp.json() - if not data["success"]: - raise BackendError(data) - except ValueError: # json parsing error - raise BackendError(resp.text) - - def _url(self, path): - return f"{self.api_url}/{path.lstrip('/')}" diff --git a/src/dstack/_internal/core/backends/tensordock/compute.py b/src/dstack/_internal/core/backends/tensordock/compute.py deleted file mode 100644 index ae8428fb32..0000000000 --- a/src/dstack/_internal/core/backends/tensordock/compute.py +++ /dev/null @@ -1,141 +0,0 @@ -import json -from typing import List, Optional - -import requests - -from dstack._internal.core.backends.base import Compute -from dstack._internal.core.backends.base.compute import get_instance_name, get_shim_commands -from dstack._internal.core.backends.base.offers import get_catalog_offers -from dstack._internal.core.backends.tensordock.api_client import TensorDockAPIClient -from dstack._internal.core.backends.tensordock.config import TensorDockConfig -from dstack._internal.core.errors import BackendError, NoCapacityError -from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.instances import ( - InstanceAvailability, - InstanceConfiguration, - InstanceOfferWithAvailability, - SSHKey, -) -from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run -from dstack._internal.core.models.volumes import Volume -from dstack._internal.utils.logging import get_logger - -logger = get_logger(__name__) - - -class TensorDockCompute(Compute): - def __init__(self, config: TensorDockConfig): - self.config = config - self.api_client = TensorDockAPIClient(config.creds.api_key, config.creds.api_token) - - def get_offers( - self, requirements: Optional[Requirements] = None - ) -> List[InstanceOfferWithAvailability]: - offers = get_catalog_offers( - backend=BackendType.TENSORDOCK, - requirements=requirements, - ) - offers = [ - InstanceOfferWithAvailability( - **offer.dict(), availability=InstanceAvailability.AVAILABLE - ) - for offer in offers - ] - return offers - - def create_instance( - self, - instance_offer: InstanceOfferWithAvailability, - instance_config: InstanceConfiguration, - ) -> JobProvisioningData: - commands = get_shim_commands(authorized_keys=instance_config.get_public_keys()) - try: - resp = self.api_client.deploy_single( - instance_name=instance_config.instance_name, - instance=instance_offer.instance, - cloudinit={ - "ssh_pwauth": False, # disable password auth - "users": [ - "default", - { - "name": "user", - "ssh_authorized_keys": instance_config.get_public_keys(), - }, - ], - "runcmd": [ - ["sh", "-c", " && ".join(commands)], - ], - "write_files": [ - { - "path": "/etc/docker/daemon.json", - "content": json.dumps( - { - "runtimes": { - "nvidia": { - "path": "nvidia-container-runtime", - "runtimeArgs": [], - } - }, - "exec-opts": ["native.cgroupdriver=cgroupfs"], - } - ), - } - ], - }, - ) - except requests.HTTPError as e: - logger.warning("Got error from tensordock: %s", e) - raise NoCapacityError() - return JobProvisioningData( - backend=instance_offer.backend, - instance_type=instance_offer.instance, - instance_id=resp["server"], - hostname=resp["ip"], - internal_ip=None, - region=instance_offer.region, - price=instance_offer.price, - username="user", - ssh_port={v: k for k, v in resp["port_forwards"].items()}["22"], - dockerized=True, - ssh_proxy=None, - backend_data=None, - ) - - def run_job( - self, - run: Run, - job: Job, - instance_offer: InstanceOfferWithAvailability, - project_ssh_public_key: str, - project_ssh_private_key: str, - volumes: List[Volume], - ) -> JobProvisioningData: - instance_config = InstanceConfiguration( - project_name=run.project_name, - instance_name=get_instance_name(run, job), # TODO: generate name - ssh_keys=[ - SSHKey(public=run.run_spec.ssh_key_pub.strip()), - SSHKey(public=project_ssh_public_key.strip()), - ], - job_docker_config=None, - user=run.user, - ) - return self.create_instance(instance_offer, instance_config) - - def terminate_instance( - self, instance_id: str, region: str, backend_data: Optional[str] = None - ): - try: - self.api_client.delete_single(instance_id) - except requests.HTTPError as e: - logger.error( - "An HTTP error occurred when trying to terminate TensorDock instance %s: %s", - instance_id, - e, - ) - except BackendError as e: - logger.error( - "TensorDock returned an error when trying to terminate instance %s: %s", - instance_id, - e, - ) diff --git a/src/dstack/_internal/core/backends/tensordock/config.py b/src/dstack/_internal/core/backends/tensordock/config.py deleted file mode 100644 index faab959bdc..0000000000 --- a/src/dstack/_internal/core/backends/tensordock/config.py +++ /dev/null @@ -1,9 +0,0 @@ -from dstack._internal.core.backends.base.config import BackendConfig -from dstack._internal.core.models.backends.tensordock import ( - AnyTensorDockCreds, - TensorDockStoredConfig, -) - - -class TensorDockConfig(TensorDockStoredConfig, BackendConfig): - creds: AnyTensorDockCreds diff --git a/src/dstack/_internal/core/backends/tensordock/models.py b/src/dstack/_internal/core/backends/tensordock/models.py new file mode 100644 index 0000000000..d031b515ac --- /dev/null +++ b/src/dstack/_internal/core/backends/tensordock/models.py @@ -0,0 +1,40 @@ +from typing import Annotated, List, Literal, Optional, Union + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + +# TODO: TensorDock is deprecated and will be removed in the future + + +class TensorDockAPIKeyCreds(CoreModel): + type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" + api_key: Annotated[str, Field(description="The API key")] + api_token: Annotated[str, Field(description="The API token")] + + +AnyTensorDockCreds = TensorDockAPIKeyCreds +TensorDockCreds = AnyTensorDockCreds + + +class TensorDockBackendConfig(CoreModel): + type: Annotated[Literal["tensordock"], Field(description="The type of backend")] = "tensordock" + regions: Annotated[ + Optional[List[str]], + Field(description="The list of TensorDock regions. Omit to use all regions"), + ] = None + + +class TensorDockBackendConfigWithCreds(TensorDockBackendConfig): + creds: Annotated[AnyTensorDockCreds, Field(description="The credentials")] + + +AnyTensorDockBackendConfig = Union[TensorDockBackendConfig, TensorDockBackendConfigWithCreds] + + +class TensorDockStoredConfig(TensorDockBackendConfig): + pass + + +class TensorDockConfig(TensorDockStoredConfig): + creds: AnyTensorDockCreds diff --git a/src/dstack/_internal/core/backends/vastai/__init__.py b/src/dstack/_internal/core/backends/vastai/__init__.py index fe8c87760f..e69de29bb2 100644 --- a/src/dstack/_internal/core/backends/vastai/__init__.py +++ b/src/dstack/_internal/core/backends/vastai/__init__.py @@ -1,15 +0,0 @@ -from dstack._internal.core.backends.base import Backend -from dstack._internal.core.backends.vastai.compute import VastAICompute -from dstack._internal.core.backends.vastai.config import VastAIConfig -from dstack._internal.core.models.backends.base import BackendType - - -class VastAIBackend(Backend): - TYPE: BackendType = BackendType.VASTAI - - def __init__(self, config: VastAIConfig): - self.config = config - self._compute = VastAICompute(self.config) - - def compute(self) -> VastAICompute: - return self._compute diff --git a/src/dstack/_internal/core/backends/vastai/api_client.py b/src/dstack/_internal/core/backends/vastai/api_client.py index 758f6d3fb8..1d8e4d8f36 100644 --- a/src/dstack/_internal/core/backends/vastai/api_client.py +++ b/src/dstack/_internal/core/backends/vastai/api_client.py @@ -5,9 +5,10 @@ import requests from requests.adapters import HTTPAdapter, Retry -import dstack._internal.server.services.docker as docker +import dstack._internal.utils.docker as docker +from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT from dstack._internal.core.errors import NoCapacityError -from dstack._internal.core.models.configurations import RegistryAuth +from dstack._internal.core.models.common import RegistryAuth class VastAIAPIClient: @@ -65,8 +66,9 @@ def create_instance( "disk": disk_size, "label": instance_name, "env": { - "-p 10022:10022": "1", + f"-p {DSTACK_RUNNER_SSH_PORT}:{DSTACK_RUNNER_SSH_PORT}": "1", }, + "user": "root", "onstart": "/bin/sh", "args": ["-c", onstart], "runtype": "args", diff --git a/src/dstack/_internal/core/backends/vastai/backend.py b/src/dstack/_internal/core/backends/vastai/backend.py new file mode 100644 index 0000000000..6600c1a3e5 --- /dev/null +++ b/src/dstack/_internal/core/backends/vastai/backend.py @@ -0,0 +1,16 @@ +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.vastai.compute import VastAICompute +from dstack._internal.core.backends.vastai.models import VastAIConfig +from dstack._internal.core.models.backends.base import BackendType + + +class VastAIBackend(Backend): + TYPE = BackendType.VASTAI + COMPUTE_CLASS = VastAICompute + + def __init__(self, config: VastAIConfig): + self.config = config + self._compute = VastAICompute(self.config) + + def compute(self) -> VastAICompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/vastai/compute.py b/src/dstack/_internal/core/backends/vastai/compute.py index ad3f0a5d68..915d98f105 100644 --- a/src/dstack/_internal/core/backends/vastai/compute.py +++ b/src/dstack/_internal/core/backends/vastai/compute.py @@ -2,12 +2,25 @@ import gpuhunt from gpuhunt.providers.vastai import VastAIProvider +from typing_extensions import assert_never -from dstack._internal.core.backends.base import Compute -from dstack._internal.core.backends.base.compute import get_docker_commands, get_instance_name +from dstack._internal.core.backends.base.backend import Compute +from dstack._internal.core.backends.base.compute import ( + ComputeWithFilteredOffersCached, + generate_unique_instance_name_for_job, + get_docker_commands, +) from dstack._internal.core.backends.base.offers import get_catalog_offers +from dstack._internal.core.backends.base.profile_options import get_backend_profile_options from dstack._internal.core.backends.vastai.api_client import VastAIAPIClient -from dstack._internal.core.backends.vastai.config import VastAIConfig +from dstack._internal.core.backends.vastai.models import VastAIConfig +from dstack._internal.core.backends.vastai.profile_options import ( + VASTAI_DEFAULT_MIN_RELIABILITY, + VASTAI_DEFAULT_OFFER_ORDER, + VastAIOfferOrder, + VastAIProfileOptions, +) +from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT from dstack._internal.core.errors import ProvisioningError from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.instances import ( @@ -15,6 +28,7 @@ InstanceOfferWithAvailability, InstanceRuntime, ) +from dstack._internal.core.models.placement import PlacementGroup from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run from dstack._internal.core.models.volumes import Volume from dstack._internal.utils.logging import get_logger @@ -22,41 +36,77 @@ logger = get_logger(__name__) -class VastAICompute(Compute): +# Undocumented but names of len 60 work +MAX_INSTANCE_NAME_LEN = 60 + + +class VastAICompute( + ComputeWithFilteredOffersCached, + Compute, +): def __init__(self, config: VastAIConfig): + super().__init__() self.config = config self.api_client = VastAIAPIClient(config.creds.api_key) - self.catalog = gpuhunt.Catalog(balance_resources=False, auto_reload=False) - self.catalog.add_provider( + + def _make_catalog(self, options: VastAIProfileOptions) -> gpuhunt.Catalog: + filters = { + "direct_port_count": {"gte": 1}, + "reliability2": { + "gte": options.min_reliability + if options.min_reliability is not None + else VASTAI_DEFAULT_MIN_RELIABILITY + }, + "inet_down": {"gt": 128}, + "verified": {"eq": True}, + "cuda_max_good": {"gte": 12.8}, + "compute_cap": {"gte": 600}, + } + if options.min_score is not None: + filters["score"] = {"gte": options.min_score} + match options.offer_order or VASTAI_DEFAULT_OFFER_ORDER: + case VastAIOfferOrder.SCORE: + order = [("score", "desc")] + case VastAIOfferOrder.PRICE: + # NOTE: dph_base is only one of the price components, + # so we also sort by InstanceOffer.price later for accurate results. + order = [("dph_base", "asc")] + case other: + assert_never(other) + catalog = gpuhunt.Catalog(balance_resources=False, auto_reload=False) + catalog.add_provider( VastAIProvider( - extra_filters={ - "direct_port_count": {"gte": 1}, - "reliability2": {"gte": 0.9}, - "inet_down": {"gt": 128}, - "verified": {"eq": True}, - "cuda_max_good": {"gte": 11.8}, - } + community_cloud=self.config.allow_community_cloud, + extra_filters=filters, + order=order, ) ) + return catalog - def get_offers( - self, requirements: Optional[Requirements] = None + def get_offers_by_requirements( + self, requirements: Requirements ) -> List[InstanceOfferWithAvailability]: + vastai_options = ( + get_backend_profile_options(requirements.backend_options, VastAIProfileOptions) + or VastAIProfileOptions() + ) offers = get_catalog_offers( backend=BackendType.VASTAI, + locations=self.config.regions or None, requirements=requirements, # TODO(egor-s): spots currently not supported extra_filter=lambda offer: not offer.instance.resources.spot, - catalog=self.catalog, + catalog=self._make_catalog(vastai_options), ) offers = [ - InstanceOfferWithAvailability( - **offer.dict(), + offer.with_availability( availability=InstanceAvailability.AVAILABLE, instance_runtime=InstanceRuntime.RUNNER, ) for offer in offers ] + if (vastai_options.offer_order or VASTAI_DEFAULT_OFFER_ORDER) == VastAIOfferOrder.PRICE: + offers = sorted(offers, key=lambda o: o.price) return offers def run_job( @@ -67,12 +117,17 @@ def run_job( project_ssh_public_key: str, project_ssh_private_key: str, volumes: List[Volume], + placement_group: Optional[PlacementGroup], ) -> JobProvisioningData: + instance_name = generate_unique_instance_name_for_job( + run, job, max_length=MAX_INSTANCE_NAME_LEN + ) + assert run.run_spec.ssh_key_pub is not None commands = get_docker_commands( [run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()] ) resp = self.api_client.create_instance( - instance_name=get_instance_name(run, job), + instance_name=instance_name, bundle_id=instance_offer.instance.name, image_name=job.job_spec.image_name, onstart=" && ".join(commands), @@ -110,7 +165,9 @@ def update_provisioning_data( if resp is not None: if resp["actual_status"] == "running": provisioning_data.hostname = resp["public_ipaddr"].strip() - provisioning_data.ssh_port = int(resp["ports"]["10022/tcp"][0]["HostPort"]) + provisioning_data.ssh_port = int( + resp["ports"][f"{DSTACK_RUNNER_SSH_PORT}/tcp"][0]["HostPort"] + ) if ( resp["actual_status"] == "created" and ": OCI runtime create failed:" in resp["status_msg"] diff --git a/src/dstack/_internal/core/backends/vastai/config.py b/src/dstack/_internal/core/backends/vastai/config.py deleted file mode 100644 index ee62275b40..0000000000 --- a/src/dstack/_internal/core/backends/vastai/config.py +++ /dev/null @@ -1,6 +0,0 @@ -from dstack._internal.core.backends.base.config import BackendConfig -from dstack._internal.core.models.backends.vastai import AnyVastAICreds, VastAIStoredConfig - - -class VastAIConfig(VastAIStoredConfig, BackendConfig): - creds: AnyVastAICreds diff --git a/src/dstack/_internal/core/backends/vastai/configurator.py b/src/dstack/_internal/core/backends/vastai/configurator.py new file mode 100644 index 0000000000..e854a7ea0f --- /dev/null +++ b/src/dstack/_internal/core/backends/vastai/configurator.py @@ -0,0 +1,69 @@ +import json + +from dstack._internal.core.backends.base.configurator import ( + BackendRecord, + Configurator, + raise_invalid_credentials_error, +) +from dstack._internal.core.backends.vastai import api_client +from dstack._internal.core.backends.vastai.backend import VastAIBackend +from dstack._internal.core.backends.vastai.models import ( + VastAIBackendConfig, + VastAIBackendConfigWithCreds, + VastAIConfig, + VastAICreds, + VastAIStoredConfig, +) +from dstack._internal.core.models.backends.base import ( + BackendType, +) + +REGIONS = [] + + +class VastAIConfigurator( + Configurator[ + VastAIBackendConfig, + VastAIBackendConfigWithCreds, + ] +): + TYPE = BackendType.VASTAI + BACKEND_CLASS = VastAIBackend + + def validate_config(self, config: VastAIBackendConfigWithCreds, default_creds_enabled: bool): + self._validate_vastai_creds(config.creds.api_key) + + def create_backend( + self, project_name: str, config: VastAIBackendConfigWithCreds + ) -> BackendRecord: + if config.regions is None: + config.regions = REGIONS + return BackendRecord( + config=VastAIStoredConfig( + **VastAIBackendConfig.__response__.parse_obj(config).dict() + ).json(), + auth=VastAICreds.parse_obj(config.creds).json(), + ) + + def get_backend_config_with_creds(self, record: BackendRecord) -> VastAIBackendConfigWithCreds: + config = self._get_config(record) + return VastAIBackendConfigWithCreds.__response__.parse_obj(config) + + def get_backend_config_without_creds(self, record: BackendRecord) -> VastAIBackendConfig: + config = self._get_config(record) + return VastAIBackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> VastAIBackend: + config = self._get_config(record) + return VastAIBackend(config=config) + + def _get_config(self, record: BackendRecord) -> VastAIConfig: + return VastAIConfig.__response__( + **json.loads(record.config), + creds=VastAICreds.parse_raw(record.auth), + ) + + def _validate_vastai_creds(self, api_key: str): + client = api_client.VastAIAPIClient(api_key=api_key) + if not client.auth_test(): + raise_invalid_credentials_error(fields=[["creds", "api_key"]]) diff --git a/src/dstack/_internal/core/backends/vastai/models.py b/src/dstack/_internal/core/backends/vastai/models.py new file mode 100644 index 0000000000..c712bce51b --- /dev/null +++ b/src/dstack/_internal/core/backends/vastai/models.py @@ -0,0 +1,56 @@ +from typing import Annotated, List, Literal, Optional, Union + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + +# TODO: Re-evaluate this default once Vast Server Cloud inventory improves for +# CUDA-sensitive GPU families (e.g. H100 with strict cuda_max_good filtering). +VASTAI_COMMUNITY_CLOUD_DEFAULT = True + + +class VastAIAPIKeyCreds(CoreModel): + type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" + api_key: Annotated[str, Field(description="The API key")] + + +AnyVastAICreds = VastAIAPIKeyCreds +VastAICreds = AnyVastAICreds + + +class VastAIBackendConfig(CoreModel): + type: Annotated[Literal["vastai"], Field(description="The type of backend")] = "vastai" + regions: Annotated[ + Optional[List[str]], + Field(description="The list of VastAI regions. Omit to use all regions"), + ] = None + community_cloud: Annotated[ + Optional[bool], + Field( + description=( + "Whether Community Cloud offers can be suggested in addition to Server Cloud." + f" Defaults to `{str(VASTAI_COMMUNITY_CLOUD_DEFAULT).lower()}`" + ) + ), + ] = None + + +class VastAIBackendConfigWithCreds(VastAIBackendConfig): + creds: Annotated[AnyVastAICreds, Field(description="The credentials")] + + +AnyVastAIBackendConfig = Union[VastAIBackendConfig, VastAIBackendConfigWithCreds] + + +class VastAIStoredConfig(VastAIBackendConfig): + pass + + +class VastAIConfig(VastAIStoredConfig): + creds: AnyVastAICreds + + @property + def allow_community_cloud(self) -> bool: + if self.community_cloud is not None: + return self.community_cloud + return VASTAI_COMMUNITY_CLOUD_DEFAULT diff --git a/src/dstack/_internal/core/backends/vastai/profile_options.py b/src/dstack/_internal/core/backends/vastai/profile_options.py new file mode 100644 index 0000000000..4574d05b30 --- /dev/null +++ b/src/dstack/_internal/core/backends/vastai/profile_options.py @@ -0,0 +1,63 @@ +from enum import Enum +from typing import Annotated, Literal, Optional + +from pydantic import Field + +from dstack._internal.core.backends.base.profile_options import BackendProfileOptions +from dstack._internal.utils.combine import get_max_optional, get_single_value_optional + + +class VastAIOfferOrder(str, Enum): + SCORE = "score" + PRICE = "price" + + +VASTAI_DEFAULT_OFFER_ORDER = VastAIOfferOrder.SCORE +VASTAI_DEFAULT_MIN_RELIABILITY = 0.9 + + +class VastAIProfileOptions(BackendProfileOptions["VastAIProfileOptions"]): + type: Literal["vastai"] = "vastai" + offer_order: Annotated[ + Optional[VastAIOfferOrder], + Field( + description=( + "Controls the order in which offers are considered for provisioning." + " Use `score` to prioritize the highest overall score first" + " (the default order in the Vast.ai console)," + " or `price` to prioritize the lowest-cost offers first." + " Lower-cost offers are often less reliable," + " so consider applying stricter filters when using `price`." + f" Defaults to `{VASTAI_DEFAULT_OFFER_ORDER.value}`" + ) + ), + ] = None + min_reliability: Annotated[ + Optional[float], + Field( + description=( + "The minimum reliability threshold for offers, on a scale from `0` to `1`." + f" Defaults to `{VASTAI_DEFAULT_MIN_RELIABILITY}`" + ), + ge=0, + le=1, + ), + ] = None + min_score: Annotated[ + Optional[int], + Field( + description=( + "The minimum overall score required for offers to be considered." + " The scoring scale varies and may require experimentation." + " Starting with a value in the low hundreds is generally recommended" + ), + ge=0, + ), + ] = None + + def combine(self, other: "VastAIProfileOptions") -> "VastAIProfileOptions": + return VastAIProfileOptions( + offer_order=get_single_value_optional(self.offer_order, other.offer_order), + min_reliability=get_max_optional(self.min_reliability, other.min_reliability), + min_score=get_max_optional(self.min_score, other.min_score), + ) diff --git a/src/dstack/_internal/core/backends/verda/__init__.py b/src/dstack/_internal/core/backends/verda/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/core/backends/verda/backend.py b/src/dstack/_internal/core/backends/verda/backend.py new file mode 100644 index 0000000000..eed2beb34d --- /dev/null +++ b/src/dstack/_internal/core/backends/verda/backend.py @@ -0,0 +1,16 @@ +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.verda.compute import VerdaCompute +from dstack._internal.core.backends.verda.models import VerdaConfig +from dstack._internal.core.models.backends.base import BackendType + + +class VerdaBackend(Backend): + TYPE = BackendType.VERDA + COMPUTE_CLASS = VerdaCompute + + def __init__(self, config: VerdaConfig): + self.config = config + self._compute = VerdaCompute(self.config, self.TYPE) + + def compute(self) -> VerdaCompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/verda/compute.py b/src/dstack/_internal/core/backends/verda/compute.py new file mode 100644 index 0000000000..1bcd6ae44e --- /dev/null +++ b/src/dstack/_internal/core/backends/verda/compute.py @@ -0,0 +1,328 @@ +from collections.abc import Iterable +from typing import Dict, List, Optional + +from verda import VerdaClient +from verda.exceptions import APIException +from verda.instances import Instance + +from dstack._internal.core.backends.base.backend import Compute +from dstack._internal.core.backends.base.compute import ( + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithInstanceVolumesSupport, + ComputeWithPrivilegedSupport, + generate_unique_instance_name, + get_shim_commands, +) +from dstack._internal.core.backends.base.offers import ( + OfferModifier, + get_catalog_offers, + get_offers_disk_modifier, +) +from dstack._internal.core.backends.verda.models import VerdaConfig +from dstack._internal.core.errors import ( + BackendError, + NoCapacityError, + NotYetTerminated, + ProvisioningError, +) +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import CoreModel +from dstack._internal.core.models.instances import ( + InstanceAvailability, + InstanceConfiguration, + InstanceOffer, + InstanceOfferWithAvailability, +) +from dstack._internal.core.models.placement import PlacementGroup +from dstack._internal.core.models.resources import Memory, Range +from dstack._internal.core.models.runs import JobProvisioningData, Requirements +from dstack._internal.utils.logging import get_logger + +logger = get_logger("verda.compute") + +MAX_INSTANCE_NAME_LEN = 60 + +IMAGE_SIZE = Memory.parse("50GB") + +CONFIGURABLE_DISK_SIZE = Range[Memory](min=IMAGE_SIZE, max=None) + + +class VerdaCompute( + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithPrivilegedSupport, + ComputeWithInstanceVolumesSupport, + Compute, +): + def __init__(self, config: VerdaConfig, backend_type: BackendType): + super().__init__() + self.config = config + self.client = VerdaClient( + client_id=self.config.creds.client_id, + client_secret=self.config.creds.client_secret, + ) + self.backend_type = backend_type + + def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]: + offers = get_catalog_offers( + backend=self.backend_type, + locations=self.config.regions, + ) + offers_with_availability = self._get_offers_with_availability(offers) + return offers_with_availability + + def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]: + return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)] + + def _get_offers_with_availability( + self, offers: List[InstanceOffer] + ) -> List[InstanceOfferWithAvailability]: + region_availabilities = {} + for is_spot in (False, True): + raw_availabilities: List[Dict] = self.client.instances.get_availabilities( + is_spot=is_spot + ) + for location in raw_availabilities: + location_code = location["location_code"] + availabilities = location["availabilities"] + for name in availabilities: + key = (name, location_code, is_spot) + region_availabilities[key] = InstanceAvailability.AVAILABLE + + availability_offers = [] + for offer in offers: + key = (offer.instance.name, offer.region, offer.instance.resources.spot) + availability = region_availabilities.get(key, InstanceAvailability.NOT_AVAILABLE) + availability_offers.append(offer.with_availability(availability=availability)) + + return availability_offers + + def create_instance( + self, + instance_offer: InstanceOfferWithAvailability, + instance_config: InstanceConfiguration, + placement_group: Optional[PlacementGroup], + ) -> JobProvisioningData: + instance_name = generate_unique_instance_name( + instance_config, max_length=MAX_INSTANCE_NAME_LEN + ) + public_keys = instance_config.get_public_keys() + ssh_ids: List[str] = [] + startup_script_id: Optional[str] = None + try: + for idx, ssh_public_key in enumerate(public_keys): + ssh_ids.append( + _create_ssh_key( + client=self.client, + name=f"{instance_name}-{idx}.key", + public_key=ssh_public_key, + ) + ) + + commands = get_shim_commands() + startup_script = " ".join([" && ".join(commands)]) + script_name = f"{instance_name}.sh" + startup_script_id = _create_startup_script( + client=self.client, + name=script_name, + script=startup_script, + ) + + disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024) + image_id = _get_vm_image_id(instance_offer) + + logger.debug( + "Deploying Verda instance", + { + "instance_type": instance_offer.instance.name, + "ssh_key_ids": ssh_ids, + "startup_script_id": startup_script_id, + "hostname": instance_name, + "description": instance_name, + "image": image_id, + "disk_size": disk_size, + "location": instance_offer.region, + }, + ) + instance = _deploy_instance( + client=self.client, + instance_type=instance_offer.instance.name, + ssh_key_ids=ssh_ids, + startup_script_id=startup_script_id, + hostname=instance_name, + description=instance_name, + image=image_id, + disk_size=disk_size, + is_spot=instance_offer.instance.resources.spot, + location=instance_offer.region, + ) + except Exception: + # startup_script_id and ssh_key_ids are per-instance. Ensure no leaks on failures. + try: + _delete_startup_script(self.client, startup_script_id) + except Exception: + logger.exception( + "Failed to cleanup startup script %s after provisioning failure.", + startup_script_id, + ) + try: + _delete_ssh_keys(self.client, ssh_ids) + except Exception: + logger.exception( + "Failed to cleanup ssh keys %s after provisioning failure.", + ssh_ids, + ) + raise + return JobProvisioningData( + backend=instance_offer.backend, + instance_type=instance_offer.instance, + instance_id=instance.id, + hostname=None, + internal_ip=None, + region=instance.location, + price=instance_offer.price, + username="root", + ssh_port=22, + dockerized=True, + ssh_proxy=None, + backend_data=VerdaInstanceBackendData( + startup_script_id=startup_script_id, + ssh_key_ids=ssh_ids, + ).json(), + ) + + def terminate_instance( + self, instance_id: str, region: str, backend_data: Optional[str] = None + ): + backend_data_parsed = VerdaInstanceBackendData.load(backend_data) + try: + self.client.instances.action( + id_list=[instance_id], + action="delete", + delete_permanently=True, + ) + except APIException as e: + if e.message in [ + "Invalid instance id", + "Can't discontinue a discontinued instance", + ]: + logger.debug("Skipping instance %s termination. Instance not found.", instance_id) + elif e.message == "Can't discontinue a provisioning instance": + raise NotYetTerminated( + "Waiting for Verda instance to leave provisioning state." + " Verda forbids terminating provisioning instances" + ) from e + else: + raise + _delete_startup_script(self.client, backend_data_parsed.startup_script_id) + _delete_ssh_keys(self.client, backend_data_parsed.ssh_key_ids) + + def update_provisioning_data( + self, + provisioning_data: JobProvisioningData, + project_ssh_public_key: str, + project_ssh_private_key: str, + ): + instance = _get_instance_by_id(self.client, provisioning_data.instance_id) + if instance is None: + raise ProvisioningError("Verda instance not found") + if instance.status not in ("ordered", "provisioning", "running"): + raise ProvisioningError(f"Unexpected Verda instance status: {instance.status!r}") + if instance.status == "running": + provisioning_data.hostname = instance.ip + + +def _get_vm_image_id(instance_offer: InstanceOfferWithAvailability) -> str: + # https://api.verda.com/v1/images + if len(instance_offer.instance.resources.gpus) > 0 and instance_offer.instance.resources.gpus[ + 0 + ].name in ["V100", "A6000"]: + # Ubuntu 22.04 + CUDA 12.0 + Docker + return "2088da25-bb0d-41cc-a191-dccae45d96fd" + # Ubuntu 24.04 + CUDA 12.8 Open + Docker + return "77777777-4f48-4249-82b3-f199fb9b701b" + + +def _create_ssh_key(client: VerdaClient, name: str, public_key: str) -> str: + try: + key = client.ssh_keys.create(name, public_key) + return key.id + except APIException as e: + raise BackendError(f"Verda API error while creating SSH key: {e.message}") + + +def _create_startup_script(client: VerdaClient, name: str, script: str) -> str: + try: + startup_script = client.startup_scripts.create(name, script) + return startup_script.id + except APIException as e: + raise BackendError(f"Verda API error while creating startup script: {e.message}") + + +def _delete_startup_script(client: VerdaClient, startup_script_id: Optional[str]) -> None: + if startup_script_id is None: + return + client.startup_scripts.delete_by_id(startup_script_id) + + +def _delete_ssh_keys(client: VerdaClient, ssh_key_ids: Optional[List[str]]) -> None: + if not ssh_key_ids: + return + client.ssh_keys.delete(ssh_key_ids) + + +def _get_instance_by_id( + client: VerdaClient, + instance_id: str, +) -> Optional[Instance]: + try: + return client.instances.get_by_id(instance_id) + except APIException as e: + if e.message == "Invalid instance id": + return None + raise + + +def _deploy_instance( + client: VerdaClient, + instance_type: str, + image: str, + ssh_key_ids: List[str], + hostname: str, + description: str, + startup_script_id: str, + disk_size: int, + is_spot: bool, + location: str, +) -> Instance: + try: + instance = client.instances.create( + instance_type=instance_type, + image=image, + ssh_key_ids=ssh_key_ids, + hostname=hostname, + description=description, + startup_script_id=startup_script_id, + pricing="FIXED_PRICE", + is_spot=is_spot, + location=location, + os_volume={"name": "OS volume", "size": disk_size}, + wait_for_status=None, # return asap + ) + except APIException as e: + # FIXME: Catch only no capacity errors + raise NoCapacityError(f"Verda API error: {e.message}") + + return instance + + +class VerdaInstanceBackendData(CoreModel): + startup_script_id: Optional[str] = None + ssh_key_ids: Optional[List[str]] = None + + @classmethod + def load(cls, raw: Optional[str]) -> "VerdaInstanceBackendData": + if raw is None: + return cls() + return cls.__response__.parse_raw(raw) diff --git a/src/dstack/_internal/core/backends/verda/configurator.py b/src/dstack/_internal/core/backends/verda/configurator.py new file mode 100644 index 0000000000..64b0dec034 --- /dev/null +++ b/src/dstack/_internal/core/backends/verda/configurator.py @@ -0,0 +1,73 @@ +import json + +from verda import VerdaClient +from verda.exceptions import APIException + +from dstack._internal.core.backends.base.configurator import ( + BackendRecord, + Configurator, + raise_invalid_credentials_error, +) +from dstack._internal.core.backends.verda.backend import VerdaBackend +from dstack._internal.core.backends.verda.models import ( + VerdaBackendConfig, + VerdaBackendConfigWithCreds, + VerdaConfig, + VerdaCreds, + VerdaStoredConfig, +) +from dstack._internal.core.models.backends.base import ( + BackendType, +) + + +class VerdaConfigurator( + Configurator[ + VerdaBackendConfig, + VerdaBackendConfigWithCreds, + ] +): + TYPE = BackendType.VERDA + BACKEND_CLASS = VerdaBackend + + def validate_config(self, config: VerdaBackendConfigWithCreds, default_creds_enabled: bool): + self._validate_creds(config.creds) + + def create_backend( + self, project_name: str, config: VerdaBackendConfigWithCreds + ) -> BackendRecord: + return BackendRecord( + config=VerdaStoredConfig( + **VerdaBackendConfig.__response__.parse_obj(config).dict() + ).json(), + auth=VerdaCreds.parse_obj(config.creds).json(), + ) + + def get_backend_config_with_creds(self, record: BackendRecord) -> VerdaBackendConfigWithCreds: + config = self._get_config(record) + return VerdaBackendConfigWithCreds.__response__.parse_obj(config) + + def get_backend_config_without_creds(self, record: BackendRecord) -> VerdaBackendConfig: + config = self._get_config(record) + return VerdaBackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> VerdaBackend: + config = self._get_config(record) + return VerdaBackend(config=config) + + def _get_config(self, record: BackendRecord) -> VerdaConfig: + return VerdaConfig.__response__( + **json.loads(record.config), + creds=VerdaCreds.parse_raw(record.auth), + ) + + def _validate_creds(self, creds: VerdaCreds): + try: + VerdaClient( + client_id=creds.client_id, + client_secret=creds.client_secret, + ) + except APIException as e: + if e.code == "unauthorized_request": + raise_invalid_credentials_error(fields=[["creds", "api_key"]]) + raise diff --git a/src/dstack/_internal/core/backends/verda/models.py b/src/dstack/_internal/core/backends/verda/models.py new file mode 100644 index 0000000000..1e0b896b5d --- /dev/null +++ b/src/dstack/_internal/core/backends/verda/models.py @@ -0,0 +1,38 @@ +from typing import Annotated, List, Literal, Optional, Union + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +class VerdaAPIKeyCreds(CoreModel): + type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" + client_id: Annotated[str, Field(description="The client ID")] + client_secret: Annotated[str, Field(description="The client secret")] + + +AnyVerdaCreds = VerdaAPIKeyCreds +VerdaCreds = AnyVerdaCreds + + +class VerdaBackendConfig(CoreModel): + type: Annotated[Literal["verda", "datacrunch"], Field(description="The type of backend")] + regions: Annotated[ + Optional[List[str]], + Field(description="The list of Verda regions. Omit to use all regions"), + ] = None + + +class VerdaBackendConfigWithCreds(VerdaBackendConfig): + creds: Annotated[AnyVerdaCreds, Field(description="The credentials")] + + +AnyVerdaBackendConfig = Union[VerdaBackendConfig, VerdaBackendConfigWithCreds] + + +class VerdaStoredConfig(VerdaBackendConfig): + pass + + +class VerdaConfig(VerdaStoredConfig): + creds: AnyVerdaCreds diff --git a/src/dstack/_internal/core/backends/vultr/__init__.py b/src/dstack/_internal/core/backends/vultr/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/core/backends/vultr/api_client.py b/src/dstack/_internal/core/backends/vultr/api_client.py new file mode 100644 index 0000000000..584cc9808a --- /dev/null +++ b/src/dstack/_internal/core/backends/vultr/api_client.py @@ -0,0 +1,116 @@ +import base64 +from typing import Any, Optional + +import requests +from requests import Response + +from dstack._internal.core.errors import BackendError, BackendInvalidCredentialsError + +API_URL = "https://api.vultr.com/v2" + + +class VultrApiClient: + def __init__(self, api_key: str): + self.api_key = api_key + + def validate_api_key(self) -> bool: + try: + self._make_request("GET", "/ssh-keys") + except BackendInvalidCredentialsError: + return False + return True + + def get_instance(self, instance_id: str, plan_type: str) -> dict: + if plan_type == "bare-metal": + response = self._make_request("GET", f"/bare-metals/{instance_id}") + return response.json()["bare_metal"] + else: + response = self._make_request("GET", f"/instances/{instance_id}") + return response.json()["instance"] + + def get_vpc_for_region(self, region: str) -> Optional[dict]: + response = self._make_request("GET", "/vpcs?per_page=500") + vpcs = response.json().get("vpcs", []) + if vpcs: + for vpc in vpcs: + if vpc["description"] == f"dstack-vpc-{region}": + return vpc + return None + + def create_vpc(self, region: str) -> dict: + data = {"region": region, "description": f"dstack-vpc-{region}"} + response = self._make_request("POST", "/vpcs", data=data) + return response.json()["vpc"] + + def launch_instance(self, region: str, plan: str, label: str, user_data: str, vpc_id: str): + # For Bare-metals + if "vbm" in plan: + # "Docker on Ubuntu 22.04" is required for bare-metals. + data = { + "region": region, + "plan": plan, + "label": label, + "image_id": "docker", + "user_data": base64.b64encode(user_data.encode()).decode(), + "attach_vpc": [vpc_id], + } + resp = self._make_request("POST", "/bare-metals", data) + return resp.json()["bare_metal"]["id"] + # For VMs + elif "vcg" in plan: + # Ubuntu 22.04 will be installed. For gpu VMs, docker is preinstalled. + data = { + "region": region, + "plan": plan, + "label": label, + "os_id": 1743, + "user_data": base64.b64encode(user_data.encode()).decode(), + "attach_vpc": [vpc_id], + } + resp = self._make_request("POST", "/instances", data) + return resp.json()["instance"]["id"] + else: + data = { + "region": region, + "plan": plan, + "label": label, + "image_id": "docker", + "user_data": base64.b64encode(user_data.encode()).decode(), + "attach_vpc": [vpc_id], + } + resp = self._make_request("POST", "/instances", data) + return resp.json()["instance"]["id"] + + def terminate_instance(self, instance_id: str, plan_type: str): + if plan_type == "bare-metal": + # Terminate bare-metal instance + endpoint = f"/bare-metals/{instance_id}" + else: + # Terminate virtual machine instance + endpoint = f"/instances/{instance_id}" + self._make_request("DELETE", endpoint) + + def _make_request(self, method: str, path: str, data: Any = None) -> Response: + try: + response = requests.request( + method=method, + url=API_URL + path, + json=data, + headers={"Authorization": f"Bearer {self.api_key}"}, + timeout=30, + ) + response.raise_for_status() + return response + except requests.HTTPError as e: + if e.response is not None and e.response.status_code in ( + requests.codes.forbidden, + requests.codes.unauthorized, + ): + raise BackendInvalidCredentialsError(e.response.text) + if e.response is not None and e.response.status_code in ( + requests.codes.bad_request, + requests.codes.internal_server_error, + requests.codes.not_found, + ): + raise BackendError(e.response.text) + raise diff --git a/src/dstack/_internal/core/backends/vultr/backend.py b/src/dstack/_internal/core/backends/vultr/backend.py new file mode 100644 index 0000000000..db4d8e439e --- /dev/null +++ b/src/dstack/_internal/core/backends/vultr/backend.py @@ -0,0 +1,16 @@ +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.vultr.compute import VultrCompute +from dstack._internal.core.backends.vultr.models import VultrConfig +from dstack._internal.core.models.backends.base import BackendType + + +class VultrBackend(Backend): + TYPE = BackendType.VULTR + COMPUTE_CLASS = VultrCompute + + def __init__(self, config: VultrConfig): + self.config = config + self._compute = VultrCompute(self.config) + + def compute(self) -> VultrCompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/vultr/compute.py b/src/dstack/_internal/core/backends/vultr/compute.py new file mode 100644 index 0000000000..2ae3210621 --- /dev/null +++ b/src/dstack/_internal/core/backends/vultr/compute.py @@ -0,0 +1,167 @@ +import json +import re +from typing import List, Optional + +import requests + +from dstack._internal.core.backends.base.backend import Compute +from dstack._internal.core.backends.base.compute import ( + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithInstanceVolumesSupport, + ComputeWithMultinodeSupport, + ComputeWithPrivilegedSupport, + generate_unique_instance_name, + get_user_data, +) +from dstack._internal.core.backends.base.offers import get_catalog_offers +from dstack._internal.core.backends.vultr.api_client import VultrApiClient +from dstack._internal.core.backends.vultr.models import VultrConfig +from dstack._internal.core.errors import BackendError, ProvisioningError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import ( + InstanceAvailability, + InstanceConfiguration, + InstanceOffer, + InstanceOfferWithAvailability, +) +from dstack._internal.core.models.placement import PlacementGroup +from dstack._internal.core.models.runs import JobProvisioningData +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + +MAX_INSTANCE_NAME_LEN = 64 + + +class VultrCompute( + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithPrivilegedSupport, + ComputeWithInstanceVolumesSupport, + ComputeWithMultinodeSupport, + Compute, +): + def __init__(self, config: VultrConfig): + super().__init__() + self.config = config + self.api_client = VultrApiClient(config.creds.api_key) + + def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]: + offers = get_catalog_offers( + backend=BackendType.VULTR, + requirements=None, + locations=self.config.regions or None, + extra_filter=_supported_instances, + ) + offers = [ + offer.with_availability(availability=InstanceAvailability.AVAILABLE) + for offer in offers + ] + return offers + + def create_instance( + self, + instance_offer: InstanceOfferWithAvailability, + instance_config: InstanceConfiguration, + placement_group: Optional[PlacementGroup], + ) -> JobProvisioningData: + instance_name = generate_unique_instance_name( + instance_config, max_length=MAX_INSTANCE_NAME_LEN + ) + # create vpc + vpc = self.api_client.get_vpc_for_region(instance_offer.region) + if not vpc: + vpc = self.api_client.create_vpc(instance_offer.region) + + subnet = vpc["v4_subnet"] + subnet_mask = vpc["v4_subnet_mask"] + + instance_id = self.api_client.launch_instance( + region=instance_offer.region, + label=instance_name, + plan=instance_offer.instance.name, + user_data=get_user_data( + authorized_keys=instance_config.get_public_keys(), + firewall_allow_from_subnets=[f"{subnet}/{subnet_mask}"], + ), + vpc_id=vpc["id"], + ) + + launched_instance = JobProvisioningData( + backend=instance_offer.backend, + instance_type=instance_offer.instance, + instance_id=instance_id, + hostname=None, + internal_ip=None, + region=instance_offer.region, + price=instance_offer.price, + ssh_port=22, + username="root", + ssh_proxy=None, + dockerized=True, + backend_data=json.dumps( + { + "plan_type": "bare-metal" + if "vbm" in instance_offer.instance.name + else "vm_instance" + } + ), + ) + return launched_instance + + def terminate_instance( + self, instance_id: str, region: str, backend_data: Optional[str] = None + ) -> None: + plan_type = json.loads(backend_data)["plan_type"] + try: + self.api_client.terminate_instance(instance_id=instance_id, plan_type=plan_type) + except requests.HTTPError as e: + raise BackendError(e.response.text if e.response is not None else str(e)) + + def update_provisioning_data( + self, + provisioning_data: JobProvisioningData, + project_ssh_public_key: str, + project_ssh_private_key: str, + ): + plan_type = json.loads(provisioning_data.backend_data)["plan_type"] + instance_data = self.api_client.get_instance(provisioning_data.instance_id, plan_type) + # Access specific fields + instance_status = instance_data["status"] + instance_main_ip = instance_data["main_ip"] + instance_internal_ip = instance_data["internal_ip"] + if instance_status == "active": + provisioning_data.hostname = instance_main_ip + provisioning_data.internal_ip = instance_internal_ip + if instance_status == "failed": + raise ProvisioningError("VM entered FAILED state") + + +def _supported_instances(offer: InstanceOffer) -> bool: + # The vbm-4c-32gb plan does not support VPC, so it is excluded. + if offer.instance.name == "vbm-4c-32gb": + return False + if offer.instance.resources.spot: + return False + for family in [ + # Bare Metal - GPU + r"vbm-\d+c-\d+gb-\d+-(a100|h100|l40|mi300x)-gpu", + # Bare Metal - AMD CPU + r"vbm-\d+c-\d+gb-amd", + # Bare Metal - Intel CPU + r"vbm-\d+c-\d+gb(-v\d+)?", + # Cloud GPU + r"vcg-(a16|a40|l40s|a100)-\d+c-\d+g-\d+vram", + # Cloud Compute - Regular Performance + r"vc2-\d+c-\d+gb(-sc1)?", + # Cloud Compute - High Frequency + r"vhf-\d+c-\d+gb(-sc1)?", + # Cloud Compute - High Performance + r"vhp-\d+c-\d+gb-(intel|amd)(-sc1)?", + # Optimized Cloud Compute + r"voc-[cgms]-\d+c-\d+gb-\d+s-amd(-sc1)?", + ]: + if re.fullmatch(family, offer.instance.name): + return True + return False diff --git a/src/dstack/_internal/core/backends/vultr/configurator.py b/src/dstack/_internal/core/backends/vultr/configurator.py new file mode 100644 index 0000000000..39f98a03e8 --- /dev/null +++ b/src/dstack/_internal/core/backends/vultr/configurator.py @@ -0,0 +1,71 @@ +import json + +from dstack._internal.core.backends.base.configurator import ( + BackendRecord, + Configurator, + raise_invalid_credentials_error, +) +from dstack._internal.core.backends.models import ( + VultrBackendConfigWithCreds, +) +from dstack._internal.core.backends.vultr import api_client +from dstack._internal.core.backends.vultr.backend import VultrBackend +from dstack._internal.core.backends.vultr.models import ( + VultrBackendConfig, + VultrConfig, + VultrCreds, + VultrStoredConfig, +) +from dstack._internal.core.models.backends.base import ( + BackendType, +) + +REGIONS = [] + + +class VultrConfigurator( + Configurator[ + VultrBackendConfig, + VultrBackendConfigWithCreds, + ] +): + TYPE = BackendType.VULTR + BACKEND_CLASS = VultrBackend + + def validate_config(self, config: VultrBackendConfigWithCreds, default_creds_enabled: bool): + self._validate_vultr_api_key(config.creds.api_key) + + def create_backend( + self, project_name: str, config: VultrBackendConfigWithCreds + ) -> BackendRecord: + if config.regions is None: + config.regions = REGIONS + return BackendRecord( + config=VultrStoredConfig( + **VultrBackendConfig.__response__.parse_obj(config).dict() + ).json(), + auth=VultrCreds.parse_obj(config.creds).json(), + ) + + def get_backend_config_with_creds(self, record: BackendRecord) -> VultrBackendConfigWithCreds: + config = self._get_config(record) + return VultrBackendConfigWithCreds.__response__.parse_obj(config) + + def get_backend_config_without_creds(self, record: BackendRecord) -> VultrBackendConfig: + config = self._get_config(record) + return VultrBackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> VultrBackend: + config = self._get_config(record) + return VultrBackend(config=config) + + def _get_config(self, record: BackendRecord) -> VultrConfig: + return VultrConfig.__response__( + **json.loads(record.config), + creds=VultrCreds.parse_raw(record.auth), + ) + + def _validate_vultr_api_key(self, api_key: str): + client = api_client.VultrApiClient(api_key=api_key) + if not client.validate_api_key(): + raise_invalid_credentials_error(fields=[["creds", "api_key"]]) diff --git a/src/dstack/_internal/core/backends/vultr/models.py b/src/dstack/_internal/core/backends/vultr/models.py new file mode 100644 index 0000000000..8692642ee1 --- /dev/null +++ b/src/dstack/_internal/core/backends/vultr/models.py @@ -0,0 +1,34 @@ +from typing import Annotated, List, Literal, Optional + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +class VultrAPIKeyCreds(CoreModel): + type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" + api_key: Annotated[str, Field(description="The API key")] + + +AnyVultrCreds = VultrAPIKeyCreds +VultrCreds = AnyVultrCreds + + +class VultrBackendConfig(CoreModel): + type: Annotated[Literal["vultr"], Field(description="The type of backend")] = "vultr" + regions: Annotated[ + Optional[List[str]], + Field(description="The list of Vultr regions. Omit to use all regions"), + ] = None + + +class VultrBackendConfigWithCreds(VultrBackendConfig): + creds: Annotated[AnyVultrCreds, Field(description="The credentials")] + + +class VultrStoredConfig(VultrBackendConfig): + pass + + +class VultrConfig(VultrStoredConfig): + creds: AnyVultrCreds diff --git a/src/dstack/_internal/core/compatibility/__init__.py b/src/dstack/_internal/core/compatibility/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/core/compatibility/common.py b/src/dstack/_internal/core/compatibility/common.py new file mode 100644 index 0000000000..8a34b40579 --- /dev/null +++ b/src/dstack/_internal/core/compatibility/common.py @@ -0,0 +1,27 @@ +from typing import Optional + +from dstack._internal.core.models.common import EntityReference, IncludeExcludeSetType +from dstack._internal.core.models.profiles import ProfileParams + + +def get_profile_excludes(profile: Optional[ProfileParams]) -> IncludeExcludeSetType: + excludes: IncludeExcludeSetType = set() + if profile is None: + return excludes + if profile.backend_options is None: + excludes.add("backend_options") + if profile.instances is None: + excludes.add("instances") + return excludes + + +def patch_profile_params(params: ProfileParams) -> None: + # If there are no project-prefixed fleets, replace all EntityReference with str + # for compatibility with pre-0.20.14 servers that don't support EntityReference. + if params.fleets is not None and all( + EntityReference.parse(f).project is None for f in params.fleets + ): + params.fleets = [ + fleet_ref.format() if isinstance(fleet_ref, EntityReference) else fleet_ref + for fleet_ref in params.fleets + ] diff --git a/src/dstack/_internal/core/compatibility/events.py b/src/dstack/_internal/core/compatibility/events.py new file mode 100644 index 0000000000..b28db11587 --- /dev/null +++ b/src/dstack/_internal/core/compatibility/events.py @@ -0,0 +1,13 @@ +from dstack._internal.core.models.common import IncludeExcludeDictType +from dstack._internal.server.schemas.events import ListEventsRequest + + +def get_list_events_excludes(request: ListEventsRequest) -> IncludeExcludeDictType: + list_gpus_excludes: IncludeExcludeDictType = {} + if request.target_volumes is None: + list_gpus_excludes["target_volumes"] = True + if request.target_gateways is None: + list_gpus_excludes["target_gateways"] = True + if request.target_secrets is None: + list_gpus_excludes["target_secrets"] = True + return list_gpus_excludes diff --git a/src/dstack/_internal/core/compatibility/exports.py b/src/dstack/_internal/core/compatibility/exports.py new file mode 100644 index 0000000000..92f1d2dc6e --- /dev/null +++ b/src/dstack/_internal/core/compatibility/exports.py @@ -0,0 +1,24 @@ +from dstack._internal.core.models.common import IncludeExcludeDictType +from dstack._internal.server.schemas.exports import CreateExportRequest, UpdateExportRequest + + +def get_create_export_excludes(request: CreateExportRequest) -> IncludeExcludeDictType: + excludes: IncludeExcludeDictType = {} + if not request.is_global: + excludes["is_global"] = True + if not request.exported_gateways: + excludes["exported_gateways"] = True + return excludes + + +def get_update_export_excludes(request: UpdateExportRequest) -> IncludeExcludeDictType: + excludes: IncludeExcludeDictType = {} + if not request.set_global: + excludes["set_global"] = True + if not request.unset_global: + excludes["unset_global"] = True + if not request.add_exported_gateways: + excludes["add_exported_gateways"] = True + if not request.remove_exported_gateways: + excludes["remove_exported_gateways"] = True + return excludes diff --git a/src/dstack/_internal/core/compatibility/fleets.py b/src/dstack/_internal/core/compatibility/fleets.py new file mode 100644 index 0000000000..36c933af4f --- /dev/null +++ b/src/dstack/_internal/core/compatibility/fleets.py @@ -0,0 +1,63 @@ +from typing import Optional + +from dstack._internal.core.compatibility.common import get_profile_excludes, patch_profile_params +from dstack._internal.core.models.common import IncludeExcludeDictType +from dstack._internal.core.models.fleets import ApplyFleetPlanInput, FleetSpec + + +def get_get_plan_excludes(fleet_spec: FleetSpec) -> IncludeExcludeDictType: + get_plan_excludes: IncludeExcludeDictType = {} + spec_excludes = get_fleet_spec_excludes(fleet_spec) + if spec_excludes: + get_plan_excludes["spec"] = spec_excludes + return get_plan_excludes + + +def get_apply_plan_excludes(plan_input: ApplyFleetPlanInput) -> IncludeExcludeDictType: + apply_plan_excludes: IncludeExcludeDictType = {} + spec_excludes = get_fleet_spec_excludes(plan_input.spec) + if spec_excludes: + apply_plan_excludes["spec"] = spec_excludes + current_resource = plan_input.current_resource + if current_resource is not None: + current_resource_excludes = {} + current_resource_spec_excludes = get_fleet_spec_excludes(current_resource.spec) + if current_resource_spec_excludes: + current_resource_excludes["spec"] = current_resource_spec_excludes + apply_plan_excludes["current_resource"] = current_resource_excludes + return {"plan": apply_plan_excludes} + + +def get_create_fleet_excludes(fleet_spec: FleetSpec) -> IncludeExcludeDictType: + create_fleet_excludes: IncludeExcludeDictType = {} + spec_excludes = get_fleet_spec_excludes(fleet_spec) + if spec_excludes: + create_fleet_excludes["spec"] = spec_excludes + return create_fleet_excludes + + +def get_fleet_spec_excludes(fleet_spec: FleetSpec) -> Optional[IncludeExcludeDictType]: + """ + Returns `fleet_spec` exclude mapping to exclude certain fields from the request. + Use this method to exclude new fields when they are not set to keep + clients backward-compatibility with older servers. + """ + spec_excludes: IncludeExcludeDictType = {} + configuration_excludes: IncludeExcludeDictType = {} + profile_excludes = get_profile_excludes(fleet_spec.profile) + + spec_excludes["autocreated"] = True + if fleet_spec.configuration.backend_options is None: + configuration_excludes["backend_options"] = True + + if configuration_excludes: + spec_excludes["configuration"] = configuration_excludes + if profile_excludes: + spec_excludes["profile"] = profile_excludes + if spec_excludes: + return spec_excludes + return None + + +def patch_fleet_spec(spec: FleetSpec) -> None: + patch_profile_params(spec.profile) diff --git a/src/dstack/_internal/core/compatibility/gateways.py b/src/dstack/_internal/core/compatibility/gateways.py new file mode 100644 index 0000000000..0a89e86113 --- /dev/null +++ b/src/dstack/_internal/core/compatibility/gateways.py @@ -0,0 +1,47 @@ +from dstack._internal.core.models.common import IncludeExcludeDictType +from dstack._internal.core.models.gateways import GatewayConfiguration, GatewaySpec +from dstack._internal.server.schemas.gateways import SetDefaultGatewayRequest + + +def get_gateway_spec_excludes(gateway_spec: GatewaySpec) -> IncludeExcludeDictType: + """ + Returns `gateway_spec` exclude mapping to exclude certain fields from the request. + Use this method to exclude new fields when they are not set to keep + clients backward-compatibility with older servers. + """ + spec_excludes: IncludeExcludeDictType = {} + spec_excludes["configuration"] = _get_gateway_configuration_excludes( + gateway_spec.configuration + ) + return spec_excludes + + +def get_create_gateway_excludes(configuration: GatewayConfiguration) -> IncludeExcludeDictType: + """ + Returns an exclude mapping to exclude certain fields from the create gateway request. + Use this method to exclude new fields when they are not set to keep + clients backward-compatibility with older servers. + """ + create_gateway_excludes: IncludeExcludeDictType = {} + create_gateway_excludes["configuration"] = _get_gateway_configuration_excludes(configuration) + return create_gateway_excludes + + +def get_set_default_gateway_excludes(request: SetDefaultGatewayRequest) -> IncludeExcludeDictType: + excludes: IncludeExcludeDictType = {} + if request.gateway_project is None: + excludes["gateway_project"] = True + return excludes + + +def _get_gateway_configuration_excludes( + configuration: GatewayConfiguration, +) -> IncludeExcludeDictType: + configuration_excludes: IncludeExcludeDictType = {} + + if configuration.router is None: + configuration_excludes["router"] = True + if configuration.replicas is None: + configuration_excludes["replicas"] = True + + return configuration_excludes diff --git a/src/dstack/_internal/core/compatibility/gpus.py b/src/dstack/_internal/core/compatibility/gpus.py new file mode 100644 index 0000000000..3885001273 --- /dev/null +++ b/src/dstack/_internal/core/compatibility/gpus.py @@ -0,0 +1,13 @@ +from typing import Optional + +from dstack._internal.core.compatibility.runs import get_run_spec_excludes +from dstack._internal.core.models.common import IncludeExcludeDictType +from dstack._internal.server.schemas.gpus import ListGpusRequest + + +def get_list_gpus_excludes(request: ListGpusRequest) -> Optional[IncludeExcludeDictType]: + list_gpus_excludes: IncludeExcludeDictType = {} + run_spec_excludes = get_run_spec_excludes(request.run_spec) + if run_spec_excludes is not None: + list_gpus_excludes["run_spec"] = run_spec_excludes + return list_gpus_excludes diff --git a/src/dstack/_internal/core/compatibility/logs.py b/src/dstack/_internal/core/compatibility/logs.py new file mode 100644 index 0000000000..078ce62218 --- /dev/null +++ b/src/dstack/_internal/core/compatibility/logs.py @@ -0,0 +1,14 @@ +from typing import Optional + +from dstack._internal.core.models.common import IncludeExcludeDictType +from dstack._internal.server.schemas.logs import PollLogsRequest + + +def get_poll_logs_excludes(request: PollLogsRequest) -> Optional[IncludeExcludeDictType]: + """ + Returns exclude mapping to exclude certain fields from the request. + Use this method to exclude new fields when they are not set to keep + clients backward-compatibility with older servers. + """ + excludes: IncludeExcludeDictType = {} + return excludes if excludes else None diff --git a/src/dstack/_internal/core/compatibility/runs.py b/src/dstack/_internal/core/compatibility/runs.py new file mode 100644 index 0000000000..4b57db1d47 --- /dev/null +++ b/src/dstack/_internal/core/compatibility/runs.py @@ -0,0 +1,196 @@ +from typing import Optional + +from dstack._internal.core.compatibility.common import get_profile_excludes, patch_profile_params +from dstack._internal.core.models.common import ( + EntityReference, + IncludeExcludeDictType, + IncludeExcludeSetType, +) +from dstack._internal.core.models.configurations import ServiceConfiguration +from dstack._internal.core.models.routers import SGLangServiceRouterConfig +from dstack._internal.core.models.runs import ( + DEFAULT_PROBE_UNTIL_READY, + DEFAULT_REPLICA_GROUP_NAME, + ApplyRunPlanInput, + JobSpec, + JobSubmission, + RunSpec, +) +from dstack._internal.server.schemas.runs import GetRunPlanRequest, ListRunsRequest + + +def get_list_runs_excludes(list_runs_request: ListRunsRequest) -> IncludeExcludeSetType: + excludes: IncludeExcludeSetType = set() + return excludes + + +def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeDictType]: + """ + Returns `plan` exclude mapping to exclude certain fields from the request. + Use this method to exclude new fields when they are not set to keep + clients backward-compatibility with older servers. + """ + apply_plan_excludes: IncludeExcludeDictType = {} + run_spec_excludes = get_run_spec_excludes(plan.run_spec) + if run_spec_excludes is not None: + apply_plan_excludes["run_spec"] = run_spec_excludes + current_resource = plan.current_resource + if current_resource is not None: + current_resource_excludes: IncludeExcludeDictType = {} + apply_plan_excludes["current_resource"] = current_resource_excludes + current_resource_excludes["run_spec"] = get_run_spec_excludes(current_resource.run_spec) + current_resource_excludes["jobs"] = { + "__all__": { + "job_spec": get_job_spec_excludes([job.job_spec for job in current_resource.jobs]), + "job_submissions": { + "__all__": get_job_submission_excludes( + [ + submission + for job in current_resource.jobs + for submission in job.job_submissions + ] + ), + }, + # Contains only informational computed fields, safe to exclude unconditionally + "job_connection_info": True, + } + } + if current_resource.latest_job_submission is not None: + current_resource_excludes["latest_job_submission"] = get_job_submission_excludes( + [current_resource.latest_job_submission] + ) + return {"plan": apply_plan_excludes} + + +def get_get_plan_excludes(request: GetRunPlanRequest) -> Optional[IncludeExcludeDictType]: + """ + Excludes new fields when they are not set to keep + clients backward-compatibility with older servers. + """ + get_plan_excludes: IncludeExcludeDictType = {} + run_spec_excludes = get_run_spec_excludes(request.run_spec) + if run_spec_excludes is not None: + get_plan_excludes["run_spec"] = run_spec_excludes + return get_plan_excludes + + +def get_run_spec_excludes(run_spec: RunSpec) -> IncludeExcludeDictType: + """ + Returns `run_spec` exclude mapping to exclude certain fields from the request. + Use this method to exclude new fields when they are not set to keep + clients backward-compatibility with older servers. + """ + spec_excludes: IncludeExcludeDictType = {} + configuration_excludes: IncludeExcludeDictType = {} + profile_excludes = get_profile_excludes(run_spec.profile) + for field in get_profile_excludes(run_spec.configuration): + configuration_excludes[field] = True + + if run_spec.configuration.backend_options is None: + configuration_excludes["backend_options"] = True + + if isinstance(run_spec.configuration, ServiceConfiguration): + if run_spec.configuration.probes: + probe_excludes: IncludeExcludeDictType = {} + configuration_excludes["probes"] = {"__all__": probe_excludes} + if all(p.until_ready is None for p in run_spec.configuration.probes): + probe_excludes["until_ready"] = True + elif run_spec.configuration.probes is None: + # Servers prior to 0.20.8 do not support probes=None + configuration_excludes["probes"] = True + + router = run_spec.configuration.router + if router is None: + configuration_excludes["router"] = True + elif isinstance(router, SGLangServiceRouterConfig) and router.pd_disaggregation is False: + configuration_excludes["router"] = {"pd_disaggregation": True} + if run_spec.configuration.https is None: + configuration_excludes["https"] = True + + replicas = run_spec.configuration.replicas + if isinstance(replicas, list): + replica_group_excludes: IncludeExcludeDictType = {} + if all(g.router is None for g in replicas): + replica_group_excludes["router"] = True + if all(g.scaling is None or g.scaling.window is None for g in replicas): + replica_group_excludes["scaling"] = {"window": True} + if all(g.image is None for g in replicas): + replica_group_excludes["image"] = True + if all(g.docker is None for g in replicas): + replica_group_excludes["docker"] = True + if all(g.python is None for g in replicas): + replica_group_excludes["python"] = True + if all(g.nvcc is None for g in replicas): + replica_group_excludes["nvcc"] = True + if all(g.privileged is None for g in replicas): + replica_group_excludes["privileged"] = True + if all(g.spot_policy is None for g in replicas): + replica_group_excludes["spot_policy"] = True + if all(g.reservation is None for g in replicas): + replica_group_excludes["reservation"] = True + if replica_group_excludes: + configuration_excludes["replicas"] = {"__all__": replica_group_excludes} + + scaling = run_spec.configuration.scaling + if scaling is not None and scaling.window is None: + configuration_excludes["scaling"] = {"window": True} + + if configuration_excludes: + spec_excludes["configuration"] = configuration_excludes + if profile_excludes: + spec_excludes["profile"] = profile_excludes + return spec_excludes + + +def get_job_spec_excludes(job_specs: list[JobSpec]) -> IncludeExcludeDictType: + """ + Returns `job_spec` exclude mapping to exclude certain fields from the request. + Use this method to exclude new fields when they are not set to keep + clients backward-compatibility with older servers. + """ + spec_excludes: IncludeExcludeDictType = {} + if all(s.replica_group == DEFAULT_REPLICA_GROUP_NAME for s in job_specs): + spec_excludes["replica_group"] = True + + probe_excludes: IncludeExcludeDictType = {} + spec_excludes["probes"] = {"__all__": probe_excludes} + if all(all(p.until_ready == DEFAULT_PROBE_UNTIL_READY for p in s.probes) for s in job_specs): + probe_excludes["until_ready"] = True + + if all(s.requirements.backend_options is None for s in job_specs): + spec_excludes["requirements"] = {"backend_options": True} + + return spec_excludes + + +def get_job_submission_excludes(job_submissions: list[JobSubmission]) -> IncludeExcludeDictType: + submission_excludes: IncludeExcludeDictType = {} + + if any(s.job_runtime_data is not None for s in job_submissions): + jrd_excludes = {} + if all( + s.job_runtime_data is None or s.job_runtime_data.username is None + for s in job_submissions + ): + jrd_excludes["username"] = True + if all( + s.job_runtime_data is None or s.job_runtime_data.working_dir is None + for s in job_submissions + ): + jrd_excludes["working_dir"] = True + submission_excludes["job_runtime_data"] = jrd_excludes + + if all(s.image_pull_progress is None for s in job_submissions): + submission_excludes["image_pull_progress"] = True + + return submission_excludes + + +def patch_run_spec(run_spec: RunSpec) -> None: + patch_profile_params(run_spec.configuration) + if run_spec.profile is not None: + patch_profile_params(run_spec.profile) + if isinstance(run_spec.configuration, ServiceConfiguration): + if isinstance(run_spec.configuration.gateway, EntityReference): + # Pre-0.20.20 servers do not support `EntityReference` in `gateway` + run_spec.configuration.gateway = run_spec.configuration.gateway.format() diff --git a/src/dstack/_internal/core/compatibility/volumes.py b/src/dstack/_internal/core/compatibility/volumes.py new file mode 100644 index 0000000000..a0afabf1c6 --- /dev/null +++ b/src/dstack/_internal/core/compatibility/volumes.py @@ -0,0 +1,42 @@ +from dstack._internal.core.models.common import IncludeExcludeDictType +from dstack._internal.core.models.volumes import ( + AnyVolumeConfiguration, + KubernetesVolumeConfiguration, + VolumeSpec, +) + + +def get_volume_spec_excludes(volume_spec: VolumeSpec) -> IncludeExcludeDictType: + """ + Returns `volume_spec` exclude mapping to exclude certain fields from the request. + Use this method to exclude new fields when they are not set to keep + clients backward-compatibility with older servers. + """ + spec_excludes: IncludeExcludeDictType = {} + spec_excludes["configuration"] = _get_volume_configuration_excludes(volume_spec.configuration) + return spec_excludes + + +def get_create_volume_excludes(configuration: AnyVolumeConfiguration) -> IncludeExcludeDictType: + """ + Returns an exclude mapping to exclude certain fields from the create volume request. + Use this method to exclude new fields when they are not set to keep + clients backward-compatibility with older servers. + """ + create_volume_excludes: IncludeExcludeDictType = {} + create_volume_excludes["configuration"] = _get_volume_configuration_excludes(configuration) + return create_volume_excludes + + +def _get_volume_configuration_excludes( + configuration: AnyVolumeConfiguration, +) -> IncludeExcludeDictType: + configuration_excludes: IncludeExcludeDictType = {} + + if isinstance(configuration, KubernetesVolumeConfiguration): + if not configuration.read_only: + configuration_excludes["read_only"] = True + if configuration.region == "": + configuration_excludes["region"] = True + + return configuration_excludes diff --git a/src/dstack/_internal/core/consts.py b/src/dstack/_internal/core/consts.py new file mode 100644 index 0000000000..cb68f9be82 --- /dev/null +++ b/src/dstack/_internal/core/consts.py @@ -0,0 +1,8 @@ +# shim (runs on the host) HTTP API port +DSTACK_SHIM_HTTP_PORT = 10998 +# runner (runs inside a container) HTTP API port +DSTACK_RUNNER_HTTP_PORT = 10999 +# ssh server (runs alongside the runner inside a container) listen port +DSTACK_RUNNER_SSH_PORT = 10022 +# legacy AWS, Azure, GCP, and OCI image for older GPUs +DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES = "0.10" diff --git a/src/dstack/_internal/core/deprecated.py b/src/dstack/_internal/core/deprecated.py new file mode 100644 index 0000000000..148d39d038 --- /dev/null +++ b/src/dstack/_internal/core/deprecated.py @@ -0,0 +1,5 @@ +import enum + + +class Deprecated(enum.Enum): + PLACEHOLDER = "DEPRECATED" diff --git a/src/dstack/_internal/core/errors.py b/src/dstack/_internal/core/errors.py index 7ae82f5447..e2685a64e4 100644 --- a/src/dstack/_internal/core/errors.py +++ b/src/dstack/_internal/core/errors.py @@ -18,6 +18,14 @@ class ClientError(DstackError): pass +class URLNotFoundError(ClientError): + pass + + +class MethodNotAllowedError(ClientError): + pass + + class ServerClientErrorCode(str, enum.Enum): UNSPECIFIED_ERROR = "error" RESOURCE_EXISTS = "resource_exists" @@ -98,6 +106,41 @@ class ComputeResourceNotFoundError(ComputeError): pass +class PlacementGroupInUseError(ComputeError): + pass + + +class PlacementGroupNotSupportedError(ComputeError): + pass + + +class NotYetTerminated(ComputeError): + """ + Used by Compute.terminate_instance to signal that instance termination is not complete + and the method should be called again after some time to continue termination. + """ + + def __init__(self, details: str) -> None: + """ + Args: + details: some details about the termination status + """ + return super().__init__(details) + + +class SkipOffer(ComputeError): + """ + Used by Compute.run_job and Compute.create_instance to signal that the offer should be skipped. + """ + + def __init__(self, details: str) -> None: + """ + Args: + details: details about why the offer should be skipped + """ + return super().__init__(details) + + class CLIError(DstackError): pass @@ -106,6 +149,10 @@ class ConfigurationError(DstackError): pass +class SSHProvisioningError(DstackError): + pass + + class SSHError(DstackError): pass @@ -128,3 +175,40 @@ class SSHPortInUseError(SSHError): class DockerRegistryError(DstackError): pass + + +class RepoError(DstackError): + pass + + +class RepoDetachedHeadError(RepoError): + pass + + +class RepoInvalidCredentialsError(RepoError): + pass + + +class RepoGitError(RepoError): + """ + A wrapper for `git.exc.GitError` and its subclasses. + + Should be raised with `from e` clause to indicate the underlying exception. + To build a message from the underlying exception, raise this exception without arguments. + + try: + ... + except git.GitError as e: + raise RepoGitError() from e + """ + + def __str__(self) -> str: + if self.args or self.__cause__ is None: + return super().__str__() + return f"{self.__cause__.__class__.__name__}: {self.__cause__}" + + +class RepoInvalidGitRepositoryError(RepoGitError): + """ + `DstackError` counterpart for `git.exc.InvalidGitRepositoryError`. + """ diff --git a/src/dstack/_internal/core/models/auth.py b/src/dstack/_internal/core/models/auth.py new file mode 100644 index 0000000000..f6d09fbc73 --- /dev/null +++ b/src/dstack/_internal/core/models/auth.py @@ -0,0 +1,28 @@ +from typing import Annotated, Optional + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +class OAuthProviderInfo(CoreModel): + name: Annotated[str, Field(description="The OAuth2 provider name.")] + enabled: Annotated[ + bool, Field(description="Whether the provider is configured on the server.") + ] + + +class OAuthState(CoreModel): + """ + A struct that the server puts in the OAuth2 state parameter. + """ + + value: Annotated[str, Field(description="A random string to protect against CSRF.")] + local_port: Annotated[ + Optional[int], + Field( + description="If specified, the user is redirected to localhost:local_port after the redirect from the provider.", + ge=1, + le=65535, + ), + ] = None diff --git a/src/dstack/_internal/core/models/backends/__init__.py b/src/dstack/_internal/core/models/backends/__init__.py index cfdc3f2c37..e69de29bb2 100644 --- a/src/dstack/_internal/core/models/backends/__init__.py +++ b/src/dstack/_internal/core/models/backends/__init__.py @@ -1,174 +0,0 @@ -from typing import Union - -from dstack._internal.core.models.backends.aws import ( - AWSConfigInfo, - AWSConfigInfoWithCreds, - AWSConfigInfoWithCredsPartial, - AWSConfigValues, -) -from dstack._internal.core.models.backends.azure import ( - AzureConfigInfo, - AzureConfigInfoWithCreds, - AzureConfigInfoWithCredsPartial, - AzureConfigValues, -) -from dstack._internal.core.models.backends.cudo import ( - CudoConfigInfo, - CudoConfigInfoWithCreds, - CudoConfigInfoWithCredsPartial, - CudoConfigValues, -) -from dstack._internal.core.models.backends.datacrunch import ( - DataCrunchConfigInfo, - DataCrunchConfigInfoWithCreds, - DataCrunchConfigInfoWithCredsPartial, - DataCrunchConfigValues, -) -from dstack._internal.core.models.backends.dstack import ( - DstackBaseBackendConfigInfo, - DstackConfigInfo, - DstackConfigValues, -) -from dstack._internal.core.models.backends.gcp import ( - GCPConfigInfo, - GCPConfigInfoWithCreds, - GCPConfigInfoWithCredsPartial, - GCPConfigValues, -) -from dstack._internal.core.models.backends.kubernetes import ( - KubernetesConfigInfo, - KubernetesConfigInfoWithCreds, - KubernetesConfigInfoWithCredsPartial, - KubernetesConfigValues, -) -from dstack._internal.core.models.backends.lambdalabs import ( - LambdaConfigInfo, - LambdaConfigInfoWithCreds, - LambdaConfigInfoWithCredsPartial, - LambdaConfigValues, -) -from dstack._internal.core.models.backends.nebius import ( - NebiusConfigInfo, - NebiusConfigInfoWithCreds, - NebiusConfigInfoWithCredsPartial, - NebiusConfigValues, -) -from dstack._internal.core.models.backends.oci import ( - OCIConfigInfo, - OCIConfigInfoWithCreds, - OCIConfigInfoWithCredsPartial, - OCIConfigValues, -) -from dstack._internal.core.models.backends.runpod import ( - RunpodConfigInfo, - RunpodConfigInfoWithCreds, - RunpodConfigInfoWithCredsPartial, - RunpodConfigValues, -) -from dstack._internal.core.models.backends.tensordock import ( - TensorDockConfigInfo, - TensorDockConfigInfoWithCreds, - TensorDockConfigInfoWithCredsPartial, - TensorDockConfigValues, -) -from dstack._internal.core.models.backends.vastai import ( - VastAIConfigInfo, - VastAIConfigInfoWithCreds, - VastAIConfigInfoWithCredsPartial, - VastAIConfigValues, -) -from dstack._internal.core.models.common import CoreModel - -# The following models are the basis of the JSON-based backend API. -# They are also the models used by the Configurator interface. -# The JSON-based backend API is replaced by the YAML-based backend API and not used. -# It's likely to be deprecated and removed. -# Some of the models below like those needed for interactive backend setup could be removed then. -# Still, others are going to stay as Configurator models to keep YAML-based configs and internal configs separated. - -# Backend config returned by the API -AnyConfigInfoWithoutCreds = Union[ - AWSConfigInfo, - AzureConfigInfo, - CudoConfigInfo, - DataCrunchConfigInfo, - GCPConfigInfo, - KubernetesConfigInfo, - LambdaConfigInfo, - NebiusConfigInfo, - OCIConfigInfo, - RunpodConfigInfo, - TensorDockConfigInfo, - VastAIConfigInfo, - DstackConfigInfo, - DstackBaseBackendConfigInfo, -] - -# Same as AnyConfigInfoWithoutCreds but also includes creds. -# Used to create/update backend. -# Also returned by the API to project admins so that they can see/update backend creds. -AnyConfigInfoWithCreds = Union[ - AWSConfigInfoWithCreds, - AzureConfigInfoWithCreds, - CudoConfigInfoWithCreds, - DataCrunchConfigInfoWithCreds, - GCPConfigInfoWithCreds, - KubernetesConfigInfoWithCreds, - LambdaConfigInfoWithCreds, - NebiusConfigInfoWithCreds, - OCIConfigInfoWithCreds, - RunpodConfigInfoWithCreds, - TensorDockConfigInfoWithCreds, - VastAIConfigInfoWithCreds, - DstackConfigInfo, -] - -AnyConfigInfo = Union[AnyConfigInfoWithoutCreds, AnyConfigInfoWithCreds] - -# Same as AnyConfigInfoWithCreds but some fields may be optional. -# Used for interactive setup with validation and suggestions (e.g. via UI). -# If the backend does not need interactive setup, it's the same as AnyConfigInfoWithCreds. -AnyConfigInfoWithCredsPartial = Union[ - AWSConfigInfoWithCredsPartial, - AzureConfigInfoWithCredsPartial, - CudoConfigInfoWithCredsPartial, - DataCrunchConfigInfoWithCredsPartial, - GCPConfigInfoWithCredsPartial, - KubernetesConfigInfoWithCredsPartial, - LambdaConfigInfoWithCredsPartial, - NebiusConfigInfoWithCredsPartial, - OCIConfigInfoWithCredsPartial, - RunpodConfigInfoWithCredsPartial, - TensorDockConfigInfoWithCredsPartial, - VastAIConfigInfoWithCredsPartial, - DstackConfigInfo, -] - -# Suggestions for unfilled fields used in interactive setup. -AnyConfigValues = Union[ - AWSConfigValues, - AzureConfigValues, - CudoConfigValues, - DataCrunchConfigValues, - GCPConfigValues, - KubernetesConfigValues, - LambdaConfigValues, - NebiusConfigValues, - OCIConfigValues, - RunpodConfigValues, - TensorDockConfigValues, - VastAIConfigValues, - DstackConfigValues, -] - - -# In case we'll support multiple backends of the same type, -# this adds backend name to backend config. -class BackendInfo(CoreModel): - name: str - config: AnyConfigInfoWithoutCreds - - -class BackendInfoYAML(CoreModel): - name: str - config_yaml: str diff --git a/src/dstack/_internal/core/models/backends/aws.py b/src/dstack/_internal/core/models/backends/aws.py deleted file mode 100644 index f685e383b5..0000000000 --- a/src/dstack/_internal/core/models/backends/aws.py +++ /dev/null @@ -1,62 +0,0 @@ -from typing import Dict - -from pydantic import Field -from typing_extensions import Annotated, List, Literal, Optional, Union - -from dstack._internal.core.models.backends.base import ConfigMultiElement -from dstack._internal.core.models.common import CoreModel - - -class AWSConfigInfo(CoreModel): - type: Literal["aws"] = "aws" - regions: Optional[List[str]] = None - vpc_name: Optional[str] = None - vpc_ids: Optional[Dict[str, str]] = None - default_vpcs: Optional[bool] = None - public_ips: Optional[bool] = None - - -class AWSAccessKeyCreds(CoreModel): - type: Annotated[Literal["access_key"], Field(description="The type of credentials")] = ( - "access_key" - ) - access_key: Annotated[str, Field(description="The access key")] - secret_key: Annotated[str, Field(description="The secret key")] - - -class AWSDefaultCreds(CoreModel): - type: Annotated[Literal["default"], Field(description="The type of credentials")] = "default" - - -AnyAWSCreds = Union[AWSAccessKeyCreds, AWSDefaultCreds] - - -class AWSCreds(CoreModel): - __root__: AnyAWSCreds = Field(..., discriminator="type") - - -class AWSConfigInfoWithCreds(AWSConfigInfo): - creds: AnyAWSCreds - - -AnyAWSConfigInfo = Union[AWSConfigInfo, AWSConfigInfoWithCreds] - - -class AWSConfigInfoWithCredsPartial(CoreModel): - type: Literal["aws"] = "aws" - creds: Optional[AnyAWSCreds] - regions: Optional[List[str]] - vpc_name: Optional[str] - vpc_ids: Optional[Dict[str, str]] - default_vpcs: Optional[bool] - public_ips: Optional[bool] - - -class AWSConfigValues(CoreModel): - type: Literal["aws"] = "aws" - default_creds: bool = False - regions: Optional[ConfigMultiElement] - - -class AWSStoredConfig(AWSConfigInfo): - pass diff --git a/src/dstack/_internal/core/models/backends/azure.py b/src/dstack/_internal/core/models/backends/azure.py deleted file mode 100644 index 94a37412a8..0000000000 --- a/src/dstack/_internal/core/models/backends/azure.py +++ /dev/null @@ -1,58 +0,0 @@ -from pydantic import Field -from typing_extensions import Annotated, List, Literal, Optional, Union - -from dstack._internal.core.models.backends.base import ConfigElement, ConfigMultiElement -from dstack._internal.core.models.common import CoreModel - - -class AzureConfigInfo(CoreModel): - type: Literal["azure"] = "azure" - tenant_id: str - subscription_id: str - locations: Optional[List[str]] = None - - -class AzureClientCreds(CoreModel): - type: Annotated[Literal["client"], Field(description="The type of credentials")] = "client" - client_id: Annotated[str, Field(description="The client ID")] - client_secret: Annotated[str, Field(description="The client secret")] - # if tenant_id is missing, it will be populated from config info - tenant_id: Optional[str] - - -class AzureDefaultCreds(CoreModel): - type: Annotated[Literal["default"], Field(description="The type of credentials")] = "default" - - -AnyAzureCreds = Union[AzureClientCreds, AzureDefaultCreds] - - -class AzureCreds(CoreModel): - __root__: AnyAzureCreds = Field(..., discriminator="type") - - -class AzureConfigInfoWithCreds(AzureConfigInfo): - creds: AnyAzureCreds - - -AnyAzureConfigInfo = Union[AzureConfigInfo, AzureConfigInfoWithCreds] - - -class AzureConfigInfoWithCredsPartial(CoreModel): - type: Literal["azure"] = "azure" - creds: Optional[AnyAzureCreds] - tenant_id: Optional[str] - subscription_id: Optional[str] - locations: Optional[List[str]] - - -class AzureConfigValues(CoreModel): - type: Literal["azure"] = "azure" - default_creds: bool = False - tenant_id: Optional[ConfigElement] - subscription_id: Optional[ConfigElement] - locations: Optional[ConfigMultiElement] - - -class AzureStoredConfig(AzureConfigInfo): - resource_group: str diff --git a/src/dstack/_internal/core/models/backends/base.py b/src/dstack/_internal/core/models/backends/base.py index e9cc4ccaba..80e241b315 100644 --- a/src/dstack/_internal/core/models/backends/base.py +++ b/src/dstack/_internal/core/models/backends/base.py @@ -1,52 +1,52 @@ import enum -from typing import List, Optional - -from dstack._internal.core.models.common import CoreModel class BackendType(str, enum.Enum): """ Attributes: + AMDDEVCLOUD (BackendType): AMD Developer Cloud AWS (BackendType): Amazon Web Services AZURE (BackendType): Microsoft Azure + CLOUDRIFT (BackendType): CloudRift + CRUSOE (BackendType): Crusoe CUDO (BackendType): Cudo + DATACRUNCH (BackendType): DataCrunch (for backward compatibility) + DIGITALOCEAN (BackendType): DigitalOcean DSTACK (BackendType): dstack Sky GCP (BackendType): Google Cloud Platform - DATACRUNCH (BackendType): DataCrunch + HOTAISLE (BackendType): Hot Aisle + JARVISLABS (BackendType): JarvisLabs KUBERNETES (BackendType): Kubernetes LAMBDA (BackendType): Lambda Cloud + NEBIUS (BackendType): Nebius AI Cloud + OCI (BackendType): Oracle Cloud Infrastructure RUNPOD (BackendType): Runpod Cloud TENSORDOCK (BackendType): TensorDock Marketplace VASTAI (BackendType): Vast.ai Marketplace + VERDA (BackendType): Verda Cloud + VULTR (BackendType): Vultr """ + AMDDEVCLOUD = "amddevcloud" AWS = "aws" AZURE = "azure" + CLOUDRIFT = "cloudrift" + CRUSOE = "crusoe" CUDO = "cudo" DATACRUNCH = "datacrunch" + """`DATACRUNCH` is kept as a `BackendType` for backward compatibility.""" + DIGITALOCEAN = "digitalocean" DSTACK = "dstack" GCP = "gcp" + HOTAISLE = "hotaisle" + JARVISLABS = "jarvislabs" KUBERNETES = "kubernetes" LAMBDA = "lambda" - LOCAL = "local" - REMOTE = "remote" # TODO: replace for LOCAL + REMOTE = "remote" NEBIUS = "nebius" OCI = "oci" RUNPOD = "runpod" TENSORDOCK = "tensordock" VASTAI = "vastai" - - -class ConfigElementValue(CoreModel): - value: str - label: str - - -class ConfigElement(CoreModel): - selected: Optional[str] = None - values: List[ConfigElementValue] = [] - - -class ConfigMultiElement(CoreModel): - selected: List[str] = [] - values: List[ConfigElementValue] = [] + VERDA = "verda" + VULTR = "vultr" diff --git a/src/dstack/_internal/core/models/backends/cudo.py b/src/dstack/_internal/core/models/backends/cudo.py deleted file mode 100644 index e7f75dbe65..0000000000 --- a/src/dstack/_internal/core/models/backends/cudo.py +++ /dev/null @@ -1,43 +0,0 @@ -from typing import List, Optional - -from pydantic.fields import Field -from typing_extensions import Annotated, Literal - -from dstack._internal.core.models.backends.base import ConfigElement, ConfigMultiElement -from dstack._internal.core.models.common import CoreModel - - -class CudoConfigInfo(CoreModel): - type: Literal["cudo"] = "cudo" - project_id: str - regions: Optional[List[str]] = None - - -class CudoStoredConfig(CudoConfigInfo): - pass - - -class CudoAPIKeyCreds(CoreModel): - type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" - api_key: Annotated[str, Field(description="The API key")] - - -AnyCudoCreds = CudoAPIKeyCreds -CudoCreds = AnyCudoCreds - - -class CudoConfigInfoWithCreds(CudoConfigInfo): - creds: AnyCudoCreds - - -class CudoConfigInfoWithCredsPartial(CoreModel): - type: Literal["cudo"] = "cudo" - creds: Optional[AnyCudoCreds] - project_id: Optional[str] - regions: Optional[List[str]] - - -class CudoConfigValues(CoreModel): - type: Literal["cudo"] = "cudo" - regions: Optional[ConfigMultiElement] - project_id: Optional[ConfigElement] diff --git a/src/dstack/_internal/core/models/backends/datacrunch.py b/src/dstack/_internal/core/models/backends/datacrunch.py deleted file mode 100644 index 5218ddccc6..0000000000 --- a/src/dstack/_internal/core/models/backends/datacrunch.py +++ /dev/null @@ -1,44 +0,0 @@ -from pydantic.fields import Field -from typing_extensions import Annotated, List, Literal, Optional, Union - -from dstack._internal.core.models.backends.base import ConfigMultiElement -from dstack._internal.core.models.common import CoreModel - - -class DataCrunchConfigInfo(CoreModel): - type: Literal["datacrunch"] = "datacrunch" - regions: Optional[List[str]] = None - - -class DataCrunchAPIKeyCreds(CoreModel): - type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" - client_id: Annotated[str, Field(description="The client ID")] - client_secret: Annotated[str, Field(description="The client secret")] - - -AnyDataCrunchCreds = DataCrunchAPIKeyCreds - - -DataCrunchCreds = AnyDataCrunchCreds - - -class DataCrunchConfigInfoWithCreds(DataCrunchConfigInfo): - creds: AnyDataCrunchCreds - - -AnyDataCrunchConfigInfo = Union[DataCrunchConfigInfo, DataCrunchConfigInfoWithCreds] - - -class DataCrunchConfigInfoWithCredsPartial(CoreModel): - type: Literal["datacrunch"] = "datacrunch" - creds: Optional[AnyDataCrunchCreds] - regions: Optional[List[str]] - - -class DataCrunchConfigValues(CoreModel): - type: Literal["datacrunch"] = "datacrunch" - regions: Optional[ConfigMultiElement] - - -class DataCrunchStoredConfig(DataCrunchConfigInfo): - pass diff --git a/src/dstack/_internal/core/models/backends/dstack.py b/src/dstack/_internal/core/models/backends/dstack.py deleted file mode 100644 index f6ff6c78d7..0000000000 --- a/src/dstack/_internal/core/models/backends/dstack.py +++ /dev/null @@ -1,26 +0,0 @@ -from typing import List - -from typing_extensions import Literal - -from dstack._internal.core.models.common import CoreModel - -# The OSS is currently aware of some of the DstackBackend internals (DstackConfigInfo) to be able to -# show DstackBackend base backends as regular backends. -# Consider designing an API that would allow DstackBackend to do the same without exposing its internals. - - -class DstackConfigInfo(CoreModel): - """ - This is a config model of DstackBackend stored in BackendModel.config and used by DstackConfigurator. - """ - - type: Literal["dstack"] = "dstack" - base_backends: List[str] - - -class DstackBaseBackendConfigInfo(CoreModel): - type: str - - -class DstackConfigValues(CoreModel): - type: Literal["dstack"] = "dstack" diff --git a/src/dstack/_internal/core/models/backends/gcp.py b/src/dstack/_internal/core/models/backends/gcp.py deleted file mode 100644 index 8d84292e35..0000000000 --- a/src/dstack/_internal/core/models/backends/gcp.py +++ /dev/null @@ -1,61 +0,0 @@ -from typing import List, Optional, Union - -from pydantic import Field -from typing_extensions import Literal - -from dstack._internal.core.models.backends.base import ConfigElement, ConfigMultiElement -from dstack._internal.core.models.common import CoreModel - - -class GCPConfigInfo(CoreModel): - type: Literal["gcp"] = "gcp" - project_id: str - regions: Optional[List[str]] = None - vpc_name: Optional[str] = None - vpc_project_id: Optional[str] = None - public_ips: Optional[bool] = None - - -class GCPServiceAccountCreds(CoreModel): - type: Literal["service_account"] = "service_account" - filename: str - data: str - - -class GCPDefaultCreds(CoreModel): - type: Literal["default"] = "default" - - -AnyGCPCreds = Union[GCPServiceAccountCreds, GCPDefaultCreds] - - -class GCPCreds(CoreModel): - __root__: AnyGCPCreds = Field(..., discriminator="type") - - -class GCPConfigInfoWithCreds(GCPConfigInfo): - creds: AnyGCPCreds - - -AnyGCPConfigInfo = Union[GCPConfigInfo, GCPConfigInfoWithCreds] - - -class GCPConfigInfoWithCredsPartial(CoreModel): - type: Literal["gcp"] = "gcp" - creds: Optional[AnyGCPCreds] - project_id: Optional[str] - regions: Optional[List[str]] - vpc_name: Optional[str] = None - vpc_project_id: Optional[str] = None - public_ips: Optional[bool] - - -class GCPConfigValues(CoreModel): - type: Literal["gcp"] = "gcp" - default_creds: bool = False - project_id: Optional[ConfigElement] - regions: Optional[ConfigMultiElement] - - -class GCPStoredConfig(GCPConfigInfo): - pass diff --git a/src/dstack/_internal/core/models/backends/kubernetes.py b/src/dstack/_internal/core/models/backends/kubernetes.py deleted file mode 100644 index 026d9d6f58..0000000000 --- a/src/dstack/_internal/core/models/backends/kubernetes.py +++ /dev/null @@ -1,40 +0,0 @@ -from pydantic.fields import Field -from typing_extensions import Annotated, Literal, Optional, Union - -from dstack._internal.core.models.common import CoreModel - - -class KubernetesNetworkingConfig(CoreModel): - ssh_host: Annotated[Optional[str], Field(description="The external IP address of any node")] - ssh_port: Annotated[ - Optional[str], Field(description="Any port accessible outside of the cluster") - ] - - -class KubernetesConfigInfo(CoreModel): - type: Literal["kubernetes"] = "kubernetes" - networking: KubernetesNetworkingConfig - - -class KubeconfigConfig(CoreModel): - filename: str - data: str - - -class KubernetesConfigInfoWithCreds(KubernetesConfigInfo): - kubeconfig: KubeconfigConfig - - -AnyKubernetesConfigInfo = Union[KubernetesConfigInfo, KubernetesConfigInfoWithCreds] - - -class KubernetesConfigInfoWithCredsPartial(KubernetesConfigInfoWithCreds): - pass - - -class KubernetesConfigValues(CoreModel): - type: Literal["kubernetes"] = "kubernetes" - - -class KubernetesStoredConfig(KubernetesConfigInfoWithCreds): - pass diff --git a/src/dstack/_internal/core/models/backends/lambdalabs.py b/src/dstack/_internal/core/models/backends/lambdalabs.py deleted file mode 100644 index 65d30c8085..0000000000 --- a/src/dstack/_internal/core/models/backends/lambdalabs.py +++ /dev/null @@ -1,43 +0,0 @@ -from pydantic.fields import Field -from typing_extensions import Annotated, List, Literal, Optional, Union - -from dstack._internal.core.models.backends.base import ConfigMultiElement -from dstack._internal.core.models.common import CoreModel - - -class LambdaConfigInfo(CoreModel): - type: Literal["lambda"] = "lambda" - regions: Optional[List[str]] = None - - -class LambdaAPIKeyCreds(CoreModel): - type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" - api_key: Annotated[str, Field(description="The API key")] - - -AnyLambdaCreds = LambdaAPIKeyCreds - - -LambdaCreds = AnyLambdaCreds - - -class LambdaConfigInfoWithCreds(LambdaConfigInfo): - creds: AnyLambdaCreds - - -AnyLambdaConfigInfo = Union[LambdaConfigInfo, LambdaConfigInfoWithCreds] - - -class LambdaConfigInfoWithCredsPartial(CoreModel): - type: Literal["lambda"] = "lambda" - creds: Optional[AnyLambdaCreds] - regions: Optional[List[str]] - - -class LambdaConfigValues(CoreModel): - type: Literal["lambda"] = "lambda" - regions: Optional[ConfigMultiElement] - - -class LambdaStoredConfig(LambdaConfigInfo): - pass diff --git a/src/dstack/_internal/core/models/backends/nebius.py b/src/dstack/_internal/core/models/backends/nebius.py deleted file mode 100644 index e7118b005b..0000000000 --- a/src/dstack/_internal/core/models/backends/nebius.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import List, Optional, Union - -from typing_extensions import Literal - -from dstack._internal.core.models.backends.base import ConfigElement, ConfigMultiElement -from dstack._internal.core.models.common import CoreModel - - -class NebiusConfigInfo(CoreModel): - type: Literal["nebius"] = "nebius" - cloud_id: str - folder_id: str - network_id: str - regions: Optional[List[str]] = None - - -class NebiusServiceAccountCreds(CoreModel): - type: Literal["service_account"] = "service_account" - filename: str - data: str - - -AnyNebiusCreds = NebiusServiceAccountCreds - - -NebiusCreds = AnyNebiusCreds - - -class NebiusConfigInfoWithCreds(NebiusConfigInfo): - creds: AnyNebiusCreds - - -AnyNebiusConfigInfo = Union[NebiusConfigInfo, NebiusConfigInfoWithCreds] - - -class NebiusConfigInfoWithCredsPartial(CoreModel): - type: Literal["nebius"] = "nebius" - creds: Optional[AnyNebiusCreds] - cloud_id: Optional[str] - folder_id: Optional[str] - network_id: Optional[str] - regions: Optional[List[str]] - - -class NebiusConfigValues(CoreModel): - type: Literal["nebius"] = "nebius" - cloud_id: Optional[ConfigElement] - folder_id: Optional[ConfigElement] - network_id: Optional[ConfigElement] - regions: Optional[ConfigMultiElement] - - -class NebiusStoredConfig(NebiusConfigInfo): - pass diff --git a/src/dstack/_internal/core/models/backends/oci.py b/src/dstack/_internal/core/models/backends/oci.py deleted file mode 100644 index 3c13f81ccd..0000000000 --- a/src/dstack/_internal/core/models/backends/oci.py +++ /dev/null @@ -1,88 +0,0 @@ -from typing import Dict - -from pydantic import Field, root_validator -from typing_extensions import Annotated, List, Literal, Optional, Union - -from dstack._internal.core.models.backends.base import ConfigMultiElement -from dstack._internal.core.models.common import CoreModel - - -class OCIConfigInfo(CoreModel): - type: Literal["oci"] = "oci" - regions: Optional[List[str]] = None - compartment_id: Optional[str] = None - - -class OCIClientCreds(CoreModel): - type: Annotated[Literal["client"], Field(description="The type of credentials")] = "client" - user: Annotated[str, Field(description="User OCID")] - tenancy: Annotated[str, Field(description="Tenancy OCID")] - key_file: Annotated[ - Optional[str], - Field( - description="Path to the user's private PEM key. Either this or `key_content` should be set" - ), - ] - key_content: Annotated[ - Optional[str], - Field( - description="Content of the user's private PEM key. Either this or `key_file` should be set" - ), - ] - pass_phrase: Annotated[ - Optional[str], Field(description="Passphrase for the private PEM key if it is encrypted") - ] - fingerprint: Annotated[str, Field(description="User's public key fingerprint")] - region: Annotated[ - str, Field(description="Name or key of any region the tenancy is subscribed to") - ] - - @root_validator - def key_file_xor_key_content(cls, values): - key_file, key_content = values["key_file"], values["key_content"] - if key_file and key_content: - raise ValueError("key_file and key_content are mutually exclusive") - if not key_file and not key_content: - raise ValueError("Either key_file or key_content should be set") - return values - - -class OCIDefaultCreds(CoreModel): - type: Annotated[Literal["default"], Field(description="The type of credentials")] = "default" - file: Annotated[str, Field(description="Path to the OCI CLI-compatible config file")] = ( - "~/.oci/config" - ) - profile: Annotated[str, Field(description="Profile to load from the config file")] = "DEFAULT" - - -AnyOCICreds = Union[OCIClientCreds, OCIDefaultCreds] - - -class OCICreds(CoreModel): - __root__: AnyOCICreds = Field(..., discriminator="type") - - -class OCIConfigInfoWithCreds(OCIConfigInfo): - creds: AnyOCICreds - - -AnyOCIConfigInfo = Union[OCIConfigInfo, OCIConfigInfoWithCreds] - - -class OCIConfigInfoWithCredsPartial(CoreModel): - type: Literal["oci"] = "oci" - creds: Optional[AnyOCICreds] - regions: Optional[List[str]] - compartment_id: Optional[str] - - -class OCIConfigValues(CoreModel): - type: Literal["oci"] = "oci" - default_creds: bool = False - regions: Optional[ConfigMultiElement] - compartment_id: Optional[str] = None - - -class OCIStoredConfig(OCIConfigInfo): - compartment_id: str - subnet_ids_per_region: Dict[str, str] diff --git a/src/dstack/_internal/core/models/backends/runpod.py b/src/dstack/_internal/core/models/backends/runpod.py deleted file mode 100644 index bcf38385e7..0000000000 --- a/src/dstack/_internal/core/models/backends/runpod.py +++ /dev/null @@ -1,39 +0,0 @@ -from typing import List, Optional - -from typing_extensions import Literal - -from dstack._internal.core.models.backends.base import ConfigMultiElement -from dstack._internal.core.models.common import CoreModel - - -class RunpodConfigInfo(CoreModel): - type: Literal["runpod"] = "runpod" - regions: Optional[List[str]] = None - - -class RunpodStoredConfig(RunpodConfigInfo): - pass - - -class RunpodAPIKeyCreds(CoreModel): - type: Literal["api_key"] = "api_key" - api_key: str - - -AnyRunpodCreds = RunpodAPIKeyCreds -RunpodCreds = AnyRunpodCreds - - -class RunpodConfigInfoWithCreds(RunpodConfigInfo): - creds: AnyRunpodCreds - - -class RunpodConfigInfoWithCredsPartial(CoreModel): - type: Literal["runpod"] = "runpod" - creds: Optional[AnyRunpodCreds] - regions: Optional[List[str]] - - -class RunpodConfigValues(CoreModel): - type: Literal["runpod"] = "runpod" - regions: Optional[ConfigMultiElement] diff --git a/src/dstack/_internal/core/models/backends/tensordock.py b/src/dstack/_internal/core/models/backends/tensordock.py deleted file mode 100644 index 64fff0ca84..0000000000 --- a/src/dstack/_internal/core/models/backends/tensordock.py +++ /dev/null @@ -1,44 +0,0 @@ -from pydantic.fields import Field -from typing_extensions import Annotated, List, Literal, Optional, Union - -from dstack._internal.core.models.backends.base import ConfigMultiElement -from dstack._internal.core.models.common import CoreModel - - -class TensorDockConfigInfo(CoreModel): - type: Literal["tensordock"] = "tensordock" - regions: Optional[List[str]] = None - - -class TensorDockAPIKeyCreds(CoreModel): - type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" - api_key: Annotated[str, Field(description="The API key")] - api_token: Annotated[str, Field(description="The API token")] - - -AnyTensorDockCreds = TensorDockAPIKeyCreds - - -TensorDockCreds = AnyTensorDockCreds - - -class TensorDockConfigInfoWithCreds(TensorDockConfigInfo): - creds: AnyTensorDockCreds - - -AnyTensorDockConfigInfo = Union[TensorDockConfigInfo, TensorDockConfigInfoWithCreds] - - -class TensorDockConfigInfoWithCredsPartial(CoreModel): - type: Literal["tensordock"] = "tensordock" - creds: Optional[AnyTensorDockCreds] - regions: Optional[List[str]] - - -class TensorDockConfigValues(CoreModel): - type: Literal["tensordock"] = "tensordock" - regions: Optional[ConfigMultiElement] - - -class TensorDockStoredConfig(TensorDockConfigInfo): - pass diff --git a/src/dstack/_internal/core/models/backends/vastai.py b/src/dstack/_internal/core/models/backends/vastai.py deleted file mode 100644 index ab54462e2f..0000000000 --- a/src/dstack/_internal/core/models/backends/vastai.py +++ /dev/null @@ -1,43 +0,0 @@ -from pydantic.fields import Field -from typing_extensions import Annotated, List, Literal, Optional, Union - -from dstack._internal.core.models.backends.base import ConfigMultiElement -from dstack._internal.core.models.common import CoreModel - - -class VastAIConfigInfo(CoreModel): - type: Literal["vastai"] = "vastai" - regions: Optional[List[str]] = None - - -class VastAIAPIKeyCreds(CoreModel): - type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" - api_key: Annotated[str, Field(description="The API key")] - - -AnyVastAICreds = VastAIAPIKeyCreds - - -VastAICreds = AnyVastAICreds - - -class VastAIConfigInfoWithCreds(VastAIConfigInfo): - creds: AnyVastAICreds - - -AnyVastAIConfigInfo = Union[VastAIConfigInfo, VastAIConfigInfoWithCreds] - - -class VastAIConfigInfoWithCredsPartial(CoreModel): - type: Literal["vastai"] = "vastai" - creds: Optional[AnyVastAICreds] - regions: Optional[List[str]] - - -class VastAIConfigValues(CoreModel): - type: Literal["vastai"] = "vastai" - regions: Optional[ConfigMultiElement] - - -class VastAIStoredConfig(VastAIConfigInfo): - pass diff --git a/src/dstack/_internal/core/models/common.py b/src/dstack/_internal/core/models/common.py index fbd9ddbf74..70578e272a 100644 --- a/src/dstack/_internal/core/models/common.py +++ b/src/dstack/_internal/core/models/common.py @@ -1,16 +1,89 @@ import re -from typing import Any, Type, Union +from enum import Enum +from typing import TYPE_CHECKING, Any, Callable, Mapping, Optional, Union -from pydantic_duality import DualBaseModel +import orjson +from pydantic import Field +from pydantic_duality import generate_dual_base_model +from typing_extensions import Annotated +from dstack._internal.utils.json_utils import pydantic_orjson_dumps +IncludeExcludeFieldType = Union[int, str] +IncludeExcludeSetType = set[IncludeExcludeFieldType] +IncludeExcludeDictType = dict[ + IncludeExcludeFieldType, Union[bool, IncludeExcludeSetType, "IncludeExcludeDictType"] +] +IncludeExcludeType = Union[IncludeExcludeSetType, IncludeExcludeDictType] + + +class CoreConfig: + json_loads = orjson.loads + json_dumps = pydantic_orjson_dumps + + +# All dstack models inherit from pydantic-duality's DualBaseModel. # DualBaseModel creates two classes for the model: # one with extra = "forbid" (CoreModel/CoreModel.__request__), # and another with extra = "ignore" (CoreModel.__response__). -# This allows to use the same model both for a strict parsing of the user input and -# for a permissive parsing of the server responses. -class CoreModel(DualBaseModel): - pass +# This allows to use the same model both for strict parsing of the user input and +# for permissive parsing of the server responses. +# +# We define a func to generate CoreModel dynamically that can be used +# to define custom Config for both __request__ and __response__ models. +# Note: Defining config in the model class directly overrides +# pydantic-duality's base config, breaking __response__. +def generate_dual_core_model( + custom_config: Union[type, Mapping], +) -> "type[CoreModel]": + class CoreModel(generate_dual_base_model(custom_config)): + def json( + self, + *, + include: Optional[IncludeExcludeType] = None, + exclude: Optional[IncludeExcludeType] = None, + by_alias: bool = False, + skip_defaults: Optional[bool] = None, # ignore as it's deprecated + exclude_unset: bool = False, + exclude_defaults: bool = False, + exclude_none: bool = False, + encoder: Optional[Callable[[Any], Any]] = None, + models_as_dict: bool = True, # does not seems to be needed by dstack or dependencies + **dumps_kwargs: Any, + ) -> str: + """ + Override `json()` method so that it calls `dict()`. + Allows changing how models are serialized by overriding `dict()` only. + By default, `json()` won't call `dict()`, so changes applied in `dict()` won't take place. + """ + data = self.dict( + by_alias=by_alias, + include=include, + exclude=exclude, + exclude_unset=exclude_unset, + exclude_defaults=exclude_defaults, + exclude_none=exclude_none, + ) + if self.__custom_root_type__: + data = data["__root__"] + return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs) + + return CoreModel + + +if TYPE_CHECKING: + + class CoreModel(generate_dual_base_model(CoreConfig)): + pass +else: + CoreModel = generate_dual_core_model(CoreConfig) + + +class FrozenConfig(CoreConfig): + frozen = True + + +FrozenCoreModel = generate_dual_core_model(FrozenConfig) class Duration(int): @@ -47,10 +120,59 @@ def parse(cls, v: Union[int, str]) -> "Duration": raise ValueError(f"Cannot parse the duration {v}") -def is_core_model_instance(instance: Any, class_: Type[CoreModel]) -> bool: +class RegistryAuth(FrozenCoreModel): + """ + Credentials for pulling a private Docker image. + + Attributes: + username (str): The username + password (str): The password or access token """ - Implements isinstance check for CoreModel such that - models parsed with MyModel.__response__ pass the check against MyModel. - See https://github.com/dstackai/dstack/issues/1124 + + username: Annotated[str, Field(description="The username")] + password: Annotated[str, Field(description="The password or access token")] + + +class ApplyAction(str, Enum): + CREATE = "create" + """`CREATE` means the resource is to be created or overridden.""" + UPDATE = "update" + """`UPDATE` means the resource is to be updated in-place.""" + + +class NetworkMode(str, Enum): + HOST = "host" + BRIDGE = "bridge" + + +class EntityReference(CoreModel): + """ + Cross-project entity reference. """ - return isinstance(instance, class_) or isinstance(instance, class_.__response__) + + project: Annotated[ + Optional[str], + Field(description="The project name. If unspecified, refers to the current project"), + ] + name: Annotated[str, Field(description="The entity name")] + + @classmethod + def parse(cls, v: Union[str, "EntityReference"]) -> "EntityReference": + if isinstance(v, EntityReference): + return v + invalid_ref_error = ValueError( + "Invalid entity reference. Only `` or `/` formats are allowed" + ) + parts = v.split("/") + if any(len(part) == 0 for part in parts): + raise invalid_ref_error + if len(parts) == 1: + return cls(project=None, name=parts[0]) + if len(parts) == 2: + return cls(project=parts[0], name=parts[1]) + raise invalid_ref_error + + def format(self) -> str: + if self.project is None: + return self.name + return f"{self.project}/{self.name}" diff --git a/src/dstack/_internal/core/models/compute_groups.py b/src/dstack/_internal/core/models/compute_groups.py new file mode 100644 index 0000000000..55dc0d2385 --- /dev/null +++ b/src/dstack/_internal/core/models/compute_groups.py @@ -0,0 +1,48 @@ +import enum +import uuid +from datetime import datetime +from typing import List, Optional + +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import CoreModel +from dstack._internal.core.models.runs import JobProvisioningData + + +class ComputeGroupStatus(str, enum.Enum): + RUNNING = "running" + TERMINATED = "terminated" + + @classmethod + def finished_statuses(cls) -> List["ComputeGroupStatus"]: + return [cls.TERMINATED] + + def is_finished(self): + return self in self.finished_statuses() + + +class ComputeGroupProvisioningData(CoreModel): + compute_group_id: str + compute_group_name: str + backend: BackendType + base_backend: Optional[BackendType] = None + """`base_backend` may be set when a backend provisions an instance in another backend and needs + to record that backend as `base_backend`. + """ + region: str + job_provisioning_datas: List[JobProvisioningData] + backend_data: Optional[str] = None + """`backend_data` stores backend-specific data in JSON.""" + + +class ComputeGroup(CoreModel): + """ + Compute group is a group of instances managed as a single unit via the provider API, + i.e. instances are not created/deleted one-by-one but all at once. + """ + + id: uuid.UUID + name: str + project_name: str + created_at: datetime + status: ComputeGroupStatus + provisioning_data: ComputeGroupProvisioningData diff --git a/src/dstack/_internal/core/models/config.py b/src/dstack/_internal/core/models/config.py index bb17b40b44..a0497401d9 100644 --- a/src/dstack/_internal/core/models/config.py +++ b/src/dstack/_internal/core/models/config.py @@ -12,13 +12,18 @@ class ProjectConfig(CoreModel): default: Optional[bool] +# Not used since 0.20.0. Can be removed when most users update their `config.yml` (it's updated +# each time a project is added) class RepoConfig(CoreModel): path: str repo_id: str repo_type: RepoType - ssh_key_path: str + ssh_key_path: Annotated[Optional[str], Field(exclude=True)] = None class GlobalConfig(CoreModel): projects: Annotated[List[ProjectConfig], Field(description="The list of projects")] = [] - repos: List[RepoConfig] = [] + repos: Annotated[list[RepoConfig], Field(exclude=True)] = [] + """`repos` is not used since 0.20.0. It can be removed when most users update their `config.yml` + because it is updated each time a project is added. + """ diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py index 4208c00dc8..900dca8ce7 100644 --- a/src/dstack/_internal/core/models/configurations.py +++ b/src/dstack/_internal/core/models/configurations.py @@ -1,21 +1,77 @@ import re +import string +from collections import Counter from enum import Enum -from typing import Any, Dict, List, Mapping, Optional, Union +from pathlib import PurePosixPath +from typing import Annotated, Any, Dict, List, Literal, Optional, Union +import orjson from pydantic import Field, ValidationError, conint, constr, root_validator, validator -from typing_extensions import Annotated, Literal +from typing_extensions import Self from dstack._internal.core.errors import ConfigurationError -from dstack._internal.core.models.common import CoreModel, Duration -from dstack._internal.core.models.gateways import AnyModel, GatewayConfiguration -from dstack._internal.core.models.profiles import ProfileParams -from dstack._internal.core.models.repos.base import Repo -from dstack._internal.core.models.repos.virtual import VirtualRepo +from dstack._internal.core.models.common import ( + CoreConfig, + CoreModel, + Duration, + EntityReference, + RegistryAuth, + generate_dual_core_model, +) +from dstack._internal.core.models.envs import Env +from dstack._internal.core.models.files import FilePathMapping +from dstack._internal.core.models.fleets import FleetConfiguration +from dstack._internal.core.models.gateways import GatewayConfiguration +from dstack._internal.core.models.profiles import ( + ProfileParams, + ProfileParamsConfig, + SpotPolicy, + parse_duration, + parse_off_duration, +) from dstack._internal.core.models.resources import Range, ResourcesSpec -from dstack._internal.core.models.volumes import VolumeConfiguration, VolumeMountPoint +from dstack._internal.core.models.routers import AnyServiceRouterConfig, ReplicaGroupRouterConfig +from dstack._internal.core.models.services import AnyModel, OpenAIChatModel +from dstack._internal.core.models.unix import UnixUser +from dstack._internal.core.models.volumes import ( + AnyVolumeConfiguration, + BaseVolumeConfiguration, + MountPoint, + VolumeConfiguration, + parse_mount_point, + parse_volume_configuration, +) +from dstack._internal.core.services import is_valid_replica_group_name +from dstack._internal.proxy.gateway.const import SERVICE_SCALING_WINDOWS +from dstack._internal.utils.common import has_duplicates, list_enum_values_for_annotation +from dstack._internal.utils.json_schema import add_extra_schema_types +from dstack._internal.utils.json_utils import ( + pydantic_orjson_dumps_with_indent, +) CommandsList = List[str] ValidPort = conint(gt=0, le=65536) +MAX_INT64 = 2**63 - 1 +SERVICE_HTTPS_DEFAULT = True +STRIP_PREFIX_DEFAULT = True +RUN_PRIOTIRY_MIN = 0 +RUN_PRIOTIRY_MAX = 100 +RUN_PRIORITY_DEFAULT = 0 +LEGACY_REPO_DIR = "/workflow" +MIN_PROBE_TIMEOUT = 1 +MIN_PROBE_INTERVAL = 1 +DEFAULT_PROBE_URL = "/" +DEFAULT_PROBE_TIMEOUT = 10 +DEFAULT_PROBE_INTERVAL = 15 +DEFAULT_PROBE_READY_AFTER = 1 +DEFAULT_PROBE_METHOD = "get" +DEFAULT_PROBE_UNTIL_READY = False +MAX_PROBE_URL_LEN = 2048 +DEFAULT_REPLICA_GROUP_NAME = "0" +OPENAI_MODEL_PROBE_TIMEOUT = 30 +ALLOWED_SCALING_WINDOWS_DESCRIPTION = ", ".join(f"`{w}s`" for w in SERVICE_SCALING_WINDOWS) +DEFAULT_SCALING_WINDOW = 60 +assert DEFAULT_SCALING_WINDOW in SERVICE_SCALING_WINDOWS class RunConfigurationType(str, Enum): @@ -25,27 +81,11 @@ class RunConfigurationType(str, Enum): class PythonVersion(str, Enum): - PY38 = "3.8" PY39 = "3.9" PY310 = "3.10" PY311 = "3.11" PY312 = "3.12" - - -class RegistryAuth(CoreModel): - """ - Credentials for pulling a private Docker image. - - Attributes: - username (str): The username - password (str): The password or access token - """ - - class Config: - frozen = True - - username: Annotated[str, Field(description="The username")] - password: Annotated[str, Field(description="The password or access token")] + PY313 = "3.13" class PortMapping(CoreModel): @@ -73,16 +113,101 @@ def parse(cls, v: str) -> "PortMapping": return PortMapping(local_port=local_port, container_port=int(container_port)) -class Artifact(CoreModel): +class RepoExistsAction(str, Enum): + ERROR = "error" + """`ERROR` means do not try to check out and terminate the run with an error. This is the default action since 0.20.0.""" + SKIP = "skip" + """`SKIP` means do not try to check out and skip the repo. This is the logic hardcoded in the pre-0.20.0 runner.""" + + +class RepoSpec(CoreModel): + local_path: Annotated[ + Optional[str], + Field( + description=( + "The path to the Git repo on the user's machine. Relative paths are resolved" + " relative to the parent directory of the the configuration file." + " Mutually exclusive with `url`" + ) + ), + ] = None + url: Annotated[ + Optional[str], + Field(description="The Git repo URL. Mutually exclusive with `local_path`"), + ] = None + branch: Annotated[ + Optional[str], + Field( + description=( + "The repo branch. Defaults to the active branch for local paths" + " and the default branch for URLs" + ) + ), + ] = None + hash: Annotated[ + Optional[str], + Field(description="The commit hash"), + ] = None path: Annotated[ - str, Field(description="The path to the folder that must be stored as an output artifact") - ] - mount: Annotated[ - bool, + str, Field( - description="Must be set to `true` if the artifact files must be saved in real-time" + description=( + "The repo path inside the run container. Relative paths are resolved" + " relative to the working directory" + ) ), - ] = False + ] = "." + if_exists: Annotated[ + RepoExistsAction, + Field( + description=( + "The action to be taken if `path` exists and is not empty." + f" One of: {list_enum_values_for_annotation(RepoExistsAction)}" + ), + ), + ] = RepoExistsAction.ERROR + + @classmethod + def parse(cls, v: str) -> Self: + is_url = False + parts = v.split(":") + if len(parts) > 1: + # Git repo, git@github.com:dstackai/dstack.git or https://github.com/dstackai/dstack + if "@" in parts[0] or parts[1].startswith("//"): + parts = [f"{parts[0]}:{parts[1]}", *parts[2:]] + is_url = True + # Windows path, e.g., `C:\path\to`, 'c:/path/to' + elif ( + len(parts[0]) == 1 + and parts[0] in string.ascii_letters + and parts[1][:1] in ["\\", "/"] + ): + parts = [f"{parts[0]}:{parts[1]}", *parts[2:]] + if len(parts) == 1: + if is_url: + return cls(url=parts[0]) + return cls(local_path=parts[0]) + if len(parts) == 2: + if is_url: + return cls(url=parts[0], path=parts[1]) + return cls(local_path=parts[0], path=parts[1]) + raise ValueError(f"Invalid repo: {v}") + + @root_validator + def validate_local_path_or_url(cls, values): + if values["local_path"] and values["url"]: + raise ValueError("`local_path` and `url` are mutually exclusive") + if not values["local_path"] and not values["url"]: + raise ValueError("Either `local_path` or `url` must be specified") + return values + + @validator("path") + def validate_path(cls, v: Optional[str]) -> Optional[str]: + if v is None: + return v + if v.startswith("~") and PurePosixPath(v).parts[0] != "~": + raise ValueError("`~username` syntax is not supported") + return v class ScalingSpec(CoreModel): @@ -98,60 +223,374 @@ class ScalingSpec(CoreModel): Field( description="The target value of the metric. " "The number of replicas is calculated based on this number and automatically adjusts " - "(scales up or down) as this metric changes" + "(scales up or down) as this metric changes", + gt=0, ), ] + window: Annotated[ + Optional[Duration], + Field( + description=( + "The time window used to calculate requests per second." + f" Allowed values: {ALLOWED_SCALING_WINDOWS_DESCRIPTION}." + f" Defaults to `{DEFAULT_SCALING_WINDOW}s`" + ), + ), + ] = None scale_up_delay: Annotated[ - Duration, Field(description="The delay in seconds before scaling up") + Duration, + Field( + description=( + "The minimum time, in seconds, between a scaling event and the next scale-up decision." + " Used to prevent overly frequent scaling" + ) + ), ] = Duration.parse("5m") scale_down_delay: Annotated[ - Duration, Field(description="The delay in seconds before scaling down") + Duration, + Field( + description=( + "The minimum time, in seconds, between a scaling event and the next scale-down decision." + " Used to prevent overly frequent scaling" + ) + ), ] = Duration.parse("10m") + @validator("window") + def validate_window(cls, v: Optional[Duration]) -> Optional[Duration]: + if v is not None and v not in SERVICE_SCALING_WINDOWS: + raise ValueError(f"Window must be one of: {ALLOWED_SCALING_WINDOWS_DESCRIPTION}") + return v + + +class IPAddressPartitioningKey(CoreModel): + type: Annotated[Literal["ip_address"], Field(description="Partitioning type")] = "ip_address" + + +class HeaderPartitioningKey(CoreModel): + type: Annotated[Literal["header"], Field(description="Partitioning type")] = "header" + header: Annotated[ + str, + Field( + description="Name of the header to use for partitioning", + regex=r"^[a-zA-Z0-9-_]+$", # prevent Nginx config injection + max_length=500, # chosen randomly, Nginx limit is higher + ), + ] + + +class RateLimit(CoreModel): + prefix: Annotated[ + str, + Field( + description=( + "URL path prefix to which this limit is applied." + " If an incoming request matches several prefixes, the longest prefix is applied" + ), + max_length=4094, # Nginx limit + regex=r"^/[^\s\\{}]*$", # prevent Nginx config injection + ), + ] = "/" + key: Annotated[ + Union[IPAddressPartitioningKey, HeaderPartitioningKey], + Field( + discriminator="type", + description=( + "The partitioning key. Each incoming request belongs to a partition" + " and rate limits are applied per partition." + " Defaults to partitioning by client IP address" + ), + ), + ] = IPAddressPartitioningKey() + rps: Annotated[ + float, + Field( + description=( + "Max allowed number of requests per second." + " Requests are tracked at millisecond granularity." + " For example, `rps: 10` means at most 1 request per 100ms" + ), + # should fit into Nginx limits after being converted to requests per minute + ge=1 / 60, + le=MAX_INT64 // 60, + ), + ] + burst: Annotated[ + int, + Field( + ge=0, + le=MAX_INT64, # Nginx limit + description=( + "Max number of requests that can be passed to the service ahead of the rate limit" + ), + ), + ] = 0 + + +HTTPMethod = Literal["get", "post", "put", "delete", "patch", "head"] + + +class HTTPHeaderSpec(CoreModel): + name: Annotated[ + str, + Field( + description="The name of the HTTP header", + min_length=1, + max_length=256, + ), + ] + value: Annotated[ + str, + Field( + description="The value of the HTTP header", + min_length=1, + max_length=2048, + ), + ] + + +class ProbeConfigConfig(CoreConfig): + @staticmethod + def schema_extra(schema: Dict[str, Any]): + add_extra_schema_types( + schema["properties"]["timeout"], + extra_types=[{"type": "string"}], + ) + add_extra_schema_types( + schema["properties"]["interval"], + extra_types=[{"type": "string"}], + ) + + +class ProbeConfig(generate_dual_core_model(ProbeConfigConfig)): + type: Annotated[ + Literal["http"], + Field(description="The probe type. Must be `http`"), + ] # expect other probe types in the future, namely `exec` + url: Annotated[ + Optional[str], Field(description=f"The URL to request. Defaults to `{DEFAULT_PROBE_URL}`") + ] = None + method: Annotated[ + Optional[HTTPMethod], + Field( + description=( + "The HTTP method to use for the probe (e.g., `get`, `post`, etc.)." + f" Defaults to `{DEFAULT_PROBE_METHOD}`" + ) + ), + ] = None + headers: Annotated[ + list[HTTPHeaderSpec], + Field(description="A list of HTTP headers to include in the request", max_items=16), + ] = [] + body: Annotated[ + Optional[str], + Field( + description="The HTTP request body to send with the probe", + min_length=1, + max_length=2048, + ), + ] = None + timeout: Annotated[ + Optional[int], + Field( + description=( + f"Maximum amount of time the HTTP request is allowed to take. Defaults to `{DEFAULT_PROBE_TIMEOUT}s`" + ) + ), + ] = None + interval: Annotated[ + Optional[int], + Field( + description=( + "Minimum amount of time between the end of one probe execution" + f" and the start of the next. Defaults to `{DEFAULT_PROBE_INTERVAL}s`" + ) + ), + ] = None + ready_after: Annotated[ + Optional[int], + Field( + ge=1, + description=( + "The number of consecutive successful probe executions required for the replica" + " to be considered ready. Used during rolling deployments." + f" Defaults to `{DEFAULT_PROBE_READY_AFTER}`" + ), + ), + ] = None + until_ready: Annotated[ + Optional[bool], + Field( + description=( + "If `true`, the probe will stop being executed as soon as it reaches the" + " `ready_after` threshold of successful executions." + f" Defaults to `{str(DEFAULT_PROBE_UNTIL_READY).lower()}`" + ), + ), + ] = None + + @validator("timeout", pre=True) + def parse_timeout(cls, v: Optional[Union[int, str]]) -> Optional[int]: + if v is None: + return v + parsed = parse_duration(v) + if parsed < MIN_PROBE_TIMEOUT: + raise ValueError(f"Probe timeout cannot be shorter than {MIN_PROBE_TIMEOUT}s") + return parsed + + @validator("interval", pre=True) + def parse_interval(cls, v: Optional[Union[int, str]]) -> Optional[int]: + if v is None: + return v + parsed = parse_duration(v) + if parsed < MIN_PROBE_INTERVAL: + raise ValueError(f"Probe interval cannot be shorter than {MIN_PROBE_INTERVAL}s") + return parsed + + @validator("url") + def validate_url(cls, v: Optional[str]) -> Optional[str]: + if v is None: + return v + if not v.startswith("/"): + raise ValueError("Must start with `/`") + if len(v) > MAX_PROBE_URL_LEN: + raise ValueError(f"Cannot be longer than {MAX_PROBE_URL_LEN} characters") + if not v.isprintable(): + raise ValueError("Cannot contain non-printable characters") + return v -class EnvSentinel(CoreModel): - key: str + @root_validator + def validate_body_matches_method(cls, values): + method: HTTPMethod = values["method"] + if values["body"] is not None and method in ["get", "head"]: + raise ValueError(f"Cannot set request body for the `{method}` method") + return values - def from_env(self, env: Mapping[str, str]) -> str: - if self.key in env: - return env[self.key] - raise ValueError(f"Environment variable {self.key} is not set") - def __str__(self): - return f"EnvSentinel({self.key})" +class BaseRunConfigurationConfig(CoreConfig): + @staticmethod + def schema_extra(schema: Dict[str, Any]): + add_extra_schema_types( + schema["properties"]["volumes"]["items"], + extra_types=[{"type": "string"}], + ) + add_extra_schema_types( + schema["properties"]["files"]["items"], + extra_types=[{"type": "string"}], + ) -class BaseConfiguration(CoreModel): +class BaseRunConfiguration(CoreModel): type: Literal["none"] - image: Annotated[Optional[str], Field(description="The name of the Docker image to run")] - entrypoint: Annotated[Optional[str], Field(description="The Docker entrypoint")] - home_dir: Annotated[ - str, Field(description="The absolute path to the home directory inside the container") - ] = "/root" + name: Annotated[ + Optional[str], + Field(description="The run name. If not specified, a random name is generated"), + ] = None + image: Annotated[Optional[str], Field(description="The name of the Docker image to run")] = ( + None + ) + user: Annotated[ + Optional[str], + Field( + description=( + "The user inside the container, `user_name_or_id[:group_name_or_id]`" + " (e.g., `ubuntu`, `1000:1000`). Defaults to the default user from the `image`" + ) + ), + ] = None + privileged: Annotated[bool, Field(description="Run the container in privileged mode")] = False + entrypoint: Annotated[Optional[str], Field(description="The Docker entrypoint")] = None + working_dir: Annotated[ + Optional[str], + Field( + description=( + "The absolute path to the working directory inside the container." + " Defaults to the `image`'s default working directory" + ), + ), + ] = None + home_dir: str = "/root" + """`home_dir` is deprecated since 0.18.31 and has no effect.""" registry_auth: Annotated[ Optional[RegistryAuth], Field(description="Credentials for pulling a private Docker image") - ] + ] = None python: Annotated[ Optional[PythonVersion], - Field(description="The major version of Python. Mutually exclusive with `image`"), - ] + Field( + description="The major version of Python. Mutually exclusive with `image` and `docker`" + ), + ] = None + nvcc: Annotated[ + Optional[bool], + Field( + description="Use image with NVIDIA CUDA Compiler (NVCC) included. Mutually exclusive with `image` and `docker`" + ), + ] = None + single_branch: Annotated[ + Optional[bool], + Field( + description=( + "Whether to clone and track only the current branch or all remote branches." + " Relevant only when using remote Git repos." + " Defaults to `false` for dev environments and to `true` for tasks and services" + ) + ), + ] = None env: Annotated[ - Union[ - List[constr(regex=r"^[a-zA-Z_][a-zA-Z0-9_]*(=.*$|$)")], - Dict[str, Union[str, EnvSentinel]], - ], + Env, Field(description="The mapping or the list of environment variables"), - ] = {} - setup: Annotated[CommandsList, Field(description="The bash commands to run on the boot")] = [] + ] = Env() + shell: Annotated[ + Optional[str], + Field( + description=( + "The shell used to run commands." + " Allowed values are `sh`, `bash`, or an absolute path, e.g., `/usr/bin/zsh`." + " Defaults to `/bin/sh` if the `image` is specified, `/bin/bash` otherwise" + ) + ), + ] = None resources: Annotated[ ResourcesSpec, Field(description="The resources requirements to run the configuration") ] = ResourcesSpec() - volumes: Annotated[List[VolumeMountPoint], Field(description="The volumes mount points")] = [] + priority: Annotated[ + Optional[int], + Field( + ge=RUN_PRIOTIRY_MIN, + le=RUN_PRIOTIRY_MAX, + description=( + f"The priority of the run, an integer between `{RUN_PRIOTIRY_MIN}` and `{RUN_PRIOTIRY_MAX}`." + " `dstack` tries to provision runs with higher priority first." + f" Defaults to `{RUN_PRIORITY_DEFAULT}`" + ), + ), + ] = None + volumes: Annotated[List[MountPoint], Field(description="The volumes mount points")] = [] + docker: Annotated[ + Optional[bool], + Field( + description="Use Docker inside the container. Mutually exclusive with `image`, `python`, and `nvcc`. Overrides `privileged`" + ), + ] = None + repos: Annotated[ + list[RepoSpec], + Field(description="The list of Git repos"), + ] = [] + files: Annotated[ + list[FilePathMapping], + Field(description="The local to container file path mappings"), + ] = [] + setup: CommandsList = [] + """ + setup: Deprecated since 0.18.31. It has no effect for tasks and services; for + dev environments it runs right before `init`. + """ @validator("python", pre=True, always=True) def convert_python(cls, v, values) -> Optional[PythonVersion]: if v is not None and values.get("image"): - raise KeyError("`image` and `python` are mutually exclusive fields") + raise ValueError("`image` and `python` are mutually exclusive fields") if isinstance(v, float): v = str(v) if v == "3.1": @@ -160,30 +599,62 @@ def convert_python(cls, v, values) -> Optional[PythonVersion]: return PythonVersion(v) return v - @validator("env") - def convert_env(cls, v) -> Dict[str, str]: - if isinstance(v, list): - d = {} - for var in v: - if "=" not in var: - if var not in d: - d[var] = EnvSentinel(key=var) - else: - raise ValueError(f"Duplicate environment variable: {var}") - else: - k, val = var.split("=", maxsplit=1) - if k not in d: - d[k] = val - else: - raise ValueError(f"Duplicate environment variable: {var}") - return d + @validator("docker", pre=True, always=True) + def _docker(cls, v, values) -> Optional[bool]: + if v is True and values.get("image"): + raise ValueError("`image` and `docker` are mutually exclusive fields") + if v is True and values.get("python"): + raise ValueError("`python` and `docker` are mutually exclusive fields") + if v is True and values.get("nvcc"): + raise ValueError("`nvcc` and `docker` are mutually exclusive fields") + # Ideally, we'd like to also prohibit privileged=False when docker=True, + # but it's not possible to do so without breaking backwards compatibility. + return v + + @validator("volumes", each_item=True, pre=True) + def convert_volumes(cls, v: Union[MountPoint, str]) -> MountPoint: + if isinstance(v, str): + return parse_mount_point(v) + return v + + @validator("files", each_item=True, pre=True) + def convert_files(cls, v: Union[FilePathMapping, str]) -> FilePathMapping: + if isinstance(v, str): + return FilePathMapping.parse(v) + return v + + @validator("repos", pre=True, each_item=True) + def convert_repos(cls, v: Union[RepoSpec, str]) -> RepoSpec: + if isinstance(v, str): + return RepoSpec.parse(v) + return v + + @validator("repos") + def validate_repos(cls, v) -> RepoSpec: + if len(v) > 1: + raise ValueError("A maximum of one repo is currently supported") + return v + + @validator("user") + def validate_user(cls, v) -> Optional[str]: + if v is None: + return None + UnixUser.parse(v) return v - def get_repo(self) -> Repo: - return VirtualRepo(repo_id="none") + @validator("shell") + def validate_shell(cls, v) -> Optional[str]: + if v is None: + return None + if v in ["sh", "bash"]: + return v + path = PurePosixPath(v) + if path.is_absolute(): + return v + raise ValueError("The value must be `sh`, `bash`, or an absolute path") -class BaseConfigurationWithPorts(BaseConfiguration): +class ConfigurationWithPortsParams(CoreModel): ports: Annotated[ List[Union[ValidPort, constr(regex=r"^(?:[0-9]+|\*):[0-9]+$"), PortMapping]], Field(description="Port numbers/mapping to expose"), @@ -198,63 +669,383 @@ def convert_ports(cls, v) -> PortMapping: return v -class BaseConfigurationWithCommands(BaseConfiguration): - commands: Annotated[CommandsList, Field(description="The bash commands to run")] = [] +class ConfigurationWithCommandsParams(CoreModel): + commands: Annotated[CommandsList, Field(description="The shell commands to run")] = [] @root_validator def check_image_or_commands_present(cls, values): + # If replicas is list, skip validation - commands come from replica groups + replicas = values.get("replicas") + if isinstance(replicas, list): + return values + if not values.get("commands") and not values.get("image"): raise ValueError("Either `commands` or `image` must be set") return values class DevEnvironmentConfigurationParams(CoreModel): - ide: Annotated[Literal["vscode"], Field(description="The IDE to run")] - version: Annotated[Optional[str], Field(description="The version of the IDE")] - init: Annotated[CommandsList, Field(description="The bash commands to run")] = [] + ide: Annotated[ + Optional[Union[Literal["vscode"], Literal["cursor"], Literal["windsurf"], Literal["zed"]]], + Field( + description="The IDE to pre-install. Supported values include `vscode`, `cursor`, `windsurf`, and `zed`. Defaults to no IDE (SSH only)" + ), + ] = None + version: Annotated[ + Optional[str], + Field( + description="The version of the IDE. For `windsurf`, the version is in the format `version@commit`" + ), + ] = None + init: Annotated[CommandsList, Field(description="The shell commands to run on startup")] = [] + inactivity_duration: Annotated[ + Optional[Union[Literal["off"], int, bool, str]], + Field( + description=( + "The maximum amount of time the dev environment can be inactive" + " (e.g., `2h`, `1d`, etc)." + " After it elapses, the dev environment is automatically stopped." + " Inactivity is defined as the absence of SSH connections to the" + " dev environment, including VS Code connections, `ssh `" + " shells, and attached `dstack apply` or `dstack attach` commands." + " Use `off` for unlimited duration. Can be updated in-place." + " Defaults to `off`" + ) + ), + ] = None + + @validator("inactivity_duration", pre=True, allow_reuse=True) + def parse_inactivity_duration( + cls, v: Optional[Union[Literal["off"], int, bool, str]] + ) -> Optional[int]: + v = parse_off_duration(v) + if isinstance(v, int): + return v + return None + + @root_validator + def validate_ide_and_version(cls, values): + ide = values.get("ide") + version = values.get("version") + if version and ide is None: + raise ValueError("`version` requires `ide` to be set") + if ide == "windsurf" and version: + # Validate format: version@commit + if not re.match(r"^.+@[a-f0-9]+$", version): + raise ValueError( + f"Invalid Windsurf version format: `{version}`. " + "Expected format: `version@commit` (e.g., `1.106.0@8951cd3ad688e789573d7f51750d67ae4a0bea7d`)" + ) + return values + + +class DevEnvironmentConfigurationConfig( + ProfileParamsConfig, + BaseRunConfigurationConfig, +): + @staticmethod + def schema_extra(schema: Dict[str, Any]): + ProfileParamsConfig.schema_extra(schema) + BaseRunConfigurationConfig.schema_extra(schema) class DevEnvironmentConfiguration( - ProfileParams, BaseConfigurationWithPorts, DevEnvironmentConfigurationParams + ProfileParams, + BaseRunConfiguration, + ConfigurationWithPortsParams, + DevEnvironmentConfigurationParams, + generate_dual_core_model(DevEnvironmentConfigurationConfig), ): type: Literal["dev-environment"] = "dev-environment" + @validator("entrypoint") + def validate_entrypoint(cls, v: Optional[str]) -> Optional[str]: + if v is not None: + raise ValueError("entrypoint is not supported for dev-environment") + return v + class TaskConfigurationParams(CoreModel): nodes: Annotated[int, Field(description="Number of nodes", ge=1)] = 1 +class TaskConfigurationConfig( + ProfileParamsConfig, + BaseRunConfigurationConfig, +): + @staticmethod + def schema_extra(schema: Dict[str, Any]): + ProfileParamsConfig.schema_extra(schema) + BaseRunConfigurationConfig.schema_extra(schema) + + class TaskConfiguration( ProfileParams, - BaseConfigurationWithCommands, - BaseConfigurationWithPorts, + BaseRunConfiguration, + ConfigurationWithCommandsParams, + ConfigurationWithPortsParams, TaskConfigurationParams, + generate_dual_core_model(TaskConfigurationConfig), ): type: Literal["task"] = "task" +class ServiceConfigurationParamsConfig(CoreConfig): + @staticmethod + def schema_extra(schema: Dict[str, Any]): + add_extra_schema_types( + schema["properties"]["replicas"], + extra_types=[{"type": "integer"}, {"type": "string"}], + ) + add_extra_schema_types( + schema["properties"]["model"], + extra_types=[{"type": "string"}], + ) + + +def _validate_replica_range(v: Range[int]) -> Range[int]: + """Validate a Range[int] used for replica counts.""" + if v.max is None: + raise ValueError("The maximum number of replicas is required") + if v.min is None: + v.min = 0 + if v.min < 0: + raise ValueError("The minimum number of replicas must be greater than or equal to 0") + return v + + +class ReplicaGroup(CoreModel): + name: Annotated[ + Optional[str], + Field( + description="The name of the replica group. If not provided, defaults to '0', '1', etc. based on position." + ), + ] + count: Annotated[ + Range[int], + Field( + description="The number of replicas. Can be a number (e.g. `2`) or a range (`0..4` or `1..8`). " + "If it's a range, the `scaling` property is required" + ), + ] + scaling: Annotated[ + Optional[ScalingSpec], + Field(description="The auto-scaling rules. Required if `count` is set to a range"), + ] = None + + resources: Annotated[ + ResourcesSpec, + Field(description="The resources requirements for replicas in this group"), + ] = ResourcesSpec() + spot_policy: Annotated[ + Optional[SpotPolicy], + Field( + description=( + "The policy for provisioning spot or on-demand instances for replicas in this group:" + f" {list_enum_values_for_annotation(SpotPolicy)}" + ) + ), + ] = None + reservation: Annotated[ + Optional[str], + Field( + description=( + "The existing reservation to use for replicas in this group." + " Supports AWS Capacity Reservations, AWS Capacity Blocks, and GCP reservations" + ) + ), + ] = None + + commands: Annotated[ + CommandsList, + Field(description="The shell commands to run for replicas in this group"), + ] = [] + image: Annotated[ + Optional[str], + Field( + description="The name of the Docker image to run for replicas in this group. " + "Mutually exclusive with group-level `docker` and `python`." + ), + ] = None + python: Annotated[ + Optional[PythonVersion], + Field( + description="The major version of Python for replicas in this group. " + "Mutually exclusive with group-level `image` and `docker`." + ), + ] = None + nvcc: Annotated[ + Optional[bool], + Field( + description="Use the image with NVIDIA CUDA Compiler (NVCC) included for replicas in this group. " + "Mutually exclusive with group-level `docker`." + ), + ] = None + docker: Annotated[ + Optional[bool], + Field( + description="Use the docker-in-docker image for this group " + "(injects `start-dockerd` and runs privileged). Mutually " + "exclusive with group-level `image`, `python`, and `nvcc`." + ), + ] = None + privileged: Annotated[ + Optional[bool], + Field(description="Run replicas in this group in privileged mode."), + ] = None + router: Annotated[ + Optional[ReplicaGroupRouterConfig], + Field( + description="When set, replicas in this group run the in-service HTTP router (e.g. SGLang).", + ), + ] = None + + @validator("name") + def validate_name(cls, v: Optional[str]) -> Optional[str]: + if v is not None: + if not is_valid_replica_group_name(v): + raise ValueError("Resource name should match regex '^[a-z0-9][a-z0-9-]{0,39}$'") + return v + + @validator("count") + def convert_count(cls, v: Range[int]) -> Range[int]: + return _validate_replica_range(v) + + @validator("python", pre=True, always=True) + def convert_python(cls, v, values) -> Optional[PythonVersion]: + if v is not None and values.get("image"): + raise ValueError("`image` and `python` are mutually exclusive within a replica group") + if isinstance(v, float): + v = str(v) + if v == "3.1": + v = "3.10" + if isinstance(v, str): + return PythonVersion(v) + return v + + @validator("docker", pre=True, always=True) + def _docker(cls, v, values) -> Optional[bool]: + if v is True and values.get("image"): + raise ValueError("`image` and `docker` are mutually exclusive within a replica group") + if v is True and values.get("python"): + raise ValueError("`python` and `docker` are mutually exclusive within a replica group") + if v is True and values.get("nvcc"): + raise ValueError("`nvcc` and `docker` are mutually exclusive within a replica group") + return v + + @validator("privileged", pre=True, always=True) + def _privileged(cls, v, values) -> Optional[bool]: + # Docker-in-docker requires privileged mode. The service level + # cannot enforce this rule because its `privileged` field defaults + # to `False` (existing backwards-compatibility constraint), so it + # cannot distinguish "unset" from explicit `False`. At the group + # level we keep `privileged` as `Optional[bool] = None`, so we can. + if v is False and values.get("docker") is True: + raise ValueError( + "`privileged: false` is incompatible with `docker: true` within " + "a replica group (docker-in-docker requires privileged mode)" + ) + return v + + @root_validator() + def validate_scaling(cls, values): + scaling = values.get("scaling") + count = values.get("count") + if count and count.min != count.max and not scaling: + raise ValueError("When you set `count` to a range, ensure to specify `scaling`.") + if count and count.min == count.max and scaling: + raise ValueError("To use `scaling`, `count` must be set to a range.") + return values + + class ServiceConfigurationParams(CoreModel): port: Annotated[ + # NOTE: it's a PortMapping for historical reasons. Only `port.container_port` is used. Union[ValidPort, constr(regex=r"^[0-9]+:[0-9]+$"), PortMapping], - Field(description="The port, that application listens on or the mapping"), + Field(description="The port the application listens on"), ] + gateway: Annotated[ + Optional[ + Union[ + bool, + EntityReference, + str, # For server response compatibility with pre-0.20.20 clients + ] + ], + Field( + description=( + "The name of the gateway. Specify boolean `false` to run without a gateway." + " Specify boolean `true` to run with the default gateway." + " Omit to run with the default gateway if there is one, or without a gateway otherwise" + ), + ), + ] = None + strip_prefix: Annotated[ + bool, + Field( + description=( + "Strip the `/proxy/services///` path prefix" + " when forwarding requests to the service. Only takes effect" + " when running the service without a gateway" + ) + ), + ] = STRIP_PREFIX_DEFAULT model: Annotated[ Optional[AnyModel], - Field(description="Mapping of the model for the OpenAI-compatible endpoint"), + Field( + description=( + "Mapping of the model for the OpenAI-compatible endpoint provided by `dstack`." + " Can be a full model format definition or just a model name." + " If it's a name, the service is expected to expose an OpenAI-compatible" + " API at the `/v1` path" + ) + ), ] = None - https: Annotated[bool, Field(description="Enable HTTPS")] = True - auth: Annotated[bool, Field(description="Enable the authorization")] = True - replicas: Annotated[ - Union[conint(ge=1), constr(regex=r"^[0-9]+..[1-9][0-9]*$"), Range[int]], + https: Annotated[ + Optional[Union[bool, Literal["auto"]]], Field( - description="The number of replicas. Can be a number (e.g. `2`) or a range (`0..4` or `1..8`). " - "If it's a range, the `scaling` property is required" + description="Enable HTTPS if running with a gateway." + " Set to `auto` to determine automatically based on gateway configuration." + f" Defaults to `{str(SERVICE_HTTPS_DEFAULT).lower()}`" ), - ] = Range[int](min=1, max=1) + ] = None + auth: Annotated[bool, Field(description="Enable the authorization")] = True + scaling: Annotated[ Optional[ScalingSpec], Field(description="The auto-scaling rules. Required if `replicas` is set to a range"), ] = None + rate_limits: Annotated[list[RateLimit], Field(description="Rate limiting rules")] = [] + probes: Annotated[ + Optional[list[ProbeConfig]], + Field( + description="The list of probes to determine service health. " + "If `model` is set, defaults to a `/v1/chat/completions` probe. " + "Set explicitly to override" + ), + ] = None # None = omitted (may get default when model is set); [] = explicit empty + + replicas: Annotated[ + Optional[Union[List[ReplicaGroup], Range[int]]], + Field( + description=( + "The number of replicas or a list of replica groups. " + "Can be an integer (e.g., `2`), a range (e.g., `0..4`), or a list of replica groups. " + "Each replica group defines replicas with shared configuration " + "(commands, resources, scaling). " + "When `replicas` is a list of replica groups, top-level `scaling`, `commands`, " + "and `resources` are not allowed and must be specified in each replica group instead. " + ) + ), + ] = None + router: Annotated[ + Optional[AnyServiceRouterConfig], + Field( + description=( + "Router configuration for the service. Requires a gateway with matching router enabled. " + ), + ), + ] = None @validator("port") def convert_port(cls, v) -> PortMapping: @@ -264,39 +1055,313 @@ def convert_port(cls, v) -> PortMapping: return PortMapping.parse(v) return v - @validator("replicas") - def convert_replicas(cls, v: Any) -> Range[int]: - if isinstance(v, str) and ".." in v: - min, max = v.replace(" ", "").split("..") - v = Range(min=min or 0, max=max or None) - elif isinstance(v, (int, float)): - v = Range(min=v, max=v) - if v.max is None: - raise ValueError("The maximum number of replicas is required") - if v.min < 0: - raise ValueError("The minimum number of replicas must be greater than or equal to 0") - if v.max < v.min: + @validator("model", pre=True) + def convert_model(cls, v: Optional[Union[AnyModel, str]]) -> Optional[AnyModel]: + if isinstance(v, str): + return OpenAIChatModel(type="chat", name=v, format="openai") + return v + + @validator("rate_limits") + def validate_rate_limits(cls, v: list[RateLimit]) -> list[RateLimit]: + counts = Counter(limit.prefix for limit in v) + duplicates = [prefix for prefix, count in counts.items() if count > 1] + if duplicates: raise ValueError( - "The maximum number of replicas must be greater than or equal to the minium number of replicas" + f"Prefixes {duplicates} are used more than once." + " Each rate limit should have a unique path prefix" ) return v + @validator("probes") + def validate_probes(cls, v: Optional[list[ProbeConfig]]) -> Optional[list[ProbeConfig]]: + if v is None: + return v + if has_duplicates(v): + # Using a custom validator instead of Field(unique_items=True) to avoid Pydantic bug: + # https://github.com/pydantic/pydantic/issues/3765 + # Because of the bug, our gen_schema_reference.py fails to determine the type of + # ServiceConfiguration.probes and insert the correct hyperlink. + raise ValueError("Probes must be unique") + return v + + @validator("gateway") + def validate_gateway( + cls, v: Optional[Union[bool, EntityReference, str]] + ) -> Optional[Union[bool, EntityReference]]: + if isinstance(v, str): + return EntityReference.parse(v) + return v + + @validator("replicas") + def validate_replicas( + cls, v: Optional[Union[Range[int], List[ReplicaGroup]]] + ) -> Optional[Union[Range[int], List[ReplicaGroup]]]: + if v is None: + return v + if isinstance(v, Range): + return _validate_replica_range(v) + + if isinstance(v, list): + if not v: + raise ValueError("`replicas` cannot be an empty list") + + # Assign default names to groups without names + for index, group in enumerate(v): + if group.name is None: + group.name = str(index) + + # Check for duplicate names + names = [group.name for group in v] + if len(names) != len(set(names)): + duplicates = [name for name in set(names) if names.count(name) > 1] + raise ValueError( + f"Duplicate replica group names found: {duplicates}. " + "Each replica group must have a unique name." + ) + return v + @root_validator() def validate_scaling(cls, values): scaling = values.get("scaling") replicas = values.get("replicas") - if replicas.min != replicas.max and not scaling: - raise ValueError("When you set `replicas` to a range, ensure to specify `scaling`.") - if replicas.min == replicas.max and scaling: - raise ValueError("To use `scaling`, `replicas` must be set to a range.") + + if isinstance(replicas, Range): + if replicas and replicas.min != replicas.max and not scaling: + raise ValueError( + "When you set `replicas` to a range, ensure to specify `scaling`." + ) + if replicas and replicas.min == replicas.max and scaling: + raise ValueError("To use `scaling`, `replicas` must be set to a range.") + return values + + @root_validator() + def validate_top_level_properties_with_replica_groups(cls, values): + """ + When replicas is a list of ReplicaGroup, forbid top-level scaling and commands. + """ + replicas = values.get("replicas") + + if not isinstance(replicas, list): + return values + + scaling = values.get("scaling") + if scaling is not None: + raise ValueError( + "Top-level `scaling` is not allowed when `replicas` is a list. " + "Specify `scaling` in each replica group instead." + ) + + commands = values.get("commands", []) + if commands: + raise ValueError( + "Top-level `commands` is not allowed when `replicas` is a list. " + "Specify `commands` in each replica group instead." + ) + + return values + + @root_validator() + def validate_no_mixed_service_and_group_container_fields(cls, values): + """ + When replicas is a list, certain fields may be set + at the service level OR in replica groups, never both. Mixing is + rejected — including partial mixing, where only some groups set a + field the service also sets — because it leaves precedence ambiguous. + """ + replicas = values.get("replicas") + if not isinstance(replicas, list): + return values + + checks = [ + ( + "image", + values.get("image") is not None, + lambda g: g.image is not None, + ), + ( + "docker", + values.get("docker") is True, + lambda g: g.docker is not None, + ), + ( + "privileged", + values.get("privileged") is True, + lambda g: g.privileged is not None, + ), + ( + "python", + values.get("python") is not None, + lambda g: g.python is not None, + ), + ( + "nvcc", + values.get("nvcc") is True, + lambda g: g.nvcc is not None, + ), + ( + "spot_policy", + values.get("spot_policy") is not None, + lambda g: g.spot_policy is not None, + ), + ( + "reservation", + values.get("reservation") is not None, + lambda g: g.reservation is not None, + ), + ] + + for field, service_set, group_set in checks: + if service_set: + conflicting = [g.name for g in replicas if group_set(g)] + if conflicting: + raise ValueError( + f"`{field}` is set at both the service level and in " + f"replica group(s) {conflicting}. Set `{field}` in one " + f"place only — either at the service level (all groups " + f"inherit) or per group, but not both." + ) + return values + + @root_validator() + def validate_no_conflicting_image_sources_across_levels(cls, values): + """ + Image-source fields (`image`, `docker`, `python`, `nvcc`) cannot + be mixed across service and group levels in conflicting ways. + """ + replicas = values.get("replicas") + if not isinstance(replicas, list): + return values + + forbidden = [ + ("image", values.get("image") is not None, "docker", lambda g: g.docker is not None), + ("image", values.get("image") is not None, "python", lambda g: g.python is not None), + ("image", values.get("image") is not None, "nvcc", lambda g: g.nvcc is not None), + ("docker", values.get("docker") is True, "image", lambda g: g.image is not None), + ("docker", values.get("docker") is True, "python", lambda g: g.python is not None), + ("docker", values.get("docker") is True, "nvcc", lambda g: g.nvcc is not None), + ("python", values.get("python") is not None, "image", lambda g: g.image is not None), + ("python", values.get("python") is not None, "docker", lambda g: g.docker is not None), + ("nvcc", values.get("nvcc") is True, "image", lambda g: g.image is not None), + ("nvcc", values.get("nvcc") is True, "docker", lambda g: g.docker is not None), + ] + + for s_field, s_set, g_field, g_pred in forbidden: + if s_set: + conflicting = [g.name for g in replicas if g_pred(g)] + if conflicting: + raise ValueError( + f"Service-level `{s_field}` conflicts with group-level " + f"`{g_field}` in replica group(s) {conflicting}. " + f"These image-source fields are mutually exclusive." + ) + return values + + @root_validator() + def validate_replica_groups_have_commands_or_image(cls, values): + """ + When replicas is a list, ensure each ReplicaGroup has something + to run. Mirrors the service-level rule: either explicit + `commands` or an `image` (group-level or service-level) is + required. + """ + replicas = values.get("replicas") + + if not isinstance(replicas, list): + return values + + service_has_image = values.get("image") is not None + + for group in replicas: + if not group.commands and group.image is None and not service_has_image: + raise ValueError( + f"Replica group '{group.name}': either `commands` or " + "`image` must be set in the group, or `image` at the " + "service level." + ) + + return values + + @root_validator() + def validate_at_most_one_router_replica_group(cls, values): + replicas = values.get("replicas") + if not isinstance(replicas, list): + return values + router_groups = [g for g in replicas if g.router is not None] + if len(router_groups) > 1: + raise ValueError("At most one replica group may specify `router`.") + if router_groups: + router_group = router_groups[0] + if router_group.count.min != 1 or router_group.count.max != 1: + raise ValueError("For now replica group with `router` must have `count: 1`.") + return values + + @root_validator() + def validate_replica_group_router_mutex(cls, values): + """ + When a replica group sets `router:`, service-level `router` must be omitted. + (Gateway-level SGLang is rejected at service registration when a gateway is selected.) + """ + replicas = values.get("replicas") + if not isinstance(replicas, list): + return values + if not any(g.router is not None for g in replicas): + return values + if values.get("router") is not None: + raise ValueError( + "Service-Level router configuration is not allowed together with replica-group `router`." + ) return values +class ServiceConfigurationConfig( + ProfileParamsConfig, + BaseRunConfigurationConfig, + ServiceConfigurationParamsConfig, +): + @staticmethod + def schema_extra(schema: Dict[str, Any]): + ProfileParamsConfig.schema_extra(schema) + BaseRunConfigurationConfig.schema_extra(schema) + ServiceConfigurationParamsConfig.schema_extra(schema) + + class ServiceConfiguration( - ProfileParams, BaseConfigurationWithCommands, ServiceConfigurationParams + ProfileParams, + BaseRunConfiguration, + ConfigurationWithCommandsParams, + ServiceConfigurationParams, + generate_dual_core_model(ServiceConfigurationConfig), ): type: Literal["service"] = "service" + @property + def replica_groups(self) -> List[ReplicaGroup]: + if self.replicas is None: + return [ + ReplicaGroup( + name=DEFAULT_REPLICA_GROUP_NAME, + count=Range[int](min=1, max=1), + commands=self.commands, + resources=self.resources, + scaling=self.scaling, + ) + ] + if isinstance(self.replicas, list): + return self.replicas + if isinstance(self.replicas, Range): + return [ + ReplicaGroup( + name=DEFAULT_REPLICA_GROUP_NAME, + count=self.replicas, + commands=self.commands, + resources=self.resources, + scaling=self.scaling, + ) + ] + raise ValueError( + f"Invalid replicas type: {type(self.replicas)}. Expected None, Range[int], or List[ReplicaGroup]" + ) + AnyRunConfiguration = Union[DevEnvironmentConfiguration, TaskConfiguration, ServiceConfiguration] @@ -317,29 +1382,67 @@ def parse_run_configuration(data: dict) -> AnyRunConfiguration: class ApplyConfigurationType(str, Enum): + DEV_ENVIRONMENT = "dev-environment" + TASK = "task" + SERVICE = "service" + FLEET = "fleet" GATEWAY = "gateway" VOLUME = "volume" -AnyApplyConfiguration = Union[GatewayConfiguration, VolumeConfiguration] +AnyApplyConfiguration = Union[ + AnyRunConfiguration, + FleetConfiguration, + GatewayConfiguration, + AnyVolumeConfiguration, +] -class ApplyConfiguration(CoreModel): +class BaseApplyConfiguration(CoreModel): + """ + `BaseApplyConfiguration` parses the configuration based on the `type` discriminator field, + but further dispatching (reparsing) may be required if there is another discriminator field, + e.g., `BaseVolumeConfiguration` should be parsed again to get a backend-specific configuration + based on the `backend` discriminator field. + + Don't use this model directly, use `parse_apply_configuration()` instead. + """ + __root__: Annotated[ - AnyApplyConfiguration, + Union[ + # Final configurations + AnyRunConfiguration, + FleetConfiguration, + GatewayConfiguration, + # Base configurations (further parsing required to get a concrete AnyApplyConfiguration) + BaseVolumeConfiguration, + ], Field(discriminator="type"), ] def parse_apply_configuration(data: dict) -> AnyApplyConfiguration: try: - conf = ApplyConfiguration.parse_obj(data).__root__ + # First-pass parsing ignoring extra fields, to get the base (or final) configuration + conf = BaseApplyConfiguration.__response__.parse_obj(data).__root__ + if not isinstance(conf, BaseVolumeConfiguration): + # If it's a final configuration (currently, any configuration other than + # BaseVolumeConfiguration), parse again rejecting extra fields + # for validation purposes only and return the final configuration + _ = BaseApplyConfiguration.parse_obj(data).__root__ + return conf except ValidationError as e: raise ConfigurationError(e) - return conf + # Otherwise, delegate further parsing to more specific parser + return parse_volume_configuration(data) -AnyDstackConfiguration = Union[AnyRunConfiguration, GatewayConfiguration] +AnyDstackConfiguration = Union[ + AnyRunConfiguration, + FleetConfiguration, + GatewayConfiguration, + VolumeConfiguration, +] class DstackConfiguration(CoreModel): @@ -348,5 +1451,13 @@ class DstackConfiguration(CoreModel): Field(discriminator="type"), ] - class Config: - schema_extra = {"$schema": "http://json-schema.org/draft-07/schema#"} + class Config(CoreConfig): + json_loads = orjson.loads + json_dumps = pydantic_orjson_dumps_with_indent + + @staticmethod + def schema_extra(schema: Dict[str, Any]): + schema["$schema"] = "http://json-schema.org/draft-07/schema#" + # Allow additionalProperties so that vscode and others not supporting + # top-level oneOf do not warn about properties being invalid. + schema["additionalProperties"] = True diff --git a/src/dstack/_internal/core/models/envs.py b/src/dstack/_internal/core/models/envs.py new file mode 100644 index 0000000000..5109c01d5d --- /dev/null +++ b/src/dstack/_internal/core/models/envs.py @@ -0,0 +1,149 @@ +import re +from typing import Dict, Iterable, Iterator, List, Mapping, NamedTuple, Tuple, Union, cast + +from pydantic import BaseModel, Field, validator +from typing_extensions import Annotated, Self + +from dstack._internal.core.models.common import CoreModel + +# VAR_NAME=VALUE, VAR_NAME=, or VAR_NAME +_ENV_STRING_REGEX = r"^([a-zA-Z_][a-zA-Z0-9_]*)(=.*$|$)" + + +class EnvSentinel(CoreModel): + key: str + + def from_env(self, env: Mapping[str, str]) -> str: + if self.key in env: + return env[self.key] + raise ValueError(f"Environment variable {self.key} is not set") + + def __str__(self): + return f"EnvSentinel({self.key})" + + +class EnvVarTuple(NamedTuple): + key: str + value: Union[str, EnvSentinel] + + @classmethod + def parse(cls, v: str) -> Self: + r = re.match(_ENV_STRING_REGEX, v) + if r is None: + raise ValueError(v) + if "=" in v: + key, value = v.split("=", 1) + else: + key = r.group(1) + value = EnvSentinel(key=key) + return cls(key, value) + + +class Env(BaseModel): + """ + Env represents a mapping of process environment variables, as in environ(7). + Environment values may be omitted, in that case the :class:`EnvSentinel` + object is used as a placeholder. + + To create an instance from a `dict[str, str]` or a `list[str]` use pydantic's + :meth:`BaseModel.parse_obj(dict | list)` method. + + NB: this is *NOT* a CoreModel, pydantic-duality, which is used as a base + for the CoreModel, doesn't play well with custom root models. + """ + + __root__: Union[ + List[Annotated[str, Field(regex=_ENV_STRING_REGEX)]], + Dict[str, Union[str, EnvSentinel]], + ] = {} + + @validator("__root__") + def validate_root(cls, v: Union[List[str], Dict[str, str]]) -> Dict[str, str]: + if isinstance(v, list): + d = {} + for var in v: + if "=" not in var: + if var not in d: + d[var] = EnvSentinel(key=var) + else: + raise ValueError(f"Duplicate environment variable: {var}") + else: + k, val = var.split("=", maxsplit=1) + if k not in d: + d[k] = val + else: + raise ValueError(f"Duplicate environment variable: {var}") + return d + # TODO: apply the same validation rules to dict keys as for keys in the list form; + # validate values (must be strings). + return v + + def __repr__(self) -> str: + return f"{self.__class__.__name__}({self._dict})" + + def __str__(self) -> str: + return str(self._dict) + + def __iter__(self) -> Iterator[str]: + return iter(self._dict) + + def __contains__(self, item: str) -> bool: + return item in self._dict + + def __len__(self) -> int: + return len(self._dict) + + def __getitem__(self, item): + return self._dict[item] + + def __setitem__(self, item, value): + self._dict[item] = value + + def copy(self, **kwargs) -> Self: + # Env.copy() is tricky because it copies only the hidden top-level {"__root__": {...}} + # structure, not the actual nested dict representing the env itself. + # So we copy __root__ explicitly in case of a shallow copy. + new_copy = super().copy(**kwargs) + if not kwargs.get("deep", False): + new_copy.__root__ = new_copy.__root__.copy() + return new_copy + + def as_dict(self) -> Dict[str, str]: + """ + Returns env variables as a new dict asserting that all values + are resolved. + + :raises ValueError: Not all variables are resolved. + """ + unresolved: List[str] = [] + dct: Dict[str, str] = {} + for k, v in self.items(): + if isinstance(v, EnvSentinel): + unresolved.append(k) + else: + # cast is required since TypeGuard is for positive cases only + dct[k] = cast(str, v) + if unresolved: + unresolved_repr = ", ".join(sorted(unresolved)) + raise ValueError(f"not all variables are resolved: {unresolved_repr}") + return dct + + def update(self, env_or_map: Union[Self, Mapping[str, Union[str, EnvSentinel]]]) -> None: + if isinstance(env_or_map, type(self)): + self._dict.update(env_or_map._dict) + else: + self._dict.update(env_or_map) + + def keys(self) -> Iterable[str]: + return self._dict.keys() + + def values(self) -> Iterable[Union[str, EnvSentinel]]: + return self._dict.values() + + def items(self) -> Iterable[Tuple[str, Union[str, EnvSentinel]]]: + return self._dict.items() + + @property + def _dict(self) -> Dict[str, Union[str, EnvSentinel]]: + # this property is redundant for runtime and used for _proper_ type signature only + return cast(Dict, self.__root__) diff --git a/src/dstack/_internal/core/models/events.py b/src/dstack/_internal/core/models/events.py new file mode 100644 index 0000000000..f2efb80d0e --- /dev/null +++ b/src/dstack/_internal/core/models/events.py @@ -0,0 +1,98 @@ +import uuid +from datetime import datetime +from enum import Enum +from typing import Annotated, Optional + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel +from dstack._internal.utils.common import list_enum_values_for_annotation + + +class EventTargetType(str, Enum): + PROJECT = "project" + USER = "user" + FLEET = "fleet" + INSTANCE = "instance" + RUN = "run" + JOB = "job" + VOLUME = "volume" + GATEWAY = "gateway" + SECRET = "secret" + + +class EventTarget(CoreModel): + type: Annotated[ + str, # not using EventTargetType to allow adding new types without breaking compatibility + Field( + description=( + f"Type of the target entity." + f" One of: {list_enum_values_for_annotation(EventTargetType)}" + ) + ), + ] + project_id: Annotated[ + Optional[uuid.UUID], + Field( + description=( + "ID of the project the target entity belongs to," + " or `null` for target types not bound to a project (e.g., users)" + ) + ), + ] + project_name: Annotated[ + Optional[str], + Field( + description=( + "Name of the project the target entity belongs to," + " or `null` for target types not bound to a project (e.g., users)" + ) + ), + ] + is_project_deleted: Annotated[ + Optional[bool], + Field( + description=( + "Whether the project the target entity belongs to is deleted," + " or `null` for target types not bound to a project (e.g., users)" + ) + ), + ] = None # default for client compatibility with pre-0.20.1 servers + id: Annotated[uuid.UUID, Field(description="ID of the target entity")] + name: Annotated[str, Field(description="Name of the target entity")] + + +class Event(CoreModel): + id: uuid.UUID + message: str + recorded_at: datetime + actor_user_id: Annotated[ + Optional[uuid.UUID], + Field( + description=( + "ID of the user who performed the action that triggered the event," + " or `null` if the action was performed by the system" + ) + ), + ] + actor_user: Annotated[ + Optional[str], + Field( + description=( + "Name of the user who performed the action that triggered the event," + " or `null` if the action was performed by the system" + ) + ), + ] + is_actor_user_deleted: Annotated[ + Optional[bool], + Field( + description=( + "Whether the user who performed the action that triggered the event is deleted," + " or `null` if the action was performed by the system" + ) + ), + ] = None # default for client compatibility with pre-0.20.1 servers + targets: Annotated[ + list[EventTarget], Field(description="List of entities affected by the event") + ] diff --git a/src/dstack/_internal/core/models/exports.py b/src/dstack/_internal/core/models/exports.py new file mode 100644 index 0000000000..eafe38b478 --- /dev/null +++ b/src/dstack/_internal/core/models/exports.py @@ -0,0 +1,26 @@ +import uuid + +from dstack._internal.core.models.common import CoreModel + + +class ExportImport(CoreModel): + project_name: str + + +class ExportedFleet(CoreModel): + id: uuid.UUID + name: str + + +class ExportedGateway(CoreModel): + id: uuid.UUID + name: str + + +class Export(CoreModel): + id: uuid.UUID + name: str + is_global: bool = False + imports: list[ExportImport] + exported_fleets: list[ExportedFleet] + exported_gateways: list[ExportedGateway] = [] diff --git a/src/dstack/_internal/core/models/files.py b/src/dstack/_internal/core/models/files.py new file mode 100644 index 0000000000..2c82fd53b5 --- /dev/null +++ b/src/dstack/_internal/core/models/files.py @@ -0,0 +1,67 @@ +import pathlib +import string +from uuid import UUID + +from pydantic import Field, validator +from typing_extensions import Annotated, Self + +from dstack._internal.core.models.common import CoreModel + + +class FileArchive(CoreModel): + id: UUID + hash: str + + +class FilePathMapping(CoreModel): + local_path: Annotated[ + str, + Field( + description=( + "The path on the user's machine. Relative paths are resolved relative to" + " the parent directory of the the configuration file" + ) + ), + ] + path: Annotated[ + str, + Field( + description=( + "The path in the container. Relative paths are resolved relative to" + " the working directory" + ) + ), + ] + + @classmethod + def parse(cls, v: str) -> Self: + local_path: str + path: str + parts = v.split(":") + # A special case for Windows paths, e.g., `C:\path\to`, 'c:/path/to' + if ( + len(parts) > 1 + and len(parts[0]) == 1 + and parts[0] in string.ascii_letters + and parts[1][:1] in ["\\", "/"] + ): + parts = [f"{parts[0]}:{parts[1]}", *parts[2:]] + if len(parts) == 1: + local_path = path = parts[0] + elif len(parts) == 2: + local_path, path = parts + else: + raise ValueError(f"invalid file path mapping: {v}") + return cls(local_path=local_path, path=path) + + @validator("path") + def validate_path(cls, v) -> str: + # True for `C:/.*`, False otherwise, including `/abs/unix/path`, `rel\windows\path`, etc. + if pathlib.PureWindowsPath(v).is_absolute(): + raise ValueError(f"path must be a Unix file path: {v}") + return v + + +class FileArchiveMapping(CoreModel): + id: Annotated[UUID, Field(description="The File archive ID")] + path: Annotated[str, Field(description="The path in the container")] diff --git a/src/dstack/_internal/core/models/fleets.py b/src/dstack/_internal/core/models/fleets.py new file mode 100644 index 0000000000..ce636ba8de --- /dev/null +++ b/src/dstack/_internal/core/models/fleets.py @@ -0,0 +1,466 @@ +import ipaddress +import uuid +from datetime import datetime +from enum import Enum +from typing import Any, Dict, List, Optional, Union + +from pydantic import Field, root_validator, validator +from typing_extensions import Annotated, Literal + +from dstack._internal.core.backends.profile_options import AnyBackendProfileOptions +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import ( + ApplyAction, + CoreConfig, + CoreModel, + generate_dual_core_model, +) +from dstack._internal.core.models.envs import Env +from dstack._internal.core.models.instances import Instance, InstanceOfferWithAvailability, SSHKey +from dstack._internal.core.models.profiles import ( + Profile, + ProfileParams, + ProfileRetry, + SpotPolicy, + parse_idle_duration, + validate_backend_options, +) +from dstack._internal.core.models.resources import ResourcesSpec +from dstack._internal.utils.common import list_enum_values_for_annotation +from dstack._internal.utils.json_schema import add_extra_schema_types +from dstack._internal.utils.tags import tags_validator + + +class FleetStatus(str, Enum): + # Currently all fleets are ACTIVE, TERMINATING, or TERMINATED. + # SUBMITTED and FAILED may be used if fleets require async processing. + SUBMITTED = "submitted" + ACTIVE = "active" + TERMINATING = "terminating" + TERMINATED = "terminated" + FAILED = "failed" + + +class InstanceGroupPlacement(str, Enum): + ANY = "any" + CLUSTER = "cluster" + + +class SSHProxyParams(CoreModel): + hostname: Annotated[str, Field(description="The IP address or domain of proxy host")] + port: Annotated[Optional[int], Field(description="The SSH port of proxy host")] = None + user: Annotated[str, Field(description="The user to log in with for proxy host")] + identity_file: Annotated[str, Field(description="The private key to use for proxy host")] + ssh_key: Optional[SSHKey] = None + + +class SSHHostParams(CoreModel): + hostname: Annotated[str, Field(description="The IP address or domain to connect to")] + port: Annotated[ + Optional[int], Field(description="The SSH port to connect to for this host") + ] = None + user: Annotated[Optional[str], Field(description="The user to log in with for this host")] = ( + None + ) + identity_file: Annotated[ + Optional[str], Field(description="The private key to use for this host") + ] = None + proxy_jump: Annotated[ + Optional[SSHProxyParams], Field(description="The SSH proxy configuration for this host") + ] = None + internal_ip: Annotated[ + Optional[str], + Field( + description=( + "The internal IP of the host used for communication inside the cluster." + " If not specified, `dstack` will use the IP address from `network` or from the first found internal network." + ) + ), + ] = None + ssh_key: Optional[SSHKey] = None + + blocks: Annotated[ + Optional[Union[Literal["auto"], int]], + Field( + description=( + "The amount of blocks to split the instance into, a number or `auto`." + " `auto` means as many as possible." + " The number of GPUs and CPUs must be divisible by the number of blocks." + " Defaults to the top-level `blocks` value" + ), + ge=1, + ), + ] = None + + @validator("internal_ip") + def validate_internal_ip(cls, value): + if value is None: + return value + try: + internal_ip = ipaddress.ip_address(value) + except ValueError as e: + raise ValueError("Invalid IP address") from e + if not internal_ip.is_private: + raise ValueError("IP address is not private") + return value + + +class SSHParams(CoreModel): + user: Annotated[Optional[str], Field(description="The user to log in with on all hosts")] = ( + None + ) + port: Annotated[Optional[int], Field(description="The SSH port to connect to")] = None + identity_file: Annotated[ + Optional[str], Field(description="The private key to use for all hosts") + ] = None + ssh_key: Optional[SSHKey] = None + proxy_jump: Annotated[ + Optional[SSHProxyParams], Field(description="The SSH proxy configuration for all hosts") + ] = None + hosts: Annotated[ + List[Union[SSHHostParams, str]], + Field( + description="The per host connection parameters: a hostname or an object that overrides default ssh parameters" + ), + ] + network: Annotated[ + Optional[str], + Field( + description=( + "The network address for cluster setup in the format `/`." + " `dstack` will use IP addresses from this network for communication between hosts." + " If not specified, `dstack` will use IPs from the first found internal network." + ) + ), + ] = None + + @validator("network") + def validate_network(cls, value): + if value is None: + return value + try: + network = ipaddress.ip_network(value, strict=False) + except ValueError as e: + raise ValueError(f"Failed to parse network: {value}") from e + if not network.is_private: + raise ValueError("Public network is specified when private network is required") + return value + + +class FleetNodesSpec(CoreModel): + min: Annotated[ + int, Field(description=("The minimum number of instances to maintain in the fleet")) + ] + target: Annotated[ + int, + Field( + description=( + "The number of instances to provision on fleet apply. `min` <= `target` <= `max`" + " Defaults to `min`" + ) + ), + ] + max: Annotated[ + Optional[int], + Field( + description=( + "The maximum number of instances allowed in the fleet. Unlimited if not specified" + ) + ), + ] = None + + def dict(self, *args, **kwargs) -> Dict: + # super() does not work with pydantic-duality + res = CoreModel.dict(self, *args, **kwargs) + # For backward compatibility with old clients + # that do not ignore extra fields due to https://github.com/dstackai/dstack/issues/3066 + if "target" in res and res["target"] == res["min"]: + del res["target"] + return res + + @root_validator(pre=True) + def set_min_and_target_defaults(cls, values): + min_ = values.get("min") + target = values.get("target") + if min_ is None: + values["min"] = 0 + if target is None: + values["target"] = values["min"] + return values + + @validator("min") + def validate_min(cls, v: int) -> int: + if v < 0: + raise ValueError("min cannot be negative") + return v + + @root_validator(skip_on_failure=True) + def _post_validate_ranges(cls, values): + min_ = values["min"] + target = values["target"] + max_ = values.get("max") + if target < min_: + raise ValueError("target must not be be less than min") + if max_ is not None and max_ < min_: + raise ValueError("max must not be less than min") + if max_ is not None and max_ < target: + raise ValueError("max must not be less than target") + return values + + +class CommonFleetConfigurationProps(CoreModel): + type: Literal["fleet"] = "fleet" + name: Annotated[Optional[str], Field(description="The fleet name")] = None + placement: Annotated[ + Optional[InstanceGroupPlacement], + Field(description="The placement of instances: `any` or `cluster`"), + ] = None + blocks: Annotated[ + Union[Literal["auto"], int], + Field( + description=( + "The amount of blocks to split the instance into, a number or `auto`." + " `auto` means as many as possible." + " The number of GPUs and CPUs must be divisible by the number of blocks." + " Defaults to `1`, i.e. do not split" + ), + ge=1, + ), + ] = 1 + + +class BackendFleetConfiguraionProps(CoreModel): + nodes: Annotated[Optional[FleetNodesSpec], Field(description="The number of instances")] = None + reservation: Annotated[ + Optional[str], + Field( + description=( + "The existing reservation to use for instance provisioning." + " Supports AWS Capacity Reservations, AWS Capacity Blocks, and GCP reservations" + ) + ), + ] = None + resources: Annotated[ + Optional[ResourcesSpec], + Field(description="The resources requirements"), + ] = None + backends: Annotated[ + Optional[List[BackendType]], + Field(description="The backends to consider for provisioning (e.g., `[aws, gcp]`)"), + ] = None + regions: Annotated[ + Optional[List[str]], + Field( + description="The regions to consider for provisioning (e.g., `[eu-west-1, us-west4, westeurope]`)" + ), + ] = None + availability_zones: Annotated[ + Optional[List[str]], + Field( + description="The availability zones to consider for provisioning (e.g., `[eu-west-1a, us-west4-a]`)" + ), + ] = None + instance_types: Annotated[ + Optional[List[str]], + Field( + description="The cloud-specific instance types to consider for provisioning (e.g., `[g6e.24xlarge, n1-standard-4]`)" + ), + ] = None + spot_policy: Annotated[ + Optional[SpotPolicy], + Field( + description=( + "The policy for provisioning spot or on-demand instances:" + f" {list_enum_values_for_annotation(SpotPolicy)}." + f" Defaults to `{SpotPolicy.ONDEMAND.value}`" + ) + ), + ] = None + retry: Annotated[ + Optional[Union[ProfileRetry, bool]], + Field(description="The policy for provisioning retry. Defaults to `false`"), + ] = None + max_price: Annotated[ + Optional[float], + Field(description="The maximum instance price per hour, in dollars", gt=0.0), + ] = None + idle_duration: Annotated[ + Optional[int], + Field( + description=( + "Time to wait before terminating idle instances." + " Instances are not terminated if the fleet is already at `nodes.min`." + " Defaults to `5m` for runs and `3d` for fleets." + " Use `off` for unlimited duration" + ) + ), + ] = None + tags: Annotated[ + Optional[Dict[str, str]], + Field( + description=( + "The custom tags to associate with the resource." + " The tags are also propagated to the underlying backend resources." + " If there is a conflict with backend-level tags, does not override them" + ) + ), + ] = None + backend_options: Annotated[ + Optional[List[AnyBackendProfileOptions]], + Field(description="Backend-specific options, applied only to offers from that backend"), + ] = None + + @validator("nodes", pre=True) + def parse_nodes(cls, v: Optional[Union[dict, str]]) -> Optional[dict]: + if isinstance(v, str) and ".." in v: + v = v.replace(" ", "") + min, max = v.split("..") + return dict(min=min or None, max=max or None) + elif isinstance(v, str) or isinstance(v, int): + return dict(min=v, max=v) + return v + + _validate_idle_duration = validator("idle_duration", pre=True, allow_reuse=True)( + parse_idle_duration + ) + _validate_tags = validator("tags", pre=True, allow_reuse=True)(tags_validator) + _validate_backend_options = validator("backend_options", allow_reuse=True)( + validate_backend_options + ) + + +class BackendFleetConfigurationPropsConfig(CoreConfig): + @staticmethod + def schema_extra(schema: Dict[str, Any]): + add_extra_schema_types( + schema["properties"]["nodes"], + extra_types=[{"type": "integer"}, {"type": "string"}], + ) + add_extra_schema_types( + schema["properties"]["idle_duration"], + extra_types=[{"type": "string"}], + ) + + +class SSHFleetConfigurationProps(CoreModel): + ssh_config: Annotated[ + Optional[SSHParams], + Field(description="The parameters for adding instances via SSH"), + ] = None + env: Annotated[ + Env, + Field(description="The mapping or the list of environment variables"), + ] = Env() + + +class FleetConfigurationConfig(BackendFleetConfigurationPropsConfig): + @staticmethod + def schema_extra(schema: dict[str, Any]): + BackendFleetConfigurationPropsConfig.schema_extra(schema) + + +class FleetConfiguration( + SSHFleetConfigurationProps, + BackendFleetConfiguraionProps, + CommonFleetConfigurationProps, + generate_dual_core_model(FleetConfigurationConfig), +): + pass + + +class BackendFleetConfiguration( + BackendFleetConfiguraionProps, + CommonFleetConfigurationProps, + generate_dual_core_model(BackendFleetConfigurationPropsConfig), +): + """For the documentation only""" + + +class SSHFleetConfiguration( + SSHFleetConfigurationProps, + CommonFleetConfigurationProps, +): + """For the documentation only""" + + +class FleetSpecConfig(CoreConfig): + @staticmethod + def schema_extra(schema: Dict[str, Any]): + prop = schema.get("properties", {}) + prop.pop("merged_profile", None) + + +class FleetSpec(generate_dual_core_model(FleetSpecConfig)): + configuration: FleetConfiguration + configuration_path: Optional[str] = None + profile: Profile + # TODO: Drop `autocreated` once last client sending it (0.20.16) + # and existing autocreated fleets no longer supported. + autocreated: bool = False + """Deprecated. Kept for deserialization of old client requests and existing DB records. + """ + # TODO: make `merged_profile` a computed field after migrating to Pydantic v2. + merged_profile: Annotated[Profile, Field(exclude=True)] = None + """`merged_profile` stores profile parameters merged from `profile` and `configuration`. + Read profile parameters from `merged_profile` instead of `profile` directly. + """ + + @root_validator + def _merged_profile(cls, values) -> Dict: + try: + merged_profile = Profile.parse_obj(values["profile"]) + conf = FleetConfiguration.parse_obj(values["configuration"]) + except KeyError: + raise ValueError("Missing profile or configuration") + for key in ProfileParams.__fields__: + conf_val = getattr(conf, key, None) + if conf_val is not None: + setattr(merged_profile, key, conf_val) + if merged_profile.spot_policy is None: + merged_profile.spot_policy = SpotPolicy.ONDEMAND + if merged_profile.retry is None: + merged_profile.retry = False + values["merged_profile"] = merged_profile + return values + + +class Fleet(CoreModel): + id: uuid.UUID + name: str + project_name: str + spec: FleetSpec + created_at: datetime + status: FleetStatus + status_message: Optional[str] = None + instances: List[Instance] + + +class FleetPlan(CoreModel): + project_name: str + user: str + spec: FleetSpec + effective_spec: Optional[FleetSpec] = None + current_resource: Optional[Fleet] = None + offers: List[InstanceOfferWithAvailability] + total_offers: int + max_offer_price: Optional[float] = None + action: Optional[ApplyAction] = None + """`action` uses a default value for backward compatibility.""" + + def get_effective_spec(self) -> FleetSpec: + if self.effective_spec is not None: + return self.effective_spec + return self.spec + + +class ApplyFleetPlanInput(CoreModel): + spec: FleetSpec + current_resource: Annotated[ + Optional[Fleet], + Field( + description=( + "The expected current resource." + " If the resource has changed, the apply fails unless `force: true`." + ) + ), + ] = None diff --git a/src/dstack/_internal/core/models/gateways.py b/src/dstack/_internal/core/models/gateways.py index 68ac1bbe7b..74b3f4e835 100644 --- a/src/dstack/_internal/core/models/gateways.py +++ b/src/dstack/_internal/core/models/gateways.py @@ -1,12 +1,17 @@ import datetime +import uuid from enum import Enum -from typing import Optional, Union +from typing import Dict, Optional, Union -from pydantic import Field +from pydantic import Field, validator from typing_extensions import Annotated, Literal from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.common import CoreModel +from dstack._internal.core.models.routers import AnyGatewayRouterConfig +from dstack._internal.utils.tags import tags_validator + +GATEWAY_REPLICAS_DEFAULT = 1 class GatewayStatus(str, Enum): @@ -49,34 +54,118 @@ class GatewayConfiguration(CoreModel): default: Annotated[bool, Field(description="Make the gateway default")] = False backend: Annotated[BackendType, Field(description="The gateway backend")] region: Annotated[str, Field(description="The gateway region")] + instance_type: Annotated[ + Optional[str], + Field( + description=( + "Backend-specific instance type to use for the gateway instance." + " Omit to use the backend's default, which is typically a small non-GPU instance" + ), + min_length=1, + ), + ] = None + router: Annotated[ + Optional[AnyGatewayRouterConfig], + Field( + description=( + "The router configuration for this gateway. " + "E.g. `{ type: sglang, policy: round_robin }`." + ), + ), + ] = None domain: Annotated[ - Optional[str], Field(description="The gateway domain, e.g. `example.com`") + Optional[str], + Field( + description=( + "The gateway wildcard domain name, e.g. `example.com`." + " Service domain names are constructed as `. bool: + return self == self.HEALTHY + + def is_failure(self) -> bool: + return self == self.FAILURE + + +class HealthEvent(CoreModel): + timestamp: datetime + status: HealthStatus + message: str + + +class HealthCheck(CoreModel): + collected_at: datetime + status: HealthStatus + events: list[HealthEvent] diff --git a/src/dstack/_internal/core/models/imports.py b/src/dstack/_internal/core/models/imports.py new file mode 100644 index 0000000000..3329b3a753 --- /dev/null +++ b/src/dstack/_internal/core/models/imports.py @@ -0,0 +1,26 @@ +import uuid + +from dstack._internal.core.models.common import CoreModel + + +class ImportExportedFleet(CoreModel): + id: uuid.UUID + name: str + + +class ImportExportedGateway(CoreModel): + id: uuid.UUID + name: str + + +class ImportExport(CoreModel): + id: uuid.UUID + name: str + project_name: str + exported_fleets: list[ImportExportedFleet] + exported_gateways: list[ImportExportedGateway] = [] + + +class Import(CoreModel): + id: uuid.UUID + export: ImportExport diff --git a/src/dstack/_internal/core/models/instances.py b/src/dstack/_internal/core/models/instances.py index 9f65ef630a..dfce209c32 100644 --- a/src/dstack/_internal/core/models/instances.py +++ b/src/dstack/_internal/core/models/instances.py @@ -1,16 +1,49 @@ +import datetime from enum import Enum -from typing import List, Optional +from typing import Annotated, Any, Dict, List, Optional +from uuid import UUID + +import gpuhunt +from pydantic import Field, root_validator from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.common import CoreModel -from dstack._internal.core.models.configurations import RegistryAuth -from dstack._internal.server.services.docker import DockerImage -from dstack._internal.utils.common import pretty_resources +from dstack._internal.core.models.common import ( + CoreModel, + FrozenCoreModel, +) +from dstack._internal.core.models.envs import Env +from dstack._internal.core.models.health import HealthStatus +from dstack._internal.core.models.volumes import Volume +from dstack._internal.utils.common import format_mib_as_gb, pretty_resources +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) class Gpu(CoreModel): name: str memory_mib: int + vendor: Optional[gpuhunt.AcceleratorVendor] = None + """`vendor` is declared as optional, but the root validator always sets a value. + `assert gpu.vendor is not None` should be a safe type narrowing. + """ + + @root_validator(pre=True) + def validate_name_and_vendor(cls, values): + is_tpu = False + name = values.get("name") + if name and name.startswith("tpu-"): + is_tpu = True + values["name"] = name[4:] + vendor = values.get("vendor") + if vendor is None: + if is_tpu: + values["vendor"] = gpuhunt.AcceleratorVendor.GOOGLE + else: + values["vendor"] = gpuhunt.AcceleratorVendor.NVIDIA + else: + values["vendor"] = gpuhunt.AcceleratorVendor.cast(vendor) + return values class Disk(CoreModel): @@ -22,24 +55,71 @@ class Resources(CoreModel): memory_mib: int gpus: List[Gpu] spot: bool - disk: Disk = Disk(size_mib=102400) # the default value (100GB) for backward compatibility - description: str = "" + disk: Disk = Disk(size_mib=102400) + """`disk` defaults to 100GB for backward compatibility.""" + cpu_arch: Optional[gpuhunt.CPUArchitecture] = None + # TODO: remove `description` in 0.21. + description: Annotated[ + str, + Field(description="Deprecated: generated client-side. Will be removed in 0.21."), + ] = "" + + def pretty_format(self, include_spot: bool = False, gpu_only: bool = False) -> str: + return Resources._pretty_format( + self.cpus, + self.cpu_arch, + self.memory_mib, + self.disk.size_mib, + self.gpus, + self.spot, + include_spot, + gpu_only, + ) + + @staticmethod + def _pretty_format( + cpus: int, + cpu_arch: Optional[gpuhunt.CPUArchitecture], + memory_mib: int, + disk_size_mib: int, + gpus: List[Gpu], + spot: bool, + include_spot: bool = False, + gpu_only: bool = False, + ) -> str: + if gpu_only: + if not gpus: + return "-" + gpu = gpus[0] + gpu_resources = { + "gpu_name": gpu.name, + "gpu_count": len(gpus), + } + if gpu.memory_mib > 0: + gpu_resources["gpu_memory"] = format_mib_as_gb(gpu.memory_mib) + output = pretty_resources(**gpu_resources) + if include_spot and spot: + output += " (spot)" + return output - def pretty_format(self) -> str: resources = {} - if self.cpus > 0: - resources["cpus"] = self.cpus - if self.memory_mib > 0: - resources["memory"] = f"{self.memory_mib / 1024:.0f}GB" - if self.disk.size_mib > 0: - resources["disk_size"] = f"{self.disk.size_mib / 1024:.1f}GB" - if self.gpus: - gpu = self.gpus[0] + if cpus > 0: + resources["cpus"] = cpus + resources["cpu_arch"] = cpu_arch + if memory_mib > 0: + resources["memory"] = format_mib_as_gb(memory_mib) + if disk_size_mib > 0: + resources["disk_size"] = format_mib_as_gb(disk_size_mib) + if gpus: + gpu = gpus[0] resources["gpu_name"] = gpu.name - resources["gpu_count"] = len(self.gpus) + resources["gpu_count"] = len(gpus) if gpu.memory_mib > 0: - resources["gpu_memory"] = f"{gpu.memory_mib / 1024:.0f}GB" - return pretty_resources(**resources) + resources["gpu_memory"] = format_mib_as_gb(gpu.memory_mib) + output = pretty_resources(**resources) + if include_spot and spot: + output += " (spot)" + return output class InstanceType(CoreModel): @@ -47,7 +127,7 @@ class InstanceType(CoreModel): resources: Resources -class SSHConnectionParams(CoreModel): +class SSHConnectionParams(FrozenCoreModel): hostname: str username: str port: int @@ -63,21 +143,21 @@ class RemoteConnectionInfo(CoreModel): port: int ssh_user: str ssh_keys: List[SSHKey] - - -class DockerConfig(CoreModel): - registry_auth: Optional[RegistryAuth] - image: Optional[DockerImage] + ssh_proxy: Optional[SSHConnectionParams] = None + ssh_proxy_keys: Optional[list[SSHKey]] = None + env: Env = Env() class InstanceConfiguration(CoreModel): project_name: str - instance_name: str # unique in pool - instance_id: Optional[str] = None + instance_name: str + user: str + """`user` stores the dstack user name.""" ssh_keys: List[SSHKey] - job_docker_config: Optional[DockerConfig] - user: str # dstack user name - availability_zone: Optional[str] = None + instance_id: Optional[str] = None + reservation: Optional[str] = None + volumes: Optional[List[Volume]] = None + tags: Optional[Dict[str, str]] = None def get_public_keys(self) -> List[str]: return [ssh_key.public.strip() for ssh_key in self.ssh_keys] @@ -93,6 +173,8 @@ class InstanceAvailability(Enum): AVAILABLE = "available" NOT_AVAILABLE = "not_available" NO_QUOTA = "no_quota" + NO_BALANCE = "no_balance" + """`NO_BALANCE` is used for dstack Sky.""" IDLE = "idle" BUSY = "busy" @@ -109,8 +191,136 @@ class InstanceOffer(CoreModel): instance: InstanceType region: str price: float + backend_data: dict[str, Any] = {} + + def with_availability(self, **kwargs) -> "InstanceOfferWithAvailability": + """Convert to InstanceOfferWithAvailability without re-serializing/re-validating fields. + The result shares nested objects with self. This is generally safe because callers + discard the original InstanceOffer after conversion.""" + return InstanceOfferWithAvailability.construct(**self.__dict__, **kwargs) class InstanceOfferWithAvailability(InstanceOffer): availability: InstanceAvailability + availability_zones: Optional[List[str]] = None instance_runtime: InstanceRuntime = InstanceRuntime.SHIM + blocks: int = 1 + total_blocks: int = 1 + + +class InstanceStatus(str, Enum): + PENDING = "pending" + PROVISIONING = "provisioning" + IDLE = "idle" + BUSY = "busy" + TERMINATING = "terminating" + TERMINATED = "terminated" + + def is_available(self) -> bool: + return self in ( + self.IDLE, + self.BUSY, + ) + + def is_active(self) -> bool: + return self not in self.finished_statuses() + + @classmethod + def finished_statuses(cls) -> List["InstanceStatus"]: + return [cls.TERMINATING, cls.TERMINATED] + + +class InstanceTerminationReason(str, Enum): + TERMINATED_BY_USER = "terminated_by_user" + IDLE_TIMEOUT = "idle_timeout" + PROVISIONING_TIMEOUT = "provisioning_timeout" + ERROR = "error" + JOB_FINISHED = "job_finished" + UNREACHABLE = "unreachable" + NO_OFFERS = "no_offers" + MASTER_FAILED = "master_failed" + MAX_INSTANCES_LIMIT = "max_instances_limit" + FLEET_SPEC_MISMATCH = "fleet_spec_mismatch" + NO_BALANCE = "no_balance" + """`NO_BALANCE` is used in dstack Sky.""" + + @classmethod + def from_legacy_str(cls, v: str) -> "InstanceTerminationReason": + """ + Convert legacy termination reason string to relevant termination reason enum. + + dstack versions prior to 0.20.1 represented instance termination reasons as raw + strings. Such strings may still be stored in the database. + """ + + if v == "Idle timeout": + return cls.IDLE_TIMEOUT + if v in ( + "Instance has not become running in time", + "Provisioning timeout expired", + "Proivisioning timeout expired", # typo is intentional + "The proivisioning timeout expired", # typo is intentional + ): + return cls.PROVISIONING_TIMEOUT + if v in ( + "Unsupported private SSH key type", + "Failed to locate internal IP address on the given network", + "Specified internal IP not found among instance interfaces", + "Cannot split into blocks", + "Backend not available", + "Error while waiting for instance to become running", + "Empty profile, requirements or instance_configuration", + "Unable to locate the internal ip-address for the given network", + "Private SSH key is encrypted, password required", + "Cannot parse private key, key type is not supported", + ) or v.startswith("Error to parse profile, requirements or instance_configuration:"): + return cls.ERROR + if v in ( + "All offers failed", + "No offers found", + "There were no offers found", + "Retry duration expired", + "The retry's duration expired", + ): + return cls.NO_OFFERS + if v == "Master instance failed to start": + return cls.MASTER_FAILED + if v == "Instance job finished": + return cls.JOB_FINISHED + if v == "Termination deadline": + return cls.UNREACHABLE + if v == "Fleet has too many instances": + return cls.MAX_INSTANCES_LIMIT + if v == "Low account balance": + return cls.NO_BALANCE + logger.warning("Unexpected instance termination reason string: %r", v) + return cls.ERROR + + +class Instance(CoreModel): + id: UUID + project_name: str + backend: Optional[BackendType] = None + instance_type: Optional[InstanceType] = None + name: str + fleet_id: Optional[UUID] = None + fleet_name: Optional[str] = None + instance_num: int + job_name: Optional[str] = None + """`job_name` is deprecated and always `None` because an instance can have more than one job.""" + hostname: Optional[str] = None + status: InstanceStatus + unreachable: bool = False + health_status: HealthStatus = HealthStatus.HEALTHY + termination_reason: Optional[str] = None + """`termination_reason` stores `InstanceTerminationReason`. + `str` allows adding new enum members without breaking compatibility with old clients. + """ + termination_reason_message: Optional[str] = None + created: datetime.datetime + finished_at: Optional[datetime.datetime] = None + region: Optional[str] = None + availability_zone: Optional[str] = None + price: Optional[float] = None + total_blocks: Optional[int] = None + busy_blocks: int = 0 diff --git a/src/dstack/_internal/core/models/keys.py b/src/dstack/_internal/core/models/keys.py new file mode 100644 index 0000000000..1a78f19111 --- /dev/null +++ b/src/dstack/_internal/core/models/keys.py @@ -0,0 +1,12 @@ +import datetime +import uuid + +from dstack._internal.core.models.common import CoreModel + + +class PublicKeyInfo(CoreModel): + id: uuid.UUID + added_at: datetime.datetime + name: str + type: str + fingerprint: str diff --git a/src/dstack/_internal/core/models/logs.py b/src/dstack/_internal/core/models/logs.py index c822feb2cd..887176f247 100644 --- a/src/dstack/_internal/core/models/logs.py +++ b/src/dstack/_internal/core/models/logs.py @@ -1,10 +1,15 @@ from datetime import datetime from enum import Enum -from typing import List +from typing import List, Optional from dstack._internal.core.models.common import CoreModel +class LogProducer(Enum): + RUNNER = "runner" + JOB = "job" + + class LogEventSource(str, Enum): STDOUT = "stdout" STDERR = "stderr" @@ -18,3 +23,5 @@ class LogEvent(CoreModel): class JobSubmissionLogs(CoreModel): logs: List[LogEvent] + external_url: Optional[str] = None + next_token: Optional[str] = None diff --git a/src/dstack/_internal/core/models/metrics.py b/src/dstack/_internal/core/models/metrics.py new file mode 100644 index 0000000000..b0b220ec30 --- /dev/null +++ b/src/dstack/_internal/core/models/metrics.py @@ -0,0 +1,14 @@ +from datetime import datetime +from typing import Any, List + +from dstack._internal.core.models.common import CoreModel + + +class Metric(CoreModel): + name: str + timestamps: List[datetime] + values: List[Any] + + +class JobMetrics(CoreModel): + metrics: List[Metric] diff --git a/src/dstack/_internal/core/models/placement.py b/src/dstack/_internal/core/models/placement.py new file mode 100644 index 0000000000..a0ce418bca --- /dev/null +++ b/src/dstack/_internal/core/models/placement.py @@ -0,0 +1,28 @@ +from enum import Enum +from typing import Optional + +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import CoreModel + + +class PlacementStrategy(str, Enum): + CLUSTER = "cluster" + + +class PlacementGroupConfiguration(CoreModel): + backend: BackendType + region: str + placement_strategy: PlacementStrategy + + +class PlacementGroupProvisioningData(CoreModel): + backend: BackendType + """`backend` can be different from the backend in `configuration`.""" + backend_data: Optional[str] = None + + +class PlacementGroup(CoreModel): + name: str + project_name: str + configuration: PlacementGroupConfiguration + provisioning_data: Optional[PlacementGroupProvisioningData] = None diff --git a/src/dstack/_internal/core/models/pools.py b/src/dstack/_internal/core/models/pools.py deleted file mode 100644 index b867371e80..0000000000 --- a/src/dstack/_internal/core/models/pools.py +++ /dev/null @@ -1,38 +0,0 @@ -import datetime -from typing import List, Optional -from uuid import UUID - -from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.common import CoreModel -from dstack._internal.core.models.instances import InstanceType -from dstack._internal.core.models.runs import InstanceStatus, JobStatus - - -class Pool(CoreModel): - name: str - default: bool - created_at: datetime.datetime - total_instances: int - available_instances: int - - -class Instance(CoreModel): - id: UUID - project_name: str - backend: Optional[BackendType] = None - instance_type: Optional[InstanceType] = None - name: str - pool_name: Optional[str] = None - job_name: Optional[str] = None - job_status: Optional[JobStatus] = None - hostname: Optional[str] = None - status: InstanceStatus - unreachable: bool = False - created: datetime.datetime - region: Optional[str] = None - price: Optional[float] = None - - -class PoolInstances(CoreModel): - name: str - instances: List[Instance] diff --git a/src/dstack/_internal/core/models/profiles.py b/src/dstack/_internal/core/models/profiles.py index 035bf45369..7a448486df 100644 --- a/src/dstack/_internal/core/models/profiles.py +++ b/src/dstack/_internal/core/models/profiles.py @@ -1,19 +1,31 @@ from enum import Enum -from typing import List, Optional, Union +from typing import Any, Dict, List, Optional, Union, overload +import orjson from pydantic import Field, root_validator, validator from typing_extensions import Annotated, Literal +from dstack._internal.core.backends.profile_options import AnyBackendProfileOptions from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.common import CoreModel, Duration +from dstack._internal.core.models.common import ( + CoreConfig, + CoreModel, + Duration, + EntityReference, + generate_dual_core_model, +) +from dstack._internal.utils.common import list_enum_values_for_annotation +from dstack._internal.utils.cron import validate_cron +from dstack._internal.utils.json_schema import add_extra_schema_types +from dstack._internal.utils.json_utils import pydantic_orjson_dumps_with_indent +from dstack._internal.utils.tags import tags_validator DEFAULT_RETRY_DURATION = 3600 -DEFAULT_POOL_NAME = "default-pool" DEFAULT_RUN_TERMINATION_IDLE_TIME = 5 * 60 # 5 minutes -DEFAULT_POOL_TERMINATION_IDLE_TIME = 72 * 60 * 60 # 3 days +DEFAULT_FLEET_TERMINATION_IDLE_TIME = 72 * 60 * 60 # 3 days -DEFAULT_INSTANCE_RETRY_DURATION = 60 * 60 * 24 # 24h +DEFAULT_STOP_DURATION = 300 class SpotPolicy(str, Enum): @@ -32,34 +44,73 @@ class TerminationPolicy(str, Enum): DESTROY_AFTER_IDLE = "destroy-after-idle" +class StartupOrder(str, Enum): + ANY = "any" + MASTER_FIRST = "master-first" + WORKERS_FIRST = "workers-first" + + +class StopCriteria(str, Enum): + ALL_DONE = "all-done" + MASTER_DONE = "master-done" + + +@overload +def parse_duration(v: None) -> None: ... + + +@overload +def parse_duration(v: Union[int, str]) -> int: ... + + def parse_duration(v: Optional[Union[int, str]]) -> Optional[int]: if v is None: return None return Duration.parse(v) -def parse_max_duration(v: Optional[Union[int, str]]) -> Optional[Union[str, int]]: - if v == "off": - return v - return parse_duration(v) +def parse_max_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[Literal["off"], int]]: + return parse_off_duration(v) -class ProfileRetryPolicy(CoreModel): - retry: Annotated[bool, Field(description="Whether to retry the run on failure or not")] = False - duration: Annotated[ - Optional[Union[int, str]], - Field(description="The maximum period of retrying the run, e.g., `4h` or `1d`"), - ] = None +def parse_stop_duration( + v: Optional[Union[int, str, bool]], +) -> Optional[Union[Literal["off"], int]]: + return parse_off_duration(v) - _validate_duration = validator("duration", pre=True, allow_reuse=True)(parse_duration) - @root_validator - def _validate_fields(cls, values): - if values["retry"] and "duration" not in values: - values["duration"] = DEFAULT_RETRY_DURATION - if values.get("duration") is not None: - values["retry"] = True - return values +def parse_off_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[Literal["off"], int]]: + if v == "off" or v is False: + return "off" + if v is True or v is None: + return None + duration = parse_duration(v) + if duration < 0: + raise ValueError("Duration cannot be negative") + return duration + + +def parse_idle_duration(v: Optional[Union[int, str, bool]]) -> Optional[int]: + # Differs from `parse_off_duration` to accept negative durations as `off` + # for backward compatibility. + if v == "off" or v is False or v == -1: + return -1 + if v is True: + return None + return parse_duration(v) + + +def validate_backend_options( + v: Optional[List["AnyBackendProfileOptions"]], +) -> Optional[List["AnyBackendProfileOptions"]]: + if v is None: + return v + seen = set() + for opt in v: + if opt.type in seen: + raise ValueError(f"backend_options contains duplicate entry for backend '{opt.type}'") + seen.add(opt.type) + return v class RetryEvent(str, Enum): @@ -68,123 +119,414 @@ class RetryEvent(str, Enum): ERROR = "error" -class ProfileRetry(CoreModel): +class ProfileRetryConfig(CoreConfig): + @staticmethod + def schema_extra(schema: Dict[str, Any]): + add_extra_schema_types( + schema["properties"]["duration"], + extra_types=[{"type": "string"}], + ) + + +class ProfileRetry(generate_dual_core_model(ProfileRetryConfig)): on_events: Annotated[ - List[RetryEvent], + Optional[List[RetryEvent]], Field( description=( "The list of events that should be handled with retry." - " Supported events are `no-capacity`, `interruption`, and `error`" + f" Supported events are {list_enum_values_for_annotation(RetryEvent)}." + " Omit to retry on all events" ) ), - ] + ] = None duration: Annotated[ - Optional[Union[int, str]], - Field(description="The maximum period of retrying the run, e.g., `4h` or `1d`"), + Optional[int], + Field( + description=( + "The maximum period of retrying the run, e.g., `4h` or `1d`." + " The period is calculated as a run age for `no-capacity` event" + " and as a time passed since the last `interruption` and `error` for `interruption` and `error` events." + ) + ), ] = None _validate_duration = validator("duration", pre=True, allow_reuse=True)(parse_duration) @root_validator def _validate_fields(cls, values): - if "on_events" in values and len(values["on_events"]) == 0: + on_events = values.get("on_events", None) + if on_events is not None and len(values["on_events"]) == 0: raise ValueError("`on_events` cannot be empty") return values +class UtilizationPolicyConfig(CoreConfig): + @staticmethod + def schema_extra(schema: Dict[str, Any]): + add_extra_schema_types( + schema["properties"]["time_window"], + extra_types=[{"type": "string"}], + ) + + +class UtilizationPolicy(generate_dual_core_model(UtilizationPolicyConfig)): + _min_time_window = "5m" + + min_gpu_utilization: Annotated[ + int, + Field( + description=( + "Minimum required GPU utilization, percent." + " If any GPU has utilization below specified value during the whole time window," + " the run is terminated" + ), + ge=0, + le=100, + ), + ] + time_window: Annotated[ + int, + Field( + description=( + "The time window of metric samples taking into account to measure utilization" + f" (e.g., `30m`, `1h`). Minimum is `{_min_time_window}`" + ) + ), + ] + + @validator("time_window", pre=True) + def validate_time_window(cls, v: Union[int, str]) -> int: + v = parse_duration(v) + if v < parse_duration(cls._min_time_window): + raise ValueError(f"Minimum time_window is {cls._min_time_window}") + return v + + +class Schedule(CoreModel): + cron: Annotated[ + Union[List[str], str], + Field( + description=( + "A cron expression or a list of cron expressions specifying the UTC time when the run needs to be started" + ) + ), + ] + + @validator("cron") + def _validate_cron(cls, v: Union[List[str], str]) -> List[str]: + if isinstance(v, str): + values = [v] + else: + values = v + if len(values) == 0: + raise ValueError("At least one cron expression must be specified") + for value in values: + validate_cron(value) + return values + + @property + def crons(self) -> List[str]: + """ + Access `cron` attribute as a list. + """ + if isinstance(self.cron, str): + return [self.cron] + return self.cron + + +class InstanceNameSelector(CoreModel): + name: Annotated[str, Field(description="The fleet instance name", min_length=1)] + + +class InstanceHostnameSelector(CoreModel): + hostname: Annotated[ + str, Field(description="The fleet instance hostname or IP address", min_length=1) + ] + + +def _parse_fleet_instance_selector_fleet(v: Any) -> Any: + if isinstance(v, str): + return EntityReference.parse(v) + return v + + +class FleetInstanceSelectorConfig(CoreConfig): + @staticmethod + def schema_extra(schema: Dict[str, Any]): + add_extra_schema_types( + schema["properties"]["fleet"], + extra_types=[{"type": "string", "minLength": 1}], + ) + + +class FleetInstanceSelector(generate_dual_core_model(FleetInstanceSelectorConfig)): + fleet: Annotated[ + EntityReference, + Field( + description=( + "The fleet reference. For fleets owned by the current project, specify" + " the fleet name. For a fleet from another project, specify" + " `/` or an object with `project` and `name`." + ), + ), + ] + instance: Annotated[int, Field(description="The fleet instance number", ge=0)] + + _validate_fleet = validator("fleet", pre=True, allow_reuse=True)( + _parse_fleet_instance_selector_fleet + ) + + +InstanceSelector = Union[InstanceNameSelector, InstanceHostnameSelector, FleetInstanceSelector] + + +def parse_instance_selector(v: Union[InstanceSelector, str]) -> InstanceSelector: + if isinstance(v, str): + return InstanceNameSelector(name=v) + return v + + +class ProfileParamsConfig(CoreConfig): + @staticmethod + def schema_extra(schema: Dict[str, Any]): + add_extra_schema_types( + schema["properties"]["max_duration"], + extra_types=[{"type": "boolean"}, {"type": "string"}], + ) + add_extra_schema_types( + schema["properties"]["stop_duration"], + extra_types=[{"type": "boolean"}, {"type": "string"}], + ) + add_extra_schema_types( + schema["properties"]["idle_duration"], + extra_types=[{"type": "string"}], + ) + add_extra_schema_types( + schema["properties"]["instances"]["items"], + extra_types=[{"type": "string", "minLength": 1}], + ) + + class ProfileParams(CoreModel): backends: Annotated[ Optional[List[BackendType]], Field(description="The backends to consider for provisioning (e.g., `[aws, gcp]`)"), - ] + ] = None regions: Annotated[ Optional[List[str]], Field( description="The regions to consider for provisioning (e.g., `[eu-west-1, us-west4, westeurope]`)" ), - ] + ] = None + availability_zones: Annotated[ + Optional[List[str]], + Field( + description="The availability zones to consider for provisioning (e.g., `[eu-west-1a, us-west4-a]`)" + ), + ] = None instance_types: Annotated[ Optional[List[str]], Field( - description="The cloud-specific instance types to consider for provisioning (e.g., `[p3.8xlarge, n1-standard-4]`)" + description="The cloud-specific instance types to consider for provisioning (e.g., `[g6e.24xlarge, n1-standard-4]`)" ), - ] + ] = None + reservation: Annotated[ + Optional[str], + Field( + description=( + "The existing reservation to use for instance provisioning." + " Supports AWS Capacity Reservations, AWS Capacity Blocks, and GCP reservations" + ) + ), + ] = None spot_policy: Annotated[ Optional[SpotPolicy], Field( - description="The policy for provisioning spot or on-demand instances: `spot`, `on-demand`, or `auto`" + description=( + "The policy for provisioning spot or on-demand instances:" + f" {list_enum_values_for_annotation(SpotPolicy)}." + f" Defaults to `{SpotPolicy.ONDEMAND.value}`" + ) ), - ] + ] = None retry: Annotated[ Optional[Union[ProfileRetry, bool]], Field(description="The policy for resubmitting the run. Defaults to `false`"), - ] - retry_policy: Annotated[ - Optional[ProfileRetryPolicy], - Field(description="The policy for resubmitting the run. Deprecated in favor of `retry`"), - ] + ] = None max_duration: Annotated[ - Optional[Union[Literal["off"], str, int]], + Optional[Union[Literal["off"], int]], Field( - description="The maximum duration of a run (e.g., `2h`, `1d`, etc). After it elapses, the run is forced to stop. Defaults to `off`" + description=( + "The maximum duration of a run (e.g., `2h`, `1d`, etc)" + " in a running state, excluding provisioning and pulling." + " After it elapses, the run is automatically stopped." + " Use `off` for unlimited duration. Defaults to `off`" + ) ), - ] + ] = None + stop_duration: Annotated[ + Optional[Union[Literal["off"], int]], + Field( + description=( + "The maximum duration of a run graceful stopping." + " After it elapses, the run is automatically forced stopped." + " This includes force detaching volumes used by the run." + " Use `off` for unlimited duration. Defaults to `5m`" + ) + ), + ] = None max_price: Annotated[ - Optional[float], Field(description="The maximum price per hour, in dollars", gt=0.0) - ] - pool_name: Annotated[ - Optional[str], - Field(description="The name of the pool. If not set, dstack will use the default name"), - ] - instance_name: Annotated[Optional[str], Field(description="The name of the instance")] + Optional[float], + Field(description="The maximum instance price per hour, in dollars", gt=0.0), + ] = None creation_policy: Annotated[ Optional[CreationPolicy], Field( - description="The policy for using instances from the pool. Defaults to `reuse-or-create`" + description=( + "The policy for using instances from fleets:" + f" {list_enum_values_for_annotation(CreationPolicy)}." + f" Defaults to `{CreationPolicy.REUSE_OR_CREATE.value}`" + ) ), - ] - termination_policy: Annotated[ - Optional[TerminationPolicy], + ] = None + idle_duration: Annotated[ + Optional[int], Field( - description="The policy for termination instances. Defaults to `destroy-after-idle`" + description=( + "Time to wait before terminating idle instances." + " When the run reuses an existing fleet instance, the fleet's `idle_duration` applies." + " When the run provisions a new instance, the shorter of the fleet's and run's values is used." + " Defaults to `5m` for runs and `3d` for fleets." + " Use `off` for unlimited duration." + " Only applied for VM-based backends" + ) ), - ] - termination_idle_time: Annotated[ - Optional[Union[str, int]], + ] = None + utilization_policy: Annotated[ + Optional[UtilizationPolicy], + Field(description="Run termination policy based on utilization"), + ] = None + startup_order: Annotated[ + Optional[StartupOrder], Field( - description="Time to wait before destroying the idle instance. Defaults to `5m` for `dstack run` and to `3d` for `dstack pool add`" + description=( + f"The order in which master and workers jobs are started:" + f" {list_enum_values_for_annotation(StartupOrder)}." + f" Defaults to `{StartupOrder.ANY.value}`" + ) ), - ] + ] = None + stop_criteria: Annotated[ + Optional[StopCriteria], + Field( + description=( + "The criteria determining when a multi-node run should be considered finished:" + f" {list_enum_values_for_annotation(StopCriteria)}." + f" Defaults to `{StopCriteria.ALL_DONE.value}`" + ) + ), + ] = None + schedule: Annotated[ + Optional[Schedule], + Field(description=("The schedule for starting the run at specified time")), + ] = None + fleets: Annotated[ + Optional[ + list[ + Union[ + EntityReference, + str, # For server response compatibility with pre-0.20.14 clients + ] + ] + ], + Field( + description=( + "The fleets considered for reuse." + " For fleets owned by the current project, specify fleet names." + " For imported fleets, specify `/`" + ), + ), + ] = None + instances: Annotated[ + Optional[List[InstanceSelector]], + Field( + description=( + "The specific fleet instances to consider for reuse." + " Each value can be an instance name string, or an object with" + " `name`, `hostname`, or `fleet` and `instance`." + " When set, the run is only placed on matching existing instances." + ), + min_items=1, + ), + ] = None + tags: Annotated[ + Optional[Dict[str, str]], + Field( + description=( + "The custom tags to associate with the resource." + " The tags are also propagated to the underlying backend resources." + " If there is a conflict with backend-level tags, does not override them" + ) + ), + ] = None + backend_options: Annotated[ + Optional[List[AnyBackendProfileOptions]], + Field(description="Backend-specific options, applied only to offers from that backend"), + ] = None _validate_max_duration = validator("max_duration", pre=True, allow_reuse=True)( parse_max_duration ) - _validate_termination_idle_time = validator( - "termination_idle_time", pre=True, allow_reuse=True - )(parse_duration) + _validate_stop_duration = validator("stop_duration", pre=True, allow_reuse=True)( + parse_stop_duration + ) + _validate_idle_duration = validator("idle_duration", pre=True, allow_reuse=True)( + parse_idle_duration + ) + _validate_fleets = validator("fleets", allow_reuse=True, each_item=True)(EntityReference.parse) + _validate_instances = validator("instances", pre=True, allow_reuse=True, each_item=True)( + parse_instance_selector + ) + _validate_tags = validator("tags", pre=True, allow_reuse=True)(tags_validator) + _validate_backend_options = validator("backend_options", allow_reuse=True)( + validate_backend_options + ) class ProfileProps(CoreModel): name: Annotated[ str, Field( - description="The name of the profile that can be passed as `--profile` to `dstack run`" + description="The name of the profile that can be passed as `--profile` to `dstack apply`" ), - ] + ] = "" default: Annotated[ - bool, Field(description="If set to true, `dstack run` will use this profile by default.") + bool, Field(description="If set to true, `dstack apply` will use this profile by default.") ] = False -class Profile(ProfileProps, ProfileParams): +class ProfileConfig(ProfileParamsConfig): + @staticmethod + def schema_extra(schema: Dict[str, Any]): + ProfileParamsConfig.schema_extra(schema) + + +class Profile( + ProfileProps, + ProfileParams, + generate_dual_core_model(ProfileConfig), +): pass -class ProfilesConfig(CoreModel): - profiles: List[Profile] +class ProfilesConfigConfig(CoreConfig): + json_loads = orjson.loads + json_dumps = pydantic_orjson_dumps_with_indent + schema_extra = {"$schema": "http://json-schema.org/draft-07/schema#"} - class Config: - schema_extra = {"$schema": "http://json-schema.org/draft-07/schema#"} + +class ProfilesConfig(generate_dual_core_model(ProfilesConfigConfig)): + profiles: List[Profile] def default(self) -> Optional[Profile]: for p in self.profiles: diff --git a/src/dstack/_internal/core/models/projects.py b/src/dstack/_internal/core/models/projects.py index ee98c00625..9cb765c683 100644 --- a/src/dstack/_internal/core/models/projects.py +++ b/src/dstack/_internal/core/models/projects.py @@ -1,20 +1,50 @@ -from typing import List +from datetime import datetime +from typing import List, Optional, Union from pydantic import UUID4 -from dstack._internal.core.models.backends import BackendInfo +from dstack._internal.core.backends.models import BackendInfo from dstack._internal.core.models.common import CoreModel from dstack._internal.core.models.users import ProjectRole, User +class MemberPermissions(CoreModel): + can_manage_ssh_fleets: bool + can_manage_secrets: bool = False + """Default is for client-side compatibility with older servers. + Always explicitly set on the server.""" + + class Member(CoreModel): user: User project_role: ProjectRole + permissions: MemberPermissions class Project(CoreModel): project_id: UUID4 project_name: str owner: User + created_at: Optional[datetime] = None backends: List[BackendInfo] members: List[Member] + is_public: bool = False + templates_repo: Optional[str] = None + + +class ProjectsInfoList(CoreModel): + total_count: Optional[int] = None + projects: List[Project] + + +# For backward compatibility with 0.20 clients, endpoints return `List[Project]` if `total_count` is None. +# TODO: Replace with ProjectsInfoList in 0.21. +ProjectsInfoListOrProjectsList = Union[List[Project], ProjectsInfoList] + + +class ProjectHookConfig(CoreModel): + """ + This class can be inherited to extend the project creation configuration passed to the hooks. + """ + + pass diff --git a/src/dstack/_internal/core/models/repos/__init__.py b/src/dstack/_internal/core/models/repos/__init__.py index 2748ac41c4..25a3d06aca 100644 --- a/src/dstack/_internal/core/models/repos/__init__.py +++ b/src/dstack/_internal/core/models/repos/__init__.py @@ -3,6 +3,7 @@ from pydantic import Field from dstack._internal.core.models.common import CoreModel +from dstack._internal.core.models.repos.base import Repo as Repo from dstack._internal.core.models.repos.local import ( # noqa: F401 LocalRepo, LocalRepoInfo, diff --git a/src/dstack/_internal/core/models/repos/base.py b/src/dstack/_internal/core/models/repos/base.py index c32469253d..e9d8643055 100644 --- a/src/dstack/_internal/core/models/repos/base.py +++ b/src/dstack/_internal/core/models/repos/base.py @@ -3,6 +3,7 @@ from typing import BinaryIO, Optional import dstack._internal.core.models.repos as repos +from dstack._internal.core.errors import DstackError from dstack._internal.core.models.common import CoreModel @@ -12,11 +13,6 @@ class RepoType(str, Enum): VIRTUAL = "virtual" -class RepoProtocol(str, Enum): - SSH = "ssh" - HTTPS = "https" - - class BaseRepoInfo(CoreModel): repo_type: str @@ -26,6 +22,10 @@ class Repo(ABC): repo_dir: Optional[str] run_repo_data: "repos.AnyRunRepoData" + @abstractmethod + def has_code_to_write(self) -> bool: + pass + @abstractmethod def write_code_file(self, fp: BinaryIO) -> str: pass @@ -33,3 +33,8 @@ def write_code_file(self, fp: BinaryIO) -> str: @abstractmethod def get_repo_info(self) -> "repos.AnyRepoInfo": pass + + def get_repo_dir_or_error(self) -> str: + if self.repo_dir is not None: + return self.repo_dir + raise DstackError("repo_dir is None") diff --git a/src/dstack/_internal/core/models/repos/local.py b/src/dstack/_internal/core/models/repos/local.py index 07773f2f95..a7e162e7c7 100644 --- a/src/dstack/_internal/core/models/repos/local.py +++ b/src/dstack/_internal/core/models/repos/local.py @@ -2,13 +2,18 @@ from pathlib import Path from typing import BinaryIO, Optional +import ignore +import ignore.overrides from typing_extensions import Literal from dstack._internal.core.models.repos.base import BaseRepoInfo, Repo +from dstack._internal.utils.common import sizeof_fmt from dstack._internal.utils.hash import get_sha256, slugify -from dstack._internal.utils.ignore import GitIgnore +from dstack._internal.utils.logging import get_logger from dstack._internal.utils.path import PathLike +logger = get_logger(__name__) + class LocalRepoInfo(BaseRepoInfo): repo_type: Literal["local"] = "local" @@ -26,7 +31,7 @@ class LocalRepo(Repo): Example: ```python - run = client.runs.submit( + run = client.runs.apply_configuration( configuration=..., repo=LocalRepo.from_dir("."), # Mount the current folder to the run ) @@ -41,10 +46,10 @@ def from_dir(repo_dir: PathLike) -> "LocalRepo": Creates an instance of a local repo from a local path. Args: - repo_dir: The path to a local folder + repo_dir: The path to a local folder. Returns: - A local repo instance + A local repo instance. """ return LocalRepo(repo_dir=repo_dir) @@ -68,23 +73,28 @@ def __init__( self.repo_id = repo_id self.run_repo_data = repo_data + def has_code_to_write(self) -> bool: + # LocalRepo is deprecated, no need for real implementation + return True + def write_code_file(self, fp: BinaryIO) -> str: + repo_path = Path(self.run_repo_data.repo_dir) with tarfile.TarFile(mode="w", fileobj=fp) as t: - t.add( - self.run_repo_data.repo_dir, - arcname="", - filter=TarIgnore(self.run_repo_data.repo_dir, globs=[".git"]), - ) + for entry in ( + ignore.WalkBuilder(repo_path) + .overrides(ignore.overrides.OverrideBuilder(repo_path).add("!/.git/").build()) + .hidden(False) # do not ignore files that start with a dot + .require_git(False) # respect git ignore rules even if not a git repo + .add_custom_ignore_filename(".dstackignore") + .build() + ): + entry_path_within_repo = entry.path().relative_to(repo_path) + if entry_path_within_repo != Path("."): + t.add(entry.path(), arcname=entry_path_within_repo, recursive=False) + logger.debug("Code file size: %s", sizeof_fmt(fp.tell())) return get_sha256(fp) def get_repo_info(self) -> LocalRepoInfo: return LocalRepoInfo( repo_dir=self.run_repo_data.repo_dir, ) - - -class TarIgnore(GitIgnore): - def __call__(self, tarinfo: tarfile.TarInfo) -> Optional[tarfile.TarInfo]: - if self.ignore(tarinfo.path): - return None - return tarinfo diff --git a/src/dstack/_internal/core/models/repos/remote.py b/src/dstack/_internal/core/models/repos/remote.py index 1682807c68..f613e18221 100644 --- a/src/dstack/_internal/core/models/repos/remote.py +++ b/src/dstack/_internal/core/models/repos/remote.py @@ -1,80 +1,70 @@ import io +import re import subprocess import time -from typing import BinaryIO, Optional +from dataclasses import dataclass +from typing import Annotated, Any, BinaryIO, Callable, Dict, Optional, Union, cast import git -import giturlparse +import pydantic from pydantic import Field from typing_extensions import Literal -from dstack._internal.core.errors import DstackError -from dstack._internal.core.models.common import CoreModel -from dstack._internal.core.models.repos.base import BaseRepoInfo, Repo, RepoProtocol +from dstack._internal.core.deprecated import Deprecated +from dstack._internal.core.errors import ( + RepoDetachedHeadError, + RepoError, + RepoGitError, + RepoInvalidGitRepositoryError, +) +from dstack._internal.core.models.common import CoreConfig, generate_dual_core_model +from dstack._internal.core.models.repos.base import BaseRepoInfo, Repo from dstack._internal.utils.hash import get_sha256, slugify +from dstack._internal.utils.logging import get_logger from dstack._internal.utils.path import PathLike from dstack._internal.utils.ssh import get_host_config +logger = get_logger(__name__) -class RepoError(DstackError): - pass +SCP_LOCATION_REGEX = re.compile(r"(?P[^/]+)@(?P[^/]+?):(?P.+)", re.IGNORECASE) -class RemoteRepoCreds(CoreModel): - protocol: RepoProtocol - private_key: Optional[str] - oauth_token: Optional[str] +class RemoteRepoCredsConfig(CoreConfig): + @staticmethod + def schema_extra(schema: Dict[str, Any]): + pass + + +class RemoteRepoCreds(generate_dual_core_model(RemoteRepoCredsConfig)): + clone_url: str + private_key: Optional[str] = None + oauth_token: Optional[str] = None + + +class RemoteRepoInfoConfig(CoreConfig): + @staticmethod + def schema_extra(schema: Dict[str, Any]): + pass -class RemoteRepoInfo(BaseRepoInfo): +class RemoteRepoInfo( + BaseRepoInfo, + generate_dual_core_model(RemoteRepoInfoConfig), +): repo_type: Literal["remote"] = "remote" - repo_host_name: str - repo_port: Optional[int] - repo_user_name: str repo_name: str class RemoteRunRepoData(RemoteRepoInfo): repo_branch: Optional[str] = None repo_hash: Optional[str] = None - repo_diff: Optional[str] = Field(None, exclude=True) + repo_diff: Annotated[Optional[bytes], Field(exclude=True)] = None repo_config_name: Optional[str] = None repo_config_email: Optional[str] = None @staticmethod - def from_url(url: str, parse_ssh_config: bool = True): - url = giturlparse.parse(url) - data = RemoteRunRepoData( - repo_host_name=url.resource, - repo_port=url.port, - repo_user_name=url.owner, - repo_name=url.name, - ) - if parse_ssh_config and url.protocol == "ssh": - host_config = get_host_config(data.repo_host_name) - data.repo_host_name = host_config.get("hostname", data.repo_host_name) - data.repo_port = host_config.get("port", data.repo_port) - return data - - def path(self, sep: str = ".") -> str: - return sep.join( - [ - self.repo_host_name - if self.repo_port is None - else f"{self.repo_host_name}:{self.repo_port}", - self.repo_user_name, - self.repo_name, - ] - ) - - def make_url(self, protocol: RepoProtocol, oauth_token: Optional[str] = None) -> str: - if protocol == RepoProtocol.HTTPS: - return f"https://{(oauth_token + '@') if oauth_token else ''}{self.path(sep='/')}.git" - elif protocol == RepoProtocol.SSH: - if self.repo_port: - return f"ssh@{self.path(sep='/')}.git" - else: - return f"git@{self.repo_host_name}:{self.repo_user_name}/{self.repo_name}.git" + def from_url(url: str): + return RemoteRunRepoData(repo_name=GitRepoURL.parse(url).get_repo_name()) class RemoteRepo(Repo): @@ -90,7 +80,7 @@ class RemoteRepo(Repo): Using a remote Git repo by a URL: ```python - repo=RemoteRepo.from_url( + repo = RemoteRepo.from_url( repo_url="https://github.com/dstackai/dstack-examples", repo_branch="main" ) @@ -108,7 +98,7 @@ class RemoteRepo(Repo): Finally, you can pass the repo object to the run: ```python - run = client.runs.submit( + run = client.runs.apply_configuration( configuration=..., repo=repo, ) @@ -117,6 +107,7 @@ class RemoteRepo(Repo): """ run_repo_data: RemoteRunRepoData + repo_url: str @staticmethod def from_dir(repo_dir: PathLike) -> "RemoteRepo": @@ -124,10 +115,10 @@ def from_dir(repo_dir: PathLike) -> "RemoteRepo": Creates an instance of a remote repo from a local path. Args: - repo_dir: The path to a local folder + repo_dir: The path to a local folder. Returns: - A remote repo instance + A remote repo instance. """ return RemoteRepo(local_repo_dir=repo_dir) @@ -139,15 +130,13 @@ def from_url( Creates an instance of a remote repo from a URL. Args: - repo_url: The URL of a remote Git repo - repo_branch: The name of the remote branch. Must be specified if `hash` is not specified. - repo_hash: The hash of the revision. Must be specified if `branch` is not specified. + repo_url: The URL of a remote Git repo. + repo_branch: The name of the remote branch. + repo_hash: The hash of the revision. Returns: - A remote repo instance + A remote repo instance. """ - if repo_branch is None and repo_hash is None: - raise ValueError("Either `repo_branch` or `repo_hash` must be specified.") return RemoteRepo( repo_url=repo_url, repo_branch=repo_branch, @@ -160,53 +149,89 @@ def __init__( repo_id: Optional[str] = None, local_repo_dir: Optional[PathLike] = None, repo_url: Optional[str] = None, - repo_data: Optional[RemoteRunRepoData] = None, repo_branch: Optional[str] = None, repo_hash: Optional[str] = None, + repo_data: Union[Deprecated, RemoteRunRepoData, None] = Deprecated.PLACEHOLDER, ): - self.repo_dir = local_repo_dir - self.repo_url = repo_url - - if self.repo_dir is not None: - repo = git.Repo(self.repo_dir) - tracking_branch = repo.active_branch.tracking_branch() - if tracking_branch is None: - raise RepoError("No remote branch is configured") - self.repo_url = repo.remote(tracking_branch.remote_name).url - repo_data = RemoteRunRepoData.from_url(self.repo_url, parse_ssh_config=True) - repo_data.repo_branch = tracking_branch.remote_head - repo_data.repo_hash = tracking_branch.commit.hexsha - repo_data.repo_config_name = repo.config_reader().get_value("user", "name", "") or None - repo_data.repo_config_email = ( - repo.config_reader().get_value("user", "email", "") or None + if repo_data is not Deprecated.PLACEHOLDER: + logger.warning( + "The repo_data argument is deprecated, ignored, and will be removed soon." + " As it was always ignored, it's safe to remove it." ) - repo_data.repo_diff = _repo_diff_verbose(repo, repo_data.repo_hash) - elif self.repo_url is not None: - repo_data = RemoteRunRepoData.from_url(self.repo_url, parse_ssh_config=True) - if repo_branch is not None: - repo_data.repo_branch = repo_branch - if repo_hash is not None: - repo_data.repo_hash = repo_hash - elif repo_data is None: - raise RepoError("No remote repo data provided") + # _init_from_* methods must set repo_dir, repo_url, and run_repo_data + if local_repo_dir is not None: + try: + self._init_from_repo_dir(local_repo_dir) + except git.InvalidGitRepositoryError as e: + raise RepoInvalidGitRepositoryError() from e + except git.GitError as e: + raise RepoGitError() from e + elif repo_url is not None: + self._init_from_repo_url(repo_url, repo_branch, repo_hash) + else: + raise RepoError("Neither local repo dir nor repo URL provided") if repo_id is None: - repo_id = slugify(repo_data.repo_name, repo_data.path("/")) + repo_id = slugify( + self.run_repo_data.repo_name, + GitRepoURL.parse( + self.repo_url, get_ssh_config=get_host_config + ).get_unique_location(), + ) self.repo_id = repo_id - self.run_repo_data = repo_data + + def has_code_to_write(self) -> bool: + # repo_diff is: + # * None for RemoteRepo.from_url() + # * empty bytes for RemoteRepo.from_dir() if there are no changes ("clean" state) + # and untracked files + # * non-empty bytes for RemoteRepo.from_dir() if there are changes ("dirty" state) + # and/or untracked files + return bool(self.run_repo_data.repo_diff) def write_code_file(self, fp: BinaryIO) -> str: if self.run_repo_data.repo_diff is not None: - fp.write(self.run_repo_data.repo_diff.encode()) + fp.write(self.run_repo_data.repo_diff) return get_sha256(fp) def get_repo_info(self) -> RemoteRepoInfo: - return RemoteRepoInfo( - repo_host_name=self.run_repo_data.repo_host_name, - repo_port=self.run_repo_data.repo_port, - repo_user_name=self.run_repo_data.repo_user_name, - repo_name=self.run_repo_data.repo_name, - ) + return RemoteRepoInfo(repo_name=self.run_repo_data.repo_name) + + def _init_from_repo_dir(self, repo_dir: PathLike): + git_repo = git.Repo(repo_dir) + if git_repo.head.is_detached: + raise RepoDetachedHeadError() + tracking_branch = git_repo.active_branch.tracking_branch() + if tracking_branch is None: + raise RepoError("No remote branch is configured") + + repo_url = git_repo.remote(tracking_branch.remote_name).url + repo_data = RemoteRunRepoData.from_url(repo_url) + repo_data.repo_branch = tracking_branch.remote_head + repo_data.repo_hash = tracking_branch.commit.hexsha + git_config = git_repo.config_reader() + if user_name := cast(str, git_config.get_value("user", "name", "")): + repo_data.repo_config_name = user_name + if user_email := cast(str, git_config.get_value("user", "email", "")): + repo_data.repo_config_email = user_email + repo_data.repo_diff = _repo_diff_verbose(git_repo, repo_data.repo_hash) + + self.repo_dir = str(repo_dir) + self.repo_url = repo_url + self.run_repo_data = repo_data + + def _init_from_repo_url( + self, repo_url: str, repo_branch: Optional[str], repo_hash: Optional[str] + ): + repo_data = RemoteRunRepoData.from_url(repo_url) + if repo_branch is not None: + repo_data.repo_branch = repo_branch + if repo_hash is not None: + repo_data.repo_hash = repo_hash + + self.repo_dir = None + self.repo_url = repo_url + self.run_repo_data = repo_data class _DiffCollector: @@ -215,7 +240,7 @@ def __init__(self, warning_time: float, delay: float = 5): self.delay = delay self.warned = False self.start_time = time.monotonic() - self.buffer = io.StringIO() + self.buffer = io.BytesIO() def timeout(self): now = time.monotonic() @@ -233,14 +258,102 @@ def timeout(self): ) def write(self, v: bytes): - self.buffer.write(v.decode()) + self.buffer.write(v) - def get(self) -> str: + def get(self) -> bytes: if self.warned: print() return self.buffer.getvalue() +@dataclass +class GitRepoURL: + """ + Class for best-effort repo URLs parsing and conversion to https:// or ssh:// form. + """ + + ssh_user: Optional[str] + host: str + https_port: Optional[str] + ssh_port: Optional[str] + path: str + + original_host: str + """`original_host` stores the host value before SSH config lookup.""" + + @staticmethod + def parse( + value: str, + get_ssh_config: Callable[[str], Dict[str, str]] = lambda host: {}, + ) -> "GitRepoURL": + try: + url = pydantic.parse_obj_as(pydantic.AnyUrl, value) + except pydantic.ValidationError: + url = scp_location_to_ssh_url(value) + + if url is None: + raise RepoError(f"Could not parse git URL {value}") + + ssh_config = get_ssh_config(url.host) + + if url.scheme.lower() == "https": + return GitRepoURL( + ssh_user=ssh_config.get("user"), + host=url.host.lower(), + https_port=url.port, + ssh_port=ssh_config.get("port"), + path=url.path or "/", + original_host=url.host.lower(), + ) + + if url.scheme.lower() == "ssh": + return GitRepoURL( + ssh_user=url.user or ssh_config.get("user"), + host=ssh_config.get("hostname", "").lower() or url.host.lower(), + https_port=None, + ssh_port=url.port or ssh_config.get("port"), + path=url.path or "/", + original_host=url.host.lower(), + ) + + raise RepoError(f"Unsupported URL scheme {url.scheme}") + + def as_https(self, oauth_token: Optional[str] = None) -> str: + optional_creds = f"anything:{oauth_token}@" if oauth_token else "" + optional_port = f":{self.https_port}" if self.https_port else "" + return f"https://{optional_creds}{self.host}{optional_port}{self.path}" + + def as_ssh(self) -> str: + user = self.ssh_user or "git" + optional_port = f":{self.ssh_port}" if self.ssh_port else "" + return f"ssh://{user}@{self.host}{optional_port}{self.path}" + + def get_clean_path(self) -> str: + return self.path.rstrip("/").removesuffix(".git") + + def get_repo_name(self) -> str: + return self.get_clean_path().rsplit("/")[-1] or "unknown" + + def get_unique_location(self) -> str: + return self.host + self.get_clean_path() + + +def scp_location_to_ssh_url(scp_location: str) -> Optional[pydantic.AnyHttpUrl]: + """ + Converts scp-format location to SSH URL. + E.g. git@github.com:dstackai/dstack.git" -> ssh://git@github.com/dstackai/dstack.git + """ + + match = re.match(SCP_LOCATION_REGEX, scp_location) + if match is None: + return None + user, host, path = match.group("user"), match.group("host"), match.group("path") + try: + return pydantic.parse_obj_as(pydantic.AnyUrl, f"ssh://{user}@{host}/{path}") + except pydantic.ValidationError: + return None + + def _interactive_git_proc( proc: git.Git.AutoInterrupt, collector: _DiffCollector, ignore_status: bool = False ): @@ -255,10 +368,10 @@ def _interactive_git_proc( continue -def _repo_diff_verbose(repo: git.Repo, repo_hash: str, warning_time: float = 5) -> str: +def _repo_diff_verbose(repo: git.Repo, repo_hash: str, warning_time: float = 5) -> bytes: collector = _DiffCollector(warning_time) try: - _interactive_git_proc(repo.git.diff(repo_hash, as_process=True), collector) + _interactive_git_proc(repo.git.diff(repo_hash, binary=True, as_process=True), collector) for filename in repo.untracked_files: _interactive_git_proc( repo.git.diff("/dev/null", filename, no_index=True, binary=True, as_process=True), diff --git a/src/dstack/_internal/core/models/repos/virtual.py b/src/dstack/_internal/core/models/repos/virtual.py index b42cda8d82..24ffeac615 100644 --- a/src/dstack/_internal/core/models/repos/virtual.py +++ b/src/dstack/_internal/core/models/repos/virtual.py @@ -8,6 +8,8 @@ from dstack._internal.utils.hash import get_sha256 from dstack._internal.utils.path import resolve_relative_path +DEFAULT_VIRTUAL_REPO_ID = "none" + class VirtualRepoInfo(BaseRepoInfo): repo_type: Literal["virtual"] = "virtual" @@ -28,7 +30,7 @@ class VirtualRepo(Repo): virtual_repo.add_file_from_package(package=some_package, path="requirements.txt") virtual_repo.add_file_from_package(package=some_package, path="train.py") - run = client.runs.submit( + run = client.runs.apply_configuration( configuration=..., repo=virtual_repo, ) @@ -41,8 +43,7 @@ class VirtualRepo(Repo): run_repo_data: VirtualRunRepoData - # TODO: Make repo_id optional - def __init__(self, repo_id: str): + def __init__(self, repo_id: str = DEFAULT_VIRTUAL_REPO_ID): self.repo_id = repo_id self.repo_dir = None self.files: Dict[str, bytes] = {} @@ -72,6 +73,9 @@ def add_file(self, path: str, content: bytes): self.files[resolve_relative_path(path).as_posix()] = content + def has_code_to_write(self) -> bool: + return len(self.files) > 0 + def write_code_file(self, fp: BinaryIO) -> str: with tarfile.TarFile(mode="w", fileobj=fp) as t: for path, content in sorted(self.files.items()): diff --git a/src/dstack/_internal/core/models/resources.py b/src/dstack/_internal/core/models/resources.py index 70074ec0a6..ff2a173a5d 100644 --- a/src/dstack/_internal/core/models/resources.py +++ b/src/dstack/_internal/core/models/resources.py @@ -1,10 +1,19 @@ +import math +from collections.abc import Mapping from typing import Any, Dict, Generic, List, Optional, Tuple, TypeVar, Union -from pydantic import Field, root_validator, validator +import gpuhunt +from pydantic import Field, parse_obj_as, root_validator, validator from pydantic.generics import GenericModel from typing_extensions import Annotated -from dstack._internal.core.models.common import CoreModel +from dstack._internal.core.models.common import CoreConfig, CoreModel, generate_dual_core_model +from dstack._internal.utils.common import pretty_resources +from dstack._internal.utils.json_schema import add_extra_schema_types +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + T = TypeVar("T", bound=Union[int, float]) @@ -49,6 +58,22 @@ def __str__(self) -> str: return str(min) return f"{min}..{max}" + def intersect(self, other: "Range") -> Optional["Range"]: + start = max( + self.min if self.min is not None else -math.inf, + other.min if other.min is not None else -math.inf, + ) + end = min( + self.max if self.max is not None else math.inf, + other.max if other.max is not None else math.inf, + ) + if start > end: + return None + return Range( + min=start if abs(start) != math.inf else None, + max=end if abs(end) != math.inf else None, + ) + class Memory(float): """ @@ -101,10 +126,99 @@ def __str__(self): DEFAULT_CPU_COUNT = Range[int](min=2) DEFAULT_MEMORY_SIZE = Range[Memory](min=Memory.parse("8GB")) -DEFAULT_GPU_COUNT = Range[int](min=1, max=1) +DEFAULT_GPU_COUNT = Range[int](min=1) + + +class CPUSpecConfig(CoreConfig): + @staticmethod + def schema_extra(schema: Dict[str, Any]): + add_extra_schema_types( + schema["properties"]["count"], + extra_types=[{"type": "integer"}, {"type": "string"}], + ) + + +class CPUSpec(generate_dual_core_model(CPUSpecConfig)): + arch: Annotated[ + Optional[gpuhunt.CPUArchitecture], + Field(description="The CPU architecture, one of: `x86`, `arm`"), + ] = None + count: Annotated[Range[int], Field(description="The number of CPU cores")] = DEFAULT_CPU_COUNT + + @classmethod + def __get_validators__(cls): + yield cls.parse + yield cls.validate + + @classmethod + def parse(cls, v: Any) -> Any: + if isinstance(v, int): + v = str(v) + if isinstance(v, str): + tokens = v.replace(" ", "").split(":") + spec = {} + for token in tokens: + if not token: + raise ValueError(f"CPU spec contains empty token: {v}") + if ".." in token or token.isdigit(): + if "count" in spec: + raise ValueError(f"CPU spec count conflict: {v}") + spec["count"] = token + else: + try: + arch = gpuhunt.CPUArchitecture.cast(token) + except ValueError: + raise ValueError(f"Invalid CPU architecture: {v}") + if "arch" in spec: + raise ValueError(f"CPU spec arch conflict: {v}") + spec["arch"] = arch + return spec + # Range and min/max dict - for backward compatibility + if isinstance(v, Range): + return {"arch": None, "count": v} + if isinstance(v, Mapping) and v.keys() == {"min", "max"}: + return {"arch": None, "count": v} + return v + + @validator("arch", pre=True) + def _validate_arch(cls, v: Any) -> Any: + if v is None: + return None + if isinstance(v, gpuhunt.CPUArchitecture): + return v + if isinstance(v, str): + return gpuhunt.CPUArchitecture.cast(v) + return v -class GPUSpec(CoreModel): +class GPUSpecConfig(CoreConfig): + @staticmethod + def schema_extra(schema: Dict[str, Any]): + add_extra_schema_types( + schema["properties"]["count"], + extra_types=[{"type": "integer"}, {"type": "string"}], + ) + add_extra_schema_types( + schema["properties"]["name"], + extra_types=[{"type": "string"}], + ) + add_extra_schema_types( + schema["properties"]["memory"], + extra_types=[{"type": "integer"}, {"type": "string"}], + ) + add_extra_schema_types( + schema["properties"]["total_memory"], + extra_types=[{"type": "integer"}, {"type": "string"}], + ) + + +class GPUSpec(generate_dual_core_model(GPUSpecConfig)): + vendor: Annotated[ + Optional[gpuhunt.AcceleratorVendor], + Field( + description="The vendor of the GPU/accelerator, one of: `nvidia`, `amd`, `google` (alias: `tpu`), `intel`" + ), + ] = None name: Annotated[ Optional[List[str]], Field(description="The name of the GPU (e.g., `A100` or `H100`)") ] = None @@ -141,6 +255,14 @@ def parse(cls, v: Any) -> Any: for token in tokens: if not token: raise ValueError(f"GPU spec contains empty token: {v}") + try: + vendor = cls._vendor_from_string(token) + except ValueError: + vendor = None + if vendor: + if "vendor" in spec: + raise ValueError(f"GPU spec vendor conflict: {v}") + spec["vendor"] = vendor elif token[0].isalpha(): # GPU name is always starts with a letter if "name" in spec: raise ValueError(f"GPU spec name conflict: {v}") @@ -160,15 +282,56 @@ def parse(cls, v: Any) -> Any: @validator("name", pre=True) def _validate_name(cls, v: Any) -> Any: - if v is not None and not isinstance(v, list): - return [v] - return v + if v is None: + return None + if not isinstance(v, list): + v = [v] + validated: List[Any] = [] + has_tpu_prefix = False + for name in v: + if isinstance(name, str) and name.startswith("tpu-"): + name = name[4:] + has_tpu_prefix = True + validated.append(name) + if has_tpu_prefix: + logger.warning("`tpu-` prefix is deprecated, specify gpu_vendor instead") + return validated + + @validator("vendor", pre=True) + def _validate_vendor( + cls, v: Union[str, gpuhunt.AcceleratorVendor, None] + ) -> Optional[gpuhunt.AcceleratorVendor]: + if v is None: + return None + if isinstance(v, gpuhunt.AcceleratorVendor): + return v + if isinstance(v, str): + return cls._vendor_from_string(v) + raise TypeError(f"Unsupported type: {v!r}") + @classmethod + def _vendor_from_string(cls, v: str) -> gpuhunt.AcceleratorVendor: + v = v.lower() + if v == "tpu": + return gpuhunt.AcceleratorVendor.GOOGLE + if v == "tt": + return gpuhunt.AcceleratorVendor.TENSTORRENT + return gpuhunt.AcceleratorVendor.cast(v) + + +DEFAULT_GPU_SPEC = GPUSpec(count=Range[int](min=0, max=None)) -MIN_DISK_SIZE = 50 +class DiskSpecConfig(CoreConfig): + @staticmethod + def schema_extra(schema: Dict[str, Any]): + add_extra_schema_types( + schema["properties"]["size"], + extra_types=[{"type": "integer"}, {"type": "string"}], + ) -class DiskSpec(CoreModel): + +class DiskSpec(generate_dual_core_model(DiskSpecConfig)): size: Annotated[Range[Memory], Field(description="Disk size")] @classmethod @@ -182,29 +345,40 @@ def _parse(cls, v: Any) -> Any: return {"size": v} return v - @validator("size") - def validate_size(cls, size): - if size.min is not None and size.min < MIN_DISK_SIZE: - raise ValueError(f"Min disk size should be >= {MIN_DISK_SIZE}GB") - if size.max is not None and size.max < MIN_DISK_SIZE: - raise ValueError(f"Max disk size should be >= {MIN_DISK_SIZE}GB") - return size - DEFAULT_DISK = DiskSpec(size=Range[Memory](min=Memory.parse("100GB"), max=None)) -class ResourcesSpec(CoreModel): - class Config: - @staticmethod - def schema_extra(schema: Dict[str, Any]): - schema.clear() - # replace strict schema with a more permissive one - ref_template = "#/definitions/ResourcesSpecRequest/definitions/{model}" - for field, value in ResourcesSpecSchema.schema(ref_template=ref_template).items(): - schema[field] = value - - cpu: Annotated[Range[int], Field(description="The number of CPU cores")] = DEFAULT_CPU_COUNT +class ResourcesSpecConfig(CoreConfig): + @staticmethod + def schema_extra(schema: Dict[str, Any]): + add_extra_schema_types( + schema["properties"]["cpu"], + extra_types=[{"type": "integer"}, {"type": "string"}], + ) + add_extra_schema_types( + schema["properties"]["memory"], + extra_types=[{"type": "integer"}, {"type": "string"}], + ) + add_extra_schema_types( + schema["properties"]["shm_size"], + extra_types=[{"type": "integer"}, {"type": "string"}], + ) + add_extra_schema_types( + schema["properties"]["gpu"], + extra_types=[{"type": "integer"}, {"type": "string"}], + ) + add_extra_schema_types( + schema["properties"]["disk"], + extra_types=[{"type": "integer"}, {"type": "string"}], + ) + + +class ResourcesSpec(generate_dual_core_model(ResourcesSpecConfig)): + # TODO: remove `Range[int]` in 0.20. It is kept only for backward compatibility. + cpu: Annotated[Union[CPUSpec, Range[int]], Field(description="The CPU requirements")] = ( + CPUSpec() + ) memory: Annotated[Range[Memory], Field(description="The RAM size (e.g., `8GB`)")] = ( DEFAULT_MEMORY_SIZE ) @@ -216,71 +390,50 @@ def schema_extra(schema: Dict[str, Any]): "you may need to configure this" ), ] = None - gpu: Annotated[Optional[GPUSpec], Field(description="The GPU requirements")] = None + gpu: Annotated[Optional[GPUSpec], Field(description="The GPU requirements")] = DEFAULT_GPU_SPEC + """`gpu` is optional for backward compatibility.""" disk: Annotated[Optional[DiskSpec], Field(description="The disk resources")] = DEFAULT_DISK - -IntRangeLike = Union[Range[Union[int, str]], int, str] -MemoryRangeLike = Union[Range[Union[Memory, float, int, str]], float, int, str] -MemoryLike = Union[Memory, float, int, str] -GPULike = Union[GPUSpec, "GPUSpecSchema", int, str] -DiskLike = Union[DiskSpec, "DiskSpecSchema", float, int, str] -ComputeCapabilityLike = Union[ComputeCapability, float, str] - - -class GPUSpecSchema(CoreModel): - name: Annotated[ - Optional[Union[List[str], str]], Field(description="The GPU name or list of names") - ] = None - count: Annotated[IntRangeLike, Field(description="The number of GPUs")] = DEFAULT_GPU_COUNT - memory: Annotated[ - Optional[MemoryRangeLike], - Field( - description="The RAM size (e.g., `16GB`). Can be set to a range (e.g. `16GB..`, or `16GB..80GB`)" - ), - ] = None - total_memory: Annotated[ - Optional[MemoryRangeLike], - Field( - description="The total RAM size (e.g., `32GB`). Can be set to a range (e.g. `16GB..`, or `16GB..80GB`)" - ), - ] = None - compute_capability: Annotated[ - Optional[ComputeCapabilityLike], - Field(description="The minimum compute capability of the GPU (e.g., `7.5`)"), - ] = None - - -class DiskSpecSchema(CoreModel): - size: Annotated[ - MemoryRangeLike, - Field( - description="The disk size. Can be a string (e.g., `100GB` or `100GB..`) or an object" - "; see [examples](#examples)" - ), - ] - - -class ResourcesSpecSchema(CoreModel): - cpu: Annotated[Optional[IntRangeLike], Field(description="The number of CPU cores")] = ( - DEFAULT_CPU_COUNT - ) - memory: Annotated[ - Optional[MemoryRangeLike], - Field(description="The RAM size (e.g., `8GB`)"), - ] = DEFAULT_MEMORY_SIZE - shm_size: Annotated[ - Optional[MemoryLike], - Field( - description="The size of shared memory (e.g., `8GB`). " - "If you are using parallel communicating processes (e.g., dataloaders in PyTorch), " - "you may need to configure this" - ), - ] = None - gpu: Annotated[ - Optional[GPULike], - Field( - description="The GPU requirements. Can be set to a number, a string (e.g. `A100`, `80GB:2`, etc.), or an object; see [examples](#examples)" - ), - ] = None - disk: Annotated[Optional[DiskLike], Field(description="The disk resources")] = DEFAULT_DISK + @classmethod + def unconstrained(cls) -> "ResourcesSpec": + """ResourcesSpec with no meaningful minimum constraints.""" + return cls( + cpu=CPUSpec(count=Range[int](min=1, max=None)), + memory=Range[Memory](min=Memory.parse("0"), max=None), + gpu=DEFAULT_GPU_SPEC, + disk=None, + ) + + def pretty_format(self) -> str: + # TODO: Remove in 0.20. Use self.cpu directly + cpu = parse_obj_as(CPUSpec, self.cpu) + resources: Dict[str, Any] = dict(cpu_arch=cpu.arch, cpus=cpu.count, memory=self.memory) + if self.gpu: + gpu = self.gpu + resources.update( + gpu_vendor=gpu.vendor, + gpu_name=",".join(gpu.name) if gpu.name else None, + gpu_count=gpu.count, + gpu_memory=gpu.memory, + total_gpu_memory=gpu.total_memory, + compute_capability=gpu.compute_capability, + ) + if self.disk: + resources.update(disk_size=self.disk.size) + res = pretty_resources(**resources) + return res + + def dict(self, *args, **kwargs) -> Dict: + # super() does not work with pydantic-duality + res = CoreModel.dict(self, *args, **kwargs) + self._update_serialized_cpu(res) + return res + + # TODO: Remove in 0.20. Added for backward compatibility. + def _update_serialized_cpu(self, values: Dict): + cpu = values["cpu"] + if cpu: + arch = cpu.get("arch") + count = cpu.get("count") + if count and arch in [None, gpuhunt.CPUArchitecture.X86.value]: + values["cpu"] = count diff --git a/src/dstack/_internal/core/models/routers.py b/src/dstack/_internal/core/models/routers.py new file mode 100644 index 0000000000..b1f189c522 --- /dev/null +++ b/src/dstack/_internal/core/models/routers.py @@ -0,0 +1,62 @@ +from enum import Enum +from typing import Literal + +from pydantic import Field +from typing_extensions import Annotated + +from dstack._internal.core.models.common import CoreModel + + +class RouterType(str, Enum): + SGLANG = "sglang" + DYNAMO = "dynamo" + + +class SGLangGatewayRouterConfig(CoreModel): + """Gateway-level router configuration. type and policy only. pd_disaggregation is service-level.""" + + type: Annotated[ + Literal["sglang"], + Field(description="The router type enabled on this gateway."), + ] = "sglang" + policy: Annotated[ + Literal["random", "round_robin", "cache_aware", "power_of_two"], + Field( + description=( + "The routing policy. Deprecated: prefer setting policy in the service's router config. " + "Options: `random`, `round_robin`, `cache_aware`, `power_of_two`" + ), + ), + ] = "cache_aware" + + +class SGLangServiceRouterConfig(CoreModel): + type: Annotated[Literal["sglang"], Field(description="The router type")] = "sglang" + policy: Annotated[ + Literal["random", "round_robin", "cache_aware", "power_of_two"], + Field( + description="The routing policy. Options: `random`, `round_robin`, `cache_aware`, `power_of_two`" + ), + ] = "cache_aware" + pd_disaggregation: Annotated[ + bool, + Field(description="Enable PD disaggregation mode for the SGLang router"), + ] = False + + +class ReplicaGroupRouterConfig(CoreModel): + type: Annotated[ + Literal["sglang", "dynamo"], + Field( + description=( + "The router implementation for this replica group. " + "`sglang` runs the SGLang router and dstack syncs worker URLs to it. " + "`dynamo` runs the NVIDIA Dynamo frontend, which discovers workers " + "itself via etcd/NATS." + ), + ), + ] = "sglang" + + +AnyServiceRouterConfig = SGLangServiceRouterConfig +AnyGatewayRouterConfig = SGLangGatewayRouterConfig diff --git a/src/dstack/_internal/core/models/runs.py b/src/dstack/_internal/core/models/runs.py index 755033cea0..04f4c326d8 100644 --- a/src/dstack/_internal/core/models/runs.py +++ b/src/dstack/_internal/core/models/runs.py @@ -1,36 +1,54 @@ from datetime import datetime, timedelta from enum import Enum -from typing import Any, Dict, List, Optional, Type +from typing import Any, Dict, List, Literal, Optional +from urllib.parse import urlparse from pydantic import UUID4, Field, root_validator from typing_extensions import Annotated +from dstack._internal.core.backends.profile_options import AnyBackendProfileOptions from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.common import CoreModel +from dstack._internal.core.models.common import ( + ApplyAction, + CoreConfig, + CoreModel, + NetworkMode, + RegistryAuth, + generate_dual_core_model, +) from dstack._internal.core.models.configurations import ( + DEFAULT_PROBE_METHOD, + DEFAULT_PROBE_UNTIL_READY, + DEFAULT_REPLICA_GROUP_NAME, + LEGACY_REPO_DIR, AnyRunConfiguration, - RegistryAuth, + HTTPHeaderSpec, + HTTPMethod, + RepoExistsAction, RunConfiguration, + ServiceConfiguration, ) +from dstack._internal.core.models.files import FileArchiveMapping from dstack._internal.core.models.instances import ( InstanceOfferWithAvailability, InstanceType, SSHConnectionParams, ) from dstack._internal.core.models.profiles import ( - DEFAULT_RUN_TERMINATION_IDLE_TIME, CreationPolicy, Profile, ProfileParams, - ProfileRetryPolicy, RetryEvent, SpotPolicy, - TerminationPolicy, + UtilizationPolicy, ) from dstack._internal.core.models.repos import AnyRunRepoData -from dstack._internal.core.models.resources import ResourcesSpec +from dstack._internal.core.models.resources import Memory, ResourcesSpec +from dstack._internal.core.models.routers import RouterType +from dstack._internal.core.models.unix import UnixUser +from dstack._internal.core.models.volumes import MountPoint from dstack._internal.utils import common as common_utils -from dstack._internal.utils.common import format_pretty_duration, pretty_resources +from dstack._internal.utils.common import format_pretty_duration class AppSpec(CoreModel): @@ -79,6 +97,10 @@ class RunTerminationReason(str, Enum): SERVER_ERROR = "server_error" def to_job_termination_reason(self) -> "JobTerminationReason": + """ + Converts run termination reason to job termination reason. + Used to set job termination reason for non-terminated jobs on run termination. + """ mapping = { self.ALL_JOBS_DONE: JobTerminationReason.DONE_BY_RUNNER, self.JOB_FAILED: JobTerminationReason.TERMINATED_BY_SERVER, @@ -100,11 +122,21 @@ def to_status(self) -> "RunStatus": } return mapping[self] + def to_error(self) -> Optional[str]: + if self == RunTerminationReason.RETRY_LIMIT_EXCEEDED: + return "retry limit exceeded" + elif self == RunTerminationReason.SERVER_ERROR: + return "server error" + else: + return None + class JobTerminationReason(str, Enum): # Set by the server FAILED_TO_START_DUE_TO_NO_CAPACITY = "failed_to_start_due_to_no_capacity" INTERRUPTED_BY_NO_CAPACITY = "interrupted_by_no_capacity" + INSTANCE_UNREACHABLE = "instance_unreachable" + INSTANCE_ACCESS_REVOKED = "instance_access_revoked" WAITING_INSTANCE_LIMIT_EXCEEDED = "waiting_instance_limit_exceeded" WAITING_RUNNER_LIMIT_EXCEEDED = "waiting_runner_limit_exceeded" TERMINATED_BY_USER = "terminated_by_user" @@ -114,16 +146,22 @@ class JobTerminationReason(str, Enum): DONE_BY_RUNNER = "done_by_runner" ABORTED_BY_USER = "aborted_by_user" TERMINATED_BY_SERVER = "terminated_by_server" + INACTIVITY_DURATION_EXCEEDED = "inactivity_duration_exceeded" + TERMINATED_DUE_TO_UTILIZATION_POLICY = "terminated_due_to_utilization_policy" # Set by the runner CONTAINER_EXITED_WITH_ERROR = "container_exited_with_error" PORTS_BINDING_FAILED = "ports_binding_failed" CREATING_CONTAINER_ERROR = "creating_container_error" EXECUTOR_ERROR = "executor_error" + MAX_DURATION_EXCEEDED = "max_duration_exceeded" + LOG_QUOTA_EXCEEDED = "log_quota_exceeded" def to_status(self) -> JobStatus: mapping = { self.FAILED_TO_START_DUE_TO_NO_CAPACITY: JobStatus.FAILED, self.INTERRUPTED_BY_NO_CAPACITY: JobStatus.FAILED, + self.INSTANCE_UNREACHABLE: JobStatus.FAILED, + self.INSTANCE_ACCESS_REVOKED: JobStatus.FAILED, self.WAITING_INSTANCE_LIMIT_EXCEEDED: JobStatus.FAILED, self.WAITING_RUNNER_LIMIT_EXCEEDED: JobStatus.FAILED, self.TERMINATED_BY_USER: JobStatus.TERMINATED, @@ -133,117 +171,262 @@ def to_status(self) -> JobStatus: self.DONE_BY_RUNNER: JobStatus.DONE, self.ABORTED_BY_USER: JobStatus.ABORTED, self.TERMINATED_BY_SERVER: JobStatus.TERMINATED, + self.INACTIVITY_DURATION_EXCEEDED: JobStatus.TERMINATED, + self.TERMINATED_DUE_TO_UTILIZATION_POLICY: JobStatus.TERMINATED, self.CONTAINER_EXITED_WITH_ERROR: JobStatus.FAILED, self.PORTS_BINDING_FAILED: JobStatus.FAILED, self.CREATING_CONTAINER_ERROR: JobStatus.FAILED, self.EXECUTOR_ERROR: JobStatus.FAILED, + self.MAX_DURATION_EXCEEDED: JobStatus.TERMINATED, + self.LOG_QUOTA_EXCEEDED: JobStatus.FAILED, } return mapping[self] - def pretty_repr(self) -> str: - return " ".join(self.value.split("_")).capitalize() + def to_retry_event(self) -> Optional[RetryEvent]: + """ + Returns: + the retry event this termination reason triggers + or None if this termination reason should not be retried + """ + mapping = { + self.FAILED_TO_START_DUE_TO_NO_CAPACITY: RetryEvent.NO_CAPACITY, + self.INTERRUPTED_BY_NO_CAPACITY: RetryEvent.INTERRUPTION, + } + default = RetryEvent.ERROR if self.to_status() == JobStatus.FAILED else None + return mapping.get(self, default) + + def to_error(self) -> Optional[str]: + # Should return None for values that are already + # handled and shown in status_message. + error_mapping = { + JobTerminationReason.INSTANCE_UNREACHABLE: "instance unreachable", + JobTerminationReason.INSTANCE_ACCESS_REVOKED: "instance access revoked", + JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED: "waiting instance limit exceeded", + JobTerminationReason.WAITING_RUNNER_LIMIT_EXCEEDED: "waiting runner limit exceeded", + JobTerminationReason.VOLUME_ERROR: "volume error", + JobTerminationReason.GATEWAY_ERROR: "gateway error", + JobTerminationReason.SCALED_DOWN: "scaled down", + JobTerminationReason.INACTIVITY_DURATION_EXCEEDED: "inactivity duration exceeded", + JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY: "utilization policy", + JobTerminationReason.PORTS_BINDING_FAILED: "ports binding failed", + JobTerminationReason.CREATING_CONTAINER_ERROR: "runner error", + JobTerminationReason.EXECUTOR_ERROR: "executor error", + JobTerminationReason.MAX_DURATION_EXCEEDED: "max duration exceeded", + JobTerminationReason.LOG_QUOTA_EXCEEDED: "log quota exceeded", + } + return error_mapping.get(self) class Requirements(CoreModel): - # TODO: Make requirements' fields required resources: ResourcesSpec - max_price: Optional[float] - spot: Optional[bool] + max_price: Optional[float] = None + spot: Optional[bool] = None + reservation: Optional[str] = None + multinode: Optional[bool] = None + """Backends can use `multinode` to filter out offers when some offers support multinode and some do not. + """ + backend_options: Optional[List[AnyBackendProfileOptions]] = None def pretty_format(self, resources_only: bool = False): - resources = dict(cpus=self.resources.cpu, memory=self.resources.memory) - if self.resources.gpu: - gpu = self.resources.gpu - resources.update( - gpu_name=",".join(gpu.name) if gpu.name else None, - gpu_count=gpu.count, - gpu_memory=gpu.memory, - total_gpu_memory=gpu.total_memory, - compute_capability=gpu.compute_capability, - ) - if self.resources.disk: - resources.update(disk_size=self.resources.disk.size) - res = pretty_resources(**resources) + res = self.resources.pretty_format() if not resources_only: if self.spot is not None: res += f", {'spot' if self.spot else 'on-demand'}" if self.max_price is not None: - res += f" under ${self.max_price:g} per hour" + res += f" under ${self.max_price:3f}".rstrip("0").rstrip(".") + " per hour" return res -class Gateway(CoreModel): - gateway_name: Optional[str] - service_port: int - hostname: Optional[str] - public_port: int = 80 - secure: bool = False +class JobSSHKey(CoreModel): + private: str + public: str - auth: bool = True - options: dict = {} + +class ProbeSpec(CoreModel): + type: Literal["http"] + """`type` currently expects `http`, but other probe types such as `exec` may be added later.""" + url: str + method: HTTPMethod = DEFAULT_PROBE_METHOD + headers: list[HTTPHeaderSpec] = [] + body: Optional[str] = None + timeout: int + interval: int + ready_after: int + until_ready: bool = DEFAULT_PROBE_UNTIL_READY class JobSpec(CoreModel): - replica_num: int = 0 # default value for backward compatibility + replica_num: int = 0 + """`replica_num` uses a default value for backward compatibility.""" job_num: int job_name: str - jobs_per_replica: int = 1 # default value for backward compatibility + jobs_per_replica: int = 1 + """`jobs_per_replica` uses a default value for backward compatibility.""" + replica_group: str = DEFAULT_REPLICA_GROUP_NAME app_specs: Optional[List[AppSpec]] + user: Optional[UnixUser] = None + """`user` uses a default value for backward compatibility.""" commands: List[str] env: Dict[str, str] home_dir: Optional[str] image_name: str + privileged: bool = False + single_branch: Optional[bool] = None max_duration: Optional[int] + stop_duration: Optional[int] = None + utilization_policy: Optional[UtilizationPolicy] = None registry_auth: Optional[RegistryAuth] requirements: Requirements retry: Optional[Retry] - # For backward compatibility with 0.18.x when retry_policy was required. - # TODO: remove in 0.19 - retry_policy: ProfileRetryPolicy = ProfileRetryPolicy(retry=False) + volumes: Optional[List[MountPoint]] = None + ssh_key: Optional[JobSSHKey] = None working_dir: Optional[str] + repo_data: Annotated[Optional[AnyRunRepoData], Field(discriminator="repo_type")] = None + """`repo_data` is optional for client compatibility with pre-0.19.17 servers and for jobs + submitted before 0.19.17. All new jobs are expected to have non-`None` `repo_data`. + For `--no-repo` runs, `repo_data` is `VirtualRunRepoData()`. + """ + # TODO: drop this compatibility note when support for jobs submitted before 0.19.17 is no longer relevant. + repo_code_hash: Optional[str] = None + """`repo_code_hash` can be `None` because it is not used for the repo or because the job was + submitted before 0.19.17. See `_get_repo_code_hash` for how to get the correct value. + """ + repo_dir: str = LEGACY_REPO_DIR + """`repo_dir` was added in 0.19.27 and uses a default value for backward compatibility.""" + repo_exists_action: Optional[RepoExistsAction] = None + """`repo_exists_action` is `None` for jobs without a repo and for jobs submitted by pre-0.20.0 clients.""" + file_archives: list[FileArchiveMapping] = [] + service_port: Optional[int] = None + """`service_port` is `None` for non-services and pre-0.19.19 services. See `get_service_port`.""" + probes: list[ProbeSpec] = [] class JobProvisioningData(CoreModel): backend: BackendType + base_backend: Optional[BackendType] = None + """`base_backend` may be set when a backend provisions an instance in another backend and wants + to record that backend as `base_backend`. + """ instance_type: InstanceType instance_id: str - # hostname may not be set immediately after instance provisioning. - # It is set to a public IP or, if public IPs are disabled, to a private IP. hostname: Optional[str] = None + """`hostname` may not be set immediately after instance provisioning. + It is set to a public IP or, if public IPs are disabled, to a private IP. + """ internal_ip: Optional[str] = None - # public_ip_enabled can used to distinguished instances with and without public IPs. - # hostname being None is not enough since it can be filled after provisioning. public_ip_enabled: bool = True - # instance_network a network address for multimode installation. Specified as `/` - # internal_ip will be selected from the specified network + """`public_ip_enabled` is used to distinguish instances with and without public IPs. + `hostname` being `None` is not enough because it can be filled after provisioning. + """ instance_network: Optional[str] = None + """`instance_network` stores the multimode installation network, specified as + `/`. `internal_ip` will be selected from the specified network. + """ region: str availability_zone: Optional[str] = None + reservation: Optional[str] = None price: float username: str - # ssh_port be different from 22 for some backends. - # ssh_port may not be set immediately after instance provisioning ssh_port: Optional[int] = None - dockerized: bool # True if backend starts shim + """`ssh_port` may be different from 22 for some backends and may not be set immediately after + instance provisioning. + """ + dockerized: bool + """`dockerized` is `True` when the backend starts the shim.""" ssh_proxy: Optional[SSHConnectionParams] = None - backend_data: Optional[str] = None # backend-specific data in json + backend_data: Optional[str] = None + """`backend_data` stores backend-specific data in JSON.""" + + def get_base_backend(self) -> BackendType: + if self.base_backend is not None: + return self.base_backend + return self.backend + + +class JobRuntimeData(CoreModel): + """ + Holds various information only available after the job is submitted, such as: + * offer (depends on the instance) + * volumes used by the job + * resource constraints for container (depend on the instance) + * port mapping (reported by the shim only after the container is started) + + Some fields are mutable, for example, `ports` only available when the shim starts + the container. + """ + + network_mode: NetworkMode + gpu: Optional[int] = None + """`gpu` stores the GPU resource share. `None` means all available with no limit.""" + cpu: Optional[float] = None + """`cpu` stores the CPU resource share. `None` means all available with no limit.""" + memory: Optional[Memory] = None + """`memory` stores the memory resource share. `None` means all available with no limit.""" + ports: Optional[dict[int, int]] = None + """`ports` stores the container-to-host port mapping reported by shim. It is an empty dict if + `network_mode == NetworkMode.HOST`. `None` if data is not yet available + on VM-based backends and SSH instances, or not applicable on container-based backends. + """ + volume_names: Optional[list[str]] = None + """`volume_names` stores the list of volumes used by the job. It is `None` for backward compatibility.""" + offer: Optional[InstanceOfferWithAvailability] = None + """`offer` stores the virtual shared offer. It is `None` for backward compatibility.""" + working_dir: Optional[str] = None + """`working_dir` stores the resolved working directory reported by the runner. + `None` if the runner has not reported it yet or if it is an old runner. + """ + username: Optional[str] = None + """`username` stores the resolved OS username reported by the runner. + `None` if the runner has not reported it yet or if it is an old runner. + """ class ClusterInfo(CoreModel): + job_ips: List[str] master_job_ip: str gpus_per_job: int +class Probe(CoreModel): + success_streak: int + + +class ImagePullProgress(CoreModel): + downloaded_bytes: int + extracted_bytes: int + total_bytes: int + """An estimate of the number of bytes to be downloaded and extracted during this pull. + Does not include cached layers that existed on the instance before the pull. + """ + is_total_bytes_final: bool + """Whether `total_bytes` is believed to be the correct final value. + If `False`, then `total_bytes` is a lower estimate. + """ + + class JobSubmission(CoreModel): id: UUID4 submission_num: int + deployment_num: int = 0 + """`deployment_num` uses a default value for compatibility with pre-0.19.14 servers.""" submitted_at: datetime last_processed_at: datetime - finished_at: Optional[datetime] + finished_at: Optional[datetime] = None + inactivity_secs: Optional[int] = None status: JobStatus - termination_reason: Optional[JobTerminationReason] - termination_reason_message: Optional[str] - job_provisioning_data: Optional[JobProvisioningData] + status_message: str = "" + """`status_message` uses a default value for backward compatibility.""" + termination_reason: Optional[str] = None + """`termination_reason` stores `JobTerminationReason`. + `str` allows adding new enum members without breaking compatibility with old clients. + """ + termination_reason_message: Optional[str] = None + exit_status: Optional[int] = None + job_provisioning_data: Optional[JobProvisioningData] = None + job_runtime_data: Optional[JobRuntimeData] = None + error: Optional[str] = None + probes: list[Probe] = [] + image_pull_progress: Optional[ImagePullProgress] = None @property def age(self) -> timedelta: @@ -257,62 +440,214 @@ def duration(self) -> timedelta: return end_time - self.submitted_at +class JobConnectionInfo(CoreModel): + ide_name: Annotated[ + Optional[str], Field(description="Dev environment IDE name for UI, human-readable.") + ] + attached_ide_url: Annotated[ + Optional[str], + Field( + description=( + "Dev environment IDE URL." + " Not set if the job has not started yet." + " Only works if the user is attached to the run via CLI or Python API." + ) + ), + ] + proxied_ide_url: Annotated[ + Optional[str], + Field( + description=( + "Dev environment IDE URL." + " Not set if the job has hot started yet or sshproxy is not configured." + ) + ), + ] + attached_ssh_command: Annotated[ + Optional[list[str]], + Field( + description=( + "SSH command to connect to the job, list of command line arguments." + " Only works if the user is attached to the run via CLI or Python API." + ) + ), + ] + proxied_ssh_command: Annotated[ + Optional[list[str]], + Field( + description=( + "SSH command to connect to the job, list of command line arguments." + " Not set if sshproxy is not configured." + ) + ), + ] + sshproxy_hostname: Annotated[ + Optional[str], + Field(description="sshproxy hostname. Not set if sshproxy is not configured."), + ] = None + sshproxy_port: Annotated[ + Optional[int], + Field( + description=( + "ssproxy port. Not set if sshproxy is not configured." + " May be not set if it is equal to the default SSH port 22." + ) + ), + ] = None + sshproxy_upstream_id: Annotated[ + Optional[str], + Field( + description=( + "sshproxy identifier for this job. SSH clients send this identifier as a username" + " to indicate which job they wish to connect." + " Not set if sshproxy is not configured." + ) + ), + ] = None + + class Job(CoreModel): job_spec: JobSpec job_submissions: List[JobSubmission] - - -class RunSpec(CoreModel): - run_name: Optional[str] - repo_id: str - repo_data: Annotated[AnyRunRepoData, Field(discriminator="repo_type")] - repo_code_hash: Optional[str] - working_dir: str - configuration_path: str + job_connection_info: Optional[JobConnectionInfo] = None + + +class RunSpecConfig(CoreConfig): + @staticmethod + def schema_extra(schema: Dict[str, Any]): + prop = schema.get("properties", {}) + prop.pop("merged_profile", None) + + +class RunSpec(generate_dual_core_model(RunSpecConfig)): + # TODO: consider removing `run_name` here because it is already passed in `configuration`. + run_name: Annotated[ + Optional[str], + Field(description="The run name. If not set, the run name is generated automatically."), + ] = None + repo_id: Annotated[ + Optional[str], + Field( + description=( + "Same `repo_id` that is specified when initializing the repo" + " by calling the `/api/project/{project_name}/repos/init` endpoint." + " If not specified, a default virtual repo is used." + ) + ), + ] = None + repo_data: Annotated[ + Optional[AnyRunRepoData], + Field( + discriminator="repo_type", + description="The repo data such as the current branch and commit.", + ), + ] = None + repo_code_hash: Annotated[ + Optional[str], + Field(description="The hash of the repo diff. Can be omitted if there is no repo diff."), + ] = None + repo_dir: Annotated[ + Optional[str], + Field( + description=( + "The repo path inside the container. Relative paths are resolved" + " relative to the working directory." + ) + ), + ] = None + file_archives: Annotated[ + list[FileArchiveMapping], + Field(description="The list of file archive ID to container path mappings."), + ] = [] + working_dir: Optional[str] = None + """`working_dir` is kept for compatibility with old clients that still send it, even though the + server uses `configuration.working_dir` since 0.19.27 and ignores this field. + """ + configuration_path: Annotated[ + Optional[str], + Field( + description=( + "The path to the run configuration YAML file." + " It can be omitted when using the programmatic API." + ) + ), + ] = None configuration: Annotated[AnyRunConfiguration, Field(discriminator="type")] - profile: Profile - ssh_key_pub: str - # TODO: make merged_profile a computed field after migrating to pydanticV2 + profile: Annotated[Optional[Profile], Field(description="The profile parameters")] = None + ssh_key_pub: Annotated[ + Optional[str], + Field( + description="The contents of the SSH public key that will be used to connect to the run." + " Can be empty only before the run is submitted." + ), + ] = None + # TODO: make `merged_profile` a computed field after migrating to Pydantic v2. merged_profile: Annotated[Profile, Field(exclude=True)] = None - - class Config: - @staticmethod - def schema_extra(schema: Dict[str, Any], model: Type) -> None: - prop = schema.get("properties", {}) - prop.pop("merged_profile", None) + """`merged_profile` stores profile parameters merged from `profile` and `configuration`. + Read profile parameters from `merged_profile` instead of `profile` directly. + """ @root_validator def _merged_profile(cls, values) -> Dict: - try: + if values.get("profile") is None: + merged_profile = Profile(name="default") + else: merged_profile = Profile.parse_obj(values["profile"]) + try: conf = RunConfiguration.parse_obj(values["configuration"]).__root__ except KeyError: - raise ValueError("Missing profile or configuration") + raise ValueError("Missing configuration") for key in ProfileParams.__fields__: conf_val = getattr(conf, key, None) if conf_val is not None: setattr(merged_profile, key, conf_val) if merged_profile.creation_policy is None: merged_profile.creation_policy = CreationPolicy.REUSE_OR_CREATE - if merged_profile.termination_policy is None: - merged_profile.termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE - if merged_profile.termination_idle_time is None: - merged_profile.termination_idle_time = DEFAULT_RUN_TERMINATION_IDLE_TIME values["merged_profile"] = merged_profile return values + @root_validator + def _validate_dynamo_no_retry(cls, values) -> Dict: + """Reject `retry` for services with a Dynamo router replica group. + Dynamo workers cache the router's internal IP at provisioning time. A + retry would produce a new router and likely a new internal_ip, leaving workers bound + to a router that no longer exists. + """ + merged_profile = values.get("merged_profile") + cfg = values.get("configuration") + if merged_profile is None or merged_profile.retry is None: + return values + if not isinstance(cfg, ServiceConfiguration): + return values + for g in cfg.replica_groups: + if g.router is not None and g.router.type == RouterType.DYNAMO: + raise ValueError( + "Retry cannot be configured for services with a Dynamo " + "router replica group. The router's address must remain " + "stable for the life of the run; allowing retry would " + "leave workers bound to a router that no longer exists. " + "Remove `retry` from the profile/configuration and " + "re-apply." + ) + return values + class ServiceModelSpec(CoreModel): name: str - base_url: str + base_url: Annotated[ + str, Field(description="Full URL or path relative to dstack-server's base URL") + ] type: str class ServiceSpec(CoreModel): - url: str + url: Annotated[str, Field(description="Full URL or path relative to dstack-server's base URL")] model: Optional[ServiceModelSpec] = None options: Dict[str, Any] = {} + def get_domain(self) -> Optional[str]: + return urlparse(self.url).hostname + class RunStatus(str, Enum): PENDING = "pending" @@ -332,19 +667,42 @@ def is_finished(self): return self in self.finished_statuses() +class RunFleet(CoreModel): + id: UUID4 + name: str + + class Run(CoreModel): id: UUID4 project_name: str user: str + fleet: Optional[RunFleet] = None submitted_at: datetime last_processed_at: datetime status: RunStatus - termination_reason: Optional[RunTerminationReason] + status_message: str = "" + """`status_message` uses a default value for backward compatibility.""" + termination_reason: Optional[str] = None + """`termination_reason` stores `RunTerminationReason`. + `str` allows adding new enum members without breaking compatibility with old clients. + """ run_spec: RunSpec jobs: List[Job] - latest_job_submission: Optional[JobSubmission] + latest_job_submission: Optional[JobSubmission] = None cost: float = 0 service: Optional[ServiceSpec] = None + deployment_num: int = 0 + """`deployment_num` uses a default value for compatibility with pre-0.19.14 servers.""" + error: Optional[str] = None + deleted: Optional[bool] = None + next_triggered_at: Optional[datetime] = None + + def is_deployment_in_progress(self) -> bool: + return any( + not j.job_submissions[-1].status.is_finished() + and j.job_submissions[-1].deployment_num != self.deployment_num + for j in self.jobs + ) class JobPlan(CoreModel): @@ -358,33 +716,33 @@ class RunPlan(CoreModel): project_name: str user: str run_spec: RunSpec + effective_run_spec: Optional[RunSpec] = None job_plans: List[JobPlan] + current_resource: Optional[Run] = None + action: ApplyAction - -class PoolInstanceOffers(CoreModel): - pool_name: str - instances: List[InstanceOfferWithAvailability] + def get_effective_run_spec(self) -> RunSpec: + if self.effective_run_spec is not None: + return self.effective_run_spec + return self.run_spec -class InstanceStatus(str, Enum): - PENDING = "pending" - PROVISIONING = "provisioning" - IDLE = "idle" - BUSY = "busy" - TERMINATING = "terminating" - TERMINATED = "terminated" - - def is_available(self) -> bool: - return self in ( - self.IDLE, - self.BUSY, - ) +class ApplyRunPlanInput(CoreModel): + run_spec: RunSpec + current_resource: Annotated[ + Optional[Run], + Field( + description=( + "The expected current resource." + " If the resource has changed, the apply fails unless `force: true`." + ) + ), + ] = None def get_policy_map(spot_policy: Optional[SpotPolicy], default: SpotPolicy) -> Optional[bool]: - """Map profile.spot_policy[SpotPolicy|None] to requirements.spot[bool|None] - - SpotPolicy.AUTO by default for `dstack run` - - SpotPolicy.ONDEMAND by default for `dstack pool add` + """ + Map profile.spot_policy[SpotPolicy|None] to requirements.spot[bool|None] """ if spot_policy is None: spot_policy = default @@ -394,3 +752,11 @@ def get_policy_map(spot_policy: Optional[SpotPolicy], default: SpotPolicy) -> Op SpotPolicy.ONDEMAND: False, } return policy_map[spot_policy] + + +def get_service_port(job_spec: JobSpec, configuration: ServiceConfiguration) -> int: + # Compatibility with pre-0.19.19 job specs that do not have the `service_port` property. + # TODO: drop when pre-0.19.19 jobs are no longer relevant. + if job_spec.service_port is None: + return configuration.port.container_port + return job_spec.service_port diff --git a/src/dstack/_internal/core/models/secrets.py b/src/dstack/_internal/core/models/secrets.py index bc16207dbe..ab3f411290 100644 --- a/src/dstack/_internal/core/models/secrets.py +++ b/src/dstack/_internal/core/models/secrets.py @@ -1,9 +1,16 @@ +from typing import Optional +from uuid import UUID + from dstack._internal.core.models.common import CoreModel class Secret(CoreModel): + id: UUID name: str - value: str + value: Optional[str] = None def __str__(self) -> str: - return f'Secret(name="{self.name}", value={"*"*len(self.value)})' + displayed_value = "*" + if self.value is not None: + displayed_value = "*" * len(self.value) + return f'Secret(name="{self.name}", value={displayed_value})' diff --git a/src/dstack/_internal/core/models/server.py b/src/dstack/_internal/core/models/server.py new file mode 100644 index 0000000000..cb570b042e --- /dev/null +++ b/src/dstack/_internal/core/models/server.py @@ -0,0 +1,7 @@ +from typing import Optional + +from dstack._internal.core.models.common import CoreModel + + +class ServerInfo(CoreModel): + server_version: Optional[str] diff --git a/src/dstack/_internal/core/models/services.py b/src/dstack/_internal/core/models/services.py new file mode 100644 index 0000000000..8aa6395776 --- /dev/null +++ b/src/dstack/_internal/core/models/services.py @@ -0,0 +1,76 @@ +""" +Data structures related to `type: service` runs. +""" + +from typing import Optional, Union + +from pydantic import Field +from typing_extensions import Annotated, Literal + +from dstack._internal.core.models.common import CoreModel + + +class BaseChatModel(CoreModel): + type: Annotated[Literal["chat"], Field(description="The type of the model")] = "chat" + name: Annotated[str, Field(description="The name of the model")] + format: Annotated[ + str, Field(description="The serving format. Supported values include `openai` and `tgi`") + ] + + +class TGIChatModel(BaseChatModel): + """ + Mapping of the model for the OpenAI-compatible endpoint. + + Attributes: + type (str): The type of the model, e.g. "chat" + name (str): The name of the model. This name will be used both to load model configuration from the HuggingFace Hub and in the OpenAI-compatible endpoint. + format (str): The format of the model, e.g. "tgi" if the model is served with HuggingFace's Text Generation Inference. + chat_template (Optional[str]): The custom prompt template for the model. If not specified, the default prompt template from the HuggingFace Hub configuration will be used. + eos_token (Optional[str]): The custom end of sentence token. If not specified, the default end of sentence token from the HuggingFace Hub configuration will be used. + """ + + format: Annotated[ + Literal["tgi"], Field(description="The serving format. Must be set to `tgi`") + ] + chat_template: Annotated[ + Optional[str], + Field( + description=( + "The custom prompt template for the model." + " If not specified, the default prompt template" + " from the HuggingFace Hub configuration will be used" + ) + ), + ] = None # will be set before registering the service + eos_token: Annotated[ + Optional[str], + Field( + description=( + "The custom end of sentence token." + " If not specified, the default end of sentence token" + " from the HuggingFace Hub configuration will be used" + ) + ), + ] = None + + +class OpenAIChatModel(BaseChatModel): + """ + Mapping of the model for the OpenAI-compatible endpoint. + + Attributes: + type (str): The type of the model, e.g. "chat" + name (str): The name of the model. This name will be used both to load model configuration from the HuggingFace Hub and in the OpenAI-compatible endpoint. + format (str): The format of the model, i.e. "openai". + prefix (str): The `base_url` prefix: `http://hostname/{prefix}/chat/completions`. Defaults to `/v1`. + """ + + format: Annotated[ + Literal["openai"], Field(description="The serving format. Must be set to `openai`") + ] + prefix: Annotated[str, Field(description="The `base_url` prefix (after hostname)")] = "/v1" + + +ChatModel = Annotated[Union[TGIChatModel, OpenAIChatModel], Field(discriminator="format")] +AnyModel = Union[ChatModel] # embeddings and etc. diff --git a/src/dstack/_internal/core/models/templates.py b/src/dstack/_internal/core/models/templates.py new file mode 100644 index 0000000000..8ca52eb785 --- /dev/null +++ b/src/dstack/_internal/core/models/templates.py @@ -0,0 +1,71 @@ +from typing import Annotated, Any, Dict, List, Literal, Optional, Union + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +class BaseUITemplateParameter(CoreModel): + """Base for all UI template parameters.""" + + pass + + +class NameUITemplateParameter(BaseUITemplateParameter): + type: Annotated[Literal["name"], Field(description="The parameter type")] + + +class IDEUITemplateParameter(BaseUITemplateParameter): + type: Annotated[Literal["ide"], Field(description="The parameter type")] + + +class ResourcesUITemplateParameter(BaseUITemplateParameter): + type: Annotated[Literal["resources"], Field(description="The parameter type")] + + +class PythonOrDockerUITemplateParameter(BaseUITemplateParameter): + type: Annotated[Literal["python_or_docker"], Field(description="The parameter type")] + + +class RepoUITemplateParameter(BaseUITemplateParameter): + type: Annotated[Literal["repo"], Field(description="The parameter type")] + + +class WorkingDirUITemplateParameter(BaseUITemplateParameter): + type: Annotated[Literal["working_dir"], Field(description="The parameter type")] + + +class EnvUITemplateParameter(BaseUITemplateParameter): + type: Annotated[Literal["env"], Field(description="The parameter type")] + title: Annotated[Optional[str], Field(description="The display title")] = None + name: Annotated[Optional[str], Field(description="The environment variable name")] = None + value: Annotated[Optional[str], Field(description="The default value")] = None + + +AnyUITemplateParameter = Annotated[ + Union[ + NameUITemplateParameter, + IDEUITemplateParameter, + ResourcesUITemplateParameter, + PythonOrDockerUITemplateParameter, + RepoUITemplateParameter, + WorkingDirUITemplateParameter, + EnvUITemplateParameter, + ], + Field(discriminator="type"), +] + + +class UITemplate(CoreModel): + type: Annotated[Literal["template"], Field(description="The template type")] + name: Annotated[str, Field(description="The unique template identifier")] + title: Annotated[str, Field(description="The human-readable template name")] + description: Annotated[Optional[str], Field(description="The template description")] = None + parameters: Annotated[ + List[AnyUITemplateParameter], + Field(description="The template parameters"), + ] = [] + configuration: Annotated[ + Dict[str, Any], + Field(description="The dstack run configuration"), + ] diff --git a/src/dstack/_internal/core/models/unix.py b/src/dstack/_internal/core/models/unix.py new file mode 100644 index 0000000000..1fe16f753c --- /dev/null +++ b/src/dstack/_internal/core/models/unix.py @@ -0,0 +1,53 @@ +from typing import Optional + +from pydantic import Field +from typing_extensions import Annotated, Self + +from dstack._internal.core.models.common import CoreModel + + +class UnixUser(CoreModel): + uid: Annotated[Optional[int], Field(description="User ID", ge=0)] = None + gid: Annotated[Optional[int], Field(description="Group ID", ge=0)] = None + username: Annotated[Optional[str], Field(description="User name", min_length=1)] = None + groupname: Annotated[Optional[str], Field(description="Group name", min_length=1)] = None + + @classmethod + def parse(cls, v: str) -> Self: + """ + Parse `[:]` format used by Docker. + """ + try: + return cls._parse(v) + except ValueError as e: + raise ValueError(f"invalid user format: {e}") + + @classmethod + def _parse(cls, v: str) -> Self: + parts = v.split(":") + if len(parts) > 2: + raise ValueError("too many parts") + uid: Optional[int] = None + gid: Optional[int] = None + username: Optional[str] = None + groupname: Optional[str] = None + user_name_or_id = parts[0] + if not user_name_or_id: + raise ValueError("empty user name or id") + try: + uid = int(user_name_or_id) + except ValueError: + username = user_name_or_id + if uid is not None and uid < 0: + raise ValueError(f"negative uid {uid}") + if len(parts) == 2: + group_name_or_id = parts[1] + if not group_name_or_id: + raise ValueError("empty group name or id") + try: + gid = int(group_name_or_id) + except ValueError: + groupname = group_name_or_id + if gid is not None and gid < 0: + raise ValueError(f"negative gid {gid}") + return cls(uid=uid, gid=gid, username=username, groupname=groupname) diff --git a/src/dstack/_internal/core/models/users.py b/src/dstack/_internal/core/models/users.py index 7794862979..8e70e092d6 100644 --- a/src/dstack/_internal/core/models/users.py +++ b/src/dstack/_internal/core/models/users.py @@ -1,5 +1,6 @@ import enum -from typing import Optional +from datetime import datetime +from typing import List, Optional, Union from pydantic import UUID4 @@ -8,6 +9,7 @@ class ProjectRole(str, enum.Enum): ADMIN = "admin" + MANAGER = "manager" USER = "user" @@ -16,11 +18,19 @@ class GlobalRole(str, enum.Enum): USER = "user" +class UserPermissions(CoreModel): + can_create_projects: bool + + class User(CoreModel): id: UUID4 username: str + created_at: Optional[datetime] = None global_role: GlobalRole email: Optional[str] + active: bool + permissions: UserPermissions + ssh_public_key: Optional[str] = None class UserTokenCreds(CoreModel): @@ -29,3 +39,22 @@ class UserTokenCreds(CoreModel): class UserWithCreds(User): creds: UserTokenCreds + ssh_private_key: Optional[str] = None + + +class UsersInfoList(CoreModel): + total_count: Optional[int] = None + users: List[User] + + +# For backward compatibility with 0.20 clients, endpoints return `List[User]` if `total_count` is None. +# TODO: Replace with UsersInfoList in 0.21. +UsersInfoListOrUsersList = Union[List[User], UsersInfoList] + + +class UserHookConfig(CoreModel): + """ + This class can be inherited to extend the user creation configuration passed to the hooks. + """ + + pass diff --git a/src/dstack/_internal/core/models/volumes.py b/src/dstack/_internal/core/models/volumes.py index 216bf63a02..1b96331903 100644 --- a/src/dstack/_internal/core/models/volumes.py +++ b/src/dstack/_internal/core/models/volumes.py @@ -1,66 +1,365 @@ import uuid from datetime import datetime from enum import Enum -from typing import Literal, Optional +from pathlib import PurePosixPath +from typing import Any, Dict, List, Literal, Optional, Tuple, Union -from pydantic import Field -from typing_extensions import Annotated +from pydantic import Field, ValidationError, validator +from typing_extensions import Annotated, Self +from dstack._internal.core.errors import ConfigurationError from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.common import CoreModel +from dstack._internal.core.models.profiles import parse_idle_duration from dstack._internal.core.models.resources import Memory +from dstack._internal.utils.common import get_or_error +from dstack._internal.utils.tags import tags_validator class VolumeStatus(str, Enum): SUBMITTED = "submitted" PROVISIONING = "provisioning" + """`PROVISIONING` is currently not used because on all backends supporting volumes, + volumes become `ACTIVE` almost immediately after provisioning. + """ ACTIVE = "active" FAILED = "failed" + def is_active(self) -> bool: + return self not in self.finished_statuses() -class VolumeConfiguration(CoreModel): + @classmethod + def finished_statuses(cls) -> List["VolumeStatus"]: + return [cls.FAILED] + + +class BaseVolumeConfiguration(CoreModel): type: Literal["volume"] = "volume" + backend: Any + """`backend` is used as a tagged union discriminator. Subclasses must override its type + with `Literal[BackendType.]` annotation. Annotated as `Any` since `BackendType` + triggers type checker error: + > Variable is mutable so its type is invariant + """ name: Annotated[Optional[str], Field(description="The volume name")] = None - backend: Annotated[BackendType, Field(description="The volume backend")] - region: Annotated[str, Field(description="The volume region")] size: Annotated[ Optional[Memory], Field(description="The volume size. Must be specified when creating new volumes"), ] = None + auto_cleanup_duration: Annotated[ + Optional[Union[str, int]], + Field( + description=( + "Time to wait after volume is no longer used by any job before deleting it. " + "Defaults to keep the volume indefinitely. " + "Use the value `off` or `-1` to disable auto-cleanup" + ) + ), + ] = None + tags: Annotated[ + Optional[Dict[str, str]], + Field( + description=( + "The custom tags to associate with the volume." + " The tags are also propagated to the underlying backend resources." + " If there is a conflict with backend-level tags, does not override them" + ) + ), + ] = None + + _validate_tags = validator("tags", pre=True, allow_reuse=True)(tags_validator) + _validate_auto_cleanup_duration = validator( + "auto_cleanup_duration", pre=True, allow_reuse=True + )(parse_idle_duration) + + @property + def external_volume_id(self) -> Optional[str]: + """ + Returns the value of a configuration field denoting a user-provided volume identifier + when an existing volume is registered rather than a new one being created. + """ + return None + + @property + def is_external(self) -> bool: + return self.external_volume_id is not None + + @property + def size_gb(self) -> int: + return int(get_or_error(self.size)) + + +class VolumeConfigurationWithRegion(BaseVolumeConfiguration): + region: Annotated[str, Field(description="The volume region")] + + +class VolumeConfigurationWithAvailibilityZone(VolumeConfigurationWithRegion): + availability_zone: Annotated[ + Optional[str], Field(description="The volume availability zone") + ] = None + + +class VolumeConfigurationWithVolumeID(BaseVolumeConfiguration): volume_id: Annotated[ Optional[str], Field(description="The volume ID. Must be specified when registering external volumes"), ] = None + @property + def external_volume_id(self) -> Optional[str]: + return self.volume_id + + +class AWSVolumeConfiguration( + VolumeConfigurationWithAvailibilityZone, VolumeConfigurationWithVolumeID +): + backend: Annotated[Literal[BackendType.AWS], Field(description="The volume backend")] = ( + BackendType.AWS + ) + + +class GCPVolumeConfiguration( + VolumeConfigurationWithAvailibilityZone, VolumeConfigurationWithVolumeID +): + backend: Annotated[Literal[BackendType.GCP], Field(description="The volume backend")] = ( + BackendType.GCP + ) + + +class RunpodVolumeConfiguration(VolumeConfigurationWithRegion, VolumeConfigurationWithVolumeID): + backend: Annotated[Literal[BackendType.RUNPOD], Field(description="The volume backend")] = ( + BackendType.RUNPOD + ) + availability_zone: Annotated[Optional[str], Field(exclude=True)] = None + """Runpod doesn't have AZs but we accept this field for compatibility with older clients.""" + + +class KubernetesVolumeConfiguration(VolumeConfigurationWithRegion): + backend: Annotated[ + Literal[BackendType.KUBERNETES], Field(description="The volume backend") + ] = BackendType.KUBERNETES + region: Annotated[str, Field(description="The volume region (cluster)")] = "" + """`region` uses a default value for backward compatibility.""" + size: Annotated[ + Optional[Memory], + Field( + description=( + "The requested volume size. Must be specified when creating new PVCs." + " Ignored if `claim_name` is set" + ) + ), + ] = None + """`size` is overridden to provide Kubernetes-specific description. + The signature is the same as in the base class.""" + claim_name: Annotated[ + Optional[str], + Field( + description=( + "The `PersistentVolumeClaim` name. Must be specified when registering" + " the existing PVC instead of creating a new one" + ) + ), + ] = None + storage_class_name: Annotated[ + Optional[str], Field(description="The `StorageClass` name. Ignored if `claim_name` is set") + ] = None + access_modes: Annotated[ + list[str], + Field(description="A list of accepted access modes. Ignored if `claim_name` is set"), + ] = ["ReadWriteOnce"] + read_only: Annotated[ + bool, Field(description="If `true`, enforces the volume to be mounted as read-only") + ] = False + + @property + def external_volume_id(self) -> Optional[str]: + return self.claim_name + + +AnyVolumeConfiguration = Union[ + AWSVolumeConfiguration, + GCPVolumeConfiguration, + RunpodVolumeConfiguration, + KubernetesVolumeConfiguration, +] + + +class VolumeConfiguration(CoreModel): + __root__: Annotated[AnyVolumeConfiguration, Field(discriminator="backend")] + + +def parse_volume_configuration(data: dict) -> AnyVolumeConfiguration: + try: + return VolumeConfiguration.parse_obj(data).__root__ + except ValidationError as e: + raise ConfigurationError(e) + + +class VolumeSpec(CoreModel): + configuration: Annotated[AnyVolumeConfiguration, Field(discriminator="backend")] + configuration_path: Optional[str] = None + class VolumeProvisioningData(CoreModel): backend: Optional[BackendType] = None volume_id: str size_gb: int availability_zone: Optional[str] = None - # price per month price: Optional[float] = None - backend_data: Optional[str] = None # backend-specific data in json + """`price` stores the monthly price.""" + attachable: bool = True + """`attachable` shows whether the volume should be attached and detached manually.""" + detachable: bool = True + backend_data: Optional[str] = None + """`backend_data` stores backend-specific data in JSON.""" class VolumeAttachmentData(CoreModel): device_name: Optional[str] = None +class VolumeInstance(CoreModel): + name: str + fleet_name: Optional[str] = None + instance_num: int + instance_id: Optional[str] = None + + +class VolumeAttachment(CoreModel): + instance: VolumeInstance + attachment_data: Optional[VolumeAttachmentData] = None + + class Volume(CoreModel): + id: uuid.UUID name: str + user: str project_name: str - configuration: VolumeConfiguration + configuration: Annotated[AnyVolumeConfiguration, Field(discriminator="backend")] external: bool created_at: datetime + last_processed_at: datetime status: VolumeStatus status_message: Optional[str] = None - volume_id: Optional[str] = None # id of the volume in the cloud + deleted: bool + deleted_at: Optional[datetime] = None + volume_id: Optional[str] = None + """`volume_id` is the volume identifier in the cloud provider.""" provisioning_data: Optional[VolumeProvisioningData] = None + cost: float = 0 + attachments: Optional[List[VolumeAttachment]] = None attachment_data: Optional[VolumeAttachmentData] = None - volume_model_id: uuid.UUID # uuid of VolumeModel + """`attachment_data` is deprecated in favor of `attachments`. + It is only set for volumes that were attached before attachments were introduced. + """ + + def get_attachment_data_for_instance(self, instance_id: str) -> Optional[VolumeAttachmentData]: + if self.attachments is not None: + for attachment in self.attachments: + if attachment.instance.instance_id == instance_id: + return attachment.attachment_data + # volume was attached before attachments were introduced + return self.attachment_data + + def get_backend(self) -> BackendType: + return self.configuration.backend + + def get_region(self) -> str: + """ + Returns the volume region or an empty string if the volume (that is, its backend) + has no such thing as a "region". + """ + if isinstance(self.configuration, VolumeConfigurationWithRegion): + return self.configuration.region + return "" + + def get_availability_zone(self) -> Optional[str]: + """ + Returns the volume availability zone or `None` if: + * the volume (that is, its backend) has no such thing as an "availability zone" + * `VolumeProvisioningData` is not set for some reason + """ + if self.provisioning_data is None: + return None + return self.provisioning_data.availability_zone + + +class VolumePlan(CoreModel): + project_name: str + user: str + spec: VolumeSpec + current_resource: Optional[Volume] + + +def _split_mount_point(mount_point: str) -> Tuple[str, str]: + parts = mount_point.split(":") + if len(parts) != 2: + raise ValueError(f"invalid mount point format: {mount_point}") + src, dest = parts + return src, dest + + +def _validate_mount_point_path(path: str) -> str: + if not path: + raise ValueError("empty path") + _path = PurePosixPath(path) + if not _path.is_absolute(): + raise ValueError(f"path must be absolute: {path}") + if ".." in _path.parts: + raise ValueError(f".. are not allowed: {path}") + return str(_path) class VolumeMountPoint(CoreModel): - name: Annotated[str, Field(description="The name of the volume to mount")] - path: Annotated[str, Field(description="The container path to mount the volume at")] + name: Annotated[ + Union[str, List[str]], + Field( + description=( + "The network volume name or the list of network volume names to mount." + " If a list is specified, one of the volumes in the list will be mounted." + " Specify volumes from different backends/regions to increase availability" + ) + ), + ] + path: Annotated[str, Field(description="The absolute container path to mount the volume at")] + + _validate_path = validator("path", allow_reuse=True)(_validate_mount_point_path) + + @classmethod + def parse(cls, v: str) -> Self: + name, path = _split_mount_point(v) + return cls(name=name, path=path) + + +class InstanceMountPoint(CoreModel): + instance_path: Annotated[str, Field(description="The absolute path on the instance (host)")] + path: Annotated[str, Field(description="The absolute path in the container")] + optional: Annotated[ + bool, + Field( + description=( + "Allow running without this volume" + " in backends that do not support instance volumes" + ), + ), + ] = False + + _validate_instance_path = validator("instance_path", allow_reuse=True)( + _validate_mount_point_path + ) + _validate_path = validator("path", allow_reuse=True)(_validate_mount_point_path) + + @classmethod + def parse(cls, v: str) -> Self: + instance_path, path = _split_mount_point(v) + return cls(instance_path=instance_path, path=path) + + +MountPoint = Union[VolumeMountPoint, InstanceMountPoint] + + +def parse_mount_point(v: str) -> MountPoint: + src, dest = _split_mount_point(v) + if "/" in src: + return InstanceMountPoint(instance_path=src, path=dest) + return VolumeMountPoint(name=src, path=dest) diff --git a/src/dstack/_internal/core/services/__init__.py b/src/dstack/_internal/core/services/__init__.py index f41d7f384a..6d698e1228 100644 --- a/src/dstack/_internal/core/services/__init__.py +++ b/src/dstack/_internal/core/services/__init__.py @@ -4,5 +4,13 @@ def validate_dstack_resource_name(resource_name: str): - if not re.match("^[a-z][a-z0-9-]{1,40}$", resource_name): + if not is_valid_dstack_resource_name(resource_name): raise ServerClientError("Resource name should match regex '^[a-z][a-z0-9-]{1,40}$'") + + +def is_valid_dstack_resource_name(resource_name: str) -> bool: + return re.match("^[a-z][a-z0-9-]{1,40}$", resource_name) is not None + + +def is_valid_replica_group_name(name: str) -> bool: + return re.match("^[a-z0-9][a-z0-9-]{0,39}$", name) is not None diff --git a/src/dstack/_internal/core/services/configs/__init__.py b/src/dstack/_internal/core/services/configs/__init__.py index b9d1ac877d..8bc72a6394 100644 --- a/src/dstack/_internal/core/services/configs/__init__.py +++ b/src/dstack/_internal/core/services/configs/__init__.py @@ -3,13 +3,11 @@ from pathlib import Path from typing import Optional -import filelock import yaml from pydantic import ValidationError from dstack._internal.cli.utils.common import confirm_ask -from dstack._internal.core.models.config import GlobalConfig, ProjectConfig, RepoConfig -from dstack._internal.core.models.repos.base import RepoType +from dstack._internal.core.models.config import GlobalConfig, ProjectConfig from dstack._internal.utils.common import get_dstack_dir from dstack._internal.utils.logging import get_logger from dstack._internal.utils.path import PathLike @@ -37,7 +35,10 @@ def load(self): with open(self.config_filepath, "r") as f: config = yaml.safe_load(f) self.config = GlobalConfig.parse_obj(config) - except (FileNotFoundError, ValidationError): + except FileNotFoundError: + self.config = GlobalConfig() + except ValidationError: + logger.error(f"Error in `{self.config_filepath}`", exc_info=True) self.config = GlobalConfig() def get_project_config(self, name: Optional[str] = None) -> Optional[ProjectConfig]: @@ -64,48 +65,19 @@ def configure_project(self, name: str, url: str, token: str, default: bool): if len(self.config.projects) == 1: self.config.projects[0].default = True + def list_project_configs(self) -> list[ProjectConfig]: + return self.config.projects + def delete_project(self, name: str): self.config.projects = [p for p in self.config.projects if p.name != name] - def save_repo_config( - self, repo_path: PathLike, repo_id: str, repo_type: RepoType, ssh_key_path: PathLike - ): - self.config_filepath.parent.mkdir(parents=True, exist_ok=True) - with filelock.FileLock(str(self.config_filepath) + ".lock"): - self.load() - repo_path = os.path.abspath(repo_path) - ssh_key_path = os.path.abspath(ssh_key_path) - for repo in self.config.repos: - if repo.path == repo_path: - repo.repo_id = repo_id - repo.repo_type = repo_type - repo.ssh_key_path = ssh_key_path - break - else: - self.config.repos.append( - RepoConfig( - path=repo_path, - repo_id=repo_id, - repo_type=repo_type, - ssh_key_path=ssh_key_path, - ) - ) - self.save() - - def get_repo_config(self, repo_path: PathLike) -> Optional[RepoConfig]: - repo_path = os.path.abspath(repo_path) - # TODO look at parent directories - for repo in self.config.repos: - if repo.path == repo_path: - return repo - return None - @property def dstack_ssh_dir(self) -> Path: return self.dstack_dir / "ssh" @property def dstack_key_path(self) -> Path: + # TODO: Remove since 0.19.40 return self.dstack_ssh_dir / "id_rsa" @property @@ -113,9 +85,7 @@ def dstack_ssh_config_path(self) -> Path: return self.dstack_ssh_dir / "config" -def update_default_project( - project_name: str, url: str, token: str, default: bool, no_default: bool -): +def update_default_project(project_name: str, url: str, token: str, yes: bool, no: bool): config_manager = ConfigManager() default_project = config_manager.get_project_config() config_dir = str(config_manager.config_filepath).replace(os.path.expanduser("~"), "~", 1) @@ -128,12 +98,13 @@ def update_default_project( set_it_as_default = ( ( default_project is None - or default + or yes or confirm_ask( - f"Update the [code]{project_name}[/] project in [code]{config_dir}[/]?" + f"Update the [code]{project_name}[/] project in [code]{config_dir}[/]?", + default=False, ) ) - if not no_default + if not no else False ) if set_it_as_default: diff --git a/src/dstack/_internal/core/services/diff.py b/src/dstack/_internal/core/services/diff.py new file mode 100644 index 0000000000..321d97f5d4 --- /dev/null +++ b/src/dstack/_internal/core/services/diff.py @@ -0,0 +1,94 @@ +from typing import Any, Optional, TypeVar, Union + +from pydantic import BaseModel + +from dstack._internal.core.models.common import CoreModel, IncludeExcludeType + + +class ModelFieldDiff(CoreModel): + old: Any + new: Any + + +ModelDiff = dict[str, Union[ModelFieldDiff, "ModelDiff"]] + + +# TODO: calculate nested diffs +def diff_models( + old: BaseModel, new: BaseModel, reset: Optional[IncludeExcludeType] = None +) -> ModelDiff: + """ + Returns a diff of model instances fields. + + The fields specified in the `reset` option are reset to their default values, effectively + excluding them from comparison (assuming that the default value is equal to itself, e.g, + `None == None`, `"task" == "task"`, but `math.nan != math.nan`). + + Args: + old: The "old" model instance. + new: The "new" model instance. + reset: Fields to reset to their default values before comparison. + + Returns: + A dict of changed fields in the form of + `{: {"old": old_value, "new": new_value}}` + """ + if type(old) is not type(new): + raise TypeError("Both instances must be of the same Pydantic model class.") + + if reset is not None: + old = copy_model(old, reset=reset) + new = copy_model(new, reset=reset) + + changes: ModelDiff = {} + for field in old.__fields__: + old_value = getattr(old, field) + new_value = getattr(new, field) + if old_value != new_value: + changes[field] = ModelFieldDiff(old=old_value, new=new_value) + + return changes + + +M = TypeVar("M", bound=BaseModel) + + +def copy_model(model: M, reset: Optional[IncludeExcludeType] = None) -> M: + """ + Returns a deep copy of the model instance. + + Implemented as `BaseModel.parse_obj(BaseModel.dict())`, thus, + unlike `BaseModel.copy(deep=True)`, runs all validations. + + The fields specified in the `reset` option are reset to their default values. + + Args: + reset: Fields to reset to their default values. + + Returns: + A deep copy of the model instance. + """ + return type(model).parse_obj(model.dict(exclude=reset)) + + +def flatten_diff_fields(diff: ModelDiff, prefix: str = "") -> list[str]: + """ + Recursively collects all field paths from a diff. + + Returns: + A list of field paths, each path with dot-separated parts. + """ + fields = [] + for field_name, field_diff in diff.items(): + current_path = f"{prefix}.{field_name}" if prefix else field_name + + if isinstance(field_diff, ModelFieldDiff): + fields.append(current_path) + else: + fields.extend(flatten_diff_fields(field_diff, current_path)) + + return fields + + +def format_diff_fields_for_event(diff: ModelDiff) -> str: + return ", ".join(flatten_diff_fields(diff)) diff --git a/src/dstack/_internal/core/services/logs.py b/src/dstack/_internal/core/services/logs.py index 16f62cc253..24eca579b7 100644 --- a/src/dstack/_internal/core/services/logs.py +++ b/src/dstack/_internal/core/services/logs.py @@ -3,6 +3,7 @@ from typing import Dict, List, Optional from dstack._internal.core.models.runs import AppSpec +from dstack._internal.utils.common import concat_url_path class URLReplacer: @@ -12,12 +13,14 @@ def __init__( ports: Dict[int, int], hostname: str, secure: bool, + path_prefix: str = "", ip_address: Optional[str] = None, ): self.app_specs = {app_spec.port: app_spec for app_spec in app_specs} self.ports = ports self.hostname = hostname self.secure = secure + self.path_prefix = path_prefix.encode() hosts = ["localhost", "0.0.0.0", "127.0.0.1"] if ip_address and ip_address not in hosts: @@ -39,10 +42,14 @@ def _replace_url(self, match: re.Match) -> bytes: qs = {k: v[0] for k, v in urllib.parse.parse_qs(url.query).items()} if app_spec and app_spec.url_query_params is not None: qs.update({k.encode(): v.encode() for k, v in app_spec.url_query_params.items()}) + path = url.path + if not path.startswith(self.path_prefix.removesuffix(b"/")): + path = concat_url_path(self.path_prefix, path) url = url._replace( scheme=("https" if self.secure else "http").encode(), netloc=(self.hostname if omit_port else f"{self.hostname}:{local_port}").encode(), + path=path, query=urllib.parse.urlencode(qs).encode(), ) return url.geturl() diff --git a/src/dstack/_internal/core/services/profiles.py b/src/dstack/_internal/core/services/profiles.py index 904ae7d1c0..71ed2e520e 100644 --- a/src/dstack/_internal/core/services/profiles.py +++ b/src/dstack/_internal/core/services/profiles.py @@ -1,24 +1,18 @@ -from typing import Optional +from typing import Optional, Tuple -from dstack._internal.core.models.profiles import DEFAULT_RETRY_DURATION, Profile, RetryEvent +from dstack._internal.core.models.profiles import ( + DEFAULT_RETRY_DURATION, + Profile, + RetryEvent, + TerminationPolicy, +) from dstack._internal.core.models.runs import Retry def get_retry(profile: Profile) -> Optional[Retry]: profile_retry = profile.retry if profile_retry is None: - # Handle retry_policy before retry was introduced - # TODO: Remove once retry_policy no longer supported - profile_retry_policy = profile.retry_policy - if profile_retry_policy is None: - return None - if not profile_retry_policy.retry: - return None - duration = profile_retry_policy.duration or DEFAULT_RETRY_DURATION - return Retry( - on_events=[RetryEvent.NO_CAPACITY, RetryEvent.INTERRUPTION, RetryEvent.ERROR], - duration=duration, - ) + return None if isinstance(profile_retry, bool): if profile_retry: return Retry( @@ -27,6 +21,26 @@ def get_retry(profile: Profile) -> Optional[Retry]: ) return None profile_retry = profile_retry.copy() + if profile_retry.on_events is None: + profile_retry.on_events = [ + RetryEvent.NO_CAPACITY, + RetryEvent.INTERRUPTION, + RetryEvent.ERROR, + ] if profile_retry.duration is None: profile_retry.duration = DEFAULT_RETRY_DURATION return Retry.parse_obj(profile_retry) + + +def get_termination( + profile: Profile, default_termination_idle_time: int +) -> Tuple[TerminationPolicy, int]: + termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE + termination_idle_time = default_termination_idle_time + if profile.idle_duration is not None and profile.idle_duration < 0: + termination_policy = TerminationPolicy.DONT_DESTROY + elif profile.idle_duration is not None: + termination_idle_time = profile.idle_duration + if termination_policy == TerminationPolicy.DONT_DESTROY: + termination_idle_time = -1 + return termination_policy, termination_idle_time diff --git a/src/dstack/_internal/core/services/repos.py b/src/dstack/_internal/core/services/repos.py index 51fb57ed7c..dd02d93de9 100644 --- a/src/dstack/_internal/core/services/repos.py +++ b/src/dstack/_internal/core/services/repos.py @@ -1,137 +1,230 @@ import os +from contextlib import suppress from pathlib import Path -from typing import Optional, Union +from tempfile import NamedTemporaryFile +from typing import Optional import git -import requests +import git.cmd import yaml -from git.exc import GitCommandError - -from dstack._internal.core.errors import DstackError -from dstack._internal.core.models.config import RepoConfig -from dstack._internal.core.models.repos import ( - LocalRepo, - RemoteRepo, - RemoteRepoCreds, - RemoteRunRepoData, -) -from dstack._internal.core.models.repos.base import RepoProtocol + +from dstack._internal.core.errors import RepoInvalidCredentialsError +from dstack._internal.core.models.repos import RemoteRepoCreds +from dstack._internal.core.models.repos.remote import GitRepoURL +from dstack._internal.utils.logging import get_logger from dstack._internal.utils.path import PathLike -from dstack._internal.utils.ssh import ( - get_host_config, - make_ssh_command_for_git, - try_ssh_key_passphrase, -) +from dstack._internal.utils.ssh import get_host_config, make_git_env, try_ssh_key_passphrase + +logger = get_logger(__name__) gh_config_path = os.path.expanduser("~/.config/gh/hosts.yml") default_ssh_key = os.path.expanduser("~/.ssh/id_rsa") -class InvalidRepoCredentialsError(DstackError): - pass - - -def get_local_repo_credentials( - repo_data: RemoteRunRepoData, +def get_repo_creds_and_default_branch( + repo_url: str, identity_file: Optional[PathLike] = None, + private_key: Optional[str] = None, oauth_token: Optional[str] = None, - original_hostname: Optional[str] = None, -) -> RemoteRepoCreds: - url = repo_data.make_url(RepoProtocol.HTTPS) # no auth - r = requests.get(f"{url}/info/refs?service=git-upload-pack", timeout=10) - if r.status_code == 200: - return RemoteRepoCreds(protocol=RepoProtocol.HTTPS, private_key=None, oauth_token=None) - - if identity_file is not None: # must fail if key is invalid - identity_file = os.path.expanduser(identity_file) - try: # user provided ssh key - return check_remote_repo_credentials( - repo_data, RepoProtocol.SSH, identity_file=identity_file +) -> tuple[RemoteRepoCreds, Optional[str]]: + url = GitRepoURL.parse(repo_url, get_ssh_config=get_host_config) + + # no auth + with suppress(RepoInvalidCredentialsError): + creds, default_branch = _get_repo_creds_and_default_branch_https(url) + logger.debug( + "Git repo %s is public. Using no auth. Default branch: %s", repo_url, default_branch + ) + return creds, default_branch + + # ssh key provided by the user or pulled from the server + if identity_file is not None or private_key is not None: + if identity_file is not None: + private_key = _read_private_key(identity_file) + creds, default_branch = _get_repo_creds_and_default_branch_ssh( + url, identity_file, private_key ) - except GitCommandError: - url = repo_data.make_url(RepoProtocol.SSH) - raise InvalidRepoCredentialsError( - f"Can't access `{url}` using the `{identity_file}` private SSH key" + logger.debug( + "Git repo %s is private. Using identity file: %s. Default branch: %s", + repo_url, + identity_file, + default_branch, ) + return creds, default_branch + elif private_key is not None: + with NamedTemporaryFile("w+", 0o600) as f: + f.write(private_key) + f.flush() + creds, default_branch = _get_repo_creds_and_default_branch_ssh( + url, f.name, private_key + ) + masked_key = "***" + private_key[-10:] if len(private_key) > 10 else "***MASKED***" + logger.debug( + "Git repo %s is private. Using private key: %s. Default branch: %s", + repo_url, + masked_key, + default_branch, + ) + return creds, default_branch + else: + assert False, "should not reach here" + # oauth token provided by the user or pulled from the server if oauth_token is not None: - try: # user provided oauth token - return check_remote_repo_credentials( - repo_data, RepoProtocol.HTTPS, oauth_token=oauth_token - ) - except GitCommandError: - url = repo_data.make_url(RepoProtocol.SSH, oauth_token) - masked = len(oauth_token[:-4]) * "*" + oauth_token[-4:] - raise InvalidRepoCredentialsError(f"Can't access `{url}` using the `{masked}` token") - - identities = get_host_config(original_hostname or repo_data.repo_host_name).get("identityfile") - if identities: # must fail if key is invalid - try: # key from ssh config - return check_remote_repo_credentials( - repo_data, RepoProtocol.SSH, identity_file=identities[0] + creds, default_branch = _get_repo_creds_and_default_branch_https(url, oauth_token) + masked_token = ( + len(oauth_token[:-4]) * "*" + oauth_token[-4:] + if len(oauth_token) > 4 + else "***MASKED***" + ) + logger.debug( + "Git repo %s is private. Using provided OAuth token: %s. Default branch: %s", + repo_url, + masked_token, + default_branch, + ) + return creds, default_branch + + # key from ssh config + identities = get_host_config(url.original_host).get("identityfile") + if identities: + _identity_file = identities[0] + with suppress(RepoInvalidCredentialsError): + _private_key = _read_private_key(_identity_file) + creds, default_branch = _get_repo_creds_and_default_branch_ssh( + url, _identity_file, _private_key ) - except GitCommandError: - url = repo_data.make_url(RepoProtocol.SSH, oauth_token) - raise InvalidRepoCredentialsError( - f"Can't access `{url}` using the `{identities[0]}` SSH private key" + logger.debug( + "Git repo %s is private. Using SSH config identity file: %s. Default branch: %s", + repo_url, + _identity_file, + default_branch, ) + return creds, default_branch + # token from gh config if os.path.exists(gh_config_path): with open(gh_config_path, "r") as f: gh_hosts = yaml.load(f, Loader=yaml.FullLoader) - oauth_token = gh_hosts.get(repo_data.repo_host_name, {}).get("oauth_token") - if oauth_token is not None: - try: # token from gh config - return check_remote_repo_credentials( - repo_data, RepoProtocol.HTTPS, oauth_token=oauth_token + _oauth_token = gh_hosts.get(url.host, {}).get("oauth_token") + if _oauth_token is not None: + with suppress(RepoInvalidCredentialsError): + creds, default_branch = _get_repo_creds_and_default_branch_https(url, _oauth_token) + masked_token = ( + len(_oauth_token[:-4]) * "*" + _oauth_token[-4:] + if len(_oauth_token) > 4 + else "***MASKED***" + ) + logger.debug( + "Git repo %s is private. Using GitHub config token: %s from %s. Default branch: %s", + repo_url, + masked_token, + gh_config_path, + default_branch, ) - except GitCommandError: - pass + return creds, default_branch + # default user key if os.path.exists(default_ssh_key): - try: # default user key - return check_remote_repo_credentials( - repo_data, RepoProtocol.SSH, identity_file=default_ssh_key + with suppress(RepoInvalidCredentialsError): + _private_key = _read_private_key(default_ssh_key) + creds, default_branch = _get_repo_creds_and_default_branch_ssh( + url, default_ssh_key, _private_key ) - except GitCommandError: - pass - - raise InvalidRepoCredentialsError( - "No valid default Git credentials found: ensure passing a valid `--token` or `--git-identity` to `dstack init`." - ) - - -def check_remote_repo_credentials( - repo_data: RemoteRunRepoData, - protocol: RepoProtocol, - *, - identity_file: Optional[PathLike] = None, - oauth_token: Optional[str] = None, -) -> RemoteRepoCreds: - url = repo_data.make_url(protocol, oauth_token) - if protocol == RepoProtocol.HTTPS: - git.cmd.Git().ls_remote(url, env=dict(GIT_TERMINAL_PROMPT="0")) - return RemoteRepoCreds(protocol=protocol, oauth_token=oauth_token, private_key=None) - elif protocol == RepoProtocol.SSH: - if not Path(identity_file).exists(): - raise InvalidRepoCredentialsError(f"The {identity_file} private SSH key doesn't exist") - if not os.access(identity_file, os.R_OK): - raise InvalidRepoCredentialsError(f"Can't access the {identity_file} private SSH key") - if not try_ssh_key_passphrase(identity_file): - raise InvalidRepoCredentialsError( - f"Can't access `{url}`: ensure the `{identity_file}` private SSH key is valid and passphrase-free" + logger.debug( + "Git repo %s is private. Using default identity file: %s. Default branch: %s", + repo_url, + default_ssh_key, + default_branch, ) - with open(identity_file, "r") as f: - private_key = f.read() - git.cmd.Git().ls_remote( - url, env=dict(GIT_SSH_COMMAND=make_ssh_command_for_git(identity_file)) + return creds, default_branch + + raise RepoInvalidCredentialsError() + + +def _get_repo_creds_and_default_branch_ssh( + url: GitRepoURL, identity_file: PathLike, private_key: str +) -> tuple[RemoteRepoCreds, Optional[str]]: + _url = url.as_ssh() + env = _make_git_env_for_creds_check(identity_file=identity_file) + try: + default_branch = _get_repo_default_branch(_url, env) + except git.GitCommandError as e: + message = f"Cannot access `{_url}` using the `{identity_file}` private SSH key" + raise RepoInvalidCredentialsError(message) from e + creds = RemoteRepoCreds( + clone_url=_url, + private_key=private_key, + oauth_token=None, + ) + return creds, default_branch + + +def _get_repo_creds_and_default_branch_https( + url: GitRepoURL, oauth_token: Optional[str] = None +) -> tuple[RemoteRepoCreds, Optional[str]]: + _url = url.as_https() + env = _make_git_env_for_creds_check() + try: + default_branch = _get_repo_default_branch(url.as_https(oauth_token), env) + except git.GitCommandError as e: + message = f"Cannot access `{_url}`" + if oauth_token is not None: + masked_token = len(oauth_token[:-4]) * "*" + oauth_token[-4:] + message = f"{message} using the `{masked_token}` token" + raise RepoInvalidCredentialsError(message) from e + creds = RemoteRepoCreds( + clone_url=_url, + private_key=None, + oauth_token=oauth_token, + ) + return creds, default_branch + + +def _make_git_env_for_creds_check(identity_file: Optional[PathLike] = None) -> dict[str, str]: + # Our goal is to check if _provided_ creds (if any) are correct, so we need to be sure that + # only the provided creds are used, without falling back to any additional mechanisms. + # To do this, we: + # 1. Disable all configs to ignore any stored creds + # 2. Disable askpass to avoid asking for creds interactively or fetching stored creds from + # a non-interactive askpass helper (for example, VS Code sets GIT_ASKPASS to its own helper, + # which silently provides creds to Git). + return make_git_env(disable_config=True, disable_askpass=True, identity_file=identity_file) + + +def _get_repo_default_branch(url: str, env: dict[str, str]) -> Optional[str]: + # Git shipped by Apple with XCode is patched to support an additional config scope + # above "system" called "xcode". There is no option in `git config list` to show this config, + # but you can list the merged config (`git config list` without options) and then exclude + # all settings listed in `git config list --{system,global,local,worktree}`. + # As of time of writing, there are only two settings in the "xcode" config, one of which breaks + # our "is repo public?" check, namely "credential.helper=osxkeychain". + # As there is no way to disable "xcode" config (no env variable, no CLI option, etc.), + # the only way to disable credential helper is to override this specific setting with an empty + # string via command line argument: `git -c credential.helper= COMMAND [ARGS ...]`. + # See: https://github.com/git/git/commit/3d4355712b9fe77a96ad4ad877d92dc7ff6e0874 + # See: https://gist.github.com/ChrisTollefson/ab9c0a5d1dd4dd615217345c6936a307 + _git = git.cmd.Git()(c="credential.helper=") + # output example: "ref: refs/heads/dev\tHEAD\n545344f77c0df78367085952a97fc3a058eb4c65\tHEAD" + output: str = _git.ls_remote("--symref", url, "HEAD", env=env) + for line in output.splitlines(): + # line format: ` TAB LF` + oid, _, ref = line.partition("\t") + if oid.startswith("ref:") and ref == "HEAD": + return oid.rsplit("/", maxsplit=1)[-1] + return None + + +def _read_private_key(identity_file: PathLike) -> str: + identity_file = Path(identity_file).expanduser().resolve() + if not Path(identity_file).exists(): + raise RepoInvalidCredentialsError(f"The `{identity_file}` private SSH key doesn't exist") + if not os.access(identity_file, os.R_OK): + raise RepoInvalidCredentialsError(f"Cannot access the `{identity_file}` private SSH key") + if not try_ssh_key_passphrase(identity_file): + raise RepoInvalidCredentialsError( + f"Cannot use the `{identity_file}` private SSH key. " + "Ensure that it is valid and passphrase-free" ) - return RemoteRepoCreds(protocol=protocol, private_key=private_key, oauth_token=None) - - -def load_repo(config: RepoConfig) -> Union[RemoteRepo, LocalRepo]: - if config.repo_type == "remote": - return RemoteRepo(repo_id=config.repo_id, local_repo_dir=config.path) - elif config.repo_type == "local": - return LocalRepo(repo_id=config.repo_id, repo_dir=config.path) - raise TypeError(f"Unknown repo_type: {config.repo_type}") + with open(identity_file, "r") as file: + return file.read() diff --git a/src/dstack/_internal/core/services/ssh/attach.py b/src/dstack/_internal/core/services/ssh/attach.py index bc60d31f19..7727aef5df 100644 --- a/src/dstack/_internal/core/services/ssh/attach.py +++ b/src/dstack/_internal/core/services/ssh/attach.py @@ -1,149 +1,304 @@ import atexit import re -import subprocess import time -from typing import Optional, Tuple +from pathlib import Path +from typing import Optional, Union + +import psutil from dstack._internal.core.errors import SSHError from dstack._internal.core.models.instances import SSHConnectionParams from dstack._internal.core.services.configs import ConfigManager +from dstack._internal.core.services.ssh.client import get_ssh_client_info from dstack._internal.core.services.ssh.ports import PortsLock -from dstack._internal.core.services.ssh.tunnel import ClientTunnel -from dstack._internal.utils.path import PathLike -from dstack._internal.utils.ssh import get_ssh_config, include_ssh_config, update_ssh_config - - -class SSHAttach: - @staticmethod - def reuse_control_sock_path_and_port_locks(run_name: str) -> Optional[Tuple[str, PortsLock]]: - ssh_config_path = str(ConfigManager().dstack_ssh_config_path) - host_config = get_ssh_config(ssh_config_path, run_name) - if host_config and host_config.get("ControlPath"): - ps = subprocess.Popen(("ps", "-A", "-o", "command"), stdout=subprocess.PIPE) - control_sock_path = host_config.get("ControlPath") - output = subprocess.check_output(("grep", control_sock_path), stdin=ps.stdout) - ps.wait() - commands = list( - filter(lambda s: not s.startswith("grep"), output.decode().strip().split("\n")) - ) - if commands: - port_pattern = r"-L (\d+):localhost:(\d+)" - matches = re.findall(port_pattern, commands[0]) - return control_sock_path, PortsLock( - {int(local_port): int(target_port) for local_port, target_port in matches} - ) +from dstack._internal.core.services.ssh.tunnel import SSHTunnel, ports_to_forwarded_sockets +from dstack._internal.utils.logging import get_logger +from dstack._internal.utils.path import FilePath, PathLike +from dstack._internal.utils.ssh import ( + default_ssh_config_path, + get_host_config, + include_ssh_config, + normalize_path, + update_ssh_config, +) + +logger = get_logger(__name__) + +# ssh -L option format: [bind_address:]port:host:hostport +_SSH_TUNNEL_REGEX = re.compile(r"(?:[\w.-]+:)?(?P\d+):localhost:(?P\d+)") + + +class BaseSSHAttach: + """ + A base class for SSH attach implementations. + + Child classes must populate `self.hosts` inside overridden `__init__()` with at least one host + named as a `run_name` argument value. + """ + + @classmethod + def get_control_sock_path(cls, run_name: str) -> Path: + return ConfigManager().dstack_ssh_dir / f"{run_name}.control.sock" + + @classmethod + def reuse_ports_lock(cls, run_name: str) -> Optional[PortsLock]: + ssh_client_info = get_ssh_client_info() + if not ssh_client_info.supports_control_socket: + raise SSHError("Unsupported SSH client") + ssh_exe = str(ssh_client_info.path) + control_sock_path = normalize_path(cls.get_control_sock_path(run_name)) + for process in psutil.process_iter(["cmdline"]): + cmdline = process.info["cmdline"] + if not (cmdline and cmdline[0] == ssh_exe and control_sock_path in cmdline): + continue + port_mapping: dict[int, int] = {} + cmdline_iter = iter(cmdline) + for arg in cmdline_iter: + if arg != "-L" or not (next_arg := next(cmdline_iter, None)): + continue + if match := _SSH_TUNNEL_REGEX.fullmatch(next_arg): + local_port, remote_port = match.group("local_port", "remote_port") + port_mapping[int(remote_port)] = int(local_port) + return PortsLock(port_mapping) return None def __init__( self, - hostname: str, - ssh_port: int, - user: str, - id_rsa_path: PathLike, - ports_lock: PortsLock, + *, run_name: str, - dockerized: bool, - ssh_proxy: Optional[SSHConnectionParams] = None, - control_sock_path: Optional[str] = None, - local_backend: bool = False, + identity_path: PathLike, + ports_lock: PortsLock, + destination: str, + service_port: Optional[int] = None, + bind_address: Optional[str] = None, ): + self._attached = False + self._hosts_added_to_ssh_config = False self._ports_lock = ports_lock self.ports = ports_lock.dict() self.run_name = run_name - self.ssh_config_path = str(ConfigManager().dstack_ssh_config_path) - self.tunnel = ClientTunnel( - run_name, - self.ports, - id_rsa_path=id_rsa_path, + self.ssh_config_path = ConfigManager().dstack_ssh_config_path + control_sock_path = self.get_control_sock_path(run_name) + # Cast all path-like values used in configs to FilePath instances for automatic + # path normalization in :func:`update_ssh_config`. + self.control_sock_path = FilePath(control_sock_path) + self.identity_file = FilePath(identity_path) + self.tunnel = SSHTunnel( + destination=destination, + identity=self.identity_file, + forwarded_sockets=ports_to_forwarded_sockets( + ports=self.ports, + bind_local=bind_address or "localhost", + ), control_sock_path=control_sock_path, ssh_config_path=self.ssh_config_path, + options={ + "ExitOnForwardFailure": "yes", + }, ) - self.ssh_proxy = ssh_proxy - if ssh_proxy is None: - self.host_config = { - "HostName": hostname, - "Port": ssh_port, - "User": user, - "IdentityFile": id_rsa_path, - "IdentitiesOnly": "yes", - "StrictHostKeyChecking": "no", - "UserKnownHostsFile": "/dev/null", - } - else: - self.host_config = { - "HostName": ssh_proxy.hostname, - "Port": ssh_proxy.port, - "User": ssh_proxy.username, - "IdentityFile": id_rsa_path, - "IdentitiesOnly": "yes", - "StrictHostKeyChecking": "no", - "UserKnownHostsFile": "/dev/null", - } - if dockerized and not local_backend: - self.container_config = { - "HostName": "localhost", - "Port": 10022, - "User": "root", - "IdentityFile": id_rsa_path, - "IdentitiesOnly": "yes", - "StrictHostKeyChecking": "no", - "UserKnownHostsFile": "/dev/null", - "ControlPath": self.tunnel.control_sock_path, - "ControlMaster": "auto", - "ControlPersist": "yes", - "ProxyJump": f"{run_name}-host", - } - elif ssh_proxy is not None: - self.container_config = { - "HostName": hostname, - "Port": ssh_port, - "User": user, - "IdentityFile": id_rsa_path, - "IdentitiesOnly": "yes", - "StrictHostKeyChecking": "no", - "UserKnownHostsFile": "/dev/null", - "ControlPath": self.tunnel.control_sock_path, - "ControlMaster": "auto", - "ControlPersist": "yes", - "ProxyJump": f"{run_name}-jump-host", - } - else: - self.container_config = None + self.service_port = service_port + self.hosts: dict[str, dict[str, Union[str, int, FilePath]]] = {} + + def __enter__(self): + self.attach() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.detach() def attach(self): include_ssh_config(self.ssh_config_path) - if self.container_config is None: - update_ssh_config(self.ssh_config_path, self.run_name, self.host_config) - elif self.ssh_proxy is not None: - update_ssh_config(self.ssh_config_path, f"{self.run_name}-jump-host", self.host_config) - update_ssh_config(self.ssh_config_path, self.run_name, self.container_config) - else: - update_ssh_config(self.ssh_config_path, f"{self.run_name}-host", self.host_config) - update_ssh_config(self.ssh_config_path, self.run_name, self.container_config) + self._add_hosts_to_ssh_config() - max_retries = 10 self._ports_lock.release() + + max_retries = 10 for i in range(max_retries): try: self.tunnel.open() + self._attached = True atexit.register(self.detach) - break + return except SSHError: if i < max_retries - 1: time.sleep(1) - else: - self.detach() - raise SSHError("Can't connect to the remote host") + self._remove_hosts_from_ssh_config() + raise SSHError("Can't connect to the remote host") def detach(self): + self._remove_hosts_from_ssh_config() + if not self._attached: + logger.debug("Not attached") + return self.tunnel.close() - update_ssh_config(self.ssh_config_path, f"{self.run_name}-jump-host", {}) - update_ssh_config(self.ssh_config_path, f"{self.run_name}-host", {}) - update_ssh_config(self.ssh_config_path, self.run_name, {}) + self._attached = False + logger.debug("Detached") - def __enter__(self): - self.attach() - return self + def _add_hosts_to_ssh_config(self): + if self._hosts_added_to_ssh_config: + return + for host, options in self.hosts.items(): + update_ssh_config(self.ssh_config_path, host, options) + self._hosts_added_to_ssh_config = True - def __exit__(self, exc_type, exc_val, exc_tb): - self.detach() + def _remove_hosts_from_ssh_config(self): + if not self._hosts_added_to_ssh_config: + return + for host in self.hosts: + update_ssh_config(self.ssh_config_path, host, {}) + self._hosts_added_to_ssh_config = False + + +class SSHAttach(BaseSSHAttach): + """ + `SSHAttach` attaches to a job directly, via a backend-specific chain of hosts. + + Used when `dstack-sshproxy` is not configured on the server. + """ + + def __init__( + self, + *, + run_name: str, + identity_path: PathLike, + ports_lock: PortsLock, + hostname: str, + ssh_port: int, + container_ssh_port: int, + user: str, + container_user: str, + dockerized: bool, + ssh_proxy: Optional[SSHConnectionParams] = None, + service_port: Optional[int] = None, + bind_address: Optional[str] = None, + ): + super().__init__( + run_name=run_name, + identity_path=identity_path, + ports_lock=ports_lock, + destination=f"root@{run_name}", + service_port=service_port, + bind_address=bind_address, + ) + hosts = self.hosts + if dockerized: + if ssh_proxy is not None: + # SSH instance with jump host + # dstack has no IdentityFile for jump host, it must be either preconfigured + # in the ~/.ssh/config or loaded into ssh-agent + hosts[f"{run_name}-jump-host"] = { + "HostName": ssh_proxy.hostname, + "Port": ssh_proxy.port, + "User": ssh_proxy.username, + "StrictHostKeyChecking": "no", + "UserKnownHostsFile": "/dev/null", + } + jump_host_config = get_host_config(ssh_proxy.hostname, default_ssh_config_path) + jump_host_identity_files = jump_host_config.get("identityfile") + if jump_host_identity_files: + hosts[f"{run_name}-jump-host"].update( + { + "IdentityFile": jump_host_identity_files[0], + "IdentitiesOnly": "yes", + } + ) + hosts[f"{run_name}-host"] = { + "HostName": hostname, + "Port": ssh_port, + "User": user, + "IdentityFile": self.identity_file, + "IdentitiesOnly": "yes", + "StrictHostKeyChecking": "no", + "UserKnownHostsFile": "/dev/null", + "ProxyJump": f"{run_name}-jump-host", + } + else: + # Regular SSH instance or VM-based cloud instance + hosts[f"{run_name}-host"] = { + "HostName": hostname, + "Port": ssh_port, + "User": user, + "IdentityFile": self.identity_file, + "IdentitiesOnly": "yes", + "StrictHostKeyChecking": "no", + "UserKnownHostsFile": "/dev/null", + } + hosts[run_name] = { + "HostName": "localhost", + "Port": container_ssh_port, + "User": container_user, + "IdentityFile": self.identity_file, + "IdentitiesOnly": "yes", + "StrictHostKeyChecking": "no", + "UserKnownHostsFile": "/dev/null", + "ProxyJump": f"{run_name}-host", + } + else: + if ssh_proxy is not None: + # Kubernetes + hosts[f"{run_name}-jump-host"] = { + "HostName": ssh_proxy.hostname, + "Port": ssh_proxy.port, + "User": ssh_proxy.username, + "IdentityFile": self.identity_file, + "IdentitiesOnly": "yes", + "StrictHostKeyChecking": "no", + "UserKnownHostsFile": "/dev/null", + } + hosts[run_name] = { + "HostName": hostname, + "Port": ssh_port, + "User": container_user, + "IdentityFile": self.identity_file, + "IdentitiesOnly": "yes", + "StrictHostKeyChecking": "no", + "UserKnownHostsFile": "/dev/null", + "ProxyJump": f"{run_name}-jump-host", + } + else: + # Container-based backends + hosts[run_name] = { + "HostName": hostname, + "Port": ssh_port, + "User": container_user, + "IdentityFile": self.identity_file, + "IdentitiesOnly": "yes", + "StrictHostKeyChecking": "no", + "UserKnownHostsFile": "/dev/null", + } + + +class SSHProxyAttach(BaseSSHAttach): + """ + `SSHProxyAttach` attaches to a job via `dstack-sshproxy`. + + Used when `dstack-sshproxy` is configured on the server. + """ + + def __init__( + self, + *, + run_name: str, + identity_path: PathLike, + ports_lock: PortsLock, + hostname: str, + upstream_id: str, + port: Optional[int] = None, + service_port: Optional[int] = None, + bind_address: Optional[str] = None, + ): + super().__init__( + run_name=run_name, + identity_path=identity_path, + ports_lock=ports_lock, + destination=f"{upstream_id}_root@{run_name}", + service_port=service_port, + bind_address=bind_address, + ) + self.hosts[run_name] = { + "HostName": hostname, + "Port": port or 22, + "User": upstream_id, + "IdentityFile": self.identity_file, + "IdentitiesOnly": "yes", + } diff --git a/src/dstack/_internal/core/services/ssh/client.py b/src/dstack/_internal/core/services/ssh/client.py new file mode 100644 index 0000000000..2126ffcb8a --- /dev/null +++ b/src/dstack/_internal/core/services/ssh/client.py @@ -0,0 +1,113 @@ +import re +import subprocess +from dataclasses import dataclass +from pathlib import Path +from typing import Optional, Tuple + +from dstack._internal.compat import IS_WINDOWS +from dstack._internal.core.errors import SSHError +from dstack._internal.utils.path import PathLike +from dstack._internal.utils.ssh import find_ssh_client + + +@dataclass +class SSHClientInfo: + # Path to `ssh` executable + path: Path + # Full version including portable suffix, e.g., "9.6p1" + version: str + # Base version not including portable suffix, e.g., (9, 6) + version_tuple: Tuple[int, ...] + # True if OpenSSH_for_Windows (Microsoft's OpenSSH Portable fork) + for_windows: bool + # Supports Control{Master,Path,Persist} directives, but only for control purposes + # (e.g., `ssh -O exit`), cannot be used for connection multiplexing + supports_control_socket: bool + # Supports Control{Master,Path,Persist} for connection multiplexing + supports_multiplexing: bool + # Supports ForkAfterAuthentication (`ssh -f`) + supports_background_mode: bool + + RAW_VERSION_REGEX = re.compile( + r"OpenSSH_(?Pfor_Windows_)?(?P[\d.]+)(?Pp\d+)?", + flags=re.I, + ) + + @classmethod + def from_raw_version(cls, raw_version: str, path: Path) -> "SSHClientInfo": + match = cls.RAW_VERSION_REGEX.match(raw_version) + if not match: + raise ValueError("no match") + for_windows, base_version, portable_version = match.group( + "for_windows", "base_version", "portable_version" + ) + if portable_version: + version = f"{base_version}{portable_version}" + else: + version = base_version + return cls( + path=path, + version=version, + version_tuple=tuple(map(int, base_version.split("."))), + for_windows=bool(for_windows), + supports_control_socket=(not for_windows), + supports_multiplexing=(not IS_WINDOWS), + supports_background_mode=(not for_windows), + ) + + +def inspect_ssh_client(path: PathLike) -> SSHClientInfo: + """ + Inspects various aspects of a given SSH client — version, "flavor", features — by executing + and parsing `ssh -V`. + + :param path: a path of the ssh executable. + :return: :class:`SSHClientInfo` named tuple. + :raise dstack._internal.core.errors.SSHError: if path does not exist, `ssh -V` returns + non-zero exit status, or `ssh -V` output does not match the pattern. + """ + path = Path(path).resolve() + try: + cp = subprocess.run( + [path, "-V"], + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + text=True, + timeout=5, + ) + except (OSError, subprocess.SubprocessError) as e: + raise SSHError(f"failed to execute `{path} -V`: {e}") from e + output = cp.stderr + if cp.returncode != 0: + raise SSHError(f"`{path} -V` returned non-zero exit status {cp.returncode}: {output}") + try: + return SSHClientInfo.from_raw_version(output, path) + except ValueError: + raise SSHError(f"failed to parse `{path} -V` output: {output}") + + +_ssh_client_info: Optional[SSHClientInfo] = None + + +def get_ssh_client_info() -> SSHClientInfo: + """ + Returns :class:`SSHClientInfo` for the default SSH client. The result is cached. + + :return: :class:`SSHClientInfo` named tuple. + :raise dstack._internal.core.errors.SSHError: if no ssh client found or the underlying + :func:`inspect_ssh_client` raises an error. + """ + global _ssh_client_info + if _ssh_client_info is not None: + return _ssh_client_info + path = find_ssh_client() + if path is None: + if IS_WINDOWS: + msg = "SSH client not found, install Git for Windows." + else: + msg = "SSH client not found." + raise SSHError(msg) + _ssh_client_info = inspect_ssh_client(path) + if _ssh_client_info.for_windows: + raise SSHError("OpenSSH for Windows is not supported, install Git for Windows.") + return _ssh_client_info diff --git a/src/dstack/_internal/core/services/ssh/key_manager.py b/src/dstack/_internal/core/services/ssh/key_manager.py new file mode 100644 index 0000000000..322ad323cc --- /dev/null +++ b/src/dstack/_internal/core/services/ssh/key_manager.py @@ -0,0 +1,53 @@ +import os +from dataclasses import dataclass +from datetime import datetime, timedelta +from pathlib import Path +from typing import TYPE_CHECKING + +from dstack._internal.core.errors import ClientError + +if TYPE_CHECKING: + from dstack.api.server import APIClient + +KEY_REFRESH_RATE = timedelta(minutes=10) # redownload the key periodically in case it was rotated + + +@dataclass +class UserSSHKey: + public_key: str + private_key_path: Path + + +class UserSSHKeyManager: + def __init__(self, api_client: "APIClient", ssh_keys_dir: Path) -> None: + self._api_client = api_client + self._key_path = ssh_keys_dir / api_client.get_token_hash() + self._pub_key_path = self._key_path.with_suffix(".pub") + + def get_user_key(self) -> UserSSHKey: + """ + Return the up-to-date user key + """ + if ( + not self._key_path.exists() + or not self._pub_key_path.exists() + or datetime.now() - datetime.fromtimestamp(self._key_path.stat().st_mtime) + > KEY_REFRESH_RATE + ): + self._download_user_key() + return UserSSHKey( + public_key=self._pub_key_path.read_text(), private_key_path=self._key_path + ) + + def _download_user_key(self) -> None: + user = self._api_client.users.get_my_user() + if user.ssh_private_key is None or user.ssh_public_key is None: + raise ClientError("Server response does not contain user SSH key") + + def key_opener(path, flags): + return os.open(path, flags, 0o600) + + with open(self._key_path, "w", opener=key_opener) as f: + f.write(user.ssh_private_key) + with open(self._pub_key_path, "w") as f: + f.write(user.ssh_public_key) diff --git a/src/dstack/_internal/core/services/ssh/ports.py b/src/dstack/_internal/core/services/ssh/ports.py index 3d81d0f11d..1d41bcd2c6 100644 --- a/src/dstack/_internal/core/services/ssh/ports.py +++ b/src/dstack/_internal/core/services/ssh/ports.py @@ -2,6 +2,7 @@ import socket from typing import Dict, List, Optional +from dstack._internal.compat import IS_WINDOWS from dstack._internal.core.errors import DstackError from dstack._internal.core.models.configurations import PortMapping @@ -10,7 +11,9 @@ class PortUsedError(DstackError): - pass + def __init__(self, port: int): + self.port = port + super().__init__(f"Port {port} is already in use") class PortsLock: @@ -27,10 +30,10 @@ def acquire(self) -> "PortsLock": if not local_port: # None or 0 continue if local_port in assigned_ports: - raise PortUsedError(f"Port {local_port} is already in use") + raise PortUsedError(local_port) sock = self._listen(local_port) if sock is None: - raise PortUsedError(f"Port {local_port} is already in use") + raise PortUsedError(local_port) self.sockets[remote_port] = sock assigned_ports.add(local_port) @@ -52,7 +55,6 @@ def acquire(self) -> "PortsLock": def release(self) -> Dict[int, int]: mapping = self.dict() for sock in self.sockets.values(): - sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) sock.close() self.sockets = {} return mapping @@ -73,6 +75,8 @@ def __str__(self) -> str: def _listen(port: int) -> Optional[socket.socket]: try: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + if IS_WINDOWS: + sock.setsockopt(socket.SOL_SOCKET, socket.SO_EXCLUSIVEADDRUSE, 1) # type: ignore[attr-defined] sock.bind(("", port)) return sock except socket.error as e: diff --git a/src/dstack/_internal/core/services/ssh/tunnel.py b/src/dstack/_internal/core/services/ssh/tunnel.py index 35b2dc8f5f..f4d6a17f70 100644 --- a/src/dstack/_internal/core/services/ssh/tunnel.py +++ b/src/dstack/_internal/core/services/ssh/tunnel.py @@ -1,71 +1,279 @@ +import abc +import asyncio import os import shlex import subprocess import tempfile -from typing import Dict, Optional +from dataclasses import dataclass +from typing import Dict, Iterable, List, Literal, NoReturn, Optional, Union +from dstack._internal.core.errors import SSHError from dstack._internal.core.models.instances import SSHConnectionParams from dstack._internal.core.services.ssh import get_ssh_error +from dstack._internal.core.services.ssh.client import get_ssh_client_info +from dstack._internal.utils.common import run_async from dstack._internal.utils.logging import get_logger -from dstack._internal.utils.path import PathLike +from dstack._internal.utils.path import FilePath, FilePathOrContent, PathLike +from dstack._internal.utils.ssh import normalize_path logger = get_logger(__name__) +SSH_TIMEOUT = 15 +SSH_DEFAULT_OPTIONS = { + "StrictHostKeyChecking": "no", + "UserKnownHostsFile": "/dev/null", + "ExitOnForwardFailure": "yes", + "StreamLocalBindUnlink": "yes", + "ConnectTimeout": "3", +} + + +class Socket(abc.ABC): + @abc.abstractmethod + def render(self) -> str: + pass + + +@dataclass +class UnixSocket(Socket): + path: PathLike + + def render(self) -> str: + return str(self.path) + + +@dataclass +class IPSocket(Socket): + host: str + port: int + + def render(self) -> str: + if ":" in self.host: # assuming IPv6 + return f"[{self.host}]:{self.port}" + return f"{self.host}:{self.port}" + + +@dataclass +class SocketPair: + local: Socket + remote: Socket class SSHTunnel: def __init__( self, - host: str, - id_rsa_path: PathLike, - ports: Dict[int, int], - control_sock_path: PathLike, - options: Dict[str, str], - ssh_config_path: str = "none", + destination: str, + identity: FilePathOrContent, + forwarded_sockets: Iterable[SocketPair] = (), + reverse_forwarded_sockets: Iterable[SocketPair] = (), + control_sock_path: Optional[PathLike] = None, + options: Dict[str, str] = SSH_DEFAULT_OPTIONS, + ssh_config_path: Union[PathLike, Literal["none"]] = "none", + port: Optional[int] = None, + ssh_proxies: Iterable[tuple[SSHConnectionParams, Optional[FilePathOrContent]]] = (), + batch_mode: bool = False, ): """ - :param ports: Mapping { remote port -> local port } + :param forwarded_sockets: Connections to the specified local sockets will be + forwarded to their corresponding remote sockets + :param reverse_forwarded_sockets: Connections to the specified remote sockets + will be forwarded to their corresponding local sockets + :param ssh_proxies: pairs of SSH connections params and optional identities, + in order from outer to inner. If an identity is `None`, the `identity` param + is used instead. + :param batch_mode: If enabled, "user interaction such as password prompts and host key + confirmation requests will be disabled", see `ssh_config(5)`, `BatchMode`. + Although this is probably the desired behavior in all use cases, the default value + is `False` for gradual adoption. + Note, this option is only applied to the `destination` and `ssh_proxies`. If you + configured `destination` with `ProxyJump` in the `ssh_config_path` config, the proxy + jump connection will ignore this option -- in that case, you should replace `ProxyJump` + with explicit `ProxyCommand=ssh [...] -o BatchMode=yes` in your config. """ - self.host = host - self.id_rsa_path = id_rsa_path - self.ports = ports - self.control_sock_path = control_sock_path + self.destination = destination + self.forwarded_sockets = list(forwarded_sockets) + self.reverse_forwarded_sockets = list(reverse_forwarded_sockets) self.options = options - self.ssh_config_path = ssh_config_path + self.port = port + self.ssh_config_path = normalize_path(ssh_config_path) + temp_dir = tempfile.TemporaryDirectory() + self.temp_dir = temp_dir + if control_sock_path is None: + control_sock_path = os.path.join(temp_dir.name, "control.sock") + self.control_sock_path = normalize_path(control_sock_path) + self.identity_path = normalize_path(self._get_identity_path(identity, "identity")) + self.ssh_proxies: list[tuple[SSHConnectionParams, PathLike]] = [] + for proxy_index, (proxy_params, proxy_identity) in enumerate(ssh_proxies): + if proxy_identity is None: + proxy_identity_path = self.identity_path + else: + proxy_identity_path = self._get_identity_path( + proxy_identity, f"proxy_identity_{proxy_index}" + ) + self.ssh_proxies.append((proxy_params, proxy_identity_path)) + self.batch_mode = batch_mode + self.log_path = normalize_path(os.path.join(temp_dir.name, "tunnel.log")) + self.ssh_client_info = get_ssh_client_info() + self.ssh_exec_path = str(self.ssh_client_info.path) - def open(self): - # ControlMaster and ControlPath are always set + def open_command(self) -> List[str]: + # Some information about how `ssh(1)` handles options: + # 1. Command-line options override config options regardless of the order of the arguments: + # `ssh -S sock2 -F config` with `ControlPath sock1` in the config -> the control socket + # path is `sock2`. + # 2. First argument wins: + # `ssh -S sock2 -S sock1` -> the control socket path is `sock2`. + # 3. `~` is not expanded in the arguments, but expanded in the config file. command = [ - "ssh", + self.ssh_exec_path, "-F", self.ssh_config_path, - "-f", - "-N", - "-M", - "-S", - self.control_sock_path, "-i", - self.id_rsa_path, + self.identity_path, + "-E", + self.log_path, + "-N", # do not run commands on remote ] + if self.ssh_client_info.supports_background_mode: + command += ["-f"] # go to background after successful authentication + else: + raise SSHError("Unsupported SSH client") + if self.ssh_client_info.supports_control_socket: + # It's safe to use ControlMaster even if the ssh client does not support multiplexing + # as long as we don't allow more than one tunnel to the specific host to be running. + # We use this feature for control only (see :meth:`close_command`). + command += [ + # Not `-M`, which means `ControlMaster=yes`, to avoid spawning uncontrollable + # ssh instances if more than one tunnel is started (precaution). + "-o", + "ControlMaster=auto", + "-S", + self.control_sock_path, + ] + else: + raise SSHError("Unsupported SSH client") + if self.port is not None: + command += ["-p", str(self.port)] for k, v in self.options.items(): command += ["-o", f"{k}={v}"] - for port_remote, port_local in self.ports.items(): - command += ["-L", f"{port_local}:localhost:{port_remote}"] - command += [self.host] - # Using stderr=subprocess.PIPE may block subprocess.run. - # Redirect stderr to file to get ssh error message - with tempfile.NamedTemporaryFile(delete=False) as f: - r = subprocess.run(command, stdout=subprocess.DEVNULL, stderr=f) - with open(f.name, "r+b") as f: - error = f.read() - os.remove(f.name) + if self.batch_mode: + command += ["-o", "BatchMode=yes"] + if "serveraliveinterval" not in map(str.lower, self.options): + # Revert Debian-specific patch effect: + # > The default is 0, indicating that these messages will not be sent + # > to the server, or 300 if the BatchMode option is set (Debian-specific). + # https://salsa.debian.org/ssh-team/openssh/-/blob/d87b69641b533b892b87e2eea02dbee796682d64/debian/patches/keepalive-extensions.patch#L69-77 + command += ["-o", "ServerAliveInterval=0"] + if proxy_command := self._get_proxy_command(): + command += ["-o", proxy_command] + for socket_pair in self.forwarded_sockets: + command += ["-L", f"{socket_pair.local.render()}:{socket_pair.remote.render()}"] + for socket_pair in self.reverse_forwarded_sockets: + command += ["-R", f"{socket_pair.remote.render()}:{socket_pair.local.render()}"] + command += [self.destination] + return command + + def close_command(self) -> List[str]: + return [self.ssh_exec_path, "-S", self.control_sock_path, "-O", "exit", self.destination] + + def check_command(self) -> List[str]: + return [self.ssh_exec_path, "-S", self.control_sock_path, "-O", "check", self.destination] + + def exec_command(self) -> List[str]: + return [self.ssh_exec_path, "-S", self.control_sock_path, self.destination] + + def open(self) -> None: + # We cannot use `stderr=subprocess.PIPE` here since the forked process (daemon) does not + # close standard streams if ProxyJump is used, therefore we will wait EOF from the pipe + # as long as the daemon exists. + self._remove_log_file() + try: + r = subprocess.run( + self.open_command(), + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + timeout=SSH_TIMEOUT, + ) + except subprocess.TimeoutExpired as e: + msg = f"SSH tunnel to {self.destination} did not open in {SSH_TIMEOUT} seconds" + logger.debug(msg) + raise SSHError(msg) from e if r.returncode == 0: return - logger.debug("SSH tunnel failed: %s", error) - raise get_ssh_error(error) + log_output = self._read_log_file() + self._raise_ssh_error_from_log_output(log_output) + + async def aopen(self) -> None: + await run_async(self._remove_log_file) + proc = await asyncio.create_subprocess_exec( + *self.open_command(), stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL + ) + try: + await asyncio.wait_for(proc.communicate(), SSH_TIMEOUT) + except asyncio.TimeoutError as e: + proc.kill() + msg = f"SSH tunnel to {self.destination} did not open in {SSH_TIMEOUT} seconds" + logger.debug(msg) + raise SSHError(msg) from e + if proc.returncode == 0: + return + log_output = await run_async(self._read_log_file) + self._raise_ssh_error_from_log_output(log_output) - def close(self): - command = ["ssh", "-S", self.control_sock_path, "-O", "exit", self.host] - subprocess.run(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def close(self) -> None: + if not os.path.exists(self.control_sock_path): + logger.debug( + "Control socket does not exist, it seems that ssh process has already exited" + ) + return + proc = subprocess.run( + self.close_command(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT + ) + if proc.returncode: + logger.error( + "Failed to close SSH tunnel, exit status: %d, output: %s", + proc.returncode, + proc.stdout, + ) + + async def aclose(self) -> None: + if not os.path.exists(self.control_sock_path): + logger.debug( + "Control socket does not exist, it seems that ssh process has already exited" + ) + return + proc = await asyncio.create_subprocess_exec( + *self.close_command(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT + ) + await proc.wait() + if proc.returncode: + logger.error( + "Failed to close SSH tunnel, exit status: %d, output: %s", + proc.returncode, + proc.stdout, + ) + + def check(self) -> bool: + proc = subprocess.run( + self.check_command(), stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL + ) + return proc.returncode == 0 + + async def acheck(self) -> bool: + proc = await asyncio.create_subprocess_exec( + *self.check_command(), stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL + ) + await proc.wait() + ok = proc.returncode == 0 + return ok + + async def aexec(self, command: str) -> str: + proc = await asyncio.create_subprocess_exec( + *self.exec_command(), command, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + stdout, stderr = await proc.communicate() + if proc.returncode != 0: + raise SSHError(stderr.decode()) + return stdout.decode() def __enter__(self): self.open() @@ -74,95 +282,100 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): self.close() + async def __aenter__(self): + await self.aopen() + return self -class RunnerTunnel(SSHTunnel): - """ - RunnerTunnel cancel forwarding without closing the connection on close() - """ + async def __aexit__(self, exc_type, exc_val, exc_tb): + await self.aclose() - def __init__( + def _get_proxy_command(self) -> Optional[str]: + proxy_command: Optional[str] = None + for params, identity_path in self.ssh_proxies: + proxy_command = self._build_proxy_command(params, identity_path, proxy_command) + return proxy_command + + def _build_proxy_command( self, - hostname: str, - ssh_port: int, - user: str, - ports: Dict[int, int], - id_rsa: str, - *, - control_sock_path: Optional[PathLike] = None, - ssh_proxy: Optional[SSHConnectionParams] = None, - disconnect_delay: int = 5, - ): - self.temp_dir = tempfile.TemporaryDirectory() - id_rsa_path = os.path.join(self.temp_dir.name, "id_rsa") - with open( - id_rsa_path, opener=lambda path, flags: os.open(path, flags, 0o600), mode="w" - ) as f: - f.write(id_rsa) - if control_sock_path is None: - control_sock_path = os.path.join(self.temp_dir.name, "control.sock") - options = {} - if ssh_proxy is not None: - proxy_command = ["ssh", "-i", id_rsa_path, "-W", "%h:%p"] - proxy_command += [ + params: SSHConnectionParams, + identity_path: PathLike, + prev_proxy_command: Optional[str], + ) -> Optional[str]: + command = [ + self.ssh_exec_path, + "-i", + identity_path, + "-W", + "%h:%p", + "-o", + "StrictHostKeyChecking=no", + "-o", + "UserKnownHostsFile=/dev/null", + ] + if self.batch_mode: + # ServerAliveInterval is explained in the open_command() comment + command += [ "-o", - "StrictHostKeyChecking=no", + "BatchMode=yes", "-o", - "UserKnownHostsFile=/dev/null", + "ServerAliveInterval=0", ] - proxy_command += [ - "-p", - str(ssh_proxy.port), - f"{ssh_proxy.username}@{ssh_proxy.hostname}", - ] - options["ProxyCommand"] = shlex.join(proxy_command) - options.update( - { - "StrictHostKeyChecking": "no", - "UserKnownHostsFile": "/dev/null", - "ExitOnForwardFailure": "yes", - "ConnectTimeout": "3", - # "ControlPersist": f"{disconnect_delay}s", - "Port": str(ssh_port), - } - ) - super().__init__( - host=f"{user}@{hostname}", - id_rsa_path=id_rsa_path, - ports=ports, - control_sock_path=control_sock_path, - options=options, - ) + if prev_proxy_command is not None: + command += ["-o", prev_proxy_command.replace("%", "%%")] + command += [ + "-p", + str(params.port), + f"{params.username}@{params.hostname}", + ] + return "ProxyCommand=" + shlex.join(command) + + def _read_log_file(self) -> Optional[bytes]: + try: + with open(self.log_path, "rb") as f: + return f.read() + except OSError as e: + logger.debug("Failed to read SSH tunnel log file %s: %s", self.log_path, e) + return None + + def _remove_log_file(self) -> None: + try: + os.remove(self.log_path) + except FileNotFoundError: + pass + except OSError as e: + logger.debug("Failed to remove SSH tunnel log file %s: %s", self.log_path, e) + + def _raise_ssh_error_from_log_output(self, output: Optional[bytes]) -> NoReturn: + if output is None: + msg = "(no log file)" + ssh_error = SSHError() + else: + msg = output + ssh_error = get_ssh_error(output) + logger.debug("SSH tunnel failed: %s", msg) + raise ssh_error - # def close(self): - # # cancel forwarding without closing the connection - # command = ["ssh", "-S", self.control_sock_path, "-O", "cancel"] - # for port_remote, port_local in self.ports.items(): - # command += ["-L", f"{port_local}:localhost:{port_remote}"] - # command += [self.host] - # subprocess.run(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def _get_identity_path(self, identity: FilePathOrContent, tmp_filename: str) -> PathLike: + if isinstance(identity, FilePath): + return identity.path + identity_path = os.path.join(self.temp_dir.name, tmp_filename) + with open( + identity_path, opener=lambda path, flags: os.open(path, flags, 0o600), mode="w" + ) as f: + f.write(identity.content) + return identity_path -class ClientTunnel(SSHTunnel): +def ports_to_forwarded_sockets( + ports: Dict[int, int], bind_local: str = "localhost" +) -> List[SocketPair]: """ - CLITunnel connects to the host from ssh config + Converts remote->local ports mapping to List[SocketPair] suitable for SSHTunnel """ - - def __init__( - self, - host: str, - ports: Dict[int, int], - id_rsa_path: PathLike, - ssh_config_path: str, - control_sock_path: Optional[str] = None, - ): - if control_sock_path is None: - self.temp_dir = tempfile.TemporaryDirectory() - control_sock_path = os.path.join(self.temp_dir.name, "control.sock") - super().__init__( - host=host, - id_rsa_path=id_rsa_path, - ports=ports, - control_sock_path=control_sock_path, - options={}, - ssh_config_path=ssh_config_path, + return [ + SocketPair( + local=IPSocket(host=bind_local, port=port_local), + remote=IPSocket(host="localhost", port=port_remote), ) + for port_remote, port_local in ports.items() + ] diff --git a/src/dstack/_internal/proxy/__init__.py b/src/dstack/_internal/proxy/__init__.py new file mode 100644 index 0000000000..a3e31886b4 --- /dev/null +++ b/src/dstack/_internal/proxy/__init__.py @@ -0,0 +1,8 @@ +""" +dstack-proxy is a component responsible for proxying ingress HTTP traffic to +services and models hosted by dstack. It can also perform load balancing, +collect service usage stats, obtain SSL certificates, etc. + +This component can run as a standalone web application on a gateway instance or +as part of the dstack-server web application. +""" diff --git a/src/dstack/_internal/proxy/gateway/__init__.py b/src/dstack/_internal/proxy/gateway/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/proxy/gateway/app.py b/src/dstack/_internal/proxy/gateway/app.py new file mode 100644 index 0000000000..f43ebada0a --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/app.py @@ -0,0 +1,89 @@ +"""FastAPI app running on a gateway.""" + +from contextlib import asynccontextmanager +from pathlib import Path +from typing import Optional + +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware + +from dstack._internal.proxy.gateway.auth import GatewayProxyAuthProvider +from dstack._internal.proxy.gateway.const import ( + DSTACK_DIR_ON_GATEWAY, + SERVER_CONNECTIONS_DIR_ON_GATEWAY, +) +from dstack._internal.proxy.gateway.deps import ( + GatewayDependencyInjector, + get_gateway_injector_from_app, + get_gateway_proxy_repo, +) +from dstack._internal.proxy.gateway.repo.repo import GatewayProxyRepo +from dstack._internal.proxy.gateway.repo.state_v1 import migrate_from_state_v1 +from dstack._internal.proxy.gateway.routers.auth import router as auth_router +from dstack._internal.proxy.gateway.routers.config import router as config_router +from dstack._internal.proxy.gateway.routers.registry import router as registry_router +from dstack._internal.proxy.gateway.routers.stats import router as stats_router +from dstack._internal.proxy.gateway.services.nginx import Nginx +from dstack._internal.proxy.gateway.services.registry import ACCESS_LOG_PATH, apply_all +from dstack._internal.proxy.gateway.services.server_client import HTTPMultiClient +from dstack._internal.proxy.gateway.services.stats import StatsCollector +from dstack._internal.proxy.lib.routers.model_proxy import router as model_proxy_router +from dstack._internal.utils.common import run_async +from dstack.version import __version__ + +STATE_FILE = DSTACK_DIR_ON_GATEWAY / "state-v2.json" +LEGACY_STATE_FILE = DSTACK_DIR_ON_GATEWAY / "state.json" +LEGACY_KEYS_DIR = Path("~/.ssh/projects").expanduser().resolve() + + +@asynccontextmanager +async def lifespan(app: FastAPI): + injector = get_gateway_injector_from_app(app) + repo = await get_gateway_proxy_repo(await injector.get_repo().__anext__()) + nginx = injector.get_nginx() + service_conn_pool = await injector.get_service_connection_pool() + await run_async(nginx.write_global_conf) + await apply_all(repo, nginx, service_conn_pool) + + yield + + await service_conn_pool.remove_all() + + +def make_app(repo: Optional[GatewayProxyRepo] = None, nginx: Optional[Nginx] = None) -> FastAPI: + if repo is None: + migrate_from_state_v1( + v1_file=LEGACY_STATE_FILE, v2_file=STATE_FILE, keys_dir=LEGACY_KEYS_DIR + ) + repo = GatewayProxyRepo.load(STATE_FILE) + + app = FastAPI(lifespan=lifespan) + app.state.proxy_dependency_injector = GatewayDependencyInjector( + repo=repo, + auth=GatewayProxyAuthProvider( + server_client=HTTPMultiClient(SERVER_CONNECTIONS_DIR_ON_GATEWAY) + ), + nginx=nginx or Nginx(), + stats_collector=StatsCollector(ACCESS_LOG_PATH), + ) + + # TODO: add CORS only to openai routers once fastapi supports it. + # See https://github.com/tiangolo/fastapi/pull/11010 + app.add_middleware( + CORSMiddleware, + allow_origin_regex=".*", + allow_methods=["*"], + allow_headers=["*"], + ) + + app.include_router(auth_router, prefix="/api/auth") + app.include_router(config_router, prefix="/api/config") + app.include_router(model_proxy_router, prefix="/api/models") + app.include_router(registry_router, prefix="/api/registry") + app.include_router(stats_router, prefix="/api/stats") + + @app.get("/") + def get_info(): + return {"version": __version__} + + return app diff --git a/src/dstack/_internal/proxy/gateway/auth.py b/src/dstack/_internal/proxy/gateway/auth.py new file mode 100644 index 0000000000..2523e9ff59 --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/auth.py @@ -0,0 +1,26 @@ +import httpx +from aiocache import cached + +from dstack._internal.proxy.lib.auth import BaseProxyAuthProvider +from dstack._internal.proxy.lib.errors import UnexpectedProxyError + + +class GatewayProxyAuthProvider(BaseProxyAuthProvider): + def __init__(self, server_client: httpx.AsyncClient) -> None: + self._server_client = server_client + + @cached(ttl=60, noself=True, skip_cache_func=lambda r: r is None) + async def is_project_member(self, project_name: str, token: str) -> bool: + try: + resp = await self._server_client.post( + f"/api/projects/{project_name}/get", + headers={"Authorization": f"Bearer {token}"}, + ) + if resp.status_code == httpx.codes.FORBIDDEN: + return False + resp.raise_for_status() + except httpx.HTTPError as e: + raise UnexpectedProxyError( + f"Failed requesting dstack-server to check access: {e!r}" + ) from e + return True diff --git a/src/dstack/_internal/proxy/gateway/const.py b/src/dstack/_internal/proxy/gateway/const.py new file mode 100644 index 0000000000..d9a294d2b6 --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/const.py @@ -0,0 +1,9 @@ +"""Gateway-related constants useful in various dstack modules.""" + +from pathlib import Path + +DSTACK_DIR_ON_GATEWAY = Path("/home/ubuntu/dstack") +SERVER_CONNECTIONS_DIR_ON_GATEWAY = DSTACK_DIR_ON_GATEWAY / "server-connections" +PROXY_PORT_ON_GATEWAY = 8000 +SERVICE_ALREADY_REGISTERED_ERROR_TEMPLATE = "Service {ref} is already registered" +SERVICE_SCALING_WINDOWS = (30, 60, 300) diff --git a/src/dstack/_internal/proxy/gateway/deps.py b/src/dstack/_internal/proxy/gateway/deps.py new file mode 100644 index 0000000000..009ce8da15 --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/deps.py @@ -0,0 +1,73 @@ +from typing import Annotated, AsyncGenerator + +from fastapi import Depends, FastAPI, Request + +from dstack._internal.proxy.gateway.repo.repo import GatewayProxyRepo +from dstack._internal.proxy.gateway.services.nginx import Nginx +from dstack._internal.proxy.gateway.services.stats import StatsCollector +from dstack._internal.proxy.lib.auth import BaseProxyAuthProvider +from dstack._internal.proxy.lib.deps import ( + ProxyDependencyInjector, + get_injector_from_app, + get_proxy_repo, +) +from dstack._internal.proxy.lib.errors import UnexpectedProxyError +from dstack._internal.proxy.lib.repo import BaseProxyRepo + + +class GatewayDependencyInjector(ProxyDependencyInjector): + def __init__( + self, + repo: GatewayProxyRepo, + auth: BaseProxyAuthProvider, + nginx: Nginx, + stats_collector: StatsCollector, + ) -> None: + super().__init__() + self._repo = repo + self._auth = auth + self._nginx = nginx + self._stats_collector = stats_collector + + async def get_repo(self) -> AsyncGenerator[BaseProxyRepo, None]: + yield self._repo + + async def get_auth_provider(self) -> AsyncGenerator[BaseProxyAuthProvider, None]: + yield self._auth + + def get_nginx(self) -> Nginx: + return self._nginx + + def get_stats_collector(self) -> StatsCollector: + return self._stats_collector + + +def get_gateway_injector_from_app(app: FastAPI) -> GatewayDependencyInjector: + injector = get_injector_from_app(app) + if not isinstance(injector, GatewayDependencyInjector): + raise UnexpectedProxyError(f"Unexpected gateway injector type: {type(injector)}") + return injector + + +async def get_gateway_injector(request: Request) -> GatewayDependencyInjector: + return get_gateway_injector_from_app(request.app) + + +async def get_gateway_proxy_repo( + repo: Annotated[BaseProxyRepo, Depends(get_proxy_repo)], +) -> GatewayProxyRepo: + if not isinstance(repo, GatewayProxyRepo): + raise UnexpectedProxyError(f"Unexpected gateway repo type: {type(repo)}") + return repo + + +async def get_nginx( + injector: Annotated[GatewayDependencyInjector, Depends(get_gateway_injector)], +) -> Nginx: + return injector.get_nginx() + + +async def get_stats_collector( + injector: Annotated[GatewayDependencyInjector, Depends(get_gateway_injector)], +) -> StatsCollector: + return injector.get_stats_collector() diff --git a/src/dstack/_internal/proxy/gateway/main.py b/src/dstack/_internal/proxy/gateway/main.py new file mode 100644 index 0000000000..134f34f538 --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/main.py @@ -0,0 +1,17 @@ +import logging + +from dstack._internal.proxy.gateway.app import make_app + + +def configure_logging(level: int = logging.INFO): + formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") + handler = logging.StreamHandler() + handler.setFormatter(formatter) + + logger = logging.getLogger("dstack") + logger.setLevel(level) + logger.addHandler(handler) + + +configure_logging(logging.DEBUG) +app = make_app() diff --git a/src/dstack/_internal/proxy/gateway/models.py b/src/dstack/_internal/proxy/gateway/models.py new file mode 100644 index 0000000000..3e0a76c032 --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/models.py @@ -0,0 +1,23 @@ +"""Things stored in GatewayProxyRepo in addition to those from BaseProxyRepo.""" + +from typing import Optional + +from pydantic import AnyHttpUrl + +from dstack._internal.proxy.lib.models import ImmutableModel + + +class ModelEntrypoint(ImmutableModel): + project_name: str + domain: str + https: bool + + +class ACMESettings(ImmutableModel): + server: Optional[AnyHttpUrl] = None + eab_kid: Optional[str] = None + eab_hmac_key: Optional[str] = None + + +class GlobalProxyConfig(ImmutableModel): + acme_settings: ACMESettings = ACMESettings() diff --git a/src/dstack/_internal/proxy/gateway/repo/__init__.py b/src/dstack/_internal/proxy/gateway/repo/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/proxy/gateway/repo/repo.py b/src/dstack/_internal/proxy/gateway/repo/repo.py new file mode 100644 index 0000000000..0956faf04c --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/repo/repo.py @@ -0,0 +1,129 @@ +from contextlib import asynccontextmanager +from itertools import chain +from pathlib import Path +from typing import Optional + +from aiorwlock import RWLock +from pydantic import BaseModel + +from dstack._internal.proxy.gateway.models import GlobalProxyConfig, ModelEntrypoint +from dstack._internal.proxy.lib.models import ChatModel, Project, Service +from dstack._internal.proxy.lib.repo import BaseProxyRepo +from dstack._internal.utils.common import run_async + + +class State(BaseModel): + services: dict[str, dict[str, Service]] = {} + models: dict[str, dict[str, ChatModel]] = {} + entrypoints: dict[str, ModelEntrypoint] = {} + projects: dict[str, Project] = {} + config: GlobalProxyConfig = GlobalProxyConfig() + + +class GatewayProxyRepo(BaseProxyRepo): + """ + Repo implementation used on gateways. Stores state in memory and maintains a copy on disk. + """ + + def __init__(self, state: Optional[State] = None, file: Optional[Path] = None) -> None: + self._state = state or State() + self._file = file + self._lock = RWLock() + + async def list_services(self) -> list[Service]: + async with self.reader(): + services_by_project = ( + project_services.values() for project_services in self._state.services.values() + ) + return list(chain(*services_by_project)) + + async def get_service(self, project_name: str, run_name: str) -> Optional[Service]: + async with self.reader(): + return self._state.services.get(project_name, {}).get(run_name) + + async def get_service_by_domain(self, domain: str) -> Optional[Service]: + async with self.reader(): + for project_services in self._state.services.values(): + for service in project_services.values(): + if service.domain == domain: + return service + return None + + async def set_service(self, service: Service) -> None: + async with self.writer(): + self._state.services.setdefault(service.project_name, {})[service.run_name] = service + + async def delete_service(self, project_name: str, run_name: str) -> None: + async with self.writer(): + project_services = self._state.services.get(project_name, {}) + project_services.pop(run_name, None) + if not project_services: + self._state.services.pop(project_name, None) + + async def list_models(self, project_name: str) -> list[ChatModel]: + async with self.reader(): + return list(self._state.models.get(project_name, {}).values()) + + async def get_model(self, project_name: str, name: str) -> Optional[ChatModel]: + async with self.reader(): + return self._state.models.get(project_name, {}).get(name) + + async def set_model(self, model: ChatModel) -> None: + async with self.writer(): + self._state.models.setdefault(model.project_name, {})[model.name] = model + + async def delete_models_by_run(self, project_name: str, run_name: str) -> None: + async with self.writer(): + project_models = self._state.models.get(project_name, {}) + models_to_delete = [m for m in project_models.values() if m.run_name == run_name] + for model in models_to_delete: + project_models.pop(model.name, None) + if not project_models: + self._state.models.pop(project_name, None) + + async def list_entrypoints(self) -> list[ModelEntrypoint]: + async with self.reader(): + return list(self._state.entrypoints.values()) + + async def set_entrypoint(self, entrypoint: ModelEntrypoint) -> None: + async with self.writer(): + self._state.entrypoints[entrypoint.project_name] = entrypoint + + async def get_project(self, name: str) -> Optional[Project]: + async with self.reader(): + return self._state.projects.get(name) + + async def set_project(self, project: Project) -> None: + async with self.writer(): + self._state.projects[project.name] = project + + async def get_config(self) -> GlobalProxyConfig: + async with self.reader(): + return self._state.config + + async def set_config(self, config: GlobalProxyConfig) -> None: + async with self.writer(): + self._state.config = config + + @asynccontextmanager + async def reader(self): + async with self._lock.reader: + yield + + @asynccontextmanager + async def writer(self): + async with self._lock.writer: + yield + await run_async(self.save) + + @staticmethod + def load(state_file: Path) -> "GatewayProxyRepo": + if state_file.exists(): + state = State.parse_file(state_file) + else: + state = None + return GatewayProxyRepo(state=state, file=state_file) + + def save(self) -> None: + if self._file is not None: + self._file.write_text(self._state.json()) diff --git a/src/dstack/_internal/proxy/gateway/repo/state_v1.py b/src/dstack/_internal/proxy/gateway/repo/state_v1.py new file mode 100644 index 0000000000..b49550ef5f --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/repo/state_v1.py @@ -0,0 +1,164 @@ +""" +Migration from the legacy state.json file of dstack-gateway to the new +state-v2.json file of dstack-proxy. +""" + +import json +from datetime import datetime +from pathlib import Path +from typing import Iterable, Optional + +from dstack._internal.core.models.instances import SSHConnectionParams +from dstack._internal.proxy.gateway.models import ACMESettings, GlobalProxyConfig, ModelEntrypoint +from dstack._internal.proxy.gateway.repo.repo import GatewayProxyRepo, State +from dstack._internal.proxy.lib.models import ( + AnyModelFormat, + ChatModel, + OpenAIChatModelFormat, + Project, + Replica, + Service, + TGIChatModelFormat, +) +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +def migrate_from_state_v1(v1_file: Path, v2_file: Path, keys_dir: Path) -> None: + if v2_file.exists() or not v1_file.exists(): + return + state_v1 = json.loads(v1_file.read_text()) + state = parse_state_v1(state_v1, keys_dir) + repo = GatewayProxyRepo(state=state, file=v2_file) + repo.save() + logger.info("Successfully migrated state from %s to %s", v1_file, v2_file) + + +def parse_state_v1(state_v1: dict, keys_dir: Path) -> State: + services, models = get_services_models(state_v1) + return State( + services=services, + models=models, + entrypoints=get_entrypoints(state_v1.get("store", {})), + projects=get_projects(state_v1.get("store", {}).get("projects", {}), keys_dir), + config=get_config(state_v1.get("store", {}).get("nginx", {}).get("acme_settings", {})), + ) + + +def get_services_models( + state_v1: dict, +) -> tuple[dict[str, dict[str, Service]], dict[str, dict[str, ChatModel]]]: + service_id_to_project_name = {} + for project_name, project_services in state_v1.get("store", {}).get("projects", {}).items(): + for service_id in project_services: + service_id_to_project_name[service_id] = project_name + + services: dict[str, dict[str, Service]] = {} + models: dict[str, dict[str, ChatModel]] = {} + for service in state_v1.get("store", {}).get("services", {}).values(): + project_name = service_id_to_project_name[service["id"]] + replicas = [] + for replica in service.get("replicas", []): + replicas.append(parse_replica(replica)) + parsed_service = Service( + project_name=project_name, + run_name=service["domain"].split(".")[0], + domain=service["domain"], + https=service.get("https", True), + auth=service["auth"], + client_max_body_size=service.get("client_max_body_size", 1024 * 1024), + replicas=tuple(replicas), + ) + services.setdefault(project_name, {})[parsed_service.run_name] = parsed_service + if model := service.get("options", {}).get("openai", {}).get("model", {}): + parsed_model = parse_model( + project_name, parsed_service.run_name, model, state_v1["openai"]["index"] + ) + if parsed_model is not None: + models.setdefault(project_name, {})[parsed_model.name] = parsed_model + + return services, models + + +def parse_replica(replica: dict) -> Replica: + ssh_proxy = None + if (ssh_proxy_destination := replica.get("ssh_jump_host")) and ( + ssh_proxy_port := replica.get("ssh_jump_port") + ): + proxy_user, proxy_host = ssh_proxy_destination.split("@") + ssh_proxy = SSHConnectionParams( + hostname=proxy_host, + username=proxy_user, + port=ssh_proxy_port, + ) + return Replica( + id=replica["id"], + app_port=replica["app_port"], + ssh_destination=replica["ssh_host"], + ssh_port=replica["ssh_port"], + ssh_proxy=ssh_proxy, + ) + + +def parse_model( + project_name: str, run_name: str, model: dict, openai_index: dict +) -> Optional[ChatModel]: + created_ts = ( + openai_index.get(project_name, {}).get("chat", {}).get(model["name"], {}).get("created") + ) + if created_ts is None: + # some models can be missing in the index, most likely due to a bug + return None + format_spec: AnyModelFormat + if model["format"] == "tgi": + format_spec = TGIChatModelFormat( + chat_template=model["chat_template"], eos_token=model["eos_token"] + ) + else: + format_spec = OpenAIChatModelFormat(prefix=model["prefix"]) + return ChatModel( + project_name=project_name, + name=model["name"], + created_at=datetime.fromtimestamp(created_ts), + run_name=run_name, + format_spec=format_spec, + ) + + +def get_entrypoints(store: dict) -> dict[str, ModelEntrypoint]: + entrypoint_domain_to_project_name = {} + for entrypoint_domain, (project_name, _) in store.get("entrypoints", {}).items(): + entrypoint_domain_to_project_name[entrypoint_domain] = project_name + + entrypoints = {} + for site_config in store.get("nginx", {}).get("configs", {}).values(): + if site_config["type"] == "entrypoint": + entrypoint = ModelEntrypoint( + project_name=entrypoint_domain_to_project_name[site_config["domain"]], + domain=site_config["domain"], + https=site_config["https"], + ) + entrypoints[entrypoint.project_name] = entrypoint + + return entrypoints + + +def get_projects(project_names: Iterable[str], keys_dir: Path) -> dict[str, Project]: + projects = {} + for project_name in project_names: + projects[project_name] = Project( + name=project_name, + ssh_private_key=(keys_dir / project_name).read_text(), + ) + return projects + + +def get_config(acme_settings: dict) -> GlobalProxyConfig: + return GlobalProxyConfig( + acme_settings=ACMESettings( + server=acme_settings.get("server"), + eab_kid=acme_settings.get("eab_kid"), + eab_hmac_key=acme_settings.get("eab_hmac_key"), + ) + ) diff --git a/src/dstack/_internal/proxy/gateway/resources/nginx/00-log-format.conf b/src/dstack/_internal/proxy/gateway/resources/nginx/00-log-format.conf new file mode 100644 index 0000000000..ea099708cf --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/resources/nginx/00-log-format.conf @@ -0,0 +1,11 @@ +log_format dstack_stat '$time_iso8601 $host $status $request_time $dstack_replica_hit'; + + +# A hack to avoid this Nginx reload error when no services are registered: +# nginx: [emerg] unknown "dstack_replica_hit" variable +server { + listen unix:/tmp/dstack-dummy-nginx.sock; + server_name placeholder.local; + deny all; + set $dstack_replica_hit 0; +} diff --git a/gateway/src/dstack/gateway/resources/nginx/entrypoint.jinja2 b/src/dstack/_internal/proxy/gateway/resources/nginx/entrypoint.jinja2 similarity index 89% rename from gateway/src/dstack/gateway/resources/nginx/entrypoint.jinja2 rename to src/dstack/_internal/proxy/gateway/resources/nginx/entrypoint.jinja2 index 47a32501f6..bcb66e1dd4 100644 --- a/gateway/src/dstack/gateway/resources/nginx/entrypoint.jinja2 +++ b/src/dstack/_internal/proxy/gateway/resources/nginx/entrypoint.jinja2 @@ -1,7 +1,7 @@ server { server_name {{ domain }}; location / { - proxy_pass http://localhost:{{ gateway_port }}/{{ proxy_path.strip('/') }}/; + proxy_pass http://localhost:{{ proxy_port }}/api/models/{{ project_name }}/; proxy_set_header X-Real-IP $remote_addr; proxy_set_header Host $host; proxy_read_timeout 300s; diff --git a/src/dstack/_internal/proxy/gateway/resources/nginx/router_workers.jinja2 b/src/dstack/_internal/proxy/gateway/resources/nginx/router_workers.jinja2 new file mode 100644 index 0000000000..3af7ea612d --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/resources/nginx/router_workers.jinja2 @@ -0,0 +1,23 @@ +{% for replica in replicas %} +# Worker {{ loop.index }} +upstream router_worker_{{ domain|replace('.', '_') }}_{{ ports[loop.index0] }}_upstream { + server unix:{{ replica.socket }}; +} + +server { + listen 127.0.0.1:{{ ports[loop.index0] }}; + access_log off; # disable access logs for this internal endpoint + + proxy_read_timeout 300s; + proxy_send_timeout 300s; + + location / { + proxy_pass http://router_worker_{{ domain|replace('.', '_') }}_{{ ports[loop.index0] }}_upstream; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header Connection ""; + proxy_set_header Upgrade $http_upgrade; + } +} +{% endfor %} diff --git a/src/dstack/_internal/proxy/gateway/resources/nginx/service.jinja2 b/src/dstack/_internal/proxy/gateway/resources/nginx/service.jinja2 new file mode 100644 index 0000000000..5523dea849 --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/resources/nginx/service.jinja2 @@ -0,0 +1,134 @@ +{% for zone in limit_req_zones %} +limit_req_zone {{ zone.key }} zone={{ zone.name }}:10m rate={{ zone.rpm }}r/m; +{% endfor %} + +{% if replicas %} +upstream {{ domain }}.upstream { + {% if router_port is not none %} + server 127.0.0.1:{{ router_port }}; # SGLang router on the gateway + {% else %} + {% for replica in replicas %} + server unix:{{ replica.socket }}; # replica {{ replica.id }} + {% endfor %} + {% endif %} +} +{% else %} + +{% endif %} +server { + server_name {{ domain }}; + limit_req_status 429; + set $dstack_replica_hit 0; + access_log {{ access_log_path }} dstack_stat; + client_max_body_size {{ client_max_body_size }}; + + {% for location in locations %} + location {{ location.prefix }} { + {% if cors_enabled %} + # Handle CORS preflight before auth (rewrite phase runs before access phase) + if ($request_method = 'OPTIONS') { + add_header 'Access-Control-Allow-Origin' '*' always; + add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, PATCH, OPTIONS, HEAD' always; + add_header 'Access-Control-Allow-Headers' '*' always; + add_header 'Access-Control-Max-Age' '600' always; + return 204; + } + {% endif %} + + {% if auth %} + auth_request /_dstack_auth; + {% endif %} + + try_files /nonexistent @$http_upgrade; + + {% if location.limit_req %} + limit_req zone={{ location.limit_req.zone }}{% if location.limit_req.burst %} burst={{ location.limit_req.burst }} nodelay{% endif %}; + {% endif %} + } + {% endfor %} + + {# For router services: block all requests except whitelisted locations added dynamically above #} + {% if has_router_replica or (router is not none and router.type == "sglang") %} + location / { + return 403; + } + {% endif %} + + location @websocket { + set $dstack_replica_hit 1; + {% if replicas %} + {% if cors_enabled %} + proxy_hide_header 'Access-Control-Allow-Origin'; + proxy_hide_header 'Access-Control-Allow-Methods'; + proxy_hide_header 'Access-Control-Allow-Headers'; + proxy_hide_header 'Access-Control-Allow-Credentials'; + add_header 'Access-Control-Allow-Origin' '*' always; + add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, PATCH, OPTIONS, HEAD' always; + add_header 'Access-Control-Allow-Headers' '*' always; + {% endif %} + proxy_pass http://{{ domain }}.upstream; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header Host $host; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "Upgrade"; + proxy_read_timeout 300s; + {% else %} + return 503; + {% endif %} + } + location @ { + set $dstack_replica_hit 1; + {% if replicas %} + {% if cors_enabled %} + proxy_hide_header 'Access-Control-Allow-Origin'; + proxy_hide_header 'Access-Control-Allow-Methods'; + proxy_hide_header 'Access-Control-Allow-Headers'; + proxy_hide_header 'Access-Control-Allow-Credentials'; + add_header 'Access-Control-Allow-Origin' '*' always; + add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, PATCH, OPTIONS, HEAD' always; + add_header 'Access-Control-Allow-Headers' '*' always; + {% endif %} + proxy_pass http://{{ domain }}.upstream; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header Host $host; + proxy_read_timeout 300s; + {% else %} + return 503; + {% endif %} + } + + {% if auth %} + location = /_dstack_auth { + internal; + if ($remote_addr = 127.0.0.1) { + # for requests from the gateway app, e.g. from the OpenAI-compatible API + return 200; + } + proxy_pass http://localhost:{{ proxy_port }}/api/auth/{{ project_name }}; + proxy_pass_request_body off; + proxy_set_header Content-Length ""; + proxy_set_header X-Original-URI $request_uri; + proxy_set_header Authorization $http_authorization; + } + {% endif %} + + listen 80; + {% if https %} + listen 443 ssl; + ssl_certificate /etc/letsencrypt/live/{{ domain }}/fullchain.pem; + ssl_certificate_key /etc/letsencrypt/live/{{ domain }}/privkey.pem; + include /etc/letsencrypt/options-ssl-nginx.conf; + ssl_dhparam /etc/letsencrypt/ssl-dhparams.pem; + set $force_https 1; + if ($scheme = "https") { + set $force_https 0; + } + if ($remote_addr = 127.0.0.1) { + set $force_https 0; + } + if ($force_https) { + return 301 https://$host$request_uri; + } + {% endif %} +} diff --git a/src/dstack/_internal/proxy/gateway/routers/__init__.py b/src/dstack/_internal/proxy/gateway/routers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/proxy/gateway/routers/auth.py b/src/dstack/_internal/proxy/gateway/routers/auth.py new file mode 100644 index 0000000000..0572159688 --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/routers/auth.py @@ -0,0 +1,10 @@ +from fastapi import APIRouter, Depends + +from dstack._internal.proxy.lib.deps import ProxyAuth + +router = APIRouter() + + +@router.get("/{project_name}", dependencies=[Depends(ProxyAuth(auto_enforce=True))]) +async def get_auth(): + return {"status": "ok"} diff --git a/src/dstack/_internal/proxy/gateway/routers/config.py b/src/dstack/_internal/proxy/gateway/routers/config.py new file mode 100644 index 0000000000..24dac6a52f --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/routers/config.py @@ -0,0 +1,28 @@ +from typing import Annotated + +from fastapi import APIRouter, Depends + +from dstack._internal.proxy.gateway.deps import get_gateway_proxy_repo +from dstack._internal.proxy.gateway.models import ACMESettings, GlobalProxyConfig +from dstack._internal.proxy.gateway.repo.repo import GatewayProxyRepo +from dstack._internal.proxy.gateway.schemas.common import OkResponse +from dstack._internal.proxy.gateway.schemas.config import ConfigRequest + +router = APIRouter() + + +@router.post("") +async def update_global_config( + body: ConfigRequest, + repo: Annotated[GatewayProxyRepo, Depends(get_gateway_proxy_repo)], +) -> OkResponse: + await repo.set_config( + GlobalProxyConfig( + acme_settings=ACMESettings( + server=body.acme_server, + eab_kid=body.acme_eab_kid, + eab_hmac_key=body.acme_eab_hmac_key, + ), + ), + ) + return OkResponse() diff --git a/src/dstack/_internal/proxy/gateway/routers/registry.py b/src/dstack/_internal/proxy/gateway/routers/registry.py new file mode 100644 index 0000000000..2a5f23f7a0 --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/routers/registry.py @@ -0,0 +1,127 @@ +from fastapi import APIRouter, Depends +from typing_extensions import Annotated + +import dstack._internal.proxy.gateway.services.registry as registry_services +from dstack._internal.proxy.gateway.deps import get_gateway_proxy_repo, get_nginx +from dstack._internal.proxy.gateway.repo.repo import GatewayProxyRepo +from dstack._internal.proxy.gateway.schemas.common import OkResponse +from dstack._internal.proxy.gateway.schemas.registry import ( + RegisterEntrypointRequest, + RegisterReplicaRequest, + RegisterServiceRequest, +) +from dstack._internal.proxy.gateway.services.nginx import Nginx +from dstack._internal.proxy.lib.deps import get_service_connection_pool +from dstack._internal.proxy.lib.services.service_connection import ServiceConnectionPool + +router = APIRouter(prefix="/{project_name}") + + +@router.post("/services/register") +async def register_service( + project_name: str, + body: RegisterServiceRequest, + repo: Annotated[GatewayProxyRepo, Depends(get_gateway_proxy_repo)], + nginx: Annotated[Nginx, Depends(get_nginx)], + service_conn_pool: Annotated[ServiceConnectionPool, Depends(get_service_connection_pool)], +) -> OkResponse: + await registry_services.register_service( + project_name=project_name.lower(), + run_name=body.run_name.lower(), + domain=body.domain.lower(), + https=body.https, + rate_limits=body.rate_limits, + auth=body.auth, + client_max_body_size=body.client_max_body_size, + model=body.options.openai.model if body.options.openai is not None else None, + ssh_private_key=body.ssh_private_key, + repo=repo, + has_router_replica=body.has_router_replica, + router=body.router, + nginx=nginx, + service_conn_pool=service_conn_pool, + ) + return OkResponse() + + +@router.post("/services/{run_name}/unregister") +async def unregister_service( + project_name: str, + run_name: str, + repo: Annotated[GatewayProxyRepo, Depends(get_gateway_proxy_repo)], + nginx: Annotated[Nginx, Depends(get_nginx)], + service_conn_pool: Annotated[ServiceConnectionPool, Depends(get_service_connection_pool)], +) -> OkResponse: + await registry_services.unregister_service( + project_name=project_name.lower(), + run_name=run_name.lower(), + repo=repo, + nginx=nginx, + service_conn_pool=service_conn_pool, + ) + return OkResponse() + + +@router.post("/services/{run_name}/replicas/register") +async def register_replica( + project_name: str, + run_name: str, + body: RegisterReplicaRequest, + repo: Annotated[GatewayProxyRepo, Depends(get_gateway_proxy_repo)], + nginx: Annotated[Nginx, Depends(get_nginx)], + service_conn_pool: Annotated[ServiceConnectionPool, Depends(get_service_connection_pool)], +) -> OkResponse: + await registry_services.register_replica( + project_name=project_name.lower(), + run_name=run_name.lower(), + replica_id=body.job_id, + app_port=body.app_port, + ssh_destination=body.ssh_host, + ssh_port=body.ssh_port, + ssh_proxy=body.ssh_proxy, + ssh_proxy_private_key=body.ssh_proxy_private_key, + ssh_head_proxy=body.ssh_head_proxy, + ssh_head_proxy_private_key=body.ssh_head_proxy_private_key, + internal_ip=body.internal_ip, + repo=repo, + nginx=nginx, + service_conn_pool=service_conn_pool, + ) + return OkResponse() + + +@router.post("/services/{run_name}/replicas/{job_id}/unregister") +async def unregister_replica( + project_name: str, + run_name: str, + job_id: str, + repo: Annotated[GatewayProxyRepo, Depends(get_gateway_proxy_repo)], + nginx: Annotated[Nginx, Depends(get_nginx)], + service_conn_pool: Annotated[ServiceConnectionPool, Depends(get_service_connection_pool)], +) -> OkResponse: + await registry_services.unregister_replica( + project_name=project_name.lower(), + run_name=run_name.lower(), + replica_id=job_id, + repo=repo, + nginx=nginx, + service_conn_pool=service_conn_pool, + ) + return OkResponse() + + +@router.post("/entrypoints/register") +async def register_entrypoint( + project_name: str, + body: RegisterEntrypointRequest, + repo: Annotated[GatewayProxyRepo, Depends(get_gateway_proxy_repo)], + nginx: Annotated[Nginx, Depends(get_nginx)], +) -> OkResponse: + await registry_services.register_model_entrypoint( + project_name=project_name.lower(), + domain=body.domain.lower(), + https=body.https, + repo=repo, + nginx=nginx, + ) + return OkResponse() diff --git a/src/dstack/_internal/proxy/gateway/routers/stats.py b/src/dstack/_internal/proxy/gateway/routers/stats.py new file mode 100644 index 0000000000..66a2897e04 --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/routers/stats.py @@ -0,0 +1,18 @@ +from typing import Annotated + +from fastapi import APIRouter, Depends + +from dstack._internal.proxy.gateway.deps import get_gateway_proxy_repo, get_stats_collector +from dstack._internal.proxy.gateway.repo.repo import GatewayProxyRepo +from dstack._internal.proxy.gateway.schemas.stats import ServiceStats +from dstack._internal.proxy.gateway.services.stats import StatsCollector, get_service_stats + +router = APIRouter() + + +@router.get("/collect") +async def collect_stats( + repo: Annotated[GatewayProxyRepo, Depends(get_gateway_proxy_repo)], + collector: Annotated[StatsCollector, Depends(get_stats_collector)], +) -> list[ServiceStats]: + return await get_service_stats(repo, collector) diff --git a/src/dstack/_internal/proxy/gateway/schemas/__init__.py b/src/dstack/_internal/proxy/gateway/schemas/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/proxy/gateway/schemas/common.py b/src/dstack/_internal/proxy/gateway/schemas/common.py new file mode 100644 index 0000000000..28ce252c0a --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/schemas/common.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + + +class OkResponse(BaseModel): + status: str = "ok" diff --git a/gateway/src/dstack/gateway/config/schemas.py b/src/dstack/_internal/proxy/gateway/schemas/config.py similarity index 100% rename from gateway/src/dstack/gateway/config/schemas.py rename to src/dstack/_internal/proxy/gateway/schemas/config.py diff --git a/src/dstack/_internal/proxy/gateway/schemas/registry.py b/src/dstack/_internal/proxy/gateway/schemas/registry.py new file mode 100644 index 0000000000..1dc354d1fa --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/schemas/registry.py @@ -0,0 +1,66 @@ +from typing import Annotated, Literal, Optional, Union + +from pydantic import BaseModel, Field + +from dstack._internal.core.models.instances import SSHConnectionParams +from dstack._internal.core.models.routers import AnyServiceRouterConfig +from dstack._internal.proxy.lib.models import RateLimit + + +class BaseChatModel(BaseModel): + type: Literal["chat"] + name: str + format: str + + +class TGIChatModel(BaseChatModel): + format: Literal["tgi"] + chat_template: str + eos_token: str + + +class OpenAIChatModel(BaseChatModel): + format: Literal["openai"] + prefix: str + + +ChatModel = Annotated[Union[TGIChatModel, OpenAIChatModel], Field(discriminator="format")] +AnyModel = Union[ChatModel] # embeddings and etc. + + +class OpenAIOptions(BaseModel): + model: AnyModel + + +class Options(BaseModel): + openai: Optional[OpenAIOptions] = None + + +class RegisterServiceRequest(BaseModel): + run_name: str + domain: str + https: bool + auth: bool + client_max_body_size: int + options: Options + ssh_private_key: str + rate_limits: tuple[RateLimit, ...] = () + has_router_replica: bool = False + router: Optional[AnyServiceRouterConfig] = None + + +class RegisterReplicaRequest(BaseModel): + job_id: str + app_port: int + ssh_host: str + ssh_port: int + ssh_proxy: Optional[SSHConnectionParams] + ssh_proxy_private_key: Optional[str] + ssh_head_proxy: Optional[SSHConnectionParams] + ssh_head_proxy_private_key: Optional[str] + internal_ip: Optional[str] = None + + +class RegisterEntrypointRequest(BaseModel): + domain: str + https: bool diff --git a/src/dstack/_internal/proxy/gateway/schemas/stats.py b/src/dstack/_internal/proxy/gateway/schemas/stats.py new file mode 100644 index 0000000000..a9dce623b1 --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/schemas/stats.py @@ -0,0 +1,15 @@ +from pydantic import BaseModel + + +class Stat(BaseModel): + requests: int + request_time: float + + +PerWindowStats = dict[int, Stat] # keys - length of time window in seconds + + +class ServiceStats(BaseModel): + project_name: str + run_name: str + stats: PerWindowStats diff --git a/src/dstack/_internal/proxy/gateway/services/__init__.py b/src/dstack/_internal/proxy/gateway/services/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/proxy/gateway/services/model_routers/__init__.py b/src/dstack/_internal/proxy/gateway/services/model_routers/__init__.py new file mode 100644 index 0000000000..43477d2d3f --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/services/model_routers/__init__.py @@ -0,0 +1,18 @@ +from dstack._internal.core.models.routers import AnyServiceRouterConfig, RouterType +from dstack._internal.proxy.gateway.services.model_routers.sglang import SglangRouter +from dstack._internal.proxy.lib.errors import ProxyError + +from .base import Router, RouterContext + + +def get_router(router: AnyServiceRouterConfig, context: RouterContext) -> Router: + if router.type == RouterType.SGLANG: + return SglangRouter(config=router, context=context) + raise ProxyError(f"Router type '{router.type}' is not available") + + +__all__ = [ + "Router", + "RouterContext", + "get_router", +] diff --git a/src/dstack/_internal/proxy/gateway/services/model_routers/base.py b/src/dstack/_internal/proxy/gateway/services/model_routers/base.py new file mode 100644 index 0000000000..83ec14cb4d --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/services/model_routers/base.py @@ -0,0 +1,91 @@ +from abc import ABC, abstractmethod +from pathlib import Path +from typing import List, Literal, Optional + +from pydantic import BaseModel + +from dstack._internal.core.models.routers import AnyServiceRouterConfig + + +class RouterContext(BaseModel): + """Context for router initialization and configuration.""" + + class Config: + frozen = True + + host: str = "127.0.0.1" + port: int + log_dir: Path + log_level: Literal["debug", "info", "warning", "error"] = "info" + + +class Router(ABC): + """Abstract base class for router implementations. + A router manages the lifecycle of worker replicas and handles request routing. + Different router implementations may have different mechanisms for managing + replicas. + """ + + def __init__( + self, + context: RouterContext, + config: Optional[AnyServiceRouterConfig] = None, + ): + """Initialize router with context. + + Args: + context: Runtime context for the router (host, port, logging, etc.) + config: Optional router configuration (implementation-specific) + """ + self.context = context + + @abstractmethod + def start(self) -> None: + """Start the router process. + + Raises: + Exception: If the router fails to start. + """ + ... + + @abstractmethod + def stop(self) -> None: + """Stop the router process. + + Raises: + Exception: If the router fails to stop. + """ + ... + + @abstractmethod + def is_running(self) -> bool: + """Check if the router is currently running and responding. + + Returns: + True if the router is running and healthy, False otherwise. + """ + ... + + @abstractmethod + def remove_replicas(self, replica_urls: List[str]) -> None: + """Unregister replicas from the router (actual API calls to remove workers). + + Args: + replica_urls: The list of replica URLs to remove from router. + + Raises: + Exception: If removing replicas fails. + """ + ... + + @abstractmethod + def update_replicas(self, replica_urls: List[str]) -> None: + """Update replicas for service, replacing the current set. + + Args: + replica_urls: The new list of replica URLs for this service. + + Raises: + Exception: If updating replicas fails. + """ + ... diff --git a/src/dstack/_internal/proxy/gateway/services/model_routers/sglang.py b/src/dstack/_internal/proxy/gateway/services/model_routers/sglang.py new file mode 100644 index 0000000000..c1c03c5a11 --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/services/model_routers/sglang.py @@ -0,0 +1,325 @@ +import shutil +import subprocess +import sys +import time +from typing import List, Optional + +import httpx +import psutil + +from dstack._internal.core.models.routers import AnyServiceRouterConfig, RouterType +from dstack._internal.proxy.lib.errors import UnexpectedProxyError +from dstack._internal.utils.logging import get_logger + +from .base import Router, RouterContext + +logger = get_logger(__name__) + + +class SglangRouter(Router): + """SGLang router implementation with 1:1 service-to-router.""" + + TYPE = RouterType.SGLANG + + def __init__(self, config: AnyServiceRouterConfig, context: RouterContext): + """Initialize SGLang router. + + Args: + config: SGLang router configuration (policy, cache_threshold, etc.) + context: Runtime context for the router (host, port, logging, etc.) + """ + super().__init__(context=context, config=config) + self.config = config + + def pid_from_tcp_ipv4_port(self, port: int) -> Optional[int]: + """ + Return PID of the process listening on the given TCP IPv4 port. + If no process is found, return None. + """ + for conn in psutil.net_connections(kind="tcp4"): + if conn.laddr and conn.laddr.port == port and conn.status == psutil.CONN_LISTEN: + return conn.pid + return None + + def start(self) -> None: + try: + logger.info("Starting sglang-router-new on port %s...", self.context.port) + + # Prometheus port is offset by 10000 from router port to keep it in a separate range + prometheus_port = self.context.port + 10000 + + cmd = [ + sys.executable, + "-m", + "sglang_router.launch_router", + "--host", + self.context.host, + "--port", + str(self.context.port), + "--prometheus-port", + str(prometheus_port), + "--prometheus-host", + self.context.host, + "--log-level", + self.context.log_level, + "--log-dir", + str(self.context.log_dir), + "--policy", + self.config.policy, + ] + if self.config.pd_disaggregation: + cmd.append("--pd-disaggregation") + + subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + + time.sleep(2) + + if not self.is_running(): + raise UnexpectedProxyError( + f"Failed to start sglang router on port {self.context.port}" + ) + + logger.info( + "Sglang router started successfully on port %s (prometheus on %s)", + self.context.port, + prometheus_port, + ) + + except Exception: + logger.exception("Failed to start sglang-router") + raise + + def stop(self) -> None: + try: + pid = self.pid_from_tcp_ipv4_port(self.context.port) + + if pid: + logger.debug( + "Stopping sglang-router process (PID: %s) on port %s", + pid, + self.context.port, + ) + try: + proc = psutil.Process(pid) + proc.terminate() + try: + proc.wait(timeout=5) + except psutil.TimeoutExpired: + logger.warning( + "Process %s did not terminate gracefully, forcing kill", pid + ) + proc.kill() + except psutil.NoSuchProcess: + logger.debug("sglang-router process %s already exited before stop()", pid) + else: + logger.debug("No sglang-router process found on port %s", self.context.port) + + # Clean up router logs + if self.context.log_dir.exists(): + logger.debug("Cleaning up router logs for port %s...", self.context.port) + shutil.rmtree(self.context.log_dir, ignore_errors=True) + + except Exception: + logger.exception("Failed to stop sglang-router") + raise + + def is_running(self) -> bool: + """Check if the SGLang router is running and responding to HTTP requests on the assigned port.""" + try: + with httpx.Client(timeout=5.0) as client: + response = client.get(f"http://{self.context.host}:{self.context.port}/workers") + return response.status_code == 200 + except httpx.RequestError as e: + logger.debug( + "Sglang router not responding on port %s: %s", + self.context.port, + e, + ) + return False + + def remove_replicas(self, replica_urls: List[str]) -> None: + for replica_url in replica_urls: + self._remove_worker_from_router(replica_url) + + def update_replicas(self, replica_urls: List[str]) -> None: + """Update replicas for service, replacing the current set.""" + # Query router to get current worker URLs + current_workers = self._get_router_workers() + current_worker_urls: set[str] = set() + for worker in current_workers: + url = worker.get("url") + if url and isinstance(url, str): + # Normalize URL by removing trailing slashes to avoid path artifacts + normalized_url = url.rstrip("/") + current_worker_urls.add(normalized_url) + # Normalize target URLs to ensure consistent comparison + target_worker_urls = {url.rstrip("/") for url in replica_urls} + + # Workers to add + workers_to_add = target_worker_urls - current_worker_urls + # Workers to remove + workers_to_remove = current_worker_urls - target_worker_urls + + if workers_to_add: + logger.info( + "Sglang router update: adding %d workers for router on port %s", + len(workers_to_add), + self.context.port, + ) + if workers_to_remove: + logger.info( + "Sglang router update: removing %d workers for router on port %s", + len(workers_to_remove), + self.context.port, + ) + + # Add workers + for worker_url in sorted(workers_to_add): + success = self._register_worker(worker_url) + if not success: + logger.warning("Failed to add worker %s, continuing with others", worker_url) + + # Remove workers + for worker_url in sorted(workers_to_remove): + success = self._remove_worker_from_router(worker_url) + if not success: + logger.warning("Failed to remove worker %s, continuing with others", worker_url) + + def _get_router_workers(self) -> List[dict]: + try: + with httpx.Client(timeout=5.0) as client: + response = client.get(f"http://{self.context.host}:{self.context.port}/workers") + if response.status_code == 200: + response_data = response.json() + workers = response_data.get("workers", []) + return workers + return [] + except Exception: + logger.exception("Error getting sglang router workers") + return [] + + def _add_worker_to_router( + self, + url: str, + worker_type: str = "regular", + bootstrap_port: Optional[int] = None, + ) -> bool: + try: + payload: dict = {"url": url, "worker_type": worker_type} + if bootstrap_port is not None: + payload["bootstrap_port"] = bootstrap_port + with httpx.Client(timeout=5.0) as client: + response = client.post( + f"http://{self.context.host}:{self.context.port}/workers", + json=payload, + ) + if response.status_code == 202: + response_data = response.json() + if response_data.get("status") == "accepted": + logger.info( + "Worker %s (type=%s) accepted by sglang router on port %s", + url, + worker_type, + self.context.port, + ) + return True + else: + logger.error( + "Sglang router on port %s failed to accept worker: %s", + self.context.port, + response_data, + ) + return False + else: + logger.error( + "Failed to add worker %s: status %d, %s", + url, + response.status_code, + response.text, + ) + return False + except Exception: + logger.exception("Error adding worker %s", url) + return False + + def _register_worker(self, url: str) -> bool: + if not self.config.pd_disaggregation: + return self._add_worker_to_router(url, "regular", None) + + server_info_url = f"{url}/server_info" + try: + with httpx.Client(timeout=10) as client: + resp = client.get(server_info_url) + if resp.status_code != 200: + return False + data = resp.json() + if data.get("status") != "ready": + return False + disaggregation_mode = data.get("disaggregation_mode", "") + if disaggregation_mode == "prefill": + worker_type = "prefill" + bootstrap_port = data.get("disaggregation_bootstrap_port") + elif disaggregation_mode == "decode": + worker_type = "decode" + bootstrap_port = None + else: + worker_type = "regular" + bootstrap_port = None + logger.info( + "Registering worker %s (type=%s)", + url, + worker_type, + ) + return self._add_worker_to_router( + url, + worker_type, + bootstrap_port, + ) + except Exception: + logger.exception("Error registering worker %s", url) + return False + + def _remove_worker_from_router(self, worker_url: str) -> bool: + try: + current_workers = self._get_router_workers() + worker_id = None + for worker in current_workers: + url = worker.get("url") + if url and isinstance(url, str) and url == worker_url: + worker_id = worker.get("id") + if worker_id and isinstance(worker_id, str): + break + if not worker_id: + logger.error("No worker id found for url %s", worker_url) + return False + with httpx.Client(timeout=5.0) as client: + response = client.delete( + f"http://{self.context.host}:{self.context.port}/workers/{worker_id}" + ) + if response.status_code == 202: + response_data = response.json() + if response_data.get("status") == "accepted": + logger.info( + "Removed worker %s from sglang router on port %s", + worker_url, + self.context.port, + ) + return True + else: + logger.error( + "Sglang router on port %s failed to remove worker: %s", + self.context.port, + response_data, + ) + return False + else: + logger.error( + "Failed to remove worker %s: status %d, %s", + worker_url, + response.status_code, + response.text, + ) + return False + except Exception: + logger.exception("Error removing worker %s", worker_url) + return False diff --git a/src/dstack/_internal/proxy/gateway/services/nginx.py b/src/dstack/_internal/proxy/gateway/services/nginx.py new file mode 100644 index 0000000000..55b3d23675 --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/services/nginx.py @@ -0,0 +1,471 @@ +import importlib.resources +import socket +import subprocess +import tempfile +from asyncio import Lock +from pathlib import Path +from typing import Dict, Optional +from urllib.parse import urlparse + +import jinja2 +from pydantic import BaseModel +from typing_extensions import Literal + +from dstack._internal.core.models.routers import AnyServiceRouterConfig, RouterType +from dstack._internal.proxy.gateway.const import PROXY_PORT_ON_GATEWAY +from dstack._internal.proxy.gateway.models import ACMESettings +from dstack._internal.proxy.gateway.services.model_routers import ( + Router, + RouterContext, + get_router, +) +from dstack._internal.proxy.lib import models +from dstack._internal.proxy.lib.errors import ProxyError, UnexpectedProxyError +from dstack._internal.utils.common import run_async +from dstack._internal.utils.logging import get_logger + +CERTBOT_TIMEOUT = 40 +CERTBOT_2ND_TIMEOUT = 5 +CONFIGS_DIR = Path("/etc/nginx/sites-enabled") +logger = get_logger(__name__) + + +class SiteConfig(BaseModel): + type: str + domain: str + https: bool = True + + def render(self) -> str: + template = read_package_resource(f"{self.type}.jinja2") + render_dict = self.dict() + render_dict["proxy_port"] = PROXY_PORT_ON_GATEWAY + return jinja2.Template(template).render(**render_dict) + + +class ReplicaConfig(BaseModel): + id: str + socket: Path + port: int + internal_ip: Optional[str] = None + + +class LimitReqZoneConfig(BaseModel): + name: str + key: str + rpm: int + + +class LimitReqConfig(BaseModel): + zone: str + burst: int + + +class LocationConfig(BaseModel): + prefix: str + limit_req: Optional[LimitReqConfig] + + +class ServiceConfig(SiteConfig): + type: Literal["service"] = "service" + project_name: str + auth: bool + client_max_body_size: int + access_log_path: Path + limit_req_zones: list[LimitReqZoneConfig] + locations: list[LocationConfig] + replicas: list[ReplicaConfig] + has_router_replica: bool = False + router: Optional[AnyServiceRouterConfig] = None + router_port: Optional[int] = None + cors_enabled: bool = False + + +class ModelEntrypointConfig(SiteConfig): + type: Literal["entrypoint"] = "entrypoint" + project_name: str + + +class Nginx: + """Updates nginx config and issues SSL certificates.""" + + def __init__(self, conf_dir: Path = Path("/etc/nginx/sites-enabled")) -> None: + self._conf_dir = conf_dir + self._lock: Lock = Lock() + # 1:1 service-to-router mapping + self._router_port_to_domain: Dict[int, str] = {} + self._domain_to_router: Dict[str, Router] = {} + self._ROUTER_PORT_MIN: int = 20000 + self._ROUTER_PORT_MAX: int = 24999 + self._WORKER_PORT_MIN: int = 10001 + self._WORKER_PORT_MAX: int = 11999 + self._next_router_port: int = self._ROUTER_PORT_MIN + # Tracking of worker ports to avoid conflicts across router instances + self._allocated_worker_ports: set[int] = set() + self._domain_to_worker_urls: Dict[str, list[str]] = {} + self._next_worker_port: int = self._WORKER_PORT_MIN + + async def register(self, conf: SiteConfig, acme: ACMESettings) -> None: + logger.debug("Registering %s domain %s", conf.type, conf.domain) + conf_name = self.get_config_name(conf.domain) + async with self._lock: + if conf.https: + await run_async(self.run_certbot, conf.domain, acme) + + if isinstance(conf, ServiceConfig) and conf.router and not conf.has_router_replica: + if conf.router.type == RouterType.SGLANG: + # Check if router already exists for this domain + if conf.domain in self._domain_to_router: + # Router already exists, reuse it + router = self._domain_to_router[conf.domain] + router_port = router.context.port + conf.router_port = router_port + else: + # Allocate router port for new router + router_port = self._allocate_router_port() + conf.router_port = router_port + + # Create per-service log directory + log_dir = Path(f"./router_logs/{conf.domain}") + + # Create router context with allocated port + ctx = RouterContext( + port=router_port, + log_dir=log_dir, + ) + + # Create new router instance for this service + router = get_router(conf.router, context=ctx) + + # Store mappings + self._router_port_to_domain[router_port] = conf.domain + self._domain_to_router[conf.domain] = router + + # Start router if not running + try: + if not await run_async(router.is_running): + await run_async(router.start) + except Exception: + # Clean up on failure + del self._router_port_to_domain[router_port] + del self._domain_to_router[conf.domain] + raise + + if conf.router.pd_disaggregation: + # PD path: replica_urls from internal_ip (router talks directly to workers) + if any(not r.internal_ip for r in conf.replicas): + raise ProxyError( + "PD disaggregation requires internal IP for all replicas." + ) + replica_urls = [ + f"http://{replica.internal_ip}:{replica.port}" + for replica in conf.replicas + ] + self._domain_to_worker_urls[conf.domain] = replica_urls + else: + # Non-PD path: allocate gateway-local ports, nginx proxies to replica sockets + allocated_ports = self._allocate_worker_ports(len(conf.replicas)) + replica_urls = [ + f"http://{router.context.host}:{port}" for port in allocated_ports + ] + if conf.replicas: + await run_async( + self.write_router_workers_conf, + conf, + allocated_ports, + ) + if conf.domain in self._domain_to_worker_urls: + self._discard_ports(self._domain_to_worker_urls[conf.domain]) + self._domain_to_worker_urls[conf.domain] = replica_urls + + try: + await run_async(router.update_replicas, replica_urls) + except Exception as e: + logger.exception( + "Failed to add replicas to router for domain=%s: %s", + conf.domain, + e, + ) + raise + + await run_async(self.write_conf, conf.render(), conf_name) + + logger.info("Registered %s domain %s", conf.type, conf.domain) + + async def unregister(self, service: models.Service) -> None: + domain = service.domain_safe + logger.debug("Unregistering domain %s", domain) + conf_path = self._conf_dir / self.get_config_name(domain) + if not conf_path.exists(): + return + async with self._lock: + await run_async(sudo_rm, conf_path) + + if domain in self._domain_to_router: + router = self._domain_to_router[domain] + # Remove all workers for this domain + if domain in self._domain_to_worker_urls: + worker_urls = self._domain_to_worker_urls[domain] + await run_async(router.remove_replicas, worker_urls) + pd_disaggregation = ( + service.router.pd_disaggregation if service.router else False + ) + if not pd_disaggregation: + self._discard_ports(worker_urls) + del self._domain_to_worker_urls[domain] + logger.debug("Removed worker URLs for domain %s", domain) + # Stop and kill the router + await run_async(router.stop) + # Remove from mappings + router_port = router.context.port + if router_port in self._router_port_to_domain: + del self._router_port_to_domain[router_port] + del self._domain_to_router[domain] + + # Remove workers config file + workers_conf_path = self._conf_dir / f"router-workers.{domain}.conf" + if workers_conf_path.exists(): + await run_async(sudo_rm, workers_conf_path) + + await run_async(self.reload) + logger.info("Unregistered domain %s", domain) + + @staticmethod + def reload() -> None: + cmd = ["sudo", "systemctl", "reload", "nginx.service"] + r = subprocess.run(cmd, timeout=10) + if r.returncode != 0: + raise UnexpectedProxyError("Failed to reload nginx") + + def write_conf(self, conf: str, conf_name: str) -> None: + """Update config and reload nginx. Rollback changes on error.""" + conf_path = self._conf_dir / conf_name + old_conf = conf_path.read_text() if conf_path.exists() else None + if conf == old_conf: + return + sudo_write(conf_path, conf) + try: + self.reload() + except UnexpectedProxyError: + # rollback changes + if old_conf is not None: + sudo_write(conf_path, old_conf) + else: + sudo_rm(conf_path) + raise + + @classmethod + def run_certbot(cls, domain: str, acme: ACMESettings) -> None: + if cls.certificate_exists(domain): + return + + logger.info("Running certbot for %s", domain) + + cmd = ["sudo", "timeout", "--kill-after", str(CERTBOT_2ND_TIMEOUT), str(CERTBOT_TIMEOUT)] + cmd += ["certbot", "certonly"] + cmd += ["--non-interactive", "--agree-tos", "--register-unsafely-without-email"] + cmd += ["--keep", "--nginx", "--domain", domain] + + if acme.server: + cmd += ["--server", str(acme.server)] + + if acme.eab_kid and acme.eab_hmac_key: + cmd += ["--eab-kid", acme.eab_kid] + cmd += ["--eab-hmac-key", acme.eab_hmac_key] + + r = subprocess.run( + cmd, + capture_output=True, + timeout=CERTBOT_TIMEOUT + CERTBOT_2ND_TIMEOUT + 1, # shouldn't happen + ) + if r.returncode == 124: + raise ProxyError( + f"Could not obtain {domain} TLS certificate in {CERTBOT_TIMEOUT}s." + " Make sure DNS records are configured for this gateway." + ) + if r.returncode != 0: + raise ProxyError(f"Error obtaining {domain} TLS certificate:\n{r.stderr.decode()}") + + @staticmethod + def certificate_exists(domain: str) -> bool: + cmd = ["sudo", "test", "-e", f"/etc/letsencrypt/live/{domain}/fullchain.pem"] + return subprocess.run(cmd, timeout=2).returncode == 0 + + @staticmethod + def get_config_name(domain: str) -> str: + return f"443-{domain}.conf" + + @staticmethod + def _is_port_available(port: int) -> bool: + """Check if a port is actually available (not in use by any process). + + Tries to bind to the port to see if it's available. + """ + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + try: + sock.bind(("127.0.0.1", port)) + # If bind succeeds, port is available + return True + except OSError: + # If bind fails (e.g., Address already in use), port is not available + return False + except Exception: + logger.warning("Error checking port %s availability", port) + return False + + def _allocate_router_port(self) -> int: + """Allocate next available router port in fixed range. + + Checks both our internal allocation map and actual port availability + to avoid conflicts with other services. Range chosen to avoid ephemeral ports. + """ + port = self._next_router_port + max_attempts = self._ROUTER_PORT_MAX - self._ROUTER_PORT_MIN + 1 + attempts = 0 + + while attempts < max_attempts: + # Check if port is already allocated by us + if port in self._router_port_to_domain: + port += 1 + if port > self._ROUTER_PORT_MAX: + port = self._ROUTER_PORT_MIN # Wrap around + attempts += 1 + continue + + # Check if port is actually available on the system + if self._is_port_available(port): + # Port is available, allocate it + self._next_router_port = port + 1 + if self._next_router_port > self._ROUTER_PORT_MAX: + self._next_router_port = self._ROUTER_PORT_MIN # Wrap around + logger.debug("Allocated router port %s", port) + return port + + # Port is in use, try next one + logger.debug("Port %s is in use, trying next port", port) + port += 1 + if port > self._ROUTER_PORT_MAX: + port = self._ROUTER_PORT_MIN # Wrap around + attempts += 1 + + raise UnexpectedProxyError( + f"Router port range exhausted ({self._ROUTER_PORT_MIN}-{self._ROUTER_PORT_MAX}). " + "All ports in range appear to be in use." + ) + + def _allocate_worker_ports(self, num_ports: int) -> list[int]: + """Allocate worker ports globally in fixed range. + + Worker ports are used by nginx to listen and proxy to worker sockets. + They must be unique across all router instances. Range chosen to avoid ephemeral ports. + + Args: + num_ports: Number of worker ports to allocate + + Returns: + List of allocated worker port numbers + """ + allocated = [] + port = self._next_worker_port + max_attempts = (self._WORKER_PORT_MAX - self._WORKER_PORT_MIN + 1) * 2 # Allow wrap-around + attempts = 0 + + while len(allocated) < num_ports and attempts < max_attempts: + # Check if port is already allocated globally + if port in self._allocated_worker_ports: + port += 1 + if port > self._WORKER_PORT_MAX: + port = self._WORKER_PORT_MIN # Wrap around + attempts += 1 + continue + + # Check if port is actually available on the system + if self._is_port_available(port): + allocated.append(port) + self._allocated_worker_ports.add(port) + logger.debug("Allocated worker port %s", port) + port += 1 + if port > self._WORKER_PORT_MAX: + port = self._WORKER_PORT_MIN # Wrap around + else: + logger.debug("Worker port %s is in use, trying next port", port) + port += 1 + if port > self._WORKER_PORT_MAX: + port = self._WORKER_PORT_MIN # Wrap around + + attempts += 1 + + if len(allocated) < num_ports: + # Free up the ports we did allocate + for p in allocated: + self._allocated_worker_ports.discard(p) + raise UnexpectedProxyError( + f"Failed to allocate {num_ports} worker ports in range " + f"({self._WORKER_PORT_MIN}-{self._WORKER_PORT_MAX}). " + f"Only allocated {len(allocated)} ports after {attempts} attempts." + ) + + # Update next worker port for next allocation + self._next_worker_port = port + if self._next_worker_port > self._WORKER_PORT_MAX: + self._next_worker_port = self._WORKER_PORT_MIN # Wrap around + + return allocated + + def _discard_ports(self, urls: list[str]) -> None: + for u in urls: + parsed = urlparse(u) + if parsed.port is not None and parsed.port in self._allocated_worker_ports: + self._allocated_worker_ports.discard(parsed.port) + + def write_global_conf(self) -> None: + conf = read_package_resource("00-log-format.conf") + self.write_conf(conf, "00-log-format.conf") + + def write_router_workers_conf(self, conf: ServiceConfig, allocated_ports: list[int]) -> None: + """Write router workers configuration file (generic).""" + # Pass ports to template + workers_config = generate_router_workers_config(conf, allocated_ports) + workers_conf_name = f"router-workers.{conf.domain}.conf" + self.write_conf(workers_config, workers_conf_name) + + +def generate_router_workers_config(conf: ServiceConfig, allocated_ports: list[int]) -> str: + """Generate router workers configuration (generic, uses router_workers.jinja2 template).""" + template = read_package_resource("router_workers.jinja2") + return jinja2.Template(template).render( + domain=conf.domain, + replicas=conf.replicas, + ports=allocated_ports, + proxy_port=PROXY_PORT_ON_GATEWAY, + ) + + +def read_package_resource(file: str) -> str: + return ( + importlib.resources.files("dstack._internal.proxy.gateway") + .joinpath(f"resources/nginx/{file}") + .read_text() + ) + + +def sudo_write(path: Path, content: str) -> None: + with tempfile.NamedTemporaryFile("w") as temp: + temp.write(content) + temp.flush() + temp.seek(0) + r = subprocess.run(sudo() + ["cp", "-p", temp.name, path], timeout=3) + if r.returncode != 0: + raise UnexpectedProxyError("Failed to copy file as sudo") + + +def sudo_rm(path: Path) -> None: + r = subprocess.run(sudo() + ["rm", path], timeout=3) + if r.returncode != 0: + raise UnexpectedProxyError("Failed to remove file as sudo") + + +def sudo() -> list[str]: + """Mocked in tests""" + return ["sudo"] diff --git a/src/dstack/_internal/proxy/gateway/services/registry.py b/src/dstack/_internal/proxy/gateway/services/registry.py new file mode 100644 index 0000000000..f190523a39 --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/services/registry.py @@ -0,0 +1,485 @@ +import asyncio +from asyncio import Lock +from datetime import datetime +from pathlib import Path +from typing import Iterable, Optional + +import dstack._internal.proxy.gateway.schemas.registry as schemas +from dstack._internal.core.models.instances import SSHConnectionParams +from dstack._internal.core.models.routers import AnyServiceRouterConfig, RouterType +from dstack._internal.proxy.gateway import models as gateway_models +from dstack._internal.proxy.gateway.const import SERVICE_ALREADY_REGISTERED_ERROR_TEMPLATE +from dstack._internal.proxy.gateway.repo.repo import GatewayProxyRepo +from dstack._internal.proxy.gateway.services.nginx import ( + LimitReqConfig, + LimitReqZoneConfig, + LocationConfig, + ModelEntrypointConfig, + Nginx, + ReplicaConfig, + ServiceConfig, +) +from dstack._internal.proxy.lib import models +from dstack._internal.proxy.lib.const import ROUTER_WHITELISTED_PATHS +from dstack._internal.proxy.lib.errors import ProxyError, UnexpectedProxyError +from dstack._internal.proxy.lib.repo import BaseProxyRepo +from dstack._internal.proxy.lib.services.service_connection import ( + ServiceConnection, + ServiceConnectionPool, +) +from dstack._internal.utils.logging import get_logger + +ACCESS_LOG_PATH = Path("/var/log/nginx/dstack.access.log") +logger = get_logger(__name__) +lock = Lock() + + +async def register_service( + project_name: str, + run_name: str, + domain: str, + https: bool, + rate_limits: tuple[models.RateLimit, ...], + auth: bool, + client_max_body_size: int, + model: Optional[schemas.AnyModel], + ssh_private_key: str, + repo: GatewayProxyRepo, + nginx: Nginx, + service_conn_pool: ServiceConnectionPool, + has_router_replica: bool = False, + router: Optional[AnyServiceRouterConfig] = None, +) -> None: + cors_enabled = model is not None and model.type == "chat" and model.format == "openai" + service = models.Service( + project_name=project_name, + run_name=run_name, + domain=domain, + https=https, + rate_limits=rate_limits, + auth=auth, + client_max_body_size=client_max_body_size, + replicas=(), + has_router_replica=has_router_replica, + router=router, + cors_enabled=cors_enabled, + ) + + async with lock: + if await repo.get_service(project_name, run_name) is not None: + raise ProxyError(SERVICE_ALREADY_REGISTERED_ERROR_TEMPLATE.format(ref=service.fmt())) + if await repo.get_service_by_domain(domain) is not None: + raise ProxyError(f"Domain name {domain!r} is already taken by another service") + + old_project = await repo.get_project(project_name) + new_project = models.Project(name=project_name, ssh_private_key=ssh_private_key) + if old_project is not None and old_project.ssh_private_key != new_project.ssh_private_key: + logger.warning( + "SSH key for service %s is different from the previous one", service.fmt() + ) + await repo.set_project(new_project) + + logger.debug("Registering service %s", service.fmt()) + + await apply_service( + service=service, + old_service=None, + repo=repo, + nginx=nginx, + service_conn_pool=service_conn_pool, + ) + await repo.set_service(service) + + if model is not None: + await repo.set_model( + models.ChatModel( + project_name=project_name, + name=model.name, + created_at=datetime.now(), + run_name=run_name, + format_spec=model_schema_to_format_spec(model), + ), + ) + + logger.info("Service %s is registered now", service.fmt()) + + +async def unregister_service( + project_name: str, + run_name: str, + repo: GatewayProxyRepo, + nginx: Nginx, + service_conn_pool: ServiceConnectionPool, +) -> None: + async with lock: + service = await repo.get_service(project_name, run_name) + if service is None: + raise ProxyError( + f"Service {project_name}/{run_name} is not registered, cannot unregister" + ) + + logger.debug("Unregistering service %s", service.fmt()) + + await stop_replica_connections( + ids=(r.id for r in service.replicas), + service_conn_pool=service_conn_pool, + ) + await nginx.unregister(service) + await repo.delete_models_by_run(project_name, run_name) + await repo.delete_service(project_name, run_name) + + logger.info("Service %s is unregistered now", service.fmt()) + + +async def register_replica( + project_name: str, + run_name: str, + replica_id: str, + app_port: int, + ssh_destination: str, + ssh_port: int, + ssh_proxy: Optional[SSHConnectionParams], + ssh_proxy_private_key: Optional[str], + ssh_head_proxy: Optional[SSHConnectionParams], + ssh_head_proxy_private_key: Optional[str], + repo: GatewayProxyRepo, + nginx: Nginx, + service_conn_pool: ServiceConnectionPool, + internal_ip: Optional[str] = None, +) -> None: + replica = models.Replica( + id=replica_id, + app_port=app_port, + ssh_destination=ssh_destination, + ssh_port=ssh_port, + ssh_proxy=ssh_proxy, + ssh_proxy_private_key=ssh_proxy_private_key, + ssh_head_proxy=ssh_head_proxy, + ssh_head_proxy_private_key=ssh_head_proxy_private_key, + internal_ip=internal_ip, + ) + + async with lock: + old_service = await repo.get_service(project_name, run_name) + if old_service is None: + raise ProxyError( + f"Service {project_name}/{run_name} does not exist, cannot register replica" + ) + + if old_service.find_replica(replica_id) is not None: + # NOTE: as of 0.19.25, the dstack server relies on the exact text of this error. + # See dstack._internal.server.services.services.register_replica + raise ProxyError(f"Replica {replica_id} already exists in service {old_service.fmt()}") + + service = old_service.with_replicas(old_service.replicas + (replica,)) + + logger.debug("Registering replica %s in service %s", replica.id, service.fmt()) + failures = await apply_service( + service=service, + old_service=old_service, + repo=repo, + nginx=nginx, + service_conn_pool=service_conn_pool, + ) + if replica in failures: + raise ProxyError( + f"Cannot register replica {replica.id}" + f" in service {service.fmt()}: {failures[replica]}" + ) + await repo.set_service(service) + + logger.info("Replica %s in service %s is registered now", replica.id, service.fmt()) + + +async def unregister_replica( + project_name: str, + run_name: str, + replica_id: str, + repo: GatewayProxyRepo, + nginx: Nginx, + service_conn_pool: ServiceConnectionPool, +) -> None: + async with lock: + old_service = await repo.get_service(project_name, run_name) + if old_service is None: + raise ProxyError( + f"Service {project_name}/{run_name} does not exist, cannot unregister replica" + ) + + replica = old_service.find_replica(replica_id) + if replica is None: + raise ProxyError( + f"Replica {replica_id} does not exist in service {old_service.fmt()}," + " cannot unregister" + ) + + service = old_service.with_replicas(tuple(r for r in old_service.replicas if r != replica)) + + logger.debug("Unregistering replica %s in service %s", replica.id, service.fmt()) + + await apply_service( + service=service, + old_service=old_service, + repo=repo, + nginx=nginx, + service_conn_pool=service_conn_pool, + ) + await repo.set_service(service) + + logger.info("Replica %s in service %s is unregistered now", replica_id, service.fmt()) + + +async def register_model_entrypoint( + project_name: str, + domain: str, + https: bool, + repo: GatewayProxyRepo, + nginx: Nginx, +) -> None: + entrypoint = gateway_models.ModelEntrypoint( + project_name=project_name, + domain=domain, + https=https, + ) + logger.debug("Registering entrypoint %s in project %s", domain, project_name) + await apply_entrypoint(entrypoint, repo, nginx) + await repo.set_entrypoint(entrypoint) + logger.info("Entrypoint %s is now registered in project %s", domain, project_name) + + +def _uses_pd_disaggregation(service: models.Service) -> bool: + """PD disaggregation: router talks to replicas via internal_ip, no SSH tunnels needed.""" + return service.router is not None and service.router.pd_disaggregation + + +async def apply_service( + service: models.Service, + old_service: Optional[models.Service], + repo: GatewayProxyRepo, + nginx: Nginx, + service_conn_pool: ServiceConnectionPool, +) -> dict[models.Replica, BaseException]: + if old_service is not None: + if service.domain != old_service.domain: + raise UnexpectedProxyError( + f"Did not expect service {service.fmt()}" + f" domain name to change ({old_service.domain} -> {service.domain})" + ) + await stop_replica_connections( + ids=( + replica.id for replica in old_service.replicas if replica not in service.replicas + ), + service_conn_pool=service_conn_pool, + ) + if _uses_pd_disaggregation(service): + replica_conns = {} + replica_failures = {} + replica_configs = [ + ReplicaConfig( + id=replica.id, + socket=Path("/dev/null"), + port=replica.app_port, + internal_ip=replica.internal_ip, + ) + for replica in service.replicas + ] + else: + replica_conns, replica_failures = await get_or_add_replica_connections( + service, repo, service_conn_pool + ) + replica_configs = [ + ReplicaConfig( + id=replica.id, + socket=conn.app_socket_path, + port=replica.app_port, + internal_ip=replica.internal_ip, + ) + for replica, conn in replica_conns.items() + ] + service_config = await get_nginx_service_config(service, replica_configs) + await nginx.register(service_config, (await repo.get_config()).acme_settings) + return replica_failures + + +async def get_or_add_replica_connections( + service: models.Service, repo: BaseProxyRepo, service_conn_pool: ServiceConnectionPool +) -> tuple[dict[models.Replica, ServiceConnection], dict[models.Replica, BaseException]]: + project = await repo.get_project(service.project_name) + if project is None: + raise UnexpectedProxyError( + f"Project {service.project_name} unexpectedly missing, even though service" + f" {service.fmt()} exists." + ) + replica_conns, replica_failures = {}, {} + tasks = [ + service_conn_pool.get_or_add(project, service, replica) for replica in service.replicas + ] + results = await asyncio.gather(*tasks, return_exceptions=True) + for replica, conn_or_err in zip(service.replicas, results): + if isinstance(conn_or_err, BaseException): + replica_failures[replica] = conn_or_err + logger.warning( + "Failed starting connection to replica %s in service %s: %s", + replica.id, + service.fmt(), + conn_or_err, + ) + else: + replica_conns[replica] = conn_or_err + return replica_conns, replica_failures + + +async def stop_replica_connections( + ids: Iterable[str], service_conn_pool: ServiceConnectionPool +) -> None: + tasks = map(service_conn_pool.remove, ids) + results = await asyncio.gather(*tasks, return_exceptions=True) + for replica_id, exc in zip(ids, results): + if isinstance(exc, Exception): + logger.error("Error stopping connection to replica %s: %s", replica_id, exc) + + +async def get_nginx_service_config( + service: models.Service, replicas: Iterable[ReplicaConfig] +) -> ServiceConfig: + limit_req_zones: list[LimitReqZoneConfig] = [] + locations: list[LocationConfig] = [] + is_router = ( + service.router is not None and service.router.type == RouterType.SGLANG + ) or service.has_router_replica + sglang_limits: dict[str, LimitReqConfig] = {} + sglang_prefix_lengths: dict[str, int] = {} # Track prefix lengths for most-specific selection + + for i, rate_limit in enumerate(service.rate_limits): + zone_name = f"{i}.{service.domain_safe}" + if isinstance(rate_limit.key, models.IPAddressPartitioningKey): + key = "$binary_remote_addr" + elif isinstance(rate_limit.key, models.HeaderPartitioningKey): + key = f"$http_{rate_limit.key.header.lower().replace('-', '_')}" + else: + raise TypeError(f"Unexpected key type {type(rate_limit.key)}") + limit_req_zones.append( + LimitReqZoneConfig(name=zone_name, key=key, rpm=round(rate_limit.rps * 60)) + ) + if is_router: + for path in ROUTER_WHITELISTED_PATHS: + if rate_limit.prefix == path or path.startswith(rate_limit.prefix): + # Use the longest prefix if multiple prefixes match the same path + current_prefix_len = len(rate_limit.prefix) + if path not in sglang_limits or current_prefix_len > sglang_prefix_lengths.get( + path, 0 + ): + sglang_limits[path] = LimitReqConfig( + zone=zone_name, burst=rate_limit.burst + ) + sglang_prefix_lengths[path] = current_prefix_len + else: + locations.append( + LocationConfig( + prefix=rate_limit.prefix, + limit_req=LimitReqConfig(zone=zone_name, burst=rate_limit.burst), + ) + ) + + # Add router whitelisted paths as locations + if is_router: + for path in ROUTER_WHITELISTED_PATHS: + # Use prefix match for paths that end with a slash and exact match for paths that don't + if path.endswith("/"): + locations.append(LocationConfig(prefix=path, limit_req=sglang_limits.get(path))) + else: + locations.append( + LocationConfig(prefix=f"= {path}", limit_req=sglang_limits.get(path)) + ) + + # Don't auto-add / location for router-based services (catch-all 403 handles it) + if not any(location.prefix == "/" for location in locations) and not is_router: + locations.append(LocationConfig(prefix="/", limit_req=None)) + return ServiceConfig( + domain=service.domain_safe, + https=service.https_safe, + project_name=service.project_name, + auth=service.auth, + client_max_body_size=service.client_max_body_size, + access_log_path=ACCESS_LOG_PATH, + limit_req_zones=limit_req_zones, + locations=locations, + replicas=sorted(replicas, key=lambda r: r.id), # sort for reproducible configs + has_router_replica=service.has_router_replica, + router=service.router, + cors_enabled=service.cors_enabled, + ) + + +async def apply_entrypoint( + entrypoint: gateway_models.ModelEntrypoint, repo: GatewayProxyRepo, nginx: Nginx +) -> None: + config = ModelEntrypointConfig( + domain=entrypoint.domain, + https=entrypoint.https, + project_name=entrypoint.project_name, + ) + acme = (await repo.get_config()).acme_settings + await nginx.register(config, acme) + + +async def _migrate_cors_enabled(repo: GatewayProxyRepo) -> None: + """Migrate services registered before the cors_enabled field was added. + + Old gateway versions didn't persist cors_enabled on services. This derives it + from the associated model's format so that CORS is enabled for openai-format + models on gateway restart without requiring service re-registration. + """ + services = await repo.list_services() + openai_run_names: set[tuple[str, str]] = set() + for service in services: + for model in await repo.list_models(service.project_name): + if model.run_name == service.run_name and isinstance( + model.format_spec, models.OpenAIChatModelFormat + ): + openai_run_names.add((service.project_name, service.run_name)) + for service in services: + if ( + not service.cors_enabled + and (service.project_name, service.run_name) in openai_run_names + ): + updated = models.Service(**{**service.dict(), "cors_enabled": True}) + await repo.set_service(updated) + + +async def apply_all( + repo: GatewayProxyRepo, nginx: Nginx, service_conn_pool: ServiceConnectionPool +) -> None: + await _migrate_cors_enabled(repo) + service_tasks = [ + apply_service( + service=service, + old_service=None, + repo=repo, + nginx=nginx, + service_conn_pool=service_conn_pool, + ) + for service in await repo.list_services() + ] + entrypoint_tasks = [ + apply_entrypoint(entrypoint, repo, nginx) for entrypoint in await repo.list_entrypoints() + ] + results = await asyncio.gather(*service_tasks, *entrypoint_tasks, return_exceptions=True) + for exc in results: + if isinstance(exc, Exception): + logger.error("Exception restoring gateway: %s", exc) + + +def model_schema_to_format_spec(model: schemas.AnyModel) -> models.AnyModelFormat: + if model.type == "chat": + if model.format == "openai": + return models.OpenAIChatModelFormat(prefix=model.prefix) + elif model.format == "tgi": + return models.TGIChatModelFormat( + chat_template=model.chat_template, + eos_token=model.eos_token, + ) + else: + raise UnexpectedProxyError(f"Unexpected model format {model.format}") + else: + raise UnexpectedProxyError(f"Unexpected model type {model.type}") diff --git a/src/dstack/_internal/proxy/gateway/services/server_client.py b/src/dstack/_internal/proxy/gateway/services/server_client.py new file mode 100644 index 0000000000..cdcf3aa046 --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/services/server_client.py @@ -0,0 +1,95 @@ +import datetime +import logging +import random +from dataclasses import dataclass, field +from pathlib import Path +from typing import Container, Dict, Generator, List + +import httpx + +logger = logging.getLogger(__name__) +BASE_URL = "http://dstack/" # any hostname will work + + +@dataclass +class CachedClientInfo: + client: httpx.AsyncClient + socket: Path + connect_errors: List[datetime.datetime] = field(default_factory=lambda: []) + + def seems_disconnected(self) -> bool: + if len(self.connect_errors) < 2: + return False + return self.connect_errors[-1] - self.connect_errors[0] >= datetime.timedelta(minutes=2) + + +class HTTPMultiClient(httpx.AsyncClient): + """ + An HTTP client that sends requests to randomly chosen Unix sockets from a specified + directory. This allows to balance the load between multiple HTTP server replicas. + Automatically deletes sockets that stop responding. + Used for requesting random dstack-server replicas from the gateway. + """ + + def __init__(self, sockets_dir: Path): + super().__init__(base_url=BASE_URL) + self._sockets_dir = sockets_dir.expanduser() + self._clients_cache: Dict[str, CachedClientInfo] = {} + + async def send(self, request: httpx.Request, *args, **kwargs) -> httpx.Response: + errors: List[httpx.RequestError] = [] + clients_count = 0 + + for clients_count, client in enumerate(self._iter_clients_rand(), start=1): + try: + resp = await client.client.send(request, *args, **kwargs) + client.connect_errors = [] + return resp + except httpx.ConnectError: + client.connect_errors.append(datetime.datetime.now()) + if client.seems_disconnected(): + logging.debug( + "Removing socket %s after several failed connection attempts", + client.socket, + ) + client.socket.unlink() + except httpx.RequestError as e: + errors.append(e) + logger.warning("Request failed with socket %s: %r", client.socket, e) + + msg = f"Cannot request {request.url.path}: " + if not clients_count: + msg += f"no sockets found in {self._sockets_dir}" + elif not errors: + msg += f"all {clients_count} socket(s) in {self._sockets_dir} are disconnected" + else: + msg += f"{len(errors)} socket(s) failed. Last error: {errors[-1]!r}" + raise httpx.RequestError(msg, request=request) + + def _iter_clients_rand(self) -> Generator[CachedClientInfo, None, None]: + sockets = list(self._sockets_dir.glob("*.sock")) + self._evict_clients(stems_to_keep={s.stem for s in sockets}) + random.shuffle(sockets) + + for socket in sockets: + if socket.stem in self._clients_cache: + cached_client = self._clients_cache[socket.stem] + else: + cached_client = self._clients_cache[socket.stem] = self._make_client(socket) + yield cached_client + + @staticmethod + def _make_client(socket: Path) -> CachedClientInfo: + client = httpx.AsyncClient( + transport=httpx.AsyncHTTPTransport(uds=str(socket.absolute())), + base_url=BASE_URL, + ) + return CachedClientInfo( + client=client, + socket=socket, + ) + + def _evict_clients(self, stems_to_keep: Container[str]) -> None: + self._clients_cache = { + stem: client for stem, client in self._clients_cache.items() if stem in stems_to_keep + } diff --git a/src/dstack/_internal/proxy/gateway/services/stats.py b/src/dstack/_internal/proxy/gateway/services/stats.py new file mode 100644 index 0000000000..358eff451a --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/services/stats.py @@ -0,0 +1,170 @@ +import asyncio +import datetime +import logging +import os +from collections import deque +from collections.abc import Reversible +from pathlib import Path +from typing import Iterable, Optional, TextIO + +from pydantic import BaseModel + +from dstack._internal.proxy.gateway.const import SERVICE_SCALING_WINDOWS +from dstack._internal.proxy.gateway.repo.repo import GatewayProxyRepo +from dstack._internal.proxy.gateway.schemas.stats import PerWindowStats, ServiceStats, Stat +from dstack._internal.proxy.lib.errors import UnexpectedProxyError +from dstack._internal.utils.common import run_async + +logger = logging.getLogger(__name__) +TTL = max(SERVICE_SCALING_WINDOWS) +EMPTY_STATS = {window: Stat(requests=0, request_time=0.0) for window in SERVICE_SCALING_WINDOWS} + + +class StatFrame(BaseModel): + """Service metrics aggregated over a 1s frame""" + + timestamp: int + requests: int + requests_time_total: float + + +class LogEntry(BaseModel): + """A line from the access log""" + + timestamp: datetime.datetime + host: str + status: int + request_time: float + is_replica_hit: bool + + +class StatsCollector: + """ + StatCollector parses nginx access log and calculates average request time and requests count. + """ + + def __init__(self, access_log: Path) -> None: + self._path = access_log + self._file: Optional[TextIO] = None + self._stats: dict[str, deque[StatFrame]] = {} + self._lock = asyncio.Lock() + + async def collect(self) -> dict[str, PerWindowStats]: + """ + :return: stats per host aggregated by 30s, 1m, 5m + """ + result = {} + async with self._lock: + await run_async(self._collect) + now = datetime.datetime.now(tz=datetime.timezone.utc) + for host, frames in self._stats.items(): + result[host] = self._aggregate(frames, now) + return result + + @staticmethod + def _aggregate(frames: Reversible[StatFrame], now: datetime.datetime) -> PerWindowStats: + """ + Aggregate 1s `frames` into windows 30s, 1m, 5m before `now` + """ + result = {} + for window in SERVICE_SCALING_WINDOWS: + req_count = 0 + req_time_total = 0.0 + for frame in reversed(frames): + if now.timestamp() - frame.timestamp > window: + break + req_time_total += frame.requests_time_total + req_count += frame.requests + if req_count > 0: + result[window] = Stat( + requests=req_count, + request_time=round(req_time_total / req_count, 3), + ) + else: + result[window] = Stat(requests=0, request_time=0.0) + return result + + def _collect(self) -> None: + now = datetime.datetime.now(tz=datetime.timezone.utc) + + for entry in self._read_access_log(now - datetime.timedelta(seconds=TTL)): + # only include requests that hit or should hit a service replica + if not entry.is_replica_hit: + continue + + frame_timestamp = int(entry.timestamp.timestamp()) + frames = self._stats.setdefault(entry.host, deque(maxlen=TTL)) + + # presume that log entries are sorted by timestamp + if not frames or frames[-1].timestamp != frame_timestamp: + latest_frame = StatFrame( + timestamp=frame_timestamp, requests=1, requests_time_total=entry.request_time + ) + frames.append(latest_frame) + else: + latest_frame = frames[-1] + latest_frame.requests += 1 + latest_frame.requests_time_total += entry.request_time + + for host in list(self._stats.keys()): + if self._stats[host][-1].timestamp < now.timestamp() - TTL: + del self._stats[host] + + def _read_access_log(self, after: datetime.datetime) -> Iterable[LogEntry]: + try: + st_ino = os.stat(self._path).st_ino + except FileNotFoundError: + st_ino = None + + if self._file is not None: + while True: + line = self._file.readline() + if not line: + break + cells = line.split() + if len(cells) == 4: # compatibility with pre-0.19.11 logs + cells.append("0" if cells[2] in ["403", "404"] else "1") + timestamp_str, host, status, request_time, dstack_replica_hit = cells + timestamp = datetime.datetime.fromisoformat(timestamp_str) + if timestamp < after: + continue + yield LogEntry( + timestamp=timestamp, + host=host, + status=int(status), + request_time=float(request_time), + is_replica_hit=_parse_nginx_bool(dstack_replica_hit), + ) + if os.fstat(self._file.fileno()).st_ino != st_ino: + # file was rotated + self._file.close() + self._file = None + + if self._file is None and st_ino is not None: + logger.info("Opening access log file: %s", self._path) + self._file = open(self._path, "r") + # normally, recursion will not exceed depth of 2 + yield from self._read_access_log(after) + + +async def get_service_stats( + repo: GatewayProxyRepo, collector: StatsCollector +) -> list[ServiceStats]: + stats_per_host = await collector.collect() + services = await repo.list_services() + return [ + ServiceStats( + project_name=service.project_name, + run_name=service.run_name, + stats=stats_per_host.get(service.domain_safe, EMPTY_STATS), + ) + for service in services + ] + + +def _parse_nginx_bool(v: str) -> bool: + if v == "0": + return False + if v == "1": + return True + raise UnexpectedProxyError(f"Cannot parse boolean value: expected '0' or '1', got {v!r}") diff --git a/src/dstack/_internal/proxy/gateway/testing/__init__.py b/src/dstack/_internal/proxy/gateway/testing/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/proxy/gateway/testing/common.py b/src/dstack/_internal/proxy/gateway/testing/common.py new file mode 100644 index 0000000000..2718ec6bae --- /dev/null +++ b/src/dstack/_internal/proxy/gateway/testing/common.py @@ -0,0 +1,13 @@ +from dataclasses import dataclass +from typing import Union +from unittest.mock import AsyncMock, MagicMock + +AnyMock = Union[MagicMock, AsyncMock] + + +@dataclass +class Mocks: + reload_nginx: AnyMock + run_certbot: AnyMock + open_conn: AnyMock + close_conn: AnyMock diff --git a/src/dstack/_internal/proxy/lib/__init__.py b/src/dstack/_internal/proxy/lib/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/proxy/lib/auth.py b/src/dstack/_internal/proxy/lib/auth.py new file mode 100644 index 0000000000..382df2c468 --- /dev/null +++ b/src/dstack/_internal/proxy/lib/auth.py @@ -0,0 +1,7 @@ +from abc import ABC, abstractmethod + + +class BaseProxyAuthProvider(ABC): + @abstractmethod + async def is_project_member(self, project_name: str, token: str) -> bool: + pass diff --git a/src/dstack/_internal/proxy/lib/const.py b/src/dstack/_internal/proxy/lib/const.py new file mode 100644 index 0000000000..43ede03ac8 --- /dev/null +++ b/src/dstack/_internal/proxy/lib/const.py @@ -0,0 +1,12 @@ +""" +Shared constants for proxy components (gateway + in-server proxy). +""" + +# Inference endpoints exposed by the in-replica HTTP router. Applies to both +# SGLang's router and Dynamo's `dynamo.frontend` — they share the +# OpenAI-compatible endpoint surface. +ROUTER_WHITELISTED_PATHS: tuple[str, ...] = ( + "/generate", + "/v1/", + "/chat/completions", +) diff --git a/src/dstack/_internal/proxy/lib/deps.py b/src/dstack/_internal/proxy/lib/deps.py new file mode 100644 index 0000000000..21528899ce --- /dev/null +++ b/src/dstack/_internal/proxy/lib/deps.py @@ -0,0 +1,106 @@ +from abc import ABC, abstractmethod +from typing import AsyncGenerator, Optional + +from fastapi import Depends, FastAPI, Request, Security, status +from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer +from typing_extensions import Annotated + +from dstack._internal.proxy.lib.auth import BaseProxyAuthProvider +from dstack._internal.proxy.lib.errors import ProxyError, UnexpectedProxyError +from dstack._internal.proxy.lib.repo import BaseProxyRepo +from dstack._internal.proxy.lib.services.service_connection import ServiceConnectionPool + + +class ProxyDependencyInjector(ABC): + """ + An injector instance stored in FastAPI's app.state.proxy_dependency_injector + configures dstack-proxy to use a specific set of dependencies, e.g. + a specific repo implementation. + """ + + def __init__(self) -> None: + self._service_conn_pool = ServiceConnectionPool() + + # Abstract AsyncGenerator does not need async def since + # type checkers infer a different type without yield in body. + # https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators + + @abstractmethod + def get_repo(self) -> AsyncGenerator[BaseProxyRepo, None]: + pass + + @abstractmethod + def get_auth_provider(self) -> AsyncGenerator[BaseProxyAuthProvider, None]: + pass + + async def get_service_connection_pool(self) -> ServiceConnectionPool: + return self._service_conn_pool + + +def get_injector_from_app(app: FastAPI) -> ProxyDependencyInjector: + injector = app.state.proxy_dependency_injector + if not isinstance(injector, ProxyDependencyInjector): + raise UnexpectedProxyError(f"Unexpected proxy_dependency_injector type {type(injector)}") + return injector + + +async def get_injector(request: Request) -> ProxyDependencyInjector: + return get_injector_from_app(request.app) + + +async def get_proxy_repo( + injector: Annotated[ProxyDependencyInjector, Depends(get_injector)], +) -> AsyncGenerator[BaseProxyRepo, None]: + async for repo in injector.get_repo(): + yield repo + + +async def get_proxy_auth_provider( + injector: Annotated[ProxyDependencyInjector, Depends(get_injector)], +) -> AsyncGenerator[BaseProxyAuthProvider, None]: + async for provider in injector.get_auth_provider(): + yield provider + + +async def get_service_connection_pool( + injector: Annotated[ProxyDependencyInjector, Depends(get_injector)], +) -> ServiceConnectionPool: + return await injector.get_service_connection_pool() + + +class ProxyAuthContext: + def __init__(self, project_name: str, token: Optional[str], provider: BaseProxyAuthProvider): + self._project_name = project_name + self._token = token + self._provider = provider + + async def enforce(self) -> None: + if self._token is None or not await self._provider.is_project_member( + self._project_name, self._token + ): + raise ProxyError( + f"Unauthenticated or unauthorized to access project {self._project_name}", + status.HTTP_403_FORBIDDEN, + ) + + +class ProxyAuth: + def __init__(self, auto_enforce: bool): + self._auto_enforce = auto_enforce + + async def __call__( + self, + project_name: str, + token: Annotated[ + Optional[HTTPAuthorizationCredentials], Security(HTTPBearer(auto_error=False)) + ], + provider: Annotated[BaseProxyAuthProvider, Depends(get_proxy_auth_provider)], + ) -> ProxyAuthContext: + context = ProxyAuthContext( + project_name=project_name, + token=token.credentials if token is not None else None, + provider=provider, + ) + if self._auto_enforce: + await context.enforce() + return context diff --git a/src/dstack/_internal/proxy/lib/errors.py b/src/dstack/_internal/proxy/lib/errors.py new file mode 100644 index 0000000000..891099bcd6 --- /dev/null +++ b/src/dstack/_internal/proxy/lib/errors.py @@ -0,0 +1,14 @@ +from fastapi import HTTPException, status + + +class ProxyError(HTTPException): + """Errors in dstack-proxy that are caused by and should be reported to the user""" + + def __init__(self, detail: str, code: int = status.HTTP_400_BAD_REQUEST) -> None: + super().__init__(detail=detail, status_code=code) + + +class UnexpectedProxyError(RuntimeError): + """Internal errors in dstack-proxy that should have never happened""" + + pass diff --git a/src/dstack/_internal/proxy/lib/models.py b/src/dstack/_internal/proxy/lib/models.py new file mode 100644 index 0000000000..f41e87a426 --- /dev/null +++ b/src/dstack/_internal/proxy/lib/models.py @@ -0,0 +1,117 @@ +"""Things stored in BaseProxyRepo implementations.""" + +from datetime import datetime +from typing import Iterable, Literal, Optional, Union + +from pydantic import BaseModel, Field +from typing_extensions import Annotated + +from dstack._internal.core.models.instances import SSHConnectionParams +from dstack._internal.core.models.routers import AnyServiceRouterConfig +from dstack._internal.proxy.lib.errors import UnexpectedProxyError + + +# Models should be immutable so that they can be stored in memory and safely shared by +# coroutines without copying on every read operation. +class ImmutableModel(BaseModel): + class Config: + frozen = True + + +class Replica(ImmutableModel): + id: str + app_port: int + ssh_destination: str + ssh_port: int + ssh_proxy: Optional[SSHConnectionParams] + ssh_proxy_private_key: Optional[str] = None + "`None` means same as service project's key" + # Optional outer proxy, a head node/bastion + ssh_head_proxy: Optional[SSHConnectionParams] = None + ssh_head_proxy_private_key: Optional[str] = None + internal_ip: Optional[str] = None + + +class IPAddressPartitioningKey(ImmutableModel): + type: Literal["ip_address"] = "ip_address" + + +class HeaderPartitioningKey(ImmutableModel): + type: Literal["header"] = "header" + header: Annotated[str, Field(regex=r"^[a-zA-Z0-9-_]+$")] # prevent Nginx config injection + + +class RateLimit(ImmutableModel): + prefix: Annotated[str, Field(regex=r"^/[^\s\\{}]*$")] # prevent Nginx config injection + key: Annotated[ + Union[IPAddressPartitioningKey, HeaderPartitioningKey], + Field(discriminator="type"), + ] + rps: float + burst: int + + +class Service(ImmutableModel): + project_name: str + run_name: str + domain: Optional[str] # only used on gateways + https: Optional[bool] # only used on gateways + rate_limits: tuple[RateLimit, ...] = () # only used on gateways + auth: bool + client_max_body_size: int # only enforced on gateways + strip_prefix: bool = True # only used in-server + replicas: tuple[Replica, ...] + has_router_replica: bool = False + router: Optional[AnyServiceRouterConfig] = None + cors_enabled: bool = False # only used on gateways; enabled for openai-format models + + @property + def domain_safe(self) -> str: + if self.domain is None: + raise UnexpectedProxyError(f"domain unexpectedly missing for service {self.fmt()}") + return self.domain + + @property + def https_safe(self) -> bool: + if self.https is None: + raise UnexpectedProxyError(f"https unexpectedly missing for service {self.fmt()}") + return self.https + + def with_replicas(self, new_replicas: Iterable[Replica]) -> "Service": + return Service(**{**self.dict(), "replicas": tuple(new_replicas)}) + + def find_replica(self, replica_id: str) -> Optional[Replica]: + for replica in self.replicas: + if replica.id == replica_id: + return replica + return None + + def fmt(self) -> str: + return f"{self.project_name}/{self.run_name}" + + +class Project(ImmutableModel): + name: str + ssh_private_key: str + + +class TGIChatModelFormat(ImmutableModel): + format: Literal["tgi"] = "tgi" + chat_template: str + eos_token: str + + +class OpenAIChatModelFormat(ImmutableModel): + format: Literal["openai"] = "openai" + prefix: str + + +AnyModelFormat = Union[TGIChatModelFormat, OpenAIChatModelFormat] + + +class ChatModel(ImmutableModel): + project_name: str + name: str + created_at: datetime + run_name: str + format_spec: Annotated[AnyModelFormat, Field(discriminator="format")] diff --git a/src/dstack/_internal/proxy/lib/repo.py b/src/dstack/_internal/proxy/lib/repo.py new file mode 100644 index 0000000000..24a1c4638c --- /dev/null +++ b/src/dstack/_internal/proxy/lib/repo.py @@ -0,0 +1,27 @@ +from abc import ABC, abstractmethod +from typing import List, Optional + +from dstack._internal.proxy.lib.models import ChatModel, Project, Service + + +class BaseProxyRepo(ABC): + """ + Data access methods relevant for both in-server and gateway environments. + Implementations can have additional environment-specific methods. + """ + + @abstractmethod + async def get_service(self, project_name: str, run_name: str) -> Optional[Service]: + pass + + @abstractmethod + async def list_models(self, project_name: str) -> List[ChatModel]: + pass + + @abstractmethod + async def get_model(self, project_name: str, name: str) -> Optional[ChatModel]: + pass + + @abstractmethod + async def get_project(self, name: str) -> Optional[Project]: + pass diff --git a/src/dstack/_internal/proxy/lib/routers/__init__.py b/src/dstack/_internal/proxy/lib/routers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/proxy/lib/routers/model_proxy.py b/src/dstack/_internal/proxy/lib/routers/model_proxy.py new file mode 100644 index 0000000000..e5a5c4cee3 --- /dev/null +++ b/src/dstack/_internal/proxy/lib/routers/model_proxy.py @@ -0,0 +1,102 @@ +from typing import AsyncIterator, Optional + +from fastapi import APIRouter, Depends, status +from fastapi.responses import StreamingResponse +from typing_extensions import Annotated + +from dstack._internal.proxy.lib.deps import ProxyAuth, get_proxy_repo, get_service_connection_pool +from dstack._internal.proxy.lib.errors import ProxyError, UnexpectedProxyError +from dstack._internal.proxy.lib.repo import BaseProxyRepo +from dstack._internal.proxy.lib.schemas.model_proxy import ( + ChatCompletionsChunk, + ChatCompletionsRequest, + ChatCompletionsResponse, + Model, + ModelsResponse, +) +from dstack._internal.proxy.lib.services.model_proxy.model_proxy import get_chat_client +from dstack._internal.proxy.lib.services.service_connection import ( + ServiceConnectionPool, + get_service_replica_client, +) + +router = APIRouter(dependencies=[Depends(ProxyAuth(auto_enforce=True))]) + + +@router.get("/{project_name}/models") +async def get_models( + project_name: str, repo: Annotated[BaseProxyRepo, Depends(get_proxy_repo)] +) -> ModelsResponse: + models = await repo.list_models(project_name) + data = [ + Model(id=m.name, created=int(m.created_at.timestamp()), owned_by=project_name) + for m in models + ] + return ModelsResponse(data=data) + + +@router.post("/{project_name}/chat/completions", response_model=ChatCompletionsResponse) +async def post_chat_completions( + project_name: str, + body: ChatCompletionsRequest, + repo: Annotated[BaseProxyRepo, Depends(get_proxy_repo)], + service_conn_pool: Annotated[ServiceConnectionPool, Depends(get_service_connection_pool)], +): + model = await repo.get_model(project_name, body.model) + if model is None: + raise ProxyError( + f"Model {body.model} not found in project {project_name}", status.HTTP_404_NOT_FOUND + ) + service = await repo.get_service(project_name, model.run_name) + if service is None or not service.replicas: + raise UnexpectedProxyError( + f"Model {model.name} in project {project_name} references run {model.run_name}" + " that does not exist or has no replicas" + ) + http_client = await get_service_replica_client(service, repo, service_conn_pool) + client = get_chat_client(model, http_client) + if not body.stream: + return await client.generate(body) + else: + return StreamingResponse( + await StreamingAdaptor(client.stream(body)).get_stream(), + media_type="text/event-stream", + headers={"X-Accel-Buffering": "no"}, + ) + + +class StreamingAdaptor: + """ + Converts a stream of ChatCompletionsChunk to an SSE stream. + Also pre-fetches the first chunk **before** starting streaming to downstream, + so that upstream request errors can propagate to the downstream client. + """ + + def __init__(self, stream: AsyncIterator[ChatCompletionsChunk]) -> None: + self._stream = stream + + async def get_stream(self) -> AsyncIterator[bytes]: + try: + first_chunk = await self._stream.__anext__() + except StopAsyncIteration: + first_chunk = None + return self._adaptor(first_chunk) + + async def _adaptor(self, first_chunk: Optional[ChatCompletionsChunk]) -> AsyncIterator[bytes]: + if first_chunk is not None: + yield self._encode_chunk(first_chunk) + + try: + async for chunk in self._stream: + yield self._encode_chunk(chunk) + except ProxyError as e: + # No standard way to report errors while streaming, + # but we'll at least send them as comments + yield f": {e.detail!r}\n\n".encode() # !r to avoid line breaks + return + + yield "data: [DONE]\n\n".encode() + + @staticmethod + def _encode_chunk(chunk: ChatCompletionsChunk) -> bytes: + return f"data:{chunk.json()}\n\n".encode() diff --git a/src/dstack/_internal/proxy/lib/schemas/__init__.py b/src/dstack/_internal/proxy/lib/schemas/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/proxy/lib/schemas/model_proxy.py b/src/dstack/_internal/proxy/lib/schemas/model_proxy.py new file mode 100644 index 0000000000..e8853a375e --- /dev/null +++ b/src/dstack/_internal/proxy/lib/schemas/model_proxy.py @@ -0,0 +1,77 @@ +from typing import Any, Dict, List, Literal, Optional, Union + +from dstack._internal.core.models.common import CoreModel + + +class ChatMessage(CoreModel): + role: str # TODO(egor-s) types + content: str + + +class ChatCompletionsRequest(CoreModel): + messages: List[ChatMessage] + model: str + frequency_penalty: Optional[float] = 0.0 + logit_bias: Dict[str, float] = {} + max_tokens: Optional[int] = None + n: int = 1 + presence_penalty: float = 0.0 + response_format: Optional[Dict] = None + seed: Optional[int] = None + stop: Optional[Union[str, List[str]]] = None + stream: bool = False + temperature: Optional[float] = 1.0 + top_p: Optional[float] = 1.0 + tools: List[Any] = [] + tool_choice: Union[Literal["none", "auto"], Dict] = {} + user: Optional[str] = None + + +class ChatCompletionsChoice(CoreModel): + finish_reason: str + index: int + message: ChatMessage + + +class ChatCompletionsChunkChoice(CoreModel): + delta: object + logprobs: object = {} + finish_reason: Optional[str] + index: int + + +class ChatCompletionsUsage(CoreModel): + completion_tokens: int + prompt_tokens: int + total_tokens: int + + +class ChatCompletionsResponse(CoreModel): + id: str + choices: List[ChatCompletionsChoice] + created: int + model: str + system_fingerprint: str = "" + object: Literal["chat.completion"] = "chat.completion" + usage: ChatCompletionsUsage + + +class ChatCompletionsChunk(CoreModel): + id: Optional[str] = None + choices: List[ChatCompletionsChunkChoice] + created: Optional[int] = None + model: str + system_fingerprint: Optional[str] = "" + object: Literal["chat.completion.chunk"] = "chat.completion.chunk" + + +class Model(CoreModel): + object: Literal["model"] = "model" + id: str + created: int + owned_by: str + + +class ModelsResponse(CoreModel): + object: Literal["list"] = "list" + data: List[Model] diff --git a/src/dstack/_internal/proxy/lib/services/__init__.py b/src/dstack/_internal/proxy/lib/services/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/proxy/lib/services/model_proxy/__init__.py b/src/dstack/_internal/proxy/lib/services/model_proxy/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/proxy/lib/services/model_proxy/clients/__init__.py b/src/dstack/_internal/proxy/lib/services/model_proxy/clients/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/proxy/lib/services/model_proxy/clients/base.py b/src/dstack/_internal/proxy/lib/services/model_proxy/clients/base.py new file mode 100644 index 0000000000..cf31cb462f --- /dev/null +++ b/src/dstack/_internal/proxy/lib/services/model_proxy/clients/base.py @@ -0,0 +1,18 @@ +from abc import ABC, abstractmethod +from typing import AsyncIterator + +from dstack._internal.proxy.lib.schemas.model_proxy import ( + ChatCompletionsChunk, + ChatCompletionsRequest, + ChatCompletionsResponse, +) + + +class ChatCompletionsClient(ABC): + @abstractmethod + async def generate(self, request: ChatCompletionsRequest) -> ChatCompletionsResponse: + pass + + @abstractmethod + async def stream(self, request: ChatCompletionsRequest) -> AsyncIterator[ChatCompletionsChunk]: + yield diff --git a/src/dstack/_internal/proxy/lib/services/model_proxy/clients/openai.py b/src/dstack/_internal/proxy/lib/services/model_proxy/clients/openai.py new file mode 100644 index 0000000000..ecd49823fd --- /dev/null +++ b/src/dstack/_internal/proxy/lib/services/model_proxy/clients/openai.py @@ -0,0 +1,67 @@ +from typing import AsyncIterator + +import httpx +from fastapi import status +from pydantic import ValidationError + +from dstack._internal.proxy.lib.errors import ProxyError +from dstack._internal.proxy.lib.schemas.model_proxy import ( + ChatCompletionsChunk, + ChatCompletionsRequest, + ChatCompletionsResponse, +) +from dstack._internal.proxy.lib.services.model_proxy.clients.base import ChatCompletionsClient + + +class OpenAIChatCompletions(ChatCompletionsClient): + def __init__(self, http_client: httpx.AsyncClient, prefix: str): + self._http = http_client + self._prefix = prefix + + async def generate(self, request: ChatCompletionsRequest) -> ChatCompletionsResponse: + try: + resp = await self._http.post( + f"{self._prefix}/chat/completions", json=request.dict(exclude_unset=True) + ) + await self._propagate_error(resp) + except httpx.RequestError as e: + raise ProxyError(f"Error requesting model: {e!r}", status.HTTP_502_BAD_GATEWAY) + + try: + return ChatCompletionsResponse.__response__.parse_raw(resp.content) + except ValidationError as e: + raise ProxyError(f"Invalid response from model: {e}", status.HTTP_502_BAD_GATEWAY) + + async def stream(self, request: ChatCompletionsRequest) -> AsyncIterator[ChatCompletionsChunk]: + try: + async with self._http.stream( + "POST", f"{self._prefix}/chat/completions", json=request.dict(exclude_unset=True) + ) as resp: + await self._propagate_error(resp) + + async for line in resp.aiter_lines(): + if not line.startswith("data:"): + continue + data = line[len("data:") :].strip() + if data == "[DONE]": + break + yield self._parse_chunk_data(data) + except httpx.RequestError as e: + raise ProxyError(f"Error requesting model: {e!r}", status.HTTP_502_BAD_GATEWAY) + + @staticmethod + def _parse_chunk_data(data: str) -> ChatCompletionsChunk: + try: + return ChatCompletionsChunk.__response__.parse_raw(data) + except ValidationError as e: + raise ProxyError(f"Invalid chunk in model stream: {e}", status.HTTP_502_BAD_GATEWAY) + + @staticmethod + async def _propagate_error(resp: httpx.Response) -> None: + """ + Propagates HTTP error by raising ProxyError if status is not 200. + May also raise httpx.RequestError if there are issues reading the response. + """ + if resp.status_code != 200: + resp_body = await resp.aread() + raise ProxyError(resp_body.decode(errors="replace"), code=resp.status_code) diff --git a/src/dstack/_internal/proxy/lib/services/model_proxy/clients/tgi.py b/src/dstack/_internal/proxy/lib/services/model_proxy/clients/tgi.py new file mode 100644 index 0000000000..70c8683a6c --- /dev/null +++ b/src/dstack/_internal/proxy/lib/services/model_proxy/clients/tgi.py @@ -0,0 +1,208 @@ +import datetime +import json +import uuid +from typing import AsyncIterator, Dict, List + +import httpx +import jinja2 +import jinja2.sandbox +from fastapi import status + +from dstack._internal.proxy.lib.errors import ProxyError +from dstack._internal.proxy.lib.schemas.model_proxy import ( + ChatCompletionsChoice, + ChatCompletionsChunk, + ChatCompletionsChunkChoice, + ChatCompletionsRequest, + ChatCompletionsResponse, + ChatCompletionsUsage, + ChatMessage, +) +from dstack._internal.proxy.lib.services.model_proxy.clients.base import ChatCompletionsClient + + +class TGIChatCompletions(ChatCompletionsClient): + # https://huggingface.github.io/text-generation-inference/ + def __init__(self, http_client: httpx.AsyncClient, chat_template: str, eos_token: str): + self.client = http_client + self.eos_token = eos_token + + try: + jinja_env = jinja2.sandbox.ImmutableSandboxedEnvironment( + trim_blocks=True, lstrip_blocks=True + ) + jinja_env.globals["raise_exception"] = raise_exception + self.chat_template = jinja_env.from_string(chat_template) + except jinja2.TemplateError as e: + raise ProxyError(f"Failed to compile chat template: {e}") + + async def generate(self, request: ChatCompletionsRequest) -> ChatCompletionsResponse: + payload = self.get_payload(request) + try: + resp = await self.client.post("/generate", json=payload) + await self.propagate_error(resp) + except httpx.RequestError as e: + raise ProxyError(f"Error requesting model: {e!r}", status.HTTP_502_BAD_GATEWAY) + + data = resp.json() + + choices = [ + ChatCompletionsChoice( + finish_reason=self.finish_reason(data["details"]["finish_reason"]), + index=0, + message=ChatMessage( + role="assistant", + content=self.trim_stop_tokens( + data["generated_text"], payload["parameters"]["stop"] + ), + ), + ) + ] + completion_tokens = data["details"]["generated_tokens"] + prompt_tokens = len(data["details"]["prefill"]) + + for i, sequence in enumerate(data["details"].get("best_of_sequences", []), start=1): + choices.append( + ChatCompletionsChoice( + finish_reason=self.finish_reason(sequence["finish_reason"]), + index=i, + message=ChatMessage( + role="assistant", + content=self.trim_stop_tokens( + sequence["generated_text"], payload["parameters"]["stop"] + ), + ), + ) + ) + completion_tokens += sequence["generated_tokens"] + + return ChatCompletionsResponse( + id=uuid.uuid4().hex, + choices=choices, + created=int(datetime.datetime.utcnow().timestamp()), + model=request.model, + system_fingerprint=f"fp_{data['details']['seed']}", + usage=ChatCompletionsUsage( + completion_tokens=completion_tokens, + prompt_tokens=prompt_tokens, # TODO(egor-s): do we need to multiply by number of sequences? + total_tokens=completion_tokens + prompt_tokens, + ), + ) + + async def stream(self, request: ChatCompletionsRequest) -> AsyncIterator[ChatCompletionsChunk]: + completion_id = uuid.uuid4().hex + created = int(datetime.datetime.utcnow().timestamp()) + + payload = self.get_payload(request) + try: + async with self.client.stream("POST", "/generate_stream", json=payload) as resp: + await self.propagate_error(resp) + async for line in resp.aiter_lines(): + if line.startswith("data:"): + yield self.parse_chunk( + data=json.loads(line[len("data:") :].strip("\n")), + model=request.model, + completion_id=completion_id, + created=created, + ) + except httpx.RequestError as e: + raise ProxyError(f"Error requesting model: {e!r}", status.HTTP_502_BAD_GATEWAY) + + def parse_chunk( + self, data: dict, model: str, completion_id: str, created: int + ) -> ChatCompletionsChunk: + if "error" in data: + raise ProxyError(data["error"]) + chunk = ChatCompletionsChunk( + id=completion_id, + choices=[], + created=created, + model=model, + system_fingerprint="", + ) + if data["details"] is not None: + chunk.choices = [ + ChatCompletionsChunkChoice( + delta={}, + logprobs=None, + finish_reason=self.finish_reason(data["details"]["finish_reason"]), + index=0, + ) + ] + else: + chunk.choices = [ + ChatCompletionsChunkChoice( + delta={"content": data["token"]["text"], "role": "assistant"}, + logprobs=None, + finish_reason=None, + index=0, + ) + ] + return chunk + + def get_payload(self, request: ChatCompletionsRequest) -> Dict: + try: + inputs = self.chat_template.render( + messages=request.messages, + add_generation_prompt=True, + ) + except jinja2.TemplateError as e: + raise ProxyError(f"Failed to render chat template: {e}") + + stop = ([request.stop] if isinstance(request.stop, str) else request.stop) or [] + if self.eos_token not in stop: + stop.append(self.eos_token) + + parameters = { + "do_sample": True, # activate logits sampling + "max_new_tokens": request.max_tokens, + # TODO(egor-s): OpenAI parameters do not convert to `repetition_penalty` + # "repetition_penalty": None, + # "return_full_text": False, + "stop": stop, + "seed": request.seed, + "temperature": request.temperature, + # OpenAI doesn't specify `top_k` parameter + # "top_k": None, + # "truncate": None, + # "typical_p": None, + "best_of": request.n, + # "watermark": False, + "details": True, # to get best_of_sequences + "decoder_input_details": not request.stream, + } + if request.top_p < 1.0: + parameters["top_p"] = request.top_p + return { + "inputs": inputs, + "parameters": parameters, + } + + @staticmethod + def finish_reason(reason: str) -> str: + if reason == "stop_sequence" or reason == "eos_token": + return "stop" + if reason == "length": + return "length" + raise ProxyError(f"Unknown finish reason: {reason}") + + @staticmethod + def trim_stop_tokens(text: str, stop_tokens: List[str]) -> str: + for stop_token in stop_tokens: + if text.endswith(stop_token): + return text[: -len(stop_token)] + return text + + @staticmethod + async def propagate_error(resp: httpx.Response) -> None: + """ + Propagates HTTP error by raising ProxyError if status is not 200. + May also raise httpx.RequestError if there are issues reading the response. + """ + if resp.status_code != 200: + resp_body = await resp.aread() + raise ProxyError(resp_body.decode(errors="replace"), code=resp.status_code) + + +def raise_exception(message: str): + raise jinja2.TemplateError(message) diff --git a/src/dstack/_internal/proxy/lib/services/model_proxy/model_proxy.py b/src/dstack/_internal/proxy/lib/services/model_proxy/model_proxy.py new file mode 100644 index 0000000000..0661f79699 --- /dev/null +++ b/src/dstack/_internal/proxy/lib/services/model_proxy/model_proxy.py @@ -0,0 +1,23 @@ +import httpx + +from dstack._internal.proxy.lib.errors import UnexpectedProxyError +from dstack._internal.proxy.lib.models import ChatModel +from dstack._internal.proxy.lib.services.model_proxy.clients.base import ChatCompletionsClient +from dstack._internal.proxy.lib.services.model_proxy.clients.openai import OpenAIChatCompletions +from dstack._internal.proxy.lib.services.model_proxy.clients.tgi import TGIChatCompletions + + +def get_chat_client(model: ChatModel, http_client: httpx.AsyncClient) -> ChatCompletionsClient: + if model.format_spec.format == "tgi": + return TGIChatCompletions( + http_client=http_client, + chat_template=model.format_spec.chat_template, + eos_token=model.format_spec.eos_token, + ) + elif model.format_spec.format == "openai": + return OpenAIChatCompletions( + http_client=http_client, + prefix=model.format_spec.prefix, + ) + else: + raise UnexpectedProxyError(f"Unsupported model format {model.format_spec.format}") diff --git a/src/dstack/_internal/proxy/lib/services/service_connection.py b/src/dstack/_internal/proxy/lib/services/service_connection.py new file mode 100644 index 0000000000..37bdc5083a --- /dev/null +++ b/src/dstack/_internal/proxy/lib/services/service_connection.py @@ -0,0 +1,163 @@ +import asyncio +import os +import random +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Dict, Optional + +import httpx +from httpx import AsyncHTTPTransport + +from dstack._internal.core.services.ssh.tunnel import ( + SSH_DEFAULT_OPTIONS, + IPSocket, + SocketPair, + SSHTunnel, + UnixSocket, +) +from dstack._internal.proxy.lib.errors import UnexpectedProxyError +from dstack._internal.proxy.lib.models import Project, Replica, Service +from dstack._internal.proxy.lib.repo import BaseProxyRepo +from dstack._internal.utils.common import get_or_error +from dstack._internal.utils.env import environ +from dstack._internal.utils.logging import get_logger +from dstack._internal.utils.path import FileContent + +logger = get_logger(__name__) +OPEN_TUNNEL_TIMEOUT = 10 +HTTP_TIMEOUT = environ.get_int("DSTACK_SERVICE_CLIENT_TIMEOUT", default=60) +# Same as default Nginx proxy timeout; override via DSTACK_SERVICE_CLIENT_TIMEOUT + + +class ServiceClient(httpx.AsyncClient): + def build_request(self, *args, **kwargs) -> httpx.Request: + self.cookies.clear() # the client is shared by all users, don't leak cookies + return super().build_request(*args, **kwargs) + + +class ServiceConnection: + def __init__(self, project: Project, service: Service, replica: Replica) -> None: + self._temp_dir = TemporaryDirectory() + options = { + **SSH_DEFAULT_OPTIONS, + "ConnectTimeout": str(OPEN_TUNNEL_TIMEOUT), + "ServerAliveInterval": "60", + } + if service.domain is not None: + # expose socket for Nginx + os.chmod(self._temp_dir.name, 0o755) + options["StreamLocalBindMask"] = "0111" + self._app_socket_path = (Path(self._temp_dir.name) / "replica.sock").absolute() + ssh_proxies = [] + if replica.ssh_head_proxy is not None: + ssh_head_proxy_private_key = get_or_error(replica.ssh_head_proxy_private_key) + ssh_proxies.append((replica.ssh_head_proxy, FileContent(ssh_head_proxy_private_key))) + if replica.ssh_proxy is not None: + if replica.ssh_proxy_private_key is not None: + ssh_proxies.append((replica.ssh_proxy, FileContent(replica.ssh_proxy_private_key))) + else: + ssh_proxies.append((replica.ssh_proxy, None)) + self._tunnel = SSHTunnel( + destination=replica.ssh_destination, + port=replica.ssh_port, + ssh_proxies=ssh_proxies, + identity=FileContent(project.ssh_private_key), + forwarded_sockets=[ + SocketPair( + remote=IPSocket("localhost", replica.app_port), + local=UnixSocket(self._app_socket_path), + ), + ], + options=options, + ) + self._client = ServiceClient( + transport=AsyncHTTPTransport(uds=str(self._app_socket_path)), + # The hostname in base_url is there for troubleshooting, as it may appear in + # logs and in the Host header. The actual destination is the Unix socket. + base_url=f"http://{replica.id}-{service.run_name}/", + timeout=HTTP_TIMEOUT, + ) + self._is_open = asyncio.locks.Event() + + @property + def app_socket_path(self) -> Path: + return self._app_socket_path + + async def open(self) -> None: + await self._tunnel.aopen() + self._is_open.set() + + async def close(self) -> None: + self._is_open.clear() + await self._client.aclose() + await self._tunnel.aclose() + + async def client(self) -> ServiceClient: + await asyncio.wait_for(self._is_open.wait(), timeout=OPEN_TUNNEL_TIMEOUT) + return self._client + + +class ServiceConnectionPool: + def __init__(self) -> None: + # TODO(#2238): remove connections to stopped replicas in-server + self.connections: Dict[str, ServiceConnection] = {} + + async def get(self, replica_id: str) -> Optional[ServiceConnection]: + return self.connections.get(replica_id) + + async def get_or_add( + self, project: Project, service: Service, replica: Replica + ) -> ServiceConnection: + connection = self.connections.get(replica.id) + if connection is not None: + return connection + connection = ServiceConnection(project, service, replica) + self.connections[replica.id] = connection + try: + await connection.open() + except BaseException: + self.connections.pop(replica.id, None) + raise + return connection + + async def remove(self, replica_id: str) -> None: + connection = self.connections.pop(replica_id, None) + if connection is not None: + await connection.close() + + async def remove_all(self) -> None: + replica_ids = list(self.connections) + results = await asyncio.gather( + *(self.remove(replica_id) for replica_id in replica_ids), return_exceptions=True + ) + for i, exc in enumerate(results): + if isinstance(exc, Exception): + logger.error( + "Error removing connection to service replica %s: %s", replica_ids[i], exc + ) + + +async def get_service_replica_client( + service: Service, repo: BaseProxyRepo, service_conn_pool: ServiceConnectionPool +) -> httpx.AsyncClient: + """ + `service` must have at least one replica + """ + if service.domain is not None: + # Forward to Nginx so that requests are visible to StatsCollector in the access log + return httpx.AsyncClient( + base_url="http://127.0.0.1", + headers={"Host": service.domain}, + timeout=HTTP_TIMEOUT, + ) + # Nginx not available, forward directly to the tunnel + replica = random.choice(service.replicas) + connection = await service_conn_pool.get(replica.id) + if connection is None: + project = await repo.get_project(service.project_name) + if project is None: + raise UnexpectedProxyError( + f"Expected to find project {service.project_name} but could not" + ) + connection = await service_conn_pool.get_or_add(project, service, replica) + return await connection.client() diff --git a/src/dstack/_internal/proxy/lib/testing/__init__.py b/src/dstack/_internal/proxy/lib/testing/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/proxy/lib/testing/auth.py b/src/dstack/_internal/proxy/lib/testing/auth.py new file mode 100644 index 0000000000..0a5153ddd7 --- /dev/null +++ b/src/dstack/_internal/proxy/lib/testing/auth.py @@ -0,0 +1,11 @@ +from typing import Container, Optional + +from dstack._internal.proxy.lib.auth import BaseProxyAuthProvider + + +class ProxyTestAuthProvider(BaseProxyAuthProvider): + def __init__(self, project_to_tokens: Optional[dict[str, Container[str]]] = None) -> None: + self._project_to_tokens = project_to_tokens or {} + + async def is_project_member(self, project_name: str, token: str) -> bool: + return token in self._project_to_tokens.get(project_name, set()) diff --git a/src/dstack/_internal/proxy/lib/testing/common.py b/src/dstack/_internal/proxy/lib/testing/common.py new file mode 100644 index 0000000000..7cd5722e31 --- /dev/null +++ b/src/dstack/_internal/proxy/lib/testing/common.py @@ -0,0 +1,51 @@ +from typing import AsyncGenerator, Optional + +from dstack._internal.proxy.lib.auth import BaseProxyAuthProvider +from dstack._internal.proxy.lib.deps import ProxyDependencyInjector +from dstack._internal.proxy.lib.models import Project, Replica, Service +from dstack._internal.proxy.lib.repo import BaseProxyRepo + + +class ProxyTestDependencyInjector(ProxyDependencyInjector): + def __init__(self, repo: BaseProxyRepo, auth: BaseProxyAuthProvider) -> None: + super().__init__() + self._repo = repo + self._auth = auth + + async def get_repo(self) -> AsyncGenerator[BaseProxyRepo, None]: + yield self._repo + + async def get_auth_provider(self) -> AsyncGenerator[BaseProxyAuthProvider, None]: + yield self._auth + + +def make_project(name: str) -> Project: + return Project(name=name, ssh_private_key="secret") + + +def make_service( + project_name: str, + run_name: str, + domain: Optional[str] = None, + https: Optional[bool] = None, + auth: bool = False, + strip_prefix: bool = True, +) -> Service: + return Service( + project_name=project_name, + run_name=run_name, + domain=domain, + https=https, + auth=auth, + client_max_body_size=2**20, + strip_prefix=strip_prefix, + replicas=( + Replica( + id="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", + app_port=80, + ssh_destination="ubuntu@server", + ssh_port=22, + ssh_proxy=None, + ), + ), + ) diff --git a/src/dstack/_internal/server/alembic.ini b/src/dstack/_internal/server/alembic.ini index 6ff3ef1bca..c4c6840f01 100644 --- a/src/dstack/_internal/server/alembic.ini +++ b/src/dstack/_internal/server/alembic.ini @@ -8,7 +8,7 @@ script_location = migrations # Uncomment the line below if you want the files to be prepended with date and time # see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file # for all available tokens -# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s +file_template = %%(year)d/%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d_%%(rev)s_%%(slug)s # sys.path path, will be prepended to sys.path if present. # defaults to the current working directory. @@ -20,7 +20,7 @@ prepend_sys_path = . # installed by adding `alembic[tz]` to the pip requirements # string value is passed to dateutil.tz.gettz() # leave blank for localtime -# timezone = +timezone = utc # max length of characters to apply to the # "slug" field @@ -38,23 +38,19 @@ prepend_sys_path = . # version location specification; This defaults # to alembic/versions. When using multiple version # directories, initial revisions must be specified with --version-path. -# The path separator used here should be the separator specified by "version_path_separator" below. +# The path separator used here should be the separator specified by "path_separator" below. # version_locations = %(here)s/bar:%(here)s/bat:alembic/versions -# version path separator; As mentioned above, this is the character used to split -# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. -# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. -# Valid values for version_path_separator are: +# path_separator; This indicates what character is used to split lists of file +# paths, including version_locations and prepend_sys_path. # -# version_path_separator = : -# version_path_separator = ; -# version_path_separator = space -version_path_separator = os # Use os.pathsep. Default configuration used for new projects. +# Use os.pathsep. Default configuration used for new projects. +path_separator = os # set to 'true' to search source files recursively # in each "version_locations" directory # new in Alembic version 1.10 -# recursive_version_locations = false +recursive_version_locations = true # the output encoding used when revision files # are written from script.py.mako @@ -68,12 +64,6 @@ version_path_separator = os # Use os.pathsep. Default configuration used for ne # on newly generated revision scripts. See the documentation for further # detail and examples -# format using "black" - use the console_scripts runner, against the "black" entrypoint -# hooks = black -# black.type = console_scripts -# black.entrypoint = black -# black.options = -l 79 REVISION_SCRIPT_FILENAME - # Logging configuration [loggers] keys = root,sqlalchemy,alembic diff --git a/src/dstack/_internal/server/app.py b/src/dstack/_internal/server/app.py index 201482f8df..0f02806aa4 100644 --- a/src/dstack/_internal/server/app.py +++ b/src/dstack/_internal/server/app.py @@ -1,33 +1,63 @@ +import asyncio +import importlib.resources import os import time +from concurrent.futures import ThreadPoolExecutor from contextlib import asynccontextmanager -from typing import Awaitable, Callable, List +from pathlib import Path +from typing import Annotated, Awaitable, Callable, List, Optional import sentry_sdk -from fastapi import FastAPI, Request, status -from fastapi.responses import JSONResponse, RedirectResponse +from fastapi import Depends, FastAPI, Request, Response, status +from fastapi.datastructures import URL +from fastapi.responses import HTMLResponse, RedirectResponse +from fastapi.staticfiles import StaticFiles +from packaging.version import Version +from prometheus_client import Counter, Histogram +from dstack._internal import settings as core_settings from dstack._internal.cli.utils.common import console from dstack._internal.core.errors import ForbiddenError, ServerClientError from dstack._internal.core.services.configs import update_default_project +from dstack._internal.proxy.lib.deps import get_injector_from_app +from dstack._internal.proxy.lib.routers import model_proxy from dstack._internal.server import settings -from dstack._internal.server.background import start_background_tasks -from dstack._internal.server.db import get_session_ctx, migrate +from dstack._internal.server.background.pipeline_tasks import start_pipeline_tasks +from dstack._internal.server.background.scheduled_tasks import start_scheduled_tasks +from dstack._internal.server.background.scheduled_tasks.probes import PROBES_SCHEDULER +from dstack._internal.server.db import get_db, get_session_ctx, migrate from dstack._internal.server.routers import ( + auth, backends, + events, + exports, + files, + fleets, gateways, + gpus, + imports, + instances, logs, - pools, + metrics, projects, + prometheus, + public_keys, repos, runs, secrets, + server, + sshproxy, + templates, users, volumes, ) from dstack._internal.server.services.config import ServerConfigManager -from dstack._internal.server.services.gateways import gateway_connections_pool, init_gateways +from dstack._internal.server.services.gateways import gateway_connections_pool +from dstack._internal.server.services.locking import advisory_lock_ctx from dstack._internal.server.services.projects import get_or_create_default_project +from dstack._internal.server.services.proxy.deps import ServerProxyDependencyInjector +from dstack._internal.server.services.proxy.routers import service_proxy +from dstack._internal.server.services.runner.pool import instance_connection_pool from dstack._internal.server.services.storage import init_default_storage from dstack._internal.server.services.users import get_or_create_admin_user from dstack._internal.server.settings import ( @@ -37,90 +67,135 @@ SERVER_URL, UPDATE_DEFAULT_PROJECT, ) +from dstack._internal.server.utils import sentry_utils from dstack._internal.server.utils.logging import configure_logging from dstack._internal.server.utils.routers import ( + CustomORJSONResponse, check_client_server_compatibility, error_detail, + get_client_version, get_server_client_error_details, ) -from dstack._internal.settings import DSTACK_VERSION +from dstack._internal.utils.common import run_async from dstack._internal.utils.logging import get_logger from dstack._internal.utils.ssh import check_required_ssh_version logger = get_logger(__name__) +# Server HTTP metrics +REQUESTS_TOTAL = Counter( + "dstack_server_requests_total", + "Total number of HTTP requests", + ["method", "endpoint", "http_status", "project_name"], +) +REQUEST_DURATION = Histogram( + "dstack_server_request_duration_seconds", + "HTTP request duration in seconds", + ["method", "endpoint", "http_status", "project_name"], +) -def create_app() -> FastAPI: - if settings.SENTRY_DSN is not None: - sentry_sdk.init( - dsn=settings.SENTRY_DSN, - release=DSTACK_VERSION, - environment=settings.SERVER_ENVIRONMENT, - enable_tracing=True, - traces_sample_rate=settings.SENTRY_TRACES_SAMPLE_RATE, - ) - app = FastAPI(docs_url="/api/docs", lifespan=lifespan) +def create_app() -> FastAPI: + app = FastAPI( + docs_url="/api/docs", + lifespan=lifespan, + dependencies=[ + Depends(_check_client_version), + ], + ) + app.state.proxy_dependency_injector = ServerProxyDependencyInjector() return app @asynccontextmanager async def lifespan(app: FastAPI): configure_logging() + if settings.SENTRY_DSN is not None: + sentry_sdk.init( + dsn=settings.SENTRY_DSN, + release=core_settings.DSTACK_VERSION, + environment=settings.SERVER_ENVIRONMENT, + enable_tracing=True, + traces_sampler=sentry_utils.sentry_traces_sampler, + profiles_sample_rate=settings.SENTRY_PROFILES_SAMPLE_RATE, + before_send=sentry_utils.AsyncioCancelledErrorFilterEventProcessor(), + ) + server_executor = ThreadPoolExecutor(max_workers=settings.SERVER_EXECUTOR_MAX_WORKERS) + asyncio.get_running_loop().set_default_executor(server_executor) await migrate() + _print_dstack_logo() + if not check_required_ssh_version(): + logger.warning("OpenSSH 8.4+ is required. The dstack server may not work properly") + server_config_manager = None + server_config_loaded = False + if settings.SERVER_CONFIG_ENABLED: + server_config_manager = ServerConfigManager() + server_config_loaded = server_config_manager.load_config() + # Encryption has to be configured before working with users and projects + await server_config_manager.apply_encryption() async with get_session_ctx() as session: - console.print( - """[purple]╱╱╭╮╱╱╭╮╱╱╱╱╱╱╭╮ -╱╱┃┃╱╭╯╰╮╱╱╱╱╱┃┃ -╭━╯┣━┻╮╭╋━━┳━━┫┃╭╮ -┃╭╮┃━━┫┃┃╭╮┃╭━┫╰╯╯ -┃╰╯┣━━┃╰┫╭╮┃╰━┫╭╮╮ -╰━━┻━━┻━┻╯╰┻━━┻╯╰╯ -╭━━┳━━┳━┳╮╭┳━━┳━╮ -┃━━┫┃━┫╭┫╰╯┃┃━┫╭╯ -┣━━┃┃━┫┃╰╮╭┫┃━┫┃ -╰━━┻━━┻╯╱╰╯╰━━┻╯ -[/]""" - ) - admin, _ = await get_or_create_admin_user(session=session) - default_project, project_created = await get_or_create_default_project( - session=session, user=admin - ) - if not check_required_ssh_version(): - logger.warning("OpenSSH 8.4+ is required. The dstack server may not work properly") - if settings.SERVER_CONFIG_ENABLED: - server_config_manager = ServerConfigManager() - config_loaded = server_config_manager.load_config() - server_config_dir = str(SERVER_CONFIG_FILE_PATH).replace( - os.path.expanduser("~"), "~", 1 + async with advisory_lock_ctx( + bind=session, + dialect_name=get_db().dialect_name, + resource="server_init", + ): + admin, _ = await get_or_create_admin_user(session=session) + await get_or_create_default_project( + session=session, + user=admin, ) - if not config_loaded: - logger.info("Initializing the default configuration...", {"show_path": False}) - await server_config_manager.init_config(session=session) - logger.info( - f"Initialized the default configuration at [link=file://{SERVER_CONFIG_FILE_PATH}]{server_config_dir}[/link]", - {"show_path": False}, - ) - else: - logger.info( - f"Applying [link=file://{SERVER_CONFIG_FILE_PATH}]{server_config_dir}[/link]...", - {"show_path": False}, - ) + if server_config_manager is not None: + server_config_dir = _get_server_config_dir() + if not server_config_loaded: + logger.info("Initializing the default configuration...", {"show_path": False}) + await server_config_manager.init_config(session=session) + logger.info( + f"Initialized the default configuration at [link=file://{SERVER_CONFIG_FILE_PATH}]{server_config_dir}[/link]", + {"show_path": False}, + ) + else: + logger.info( + f"Applying [link=file://{SERVER_CONFIG_FILE_PATH}]{server_config_dir}[/link]...", + {"show_path": False}, + ) + await server_config_manager.apply_config(session=session, owner=admin) - await server_config_manager.apply_config(session=session, owner=admin) - await init_gateways(session=session) update_default_project( project_name=DEFAULT_PROJECT_NAME, url=SERVER_URL, - token=admin.token, - default=UPDATE_DEFAULT_PROJECT, - no_default=DO_NOT_UPDATE_DEFAULT_PROJECT, + token=admin.token.get_plaintext_or_error(), + yes=UPDATE_DEFAULT_PROJECT, + no=DO_NOT_UPDATE_DEFAULT_PROJECT, ) - if settings.SERVER_BUCKET is not None: + if settings.SERVER_S3_BUCKET is not None or settings.SERVER_GCS_BUCKET is not None: init_default_storage() - scheduler = start_background_tasks() - dstack_version = DSTACK_VERSION if DSTACK_VERSION else "(no version)" - logger.info(f"The admin token is {admin.token}", {"show_path": False}) + if settings.SERVER_SSH_POOL_ENABLED: + await run_async(instance_connection_pool.startup_cleanup) + else: + logger.info("Server SSH pool is disabled") + scheduler = None + pipeline_manager = None + if settings.SERVER_BACKGROUND_PROCESSING_ENABLED: + scheduler = start_scheduled_tasks() + pipeline_manager = start_pipeline_tasks() + app.state.pipeline_manager = pipeline_manager + else: + logger.info("Background processing is disabled") + PROBES_SCHEDULER.start() + dstack_version = ( + core_settings.DSTACK_VERSION if core_settings.DSTACK_VERSION else "(no version)" + ) + job_network_mode_log = ( + logger.info + if settings.JOB_NETWORK_MODE != settings.DEFAULT_JOB_NETWORK_MODE + else logger.debug + ) + job_network_mode_log( + "Job network mode: %s (%d)", + settings.JOB_NETWORK_MODE.name, + settings.JOB_NETWORK_MODE.value, + ) + logger.info(f"The admin token is {admin.token.get_plaintext_or_error()}", {"show_path": False}) logger.info( f"The dstack server {dstack_version} is running at {SERVER_URL}", {"show_path": False}, @@ -128,8 +203,23 @@ async def lifespan(app: FastAPI): for func in _ON_STARTUP_HOOKS: await func(app) yield - scheduler.shutdown() + PROBES_SCHEDULER.shutdown(wait=False) + if pipeline_manager is not None: + pipeline_manager.shutdown() + if scheduler is not None: + # Note: Scheduler does not cancel currently running jobs, so scheduled tasks cannot do cleanup. + # TODO: Track and cancel scheduled tasks. + scheduler.shutdown() + if pipeline_manager is not None: + await pipeline_manager.drain() await gateway_connections_pool.remove_all() + service_conn_pool = await get_injector_from_app(app).get_service_connection_pool() + await service_conn_pool.remove_all() + if settings.SERVER_SSH_POOL_ENABLED: + await run_async(instance_connection_pool.close_all) + await get_db().engine.dispose() + # Let checked-out DB connections close as dispose() only closes checked-in connections + await asyncio.sleep(3) _ON_STARTUP_HOOKS = [] @@ -146,64 +236,212 @@ def add_no_api_version_check_routes(paths: List[str]): _NO_API_VERSION_CHECK_ROUTES.extend(paths) -def register_routes(app: FastAPI): +def register_routes(app: FastAPI, ui: bool = True): + app.include_router(server.router) app.include_router(users.router) + app.include_router(auth.router) app.include_router(projects.router) - app.include_router(pools.root_router) - app.include_router(pools.router) app.include_router(backends.root_router) app.include_router(backends.project_router) + app.include_router(fleets.root_router) + app.include_router(fleets.project_router) + app.include_router(instances.root_router) + app.include_router(instances.project_router) app.include_router(repos.router) app.include_router(runs.root_router) app.include_router(runs.project_router) + app.include_router(gpus.project_router) + app.include_router(metrics.router) app.include_router(logs.router) app.include_router(secrets.router) app.include_router(gateways.router) - app.include_router(volumes.router) + app.include_router(volumes.root_router) + app.include_router(volumes.project_router) + app.include_router(service_proxy.router, prefix="/proxy/services", tags=["proxy"]) + app.include_router(model_proxy.router, prefix="/proxy/models", tags=["proxy"], deprecated=True) + app.include_router(prometheus.router) + app.include_router(files.router) + app.include_router(events.root_router) + app.include_router(templates.router) + app.include_router(exports.project_router) + app.include_router(imports.project_router) + app.include_router(sshproxy.router) + app.include_router(public_keys.router) @app.exception_handler(ForbiddenError) async def forbidden_error_handler(request: Request, exc: ForbiddenError): - return JSONResponse( + msg = "Access denied" + if len(exc.args) > 0: + msg = exc.args[0] + return CustomORJSONResponse( status_code=status.HTTP_403_FORBIDDEN, - content=error_detail("Access denied"), + content=error_detail(msg), ) @app.exception_handler(ServerClientError) async def server_client_error_handler(request: Request, exc: ServerClientError): - return JSONResponse( + return CustomORJSONResponse( status_code=status.HTTP_400_BAD_REQUEST, content={"detail": get_server_client_error_details(exc)}, ) + @app.exception_handler(OSError) + async def os_error_handler(request, exc: OSError): + if exc.errno in [36, 63]: + return CustomORJSONResponse( + {"detail": "Filename too long"}, + status_code=status.HTTP_400_BAD_REQUEST, + ) + raise exc + @app.middleware("http") async def log_request(request: Request, call_next): start_time = time.time() - response = await call_next(request) + response: Response = await call_next(request) process_time = time.time() - start_time + # log process_time to be used in the log_http_metrics middleware + request.state.process_time = process_time logger.debug( - "Processed request %s %s in %s", request.method, request.url, f"{process_time:0.6f}s" + "Processed request %s %s in %s. Status: %s", + request.method, + request.url, + f"{process_time:0.6f}s", + response.status_code, ) return response + if settings.SERVER_PROFILING_ENABLED: + from pyinstrument import Profiler + + @app.middleware("http") + async def profile_request(request: Request, call_next): + profiling = request.query_params.get("profile", False) + if profiling: + profiler = Profiler() + profiler.start() + respone = await call_next(request) + profiler.stop() + with open("profiling_results.html", "w+") as f: + f.write(profiler.output_html()) + return respone + else: + return await call_next(request) + + # this middleware must be defined after the log_request middleware @app.middleware("http") - async def check_client_version(request: Request, call_next): - if ( - not request.url.path.startswith("/api/") - or request.url.path in _NO_API_VERSION_CHECK_ROUTES - ): - return await call_next(request) - response = check_client_server_compatibility( - client_version=request.headers.get("x-api-version"), - server_version=DSTACK_VERSION, - ) - if response is not None: - return response - return await call_next(request) + async def log_http_metrics(request: Request, call_next): + def _extract_project_name(request: Request): + project_name = None + prefix = "/api/project/" + if request.url.path.startswith(prefix): + rest = request.url.path[len(prefix) :] + project_name = rest.split("/", 1)[0] if rest else None + + return project_name + + def _extract_endpoint_label(request: Request, response: Response) -> str: + route = request.scope.get("route") + route_path = getattr(route, "path", None) + if route_path: + return route_path + if not request.url.path.startswith("/api/"): + return "__non_api__" + if response.status_code == status.HTTP_404_NOT_FOUND: + return "__not_found__" + return "__unmatched__" + + project_name = _extract_project_name(request) + response: Response = await call_next(request) + endpoint_label = _extract_endpoint_label(request, response) + + REQUEST_DURATION.labels( + method=request.method, + endpoint=endpoint_label, + http_status=response.status_code, + project_name=project_name, + ).observe(request.state.process_time) + + REQUESTS_TOTAL.labels( + method=request.method, + endpoint=endpoint_label, + http_status=response.status_code, + project_name=project_name, + ).inc() + return response @app.get("/healthcheck") async def healthcheck(): - return JSONResponse(content={"status": "running"}) + return CustomORJSONResponse(content={"status": "running"}) + + if ui and Path(__file__).parent.joinpath("statics").exists(): + app.mount( + "/", StaticFiles(packages=["dstack._internal.server"], html=True), name="statics" + ) + + @app.exception_handler(404) + async def custom_http_exception_handler(request, exc): + if ( + request.url.path.startswith("/api") + or _is_proxy_request(request) + or _is_prometheus_request(request) + ): + return CustomORJSONResponse( + {"detail": exc.detail}, + status_code=status.HTTP_404_NOT_FOUND, + ) + else: + return HTMLResponse( + importlib.resources.files("dstack._internal.server") + .joinpath("statics/index.html") + .read_text() + ) + + else: + + @app.get("/") + async def index(): + return RedirectResponse("/api/docs") + + +def _check_client_version( + request: Request, client_version: Annotated[Optional[Version], Depends(get_client_version)] +) -> None: + if ( + request.url.path.startswith("/api/") + and request.url.path not in _NO_API_VERSION_CHECK_ROUTES + ): + check_client_server_compatibility( + client_version=client_version, + server_version=core_settings.DSTACK_VERSION, + ) + + +def _is_proxy_request(request: Request) -> bool: + if request.url.path.startswith("/proxy"): + return True + # Attempt detecting requests originating from services proxied by dstack-proxy. + # Such requests can "leak" to dstack server paths if the service does not support + # running under a path prefix properly. + referrer = URL(request.headers.get("Referer", "")) + return ( + referrer.netloc == "" or referrer.netloc == request.url.netloc + ) and referrer.path.startswith("/proxy") + + +def _is_prometheus_request(request: Request) -> bool: + return request.url.path.startswith("/metrics") + + +def _print_dstack_logo(): + console.print( + r"""[purple] _ _ _ + __| |___| |_ __ _ ___| | __ ___ ___ _ ____ _____ _ __ + / _` / __| __/ _` |/ __| |/ / / __|/ _ \ '__\ \ / / _ \ '__| +| (_| \__ \ || (_| | (__| < \__ \ __/ | \ V / __/ | + \__,_|___/\__\__,_|\___|_|\_\ |___/\___|_| \_/ \___|_| +[/]""" + ) + - @app.get("/") - async def index(): - return RedirectResponse("/api/docs") +def _get_server_config_dir() -> str: + return str(SERVER_CONFIG_FILE_PATH).replace(os.path.expanduser("~"), "~", 1) diff --git a/src/dstack/_internal/server/background/__init__.py b/src/dstack/_internal/server/background/__init__.py index 11c693ffa5..e69de29bb2 100644 --- a/src/dstack/_internal/server/background/__init__.py +++ b/src/dstack/_internal/server/background/__init__.py @@ -1,36 +0,0 @@ -from apscheduler.schedulers.asyncio import AsyncIOScheduler -from apscheduler.triggers.interval import IntervalTrigger - -from dstack._internal.server.background.tasks.process_gateways import ( - process_gateways_connections, - process_submitted_gateways, -) -from dstack._internal.server.background.tasks.process_instances import ( - process_instances, -) -from dstack._internal.server.background.tasks.process_running_jobs import process_running_jobs -from dstack._internal.server.background.tasks.process_runs import process_runs -from dstack._internal.server.background.tasks.process_submitted_jobs import process_submitted_jobs -from dstack._internal.server.background.tasks.process_terminating_jobs import ( - process_terminating_jobs, -) -from dstack._internal.server.background.tasks.process_volumes import process_submitted_volumes - -_scheduler = AsyncIOScheduler() - - -def get_scheduler() -> AsyncIOScheduler: - return _scheduler - - -def start_background_tasks() -> AsyncIOScheduler: - _scheduler.add_job(process_submitted_jobs, IntervalTrigger(seconds=2)) - _scheduler.add_job(process_running_jobs, IntervalTrigger(seconds=2)) - _scheduler.add_job(process_terminating_jobs, IntervalTrigger(seconds=2)) - _scheduler.add_job(process_instances, IntervalTrigger(seconds=10)) - _scheduler.add_job(process_runs, IntervalTrigger(seconds=1)) - _scheduler.add_job(process_gateways_connections, IntervalTrigger(seconds=15)) - _scheduler.add_job(process_submitted_gateways, IntervalTrigger(seconds=10), max_instances=5) - _scheduler.add_job(process_submitted_volumes, IntervalTrigger(seconds=5)) - _scheduler.start() - return _scheduler diff --git a/src/dstack/_internal/server/background/pipeline_tasks/__init__.py b/src/dstack/_internal/server/background/pipeline_tasks/__init__.py new file mode 100644 index 0000000000..a5e5164792 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/__init__.py @@ -0,0 +1,109 @@ +import asyncio + +from dstack._internal.server.background.pipeline_tasks.base import Pipeline +from dstack._internal.server.background.pipeline_tasks.compute_groups import ComputeGroupPipeline +from dstack._internal.server.background.pipeline_tasks.fleets import FleetPipeline +from dstack._internal.server.background.pipeline_tasks.gateways import GatewayPipeline +from dstack._internal.server.background.pipeline_tasks.instances import InstancePipeline +from dstack._internal.server.background.pipeline_tasks.jobs_running import JobRunningPipeline +from dstack._internal.server.background.pipeline_tasks.jobs_submitted import ( + JobSubmittedPipeline, +) +from dstack._internal.server.background.pipeline_tasks.jobs_terminating import ( + JobTerminatingPipeline, +) +from dstack._internal.server.background.pipeline_tasks.placement_groups import ( + PlacementGroupPipeline, +) +from dstack._internal.server.background.pipeline_tasks.runs import RunPipeline +from dstack._internal.server.background.pipeline_tasks.service_router_worker_sync import ( + ServiceRouterWorkerSyncPipeline, +) +from dstack._internal.server.background.pipeline_tasks.volumes import VolumePipeline +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +class PipelineManager: + def __init__(self) -> None: + self._pipelines: list[Pipeline] = [] + self._hinter = PipelineHinter() + for builtin_pipeline in [ + ComputeGroupPipeline(pipeline_hinter=self._hinter), + FleetPipeline(pipeline_hinter=self._hinter), + GatewayPipeline(pipeline_hinter=self._hinter), + JobSubmittedPipeline(pipeline_hinter=self._hinter), + JobRunningPipeline(pipeline_hinter=self._hinter), + JobTerminatingPipeline(pipeline_hinter=self._hinter), + InstancePipeline(pipeline_hinter=self._hinter), + PlacementGroupPipeline(pipeline_hinter=self._hinter), + RunPipeline(pipeline_hinter=self._hinter), + ServiceRouterWorkerSyncPipeline(pipeline_hinter=self._hinter), + VolumePipeline(pipeline_hinter=self._hinter), + ]: + self.register_pipeline(builtin_pipeline) + + def register_pipeline(self, pipeline: Pipeline): + self._pipelines.append(pipeline) + self._hinter.register_pipeline(pipeline) + + def start(self): + for pipeline in self._pipelines: + pipeline.start() + + def shutdown(self): + for pipeline in self._pipelines: + pipeline.shutdown() + + async def drain(self): + results = await asyncio.gather( + *[p.drain() for p in self._pipelines], return_exceptions=True + ) + for pipeline, result in zip(self._pipelines, results): + if isinstance(result, BaseException): + logger.error( + "Unexpected exception when draining pipeline %r", + pipeline, + exc_info=(type(result), result, result.__traceback__), + ) + + @property + def hinter(self): + return self._hinter + + +class PipelineHinter: + def __init__(self) -> None: + self._hint_fetch_map: dict[str, list[Pipeline]] = {} + + def register_pipeline(self, pipeline: Pipeline): + self._hint_fetch_map.setdefault(pipeline.hint_fetch_model_name, []).append(pipeline) + + def hint_fetch(self, model_name: str): + pipelines = self._hint_fetch_map.get(model_name) + if pipelines is None: + logger.warning("Model %s not registered for fetch hints", model_name) + return + for pipeline in pipelines: + pipeline.hint_fetch() + + +_pipeline_manager = None + + +def get_pipeline_manager() -> PipelineManager: + global _pipeline_manager + if _pipeline_manager is None: + _pipeline_manager = PipelineManager() + return _pipeline_manager + + +def start_pipeline_tasks() -> PipelineManager: + """ + Start tasks processed by fetch-workers pipelines based on db + in-memory queues. + Suitable for tasks that run frequently and need to lock rows for a long time. + """ + pipeline_manager = get_pipeline_manager() + pipeline_manager.start() + return pipeline_manager diff --git a/src/dstack/_internal/server/background/pipeline_tasks/base.py b/src/dstack/_internal/server/background/pipeline_tasks/base.py new file mode 100644 index 0000000000..fa4f997850 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/base.py @@ -0,0 +1,483 @@ +import asyncio +import logging +import math +import random +import time +import uuid +from abc import ABC, abstractmethod +from collections.abc import Iterable, Sequence +from dataclasses import dataclass +from datetime import datetime, timedelta +from typing import ( + Any, + ClassVar, + Final, + Generic, + Optional, + Protocol, + TypedDict, + TypeVar, + Union, +) + +from sqlalchemy import and_, or_, update +from sqlalchemy.orm import Mapped + +from dstack._internal.server.db import get_session_ctx +from dstack._internal.server.services.pipelines import PipelineHinterProtocol +from dstack._internal.utils.common import get_current_datetime +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +@dataclass +class PipelineItem: + """ + Pipelines can work with this class or its subclass if the worker needs to access extra attributes. + """ + + __tablename__: str + id: uuid.UUID + lock_expires_at: datetime + lock_token: uuid.UUID + prev_lock_expired: bool + + +ItemT = TypeVar("ItemT", bound=PipelineItem) + + +class PipelineModel(Protocol): + """ + Heartbeater can work with any DB model implementing this protocol. + """ + + __tablename__: str + __mapper__: ClassVar[Any] + __table__: ClassVar[Any] + id: Mapped[uuid.UUID] + lock_expires_at: Mapped[Optional[datetime]] + lock_token: Mapped[Optional[uuid.UUID]] + + +class PipelineError(Exception): + pass + + +class Pipeline(Generic[ItemT], ABC): + def __init__( + self, + workers_num: int, + queue_lower_limit_factor: float, + queue_upper_limit_factor: float, + min_processing_interval: timedelta, + lock_timeout: timedelta, + heartbeat_trigger: timedelta, + ) -> None: + self._workers_num = workers_num + self._queue_lower_limit_factor = queue_lower_limit_factor + self._queue_upper_limit_factor = queue_upper_limit_factor + self._queue_desired_minsize = math.ceil(workers_num * queue_lower_limit_factor) + self._queue_maxsize = math.ceil(workers_num * queue_upper_limit_factor) + self._min_processing_interval = min_processing_interval + self._lock_timeout = lock_timeout + self._heartbeat_trigger = heartbeat_trigger + self._queue = asyncio.Queue[ItemT](maxsize=self._queue_maxsize) + self._tasks: list[asyncio.Task] = [] + self._running = False + self._shutdown = False + + def start(self): + """ + Starts all pipeline tasks. + """ + if self._running: + return + if self._shutdown: + raise PipelineError("Cannot start pipeline after shutdown.") + self._running = True + self._tasks.append(asyncio.create_task(self._heartbeater.start())) + for worker in self._workers: + self._tasks.append(asyncio.create_task(worker.start())) + self._tasks.append(asyncio.create_task(self._fetcher.start())) + + def shutdown(self): + """ + Stops the pipeline from processing new items and signals running tasks to cancel. + """ + if self._shutdown: + return + self._shutdown = True + self._running = False + self._fetcher.stop() + for worker in self._workers: + worker.stop() + self._heartbeater.stop() + for task in self._tasks: + if not task.done(): + task.cancel() + + async def drain(self): + """ + Waits for all pipeline tasks to finish cleanup after shutdown. + """ + if not self._shutdown: + raise PipelineError("Cannot drain running pipeline. Call `shutdown()` first.") + results = await asyncio.gather(*self._tasks, return_exceptions=True) + for task, result in zip(self._tasks, results): + if ( + isinstance(result, BaseException) + and not isinstance(result, asyncio.CancelledError) + and not isinstance( + result, + asyncio.TimeoutError, # At least on Python 3.9 a task may raise TimeoutError from CancelledError. + ) + ): + logger.error( + "Unexpected exception when draining pipeline task %r", + task, + exc_info=(type(result), result, result.__traceback__), + ) + + def hint_fetch(self): + self._fetcher.hint() + + @property + @abstractmethod + def hint_fetch_model_name(self) -> str: + pass + + @property + @abstractmethod + def _heartbeater(self) -> "Heartbeater[ItemT]": + pass + + @property + @abstractmethod + def _fetcher(self) -> "Fetcher[ItemT]": + pass + + @property + @abstractmethod + def _workers(self) -> Sequence["Worker[ItemT]"]: + pass + + +class Heartbeater(Generic[ItemT]): + def __init__( + self, + model_type: type[PipelineModel], + lock_timeout: timedelta, + heartbeat_trigger: timedelta, + heartbeat_delay: float = 1.0, + ) -> None: + self._model_type = model_type + self._lock_timeout = lock_timeout + self._hearbeat_margin = heartbeat_trigger + self._items: dict[uuid.UUID, ItemT] = {} + self._untrack_lock = asyncio.Lock() + self._heartbeat_delay = heartbeat_delay + self._running = False + + async def start(self): + self._running = True + while self._running: + try: + await self.heartbeat() + except Exception: + logger.exception("Unexpected exception when running heartbeat") + await asyncio.sleep(self._heartbeat_delay) + + def stop(self): + self._running = False + + async def track(self, item: ItemT): + self._items[item.id] = item + + async def untrack(self, item: ItemT): + async with self._untrack_lock: + tracked = self._items.get(item.id) + # Prevent expired fetch iteration to unlock item processed by new iteration. + if tracked is not None and tracked.lock_token == item.lock_token: + del self._items[item.id] + + async def heartbeat(self): + items_to_update: list[ItemT] = [] + now = get_current_datetime() + items = list(self._items.values()) + failed_to_heartbeat_count = 0 + for item in items: + if item.lock_expires_at < now: + failed_to_heartbeat_count += 1 + await self.untrack(item) + elif item.lock_expires_at < now + self._hearbeat_margin: + items_to_update.append(item) + if failed_to_heartbeat_count > 0: + logger.warning( + "Failed to heartbeat %d %s items in time." + " The items are expected to be processed on another fetch iteration.", + failed_to_heartbeat_count, + self._model_type.__tablename__, + ) + if len(items_to_update) == 0: + return + logger.debug( + "Updating lock_expires_at for items: %s", [str(r.id) for r in items_to_update] + ) + async with get_session_ctx() as session: + per_item_filters = [ + and_( + self._model_type.id == item.id, self._model_type.lock_token == item.lock_token + ) + for item in items_to_update + ] + res = await session.execute( + update(self._model_type) + .where(or_(*per_item_filters)) + .values(lock_expires_at=now + self._lock_timeout) + .returning(self._model_type.id) + ) + updated_ids = set(res.scalars().all()) + failed_to_update_count = 0 + for item in items_to_update: + if item.id in updated_ids: + item.lock_expires_at = now + self._lock_timeout + else: + failed_to_update_count += 1 + await self.untrack(item) + if failed_to_update_count > 0: + logger.warning( + "Failed to update %s lock_expires_at of %d items: lock_token changed." + " The items are expected to be processed and updated on another fetch iteration.", + self._model_type.__tablename__, + failed_to_update_count, + ) + + +class Fetcher(Generic[ItemT], ABC): + _DEFAULT_FETCH_DELAYS = [0.5, 1, 2, 5] + """Increasing fetch delays on empty fetches to avoid frequent selects on low-activity/low-resource servers.""" + + def __init__( + self, + queue: asyncio.Queue[ItemT], + queue_desired_minsize: int, + min_processing_interval: timedelta, + lock_timeout: timedelta, + heartbeater: Heartbeater[ItemT], + queue_check_delay: float = 1.0, + fetch_delays: Optional[list[float]] = None, + ) -> None: + self._queue = queue + self._queue_desired_minsize = queue_desired_minsize + self._min_processing_interval = min_processing_interval + self._lock_timeout = lock_timeout + self._heartbeater = heartbeater + self._queue_check_delay = queue_check_delay + if fetch_delays is None: + fetch_delays = self._DEFAULT_FETCH_DELAYS + self._fetch_delays = fetch_delays + self._running = False + self._fetch_event = asyncio.Event() + + async def start(self): + self._running = True + empty_fetch_count = 0 + while self._running: + if self._queue.qsize() >= self._queue_desired_minsize: + await asyncio.sleep(self._queue_check_delay) + continue + fetch_limit = self._queue.maxsize - self._queue.qsize() + try: + items = await self.fetch(limit=fetch_limit) + except Exception: + logger.exception("Unexpected exception when fetching new items") + items = [] + if len(items) == 0: + try: + await asyncio.wait_for( + self._fetch_event.wait(), + timeout=self._next_fetch_delay(empty_fetch_count), + ) + except ( + asyncio.TimeoutError, # < Python 3.11 + TimeoutError, # >= Python 3.11 + ): + pass + empty_fetch_count += 1 + self._fetch_event.clear() + continue + else: + empty_fetch_count = 0 + for item in items: + self._queue.put_nowait(item) # should never raise + await self._heartbeater.track(item) + + def stop(self): + self._running = False + + def hint(self): + self._fetch_event.set() + + @abstractmethod + async def fetch(self, limit: int) -> list[ItemT]: + pass + + def _next_fetch_delay(self, empty_fetch_count: int) -> float: + effective_empty_fetch_count = empty_fetch_count + if random.random() < 0.1: + # Empty fetch count can be 0 not because there are no items in the DB, + # but for other reasons such as waiting parent resource processing. + # From time to time, force minimal next delay to avoid empty results due to rare fetches. + effective_empty_fetch_count = 0 + next_delay = self._fetch_delays[ + min(effective_empty_fetch_count, len(self._fetch_delays) - 1) + ] + jitter = random.random() * 0.4 - 0.2 + return next_delay * (1 + jitter) + + +class Worker(Generic[ItemT], ABC): + def __init__( + self, + queue: asyncio.Queue[ItemT], + heartbeater: Heartbeater[ItemT], + pipeline_hinter: PipelineHinterProtocol, + ) -> None: + self._queue = queue + self._heartbeater = heartbeater + self._pipeline_hinter = pipeline_hinter + self._running = False + + async def start(self): + self._running = True + while self._running: + item = await self._queue.get() + start_time = time.time() + logger.debug("Processing %s item %s", item.__tablename__, item.id) + try: + await self.process(item) + except Exception: + logger.exception("Unexpected exception when processing item") + finally: + await self._heartbeater.untrack(item) + logger.debug( + "Processed %s item %s in %.3f", + item.__tablename__, + item.id, + time.time() - start_time, + ) + + def stop(self): + self._running = False + + @abstractmethod + async def process(self, item: ItemT): + pass + + +class _NowPlaceholder: + pass + + +NOW_PLACEHOLDER: Final = _NowPlaceholder() +""" +Use `NOW_PLACEHOLDER` together with `resolve_now_placeholders()` in pipeline update maps +instead of `get_current_time()` to have the same current time for all updates in the transaction. +""" + + +UpdateMapDateTime = Union[datetime, _NowPlaceholder] + + +class _UnlockUpdateMap(TypedDict, total=False): + lock_expires_at: Optional[datetime] + lock_token: Optional[uuid.UUID] + lock_owner: Optional[str] + + +class _ProcessedUpdateMap(TypedDict, total=False): + last_processed_at: UpdateMapDateTime + + +class ItemUpdateMap(_UnlockUpdateMap, _ProcessedUpdateMap, total=False): + lock_expires_at: Optional[datetime] + lock_token: Optional[uuid.UUID] + lock_owner: Optional[str] + last_processed_at: UpdateMapDateTime + + +def set_unlock_update_map_fields(update_map: _UnlockUpdateMap): + update_map["lock_expires_at"] = None + update_map["lock_token"] = None + update_map["lock_owner"] = None + + +def set_processed_update_map_fields( + update_map: _ProcessedUpdateMap, + now: UpdateMapDateTime = NOW_PLACEHOLDER, +): + update_map["last_processed_at"] = now + + +class _ResolveNowUpdateMap(Protocol): + def items(self) -> Iterable[tuple[str, object]]: ... + + +_ResolveNowInput = Union[_ResolveNowUpdateMap, Sequence[_ResolveNowUpdateMap]] + + +def resolve_now_placeholders(update_values: _ResolveNowInput, now: datetime): + """ + Replaces `NOW_PLACEHOLDER` with `now` in an update map or a sequence of update rows. + """ + if isinstance(update_values, Sequence): + for update_row in update_values: + resolve_now_placeholders(update_row, now) + return + # Runtime dict narrowing is required here: pyright doesn't model TypedDicts as + # supporting generic dynamic-key mutation via protocol methods. + if not isinstance(update_values, dict): + raise TypeError( + "resolve_now_placeholders() expects update maps or sequences of update maps" + ) + for key, value in update_values.items(): + if value is NOW_PLACEHOLDER: + update_values[key] = now + + +def log_lock_token_mismatch( + logger: logging.Logger, + item: PipelineItem, + action: str = "process", +) -> None: + logger.warning( + "Failed to %s %s item %s: lock_token mismatch." + " The item is expected to be processed and updated on another fetch iteration.", + action, + item.__tablename__, + item.id, + ) + + +def log_lock_token_changed_after_processing( + logger: logging.Logger, + item: PipelineItem, + action: str = "update", + expected_outcome: str = "updated", +) -> None: + logger.warning( + "Failed to %s %s item %s after processing: lock_token changed." + " The item is expected to be processed and %s on another fetch iteration.", + action, + item.__tablename__, + item.id, + expected_outcome, + ) + + +def log_lock_token_changed_on_reset(logger: logging.Logger) -> None: + logger.warning( + "Failed to reset lock: lock_token changed." + " The item is expected to be processed and updated on another fetch iteration." + ) diff --git a/src/dstack/_internal/server/background/pipeline_tasks/common.py b/src/dstack/_internal/server/background/pipeline_tasks/common.py new file mode 100644 index 0000000000..0c204cbc33 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/common.py @@ -0,0 +1,23 @@ +from datetime import timedelta + +from dstack._internal.core.models.backends.base import BackendType + + +def get_provisioning_timeout(backend_type: BackendType, instance_type_name: str) -> timedelta: + """ + This timeout refers to the max time between requesting instance creation and the instance becoming ready to accept jobs. + For container-based backends, this also includes the image pulling time. + """ + if backend_type == BackendType.LAMBDA: + return timedelta(minutes=30) + if backend_type == BackendType.RUNPOD: + return timedelta(minutes=20) + if backend_type == BackendType.KUBERNETES: + return timedelta(minutes=20) + if backend_type == BackendType.OCI and instance_type_name.startswith("BM."): + return timedelta(minutes=20) + if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"): + return timedelta(minutes=55) + if backend_type == BackendType.GCP and instance_type_name == "a4-highgpu-8g": + return timedelta(minutes=16) + return timedelta(minutes=10) diff --git a/src/dstack/_internal/server/background/pipeline_tasks/compute_groups.py b/src/dstack/_internal/server/background/pipeline_tasks/compute_groups.py new file mode 100644 index 0000000000..78f23495db --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/compute_groups.py @@ -0,0 +1,365 @@ +import asyncio +import uuid +from dataclasses import dataclass, field +from datetime import datetime, timedelta +from typing import Sequence, TypedDict + +from sqlalchemy import or_, select, update +from sqlalchemy.orm import joinedload, load_only + +from dstack._internal.core.backends.base.compute import ComputeWithGroupProvisioningSupport +from dstack._internal.core.errors import BackendError +from dstack._internal.core.models.compute_groups import ComputeGroupStatus +from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.server.background.pipeline_tasks.base import ( + NOW_PLACEHOLDER, + Fetcher, + Heartbeater, + ItemUpdateMap, + Pipeline, + PipelineItem, + UpdateMapDateTime, + Worker, + log_lock_token_changed_after_processing, + log_lock_token_mismatch, + resolve_now_placeholders, + set_processed_update_map_fields, + set_unlock_update_map_fields, +) +from dstack._internal.server.db import get_db, get_session_ctx +from dstack._internal.server.models import ComputeGroupModel, InstanceModel, ProjectModel +from dstack._internal.server.services import backends as backends_services +from dstack._internal.server.services.compute_groups import compute_group_model_to_compute_group +from dstack._internal.server.services.instances import emit_instance_status_change_event +from dstack._internal.server.services.locking import get_locker +from dstack._internal.server.services.pipelines import PipelineHinterProtocol +from dstack._internal.server.utils import sentry_utils +from dstack._internal.utils.common import get_current_datetime, run_async +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + +TERMINATION_RETRY_TIMEOUT = timedelta(seconds=60) +TERMINATION_RETRY_MAX_DURATION = timedelta(minutes=15) + + +class ComputeGroupPipeline(Pipeline[PipelineItem]): + def __init__( + self, + workers_num: int = 10, + queue_lower_limit_factor: float = 0.5, + queue_upper_limit_factor: float = 2.0, + min_processing_interval: timedelta = timedelta(seconds=15), + lock_timeout: timedelta = timedelta(seconds=30), + heartbeat_trigger: timedelta = timedelta(seconds=15), + *, + pipeline_hinter: PipelineHinterProtocol, + ) -> None: + super().__init__( + workers_num=workers_num, + queue_lower_limit_factor=queue_lower_limit_factor, + queue_upper_limit_factor=queue_upper_limit_factor, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeat_trigger=heartbeat_trigger, + ) + self.__heartbeater = Heartbeater[PipelineItem]( + model_type=ComputeGroupModel, + lock_timeout=self._lock_timeout, + heartbeat_trigger=self._heartbeat_trigger, + ) + self.__fetcher = ComputeGroupFetcher( + queue=self._queue, + queue_desired_minsize=self._queue_desired_minsize, + min_processing_interval=self._min_processing_interval, + lock_timeout=self._lock_timeout, + heartbeater=self._heartbeater, + ) + self.__workers = [ + ComputeGroupWorker( + queue=self._queue, + heartbeater=self._heartbeater, + pipeline_hinter=pipeline_hinter, + ) + for _ in range(self._workers_num) + ] + + @property + def hint_fetch_model_name(self) -> str: + return ComputeGroupModel.__name__ + + @property + def _heartbeater(self) -> Heartbeater[PipelineItem]: + return self.__heartbeater + + @property + def _fetcher(self) -> Fetcher[PipelineItem]: + return self.__fetcher + + @property + def _workers(self) -> Sequence["ComputeGroupWorker"]: + return self.__workers + + +class ComputeGroupFetcher(Fetcher[PipelineItem]): + def __init__( + self, + queue: asyncio.Queue[PipelineItem], + queue_desired_minsize: int, + min_processing_interval: timedelta, + lock_timeout: timedelta, + heartbeater: Heartbeater[PipelineItem], + queue_check_delay: float = 1.0, + ) -> None: + super().__init__( + queue=queue, + queue_desired_minsize=queue_desired_minsize, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeater=heartbeater, + queue_check_delay=queue_check_delay, + ) + + @sentry_utils.instrument_pipeline_task("ComputeGroupFetcher.fetch") + async def fetch(self, limit: int) -> list[PipelineItem]: + compute_group_lock, _ = get_locker(get_db().dialect_name).get_lockset( + ComputeGroupModel.__tablename__ + ) + async with compute_group_lock: + async with get_session_ctx() as session: + now = get_current_datetime() + res = await session.execute( + select(ComputeGroupModel) + .where( + ComputeGroupModel.status.not_in(ComputeGroupStatus.finished_statuses()), + ComputeGroupModel.last_processed_at <= now - self._min_processing_interval, + or_( + ComputeGroupModel.lock_expires_at.is_(None), + ComputeGroupModel.lock_expires_at < now, + ), + or_( + ComputeGroupModel.lock_owner.is_(None), + ComputeGroupModel.lock_owner == ComputeGroupPipeline.__name__, + ), + ) + .order_by(ComputeGroupModel.last_processed_at.asc()) + .limit(limit) + .with_for_update(skip_locked=True, key_share=True, of=ComputeGroupModel) + .options( + load_only( + ComputeGroupModel.id, + ComputeGroupModel.lock_token, + ComputeGroupModel.lock_expires_at, + ) + ) + ) + compute_group_models = list(res.scalars().all()) + lock_expires_at = get_current_datetime() + self._lock_timeout + lock_token = uuid.uuid4() + items = [] + for compute_group_model in compute_group_models: + prev_lock_expired = compute_group_model.lock_expires_at is not None + compute_group_model.lock_expires_at = lock_expires_at + compute_group_model.lock_token = lock_token + compute_group_model.lock_owner = ComputeGroupPipeline.__name__ + items.append( + PipelineItem( + __tablename__=ComputeGroupModel.__tablename__, + id=compute_group_model.id, + lock_expires_at=lock_expires_at, + lock_token=lock_token, + prev_lock_expired=prev_lock_expired, + ) + ) + await session.commit() + return items + + +class ComputeGroupWorker(Worker[PipelineItem]): + def __init__( + self, + queue: asyncio.Queue[PipelineItem], + heartbeater: Heartbeater[PipelineItem], + pipeline_hinter: PipelineHinterProtocol, + ) -> None: + super().__init__( + queue=queue, + heartbeater=heartbeater, + pipeline_hinter=pipeline_hinter, + ) + + @sentry_utils.instrument_pipeline_task("ComputeGroupWorker.process") + async def process(self, item: PipelineItem): + async with get_session_ctx() as session: + res = await session.execute( + select(ComputeGroupModel) + .where( + ComputeGroupModel.id == item.id, + ComputeGroupModel.lock_token == item.lock_token, + ) + # Terminating instances belonging to a compute group are locked implicitly by locking the compute group. + .options( + joinedload(ComputeGroupModel.instances), + joinedload(ComputeGroupModel.project).joinedload(ProjectModel.backends), + ) + ) + compute_group_model = res.unique().scalar_one_or_none() + if compute_group_model is None: + log_lock_token_mismatch(logger, item) + return + + result = _TerminateResult() + # TODO: Fetch only compute groups with all instances terminating. + if all(i.status == InstanceStatus.TERMINATING for i in compute_group_model.instances): + result = await _terminate_compute_group(compute_group_model) + set_processed_update_map_fields(result.compute_group_update_map) + if result.instances_update_map: + set_processed_update_map_fields(result.instances_update_map) + set_unlock_update_map_fields(result.compute_group_update_map) + if result.compute_group_update_map.get("deleted", False): + logger.info("Terminated compute group %s", compute_group_model.id) + + async with get_session_ctx() as session: + now = get_current_datetime() + resolve_now_placeholders(result.compute_group_update_map, now=now) + resolve_now_placeholders(result.instances_update_map, now=now) + res = await session.execute( + update(ComputeGroupModel) + .where( + ComputeGroupModel.id == compute_group_model.id, + ComputeGroupModel.lock_token == compute_group_model.lock_token, + ) + .values(**result.compute_group_update_map) + .returning(ComputeGroupModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + log_lock_token_changed_after_processing(logger, item) + return + if not result.instances_update_map: + return + instances_ids = [i.id for i in compute_group_model.instances] + res = await session.execute( + update(InstanceModel) + .where(InstanceModel.id.in_(instances_ids)) + .values(**result.instances_update_map) + ) + for instance_model in compute_group_model.instances: + emit_instance_status_change_event( + session=session, + instance_model=instance_model, + old_status=instance_model.status, + new_status=InstanceStatus.TERMINATED, + termination_reason=instance_model.termination_reason, + termination_reason_message=instance_model.termination_reason_message, + ) + + +class _ComputeGroupUpdateMap(ItemUpdateMap, total=False): + status: ComputeGroupStatus + deleted: bool + deleted_at: UpdateMapDateTime + first_termination_retry_at: UpdateMapDateTime + last_termination_retry_at: UpdateMapDateTime + + +class _InstanceBulkUpdateMap(TypedDict, total=False): + last_processed_at: UpdateMapDateTime + deleted: bool + deleted_at: UpdateMapDateTime + finished_at: UpdateMapDateTime + status: InstanceStatus + + +@dataclass +class _TerminateResult: + compute_group_update_map: _ComputeGroupUpdateMap = field( + default_factory=_ComputeGroupUpdateMap + ) + instances_update_map: _InstanceBulkUpdateMap = field(default_factory=_InstanceBulkUpdateMap) + + +async def _terminate_compute_group(compute_group_model: ComputeGroupModel) -> _TerminateResult: + result = _TerminateResult() + if ( + compute_group_model.last_termination_retry_at is not None + and _next_termination_retry_at(compute_group_model.last_termination_retry_at) + > get_current_datetime() + ): + return result + compute_group = compute_group_model_to_compute_group(compute_group_model) + cgpd = compute_group.provisioning_data + backend = await backends_services.get_project_backend_by_type( + project=compute_group_model.project, + backend_type=cgpd.backend, + ) + if backend is None: + logger.error( + "Failed to terminate compute group %s. Backend %s not available." + " Please terminate it manually to avoid unexpected charges.", + compute_group.name, + cgpd.backend, + ) + return _get_terminated_result() + logger.debug("Terminating compute group %s", compute_group.name) + compute = backend.compute() + assert isinstance(compute, ComputeWithGroupProvisioningSupport) + try: + await run_async( + compute.terminate_compute_group, + compute_group, + ) + except Exception as e: + retry_at = get_current_datetime() + first_termination_retry_at = compute_group_model.first_termination_retry_at + if compute_group_model.first_termination_retry_at is None: + result.compute_group_update_map["first_termination_retry_at"] = NOW_PLACEHOLDER + first_termination_retry_at = retry_at + assert first_termination_retry_at is not None + result.compute_group_update_map["last_termination_retry_at"] = NOW_PLACEHOLDER + if _next_termination_retry_at(retry_at) < _get_termination_deadline( + first_termination_retry_at + ): + logger.warning( + "Failed to terminate compute group %s. Will retry. Error: %r", + compute_group.name, + e, + exc_info=not isinstance(e, BackendError), + ) + return result + logger.error( + "Failed all attempts to terminate compute group %s." + " Please terminate it manually to avoid unexpected charges." + " Error: %r", + compute_group.name, + e, + exc_info=not isinstance(e, BackendError), + ) + terminated_result = _get_terminated_result() + terminated_result.compute_group_update_map.update(result.compute_group_update_map) + terminated_result.instances_update_map.update(result.instances_update_map) + return terminated_result + + +def _next_termination_retry_at(last_termination_retry_at: datetime) -> datetime: + return last_termination_retry_at + TERMINATION_RETRY_TIMEOUT + + +def _get_termination_deadline(first_termination_retry_at: datetime) -> datetime: + return first_termination_retry_at + TERMINATION_RETRY_MAX_DURATION + + +def _get_terminated_result() -> _TerminateResult: + return _TerminateResult( + compute_group_update_map={ + "deleted": True, + "deleted_at": NOW_PLACEHOLDER, + "status": ComputeGroupStatus.TERMINATED, + }, + instances_update_map={ + "deleted": True, + "deleted_at": NOW_PLACEHOLDER, + "finished_at": NOW_PLACEHOLDER, + "status": InstanceStatus.TERMINATED, + }, + ) diff --git a/src/dstack/_internal/server/background/pipeline_tasks/fleets.py b/src/dstack/_internal/server/background/pipeline_tasks/fleets.py new file mode 100644 index 0000000000..8050c552c8 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/fleets.py @@ -0,0 +1,983 @@ +import asyncio +import uuid +from dataclasses import dataclass, field +from datetime import timedelta +from typing import Optional, Sequence, TypedDict + +from sqlalchemy import delete, or_, select, update +from sqlalchemy.ext.asyncio.session import AsyncSession +from sqlalchemy.orm import joinedload, load_only, selectinload + +from dstack._internal.core.models.fleets import ( + FleetSpec, + FleetStatus, + InstanceGroupPlacement, +) +from dstack._internal.core.models.instances import InstanceStatus, InstanceTerminationReason +from dstack._internal.core.models.runs import RunStatus +from dstack._internal.server.background.pipeline_tasks.base import ( + NOW_PLACEHOLDER, + Fetcher, + Heartbeater, + ItemUpdateMap, + Pipeline, + PipelineItem, + UpdateMapDateTime, + Worker, + log_lock_token_changed_after_processing, + log_lock_token_changed_on_reset, + log_lock_token_mismatch, + resolve_now_placeholders, + set_processed_update_map_fields, + set_unlock_update_map_fields, +) +from dstack._internal.server.db import get_db, get_session_ctx +from dstack._internal.server.models import ( + ExportedFleetModel, + FleetModel, + InstanceModel, + JobModel, + PlacementGroupModel, + RunModel, +) +from dstack._internal.server.services import events +from dstack._internal.server.services.fleets import ( + create_fleet_instance_model, + emit_fleet_status_change_event, + get_fleet_requirements, + get_fleet_spec, + get_next_instance_num, + is_fleet_empty, + is_fleet_in_use, +) +from dstack._internal.server.services.instances import ( + instance_matches_constraints, + is_placeholder_instance, +) +from dstack._internal.server.services.locking import get_locker +from dstack._internal.server.services.pipelines import PipelineHinterProtocol +from dstack._internal.server.utils import sentry_utils +from dstack._internal.utils.common import get_current_datetime +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +class FleetPipeline(Pipeline[PipelineItem]): + def __init__( + self, + workers_num: int = 10, + queue_lower_limit_factor: float = 0.5, + queue_upper_limit_factor: float = 2.0, + min_processing_interval: timedelta = timedelta(seconds=15), + lock_timeout: timedelta = timedelta(seconds=20), + heartbeat_trigger: timedelta = timedelta(seconds=10), + *, + pipeline_hinter: PipelineHinterProtocol, + ) -> None: + super().__init__( + workers_num=workers_num, + queue_lower_limit_factor=queue_lower_limit_factor, + queue_upper_limit_factor=queue_upper_limit_factor, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeat_trigger=heartbeat_trigger, + ) + self.__heartbeater = Heartbeater[PipelineItem]( + model_type=FleetModel, + lock_timeout=self._lock_timeout, + heartbeat_trigger=self._heartbeat_trigger, + ) + self.__fetcher = FleetFetcher( + queue=self._queue, + queue_desired_minsize=self._queue_desired_minsize, + min_processing_interval=self._min_processing_interval, + lock_timeout=self._lock_timeout, + heartbeater=self._heartbeater, + ) + self.__workers = [ + FleetWorker( + queue=self._queue, + heartbeater=self._heartbeater, + pipeline_hinter=pipeline_hinter, + ) + for _ in range(self._workers_num) + ] + + @property + def hint_fetch_model_name(self) -> str: + return FleetModel.__name__ + + @property + def _heartbeater(self) -> Heartbeater[PipelineItem]: + return self.__heartbeater + + @property + def _fetcher(self) -> Fetcher[PipelineItem]: + return self.__fetcher + + @property + def _workers(self) -> Sequence["FleetWorker"]: + return self.__workers + + +class FleetFetcher(Fetcher[PipelineItem]): + def __init__( + self, + queue: asyncio.Queue[PipelineItem], + queue_desired_minsize: int, + min_processing_interval: timedelta, + lock_timeout: timedelta, + heartbeater: Heartbeater[PipelineItem], + queue_check_delay: float = 1.0, + ) -> None: + super().__init__( + queue=queue, + queue_desired_minsize=queue_desired_minsize, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeater=heartbeater, + queue_check_delay=queue_check_delay, + ) + + @sentry_utils.instrument_pipeline_task("FleetFetcher.fetch") + async def fetch(self, limit: int) -> list[PipelineItem]: + fleet_lock, _ = get_locker(get_db().dialect_name).get_lockset(FleetModel.__tablename__) + async with fleet_lock: + async with get_session_ctx() as session: + now = get_current_datetime() + res = await session.execute( + select(FleetModel) + .where( + FleetModel.deleted == False, + or_( + FleetModel.last_processed_at <= now - self._min_processing_interval, + FleetModel.last_processed_at == FleetModel.created_at, + ), + or_( + FleetModel.lock_expires_at.is_(None), + FleetModel.lock_expires_at < now, + ), + or_( + FleetModel.lock_owner.is_(None), + FleetModel.lock_owner == FleetPipeline.__name__, + ), + ) + .order_by(FleetModel.last_processed_at.asc()) + .limit(limit) + .with_for_update(skip_locked=True, key_share=True, of=FleetModel) + .options( + load_only( + FleetModel.id, + FleetModel.lock_token, + FleetModel.lock_expires_at, + ) + ) + ) + fleet_models = list(res.scalars().all()) + lock_expires_at = get_current_datetime() + self._lock_timeout + lock_token = uuid.uuid4() + items = [] + for fleet_model in fleet_models: + prev_lock_expired = fleet_model.lock_expires_at is not None + fleet_model.lock_expires_at = lock_expires_at + fleet_model.lock_token = lock_token + fleet_model.lock_owner = FleetPipeline.__name__ + items.append( + PipelineItem( + __tablename__=FleetModel.__tablename__, + id=fleet_model.id, + lock_expires_at=lock_expires_at, + lock_token=lock_token, + prev_lock_expired=prev_lock_expired, + ) + ) + await session.commit() + return items + + +class FleetWorker(Worker[PipelineItem]): + def __init__( + self, + queue: asyncio.Queue[PipelineItem], + heartbeater: Heartbeater[PipelineItem], + pipeline_hinter: PipelineHinterProtocol, + ) -> None: + super().__init__( + queue=queue, + heartbeater=heartbeater, + pipeline_hinter=pipeline_hinter, + ) + + @sentry_utils.instrument_pipeline_task("FleetWorker.process") + async def process(self, item: PipelineItem): + process_context = await _load_process_context(item) + if process_context is None: + return + result = await _process_fleet(process_context.fleet_model) + await _apply_process_result(item, process_context, result) + + +@dataclass +class _ProcessContext: + fleet_model: FleetModel + locked_instance_ids: set[uuid.UUID] = field(default_factory=set) + + +class _FleetUpdateMap(ItemUpdateMap, total=False): + status: FleetStatus + status_message: str + deleted: bool + deleted_at: UpdateMapDateTime + consolidation_attempt: int + last_consolidated_at: UpdateMapDateTime + current_master_instance_id: Optional[uuid.UUID] + + +class _InstanceUpdateMap(ItemUpdateMap, total=False): + status: InstanceStatus + termination_reason: InstanceTerminationReason + termination_reason_message: str + deleted: bool + deleted_at: UpdateMapDateTime + last_processed_at: UpdateMapDateTime + id: uuid.UUID + + +@dataclass +class _ProcessResult: + fleet_update_map: _FleetUpdateMap = field(default_factory=_FleetUpdateMap) + instance_id_to_update_map: dict[uuid.UUID, _InstanceUpdateMap] = field(default_factory=dict) + new_instance_creates: list["_NewInstanceCreate"] = field(default_factory=list) + consolidation_limit_reached: bool = False + + +class _NewInstanceCreate(TypedDict): + id: uuid.UUID + instance_num: int + + +@dataclass +class _MaintainNodesResult: + instance_id_to_update_map: dict[uuid.UUID, _InstanceUpdateMap] = field(default_factory=dict) + new_instance_creates: list[_NewInstanceCreate] = field(default_factory=list) + changes_required: bool = False + + @property + def has_changes(self) -> bool: + return len(self.instance_id_to_update_map) > 0 or len(self.new_instance_creates) > 0 + + +async def _load_process_context(item: PipelineItem) -> Optional[_ProcessContext]: + async with get_session_ctx() as session: + fleet_model = await _refetch_locked_fleet_for_lock_decision(session=session, item=item) + if fleet_model is None: + log_lock_token_mismatch(logger, item) + return None + + locked_instance_ids = await _lock_fleet_instances_for_processing( + session=session, + item=item, + fleet_model=fleet_model, + ) + if locked_instance_ids is None: + return None + + fleet_model = await _refetch_locked_fleet_for_processing(session=session, item=item) + if fleet_model is None: + log_lock_token_mismatch(logger, item) + if locked_instance_ids: + await _unlock_fleet_locked_instances( + session=session, + item=item, + locked_instance_ids=locked_instance_ids, + ) + await session.commit() + return None + + return _ProcessContext( + fleet_model=fleet_model, + locked_instance_ids=locked_instance_ids, + ) + + +async def _refetch_locked_fleet_for_lock_decision( + session: AsyncSession, + item: PipelineItem, +) -> Optional[FleetModel]: + res = await session.execute( + select(FleetModel) + .where( + FleetModel.id == item.id, + FleetModel.lock_token == item.lock_token, + ) + .options( + load_only( + FleetModel.id, + FleetModel.status, + FleetModel.spec, + FleetModel.current_master_instance_id, + FleetModel.consolidation_attempt, + FleetModel.last_consolidated_at, + FleetModel.last_processed_at, + FleetModel.created_at, + ) + ) + .execution_options(populate_existing=True) + ) + return res.unique().scalar_one_or_none() + + +async def _refetch_locked_fleet_for_processing( + session: AsyncSession, + item: PipelineItem, +) -> Optional[FleetModel]: + res = await session.execute( + select(FleetModel) + .where( + FleetModel.id == item.id, + FleetModel.lock_token == item.lock_token, + ) + .options(joinedload(FleetModel.project)) + .options( + selectinload(FleetModel.instances.and_(InstanceModel.deleted == False)) + .joinedload(InstanceModel.jobs) + .load_only(JobModel.id), + ) + .options( + selectinload( + FleetModel.runs.and_(RunModel.status.not_in(RunStatus.finished_statuses())) + ).load_only(RunModel.status) + ) + .execution_options(populate_existing=True) + ) + return res.unique().scalar_one_or_none() + + +def _get_fleet_spec_if_ready_for_consolidation(fleet_model: FleetModel) -> Optional[FleetSpec]: + if fleet_model.status == FleetStatus.TERMINATING: + return None + consolidation_fleet_spec = get_fleet_spec(fleet_model) + # TODO: Drop fleet_spec.autocreated check after existing autocreated fleets no longer supported + if ( + consolidation_fleet_spec.configuration.nodes is None + or consolidation_fleet_spec.autocreated + ): + return None + if fleet_model.consolidation_attempt >= _MAX_CONSOLIDATION_ATTEMPTS: + return None + if not _is_fleet_ready_for_consolidation(fleet_model): + return None + return consolidation_fleet_spec + + +async def _lock_fleet_instances_for_processing( + session: AsyncSession, + item: PipelineItem, + fleet_model: FleetModel, +) -> Optional[set[uuid.UUID]]: + if _get_fleet_spec_if_ready_for_consolidation(fleet_model) is None: + if fleet_model.current_master_instance_id is None: + return set() + if not _is_cloud_cluster_fleet_spec(get_fleet_spec(fleet_model)): + return set() + + instance_lock, _ = get_locker(get_db().dialect_name).get_lockset(InstanceModel.__tablename__) + async with instance_lock: + res = await session.execute( + select(InstanceModel) + .where( + InstanceModel.fleet_id == item.id, + InstanceModel.deleted == False, + or_( + InstanceModel.lock_expires_at.is_(None), + InstanceModel.lock_expires_at < get_current_datetime(), + ), + or_( + InstanceModel.lock_owner.is_(None), + InstanceModel.lock_owner == FleetPipeline.__name__, + ), + ) + .with_for_update(skip_locked=True, key_share=True, of=InstanceModel) + .options(load_only(InstanceModel.id)) + ) + locked_instance_models = list(res.scalars().all()) + locked_instance_ids = {instance_model.id for instance_model in locked_instance_models} + + res = await session.execute( + select(InstanceModel.id).where( + InstanceModel.fleet_id == item.id, + InstanceModel.deleted == False, + ) + ) + current_instance_ids = set(res.scalars().all()) + if current_instance_ids != locked_instance_ids: + logger.debug( + "Failed to lock fleet %s instances. The fleet will be processed later.", + item.id, + ) + # Keep `lock_owner` so that `InstancePipeline` can check that the fleet is being locked + # but unset `lock_expires_at` to process the item again ASAP (after `min_processing_interval`). + # Unset `lock_token` so that heartbeater can no longer update the item. + res = await session.execute( + update(FleetModel) + .where( + FleetModel.id == item.id, + FleetModel.lock_token == item.lock_token, + ) + .values( + lock_expires_at=None, + lock_token=None, + last_processed_at=get_current_datetime(), + ) + .returning(FleetModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + log_lock_token_changed_on_reset(logger) + return None + + for instance_model in locked_instance_models: + instance_model.lock_expires_at = item.lock_expires_at + instance_model.lock_token = item.lock_token + instance_model.lock_owner = FleetPipeline.__name__ + await session.commit() + return locked_instance_ids + + +async def _apply_process_result( + item: PipelineItem, + context: _ProcessContext, + result: "_ProcessResult", +) -> None: + fleet_update_map = _FleetUpdateMap() + fleet_update_map.update(result.fleet_update_map) + set_processed_update_map_fields(fleet_update_map) + set_unlock_update_map_fields(fleet_update_map) + instance_update_rows = _build_instance_update_rows( + result.instance_id_to_update_map, + unlock_instance_ids=context.locked_instance_ids, + ) + + async with get_session_ctx() as session: + now = get_current_datetime() + resolve_now_placeholders(fleet_update_map, now=now) + resolve_now_placeholders(instance_update_rows, now=now) + res = await session.execute( + update(FleetModel) + .where( + FleetModel.id == context.fleet_model.id, + FleetModel.lock_token == context.fleet_model.lock_token, + ) + .values(**fleet_update_map) + .returning(FleetModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + log_lock_token_changed_after_processing(logger, item) + if context.locked_instance_ids: + await _unlock_fleet_locked_instances( + session=session, + item=item, + locked_instance_ids=context.locked_instance_ids, + ) + # TODO: Clean up fleet. + return + + if fleet_update_map.get("deleted"): + await session.execute( + update(PlacementGroupModel) + .where(PlacementGroupModel.fleet_id == context.fleet_model.id) + .values(fleet_deleted=True) + ) + await session.execute( + delete(ExportedFleetModel).where( + ExportedFleetModel.fleet_id == context.fleet_model.id + ) + ) + if instance_update_rows: + await session.execute( + update(InstanceModel), + instance_update_rows, + ) + if len(result.new_instance_creates) > 0: + await _create_missing_fleet_instances( + session=session, + fleet_model=context.fleet_model, + new_instance_creates=result.new_instance_creates, + ) + emit_fleet_status_change_event( + session=session, + fleet_model=context.fleet_model, + old_status=context.fleet_model.status, + new_status=fleet_update_map.get("status", context.fleet_model.status), + status_message=fleet_update_map.get( + "status_message", context.fleet_model.status_message + ), + ) + if result.consolidation_limit_reached: + events.emit( + session=session, + message=( + f"Fleet consolidation stopped after {_MAX_CONSOLIDATION_ATTEMPTS} attempts." + " Update the fleet to resume" + ), + actor=events.SystemActor(), + targets=[events.Target.from_model(context.fleet_model)], + ) + + +async def _process_fleet( + fleet_model: FleetModel, +) -> _ProcessResult: + result = _ProcessResult() + consolidation_fleet_spec = _get_fleet_spec_if_ready_for_consolidation(fleet_model) + if consolidation_fleet_spec is not None: + result = _consolidate_fleet_state_with_spec( + fleet_model, + consolidation_fleet_spec=consolidation_fleet_spec, + consolidation_instances=fleet_model.instances, + ) + if len(result.new_instance_creates) == 0 and _should_delete_fleet(fleet_model): + result.fleet_update_map["status"] = FleetStatus.TERMINATED + result.fleet_update_map["deleted"] = True + result.fleet_update_map["deleted_at"] = NOW_PLACEHOLDER + return result + _set_fail_instances_on_master_bootstrap_failure( + fleet_model=fleet_model, + instance_models=fleet_model.instances, + instance_id_to_update_map=result.instance_id_to_update_map, + ) + _set_current_master_instance_id( + fleet_model=fleet_model, + fleet_update_map=result.fleet_update_map, + instance_models=fleet_model.instances, + instance_id_to_update_map=result.instance_id_to_update_map, + new_instance_creates=result.new_instance_creates, + ) + return result + + +def _consolidate_fleet_state_with_spec( + fleet_model: FleetModel, + consolidation_fleet_spec: FleetSpec, + consolidation_instances: Sequence[InstanceModel], +) -> _ProcessResult: + result = _ProcessResult() + + spec_mismatch_updates = _terminate_instances_not_matching_fleet_spec( + instances=consolidation_instances, + fleet_spec=consolidation_fleet_spec, + ) + if spec_mismatch_updates: + result.instance_id_to_update_map.update(spec_mismatch_updates) + + # Exclude spec-mismatched instances so min/max check sees only compatible instances. + effective_instances = [i for i in consolidation_instances if i.id not in spec_mismatch_updates] + + maintain_nodes_result = _maintain_fleet_nodes_in_min_max_range( + instances=effective_instances, + fleet_spec=consolidation_fleet_spec, + ) + if maintain_nodes_result.has_changes: + result.instance_id_to_update_map.update(maintain_nodes_result.instance_id_to_update_map) + result.new_instance_creates = maintain_nodes_result.new_instance_creates + if len(spec_mismatch_updates) > 0 or maintain_nodes_result.changes_required: + new_attempt = fleet_model.consolidation_attempt + 1 + result.fleet_update_map["consolidation_attempt"] = new_attempt + if new_attempt >= _MAX_CONSOLIDATION_ATTEMPTS: + result.consolidation_limit_reached = True + else: + # The fleet is consolidated with respect to spec and nodes min/max. + result.fleet_update_map["consolidation_attempt"] = 0 + result.fleet_update_map["last_consolidated_at"] = NOW_PLACEHOLDER + return result + + +def _is_fleet_ready_for_consolidation(fleet_model: FleetModel) -> bool: + consolidation_retry_delay = _get_consolidation_retry_delay(fleet_model.consolidation_attempt) + last_consolidated_at = fleet_model.last_consolidated_at or fleet_model.created_at + duration_since_last_consolidation = get_current_datetime() - last_consolidated_at + return duration_since_last_consolidation >= consolidation_retry_delay + + +_MAX_CONSOLIDATION_ATTEMPTS = 15 + +# We use exponentially increasing consolidation retry delays so that +# consolidation does not happen too often. In particular, this prevents +# retrying instance provisioning constantly in case of no offers. +_CONSOLIDATION_RETRY_DELAYS = [ + timedelta(minutes=1), + timedelta(minutes=2), + timedelta(minutes=5), + timedelta(minutes=10), + timedelta(minutes=30), +] + + +def _get_consolidation_retry_delay(consolidation_attempt: int) -> timedelta: + if consolidation_attempt < len(_CONSOLIDATION_RETRY_DELAYS): + return _CONSOLIDATION_RETRY_DELAYS[consolidation_attempt] + return _CONSOLIDATION_RETRY_DELAYS[-1] + + +def _terminate_instances_not_matching_fleet_spec( + instances: Sequence[InstanceModel], + fleet_spec: FleetSpec, +) -> dict[uuid.UUID, _InstanceUpdateMap]: + updates: dict[uuid.UUID, _InstanceUpdateMap] = {} + for instance in instances: + if not _can_terminate_spec_mismatched_instance(instance): + continue + if not _instance_matches_fleet_spec(instance, fleet_spec): + updates[instance.id] = { + "status": InstanceStatus.TERMINATING, + "termination_reason": InstanceTerminationReason.FLEET_SPEC_MISMATCH, + "termination_reason_message": "Instance does not match fleet spec", + } + return updates + + +def _can_terminate_spec_mismatched_instance(instance: InstanceModel) -> bool: + if instance.deleted: + return False + # Pending instances have not selected an offer yet, so InstancePipeline will provision them + # using the current fleet spec. Recycle only instances already tied to the old spec. + return instance.status in (InstanceStatus.IDLE, InstanceStatus.PROVISIONING) + + +def _instance_matches_fleet_spec(instance: InstanceModel, fleet_spec: FleetSpec) -> bool: + if instance.offer is None: + # Not yet provisioned — will be provisioned using the current (updated) spec. + return True + profile = fleet_spec.merged_profile + requirements = get_fleet_requirements(fleet_spec) + return instance_matches_constraints( + instance, + backend_types=profile.backends, + regions=profile.regions, + instance_types=profile.instance_types, + zones=profile.availability_zones, + requirements=requirements, + ) + + +def _maintain_fleet_nodes_in_min_max_range( + instances: Sequence[InstanceModel], + fleet_spec: FleetSpec, +) -> _MaintainNodesResult: + """ + Ensures the fleet has at least `nodes.min` and at most `nodes.max` instances. + """ + assert fleet_spec.configuration.nodes is not None + result = _MaintainNodesResult() + for instance in instances: + # Delete terminated but not deleted instances since + # they are going to be replaced with new pending instances. + if instance.status == InstanceStatus.TERMINATED and not instance.deleted: + result.changes_required = True + result.instance_id_to_update_map[instance.id] = { + "deleted": True, + "deleted_at": NOW_PLACEHOLDER, + } + active_instances = [ + i for i in instances if i.status != InstanceStatus.TERMINATED and not i.deleted + ] + active_instances_num = len(active_instances) + if active_instances_num < fleet_spec.configuration.nodes.min: + result.changes_required = True + nodes_missing = fleet_spec.configuration.nodes.min - active_instances_num + taken_instance_nums = {instance.instance_num for instance in active_instances} + for _ in range(nodes_missing): + instance_num = get_next_instance_num(taken_instance_nums) + taken_instance_nums.add(instance_num) + result.new_instance_creates.append( + _NewInstanceCreate(id=uuid.uuid4(), instance_num=instance_num) + ) + return result + if ( + fleet_spec.configuration.nodes.max is None + or active_instances_num <= fleet_spec.configuration.nodes.max + ): + return result + # Fleet has more instances than allowed by nodes.max. + # This is possible due to race conditions (e.g. provisioning jobs in a fleet concurrently) + # or if nodes.max is updated. + result.changes_required = True + nodes_redundant = active_instances_num - fleet_spec.configuration.nodes.max + for instance in instances: + if nodes_redundant == 0: + break + if instance.status == InstanceStatus.IDLE: + result.instance_id_to_update_map[instance.id] = { + "termination_reason": InstanceTerminationReason.MAX_INSTANCES_LIMIT, + "termination_reason_message": "Fleet has too many instances", + "status": InstanceStatus.TERMINATING, + } + nodes_redundant -= 1 + return result + + +def _should_delete_fleet(fleet_model: FleetModel) -> bool: + if fleet_model.project.deleted: + # It used to be possible to delete project with active resources: + # https://github.com/dstackai/dstack/issues/3077 + logger.info("Fleet %s deleted due to deleted project", fleet_model.name) + return True + + if is_fleet_in_use(fleet_model) or not is_fleet_empty(fleet_model): + return False + + fleet_spec = get_fleet_spec(fleet_model) + if fleet_model.status == FleetStatus.TERMINATING: + logger.info("Automatic cleanup of terminating empty fleet %s", fleet_model.name) + return True + + # TODO: Drop autocreated fleet auto-deletion after existing autocreated fleets no longer supported. + if fleet_spec.autocreated: + logger.info("Automatic cleanup of empty autocreated fleet %s", fleet_model.name) + return True + + return False + + +def _build_instance_update_rows( + instance_id_to_update_map: dict[uuid.UUID, _InstanceUpdateMap], + unlock_instance_ids: set[uuid.UUID], +) -> list[_InstanceUpdateMap]: + instance_update_rows = [] + for instance_id in sorted(instance_id_to_update_map.keys() | unlock_instance_ids): + instance_update_map = instance_id_to_update_map.get(instance_id) + update_row = _InstanceUpdateMap() + if instance_update_map is not None: + update_row.update(instance_update_map) + if instance_id in unlock_instance_ids: + set_unlock_update_map_fields(update_row) + update_row["id"] = instance_id + set_processed_update_map_fields(update_row) + instance_update_rows.append(update_row) + return instance_update_rows + + +async def _unlock_fleet_locked_instances( + session: AsyncSession, + item: PipelineItem, + locked_instance_ids: set[uuid.UUID], +) -> None: + await session.execute( + update(InstanceModel) + .where( + InstanceModel.id.in_(locked_instance_ids), + InstanceModel.lock_token == item.lock_token, + InstanceModel.lock_owner == FleetPipeline.__name__, + ) + .values( + lock_expires_at=None, + lock_token=None, + lock_owner=None, + ) + ) + + +async def _create_missing_fleet_instances( + session: AsyncSession, + fleet_model: FleetModel, + new_instance_creates: Sequence[_NewInstanceCreate], +): + fleet_spec = get_fleet_spec(fleet_model) + for new_instance_create in new_instance_creates: + instance_model = create_fleet_instance_model( + session=session, + project=fleet_model.project, + # TODO: Store fleet.user and pass it instead of the project owner. + username=fleet_model.project.owner.name, + spec=fleet_spec, + instance_num=new_instance_create["instance_num"], + instance_id=new_instance_create["id"], + ) + instance_model.fleet_id = fleet_model.id + events.emit( + session=session, + message=( + "Instance created to meet target fleet node count." + f" Status: {instance_model.status.upper()}" + ), + actor=events.SystemActor(), + targets=[events.Target.from_model(instance_model)], + ) + logger.info( + "Added %d instances to fleet %s", + len(new_instance_creates), + fleet_model.name, + ) + + +def _set_fail_instances_on_master_bootstrap_failure( + fleet_model: FleetModel, + instance_models: Sequence[InstanceModel], + instance_id_to_update_map: dict[uuid.UUID, _InstanceUpdateMap], +) -> None: + """ + Terminates instances with MASTER_FAILED if the master dies with NO_OFFERS in a cluster with node.min == 0. + This is needed to avoid master re-election loop and fail fast. + """ + fleet_spec = get_fleet_spec(fleet_model) + if ( + not _is_cloud_cluster_fleet_spec(fleet_spec) + or fleet_spec.configuration.nodes is None + or fleet_spec.configuration.nodes.min != 0 + or fleet_model.current_master_instance_id is None + ): + return + + current_master_instance_model = None + for instance_model in instance_models: + if instance_model.id == fleet_model.current_master_instance_id: + current_master_instance_model = instance_model + break + if current_master_instance_model is None: + return + + if ( + current_master_instance_model.status != InstanceStatus.TERMINATED + or current_master_instance_model.termination_reason != InstanceTerminationReason.NO_OFFERS + ): + return + + surviving_instance_models = _get_surviving_instance_models_after_updates( + instance_models=instance_models, + instance_id_to_update_map=instance_id_to_update_map, + ) + if any( + instance_model.status not in InstanceStatus.finished_statuses() + and instance_model.job_provisioning_data is not None + for instance_model in surviving_instance_models + ): + # It should not be possible to provision non-master instances ahead of master + # but we still safe-guard against the case when there can be other instances provisioned. + return + + for instance_model in surviving_instance_models: + if ( + instance_model.id == current_master_instance_model.id + or instance_model.status in InstanceStatus.finished_statuses() + ): + continue + update_map = instance_id_to_update_map.setdefault(instance_model.id, _InstanceUpdateMap()) + update_map["status"] = InstanceStatus.TERMINATED + update_map["termination_reason"] = InstanceTerminationReason.MASTER_FAILED + + +def _set_current_master_instance_id( + fleet_model: FleetModel, + fleet_update_map: _FleetUpdateMap, + instance_models: Sequence[InstanceModel], + instance_id_to_update_map: dict[uuid.UUID, _InstanceUpdateMap], + new_instance_creates: Sequence[_NewInstanceCreate], +) -> None: + """ + Sets `current_master_instance_id` for `fleet_model`. + Master instance can be changed if the previous master is gone. + If there are no active instances, newly selected master may change backend/region/az/placement. + """ + fleet_spec = get_fleet_spec(fleet_model) + if not _is_cloud_cluster_fleet_spec(fleet_spec): + fleet_update_map["current_master_instance_id"] = None + return + surviving_instance_models = _get_surviving_instance_models_after_updates( + instance_models=instance_models, + instance_id_to_update_map=instance_id_to_update_map, + ) + current_master_instance_id = _select_current_master_instance_id( + current_master_instance_id=fleet_model.current_master_instance_id, + surviving_instance_models=surviving_instance_models, + instance_id_to_update_map=instance_id_to_update_map, + new_instance_creates=new_instance_creates, + ) + fleet_update_map["current_master_instance_id"] = current_master_instance_id + + +def _get_surviving_instance_models_after_updates( + instance_models: Sequence[InstanceModel], + instance_id_to_update_map: dict[uuid.UUID, _InstanceUpdateMap], +) -> list[InstanceModel]: + surviving_instance_models = [] + for instance_model in sorted(instance_models, key=lambda i: (i.instance_num, i.created_at)): + instance_update_map = instance_id_to_update_map.get(instance_model.id) + if instance_update_map is not None and instance_update_map.get("deleted"): + continue + surviving_instance_models.append(instance_model) + return surviving_instance_models + + +def _select_current_master_instance_id( + current_master_instance_id: Optional[uuid.UUID], + surviving_instance_models: Sequence[InstanceModel], + instance_id_to_update_map: dict[uuid.UUID, _InstanceUpdateMap], + new_instance_creates: Sequence[_NewInstanceCreate], +) -> Optional[uuid.UUID]: + # Keep the current master stable while it is still alive so InstancePipeline + # does not see fleet-wide election churn between provisioning attempts. + if current_master_instance_id is not None: + for instance_model in surviving_instance_models: + if ( + instance_model.id == current_master_instance_id + and _get_effective_instance_status( + instance_model, + instance_id_to_update_map=instance_id_to_update_map, + ) + not in InstanceStatus.finished_statuses() + ): + return instance_model.id + + # If the old master is gone, prefer a surviving provisioned instance so we + # keep following an already-established cluster placement decision. + for instance_model in surviving_instance_models: + if ( + _get_effective_instance_status( + instance_model, + instance_id_to_update_map=instance_id_to_update_map, + ) + not in InstanceStatus.finished_statuses() + and instance_model.job_provisioning_data is not None + ): + return instance_model.id + + # Prefer existing surviving instances over freshly planned replacements to + # avoid election churn during min-nodes backfill. Skip placeholders — + # they have no JPD and cannot anchor cluster placement, so electing one + # just defers the real master decision. + for instance_model in surviving_instance_models: + if is_placeholder_instance(instance_model): + continue + if ( + _get_effective_instance_status( + instance_model, + instance_id_to_update_map=instance_id_to_update_map, + ) + not in InstanceStatus.finished_statuses() + ): + return instance_model.id + + for new_instance_create in sorted(new_instance_creates, key=lambda i: i["instance_num"]): + return new_instance_create["id"] + + return None + + +def _get_effective_instance_status( + instance_model: InstanceModel, + instance_id_to_update_map: dict[uuid.UUID, _InstanceUpdateMap], +) -> InstanceStatus: + update_map = instance_id_to_update_map.get(instance_model.id) + if update_map is None: + return instance_model.status + return update_map.get("status", instance_model.status) + + +def _is_cloud_cluster_fleet_spec(fleet_spec: FleetSpec) -> bool: + configuration = fleet_spec.configuration + return ( + configuration.placement == InstanceGroupPlacement.CLUSTER + and configuration.ssh_config is None + ) diff --git a/src/dstack/_internal/server/background/pipeline_tasks/gateways.py b/src/dstack/_internal/server/background/pipeline_tasks/gateways.py new file mode 100644 index 0000000000..5c834c852a --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/gateways.py @@ -0,0 +1,620 @@ +import asyncio +import uuid +from dataclasses import dataclass, field +from datetime import timedelta +from typing import Optional, Sequence, TypedDict + +from sqlalchemy import delete, or_, select, update +from sqlalchemy.orm import joinedload, load_only, selectinload + +from dstack._internal.core.backends.base.compute import ComputeWithGatewaySupport +from dstack._internal.core.errors import BackendError, BackendNotAvailable +from dstack._internal.core.models.gateways import GATEWAY_REPLICAS_DEFAULT, GatewayStatus +from dstack._internal.server.background.pipeline_tasks.base import ( + Fetcher, + Heartbeater, + ItemUpdateMap, + Pipeline, + PipelineItem, + Worker, + log_lock_token_changed_after_processing, + log_lock_token_mismatch, + resolve_now_placeholders, + set_processed_update_map_fields, + set_unlock_update_map_fields, +) +from dstack._internal.server.db import get_db, get_session_ctx +from dstack._internal.server.models import ( + BackendModel, + GatewayComputeModel, + GatewayModel, + ProjectModel, +) +from dstack._internal.server.services import backends as backends_services +from dstack._internal.server.services import events +from dstack._internal.server.services import gateways as gateways_services +from dstack._internal.server.services.gateways import ( + emit_gateway_status_change_event, + get_gateway_compute_models, +) +from dstack._internal.server.services.gateways.pool import gateway_connections_pool +from dstack._internal.server.services.locking import get_locker +from dstack._internal.server.services.logging import fmt +from dstack._internal.server.services.pipelines import PipelineHinterProtocol +from dstack._internal.server.utils import sentry_utils +from dstack._internal.utils.common import get_current_datetime, run_async +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +@dataclass +class GatewayPipelineItem(PipelineItem): + status: GatewayStatus + to_be_deleted: bool + + +class GatewayPipeline(Pipeline[GatewayPipelineItem]): + def __init__( + self, + workers_num: int = 10, + queue_lower_limit_factor: float = 0.5, + queue_upper_limit_factor: float = 2.0, + min_processing_interval: timedelta = timedelta(seconds=15), + lock_timeout: timedelta = timedelta(seconds=30), + heartbeat_trigger: timedelta = timedelta(seconds=15), + *, + pipeline_hinter: PipelineHinterProtocol, + ) -> None: + super().__init__( + workers_num=workers_num, + queue_lower_limit_factor=queue_lower_limit_factor, + queue_upper_limit_factor=queue_upper_limit_factor, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeat_trigger=heartbeat_trigger, + ) + self.__heartbeater = Heartbeater[GatewayPipelineItem]( + model_type=GatewayModel, + lock_timeout=self._lock_timeout, + heartbeat_trigger=self._heartbeat_trigger, + ) + self.__fetcher = GatewayFetcher( + queue=self._queue, + queue_desired_minsize=self._queue_desired_minsize, + min_processing_interval=self._min_processing_interval, + lock_timeout=self._lock_timeout, + heartbeater=self._heartbeater, + ) + self.__workers = [ + GatewayWorker( + queue=self._queue, + heartbeater=self._heartbeater, + pipeline_hinter=pipeline_hinter, + ) + for _ in range(self._workers_num) + ] + + @property + def hint_fetch_model_name(self) -> str: + return GatewayModel.__name__ + + @property + def _heartbeater(self) -> Heartbeater[GatewayPipelineItem]: + return self.__heartbeater + + @property + def _fetcher(self) -> Fetcher[GatewayPipelineItem]: + return self.__fetcher + + @property + def _workers(self) -> Sequence["GatewayWorker"]: + return self.__workers + + +class GatewayFetcher(Fetcher[GatewayPipelineItem]): + def __init__( + self, + queue: asyncio.Queue[GatewayPipelineItem], + queue_desired_minsize: int, + min_processing_interval: timedelta, + lock_timeout: timedelta, + heartbeater: Heartbeater[GatewayPipelineItem], + queue_check_delay: float = 1.0, + ) -> None: + super().__init__( + queue=queue, + queue_desired_minsize=queue_desired_minsize, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeater=heartbeater, + queue_check_delay=queue_check_delay, + ) + + @sentry_utils.instrument_pipeline_task("GatewayFetcher.fetch") + async def fetch(self, limit: int) -> list[GatewayPipelineItem]: + gateway_lock, _ = get_locker(get_db().dialect_name).get_lockset(GatewayModel.__tablename__) + async with gateway_lock: + async with get_session_ctx() as session: + now = get_current_datetime() + res = await session.execute( + select(GatewayModel) + .where( + or_( + GatewayModel.status.in_( + [GatewayStatus.SUBMITTED, GatewayStatus.PROVISIONING] + ), + GatewayModel.to_be_deleted == True, + ), + or_( + GatewayModel.last_processed_at <= now - self._min_processing_interval, + GatewayModel.last_processed_at == GatewayModel.created_at, + ), + or_( + GatewayModel.lock_expires_at.is_(None), + GatewayModel.lock_expires_at < now, + ), + or_( + GatewayModel.lock_owner.is_(None), + GatewayModel.lock_owner == GatewayPipeline.__name__, + ), + ) + .order_by(GatewayModel.last_processed_at.asc()) + .limit(limit) + .with_for_update(skip_locked=True, key_share=True, of=GatewayModel) + .options( + load_only( + GatewayModel.id, + GatewayModel.lock_token, + GatewayModel.lock_expires_at, + GatewayModel.status, + GatewayModel.to_be_deleted, + ) + ) + ) + gateway_models = list(res.scalars().all()) + lock_expires_at = get_current_datetime() + self._lock_timeout + lock_token = uuid.uuid4() + items = [] + for gateway_model in gateway_models: + prev_lock_expired = gateway_model.lock_expires_at is not None + gateway_model.lock_expires_at = lock_expires_at + gateway_model.lock_token = lock_token + gateway_model.lock_owner = GatewayPipeline.__name__ + items.append( + GatewayPipelineItem( + __tablename__=GatewayModel.__tablename__, + id=gateway_model.id, + lock_expires_at=lock_expires_at, + lock_token=lock_token, + prev_lock_expired=prev_lock_expired, + status=gateway_model.status, + to_be_deleted=gateway_model.to_be_deleted, + ) + ) + await session.commit() + return items + + +class GatewayWorker(Worker[GatewayPipelineItem]): + def __init__( + self, + queue: asyncio.Queue[GatewayPipelineItem], + heartbeater: Heartbeater[GatewayPipelineItem], + pipeline_hinter: PipelineHinterProtocol, + ) -> None: + super().__init__( + queue=queue, + heartbeater=heartbeater, + pipeline_hinter=pipeline_hinter, + ) + + @sentry_utils.instrument_pipeline_task("GatewayWorker.process") + async def process(self, item: GatewayPipelineItem): + if item.to_be_deleted: + await _process_to_be_deleted_item(item) + elif item.status == GatewayStatus.SUBMITTED: + await _process_submitted_item(item) + elif item.status == GatewayStatus.PROVISIONING: + await _process_provisioning_item(item) + + +async def _process_submitted_item(item: GatewayPipelineItem): + async with get_session_ctx() as session: + res = await session.execute( + select(GatewayModel) + .where( + GatewayModel.id == item.id, + GatewayModel.lock_token == item.lock_token, + ) + .options(joinedload(GatewayModel.project).joinedload(ProjectModel.backends)) + .options(joinedload(GatewayModel.backend).load_only(BackendModel.type)) + ) + gateway_model = res.unique().scalar_one_or_none() + if gateway_model is None: + log_lock_token_mismatch(logger, item) + return + + result = await _process_submitted_gateway(gateway_model) + update_map = _GatewayUpdateMap() + update_map.update(result.update_map) + set_processed_update_map_fields(update_map) + set_unlock_update_map_fields(update_map) + async with get_session_ctx() as session: + for gateway_compute_model in result.gateway_compute_models: + session.add(gateway_compute_model) + now = get_current_datetime() + resolve_now_placeholders(update_map, now=now) + res = await session.execute( + update(GatewayModel) + .where( + GatewayModel.id == gateway_model.id, + GatewayModel.lock_token == gateway_model.lock_token, + ) + .values(**update_map) + .returning(GatewayModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + log_lock_token_changed_after_processing(logger, item) + # TODO: Clean up gateway_compute_models. + return + emit_gateway_status_change_event( + session=session, + gateway_model=gateway_model, + old_status=gateway_model.status, + new_status=update_map.get("status", gateway_model.status), + status_message=update_map.get("status_message", gateway_model.status_message), + ) + + +class _GatewayUpdateMap(ItemUpdateMap, total=False): + status: GatewayStatus + status_message: str + + +class _GatewayComputeUpdateMap(TypedDict, total=False): + active: bool + deleted: bool + + +@dataclass +class _SubmittedResult: + update_map: _GatewayUpdateMap = field(default_factory=_GatewayUpdateMap) + gateway_compute_models: list[GatewayComputeModel] = field(default_factory=list) + + +async def _process_submitted_gateway(gateway_model: GatewayModel) -> _SubmittedResult: + logger.info("%s: started gateway provisioning", fmt(gateway_model)) + configuration = gateways_services.get_gateway_configuration(gateway_model) + try: + ( + backend_model, + backend, + ) = await backends_services.get_project_backend_with_model_by_type_or_error( + project=gateway_model.project, backend_type=configuration.backend + ) + except BackendNotAvailable: + return _SubmittedResult( + update_map={ + "status": GatewayStatus.FAILED, + "status_message": "Backend not available", + } + ) + replicas = ( + configuration.replicas if configuration.replicas is not None else GATEWAY_REPLICAS_DEFAULT + ) + gateway_compute_models = [] + try: + for replica_num in range(replicas): + logger.debug( + "%s replica %d: creating gateway compute", fmt(gateway_model), replica_num + ) + gateway_compute_model = await gateways_services.create_gateway_compute( + backend_compute=backend.compute(), + project_name=gateway_model.project.name, + configuration=configuration, + replica_num=replica_num, + gateway_id=gateway_model.id, + backend_id=backend_model.id, + ) + logger.info("%s replica %d: gateway compute created", fmt(gateway_model), replica_num) + gateway_compute_models.append(gateway_compute_model) + return _SubmittedResult( + update_map={"status": GatewayStatus.PROVISIONING}, + gateway_compute_models=gateway_compute_models, + ) + except BackendError as e: + status_message = f"Backend error: {repr(e)}" + if len(e.args) > 0: + status_message = str(e.args[0]) + return _SubmittedResult( + update_map={ + "status": GatewayStatus.FAILED, + "status_message": status_message, + }, + gateway_compute_models=gateway_compute_models, + ) + except Exception as e: + logger.exception("%s: got exception when creating gateway compute", fmt(gateway_model)) + return _SubmittedResult( + update_map={ + "status": GatewayStatus.FAILED, + "status_message": f"Unexpected error: {repr(e)}", + }, + gateway_compute_models=gateway_compute_models, + ) + + +async def _process_provisioning_item(item: GatewayPipelineItem): + async with get_session_ctx() as session: + res = await session.execute( + select(GatewayModel) + .where( + GatewayModel.id == item.id, + GatewayModel.lock_token == item.lock_token, + ) + .options(joinedload(GatewayModel.gateway_compute)) + .options(selectinload(GatewayModel.gateway_computes)) + ) + gateway_model = res.unique().scalar_one_or_none() + if gateway_model is None: + log_lock_token_mismatch(logger, item) + return + + result = await _process_provisioning_gateway(gateway_model) + gateway_update_map = result.gateway_update_map + set_processed_update_map_fields(gateway_update_map) + set_unlock_update_map_fields(gateway_update_map) + + async with get_session_ctx() as session: + now = get_current_datetime() + resolve_now_placeholders(gateway_update_map, now=now) + res = await session.execute( + update(GatewayModel) + .where( + GatewayModel.id == gateway_model.id, + GatewayModel.lock_token == gateway_model.lock_token, + ) + .values(**gateway_update_map) + .returning(GatewayModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + log_lock_token_changed_after_processing(logger, item) + return + emit_gateway_status_change_event( + session=session, + gateway_model=gateway_model, + old_status=gateway_model.status, + new_status=gateway_update_map.get("status", gateway_model.status), + status_message=gateway_update_map.get("status_message", gateway_model.status_message), + ) + if result.all_computes_update_map: + res = await session.execute( + update(GatewayComputeModel) + .where( + or_( + GatewayComputeModel.gateway_id == gateway_model.id, + GatewayComputeModel.id == gateway_model.gateway_compute_id, + ) + ) + .values(**result.all_computes_update_map) + .returning(GatewayComputeModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) < len(get_gateway_compute_models(gateway_model)): + logger.error( + "Failed to update compute models for gateway %s." + " This is unexpected and may happen only if the compute model was manually deleted.", + gateway_model.id, + ) + + +@dataclass +class _ProvisioningResult: + gateway_update_map: _GatewayUpdateMap = field(default_factory=_GatewayUpdateMap) + all_computes_update_map: _GatewayComputeUpdateMap = field( + default_factory=_GatewayComputeUpdateMap + ) + + +async def _process_provisioning_gateway(gateway_model: GatewayModel) -> _ProvisioningResult: + gateway_computes = get_gateway_compute_models(gateway_model) + # Provisioning gateways must have compute. + assert len(gateway_computes) > 0 + + # TODO: do only one connection/configuration attempt per pipeline tick. + # Blocking on connect_to_gateway_with_retry and configure_gateway now has these cons: + # - cannot delete the gateway before it is provisioned because the DB model is locked + # - connection retry counter is reset on server restart + # - only one server replica is processing the gateway + + errors = await asyncio.gather( + *(_connect_and_configure_gateway_replica(gateway_model, gc) for gc in gateway_computes) + ) + if any(errors): + return _ProvisioningResult( + gateway_update_map={ + "status": GatewayStatus.FAILED, + "status_message": next(e for e in errors if e), + }, + all_computes_update_map={"active": False}, + ) + + return _ProvisioningResult( + gateway_update_map={"status": GatewayStatus.RUNNING}, + ) + + +async def _connect_and_configure_gateway_replica( + gateway_model: GatewayModel, + gateway_compute: GatewayComputeModel, +) -> Optional[str]: + """Returns an error message on failure, None on success.""" + logger.debug( + "%s replica %d: connecting to gateway compute", + fmt(gateway_model), + gateway_compute.replica_num, + ) + connection = await gateways_services.connect_to_gateway_with_retry(gateway_compute) + if connection is None: + logger.warning( + "%s replica %d: failed to connect to gateway compute", + fmt(gateway_model), + gateway_compute.replica_num, + ) + return "Failed to connect to gateway" + try: + await gateways_services.configure_gateway(connection) + except Exception: + logger.exception( + "%s replica %d: failed to configure gateway", + fmt(gateway_model), + gateway_compute.replica_num, + ) + await gateway_connections_pool.remove(gateway_compute.ip_address) + return "Failed to configure gateway" + logger.info( + "%s replica %d: gateway compute connected and configured", + fmt(gateway_model), + gateway_compute.replica_num, + ) + return None + + +async def _process_to_be_deleted_item(item: GatewayPipelineItem): + async with get_session_ctx() as session: + res = await session.execute( + select(GatewayModel) + .where( + GatewayModel.id == item.id, + GatewayModel.lock_token == item.lock_token, + ) + .options(joinedload(GatewayModel.project).joinedload(ProjectModel.backends)) + .options(joinedload(GatewayModel.gateway_compute)) + .options(selectinload(GatewayModel.gateway_computes)) + .options(joinedload(GatewayModel.backend).load_only(BackendModel.type)) + ) + gateway_model = res.unique().scalar_one_or_none() + if gateway_model is None: + log_lock_token_mismatch(logger, item) + return + + result = await _process_to_be_deleted_gateway(gateway_model) + async with get_session_ctx() as session: + if result.all_computes_update_map: + res = await session.execute( + update(GatewayComputeModel) + .where( + or_( + GatewayComputeModel.gateway_id == gateway_model.id, + GatewayComputeModel.id == gateway_model.gateway_compute_id, + ) + ) + .values(**result.all_computes_update_map) + .returning(GatewayComputeModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) < len(get_gateway_compute_models(gateway_model)): + logger.error( + "Failed to update compute models for gateway %s." + " This is unexpected and may happen only if the compute model was manually deleted.", + gateway_model.id, + ) + return + + if result.delete_gateway: + res = await session.execute( + delete(GatewayModel) + .where( + GatewayModel.id == gateway_model.id, + GatewayModel.lock_token == gateway_model.lock_token, + ) + .returning(GatewayModel.id) + ) + deleted_ids = list(res.scalars().all()) + if len(deleted_ids) == 0: + log_lock_token_changed_after_processing( + logger, + item, + action="delete", + expected_outcome="deleted", + ) + return + events.emit( + session, + "Gateway deleted", + actor=events.SystemActor(), + targets=[events.Target.from_model(gateway_model)], + ) + else: + update_map = _GatewayUpdateMap() + set_processed_update_map_fields(update_map) + set_unlock_update_map_fields(update_map) + resolve_now_placeholders(update_map, now=get_current_datetime()) + res = await session.execute( + update(GatewayModel) + .where( + GatewayModel.id == gateway_model.id, + GatewayModel.lock_token == gateway_model.lock_token, + ) + .values(**update_map) + .returning(GatewayModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + log_lock_token_changed_after_processing(logger, item) + return + + +@dataclass +class _ProcessToBeDeletedResult: + delete_gateway: bool + all_computes_update_map: _GatewayComputeUpdateMap = field( + default_factory=_GatewayComputeUpdateMap + ) + + +async def _process_to_be_deleted_gateway(gateway_model: GatewayModel) -> _ProcessToBeDeletedResult: + backend = await backends_services.get_project_backend_by_type_or_error( + project=gateway_model.project, backend_type=gateway_model.backend.type + ) + compute = backend.compute() + assert isinstance(compute, ComputeWithGatewaySupport) + + for gateway_compute in get_gateway_compute_models(gateway_model): + gateway_compute_configuration = gateways_services.get_gateway_compute_configuration( + gateway_compute=gateway_compute, + gateway_model=gateway_model, + ) + logger.debug( + "%s replica %d: terminating gateway compute", + fmt(gateway_model), + gateway_compute.replica_num, + ) + try: + await run_async( + compute.terminate_gateway, + gateway_compute.instance_id, + gateway_compute_configuration, + gateway_compute.backend_data, + ) + except Exception: + logger.exception( + "%s replica %d: error when terminating gateway compute", + fmt(gateway_model), + gateway_compute.replica_num, + ) + return _ProcessToBeDeletedResult(delete_gateway=False) + logger.info( + "%s replica %d: gateway compute terminated", + fmt(gateway_model), + gateway_compute.replica_num, + ) + await gateway_connections_pool.remove(gateway_compute.ip_address) + + return _ProcessToBeDeletedResult( + delete_gateway=True, + all_computes_update_map={"active": False, "deleted": True}, + ) diff --git a/src/dstack/_internal/server/background/pipeline_tasks/instances/__init__.py b/src/dstack/_internal/server/background/pipeline_tasks/instances/__init__.py new file mode 100644 index 0000000000..343b05f813 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/instances/__init__.py @@ -0,0 +1,535 @@ +import asyncio +import uuid +from dataclasses import dataclass +from datetime import timedelta +from typing import Optional, Sequence + +from sqlalchemy import and_, not_, or_, select, update +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import joinedload, load_only + +from dstack._internal.core.models.health import HealthStatus +from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.server.background.pipeline_tasks.base import ( + Fetcher, + Heartbeater, + Pipeline, + PipelineItem, + Worker, + log_lock_token_changed_after_processing, + log_lock_token_mismatch, + resolve_now_placeholders, + set_processed_update_map_fields, + set_unlock_update_map_fields, +) +from dstack._internal.server.background.pipeline_tasks.instances.check import ( + check_instance, + process_idle_timeout, +) +from dstack._internal.server.background.pipeline_tasks.instances.cloud_provisioning import ( + create_cloud_instance, +) +from dstack._internal.server.background.pipeline_tasks.instances.common import ( + ProcessResult, +) +from dstack._internal.server.background.pipeline_tasks.instances.ssh_deploy import ( + add_ssh_instance, +) +from dstack._internal.server.background.pipeline_tasks.instances.termination import ( + terminate_instance, +) +from dstack._internal.server.db import get_db, get_session_ctx +from dstack._internal.server.models import ( + FleetModel, + InstanceHealthCheckModel, + InstanceModel, + JobModel, + ProjectModel, +) +from dstack._internal.server.services import events +from dstack._internal.server.services.instances import ( + emit_instance_status_change_event, + is_ssh_instance, +) +from dstack._internal.server.services.locking import get_locker +from dstack._internal.server.services.pipelines import PipelineHinterProtocol +from dstack._internal.server.services.placement import ( + schedule_fleet_placement_groups_deletion, +) +from dstack._internal.server.utils import sentry_utils +from dstack._internal.utils.common import get_current_datetime +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +INSTANCE_STATUSES_WITH_MIN_PROCESSING_INTERVAL = [ + InstanceStatus.PENDING, + InstanceStatus.PROVISIONING, + InstanceStatus.TERMINATING, +] + + +@dataclass +class InstancePipelineItem(PipelineItem): + status: InstanceStatus + + +class InstancePipeline(Pipeline[InstancePipelineItem]): + def __init__( + self, + workers_num: int = 20, + queue_lower_limit_factor: float = 0.5, + queue_upper_limit_factor: float = 2.0, + min_processing_interval: timedelta = timedelta(seconds=7), + lock_timeout: timedelta = timedelta(seconds=30), + heartbeat_trigger: timedelta = timedelta(seconds=15), + *, + pipeline_hinter: PipelineHinterProtocol, + ) -> None: + super().__init__( + workers_num=workers_num, + queue_lower_limit_factor=queue_lower_limit_factor, + queue_upper_limit_factor=queue_upper_limit_factor, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeat_trigger=heartbeat_trigger, + ) + self.__heartbeater = Heartbeater[InstancePipelineItem]( + model_type=InstanceModel, + lock_timeout=self._lock_timeout, + heartbeat_trigger=self._heartbeat_trigger, + ) + self.__fetcher = InstanceFetcher( + queue=self._queue, + queue_desired_minsize=self._queue_desired_minsize, + min_processing_interval=self._min_processing_interval, + lock_timeout=self._lock_timeout, + heartbeater=self._heartbeater, + ) + self.__workers = [ + InstanceWorker( + queue=self._queue, + heartbeater=self._heartbeater, + pipeline_hinter=pipeline_hinter, + ) + for _ in range(self._workers_num) + ] + + @property + def hint_fetch_model_name(self) -> str: + return InstanceModel.__name__ + + @property + def _heartbeater(self) -> Heartbeater[InstancePipelineItem]: + return self.__heartbeater + + @property + def _fetcher(self) -> Fetcher[InstancePipelineItem]: + return self.__fetcher + + @property + def _workers(self) -> Sequence["InstanceWorker"]: + return self.__workers + + +class InstanceFetcher(Fetcher[InstancePipelineItem]): + def __init__( + self, + queue: asyncio.Queue[InstancePipelineItem], + queue_desired_minsize: int, + min_processing_interval: timedelta, + lock_timeout: timedelta, + heartbeater: Heartbeater[InstancePipelineItem], + queue_check_delay: float = 1.0, + ) -> None: + super().__init__( + queue=queue, + queue_desired_minsize=queue_desired_minsize, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeater=heartbeater, + queue_check_delay=queue_check_delay, + ) + + @sentry_utils.instrument_pipeline_task("InstanceFetcher.fetch") + async def fetch(self, limit: int) -> list[InstancePipelineItem]: + instance_lock, _ = get_locker(get_db().dialect_name).get_lockset( + InstanceModel.__tablename__ + ) + async with instance_lock: + async with get_session_ctx() as session: + now = get_current_datetime() + res = await session.execute( + select(InstanceModel) + .join(InstanceModel.fleet, isouter=True) + .where( + InstanceModel.status.in_( + [ + InstanceStatus.PENDING, + InstanceStatus.PROVISIONING, + InstanceStatus.BUSY, + InstanceStatus.IDLE, + InstanceStatus.TERMINATING, + ] + ), + not_( + and_( + InstanceModel.status == InstanceStatus.TERMINATING, + InstanceModel.compute_group_id.is_not(None), + ) + ), + # Skip placeholder instances managed by JobSubmittedPipeline. + not_( + and_( + InstanceModel.status == InstanceStatus.PENDING, + InstanceModel.provisioning_job_id.is_not(None), + ) + ), + InstanceModel.deleted == False, + or_( + # Process fast-moving instances (pending, provisioning, terminating) + # at base interval for low-latency state transitions. + # Steady-state instances (idle, busy) use a longer interval + # since they only need periodic health checks. + and_( + InstanceModel.status.in_( + INSTANCE_STATUSES_WITH_MIN_PROCESSING_INTERVAL + ), + InstanceModel.last_processed_at + <= now - self._min_processing_interval, + ), + and_( + InstanceModel.status.not_in( + INSTANCE_STATUSES_WITH_MIN_PROCESSING_INTERVAL + ), + InstanceModel.last_processed_at + <= now - self._min_processing_interval * 2, + ), + InstanceModel.last_processed_at == InstanceModel.created_at, + InstanceModel.skip_min_processing_interval == True, + ), + or_( + and_( + # Do not try to lock instances if the fleet is waiting for the + # lock, but allow retrying instances whose own lock is stale + # because the fleet pipeline cannot reclaim stale instance locks. + or_( + InstanceModel.fleet_id.is_(None), + FleetModel.lock_owner.is_(None), + ), + InstanceModel.lock_expires_at.is_(None), + ), + InstanceModel.lock_expires_at < now, + ), + or_( + InstanceModel.lock_owner.is_(None), + InstanceModel.lock_owner == InstancePipeline.__name__, + ), + ) + .order_by(InstanceModel.last_processed_at.asc()) + .limit(limit) + .with_for_update(skip_locked=True, key_share=True, of=InstanceModel) + .options( + load_only( + InstanceModel.id, + InstanceModel.lock_token, + InstanceModel.lock_expires_at, + InstanceModel.status, + InstanceModel.skip_min_processing_interval, + ) + ) + ) + instance_models = list(res.scalars().all()) + lock_expires_at = get_current_datetime() + self._lock_timeout + lock_token = uuid.uuid4() + items = [] + for instance_model in instance_models: + prev_lock_expired = instance_model.lock_expires_at is not None + instance_model.lock_expires_at = lock_expires_at + instance_model.lock_token = lock_token + instance_model.lock_owner = InstancePipeline.__name__ + instance_model.skip_min_processing_interval = False + items.append( + InstancePipelineItem( + __tablename__=InstanceModel.__tablename__, + id=instance_model.id, + lock_expires_at=lock_expires_at, + lock_token=lock_token, + prev_lock_expired=prev_lock_expired, + status=instance_model.status, + ) + ) + await session.commit() + return items + + +class InstanceWorker(Worker[InstancePipelineItem]): + def __init__( + self, + queue: asyncio.Queue[InstancePipelineItem], + heartbeater: Heartbeater[InstancePipelineItem], + pipeline_hinter: PipelineHinterProtocol, + ) -> None: + super().__init__( + queue=queue, + heartbeater=heartbeater, + pipeline_hinter=pipeline_hinter, + ) + + @sentry_utils.instrument_pipeline_task("InstanceWorker.process") + async def process(self, item: InstancePipelineItem): + process_context: Optional[_ProcessContext] = None + if item.status == InstanceStatus.PENDING: + process_context = await _process_pending_item(item) + elif item.status == InstanceStatus.PROVISIONING: + process_context = await _process_provisioning_item(item) + elif item.status == InstanceStatus.IDLE: + process_context = await _process_idle_item(item) + elif item.status == InstanceStatus.BUSY: + process_context = await _process_busy_item(item) + elif item.status == InstanceStatus.TERMINATING: + process_context = await _process_terminating_item(item) + if process_context is None: + return + + # Keep apply centralized here because every instance path returns the same + # `ProcessResult` shape for one primary model, with only a small set of + # optional side effects such as health checks or placement-group scheduling. + await _apply_process_result( + item=item, + instance_model=process_context.instance_model, + result=process_context.result, + ) + + +@dataclass +class _ProcessContext: + instance_model: InstanceModel + result: ProcessResult + + +async def _process_pending_item(item: InstancePipelineItem) -> Optional[_ProcessContext]: + async with get_session_ctx() as session: + instance_model = await _refetch_locked_instance_for_pending_or_terminating( + session=session, + item=item, + ) + if instance_model is None: + log_lock_token_mismatch(logger, item) + return None + if is_ssh_instance(instance_model): + result = await add_ssh_instance(instance_model) + else: + result = await create_cloud_instance(instance_model) + return _ProcessContext(instance_model=instance_model, result=result) + + +async def _process_provisioning_item(item: InstancePipelineItem) -> Optional[_ProcessContext]: + async with get_session_ctx() as session: + instance_model = await _refetch_locked_instance_for_check(session=session, item=item) + if instance_model is None: + log_lock_token_mismatch(logger, item) + return None + result = await check_instance(instance_model) + return _ProcessContext(instance_model=instance_model, result=result) + + +async def _process_idle_item(item: InstancePipelineItem) -> Optional[_ProcessContext]: + async with get_session_ctx() as session: + instance_model = await _refetch_locked_instance_for_idle(session=session, item=item) + if instance_model is None: + log_lock_token_mismatch(logger, item) + return None + idle_result = await process_idle_timeout( + session=session, + instance_model=instance_model, + ) + if idle_result is not None: + return _ProcessContext(instance_model=instance_model, result=idle_result) + result = await check_instance(instance_model) + return _ProcessContext(instance_model=instance_model, result=result) + + +async def _process_busy_item(item: InstancePipelineItem) -> Optional[_ProcessContext]: + async with get_session_ctx() as session: + instance_model = await _refetch_locked_instance_for_check(session=session, item=item) + if instance_model is None: + log_lock_token_mismatch(logger, item) + return None + result = await check_instance(instance_model) + return _ProcessContext(instance_model=instance_model, result=result) + + +async def _process_terminating_item(item: InstancePipelineItem) -> Optional[_ProcessContext]: + async with get_session_ctx() as session: + instance_model = await _refetch_locked_instance_for_pending_or_terminating( + session=session, + item=item, + ) + if instance_model is None: + log_lock_token_mismatch(logger, item) + return None + result = await terminate_instance(instance_model) + return _ProcessContext(instance_model=instance_model, result=result) + + +async def _refetch_locked_instance_for_pending_or_terminating( + session: AsyncSession, item: InstancePipelineItem +) -> Optional[InstanceModel]: + res = await session.execute( + select(InstanceModel) + .where( + InstanceModel.id == item.id, + InstanceModel.lock_token == item.lock_token, + ) + .options(joinedload(InstanceModel.project).joinedload(ProjectModel.backends)) + .options(joinedload(InstanceModel.jobs).load_only(JobModel.id, JobModel.status)) + .options(joinedload(InstanceModel.fleet)) + ) + return res.unique().scalar_one_or_none() + + +async def _refetch_locked_instance_for_idle( + session: AsyncSession, item: InstancePipelineItem +) -> Optional[InstanceModel]: + res = await session.execute( + select(InstanceModel) + .where( + InstanceModel.id == item.id, + InstanceModel.lock_token == item.lock_token, + ) + .options(joinedload(InstanceModel.project)) + .options(joinedload(InstanceModel.jobs).load_only(JobModel.id, JobModel.status)) + .options(joinedload(InstanceModel.fleet)) + ) + return res.unique().scalar_one_or_none() + + +async def _refetch_locked_instance_for_check( + session: AsyncSession, item: InstancePipelineItem +) -> Optional[InstanceModel]: + res = await session.execute( + select(InstanceModel) + .where( + InstanceModel.id == item.id, + InstanceModel.lock_token == item.lock_token, + ) + .options( + joinedload(InstanceModel.project).load_only( + ProjectModel.id, + ProjectModel.ssh_public_key, + ProjectModel.ssh_private_key, + ) + ) + .options(joinedload(InstanceModel.jobs).load_only(JobModel.id, JobModel.status)) + ) + return res.unique().scalar_one_or_none() + + +async def _apply_process_result( + item: InstancePipelineItem, + instance_model: InstanceModel, + result: ProcessResult, +) -> None: + set_processed_update_map_fields(result.instance_update_map) + set_unlock_update_map_fields(result.instance_update_map) + + async with get_session_ctx() as session: + if result.health_check_create is not None: + session.add(InstanceHealthCheckModel(**result.health_check_create)) + if result.new_placement_group_models: + session.add_all(result.new_placement_group_models) + if result.health_check_create is not None or result.new_placement_group_models: + await session.flush() + + now = get_current_datetime() + resolve_now_placeholders(result.instance_update_map, now=now) + + res = await session.execute( + update(InstanceModel) + .where( + InstanceModel.id == item.id, + InstanceModel.lock_token == item.lock_token, + ) + .values(**result.instance_update_map) + .returning(InstanceModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + log_lock_token_changed_after_processing(logger, item) + await session.rollback() + return + + if result.schedule_pg_deletion_fleet_id is not None: + await schedule_fleet_placement_groups_deletion( + session=session, + fleet_id=result.schedule_pg_deletion_fleet_id, + except_placement_group_ids=( + () + if result.schedule_pg_deletion_except_id is None + else (result.schedule_pg_deletion_except_id,) + ), + ) + + emit_instance_status_change_event( + session=session, + instance_model=instance_model, + old_status=instance_model.status, + new_status=result.instance_update_map.get("status", instance_model.status), + termination_reason=result.instance_update_map.get( + "termination_reason", instance_model.termination_reason + ), + termination_reason_message=result.instance_update_map.get( + "termination_reason_message", + instance_model.termination_reason_message, + ), + ) + _emit_instance_health_change_event( + session=session, + instance_model=instance_model, + old_health=instance_model.health, + new_health=result.instance_update_map.get("health", instance_model.health), + ) + _emit_instance_reachability_change_event( + session=session, + instance_model=instance_model, + old_status=instance_model.status, + old_unreachable=instance_model.unreachable, + new_unreachable=result.instance_update_map.get( + "unreachable", instance_model.unreachable + ), + ) + + +def _emit_instance_health_change_event( + session: AsyncSession, + instance_model: InstanceModel, + old_health: HealthStatus, + new_health: HealthStatus, +) -> None: + if old_health == new_health: + return + events.emit( + session, + f"Instance health changed {old_health.upper()} -> {new_health.upper()}", + actor=events.SystemActor(), + targets=[events.Target.from_model(instance_model)], + ) + + +def _emit_instance_reachability_change_event( + session: AsyncSession, + instance_model: InstanceModel, + old_status: InstanceStatus, + old_unreachable: bool, + new_unreachable: bool, +) -> None: + if not old_status.is_available() or old_unreachable == new_unreachable: + return + events.emit( + session, + "Instance became unreachable" if new_unreachable else "Instance became reachable", + actor=events.SystemActor(), + targets=[events.Target.from_model(instance_model)], + ) diff --git a/src/dstack/_internal/server/background/pipeline_tasks/instances/check.py b/src/dstack/_internal/server/background/pipeline_tasks/instances/check.py new file mode 100644 index 0000000000..486c83dbf6 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/instances/check.py @@ -0,0 +1,569 @@ +import logging +import uuid +from collections.abc import Mapping +from datetime import timedelta +from typing import Optional + +import gpuhunt +import requests +from sqlalchemy import func, select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import joinedload + +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.base.compute import ( + get_dstack_runner_download_url, + get_dstack_runner_version, + get_dstack_shim_download_url, + get_dstack_shim_version, +) +from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT +from dstack._internal.core.errors import ProvisioningError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.health import HealthStatus +from dstack._internal.core.models.instances import InstanceStatus, InstanceTerminationReason +from dstack._internal.core.models.profiles import TerminationPolicy +from dstack._internal.core.models.runs import JobProvisioningData +from dstack._internal.server import settings as server_settings +from dstack._internal.server.background.pipeline_tasks.instances.common import ( + TERMINATION_DEADLINE_OFFSET, + HealthCheckCreate, + ProcessResult, + can_terminate_fleet_instances_on_idle_duration, + get_instance_idle_duration, + get_provisioning_deadline, + set_health_update, + set_status_update, + set_unreachable_update, +) +from dstack._internal.server.db import get_session_ctx +from dstack._internal.server.models import InstanceHealthCheckModel, InstanceModel, ProjectModel +from dstack._internal.server.schemas.instances import InstanceCheck +from dstack._internal.server.schemas.runner import ( + ComponentInfo, + ComponentStatus, + InstanceHealthResponse, +) +from dstack._internal.server.services import backends as backends_services +from dstack._internal.server.services.instances import ( + get_instance_provisioning_data, + get_instance_ssh_private_keys, + is_ssh_instance, + remove_dangling_tasks_from_instance, +) +from dstack._internal.server.services.logging import fmt +from dstack._internal.server.services.runner import client as runner_client +from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel +from dstack._internal.utils.common import get_current_datetime, get_or_error, run_async +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +async def process_idle_timeout( + session: AsyncSession, + instance_model: InstanceModel, +) -> Optional[ProcessResult]: + if not ( + instance_model.status == InstanceStatus.IDLE + and instance_model.termination_policy == TerminationPolicy.DESTROY_AFTER_IDLE + and not instance_model.jobs + ): + return None + # Do not terminate instances on idle duration if fleet is already at `nodes.min`. + # This is an optimization to avoid terminate-create loop. + # There may be race conditions since we don't take the fleet lock. + # That's ok: in the worst case we go below `nodes.min`, but + # the fleet consolidation logic will provision new nodes. + if ( + instance_model.fleet is not None + and not await can_terminate_fleet_instances_on_idle_duration( + session=session, + fleet_model=instance_model.fleet, + ) + ): + return None + + idle_duration = get_instance_idle_duration(instance_model) + if idle_duration <= timedelta(seconds=instance_model.termination_idle_time): + return None + + result = ProcessResult() + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATING, + termination_reason=InstanceTerminationReason.IDLE_TIMEOUT, + termination_reason_message=f"Instance idle for {idle_duration.seconds}s", + ) + return result + + +async def check_instance(instance_model: InstanceModel) -> ProcessResult: + result = ProcessResult() + if ( + instance_model.status == InstanceStatus.BUSY + and instance_model.jobs + and all(job.status.is_finished() for job in instance_model.jobs) + ): + # A busy instance could have no active jobs due to this bug: + # https://github.com/dstackai/dstack/issues/2068 + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATING, + termination_reason=InstanceTerminationReason.JOB_FINISHED, + ) + logger.warning( + "Detected busy instance %s with finished job. Marked as TERMINATING", + instance_model.name, + extra={ + "instance_name": instance_model.name, + "instance_status": instance_model.status.value, + }, + ) + return result + + job_provisioning_data = get_or_error(get_instance_provisioning_data(instance_model)) + if job_provisioning_data.hostname is None: + return await _process_wait_for_instance_provisioning_data( + instance_model=instance_model, + job_provisioning_data=job_provisioning_data, + ) + + if not job_provisioning_data.dockerized: + if instance_model.status == InstanceStatus.PROVISIONING: + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.BUSY, + ) + return result + + check_instance_health = await _should_check_instance_health(instance_model.id) + instance_check = await _run_instance_check( + instance_model=instance_model, + job_provisioning_data=job_provisioning_data, + check_instance_health=check_instance_health, + ) + health_status = _get_health_status_for_instance_check( + instance_model=instance_model, + instance_check=instance_check, + check_instance_health=check_instance_health, + ) + _log_instance_check_result( + instance_model=instance_model, + instance_check=instance_check, + health_status=health_status, + check_instance_health=check_instance_health, + ) + + if instance_check.has_health_checks(): + # ensured by has_health_checks() + assert instance_check.health_response is not None + result.health_check_create = HealthCheckCreate( + instance_id=instance_model.id, + collected_at=get_current_datetime(), + status=health_status, + response=instance_check.health_response.json(), + ) + + set_health_update( + update_map=result.instance_update_map, + instance_model=instance_model, + health=health_status, + ) + set_unreachable_update( + update_map=result.instance_update_map, + instance_model=instance_model, + unreachable=not instance_check.reachable, + ) + + if instance_check.reachable: + result.instance_update_map["termination_deadline"] = None + if instance_model.status == InstanceStatus.PROVISIONING: + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.IDLE if not instance_model.jobs else InstanceStatus.BUSY, + ) + return result + + now = get_current_datetime() + if not is_ssh_instance(instance_model) and instance_model.termination_deadline is None: + result.instance_update_map["termination_deadline"] = now + TERMINATION_DEADLINE_OFFSET + + if ( + instance_model.status == InstanceStatus.PROVISIONING + and instance_model.started_at is not None + ): + provisioning_deadline = get_provisioning_deadline( + instance_model=instance_model, + job_provisioning_data=job_provisioning_data, + ) + if now > provisioning_deadline: + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATING, + termination_reason=InstanceTerminationReason.PROVISIONING_TIMEOUT, + termination_reason_message="Instance did not become reachable in time", + ) + elif instance_model.status.is_available(): + deadline = instance_model.termination_deadline + if deadline is not None and now > deadline: + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATING, + termination_reason=InstanceTerminationReason.UNREACHABLE, + ) + return result + + +async def _should_check_instance_health(instance_id) -> bool: + health_check_cutoff = get_current_datetime() - timedelta( + seconds=server_settings.SERVER_INSTANCE_HEALTH_MIN_COLLECT_INTERVAL_SECONDS + ) + async with get_session_ctx() as session: + res = await session.execute( + select(func.count(1)).where( + InstanceHealthCheckModel.instance_id == instance_id, + InstanceHealthCheckModel.collected_at > health_check_cutoff, + ) + ) + return res.scalar_one() == 0 + + +async def _run_instance_check( + instance_model: InstanceModel, + job_provisioning_data: JobProvisioningData, + check_instance_health: bool, +) -> InstanceCheck: + ssh_private_keys = get_instance_ssh_private_keys(instance_model) + instance_check = await run_async( + _check_instance_inner, + ssh_private_keys, + job_provisioning_data, + None, + instance=instance_model, + check_instance_health=check_instance_health, + ) + # May return False if fails to establish ssh connection. + if instance_check is False: + return InstanceCheck(reachable=False, message="SSH or tunnel error") + return instance_check + + +def _get_health_status_for_instance_check( + instance_model: InstanceModel, + instance_check: InstanceCheck, + check_instance_health: bool, +) -> HealthStatus: + if instance_check.reachable and check_instance_health: + return instance_check.get_health_status() + # Keep previous health status. + return instance_model.health + + +def _log_instance_check_result( + instance_model: InstanceModel, + instance_check: InstanceCheck, + health_status: HealthStatus, + check_instance_health: bool, +) -> None: + loglevel = logging.DEBUG + if not instance_check.reachable and instance_model.status.is_available(): + loglevel = logging.WARNING + elif check_instance_health and not health_status.is_healthy(): + loglevel = logging.WARNING + logger.log( + loglevel, + "Instance %s check: reachable=%s health_status=%s message=%r", + instance_model.name, + instance_check.reachable, + health_status.name, + instance_check.message, + extra={"instance_name": instance_model.name, "health_status": health_status}, + ) + + +async def _process_wait_for_instance_provisioning_data( + instance_model: InstanceModel, + job_provisioning_data: JobProvisioningData, +) -> ProcessResult: + result = ProcessResult() + logger.debug("Waiting for instance %s to become running", instance_model.name) + provisioning_deadline = get_provisioning_deadline( + instance_model=instance_model, + job_provisioning_data=job_provisioning_data, + ) + if get_current_datetime() > provisioning_deadline: + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATING, + termination_reason=InstanceTerminationReason.PROVISIONING_TIMEOUT, + termination_reason_message="Backend did not complete provisioning in time", + ) + return result + + backend = await _get_backend_for_provisioning_wait( + project_id=instance_model.project_id, + backend_type=job_provisioning_data.backend, + ) + if backend is None: + logger.warning( + "Instance %s failed because instance's backend is not available", + instance_model.name, + ) + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATING, + termination_reason=InstanceTerminationReason.ERROR, + termination_reason_message="Backend not available", + ) + return result + + try: + await run_async( + backend.compute().update_provisioning_data, + job_provisioning_data, + instance_model.project.ssh_public_key, + instance_model.project.ssh_private_key, + ) + result.instance_update_map["job_provisioning_data"] = job_provisioning_data.json() + except ProvisioningError as exc: + logger.warning( + "Error while waiting for instance %s to become running: %s", + instance_model.name, + repr(exc), + ) + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATING, + termination_reason=InstanceTerminationReason.ERROR, + termination_reason_message="Error while waiting for instance to become running", + ) + except Exception: + logger.exception( + "Got exception when updating instance %s provisioning data", + instance_model.name, + ) + return result + + +async def _get_backend_for_provisioning_wait( + project_id: uuid.UUID, + backend_type: BackendType, +) -> Optional[Backend]: + async with get_session_ctx() as session: + res = await session.execute( + select(ProjectModel) + .where(ProjectModel.id == project_id) + .options(joinedload(ProjectModel.backends)) + ) + project_model = res.unique().scalar_one_or_none() + if project_model is None: + return None + return await backends_services.get_project_backend_by_type( + project=project_model, + backend_type=backend_type, + ) + + +@runner_ssh_tunnel +def _check_instance_inner( + addresses: Mapping[int, runner_client.LocalAddress], + *, + instance: InstanceModel, + check_instance_health: bool = False, +) -> InstanceCheck: + instance_health_response: Optional[InstanceHealthResponse] = None + shim_client = runner_client.ShimClient.from_address(addresses[DSTACK_SHIM_HTTP_PORT]) + method = shim_client.healthcheck + try: + healthcheck_response = method(unmask_exceptions=True) + if check_instance_health: + method = shim_client.get_instance_health + instance_health_response = method() + except requests.RequestException as exc: + template = "shim.%s(): request error: %s" + args = (method.__func__.__name__, exc) + logger.debug(template, *args) + return InstanceCheck(reachable=False, message=template % args) + except Exception as exc: + template = "shim.%s(): unexpected exception %s: %s" + args = (method.__func__.__name__, exc.__class__.__name__, exc) + logger.exception(template, *args) + return InstanceCheck(reachable=False, message=template % args) + + try: + remove_dangling_tasks_from_instance(shim_client, instance) + except Exception as exc: + logger.exception("%s: error removing dangling tasks: %s", fmt(instance), exc) + + # There should be no shim API calls after this function call since it can request shim restart. + _maybe_install_components(instance, shim_client) + return runner_client.healthcheck_response_to_instance_check( + healthcheck_response, + instance_health_response, + ) + + +def _maybe_install_components( + instance_model: InstanceModel, + shim_client: runner_client.ShimClient, +) -> None: + try: + components = shim_client.get_components() + except requests.RequestException as exc: + logger.warning( + "Instance %s: shim.get_components(): request error: %s", instance_model.name, exc + ) + return + if components is None: + logger.debug("Instance %s: no components info", instance_model.name) + return + + installed_shim_version: Optional[str] = None + installation_requested = False + + if (runner_info := components.runner) is not None: + installation_requested |= _maybe_install_runner(instance_model, shim_client, runner_info) + else: + logger.debug("Instance %s: no runner info", instance_model.name) + + if (shim_info := components.shim) is not None: + if shim_info.status == ComponentStatus.INSTALLED: + installed_shim_version = shim_info.version + installation_requested |= _maybe_install_shim(instance_model, shim_client, shim_info) + else: + logger.debug("Instance %s: no shim info", instance_model.name) + + # old shim without `dstack-shim` component and `/api/shutdown` support + # or the same version is already running + # or we just requested installation of at least one component + # or at least one component is already being installed + # or at least one shim task won't survive restart + running_shim_version = shim_client.get_version_string() + if ( + installed_shim_version is None + or installed_shim_version == running_shim_version + or installation_requested + or any(component.status == ComponentStatus.INSTALLING for component in components) + or not shim_client.is_safe_to_restart() + ): + return + + if shim_client.shutdown(force=False): + logger.debug( + "Instance %s: restarting shim %s -> %s", + instance_model.name, + running_shim_version, + installed_shim_version, + ) + else: + logger.debug("Instance %s: cannot restart shim", instance_model.name) + + +def _maybe_install_runner( + instance_model: InstanceModel, + shim_client: runner_client.ShimClient, + runner_info: ComponentInfo, +) -> bool: + # For developers: + # * To install the latest dev build for the current branch from the CI, + # set DSTACK_USE_LATEST_FROM_BRANCH=1. + # * To provide your own build, set DSTACK_RUNNER_VERSION_URL and DSTACK_RUNNER_DOWNLOAD_URL. + expected_version = get_dstack_runner_version() + if expected_version is None: + return False + + installed_version = runner_info.version + logger.debug( + "Instance %s: runner status=%s installed_version=%s", + instance_model.name, + runner_info.status.value, + installed_version or "(no version)", + ) + if runner_info.status == ComponentStatus.INSTALLING: + logger.debug("Instance %s: runner is already being installed", instance_model.name) + return False + if installed_version and installed_version == expected_version: + logger.debug("Instance %s: expected runner version already installed", instance_model.name) + return False + + url = get_dstack_runner_download_url( + arch=_get_instance_cpu_arch(instance_model), + version=expected_version, + ) + logger.debug( + "Instance %s: installing runner %s -> %s from %s", + instance_model.name, + installed_version or "(no version)", + expected_version, + url, + ) + try: + shim_client.install_runner(url) + return True + except requests.RequestException as exc: + logger.warning("Instance %s: shim.install_runner(): %s", instance_model.name, exc) + return False + + +def _maybe_install_shim( + instance_model: InstanceModel, + shim_client: runner_client.ShimClient, + shim_info: ComponentInfo, +) -> bool: + # For developers: + # * To install the latest dev build for the current branch from the CI, + # set DSTACK_USE_LATEST_FROM_BRANCH=1. + # * To provide your own build, set DSTACK_SHIM_VERSION_URL and DSTACK_SHIM_DOWNLOAD_URL. + expected_version = get_dstack_shim_version() + if expected_version is None: + return False + + installed_version = shim_info.version + logger.debug( + "Instance %s: shim status=%s installed_version=%s running_version=%s", + instance_model.name, + shim_info.status.value, + installed_version or "(no version)", + shim_client.get_version_string(), + ) + if shim_info.status == ComponentStatus.INSTALLING: + logger.debug("Instance %s: shim is already being installed", instance_model.name) + return False + if installed_version and installed_version == expected_version: + logger.debug("Instance %s: expected shim version already installed", instance_model.name) + return False + + url = get_dstack_shim_download_url( + arch=_get_instance_cpu_arch(instance_model), + version=expected_version, + ) + logger.debug( + "Instance %s: installing shim %s -> %s from %s", + instance_model.name, + installed_version or "(no version)", + expected_version, + url, + ) + try: + shim_client.install_shim(url) + return True + except requests.RequestException as exc: + logger.warning("Instance %s: shim.install_shim(): %s", instance_model.name, exc) + return False + + +def _get_instance_cpu_arch(instance_model: InstanceModel) -> Optional[gpuhunt.CPUArchitecture]: + job_provisioning_data = get_instance_provisioning_data(instance_model) + if job_provisioning_data is None: + return None + return job_provisioning_data.instance_type.resources.cpu_arch diff --git a/src/dstack/_internal/server/background/pipeline_tasks/instances/cloud_provisioning.py b/src/dstack/_internal/server/background/pipeline_tasks/instances/cloud_provisioning.py new file mode 100644 index 0000000000..e86862cd4e --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/instances/cloud_provisioning.py @@ -0,0 +1,442 @@ +import uuid +from dataclasses import dataclass +from typing import Optional + +from pydantic import ValidationError +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import load_only +from sqlalchemy.orm.attributes import set_committed_value + +from dstack._internal.core.backends.base.compute import ( + ComputeWithCreateInstanceSupport, + ComputeWithPlacementGroupSupport, + generate_unique_placement_group_name, +) +from dstack._internal.core.backends.features import ( + BACKENDS_WITH_CREATE_INSTANCE_SUPPORT, + BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT, +) +from dstack._internal.core.errors import ( + BackendError, + PlacementGroupNotSupportedError, + SkipOffer, +) +from dstack._internal.core.models.instances import ( + InstanceOfferWithAvailability, + InstanceStatus, + InstanceTerminationReason, +) +from dstack._internal.core.models.placement import PlacementGroupConfiguration, PlacementStrategy +from dstack._internal.core.models.runs import JobProvisioningData +from dstack._internal.server import settings as server_settings +from dstack._internal.server.background.pipeline_tasks.base import NOW_PLACEHOLDER +from dstack._internal.server.background.pipeline_tasks.instances.common import ( + ProcessResult, + set_status_update, +) +from dstack._internal.server.db import get_session_ctx +from dstack._internal.server.models import FleetModel, InstanceModel, PlacementGroupModel +from dstack._internal.server.services.fleets import get_fleet_offers, is_cloud_cluster +from dstack._internal.server.services.instances import ( + get_instance_configuration, + get_instance_profile, + get_instance_provisioning_data, + get_instance_requirements, +) +from dstack._internal.server.services.logging import fmt +from dstack._internal.server.services.offers import get_instance_offer_with_restricted_az +from dstack._internal.server.services.placement import ( + get_fleet_placement_group_models, + placement_group_model_to_placement_group, + placement_group_model_to_placement_group_optional, +) +from dstack._internal.utils.common import get_or_error, run_async +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +@dataclass +class _ClusterMasterContext: + current_master_instance_model: InstanceModel + is_current_instance_master: bool + master_job_provisioning_data: Optional[JobProvisioningData] + + +async def create_cloud_instance(instance_model: InstanceModel) -> ProcessResult: + result = ProcessResult() + + try: + instance_configuration = get_instance_configuration(instance_model) + profile = get_instance_profile(instance_model) + requirements = get_instance_requirements(instance_model) + except ValidationError as exc: + logger.exception( + "%s: error parsing profile, requirements or instance configuration", + fmt(instance_model), + ) + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATED, + termination_reason=InstanceTerminationReason.ERROR, + termination_reason_message=( + f"Error to parse profile, requirements or instance_configuration: {exc}" + ), + ) + return result + + cluster_context = None + placement_group_models: list[PlacementGroupModel] = [] + placement_group_model = None + master_job_provisioning_data = None + if instance_model.fleet is not None and is_cloud_cluster(instance_model.fleet): + cluster_context = await _get_cluster_master_context(instance_model) + if cluster_context is None: + # Waiting for the master + return result + placement_group_models, placement_group_model = await _get_cluster_placement_context( + instance_model=instance_model, + cluster_context=cluster_context, + ) + master_job_provisioning_data = cluster_context.master_job_provisioning_data + + offers = await get_fleet_offers( + project=instance_model.project, + profile=profile, + requirements=requirements, + fleet_model=instance_model.fleet, + placement_group=placement_group_model_to_placement_group_optional(placement_group_model), + blocks="auto" if instance_model.total_blocks is None else instance_model.total_blocks, + exclude_not_available=True, + master_job_provisioning_data=master_job_provisioning_data, + infer_master_job_provisioning_data_from_fleet_instances=False, + include_only_create_instance_supported_backends=True, + ) + + offers_iter = iter(offers) + offers_tried = 0 + # Limit number of offers tried to prevent long-running processing in case all offers fail. + while offers_tried < server_settings.MAX_OFFERS_TRIED: + backend_with_instance_offer = next(offers_iter, None) + if backend_with_instance_offer is None: + break + backend, instance_offer = backend_with_instance_offer + + if instance_offer.backend not in BACKENDS_WITH_CREATE_INSTANCE_SUPPORT: + continue + compute = backend.compute() + assert isinstance(compute, ComputeWithCreateInstanceSupport) + if master_job_provisioning_data is not None: + # `get_fleet_offers()` already restricts backend and region from the master. + # Availability zone still has to be narrowed per offer. + instance_offer = get_instance_offer_with_restricted_az( + instance_offer=instance_offer, + master_job_provisioning_data=master_job_provisioning_data, + ) + if ( + cluster_context is not None + and cluster_context.is_current_instance_master + and instance_offer.backend in BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT + and isinstance(compute, ComputeWithPlacementGroupSupport) + and ( + compute.are_placement_groups_compatible_with_reservations(instance_offer.backend) + or instance_configuration.reservation is None + ) + ): + ( + placement_group_model, + created_placement_group_model, + ) = await _find_or_create_suitable_placement_group_model( + instance_model=instance_model, + placement_group_models=placement_group_models, + instance_offer=instance_offer, + compute=compute, + ) + if placement_group_model is None: + continue + if created_placement_group_model: + placement_group_models.append(placement_group_model) + result.new_placement_group_models.append(placement_group_model) + + logger.debug( + "Trying %s in %s/%s for $%0.4f per hour", + instance_offer.instance.name, + instance_offer.backend.value, + instance_offer.region, + instance_offer.price, + ) + offers_tried += 1 + try: + job_provisioning_data = await run_async( + compute.create_instance, + instance_offer, + instance_configuration, + placement_group_model_to_placement_group_optional(placement_group_model), + ) + except SkipOffer as exc: + offers_tried -= 1 + logger.info( + "%s launch in %s/%s skipped: %s", + instance_offer.instance.name, + instance_offer.backend.value, + instance_offer.region, + exc, + extra={"instance_name": instance_model.name}, + ) + continue + except BackendError as exc: + logger.warning( + "%s launch in %s/%s failed: %s", + instance_offer.instance.name, + instance_offer.backend.value, + instance_offer.region, + repr(exc), + extra={"instance_name": instance_model.name}, + ) + continue + except Exception: + logger.exception( + "Got exception when launching %s in %s/%s", + instance_offer.instance.name, + instance_offer.backend.value, + instance_offer.region, + ) + continue + + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.PROVISIONING, + ) + result.instance_update_map["backend"] = backend.TYPE + result.instance_update_map["region"] = instance_offer.region + result.instance_update_map["price"] = instance_offer.price + result.instance_update_map["instance_configuration"] = instance_configuration.json() + result.instance_update_map["job_provisioning_data"] = job_provisioning_data.json() + result.instance_update_map["offer"] = instance_offer.json() + result.instance_update_map["total_blocks"] = instance_offer.total_blocks + result.instance_update_map["started_at"] = NOW_PLACEHOLDER + + if ( + instance_model.fleet_id is not None + and cluster_context is not None + and cluster_context.is_current_instance_master + ): + # Clean up placement groups that did not end up being used. + result.schedule_pg_deletion_fleet_id = instance_model.fleet_id + if placement_group_model is not None: + result.schedule_pg_deletion_except_id = placement_group_model.id + return result + + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATED, + termination_reason=InstanceTerminationReason.NO_OFFERS, + termination_reason_message="All offers failed" if offers else "No offers found", + ) + return result + + +async def _get_cluster_master_context( + instance_model: InstanceModel, +) -> Optional[_ClusterMasterContext]: + assert instance_model.fleet is not None and is_cloud_cluster(instance_model.fleet) + assert instance_model.fleet_id is not None + async with get_session_ctx() as session: + current_master_instance_model = await _load_current_master_instance( + session=session, + fleet_id=instance_model.fleet_id, + ) + if current_master_instance_model is None: + logger.debug( + "%s: waiting for fleet pipeline to elect current cluster master", + fmt(instance_model), + ) + return None + + is_current_instance_master = current_master_instance_model.id == instance_model.id + master_job_provisioning_data = None + if not is_current_instance_master: + if ( + current_master_instance_model.deleted + or current_master_instance_model.status == InstanceStatus.TERMINATED + ): + logger.debug( + "%s: waiting for fleet pipeline to replace current master %s", + fmt(instance_model), + current_master_instance_model.id, + ) + return None + master_job_provisioning_data = get_instance_provisioning_data( + current_master_instance_model + ) + if master_job_provisioning_data is None: + logger.debug( + "%s: waiting for current master %s to determine cluster placement", + fmt(instance_model), + current_master_instance_model.id, + ) + return None + + return _ClusterMasterContext( + current_master_instance_model=current_master_instance_model, + is_current_instance_master=is_current_instance_master, + master_job_provisioning_data=master_job_provisioning_data, + ) + + +async def _get_cluster_placement_context( + instance_model: InstanceModel, + cluster_context: _ClusterMasterContext, +) -> tuple[list[PlacementGroupModel], Optional[PlacementGroupModel]]: + assert instance_model.fleet is not None and is_cloud_cluster(instance_model.fleet) + assert instance_model.fleet_id is not None + async with get_session_ctx() as session: + placement_group_models = await get_fleet_placement_group_models( + session=session, + fleet_id=instance_model.fleet_id, + ) + for placement_group_model in placement_group_models: + _populate_placement_group_relations( + placement_group_model=placement_group_model, + instance_model=instance_model, + ) + placement_group_model = None + if not cluster_context.is_current_instance_master: + # Non-master instances only reuse the placement group chosen by the + # current master. They never create a new placement group themselves. + placement_group_model = _get_current_master_placement_group_model( + placement_group_models=placement_group_models, + fleet_id=instance_model.fleet_id, + ) + return placement_group_models, placement_group_model + + +async def _load_current_master_instance( + session: AsyncSession, + fleet_id: uuid.UUID, +) -> Optional[InstanceModel]: + res = await session.execute( + select(FleetModel.current_master_instance_id).where(FleetModel.id == fleet_id) + ) + current_master_instance_id = res.scalar_one_or_none() + if current_master_instance_id is None: + return None + res = await session.execute( + select(InstanceModel) + .where( + InstanceModel.id == current_master_instance_id, + ) + .options( + load_only( + InstanceModel.id, + InstanceModel.deleted, + InstanceModel.status, + InstanceModel.job_provisioning_data, + ) + ) + ) + return res.scalar_one_or_none() + + +def _get_current_master_placement_group_model( + placement_group_models: list[PlacementGroupModel], + fleet_id: uuid.UUID, +) -> Optional[PlacementGroupModel]: + if not placement_group_models: + return None + if len(placement_group_models) > 1: + logger.error( + "Expected 0 or 1 placement groups associated with fleet master %s, found %s." + " Using the first placement group for this provisioning attempt.", + fleet_id, + len(placement_group_models), + ) + return placement_group_models[0] + + +def _populate_placement_group_relations( + placement_group_model: PlacementGroupModel, + instance_model: InstanceModel, +) -> None: + # Placement groups are loaded in a separate session from the instance worker. + # Reattach the already-known project/fleet objects so later detached access + # can build a PlacementGroup value object without lazy loading. + set_committed_value(placement_group_model, "project", instance_model.project) + if instance_model.fleet is not None: + set_committed_value(placement_group_model, "fleet", instance_model.fleet) + + +async def _find_or_create_suitable_placement_group_model( + instance_model: InstanceModel, + placement_group_models: list[PlacementGroupModel], + instance_offer: InstanceOfferWithAvailability, + compute: ComputeWithPlacementGroupSupport, +) -> tuple[Optional[PlacementGroupModel], bool]: + for placement_group_model in placement_group_models: + if compute.is_suitable_placement_group( + placement_group_model_to_placement_group(placement_group_model), + instance_offer, + ): + return placement_group_model, False + + assert instance_model.fleet is not None + placement_group_id = uuid.uuid4() + placement_group_name = generate_unique_placement_group_name( + project_name=instance_model.project.name, + fleet_name=instance_model.fleet.name, + ) + placement_group_model = PlacementGroupModel( + id=placement_group_id, + name=placement_group_name, + project=instance_model.project, + fleet=get_or_error(instance_model.fleet), + configuration=PlacementGroupConfiguration( + backend=instance_offer.backend, + region=instance_offer.region, + placement_strategy=PlacementStrategy.CLUSTER, + ).json(), + ) + placement_group = placement_group_model_to_placement_group(placement_group_model) + logger.debug( + "Creating placement group %s in %s/%s", + placement_group.name, + placement_group.configuration.backend.value, + placement_group.configuration.region, + ) + try: + provisioning_data = await run_async( + compute.create_placement_group, + placement_group, + instance_offer, + ) + except PlacementGroupNotSupportedError: + logger.debug( + "Skipping offer %s because placement group not supported", + instance_offer.instance.name, + ) + return None, False + except BackendError as exc: + logger.warning( + "Failed to create placement group %s in %s/%s: %r", + placement_group.name, + placement_group.configuration.backend.value, + placement_group.configuration.region, + exc, + ) + return None, False + except Exception: + logger.exception( + "Got exception when creating placement group %s in %s/%s", + placement_group.name, + placement_group.configuration.backend.value, + placement_group.configuration.region, + ) + return None, False + + placement_group.provisioning_data = provisioning_data + placement_group_model.provisioning_data = provisioning_data.json() + return placement_group_model, True diff --git a/src/dstack/_internal/server/background/pipeline_tasks/instances/common.py b/src/dstack/_internal/server/background/pipeline_tasks/instances/common.py new file mode 100644 index 0000000000..a386960478 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/instances/common.py @@ -0,0 +1,178 @@ +import datetime +import uuid +from dataclasses import dataclass, field +from datetime import timedelta +from typing import Optional, TypedDict, Union + +from paramiko.pkey import PKey +from sqlalchemy import func, select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.health import HealthStatus +from dstack._internal.core.models.instances import ( + InstanceStatus, + InstanceTerminationReason, + SSHKey, +) +from dstack._internal.core.models.runs import JobProvisioningData +from dstack._internal.server.background.pipeline_tasks.base import ( + ItemUpdateMap, + UpdateMapDateTime, +) +from dstack._internal.server.background.pipeline_tasks.common import get_provisioning_timeout +from dstack._internal.server.models import FleetModel, InstanceModel, PlacementGroupModel +from dstack._internal.server.services.fleets import get_fleet_spec +from dstack._internal.utils.common import UNSET, Unset, get_current_datetime +from dstack._internal.utils.ssh import pkey_from_str + +TERMINATION_DEADLINE_OFFSET = timedelta(minutes=20) +TERMINATION_RETRY_TIMEOUT = timedelta(seconds=30) +TERMINATION_RETRY_MAX_DURATION = timedelta(minutes=15) +PROVISIONING_TIMEOUT_SECONDS = 10 * 60 # 10 minutes in seconds + + +class InstanceUpdateMap(ItemUpdateMap, total=False): + status: InstanceStatus + unreachable: bool + started_at: UpdateMapDateTime + finished_at: UpdateMapDateTime + instance_configuration: str + termination_deadline: Optional[datetime.datetime] + termination_reason: Optional[InstanceTerminationReason] + termination_reason_message: Optional[str] + health: HealthStatus + first_termination_retry_at: UpdateMapDateTime + last_termination_retry_at: UpdateMapDateTime + backend: BackendType + backend_data: Optional[str] + offer: str + region: str + price: float + job_provisioning_data: str + total_blocks: int + busy_blocks: int + deleted: bool + deleted_at: UpdateMapDateTime + + +class HealthCheckCreate(TypedDict): + instance_id: uuid.UUID + collected_at: datetime.datetime + status: HealthStatus + response: str + + +@dataclass +class ProcessResult: + instance_update_map: InstanceUpdateMap = field(default_factory=InstanceUpdateMap) + health_check_create: Optional[HealthCheckCreate] = None + new_placement_group_models: list[PlacementGroupModel] = field(default_factory=list) + schedule_pg_deletion_fleet_id: Optional[uuid.UUID] = None + schedule_pg_deletion_except_id: Optional[uuid.UUID] = None + + +async def can_terminate_fleet_instances_on_idle_duration( + session: AsyncSession, + fleet_model: FleetModel, +) -> bool: + fleet_spec = get_fleet_spec(fleet_model) + # TODO: Drop fleet_spec.autocreated check after existing autocreated fleets no longer supported + if fleet_spec.configuration.nodes is None or fleet_spec.autocreated: + return True + res = await session.execute( + select(func.count(1)).where( + InstanceModel.fleet_id == fleet_model.id, + InstanceModel.deleted == False, + InstanceModel.status.not_in(InstanceStatus.finished_statuses()), + ) + ) + return res.scalar_one() > fleet_spec.configuration.nodes.min + + +def get_instance_idle_duration(instance_model: InstanceModel) -> datetime.timedelta: + last_time = instance_model.created_at + if instance_model.last_job_processed_at is not None: + last_time = instance_model.last_job_processed_at + return get_current_datetime() - last_time + + +def get_provisioning_deadline( + instance_model: InstanceModel, + job_provisioning_data: JobProvisioningData, +) -> datetime.datetime: + assert instance_model.started_at is not None + timeout_interval = get_provisioning_timeout( + backend_type=job_provisioning_data.get_base_backend(), + instance_type_name=job_provisioning_data.instance_type.name, + ) + return instance_model.started_at + timeout_interval + + +def next_termination_retry_at(last_termination_retry_at: datetime.datetime) -> datetime.datetime: + return last_termination_retry_at + TERMINATION_RETRY_TIMEOUT + + +def get_termination_deadline(first_termination_retry_at: datetime.datetime) -> datetime.datetime: + return first_termination_retry_at + TERMINATION_RETRY_MAX_DURATION + + +def ssh_keys_to_pkeys(ssh_keys: list[SSHKey]) -> list[PKey]: + return [pkey_from_str(ssh_key.private) for ssh_key in ssh_keys if ssh_key.private is not None] + + +def set_status_update( + update_map: InstanceUpdateMap, + instance_model: InstanceModel, + new_status: InstanceStatus, + termination_reason: Union[Optional[InstanceTerminationReason], Unset] = UNSET, + termination_reason_message: Union[Optional[str], Unset] = UNSET, +) -> bool: + old_status = instance_model.status + changed = False + if old_status == new_status: + if not isinstance(termination_reason, Unset): + update_map["termination_reason"] = termination_reason + changed = True + if not isinstance(termination_reason_message, Unset): + update_map["termination_reason_message"] = termination_reason_message + changed = True + return changed + + effective_termination_reason = instance_model.termination_reason + if not isinstance(termination_reason, Unset): + effective_termination_reason = termination_reason + update_map["termination_reason"] = effective_termination_reason + changed = True + + effective_termination_reason_message = instance_model.termination_reason_message + if not isinstance(termination_reason_message, Unset): + effective_termination_reason_message = termination_reason_message + update_map["termination_reason_message"] = effective_termination_reason_message + changed = True + + update_map["status"] = new_status + changed = True + return changed + + +def set_health_update( + update_map: InstanceUpdateMap, + instance_model: InstanceModel, + health: HealthStatus, +) -> bool: + if instance_model.health == health: + return False + update_map["health"] = health + return True + + +def set_unreachable_update( + update_map: InstanceUpdateMap, + instance_model: InstanceModel, + unreachable: bool, +) -> bool: + if not instance_model.status.is_available() or instance_model.unreachable == unreachable: + return False + update_map["unreachable"] = unreachable + return True diff --git a/src/dstack/_internal/server/background/pipeline_tasks/instances/ssh_deploy.py b/src/dstack/_internal/server/background/pipeline_tasks/instances/ssh_deploy.py new file mode 100644 index 0000000000..b4e3e1122a --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/instances/ssh_deploy.py @@ -0,0 +1,302 @@ +import asyncio +from datetime import timedelta +from typing import Any, Optional + +from paramiko.pkey import PKey +from paramiko.ssh_exception import PasswordRequiredException +from pydantic import ValidationError + +from dstack._internal import settings +from dstack._internal.core.backends.base.compute import ( + GoArchType, + get_dstack_runner_binary_path, + get_dstack_shim_binary_path, + get_dstack_working_dir, + get_shim_env, + get_shim_pre_start_commands, +) +from dstack._internal.core.errors import SSHProvisioningError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import ( + InstanceAvailability, + InstanceOfferWithAvailability, + InstanceRuntime, + InstanceStatus, + InstanceTerminationReason, + RemoteConnectionInfo, +) +from dstack._internal.core.models.runs import JobProvisioningData +from dstack._internal.server.background.pipeline_tasks.base import NOW_PLACEHOLDER +from dstack._internal.server.background.pipeline_tasks.instances.common import ( + PROVISIONING_TIMEOUT_SECONDS, + ProcessResult, + set_status_update, + ssh_keys_to_pkeys, +) +from dstack._internal.server.models import InstanceModel +from dstack._internal.server.schemas.instances import InstanceCheck +from dstack._internal.server.schemas.runner import HealthcheckResponse +from dstack._internal.server.services.instances import get_instance_remote_connection_info +from dstack._internal.server.services.logging import fmt +from dstack._internal.server.services.offers import is_divisible_into_blocks +from dstack._internal.server.services.runner import client as runner_client +from dstack._internal.server.services.ssh_fleets.provisioning import ( + detect_cpu_arch, + get_host_info, + get_paramiko_connection, + get_shim_healthcheck, + host_info_to_instance_type, + remove_dstack_runner_if_exists, + remove_host_info_if_exists, + run_pre_start_commands, + run_shim_as_systemd_service, + upload_envs, +) +from dstack._internal.utils.common import get_current_datetime, run_async +from dstack._internal.utils.logging import get_logger +from dstack._internal.utils.network import get_ip_from_network, is_ip_among_addresses + +logger = get_logger(__name__) + + +async def add_ssh_instance(instance_model: InstanceModel) -> ProcessResult: + result = ProcessResult() + logger.info("Adding ssh instance %s...", instance_model.name) + + retry_duration_deadline = instance_model.created_at + timedelta( + seconds=PROVISIONING_TIMEOUT_SECONDS + ) + if retry_duration_deadline < get_current_datetime(): + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATED, + termination_reason=InstanceTerminationReason.PROVISIONING_TIMEOUT, + termination_reason_message=( + f"Failed to add SSH instance in {PROVISIONING_TIMEOUT_SECONDS}s" + ), + ) + return result + + remote_details = get_instance_remote_connection_info(instance_model) + assert remote_details is not None + + try: + pkeys = ssh_keys_to_pkeys(remote_details.ssh_keys) + ssh_proxy_pkeys = None + if remote_details.ssh_proxy_keys is not None: + ssh_proxy_pkeys = ssh_keys_to_pkeys(remote_details.ssh_proxy_keys) + except (ValueError, PasswordRequiredException): + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATED, + termination_reason=InstanceTerminationReason.ERROR, + termination_reason_message="Unsupported private SSH key type", + ) + return result + + authorized_keys = [pkey.public.strip() for pkey in remote_details.ssh_keys] + authorized_keys.append(instance_model.project.ssh_public_key.strip()) + + try: + future = run_async( + _deploy_instance, + remote_details, + pkeys, + ssh_proxy_pkeys, + authorized_keys, + ) + health, host_info, arch = await asyncio.wait_for(future, timeout=20 * 60) + except (asyncio.TimeoutError, TimeoutError) as exc: + logger.warning( + "%s: deploy timeout when adding SSH instance: %s", + fmt(instance_model), + repr(exc), + ) + return result + except SSHProvisioningError as exc: + logger.warning( + "%s: provisioning error when adding SSH instance: %s", + fmt(instance_model), + repr(exc), + ) + return result + except Exception: + logger.exception("%s: unexpected error when adding SSH instance", fmt(instance_model)) + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATED, + termination_reason=InstanceTerminationReason.ERROR, + termination_reason_message="Unexpected error when adding SSH instance", + ) + return result + + instance_type = host_info_to_instance_type(host_info, arch) + try: + instance_network, internal_ip = _resolve_ssh_instance_network(instance_model, host_info) + except _SSHInstanceNetworkResolutionError as exc: + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATED, + termination_reason=InstanceTerminationReason.ERROR, + termination_reason_message=str(exc), + ) + return result + + divisible, blocks = is_divisible_into_blocks( + cpu_count=instance_type.resources.cpus, + gpu_count=len(instance_type.resources.gpus), + blocks="auto" if instance_model.total_blocks is None else instance_model.total_blocks, + ) + if not divisible: + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATED, + termination_reason=InstanceTerminationReason.ERROR, + termination_reason_message="Cannot split into blocks", + ) + return result + + region = instance_model.region + assert region is not None + job_provisioning_data = JobProvisioningData( + backend=BackendType.REMOTE, + instance_type=instance_type, + instance_id="instance_id", + hostname=remote_details.host, + region=region, + price=0, + internal_ip=internal_ip, + instance_network=instance_network, + username=remote_details.ssh_user, + ssh_port=remote_details.port, + dockerized=True, + backend_data=None, + ssh_proxy=remote_details.ssh_proxy, + ) + instance_offer = InstanceOfferWithAvailability( + backend=BackendType.REMOTE, + instance=instance_type, + region=region, + price=0, + availability=InstanceAvailability.AVAILABLE, + instance_runtime=InstanceRuntime.SHIM, + ) + + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.IDLE if health else InstanceStatus.PROVISIONING, + ) + result.instance_update_map["backend"] = BackendType.REMOTE + result.instance_update_map["price"] = 0 + result.instance_update_map["offer"] = instance_offer.json() + result.instance_update_map["job_provisioning_data"] = job_provisioning_data.json() + result.instance_update_map["started_at"] = NOW_PLACEHOLDER + result.instance_update_map["total_blocks"] = blocks + return result + + +class _SSHInstanceNetworkResolutionError(Exception): + pass + + +def _resolve_ssh_instance_network( + instance_model: InstanceModel, + host_info: dict[str, Any], +) -> tuple[Optional[str], Optional[str]]: + instance_network = None + internal_ip = None + try: + default_job_provisioning_data = JobProvisioningData.__response__.parse_raw( + instance_model.job_provisioning_data + ) + instance_network = default_job_provisioning_data.instance_network + internal_ip = default_job_provisioning_data.internal_ip + except ValidationError: + pass + + host_network_addresses = host_info.get("addresses", []) + if internal_ip is None: + internal_ip = get_ip_from_network( + network=instance_network, + addresses=host_network_addresses, + ) + if instance_network is not None and internal_ip is None: + raise _SSHInstanceNetworkResolutionError( + "Failed to locate internal IP address on the given network" + ) + if internal_ip is not None and not is_ip_among_addresses( + ip_address=internal_ip, + addresses=host_network_addresses, + ): + raise _SSHInstanceNetworkResolutionError( + "Specified internal IP not found among instance interfaces" + ) + return instance_network, internal_ip + + +def _deploy_instance( + remote_details: RemoteConnectionInfo, + pkeys: list[PKey], + ssh_proxy_pkeys: Optional[list[PKey]], + authorized_keys: list[str], +) -> tuple[InstanceCheck, dict[str, Any], GoArchType]: + with get_paramiko_connection( + remote_details.ssh_user, + remote_details.host, + remote_details.port, + pkeys, + remote_details.ssh_proxy, + ssh_proxy_pkeys, + ) as client: + logger.debug("Connected to %s %s", remote_details.ssh_user, remote_details.host) + + arch = detect_cpu_arch(client) + logger.debug("%s: CPU arch is %s", remote_details.host, arch) + + # Execute pre start commands + shim_pre_start_commands = get_shim_pre_start_commands(arch=arch) + run_pre_start_commands(client, shim_pre_start_commands, authorized_keys) + logger.debug("The script for installing dstack has been executed") + + # Upload envs + shim_envs = get_shim_env(arch=arch) + try: + fleet_configuration_envs = remote_details.env.as_dict() + except ValueError as exc: + raise SSHProvisioningError(f"Invalid Env: {exc}") from exc + shim_envs.update(fleet_configuration_envs) + dstack_working_dir = get_dstack_working_dir() + dstack_shim_binary_path = get_dstack_shim_binary_path() + dstack_runner_binary_path = get_dstack_runner_binary_path() + upload_envs(client, dstack_working_dir, shim_envs) + logger.debug("The dstack-shim environment variables have been installed") + + # Ensure we have fresh versions of host info.json and dstack-runner + remove_host_info_if_exists(client, dstack_working_dir) + remove_dstack_runner_if_exists(client, dstack_runner_binary_path) + + # Run dstack-shim as a systemd service + run_shim_as_systemd_service( + client=client, + binary_path=dstack_shim_binary_path, + working_dir=dstack_working_dir, + dev=settings.DSTACK_VERSION is None, + ) + + # Get host info + host_info = get_host_info(client, dstack_working_dir) + logger.debug("Received a host_info %s", host_info) + + healthcheck_out = get_shim_healthcheck(client) + try: + healthcheck = HealthcheckResponse.__response__.parse_raw(healthcheck_out) + except ValueError as exc: + raise SSHProvisioningError(f"Cannot parse HealthcheckResponse: {exc}") from exc + instance_check = runner_client.healthcheck_response_to_instance_check(healthcheck) + return instance_check, host_info, arch diff --git a/src/dstack/_internal/server/background/pipeline_tasks/instances/termination.py b/src/dstack/_internal/server/background/pipeline_tasks/instances/termination.py new file mode 100644 index 0000000000..c4cce13b27 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/instances/termination.py @@ -0,0 +1,94 @@ +from dstack._internal.core.errors import BackendError, NotYetTerminated +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.server.background.pipeline_tasks.base import NOW_PLACEHOLDER +from dstack._internal.server.background.pipeline_tasks.instances.common import ( + ProcessResult, + get_termination_deadline, + next_termination_retry_at, + set_status_update, +) +from dstack._internal.server.models import InstanceModel +from dstack._internal.server.services import backends as backends_services +from dstack._internal.server.services.instances import get_instance_provisioning_data +from dstack._internal.server.services.runner.pool import ( + instance_connection_pool, +) +from dstack._internal.utils.common import get_current_datetime, run_async +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +async def terminate_instance(instance_model: InstanceModel) -> ProcessResult: + result = ProcessResult() + now = get_current_datetime() + if ( + instance_model.last_termination_retry_at is not None + and next_termination_retry_at(instance_model.last_termination_retry_at) > now + ): + return result + + job_provisioning_data = get_instance_provisioning_data(instance_model) + if job_provisioning_data is not None and job_provisioning_data.backend != BackendType.REMOTE: + backend = await backends_services.get_project_backend_by_type( + project=instance_model.project, + backend_type=job_provisioning_data.backend, + ) + if backend is None: + logger.error( + "Failed to terminate instance %s. Backend %s not available.", + instance_model.name, + job_provisioning_data.backend, + ) + else: + logger.debug("Terminating runner instance %s", job_provisioning_data.hostname) + try: + await run_async( + backend.compute().terminate_instance, + job_provisioning_data.instance_id, + job_provisioning_data.region, + job_provisioning_data.backend_data, + ) + except Exception as exc: + first_retry_at = instance_model.first_termination_retry_at + if first_retry_at is None: + first_retry_at = now + result.instance_update_map["first_termination_retry_at"] = NOW_PLACEHOLDER + result.instance_update_map["last_termination_retry_at"] = NOW_PLACEHOLDER + if next_termination_retry_at(now) < get_termination_deadline(first_retry_at): + if isinstance(exc, NotYetTerminated): + logger.debug( + "Instance %s termination in progress: %s", + instance_model.name, + exc, + ) + else: + logger.warning( + "Failed to terminate instance %s. Will retry. Error: %r", + instance_model.name, + exc, + exc_info=not isinstance(exc, BackendError), + ) + return result + logger.error( + "Failed all attempts to terminate instance %s." + " Please terminate the instance manually to avoid unexpected charges." + " Error: %r", + instance_model.name, + exc, + exc_info=not isinstance(exc, BackendError), + ) + + if job_provisioning_data is not None: + instance_connection_pool.drop_by_jpd(job_provisioning_data) + + result.instance_update_map["deleted"] = True + result.instance_update_map["deleted_at"] = NOW_PLACEHOLDER + result.instance_update_map["finished_at"] = NOW_PLACEHOLDER + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATED, + ) + return result diff --git a/src/dstack/_internal/server/background/pipeline_tasks/jobs_running.py b/src/dstack/_internal/server/background/pipeline_tasks/jobs_running.py new file mode 100644 index 0000000000..61599172b5 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/jobs_running.py @@ -0,0 +1,1889 @@ +import asyncio +import enum +import uuid +from collections.abc import Mapping +from dataclasses import dataclass, field +from datetime import datetime, timedelta +from typing import Dict, Iterable, Literal, Optional, Sequence, Union + +import httpx +from sqlalchemy import and_, exists, false, func, or_, select, true, update +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import aliased, contains_eager, joinedload, load_only + +from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT, DSTACK_SHIM_HTTP_PORT +from dstack._internal.core.errors import GatewayError, SSHError +from dstack._internal.core.models.common import NetworkMode, RegistryAuth +from dstack._internal.core.models.configurations import ( + DevEnvironmentConfiguration, + ServiceConfiguration, +) +from dstack._internal.core.models.files import FileArchiveMapping +from dstack._internal.core.models.instances import InstanceStatus, SSHConnectionParams +from dstack._internal.core.models.metrics import Metric +from dstack._internal.core.models.profiles import StartupOrder +from dstack._internal.core.models.repos import RemoteRepoCreds +from dstack._internal.core.models.routers import RouterType +from dstack._internal.core.models.runs import ( + ClusterInfo, + ImagePullProgress, + Job, + JobProvisioningData, + JobRuntimeData, + JobSpec, + JobStatus, + JobSubmission, + JobTerminationReason, + Run, + RunSpec, + RunStatus, +) +from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint +from dstack._internal.server import settings as server_settings +from dstack._internal.server.background.pipeline_tasks.base import ( + Fetcher, + Heartbeater, + ItemUpdateMap, + Pipeline, + PipelineItem, + Worker, + log_lock_token_changed_after_processing, + log_lock_token_mismatch, + resolve_now_placeholders, + set_processed_update_map_fields, + set_unlock_update_map_fields, +) +from dstack._internal.server.background.pipeline_tasks.common import get_provisioning_timeout +from dstack._internal.server.db import get_db, get_session_ctx +from dstack._internal.server.models import ( + ExportedFleetModel, + FleetModel, + ImportModel, + InstanceModel, + JobModel, + ProbeModel, + ProjectModel, + RepoModel, + RunModel, + UserModel, +) +from dstack._internal.server.schemas.runner import TaskStatus +from dstack._internal.server.services import events +from dstack._internal.server.services import files as files_services +from dstack._internal.server.services import logs as logs_services +from dstack._internal.server.services.backends.provisioning import ( + get_instance_specific_gpu_devices, + get_instance_specific_mounts, + resolve_provisioning_image, +) +from dstack._internal.server.services.gateways import get_or_add_gateway_connections +from dstack._internal.server.services.instances import ( + get_instance_remote_connection_info, + get_instance_ssh_private_keys, +) +from dstack._internal.server.services.jobs import ( + emit_job_status_change_event, + find_job, + get_job_attached_volumes, + get_job_runtime_data, + get_job_spec, + interpolate_job_spec_secrets, + is_master_job, + job_model_to_job_submission, +) +from dstack._internal.server.services.locking import get_locker +from dstack._internal.server.services.logging import fmt +from dstack._internal.server.services.metrics import get_job_metrics +from dstack._internal.server.services.pipelines import PipelineHinterProtocol +from dstack._internal.server.services.repos import ( + get_code_model, + get_repo_creds, + repo_model_to_repo_head_with_creds, +) +from dstack._internal.server.services.runner import client +from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel +from dstack._internal.server.services.runs import is_job_ready, run_model_to_run +from dstack._internal.server.services.runs.replicas import ( + RouterEnvStatus, + get_router_env_for_job, + get_router_replica_group, +) +from dstack._internal.server.services.secrets import get_project_secrets_mapping +from dstack._internal.server.services.storage import get_default_storage +from dstack._internal.server.utils import sentry_utils +from dstack._internal.utils.common import get_current_datetime, get_or_error, run_async +from dstack._internal.utils.interpolator import InterpolatorError +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +JOB_STATUSES_WITH_MIN_PROCESSING_INTERVAL = [JobStatus.PROVISIONING, JobStatus.PULLING] + +ROUTER_PROVISIONING_WAIT_TIMEOUT_SECONDS = 30 * 60 + +JOB_DISCONNECTED_RETRY_TIMEOUT = timedelta(minutes=2) +"""`The minimum time before terminating active job in case of connectivity issues.""" + + +@dataclass +class JobRunningPipelineItem(PipelineItem): + status: JobStatus + replica_num: int + + +class JobRunningPipeline(Pipeline[JobRunningPipelineItem]): + def __init__( + self, + workers_num: int = 20, + queue_lower_limit_factor: float = 0.5, + queue_upper_limit_factor: float = 2.0, + min_processing_interval: timedelta = timedelta(seconds=5), + lock_timeout: timedelta = timedelta(seconds=30), + heartbeat_trigger: timedelta = timedelta(seconds=15), + *, + pipeline_hinter: PipelineHinterProtocol, + ) -> None: + super().__init__( + workers_num=workers_num, + queue_lower_limit_factor=queue_lower_limit_factor, + queue_upper_limit_factor=queue_upper_limit_factor, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeat_trigger=heartbeat_trigger, + ) + self.__heartbeater = Heartbeater[JobRunningPipelineItem]( + model_type=JobModel, + lock_timeout=self._lock_timeout, + heartbeat_trigger=self._heartbeat_trigger, + ) + self.__fetcher = JobRunningFetcher( + queue=self._queue, + queue_desired_minsize=self._queue_desired_minsize, + min_processing_interval=self._min_processing_interval, + lock_timeout=self._lock_timeout, + heartbeater=self._heartbeater, + ) + self.__workers = [ + JobRunningWorker( + queue=self._queue, + heartbeater=self._heartbeater, + pipeline_hinter=pipeline_hinter, + ) + for _ in range(self._workers_num) + ] + + @property + def hint_fetch_model_name(self) -> str: + return JobModel.__name__ + + @property + def _heartbeater(self) -> Heartbeater[JobRunningPipelineItem]: + return self.__heartbeater + + @property + def _fetcher(self) -> Fetcher[JobRunningPipelineItem]: + return self.__fetcher + + @property + def _workers(self) -> Sequence["JobRunningWorker"]: + return self.__workers + + +class JobRunningFetcher(Fetcher[JobRunningPipelineItem]): + def __init__( + self, + queue: asyncio.Queue[JobRunningPipelineItem], + queue_desired_minsize: int, + min_processing_interval: timedelta, + lock_timeout: timedelta, + heartbeater: Heartbeater[JobRunningPipelineItem], + queue_check_delay: float = 1.0, + ) -> None: + super().__init__( + queue=queue, + queue_desired_minsize=queue_desired_minsize, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeater=heartbeater, + queue_check_delay=queue_check_delay, + ) + + @sentry_utils.instrument_pipeline_task("JobRunningFetcher.fetch") + async def fetch(self, limit: int) -> list[JobRunningPipelineItem]: + job_lock, _ = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__) + async with job_lock: + async with get_session_ctx() as session: + now = get_current_datetime() + res = await session.execute( + select(JobModel) + .join(JobModel.run) + .where( + JobModel.status.in_( + [JobStatus.PROVISIONING, JobStatus.PULLING, JobStatus.RUNNING] + ), + or_( + # Process provisioning and pulling jobs quicker for low-latency provisioning. + # Active jobs processing can be less frequent to minimize contention with `RunPipeline`. + and_( + JobModel.status.in_(JOB_STATUSES_WITH_MIN_PROCESSING_INTERVAL), + JobModel.last_processed_at <= now - self._min_processing_interval, + ), + and_( + JobModel.status.not_in(JOB_STATUSES_WITH_MIN_PROCESSING_INTERVAL), + JobModel.last_processed_at + <= now - self._min_processing_interval * 2, + ), + JobModel.skip_min_processing_interval == True, + ), + or_( + and_( + # Do not try to lock jobs if the run is waiting for the lock or terminating, + # but allow retrying jobs whose own lock is stale because + # the run pipeline cannot reclaim stale job locks, and allow jobs with + # skip_min_processing_interval set to speed up provisioning. + or_( + RunModel.lock_owner.is_(None), + JobModel.skip_min_processing_interval == True, + ), + RunModel.status.not_in([RunStatus.TERMINATING]), + JobModel.lock_expires_at.is_(None), + ), + JobModel.lock_expires_at < now, + ), + or_( + JobModel.lock_owner.is_(None), + JobModel.lock_owner == JobRunningPipeline.__name__, + ), + ) + .order_by(JobModel.last_processed_at.asc()) + .limit(limit) + .with_for_update(skip_locked=True, key_share=True, of=JobModel) + .options( + load_only( + JobModel.id, + JobModel.lock_token, + JobModel.lock_expires_at, + JobModel.status, + JobModel.replica_num, + JobModel.skip_min_processing_interval, + ) + ) + ) + job_models = list(res.scalars().all()) + lock_expires_at = get_current_datetime() + self._lock_timeout + lock_token = uuid.uuid4() + items = [] + for job_model in job_models: + prev_lock_expired = job_model.lock_expires_at is not None + job_model.lock_expires_at = lock_expires_at + job_model.lock_token = lock_token + job_model.lock_owner = JobRunningPipeline.__name__ + job_model.skip_min_processing_interval = False + items.append( + JobRunningPipelineItem( + __tablename__=JobModel.__tablename__, + id=job_model.id, + lock_expires_at=lock_expires_at, + lock_token=lock_token, + prev_lock_expired=prev_lock_expired, + status=job_model.status, + replica_num=job_model.replica_num, + ) + ) + await session.commit() + return items + + +class JobRunningWorker(Worker[JobRunningPipelineItem]): + def __init__( + self, + queue: asyncio.Queue[JobRunningPipelineItem], + heartbeater: Heartbeater[JobRunningPipelineItem], + pipeline_hinter: PipelineHinterProtocol, + ) -> None: + super().__init__( + queue=queue, + heartbeater=heartbeater, + pipeline_hinter=pipeline_hinter, + ) + + @sentry_utils.instrument_pipeline_task("JobRunningWorker.process") + async def process(self, item: JobRunningPipelineItem): + context = await _load_process_context(item=item) + if context is None: + log_lock_token_mismatch(logger, item) + return + + result = await _process_running_job(context=context) + await _apply_process_result( + item=item, + job_model=context.job_model, + result=result, + ) + new_status = result.job_update_map.get("status") + if new_status == JobStatus.PULLING: + self._pipeline_hinter.hint_fetch(JobModel.__name__) + # Hint run pipeline for fast run transition to RUNNING status. + if new_status == JobStatus.RUNNING and context.job_model.run.status != RunStatus.RUNNING: + self._pipeline_hinter.hint_fetch(RunModel.__name__) + + +@dataclass +class _ProcessContext: + job_model: JobModel + run_model: RunModel + run: Run + job: Job + job_submission: JobSubmission + job_provisioning_data: Optional[JobProvisioningData] + instance_access_revoked: bool + server_ssh_private_keys: Optional[tuple[str, Optional[str]]] = None + + @property + def repo_model(self) -> RepoModel: + return self.run_model.repo + + @property + def project(self) -> ProjectModel: + return self.run_model.project + + +class _JobUpdateMap(ItemUpdateMap, total=False): + status: JobStatus + termination_reason: Optional[JobTerminationReason] + termination_reason_message: Optional[str] + job_provisioning_data: Optional[str] + job_runtime_data: Optional[str] + runner_timestamp: Optional[int] + disconnected_at: Optional[datetime] + inactivity_secs: Optional[int] + exit_status: Optional[int] + registered: bool + image_pull_progress: Optional[str] + skip_min_processing_interval: bool + + +@dataclass +class _RegisterReplicaResult: + gateway_target: Optional[events.Target] # None = no gateway + + +@dataclass +class _ProcessResult: + job_update_map: _JobUpdateMap = field(default_factory=_JobUpdateMap) + new_probe_models: list[ProbeModel] = field(default_factory=list) + replica_registration: Optional[_RegisterReplicaResult] = None # None = not registered yet + + +@dataclass +class _StartupContext: + cluster_info: ClusterInfo + volumes: list[Volume] + secrets: dict[str, str] + repo_creds: Optional[RemoteRepoCreds] + router_env: Optional[Dict[str, str]] = None + """Dynamo-specific env (e.g. DSTACK_ROUTER_INTERNAL_IP) computed from the + router replica's state. Passed through to RunnerClient.submit_job, which + merges it into a deep-copied job_spec.env so the shared job_spec is not + mutated. None for SGLang services, non-router runs, and the router + replica itself.""" + + +async def _load_process_context(item: JobRunningPipelineItem) -> Optional[_ProcessContext]: + async with get_session_ctx() as session: + job_model = await _refetch_locked_job_model(session=session, item=item) + if job_model is None: + return None + if item.status == JobStatus.RUNNING: + # RUNNING jobs don't access run.jobs — skip loading sibling jobs entirely. + run_model = await _fetch_run_model(session=session, run_id=job_model.run_id) + run = run_model_to_run(run_model, include_sensitive=True, include_jobs=False) + job = Job( + job_spec=get_job_spec(job_model), + job_submissions=[job_model_to_job_submission(job_model)], + ) + else: + # PROVISIONING/PULLING jobs need same-replica siblings for cluster + # coordination, plus — when the run has a router replica group — + # the router replica's job (cross-replica) so the env-injection + # gate in _prepare_startup_context can read its status / IP. + # _fetch_run_model handles both: same-replica jobs always, plus + # all non-terminated jobs when one exists. + run_spec = RunSpec.__response__.parse_raw(job_model.run.run_spec) + run_model = await _fetch_run_model( + session=session, + run_id=job_model.run_id, + replica_num=item.replica_num, + run_spec=run_spec, + ) + run = run_model_to_run(run_model, include_sensitive=True) + job = find_job(run.jobs, job_model.replica_num, job_model.job_num) + instance_access_revoked = await _is_instance_access_revoked(session, job_model) + job_submission = job_model_to_job_submission(job_model) + server_ssh_private_keys = get_instance_ssh_private_keys(get_or_error(job_model.instance)) + return _ProcessContext( + job_model=job_model, + run_model=run_model, + run=run, + job=job, + job_submission=job_submission, + job_provisioning_data=job_submission.job_provisioning_data, + instance_access_revoked=instance_access_revoked, + server_ssh_private_keys=server_ssh_private_keys, + ) + + +async def _process_running_job(context: _ProcessContext) -> _ProcessResult: + result = _ProcessResult() + if context.instance_access_revoked: + _terminate_job( + job_model=context.job_model, + job_update_map=result.job_update_map, + termination_reason=JobTerminationReason.INSTANCE_ACCESS_REVOKED, + termination_reason_message=( + "The instance is no longer imported into the job's project" + ), + ) + return result + + if context.job_provisioning_data is None: + logger.error("%s: job_provisioning_data of an active job is None", fmt(context.job_model)) + _terminate_job( + job_model=context.job_model, + job_update_map=result.job_update_map, + termination_reason=JobTerminationReason.TERMINATED_BY_SERVER, + termination_reason_message=( + "Unexpected server error: job_provisioning_data of an active job is None" + ), + ) + return result + + if context.job_model.status == JobStatus.PROVISIONING: + startup_context = await _prepare_startup_context(context=context, result=result) + if startup_context is None: + return result + await _process_provisioning_status( + context=context, startup_context=startup_context, result=result + ) + elif context.job_model.status == JobStatus.PULLING: + startup_context = await _prepare_startup_context(context=context, result=result) + if startup_context is None: + return result + await _process_pulling_status( + context=context, startup_context=startup_context, result=result + ) + elif context.job_model.status == JobStatus.RUNNING: + await _process_running_status(context=context, result=result) + + if _get_result_status(context.job_model, result) == JobStatus.RUNNING: + if context.job_model.status != JobStatus.RUNNING: + _initialize_running_job_probes( + job_model=context.job_model, + job=context.job, + result=result, + ) + await _maybe_register_replica(context=context, result=result) + await _check_gpu_utilization(context=context, result=result) + return result + + +async def _prepare_startup_context( + context: _ProcessContext, + result: _ProcessResult, +) -> Optional[_StartupContext]: + job_provisioning_data = get_or_error(context.job_provisioning_data) + + for other_job in context.run.jobs: + if ( + other_job.job_spec.replica_num == context.job.job_spec.replica_num + and other_job.job_submissions[-1].status == JobStatus.SUBMITTED + ): + logger.debug( + "%s: waiting for all jobs in the replica to be provisioned", + fmt(context.job_model), + ) + return None + + # If this run has a router replica group and this job is a worker, gate + # startup on the router replica's state. The helper returns None for the + # router itself and for runs without a router group, so this whole block + # is a no-op in those cases. + router_env_outcome = get_router_env_for_job( + run_model=context.run_model, + run_spec=context.run.run_spec, + job_model=context.job_model, + ) + if router_env_outcome is RouterEnvStatus.FAILED: + # Router has reached a terminal state — the worker cannot recover by + # waiting. Terminate it now with a clear reason instead of letting it + # idle until the run-level reconciler tears the whole run down. + _terminate_job( + job_model=context.job_model, + job_update_map=result.job_update_map, + termination_reason=JobTerminationReason.TERMINATED_BY_SERVER, + termination_reason_message=( + "Router replica is in a terminal state; cannot provision worker " + "without a running router." + ), + ) + return None + if router_env_outcome is RouterEnvStatus.NOT_PROVISIONED: + # Router is alive but its internal_ip is not yet known. Defer this + # worker — the next pipeline tick will re-check. Bound the wait so a + # router that is genuinely stuck can't burn worker instance-hours + # forever; see ROUTER_PROVISIONING_WAIT_TIMEOUT_SECONDS. + waited_seconds = (get_current_datetime() - context.job_model.submitted_at).total_seconds() + if waited_seconds > ROUTER_PROVISIONING_WAIT_TIMEOUT_SECONDS: + _terminate_job( + job_model=context.job_model, + job_update_map=result.job_update_map, + termination_reason=JobTerminationReason.TERMINATED_BY_SERVER, + termination_reason_message=( + f"Router replica did not acquire an internal IP within " + f"{ROUTER_PROVISIONING_WAIT_TIMEOUT_SECONDS}s; terminating worker." + ), + ) + return None + logger.debug( + "%s: waiting for router replica to be provisioned", + fmt(context.job_model), + ) + return None + # Past the enum branches, router_env_outcome is either None or a Dict. + # We don't mutate job_spec.env here — RunnerClient.submit_job merges it + # into a deep-copied spec, mirroring how instance_env is handled. + router_env: Optional[Dict[str, str]] = ( + router_env_outcome if isinstance(router_env_outcome, dict) else None + ) + + cluster_info = _get_cluster_info( + jobs=context.run.jobs, + replica_num=context.job.job_spec.replica_num, + job_provisioning_data=job_provisioning_data, + job_runtime_data=context.job_submission.job_runtime_data, + ) + + async with get_session_ctx() as session: + volumes = await get_job_attached_volumes( + session=session, + project=context.project, + run_spec=context.run.run_spec, + job_num=context.job.job_spec.job_num, + job_provisioning_data=job_provisioning_data, + ) + repo_creds_model = await get_repo_creds( + session=session, + repo=context.repo_model, + user=context.run_model.user, + ) + secrets = await get_project_secrets_mapping(session=session, project=context.project) + + repo_creds = repo_model_to_repo_head_with_creds( + context.repo_model, + repo_creds_model, + ).repo_creds + + try: + interpolate_job_spec_secrets(context.job.job_spec, secrets) + except InterpolatorError as e: + _terminate_job( + job_model=context.job_model, + job_update_map=result.job_update_map, + termination_reason=JobTerminationReason.TERMINATED_BY_SERVER, + termination_reason_message=f"Secrets interpolation error: {e.args[0]}", + ) + return None + + return _StartupContext( + cluster_info=cluster_info, + volumes=volumes, + secrets=secrets, + repo_creds=repo_creds, + router_env=router_env, + ) + + +async def _refetch_locked_job_model( + session: AsyncSession, item: JobRunningPipelineItem +) -> Optional[JobModel]: + res = await session.execute( + select(JobModel) + .where( + JobModel.id == item.id, + JobModel.lock_token == item.lock_token, + ) + .options(joinedload(JobModel.instance).joinedload(InstanceModel.project)) + .options(joinedload(JobModel.probes).load_only(ProbeModel.success_streak)) + .options( + joinedload(JobModel.run).load_only(RunModel.id, RunModel.run_spec, RunModel.status) + ) + .execution_options(populate_existing=True) + ) + return res.unique().scalar_one_or_none() + + +async def _fetch_run_model( + session: AsyncSession, + run_id: uuid.UUID, + replica_num: Optional[int] = None, + run_spec: Optional[RunSpec] = None, +) -> RunModel: + """Fetch run model with related project, user, repo, and fleet. + + Args: + replica_num: If None, skip loading jobs (for RUNNING jobs that don't need siblings). + If set, load only latest-submission jobs for that replica (for PROVISIONING/PULLING + jobs that need same-replica siblings for cluster coordination). When the run has + a Dynamo router replica group, all non-terminated latest-submission jobs for the + run are loaded so find_router_job can identify the router by replica-group + membership. + run_spec: Required whenever `replica_num` is set. Used only to detect + whether the run has a Dynamo router replica group. The caller is + expected to parse it once from the eager-loaded JobModel.run + (see _refetch_locked_job_model) so we don't issue a separate + query for it here. + """ + query = ( + select(RunModel) + .where(RunModel.id == run_id) + .options(joinedload(RunModel.project)) + .options(joinedload(RunModel.user)) + .options(joinedload(RunModel.repo)) + .options(joinedload(RunModel.fleet).load_only(FleetModel.id, FleetModel.name)) + ) + if replica_num is not None: + assert run_spec is not None, "run_spec must be provided when replica_num is set" + router_group = get_router_replica_group(run_spec) + is_dynamo = ( + router_group is not None + and router_group.router is not None + and router_group.router.type == RouterType.DYNAMO + ) + + latest_submissions_sq = ( + select( + JobModel.run_id.label("run_id"), + JobModel.replica_num.label("replica_num"), + JobModel.job_num.label("job_num"), + func.max(JobModel.submission_num).label("max_submission_num"), + ) + .where( + JobModel.run_id == run_id, + # For Service with Dynamo router: load all replicas. For Non-Dynamo: only the worker's + # own replica. + true() if is_dynamo else JobModel.replica_num == replica_num, + ) + .group_by(JobModel.run_id, JobModel.replica_num, JobModel.job_num) + .subquery() + ) + job_alias = aliased(JobModel) + query = ( + query.join(job_alias, job_alias.run_id == RunModel.id) + .join( + latest_submissions_sq, + onclause=and_( + job_alias.run_id == latest_submissions_sq.c.run_id, + job_alias.replica_num == latest_submissions_sq.c.replica_num, + job_alias.job_num == latest_submissions_sq.c.job_num, + job_alias.submission_num == latest_submissions_sq.c.max_submission_num, + # For Dynamo runs, drop terminated rows so accumulated + # scale-down history doesn't bloat the load. Non-Dynamo + # runs are already restricted to the worker's own + # replica above, so this filter is a no-op for them. + or_( + false() if is_dynamo else true(), + ~job_alias.status.in_(JobStatus.finished_statuses()) + & (job_alias.status != JobStatus.TERMINATING), + ), + ), + ) + .options(contains_eager(RunModel.jobs, alias=job_alias)) + ) + res = await session.execute(query) + return res.unique().scalar_one() + + +async def _is_instance_access_revoked(session: AsyncSession, job_model: JobModel) -> bool: + if job_model.instance is None or job_model.instance.project_id == job_model.project_id: + return False + return not ( + await session.execute( + select( + exists().where( + ImportModel.project_id == job_model.project_id, + ImportModel.export_id == ExportedFleetModel.export_id, + ExportedFleetModel.fleet_id == job_model.instance.fleet_id, + ) + ) + ) + ).scalar() + + +async def _process_provisioning_status( + context: _ProcessContext, + startup_context: _StartupContext, + result: _ProcessResult, +) -> None: + job_provisioning_data = get_or_error(context.job_provisioning_data) + server_ssh_private_keys = get_or_error(context.server_ssh_private_keys) + + if job_provisioning_data.hostname is None: + _wait_for_instance_provisioning_data(context.job_model, result) + return + if _should_wait_for_other_nodes(context.run, context.job, context.job_model): + return + + if job_provisioning_data.dockerized: + logger.debug( + "%s: process provisioning job with shim, age=%s", + fmt(context.job_model), + context.job_submission.age, + ) + public_keys = [context.project.ssh_public_key.strip()] + ssh_user: Optional[str] = None + user_ssh_key: Optional[str] = None + if not server_settings.SSHPROXY_ENFORCED: + ssh_user = job_provisioning_data.username + assert context.run.run_spec.ssh_key_pub is not None + user_ssh_key = context.run.run_spec.ssh_key_pub.strip() + public_keys.append(user_ssh_key) + success = await run_async( + _process_provisioning_with_shim, + server_ssh_private_keys, + job_provisioning_data, + None, + run=context.run, + job_model=context.job_model, + jrd=get_job_runtime_data(context.job_model), + jpd=job_provisioning_data, + volumes=startup_context.volumes, + registry_auth=context.job.job_spec.registry_auth, + public_keys=public_keys, + ssh_user=ssh_user, + ssh_key=user_ssh_key, + ) + if success: + _set_job_status(context.job_model, result, JobStatus.PULLING) + result.job_update_map["skip_min_processing_interval"] = True + return + else: + logger.debug( + "%s: process provisioning job without shim, age=%s", + fmt(context.job_model), + context.job_submission.age, + ) + runner_availability = await run_async( + _get_runner_availability, + server_ssh_private_keys, + job_provisioning_data, + None, + ) + if runner_availability == _RunnerAvailability.AVAILABLE: + file_archives = await _get_job_file_archives( + archive_mappings=context.job.job_spec.file_archives, + user=context.run_model.user, + ) + code = await _get_job_code( + project=context.project, + repo=context.repo_model, + code_hash=_get_repo_code_hash(context.run, context.job), + ) + submit_result = await run_async( + _submit_job_to_runner, + server_ssh_private_keys, + job_provisioning_data, + None, + run=context.run, + job_model=context.job_model, + job=context.job, + jrd=get_job_runtime_data(context.job_model), + cluster_info=startup_context.cluster_info, + code=code, + file_archives=file_archives, + secrets=startup_context.secrets, + repo_credentials=startup_context.repo_creds, + router_env=startup_context.router_env, + success_if_not_available=False, + ) + if submit_result is not False: + _apply_submit_job_to_runner_result( + job_model=context.job_model, + result=result, + submit_result=submit_result, + ) + if submit_result is not False and submit_result.success: + return + + provisioning_timeout = get_provisioning_timeout( + backend_type=job_provisioning_data.get_base_backend(), + instance_type_name=job_provisioning_data.instance_type.name, + ) + if context.job_submission.age > provisioning_timeout: + _terminate_job( + job_model=context.job_model, + job_update_map=result.job_update_map, + termination_reason=JobTerminationReason.WAITING_RUNNER_LIMIT_EXCEEDED, + termination_reason_message=( + f"Runner did not become available within {provisioning_timeout.total_seconds()}s." + f" Job submission age: {context.job_submission.age.total_seconds()}s)" + ), + ) + + +async def _process_pulling_status( + context: _ProcessContext, + startup_context: _StartupContext, + result: _ProcessResult, +) -> None: + job_provisioning_data = get_or_error(context.job_provisioning_data) + server_ssh_private_keys = get_or_error(context.server_ssh_private_keys) + + logger.debug( + "%s: process pulling job with shim, age=%s", + fmt(context.job_model), + context.job_submission.age, + ) + shim_state = await run_async( + _sync_shim_pulling_state, + server_ssh_private_keys, + job_provisioning_data, + None, + job_model=context.job_model, + jrd=_get_result_job_runtime_data(context.job_model, result), + ) + if shim_state is not False: + if shim_state.job_runtime_data is not None: + _set_job_runtime_data(result, shim_state.job_runtime_data) + + if shim_state.image_pull_progress is not None: + result.job_update_map["image_pull_progress"] = shim_state.image_pull_progress.json() + + if shim_state.state == _ShimPullingState.WAITING: + _reset_disconnected_at(context.job_model, result) + return + + if shim_state.state == _ShimPullingState.FAILED: + logger.warning( + "%s: failed due to %s, age=%s", + fmt(context.job_model), + get_or_error(shim_state.termination_reason).value, + context.job_submission.age, + ) + _terminate_job( + job_model=context.job_model, + job_update_map=result.job_update_map, + termination_reason=get_or_error(shim_state.termination_reason), + termination_reason_message=get_or_error(shim_state.termination_reason_message), + ) + return + + # _ShimPullingState.READY + job_runtime_data = _get_result_job_runtime_data(context.job_model, result) + runner_availability = await run_async( + _get_runner_availability, + server_ssh_private_keys, + job_provisioning_data, + job_runtime_data, + ) + if runner_availability == _RunnerAvailability.UNAVAILABLE: + _reset_disconnected_at(context.job_model, result) + return + + if runner_availability == _RunnerAvailability.AVAILABLE: + file_archives = await _get_job_file_archives( + archive_mappings=context.job.job_spec.file_archives, + user=context.run_model.user, + ) + code = await _get_job_code( + project=context.project, + repo=context.repo_model, + code_hash=_get_repo_code_hash(context.run, context.job), + ) + submit_result = await run_async( + _submit_job_to_runner, + server_ssh_private_keys, + job_provisioning_data, + job_runtime_data, + run=context.run, + job_model=context.job_model, + job=context.job, + jrd=job_runtime_data, + cluster_info=startup_context.cluster_info, + code=code, + file_archives=file_archives, + secrets=startup_context.secrets, + repo_credentials=startup_context.repo_creds, + router_env=startup_context.router_env, + success_if_not_available=True, + ) + if submit_result is not False: + _apply_submit_job_to_runner_result( + job_model=context.job_model, + result=result, + submit_result=submit_result, + ) + if submit_result is not False and submit_result.success: + _reset_disconnected_at(context.job_model, result) + return + + # SSH tunnel failed or READY but runner submit failed — treat as disconnect + _handle_instance_unreachable(context, result, job_provisioning_data) + + +async def _process_running_status( + context: _ProcessContext, + result: _ProcessResult, +) -> None: + job_provisioning_data = get_or_error(context.job_provisioning_data) + server_ssh_private_keys = get_or_error(context.server_ssh_private_keys) + + logger.debug( + "%s: process running job, age=%s", + fmt(context.job_model), + context.job_submission.age, + ) + process_running_result = await run_async( + _process_running, + server_ssh_private_keys, + job_provisioning_data, + context.job_submission.job_runtime_data, + run_model=context.run_model, + job_model=context.job_model, + ) + if process_running_result is not False: + result.job_update_map.update(process_running_result.job_update_map) + _reset_disconnected_at(context.job_model, result) + return + + _handle_instance_unreachable(context, result, job_provisioning_data) + + +async def _apply_process_result( + item: JobRunningPipelineItem, + job_model: JobModel, + result: _ProcessResult, +) -> None: + set_processed_update_map_fields(result.job_update_map) + set_unlock_update_map_fields(result.job_update_map) + + async with get_session_ctx() as session: + now = get_current_datetime() + resolve_now_placeholders(result.job_update_map, now=now) + res = await session.execute( + update(JobModel) + .where( + JobModel.id == item.id, + JobModel.lock_token == item.lock_token, + ) + .values(**result.job_update_map) + .returning(JobModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + log_lock_token_changed_after_processing(logger, item) + return + + if result.new_probe_models: + session.add_all(result.new_probe_models) + + # Set RunModel.skip_min_processing_interval for fast run transition to RUNNING status. + # Cross-pipeline write is ok: worst case skip_min_processing_interval is overridden. + if ( + result.job_update_map.get("status") == JobStatus.RUNNING + and job_model.run.status != RunStatus.RUNNING + ): + await session.execute( + update(RunModel) + .where(RunModel.id == job_model.run_id) + .values(skip_min_processing_interval=True) + ) + + _emit_result_events(session=session, job_model=job_model, result=result) + + +def _emit_result_events( + session: AsyncSession, + job_model: JobModel, + result: _ProcessResult, +) -> None: + """Emit audit events for changes recorded in result..""" + emit_job_status_change_event( + session=session, + job_model=job_model, + old_status=job_model.status, + new_status=result.job_update_map.get("status", job_model.status), + termination_reason=result.job_update_map.get( + "termination_reason", job_model.termination_reason + ), + termination_reason_message=result.job_update_map.get( + "termination_reason_message", + job_model.termination_reason_message, + ), + ) + _emit_reachability_change_event( + session=session, + job_model=job_model, + old_disconnected_at=job_model.disconnected_at, + new_disconnected_at=result.job_update_map.get( + "disconnected_at", + job_model.disconnected_at, + ), + ) + if result.replica_registration is not None: + targets = [events.Target.from_model(job_model)] + if result.replica_registration.gateway_target is not None: + targets.append(result.replica_registration.gateway_target) + events.emit( + session, + "Service replica registered to receive requests", + actor=events.SystemActor(), + targets=targets, + ) + + +def _wait_for_instance_provisioning_data( + job_model: JobModel, + result: _ProcessResult, +) -> None: + if job_model.instance is None: + logger.error( + "%s: cannot update job_provisioning_data. job_model.instance is None.", + fmt(job_model), + ) + return + if job_model.instance.job_provisioning_data is None: + logger.error( + "%s: cannot update job_provisioning_data. job_model.job_provisioning_data is None.", + fmt(job_model), + ) + return + + if job_model.instance.status == InstanceStatus.TERMINATED: + _terminate_job( + job_model=job_model, + job_update_map=result.job_update_map, + termination_reason=JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED, + termination_reason_message="Instance is terminated", + ) + return + + result.job_update_map["job_provisioning_data"] = job_model.instance.job_provisioning_data + + +def _handle_instance_unreachable( + context: _ProcessContext, + result: _ProcessResult, + job_provisioning_data: JobProvisioningData, +) -> None: + _set_disconnected_at_now(context.job_model, result) + if not _should_terminate_job_due_to_disconnect( + _get_result_disconnected_at(context.job_model, result) + ): + logger.warning( + "%s: is unreachable, waiting for the instance to become reachable again, age=%s", + fmt(context.job_model), + context.job_submission.age, + ) + return + if job_provisioning_data.instance_type.resources.spot: + termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY + else: + termination_reason = JobTerminationReason.INSTANCE_UNREACHABLE + _terminate_job( + job_model=context.job_model, + job_update_map=result.job_update_map, + termination_reason=termination_reason, + termination_reason_message="Instance is unreachable", + ) + + +def _initialize_running_job_probes( + job_model: JobModel, + job: Job, + result: _ProcessResult, +) -> None: + for probe_num in range(len(job.job_spec.probes)): + result.new_probe_models.append( + ProbeModel( + name=f"{job_model.job_name}-{probe_num}", + job_id=job_model.id, + probe_num=probe_num, + due=get_current_datetime(), + success_streak=0, + active=True, + ) + ) + + +async def _maybe_register_replica( + context: _ProcessContext, + result: _ProcessResult, +) -> None: + if ( + context.run.run_spec.configuration.type != "service" + or _get_result_registered(context.job_model, result) + or context.job_model.job_num != 0 + or result.new_probe_models + or not is_job_ready(context.job_model.probes, context.job.job_spec.probes) + ): + return + + ssh_head_proxy: Optional[SSHConnectionParams] = None + ssh_head_proxy_private_key: Optional[str] = None + instance = get_or_error(context.job_model.instance) + rci = get_instance_remote_connection_info(instance) + if rci is not None and rci.ssh_proxy is not None: + ssh_head_proxy = rci.ssh_proxy + ssh_head_proxy_keys = get_or_error(rci.ssh_proxy_keys) + ssh_head_proxy_private_key = ssh_head_proxy_keys[0].private + + try: + gateway_target = await _register_service_replica( + context=context, + result=result, + ssh_head_proxy=ssh_head_proxy, + ssh_head_proxy_private_key=ssh_head_proxy_private_key, + ) + except GatewayError as e: + logger.warning("%s: failed to register service replica: %s", fmt(context.job_model), e) + _terminate_job( + job_model=context.job_model, + job_update_map=result.job_update_map, + termination_reason=JobTerminationReason.GATEWAY_ERROR, + termination_reason_message="Failed to register service replica", + ) + return + + result.job_update_map["registered"] = True + result.replica_registration = _RegisterReplicaResult(gateway_target=gateway_target) + + +async def _register_service_replica( + context: _ProcessContext, + result: _ProcessResult, + ssh_head_proxy: Optional[SSHConnectionParams], + ssh_head_proxy_private_key: Optional[str], +) -> Optional[events.Target]: + if context.run_model.gateway_id is None: + return None + + job_spec = JobSpec.__response__.parse_raw(context.job_model.job_spec_data) + + # For router-based services (e.g. PD disaggregation), only router replicas should be + # registered with the gateway. Worker replicas are discovered by the router-worker + # sync pipeline and should not be routed to directly by the gateway. + config = context.run.run_spec.configuration + assert isinstance(config, ServiceConfiguration) + router_group = next((g for g in config.replica_groups if g.router is not None), None) + is_router_replica = router_group is not None and job_spec.replica_group == router_group.name + if router_group is not None and not is_router_replica: + logger.debug( + "%s: skipping gateway replica registration (non-router replica)", + fmt(context.job_model), + ) + return None + + async with get_session_ctx() as session: + gateway_model, connections = await get_or_add_gateway_connections( + session, context.run_model.gateway_id + ) + gateway_target = events.Target.from_model(gateway_model) + assert context.job_model.instance is not None + instance_project_ssh_private_key = None + if context.job_model.project_id != context.job_model.instance.project_id: + instance_project_ssh_private_key = context.job_model.instance.project.ssh_private_key + # JobRuntimeData might change on PULLING -> RUNNING path + # so we must update job_submission with the result value. + job_submission = context.job_submission.copy(deep=True) + job_submission.job_runtime_data = _get_result_job_runtime_data(context.job_model, result) + for conn in connections: + try: + logger.debug( + "%s: registering replica for service %s on gateway replica %s", + fmt(context.job_model), + context.run.id.hex, + conn.ip_address, + ) + async with conn.client() as gateway_client: + await gateway_client.register_replica( + run=context.run, + job_spec=job_spec, + job_submission=job_submission, + instance_project_ssh_private_key=instance_project_ssh_private_key, + ssh_head_proxy=ssh_head_proxy, + ssh_head_proxy_private_key=ssh_head_proxy_private_key, + ) + except (httpx.RequestError, SSHError) as e: + logger.debug("Gateway request failed", exc_info=True) + raise GatewayError(repr(e)) + except GatewayError as e: + if "already exists in service" in e.msg: + logger.warning( + ( + "%s: could not register replica in gateway %s: %s." + " NOTE: if you just updated dstack from pre-0.19.25 to 0.19.25+," + " expect to see this warning once for every running service replica" + ), + fmt(context.job_model), + conn.ip_address, + e.msg, + ) + else: + raise + return gateway_target + + +async def _check_gpu_utilization( + context: _ProcessContext, + result: _ProcessResult, +) -> None: + policy = context.job.job_spec.utilization_policy + if policy is None: + return + + after = get_current_datetime() - timedelta(seconds=policy.time_window) + async with get_session_ctx() as session: + job_metrics = await get_job_metrics(session, context.job_model, after=after) + gpus_util_metrics: list[Metric] = [] + for metric in job_metrics.metrics: + if metric.name.startswith("gpu_util_percent_gpu"): + gpus_util_metrics.append(metric) + if not gpus_util_metrics or gpus_util_metrics[0].timestamps[-1] > after + timedelta(minutes=1): + logger.debug("%s: GPU utilization check: not enough samples", fmt(context.job_model)) + return + if _should_terminate_due_to_low_gpu_util( + policy.min_gpu_utilization, [metric.values for metric in gpus_util_metrics] + ): + logger.debug("%s: GPU utilization check: terminating", fmt(context.job_model)) + _terminate_job( + job_model=context.job_model, + job_update_map=result.job_update_map, + termination_reason=JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY, + termination_reason_message=( + f"The job GPU utilization below {policy.min_gpu_utilization}%" + f" for {policy.time_window} seconds" + ), + ) + else: + logger.debug("%s: GPU utilization check: OK", fmt(context.job_model)) + + +def _should_terminate_due_to_low_gpu_util( + min_util: int, gpus_util: Iterable[Iterable[int]] +) -> bool: + for gpu_util in gpus_util: + if all(util < min_util for util in gpu_util): + return True + return False + + +def _should_wait_for_other_nodes(run: Run, job: Job, job_model: JobModel) -> bool: + for other_job in run.jobs: + if ( + other_job.job_spec.replica_num == job.job_spec.replica_num + and other_job.job_submissions[-1].status == JobStatus.PROVISIONING + and other_job.job_submissions[-1].job_provisioning_data is not None + and other_job.job_submissions[-1].job_provisioning_data.hostname is None + ): + logger.debug("%s: waiting for other job to have IP assigned", fmt(job_model)) + return True + master_job = find_job(run.jobs, job.job_spec.replica_num, 0) + if ( + job.job_spec.job_num != 0 + and run.run_spec.merged_profile.startup_order == StartupOrder.MASTER_FIRST + and master_job.job_submissions[-1].status != JobStatus.RUNNING + ): + logger.debug("%s: waiting for master job to become running", fmt(job_model)) + return True + if ( + is_master_job(job) + and run.run_spec.merged_profile.startup_order == StartupOrder.WORKERS_FIRST + ): + for other_job in run.jobs: + if ( + other_job.job_spec.replica_num == job.job_spec.replica_num + and other_job.job_spec.job_num != job.job_spec.job_num + and other_job.job_submissions[-1].status != JobStatus.RUNNING + ): + logger.debug("%s: waiting for worker job to become running", fmt(job_model)) + return True + return False + + +@runner_ssh_tunnel +def _process_provisioning_with_shim( + addresses: Mapping[int, client.LocalAddress], + run: Run, + job_model: JobModel, + jrd: Optional[JobRuntimeData], + jpd: JobProvisioningData, + volumes: list[Volume], + registry_auth: Optional[RegistryAuth], + public_keys: list[str], + ssh_user: Optional[str], + ssh_key: Optional[str], +) -> bool: + job_spec = get_job_spec(job_model) + shim_client = client.ShimClient.from_address(addresses[DSTACK_SHIM_HTTP_PORT]) + + resp = shim_client.healthcheck() + if resp is None: + logger.debug("%s: shim is not available yet", fmt(job_model)) + return False + + image_name, registry_auth = resolve_provisioning_image(job_spec.image_name, registry_auth, jpd) + + registry_username = "" + registry_password = "" + if registry_auth is not None: + registry_username = registry_auth.username + registry_password = registry_auth.password + + volume_mounts: list[VolumeMountPoint] = [] + instance_mounts: list[InstanceMountPoint] = [] + for mount in run.run_spec.configuration.volumes: + if isinstance(mount, VolumeMountPoint): + volume_mounts.append(mount.copy()) + elif isinstance(mount, InstanceMountPoint): + instance_mounts.append(mount) + else: + assert False, f"unexpected mount point: {mount!r}" + + for volume, volume_mount in zip(volumes, volume_mounts): + volume_mount.name = volume.name + + instance_mounts += get_instance_specific_mounts(jpd.backend, jpd.instance_type.name) + gpu_devices = get_instance_specific_gpu_devices(jpd.backend, jpd.instance_type.name) + + container_user = "root" + if jrd is not None: + gpu = jrd.gpu + cpu = jrd.cpu + memory = jrd.memory + network_mode = jrd.network_mode + else: + gpu = None + cpu = None + memory = None + network_mode = NetworkMode.HOST + if shim_client.is_api_v2_supported(): + shim_client.submit_task( + task_id=job_model.id, + name=job_model.job_name, + registry_username=registry_username, + registry_password=registry_password, + image_name=image_name, + container_user=container_user, + privileged=job_spec.privileged, + gpu=gpu, + cpu=cpu, + memory=memory, + shm_size=job_spec.requirements.resources.shm_size, + network_mode=network_mode, + volumes=volumes, + volume_mounts=volume_mounts, + instance_mounts=instance_mounts, + gpu_devices=gpu_devices, + host_ssh_user=ssh_user or "", + host_ssh_keys=[ssh_key] if ssh_key else [], + container_ssh_keys=public_keys, + instance_id=jpd.instance_id, + ) + else: + submitted = shim_client.submit( + username=registry_username, + password=registry_password, + image_name=image_name, + privileged=job_spec.privileged, + container_name=job_model.job_name, + container_user=container_user, + shm_size=job_spec.requirements.resources.shm_size, + public_keys=public_keys, + ssh_user=ssh_user or "", + ssh_key=ssh_key or "", + mounts=volume_mounts, + volumes=volumes, + instance_mounts=instance_mounts, + instance_id=jpd.instance_id, + ) + if not submitted: + logger.warning( + "%s: failed to submit, shim is already running a job, stopping it now, retry later", + fmt(job_model), + ) + shim_client.stop(force=True) + return False + + return True + + +class _RunnerAvailability(enum.Enum): + AVAILABLE = "available" + UNAVAILABLE = "unavailable" + + +class _ShimPullingState(enum.Enum): + WAITING = "waiting" + READY = "ready" + FAILED = "failed" + + +@dataclass +class _SyncShimPullingStateResult: + state: _ShimPullingState + termination_reason: Optional[JobTerminationReason] = None + termination_reason_message: Optional[str] = None + job_runtime_data: Optional[JobRuntimeData] = None + image_pull_progress: Optional[ImagePullProgress] = None + + +@runner_ssh_tunnel +def _get_runner_availability(addresses: Mapping[int, client.LocalAddress]) -> _RunnerAvailability: + runner_client = client.RunnerClient.from_address(addresses[DSTACK_RUNNER_HTTP_PORT]) + if runner_client.healthcheck() is None: + return _RunnerAvailability.UNAVAILABLE + return _RunnerAvailability.AVAILABLE + + +@runner_ssh_tunnel +def _sync_shim_pulling_state( + addresses: Mapping[int, client.LocalAddress], + job_model: JobModel, + jrd: Optional[JobRuntimeData] = None, +) -> Union[_SyncShimPullingStateResult, Literal[False]]: + shim_client = client.ShimClient.from_address(addresses[DSTACK_SHIM_HTTP_PORT]) + image_pull_progress: Optional[ImagePullProgress] = None + if shim_client.is_api_v2_supported(): + task = shim_client.get_task(job_model.id) + if task.image_pull_progress is not None: + image_pull_progress = task.image_pull_progress + + if task.status == TaskStatus.TERMINATED: + logger.warning( + "shim failed to execute job %s: %s (%s)", + job_model.job_name, + task.termination_reason, + task.termination_message, + ) + logger.debug("task status: %s", task.dict()) + return _SyncShimPullingStateResult( + state=_ShimPullingState.FAILED, + termination_reason=JobTerminationReason(task.termination_reason.lower()), + termination_reason_message=task.termination_message, + image_pull_progress=image_pull_progress, + ) + + if task.status != TaskStatus.RUNNING: + return _SyncShimPullingStateResult( + state=_ShimPullingState.WAITING, + image_pull_progress=image_pull_progress, + ) + + if jrd is not None: + if task.ports is None: + return _SyncShimPullingStateResult( + state=_ShimPullingState.WAITING, + image_pull_progress=image_pull_progress, + ) + jrd = jrd.copy(update={"ports": {pm.container: pm.host for pm in task.ports}}) + else: + shim_status = shim_client.pull() + if ( + shim_status.state == "pending" + and shim_status.result is not None + and shim_status.result.reason != "" + ): + logger.warning( + "shim failed to execute job %s: %s (%s)", + job_model.job_name, + shim_status.result.reason, + shim_status.result.reason_message, + ) + logger.debug("shim status: %s", shim_status.dict()) + return _SyncShimPullingStateResult( + state=_ShimPullingState.FAILED, + termination_reason=JobTerminationReason(shim_status.result.reason.lower()), + termination_reason_message=shim_status.result.reason_message, + image_pull_progress=image_pull_progress, + ) + + if shim_status.state in ("pulling", "creating"): + return _SyncShimPullingStateResult( + state=_ShimPullingState.WAITING, + image_pull_progress=image_pull_progress, + ) + + return _SyncShimPullingStateResult( + state=_ShimPullingState.READY, + job_runtime_data=jrd, + image_pull_progress=image_pull_progress, + ) + + +@dataclass +class _SubmitJobToRunnerResult: + success: bool + set_running_status: bool = False + job_runtime_data: Optional[JobRuntimeData] = None + + +@runner_ssh_tunnel +def _submit_job_to_runner( + addresses: Mapping[int, client.LocalAddress], + run: Run, + job_model: JobModel, + job: Job, + jrd: Optional[JobRuntimeData], + cluster_info: ClusterInfo, + code: Optional[bytes], + file_archives: Iterable[tuple[uuid.UUID, bytes]], + secrets: Dict[str, str], + repo_credentials: Optional[RemoteRepoCreds], + router_env: Optional[Dict[str, str]], + success_if_not_available: bool, +) -> Union[_SubmitJobToRunnerResult, Literal[False]]: + logger.debug("%s: submitting job spec", fmt(job_model)) + logger.debug( + "%s: repo clone URL is %s", + fmt(job_model), + None if repo_credentials is None else repo_credentials.clone_url, + ) + instance = job_model.instance + if instance is not None and (rci := get_instance_remote_connection_info(instance)) is not None: + instance_env = rci.env + else: + instance_env = None + + runner_client = client.RunnerClient.from_address(addresses[DSTACK_RUNNER_HTTP_PORT]) + if runner_client.healthcheck() is None: + return _SubmitJobToRunnerResult(success=success_if_not_available) + + runner_client.submit_job( + run=run, + job=job, + cluster_info=cluster_info, + # Do not send all the secrets since interpolation is already done by the server. + # TODO: Passing secrets may be necessary for filtering out secret values from logs. + secrets={}, + repo_credentials=repo_credentials, + instance_env=instance_env, + router_env=router_env, + ) + for archive_id, archive in file_archives: + logger.debug("%s: uploading file archive: %s", fmt(job_model), archive_id) + runner_client.upload_archive(archive_id, archive) + if code is None and not runner_client.is_code_upload_optional(): + # Old runner, we must call `/api/upload_code` to proceed + code = b"" + if code is not None: + logger.debug("%s: uploading code", fmt(job_model)) + runner_client.upload_code(code) + logger.debug("%s: starting job", fmt(job_model)) + job_info = runner_client.run_job() + if job_info is not None: + if jrd is not None: + jrd = jrd.copy( + update={"working_dir": job_info.working_dir, "username": job_info.username} + ) + return _SubmitJobToRunnerResult( + success=True, + set_running_status=True, + job_runtime_data=jrd, + ) + + +@dataclass +class _ProcessRunningResult: + job_update_map: _JobUpdateMap = field(default_factory=_JobUpdateMap) + + +@runner_ssh_tunnel +def _process_running( + addresses: Mapping[int, client.LocalAddress], + run_model: RunModel, + job_model: JobModel, +) -> Union[_ProcessRunningResult, Literal[False]]: + runner_client = client.RunnerClient.from_address(addresses[DSTACK_RUNNER_HTTP_PORT]) + timestamp = job_model.runner_timestamp or 0 + resp = runner_client.pull(timestamp) + logs_services.write_logs( + project=run_model.project, + run_name=run_model.run_name, + job_submission_id=job_model.id, + runner_logs=resp.runner_logs, + job_logs=resp.job_logs, + ) + result = _ProcessRunningResult( + job_update_map=_JobUpdateMap(runner_timestamp=resp.last_updated) + ) + if len(resp.job_states) > 0: + latest_state_event = resp.job_states[-1] + latest_status = latest_state_event.state + if latest_status == JobStatus.DONE: + _terminate_job( + job_model=job_model, + job_update_map=result.job_update_map, + termination_reason=JobTerminationReason.DONE_BY_RUNNER, + termination_reason_message=None, + ) + elif latest_status in {JobStatus.FAILED, JobStatus.TERMINATED}: + termination_reason = JobTerminationReason.CONTAINER_EXITED_WITH_ERROR + if latest_state_event.termination_reason: + termination_reason = JobTerminationReason( + latest_state_event.termination_reason.lower() + ) + _terminate_job( + job_model=job_model, + job_update_map=result.job_update_map, + termination_reason=termination_reason, + termination_reason_message=latest_state_event.termination_message, + ) + if latest_state_event.exit_status is not None: + result.job_update_map["exit_status"] = latest_state_event.exit_status + if latest_state_event.exit_status != 0: + logger.info( + "%s: non-zero exit status %s", fmt(job_model), latest_state_event.exit_status + ) + else: + _terminate_if_inactivity_duration_exceeded( + run_model=run_model, + job_model=job_model, + job_update_map=result.job_update_map, + no_connections_secs=resp.no_connections_secs, + ) + return result + + +def _terminate_if_inactivity_duration_exceeded( + run_model: RunModel, + job_model: JobModel, + job_update_map: _JobUpdateMap, + no_connections_secs: Optional[int], +) -> None: + conf = RunSpec.__response__.parse_raw(run_model.run_spec).configuration + if not isinstance(conf, DevEnvironmentConfiguration) or not isinstance( + conf.inactivity_duration, int + ): + job_update_map["inactivity_secs"] = None + return + + logger.debug("%s: no SSH connections for %s seconds", fmt(job_model), no_connections_secs) + job_update_map["inactivity_secs"] = no_connections_secs + if no_connections_secs is None: + # TODO(0.19 or earlier): make no_connections_secs required + _terminate_job( + job_model=job_model, + job_update_map=job_update_map, + termination_reason=JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY, + termination_reason_message=( + "The selected instance was created before dstack 0.18.41" + " and does not support inactivity_duration" + ), + ) + elif no_connections_secs >= conf.inactivity_duration: + _terminate_job( + job_model=job_model, + job_update_map=job_update_map, + termination_reason=JobTerminationReason.INACTIVITY_DURATION_EXCEEDED, + termination_reason_message=( + f"The job was inactive for {no_connections_secs} seconds," + f" exceeding the inactivity_duration of {conf.inactivity_duration} seconds" + ), + ) + + +def _should_terminate_job_due_to_disconnect(disconnected_at: Optional[datetime]) -> bool: + if disconnected_at is None: + return False + return get_current_datetime() > disconnected_at + JOB_DISCONNECTED_RETRY_TIMEOUT + + +def _set_disconnected_at_now(job_model: JobModel, result: _ProcessResult) -> None: + if _get_result_disconnected_at(job_model, result) is None: + result.job_update_map["disconnected_at"] = get_current_datetime() + + +def _reset_disconnected_at(job_model: JobModel, result: _ProcessResult) -> None: + if _get_result_disconnected_at(job_model, result) is not None: + result.job_update_map["disconnected_at"] = None + + +def _get_cluster_info( + jobs: list[Job], + replica_num: int, + job_provisioning_data: JobProvisioningData, + job_runtime_data: Optional[JobRuntimeData], +) -> ClusterInfo: + job_ips = [] + for job in jobs: + if job.job_spec.replica_num == replica_num: + job_ips.append( + get_or_error(job.job_submissions[-1].job_provisioning_data).internal_ip or "" + ) + gpus_per_job = len(job_provisioning_data.instance_type.resources.gpus) + if job_runtime_data is not None and job_runtime_data.offer is not None: + gpus_per_job = len(job_runtime_data.offer.instance.resources.gpus) + return ClusterInfo( + job_ips=job_ips, + master_job_ip=job_ips[0], + gpus_per_job=gpus_per_job, + ) + + +def _get_repo_code_hash(run: Run, job: Job) -> Optional[str]: + # TODO: drop this function when supporting jobs submitted before 0.19.17 is no longer relevant. + if ( + job.job_spec.repo_code_hash is None + and run.run_spec.repo_code_hash is not None + and job.job_submissions[-1].deployment_num == run.deployment_num + ): + return run.run_spec.repo_code_hash + return job.job_spec.repo_code_hash + + +async def _get_job_code( + project: ProjectModel, repo: RepoModel, code_hash: Optional[str] +) -> Optional[bytes]: + if code_hash is None: + return None + async with get_session_ctx() as session: + code_model = await get_code_model(session=session, repo=repo, code_hash=code_hash) + if code_model is None: + return None + if code_model.blob is not None: + return code_model.blob + storage = get_default_storage() + if storage is None: + return None + blob = await run_async( + storage.get_code, + project.name, + repo.name, + code_hash, + ) + if blob is None: + logger.error( + "Failed to get repo code hash %s from storage for repo %s", code_hash, repo.name + ) + return None + return blob + + +async def _get_job_file_archives( + archive_mappings: Iterable[FileArchiveMapping], + user: UserModel, +) -> list[tuple[uuid.UUID, bytes]]: + archives: list[tuple[uuid.UUID, bytes]] = [] + for archive_mapping in archive_mappings: + archive_blob = await _get_job_file_archive(archive_id=archive_mapping.id, user=user) + archives.append((archive_mapping.id, archive_blob)) + return archives + + +async def _get_job_file_archive(archive_id: uuid.UUID, user: UserModel) -> bytes: + async with get_session_ctx() as session: + archive_model = await files_services.get_archive_model(session, id=archive_id, user=user) + if archive_model is None: + return b"" + if archive_model.blob is not None: + return archive_model.blob + storage = get_default_storage() + if storage is None: + return b"" + blob = await run_async( + storage.get_archive, + str(archive_model.user_id), + archive_model.blob_hash, + ) + if blob is None: + logger.error("Failed to get file archive %s from storage", archive_id) + return b"" + return blob + + +def _emit_reachability_change_event( + session: AsyncSession, + job_model: JobModel, + old_disconnected_at: Optional[datetime], + new_disconnected_at: Optional[datetime], +) -> None: + if old_disconnected_at is None and new_disconnected_at is not None: + events.emit( + session, + "Job became unreachable", + actor=events.SystemActor(), + targets=[events.Target.from_model(job_model)], + ) + elif old_disconnected_at is not None and new_disconnected_at is None: + events.emit( + session, + "Job became reachable", + actor=events.SystemActor(), + targets=[events.Target.from_model(job_model)], + ) + + +def _terminate_job( + job_model: JobModel, + job_update_map: _JobUpdateMap, + termination_reason: JobTerminationReason, + termination_reason_message: Optional[str], +) -> None: + job_update_map["termination_reason"] = termination_reason + job_update_map["termination_reason_message"] = termination_reason_message + _set_job_update_status(job_model, job_update_map, JobStatus.TERMINATING) + + +def _set_job_update_status( + job_model: JobModel, + job_update_map: _JobUpdateMap, + new_status: JobStatus, +) -> None: + if job_update_map.get("status", job_model.status) != new_status: + job_update_map["status"] = new_status + + +def _set_job_status(job_model: JobModel, result: _ProcessResult, new_status: JobStatus) -> None: + _set_job_update_status(job_model, result.job_update_map, new_status) + + +def _set_job_runtime_data(result: _ProcessResult, jrd: Optional[JobRuntimeData]) -> None: + result.job_update_map["job_runtime_data"] = None if jrd is None else jrd.json() + + +def _apply_submit_job_to_runner_result( + job_model: JobModel, + result: _ProcessResult, + submit_result: _SubmitJobToRunnerResult, +) -> None: + if submit_result.job_runtime_data is not None: + _set_job_runtime_data(result, submit_result.job_runtime_data) + if submit_result.set_running_status: + _set_job_status(job_model, result, JobStatus.RUNNING) + + +# Convention: _get_result_* helpers merge the loaded job_model state with any pending +# updates recorded in result.job_update_map. Always use these (not job_model.attr directly) +# when the field may have been updated earlier in the same processing cycle. + + +def _get_result_status(job_model: JobModel, result: _ProcessResult) -> JobStatus: + return result.job_update_map.get("status", job_model.status) + + +def _get_result_disconnected_at(job_model: JobModel, result: _ProcessResult) -> Optional[datetime]: + return result.job_update_map.get("disconnected_at", job_model.disconnected_at) + + +def _get_result_job_runtime_data( + job_model: JobModel, result: _ProcessResult +) -> Optional[JobRuntimeData]: + jrd = result.job_update_map.get("job_runtime_data", job_model.job_runtime_data) + if jrd is None: + return None + return JobRuntimeData.__response__.parse_raw(jrd) + + +def _get_result_registered(job_model: JobModel, result: _ProcessResult) -> bool: + return result.job_update_map.get("registered", job_model.registered) diff --git a/src/dstack/_internal/server/background/pipeline_tasks/jobs_submitted.py b/src/dstack/_internal/server/background/pipeline_tasks/jobs_submitted.py new file mode 100644 index 0000000000..35f613833c --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/jobs_submitted.py @@ -0,0 +1,2549 @@ +import asyncio +import copy +import uuid +from contextlib import AsyncExitStack +from dataclasses import dataclass +from datetime import timedelta +from typing import Optional, Sequence, Union + +from sqlalchemy import and_, func, or_, select, update +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import aliased, contains_eager, joinedload, load_only, selectinload + +from dstack._internal.core.backends.base.compute import ( + ComputeWithGroupProvisioningSupport, + ComputeWithPlacementGroupSupport, + ComputeWithVolumeSupport, +) +from dstack._internal.core.backends.base.models import JobConfiguration +from dstack._internal.core.backends.features import ( + BACKENDS_WITH_GROUP_PROVISIONING_SUPPORT, + BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT, +) +from dstack._internal.core.errors import BackendError, ServerClientError, SkipOffer +from dstack._internal.core.models.common import NetworkMode +from dstack._internal.core.models.compute_groups import ( + ComputeGroupProvisioningData, + ComputeGroupStatus, +) +from dstack._internal.core.models.fleets import ( + FleetSpec, + InstanceGroupPlacement, +) +from dstack._internal.core.models.instances import ( + InstanceOfferWithAvailability, + InstanceStatus, +) +from dstack._internal.core.models.profiles import ( + DEFAULT_RUN_TERMINATION_IDLE_TIME, + CreationPolicy, + Profile, + TerminationPolicy, +) +from dstack._internal.core.models.resources import Memory +from dstack._internal.core.models.runs import ( + Job, + JobProvisioningData, + JobRuntimeData, + JobStatus, + JobTerminationReason, + Requirements, + Run, +) +from dstack._internal.core.models.volumes import Volume +from dstack._internal.core.services.profiles import get_termination +from dstack._internal.server import settings +from dstack._internal.server.background.pipeline_tasks.base import ( + Fetcher, + Heartbeater, + Pipeline, + PipelineItem, + Worker, + log_lock_token_changed_after_processing, + log_lock_token_changed_on_reset, + log_lock_token_mismatch, +) +from dstack._internal.server.db import ( + get_db, + get_session_ctx, + is_db_sqlite, + sqlite_commit, +) +from dstack._internal.server.models import ( + ComputeGroupModel, + FleetModel, + InstanceModel, + JobModel, + PlacementGroupModel, + ProjectModel, + RunModel, + UserModel, + VolumeAttachmentModel, + VolumeModel, +) +from dstack._internal.server.services import events +from dstack._internal.server.services.backends import get_project_backend_by_type_or_error +from dstack._internal.server.services.docker import apply_server_docker_defaults +from dstack._internal.server.services.fleets import ( + can_create_new_cloud_instance_in_fleet, + get_fleet_master_instance_provisioning_data, + get_fleet_spec, + get_next_instance_num, + is_cloud_cluster, +) +from dstack._internal.server.services.instances import ( + filter_non_placeholder_instances, + format_instance_blocks_for_event, + get_instance_offer, + get_instance_provisioning_data, + is_placeholder_instance, + switch_instance_status, +) +from dstack._internal.server.services.jobs import ( + check_can_attach_job_volumes, + find_job, + find_jobs, + get_job_configured_volume_models, + get_job_configured_volumes, + get_job_runtime_data, + get_job_spec, + interpolate_job_spec_secrets, + is_master_job, + is_multinode_job, + switch_job_status, +) +from dstack._internal.server.services.locking import get_locker +from dstack._internal.server.services.logging import fmt +from dstack._internal.server.services.offers import ( + get_instance_offer_with_restricted_az, + get_offers_by_requirements, +) +from dstack._internal.server.services.pipelines import PipelineHinterProtocol +from dstack._internal.server.services.placement import ( + find_or_create_suitable_placement_group, + get_placement_group_model_for_job, + placement_group_model_to_placement_group_optional, + schedule_fleet_placement_groups_deletion, +) +from dstack._internal.server.services.runs import run_model_to_run +from dstack._internal.server.services.runs.plan import ( + find_optimal_fleet_with_offers, + get_instance_offers_in_fleet, + get_run_candidate_fleet_models_filters, + get_run_profile_and_requirements_in_fleet, + get_targeted_instance_offers, + select_run_candidate_fleet_models_with_filters, +) +from dstack._internal.server.services.runs.spec import ( + check_run_spec_requires_instance_mounts, +) +from dstack._internal.server.services.secrets import get_project_secrets_mapping +from dstack._internal.server.services.volumes import volume_model_to_volume +from dstack._internal.server.utils import sentry_utils +from dstack._internal.utils.common import get_current_datetime, get_or_error, run_async +from dstack._internal.utils.interpolator import InterpolatorError +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +@dataclass +class JobSubmittedPipelineItem(PipelineItem): + pass + + +class JobSubmittedPipeline(Pipeline[JobSubmittedPipelineItem]): + def __init__( + self, + workers_num: int = 40, + queue_lower_limit_factor: float = 0.5, + queue_upper_limit_factor: float = 2.0, + min_processing_interval: timedelta = timedelta(seconds=4), + lock_timeout: timedelta = timedelta(seconds=40), + heartbeat_trigger: timedelta = timedelta(seconds=20), + *, + pipeline_hinter: PipelineHinterProtocol, + ) -> None: + super().__init__( + workers_num=workers_num, + queue_lower_limit_factor=queue_lower_limit_factor, + queue_upper_limit_factor=queue_upper_limit_factor, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeat_trigger=heartbeat_trigger, + ) + self.__heartbeater = Heartbeater[JobSubmittedPipelineItem]( + model_type=JobModel, + lock_timeout=self._lock_timeout, + heartbeat_trigger=self._heartbeat_trigger, + ) + self.__fetcher = JobSubmittedFetcher( + queue=self._queue, + queue_desired_minsize=self._queue_desired_minsize, + min_processing_interval=self._min_processing_interval, + lock_timeout=self._lock_timeout, + heartbeater=self._heartbeater, + ) + self.__workers = [ + JobSubmittedWorker( + queue=self._queue, + heartbeater=self._heartbeater, + pipeline_hinter=pipeline_hinter, + ) + for _ in range(self._workers_num) + ] + + @property + def hint_fetch_model_name(self) -> str: + return JobModel.__name__ + + @property + def _heartbeater(self) -> Heartbeater[JobSubmittedPipelineItem]: + return self.__heartbeater + + @property + def _fetcher(self) -> Fetcher[JobSubmittedPipelineItem]: + return self.__fetcher + + @property + def _workers(self) -> Sequence["JobSubmittedWorker"]: + return self.__workers + + +class JobSubmittedFetcher(Fetcher[JobSubmittedPipelineItem]): + def __init__( + self, + queue: asyncio.Queue[JobSubmittedPipelineItem], + queue_desired_minsize: int, + min_processing_interval: timedelta, + lock_timeout: timedelta, + heartbeater: Heartbeater[JobSubmittedPipelineItem], + queue_check_delay: float = 1.0, + ) -> None: + super().__init__( + queue=queue, + queue_desired_minsize=queue_desired_minsize, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeater=heartbeater, + queue_check_delay=queue_check_delay, + ) + + @sentry_utils.instrument_pipeline_task("JobSubmittedFetcher.fetch") + async def fetch(self, limit: int) -> list[JobSubmittedPipelineItem]: + now = get_current_datetime() + if limit <= 0: + return [] + + job_lock, _ = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__) + async with job_lock: + async with get_session_ctx() as session: + res = await session.execute( + select(JobModel) + .join(JobModel.run) + .where( + JobModel.status == JobStatus.SUBMITTED, + JobModel.waiting_master_job.is_not(True), + or_( + # Non-master jobs must wait for the run to have the fleet assigned. + JobModel.job_num == 0, + RunModel.fleet_id.is_not(None), + ), + or_( + JobModel.skip_min_processing_interval == True, + JobModel.last_processed_at <= now - self._min_processing_interval, + JobModel.last_processed_at == JobModel.submitted_at, + ), + or_( + # This pipeline does not check RunModel.lock_owner + # because we want to provision jobs ASAP and RunPipeline can wait. + JobModel.lock_expires_at.is_(None), + JobModel.lock_expires_at < now, + ), + or_( + JobModel.lock_owner.is_(None), + JobModel.lock_owner == JobSubmittedPipeline.__name__, + ), + ) + .order_by(RunModel.priority.desc(), JobModel.last_processed_at.asc()) + .limit(limit) + .with_for_update(skip_locked=True, key_share=True, of=JobModel) + .options( + load_only( + JobModel.id, + JobModel.lock_token, + JobModel.lock_expires_at, + JobModel.skip_min_processing_interval, + ) + ) + ) + job_models = list(res.scalars().all()) + lock_expires_at = get_current_datetime() + self._lock_timeout + lock_token = uuid.uuid4() + items = [] + for job_model in job_models: + prev_lock_expired = job_model.lock_expires_at is not None + job_model.lock_expires_at = lock_expires_at + job_model.lock_token = lock_token + job_model.lock_owner = JobSubmittedPipeline.__name__ + job_model.skip_min_processing_interval = False + items.append( + JobSubmittedPipelineItem( + __tablename__=JobModel.__tablename__, + id=job_model.id, + lock_expires_at=lock_expires_at, + lock_token=lock_token, + prev_lock_expired=prev_lock_expired, + ) + ) + await session.commit() + + return items + + +class JobSubmittedWorker(Worker[JobSubmittedPipelineItem]): + def __init__( + self, + queue: asyncio.Queue[JobSubmittedPipelineItem], + heartbeater: Heartbeater[JobSubmittedPipelineItem], + pipeline_hinter: PipelineHinterProtocol, + ) -> None: + super().__init__( + queue=queue, + heartbeater=heartbeater, + pipeline_hinter=pipeline_hinter, + ) + + @sentry_utils.instrument_pipeline_task("JobSubmittedWorker.process") + async def process(self, item: JobSubmittedPipelineItem): + context = await _load_process_context(item=item) + if context is None: + log_lock_token_mismatch(logger, item) + return + + if context.job_model.instance_assigned: + logger.debug("%s: provisioning has started", fmt(context.job_model)) + provisioning = await _process_provisioning(item=item, context=context) + _hint_pipelines_fetch( + pipeline_hinter=self._pipeline_hinter, + result=provisioning, + ) + await _apply_provisioning_result( + item=item, + provisioning=provisioning, + ) + self._pipeline_hinter.hint_fetch(JobModel.__name__) + return + + logger.debug("%s: assignment has started", fmt(context.job_model)) + assignment = await _process_assignment(context=context) + _hint_pipelines_fetch( + pipeline_hinter=self._pipeline_hinter, + result=assignment, + ) + await _apply_assignment_result( + item=item, + context=context, + assignment=assignment, + ) + self._pipeline_hinter.hint_fetch(JobModel.__name__) + + +@dataclass +class _SubmittedJobContext: + job_model: JobModel + run_model: RunModel + project: ProjectModel + run: Run + job: Job + jobs_to_provision: list[Job] + replica_jobs: list[Job] + replica_job_model_ids: list[uuid.UUID] + fleet_model: Optional[FleetModel] + multinode: bool + + +@dataclass +class _PreparedJobVolumes: + volume_model_ids: list[list[uuid.UUID]] + volumes: list[list[Volume]] + + +@dataclass +class _ProcessedPreconditions: + master_job_provisioning_data: Optional[JobProvisioningData] + prepared_job_volumes: _PreparedJobVolumes + + +@dataclass +class _DeferSubmittedJobResult: + """The job is not ready yet, so apply should just mark it processed and unlock it.""" + + log_message: str + hint_fleet_pipeline: bool = False + + +@dataclass +class _RetrySubmittedJobResult: + """Transient contention outcome that resets the main job lock for a quick retry later without clearing lock_owner.""" + + pass + + +@dataclass +class _PlacementGroupCleanup: + fleet_id: uuid.UUID + selected_placement_group_id: Optional[uuid.UUID] + new_placement_group_models: list[PlacementGroupModel] + + +@dataclass +class _TerminateSubmittedJobResult: + reason: JobTerminationReason + message: Optional[str] = None + locked_fleet_id: Optional[uuid.UUID] = None + placement_group_cleanup: Optional[_PlacementGroupCleanup] = None + + +@dataclass +class _VolumeAttachmentPayload: + volume_id: uuid.UUID + attachment_data: str + volume_name: str + + +@dataclass +class _VolumeAttachmentResult: + attachments: list[_VolumeAttachmentPayload] + locked_volume_ids: list[uuid.UUID] + termination_message: Optional[str] = None + + +@dataclass +class _NoFleetAssignment: + pass + + +@dataclass +class _ExistingInstanceAssignment: + fleet_id: uuid.UUID + master_job_provisioning_data: Optional[JobProvisioningData] + volumes: list[list[Volume]] + + +@dataclass +class _NewCapacityAssignment: + fleet_id: uuid.UUID + + +_AssignmentResult = Union[ + _DeferSubmittedJobResult, + _TerminateSubmittedJobResult, + _NoFleetAssignment, + _NewCapacityAssignment, + _ExistingInstanceAssignment, +] + + +@dataclass +class _ExistingInstanceProvisioning: + volume_attachment_result: _VolumeAttachmentResult + + +@dataclass +class _FailedNewCapacityProvisioning: + placement_group_cleanup: Optional[_PlacementGroupCleanup] + + +@dataclass +class _ProvisionNewCapacityResult: + provisioning_data: Union[JobProvisioningData, ComputeGroupProvisioningData] + offer: InstanceOfferWithAvailability + effective_profile: Profile + placement_group_cleanup: Optional[_PlacementGroupCleanup] + + +@dataclass +class _NewCapacityProvisioning: + provisioning_data: Union[JobProvisioningData, ComputeGroupProvisioningData] + offer: InstanceOfferWithAvailability + effective_profile: Profile + placement_group_cleanup: Optional[_PlacementGroupCleanup] + volume_attachment_result: Optional[_VolumeAttachmentResult] + locked_fleet_id: Optional[uuid.UUID] + + +_ProvisioningResult = Union[ + _DeferSubmittedJobResult, + _TerminateSubmittedJobResult, + _RetrySubmittedJobResult, + _ExistingInstanceProvisioning, + _NewCapacityProvisioning, +] + + +async def _load_process_context(item: JobSubmittedPipelineItem) -> Optional[_SubmittedJobContext]: + async with get_session_ctx() as session: + job_model = await _refetch_locked_job(session=session, item=item) + if job_model is None: + return None + return await _load_submitted_job_context(session=session, job_model=job_model) + + +async def _process_assignment(context: _SubmittedJobContext) -> _AssignmentResult: + preconditions = await _process_preconditions(context=context) + if not isinstance(preconditions, _ProcessedPreconditions): + return preconditions + + if context.run.run_spec.merged_profile.instances is not None: + return await _select_targeted_instance_assignment( + context=context, + preconditions=preconditions, + ) + + candidate_fleet_models = await _load_assignment_candidate_fleets(context=context) + return await _select_assignment( + context=context, + preconditions=preconditions, + candidate_fleet_models=candidate_fleet_models, + ) + + +async def _select_assignment( + context: _SubmittedJobContext, + preconditions: _ProcessedPreconditions, + candidate_fleet_models: list[FleetModel], +) -> _AssignmentResult: + # Getting backend offers can be slow, so fleet selection must happen outside the DB transaction. + fleet_model, fleet_instances_with_offers, _ = await find_optimal_fleet_with_offers( + project=context.project, + fleet_models=candidate_fleet_models, + run_model=context.run_model, + run_spec=context.run.run_spec, + job=context.job, + master_job_provisioning_data=preconditions.master_job_provisioning_data, + volumes=preconditions.prepared_job_volumes.volumes, + exclude_not_available=True, + skip_backend_offers_on_pool_capacity=True, + ) + + if fleet_model is None: + return _NoFleetAssignment() + + if fleet_instances_with_offers: + return _ExistingInstanceAssignment( + fleet_id=fleet_model.id, + master_job_provisioning_data=preconditions.master_job_provisioning_data, + volumes=preconditions.prepared_job_volumes.volumes, + ) + + return _NewCapacityAssignment(fleet_id=fleet_model.id) + + +async def _select_targeted_instance_assignment( + context: _SubmittedJobContext, + preconditions: _ProcessedPreconditions, +) -> _AssignmentResult: + async with get_session_ctx() as session: + instance_offers = await get_targeted_instance_offers( + session=session, + project=context.project, + run_spec=context.run.run_spec, + job=context.job, + master_job_provisioning_data=preconditions.master_job_provisioning_data, + volumes=preconditions.prepared_job_volumes.volumes, + exclude_not_available=True, + fleet_id=context.run_model.fleet_id, + ) + if len(instance_offers) < _get_required_targeted_instance_offers(context): + return _NoFleetAssignment() + return _ExistingInstanceAssignment( + fleet_id=get_or_error(instance_offers[0][0].fleet_id), + master_job_provisioning_data=preconditions.master_job_provisioning_data, + volumes=preconditions.prepared_job_volumes.volumes, + ) + + +async def _apply_assignment_result( + item: JobSubmittedPipelineItem, + context: _SubmittedJobContext, + assignment: _AssignmentResult, +) -> None: + async with get_session_ctx() as session: + job_model = await _refetch_locked_job(session=session, item=item) + if job_model is None: + log_lock_token_changed_after_processing(logger, item) + return + + if isinstance(assignment, _DeferSubmittedJobResult): + await _defer_submitted_job( + session=session, + job_model=job_model, + log_message=assignment.log_message, + ) + return + + if isinstance(assignment, _TerminateSubmittedJobResult): + await _terminate_submitted_job( + session=session, + job_model=job_model, + reason=assignment.reason, + message=assignment.message, + ) + return + + if isinstance(assignment, _NoFleetAssignment): + await _apply_no_fleet_selection( + session=session, + job_model=job_model, + run=context.run, + ) + return + + if isinstance(assignment, _NewCapacityAssignment): + # Always reserve one placeholder instance under fleet lock for the current + # submitted job. This keeps instance_num unique and makes nodes.max a hard + # limit for the single-instance provisioning path, including multinode + # masters that later fall back to run_job(). Compute groups still use the + # old partial path: one placeholder does not reserve the full group. + async with AsyncExitStack() as exit_stack: + fleet_model = await _lock_fleet_for_placeholder( + exit_stack=exit_stack, + session=session, + fleet_id=assignment.fleet_id, + ) + if fleet_model is None: + logger.debug( + "%s: failed to lock fleet for placeholder creation", + fmt(context.job_model), + ) + await _reset_job_lock_for_retry(session=session, item=item) + return + fleet_spec = get_fleet_spec(fleet_model) + if not can_create_new_cloud_instance_in_fleet(fleet_model, fleet_spec): + await _terminate_submitted_job( + session=session, + job_model=job_model, + reason=JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY, + message="Fleet is at capacity", + ) + return + instance_model = _create_placeholder_instance( + fleet_model=fleet_model, + project=context.project, + job_model=job_model, + ) + session.add(instance_model) + job_model.instance = instance_model + job_model.used_instance_id = instance_model.id + events.emit( + session, + f"Instance created for job. Instance status: {instance_model.status.upper()}", + actor=events.SystemActor(), + targets=[ + events.Target.from_model(instance_model), + events.Target.from_model(job_model), + ], + ) + job_model.fleet_id = assignment.fleet_id + job_model.instance_assigned = True + job_model.skip_min_processing_interval = True + await _mark_job_processed(session=session, job_model=job_model) + return + + async with AsyncExitStack() as exit_stack: + if context.run.run_spec.merged_profile.instances is not None: + current_instance_offers = await _lock_targeted_instance_offers_for_assignment( + exit_stack=exit_stack, + session=session, + context=context, + assignment=assignment, + ) + if len(current_instance_offers) < _get_required_targeted_instance_offers(context): + await _reset_job_lock_for_retry(session=session, item=item) + return + + instance_model, current_offer = current_instance_offers[0] + _assign_instance_to_job( + session=session, + job_model=job_model, + instance_model=instance_model, + offer=current_offer, + multinode=context.multinode, + ) + await _mark_job_processed(session=session, job_model=job_model) + return + + fleet_model = await _lock_assignment_fleet_for_existing_instance_assignment( + exit_stack=exit_stack, + session=session, + context=context, + fleet_id=assignment.fleet_id, + ) + if fleet_model is None: + logger.debug( + "%s: failed to lock existing fleet instances for assignment", + fmt(context.job_model), + ) + await _reset_job_lock_for_retry(session=session, item=item) + return + + # The optimal fleet was chosen from a detached snapshot. Recompute reusable + # offers after locking the fleet instances so concurrent jobs can spread + # across the remaining free instances instead of racing on one stale choice. + current_instance_offers = _get_current_reusable_instance_offers( + context=context, + assignment=assignment, + fleet_model=fleet_model, + ) + if not current_instance_offers: + # If the reusable offers vanished under the fleet lock, retry full + # assignment later instead of forcing new-capacity provisioning in a + # fleet that may no longer be optimal. + await _reset_job_lock_for_retry(session=session, item=item) + return + + instance_model, current_offer = current_instance_offers[0] + _assign_instance_to_job( + session=session, + job_model=job_model, + instance_model=instance_model, + offer=current_offer, + multinode=context.multinode, + ) + await _mark_job_processed(session=session, job_model=job_model) + + +async def _refetch_locked_job( + session: AsyncSession, + item: JobSubmittedPipelineItem, +) -> Optional[JobModel]: + res = await session.execute( + select(JobModel) + .where( + JobModel.id == item.id, + JobModel.lock_token == item.lock_token, + ) + .execution_options(populate_existing=True) + ) + return res.unique().scalar_one_or_none() + + +async def _load_submitted_job_context( + session: AsyncSession, job_model: JobModel +) -> _SubmittedJobContext: + run_model = await _fetch_run_model_for_submitted_job(session=session, job_model=job_model) + res = await session.execute( + select(JobModel) + .where(JobModel.id == job_model.id) + .options(joinedload(JobModel.instance)) + .options( + joinedload(JobModel.fleet).selectinload( + FleetModel.instances.and_(InstanceModel.deleted == False) + ) + ) + .execution_options(populate_existing=True) + ) + job_model = res.unique().scalar_one() + run = run_model_to_run(run_model) + job = find_job(run.jobs, job_model.replica_num, job_model.job_num) + replica_jobs = find_jobs(run.jobs, replica_num=job_model.replica_num) + return _SubmittedJobContext( + job_model=job_model, + run_model=run_model, + project=run_model.project, + run=run, + job=job, + jobs_to_provision=_select_jobs_to_provision(job, replica_jobs, job_model), + replica_jobs=replica_jobs, + replica_job_model_ids=[ + jm.id for jm in _get_job_models_for_jobs(run_model.jobs, replica_jobs) + ], + fleet_model=run_model.fleet or job_model.fleet, + multinode=job.job_spec.jobs_per_replica > 1, + ) + + +async def _fetch_run_model_for_submitted_job( + session: AsyncSession, job_model: JobModel +) -> RunModel: + """Fetch run model with only the relevant latest-submission jobs. + + Only a small subset is needed depending on the job type: + * Master multinode: all same-replica jobs (for cluster provisioning and releasing sibling waits). + * Non-master: master job + current job (for master provisioning data lookup). + * Master single-node: current job only (no siblings needed). + + Only the latest submission per (replica_num, job_num) is loaded since historical + submissions are never accessed in submitted job processing. + """ + is_master = job_model.job_num == 0 + is_multinode = get_job_spec(job_model).jobs_per_replica > 1 + + job_num_filters: list = [] + if is_master and not is_multinode: + # Master single-node: only current job needed. + job_num_filters.append(JobModel.job_num == 0) + elif not is_master: + # Non-master: master job (for provisioning data) + current job. + job_num_filters.append(JobModel.job_num.in_([0, job_model.job_num])) + # else: master multinode — no job_num filter, load all jobs in replica. + + latest_submissions_sq = ( + select( + JobModel.run_id.label("run_id"), + JobModel.replica_num.label("replica_num"), + JobModel.job_num.label("job_num"), + func.max(JobModel.submission_num).label("max_submission_num"), + ) + .where( + JobModel.run_id == job_model.run_id, + JobModel.replica_num == job_model.replica_num, + *job_num_filters, + ) + .group_by(JobModel.run_id, JobModel.replica_num, JobModel.job_num) + .subquery() + ) + job_alias = aliased(JobModel) + res = await session.execute( + select(RunModel) + .where(RunModel.id == job_model.run_id) + .join(job_alias, job_alias.run_id == RunModel.id) + .join( + latest_submissions_sq, + onclause=and_( + job_alias.run_id == latest_submissions_sq.c.run_id, + job_alias.replica_num == latest_submissions_sq.c.replica_num, + job_alias.job_num == latest_submissions_sq.c.job_num, + job_alias.submission_num == latest_submissions_sq.c.max_submission_num, + ), + ) + .options(joinedload(RunModel.project).joinedload(ProjectModel.backends)) + .options(joinedload(RunModel.user).load_only(UserModel.name)) + .options( + joinedload(RunModel.fleet).selectinload( + FleetModel.instances.and_(InstanceModel.deleted == False) + ) + ) + .options(contains_eager(RunModel.jobs, alias=job_alias)) + .execution_options(populate_existing=True) + ) + return res.unique().scalar_one() + + +def _get_job_models_for_jobs( + job_models: list[JobModel], + jobs: list[Job], +) -> list[JobModel]: + id_to_job_model_map = {job_model.id: job_model for job_model in job_models} + return [id_to_job_model_map[job.job_submissions[-1].id] for job in jobs] + + +def _get_job_models_by_ids( + job_models: list[JobModel], + job_model_ids: list[uuid.UUID], +) -> list[JobModel]: + id_to_job_model_map = {job_model.id: job_model for job_model in job_models} + return [id_to_job_model_map[job_model_id] for job_model_id in job_model_ids] + + +async def _process_preconditions( + context: _SubmittedJobContext, +) -> Union[ + _ProcessedPreconditions, + _DeferSubmittedJobResult, + _TerminateSubmittedJobResult, +]: + master_job_provisioning_data = _get_master_job_provisioning_data(context=context) + if context.job.job_spec.job_num != 0 and master_job_provisioning_data is None: + return _DeferSubmittedJobResult(log_message="waiting for master job to be provisioned") + + if _should_wait_for_run_fleet_assignment(context=context): + return _DeferSubmittedJobResult( + log_message="waiting for the run to be assigned to the fleet" + ) + + prepared_job_volumes = await _prepare_job_volumes(context=context) + if isinstance(prepared_job_volumes, _TerminateSubmittedJobResult): + return prepared_job_volumes + + return _ProcessedPreconditions( + master_job_provisioning_data=master_job_provisioning_data, + prepared_job_volumes=prepared_job_volumes, + ) + + +def _get_master_job_provisioning_data( + context: _SubmittedJobContext, +) -> Optional[JobProvisioningData]: + if context.job.job_spec.job_num == 0: + return None + + master_job = find_job(context.run.jobs, context.job_model.replica_num, 0) + if master_job.job_submissions[-1].job_provisioning_data is None: + return None + + return JobProvisioningData.__response__.parse_obj( + master_job.job_submissions[-1].job_provisioning_data + ) + + +def _should_wait_for_run_fleet_assignment(context: _SubmittedJobContext) -> bool: + if context.job.job_spec.job_num == 0 and context.job.job_spec.replica_num == 0: + return False + return context.run_model.fleet_id is None + + +async def _prepare_job_volumes( + context: _SubmittedJobContext, +) -> Union[_PreparedJobVolumes, _TerminateSubmittedJobResult]: + async with get_session_ctx() as session: + try: + volume_models = await get_job_configured_volume_models( + session=session, + project=context.project, + run_spec=context.run.run_spec, + job_num=context.job.job_spec.job_num, + job_spec=context.job.job_spec, + ) + volumes = await get_job_configured_volumes( + session=session, + project=context.project, + run_spec=context.run.run_spec, + job_num=context.job.job_spec.job_num, + job_spec=context.job.job_spec, + ) + check_can_attach_job_volumes(volumes) + except ServerClientError as e: + logger.warning( + "%s: failed to prepare run volumes: %s", fmt(context.job_model), repr(e) + ) + return _TerminateSubmittedJobResult( + reason=JobTerminationReason.VOLUME_ERROR, + message=e.msg, + ) + + return _PreparedJobVolumes( + volume_model_ids=[ + [volume_model.id for volume_model in mount_point] for mount_point in volume_models + ], + volumes=volumes, + ) + + +async def _load_assignment_candidate_fleets( + context: _SubmittedJobContext, +) -> list[FleetModel]: + async with get_session_ctx() as session: + fleet_filters, instance_filters = await get_run_candidate_fleet_models_filters( + session=session, + project=context.project, + run_model=context.run_model, + run_spec=context.run.run_spec, + ) + ( + fleets_with_instances, + fleets_without_instances, + ) = await select_run_candidate_fleet_models_with_filters( + session=session, + fleet_filters=fleet_filters, + instance_filters=instance_filters, + lock_instances=False, + ) + return fleets_with_instances + fleets_without_instances + + +async def _apply_no_fleet_selection( + session: AsyncSession, + job_model: JobModel, + run: Run, +) -> None: + if run.run_spec.merged_profile.instances is not None: + logger.debug("%s: failed to use specified instances", fmt(job_model)) + await _terminate_submitted_job( + session=session, + job_model=job_model, + reason=JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY, + message="Failed to use specified instances", + ) + return + + if run.run_spec.merged_profile.fleets is not None: + logger.debug("%s: failed to use specified fleets", fmt(job_model)) + await _terminate_submitted_job( + session=session, + job_model=job_model, + reason=JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY, + message="Failed to use specified fleets", + ) + return + + logger.debug("%s: no fleet found", fmt(job_model)) + await _terminate_submitted_job( + session=session, + job_model=job_model, + reason=JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY, + message=( + "No matching fleet found. Possible reasons: " + "https://dstack.ai/docs/guides/troubleshooting/#no-fleets" + ), + ) + + +async def _lock_targeted_instance_offers_for_assignment( + exit_stack: AsyncExitStack, + session: AsyncSession, + context: _SubmittedJobContext, + assignment: _ExistingInstanceAssignment, +) -> list[tuple[InstanceModel, InstanceOfferWithAvailability]]: + instance_offers = await get_targeted_instance_offers( + session=session, + project=context.project, + run_spec=context.run.run_spec, + job=context.job, + master_job_provisioning_data=assignment.master_job_provisioning_data, + volumes=assignment.volumes, + exclude_not_available=True, + fleet_id=assignment.fleet_id, + lock_instances=True, + ) + instance_ids = sorted(instance.id for instance, _ in instance_offers) + if not instance_ids or not is_db_sqlite(): + return instance_offers + + await sqlite_commit(session) + await exit_stack.enter_async_context( + get_locker(get_db().dialect_name).lock_ctx(InstanceModel.__tablename__, instance_ids) + ) + return await get_targeted_instance_offers( + session=session, + project=context.project, + run_spec=context.run.run_spec, + job=context.job, + master_job_provisioning_data=assignment.master_job_provisioning_data, + volumes=assignment.volumes, + exclude_not_available=True, + fleet_id=assignment.fleet_id, + instance_ids=instance_ids, + lock_instances=True, + ) + + +async def _lock_assignment_fleet_for_existing_instance_assignment( + exit_stack: AsyncExitStack, + session: AsyncSession, + context: _SubmittedJobContext, + fleet_id: uuid.UUID, +) -> Optional[FleetModel]: + fleet_filters, instance_filters = await get_run_candidate_fleet_models_filters( + session=session, + project=context.project, + run_model=context.run_model, + run_spec=context.run.run_spec, + ) + fleet_filters.append(FleetModel.id == fleet_id) + + ( + fleets_with_instances, + _, + ) = await select_run_candidate_fleet_models_with_filters( + session=session, + fleet_filters=fleet_filters, + instance_filters=instance_filters, + lock_instances=True, + ) + if not fleets_with_instances: + return None + + instance_ids = sorted(instance.id for instance in fleets_with_instances[0].instances) + if not instance_ids: + return None + + if not is_db_sqlite(): + return fleets_with_instances[0] + + await sqlite_commit(session) + await exit_stack.enter_async_context( + get_locker(get_db().dialect_name).lock_ctx(InstanceModel.__tablename__, instance_ids) + ) + ( + fleets_with_locked_instances, + _, + ) = await select_run_candidate_fleet_models_with_filters( + session=session, + fleet_filters=fleet_filters, + instance_filters=[*instance_filters, InstanceModel.id.in_(instance_ids)], + lock_instances=True, + ) + if not fleets_with_locked_instances: + return None + return fleets_with_locked_instances[0] + + +async def _lock_fleet_for_placeholder( + exit_stack: AsyncExitStack, + session: AsyncSession, + fleet_id: uuid.UUID, +) -> Optional[FleetModel]: + """Lock a fleet and load its non-deleted instances for placeholder creation. + + Returns the fleet model with instances loaded, or None if the fleet + cannot be locked (e.g. it is gone, deleted, or already locked by another pipeline). + """ + res = await session.execute( + select(FleetModel) + .where( + FleetModel.id == fleet_id, + FleetModel.deleted == False, + ) + .options(selectinload(FleetModel.instances.and_(InstanceModel.deleted == False))) + .execution_options(populate_existing=True) + .with_for_update(skip_locked=True, key_share=True) + ) + fleet_model = res.scalars().unique().one_or_none() + if fleet_model is None: + return None + + if not is_db_sqlite(): + return fleet_model + + await sqlite_commit(session) + await exit_stack.enter_async_context( + get_locker(get_db().dialect_name).lock_ctx(FleetModel.__tablename__, [fleet_id]) + ) + # Re-query under in-memory lock to see committed changes. + res = await session.execute( + select(FleetModel) + .where( + FleetModel.id == fleet_id, + FleetModel.deleted == False, + ) + .options(selectinload(FleetModel.instances.and_(InstanceModel.deleted == False))) + .execution_options(populate_existing=True) + ) + return res.scalars().unique().one_or_none() + + +def _create_placeholder_instance( + fleet_model: FleetModel, + project: ProjectModel, + job_model: JobModel, +) -> InstanceModel: + taken_instance_nums = {i.instance_num for i in fleet_model.instances} + instance_num = get_next_instance_num(taken_instance_nums) + return InstanceModel( + id=uuid.uuid4(), + name=f"{fleet_model.name}-{instance_num}", + instance_num=instance_num, + project=project, + fleet=fleet_model, + status=InstanceStatus.PENDING, + unreachable=False, + provisioning_job_id=job_model.id, + ) + + +def _get_current_reusable_instance_offers( + context: _SubmittedJobContext, + assignment: _ExistingInstanceAssignment, + fleet_model: FleetModel, +) -> list[tuple[InstanceModel, InstanceOfferWithAvailability]]: + return get_instance_offers_in_fleet( + fleet_model=fleet_model, + run_spec=context.run.run_spec, + job=context.job, + master_job_provisioning_data=assignment.master_job_provisioning_data, + volumes=assignment.volumes, + exclude_not_available=True, + ) + + +def _assign_instance_to_job( + session: AsyncSession, + job_model: JobModel, + instance_model: InstanceModel, + offer: InstanceOfferWithAvailability, + multinode: bool, +) -> None: + job_model.fleet_id = instance_model.fleet_id + job_model.instance_assigned = True + job_model.instance = instance_model + job_model.used_instance_id = instance_model.id + job_model.job_provisioning_data = instance_model.job_provisioning_data + job_model.job_runtime_data = _prepare_job_runtime_data(offer, multinode).json() + job_model.skip_min_processing_interval = True + + switch_instance_status(session, instance_model, InstanceStatus.BUSY) + instance_model.busy_blocks += offer.blocks + events.emit( + session, + ( + "Job assigned to instance." + f" Instance blocks: {format_instance_blocks_for_event(instance_model)}" + ), + actor=events.SystemActor(), + targets=[ + events.Target.from_model(job_model), + events.Target.from_model(instance_model), + ], + ) + + +def _prepare_job_runtime_data( + offer: InstanceOfferWithAvailability, multinode: bool +) -> JobRuntimeData: + if offer.blocks == offer.total_blocks: + if settings.JOB_NETWORK_MODE == settings.JobNetworkMode.FORCED_BRIDGE: + network_mode = NetworkMode.BRIDGE + elif settings.JOB_NETWORK_MODE == settings.JobNetworkMode.HOST_WHEN_POSSIBLE: + network_mode = NetworkMode.HOST + else: + assert settings.JOB_NETWORK_MODE == settings.JobNetworkMode.HOST_FOR_MULTINODE_ONLY + network_mode = NetworkMode.HOST if multinode else NetworkMode.BRIDGE + return JobRuntimeData( + network_mode=network_mode, + offer=offer, + ) + return JobRuntimeData( + network_mode=NetworkMode.BRIDGE, + offer=offer, + cpu=offer.instance.resources.cpus, + gpu=len(offer.instance.resources.gpus), + memory=Memory(offer.instance.resources.memory_mib / 1024), + ) + + +async def _process_provisioning( + item: JobSubmittedPipelineItem, + context: _SubmittedJobContext, +) -> _ProvisioningResult: + preconditions = await _process_preconditions(context=context) + if not isinstance(preconditions, _ProcessedPreconditions): + return preconditions + + if context.job_model.instance is not None: + if is_placeholder_instance(context.job_model.instance): + # Placeholder instance created during assignment — proceed to cloud provisioning. + return await _process_new_capacity_provisioning( + item=item, + context=context, + preconditions=preconditions, + ) + return await _process_existing_instance_provisioning( + item=item, + context=context, + prepared_job_volumes=preconditions.prepared_job_volumes, + ) + + if context.run.run_spec.merged_profile.creation_policy == CreationPolicy.REUSE: + logger.debug("%s: reuse instance failed", fmt(context.job_model)) + return _TerminateSubmittedJobResult( + reason=JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY, + message="Could not reuse any instances for this job", + ) + + return await _process_new_capacity_provisioning( + item=item, + context=context, + preconditions=preconditions, + ) + + +async def _apply_provisioning_result( + item: JobSubmittedPipelineItem, + provisioning: _ProvisioningResult, +) -> None: + async with get_session_ctx() as session: + if isinstance(provisioning, _RetrySubmittedJobResult): + await _reset_job_lock_for_retry(session=session, item=item) + return + + job_model = await _refetch_locked_job(session=session, item=item) + if job_model is None: + # FIXME: Placement-group creation, provisioning, and volume attachment all run + # before guarded apply, so a stale lock token here means provider-side + # side effects may already have happened. + await _unlock_related_volumes( + session=session, + item=item, + volume_ids=_get_locked_volume_ids_from_provisioning(provisioning), + ) + await _unlock_related_fleet( + session=session, + item=item, + fleet_id=_get_locked_fleet_id_from_provisioning(provisioning), + ) + log_lock_token_changed_after_processing(logger, item) + return + + if isinstance(provisioning, _DeferSubmittedJobResult): + await _defer_submitted_job( + session=session, + job_model=job_model, + log_message=provisioning.log_message, + ) + return + + if isinstance(provisioning, _TerminateSubmittedJobResult): + if provisioning.placement_group_cleanup is not None: + cleanup_fleet_model = await _load_placement_group_cleanup_fleet( + session=session, + fleet_id=provisioning.placement_group_cleanup.fleet_id, + ) + await _persist_placement_group_cleanup( + session=session, + fleet_model=cleanup_fleet_model, + project=cleanup_fleet_model.project, + placement_group_cleanup=provisioning.placement_group_cleanup, + ) + await _unlock_related_fleet( + session=session, + item=item, + fleet_id=provisioning.locked_fleet_id, + ) + # Keep the placeholder live here: JobTerminatingPipeline will unassign it + # from the job, and InstancePipeline will finish deleting it. + await _terminate_submitted_job( + session=session, + job_model=job_model, + reason=provisioning.reason, + message=provisioning.message, + ) + return + + if isinstance(provisioning, _ExistingInstanceProvisioning): + await _apply_existing_instance_provisioning( + session=session, + item=item, + job_model=job_model, + provisioning=provisioning, + ) + return + + await _apply_new_capacity_provisioning( + session=session, + item=item, + job_model=job_model, + provisioning=provisioning, + ) + + +async def _process_existing_instance_provisioning( + item: JobSubmittedPipelineItem, + context: _SubmittedJobContext, + prepared_job_volumes: _PreparedJobVolumes, +) -> _ExistingInstanceProvisioning: + instance_model = get_or_error(context.job_model.instance) + volume_attachment_result = await _process_volume_attachments( + item=item, + project=context.project, + job_model=context.job_model, + prepared_job_volumes=prepared_job_volumes, + job_provisioning_data=get_or_error(get_instance_provisioning_data(instance_model)), + ) + return _ExistingInstanceProvisioning( + volume_attachment_result=volume_attachment_result, + ) + + +async def _apply_existing_instance_provisioning( + session: AsyncSession, + item: JobSubmittedPipelineItem, + job_model: JobModel, + provisioning: _ExistingInstanceProvisioning, +) -> None: + context = await _load_submitted_job_context(session=session, job_model=job_model) + instance_model = get_or_error(context.job_model.instance) + context.job_model.job_provisioning_data = instance_model.job_provisioning_data + if context.job_model.job_runtime_data is None: + context.job_model.job_runtime_data = _prepare_job_runtime_data( + offer=get_or_error(get_instance_offer(instance_model)), + multinode=context.multinode, + ).json() + switch_job_status(session, context.job_model, JobStatus.PROVISIONING) + await _apply_volume_attachment_result( + session=session, + job_model=context.job_model, + instance_model=instance_model, + volume_attachment_result=provisioning.volume_attachment_result, + ) + if context.job_model.status == JobStatus.PROVISIONING: + context.job_model.skip_min_processing_interval = True + _release_replica_jobs_from_master_wait( + job_model=context.job_model, + replica_job_models=_get_job_models_by_ids( + job_models=context.run_model.jobs, + job_model_ids=context.replica_job_model_ids, + ), + jobs_to_provision=context.jobs_to_provision, + ) + await _unlock_related_volumes( + session=session, + item=item, + volume_ids=provisioning.volume_attachment_result.locked_volume_ids, + ) + await _mark_job_processed(session=session, job_model=context.job_model) + + +async def _process_new_capacity_provisioning( + item: JobSubmittedPipelineItem, + context: _SubmittedJobContext, + preconditions: _ProcessedPreconditions, +) -> _ProvisioningResult: + fleet_model = context.fleet_model + if fleet_model is None: + # Legacy in-flight job from autocreated fleets path (instance_assigned=True, no fleet). + # Autocreated fleets are no longer supported; terminate the job. + return _TerminateSubmittedJobResult( + reason=JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY, + message=( + "No matching fleet found. Possible reasons: " + "https://dstack.ai/docs/guides/troubleshooting/#no-fleets" + ), + ) + locked_fleet_id = None + if _should_refresh_related_cluster_master_fleet(context=context): + related_cluster_master_fleet = await _resolve_related_cluster_master_fleet( + item=item, + fleet_id=fleet_model.id, + ) + if related_cluster_master_fleet is None: + logger.debug("%s: cluster fleet is locked for provisioning", fmt(context.job_model)) + return _RetrySubmittedJobResult() + fleet_model = related_cluster_master_fleet.fleet_model + locked_fleet_id = related_cluster_master_fleet.locked_fleet_id + + master_provisioning_data = ( + preconditions.master_job_provisioning_data + or _get_fleet_master_provisioning_data( + fleet_model=fleet_model, + job=context.job, + ) + ) + if ( + is_master_job(context.job) + and _get_cluster_fleet_spec(fleet_model) is not None + # Placeholder reservations never become fleet masters. + and filter_non_placeholder_instances(fleet_model.instances) + and master_provisioning_data is None + ): + return _DeferSubmittedJobResult( + log_message="waiting for fleet master instance election", + hint_fleet_pipeline=True, + ) + provision_new_capacity_result = await _provision_new_capacity( + project=context.project, + fleet_model=fleet_model, + job_model=context.job_model, + run=context.run, + jobs=context.jobs_to_provision, + project_ssh_public_key=context.project.ssh_public_key, + project_ssh_private_key=context.project.ssh_private_key, + master_job_provisioning_data=master_provisioning_data, + volumes=preconditions.prepared_job_volumes.volumes, + ) + if isinstance(provision_new_capacity_result, _TerminateSubmittedJobResult): + return provision_new_capacity_result + if isinstance(provision_new_capacity_result, _FailedNewCapacityProvisioning): + logger.debug("%s: provisioning failed", fmt(context.job_model)) + return _TerminateSubmittedJobResult( + reason=JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY, + locked_fleet_id=locked_fleet_id, + placement_group_cleanup=provision_new_capacity_result.placement_group_cleanup, + ) + + volume_attachment_result = None + # TODO: Volume attachment for compute groups is not yet supported since + # currently supported compute groups don't require explicit volume attachment. + if isinstance(provision_new_capacity_result.provisioning_data, JobProvisioningData): + volume_attachment_result = await _process_volume_attachments( + item=item, + project=context.project, + job_model=context.job_model, + prepared_job_volumes=preconditions.prepared_job_volumes, + job_provisioning_data=provision_new_capacity_result.provisioning_data, + ) + + return _NewCapacityProvisioning( + provisioning_data=provision_new_capacity_result.provisioning_data, + offer=provision_new_capacity_result.offer, + effective_profile=provision_new_capacity_result.effective_profile, + placement_group_cleanup=provision_new_capacity_result.placement_group_cleanup, + volume_attachment_result=volume_attachment_result, + locked_fleet_id=locked_fleet_id, + ) + + +async def _apply_new_capacity_provisioning( + session: AsyncSession, + item: JobSubmittedPipelineItem, + job_model: JobModel, + provisioning: _NewCapacityProvisioning, +) -> None: + fresh_context = await _load_submitted_job_context(session=session, job_model=job_model) + fleet_model = fresh_context.fleet_model + assert fleet_model is not None + await _persist_placement_group_cleanup( + session=session, + fleet_model=fleet_model, + project=fresh_context.project, + placement_group_cleanup=provisioning.placement_group_cleanup, + ) + + instance_models, _ = await _materialize_newly_provisioned_capacity( + session=session, + context=fresh_context, + fleet_model=fleet_model, + provisioning_data=provisioning.provisioning_data, + offer=provisioning.offer, + effective_profile=provisioning.effective_profile, + ) + if provisioning.volume_attachment_result is not None: + assert len(instance_models) == 1 + await _apply_volume_attachment_result( + session=session, + job_model=fresh_context.job_model, + instance_model=instance_models[0], + volume_attachment_result=provisioning.volume_attachment_result, + ) + _release_replica_jobs_from_master_wait( + job_model=fresh_context.job_model, + replica_job_models=_get_job_models_by_ids( + job_models=fresh_context.run_model.jobs, + job_model_ids=fresh_context.replica_job_model_ids, + ), + jobs_to_provision=fresh_context.jobs_to_provision, + ) + await _unlock_related_volumes( + session=session, + item=item, + volume_ids=_get_locked_volume_ids_from_volume_attachment_result( + provisioning.volume_attachment_result + ), + ) + await _unlock_related_fleet( + session=session, + item=item, + fleet_id=provisioning.locked_fleet_id, + ) + await _mark_job_processed(session=session, job_model=fresh_context.job_model) + + +async def _materialize_newly_provisioned_capacity( + session: AsyncSession, + context: _SubmittedJobContext, + fleet_model: FleetModel, + provisioning_data: Union[JobProvisioningData, ComputeGroupProvisioningData], + offer: InstanceOfferWithAvailability, + effective_profile: Profile, +) -> tuple[list[InstanceModel], Optional[ComputeGroupModel]]: + ( + provisioned_jobs, + job_provisioning_datas, + compute_group_model, + ) = _resolve_provisioned_jobs_and_data( + context=context, + fleet_model=fleet_model, + provisioning_data=provisioning_data, + ) + if compute_group_model is not None: + session.add(compute_group_model) + + instance_models = await _promote_or_create_instance_models_for_provisioned_jobs( + session=session, + context=context, + fleet_model=fleet_model, + compute_group_model=compute_group_model, + provisioned_jobs=provisioned_jobs, + job_provisioning_datas=job_provisioning_datas, + offer=offer, + effective_profile=effective_profile, + ) + + logger.info( + "%s: provisioned %s new instance(s)", + fmt(context.job_model), + len(provisioned_jobs), + ) + return instance_models, compute_group_model + + +def _resolve_provisioned_jobs_and_data( + context: _SubmittedJobContext, + fleet_model: FleetModel, + provisioning_data: Union[JobProvisioningData, ComputeGroupProvisioningData], +) -> tuple[list[Job], list[JobProvisioningData], Optional[ComputeGroupModel]]: + if isinstance(provisioning_data, ComputeGroupProvisioningData): + compute_group_model = ComputeGroupModel( + id=uuid.uuid4(), + project=context.project, + fleet=fleet_model, + status=ComputeGroupStatus.RUNNING, + provisioning_data=provisioning_data.json(), + ) + return ( + context.jobs_to_provision, + provisioning_data.job_provisioning_datas, + compute_group_model, + ) + return [context.job], [provisioning_data], None + + +async def _promote_or_create_instance_models_for_provisioned_jobs( + session: AsyncSession, + context: _SubmittedJobContext, + fleet_model: FleetModel, + compute_group_model: Optional[ComputeGroupModel], + provisioned_jobs: list[Job], + job_provisioning_datas: list[JobProvisioningData], + offer: InstanceOfferWithAvailability, + effective_profile: Profile, +) -> list[InstanceModel]: + provisioned_job_models = _get_job_models_for_jobs(context.run_model.jobs, provisioned_jobs) + instance_models: list[InstanceModel] = [] + # FIXME: For compute groups, the fleet is not locked here, which may lead to + # duplicate `instance_num`. Single-instance jobs use placeholder instances + # created under fleet lock during assignment, so they are not affected. + taken_instance_nums = await _get_taken_instance_nums(session, fleet_model) + + for provisioned_job_model, job_provisioning_data in zip( + provisioned_job_models, job_provisioning_datas + ): + provisioned_job_model.fleet_id = fleet_model.id + provisioned_job_model.job_provisioning_data = job_provisioning_data.json() + switch_job_status(session, provisioned_job_model, JobStatus.PROVISIONING) + provisioned_job_model.skip_min_processing_interval = True + + # If a placeholder instance exists, promote it instead of creating a new one. + # Safe to update the placeholder without locking: nobody else should update the placeholder. + placeholder_instance = _get_job_placeholder_instance(context, provisioned_job_model) + if placeholder_instance is not None: + instance_model = placeholder_instance + _promote_placeholder_instance( + instance_model=instance_model, + compute_group_model=compute_group_model, + job_provisioning_data=job_provisioning_data, + offer=offer, + profile=effective_profile, + ) + else: + instance_num = get_next_instance_num(taken_instance_nums) + instance_model = _create_instance_model_for_job( + project=context.project, + fleet_model=fleet_model, + compute_group_model=compute_group_model, + job_model=provisioned_job_model, + job_provisioning_data=job_provisioning_data, + offer=offer, + instance_num=instance_num, + profile=effective_profile, + ) + taken_instance_nums.add(instance_num) + session.add(instance_model) + provisioned_job_model.used_instance_id = instance_model.id + + instance_models.append(instance_model) + provisioned_job_model.job_runtime_data = _prepare_job_runtime_data( + offer, context.multinode + ).json() + events.emit( + session, + f"Instance provisioned for job. Instance status: {instance_model.status.upper()}", + actor=events.SystemActor(), + targets=[ + events.Target.from_model(instance_model), + events.Target.from_model(provisioned_job_model), + ], + ) + provisioned_job_model.last_processed_at = get_current_datetime() + return instance_models + + +async def _get_taken_instance_nums(session: AsyncSession, fleet_model: FleetModel) -> set[int]: + res = await session.execute( + select(InstanceModel.instance_num).where( + InstanceModel.fleet_id == fleet_model.id, + InstanceModel.deleted.is_(False), + ) + ) + return set(res.scalars().all()) + + +def _get_job_placeholder_instance( + context: _SubmittedJobContext, + job_model: JobModel, +) -> Optional[InstanceModel]: + """Return the placeholder instance for a job, or None. + Only context.job_model has the instance relationship eagerly loaded, + so we match by id and return its instance. + """ + if job_model.id != context.job_model.id: + return None + instance = context.job_model.instance + if instance is not None and is_placeholder_instance(instance): + return instance + return None + + +def _create_instance_model_for_job( + project: ProjectModel, + fleet_model: FleetModel, + compute_group_model: Optional[ComputeGroupModel], + job_model: JobModel, + job_provisioning_data: JobProvisioningData, + offer: InstanceOfferWithAvailability, + instance_num: int, + profile: Profile, +) -> InstanceModel: + if not job_provisioning_data.dockerized: + termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE + termination_idle_time = 0 + else: + termination_policy, termination_idle_time = get_termination( + profile, DEFAULT_RUN_TERMINATION_IDLE_TIME + ) + return InstanceModel( + id=uuid.uuid4(), + name=f"{fleet_model.name}-{instance_num}", + instance_num=instance_num, + project=project, + fleet=fleet_model, + compute_group=compute_group_model, + created_at=get_current_datetime(), + started_at=get_current_datetime(), + status=InstanceStatus.PROVISIONING, + unreachable=False, + job_provisioning_data=job_provisioning_data.json(), + offer=offer.json(), + termination_policy=termination_policy, + termination_idle_time=termination_idle_time, + jobs=[job_model], + backend=offer.backend, + price=offer.price, + region=offer.region, + volume_attachments=[], + total_blocks=1, + busy_blocks=1, + ) + + +def _promote_placeholder_instance( + instance_model: InstanceModel, + compute_group_model: Optional[ComputeGroupModel], + job_provisioning_data: JobProvisioningData, + offer: InstanceOfferWithAvailability, + profile: Profile, +) -> None: + """Promote a placeholder instance to a real provisioning instance + by filling in the fields that were unknown at placeholder creation time.""" + if not job_provisioning_data.dockerized: + termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE + termination_idle_time = 0 + else: + termination_policy, termination_idle_time = get_termination( + profile, DEFAULT_RUN_TERMINATION_IDLE_TIME + ) + instance_model.status = InstanceStatus.PROVISIONING + instance_model.started_at = get_current_datetime() + instance_model.compute_group = compute_group_model + instance_model.job_provisioning_data = job_provisioning_data.json() + instance_model.offer = offer.json() + instance_model.backend = offer.backend + instance_model.price = offer.price + instance_model.region = offer.region + instance_model.termination_policy = termination_policy + instance_model.termination_idle_time = termination_idle_time + instance_model.total_blocks = 1 + instance_model.busy_blocks = 1 + + +async def _process_volume_attachments( + item: JobSubmittedPipelineItem, + project: ProjectModel, + job_model: JobModel, + prepared_job_volumes: _PreparedJobVolumes, + job_provisioning_data: JobProvisioningData, +) -> _VolumeAttachmentResult: + if len(prepared_job_volumes.volume_model_ids) == 0: + return _VolumeAttachmentResult(attachments=[], locked_volume_ids=[]) + + backend = await get_project_backend_by_type_or_error( + project=project, + backend_type=job_provisioning_data.backend, + ) + compute = backend.compute() + assert isinstance(compute, ComputeWithVolumeSupport) + + volume_models = await _lock_related_volume_models( + item=item, volume_model_ids=prepared_job_volumes.volume_model_ids + ) + if volume_models is None: + return _VolumeAttachmentResult( + attachments=[], + locked_volume_ids=[], + termination_message="Failed to attach volume: Cannot attach a volume locked for processing", + ) + + locked_volume_ids = sorted( + { + volume_model.id + for mount_point_volume_models in volume_models + for volume_model in mount_point_volume_models + } + ) + attachments: list[_VolumeAttachmentPayload] = [] + related_volume_lock_owner = _get_related_volume_lock_owner(item.id) + for mount_point_volume_models in volume_models: + for volume_model in mount_point_volume_models: + volume = volume_model_to_volume(volume_model) + try: + if volume_model.deleted: + raise ServerClientError("Cannot attach a deleted volume") + if volume_model.to_be_deleted: + raise ServerClientError("Cannot attach a volume marked for deletion") + if ( + volume_model.lock_expires_at is not None + and volume_model.lock_owner != related_volume_lock_owner + ): + raise ServerClientError("Cannot attach a volume locked for processing") + if ( + job_provisioning_data.get_base_backend() != volume.get_backend() + or job_provisioning_data.region.lower() != volume.get_region().lower() + ): + continue + if volume.provisioning_data is None or not volume.provisioning_data.attachable: + continue + attachment_data = await run_async( + compute.attach_volume, + volume=volume, + provisioning_data=job_provisioning_data, + ) + attachments.append( + _VolumeAttachmentPayload( + volume_id=volume_model.id, + attachment_data=attachment_data.json(), + volume_name=volume.name, + ) + ) + break + except ServerClientError as e: + logger.info("%s: failed to attach volume: %s", fmt(job_model), repr(e)) + return _VolumeAttachmentResult( + attachments=attachments, + locked_volume_ids=locked_volume_ids, + termination_message=f"Failed to attach volume: {e.msg}", + ) + except BackendError as e: + logger.warning("%s: failed to attach volume: %s", fmt(job_model), repr(e)) + return _VolumeAttachmentResult( + attachments=attachments, + locked_volume_ids=locked_volume_ids, + termination_message=f"Failed to attach volume: {str(e)}", + ) + except Exception: + logger.exception("%s: got exception when attaching volume", fmt(job_model)) + return _VolumeAttachmentResult( + attachments=attachments, + locked_volume_ids=locked_volume_ids, + termination_message="Failed to attach volume: unexpected error", + ) + return _VolumeAttachmentResult( + attachments=attachments, + locked_volume_ids=locked_volume_ids, + ) + + +async def _lock_related_volume_models( + item: JobSubmittedPipelineItem, + volume_model_ids: list[list[uuid.UUID]], +) -> Optional[list[list[VolumeModel]]]: + now = get_current_datetime() + volume_ids = sorted( + { + volume_id + for mount_point_volume_ids in volume_model_ids + for volume_id in mount_point_volume_ids + } + ) + if not volume_ids: + return [] + + related_volume_lock_owner = _get_related_volume_lock_owner(item.id) + volume_lock, _ = get_locker(get_db().dialect_name).get_lockset(VolumeModel.__tablename__) + async with volume_lock: + async with get_session_ctx() as session: + # Persist related volume locks before attach because the attach call itself + # must run outside a DB transaction in the processing phase. + res = await session.execute( + select(VolumeModel) + .where( + VolumeModel.id.in_(volume_ids), + or_( + VolumeModel.lock_expires_at.is_(None), + and_( + VolumeModel.lock_owner == related_volume_lock_owner, + VolumeModel.lock_expires_at < now, + ), + ), + ) + .options(joinedload(VolumeModel.project)) + .options(joinedload(VolumeModel.user).load_only(UserModel.name)) + .options( + joinedload(VolumeModel.attachments) + .joinedload(VolumeAttachmentModel.instance) + .joinedload(InstanceModel.fleet) + .load_only(FleetModel.name) + ) + .with_for_update(skip_locked=True, key_share=True, of=VolumeModel) + ) + locked_volume_models = list(res.unique().scalars().all()) + if len(locked_volume_models) != len(volume_ids): + return None + + for volume_model in locked_volume_models: + volume_model.lock_expires_at = item.lock_expires_at + volume_model.lock_token = item.lock_token + volume_model.lock_owner = related_volume_lock_owner + + await session.commit() + + volume_models_by_id = {volume_model.id: volume_model for volume_model in locked_volume_models} + return [ + [volume_models_by_id[volume_id] for volume_id in mount_point_volume_ids] + for mount_point_volume_ids in volume_model_ids + ] + + +async def _unlock_related_volumes( + session: AsyncSession, + item: JobSubmittedPipelineItem, + volume_ids: list[uuid.UUID], +) -> None: + if not volume_ids: + return + + await session.execute( + update(VolumeModel) + .where( + VolumeModel.id.in_(volume_ids), + VolumeModel.lock_owner == _get_related_volume_lock_owner(item.id), + VolumeModel.lock_token == item.lock_token, + ) + .values( + lock_expires_at=None, + lock_token=None, + lock_owner=None, + ) + ) + + +def _get_locked_volume_ids_from_provisioning( + provisioning: _ProvisioningResult, +) -> list[uuid.UUID]: + if isinstance(provisioning, _ExistingInstanceProvisioning): + return provisioning.volume_attachment_result.locked_volume_ids + + if isinstance(provisioning, _NewCapacityProvisioning): + return _get_locked_volume_ids_from_volume_attachment_result( + provisioning.volume_attachment_result + ) + + return [] + + +def _get_locked_volume_ids_from_volume_attachment_result( + volume_attachment_result: Optional[_VolumeAttachmentResult], +) -> list[uuid.UUID]: + if volume_attachment_result is None: + return [] + return volume_attachment_result.locked_volume_ids + + +def _get_locked_fleet_id_from_provisioning( + provisioning: _ProvisioningResult, +) -> Optional[uuid.UUID]: + if isinstance(provisioning, _TerminateSubmittedJobResult): + return provisioning.locked_fleet_id + + if isinstance(provisioning, _NewCapacityProvisioning): + return provisioning.locked_fleet_id + + return None + + +def _get_related_volume_lock_owner(job_id: uuid.UUID) -> str: + return f"{JobSubmittedPipeline.__name__}:{job_id}" + + +def _get_related_fleet_lock_owner(job_id: uuid.UUID) -> str: + return f"{JobSubmittedPipeline.__name__}:{job_id}" + + +async def _apply_volume_attachment_result( + session: AsyncSession, + job_model: JobModel, + instance_model: InstanceModel, + volume_attachment_result: _VolumeAttachmentResult, +) -> None: + job_runtime_data = get_or_error(get_job_runtime_data(job_model)) + job_runtime_data.volume_names = [ + attachment.volume_name for attachment in volume_attachment_result.attachments + ] + job_model.job_runtime_data = job_runtime_data.json() + + volume_ids = [attachment.volume_id for attachment in volume_attachment_result.attachments] + if volume_ids: + now = get_current_datetime() + await session.execute( + update(VolumeModel) + .where(VolumeModel.id.in_(volume_ids)) + .values(last_job_processed_at=now) + ) + for attachment in volume_attachment_result.attachments: + session.add( + VolumeAttachmentModel( + volume_id=attachment.volume_id, + instance=instance_model, + attachment_data=attachment.attachment_data, + ) + ) + + if volume_attachment_result.termination_message is None: + return + + job_model.termination_reason = JobTerminationReason.VOLUME_ERROR + job_model.termination_reason_message = volume_attachment_result.termination_message + switch_job_status(session, job_model, JobStatus.TERMINATING) + + +def _get_cluster_fleet_spec(fleet_model: FleetModel) -> Optional[FleetSpec]: + fleet_spec = get_fleet_spec(fleet_model) + if fleet_spec.configuration.placement != InstanceGroupPlacement.CLUSTER: + return None + return fleet_spec + + +def _should_refresh_related_cluster_master_fleet(context: _SubmittedJobContext) -> bool: + return ( + is_master_job(context.job) + and context.fleet_model is not None + and _get_cluster_fleet_spec(context.fleet_model) is not None + ) + + +@dataclass +class _ResolvedRelatedClusterMasterFleet: + fleet_model: FleetModel + locked_fleet_id: Optional[uuid.UUID] + + +async def _resolve_related_cluster_master_fleet( + item: JobSubmittedPipelineItem, + fleet_id: uuid.UUID, +) -> Optional[_ResolvedRelatedClusterMasterFleet]: + now = get_current_datetime() + related_fleet_lock_owner = _get_related_fleet_lock_owner(item.id) + fleet_lock, _ = get_locker(get_db().dialect_name).get_lockset(FleetModel.__tablename__) + async with fleet_lock: + async with get_session_ctx() as session: + # To avoid violating cluster placement during master provisioning, + # lock empty fleets and respect existing instances in non-empty fleets. + # Refetch the fleet under lock before deciding which case we are in. + res = await session.execute( + select(FleetModel) + .where( + FleetModel.id == fleet_id, + ) + .options( + joinedload(FleetModel.project).load_only(ProjectModel.id, ProjectModel.name) + ) + .options(selectinload(FleetModel.instances.and_(InstanceModel.deleted == False))) + .execution_options(populate_existing=True) + .with_for_update(skip_locked=True, of=FleetModel) + ) + fleet_model = res.unique().scalar_one_or_none() + if fleet_model is None: + return None + # Placeholder reservations should not make an empty cluster fleet look + # non-empty; only real instances mean placement is already anchored. + if filter_non_placeholder_instances(fleet_model.instances): + return _ResolvedRelatedClusterMasterFleet( + fleet_model=fleet_model, + locked_fleet_id=None, + ) + if not ( + fleet_model.lock_expires_at is None + or ( + fleet_model.lock_owner == related_fleet_lock_owner + and fleet_model.lock_expires_at < now + ) + ): + return None + + fleet_model.lock_expires_at = item.lock_expires_at + fleet_model.lock_token = item.lock_token + fleet_model.lock_owner = related_fleet_lock_owner + await session.commit() + return _ResolvedRelatedClusterMasterFleet( + fleet_model=fleet_model, + locked_fleet_id=fleet_model.id, + ) + + +async def _unlock_related_fleet( + session: AsyncSession, + item: JobSubmittedPipelineItem, + fleet_id: Optional[uuid.UUID], +) -> None: + if fleet_id is None: + return + + await session.execute( + update(FleetModel) + .where( + FleetModel.id == fleet_id, + FleetModel.lock_owner == _get_related_fleet_lock_owner(item.id), + FleetModel.lock_token == item.lock_token, + ) + .values( + lock_expires_at=None, + lock_token=None, + lock_owner=None, + ) + ) + + +def _get_fleet_master_provisioning_data( + fleet_model: Optional[FleetModel], + job: Job, +) -> Optional[JobProvisioningData]: + if not is_master_job(job) or fleet_model is None: + return None + + fleet_spec = _get_cluster_fleet_spec(fleet_model) + if fleet_spec is None: + return None + + return get_fleet_master_instance_provisioning_data( + fleet_model=fleet_model, + fleet_spec=fleet_spec, + ) + + +def _hint_pipelines_fetch( + pipeline_hinter: PipelineHinterProtocol, + result: Union[_AssignmentResult, _ProvisioningResult], +) -> None: + if not isinstance(result, _DeferSubmittedJobResult): + return + + if result.hint_fleet_pipeline: + pipeline_hinter.hint_fetch(FleetModel.__name__) + + +def _select_jobs_to_provision(job: Job, replica_jobs: list[Job], job_model: JobModel) -> list[Job]: + jobs_to_provision = [job] + if is_multinode_job(job) and is_master_job(job) and job_model.waiting_master_job is not None: + jobs_to_provision = replica_jobs + return jobs_to_provision + + +def _get_required_targeted_instance_offers(context: _SubmittedJobContext) -> int: + if is_multinode_job(context.job) and is_master_job(context.job): + return len(context.jobs_to_provision) + return 1 + + +def _release_replica_jobs_from_master_wait( + job_model: JobModel, + replica_job_models: list[JobModel], + jobs_to_provision: list[Job], +) -> None: + if len(jobs_to_provision) > 1: + logger.debug("%s: allow replica jobs to be provisioned one-by-one", fmt(job_model)) + for replica_job_model in replica_job_models: + replica_job_model.waiting_master_job = False + + +async def _provision_new_capacity( + project: ProjectModel, + fleet_model: FleetModel, + job_model: JobModel, + run: Run, + jobs: list[Job], + project_ssh_public_key: str, + project_ssh_private_key: str, + master_job_provisioning_data: Optional[JobProvisioningData] = None, + volumes: Optional[list[list[Volume]]] = None, +) -> Union[ + _TerminateSubmittedJobResult, _FailedNewCapacityProvisioning, _ProvisionNewCapacityResult +]: + secrets = await _load_project_secrets(project=project) + jobs = copy.deepcopy(jobs) + for job in jobs: + job.job_spec.image_name, job.job_spec.registry_auth = apply_server_docker_defaults( + job.job_spec.image_name, job.job_spec.registry_auth + ) + try: + interpolate_job_spec_secrets(job.job_spec, secrets) + except InterpolatorError as e: + return _TerminateSubmittedJobResult( + reason=JobTerminationReason.TERMINATED_BY_SERVER, + message=f"Secrets interpolation error: {e.args[0]}", + ) + job = jobs[0] + if volumes is None: + volumes = [] + # New-capacity provisioning is reached only for fleet-backed jobs. During + # the transition, legacy in-flight jobs may still have no attached + # instance; otherwise any attached instance here is expected to be the + # placeholder created during assignment. + effective_profile_and_requirements = _get_effective_profile_and_requirements( + job_model=job_model, + run=run, + job=job, + fleet_model=fleet_model, + ) + if effective_profile_and_requirements is None: + return _FailedNewCapacityProvisioning(placement_group_cleanup=None) + profile, requirements = effective_profile_and_requirements + + placement_group_models = await _load_fleet_placement_group_models(fleet_model.id) + new_placement_group_models: list[PlacementGroupModel] = [] + known_placement_group_ids = { + placement_group_model.id for placement_group_model in placement_group_models + } + placement_group_model = get_placement_group_model_for_job( + placement_group_models=placement_group_models, + fleet_model=fleet_model, + ) + multinode = requirements.multinode or is_multinode_job(job) + offers = await get_offers_by_requirements( + project=project, + profile=profile, + requirements=requirements, + exclude_not_available=True, + multinode=multinode, + master_job_provisioning_data=master_job_provisioning_data, + volumes=volumes, + privileged=job.job_spec.privileged, + instance_mounts=check_run_spec_requires_instance_mounts(run.run_spec), + placement_group=placement_group_model_to_placement_group_optional(placement_group_model), + ) + offers_iter = iter(offers) + offers_tried = 0 + while offers_tried < settings.MAX_OFFERS_TRIED: + backend_with_offer = next(offers_iter, None) + if backend_with_offer is None: + break + backend, offer = backend_with_offer + logger.debug( + "%s: trying %s in %s/%s for $%0.4f per hour", + fmt(job_model), + offer.instance.name, + offer.backend.value, + offer.region, + offer.price, + ) + offer_volumes = _get_offer_volumes(volumes, offer) + job_configurations = [ + JobConfiguration(job=job_to_run, volumes=offer_volumes) for job_to_run in jobs + ] + compute = backend.compute() + if master_job_provisioning_data is not None: + offer = get_instance_offer_with_restricted_az( + instance_offer=offer, + master_job_provisioning_data=master_job_provisioning_data, + ) + if ( + # The first real instance in an empty cluster fleet is responsible + # for creating/selecting the placement group. A placeholder alone + # must not suppress that path. + not filter_non_placeholder_instances(fleet_model.instances) + and is_cloud_cluster(fleet_model) + and offer.backend in BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT + and isinstance(compute, ComputeWithPlacementGroupSupport) + and ( + compute.are_placement_groups_compatible_with_reservations(offer.backend) + or job.job_spec.requirements.reservation is None + ) + ): + placement_group_model = await find_or_create_suitable_placement_group( + fleet_model=fleet_model, + placement_groups=placement_group_models, + instance_offer=offer, + compute=compute, + ) + if placement_group_model is None: + continue + if placement_group_model.id not in known_placement_group_ids: + new_placement_group_models.append(placement_group_model) + placement_group_models.append(placement_group_model) + known_placement_group_ids.add(placement_group_model.id) + offers_tried += 1 + try: + if len(jobs) > 1 and offer.backend in BACKENDS_WITH_GROUP_PROVISIONING_SUPPORT: + assert isinstance(compute, ComputeWithGroupProvisioningSupport) + compute_group_provisioning_data = await run_async( + compute.run_jobs, + run, + job_configurations, + offer, + project_ssh_public_key, + project_ssh_private_key, + placement_group_model_to_placement_group_optional(placement_group_model), + ) + return _ProvisionNewCapacityResult( + provisioning_data=compute_group_provisioning_data, + offer=offer, + effective_profile=profile, + placement_group_cleanup=_build_placement_group_cleanup( + fleet_model=fleet_model, + offers_tried=offers_tried, + selected_placement_group_id=( + None if placement_group_model is None else placement_group_model.id + ), + new_placement_group_models=new_placement_group_models, + ), + ) + job_provisioning_data = await run_async( + compute.run_job, + run, + job, + offer, + project_ssh_public_key, + project_ssh_private_key, + offer_volumes, + placement_group_model_to_placement_group_optional(placement_group_model), + ) + return _ProvisionNewCapacityResult( + provisioning_data=job_provisioning_data, + offer=offer, + effective_profile=profile, + placement_group_cleanup=_build_placement_group_cleanup( + fleet_model=fleet_model, + offers_tried=offers_tried, + selected_placement_group_id=( + None if placement_group_model is None else placement_group_model.id + ), + new_placement_group_models=new_placement_group_models, + ), + ) + except SkipOffer as e: + offers_tried -= 1 + logger.info( + "%s: %s launch in %s/%s skipped: %s", + fmt(job_model), + offer.instance.name, + offer.backend.value, + offer.region, + e, + ) + continue + except BackendError as e: + logger.warning( + "%s: %s launch in %s/%s failed: %s", + fmt(job_model), + offer.instance.name, + offer.backend.value, + offer.region, + repr(e), + ) + continue + except Exception: + logger.exception( + "%s: got exception when launching %s in %s/%s", + fmt(job_model), + offer.instance.name, + offer.backend.value, + offer.region, + ) + continue + return _FailedNewCapacityProvisioning( + placement_group_cleanup=_build_placement_group_cleanup( + fleet_model=fleet_model, + offers_tried=offers_tried, + selected_placement_group_id=None, + new_placement_group_models=new_placement_group_models, + ) + ) + + +async def _load_project_secrets(project: ProjectModel) -> dict[str, str]: + async with get_session_ctx() as session: + return await get_project_secrets_mapping(session=session, project=project) + + +async def _load_fleet_placement_group_models(fleet_id: uuid.UUID) -> list["PlacementGroupModel"]: + async with get_session_ctx() as session: + res = await session.execute( + select(PlacementGroupModel) + .where( + and_( + PlacementGroupModel.fleet_id == fleet_id, + PlacementGroupModel.deleted == False, + PlacementGroupModel.fleet_deleted == False, + ) + ) + .options( + joinedload(PlacementGroupModel.project).load_only( + ProjectModel.id, + ProjectModel.name, + ) + ) + ) + return list(res.scalars().all()) + + +def _build_placement_group_cleanup( + fleet_model: FleetModel, + offers_tried: int, + selected_placement_group_id: Optional[uuid.UUID], + new_placement_group_models: list[PlacementGroupModel], +) -> Optional[_PlacementGroupCleanup]: + # Treat placeholder-only fleets as empty so a failed first-instance attempt + # still cleans up placement groups created for that attempt. + if filter_non_placeholder_instances(fleet_model.instances) or offers_tried == 0: + return None + return _PlacementGroupCleanup( + fleet_id=fleet_model.id, + selected_placement_group_id=selected_placement_group_id, + new_placement_group_models=new_placement_group_models, + ) + + +async def _load_placement_group_cleanup_fleet( + session: AsyncSession, + fleet_id: uuid.UUID, +) -> FleetModel: + res = await session.execute( + select(FleetModel) + .where(FleetModel.id == fleet_id) + .options(joinedload(FleetModel.project).load_only(ProjectModel.id, ProjectModel.name)) + ) + return res.unique().scalar_one() + + +async def _persist_placement_group_cleanup( + session: AsyncSession, + fleet_model: FleetModel, + project: ProjectModel, + placement_group_cleanup: Optional[_PlacementGroupCleanup], +) -> None: + if placement_group_cleanup is None: + return + + assert fleet_model.id == placement_group_cleanup.fleet_id + except_placement_group_ids = () + if placement_group_cleanup.selected_placement_group_id is not None: + except_placement_group_ids = (placement_group_cleanup.selected_placement_group_id,) + await schedule_fleet_placement_groups_deletion( + session=session, + fleet_id=placement_group_cleanup.fleet_id, + except_placement_group_ids=except_placement_group_ids, + ) + for placement_group_model in placement_group_cleanup.new_placement_group_models: + placement_group_model.project = project + placement_group_model.fleet = fleet_model + placement_group_model.fleet_deleted = ( + placement_group_model.id != placement_group_cleanup.selected_placement_group_id + ) + session.add(placement_group_model) + + +def _get_effective_profile_and_requirements( + job_model: JobModel, + run: Run, + job: Job, + fleet_model: FleetModel, +) -> Optional[tuple[Profile, Requirements]]: + fleet_spec = get_fleet_spec(fleet_model) + try: + effective_profile, requirements = get_run_profile_and_requirements_in_fleet( + job=job, + run_spec=run.run_spec, + fleet_spec=fleet_spec, + ) + except ValueError as e: + logger.debug("%s: %s", fmt(job_model), e.args[0]) + return None + # TODO: Respect fleet provisioning properties such as tags. + return effective_profile, requirements + + +def _get_offer_volumes( + volumes: list[list[Volume]], + offer: InstanceOfferWithAvailability, +) -> list[Volume]: + return [ + _get_offer_mount_point_volume(mount_point_volumes, offer) + for mount_point_volumes in volumes + ] + + +def _get_offer_mount_point_volume( + volumes: list[Volume], + offer: InstanceOfferWithAvailability, +) -> Volume: + for volume in volumes: + if ( + volume.get_backend() != offer.backend + or volume.get_region().lower() != offer.region.lower() + ): + continue + return volume + raise ServerClientError("Failed to find an eligible volume for the mount point") + + +async def _defer_submitted_job( + session: AsyncSession, + job_model: JobModel, + log_message: str, +) -> None: + logger.debug("%s: %s", fmt(job_model), log_message) + await _mark_job_processed(session=session, job_model=job_model) + + +async def _terminate_submitted_job( + session: AsyncSession, + job_model: JobModel, + reason: JobTerminationReason, + message: Optional[str] = None, +) -> None: + job_model.termination_reason = reason + if message is not None: + job_model.termination_reason_message = message + switch_job_status(session, job_model, JobStatus.TERMINATING) + await _mark_job_processed(session=session, job_model=job_model) + + +async def _mark_job_processed(session: AsyncSession, job_model: JobModel) -> None: + job_model.last_processed_at = get_current_datetime() + job_model.lock_expires_at = None + job_model.lock_token = None + job_model.lock_owner = None + await session.commit() + + +async def _reset_job_lock_for_retry( + session: AsyncSession, + item: JobSubmittedPipelineItem, +) -> None: + res = await session.execute( + update(JobModel) + .where( + JobModel.id == item.id, + JobModel.lock_token == item.lock_token, + ) + # Keep `lock_owner` so retry paths preserve submitted-jobs ownership intent + # while dropping only the stale token/expiry fields. + .values( + lock_expires_at=None, + lock_token=None, + last_processed_at=get_current_datetime(), + ) + .returning(JobModel.id) + ) + if res.scalar_one_or_none() is None: + log_lock_token_changed_on_reset(logger) + await session.commit() diff --git a/src/dstack/_internal/server/background/pipeline_tasks/jobs_terminating.py b/src/dstack/_internal/server/background/pipeline_tasks/jobs_terminating.py new file mode 100644 index 0000000000..be8d80948d --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/jobs_terminating.py @@ -0,0 +1,1027 @@ +import asyncio +import uuid +from collections.abc import Mapping +from dataclasses import dataclass, field +from datetime import datetime, timedelta +from typing import Optional, Sequence, TypedDict + +import httpx +from sqlalchemy import and_, delete, or_, select, update +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import joinedload, load_only + +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport +from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT +from dstack._internal.core.errors import BackendError, GatewayError, SSHError +from dstack._internal.core.models.instances import InstanceStatus, InstanceTerminationReason +from dstack._internal.core.models.runs import ( + JobProvisioningData, + JobRuntimeData, + JobSpec, + JobStatus, + JobTerminationReason, + RunTerminationReason, +) +from dstack._internal.server import settings +from dstack._internal.server.background.pipeline_tasks.base import ( + NOW_PLACEHOLDER, + Fetcher, + Heartbeater, + ItemUpdateMap, + Pipeline, + PipelineItem, + UpdateMapDateTime, + Worker, + log_lock_token_changed_after_processing, + log_lock_token_changed_on_reset, + log_lock_token_mismatch, + resolve_now_placeholders, + set_processed_update_map_fields, + set_unlock_update_map_fields, +) +from dstack._internal.server.db import get_db, get_session_ctx +from dstack._internal.server.models import ( + InstanceModel, + JobModel, + ProjectModel, + RunModel, + VolumeAttachmentModel, + VolumeModel, +) +from dstack._internal.server.services import backends as backends_services +from dstack._internal.server.services import events +from dstack._internal.server.services.gateways import get_or_add_gateway_connections +from dstack._internal.server.services.instances import ( + emit_instance_status_change_event, + get_instance_ssh_private_keys, + is_placeholder_instance, +) +from dstack._internal.server.services.jobs import ( + emit_job_status_change_event, + get_job_provisioning_data, + get_job_runtime_data, + get_job_spec, + stop_runner, +) +from dstack._internal.server.services.locking import get_locker +from dstack._internal.server.services.logging import fmt +from dstack._internal.server.services.pipelines import PipelineHinterProtocol +from dstack._internal.server.services.runner import client +from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel +from dstack._internal.server.services.volumes import ( + volume_model_to_volume, +) +from dstack._internal.server.utils import sentry_utils +from dstack._internal.utils import common +from dstack._internal.utils.common import get_current_datetime, get_or_error +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +@dataclass +class JobTerminatingPipelineItem(PipelineItem): + volumes_detached_at: Optional[datetime] + + +class JobTerminatingPipeline(Pipeline[JobTerminatingPipelineItem]): + def __init__( + self, + workers_num: int = 20, + queue_lower_limit_factor: float = 0.5, + queue_upper_limit_factor: float = 2.0, + min_processing_interval: timedelta = timedelta(seconds=2), + lock_timeout: timedelta = timedelta(seconds=30), + heartbeat_trigger: timedelta = timedelta(seconds=15), + *, + pipeline_hinter: PipelineHinterProtocol, + ) -> None: + super().__init__( + workers_num=workers_num, + queue_lower_limit_factor=queue_lower_limit_factor, + queue_upper_limit_factor=queue_upper_limit_factor, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeat_trigger=heartbeat_trigger, + ) + self.__heartbeater = Heartbeater[JobTerminatingPipelineItem]( + model_type=JobModel, + lock_timeout=self._lock_timeout, + heartbeat_trigger=self._heartbeat_trigger, + ) + self.__fetcher = JobTerminatingFetcher( + queue=self._queue, + queue_desired_minsize=self._queue_desired_minsize, + min_processing_interval=self._min_processing_interval, + lock_timeout=self._lock_timeout, + heartbeater=self._heartbeater, + ) + self.__workers = [ + JobTerminatingWorker( + queue=self._queue, + heartbeater=self._heartbeater, + pipeline_hinter=pipeline_hinter, + ) + for _ in range(self._workers_num) + ] + + @property + def hint_fetch_model_name(self) -> str: + return JobModel.__name__ + + @property + def _heartbeater(self) -> Heartbeater[JobTerminatingPipelineItem]: + return self.__heartbeater + + @property + def _fetcher(self) -> Fetcher[JobTerminatingPipelineItem]: + return self.__fetcher + + @property + def _workers(self) -> Sequence["JobTerminatingWorker"]: + return self.__workers + + +class JobTerminatingFetcher(Fetcher[JobTerminatingPipelineItem]): + def __init__( + self, + queue: asyncio.Queue[JobTerminatingPipelineItem], + queue_desired_minsize: int, + min_processing_interval: timedelta, + lock_timeout: timedelta, + heartbeater: Heartbeater[JobTerminatingPipelineItem], + queue_check_delay: float = 1.0, + ) -> None: + super().__init__( + queue=queue, + queue_desired_minsize=queue_desired_minsize, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeater=heartbeater, + queue_check_delay=queue_check_delay, + ) + + @sentry_utils.instrument_pipeline_task("JobTerminatingFetcher.fetch") + async def fetch(self, limit: int) -> list[JobTerminatingPipelineItem]: + job_lock, _ = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__) + async with job_lock: + async with get_session_ctx() as session: + now = get_current_datetime() + res = await session.execute( + select(JobModel) + .where( + JobModel.status == JobStatus.TERMINATING, + or_( + JobModel.remove_at.is_(None), + JobModel.remove_at < now, + ), + or_( + # Processing volumes detach can be less frequent since it may take time. + and_( + JobModel.last_processed_at <= now - self._min_processing_interval, + JobModel.volumes_detached_at.is_(None), + ), + and_( + JobModel.last_processed_at + <= now - self._min_processing_interval * 2, + JobModel.volumes_detached_at.is_not(None), + ), + JobModel.skip_min_processing_interval == True, + ), + or_( + JobModel.lock_expires_at.is_(None), + JobModel.lock_expires_at < now, + ), + or_( + JobModel.lock_owner.is_(None), + JobModel.lock_owner == JobTerminatingPipeline.__name__, + ), + ) + .order_by(JobModel.last_processed_at.asc()) + .limit(limit) + .with_for_update(skip_locked=True, key_share=True, of=JobModel) + .options( + load_only( + JobModel.id, + JobModel.lock_token, + JobModel.lock_expires_at, + JobModel.volumes_detached_at, + JobModel.skip_min_processing_interval, + ) + ) + ) + job_models = list(res.scalars().all()) + lock_expires_at = get_current_datetime() + self._lock_timeout + lock_token = uuid.uuid4() + items = [] + for job_model in job_models: + prev_lock_expired = job_model.lock_expires_at is not None + job_model.lock_expires_at = lock_expires_at + job_model.lock_token = lock_token + job_model.lock_owner = JobTerminatingPipeline.__name__ + job_model.skip_min_processing_interval = False + items.append( + JobTerminatingPipelineItem( + __tablename__=JobModel.__tablename__, + id=job_model.id, + lock_expires_at=lock_expires_at, + lock_token=lock_token, + prev_lock_expired=prev_lock_expired, + volumes_detached_at=job_model.volumes_detached_at, + ) + ) + await session.commit() + return items + + +class JobTerminatingWorker(Worker[JobTerminatingPipelineItem]): + def __init__( + self, + queue: asyncio.Queue[JobTerminatingPipelineItem], + heartbeater: Heartbeater[JobTerminatingPipelineItem], + pipeline_hinter: PipelineHinterProtocol, + ) -> None: + super().__init__( + queue=queue, + heartbeater=heartbeater, + pipeline_hinter=pipeline_hinter, + ) + + @sentry_utils.instrument_pipeline_task("JobTerminatingWorker.process") + async def process(self, item: JobTerminatingPipelineItem): + async with get_session_ctx() as session: + job_model = await _refetch_locked_job(session=session, item=item) + if job_model is None: + log_lock_token_mismatch(logger, item) + return + + instance_model: Optional[InstanceModel] = None + if job_model.used_instance_id is not None: + instance_model = await _lock_related_instance( + session=session, + item=item, + instance_id=job_model.used_instance_id, + ) + if instance_model is None: + await _reset_job_lock_for_retry(session=session, item=item) + return + + if job_model.volumes_detached_at is None: + result = await _process_terminating_job( + job_model=job_model, + instance_model=instance_model, + ) + else: + result = await _process_job_volumes_detaching( + job_model=job_model, + instance_model=get_or_error(instance_model), + ) + + await _apply_process_result( + item=item, + job_model=job_model, + instance_model=instance_model, + result=result, + ) + if ( + result.instance_update_map is not None + and result.instance_update_map.get("status") == InstanceStatus.TERMINATING + ): + self._pipeline_hinter.hint_fetch(InstanceModel.__name__) + # TODO: Hint RunPipeline to quickly move run to TERMINATED. + # Currently not implemented since it also requires making run eligible for processing. + # (This pipeline cannot modify runs so it's not simple). + + +class _JobUpdateMap(ItemUpdateMap, total=False): + status: JobStatus + termination_reason: Optional[JobTerminationReason] + termination_reason_message: Optional[str] + instance_id: Optional[uuid.UUID] + graceful_termination_attempts: int + volumes_detached_at: UpdateMapDateTime + registered: bool + remove_at: UpdateMapDateTime + + +class _InstanceUpdateMap(ItemUpdateMap, total=False): + status: InstanceStatus + termination_reason: Optional[InstanceTerminationReason] + termination_reason_message: Optional[str] + busy_blocks: int + last_job_processed_at: UpdateMapDateTime + skip_min_processing_interval: bool + + +class _VolumeUpdateRow(TypedDict): + id: uuid.UUID + last_job_processed_at: UpdateMapDateTime + + +@dataclass +class _UnregisterReplicaResult: + gateway_target: Optional[events.Target] # None = no gateway + + +@dataclass +class _ProcessResult: + job_update_map: _JobUpdateMap = field(default_factory=_JobUpdateMap) + instance_update_map: Optional[_InstanceUpdateMap] = None + volume_update_rows: list[_VolumeUpdateRow] = field(default_factory=list) + detached_volume_ids: set[uuid.UUID] = field(default_factory=set) + unassign_event_message: Optional[str] = None + graceful_stop_event_message: Optional[str] = None + replica_unregistration: Optional[_UnregisterReplicaResult] = ( + None # None = not unregistered yet + ) + + +@dataclass +class _VolumeDetachResult: + all_detached: bool + detached_volume_ids: set[uuid.UUID] = field(default_factory=set) + set_volumes_detached_at: bool = False + + +async def _refetch_locked_job( + session: AsyncSession, item: JobTerminatingPipelineItem +) -> Optional[JobModel]: + res = await session.execute( + select(JobModel) + .where( + JobModel.id == item.id, + JobModel.lock_token == item.lock_token, + ) + .options( + joinedload(JobModel.run).load_only( + RunModel.id, + RunModel.project_id, + RunModel.run_name, + RunModel.gateway_id, + RunModel.termination_reason, + ), + joinedload(JobModel.run) + .joinedload(RunModel.project) + .load_only(ProjectModel.id, ProjectModel.name), + ) + .execution_options(populate_existing=True) + ) + return res.unique().scalar_one_or_none() + + +async def _lock_related_instance( + session: AsyncSession, + item: JobTerminatingPipelineItem, + instance_id: uuid.UUID, +) -> Optional[InstanceModel]: + lock_owner = _get_related_instance_lock_owner(item.id) + instance_lock, _ = get_locker(get_db().dialect_name).get_lockset(InstanceModel.__tablename__) + async with instance_lock: + res = await session.execute( + select(InstanceModel) + .where( + InstanceModel.id == instance_id, + or_( + InstanceModel.lock_expires_at.is_(None), + InstanceModel.lock_expires_at < get_current_datetime(), + ), + or_( + InstanceModel.lock_owner.is_(None), + InstanceModel.lock_owner == lock_owner, + ), + ) + .options(joinedload(InstanceModel.project).joinedload(ProjectModel.backends)) + .options( + joinedload(InstanceModel.volume_attachments).joinedload( + VolumeAttachmentModel.volume + ) + ) + .options(joinedload(InstanceModel.jobs).load_only(JobModel.id)) + .with_for_update(skip_locked=True, key_share=True, of=InstanceModel) + ) + instance_model = res.unique().scalar_one_or_none() + if instance_model is None: + return None + instance_model.lock_expires_at = item.lock_expires_at + instance_model.lock_token = item.lock_token + instance_model.lock_owner = lock_owner + return instance_model + + +async def _load_job_volume_models( + job_model: JobModel, + instance_model: Optional[InstanceModel], +) -> list[VolumeModel]: + if instance_model is None: + return [] + jrd = get_job_runtime_data(job_model) + volume_names = ( + jrd.volume_names + if jrd and jrd.volume_names + else [va.volume.name for va in instance_model.volume_attachments] + ) + if len(volume_names) == 0: + return [] + async with get_session_ctx() as session: + res = await session.execute( + select(VolumeModel) + .where( + VolumeModel.project_id == instance_model.project.id, + VolumeModel.name.in_(volume_names), + VolumeModel.deleted == False, + ) + .options(joinedload(VolumeModel.project)) + .options(joinedload(VolumeModel.user)) + .options( + joinedload(VolumeModel.attachments) + .joinedload(VolumeAttachmentModel.instance) + .joinedload(InstanceModel.fleet) + ) + ) + return list(res.unique().scalars().all()) + + +async def _reset_job_lock_for_retry(session: AsyncSession, item: JobTerminatingPipelineItem): + res = await session.execute( + update(JobModel) + .where( + JobModel.id == item.id, + JobModel.lock_token == item.lock_token, + ) + # Keep `lock_owner` so that `InstancePipeline` can check that the job is being locked + # but unset `lock_expires_at` to process the item again ASAP (after `min_processing_interval`). + # Unset `lock_token` so that heartbeater can no longer update the item. + .values( + lock_expires_at=None, + lock_token=None, + last_processed_at=get_current_datetime(), + ) + .returning(JobModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + log_lock_token_changed_on_reset(logger) + + +async def _apply_process_result( + item: JobTerminatingPipelineItem, + job_model: JobModel, + instance_model: Optional[InstanceModel], + result: _ProcessResult, +) -> None: + set_processed_update_map_fields(result.job_update_map) + set_unlock_update_map_fields(result.job_update_map) + if instance_model is not None and result.instance_update_map is None: + result.instance_update_map = _InstanceUpdateMap() + if result.instance_update_map is not None: + set_processed_update_map_fields(result.instance_update_map) + set_unlock_update_map_fields(result.instance_update_map) + + async with get_session_ctx() as session: + now = get_current_datetime() + related_instance_lock_owner = _get_related_instance_lock_owner(item.id) + instance_update_map = result.instance_update_map + if instance_model is None: + instance_update_map = None + resolve_now_placeholders(result.job_update_map, now=now) + if instance_update_map is not None: + resolve_now_placeholders(instance_update_map, now=now) + if result.volume_update_rows: + resolve_now_placeholders(result.volume_update_rows, now=now) + + res = await session.execute( + update(JobModel) + .where( + JobModel.id == item.id, + JobModel.lock_token == item.lock_token, + ) + .values(**result.job_update_map) + .returning(JobModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + log_lock_token_changed_after_processing(logger, item) + if instance_model is not None: + await _unlock_related_instance( + session=session, + item=item, + instance_id=instance_model.id, + ) + return + + if instance_model is not None and instance_update_map is not None: + res = await session.execute( + update(InstanceModel) + .where( + InstanceModel.id == instance_model.id, + InstanceModel.lock_token == item.lock_token, + InstanceModel.lock_owner == related_instance_lock_owner, + ) + .values(**instance_update_map) + .returning(InstanceModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + logger.error( + "Failed to update related instance %s for terminating job %s.", + instance_model.id, + item.id, + ) + + if result.volume_update_rows: + # Safe to update volumes without lock as long as no other pipeline/task + # updates active attached volumes and/or the races are accepted. + await session.execute(update(VolumeModel), result.volume_update_rows) + + if result.detached_volume_ids and instance_model is not None: + await session.execute( + delete(VolumeAttachmentModel).where( + VolumeAttachmentModel.instance_id == instance_model.id, + VolumeAttachmentModel.volume_id.in_(result.detached_volume_ids), + ) + ) + + emit_job_status_change_event( + session=session, + job_model=job_model, + old_status=job_model.status, + new_status=result.job_update_map.get("status", job_model.status), + termination_reason=result.job_update_map.get( + "termination_reason", job_model.termination_reason + ), + termination_reason_message=result.job_update_map.get( + "termination_reason_message", + job_model.termination_reason_message, + ), + ) + + if instance_model is not None and instance_update_map is not None: + emit_instance_status_change_event( + session=session, + instance_model=instance_model, + old_status=instance_model.status, + new_status=instance_update_map.get("status", instance_model.status), + termination_reason=instance_update_map.get( + "termination_reason", + instance_model.termination_reason, + ), + termination_reason_message=instance_update_map.get( + "termination_reason_message", + instance_model.termination_reason_message, + ), + ) + + if result.unassign_event_message is not None and instance_model is not None: + events.emit( + session, + result.unassign_event_message, + actor=events.SystemActor(), + targets=[ + events.Target.from_model(job_model), + events.Target.from_model(instance_model), + ], + ) + + if result.graceful_stop_event_message is not None and instance_model is not None: + events.emit( + session, + result.graceful_stop_event_message, + actor=events.SystemActor(), + targets=[events.Target.from_model(job_model)], + ) + + if result.replica_unregistration is not None: + targets = [events.Target.from_model(job_model)] + if result.replica_unregistration.gateway_target is not None: + targets.append(result.replica_unregistration.gateway_target) + events.emit( + session, + "Service replica unregistered from receiving requests", + actor=events.SystemActor(), + targets=targets, + ) + + +async def _unlock_related_instance( + session: AsyncSession, + item: JobTerminatingPipelineItem, + instance_id: uuid.UUID, +) -> None: + await session.execute( + update(InstanceModel) + .where( + InstanceModel.id == instance_id, + InstanceModel.lock_token == item.lock_token, + InstanceModel.lock_owner == _get_related_instance_lock_owner(item.id), + ) + .values( + lock_expires_at=None, + lock_token=None, + lock_owner=None, + ) + ) + + +async def _process_terminating_job( + job_model: JobModel, + instance_model: Optional[InstanceModel], +) -> _ProcessResult: + """ + Terminates the job: + 1. tells the runner to stop the job's command + 2. tells the shim to stop the container + 3. detaches the job from the instance + 4. and detaches volumes from the instance. + """ + instance_update_map = None if instance_model is None else _InstanceUpdateMap() + result = _ProcessResult(instance_update_map=instance_update_map) + + if instance_model is None: + await _unregister_replica_and_update_result(result=result, job_model=job_model) + result.job_update_map["status"] = _get_job_termination_status(job_model) + return result + + if is_placeholder_instance(instance_model): + # Placeholder has no VM and no provisioning data. Skip graceful stop, + # container stop, and volume detach. + instance_update_map = get_or_error(result.instance_update_map) + if instance_model.status != InstanceStatus.TERMINATING: + instance_update_map["status"] = InstanceStatus.TERMINATING + instance_update_map["skip_min_processing_interval"] = True + instance_update_map["termination_reason"] = InstanceTerminationReason.JOB_FINISHED + result.job_update_map["instance_id"] = None + await _unregister_replica_and_update_result(result=result, job_model=job_model) + result.job_update_map["status"] = _get_job_termination_status(job_model) + return result + + if job_model.graceful_termination_attempts == 0 and job_model.remove_at is None: + result.job_update_map = await _stop_job_gracefully(job_model, instance_model) + result.graceful_stop_event_message = "Graceful job stop requested" + return result + + jrd = get_job_runtime_data(job_model) + jpd = get_job_provisioning_data(job_model) + if jpd is not None and jpd.hostname is not None and jpd.ssh_port is not None: + logger.debug("%s: stopping container", fmt(job_model)) + ssh_private_keys = get_instance_ssh_private_keys(instance_model) + if not await _stop_container(job_model, jpd, ssh_private_keys): + # Dangling containers (tasks) are cleared periodically on instance checks by + # `remove_dangling_tasks_from_instance()` + logger.warning( + ( + "%s: could not stop container, possibly due to a communication error." + " See debug logs for details." + " Ignoring, can attempt to remove the container later" + ), + fmt(job_model), + ) + + ( + result.volume_update_rows, + detach_result, + ) = await _detach_job_volumes( + job_model=job_model, + instance_model=instance_model, + job_provisioning_data=jpd, + ) + result.detached_volume_ids = detach_result.detached_volume_ids + if detach_result.set_volumes_detached_at: + result.job_update_map["volumes_detached_at"] = NOW_PLACEHOLDER + + instance_update_map = get_or_error(result.instance_update_map) + busy_blocks = instance_model.busy_blocks - _get_job_occupied_blocks(jrd) + instance_update_map["busy_blocks"] = busy_blocks + if instance_model.status != InstanceStatus.BUSY or jpd is None or not jpd.dockerized: + if instance_model.status not in InstanceStatus.finished_statuses(): + instance_update_map["termination_reason"] = InstanceTerminationReason.JOB_FINISHED + if instance_model.status != InstanceStatus.TERMINATING: + instance_update_map["status"] = InstanceStatus.TERMINATING + instance_update_map["skip_min_processing_interval"] = True + elif not [j for j in instance_model.jobs if j.id != job_model.id]: + instance_update_map["status"] = InstanceStatus.IDLE + + result.job_update_map["instance_id"] = None + instance_update_map["last_job_processed_at"] = NOW_PLACEHOLDER + result.unassign_event_message = ( + "Job unassigned from instance." + f" Instance blocks: {busy_blocks}/{instance_model.total_blocks} busy" + ) + + await _unregister_replica_and_update_result(result=result, job_model=job_model) + if detach_result.all_detached: + result.job_update_map["status"] = _get_job_termination_status(job_model) + return result + + +async def _stop_job_gracefully( + job_model: JobModel, instance_model: InstanceModel +) -> _JobUpdateMap: + """ + Tells the runner to stop the job's command. Records the first graceful-stop attempt and + sets `remove_at` so `_process_terminating_job()` stops the container on a later iteration. + """ + job_update_map = _JobUpdateMap() + await stop_runner(job_model=job_model, instance_model=instance_model) + job_update_map["graceful_termination_attempts"] = 1 + job_update_map["remove_at"] = get_current_datetime() + timedelta(seconds=10) + return job_update_map + + +async def _process_job_volumes_detaching( + job_model: JobModel, + instance_model: InstanceModel, +) -> _ProcessResult: + """ + Called after job's volumes have been soft detached to check if they are detached. + Terminates the job when all the volumes are detached. + If the volumes fail to detach, force detaches them. + """ + result = _ProcessResult(instance_update_map=_InstanceUpdateMap()) + jpd = get_or_error(get_job_provisioning_data(job_model)) + ( + result.volume_update_rows, + detach_result, + ) = await _detach_job_volumes( + job_model=job_model, + instance_model=instance_model, + job_provisioning_data=jpd, + ) + result.detached_volume_ids = detach_result.detached_volume_ids + if detach_result.all_detached: + result.job_update_map["status"] = _get_job_termination_status(job_model) + return result + + +async def _detach_job_volumes( + job_model: JobModel, + instance_model: InstanceModel, + job_provisioning_data: Optional[JobProvisioningData], +) -> tuple[list[_VolumeUpdateRow], _VolumeDetachResult]: + volume_models = await _load_job_volume_models( + job_model=job_model, instance_model=instance_model + ) + volume_update_rows = _get_volume_update_rows(volume_models) + if len(volume_models) == 0: + return volume_update_rows, _VolumeDetachResult(all_detached=True) + + if job_provisioning_data is None: + return volume_update_rows, _VolumeDetachResult(all_detached=True) + + logger.info("Detaching volumes: %s", [v.name for v in volume_models]) + detach_result = await _detach_volumes_from_job_instance( + job_model=job_model, + instance_model=instance_model, + volume_models=volume_models, + jpd=job_provisioning_data, + run_termination_reason=job_model.run.termination_reason, + ) + return volume_update_rows, detach_result + + +async def _unregister_replica_and_update_result( + result: _ProcessResult, job_model: JobModel +) -> None: + gateway_target = await _unregister_replica(job_model=job_model) + if job_model.registered: + result.job_update_map["registered"] = False + result.replica_unregistration = _UnregisterReplicaResult(gateway_target=gateway_target) + + +async def _unregister_replica( + job_model: JobModel, +) -> Optional[events.Target]: + if not job_model.registered: + return None + gateway_target = None + run_model = job_model.run + if run_model.gateway_id is not None: + async with get_session_ctx() as session: + gateway, connections = await get_or_add_gateway_connections( + session, run_model.gateway_id + ) + gateway_target = events.Target.from_model(gateway) + for conn in connections: + try: + logger.debug( + "%s: unregistering replica from service %s on gateway replica %s", + fmt(job_model), + job_model.run_id.hex, + conn.ip_address, + ) + async with conn.client() as client: + await client.unregister_replica( + project=run_model.project.name, + run_name=run_model.run_name, + job_id=job_model.id, + ) + except GatewayError as e: + logger.warning( + "%s: unregistering replica from service on gateway replica %s: %s", + fmt(job_model), + conn.ip_address, + e, + ) + except (httpx.RequestError, SSHError) as e: + logger.debug("Gateway request failed", exc_info=True) + # FIXME: Unhandled exception raised. + # Handle and retry unregister with timeout. + raise GatewayError(repr(e)) + return gateway_target + + +def _get_job_termination_status(job_model: JobModel) -> JobStatus: + if job_model.termination_reason is not None: + return job_model.termination_reason.to_status() + return JobStatus.FAILED + + +def _get_volume_update_rows(volume_models: list[VolumeModel]) -> list[_VolumeUpdateRow]: + return [ + { + "id": volume_model.id, + "last_job_processed_at": NOW_PLACEHOLDER, + } + for volume_model in volume_models + ] + + +def _get_job_occupied_blocks(jrd: Optional[JobRuntimeData]) -> int: + if jrd is not None and jrd.offer is not None: + return jrd.offer.blocks + return 1 + + +async def _stop_container( + job_model: JobModel, + job_provisioning_data: JobProvisioningData, + ssh_private_keys: tuple[str, Optional[str]], +) -> bool: + if job_provisioning_data.dockerized: + return await common.run_async( + _shim_submit_stop, + ssh_private_keys, + job_provisioning_data, + None, + job_model, + ) + return True + + +@runner_ssh_tunnel +def _shim_submit_stop(addresses: Mapping[int, client.LocalAddress], job_model: JobModel) -> bool: + shim_client = client.ShimClient.from_address(addresses[DSTACK_SHIM_HTTP_PORT]) + + resp = shim_client.healthcheck() + if resp is None: + logger.debug("%s: can't stop container, shim is not available yet", fmt(job_model)) + return False + + if shim_client.is_api_v2_supported(): + reason = ( + None if job_model.termination_reason is None else job_model.termination_reason.value + ) + shim_client.terminate_task( + task_id=job_model.id, + reason=reason, + message=job_model.termination_reason_message, + timeout=0, + ) + if not settings.SERVER_KEEP_SHIM_TASKS: + shim_client.remove_task(task_id=job_model.id) + else: + shim_client.stop(force=True) + return True + + +async def _detach_volumes_from_job_instance( + job_model: JobModel, + instance_model: InstanceModel, + volume_models: list[VolumeModel], + jpd: JobProvisioningData, + run_termination_reason: Optional[RunTerminationReason], +) -> _VolumeDetachResult: + job_spec = get_job_spec(job_model) + backend = await backends_services.get_project_backend_by_type( + project=instance_model.project, + backend_type=jpd.backend, + ) + if backend is None: + logger.error( + "Failed to detach volumes from %s. Backend not available.", instance_model.name + ) + return _VolumeDetachResult(all_detached=False) + + detached_volume_ids = set() + all_detached = True + for volume_model in volume_models: + detached = await _detach_volume_from_job_instance( + backend=backend, + job_model=job_model, + jpd=jpd, + job_spec=job_spec, + instance_model=instance_model, + volume_model=volume_model, + run_termination_reason=run_termination_reason, + ) + if detached: + detached_volume_ids.add(volume_model.id) + else: + all_detached = False + + return _VolumeDetachResult( + all_detached=all_detached, + detached_volume_ids=detached_volume_ids, + set_volumes_detached_at=job_model.volumes_detached_at is None, + ) + + +async def _detach_volume_from_job_instance( + backend: Backend, + job_model: JobModel, + jpd: JobProvisioningData, + job_spec: JobSpec, + instance_model: InstanceModel, + volume_model: VolumeModel, + run_termination_reason: Optional[RunTerminationReason], +) -> bool: + detached = True + volume = volume_model_to_volume(volume_model) + if volume.provisioning_data is None or not volume.provisioning_data.detachable: + return detached + compute = backend.compute() + assert isinstance(compute, ComputeWithVolumeSupport) + try: + if job_model.volumes_detached_at is None: + await common.run_async( + compute.detach_volume, + volume=volume, + provisioning_data=jpd, + force=False, + ) + detached = await common.run_async( + compute.is_volume_detached, + volume=volume, + provisioning_data=jpd, + ) + else: + detached = await common.run_async( + compute.is_volume_detached, + volume=volume, + provisioning_data=jpd, + ) + if not detached and _should_force_detach_volume( + job_model=job_model, + run_termination_reason=run_termination_reason, + stop_duration=job_spec.stop_duration, + ): + logger.info( + "Force detaching volume %s from %s", + volume_model.name, + instance_model.name, + ) + await common.run_async( + compute.detach_volume, + volume=volume, + provisioning_data=jpd, + force=True, + ) + except BackendError as e: + logger.error( + "Failed to detach volume %s from %s: %s", + volume_model.name, + instance_model.name, + repr(e), + ) + except Exception: + logger.exception( + "Got exception when detaching volume %s from instance %s", + volume_model.name, + instance_model.name, + ) + return detached + + +_MIN_FORCE_DETACH_WAIT_PERIOD = timedelta(seconds=60) + + +def _should_force_detach_volume( + job_model: JobModel, + run_termination_reason: Optional[RunTerminationReason], + stop_duration: Optional[int], +) -> bool: + now = get_current_datetime() + return ( + job_model.volumes_detached_at is not None + and now > job_model.volumes_detached_at + _MIN_FORCE_DETACH_WAIT_PERIOD + and ( + job_model.termination_reason == JobTerminationReason.ABORTED_BY_USER + or run_termination_reason == RunTerminationReason.ABORTED_BY_USER + or stop_duration is not None + and now > job_model.volumes_detached_at + timedelta(seconds=stop_duration) + ) + ) + + +def _get_related_instance_lock_owner(job_id: uuid.UUID) -> str: + return f"{JobTerminatingPipeline.__name__}:{job_id}" diff --git a/src/dstack/_internal/server/background/pipeline_tasks/placement_groups.py b/src/dstack/_internal/server/background/pipeline_tasks/placement_groups.py new file mode 100644 index 0000000000..e5160a8ea1 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/placement_groups.py @@ -0,0 +1,281 @@ +import asyncio +import uuid +from dataclasses import dataclass, field +from datetime import timedelta +from typing import Sequence + +from sqlalchemy import or_, select, update +from sqlalchemy.orm import joinedload, load_only + +from dstack._internal.core.backends.base.compute import ComputeWithPlacementGroupSupport +from dstack._internal.core.errors import PlacementGroupInUseError +from dstack._internal.server.background.pipeline_tasks.base import ( + NOW_PLACEHOLDER, + Fetcher, + Heartbeater, + ItemUpdateMap, + Pipeline, + PipelineItem, + UpdateMapDateTime, + Worker, + log_lock_token_changed_after_processing, + log_lock_token_mismatch, + resolve_now_placeholders, + set_processed_update_map_fields, + set_unlock_update_map_fields, +) +from dstack._internal.server.db import get_db, get_session_ctx +from dstack._internal.server.models import ( + PlacementGroupModel, + ProjectModel, +) +from dstack._internal.server.services import backends as backends_services +from dstack._internal.server.services.locking import get_locker +from dstack._internal.server.services.pipelines import PipelineHinterProtocol +from dstack._internal.server.services.placement import placement_group_model_to_placement_group +from dstack._internal.server.utils import sentry_utils +from dstack._internal.utils.common import get_current_datetime, run_async +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +class PlacementGroupPipeline(Pipeline[PipelineItem]): + def __init__( + self, + workers_num: int = 10, + queue_lower_limit_factor: float = 0.5, + queue_upper_limit_factor: float = 2.0, + min_processing_interval: timedelta = timedelta(seconds=15), + lock_timeout: timedelta = timedelta(seconds=30), + heartbeat_trigger: timedelta = timedelta(seconds=15), + *, + pipeline_hinter: PipelineHinterProtocol, + ) -> None: + super().__init__( + workers_num=workers_num, + queue_lower_limit_factor=queue_lower_limit_factor, + queue_upper_limit_factor=queue_upper_limit_factor, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeat_trigger=heartbeat_trigger, + ) + self.__heartbeater = Heartbeater[PipelineItem]( + model_type=PlacementGroupModel, + lock_timeout=self._lock_timeout, + heartbeat_trigger=self._heartbeat_trigger, + ) + self.__fetcher = PlacementGroupFetcher( + queue=self._queue, + queue_desired_minsize=self._queue_desired_minsize, + min_processing_interval=self._min_processing_interval, + lock_timeout=self._lock_timeout, + heartbeater=self._heartbeater, + ) + self.__workers = [ + PlacementGroupWorker( + queue=self._queue, + heartbeater=self._heartbeater, + pipeline_hinter=pipeline_hinter, + ) + for _ in range(self._workers_num) + ] + + @property + def hint_fetch_model_name(self) -> str: + return PlacementGroupModel.__name__ + + @property + def _heartbeater(self) -> Heartbeater[PipelineItem]: + return self.__heartbeater + + @property + def _fetcher(self) -> Fetcher[PipelineItem]: + return self.__fetcher + + @property + def _workers(self) -> Sequence["PlacementGroupWorker"]: + return self.__workers + + +class PlacementGroupFetcher(Fetcher[PipelineItem]): + def __init__( + self, + queue: asyncio.Queue[PipelineItem], + queue_desired_minsize: int, + min_processing_interval: timedelta, + lock_timeout: timedelta, + heartbeater: Heartbeater[PipelineItem], + queue_check_delay: float = 1.0, + ) -> None: + super().__init__( + queue=queue, + queue_desired_minsize=queue_desired_minsize, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeater=heartbeater, + queue_check_delay=queue_check_delay, + ) + + @sentry_utils.instrument_pipeline_task("PlacementGroupFetcher.fetch") + async def fetch(self, limit: int) -> list[PipelineItem]: + placement_group_lock, _ = get_locker(get_db().dialect_name).get_lockset( + PlacementGroupModel.__tablename__ + ) + async with placement_group_lock: + async with get_session_ctx() as session: + now = get_current_datetime() + res = await session.execute( + select(PlacementGroupModel) + .where( + PlacementGroupModel.fleet_deleted == True, + PlacementGroupModel.deleted == False, + PlacementGroupModel.last_processed_at + <= now - self._min_processing_interval, + or_( + PlacementGroupModel.lock_expires_at.is_(None), + PlacementGroupModel.lock_expires_at < now, + ), + or_( + PlacementGroupModel.lock_owner.is_(None), + PlacementGroupModel.lock_owner == PlacementGroupPipeline.__name__, + ), + ) + .order_by(PlacementGroupModel.last_processed_at.asc()) + .limit(limit) + .with_for_update(skip_locked=True, key_share=True, of=PlacementGroupModel) + .options( + load_only( + PlacementGroupModel.id, + PlacementGroupModel.lock_token, + PlacementGroupModel.lock_expires_at, + ) + ) + ) + placement_group_models = list(res.scalars().all()) + lock_expires_at = get_current_datetime() + self._lock_timeout + lock_token = uuid.uuid4() + items = [] + for placement_group_model in placement_group_models: + prev_lock_expired = placement_group_model.lock_expires_at is not None + placement_group_model.lock_expires_at = lock_expires_at + placement_group_model.lock_token = lock_token + placement_group_model.lock_owner = PlacementGroupPipeline.__name__ + items.append( + PipelineItem( + __tablename__=PlacementGroupModel.__tablename__, + id=placement_group_model.id, + lock_expires_at=lock_expires_at, + lock_token=lock_token, + prev_lock_expired=prev_lock_expired, + ) + ) + await session.commit() + return items + + +class PlacementGroupWorker(Worker[PipelineItem]): + def __init__( + self, + queue: asyncio.Queue[PipelineItem], + heartbeater: Heartbeater[PipelineItem], + pipeline_hinter: PipelineHinterProtocol, + ) -> None: + super().__init__( + queue=queue, + heartbeater=heartbeater, + pipeline_hinter=pipeline_hinter, + ) + + @sentry_utils.instrument_pipeline_task("PlacementGroupWorker.process") + async def process(self, item: PipelineItem): + async with get_session_ctx() as session: + res = await session.execute( + select(PlacementGroupModel) + .where( + PlacementGroupModel.id == item.id, + PlacementGroupModel.lock_token == item.lock_token, + ) + .options(joinedload(PlacementGroupModel.project).joinedload(ProjectModel.backends)) + ) + placement_group_model = res.unique().scalar_one_or_none() + if placement_group_model is None: + log_lock_token_mismatch(logger, item) + return + + result = await _delete_placement_group(placement_group_model) + update_map = result.update_map + set_processed_update_map_fields(update_map) + set_unlock_update_map_fields(update_map) + if update_map.get("deleted", False): + logger.info("Deleted placement group %s", placement_group_model.name) + + async with get_session_ctx() as session: + resolve_now_placeholders(update_map, now=get_current_datetime()) + res = await session.execute( + update(PlacementGroupModel) + .where( + PlacementGroupModel.id == placement_group_model.id, + PlacementGroupModel.lock_token == placement_group_model.lock_token, + ) + .values(**update_map) + .returning(PlacementGroupModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + log_lock_token_changed_after_processing(logger, item) + + +class _PlacementGroupUpdateMap(ItemUpdateMap, total=False): + deleted: bool + deleted_at: UpdateMapDateTime + + +@dataclass +class _DeleteResult: + update_map: _PlacementGroupUpdateMap = field(default_factory=_PlacementGroupUpdateMap) + + +async def _delete_placement_group( + placement_group_model: PlacementGroupModel, +) -> _DeleteResult: + placement_group = placement_group_model_to_placement_group(placement_group_model) + if placement_group.provisioning_data is None: + logger.error( + "Failed to delete placement group %s. provisioning_data is None.", placement_group.name + ) + return _get_deleted_result() + backend = await backends_services.get_project_backend_by_type( + project=placement_group_model.project, + backend_type=placement_group.provisioning_data.backend, + ) + if backend is None: + # TODO: Retry deletion + logger.error( + "Failed to delete placement group %s. Backend not available. Please delete it manually.", + placement_group.name, + ) + return _get_deleted_result() + compute = backend.compute() + assert isinstance(compute, ComputeWithPlacementGroupSupport) + try: + await run_async(compute.delete_placement_group, placement_group) + except PlacementGroupInUseError: + logger.info( + "Placement group %s is still in use. Skipping deletion for now.", placement_group.name + ) + return _DeleteResult() + except Exception: + # TODO: Retry deletion + logger.exception( + "Got exception when deleting placement group %s. Please delete it manually.", + placement_group.name, + ) + return _get_deleted_result() + + +def _get_deleted_result() -> _DeleteResult: + update_map = _PlacementGroupUpdateMap() + update_map["deleted"] = True + update_map["deleted_at"] = NOW_PLACEHOLDER + return _DeleteResult(update_map=update_map) diff --git a/src/dstack/_internal/server/background/pipeline_tasks/runs/__init__.py b/src/dstack/_internal/server/background/pipeline_tasks/runs/__init__.py new file mode 100644 index 0000000000..071af9fbd3 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/runs/__init__.py @@ -0,0 +1,969 @@ +import asyncio +import uuid +from dataclasses import dataclass +from datetime import timedelta +from typing import Optional, Sequence + +from sqlalchemy import and_, func, or_, select, update +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import aliased, contains_eager, joinedload, load_only + +import dstack._internal.server.background.pipeline_tasks.runs.active as active +import dstack._internal.server.background.pipeline_tasks.runs.pending as pending +import dstack._internal.server.background.pipeline_tasks.runs.terminating as terminating +from dstack._internal.core.models.runs import JobStatus, RunStatus +from dstack._internal.server.background.pipeline_tasks.base import ( + Fetcher, + Heartbeater, + Pipeline, + PipelineItem, + Worker, + log_lock_token_changed_after_processing, + log_lock_token_changed_on_reset, + log_lock_token_mismatch, + resolve_now_placeholders, + set_processed_update_map_fields, + set_unlock_update_map_fields, +) +from dstack._internal.server.db import get_db, get_session_ctx +from dstack._internal.server.models import InstanceModel, JobModel, ProjectModel, RunModel +from dstack._internal.server.services import events +from dstack._internal.server.services.gateways import get_combined_gateway_stats +from dstack._internal.server.services.jobs import emit_job_status_change_event +from dstack._internal.server.services.locking import get_locker +from dstack._internal.server.services.pipelines import PipelineHinterProtocol +from dstack._internal.server.services.prometheus.client_metrics import run_metrics +from dstack._internal.server.services.runs import emit_run_status_change_event, get_run_spec +from dstack._internal.server.services.secrets import get_project_secrets_mapping +from dstack._internal.server.utils import sentry_utils +from dstack._internal.utils.common import get_current_datetime +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + +# No need to lock finished or terminating jobs since run processing does not update them. +JOB_STATUSES_EXCLUDED_FOR_LOCKING = JobStatus.finished_statuses() + [JobStatus.TERMINATING] + +RUN_STATUSES_WITH_MIN_PROCESSING_INTERVAL = [RunStatus.SUBMITTED, RunStatus.TERMINATING] + + +@dataclass +class RunPipelineItem(PipelineItem): + status: RunStatus + + +class RunPipeline(Pipeline[RunPipelineItem]): + def __init__( + self, + workers_num: int = 10, + queue_lower_limit_factor: float = 0.5, + queue_upper_limit_factor: float = 2.0, + min_processing_interval: timedelta = timedelta(seconds=5), + lock_timeout: timedelta = timedelta(seconds=30), + heartbeat_trigger: timedelta = timedelta(seconds=15), + *, + pipeline_hinter: PipelineHinterProtocol, + ) -> None: + super().__init__( + workers_num=workers_num, + queue_lower_limit_factor=queue_lower_limit_factor, + queue_upper_limit_factor=queue_upper_limit_factor, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeat_trigger=heartbeat_trigger, + ) + self.__heartbeater = Heartbeater[RunPipelineItem]( + model_type=RunModel, + lock_timeout=self._lock_timeout, + heartbeat_trigger=self._heartbeat_trigger, + ) + self.__fetcher = RunFetcher( + queue=self._queue, + queue_desired_minsize=self._queue_desired_minsize, + min_processing_interval=self._min_processing_interval, + lock_timeout=self._lock_timeout, + heartbeater=self._heartbeater, + ) + self.__workers = [ + RunWorker( + queue=self._queue, + heartbeater=self._heartbeater, + pipeline_hinter=pipeline_hinter, + ) + for _ in range(self._workers_num) + ] + + @property + def hint_fetch_model_name(self) -> str: + return RunModel.__name__ + + @property + def _heartbeater(self) -> Heartbeater[RunPipelineItem]: + return self.__heartbeater + + @property + def _fetcher(self) -> Fetcher[RunPipelineItem]: + return self.__fetcher + + @property + def _workers(self) -> Sequence["RunWorker"]: + return self.__workers + + +class RunFetcher(Fetcher[RunPipelineItem]): + def __init__( + self, + queue: asyncio.Queue[RunPipelineItem], + queue_desired_minsize: int, + min_processing_interval: timedelta, + lock_timeout: timedelta, + heartbeater: Heartbeater[RunPipelineItem], + queue_check_delay: float = 1.0, + ) -> None: + super().__init__( + queue=queue, + queue_desired_minsize=queue_desired_minsize, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeater=heartbeater, + queue_check_delay=queue_check_delay, + ) + + @sentry_utils.instrument_pipeline_task("RunFetcher.fetch") + async def fetch(self, limit: int) -> list[RunPipelineItem]: + if limit <= 0: + return [] + + run_lock, _ = get_locker(get_db().dialect_name).get_lockset(RunModel.__tablename__) + async with run_lock: + async with get_session_ctx() as session: + now = get_current_datetime() + res = await session.execute( + select(RunModel) + .where( + # Filter out runs that do not need processing. + # This is only to reduce unnecessary fetch/apply churn. + # Otherwise, we could fetch all active runs and filter them in the worker. + or_( + # Active non-pending runs. + RunModel.status.not_in( + RunStatus.finished_statuses() + [RunStatus.PENDING] + ), + # Retrying runs. + and_( + RunModel.status == RunStatus.PENDING, + RunModel.resubmission_attempt > 0, + ), + # Scheduled ready runs. + and_( + RunModel.status == RunStatus.PENDING, + RunModel.resubmission_attempt == 0, + RunModel.next_triggered_at.is_not(None), + RunModel.next_triggered_at < now, + ), + # Scaled-to-zero runs. + # Such runs cannot be scheduled, so we detect them via + # `next_triggered_at is None`. + # If scheduled services ever support downscaling to zero, + # this selector must be revisited. + and_( + RunModel.status == RunStatus.PENDING, + RunModel.resubmission_attempt == 0, + RunModel.next_triggered_at.is_(None), + ), + ), + or_( + # Process submitted and terminating runs quicker for low-latency state transition. + # Active run processing can be less frequent to minimize contention with `JobRunningPipeline`. + and_( + RunModel.status.in_(RUN_STATUSES_WITH_MIN_PROCESSING_INTERVAL), + RunModel.last_processed_at <= now - self._min_processing_interval, + ), + and_( + RunModel.status.not_in(RUN_STATUSES_WITH_MIN_PROCESSING_INTERVAL), + RunModel.last_processed_at + <= now - self._min_processing_interval * 2, + ), + RunModel.last_processed_at == RunModel.submitted_at, + RunModel.skip_min_processing_interval == True, + ), + or_( + RunModel.lock_expires_at.is_(None), + RunModel.lock_expires_at < now, + ), + or_( + RunModel.lock_owner.is_(None), + RunModel.lock_owner == RunPipeline.__name__, + ), + ) + .order_by(RunModel.last_processed_at.asc()) + .limit(limit) + .with_for_update(skip_locked=True, key_share=True, of=RunModel) + .options( + load_only( + RunModel.id, + RunModel.lock_token, + RunModel.lock_expires_at, + RunModel.status, + RunModel.skip_min_processing_interval, + ) + ) + ) + run_models = list(res.scalars().all()) + lock_expires_at = get_current_datetime() + self._lock_timeout + lock_token = uuid.uuid4() + items = [] + for run_model in run_models: + prev_lock_expired = run_model.lock_expires_at is not None + run_model.lock_expires_at = lock_expires_at + run_model.lock_token = lock_token + run_model.lock_owner = RunPipeline.__name__ + run_model.skip_min_processing_interval = False + items.append( + RunPipelineItem( + __tablename__=RunModel.__tablename__, + id=run_model.id, + lock_expires_at=lock_expires_at, + lock_token=lock_token, + prev_lock_expired=prev_lock_expired, + status=run_model.status, + ) + ) + await session.commit() + return items + + +class RunWorker(Worker[RunPipelineItem]): + def __init__( + self, + queue: asyncio.Queue[RunPipelineItem], + heartbeater: Heartbeater[RunPipelineItem], + pipeline_hinter: PipelineHinterProtocol, + ) -> None: + super().__init__( + queue=queue, + heartbeater=heartbeater, + pipeline_hinter=pipeline_hinter, + ) + + @sentry_utils.instrument_pipeline_task("RunWorker.process") + async def process(self, item: RunPipelineItem): + # Currently `dstack` supports runs with + # * one multi-node replica (multi-node tasks) + # * or multiple single-node replicas (services) + # The multiple multi-node replica is not supported but the most of the processing logic + # is written to be able to handle this generic case. + # + # Different run stats have completely separate load/process/apply phases + # due to distinct processing flows and related-row requirements. + if item.status == RunStatus.PENDING: + await _process_pending_item(item) + return + if item.status in { + RunStatus.SUBMITTED, + RunStatus.PROVISIONING, + RunStatus.RUNNING, + }: + await _process_active_item(item) + return + if item.status == RunStatus.TERMINATING: + await _process_terminating_item(item) + self._pipeline_hinter.hint_fetch(JobModel.__name__) + return + + logger.error("Skipping run %s with unexpected status %s", item.id, item.status) + + +async def _process_pending_item(item: RunPipelineItem) -> None: + async with get_session_ctx() as session: + context = await _load_pending_context(session=session, item=item) + if context is None: + return + + result = await pending.process_pending_run(context) + if result is None: + await _apply_noop_result( + item=item, + locked_job_ids=context.locked_job_ids, + ) + return + + await _apply_pending_result(item=item, context=context, result=result) + + +async def _load_pending_context( + session: AsyncSession, + item: RunPipelineItem, +) -> Optional[pending.PendingContext]: + locked_job_ids = await _lock_related_jobs(session=session, item=item) + if locked_job_ids is None: + return None + run_model = await _refetch_locked_run_for_pending(session=session, item=item) + if run_model is None: + log_lock_token_mismatch(logger, item) + await _unlock_related_jobs( + session=session, + item=item, + locked_job_ids=locked_job_ids, + ) + await session.commit() + return None + secrets = await get_project_secrets_mapping(session=session, project=run_model.project) + run_spec = get_run_spec(run_model) + + gateway_stats = None + if run_spec.configuration.type == "service" and run_model.gateway_id is not None: + gateway_stats = await get_combined_gateway_stats( + session, run_model.gateway_id, run_model.project.name, run_model.run_name + ) + + return pending.PendingContext( + run_model=run_model, + run_spec=run_spec, + secrets=secrets, + locked_job_ids=locked_job_ids, + gateway_stats=gateway_stats, + ) + + +async def _refetch_locked_run_for_pending( + session: AsyncSession, + item: RunPipelineItem, +) -> Optional[RunModel]: + latest_sq = _build_latest_submissions_subquery(item.id) + job_alias = aliased(JobModel) + res = await session.execute( + select(RunModel) + .where( + RunModel.id == item.id, + RunModel.lock_token == item.lock_token, + ) + .outerjoin(latest_sq, latest_sq.c.run_id == RunModel.id) + .outerjoin( + job_alias, + and_( + job_alias.run_id == latest_sq.c.run_id, + job_alias.replica_num == latest_sq.c.replica_num, + job_alias.job_num == latest_sq.c.job_num, + job_alias.submission_num == latest_sq.c.max_submission_num, + ), + ) + .options( + joinedload(RunModel.project).load_only( + ProjectModel.id, + ProjectModel.name, + ), + ) + .options(contains_eager(RunModel.jobs, alias=job_alias)) + .execution_options(populate_existing=True) + ) + return res.unique().scalar_one_or_none() + + +def _build_latest_submissions_subquery(run_id: uuid.UUID): + """Subquery selecting only the latest submission per (replica_num, job_num).""" + return ( + select( + JobModel.run_id.label("run_id"), + JobModel.replica_num.label("replica_num"), + JobModel.job_num.label("job_num"), + func.max(JobModel.submission_num).label("max_submission_num"), + ) + .where(JobModel.run_id == run_id) + .group_by(JobModel.run_id, JobModel.replica_num, JobModel.job_num) + .subquery() + ) + + +async def _apply_pending_result( + item: RunPipelineItem, + context: pending.PendingContext, + result: pending.PendingResult, +) -> None: + set_processed_update_map_fields(result.run_update_map) + set_unlock_update_map_fields(result.run_update_map) + + async with get_session_ctx() as session: + now = get_current_datetime() + resolve_now_placeholders(result.run_update_map, now=now) + + res = await session.execute( + update(RunModel) + .where( + RunModel.id == item.id, + RunModel.lock_token == item.lock_token, + ) + .values(**result.run_update_map) + .returning(RunModel.id) + ) + updated_run_ids = list(res.scalars().all()) + if len(updated_run_ids) == 0: + log_lock_token_changed_after_processing(logger, item) + await _unlock_related_jobs( + session=session, + item=item, + locked_job_ids=context.locked_job_ids, + ) + await session.commit() + return + + for job_model in result.new_job_models: + session.add(job_model) + events.emit( + session, + f"Job created on new submission. Status: {job_model.status.upper()}", + actor=events.SystemActor(), + targets=[events.Target.from_model(job_model)], + ) + + emit_run_status_change_event( + session=session, + run_model=context.run_model, + old_status=context.run_model.status, + new_status=result.run_update_map.get("status", context.run_model.status), + ) + + await _unlock_related_jobs( + session=session, + item=item, + locked_job_ids=context.locked_job_ids, + ) + await session.commit() + + +async def _apply_noop_result( + item: RunPipelineItem, + locked_job_ids: set[uuid.UUID], +) -> None: + """Unlock the run without changing state. Used when processing decides to skip.""" + async with get_session_ctx() as session: + now = get_current_datetime() + await session.execute( + update(RunModel) + .where( + RunModel.id == item.id, + RunModel.lock_token == item.lock_token, + ) + .values( + lock_expires_at=None, + lock_token=None, + lock_owner=None, + last_processed_at=now, + ) + ) + await _unlock_related_jobs( + session=session, + item=item, + locked_job_ids=locked_job_ids, + ) + await session.commit() + + +async def _process_active_item(item: RunPipelineItem) -> None: + async with get_session_ctx() as session: + load_result = await _load_active_context(session=session, item=item) + if load_result is None: + return + context = load_result + + result = await active.process_active_run(context) + await _apply_active_result(item=item, context=context, result=result) + + +async def _load_active_context( + session: AsyncSession, + item: RunPipelineItem, +) -> Optional[active.ActiveContext]: + """Returns None on lock mismatch (already handled). + Returns context when processing should proceed. + """ + locked_job_ids = await _lock_related_jobs(session=session, item=item) + if locked_job_ids is None: + return None + run_model = await _refetch_locked_run_for_active(session=session, item=item) + if run_model is None: + log_lock_token_mismatch(logger, item) + await _unlock_related_jobs( + session=session, + item=item, + locked_job_ids=locked_job_ids, + ) + await session.commit() + return None + secrets = await get_project_secrets_mapping(session=session, project=run_model.project) + run_spec = get_run_spec(run_model) + + gateway_stats = None + if run_spec.configuration.type == "service" and run_model.gateway_id is not None: + gateway_stats = await get_combined_gateway_stats( + session, run_model.gateway_id, run_model.project.name, run_model.run_name + ) + + return active.ActiveContext( + run_model=run_model, + run_spec=run_spec, + secrets=secrets, + locked_job_ids=locked_job_ids, + gateway_stats=gateway_stats, + ) + + +async def _refetch_locked_run_for_active( + session: AsyncSession, + item: RunPipelineItem, +) -> Optional[RunModel]: + latest_sq = _build_latest_submissions_subquery(item.id) + job_alias = aliased(JobModel) + res = await session.execute( + select(RunModel) + .where( + RunModel.id == item.id, + RunModel.lock_token == item.lock_token, + ) + .outerjoin(latest_sq, latest_sq.c.run_id == RunModel.id) + .outerjoin( + job_alias, + and_( + job_alias.run_id == latest_sq.c.run_id, + job_alias.replica_num == latest_sq.c.replica_num, + job_alias.job_num == latest_sq.c.job_num, + job_alias.submission_num == latest_sq.c.max_submission_num, + ), + ) + .options( + joinedload(RunModel.project).load_only( + ProjectModel.id, + ProjectModel.name, + ), + ) + .options( + contains_eager(RunModel.jobs, alias=job_alias) + .joinedload(JobModel.instance) + .load_only(InstanceModel.fleet_id), + ) + .execution_options(populate_existing=True) + ) + return res.unique().scalar_one_or_none() + + +async def _apply_active_result( + item: RunPipelineItem, + context: active.ActiveContext, + result: active.ActiveResult, +) -> None: + run_model = context.run_model + set_processed_update_map_fields(result.run_update_map) + set_unlock_update_map_fields(result.run_update_map) + + async with get_session_ctx() as session: + now = get_current_datetime() + resolve_now_placeholders(result.run_update_map, now=now) + job_update_rows = _build_active_job_update_rows( + job_id_to_update_map=result.job_id_to_update_map, + unlock_job_ids=context.locked_job_ids, + ) + if job_update_rows: + resolve_now_placeholders(job_update_rows, now=now) + + res = await session.execute( + update(RunModel) + .where( + RunModel.id == item.id, + RunModel.lock_token == item.lock_token, + ) + .values(**result.run_update_map) + .returning(RunModel.id) + ) + updated_run_ids = list(res.scalars().all()) + if len(updated_run_ids) == 0: + log_lock_token_changed_after_processing(logger, item) + await _unlock_related_jobs( + session=session, + item=item, + locked_job_ids=context.locked_job_ids, + ) + await session.commit() + return + + if job_update_rows: + await session.execute(update(JobModel), job_update_rows) + + for job_model in result.new_job_models: + session.add(job_model) + events.emit( + session, + f"Job created on retry. Status: {job_model.status.upper()}", + actor=events.SystemActor(), + targets=[events.Target.from_model(job_model)], + ) + + old_status = run_model.status + new_status = result.run_update_map.get("status", old_status) + _emit_active_metrics(run_model, context.run_spec, old_status, new_status) + + _emit_active_job_status_change_events( + session=session, + context=context, + result=result, + ) + # Set termination_reason on the model so emit_run_status_change_event can read it. + if "termination_reason" in result.run_update_map: + run_model.termination_reason = result.run_update_map["termination_reason"] + emit_run_status_change_event( + session=session, + run_model=run_model, + old_status=old_status, + new_status=new_status, + ) + await session.commit() + + +def _emit_active_metrics( + run_model: RunModel, + run_spec, + old_status: RunStatus, + new_status: RunStatus, +) -> None: + if old_status == new_status: + return + project_name = run_model.project.name + run_type = run_spec.configuration.type + if old_status == RunStatus.SUBMITTED and new_status == RunStatus.PROVISIONING: + duration = (get_current_datetime() - run_model.submitted_at).total_seconds() + run_metrics.log_submit_to_provision_duration(duration, project_name, run_type) + if new_status == RunStatus.PENDING: + run_metrics.increment_pending_runs(project_name, run_type) + + +class _ActiveRunJobUpdateRow(active.ActiveRunJobUpdateMap, total=False): + id: uuid.UUID + + +def _build_active_job_update_rows( + job_id_to_update_map: dict[uuid.UUID, active.ActiveRunJobUpdateMap], + unlock_job_ids: set[uuid.UUID], +) -> list[_ActiveRunJobUpdateRow]: + job_update_rows = [] + for job_id in sorted(job_id_to_update_map.keys() | unlock_job_ids): + update_row = _ActiveRunJobUpdateRow(id=job_id) + job_update_map = job_id_to_update_map.get(job_id) + if job_update_map is not None: + for key, value in job_update_map.items(): + update_row[key] = value + if job_id in unlock_job_ids: + set_unlock_update_map_fields(update_row) + set_processed_update_map_fields(update_row) + job_update_rows.append(update_row) + return job_update_rows + + +def _emit_active_job_status_change_events( + session: AsyncSession, + context: active.ActiveContext, + result: active.ActiveResult, +) -> None: + for job_model in context.run_model.jobs: + job_update_map = result.job_id_to_update_map.get(job_model.id) + if job_update_map is None: + continue + emit_job_status_change_event( + session=session, + job_model=job_model, + old_status=job_model.status, + new_status=job_update_map.get("status", job_model.status), + termination_reason=job_update_map.get( + "termination_reason", + job_model.termination_reason, + ), + termination_reason_message=job_update_map.get( + "termination_reason_message", + job_model.termination_reason_message, + ), + ) + + +async def _process_terminating_item(item: RunPipelineItem) -> None: + async with get_session_ctx() as session: + context = await _load_terminating_context(session=session, item=item) + if context is None: + return + + result = await terminating.process_terminating_run(context) + await _apply_terminating_result(item=item, context=context, result=result) + + +async def _load_terminating_context( + session: AsyncSession, + item: RunPipelineItem, +) -> Optional[terminating.TerminatingContext]: + locked_job_ids = await _lock_related_jobs( + session=session, + item=item, + ) + if locked_job_ids is None: + return None + run_model = await _refetch_locked_run_for_terminating(session=session, item=item) + if run_model is None: + log_lock_token_mismatch(logger, item) + await _unlock_related_jobs( + session=session, + item=item, + locked_job_ids=locked_job_ids, + ) + await session.commit() + return None + return terminating.TerminatingContext( + run_model=run_model, + locked_job_ids=locked_job_ids, + ) + + +async def _refetch_locked_run_for_terminating( + session: AsyncSession, + item: RunPipelineItem, +) -> Optional[RunModel]: + latest_sq = _build_latest_submissions_subquery(item.id) + job_alias = aliased(JobModel) + res = await session.execute( + select(RunModel) + .where( + RunModel.id == item.id, + RunModel.lock_token == item.lock_token, + ) + .outerjoin(latest_sq, latest_sq.c.run_id == RunModel.id) + .outerjoin( + job_alias, + and_( + job_alias.run_id == latest_sq.c.run_id, + job_alias.replica_num == latest_sq.c.replica_num, + job_alias.job_num == latest_sq.c.job_num, + job_alias.submission_num == latest_sq.c.max_submission_num, + ), + ) + .options( + joinedload(RunModel.project).load_only( + ProjectModel.id, + ProjectModel.name, + ), + ) + .options( + contains_eager(RunModel.jobs, alias=job_alias) + .joinedload(JobModel.instance) + .joinedload(InstanceModel.project) + .load_only( + ProjectModel.id, + ProjectModel.ssh_private_key, + ), + ) + .execution_options(populate_existing=True) + ) + return res.unique().scalar_one_or_none() + + +async def _lock_related_jobs( + session: AsyncSession, + item: RunPipelineItem, +) -> Optional[set[uuid.UUID]]: + now = get_current_datetime() + job_lock, _ = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__) + async with job_lock: + res = await session.execute( + select(JobModel) + .where( + JobModel.run_id == item.id, + JobModel.status.not_in(JOB_STATUSES_EXCLUDED_FOR_LOCKING), + or_( + JobModel.lock_expires_at.is_(None), + JobModel.lock_expires_at < now, + ), + or_( + JobModel.lock_owner.is_(None), + JobModel.lock_owner == RunPipeline.__name__, + ), + ) + .order_by(JobModel.id) + .with_for_update(skip_locked=True, key_share=True, of=JobModel) + .options(load_only(JobModel.id)) + ) + locked_job_models = list(res.scalars().all()) + locked_job_ids = {job_model.id for job_model in locked_job_models} + + res = await session.execute( + select(JobModel.id).where( + JobModel.run_id == item.id, + JobModel.status.not_in(JOB_STATUSES_EXCLUDED_FOR_LOCKING), + ) + ) + current_job_ids = set(res.scalars().all()) + if current_job_ids != locked_job_ids: + logger.debug( + "Failed to lock run %s jobs. The run will be processed later.", + item.id, + ) + await _reset_run_lock_for_retry(session=session, item=item) + return None + for job_model in locked_job_models: + job_model.lock_expires_at = item.lock_expires_at + job_model.lock_token = item.lock_token + job_model.lock_owner = RunPipeline.__name__ + await session.commit() + return {jm.id for jm in locked_job_models} + + +async def _reset_run_lock_for_retry( + session: AsyncSession, + item: RunPipelineItem, +) -> None: + res = await session.execute( + update(RunModel) + .where( + RunModel.id == item.id, + RunModel.lock_token == item.lock_token, + ) + # Keep `lock_owner` so the run remains owned by the run pipeline, + # but unset `lock_expires_at` to retry ASAP and unset `lock_token` + # so heartbeater can no longer update the item. + .values( + lock_expires_at=None, + lock_token=None, + last_processed_at=get_current_datetime(), + ) + .returning(RunModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + log_lock_token_changed_on_reset(logger) + + +async def _apply_terminating_result( + item: RunPipelineItem, + context: terminating.TerminatingContext, + result: terminating.TerminatingResult, +) -> None: + run_model = context.run_model + set_processed_update_map_fields(result.run_update_map) + set_unlock_update_map_fields(result.run_update_map) + + async with get_session_ctx() as session: + now = get_current_datetime() + resolve_now_placeholders(result.run_update_map, now=now) + job_update_rows = _build_terminating_job_update_rows( + job_id_to_update_map=result.job_id_to_update_map, + unlock_job_ids=context.locked_job_ids, + ) + if job_update_rows: + resolve_now_placeholders(job_update_rows, now=now) + res = await session.execute( + update(RunModel) + .where( + RunModel.id == item.id, + RunModel.lock_token == item.lock_token, + ) + .values(**result.run_update_map) + .returning(RunModel.id) + ) + updated_run_ids = list(res.scalars().all()) + if len(updated_run_ids) == 0: + # The only side-effects are runner stop signal and service deregistration, + # and they are idempotent, so no need for cleanup. + log_lock_token_changed_after_processing(logger, item) + await _unlock_related_jobs( + session=session, + item=item, + locked_job_ids=context.locked_job_ids, + ) + await session.commit() + return + + if job_update_rows: + await session.execute(update(JobModel), job_update_rows) + + if result.service_unregistration is not None: + targets = [events.Target.from_model(run_model)] + if result.service_unregistration.gateway_target is not None: + targets.append(result.service_unregistration.gateway_target) + events.emit( + session, + result.service_unregistration.event_message, + actor=events.SystemActor(), + targets=targets, + ) + + _emit_terminating_job_status_change_events( + session=session, + context=context, + result=result, + ) + emit_run_status_change_event( + session=session, + run_model=context.run_model, + old_status=context.run_model.status, + new_status=result.run_update_map.get("status", context.run_model.status), + ) + await session.commit() + + +class _TerminatingRunJobUpdateRow(terminating.TerminatingRunJobUpdateMap, total=False): + id: uuid.UUID + + +def _build_terminating_job_update_rows( + job_id_to_update_map: dict[uuid.UUID, terminating.TerminatingRunJobUpdateMap], + unlock_job_ids: set[uuid.UUID], +) -> list[_TerminatingRunJobUpdateRow]: + job_update_rows = [] + for job_id in sorted(job_id_to_update_map.keys() | unlock_job_ids): + update_row = _TerminatingRunJobUpdateRow(id=job_id) + job_update_map = job_id_to_update_map.get(job_id) + if job_update_map is not None: + for key, value in job_update_map.items(): + update_row[key] = value + if job_id in unlock_job_ids: + set_unlock_update_map_fields(update_row) + set_processed_update_map_fields(update_row) + job_update_rows.append(update_row) + return job_update_rows + + +def _emit_terminating_job_status_change_events( + session: AsyncSession, + context: terminating.TerminatingContext, + result: terminating.TerminatingResult, +) -> None: + for job_model in context.run_model.jobs: + job_update_map = result.job_id_to_update_map.get(job_model.id) + if job_update_map is None: + continue + emit_job_status_change_event( + session=session, + job_model=job_model, + old_status=job_model.status, + new_status=job_update_map.get("status", job_model.status), + termination_reason=job_update_map.get( + "termination_reason", + job_model.termination_reason, + ), + termination_reason_message=job_model.termination_reason_message, + ) + + +async def _unlock_related_jobs( + session: AsyncSession, + item: RunPipelineItem, + locked_job_ids: set[uuid.UUID], +) -> None: + if len(locked_job_ids) == 0: + return + await session.execute( + update(JobModel) + .where( + JobModel.id.in_(locked_job_ids), + JobModel.lock_token == item.lock_token, + JobModel.lock_owner == RunPipeline.__name__, + ) + .values( + lock_expires_at=None, + lock_token=None, + lock_owner=None, + ) + ) diff --git a/src/dstack/_internal/server/background/pipeline_tasks/runs/active.py b/src/dstack/_internal/server/background/pipeline_tasks/runs/active.py new file mode 100644 index 0000000000..8448c52c32 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/runs/active.py @@ -0,0 +1,739 @@ +import json +import uuid +from dataclasses import dataclass, field +from datetime import datetime, timedelta +from typing import Dict, List, Optional, Set, Tuple + +from sqlalchemy import select +from sqlalchemy.orm import load_only + +from dstack._internal.core.errors import ServerError +from dstack._internal.core.models.configurations import ServiceConfiguration +from dstack._internal.core.models.profiles import RetryEvent, StopCriteria +from dstack._internal.core.models.runs import ( + JobStatus, + JobTerminationReason, + RunSpec, + RunStatus, + RunTerminationReason, +) +from dstack._internal.proxy.gateway.schemas.stats import PerWindowStats +from dstack._internal.server.background.pipeline_tasks.base import ItemUpdateMap +from dstack._internal.server.background.pipeline_tasks.runs.common import ( + PerGroupDesiredCounts, + build_scale_up_job_models, + compute_desired_replica_counts, +) +from dstack._internal.server.db import get_session_ctx +from dstack._internal.server.models import JobModel, RunModel +from dstack._internal.server.services.jobs import ( + get_job_spec, + get_job_specs_from_run_spec, + get_jobs_from_run_spec, + group_jobs_by_replica_latest, +) +from dstack._internal.server.services.runs import create_job_model_for_new_submission +from dstack._internal.server.services.runs.replicas import ( + build_replica_lists, + get_group_rollout_state, + has_out_of_date_replicas, + job_belongs_to_group, +) +from dstack._internal.utils.common import get_current_datetime +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + +ROLLING_DEPLOYMENT_MAX_SURGE = 1 # at most one extra replica during rolling deployment + + +class ActiveRunUpdateMap(ItemUpdateMap, total=False): + status: RunStatus + termination_reason: Optional[RunTerminationReason] + fleet_id: Optional[uuid.UUID] + resubmission_attempt: int + desired_replica_count: int + desired_replica_counts: Optional[str] # JSON + + +class ActiveRunJobUpdateMap(ItemUpdateMap, total=False): + status: JobStatus + termination_reason: Optional[JobTerminationReason] + termination_reason_message: Optional[str] + deployment_num: int + + +@dataclass +class ActiveContext: + run_model: RunModel + run_spec: RunSpec + secrets: dict + locked_job_ids: set[uuid.UUID] + gateway_stats: Optional[PerWindowStats] = None + + +@dataclass +class ActiveResult: + run_update_map: ActiveRunUpdateMap + new_job_models: list[JobModel] + job_id_to_update_map: dict[uuid.UUID, ActiveRunJobUpdateMap] + + +@dataclass +class _ReplicaAnalysis: + """Per-replica classification of job states for determining the run's next status.""" + + replica_num: int + job_models: List[JobModel] + contributed_statuses: Set[RunStatus] = field(default_factory=set) + """`RunStatus` values derived from this replica's jobs. Merged into the run-level + analysis unless the replica is being retried as a whole.""" + termination_reasons: Set[RunTerminationReason] = field(default_factory=set) + """Why the replica failed. Only populated when `FAILED` is in `contributed_statuses`.""" + needs_retry: bool = False + """At least one job failed with a retryable reason and the retry duration hasn't been + exceeded. When `True`, the replica does not contribute its statuses to the run-level + analysis and is added to `replicas_to_retry` instead.""" + + +@dataclass +class _RunAnalysis: + """Aggregated replica analysis used to determine the run's next status. + + Each replica contributes `RunStatus` based on its jobs' statuses. + The run's new status is the highest-priority value across all + contributing replicas: FAILED > RUNNING > PROVISIONING > SUBMITTED > DONE. + Replicas that need full retry do not contribute and instead cause a PENDING transition. + """ + + contributed_statuses: Set[RunStatus] = field(default_factory=set) + termination_reasons: Set[RunTerminationReason] = field(default_factory=set) + replicas_to_retry: List[Tuple[int, List[JobModel]]] = field(default_factory=list) + """Replicas with retryable failures that haven't exceeded the retry duration.""" + + +@dataclass +class _ActiveRunTransition: + new_status: RunStatus + termination_reason: Optional[RunTerminationReason] = None + + +async def process_active_run(context: ActiveContext) -> ActiveResult: + run_model = context.run_model + run_spec = context.run_spec + + fleet_id = _detect_fleet_id_from_jobs(run_model) + analysis = await _analyze_active_run(run_model) + transition = _get_active_run_transition(run_spec, run_model, analysis) + + run_update_map = _build_run_update_map(run_model, run_spec, transition, fleet_id) + new_job_models: list[JobModel] = [] + job_id_to_update_map: Dict[uuid.UUID, ActiveRunJobUpdateMap] = {} + + if transition.new_status == RunStatus.PENDING: + job_id_to_update_map = _build_terminate_retrying_jobs_map(analysis.replicas_to_retry) + elif transition.new_status not in {RunStatus.TERMINATING, RunStatus.PENDING}: + if analysis.replicas_to_retry: + new_job_models = await _build_retry_job_models(context, analysis.replicas_to_retry) + # In a multi-node replica, one job may fail while siblings are still running. + # Terminate those siblings so the entire replica retries cleanly. + job_id_to_update_map = _build_terminate_retrying_jobs_map(analysis.replicas_to_retry) + elif run_spec.configuration.type == "service": + per_group_desired = _apply_desired_counts_to_update_map(run_update_map, context) + # Service processing has multiple stages that never conflict: + # - scaling skips groups with out-of-date replicas (rolling in progress), + # so for those groups only rolling manages replica creation and teardown; + # - cleanup only targets removed groups (not in configuration.replica_groups). + new_job_models, job_id_to_update_map = await _build_service_scaling_maps( + context, per_group_desired + ) + + deployment_maps = await _build_deployment_update_map(context) + job_id_to_update_map.update(deployment_maps) + + rolling_new, rolling_maps = await _build_rolling_deployment_maps( + context, per_group_desired, in_place_bumped_job_ids=set(deployment_maps.keys()) + ) + new_job_models.extend(rolling_new) + job_id_to_update_map.update(rolling_maps) + + cleanup_maps = _build_removed_groups_cleanup_maps(context) + job_id_to_update_map.update(cleanup_maps) + else: + job_id_to_update_map = await _build_deployment_update_map(context) + + return ActiveResult( + run_update_map=run_update_map, + new_job_models=new_job_models, + job_id_to_update_map=job_id_to_update_map, + ) + + +def _detect_fleet_id_from_jobs(run_model: RunModel) -> Optional[uuid.UUID]: + """Detect fleet_id from job instances. Returns the current fleet_id if already set.""" + if run_model.fleet_id is not None: + return run_model.fleet_id + for job_model in run_model.jobs: + if job_model.instance is not None and job_model.instance.fleet_id is not None: + return job_model.instance.fleet_id + return None + + +async def _analyze_active_run(run_model: RunModel) -> _RunAnalysis: + run_analysis = _RunAnalysis() + for replica_num, job_models in group_jobs_by_replica_latest(run_model.jobs): + replica_analysis = await _analyze_active_run_replica( + run_model=run_model, + replica_num=replica_num, + job_models=job_models, + ) + _apply_replica_analysis(run_analysis, replica_analysis) + return run_analysis + + +async def _analyze_active_run_replica( + run_model: RunModel, + replica_num: int, + job_models: List[JobModel], +) -> _ReplicaAnalysis: + contributed_statuses: Set[RunStatus] = set() + termination_reasons: Set[RunTerminationReason] = set() + needs_retry = False + + for job_model in job_models: + if _job_is_done_or_finishing_done(job_model): + contributed_statuses.add(RunStatus.DONE) + continue + + if _job_was_scaled_down(job_model): + continue + + replica_status = _get_non_terminal_replica_status(job_model) + if replica_status is not None: + contributed_statuses.add(replica_status) + continue + + if _job_needs_retry_evaluation(job_model): + current_duration = await _should_retry_job(run_model, job_model) + if current_duration is None: + contributed_statuses.add(RunStatus.FAILED) + termination_reasons.add(RunTerminationReason.JOB_FAILED) + elif _is_retry_duration_exceeded(job_model, current_duration): + contributed_statuses.add(RunStatus.FAILED) + termination_reasons.add(RunTerminationReason.RETRY_LIMIT_EXCEEDED) + else: + needs_retry = True + continue + + raise ServerError(f"Unexpected job status {job_model.status}") + + return _ReplicaAnalysis( + replica_num=replica_num, + job_models=job_models, + contributed_statuses=contributed_statuses, + termination_reasons=termination_reasons, + needs_retry=needs_retry, + ) + + +def _apply_replica_analysis( + analysis: _RunAnalysis, + replica_analysis: _ReplicaAnalysis, +) -> None: + if RunStatus.FAILED in replica_analysis.contributed_statuses: + analysis.contributed_statuses.add(RunStatus.FAILED) + analysis.termination_reasons.update(replica_analysis.termination_reasons) + return + + if replica_analysis.needs_retry: + analysis.replicas_to_retry.append( + (replica_analysis.replica_num, replica_analysis.job_models) + ) + + if not replica_analysis.needs_retry: + analysis.contributed_statuses.update(replica_analysis.contributed_statuses) + + +def _job_is_done_or_finishing_done(job_model: JobModel) -> bool: + return job_model.status == JobStatus.DONE or ( + job_model.status == JobStatus.TERMINATING + and job_model.termination_reason == JobTerminationReason.DONE_BY_RUNNER + ) + + +def _job_was_scaled_down(job_model: JobModel) -> bool: + return job_model.termination_reason == JobTerminationReason.SCALED_DOWN + + +def _get_non_terminal_replica_status(job_model: JobModel) -> Optional[RunStatus]: + if job_model.status == JobStatus.RUNNING: + return RunStatus.RUNNING + if job_model.status in {JobStatus.PROVISIONING, JobStatus.PULLING}: + return RunStatus.PROVISIONING + if job_model.status == JobStatus.SUBMITTED: + return RunStatus.SUBMITTED + return None + + +def _job_needs_retry_evaluation(job_model: JobModel) -> bool: + return job_model.status == JobStatus.FAILED or ( + job_model.status in [JobStatus.TERMINATING, JobStatus.TERMINATED, JobStatus.ABORTED] + and job_model.termination_reason + not in {JobTerminationReason.DONE_BY_RUNNER, JobTerminationReason.SCALED_DOWN} + ) + + +async def _should_retry_job( + run_model: RunModel, + job_model: JobModel, +) -> Optional[timedelta]: + """ + Checks if the job should be retried and returns the elapsed retry duration. + + For `no-capacity`, retry is limited by the age of the current run. Once the + job has already provisioned, retry is limited by the time since the latest + provisioned submission for that job. + """ + job_spec = get_job_spec(job_model) + if job_spec.retry is None: + return None + + last_provisioned = await _load_last_provisioned_job( + run_id=job_model.run_id, + replica_num=job_model.replica_num, + job_num=job_model.job_num, + ) + + if ( + job_model.termination_reason is not None + and job_model.termination_reason.to_retry_event() == RetryEvent.NO_CAPACITY + and last_provisioned is None + and RetryEvent.NO_CAPACITY in job_spec.retry.on_events + ): + retry_started_at = run_model.submitted_at + if run_model.next_triggered_at is not None: + # Scheduled runs keep `next_triggered_at` pointing to the current trigger time while + # retrying. Retryable failures go back to PENDING directly, and the terminating worker + # advances `next_triggered_at` only when the current execution is over. + retry_started_at = run_model.next_triggered_at + return get_current_datetime() - retry_started_at + + if ( + job_model.termination_reason is not None + and job_model.termination_reason.to_retry_event() in job_spec.retry.on_events + and last_provisioned is not None + ): + return get_current_datetime() - last_provisioned.last_processed_at + + return None + + +async def _load_last_provisioned_job( + run_id: uuid.UUID, + replica_num: int, + job_num: int, +) -> Optional[JobModel]: + """Load the last submission with provisioning data for a single (replica_num, job_num).""" + async with get_session_ctx() as session: + res = await session.execute( + select(JobModel) + .where( + JobModel.run_id == run_id, + JobModel.replica_num == replica_num, + JobModel.job_num == job_num, + JobModel.job_provisioning_data.is_not(None), + ) + .order_by(JobModel.submission_num.desc()) + .limit(1) + .options(load_only(JobModel.last_processed_at)) + ) + return res.scalar_one_or_none() + + +def _is_retry_duration_exceeded(job_model: JobModel, current_duration: timedelta) -> bool: + job_spec = get_job_spec(job_model) + if job_spec.retry is None: + return True + return current_duration > timedelta(seconds=job_spec.retry.duration) + + +def _should_stop_on_master_done(run_spec: RunSpec, run_model: RunModel) -> bool: + if run_spec.merged_profile.stop_criteria != StopCriteria.MASTER_DONE: + return False + for job_model in run_model.jobs: + if job_model.job_num == 0 and job_model.status == JobStatus.DONE: + return True + return False + + +def _get_active_run_transition( + run_spec: RunSpec, + run_model: RunModel, + analysis: _RunAnalysis, +) -> _ActiveRunTransition: + # Check `analysis.contributed_statuses` in the priority order. + if RunStatus.FAILED in analysis.contributed_statuses: + if RunTerminationReason.JOB_FAILED in analysis.termination_reasons: + termination_reason = RunTerminationReason.JOB_FAILED + elif RunTerminationReason.RETRY_LIMIT_EXCEEDED in analysis.termination_reasons: + termination_reason = RunTerminationReason.RETRY_LIMIT_EXCEEDED + else: + raise ServerError(f"Unexpected termination reason {analysis.termination_reasons}") + return _ActiveRunTransition( + new_status=RunStatus.TERMINATING, + termination_reason=termination_reason, + ) + + if _should_stop_on_master_done(run_spec, run_model): + return _ActiveRunTransition( + new_status=RunStatus.TERMINATING, + termination_reason=RunTerminationReason.ALL_JOBS_DONE, + ) + + if RunStatus.RUNNING in analysis.contributed_statuses: + return _ActiveRunTransition(new_status=RunStatus.RUNNING) + if RunStatus.PROVISIONING in analysis.contributed_statuses: + return _ActiveRunTransition(new_status=RunStatus.PROVISIONING) + if RunStatus.SUBMITTED in analysis.contributed_statuses: + return _ActiveRunTransition(new_status=RunStatus.SUBMITTED) + if RunStatus.DONE in analysis.contributed_statuses and not analysis.replicas_to_retry: + return _ActiveRunTransition( + new_status=RunStatus.TERMINATING, + termination_reason=RunTerminationReason.ALL_JOBS_DONE, + ) + if not analysis.contributed_statuses or analysis.contributed_statuses == {RunStatus.DONE}: + # No active replicas remain — resubmit the entire run. + # `contributed_statuses` is either empty (every replica is retrying) or contains + # only DONE (some replicas finished, others need retry). + return _ActiveRunTransition(new_status=RunStatus.PENDING) + raise ServerError("Failed to determine run transition: unexpected active run state") + + +def _build_run_update_map( + run_model: RunModel, + run_spec: RunSpec, + transition: _ActiveRunTransition, + fleet_id: Optional[uuid.UUID], +) -> ActiveRunUpdateMap: + update_map = ActiveRunUpdateMap() + + if fleet_id != run_model.fleet_id: + update_map["fleet_id"] = fleet_id + + if run_model.status == transition.new_status: + return update_map + + update_map["status"] = transition.new_status + update_map["termination_reason"] = transition.termination_reason + + if transition.new_status == RunStatus.PROVISIONING: + update_map["resubmission_attempt"] = 0 + elif transition.new_status == RunStatus.PENDING: + update_map["resubmission_attempt"] = run_model.resubmission_attempt + 1 + # Unassign run from fleet so that a new fleet can be chosen when retrying + update_map["fleet_id"] = None + + return update_map + + +def _build_terminate_retrying_jobs_map( + replicas_to_retry: List[Tuple[int, List[JobModel]]], +) -> dict[uuid.UUID, ActiveRunJobUpdateMap]: + job_id_to_update_map: dict[uuid.UUID, ActiveRunJobUpdateMap] = {} + for _, replica_jobs in replicas_to_retry: + for job_model in replica_jobs: + if job_model.status.is_finished() or job_model.status == JobStatus.TERMINATING: + continue + job_id_to_update_map[job_model.id] = ActiveRunJobUpdateMap( + status=JobStatus.TERMINATING, + termination_reason=JobTerminationReason.TERMINATED_BY_SERVER, + termination_reason_message="Run is to be resubmitted", + ) + return job_id_to_update_map + + +async def _build_retry_job_models( + context: ActiveContext, + replicas_to_retry: List[Tuple[int, List[JobModel]]], +) -> list[JobModel]: + new_job_models: list[JobModel] = [] + for _, replica_jobs in replicas_to_retry: + job_spec = get_job_spec(replica_jobs[0]) + replica_group_name = job_spec.replica_group + new_jobs = await get_jobs_from_run_spec( + run_spec=context.run_spec, + secrets=context.secrets, + replica_num=replica_jobs[0].replica_num, + replica_group_name=replica_group_name, + ) + assert len(new_jobs) == len(replica_jobs), ( + "Changing the number of jobs within a replica is not yet supported" + ) + for old_job_model, new_job in zip(replica_jobs, new_jobs): + # If some jobs in a retry replica are not finished, they must be terminated by the caller. + job_model = create_job_model_for_new_submission( + run_model=context.run_model, + job=new_job, + status=JobStatus.SUBMITTED, + ) + job_model.submission_num = old_job_model.submission_num + 1 + new_job_models.append(job_model) + return new_job_models + + +async def _build_deployment_update_map( + context: ActiveContext, +) -> dict[uuid.UUID, ActiveRunJobUpdateMap]: + """Bump deployment_num for jobs that do not require redeployment.""" + run_model = context.run_model + run_spec = context.run_spec + job_id_to_update_map: dict[uuid.UUID, ActiveRunJobUpdateMap] = {} + + if not has_out_of_date_replicas(run_model): + return job_id_to_update_map + + for replica_num, job_models in group_jobs_by_replica_latest(run_model.jobs): + if all(j.status.is_finished() for j in job_models): + continue + if all(j.deployment_num == run_model.deployment_num for j in job_models): + continue + + replica_group_name = None + if run_spec.configuration.type == "service": + job_spec = get_job_spec(job_models[0]) + replica_group_name = job_spec.replica_group + + new_job_specs = await get_job_specs_from_run_spec( + run_spec=run_spec, + secrets=context.secrets, + replica_num=replica_num, + replica_group_name=replica_group_name, + ) + assert len(new_job_specs) == len(job_models), ( + "Changing the number of jobs within a replica is not yet supported" + ) + can_update_all_jobs = True + for old_job_model, new_job_spec in zip(job_models, new_job_specs): + old_job_spec = get_job_spec(old_job_model) + if new_job_spec != old_job_spec: + can_update_all_jobs = False + break + if can_update_all_jobs: + for job_model in job_models: + job_id_to_update_map[job_model.id] = ActiveRunJobUpdateMap( + deployment_num=run_model.deployment_num, + ) + + return job_id_to_update_map + + +def _compute_last_scaled_at(run_model: RunModel) -> Optional[datetime]: + """Compute the timestamp of the most recent scaling event from replica data.""" + timestamps: list[datetime] = [] + active, inactive = build_replica_lists(run_model) + for _, _, _, jobs in active: + timestamps.append(min(j.submitted_at for j in jobs)) + for _, _, _, jobs in inactive: + timestamps.append(max(j.last_processed_at for j in jobs)) + return max(timestamps) if timestamps else None + + +def _apply_desired_counts_to_update_map( + run_update_map: ActiveRunUpdateMap, + context: ActiveContext, +) -> PerGroupDesiredCounts: + """Compute desired counts and add to run_update_map. Returns per-group desired counts.""" + configuration = context.run_spec.configuration + assert isinstance(configuration, ServiceConfiguration) + last_scaled_at = _compute_last_scaled_at(context.run_model) + total, per_group_desired = compute_desired_replica_counts( + context.run_model, configuration, context.gateway_stats, last_scaled_at + ) + run_update_map["desired_replica_count"] = total + run_update_map["desired_replica_counts"] = json.dumps(per_group_desired) + return per_group_desired + + +def _build_scale_down_job_update_maps( + active_replicas: list[tuple[int, bool, int, list[JobModel]]], + count: int, +) -> dict[uuid.UUID, ActiveRunJobUpdateMap]: + """Build job update maps for scaling down the least-important replicas.""" + job_id_to_update_map: dict[uuid.UUID, ActiveRunJobUpdateMap] = {} + if count <= 0: + return job_id_to_update_map + for _, _, _, replica_jobs in reversed(active_replicas[-count:]): + for job in replica_jobs: + if job.status.is_finished() or job.status == JobStatus.TERMINATING: + continue + job_id_to_update_map[job.id] = ActiveRunJobUpdateMap( + status=JobStatus.TERMINATING, + termination_reason=JobTerminationReason.SCALED_DOWN, + ) + return job_id_to_update_map + + +async def _build_service_scaling_maps( + context: ActiveContext, + per_group_desired: PerGroupDesiredCounts, +) -> tuple[list[JobModel], dict[uuid.UUID, ActiveRunJobUpdateMap]]: + """Build new jobs for scale-up and update maps for scale-down across all groups.""" + run_model = context.run_model + configuration = context.run_spec.configuration + assert isinstance(configuration, ServiceConfiguration) + new_job_models: list[JobModel] = [] + job_id_to_update_map: dict[uuid.UUID, ActiveRunJobUpdateMap] = {} + + next_replica_num = max((job.replica_num for job in run_model.jobs), default=-1) + 1 + + for group in configuration.replica_groups: + assert group.name is not None + group_desired = per_group_desired.get(group.name, 0) + active_replicas, _ = build_replica_lists(run_model, group_filter=group.name) + diff = group_desired - len(active_replicas) + + if diff == 0: + continue + + # During rolling deployment, skip the group entirely — + # _build_rolling_deployment_maps handles both surge and teardown. + if has_out_of_date_replicas(run_model, group_filter=group.name): + continue + + if diff > 0: + new_jobs = await build_scale_up_job_models( + run_model=run_model, + run_spec=context.run_spec, + secrets=context.secrets, + replicas_diff=diff, + group_name=group.name, + replica_num_start=next_replica_num, + ) + new_job_models.extend(new_jobs) + # Advance next_replica_num past any newly created replicas + if new_jobs: + max_new = max(j.replica_num for j in new_jobs) + next_replica_num = max(next_replica_num, max_new + 1) + else: + scale_down_maps = _build_scale_down_job_update_maps(active_replicas, abs(diff)) + job_id_to_update_map.update(scale_down_maps) + + return ( + new_job_models, + job_id_to_update_map, + ) + + +def _has_out_of_date_replicas( + run_model: RunModel, + group_name: str, + exclude_job_ids: Set[uuid.UUID], +) -> bool: + """Check for out-of-date replicas, treating jobs in `exclude_job_ids` as up-to-date.""" + for job in run_model.jobs: + if job.id in exclude_job_ids: + continue + if not job_belongs_to_group(job, group_name): + continue + if job.deployment_num < run_model.deployment_num and not ( + job.status.is_finished() or job.termination_reason == JobTerminationReason.SCALED_DOWN + ): + return True + return False + + +async def _build_rolling_deployment_maps( + context: ActiveContext, + per_group_desired: PerGroupDesiredCounts, + in_place_bumped_job_ids: Set[uuid.UUID], +) -> tuple[list[JobModel], dict[uuid.UUID, ActiveRunJobUpdateMap]]: + """Build scale-up models and scale-down maps for rolling deployment across all groups. + + Jobs in `in_place_bumped_job_ids` are about to have their deployment_num bumped + in-place. We exclude them from the out-of-date check so rolling deployment only + targets replicas that actually need replacement. + """ + run_model = context.run_model + configuration = context.run_spec.configuration + assert isinstance(configuration, ServiceConfiguration) + new_job_models: list[JobModel] = [] + job_id_to_update_map: dict[uuid.UUID, ActiveRunJobUpdateMap] = {} + + next_replica_num = max((job.replica_num for job in run_model.jobs), default=-1) + 1 + + for group in configuration.replica_groups: + assert group.name is not None + group_desired = per_group_desired.get(group.name, 0) + # Check if there are truly out-of-date replicas (excluding in-place bumped jobs) + if not _has_out_of_date_replicas(run_model, group.name, in_place_bumped_job_ids): + continue + + state = get_group_rollout_state(run_model, group) + group_max = group_desired + ROLLING_DEPLOYMENT_MAX_SURGE + + # Scale up: create new up-to-date replicas if below max + if state.non_terminated_replica_count < group_max: + new_jobs = await build_scale_up_job_models( + run_model=run_model, + run_spec=context.run_spec, + secrets=context.secrets, + replicas_diff=group_max - state.non_terminated_replica_count, + group_name=group.name, + replica_num_start=next_replica_num, + ) + new_job_models.extend(new_jobs) + if new_jobs: + max_new = max(j.replica_num for j in new_jobs) + next_replica_num = max(next_replica_num, max_new + 1) + + # Scale down: terminate unregistered out-of-date + excess registered replicas + replicas_to_stop = state.unregistered_out_of_date_replica_count + replicas_to_stop += max( + 0, + state.registered_non_terminating_replica_count - group_desired, + ) + if replicas_to_stop > 0: + scale_down_maps = _build_scale_down_job_update_maps( + state.active_replicas, replicas_to_stop + ) + job_id_to_update_map.update(scale_down_maps) + + return new_job_models, job_id_to_update_map + + +def _build_removed_groups_cleanup_maps( + context: ActiveContext, +) -> dict[uuid.UUID, ActiveRunJobUpdateMap]: + """Terminate replicas from groups no longer in the configuration.""" + run_model = context.run_model + configuration = context.run_spec.configuration + assert isinstance(configuration, ServiceConfiguration) + job_id_to_update_map: dict[uuid.UUID, ActiveRunJobUpdateMap] = {} + + existing_group_names: set[str] = set() + for job in run_model.jobs: + if job.status.is_finished(): + continue + job_spec = get_job_spec(job) + existing_group_names.add(job_spec.replica_group) + + new_group_names = {group.name for group in configuration.replica_groups} + removed_group_names = existing_group_names - new_group_names + + for removed_group_name in removed_group_names: + active_replicas, inactive_replicas = build_replica_lists( + run_model=run_model, + group_filter=removed_group_name, + ) + if active_replicas: + scale_down_maps = _build_scale_down_job_update_maps( + active_replicas, len(active_replicas) + ) + job_id_to_update_map.update(scale_down_maps) + if inactive_replicas: + scale_down_maps = _build_scale_down_job_update_maps( + inactive_replicas, len(inactive_replicas) + ) + job_id_to_update_map.update(scale_down_maps) + + return job_id_to_update_map diff --git a/src/dstack/_internal/server/background/pipeline_tasks/runs/common.py b/src/dstack/_internal/server/background/pipeline_tasks/runs/common.py new file mode 100644 index 0000000000..0c6c9730c4 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/runs/common.py @@ -0,0 +1,117 @@ +import json +from datetime import datetime +from typing import Optional + +from dstack._internal.core.models.configurations import ( + DEFAULT_REPLICA_GROUP_NAME, + ServiceConfiguration, +) +from dstack._internal.core.models.runs import JobStatus, RunSpec +from dstack._internal.proxy.gateway.schemas.stats import PerWindowStats +from dstack._internal.server.models import JobModel, RunModel +from dstack._internal.server.services.jobs import get_job_spec, get_jobs_from_run_spec +from dstack._internal.server.services.runs import create_job_model_for_new_submission +from dstack._internal.server.services.runs.replicas import build_replica_lists +from dstack._internal.server.services.services.autoscalers import get_service_scaler + +PerGroupDesiredCounts = dict[str, int] +"""Maps group_name → desired replica count""" + + +def compute_desired_replica_counts( + run_model: RunModel, + configuration: ServiceConfiguration, + gateway_stats: Optional[PerWindowStats], + last_scaled_at: Optional[datetime], +) -> tuple[int, PerGroupDesiredCounts]: + """Returns (total_desired, per_group_desired_counts).""" + replica_groups = configuration.replica_groups + prev_counts: PerGroupDesiredCounts = ( + json.loads(run_model.desired_replica_counts) if run_model.desired_replica_counts else {} + ) + if ( + prev_counts == {} + and len(replica_groups) == 1 + and replica_groups[0].name == DEFAULT_REPLICA_GROUP_NAME + ): + # Special case to avoid dropping the replica count to group.count.min + # when a 0.20.7+ server first processes a service created by a pre-0.20.7 server. + # TODO: remove once most users upgrade to 0.20.7+. + prev_counts = {DEFAULT_REPLICA_GROUP_NAME: run_model.desired_replica_count} + desired_counts: PerGroupDesiredCounts = {} + total = 0 + for group in replica_groups: + scaler = get_service_scaler(group.count, group.scaling) + assert group.name is not None, "Group name is always set" + group_desired = scaler.get_desired_count( + current_desired_count=prev_counts.get(group.name, group.count.min or 0), + stats=gateway_stats, + last_scaled_at=last_scaled_at, + ) + desired_counts[group.name] = group_desired + total += group_desired + return total, desired_counts + + +async def build_scale_up_job_models( + run_model: RunModel, + run_spec: RunSpec, + secrets: dict, + replicas_diff: int, + group_name: Optional[str] = None, + replica_num_start: Optional[int] = None, +) -> list[JobModel]: + """Build new JobModel instances for scaling up.""" + if replicas_diff <= 0: + return [] + + _, inactive_replicas = build_replica_lists(run_model, group_filter=group_name) + new_job_models: list[JobModel] = [] + scheduled_replicas = 0 + + # Retry inactive replicas first. + for _, _, replica_num, replica_jobs in inactive_replicas: + if scheduled_replicas == replicas_diff: + break + job_spec = get_job_spec(replica_jobs[0]) + replica_group_name = job_spec.replica_group + new_jobs = await get_jobs_from_run_spec( + run_spec=run_spec, + secrets=secrets, + replica_num=replica_num, + replica_group_name=replica_group_name, + ) + for old_job_model, new_job in zip(replica_jobs, new_jobs): + job_model = create_job_model_for_new_submission( + run_model=run_model, + job=new_job, + status=JobStatus.SUBMITTED, + ) + job_model.submission_num = old_job_model.submission_num + 1 + new_job_models.append(job_model) + scheduled_replicas += 1 + + # Create new replicas for the remainder + if scheduled_replicas < replicas_diff: + if replica_num_start is not None: + first_replica_num = replica_num_start + else: + first_replica_num = max((job.replica_num for job in run_model.jobs), default=-1) + 1 + new_replicas_needed = replicas_diff - scheduled_replicas + for i in range(new_replicas_needed): + new_replica_num = first_replica_num + i + new_jobs = await get_jobs_from_run_spec( + run_spec=run_spec, + secrets=secrets, + replica_num=new_replica_num, + replica_group_name=group_name, + ) + for new_job in new_jobs: + job_model = create_job_model_for_new_submission( + run_model=run_model, + job=new_job, + status=JobStatus.SUBMITTED, + ) + new_job_models.append(job_model) + + return new_job_models diff --git a/src/dstack/_internal/server/background/pipeline_tasks/runs/pending.py b/src/dstack/_internal/server/background/pipeline_tasks/runs/pending.py new file mode 100644 index 0000000000..863b36f9a9 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/runs/pending.py @@ -0,0 +1,142 @@ +import json +import uuid +from dataclasses import dataclass +from datetime import timedelta +from typing import Optional + +from dstack._internal.core.models.configurations import ServiceConfiguration +from dstack._internal.core.models.runs import RunSpec, RunStatus +from dstack._internal.proxy.gateway.schemas.stats import PerWindowStats +from dstack._internal.server.background.pipeline_tasks.base import ItemUpdateMap +from dstack._internal.server.background.pipeline_tasks.runs.common import ( + build_scale_up_job_models, + compute_desired_replica_counts, +) +from dstack._internal.server.models import JobModel, RunModel +from dstack._internal.utils.common import get_current_datetime +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +class PendingRunUpdateMap(ItemUpdateMap, total=False): + status: RunStatus + desired_replica_count: int + desired_replica_counts: Optional[str] + + +@dataclass +class PendingContext: + run_model: RunModel + run_spec: RunSpec + secrets: dict + locked_job_ids: set[uuid.UUID] + gateway_stats: Optional[PerWindowStats] = None + + +@dataclass +class PendingResult: + run_update_map: PendingRunUpdateMap + new_job_models: list[JobModel] + + +async def process_pending_run(context: PendingContext) -> Optional[PendingResult]: + """ + Returns None if the run is not ready for processing (retry delay not met, + zero-scaled service, etc.). Otherwise returns a result describing the + desired state change and pre-built job models. + """ + run_model = context.run_model + run_spec = context.run_spec + + if run_model.resubmission_attempt > 0 and not _is_ready_for_resubmission(run_model): + return None + + if run_spec.configuration.type == "service": + return await _process_pending_service(context) + + desired_replica_count = 1 + new_job_models = await build_scale_up_job_models( + run_model=run_model, + run_spec=run_spec, + secrets=context.secrets, + replicas_diff=desired_replica_count, + ) + return PendingResult( + run_update_map=PendingRunUpdateMap( + status=RunStatus.SUBMITTED, + desired_replica_count=desired_replica_count, + ), + new_job_models=new_job_models, + ) + + +async def _process_pending_service(context: PendingContext) -> Optional[PendingResult]: + run_model = context.run_model + run_spec = context.run_spec + assert isinstance(run_spec.configuration, ServiceConfiguration) + configuration = run_spec.configuration + + total, per_group_desired = compute_desired_replica_counts( + run_model=run_model, + configuration=configuration, + gateway_stats=context.gateway_stats, + last_scaled_at=None, + ) + if total == 0: + return None + + all_new_job_models: list[JobModel] = [] + next_replica_num = max((j.replica_num for j in run_model.jobs), default=-1) + 1 + for group in configuration.replica_groups: + assert group.name is not None + group_desired = per_group_desired.get(group.name, 0) + if group_desired <= 0: + continue + new_job_models = await build_scale_up_job_models( + run_model=run_model, + run_spec=run_spec, + secrets=context.secrets, + replicas_diff=group_desired, + group_name=group.name, + replica_num_start=next_replica_num, + ) + next_replica_num += group_desired + all_new_job_models.extend(new_job_models) + + return PendingResult( + run_update_map=PendingRunUpdateMap( + status=RunStatus.SUBMITTED, + desired_replica_count=total, + desired_replica_counts=json.dumps(per_group_desired), + ), + new_job_models=all_new_job_models, + ) + + +def _is_ready_for_resubmission(run_model: RunModel) -> bool: + if not run_model.jobs: + # No jobs yet — should not be possible for resubmission, but allow processing. + return True + last_processed_at = max(job.last_processed_at for job in run_model.jobs) + duration_since_processing = get_current_datetime() - last_processed_at + return duration_since_processing >= _get_retry_delay(run_model.resubmission_attempt) + + +# We use exponentially increasing retry delays for pending runs. +# This prevents creation of too many job submissions for runs stuck in pending, +# e.g. when users set retry for a long period without capacity. +_PENDING_RETRY_DELAYS = [ + timedelta(seconds=15), + timedelta(seconds=30), + timedelta(minutes=1), + timedelta(minutes=2), + timedelta(minutes=5), + timedelta(minutes=10), +] + + +def _get_retry_delay(resubmission_attempt: int) -> timedelta: + if resubmission_attempt - 1 < len(_PENDING_RETRY_DELAYS): + return _PENDING_RETRY_DELAYS[resubmission_attempt - 1] + return _PENDING_RETRY_DELAYS[-1] diff --git a/src/dstack/_internal/server/background/pipeline_tasks/runs/terminating.py b/src/dstack/_internal/server/background/pipeline_tasks/runs/terminating.py new file mode 100644 index 0000000000..c9a75e3c71 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/runs/terminating.py @@ -0,0 +1,185 @@ +import uuid +from dataclasses import dataclass, field +from datetime import datetime +from typing import Optional + +import httpx + +from dstack._internal.core.errors import GatewayError, SSHError +from dstack._internal.core.models.runs import ( + JobStatus, + JobTerminationReason, + RunStatus, + RunTerminationReason, +) +from dstack._internal.server import models +from dstack._internal.server.background.pipeline_tasks.base import ItemUpdateMap +from dstack._internal.server.db import get_session_ctx +from dstack._internal.server.services import events +from dstack._internal.server.services.gateways import get_or_add_gateway_connections +from dstack._internal.server.services.logging import fmt +from dstack._internal.server.services.runs import _get_next_triggered_at, get_run_spec +from dstack._internal.utils.common import get_or_error +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +class TerminatingRunUpdateMap(ItemUpdateMap, total=False): + status: RunStatus + next_triggered_at: Optional[datetime] + fleet_id: Optional[uuid.UUID] + resubmission_attempt: int + + +class TerminatingRunJobUpdateMap(ItemUpdateMap, total=False): + status: JobStatus + termination_reason: Optional[JobTerminationReason] + graceful_termination_attempts: int + skip_min_processing_interval: bool + + +@dataclass +class ServiceUnregistration: + event_message: str + gateway_target: Optional[events.Target] + + +@dataclass +class TerminatingContext: + run_model: models.RunModel + locked_job_ids: set[uuid.UUID] + + +@dataclass +class TerminatingResult: + run_update_map: TerminatingRunUpdateMap = field(default_factory=TerminatingRunUpdateMap) + job_id_to_update_map: dict[uuid.UUID, TerminatingRunJobUpdateMap] = field(default_factory=dict) + service_unregistration: Optional[ServiceUnregistration] = None + + +async def process_terminating_run(context: TerminatingContext) -> TerminatingResult: + """ + Stops the jobs gracefully and marks them as TERMINATING. + Jobs then should be terminated by `JobTerminatingPipeline`. + When all jobs are already terminated, assigns a finished status to the run. + Caller must preload the run, acquire related job locks, and apply the result. + """ + run_model = context.run_model + assert run_model.termination_reason is not None + + job_termination_reason = run_model.termination_reason.to_job_termination_reason() + if len(context.locked_job_ids) > 0: + locked_jobs = [j for j in run_model.jobs if j.id in context.locked_job_ids] + delayed_job_ids = [] + regular_job_ids = [] + for job_model in locked_jobs: + if job_model.status == JobStatus.RUNNING and job_termination_reason not in { + JobTerminationReason.ABORTED_BY_USER, + JobTerminationReason.DONE_BY_RUNNER, + }: + delayed_job_ids.append(job_model.id) + continue + regular_job_ids.append(job_model.id) + return TerminatingResult( + job_id_to_update_map=_get_job_id_to_update_map( + delayed_job_ids=delayed_job_ids, + regular_job_ids=regular_job_ids, + job_termination_reason=job_termination_reason, + ) + ) + + if any(not job_model.status.is_finished() for job_model in run_model.jobs): + return TerminatingResult() + + service_unregistration = None + if run_model.service_spec is not None: + try: + service_unregistration = await _unregister_service(run_model) + except Exception as e: + logger.warning("%s: failed to unregister service: %s", fmt(run_model), repr(e)) + + return TerminatingResult( + run_update_map=_get_run_update_map(run_model), + service_unregistration=service_unregistration, + ) + + +def _get_job_id_to_update_map( + delayed_job_ids: list[uuid.UUID], + regular_job_ids: list[uuid.UUID], + job_termination_reason: JobTerminationReason, +) -> dict[uuid.UUID, TerminatingRunJobUpdateMap]: + job_id_to_update_map = {} + for job_id in regular_job_ids: + job_id_to_update_map[job_id] = TerminatingRunJobUpdateMap( + status=JobStatus.TERMINATING, + termination_reason=job_termination_reason, + skip_min_processing_interval=True, + ) + for job_id in delayed_job_ids: + job_id_to_update_map[job_id] = TerminatingRunJobUpdateMap( + status=JobStatus.TERMINATING, + termination_reason=job_termination_reason, + graceful_termination_attempts=0, + skip_min_processing_interval=True, + ) + return job_id_to_update_map + + +def _get_run_update_map(run_model: models.RunModel) -> TerminatingRunUpdateMap: + termination_reason = get_or_error(run_model.termination_reason) + run_spec = get_run_spec(run_model) + if run_spec.merged_profile.schedule is not None and termination_reason not in { + RunTerminationReason.ABORTED_BY_USER, + RunTerminationReason.STOPPED_BY_USER, + }: + return TerminatingRunUpdateMap( + status=RunStatus.PENDING, + next_triggered_at=_get_next_triggered_at(run_spec), + fleet_id=None, + resubmission_attempt=0, + ) + return TerminatingRunUpdateMap(status=termination_reason.to_status()) + + +async def _unregister_service(run_model: models.RunModel) -> Optional[ServiceUnregistration]: + if run_model.gateway_id is None: # in-server proxy + return None + + async with get_session_ctx() as session: + gateway, connections = await get_or_add_gateway_connections(session, run_model.gateway_id) + gateway_target = events.Target.from_model(gateway) + + gateway_errors = [] + for conn in connections: + try: + logger.debug( + "%s: unregistering service on gateway replica %s", fmt(run_model), conn.ip_address + ) + async with conn.client() as client: + await client.unregister_service( + project=run_model.project.name, + run_name=run_model.run_name, + ) + except GatewayError as e: + # Ignore if the service is not registered on this replica. + logger.warning( + "%s: unregistering service on gateway replica %s: %s", + fmt(run_model), + conn.ip_address, + e, + ) + gateway_errors.append(str(e)) + except (httpx.RequestError, SSHError) as e: + logger.debug("Gateway request failed", exc_info=True) + raise GatewayError(repr(e)) + + if gateway_errors: + event_message = f"Gateway error when unregistering service: {'; '.join(gateway_errors)}" + else: + event_message = "Service unregistered from gateway" + return ServiceUnregistration( + event_message=event_message, + gateway_target=gateway_target, + ) diff --git a/src/dstack/_internal/server/background/pipeline_tasks/service_router_worker_sync.py b/src/dstack/_internal/server/background/pipeline_tasks/service_router_worker_sync.py new file mode 100644 index 0000000000..2b416fb823 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/service_router_worker_sync.py @@ -0,0 +1,297 @@ +import asyncio +import uuid +from dataclasses import dataclass +from datetime import timedelta +from typing import Sequence + +from sqlalchemy import or_, select, update +from sqlalchemy.orm import joinedload, load_only, selectinload +from sqlalchemy.sql import false, true + +from dstack._internal.core.models.runs import JobStatus, RunStatus +from dstack._internal.server.background.pipeline_tasks.base import ( + Fetcher, + Heartbeater, + ItemUpdateMap, + Pipeline, + PipelineItem, + Worker, + log_lock_token_changed_after_processing, + log_lock_token_mismatch, + resolve_now_placeholders, + set_processed_update_map_fields, + set_unlock_update_map_fields, +) +from dstack._internal.server.db import get_db, get_session_ctx +from dstack._internal.server.models import ( + InstanceModel, + JobModel, + ProjectModel, + RunModel, + ServiceRouterWorkerSyncModel, +) +from dstack._internal.server.services.locking import get_locker +from dstack._internal.server.services.pipelines import PipelineHinterProtocol +from dstack._internal.server.services.runs.router_worker_sync import ( + run_model_has_sglang_router_replica_group, + sync_router_workers_for_run_model, +) +from dstack._internal.server.utils import sentry_utils +from dstack._internal.utils.common import get_current_datetime +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +@dataclass +class ServiceRouterWorkerSyncPipelineItem(PipelineItem): + run_id: uuid.UUID + + +class ServiceRouterWorkerSyncPipeline(Pipeline[ServiceRouterWorkerSyncPipelineItem]): + def __init__( + self, + workers_num: int = 8, + queue_lower_limit_factor: float = 0.5, + queue_upper_limit_factor: float = 2.0, + min_processing_interval: timedelta = timedelta(seconds=5), + lock_timeout: timedelta = timedelta(seconds=25), + heartbeat_trigger: timedelta = timedelta(seconds=10), + *, + pipeline_hinter: PipelineHinterProtocol, + ) -> None: + super().__init__( + workers_num=workers_num, + queue_lower_limit_factor=queue_lower_limit_factor, + queue_upper_limit_factor=queue_upper_limit_factor, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeat_trigger=heartbeat_trigger, + ) + self.__heartbeater = Heartbeater[ServiceRouterWorkerSyncPipelineItem]( + model_type=ServiceRouterWorkerSyncModel, + lock_timeout=self._lock_timeout, + heartbeat_trigger=self._heartbeat_trigger, + ) + self.__fetcher = ServiceRouterWorkerSyncFetcher( + queue=self._queue, + queue_desired_minsize=self._queue_desired_minsize, + min_processing_interval=self._min_processing_interval, + lock_timeout=self._lock_timeout, + heartbeater=self.__heartbeater, + ) + self.__workers = [ + ServiceRouterWorkerSyncWorker( + queue=self._queue, + heartbeater=self.__heartbeater, + pipeline_hinter=pipeline_hinter, + ) + for _ in range(self._workers_num) + ] + + @property + def hint_fetch_model_name(self) -> str: + return ServiceRouterWorkerSyncModel.__name__ + + @property + def _heartbeater(self) -> Heartbeater[ServiceRouterWorkerSyncPipelineItem]: + return self.__heartbeater + + @property + def _fetcher(self) -> Fetcher[ServiceRouterWorkerSyncPipelineItem]: + return self.__fetcher + + @property + def _workers(self) -> Sequence["ServiceRouterWorkerSyncWorker"]: + return self.__workers + + +class ServiceRouterWorkerSyncFetcher(Fetcher[ServiceRouterWorkerSyncPipelineItem]): + @sentry_utils.instrument_pipeline_task("ServiceRouterWorkerSyncFetcher.fetch") + async def fetch(self, limit: int) -> list[ServiceRouterWorkerSyncPipelineItem]: + sync_lock, _ = get_locker(get_db().dialect_name).get_lockset( + ServiceRouterWorkerSyncModel.__tablename__ + ) + async with sync_lock: + async with get_session_ctx() as session: + now = get_current_datetime() + res = await session.execute( + select(ServiceRouterWorkerSyncModel) + .join(RunModel, RunModel.id == ServiceRouterWorkerSyncModel.run_id) + .where( + ServiceRouterWorkerSyncModel.deleted == false(), + # Fetch RUNNING runs for normal processing, and finished/deleted runs so + # the worker can mark their sync rows deleted. + or_( + RunModel.status == RunStatus.RUNNING, + RunModel.status.in_(RunStatus.finished_statuses()), + RunModel.deleted == true(), + ), + or_( + ServiceRouterWorkerSyncModel.last_processed_at + <= now - self._min_processing_interval, + ServiceRouterWorkerSyncModel.last_processed_at + == ServiceRouterWorkerSyncModel.created_at, + ), + or_( + ServiceRouterWorkerSyncModel.lock_expires_at.is_(None), + ServiceRouterWorkerSyncModel.lock_expires_at < now, + ), + ) + .order_by(ServiceRouterWorkerSyncModel.last_processed_at.asc()) + .limit(limit) + .with_for_update( + skip_locked=True, key_share=True, of=ServiceRouterWorkerSyncModel + ) + .options( + load_only( + ServiceRouterWorkerSyncModel.id, + ServiceRouterWorkerSyncModel.run_id, + ServiceRouterWorkerSyncModel.lock_token, + ServiceRouterWorkerSyncModel.lock_expires_at, + ) + ) + ) + rows = list(res.scalars().all()) + lock_expires_at = get_current_datetime() + self._lock_timeout + lock_token = uuid.uuid4() + items: list[ServiceRouterWorkerSyncPipelineItem] = [] + for row in rows: + prev_lock_expired = row.lock_expires_at is not None + row.lock_expires_at = lock_expires_at + row.lock_token = lock_token + row.lock_owner = ServiceRouterWorkerSyncPipeline.__name__ + items.append( + ServiceRouterWorkerSyncPipelineItem( + __tablename__=ServiceRouterWorkerSyncModel.__tablename__, + id=row.id, + lock_expires_at=lock_expires_at, + lock_token=lock_token, + prev_lock_expired=prev_lock_expired, + run_id=row.run_id, + ) + ) + await session.commit() + return items + + +class _SyncRowUpdateMap(ItemUpdateMap, total=False): + deleted: bool + + +class ServiceRouterWorkerSyncWorker(Worker[ServiceRouterWorkerSyncPipelineItem]): + def __init__( + self, + queue: asyncio.Queue[ServiceRouterWorkerSyncPipelineItem], + heartbeater: Heartbeater[ServiceRouterWorkerSyncPipelineItem], + pipeline_hinter: PipelineHinterProtocol, + ) -> None: + super().__init__( + queue=queue, + heartbeater=heartbeater, + pipeline_hinter=pipeline_hinter, + ) + + @sentry_utils.instrument_pipeline_task("ServiceRouterWorkerSyncWorker.process") + async def process(self, item: ServiceRouterWorkerSyncPipelineItem) -> None: + async with get_session_ctx() as session: + res = await session.execute( + select(ServiceRouterWorkerSyncModel) + .where( + ServiceRouterWorkerSyncModel.id == item.id, + ServiceRouterWorkerSyncModel.lock_token == item.lock_token, + ) + .options(selectinload(ServiceRouterWorkerSyncModel.run)) + ) + sync_row = res.unique().scalar_one_or_none() + if sync_row is None: + log_lock_token_mismatch(logger, item) + return + run_model = sync_row.run + if ( + run_model.deleted + or run_model.status.is_finished() + or run_model.status != RunStatus.RUNNING + or not run_model_has_sglang_router_replica_group(run_model) + ): + early_cleanup_update_map: _SyncRowUpdateMap = {"deleted": True} + set_processed_update_map_fields(early_cleanup_update_map) + set_unlock_update_map_fields(early_cleanup_update_map) + now = get_current_datetime() + resolve_now_placeholders(early_cleanup_update_map, now=now) + await _update_sync_row_or_log_lock_token_changed( + session, item, early_cleanup_update_map + ) + return + + async with get_session_ctx() as session: + res = await session.execute( + select(RunModel) + .where(RunModel.id == item.run_id) + .options( + load_only(RunModel.id, RunModel.run_spec), + selectinload( + RunModel.jobs.and_( + JobModel.status == JobStatus.RUNNING, + JobModel.registered == True, + ) + ) + .load_only( + JobModel.id, + JobModel.status, + JobModel.registered, + JobModel.job_spec_data, + JobModel.job_provisioning_data, + JobModel.job_runtime_data, + ) + .options( + joinedload(JobModel.project).load_only( + ProjectModel.id, ProjectModel.ssh_private_key + ), + joinedload(JobModel.instance) + .load_only(InstanceModel.id, InstanceModel.remote_connection_info) + .joinedload(InstanceModel.project) + .load_only(ProjectModel.id, ProjectModel.ssh_private_key), + ), + ) + ) + run_for_sync = res.unique().scalar_one_or_none() + + if run_for_sync is None: + cleanup_update_map: _SyncRowUpdateMap = {"deleted": True} + set_processed_update_map_fields(cleanup_update_map) + set_unlock_update_map_fields(cleanup_update_map) + async with get_session_ctx() as session: + now = get_current_datetime() + resolve_now_placeholders(cleanup_update_map, now=now) + await _update_sync_row_or_log_lock_token_changed(session, item, cleanup_update_map) + return + + await sync_router_workers_for_run_model(run_for_sync) + + update_map: _SyncRowUpdateMap = {} + set_processed_update_map_fields(update_map) + set_unlock_update_map_fields(update_map) + async with get_session_ctx() as session: + now = get_current_datetime() + resolve_now_placeholders(update_map, now=now) + await _update_sync_row_or_log_lock_token_changed(session, item, update_map) + + +async def _update_sync_row_or_log_lock_token_changed( + session, + item: PipelineItem, + update_map: ItemUpdateMap, +) -> None: + res = await session.execute( + update(ServiceRouterWorkerSyncModel) + .where( + ServiceRouterWorkerSyncModel.id == item.id, + ServiceRouterWorkerSyncModel.lock_token == item.lock_token, + ) + .values(**update_map) + .returning(ServiceRouterWorkerSyncModel.id) + ) + if not list(res.scalars().all()): + log_lock_token_changed_after_processing(logger, item) + await session.commit() diff --git a/src/dstack/_internal/server/background/pipeline_tasks/volumes.py b/src/dstack/_internal/server/background/pipeline_tasks/volumes.py new file mode 100644 index 0000000000..cb8d6ae79d --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/volumes.py @@ -0,0 +1,421 @@ +import asyncio +import uuid +from dataclasses import dataclass, field +from datetime import timedelta +from typing import Optional, Sequence + +from sqlalchemy import or_, select, update +from sqlalchemy.orm import joinedload, load_only + +from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport +from dstack._internal.core.errors import BackendError, BackendNotAvailable +from dstack._internal.core.models.volumes import VolumeStatus +from dstack._internal.server.background.pipeline_tasks.base import ( + NOW_PLACEHOLDER, + Fetcher, + Heartbeater, + ItemUpdateMap, + Pipeline, + PipelineItem, + UpdateMapDateTime, + Worker, + log_lock_token_changed_after_processing, + log_lock_token_mismatch, + resolve_now_placeholders, + set_processed_update_map_fields, + set_unlock_update_map_fields, +) +from dstack._internal.server.db import get_db, get_session_ctx +from dstack._internal.server.models import ( + FleetModel, + InstanceModel, + ProjectModel, + UserModel, + VolumeAttachmentModel, + VolumeModel, +) +from dstack._internal.server.services import backends as backends_services +from dstack._internal.server.services import events +from dstack._internal.server.services.locking import get_locker +from dstack._internal.server.services.pipelines import PipelineHinterProtocol +from dstack._internal.server.services.volumes import ( + emit_volume_status_change_event, + volume_model_to_volume, +) +from dstack._internal.server.utils import sentry_utils +from dstack._internal.utils.common import get_current_datetime, run_async +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +@dataclass +class VolumePipelineItem(PipelineItem): + status: VolumeStatus + to_be_deleted: bool + + +class VolumePipeline(Pipeline[VolumePipelineItem]): + def __init__( + self, + workers_num: int = 10, + queue_lower_limit_factor: float = 0.5, + queue_upper_limit_factor: float = 2.0, + min_processing_interval: timedelta = timedelta(seconds=15), + lock_timeout: timedelta = timedelta(seconds=30), + heartbeat_trigger: timedelta = timedelta(seconds=15), + *, + pipeline_hinter: PipelineHinterProtocol, + ) -> None: + super().__init__( + workers_num=workers_num, + queue_lower_limit_factor=queue_lower_limit_factor, + queue_upper_limit_factor=queue_upper_limit_factor, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeat_trigger=heartbeat_trigger, + ) + self.__heartbeater = Heartbeater[VolumePipelineItem]( + model_type=VolumeModel, + lock_timeout=self._lock_timeout, + heartbeat_trigger=self._heartbeat_trigger, + ) + self.__fetcher = VolumeFetcher( + queue=self._queue, + queue_desired_minsize=self._queue_desired_minsize, + min_processing_interval=self._min_processing_interval, + lock_timeout=self._lock_timeout, + heartbeater=self._heartbeater, + ) + self.__workers = [ + VolumeWorker( + queue=self._queue, + heartbeater=self._heartbeater, + pipeline_hinter=pipeline_hinter, + ) + for _ in range(self._workers_num) + ] + + @property + def hint_fetch_model_name(self) -> str: + return VolumeModel.__name__ + + @property + def _heartbeater(self) -> Heartbeater[VolumePipelineItem]: + return self.__heartbeater + + @property + def _fetcher(self) -> Fetcher[VolumePipelineItem]: + return self.__fetcher + + @property + def _workers(self) -> Sequence["VolumeWorker"]: + return self.__workers + + +class VolumeFetcher(Fetcher[VolumePipelineItem]): + def __init__( + self, + queue: asyncio.Queue[VolumePipelineItem], + queue_desired_minsize: int, + min_processing_interval: timedelta, + lock_timeout: timedelta, + heartbeater: Heartbeater[VolumePipelineItem], + queue_check_delay: float = 1.0, + ) -> None: + super().__init__( + queue=queue, + queue_desired_minsize=queue_desired_minsize, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeater=heartbeater, + queue_check_delay=queue_check_delay, + ) + + @sentry_utils.instrument_pipeline_task("VolumeFetcher.fetch") + async def fetch(self, limit: int) -> list[VolumePipelineItem]: + volume_lock, _ = get_locker(get_db().dialect_name).get_lockset(VolumeModel.__tablename__) + async with volume_lock: + async with get_session_ctx() as session: + now = get_current_datetime() + res = await session.execute( + select(VolumeModel) + .where( + or_( + VolumeModel.status == VolumeStatus.SUBMITTED, + VolumeModel.to_be_deleted == True, + ), + VolumeModel.deleted == False, + or_( + VolumeModel.last_processed_at <= now - self._min_processing_interval, + VolumeModel.last_processed_at == VolumeModel.created_at, + ), + or_( + VolumeModel.lock_expires_at.is_(None), + VolumeModel.lock_expires_at < now, + ), + or_( + VolumeModel.lock_owner.is_(None), + VolumeModel.lock_owner == VolumePipeline.__name__, + ), + ) + .order_by(VolumeModel.last_processed_at.asc()) + .limit(limit) + .with_for_update(skip_locked=True, key_share=True, of=VolumeModel) + .options( + load_only( + VolumeModel.id, + VolumeModel.lock_token, + VolumeModel.lock_expires_at, + VolumeModel.status, + VolumeModel.to_be_deleted, + ) + ) + ) + volume_models = list(res.scalars().all()) + lock_expires_at = get_current_datetime() + self._lock_timeout + lock_token = uuid.uuid4() + items = [] + for volume_model in volume_models: + prev_lock_expired = volume_model.lock_expires_at is not None + volume_model.lock_expires_at = lock_expires_at + volume_model.lock_token = lock_token + volume_model.lock_owner = VolumePipeline.__name__ + items.append( + VolumePipelineItem( + __tablename__=VolumeModel.__tablename__, + id=volume_model.id, + lock_expires_at=lock_expires_at, + lock_token=lock_token, + prev_lock_expired=prev_lock_expired, + status=volume_model.status, + to_be_deleted=volume_model.to_be_deleted, + ) + ) + await session.commit() + return items + + +class VolumeWorker(Worker[VolumePipelineItem]): + def __init__( + self, + queue: asyncio.Queue[VolumePipelineItem], + heartbeater: Heartbeater[VolumePipelineItem], + pipeline_hinter: PipelineHinterProtocol, + ) -> None: + super().__init__( + queue=queue, + heartbeater=heartbeater, + pipeline_hinter=pipeline_hinter, + ) + + @sentry_utils.instrument_pipeline_task("VolumeWorker.process") + async def process(self, item: VolumePipelineItem): + volume_model = await _refetch_locked_volume(item) + if volume_model is None: + log_lock_token_mismatch(logger, item) + return + + if item.to_be_deleted: + result = await _process_to_be_deleted_volume(volume_model) + elif item.status == VolumeStatus.SUBMITTED: + result = await _process_submitted_volume(volume_model) + else: + return + + await _apply_process_result(item=item, volume_model=volume_model, result=result) + + +async def _refetch_locked_volume(item: VolumePipelineItem) -> Optional[VolumeModel]: + async with get_session_ctx() as session: + res = await session.execute( + select(VolumeModel) + .where( + VolumeModel.id == item.id, + VolumeModel.lock_token == item.lock_token, + ) + .options(joinedload(VolumeModel.project).joinedload(ProjectModel.backends)) + .options(joinedload(VolumeModel.user).load_only(UserModel.name)) + .options( + joinedload(VolumeModel.attachments) + .joinedload(VolumeAttachmentModel.instance) + .joinedload(InstanceModel.fleet) + .load_only(FleetModel.name) + ) + ) + return res.unique().scalar_one_or_none() + + +async def _apply_process_result( + item: VolumePipelineItem, + volume_model: VolumeModel, + result: "_ProcessResult", +): + update_map = _VolumeUpdateMap() + update_map.update(result.update_map) + set_processed_update_map_fields(update_map) + set_unlock_update_map_fields(update_map) + + async with get_session_ctx() as session: + resolve_now_placeholders(update_map, now=get_current_datetime()) + res = await session.execute( + update(VolumeModel) + .where( + VolumeModel.id == volume_model.id, + VolumeModel.lock_token == volume_model.lock_token, + ) + .values(**update_map) + .returning(VolumeModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + log_lock_token_changed_after_processing(logger, item) + if item.status == VolumeStatus.SUBMITTED: + # TODO: Clean up volume. + pass + return + if item.to_be_deleted: + events.emit( + session, + "Volume deleted", + actor=events.SystemActor(), + targets=[events.Target.from_model(volume_model)], + ) + else: + emit_volume_status_change_event( + session=session, + volume_model=volume_model, + old_status=volume_model.status, + new_status=update_map.get("status", volume_model.status), + status_message=update_map.get("status_message", volume_model.status_message), + ) + + +class _VolumeUpdateMap(ItemUpdateMap, total=False): + status: VolumeStatus + status_message: str + volume_provisioning_data: str + deleted: bool + deleted_at: UpdateMapDateTime + + +@dataclass +class _ProcessResult: + update_map: _VolumeUpdateMap = field(default_factory=_VolumeUpdateMap) + + +async def _process_submitted_volume(volume_model: VolumeModel) -> _ProcessResult: + volume = volume_model_to_volume(volume_model) + try: + backend = await backends_services.get_project_backend_by_type_or_error( + project=volume_model.project, + backend_type=volume.configuration.backend, + overrides=True, + ) + except BackendNotAvailable: + logger.error( + "Failed to process volume %s. Backend %s not available.", + volume.name, + volume.configuration.backend.value, + ) + return _ProcessResult( + update_map={ + "status": VolumeStatus.FAILED, + "status_message": "Backend not available", + } + ) + + compute = backend.compute() + assert isinstance(compute, ComputeWithVolumeSupport) + try: + if volume.configuration.is_external: + logger.info("Registering external volume %s", volume_model.name) + vpd = await run_async( + compute.register_volume, + volume=volume, + ) + else: + logger.info("Provisioning new volume %s", volume_model.name) + vpd = await run_async( + compute.create_volume, + volume=volume, + ) + except BackendError as e: + logger.info("Failed to create volume %s: %s", volume_model.name, repr(e)) + status_message = f"Backend error: {repr(e)}" + if len(e.args) > 0: + status_message = str(e.args[0]) + return _ProcessResult( + update_map={ + "status": VolumeStatus.FAILED, + "status_message": status_message, + } + ) + except Exception as e: + logger.exception("Got exception when creating volume %s", volume_model.name) + return _ProcessResult( + update_map={ + "status": VolumeStatus.FAILED, + "status_message": f"Unexpected error: {repr(e)}", + } + ) + + logger.info("Added new volume %s", volume_model.name) + # Provisioned volumes marked as active since they become available almost immediately in AWS + # TODO: Consider checking volume state + return _ProcessResult( + update_map={ + "status": VolumeStatus.ACTIVE, + "volume_provisioning_data": vpd.json(), + } + ) + + +async def _process_to_be_deleted_volume(volume_model: VolumeModel) -> _ProcessResult: + volume = volume_model_to_volume(volume_model) + if volume.external: + return _get_deleted_result() + if volume.provisioning_data is None: + # The volume wasn't provisioned so there is nothing to delete + return _get_deleted_result() + if volume.provisioning_data.backend is None: + logger.error( + f"Failed to delete volume {volume_model.name}. volume.provisioning_data.backend is None." + ) + return _get_deleted_result() + try: + backend = await backends_services.get_project_backend_by_type_or_error( + project=volume_model.project, + backend_type=volume.provisioning_data.backend, + ) + except BackendNotAvailable: + # TODO: Retry deletion + logger.error( + f"Failed to delete volume {volume_model.name}. Backend {volume.configuration.backend} not available." + " Please terminate it manually to avoid unexpected charges.", + ) + return _get_deleted_result() + + compute = backend.compute() + assert isinstance(compute, ComputeWithVolumeSupport) + try: + await run_async( + compute.delete_volume, + volume=volume, + ) + except Exception: + # TODO: Retry deletion + logger.exception( + "Got exception when deleting volume %s. Please terminate it manually to avoid unexpected charges.", + volume.name, + ) + return _get_deleted_result() + + +def _get_deleted_result() -> _ProcessResult: + return _ProcessResult( + update_map={ + "deleted": True, + "deleted_at": NOW_PLACEHOLDER, + } + ) diff --git a/src/dstack/_internal/server/background/scheduled_tasks/__init__.py b/src/dstack/_internal/server/background/scheduled_tasks/__init__.py new file mode 100644 index 0000000000..0e929811e6 --- /dev/null +++ b/src/dstack/_internal/server/background/scheduled_tasks/__init__.py @@ -0,0 +1,62 @@ +from apscheduler.schedulers.asyncio import AsyncIOScheduler +from apscheduler.triggers.date import DateTrigger +from apscheduler.triggers.interval import IntervalTrigger + +from dstack._internal.server import settings +from dstack._internal.server.background.scheduled_tasks.events import delete_events +from dstack._internal.server.background.scheduled_tasks.gateways import ( + init_gateways_in_background, + process_gateways_connections, +) +from dstack._internal.server.background.scheduled_tasks.idle_volumes import ( + process_idle_volumes, +) +from dstack._internal.server.background.scheduled_tasks.instance_healthchecks import ( + delete_instance_healthchecks, +) +from dstack._internal.server.background.scheduled_tasks.metrics import ( + collect_metrics, + delete_metrics, +) +from dstack._internal.server.background.scheduled_tasks.offers_catalog import ( + preload_offers_catalog, +) +from dstack._internal.server.background.scheduled_tasks.probes import process_probes +from dstack._internal.server.background.scheduled_tasks.prometheus_metrics import ( + collect_prometheus_metrics, + delete_prometheus_metrics, +) + +_scheduler = AsyncIOScheduler() + + +def get_scheduler() -> AsyncIOScheduler: + return _scheduler + + +def start_scheduled_tasks() -> AsyncIOScheduler: + """ + Start periodic tasks triggered by `apscheduler` at specific times/intervals. + Suitable for tasks that run infrequently and don't need to lock rows for a long time. + """ + # DateTrigger() to run one-time init tasks immediately. + _scheduler.add_job(init_gateways_in_background, DateTrigger(), max_instances=1) + # Pre-load catalog offers both on server start and before catalog needs reload (15m). + _scheduler.add_job(preload_offers_catalog, DateTrigger(), max_instances=1) + _scheduler.add_job(preload_offers_catalog, IntervalTrigger(minutes=10), max_instances=1) + _scheduler.add_job(process_probes, IntervalTrigger(seconds=3, jitter=1)) + _scheduler.add_job(collect_metrics, IntervalTrigger(seconds=10), max_instances=1) + _scheduler.add_job(delete_metrics, IntervalTrigger(minutes=5), max_instances=1) + _scheduler.add_job(delete_events, IntervalTrigger(minutes=7), max_instances=1) + _scheduler.add_job(process_gateways_connections, IntervalTrigger(seconds=15)) + _scheduler.add_job( + process_idle_volumes, IntervalTrigger(seconds=60, jitter=10), max_instances=1 + ) + _scheduler.add_job(delete_instance_healthchecks, IntervalTrigger(minutes=5), max_instances=1) + if settings.ENABLE_PROMETHEUS_METRICS: + _scheduler.add_job( + collect_prometheus_metrics, IntervalTrigger(seconds=10), max_instances=1 + ) + _scheduler.add_job(delete_prometheus_metrics, IntervalTrigger(minutes=5), max_instances=1) + _scheduler.start() + return _scheduler diff --git a/src/dstack/_internal/server/background/scheduled_tasks/events.py b/src/dstack/_internal/server/background/scheduled_tasks/events.py new file mode 100644 index 0000000000..1fbf602176 --- /dev/null +++ b/src/dstack/_internal/server/background/scheduled_tasks/events.py @@ -0,0 +1,17 @@ +from datetime import timedelta + +from sqlalchemy import delete + +from dstack._internal.server import settings +from dstack._internal.server.db import get_session_ctx +from dstack._internal.server.models import EventModel +from dstack._internal.server.utils import sentry_utils +from dstack._internal.utils.common import get_current_datetime + + +@sentry_utils.instrument_scheduled_task +async def delete_events(): + cutoff = get_current_datetime() - timedelta(seconds=settings.SERVER_EVENTS_TTL_SECONDS) + stmt = delete(EventModel).where(EventModel.recorded_at < cutoff) + async with get_session_ctx() as session: + await session.execute(stmt) diff --git a/src/dstack/_internal/server/background/scheduled_tasks/gateways.py b/src/dstack/_internal/server/background/scheduled_tasks/gateways.py new file mode 100644 index 0000000000..f71ecacc0d --- /dev/null +++ b/src/dstack/_internal/server/background/scheduled_tasks/gateways.py @@ -0,0 +1,63 @@ +import asyncio + +from sqlalchemy import select + +from dstack._internal.core.errors import SSHError +from dstack._internal.server.db import get_db, get_session_ctx +from dstack._internal.server.models import ( + GatewayComputeModel, +) +from dstack._internal.server.services.gateways import ( + GatewayConnection, + gateway_connections_pool, + init_gateways, +) +from dstack._internal.server.services.locking import advisory_lock_ctx +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +async def init_gateways_in_background(): + async with get_session_ctx() as session: + await init_gateways(session=session) + + +async def process_gateways_connections(): + await _remove_inactive_connections() + await _process_active_connections() + + +async def _remove_inactive_connections(): + async with get_session_ctx() as session: + res = await session.execute( + select(GatewayComputeModel.ip_address).where(GatewayComputeModel.active == True) + ) + active_connection_ips = set(res.scalars().all()) + for conn in await gateway_connections_pool.all(): + if conn.ip_address not in active_connection_ips: + await gateway_connections_pool.remove(conn.ip_address) + + +async def _process_active_connections(): + connections = await gateway_connections_pool.all() + # Two server processes on a single host cannot process + # gateway connections and init gateway connections concurrently: + # Race conditions cause conflicting tunnels being opened. + async with get_session_ctx() as session: + async with advisory_lock_ctx( + bind=session, + dialect_name=get_db().dialect_name, + resource="gateway_tunnels", + ): + await asyncio.gather(*(_process_connection(conn) for conn in connections)) + + +async def _process_connection(conn: GatewayConnection): + try: + await conn.check_or_restart() + except SSHError as e: + logger.error("Connection to gateway %s failed: %s", conn.ip_address, e) + return + + await conn.try_collect_stats() diff --git a/src/dstack/_internal/server/background/scheduled_tasks/idle_volumes.py b/src/dstack/_internal/server/background/scheduled_tasks/idle_volumes.py new file mode 100644 index 0000000000..54fb68c282 --- /dev/null +++ b/src/dstack/_internal/server/background/scheduled_tasks/idle_volumes.py @@ -0,0 +1,99 @@ +import datetime +from typing import List + +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import joinedload + +from dstack._internal.core.models.profiles import parse_duration +from dstack._internal.core.models.volumes import VolumeStatus +from dstack._internal.server.db import get_db, get_session_ctx +from dstack._internal.server.models import ProjectModel, UserModel, VolumeModel +from dstack._internal.server.services import events +from dstack._internal.server.services.locking import get_locker +from dstack._internal.server.services.volumes import ( + get_volume_configuration, +) +from dstack._internal.server.utils import sentry_utils +from dstack._internal.utils.common import get_current_datetime +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +@sentry_utils.instrument_scheduled_task +async def process_idle_volumes(): + lock, lockset = get_locker(get_db().dialect_name).get_lockset(VolumeModel.__tablename__) + async with get_session_ctx() as session: + async with lock: + res = await session.execute( + select(VolumeModel.id) + .where( + VolumeModel.status == VolumeStatus.ACTIVE, + VolumeModel.auto_cleanup_enabled.is_not(False), + VolumeModel.deleted == False, + VolumeModel.lock_expires_at.is_(None), + VolumeModel.id.not_in(lockset), + ) + .order_by(VolumeModel.last_processed_at.asc()) + .limit(10) + .with_for_update(skip_locked=True, key_share=True) + ) + volume_ids = list(res.scalars().all()) + if not volume_ids: + return + for volume_id in volume_ids: + lockset.add(volume_id) + + res = await session.execute( + select(VolumeModel) + .where(VolumeModel.id.in_(volume_ids)) + .options(joinedload(VolumeModel.project).joinedload(ProjectModel.backends)) + .options(joinedload(VolumeModel.user).load_only(UserModel.name)) + .options(joinedload(VolumeModel.attachments)) + .execution_options(populate_existing=True) + ) + volume_models = list(res.unique().scalars().all()) + try: + volumes_to_delete = [v for v in volume_models if _should_delete_volume(v)] + if not volumes_to_delete: + return + await _delete_idle_volumes(session, volumes_to_delete) + finally: + lockset.difference_update(volume_ids) + + +def _should_delete_volume(volume: VolumeModel) -> bool: + if volume.attachments: + return False + + config = get_volume_configuration(volume) + if not config.auto_cleanup_duration: + return False + + duration_seconds = parse_duration(config.auto_cleanup_duration) + if not duration_seconds or duration_seconds <= 0: + return False + + idle_time = _get_idle_time(volume) + threshold = datetime.timedelta(seconds=duration_seconds) + return idle_time > threshold + + +def _get_idle_time(volume: VolumeModel) -> datetime.timedelta: + last_used = volume.last_job_processed_at or volume.created_at + idle_time = get_current_datetime() - last_used + return max(idle_time, datetime.timedelta(0)) + + +async def _delete_idle_volumes(session: AsyncSession, volumes: List[VolumeModel]): + for volume_model in volumes: + logger.info("Deleting idle volume %s", volume_model.name) + volume_model.to_be_deleted = True + events.emit( + session=session, + message="Volume marked for deletion due to exceeding auto_cleanup_duration", + actor=events.SystemActor(), + targets=[events.Target.from_model(volume_model)], + ) + await session.commit() diff --git a/src/dstack/_internal/server/background/scheduled_tasks/instance_healthchecks.py b/src/dstack/_internal/server/background/scheduled_tasks/instance_healthchecks.py new file mode 100644 index 0000000000..41e83c71aa --- /dev/null +++ b/src/dstack/_internal/server/background/scheduled_tasks/instance_healthchecks.py @@ -0,0 +1,20 @@ +from datetime import timedelta + +from sqlalchemy import delete + +from dstack._internal.server import settings +from dstack._internal.server.db import get_session_ctx +from dstack._internal.server.models import InstanceHealthCheckModel +from dstack._internal.server.utils import sentry_utils +from dstack._internal.utils.common import get_current_datetime + + +@sentry_utils.instrument_scheduled_task +async def delete_instance_healthchecks(): + now = get_current_datetime() + cutoff = now - timedelta(seconds=settings.SERVER_INSTANCE_HEALTH_TTL_SECONDS) + async with get_session_ctx() as session: + await session.execute( + delete(InstanceHealthCheckModel).where(InstanceHealthCheckModel.collected_at < cutoff) + ) + await session.commit() diff --git a/src/dstack/_internal/server/background/scheduled_tasks/metrics.py b/src/dstack/_internal/server/background/scheduled_tasks/metrics.py new file mode 100644 index 0000000000..1febe7fa52 --- /dev/null +++ b/src/dstack/_internal/server/background/scheduled_tasks/metrics.py @@ -0,0 +1,173 @@ +import asyncio +import json +from collections.abc import Mapping +from typing import List, Optional + +from sqlalchemy import Delete, delete, select +from sqlalchemy.orm import joinedload + +from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT +from dstack._internal.core.models.runs import JobStatus +from dstack._internal.server import settings +from dstack._internal.server.db import get_session_ctx +from dstack._internal.server.models import InstanceModel, JobMetricsPoint, JobModel, ProjectModel +from dstack._internal.server.schemas.runner import MetricsResponse +from dstack._internal.server.services.instances import get_instance_ssh_private_keys +from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data +from dstack._internal.server.services.runner import client +from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel +from dstack._internal.server.utils import sentry_utils +from dstack._internal.utils.common import batched, get_current_datetime, get_or_error, run_async +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +MAX_JOBS_FETCHED = 100 +BATCH_SIZE = 10 +MIN_COLLECT_INTERVAL_SECONDS = 9 + + +@sentry_utils.instrument_scheduled_task +async def collect_metrics(): + async with get_session_ctx() as session: + res = await session.execute( + select(JobModel) + .where(JobModel.status.in_([JobStatus.RUNNING])) + .options( + joinedload(JobModel.instance) + .joinedload(InstanceModel.project) + .load_only(ProjectModel.ssh_private_key) + ) + .order_by(JobModel.last_processed_at.asc()) + .limit(MAX_JOBS_FETCHED) + ) + job_models = res.unique().scalars().all() + + for batch in batched(job_models, BATCH_SIZE): + await _collect_jobs_metrics(batch) + + +@sentry_utils.instrument_scheduled_task +async def delete_metrics(): + now_timestamp_micro = int(get_current_datetime().timestamp() * 1_000_000) + running_timestamp_micro_cutoff = ( + now_timestamp_micro - settings.SERVER_METRICS_RUNNING_TTL_SECONDS * 1_000_000 + ) + finished_timestamp_micro_cutoff = ( + now_timestamp_micro - settings.SERVER_METRICS_FINISHED_TTL_SECONDS * 1_000_000 + ) + await asyncio.gather( + _execute_delete_statement( + delete(JobMetricsPoint).where( + JobMetricsPoint.job_id.in_( + select(JobModel.id).where(JobModel.status.in_([JobStatus.RUNNING])) + ), + JobMetricsPoint.timestamp_micro < running_timestamp_micro_cutoff, + ) + ), + _execute_delete_statement( + delete(JobMetricsPoint).where( + JobMetricsPoint.job_id.in_( + select(JobModel.id).where(JobModel.status.in_(JobStatus.finished_statuses())) + ), + JobMetricsPoint.timestamp_micro < finished_timestamp_micro_cutoff, + ) + ), + ) + + +async def _execute_delete_statement(stmt: Delete) -> None: + async with get_session_ctx() as session: + await session.execute(stmt) + await session.commit() + + +async def _collect_jobs_metrics(job_models: List[JobModel]): + filtered_job_models = await _filter_recently_collected_jobs(job_models) + tasks = [] + for job_model in filtered_job_models: + tasks.append(_collect_job_metrics(job_model)) + points = await asyncio.gather(*tasks) + async with get_session_ctx() as session: + for point in points: + if point is not None: + session.add(point) + await session.commit() + + +async def _filter_recently_collected_jobs(job_models: List[JobModel]) -> List[JobModel]: + # Skip metrics collection if another replica collected it recently. + # Two replicas can still collect metrics simultaneously – that's fine since + # we'll just store some extra metric points in the db. + async with get_session_ctx() as session: + res = await session.execute( + select(JobMetricsPoint).where( + JobMetricsPoint.job_id.in_([j.id for j in job_models]), + JobMetricsPoint.timestamp_micro > _get_recently_collected_metric_cutoff(), + ) + ) + recent_points = res.scalars().all() + recent_job_ids = [p.job_id for p in recent_points] + return [j for j in job_models if j.id not in recent_job_ids] + + +def _get_recently_collected_metric_cutoff() -> int: + now = int(get_current_datetime().timestamp() * 1_000_000) + cutoff = now - (MIN_COLLECT_INTERVAL_SECONDS * 1_000_000) + return cutoff + + +async def _collect_job_metrics(job_model: JobModel) -> Optional[JobMetricsPoint]: + ssh_private_keys = get_instance_ssh_private_keys(get_or_error(job_model.instance)) + jpd = get_job_provisioning_data(job_model) + jrd = get_job_runtime_data(job_model) + if jpd is None: + return None + try: + res = await run_async( + _pull_runner_metrics, + ssh_private_keys, + jpd, + jrd, + ) + except Exception: + logger.exception("Failed to collect job %s metrics", job_model.job_name) + return None + + if isinstance(res, bool): + # The job may already be terminated when collecting metrics - that's ok. + logger.warning("Failed to connect to job %s to collect metrics", job_model.job_name) + return None + + if res is None: + logger.debug( + ( + "Failed to collect job %s metrics." + " Either runner version does not support metrics API" + " or metrics collector is not available." + ), + job_model.job_name, + ) + return None + + gpus_memory_usage_bytes = [g.gpu_memory_usage_bytes for g in res.gpus] + gpus_util_percent = [g.gpu_util_percent for g in res.gpus] + + return JobMetricsPoint( + job_id=job_model.id, + timestamp_micro=res.timestamp_micro, + cpu_usage_micro=res.cpu_usage_micro, + memory_usage_bytes=res.memory_usage_bytes, + memory_working_set_bytes=res.memory_working_set_bytes, + gpus_memory_usage_bytes=json.dumps(gpus_memory_usage_bytes), + gpus_util_percent=json.dumps(gpus_util_percent), + ) + + +@runner_ssh_tunnel +def _pull_runner_metrics( + addresses: Mapping[int, client.LocalAddress], +) -> Optional[MetricsResponse]: + runner_client = client.RunnerClient.from_address(addresses[DSTACK_RUNNER_HTTP_PORT]) + return runner_client.get_metrics() diff --git a/src/dstack/_internal/server/background/scheduled_tasks/offers_catalog.py b/src/dstack/_internal/server/background/scheduled_tasks/offers_catalog.py new file mode 100644 index 0000000000..9fafd21a35 --- /dev/null +++ b/src/dstack/_internal/server/background/scheduled_tasks/offers_catalog.py @@ -0,0 +1,14 @@ +import gpuhunt + +from dstack._internal.utils.common import run_async +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +async def preload_offers_catalog(): + """Pre-load the `gpuhunt` offers catalog so the get offer requests do not pay the catalog download cost.""" + logger.debug("Pre-loading offers catalog") + catalog = gpuhunt.default_catalog() + await run_async(catalog.load) + logger.debug("Pre-loaded offers catalog") diff --git a/src/dstack/_internal/server/background/scheduled_tasks/probes.py b/src/dstack/_internal/server/background/scheduled_tasks/probes.py new file mode 100644 index 0000000000..d2c550b5df --- /dev/null +++ b/src/dstack/_internal/server/background/scheduled_tasks/probes.py @@ -0,0 +1,134 @@ +from datetime import timedelta +from functools import partial + +import httpx +from apscheduler.schedulers.asyncio import AsyncIOScheduler +from sqlalchemy import select, update +from sqlalchemy.orm import joinedload + +from dstack._internal.core.errors import SSHError +from dstack._internal.core.models.runs import JobStatus, ProbeSpec +from dstack._internal.server.db import get_db, get_session_ctx +from dstack._internal.server.models import InstanceModel, JobModel, ProbeModel +from dstack._internal.server.services.jobs import get_job_spec +from dstack._internal.server.services.jobs.job_replica_http_client import ( + SSH_CONNECT_TIMEOUT, + get_service_replica_client, +) +from dstack._internal.server.services.locking import get_locker +from dstack._internal.server.services.logging import fmt +from dstack._internal.utils.common import get_current_datetime +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) +BATCH_SIZE = 100 +PROCESSING_OVERHEAD_TIMEOUT = timedelta(minutes=1) +PROBES_SCHEDULER = AsyncIOScheduler() + + +async def process_probes(): + probe_lock, probe_lockset = get_locker(get_db().dialect_name).get_lockset( + ProbeModel.__tablename__ + ) + async with get_session_ctx() as session: + async with probe_lock: + res = await session.execute( + select(ProbeModel.id) + .where(ProbeModel.id.not_in(probe_lockset)) + .where(ProbeModel.active == True) + .where(ProbeModel.due <= get_current_datetime()) + .order_by(ProbeModel.due.asc()) + .limit(BATCH_SIZE) + .with_for_update(skip_locked=True, key_share=True) + ) + probe_ids = res.unique().scalars().all() + probe_lockset.update(probe_ids) + + try: + # Refetch to load all attributes. + # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE. + res = await session.execute( + select(ProbeModel) + .where(ProbeModel.id.in_(probe_ids)) + .options( + joinedload(ProbeModel.job) + .joinedload(JobModel.instance) + .joinedload(InstanceModel.project) + ) + .options(joinedload(ProbeModel.job).joinedload(JobModel.project)) + .execution_options(populate_existing=True) + ) + probes = res.unique().scalars().all() + for probe in probes: + if probe.job.status != JobStatus.RUNNING: + probe.active = False + else: + job_spec = get_job_spec(probe.job) + probe_spec = job_spec.probes[probe.probe_num] + if probe_spec.until_ready and probe.success_streak >= probe_spec.ready_after: + probe.active = False + else: + # Schedule the next probe execution in case this execution is interrupted + probe.due = get_current_datetime() + _get_probe_async_processing_timeout( + probe_spec + ) + # Execute the probe asynchronously outside of the DB session + PROBES_SCHEDULER.add_job(partial(_process_probe_async, probe, probe_spec)) + await session.commit() + finally: + probe_lockset.difference_update(probe_ids) + + +async def _process_probe_async(probe: ProbeModel, probe_spec: ProbeSpec) -> None: + start = get_current_datetime() + logger.debug("%s: processing probe", fmt(probe)) + success = await _execute_probe(probe, probe_spec) + + async with get_session_ctx() as session: + async with get_locker(get_db().dialect_name).lock_ctx( + ProbeModel.__tablename__, [probe.id] + ): + await session.execute( + update(ProbeModel) + .where(ProbeModel.id == probe.id) + .values( + success_streak=0 if not success else ProbeModel.success_streak + 1, + due=get_current_datetime() + timedelta(seconds=probe_spec.interval), + ) + ) + logger.debug( + "%s: probe processing took %ss", + fmt(probe), + (get_current_datetime() - start).total_seconds(), + ) + + +async def _execute_probe(probe: ProbeModel, probe_spec: ProbeSpec) -> bool: + """ + Returns: + Whether probe execution was successful. + """ + + try: + async with get_service_replica_client(probe.job) as client: + resp = await client.request( + method=probe_spec.method, + url="http://dstack" + probe_spec.url, + headers=[(h.name, h.value) for h in probe_spec.headers], + content=probe_spec.body, + timeout=probe_spec.timeout, + follow_redirects=False, + ) + logger.debug("%s: probe status code: %s", fmt(probe), resp.status_code) + return resp.is_success + except (SSHError, httpx.RequestError) as e: + logger.debug("%s: probe failed: %r", fmt(probe), e) + return False + + +def _get_probe_async_processing_timeout(probe_spec: ProbeSpec) -> timedelta: + return ( + timedelta(seconds=probe_spec.timeout) + + SSH_CONNECT_TIMEOUT + + PROCESSING_OVERHEAD_TIMEOUT # slow db queries and other unforeseen conditions + ) diff --git a/src/dstack/_internal/server/background/scheduled_tasks/prometheus_metrics.py b/src/dstack/_internal/server/background/scheduled_tasks/prometheus_metrics.py new file mode 100644 index 0000000000..96b8cb7742 --- /dev/null +++ b/src/dstack/_internal/server/background/scheduled_tasks/prometheus_metrics.py @@ -0,0 +1,153 @@ +import uuid +from collections.abc import Mapping +from datetime import datetime, timedelta +from typing import Optional + +import sqlalchemy.exc +from sqlalchemy import delete, or_, select, update +from sqlalchemy.orm import joinedload + +from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT +from dstack._internal.core.models.runs import JobStatus +from dstack._internal.server.db import get_session_ctx +from dstack._internal.server.models import ( + InstanceModel, + JobModel, + JobPrometheusMetrics, + ProjectModel, +) +from dstack._internal.server.services.instances import get_instance_ssh_private_keys +from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data +from dstack._internal.server.services.runner import client +from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel +from dstack._internal.server.utils import sentry_utils +from dstack._internal.server.utils.common import gather_map_async +from dstack._internal.utils.common import batched, get_current_datetime, get_or_error, run_async +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +MAX_JOBS_FETCHED = 100 +BATCH_SIZE = 10 +MIN_COLLECT_INTERVAL_SECONDS = 9 +# 10 minutes should be more than enough to scrape metrics, and, in any case, +# 10 minutes old metrics has little to no value +METRICS_TTL_SECONDS = 600 + + +@sentry_utils.instrument_scheduled_task +async def collect_prometheus_metrics(): + now = get_current_datetime() + cutoff = now - timedelta(seconds=MIN_COLLECT_INTERVAL_SECONDS) + async with get_session_ctx() as session: + res = await session.execute( + select(JobModel) + .join(JobPrometheusMetrics, isouter=True) + .where( + JobModel.status.in_([JobStatus.RUNNING]), + or_( + JobPrometheusMetrics.job_id.is_(None), + JobPrometheusMetrics.collected_at < cutoff, + ), + ) + .options( + joinedload(JobModel.instance) + .joinedload(InstanceModel.project) + .load_only(ProjectModel.ssh_private_key) + ) + .order_by(JobModel.last_processed_at.asc()) + .limit(MAX_JOBS_FETCHED) + ) + job_models = res.unique().scalars().all() + for batch in batched(job_models, BATCH_SIZE): + await _collect_jobs_metrics(batch, now) + + +@sentry_utils.instrument_scheduled_task +async def delete_prometheus_metrics(): + now = get_current_datetime() + cutoff = now - timedelta(seconds=METRICS_TTL_SECONDS) + async with get_session_ctx() as session: + await session.execute( + delete(JobPrometheusMetrics).where(JobPrometheusMetrics.collected_at < cutoff) + ) + await session.commit() + + +async def _collect_jobs_metrics(job_models: list[JobModel], collected_at: datetime): + results = await gather_map_async(job_models, _collect_job_metrics, return_exceptions=True) + async with get_session_ctx() as session: + for job_model, result in results: + if result is None: + continue + if isinstance(result, BaseException): + logger.error( + "Failed to collect job %s Prometheus metrics: %r", job_model.job_name, result + ) + continue + res = await session.execute( + update(JobPrometheusMetrics) + .where(JobPrometheusMetrics.job_id == job_model.id) + .values( + collected_at=collected_at, + text=result, + ) + .returning(JobPrometheusMetrics) + ) + metrics = res.scalar() + if metrics is None: + metrics = JobPrometheusMetrics( + job_id=job_model.id, + collected_at=collected_at, + text=result, + ) + try: + async with session.begin_nested(): + session.add(metrics) + except sqlalchemy.exc.IntegrityError: + # Concurrent server replica already committed, ignoring + pass + await session.commit() + + +async def _collect_job_metrics(job_model: JobModel) -> Optional[str]: + jpd = get_job_provisioning_data(job_model) + if jpd is None: + return None + if not jpd.dockerized: + # Container-based backend, no shim + return None + ssh_private_keys = get_instance_ssh_private_keys(get_or_error(job_model.instance)) + jrd = get_job_runtime_data(job_model) + try: + res = await run_async( + _pull_job_metrics, + ssh_private_keys, + jpd, + jrd, + job_model.id, + ) + except Exception: + logger.exception("Failed to collect job %s Prometheus metrics", job_model.job_name) + return None + + if isinstance(res, bool): + logger.warning( + "Failed to connect to job %s to collect Prometheus metrics", job_model.job_name + ) + return None + + if res is None: + # Either not supported by shim or exporter is not available + return None + + return res + + +@runner_ssh_tunnel +def _pull_job_metrics( + addresses: Mapping[int, client.LocalAddress], task_id: uuid.UUID +) -> Optional[str]: + shim_client = client.ShimClient.from_address(addresses[DSTACK_SHIM_HTTP_PORT]) + return shim_client.get_task_metrics(task_id) diff --git a/src/dstack/_internal/server/background/tasks/process_gateways.py b/src/dstack/_internal/server/background/tasks/process_gateways.py deleted file mode 100644 index 4007bb19a4..0000000000 --- a/src/dstack/_internal/server/background/tasks/process_gateways.py +++ /dev/null @@ -1,154 +0,0 @@ -import asyncio -from uuid import UUID - -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import joinedload - -from dstack._internal.core.errors import BackendError, BackendNotAvailable, SSHError -from dstack._internal.core.models.gateways import GatewayStatus -from dstack._internal.server.db import get_session_ctx -from dstack._internal.server.models import GatewayModel -from dstack._internal.server.services import backends as backends_services -from dstack._internal.server.services import gateways as gateways_services -from dstack._internal.server.services.gateways import ( - PROCESSING_GATEWAYS_IDS, - PROCESSING_GATEWAYS_LOCK, - GatewayConnection, - create_gateway_compute, - gateway_connections_pool, -) -from dstack._internal.utils.common import get_current_datetime -from dstack._internal.utils.logging import get_logger - -logger = get_logger(__name__) - - -async def process_gateways_connections(): - # TODO(egor-s): distribute the load evenly - connections = await gateway_connections_pool.all() - await asyncio.gather(*(_process_connection(conn) for conn in connections)) - - -async def process_submitted_gateways(): - async with get_session_ctx() as session: - async with PROCESSING_GATEWAYS_LOCK: - res = await session.execute( - select(GatewayModel) - .where( - GatewayModel.status == GatewayStatus.SUBMITTED, - GatewayModel.id.not_in(PROCESSING_GATEWAYS_IDS), - ) - .order_by(GatewayModel.last_processed_at.asc()) - .limit(1) - ) - gateway_model = res.scalar() - if gateway_model is None: - return - - PROCESSING_GATEWAYS_IDS.add(gateway_model.id) - - try: - await _process_gateway(gateway_id=gateway_model.id) - finally: - PROCESSING_GATEWAYS_IDS.remove(gateway_model.id) - - -async def _process_connection(conn: GatewayConnection): - try: - await conn.check_or_restart() - await conn.try_collect_stats() - except SSHError as e: - logger.error("Connection to gateway %s failed: %s", conn.ip_address, e) - - -async def _process_gateway(gateway_id: UUID): - async with get_session_ctx() as session: - res = await session.execute( - select(GatewayModel) - .where(GatewayModel.id == gateway_id) - .options(joinedload(GatewayModel.project)) - ) - gateway_model = res.scalar_one() - await _process_submitted_gateway( - session=session, - gateway_model=gateway_model, - ) - - -async def _process_submitted_gateway(session: AsyncSession, gateway_model: GatewayModel): - logger.info("Started gateway %s provisioning", gateway_model.name) - configuration = gateways_services.get_gateway_configuration(gateway_model) - try: - ( - backend_model, - backend, - ) = await backends_services.get_project_backend_with_model_by_type_or_error( - project=gateway_model.project, backend_type=configuration.backend - ) - except BackendNotAvailable: - gateway_model.status = GatewayStatus.FAILED - gateway_model.status_message = "Backend not available" - gateway_model.last_processed_at = get_current_datetime() - await session.commit() - return - - try: - gateway_model.gateway_compute = await create_gateway_compute( - backend_compute=backend.compute(), - project_name=gateway_model.project.name, - configuration=configuration, - backend_id=backend_model.id, - ) - session.add(gateway_model) - gateway_model.status = GatewayStatus.PROVISIONING - await session.commit() - await session.refresh(gateway_model) - except BackendError as e: - logger.info( - "Failed to create gateway compute for gateway %s: %s", gateway_model.name, repr(e) - ) - gateway_model.status = GatewayStatus.FAILED - status_message = f"Backend error: {repr(e)}" - if len(e.args) > 0: - status_message = str(e.args[0]) - gateway_model.status_message = status_message - gateway_model.last_processed_at = get_current_datetime() - await session.commit() - return - except Exception as e: - logger.exception( - "Got exception when creating gateway compute for gateway %s", gateway_model.name - ) - gateway_model.status = GatewayStatus.FAILED - gateway_model.status_message = f"Unexpected error: {repr(e)}" - gateway_model.last_processed_at = get_current_datetime() - await session.commit() - return - - connection = await gateways_services.connect_to_gateway_with_retry( - gateway_model.gateway_compute - ) - if connection is None: - gateway_model.status = GatewayStatus.FAILED - gateway_model.status_message = "Failed to connect to gateway" - gateway_model.last_processed_at = get_current_datetime() - gateway_model.gateway_compute.deleted = True - await session.commit() - return - - try: - await gateways_services.configure_gateway(connection) - except Exception: - logger.exception("Failed to configure gateway %s", gateway_model.name) - gateway_model.status = GatewayStatus.FAILED - gateway_model.status_message = "Failed to configure gateway" - gateway_model.last_processed_at = get_current_datetime() - await gateway_connections_pool.remove(gateway_model.gateway_compute.ip_address) - gateway_model.gateway_compute.active = False - await session.commit() - return - - gateway_model.status = GatewayStatus.RUNNING - gateway_model.last_processed_at = get_current_datetime() - await session.commit() diff --git a/src/dstack/_internal/server/background/tasks/process_instances.py b/src/dstack/_internal/server/background/tasks/process_instances.py deleted file mode 100644 index 9751dd33b9..0000000000 --- a/src/dstack/_internal/server/background/tasks/process_instances.py +++ /dev/null @@ -1,811 +0,0 @@ -import asyncio -import datetime -from datetime import timedelta -from typing import Any, Dict, List, Optional, Tuple, Union, cast -from uuid import UUID - -import requests -from paramiko.pkey import PKey -from pydantic import ValidationError -from sqlalchemy import select -from sqlalchemy.orm import joinedload - -from dstack._internal import settings -from dstack._internal.core.backends import BACKENDS_WITH_CREATE_INSTANCE_SUPPORT -from dstack._internal.core.backends.base.compute import ( - DSTACK_WORKING_DIR, - get_dstack_runner_version, - get_shim_env, - get_shim_pre_start_commands, -) -from dstack._internal.core.backends.remote.provisioning import ( - get_host_info, - get_paramiko_connection, - get_shim_healthcheck, - host_info_to_instance_type, - run_pre_start_commands, - run_shim_as_systemd_service, - upload_envs, -) -from dstack._internal.core.errors import BackendError, ProvisioningError -from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.instances import ( - InstanceAvailability, - InstanceConfiguration, - InstanceOfferWithAvailability, - InstanceRuntime, - RemoteConnectionInfo, -) -from dstack._internal.core.models.profiles import ( - Profile, - RetryEvent, - TerminationPolicy, -) -from dstack._internal.core.models.runs import ( - InstanceStatus, - JobProvisioningData, - Requirements, - Retry, -) -from dstack._internal.core.services.profiles import get_retry -from dstack._internal.server.db import get_session_ctx -from dstack._internal.server.models import InstanceModel, ProjectModel -from dstack._internal.server.schemas.runner import HealthcheckResponse -from dstack._internal.server.services import backends as backends_services -from dstack._internal.server.services.jobs import ( - PROCESSING_POOL_IDS, - PROCESSING_POOL_LOCK, - terminate_job_provisioning_data_instance, -) -from dstack._internal.server.services.pools import get_instance_provisioning_data -from dstack._internal.server.services.runner import client as runner_client -from dstack._internal.server.services.runner.client import HealthStatus -from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel -from dstack._internal.server.services.runs import get_create_instance_offers -from dstack._internal.server.utils.common import run_async -from dstack._internal.utils.common import get_current_datetime -from dstack._internal.utils.logging import get_logger -from dstack._internal.utils.network import get_ip_from_network -from dstack._internal.utils.ssh import ( - rsa_pkey_from_str, -) - -PENDING_JOB_RETRY_INTERVAL = timedelta(seconds=60) - -TERMINATION_DEADLINE_OFFSET = timedelta(minutes=20) - -PROVISIONING_TIMEOUT_SECONDS = 10 * 60 # 10 minutes in seconds - - -logger = get_logger(__name__) - - -async def process_instances() -> None: - async with get_session_ctx() as session: - async with PROCESSING_POOL_LOCK: - res = await session.scalars( - select(InstanceModel).where( - InstanceModel.status.in_( - [ - InstanceStatus.PENDING, - InstanceStatus.PROVISIONING, - InstanceStatus.BUSY, - InstanceStatus.IDLE, - InstanceStatus.TERMINATING, - ] - ), - InstanceModel.id.not_in(PROCESSING_POOL_IDS), - ) - ) - instances = res.all() - if not instances: - return - - unprocessed_instances_ids = set(i.id for i in instances) - PROCESSING_POOL_IDS.update(unprocessed_instances_ids) - - try: - futures = [process_instance(i) for i in instances] - for future in asyncio.as_completed(futures): - instance_id = await future - PROCESSING_POOL_IDS.remove(instance_id) - unprocessed_instances_ids.remove(instance_id) - finally: - PROCESSING_POOL_IDS.difference_update(unprocessed_instances_ids) - - -async def process_instance(instance: InstanceModel) -> UUID: - if ( - instance.status == InstanceStatus.IDLE - and instance.termination_policy == TerminationPolicy.DESTROY_AFTER_IDLE - and instance.job_id is None - ): - await terminate_idle_instance(instance.id) - - if instance.status == InstanceStatus.PENDING and instance.remote_connection_info is not None: - await add_remote(instance.id) - - if instance.status == InstanceStatus.PENDING and instance.remote_connection_info is None: - await create_instance(instance.id) - - if instance.status in ( - InstanceStatus.PROVISIONING, - InstanceStatus.IDLE, - InstanceStatus.BUSY, - ): - await check_instance(instance.id) - - if instance.status == InstanceStatus.TERMINATING: - await terminate(instance.id) - - return instance.id - - -def deploy_instance( - remote_details: RemoteConnectionInfo, pkeys: List[PKey] -) -> Tuple[HealthStatus, Dict[str, Any]]: - with get_paramiko_connection( - remote_details.ssh_user, remote_details.host, remote_details.port, pkeys - ) as client: - logger.info(f"Connected to {remote_details.ssh_user} {remote_details.host}") - - runner_build = get_dstack_runner_version() - - # Execute pre start commands - shim_pre_start_commands = get_shim_pre_start_commands(runner_build) - run_pre_start_commands( - client, - shim_pre_start_commands, - authorized_keys=[pk.public.strip() for pk in remote_details.ssh_keys], - ) - logger.debug("The script for installing dstack has been executed") - - # Upload envs - shim_envs = get_shim_env( - runner_build, authorized_keys=[sk.public for sk in remote_details.ssh_keys] - ) - upload_envs(client, DSTACK_WORKING_DIR, shim_envs) - logger.debug("The dstack-shim environment variables have been installed") - - # Run dstack-shim as a systemd service - run_shim_as_systemd_service( - client=client, - working_dir=DSTACK_WORKING_DIR, - dev=settings.DSTACK_VERSION is None, - ) - - # Get host info - host_info = get_host_info(client, DSTACK_WORKING_DIR) - logger.debug("Received a host_info %s", host_info) - - raw_health = get_shim_healthcheck(client) - try: - health_response = HealthcheckResponse.__response__.parse_raw(raw_health) - except ValueError as e: - raise ProvisioningError("Cannot read HealthcheckResponse") from e - health = runner_client.health_response_to_health_status(health_response) - - return health, host_info - - -async def add_remote(instance_id: UUID) -> None: - async with get_session_ctx() as session: - instance = ( - await session.scalars( - select(InstanceModel) - .where(InstanceModel.id == instance_id) - .options(joinedload(InstanceModel.project)) - ) - ).one() - - logger.debug("Adding remote instance %s...", instance.name) - - if instance.status == InstanceStatus.PENDING: - instance.status = InstanceStatus.PROVISIONING - await session.commit() - - retry_duration_deadline = instance.created_at.replace( - tzinfo=datetime.timezone.utc - ) + timedelta(seconds=PROVISIONING_TIMEOUT_SECONDS) - if retry_duration_deadline < get_current_datetime(): - instance.status = InstanceStatus.TERMINATED - instance.deleted = True - instance.deleted_at = get_current_datetime() - instance.termination_reason = "The proivisioning timeout expired" - await session.commit() - logger.warning( - "Failed to start the instance in %s seconds. Terminate instance %s", - PROVISIONING_TIMEOUT_SECONDS, - instance.name, - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.TERMINATED.value, - }, - ) - return - - try: - remote_details = RemoteConnectionInfo.parse_raw( - cast(str, instance.remote_connection_info) - ) - - # Prepare connection key - pkeys = [ - rsa_pkey_from_str(sk.private) - for sk in remote_details.ssh_keys - if sk.private is not None - ] - if not pkeys: - logger.error("There are no ssh private key") - raise ProvisioningError("The SSH private key is not provided") - - try: - future = asyncio.get_running_loop().run_in_executor( - None, deploy_instance, remote_details, pkeys - ) - deploy_timeout = 20 * 60 # 20 minutes - result = await asyncio.wait_for(future, timeout=deploy_timeout) - health, host_info = result - except (asyncio.TimeoutError, TimeoutError) as e: - raise ProvisioningError(f"Deploy timeout {e}") from e - except Exception as e: - logger.debug("deploy_instance raise an error: %s", e) - raise ProvisioningError(f"Deploy instance raise an error {e}") from e - else: - logger.info( - "The instance %s (%s) was successfully added", - instance.name, - remote_details.host, - ) - - except ProvisioningError as e: - logger.warning( - "Provisioning the instance '%s' could not be completed because of the error: %s", - instance.name, - e, - ) - instance.status = InstanceStatus.PENDING - instance.last_retry_at = get_current_datetime() - await session.commit() - return - - instance_type = host_info_to_instance_type(host_info) - - instance_network = None - try: - default_jpd = JobProvisioningData.__response__.parse_raw( - instance.job_provisioning_data - ) - instance_network = default_jpd.instance_network - except ValidationError: - pass - - internal_ip = get_ip_from_network( - network=instance_network, - addresses=host_info.get("addresses", []), - ) - if instance_network is not None and internal_ip is None: - instance.status = InstanceStatus.TERMINATED - instance.deleted = True - instance.deleted_at = get_current_datetime() - instance.termination_reason = ( - "Unable to locate the internal ip-address for the given network" - ) - await session.commit() - logger.warning( - "Failed to configure internal ip-address on instance %s. Terminate it", - instance.name, - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.TERMINATED.value, - }, - ) - return - - region = instance.region - - jpd = JobProvisioningData( - backend=BackendType.REMOTE, - instance_type=instance_type, - instance_id="instance_id", - hostname=remote_details.host, - region=region, - price=0, - internal_ip=internal_ip, - instance_network=instance_network, - username=remote_details.ssh_user, - ssh_port=22, - dockerized=True, - backend_data=None, - ssh_proxy=None, - ) - - instance.status = InstanceStatus.IDLE if health else InstanceStatus.PROVISIONING - instance.backend = BackendType.REMOTE - - instance_offer = InstanceOfferWithAvailability( - backend=BackendType.REMOTE, - instance=instance_type, - region=region, - price=0, - availability=InstanceAvailability.AVAILABLE, - instance_runtime=InstanceRuntime.SHIM, - ) - - instance.price = 0 - instance.offer = instance_offer.json() - instance.job_provisioning_data = jpd.json() - - instance.started_at = get_current_datetime() - instance.last_retry_at = get_current_datetime() - - await session.commit() - - -async def create_instance(instance_id: UUID) -> None: - async with get_session_ctx() as session: - instance = ( - await session.scalars( - select(InstanceModel) - .where(InstanceModel.id == instance_id) - .options(joinedload(InstanceModel.project)) - ) - ).one() - - if instance.last_retry_at is not None: - last_retry = instance.last_retry_at.replace(tzinfo=datetime.timezone.utc) - if get_current_datetime() < last_retry + timedelta(minutes=1): - return - - if ( - instance.profile is None - or instance.requirements is None - or instance.instance_configuration is None - ): - instance.status = InstanceStatus.TERMINATED - instance.deleted = True - instance.deleted_at = get_current_datetime() - instance.termination_reason = "Empty profile, requirements or instance_configuration" - instance.last_retry_at = get_current_datetime() - await session.commit() - logger.warning( - "Empty profile, requirements or instance_configuration. Terminate instance: %s", - instance.name, - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.TERMINATED.value, - }, - ) - return - - try: - profile: Profile = Profile.__response__.parse_raw(instance.profile) - requirements: Requirements = Requirements.__response__.parse_raw(instance.requirements) - instance_configuration: InstanceConfiguration = ( - InstanceConfiguration.__response__.parse_raw(instance.instance_configuration) - ) - except ValidationError as e: - instance.status = InstanceStatus.TERMINATED - instance.deleted = True - instance.deleted_at = get_current_datetime() - instance.termination_reason = ( - f"Error to parse profile, requirements or instance_configuration: {e}" - ) - instance.last_retry_at = get_current_datetime() - logger.warning( - "Error to parse profile, requirements or instance_configuration. Terminate instance: %s", - instance.name, - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.TERMINATED.value, - }, - ) - await session.commit() - return - - retry = get_retry(profile) - should_retry = retry is not None and RetryEvent.NO_CAPACITY in retry.on_events - - if retry is not None: - retry_duration_deadline = _get_retry_duration_deadline(instance, retry) - if get_current_datetime() > retry_duration_deadline: - instance.status = InstanceStatus.TERMINATED - instance.deleted = True - instance.deleted_at = get_current_datetime() - instance.termination_reason = "Retry duration expired" - await session.commit() - logger.warning( - "Retry duration expired. Terminate instance %s", - instance.name, - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.TERMINATED.value, - }, - ) - return - - offers = await get_create_instance_offers( - project=instance.project, - profile=profile, - requirements=requirements, - exclude_not_available=True, - ) - - if not offers and should_retry: - instance.last_retry_at = get_current_datetime() - await session.commit() - logger.debug( - "No offers for instance %s. Next retry", - instance.name, - extra={"instance_name": instance.name}, - ) - return - - for backend, instance_offer in offers: - if instance_offer.backend not in BACKENDS_WITH_CREATE_INSTANCE_SUPPORT: - continue - logger.debug( - "Trying %s in %s/%s for $%0.4f per hour", - instance_offer.instance.name, - instance_offer.backend.value, - instance_offer.region, - instance_offer.price, - ) - try: - job_provisioning_data = await run_async( - backend.compute().create_instance, - instance_offer, - instance_configuration, - ) - except BackendError as e: - logger.warning( - "%s launch in %s/%s failed: %s", - instance_offer.instance.name, - instance_offer.backend.value, - instance_offer.region, - repr(e), - extra={"instance_name": instance.name}, - ) - continue - except NotImplementedError: - # skip a backend without create_instance support, continue with next backend and offer - continue - - instance.status = InstanceStatus.PROVISIONING - instance.backend = backend.TYPE - instance.region = instance_offer.region - instance.price = instance_offer.price - instance.job_provisioning_data = job_provisioning_data.json() - instance.offer = instance_offer.json() - instance.started_at = get_current_datetime() - instance.last_retry_at = get_current_datetime() - - logger.info( - "Created instance %s", - instance.name, - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.PROVISIONING.value, - }, - ) - await session.commit() - return - - instance.last_retry_at = get_current_datetime() - - if not should_retry: - instance.status = InstanceStatus.TERMINATED - instance.deleted = True - instance.deleted_at = get_current_datetime() - instance.termination_reason = "No offers found" - logger.info( - "No offers found. Terminated instance %s", - instance.name, - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.TERMINATED.value, - }, - ) - - await session.commit() - - -async def check_instance(instance_id: UUID) -> None: - async with get_session_ctx() as session: - instance = ( - await session.scalars( - select(InstanceModel) - .where(InstanceModel.id == instance_id) - .options(joinedload(InstanceModel.project)) - ) - ).one() - - job_provisioning_data = JobProvisioningData.__response__.parse_raw( - instance.job_provisioning_data - ) - - if job_provisioning_data.hostname is None: - await wait_for_instance_provisioning_data( - project=instance.project, - instance=instance, - job_provisioning_data=job_provisioning_data, - ) - await session.commit() - return - - if not job_provisioning_data.dockerized: - return - - ssh_private_key = instance.project.ssh_private_key - if instance.remote_connection_info is not None: - remote_conn_info: RemoteConnectionInfo = RemoteConnectionInfo.__response__.parse_raw( - instance.remote_connection_info - ) - ssh_private_key = remote_conn_info.ssh_keys[0].private - - # May return False if fails to establish ssh connection - health_status_response: Union[Optional[HealthStatus], bool] = await run_async( - instance_healthcheck, ssh_private_key, job_provisioning_data - ) - if isinstance(health_status_response, bool) or health_status_response is None: - health_status = HealthStatus(healthy=False, reason="SSH or tunnel error") - else: - health_status = health_status_response - - logger.debug( - "Check instance %s status. shim health: %s", - instance.name, - health_status, - extra={"instance_name": instance.name, "shim_health": health_status}, - ) - - if health_status.healthy: - instance.termination_deadline = None - instance.health_status = None - instance.unreachable = False - - if instance.status == InstanceStatus.PROVISIONING: - instance.status = ( - InstanceStatus.IDLE if instance.job_id is None else InstanceStatus.BUSY - ) - logger.info( - "Instance %s has switched to %s status", - instance.name, - instance.status.value, - extra={ - "instance_name": instance.name, - "instance_status": instance.status.value, - }, - ) - await session.commit() - return - - if instance.termination_deadline is None: - instance.termination_deadline = get_current_datetime() + TERMINATION_DEADLINE_OFFSET - - instance.health_status = health_status.reason - instance.unreachable = True - - if instance.status == InstanceStatus.PROVISIONING and instance.started_at is not None: - provisioning_deadline = _get_provisioning_deadline(instance) - if get_current_datetime() > provisioning_deadline: - instance.status = InstanceStatus.TERMINATING - logger.warning( - "Instance %s has not started in time. Marked as TERMINATING", - instance.name, - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.TERMINATING.value, - }, - ) - elif instance.status in (InstanceStatus.IDLE, InstanceStatus.BUSY): - logger.warning( - "Instance %s shim is not available", - instance.name, - extra={"instance_name": instance.name}, - ) - deadline = instance.termination_deadline.replace(tzinfo=datetime.timezone.utc) - if get_current_datetime() > deadline: - instance.status = InstanceStatus.TERMINATING - instance.termination_reason = "Termination deadline" - logger.warning( - "Instance %s shim waiting timeout. Marked as TERMINATING", - instance.name, - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.TERMINATING.value, - }, - ) - - await session.commit() - - -async def wait_for_instance_provisioning_data( - project: ProjectModel, - instance: InstanceModel, - job_provisioning_data: JobProvisioningData, -): - logger.debug( - "Waiting for instance %s to become running", - instance.name, - ) - provisioning_deadline = _get_provisioning_deadline(instance) - if get_current_datetime() > provisioning_deadline: - logger.warning( - "Instance %s failed because instance has not become running in time", instance.name - ) - instance.status = InstanceStatus.TERMINATING - instance.termination_reason = "Instance has not become running in time" - return - - backend = await backends_services.get_project_backend_by_type( - project=project, - backend_type=job_provisioning_data.backend, - ) - if backend is None: - logger.warning( - "Instance %s failed because instance's backend is not available", - instance.name, - ) - instance.status = InstanceStatus.TERMINATING - instance.termination_reason = "Backend not available" - return - try: - await run_async( - backend.compute().update_provisioning_data, - job_provisioning_data, - project.ssh_public_key, - project.ssh_private_key, - ) - instance.job_provisioning_data = job_provisioning_data.json() - except ProvisioningError as e: - logger.warning( - "Error while waiting for instance %s to become running: %s", - instance.name, - repr(e), - ) - instance.status = InstanceStatus.TERMINATING - instance.termination_reason = "Error while waiting for instance to become running" - except Exception: - logger.exception( - "Got exception when updating instance %s provisioning data", instance.name - ) - - -@runner_ssh_tunnel(ports=[runner_client.REMOTE_SHIM_PORT], retries=1) -def instance_healthcheck(*, ports: Dict[int, int]) -> HealthStatus: - shim_client = runner_client.ShimClient(port=ports[runner_client.REMOTE_SHIM_PORT]) - try: - resp = shim_client.healthcheck(unmask_exeptions=True) - if resp is None: - return HealthStatus(healthy=False, reason="Unknown reason") - return runner_client.health_response_to_health_status(resp) - except requests.RequestException as e: - return HealthStatus(healthy=False, reason=f"Can't request shim: {e}") - except Exception as e: - logger.exception("Unknown exception from shim.healthcheck: %s", e) - return HealthStatus( - healthy=False, reason=f"Unknown exception ({e.__class__.__name__}): {e}" - ) - - -async def terminate(instance_id: UUID) -> None: - async with get_session_ctx() as session: - instance = ( - await session.scalars( - select(InstanceModel) - .where(InstanceModel.id == instance_id) - .options(joinedload(InstanceModel.project)) - ) - ).one() - - jpd = get_instance_provisioning_data(instance) - if jpd is not None: - if jpd.backend != BackendType.REMOTE: - backend = await backends_services.get_project_backend_by_type( - project=instance.project, backend_type=jpd.backend - ) - if backend is None: - logger.error( - "Failed to terminate instance %s. Backend not available.", instance.name - ) - else: - try: - await run_async( - backend.compute().terminate_instance, - jpd.instance_id, - jpd.region, - jpd.backend_data, - ) - except BackendError as e: - logger.error( - "Failed to terminate instance %s: %s", - instance.name, - repr(e), - ) - except Exception: - logger.exception( - "Got exception when terminating instance %s", - instance.name, - ) - - instance.deleted = True - instance.deleted_at = get_current_datetime() - instance.finished_at = get_current_datetime() - instance.status = InstanceStatus.TERMINATED - - logger.info( - "Instance %s terminated", - instance.name, - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.TERMINATED.value, - }, - ) - - await session.commit() - - -async def terminate_idle_instance(instance_id: UUID): - async with get_session_ctx() as session: - instance = ( - await session.scalars( - select(InstanceModel) - .where(InstanceModel.id == instance_id) - .options(joinedload(InstanceModel.project)) - ) - ).one() - current_time = get_current_datetime() - idle_duration = _get_instance_idle_duration(instance) - idle_seconds = instance.termination_idle_time - delta = datetime.timedelta(seconds=idle_seconds) - if idle_duration > delta: - jpd = get_instance_provisioning_data(instance) - if jpd is None: - logger.error( - "Failed to terminate idle instance %s. provisioning_data is None.", - instance.name, - ) - else: - await terminate_job_provisioning_data_instance( - project=instance.project, job_provisioning_data=jpd - ) - instance.deleted = True - instance.deleted_at = current_time - instance.finished_at = current_time - instance.status = InstanceStatus.TERMINATED - instance.termination_reason = "Idle timeout" - logger.info( - "Instance %s terminated by termination policy: idle time %ss", - instance.name, - str(idle_duration.seconds), - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.TERMINATED.value, - }, - ) - await session.commit() - - -def _get_instance_idle_duration(instance: InstanceModel) -> datetime.timedelta: - last_time = instance.created_at.replace(tzinfo=datetime.timezone.utc) - if instance.last_job_processed_at is not None: - last_time = instance.last_job_processed_at.replace(tzinfo=datetime.timezone.utc) - return get_current_datetime() - last_time - - -def _get_retry_duration_deadline(instance: InstanceModel, retry: Retry) -> datetime.datetime: - return instance.created_at.replace(tzinfo=datetime.timezone.utc) + timedelta( - seconds=retry.duration - ) - - -def _get_provisioning_deadline(instance: InstanceModel) -> datetime.datetime: - timeout_interval = _get_instance_timeout_interval(backend_type=instance.backend) - return instance.started_at.replace(tzinfo=datetime.timezone.utc) + timeout_interval - - -def _get_instance_timeout_interval(backend_type: BackendType) -> timedelta: - if backend_type == BackendType.RUNPOD: - return timedelta(seconds=1200) - return timedelta(seconds=600) diff --git a/src/dstack/_internal/server/background/tasks/process_running_jobs.py b/src/dstack/_internal/server/background/tasks/process_running_jobs.py deleted file mode 100644 index e0ecc6fb19..0000000000 --- a/src/dstack/_internal/server/background/tasks/process_running_jobs.py +++ /dev/null @@ -1,576 +0,0 @@ -from datetime import timedelta -from typing import Dict, List, Optional -from uuid import UUID - -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import joinedload - -import dstack._internal.server.services.gateways as gateways -from dstack._internal.core.errors import GatewayError -from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.configurations import RegistryAuth -from dstack._internal.core.models.instances import RemoteConnectionInfo -from dstack._internal.core.models.repos import RemoteRepoCreds -from dstack._internal.core.models.runs import ( - ClusterInfo, - InstanceStatus, - Job, - JobSpec, - JobStatus, - JobTerminationReason, - Run, -) -from dstack._internal.core.models.volumes import Volume -from dstack._internal.server.db import get_session_ctx -from dstack._internal.server.models import ( - JobModel, - ProjectModel, - RepoModel, - RunModel, -) -from dstack._internal.server.services import logs as logs_services -from dstack._internal.server.services.jobs import ( - RUNNING_PROCESSING_JOBS_IDS, - RUNNING_PROCESSING_JOBS_LOCK, - find_job, - job_model_to_job_submission, -) -from dstack._internal.server.services.logging import fmt -from dstack._internal.server.services.repos import get_code_model, repo_model_to_repo_head -from dstack._internal.server.services.runner import client -from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel -from dstack._internal.server.services.runs import ( - PROCESSING_RUNS_IDS, - PROCESSING_RUNS_LOCK, - get_run_volumes, - run_model_to_run, -) -from dstack._internal.server.services.storage import get_default_storage -from dstack._internal.server.utils.common import run_async -from dstack._internal.utils import common as common_utils -from dstack._internal.utils.interpolator import VariablesInterpolator -from dstack._internal.utils.logging import get_logger - -logger = get_logger(__name__) - - -async def process_running_jobs(): - async with get_session_ctx() as session: - async with PROCESSING_RUNS_LOCK, RUNNING_PROCESSING_JOBS_LOCK: - res = await session.execute( - select(JobModel) - .where( - JobModel.status.in_( - [JobStatus.PROVISIONING, JobStatus.PULLING, JobStatus.RUNNING] - ), - JobModel.id.not_in(RUNNING_PROCESSING_JOBS_IDS), - JobModel.run_id.not_in( - PROCESSING_RUNS_IDS - ), # runs processing has higher priority - ) - .order_by(JobModel.last_processed_at.asc()) - .limit(1) # TODO process multiple at once - ) - job_model = res.scalar() - if job_model is None: - return - - RUNNING_PROCESSING_JOBS_IDS.add(job_model.id) - - try: - await _process_job(job_id=job_model.id) - finally: - RUNNING_PROCESSING_JOBS_IDS.remove(job_model.id) - - -async def _process_job(job_id: UUID): - async with get_session_ctx() as session: - res = await session.execute( - select(JobModel).where(JobModel.id == job_id).options(joinedload(JobModel.instance)) - ) - job_model = res.scalar_one() - res = await session.execute( - select(RunModel) - .where(RunModel.id == job_model.run_id) - .options(joinedload(RunModel.project)) - .options(joinedload(RunModel.user)) - .options(joinedload(RunModel.repo)) - ) - run_model = res.scalar_one() - repo_model = run_model.repo - project = run_model.project - run = run_model_to_run(run_model) - job_submission = job_model_to_job_submission(job_model) - job_provisioning_data = job_submission.job_provisioning_data - if job_provisioning_data is None: - logger.error("%s: job_provisioning_data of an active job is None", fmt(job_model)) - job_model.status = JobStatus.TERMINATING - job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER - job_model.last_processed_at = common_utils.get_current_datetime() - return - - job = find_job(run.jobs, job_model.replica_num, job_model.job_num) - - # Wait until all other jobs in the replica are provisioned - for other_job in run.jobs: - if ( - other_job.job_spec.replica_num == job.job_spec.replica_num - and other_job.job_submissions[-1].status == JobStatus.SUBMITTED - ): - job_model.last_processed_at = common_utils.get_current_datetime() - await session.commit() - return - - master_job = find_job(run.jobs, job_model.replica_num, 0) - cluster_info = ClusterInfo( - master_job_ip=master_job.job_submissions[-1].job_provisioning_data.internal_ip or "", - gpus_per_job=len(job_provisioning_data.instance_type.resources.gpus), - ) - - volumes = await get_run_volumes( - session=session, - project=project, - run_spec=run.run_spec, - ) - - server_ssh_private_key = project.ssh_private_key - if ( - job_model.instance is not None - and job_model.instance.remote_connection_info is not None - and job_provisioning_data.dockerized - ): - remote_conn_info: RemoteConnectionInfo = RemoteConnectionInfo.__response__.parse_raw( - job_model.instance.remote_connection_info - ) - server_ssh_private_key = remote_conn_info.ssh_keys[0].private - - secrets = {} # TODO secrets - repo_creds = repo_model_to_repo_head(repo_model, include_creds=True).repo_creds - - initial_status = job_model.status - if initial_status == JobStatus.PROVISIONING: - if job_provisioning_data.hostname is None: - await _wait_for_instance_provisioning_data(job_model=job_model) - else: - # fails are acceptable until timeout is exceeded - if job_provisioning_data.dockerized: - logger.debug( - "%s: process provisioning job with shim, age=%s", - fmt(job_model), - job_submission.age, - ) - ssh_user = job_provisioning_data.username - user_ssh_key = run.run_spec.ssh_key_pub.strip() - public_keys = [project.ssh_public_key.strip(), user_ssh_key] - if job_provisioning_data.backend == BackendType.LOCAL: - # No need to update ~/.ssh/authorized_keys when running shim localy - user_ssh_key = "" - success = await run_async( - _process_provisioning_with_shim, - server_ssh_private_key, - job_provisioning_data, - run, - job_model, - volumes, - secrets, - job.job_spec.registry_auth, - public_keys, - ssh_user, - user_ssh_key, - ) - else: - logger.debug( - "%s: process provisioning job without shim, age=%s", - fmt(job_model), - job_submission.age, - ) - code = await _get_job_code( - session=session, - project=project, - repo=repo_model, - code_hash=run.run_spec.repo_code_hash, - ) - success = await run_async( - _process_provisioning_no_shim, - server_ssh_private_key, - job_provisioning_data, - run, - job_model, - job, - cluster_info, - code, - secrets, - repo_creds, - ) - - if not success: - # check timeout - if job_submission.age > _get_runner_timeout_interval( - job_provisioning_data.backend, job_provisioning_data.instance_type.name - ): - logger.warning( - "%s: failed because runner has not become available in time, age=%s", - fmt(job_model), - job_submission.age, - ) - job_model.status = JobStatus.TERMINATING - job_model.termination_reason = ( - JobTerminationReason.WAITING_RUNNER_LIMIT_EXCEEDED - ) - # instance will be emptied by process_terminating_jobs - - else: # fails are not acceptable - if initial_status == JobStatus.PULLING: - logger.debug( - "%s: process pulling job with shim, age=%s", fmt(job_model), job_submission.age - ) - code = await _get_job_code( - session=session, - project=project, - repo=repo_model, - code_hash=run.run_spec.repo_code_hash, - ) - success = await run_async( - _process_pulling_with_shim, - server_ssh_private_key, - job_provisioning_data, - run, - job_model, - job, - cluster_info, - code, - secrets, - repo_creds, - ) - elif initial_status == JobStatus.RUNNING: - logger.debug("%s: process running job, age=%s", fmt(job_model), job_submission.age) - success = await run_async( - _process_running, - server_ssh_private_key, - job_provisioning_data, - run_model, - job_model, - ) - if not success: - job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY - - if not success: # kill the job - logger.warning( - "%s: failed because runner is not available or return an error, age=%s", - fmt(job_model), - job_submission.age, - ) - job_model.status = JobStatus.TERMINATING - if not job_model.termination_reason: - job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY - # job will be terminated and instance will be emptied by process_terminating_jobs - - if ( - initial_status != job_model.status - and job_model.status == JobStatus.RUNNING - and job_model.job_num == 0 # gateway connects only to the first node - and run.run_spec.configuration.type == "service" - ): - try: - await gateways.register_replica(session, run_model.gateway_id, run, job_model) - except GatewayError as e: - logger.warning( - "%s: failed to register service replica: %s, age=%s", - fmt(job_model), - e, - job_submission.age, - ) - job_model.status = JobStatus.TERMINATING - job_model.termination_reason = JobTerminationReason.GATEWAY_ERROR - - job_model.last_processed_at = common_utils.get_current_datetime() - await session.commit() - - -async def _wait_for_instance_provisioning_data(job_model: JobModel): - """ - This function will be called until instance IP address appears - in `job_model.instance.job_provisioning_data` or instance is terminated on timeout. - """ - if job_model.instance is None: - logger.error( - "%s: cannot update job_provisioning_data. job_model.instance is None.", - fmt(job_model), - ) - return - if job_model.instance.job_provisioning_data is None: - logger.error( - "%s: cannot update job_provisioning_data. job_model.job_provisioning_data is None.", - fmt(job_model), - ) - return - - if job_model.instance.status == InstanceStatus.TERMINATED: - job_model.status = JobStatus.TERMINATING - # TODO use WAITING_INSTANCE_LIMIT_EXCEEDED after 0.19.x - job_model.termination_reason = JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY - return - - job_model.job_provisioning_data = job_model.instance.job_provisioning_data - - -@runner_ssh_tunnel(ports=[client.REMOTE_RUNNER_PORT], retries=1) -def _process_provisioning_no_shim( - run: Run, - job_model: JobModel, - job: Job, - cluster_info: ClusterInfo, - code: bytes, - secrets: Dict[str, str], - repo_credentials: Optional[RemoteRepoCreds], - *, - ports: Dict[int, int], -) -> bool: - """ - Possible next states: - - JobStatus.RUNNING if runner is available - - JobStatus.TERMINATING if timeout is exceeded - - Returns: - is successful - """ - - runner_client = client.RunnerClient(port=ports[client.REMOTE_RUNNER_PORT]) - resp = runner_client.healthcheck() - if resp is None: - return False # runner is not available yet - _submit_job_to_runner( - runner_client=runner_client, - run=run, - job_model=job_model, - job=job, - cluster_info=cluster_info, - code=code, - secrets=secrets, - repo_credentials=repo_credentials, - ) - return True - - -@runner_ssh_tunnel(ports=[client.REMOTE_SHIM_PORT], retries=1) -def _process_provisioning_with_shim( - run: Run, - job_model: JobModel, - volumes: List[Volume], - secrets: Dict[str, str], - registry_auth: Optional[RegistryAuth], - public_keys: List[str], - ssh_user: str, - ssh_key: str, - *, - ports: Dict[int, int], -) -> bool: - """ - Possible next states: - - JobStatus.PULLING if shim is available - - JobStatus.TERMINATING if timeout is exceeded - - Returns: - is successful - """ - job_spec = JobSpec.__response__.parse_raw(job_model.job_spec_data) - - shim_client = client.ShimClient(port=ports[client.REMOTE_SHIM_PORT]) - - resp = shim_client.healthcheck() - if resp is None: - logger.debug("%s: shim is not available yet", fmt(job_model)) - return False # shim is not available yet - - username = "" - password = "" - if registry_auth is not None: - logger.debug("%s: authenticating to the registry...", fmt(job_model)) - interpolate = VariablesInterpolator({"secrets": secrets}).interpolate - username = interpolate(registry_auth.username) - password = interpolate(registry_auth.password) - - shim_client.submit( - username=username, - password=password, - image_name=job_spec.image_name, - container_name=job_model.job_name, - shm_size=job_spec.requirements.resources.shm_size, - public_keys=public_keys, - ssh_user=ssh_user, - ssh_key=ssh_key, - mounts=run.run_spec.configuration.volumes, - volumes=volumes, - ) - - job_model.status = JobStatus.PULLING - logger.info("%s: now is %s", fmt(job_model), job_model.status.name) - return True - - -@runner_ssh_tunnel(ports=[client.REMOTE_SHIM_PORT, client.REMOTE_RUNNER_PORT]) -def _process_pulling_with_shim( - run: Run, - job_model: JobModel, - job: Job, - cluster_info: ClusterInfo, - code: bytes, - secrets: Dict[str, str], - repo_credentials: Optional[RemoteRepoCreds], - *, - ports: Dict[int, int], -) -> bool: - """ - Possible next states: - - JobStatus.RUNNING if runner is available - - JobStatus.TERMINATING if shim is not available - - Returns: - is successful - """ - shim_client = client.ShimClient(port=ports[client.REMOTE_SHIM_PORT]) - shim_status = shim_client.pull() # raises error if shim is down, causes retry - - # If shim goes to pending before the job is submitted to runner, then an error occured - if ( - shim_status.state == "pending" - and shim_status.result is not None - and shim_status.result.reason != "" - ): - logger.warning( - "shim failed to execute job %s: %s (%s)", - job_model.job_name, - shim_status.result.reason, - shim_status.result.reason_message, - ) - logger.debug("shim status: %s", shim_status.dict()) - job_model.termination_reason = JobTerminationReason[shim_status.result.reason.upper()] - job_model.termination_reason_message = shim_status.result.reason_message - return False - - if shim_status.state in ("pulling", "creating"): - return True - - runner_client = client.RunnerClient(port=ports[client.REMOTE_RUNNER_PORT]) - resp = runner_client.healthcheck() - if resp is None: - return True # runner is not available yet - - # Expect shim_status.state == "running" - _submit_job_to_runner( - runner_client=runner_client, - run=run, - job_model=job_model, - job=job, - cluster_info=cluster_info, - code=code, - secrets=secrets, - repo_credentials=repo_credentials, - ) - return True - - -@runner_ssh_tunnel(ports=[client.REMOTE_RUNNER_PORT]) -def _process_running( - run_model: RunModel, - job_model: JobModel, - *, - ports: Dict[int, int], -) -> bool: - """ - Possible next states: - - JobStatus.TERMINATING if runner is not available - - Any status received from runner - - Returns: - is successful - """ - runner_client = client.RunnerClient(port=ports[client.REMOTE_RUNNER_PORT]) - timestamp = 0 - if job_model.runner_timestamp is not None: - timestamp = job_model.runner_timestamp - resp = runner_client.pull(timestamp) # raises error if runner is down, causes retry - job_model.runner_timestamp = resp.last_updated - logs_services.write_logs( - project=run_model.project, - run_name=run_model.run_name, - job_submission_id=job_model.id, - runner_logs=resp.runner_logs, - job_logs=resp.job_logs, - ) - if len(resp.job_states) > 0: - latest_status = resp.job_states[-1].state - # TODO(egor-s): refactor dstack-runner to return compatible statuses and reasons - if latest_status == JobStatus.DONE: - job_model.status = JobStatus.TERMINATING - job_model.termination_reason = JobTerminationReason.DONE_BY_RUNNER - # let the CLI pull logs? - # delay_job_instance_termination(job_model) - elif latest_status in {JobStatus.FAILED, JobStatus.ABORTED, JobStatus.TERMINATED}: - job_model.status = JobStatus.TERMINATING - job_model.termination_reason = JobTerminationReason.CONTAINER_EXITED_WITH_ERROR - # let the CLI pull logs? - # delay_job_instance_termination(job_model) - logger.info("%s: now is %s", fmt(job_model), job_model.status.name) - return True - - -async def _get_job_code( - session: AsyncSession, project: ProjectModel, repo: RepoModel, code_hash: str -) -> bytes: - code_model = await get_code_model(session=session, repo=repo, code_hash=code_hash) - if code_model is None: - return b"" - storage = get_default_storage() - if storage is None or code_model.blob is not None: - return code_model.blob - blob = await run_async( - storage.get_code, - project.name, - repo.name, - code_hash, - ) - return blob - - -def _submit_job_to_runner( - runner_client: client.RunnerClient, - run: Run, - job_model: JobModel, - job: Job, - cluster_info: ClusterInfo, - code: bytes, - secrets: Dict[str, str], - repo_credentials: Optional[RemoteRepoCreds], -): - logger.debug("%s: submitting job spec", fmt(job_model)) - logger.debug( - "%s: repo credentials are %s", - fmt(job_model), - None if repo_credentials is None else repo_credentials.protocol.value, - ) - runner_client.submit_job( - run_spec=run.run_spec, - job_spec=job.job_spec, - cluster_info=cluster_info, - secrets=secrets, - repo_credentials=repo_credentials, - ) - logger.debug("%s: uploading code", fmt(job_model)) - runner_client.upload_code(code) - logger.debug("%s: starting job", fmt(job_model)) - runner_client.run_job() - - job_model.status = JobStatus.RUNNING - # do not log here, because the runner will send a new status - - -def _get_runner_timeout_interval(backend_type: BackendType, instance_type_name: str) -> timedelta: - if backend_type == BackendType.LAMBDA: - return timedelta(seconds=1200) - if backend_type == BackendType.KUBERNETES: - return timedelta(seconds=1200) - if backend_type == BackendType.OCI and instance_type_name.startswith("BM."): - return timedelta(seconds=1200) - return timedelta(seconds=600) diff --git a/src/dstack/_internal/server/background/tasks/process_runs.py b/src/dstack/_internal/server/background/tasks/process_runs.py deleted file mode 100644 index 027f9427e0..0000000000 --- a/src/dstack/_internal/server/background/tasks/process_runs.py +++ /dev/null @@ -1,402 +0,0 @@ -import asyncio -import datetime -import itertools -import uuid -from typing import List, Optional, Set, Tuple - -import sqlalchemy as sa -from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import joinedload - -import dstack._internal.server.services.gateways as gateways -import dstack._internal.server.services.gateways.autoscalers as autoscalers -from dstack._internal.core.errors import ServerError -from dstack._internal.core.models.profiles import RetryEvent -from dstack._internal.core.models.runs import ( - Job, - JobStatus, - JobTerminationReason, - Run, - RunSpec, - RunStatus, - RunTerminationReason, -) -from dstack._internal.server.db import get_session_ctx -from dstack._internal.server.models import JobModel, RunModel -from dstack._internal.server.services.jobs import ( - RUNNING_PROCESSING_JOBS_IDS, - RUNNING_PROCESSING_JOBS_LOCK, - SUBMITTED_PROCESSING_JOBS_IDS, - SUBMITTED_PROCESSING_JOBS_LOCK, - TERMINATING_PROCESSING_JOBS_IDS, - TERMINATING_PROCESSING_JOBS_LOCK, - find_job, - get_jobs_from_run_spec, - group_jobs_by_replica_latest, -) -from dstack._internal.server.services.runs import ( - PROCESSING_RUNS_IDS, - PROCESSING_RUNS_LOCK, - create_job_model_for_new_submission, - fmt, - process_terminating_run, - retry_run_replica_jobs, - run_model_to_run, - scale_run_replicas, -) -from dstack._internal.server.utils.common import wait_unlock -from dstack._internal.utils import common -from dstack._internal.utils.logging import get_logger - -logger = get_logger(__name__) -PROCESSING_INTERVAL = datetime.timedelta(seconds=2) -RETRY_DELAY = datetime.timedelta(seconds=15) - - -async def process_runs(): - async with get_session_ctx() as session: - async with PROCESSING_RUNS_LOCK: - res = await session.execute( - sa.select(RunModel).where( - RunModel.status.not_in(RunStatus.finished_statuses()), - RunModel.last_processed_at - < common.get_current_datetime() - PROCESSING_INTERVAL, - RunModel.id.not_in(PROCESSING_RUNS_IDS), - ) - ) - runs = res.scalars().all() - unprocessed_runs_ids = set(run.id for run in runs) - PROCESSING_RUNS_IDS.update(unprocessed_runs_ids) - - futures = [process_single_run(run.id, [job.id for job in run.jobs]) for run in runs] - try: - for future in asyncio.as_completed(futures): - run_id = await future - # Unlock job processing as soon as possible. - PROCESSING_RUNS_IDS.remove(run_id) - unprocessed_runs_ids.remove(run_id) - finally: - # Ensure that all runs are unlocked. - # Note that runs should not be unlocked twice! - PROCESSING_RUNS_IDS.difference_update(unprocessed_runs_ids) - - -async def process_single_run(run_id: uuid.UUID, job_ids: List[uuid.UUID]) -> uuid.UUID: - jobs_ids_set = set(job_ids) - await wait_unlock(SUBMITTED_PROCESSING_JOBS_LOCK, SUBMITTED_PROCESSING_JOBS_IDS, jobs_ids_set) - await wait_unlock(RUNNING_PROCESSING_JOBS_LOCK, RUNNING_PROCESSING_JOBS_IDS, jobs_ids_set) - await wait_unlock( - TERMINATING_PROCESSING_JOBS_LOCK, TERMINATING_PROCESSING_JOBS_IDS, jobs_ids_set - ) - - async with get_session_ctx() as session: - res = await session.execute( - sa.select(RunModel) - .where(RunModel.id == run_id) - .execution_options(populate_existing=True) - .options(joinedload(RunModel.project)) - .options(joinedload(RunModel.user)) - .options(joinedload(RunModel.repo)) - ) - run = res.scalar() - if run is None: - logger.error(f"Run {run_id} not found") - return run_id - - try: - if run.status == RunStatus.PENDING: - await process_pending_run(session, run) - elif run.status in {RunStatus.SUBMITTED, RunStatus.PROVISIONING, RunStatus.RUNNING}: - await process_active_run(session, run) - elif run.status == RunStatus.TERMINATING: - await process_terminating_run(session, run) - else: - logger.error("%s: unexpected status %s", fmt(run), run.status.name) - run.status = RunStatus.TERMINATING - run.termination_reason = RunTerminationReason.SERVER_ERROR - except ServerError as e: - logger.error("%s: run processing error: %s", fmt(run), e) - run.status = RunStatus.TERMINATING - run.termination_reason = RunTerminationReason.SERVER_ERROR - - run.last_processed_at = common.get_current_datetime() - await session.commit() - - return run_id - - -async def process_pending_run(session: AsyncSession, run_model: RunModel): - """Jobs are not created yet""" - run = run_model_to_run(run_model) - if run.latest_job_submission is None: - logger.error("%s: failed to retry: pending run has no job submissions.") - run_model.status = RunStatus.FAILED - run_model.termination_reason = RunTerminationReason.SERVER_ERROR - return - - if common.get_current_datetime() - run.latest_job_submission.last_processed_at < RETRY_DELAY: - logger.debug("%s: pending run is not yet ready for resubmission", fmt(run_model)) - return - - # TODO(egor-s) consolidate with `scale_run_replicas` if possible - replicas = 1 - if run.run_spec.configuration.type == "service": - replicas = run.run_spec.configuration.replicas.min or 0 # new default - scaler = autoscalers.get_service_autoscaler(run.run_spec.configuration) - if scaler is not None: - conn = await gateways.get_gateway_connection(session, run_model.gateway_id) - stats = await conn.get_stats(run_model.id) - if stats: - # replicas info doesn't matter for now - replicas = scaler.scale([], stats) - if replicas == 0: - # stay zero scaled - return - - scheduled_replicas = 0 - # Resubmit existing replicas - for replica_num, replica_jobs in itertools.groupby( - run.jobs, key=lambda j: j.job_spec.replica_num - ): - if scheduled_replicas >= replicas: - break - scheduled_replicas += 1 - for job in replica_jobs: - new_job_model = create_job_model_for_new_submission( - run_model=run_model, - job=job, - status=JobStatus.SUBMITTED, - ) - session.add(new_job_model) - # Create missing replicas - for replica_num in range(scheduled_replicas, replicas): - jobs = await get_jobs_from_run_spec(run.run_spec, replica_num=replica_num) - for job in jobs: - job_model = create_job_model_for_new_submission( - run_model=run_model, - job=job, - status=JobStatus.SUBMITTED, - ) - session.add(job_model) - - run_model.status = RunStatus.SUBMITTED - logger.info("%s: run status has changed PENDING -> SUBMITTED", fmt(run_model)) - - -async def process_active_run(session: AsyncSession, run_model: RunModel): - """ - Run is submitted, provisioning, or running. - We handle fails, scaling, and status changes. - """ - run = run_model_to_run(run_model) - run_spec = RunSpec.__response__.parse_raw(run_model.run_spec) - retry_single_job = can_retry_single_job(run_spec) - - run_statuses: Set[RunStatus] = set() - run_termination_reasons: Set[RunTerminationReason] = set() - replicas_to_retry: List[Tuple[int, List[JobModel]]] = [] - - replicas_info: List[autoscalers.ReplicaInfo] = [] - for replica_num, job_models in group_jobs_by_replica_latest(run_model.jobs): - replica_statuses: Set[RunStatus] = set() - replica_needs_retry = False - - replica_active = True - for job_model in job_models: - job = find_job(run.jobs, job_model.replica_num, job_model.job_num) - if job_model.status == JobStatus.DONE or ( - job_model.status == JobStatus.TERMINATING - and job_model.termination_reason == JobTerminationReason.DONE_BY_RUNNER - ): - # the job is done or going to be done - replica_statuses.add(RunStatus.DONE) - # for some reason the replica is done, it's not active - replica_active = False - elif job_model.termination_reason == JobTerminationReason.SCALED_DOWN: - # the job was scaled down - replica_active = False - elif job_model.status == JobStatus.RUNNING: - # the job is running - replica_statuses.add(RunStatus.RUNNING) - elif job_model.status in {JobStatus.PROVISIONING, JobStatus.PULLING}: - # the job is provisioning - replica_statuses.add(RunStatus.PROVISIONING) - elif job_model.status == JobStatus.SUBMITTED: - # the job is submitted - replica_statuses.add(RunStatus.SUBMITTED) - elif job_model.status == JobStatus.FAILED or ( - job_model.status == JobStatus.TERMINATING - and job_model.termination_reason - not in {JobTerminationReason.DONE_BY_RUNNER, JobTerminationReason.SCALED_DOWN} - ): - current_duration = should_retry_job(run, job, job_model) - if current_duration is None: - replica_statuses.add(RunStatus.FAILED) - run_termination_reasons.add(RunTerminationReason.JOB_FAILED) - else: - if is_retry_duration_exceeded(job, current_duration): - replica_statuses.add(RunStatus.FAILED) - run_termination_reasons.add(RunTerminationReason.RETRY_LIMIT_EXCEEDED) - else: - replica_needs_retry = True - elif job_model.status in { - JobStatus.TERMINATING, - JobStatus.TERMINATED, - JobStatus.ABORTED, - }: - pass # unexpected, but let's ignore it - else: - raise ValueError(f"Unexpected job status {job_model.status}") - - if RunStatus.FAILED in replica_statuses: - run_statuses.add(RunStatus.FAILED) - else: - if replica_needs_retry: - replicas_to_retry.append((replica_num, job_models)) - if not replica_needs_retry or retry_single_job: - run_statuses.update(replica_statuses) - - if replica_active: - # submitted_at = replica created - replicas_info.append( - autoscalers.ReplicaInfo( - active=True, - timestamp=min(job.submitted_at for job in job_models).replace( - tzinfo=datetime.timezone.utc - ), - ) - ) - else: - # last_processed_at = replica scaled down - replicas_info.append( - autoscalers.ReplicaInfo( - active=False, - timestamp=max(job.last_processed_at for job in job_models).replace( - tzinfo=datetime.timezone.utc - ), - ) - ) - - termination_reason: Optional[RunTerminationReason] = None - if RunStatus.FAILED in run_statuses: - new_status = RunStatus.TERMINATING - if RunTerminationReason.JOB_FAILED in run_termination_reasons: - termination_reason = RunTerminationReason.JOB_FAILED - elif RunTerminationReason.RETRY_LIMIT_EXCEEDED in run_termination_reasons: - termination_reason = RunTerminationReason.RETRY_LIMIT_EXCEEDED - else: - raise ValueError(f"Unexpected termination reason {run_termination_reasons}") - elif RunStatus.RUNNING in run_statuses: - new_status = RunStatus.RUNNING - elif RunStatus.PROVISIONING in run_statuses: - new_status = RunStatus.PROVISIONING - elif RunStatus.SUBMITTED in run_statuses: - new_status = RunStatus.SUBMITTED - elif RunStatus.DONE in run_statuses and not replicas_to_retry: - new_status = RunStatus.TERMINATING - termination_reason = RunTerminationReason.ALL_JOBS_DONE - else: - new_status = RunStatus.PENDING - - # Terminate active jobs if the run is to be resubmitted - if new_status == RunStatus.PENDING and not retry_single_job: - for _, replica_jobs in replicas_to_retry: - for job_model in replica_jobs: - if not ( - job_model.status.is_finished() or job_model.status == JobStatus.TERMINATING - ): - job_model.status = JobStatus.TERMINATING - job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER - - if new_status not in {RunStatus.TERMINATING, RunStatus.PENDING}: - # No need to retry if the run is terminating, - # pending run will retry replicas in `process_pending_run` - for _, replica_jobs in replicas_to_retry: - await retry_run_replica_jobs( - session, run_model, replica_jobs, only_failed=retry_single_job - ) - - if run_spec.configuration.type == "service": - scaler = autoscalers.get_service_autoscaler(run_spec.configuration) - if scaler is not None: - conn = await gateways.get_gateway_connection(session, run_model.gateway_id) - stats = await conn.get_stats(run_model.id) - if stats: - # use replicas_info from before retrying - replicas_diff = scaler.scale(replicas_info, stats) - if replicas_diff != 0: - await session.flush() - await session.refresh(run_model) - await scale_run_replicas(session, run_model, replicas_diff) - - if run_model.status != new_status: - logger.info( - "%s: run status has changed %s -> %s", - fmt(run_model), - run_model.status.name, - new_status.name, - ) - run_model.status = new_status - run_model.termination_reason = termination_reason - - -def should_retry_job(run: Run, job: Job, job_model: JobModel) -> Optional[datetime.timedelta]: - """ - Checks if the job should be retried. - Returns the current duration of retrying if retry is enabled. - """ - if job.job_spec.retry is None: - return None - - last_provisioned_submission = None - for job_submission in reversed(job.job_submissions): - if job_submission.job_provisioning_data is not None: - last_provisioned_submission = job_submission - break - - if ( - job_model.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY - and last_provisioned_submission is None - and RetryEvent.NO_CAPACITY in job.job_spec.retry.on_events - ): - return common.get_current_datetime() - run.submitted_at - - if last_provisioned_submission is None: - return None - - if ( - last_provisioned_submission.termination_reason - == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY - and RetryEvent.INTERRUPTION in job.job_spec.retry.on_events - ): - return common.get_current_datetime() - last_provisioned_submission.last_processed_at - - if ( - last_provisioned_submission.termination_reason - in [ - JobTerminationReason.CONTAINER_EXITED_WITH_ERROR, - JobTerminationReason.CREATING_CONTAINER_ERROR, - JobTerminationReason.EXECUTOR_ERROR, - JobTerminationReason.GATEWAY_ERROR, - JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED, - JobTerminationReason.WAITING_RUNNER_LIMIT_EXCEEDED, - JobTerminationReason.PORTS_BINDING_FAILED, - ] - and RetryEvent.ERROR in job.job_spec.retry.on_events - ): - return common.get_current_datetime() - last_provisioned_submission.last_processed_at - - return None - - -def is_retry_duration_exceeded(job: Job, current_duration: datetime.timedelta) -> bool: - if job.job_spec.retry is None: - return True - return current_duration > datetime.timedelta(seconds=job.job_spec.retry.duration) - - -def can_retry_single_job(run_spec: RunSpec) -> bool: - # TODO: Currently, we terminate and retry the entire replica if one of the job fails. - # We could make partial retry in some multi-node cases. - # E.g. restarting a worker node, independent jobs. - return False diff --git a/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py b/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py deleted file mode 100644 index 2f57f3579c..0000000000 --- a/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py +++ /dev/null @@ -1,459 +0,0 @@ -import uuid -from typing import List, Optional, Tuple -from uuid import UUID - -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import joinedload - -from dstack._internal.core.backends.base import Backend -from dstack._internal.core.errors import BackendError, ServerClientError -from dstack._internal.core.models.instances import ( - InstanceOfferWithAvailability, -) -from dstack._internal.core.models.profiles import ( - CreationPolicy, - TerminationPolicy, -) -from dstack._internal.core.models.runs import ( - InstanceStatus, - Job, - JobProvisioningData, - JobStatus, - JobTerminationReason, - Requirements, - Run, - RunSpec, -) -from dstack._internal.core.models.volumes import Volume -from dstack._internal.server.db import get_session_ctx -from dstack._internal.server.models import ( - InstanceModel, - JobModel, - PoolModel, - ProjectModel, - RunModel, - VolumeModel, -) -from dstack._internal.server.services.backends import get_project_backend_by_type_or_error -from dstack._internal.server.services.jobs import ( - PROCESSING_POOL_LOCK, - SUBMITTED_PROCESSING_JOBS_IDS, - SUBMITTED_PROCESSING_JOBS_LOCK, - find_job, -) -from dstack._internal.server.services.logging import fmt -from dstack._internal.server.services.pools import ( - filter_pool_instances, - get_or_create_pool_by_name, - get_pool_instances, -) -from dstack._internal.server.services.runs import ( - PROCESSING_RUNS_IDS, - PROCESSING_RUNS_LOCK, - check_can_attach_run_volumes, - get_offers_by_requirements, - get_run_volume_models, - run_model_to_run, -) -from dstack._internal.server.services.volumes import ( - PROCESSING_VOLUMES_IDS, - PROCESSING_VOLUMES_LOCK, - volume_model_to_volume, -) -from dstack._internal.server.utils.common import run_async, wait_to_lock_many -from dstack._internal.utils import common as common_utils -from dstack._internal.utils.logging import get_logger - -logger = get_logger(__name__) - - -async def process_submitted_jobs(): - async with get_session_ctx() as session: - async with PROCESSING_RUNS_LOCK, SUBMITTED_PROCESSING_JOBS_LOCK: - res = await session.execute( - select(JobModel) - .where( - JobModel.status == JobStatus.SUBMITTED, - JobModel.id.not_in(SUBMITTED_PROCESSING_JOBS_IDS), - JobModel.run_id.not_in(PROCESSING_RUNS_IDS), - ) - .order_by(JobModel.last_processed_at.asc()) - .limit(1) # TODO process multiple at once - ) - job_model = res.scalar() - if job_model is None: - return - - SUBMITTED_PROCESSING_JOBS_IDS.add(job_model.id) - - try: - await _process_job(job_id=job_model.id) - finally: - SUBMITTED_PROCESSING_JOBS_IDS.remove(job_model.id) - - -async def _process_job(job_id: UUID): - async with get_session_ctx() as session: - res = await session.execute(select(JobModel).where(JobModel.id == job_id)) - job_model = res.scalar_one() - await _process_submitted_job( - session=session, - job_model=job_model, - ) - - -async def _process_submitted_job(session: AsyncSession, job_model: JobModel): - logger.debug("%s: provisioning has started", fmt(job_model)) - res = await session.execute( - select(RunModel) - .where(RunModel.id == job_model.run_id) - .options(joinedload(RunModel.project)) - .options(joinedload(RunModel.user)) - ) - run_model = res.scalar_one() - project_model = run_model.project - run_spec = RunSpec.__response__.parse_raw(run_model.run_spec) - profile = run_spec.merged_profile - - run = run_model_to_run(run_model) - job = find_job(run.jobs, job_model.replica_num, job_model.job_num) - - master_job = find_job(run.jobs, job_model.replica_num, 0) - master_job_provisioning_data = None - # Wait until the master job is provisioned to provision in the same cluster - if job.job_spec.job_num != 0: - if master_job.job_submissions[-1].job_provisioning_data is None: - job_model.last_processed_at = common_utils.get_current_datetime() - await session.commit() - return - master_job_provisioning_data = JobProvisioningData.__response__.parse_obj( - master_job.job_submissions[-1].job_provisioning_data - ) - - try: - volume_models = await get_run_volume_models( - session=session, - project=project_model, - run_spec=run_spec, - ) - volumes = [volume_model_to_volume(v) for v in volume_models] - check_can_attach_run_volumes(run_spec=run_spec, volumes=volumes) - except ServerClientError as e: - logger.error("%s: ", fmt(job_model)) - job_model.status = JobStatus.TERMINATING - # TODO: Replace with JobTerminationReason.VOLUME_ERROR in 0.19 - job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER - job_model.termination_reason_message = e.msg - job_model.last_processed_at = common_utils.get_current_datetime() - await session.commit() - return - - # Try to provision on an instance from the pool - pool = await get_or_create_pool_by_name( - session=session, - project=project_model, - pool_name=profile.pool_name, - ) - instance = await _run_job_on_pool_instance( - session=session, - pool=pool, - run_spec=run_spec, - job_model=job_model, - job=job, - master_job_provisioning_data=master_job_provisioning_data, - volumes=volumes, - ) - if instance is None: - if profile.creation_policy == CreationPolicy.REUSE: - logger.debug("%s: reuse instance failed", fmt(job_model)) - job_model.status = JobStatus.TERMINATING - job_model.termination_reason = JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY - job_model.last_processed_at = common_utils.get_current_datetime() - await session.commit() - return - - # Create a new cloud instance - run_job_result = await _run_job_on_new_instance( - project_model=project_model, - job_model=job_model, - run=run, - job=job, - project_ssh_public_key=project_model.ssh_public_key, - project_ssh_private_key=project_model.ssh_private_key, - master_job_provisioning_data=master_job_provisioning_data, - volumes=volumes, - ) - if run_job_result is None: - logger.debug("%s: provisioning failed", fmt(job_model)) - job_model.status = JobStatus.TERMINATING - job_model.termination_reason = JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY - job_model.last_processed_at = common_utils.get_current_datetime() - await session.commit() - return - - logger.info("%s: now is provisioning a new instance", fmt(job_model)) - job_provisioning_data, offer = run_job_result - job_model.job_provisioning_data = job_provisioning_data.json() - job_model.status = JobStatus.PROVISIONING - instance = _create_instance_model_for_job( - project_model=project_model, - pool=pool, - run_spec=run_spec, - job_model=job_model, - job=job, - job_provisioning_data=job_provisioning_data, - offer=offer, - ) - logger.info( - "The job %s created the new instance %s", - job_model.job_name, - instance.name, - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.PROVISIONING.value, - }, - ) - session.add(instance) - await session.flush() # to get im.id - job_model.used_instance_id = instance.id - - if len(volume_models) > 0: - await _attach_volumes( - session=session, - project=project_model, - job_model=job_model, - instance=instance, - volume_models=volume_models, - ) - - job_model.last_processed_at = common_utils.get_current_datetime() - await session.commit() - - -async def _run_job_on_pool_instance( - session: AsyncSession, - pool: PoolModel, - run_spec: RunSpec, - job_model: JobModel, - job: Job, - master_job_provisioning_data: Optional[JobProvisioningData] = None, - volumes: Optional[List[Volume]] = None, -) -> Optional[InstanceModel]: - profile = run_spec.merged_profile - async with PROCESSING_POOL_LOCK: - pool_instances = get_pool_instances(pool) - requirements = Requirements( - resources=run_spec.configuration.resources, - max_price=profile.max_price, - spot=job.job_spec.requirements.spot, - ) - relevant_instances = filter_pool_instances( - pool_instances=pool_instances, - profile=profile, - requirements=requirements, - status=InstanceStatus.IDLE, - multinode=job.job_spec.jobs_per_replica > 1, - master_job_provisioning_data=master_job_provisioning_data, - volumes=volumes, - ) - if len(relevant_instances) == 0: - return None - sorted_instances = sorted(relevant_instances, key=lambda instance: instance.name) - instance = sorted_instances[0] - # Reload InstanceModel with volumes - res = await session.execute( - select(InstanceModel) - .where(InstanceModel.id == instance.id) - .options(joinedload(InstanceModel.volumes)) - ) - instance = res.unique().scalar_one() - instance.status = InstanceStatus.BUSY - instance.job = job_model - logger.info( - "The job %s switched instance %s status to BUSY", - job_model.job_name, - instance.name, - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.BUSY.value, - }, - ) - logger.info("%s: now is provisioning on '%s'", fmt(job_model), instance.name) - job_model.job_provisioning_data = instance.job_provisioning_data - job_model.used_instance_id = instance.id - job_model.status = JobStatus.PROVISIONING - job_model.last_processed_at = common_utils.get_current_datetime() - await session.commit() - return instance - - -async def _run_job_on_new_instance( - project_model: ProjectModel, - job_model: JobModel, - run: Run, - job: Job, - project_ssh_public_key: str, - project_ssh_private_key: str, - master_job_provisioning_data: Optional[JobProvisioningData] = None, - volumes: Optional[List[Volume]] = None, -) -> Optional[Tuple[JobProvisioningData, InstanceOfferWithAvailability]]: - if volumes is None: - volumes = [] - offers = await get_offers_by_requirements( - project=project_model, - profile=run.run_spec.merged_profile, - requirements=job.job_spec.requirements, - exclude_not_available=True, - multinode=job.job_spec.jobs_per_replica > 1, - master_job_provisioning_data=master_job_provisioning_data, - volumes=volumes, - ) - # Limit number of offers tried to prevent long-running processing - # in case all offers fail. - for backend, offer in offers[:15]: - logger.debug( - "%s: trying %s in %s/%s for $%0.4f per hour", - fmt(job_model), - offer.instance.name, - offer.backend.value, - offer.region, - offer.price, - ) - try: - job_provisioning_data = await run_async( - backend.compute().run_job, - run, - job, - offer, - project_ssh_public_key, - project_ssh_private_key, - volumes, - ) - return job_provisioning_data, offer - except BackendError as e: - logger.warning( - "%s: %s launch in %s/%s failed: %s", - fmt(job_model), - offer.instance.name, - offer.backend.value, - offer.region, - repr(e), - ) - continue - except Exception: - logger.exception( - "%s: got exception when launching %s in %s/%s", - fmt(job_model), - offer.instance.name, - offer.backend.value, - offer.region, - ) - continue - return None - - -def _create_instance_model_for_job( - project_model: ProjectModel, - pool: PoolModel, - run_spec: RunSpec, - job_model: JobModel, - job: Job, - job_provisioning_data: JobProvisioningData, - offer: InstanceOfferWithAvailability, -) -> InstanceModel: - profile = run_spec.merged_profile - termination_policy = profile.termination_policy - termination_idle_time = profile.termination_idle_time - if not job_provisioning_data.dockerized: - # terminate vastai/k8s instances immediately - termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE - termination_idle_time = 0 - instance = InstanceModel( - id=uuid.uuid4(), - name=job.job_spec.job_name, # TODO: make new name - project=project_model, - pool=pool, - created_at=common_utils.get_current_datetime(), - started_at=common_utils.get_current_datetime(), - status=InstanceStatus.PROVISIONING, - unreachable=False, - job_provisioning_data=job_provisioning_data.json(), - offer=offer.json(), - termination_policy=termination_policy, - termination_idle_time=termination_idle_time, - job=job_model, - backend=offer.backend, - price=offer.price, - region=offer.region, - volumes=[], - ) - return instance - - -async def _attach_volumes( - session: AsyncSession, - project: ProjectModel, - job_model: JobModel, - instance: InstanceModel, - volume_models: List[VolumeModel], -): - job_provisioning_data = JobProvisioningData.__response__.parse_raw( - instance.job_provisioning_data - ) - backend = await get_project_backend_by_type_or_error( - project=project, - backend_type=job_provisioning_data.backend, - ) - volumes_ids = sorted([v.id for v in volume_models]) - logger.info("Attaching volumes: %s", [v.name for v in volume_models]) - # Take lock to prevent attaching deleted volumes. - # If the volume was deleted before the lock, fail the job. - await wait_to_lock_many(PROCESSING_VOLUMES_LOCK, PROCESSING_VOLUMES_IDS, volumes_ids) - try: - for volume_model in volume_models: - try: - await _attach_volume( - session=session, - backend=backend, - volume_model=volume_model, - instance=instance, - instance_id=job_provisioning_data.instance_id, - ) - except (ServerClientError, BackendError) as e: - logger.warning("%s: failed to attached volume: %s", fmt(job_model), repr(e)) - job_model.status = JobStatus.TERMINATING - # TODO: Replace with JobTerminationReason.VOLUME_ERROR in 0.19 - job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER - job_model.termination_reason_message = "Failed to attach volume" - except Exception: - logger.exception( - "%s: got exception when attaching volume", - fmt(job_model), - ) - job_model.status = JobStatus.TERMINATING - # TODO: Replace with JobTerminationReason.VOLUME_ERROR in 0.19 - job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER - job_model.termination_reason_message = "Failed to attach volume" - finally: - PROCESSING_VOLUMES_IDS.difference_update(volumes_ids) - - -async def _attach_volume( - session: AsyncSession, - backend: Backend, - volume_model: VolumeModel, - instance: InstanceModel, - instance_id: str, -): - await session.refresh(volume_model) - if volume_model.deleted: - raise ServerClientError("Cannot attach a deleted volume") - volume = volume_model_to_volume(volume_model) - attachment_data = await run_async( - backend.compute().attach_volume, - volume=volume, - instance_id=instance_id, - ) - volume_model.volume_attachment_data = attachment_data.json() - instance.volumes.append(volume_model) diff --git a/src/dstack/_internal/server/background/tasks/process_terminating_jobs.py b/src/dstack/_internal/server/background/tasks/process_terminating_jobs.py deleted file mode 100644 index fee6733f20..0000000000 --- a/src/dstack/_internal/server/background/tasks/process_terminating_jobs.py +++ /dev/null @@ -1,50 +0,0 @@ -import uuid - -from sqlalchemy import or_, select - -from dstack._internal.core.models.runs import JobStatus -from dstack._internal.server.db import get_session_ctx -from dstack._internal.server.models import JobModel -from dstack._internal.server.services.jobs import ( - TERMINATING_PROCESSING_JOBS_IDS, - TERMINATING_PROCESSING_JOBS_LOCK, - process_terminating_job, -) -from dstack._internal.server.services.runs import PROCESSING_RUNS_IDS, PROCESSING_RUNS_LOCK -from dstack._internal.utils.common import get_current_datetime -from dstack._internal.utils.logging import get_logger - -logger = get_logger(__name__) - - -async def process_terminating_jobs(): - async with get_session_ctx() as session: - async with PROCESSING_RUNS_LOCK, TERMINATING_PROCESSING_JOBS_LOCK: - res = await session.execute( - select(JobModel) - .where( - JobModel.id.not_in(TERMINATING_PROCESSING_JOBS_IDS), - JobModel.status == JobStatus.TERMINATING, - JobModel.run_id.not_in(PROCESSING_RUNS_IDS), - or_(JobModel.remove_at.is_(None), JobModel.remove_at < get_current_datetime()), - ) - .order_by(JobModel.last_processed_at.asc()) - .limit(1) - ) - job_model = res.scalar() - if job_model is None: - return - TERMINATING_PROCESSING_JOBS_IDS.add(job_model.id) - try: - await _process_job(job_id=job_model.id) - finally: - TERMINATING_PROCESSING_JOBS_IDS.remove(job_model.id) - - -async def _process_job(job_id: uuid.UUID): - async with get_session_ctx() as session: - res = await session.execute(select(JobModel).where(JobModel.id == job_id)) - job_model = res.scalar_one() - await process_terminating_job(session, job_model) - job_model.last_processed_at = get_current_datetime() - await session.commit() diff --git a/src/dstack/_internal/server/background/tasks/process_volumes.py b/src/dstack/_internal/server/background/tasks/process_volumes.py deleted file mode 100644 index 4230f1cdd3..0000000000 --- a/src/dstack/_internal/server/background/tasks/process_volumes.py +++ /dev/null @@ -1,120 +0,0 @@ -from uuid import UUID - -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import joinedload - -from dstack._internal.core.errors import BackendError, BackendNotAvailable -from dstack._internal.core.models.volumes import VolumeStatus -from dstack._internal.server.db import get_session_ctx -from dstack._internal.server.models import VolumeModel -from dstack._internal.server.services import backends as backends_services -from dstack._internal.server.services import volumes as volumes_services -from dstack._internal.server.services.volumes import ( - PROCESSING_VOLUMES_IDS, - PROCESSING_VOLUMES_LOCK, -) -from dstack._internal.server.utils.common import run_async -from dstack._internal.utils.common import get_current_datetime -from dstack._internal.utils.logging import get_logger - -logger = get_logger(__name__) - - -async def process_submitted_volumes(): - async with get_session_ctx() as session: - async with PROCESSING_VOLUMES_LOCK: - res = await session.execute( - select(VolumeModel) - .where( - VolumeModel.status == VolumeStatus.SUBMITTED, - VolumeModel.id.not_in(PROCESSING_VOLUMES_IDS), - ) - .order_by(VolumeModel.last_processed_at.asc()) - .limit(1) - ) - volume_model = res.scalar() - if volume_model is None: - return - - PROCESSING_VOLUMES_IDS.add(volume_model.id) - - try: - await _process_volume(volume_id=volume_model.id) - finally: - PROCESSING_VOLUMES_IDS.remove(volume_model.id) - - -async def _process_volume(volume_id: UUID): - async with get_session_ctx() as session: - res = await session.execute( - select(VolumeModel) - .where(VolumeModel.id == volume_id) - .options(joinedload(VolumeModel.project)) - ) - volume_model = res.scalar_one() - await _process_submitted_volume( - session=session, - volume_model=volume_model, - ) - - -async def _process_submitted_volume(session: AsyncSession, volume_model: VolumeModel): - logger.info("Started submitted volume %s processing", volume_model.name) - - volume = volumes_services.volume_model_to_volume(volume_model) - try: - backend = await backends_services.get_project_backend_by_type_or_error( - project=volume_model.project, - backend_type=volume.configuration.backend, - overrides=True, - ) - except BackendNotAvailable: - logger.error( - "Failed to process volume %s. Backend %s not available.", - volume.name, - volume.configuration.backend.value, - ) - volume_model.status = VolumeStatus.FAILED - volume_model.status_message = "Backend not available" - volume_model.last_processed_at = get_current_datetime() - await session.commit() - return - - try: - if volume.configuration.volume_id is not None: - logger.info("Registering external volume %s", volume_model.name) - vpd = await run_async( - backend.compute().register_volume, - volume=volume, - ) - else: - logger.info("Provisioning new volume %s", volume_model.name) - vpd = await run_async( - backend.compute().create_volume, - volume=volume, - ) - except BackendError as e: - logger.info("Failed to create volume %s: %s", volume_model.name, repr(e)) - volume_model.status = VolumeStatus.FAILED - status_message = f"Backend error: {repr(e)}" - if len(e.args) > 0: - status_message = str(e.args[0]) - volume_model.status_message = status_message - volume_model.last_processed_at = get_current_datetime() - await session.commit() - return - except Exception as e: - logger.exception("Got exception when creating volume %s", volume_model.name) - volume_model.status = VolumeStatus.FAILED - volume_model.status_message = f"Unexpected error: {repr(e)}" - volume_model.last_processed_at = get_current_datetime() - await session.commit() - return - - # Provisioned volumes marked as active since they become available almost immediately in AWS - # TODO: Consider checking volume state - volume_model.volume_provisioning_data = vpd.json() - volume_model.status = VolumeStatus.ACTIVE - volume_model.last_processed_at = get_current_datetime() - await session.commit() diff --git a/src/dstack/_internal/server/compatibility/__init__.py b/src/dstack/_internal/server/compatibility/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/server/compatibility/common.py b/src/dstack/_internal/server/compatibility/common.py new file mode 100644 index 0000000000..ce982b6730 --- /dev/null +++ b/src/dstack/_internal/server/compatibility/common.py @@ -0,0 +1,33 @@ +from typing import Optional + +from packaging.version import Version + +from dstack._internal.core.models.common import EntityReference +from dstack._internal.core.models.instances import ( + InstanceAvailability, + InstanceOfferWithAvailability, +) +from dstack._internal.core.models.profiles import ProfileParams + + +def patch_profile_params(params: ProfileParams, client_version: Optional[Version]) -> None: + if client_version is None: + return + # Clients prior to 0.20.14 only support `list[str]` in `fleets` + if client_version < Version("0.20.14") and params.fleets is not None: + params.fleets = [ + fleet_ref.format() if isinstance(fleet_ref, EntityReference) else fleet_ref + for fleet_ref in params.fleets + ] + + +def patch_offers_list( + offers: list[InstanceOfferWithAvailability], client_version: Optional[Version] +) -> None: + if client_version is None: + return + # CLIs prior to 0.20.4 incorrectly display the `no_balance` availability in the run/fleet plan + if client_version < Version("0.20.4"): + for offer in offers: + if offer.availability == InstanceAvailability.NO_BALANCE: + offer.availability = InstanceAvailability.NOT_AVAILABLE diff --git a/src/dstack/_internal/server/compatibility/fleets.py b/src/dstack/_internal/server/compatibility/fleets.py new file mode 100644 index 0000000000..ddd90d14d7 --- /dev/null +++ b/src/dstack/_internal/server/compatibility/fleets.py @@ -0,0 +1,23 @@ +from typing import Optional + +from packaging.version import Version + +from dstack._internal.core.models.fleets import Fleet, FleetPlan, FleetSpec +from dstack._internal.server.compatibility.common import patch_offers_list, patch_profile_params + + +def patch_fleet_plan(fleet_plan: FleetPlan, client_version: Optional[Version]) -> None: + patch_fleet_spec(fleet_plan.spec, client_version) + if fleet_plan.effective_spec is not None: + patch_fleet_spec(fleet_plan.effective_spec, client_version) + if fleet_plan.current_resource is not None: + patch_fleet(fleet_plan.current_resource, client_version) + patch_offers_list(fleet_plan.offers, client_version) + + +def patch_fleet(fleet: Fleet, client_version: Optional[Version]) -> None: + patch_fleet_spec(fleet.spec, client_version) + + +def patch_fleet_spec(fleet_spec: FleetSpec, client_version: Optional[Version]) -> None: + patch_profile_params(fleet_spec.profile, client_version) diff --git a/src/dstack/_internal/server/compatibility/gateways.py b/src/dstack/_internal/server/compatibility/gateways.py new file mode 100644 index 0000000000..653f3d6e22 --- /dev/null +++ b/src/dstack/_internal/server/compatibility/gateways.py @@ -0,0 +1,15 @@ +from typing import Optional + +from packaging.version import Version + +from dstack._internal.core.models.gateways import Gateway + + +def patch_gateway(gateway: Gateway, client_version: Optional[Version]) -> None: + if client_version is None: + return + if client_version < Version("0.20.25"): + gateway.instance_id = "" + gateway.ip_address = "\n".join(r.hostname for r in gateway.replicas) + if gateway.hostname is None: + gateway.hostname = gateway.ip_address diff --git a/src/dstack/_internal/server/compatibility/gpus.py b/src/dstack/_internal/server/compatibility/gpus.py new file mode 100644 index 0000000000..8548e58bf9 --- /dev/null +++ b/src/dstack/_internal/server/compatibility/gpus.py @@ -0,0 +1,22 @@ +from typing import Optional + +from packaging.version import Version + +from dstack._internal.core.models.instances import InstanceAvailability +from dstack._internal.server.schemas.gpus import ListGpusResponse + + +def patch_list_gpus_response( + response: ListGpusResponse, client_version: Optional[Version] +) -> None: + if client_version is None: + return + # CLIs prior to 0.20.4 incorrectly display the `no_balance` availability in `dstack offer --group-by gpu` + if client_version < Version("0.20.4"): + for gpu in response.gpus: + if InstanceAvailability.NO_BALANCE in gpu.availability: + gpu.availability = [ + a for a in gpu.availability if a != InstanceAvailability.NO_BALANCE + ] + if InstanceAvailability.NOT_AVAILABLE not in gpu.availability: + gpu.availability.append(InstanceAvailability.NOT_AVAILABLE) diff --git a/src/dstack/_internal/server/compatibility/runs.py b/src/dstack/_internal/server/compatibility/runs.py new file mode 100644 index 0000000000..752f5f784b --- /dev/null +++ b/src/dstack/_internal/server/compatibility/runs.py @@ -0,0 +1,54 @@ +from typing import Optional + +from packaging.version import Version + +from dstack._internal.core.models.common import EntityReference +from dstack._internal.core.models.configurations import SERVICE_HTTPS_DEFAULT, ServiceConfiguration +from dstack._internal.core.models.runs import Run, RunPlan, RunSpec +from dstack._internal.server.compatibility.common import patch_offers_list, patch_profile_params + + +def patch_run_plan(run_plan: RunPlan, client_version: Optional[Version]) -> None: + if client_version is None: + return + patch_run_spec(run_plan.run_spec, client_version) + if run_plan.effective_run_spec is not None: + patch_run_spec(run_plan.effective_run_spec, client_version) + if run_plan.current_resource is not None: + patch_run(run_plan.current_resource, client_version) + for job_plan in run_plan.job_plans: + patch_offers_list(job_plan.offers, client_version) + + +def patch_run(run: Run, client_version: Optional[Version]) -> None: + if client_version is None: + return + patch_run_spec(run.run_spec, client_version) + + +def patch_run_spec(run_spec: RunSpec, client_version: Optional[Version]) -> None: + if client_version is None: + return + # Clients prior to 0.20.8 do not support probes = None + if client_version < Version("0.20.8") and isinstance( + run_spec.configuration, ServiceConfiguration + ): + if run_spec.configuration.probes is None: + run_spec.configuration.probes = [] + # Clients prior to 0.20.12 do not support https = None + if ( + client_version < Version("0.20.12") + and isinstance(run_spec.configuration, ServiceConfiguration) + and run_spec.configuration.https is None + ): + run_spec.configuration.https = SERVICE_HTTPS_DEFAULT + patch_profile_params(run_spec.configuration, client_version) + if run_spec.profile is not None: + patch_profile_params(run_spec.profile, client_version) + # Clients prior to 0.20.20 do not support `EntityReference` in `gateway` + if ( + client_version < Version("0.20.20") + and isinstance(run_spec.configuration, ServiceConfiguration) + and isinstance(run_spec.configuration.gateway, EntityReference) + ): + run_spec.configuration.gateway = run_spec.configuration.gateway.format() diff --git a/src/dstack/_internal/server/const.py b/src/dstack/_internal/server/const.py new file mode 100644 index 0000000000..0ebf4643cb --- /dev/null +++ b/src/dstack/_internal/server/const.py @@ -0,0 +1,5 @@ +GLOBAL_EXPORTS_LOCK_NAMESPACE = "global_exports" +""" +Lock used to avoid race conditions between promoting an export to global and creating new projects. +Ensures that all projects always import all global exports. +""" diff --git a/src/dstack/_internal/server/db.py b/src/dstack/_internal/server/db.py index 155cdba40a..2eb18a3f3c 100644 --- a/src/dstack/_internal/server/db.py +++ b/src/dstack/_internal/server/db.py @@ -1,58 +1,95 @@ from contextlib import asynccontextmanager +from typing import Optional from alembic import command, config -from sqlalchemy import event +from sqlalchemy import AsyncAdaptedQueuePool, event from sqlalchemy.engine.interfaces import DBAPIConnection -from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine -from sqlalchemy.orm import sessionmaker +from sqlalchemy.ext.asyncio import ( + AsyncEngine, + AsyncSession, + async_sessionmaker, + create_async_engine, +) from sqlalchemy.pool import ConnectionPoolEntry from dstack._internal.server import settings +from dstack._internal.server.services.locking import advisory_lock_ctx from dstack._internal.server.settings import DATABASE_URL class Database: - def __init__(self, url: str): + def __init__(self, url: str, engine: Optional[AsyncEngine] = None): self.url = url - self.engine = create_async_engine(self.url, echo=settings.SQL_ECHO_ENABLED) - self.session_maker = sessionmaker( - bind=self.engine, + if engine is not None: + self.engine = engine + else: + self.engine = create_async_engine( + self.url, + echo=settings.SQL_ECHO_ENABLED, + poolclass=AsyncAdaptedQueuePool, + pool_size=settings.DB_POOL_SIZE, + max_overflow=settings.DB_MAX_OVERFLOW, + ) + self.session_maker = async_sessionmaker( + bind=self.engine, # type: ignore[assignment] expire_on_commit=False, + # Disable autoflush to avoid accidental long write transactions on SQLite. + autoflush=False, class_=AsyncSession, ) - if self.get_dialect_name() == "sqlite": + if self.dialect_name == "sqlite": @event.listens_for(self.engine.sync_engine, "connect") def set_sqlite_pragma(dbapi_connection: DBAPIConnection, _: ConnectionPoolEntry): cursor = dbapi_connection.cursor() cursor.execute("PRAGMA journal_mode=WAL;") cursor.execute("PRAGMA foreign_keys=ON;") - cursor.execute("PRAGMA busy_timeout=5000;") + cursor.execute("PRAGMA synchronous=NORMAL;") + cursor.execute("PRAGMA busy_timeout=30000;") cursor.close() - def get_dialect_name(self) -> str: + @property + def dialect_name(self) -> str: return self.engine.dialect.name def get_session(self) -> AsyncSession: return self.session_maker() -db = Database(url=DATABASE_URL) +def get_new_db() -> Database: + """ + Creates a new Database with a new Engine. + Use this when you need to access the DB in a new thread instead of calling Database directly + since it's easier to monkey-patch. + """ + return Database(url=DATABASE_URL) + + +_db = get_new_db() + + +def get_db() -> Database: + return _db def override_db(new_db: Database): - global db - db = new_db + global _db + _db = new_db async def migrate(): - async with db.engine.connect() as connection: - await connection.run_sync(_run_alembic_upgrade) + async with _db.engine.connect() as connection: + async with advisory_lock_ctx( + bind=connection, + dialect_name=_db.dialect_name, + resource="migrations", + ): + await connection.run_sync(_run_alembic_upgrade) async def get_session(): - async with db.get_session() as session: + async with _db.get_session() as session: yield session await session.commit() @@ -68,8 +105,26 @@ async def new_func(*args, **kwargs): return new_func +def is_db_sqlite() -> bool: + return get_db().dialect_name == "sqlite" + + +def is_db_postgres() -> bool: + return get_db().dialect_name == "postgresql" + + +async def sqlite_commit(session: AsyncSession): + """ + Commit an sqlite transaction. + Should be used before taking locks in active sessions to see committed changes. + """ + if is_db_sqlite(): + await session.commit() + + def _run_alembic_upgrade(connection): alembic_cfg = config.Config() alembic_cfg.set_main_option("script_location", settings.ALEMBIC_MIGRATIONS_LOCATION) + alembic_cfg.set_main_option("recursive_version_locations", "true") alembic_cfg.attributes["connection"] = connection command.upgrade(alembic_cfg, "head") diff --git a/src/dstack/_internal/server/migrations/env.py b/src/dstack/_internal/server/migrations/env.py index 7a80c7464e..c7c27f1f8b 100644 --- a/src/dstack/_internal/server/migrations/env.py +++ b/src/dstack/_internal/server/migrations/env.py @@ -5,12 +5,12 @@ from alembic import context from sqlalchemy import Connection, MetaData, text -from dstack._internal.server.db import db -from dstack._internal.server.models import BaseModel +from dstack._internal.server.db import get_db +from dstack._internal.server.models import BaseModel, EnumAsString config = context.config -if config.config_file_name is not None: +if config.config_file_name is not None and config.attributes.get("configure_logging", True): fileConfig(config.config_file_name) target_metadata = BaseModel.metadata @@ -21,6 +21,14 @@ def set_target_metadata(metadata: MetaData): target_metadata = metadata +def render_item(type_, obj, autogen_context): + """Apply custom rendering for selected items.""" + if type_ == "type" and isinstance(obj, EnumAsString): + return f"sa.String(length={obj.length})" + # default rendering for other objects + return False + + def run_migrations_offline(): """Run migrations in 'offline' mode. This configures the context with just a URL @@ -31,12 +39,12 @@ def run_migrations_offline(): script output. """ context.configure( - url=db.url, + url=get_db().url, target_metadata=target_metadata, literal_binds=True, dialect_opts={"paramstyle": "named"}, + render_item=render_item, ) - with context.begin_transaction(): context.run_migrations() @@ -59,25 +67,35 @@ def run_migrations(connection: Connection): # Temporarily disable foreign keys, # so that sqlite batch table migrations are performed without data loss: # https://alembic.sqlalchemy.org/en/latest/batch.html#dealing-with-referencing-foreign-keys - if db.get_dialect_name() == "sqlite": + if connection.dialect.name == "sqlite": connection.execute(text("PRAGMA foreign_keys=OFF;")) + elif connection.dialect.name == "postgresql": + # lock_timeout is needed so that migrations that acquire locks + # do not wait for locks forever, blocking live queries. + # Better to fail and retry a deployment. + connection.execute(text("SET lock_timeout='15s';")) connection.commit() context.configure( connection=connection, target_metadata=target_metadata, compare_type=True, render_as_batch=True, + render_item=render_item, + # Running each migration in a separate transaction. + # Running all migrations in one transaction may lead to deadlocks in HA deployments + # because lock ordering is not respected across all migrations. + transaction_per_migration=True, ) with context.begin_transaction(): context.run_migrations() - if db.get_dialect_name() == "sqlite": + if connection.dialect.name == "sqlite": connection.execute(text("PRAGMA foreign_keys=ON;")) connection.commit() async def run_async_migrations(): - engine = db.engine - async with db.engine.connect() as connection: + engine = get_db().engine + async with engine.connect() as connection: await connection.run_sync(run_migrations) await engine.dispose() diff --git a/src/dstack/_internal/server/migrations/script.py.mako b/src/dstack/_internal/server/migrations/script.py.mako index 90e99f0635..79c96c3a3d 100644 --- a/src/dstack/_internal/server/migrations/script.py.mako +++ b/src/dstack/_internal/server/migrations/script.py.mako @@ -6,9 +6,11 @@ Create Date: ${create_date} """ -from alembic import op import sqlalchemy as sa import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models ${imports if imports else ""} # revision identifiers, used by Alembic. diff --git a/src/dstack/_internal/server/migrations/versions/a060e2440936_.py b/src/dstack/_internal/server/migrations/versions/2023/09_20_1634_a060e2440936_.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/a060e2440936_.py rename to src/dstack/_internal/server/migrations/versions/2023/09_20_1634_a060e2440936_.py diff --git a/src/dstack/_internal/server/migrations/versions/bfba43f6def2_.py b/src/dstack/_internal/server/migrations/versions/2023/09_22_1052_bfba43f6def2_.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/bfba43f6def2_.py rename to src/dstack/_internal/server/migrations/versions/2023/09_22_1052_bfba43f6def2_.py diff --git a/src/dstack/_internal/server/migrations/versions/252d3743b641_.py b/src/dstack/_internal/server/migrations/versions/2023/09_25_1609_252d3743b641_.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/252d3743b641_.py rename to src/dstack/_internal/server/migrations/versions/2023/09_25_1609_252d3743b641_.py diff --git a/src/dstack/_internal/server/migrations/versions/fe72c4de8376_add_gateways.py b/src/dstack/_internal/server/migrations/versions/2023/09_27_1742_fe72c4de8376_add_gateways.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/fe72c4de8376_add_gateways.py rename to src/dstack/_internal/server/migrations/versions/2023/09_27_1742_fe72c4de8376_add_gateways.py diff --git a/src/dstack/_internal/server/migrations/versions/d0bb68e48b9f_add_project_owners_and_quotas.py b/src/dstack/_internal/server/migrations/versions/2023/11_01_1019_d0bb68e48b9f_add_project_owners_and_quotas.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/d0bb68e48b9f_add_project_owners_and_quotas.py rename to src/dstack/_internal/server/migrations/versions/2023/11_01_1019_d0bb68e48b9f_add_project_owners_and_quotas.py diff --git a/src/dstack/_internal/server/migrations/versions/112753bc17dd_remove_nullable_fields.py b/src/dstack/_internal/server/migrations/versions/2023/11_01_1135_112753bc17dd_remove_nullable_fields.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/112753bc17dd_remove_nullable_fields.py rename to src/dstack/_internal/server/migrations/versions/2023/11_01_1135_112753bc17dd_remove_nullable_fields.py diff --git a/src/dstack/_internal/server/migrations/versions/14f2cb002fc2_add_jobmodel_removed_flag.py b/src/dstack/_internal/server/migrations/versions/2023/11_03_1646_14f2cb002fc2_add_jobmodel_removed_flag.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/14f2cb002fc2_add_jobmodel_removed_flag.py rename to src/dstack/_internal/server/migrations/versions/2023/11_03_1646_14f2cb002fc2_add_jobmodel_removed_flag.py diff --git a/src/dstack/_internal/server/migrations/versions/23e01c56279a_make_blob_nullable.py b/src/dstack/_internal/server/migrations/versions/2023/11_06_1613_23e01c56279a_make_blob_nullable.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/23e01c56279a_make_blob_nullable.py rename to src/dstack/_internal/server/migrations/versions/2023/11_06_1613_23e01c56279a_make_blob_nullable.py diff --git a/src/dstack/_internal/server/migrations/versions/3dbdce90d0e0_fix_code_uq_constraint.py b/src/dstack/_internal/server/migrations/versions/2023/11_14_1041_3dbdce90d0e0_fix_code_uq_constraint.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/3dbdce90d0e0_fix_code_uq_constraint.py rename to src/dstack/_internal/server/migrations/versions/2023/11_14_1041_3dbdce90d0e0_fix_code_uq_constraint.py diff --git a/src/dstack/_internal/server/migrations/versions/686fb8341ea5_add_user_emails.py b/src/dstack/_internal/server/migrations/versions/2023/11_14_1609_686fb8341ea5_add_user_emails.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/686fb8341ea5_add_user_emails.py rename to src/dstack/_internal/server/migrations/versions/2023/11_14_1609_686fb8341ea5_add_user_emails.py diff --git a/src/dstack/_internal/server/migrations/versions/e6391ca6a264_separate_gateways_from_compute.py b/src/dstack/_internal/server/migrations/versions/2023/12_11_1034_e6391ca6a264_separate_gateways_from_compute.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/e6391ca6a264_separate_gateways_from_compute.py rename to src/dstack/_internal/server/migrations/versions/2023/12_11_1034_e6391ca6a264_separate_gateways_from_compute.py diff --git a/src/dstack/_internal/server/migrations/versions/48ad3ecbaea2_do_not_delete_projects_and_runs.py b/src/dstack/_internal/server/migrations/versions/2023/12_19_1555_48ad3ecbaea2_do_not_delete_projects_and_runs.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/48ad3ecbaea2_do_not_delete_projects_and_runs.py rename to src/dstack/_internal/server/migrations/versions/2023/12_19_1555_48ad3ecbaea2_do_not_delete_projects_and_runs.py diff --git a/src/dstack/_internal/server/migrations/versions/d3e8af4786fa_gateway_compute_flag_deleted.py b/src/dstack/_internal/server/migrations/versions/2024/01_09_1223_d3e8af4786fa_gateway_compute_flag_deleted.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/d3e8af4786fa_gateway_compute_flag_deleted.py rename to src/dstack/_internal/server/migrations/versions/2024/01_09_1223_d3e8af4786fa_gateway_compute_flag_deleted.py diff --git a/src/dstack/_internal/server/migrations/versions/27d3e55759fa_add_pools.py b/src/dstack/_internal/server/migrations/versions/2024/02_12_1427_27d3e55759fa_add_pools.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/27d3e55759fa_add_pools.py rename to src/dstack/_internal/server/migrations/versions/2024/02_12_1427_27d3e55759fa_add_pools.py diff --git a/src/dstack/_internal/server/migrations/versions/29c08c6a8cb3_.py b/src/dstack/_internal/server/migrations/versions/2024/02_14_1139_29c08c6a8cb3_.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/29c08c6a8cb3_.py rename to src/dstack/_internal/server/migrations/versions/2024/02_14_1139_29c08c6a8cb3_.py diff --git a/src/dstack/_internal/server/migrations/versions/9eea6af28e10_added_fail_reason_for_instancemodel.py b/src/dstack/_internal/server/migrations/versions/2024/02_19_1139_9eea6af28e10_added_fail_reason_for_instancemodel.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/9eea6af28e10_added_fail_reason_for_instancemodel.py rename to src/dstack/_internal/server/migrations/versions/2024/02_19_1139_9eea6af28e10_added_fail_reason_for_instancemodel.py diff --git a/src/dstack/_internal/server/migrations/versions/1a48dfe44a40_rework_termination_handling.py b/src/dstack/_internal/server/migrations/versions/2024/02_21_1011_1a48dfe44a40_rework_termination_handling.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/1a48dfe44a40_rework_termination_handling.py rename to src/dstack/_internal/server/migrations/versions/2024/02_21_1011_1a48dfe44a40_rework_termination_handling.py diff --git a/src/dstack/_internal/server/migrations/versions/ed0ca30e13bb_migrate_instancestatus_provisioning.py b/src/dstack/_internal/server/migrations/versions/2024/02_28_0547_ed0ca30e13bb_migrate_instancestatus_provisioning.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/ed0ca30e13bb_migrate_instancestatus_provisioning.py rename to src/dstack/_internal/server/migrations/versions/2024/02_28_0547_ed0ca30e13bb_migrate_instancestatus_provisioning.py diff --git a/src/dstack/_internal/server/migrations/versions/b88d55c2a07d_replace_instancestatus_ready.py b/src/dstack/_internal/server/migrations/versions/2024/02_28_0615_b88d55c2a07d_replace_instancestatus_ready.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/b88d55c2a07d_replace_instancestatus_ready.py rename to src/dstack/_internal/server/migrations/versions/2024/02_28_0615_b88d55c2a07d_replace_instancestatus_ready.py diff --git a/src/dstack/_internal/server/migrations/versions/2024/03_01_1430_4b4319398164_introduce_runs_processing.py b/src/dstack/_internal/server/migrations/versions/2024/03_01_1430_4b4319398164_introduce_runs_processing.py new file mode 100644 index 0000000000..bfb1a98411 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2024/03_01_1430_4b4319398164_introduce_runs_processing.py @@ -0,0 +1,144 @@ +"""Introduce runs processing + +Revision ID: 4b4319398164 +Revises: b88d55c2a07d +Create Date: 2024-03-01 14:30:28.918255 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +# revision identifiers, used by Alembic. +revision = "4b4319398164" +down_revision = "b88d55c2a07d" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + with op.batch_alter_table("runs", schema=None) as batch_op: + # last_processed_at is nullable=False later + batch_op.add_column(sa.Column("last_processed_at", sa.DateTime(), nullable=True)) + batch_op.add_column( + sa.Column( + "gateway_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True + ) + ) + run_termination_reason_enum = sa.Enum( + "ALL_JOBS_DONE", + "JOB_FAILED", + "RETRY_LIMIT_EXCEEDED", + "STOPPED_BY_USER", + "ABORTED_BY_USER", + "SERVER_ERROR", + name="runterminationreason", + ) + run_termination_reason_enum.create(op.get_bind(), checkfirst=True) + batch_op.add_column( + sa.Column( + "termination_reason", + run_termination_reason_enum, + nullable=True, + ) + ) + batch_op.add_column(sa.Column("service_spec", sa.String(length=4000), nullable=True)) + batch_op.create_foreign_key( + batch_op.f("fk_runs_gateway_id_gateways"), + "gateways", + ["gateway_id"], + ["id"], + ondelete="SET NULL", + ) + op.execute("UPDATE runs SET last_processed_at = submitted_at") + op.execute( + "UPDATE runs SET " + " status = 'TERMINATED' " + "WHERE id NOT IN ( " + " SELECT run_id FROM jobs " + " WHERE status NOT IN ('TERMINATED', 'ABORTED', 'FAILED', 'DONE') " + ")" + ) + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.alter_column("last_processed_at", nullable=False) + + job_termination_reason_enum = sa.Enum( + "FAILED_TO_START_DUE_TO_NO_CAPACITY", + "INTERRUPTED_BY_NO_CAPACITY", + "WAITING_RUNNER_LIMIT_EXCEEDED", + "TERMINATED_BY_USER", + "GATEWAY_ERROR", + "SCALED_DOWN", + "DONE_BY_RUNNER", + "ABORTED_BY_USER", + "TERMINATED_BY_SERVER", + "CONTAINER_EXITED_WITH_ERROR", + "PORTS_BINDING_FAILED", + name="jobterminationreason", + ) + job_termination_reason_enum.create(op.get_bind(), checkfirst=True) + + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.alter_column( + "error_code", + new_column_name="termination_reason", + type_=job_termination_reason_enum, + postgresql_using=("error_code::VARCHAR::jobterminationreason"), + ) + # replica_num is nullable=False later + batch_op.add_column(sa.Column("replica_num", sa.Integer(), nullable=True)) + batch_op.drop_column("removed") + batch_op.execute("UPDATE jobs SET replica_num = 0") + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.alter_column("replica_num", nullable=False) + + +def downgrade() -> None: + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.alter_column( + "termination_reason", + new_column_name="error_code", + type_=sa.VARCHAR(length=34), + ) + batch_op.add_column( + # all jobs will get not removed + sa.Column("removed", sa.BOOLEAN(), server_default=sa.false(), nullable=False) + ) + batch_op.drop_column("replica_num") + + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.drop_constraint(batch_op.f("fk_runs_gateway_id_gateways"), type_="foreignkey") + batch_op.drop_column("service_spec") + batch_op.drop_column("termination_reason") + batch_op.drop_column("gateway_id") + batch_op.drop_column("last_processed_at") + op.execute("UPDATE runs SET status = 'SUBMITTED'") + op.execute("UPDATE jobs SET removed = TRUE") + + run_termination_reason_enum = sa.Enum( + "ALL_JOBS_DONE", + "JOB_FAILED", + "RETRY_LIMIT_EXCEEDED", + "STOPPED_BY_USER", + "ABORTED_BY_USER", + "SERVER_ERROR", + name="runterminationreason", + ) + run_termination_reason_enum.drop(op.get_bind(), checkfirst=True) + + job_termination_reason_enum = sa.Enum( + "FAILED_TO_START_DUE_TO_NO_CAPACITY", + "INTERRUPTED_BY_NO_CAPACITY", + "WAITING_RUNNER_LIMIT_EXCEEDED", + "TERMINATED_BY_USER", + "GATEWAY_ERROR", + "SCALED_DOWN", + "DONE_BY_RUNNER", + "ABORTED_BY_USER", + "TERMINATED_BY_SERVER", + "CONTAINER_EXITED_WITH_ERROR", + "PORTS_BINDING_FAILED", + name="jobterminationreason", + ) + job_termination_reason_enum.drop(op.get_bind(), checkfirst=True) diff --git a/src/dstack/_internal/server/migrations/versions/0e33559e16ed_update_instancestatus.py b/src/dstack/_internal/server/migrations/versions/2024/03_07_1721_0e33559e16ed_update_instancestatus.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/0e33559e16ed_update_instancestatus.py rename to src/dstack/_internal/server/migrations/versions/2024/03_07_1721_0e33559e16ed_update_instancestatus.py diff --git a/src/dstack/_internal/server/migrations/versions/555138b1f77f_change_instancemodel_for_asynchronous_.py b/src/dstack/_internal/server/migrations/versions/2024/03_12_1717_555138b1f77f_change_instancemodel_for_asynchronous_.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/555138b1f77f_change_instancemodel_for_asynchronous_.py rename to src/dstack/_internal/server/migrations/versions/2024/03_12_1717_555138b1f77f_change_instancemodel_for_asynchronous_.py diff --git a/src/dstack/_internal/server/migrations/versions/5ec538b70e71_replace_instansestatus.py b/src/dstack/_internal/server/migrations/versions/2024/03_13_1048_5ec538b70e71_replace_instansestatus.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/5ec538b70e71_replace_instansestatus.py rename to src/dstack/_internal/server/migrations/versions/2024/03_13_1048_5ec538b70e71_replace_instansestatus.py diff --git a/src/dstack/_internal/server/migrations/versions/4ae1a5b0e7f1_add_run_list_index.py b/src/dstack/_internal/server/migrations/versions/2024/03_18_1216_4ae1a5b0e7f1_add_run_list_index.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/4ae1a5b0e7f1_add_run_list_index.py rename to src/dstack/_internal/server/migrations/versions/2024/03_18_1216_4ae1a5b0e7f1_add_run_list_index.py diff --git a/src/dstack/_internal/server/migrations/versions/99b4c8c954ea_add_termination_reason_message.py b/src/dstack/_internal/server/migrations/versions/2024/03_29_0637_99b4c8c954ea_add_termination_reason_message.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/99b4c8c954ea_add_termination_reason_message.py rename to src/dstack/_internal/server/migrations/versions/2024/03_29_0637_99b4c8c954ea_add_termination_reason_message.py diff --git a/src/dstack/_internal/server/migrations/versions/866ec1d67184_replace_retrypolicy_limit_with_.py b/src/dstack/_internal/server/migrations/versions/2024/04_02_0142_866ec1d67184_replace_retrypolicy_limit_with_.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/866ec1d67184_replace_retrypolicy_limit_with_.py rename to src/dstack/_internal/server/migrations/versions/2024/04_02_0142_866ec1d67184_replace_retrypolicy_limit_with_.py diff --git a/src/dstack/_internal/server/migrations/versions/1e3fb39ef74b_add_remote_connection_details.py b/src/dstack/_internal/server/migrations/versions/2024/04_08_0802_1e3fb39ef74b_add_remote_connection_details.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/1e3fb39ef74b_add_remote_connection_details.py rename to src/dstack/_internal/server/migrations/versions/2024/04_08_0802_1e3fb39ef74b_add_remote_connection_details.py diff --git a/src/dstack/_internal/server/migrations/versions/58aa5162dcc3_add_gatewaymodel_configuration.py b/src/dstack/_internal/server/migrations/versions/2024/05_15_1104_58aa5162dcc3_add_gatewaymodel_configuration.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/58aa5162dcc3_add_gatewaymodel_configuration.py rename to src/dstack/_internal/server/migrations/versions/2024/05_15_1104_58aa5162dcc3_add_gatewaymodel_configuration.py diff --git a/src/dstack/_internal/server/migrations/versions/c154eece89da_add_fields_for_async_gateway_creation.py b/src/dstack/_internal/server/migrations/versions/2024/05_16_1418_c154eece89da_add_fields_for_async_gateway_creation.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/c154eece89da_add_fields_for_async_gateway_creation.py rename to src/dstack/_internal/server/migrations/versions/2024/05_16_1418_c154eece89da_add_fields_for_async_gateway_creation.py diff --git a/src/dstack/_internal/server/migrations/versions/dfffd6a1165c_add_fields_for_gateways_behind_alb.py b/src/dstack/_internal/server/migrations/versions/2024/05_22_1338_dfffd6a1165c_add_fields_for_gateways_behind_alb.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/dfffd6a1165c_add_fields_for_gateways_behind_alb.py rename to src/dstack/_internal/server/migrations/versions/2024/05_22_1338_dfffd6a1165c_add_fields_for_gateways_behind_alb.py diff --git a/src/dstack/_internal/server/migrations/versions/29826f417010_remove_instancemodel_retry_policy.py b/src/dstack/_internal/server/migrations/versions/2024/05_29_1040_29826f417010_remove_instancemodel_retry_policy.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/29826f417010_remove_instancemodel_retry_policy.py rename to src/dstack/_internal/server/migrations/versions/2024/05_29_1040_29826f417010_remove_instancemodel_retry_policy.py diff --git a/src/dstack/_internal/server/migrations/versions/b4d6ad60db08_add_instancemodel_unreachable.py b/src/dstack/_internal/server/migrations/versions/2024/05_30_0955_b4d6ad60db08_add_instancemodel_unreachable.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/b4d6ad60db08_add_instancemodel_unreachable.py rename to src/dstack/_internal/server/migrations/versions/2024/05_30_0955_b4d6ad60db08_add_instancemodel_unreachable.py diff --git a/src/dstack/_internal/server/migrations/versions/98cd9c8b5927_add_volumemodel.py b/src/dstack/_internal/server/migrations/versions/2024/06_26_1122_98cd9c8b5927_add_volumemodel.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/98cd9c8b5927_add_volumemodel.py rename to src/dstack/_internal/server/migrations/versions/2024/06_26_1122_98cd9c8b5927_add_volumemodel.py diff --git a/src/dstack/_internal/server/migrations/versions/2024/07_04_1726_5ad8debc8fe6_fixes_for_psql.py b/src/dstack/_internal/server/migrations/versions/2024/07_04_1726_5ad8debc8fe6_fixes_for_psql.py new file mode 100644 index 0000000000..5d0fa4a506 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2024/07_04_1726_5ad8debc8fe6_fixes_for_psql.py @@ -0,0 +1,329 @@ +"""Fixes for psql + +Revision ID: 5ad8debc8fe6 +Revises: 98cd9c8b5927 +Create Date: 2024-07-04 17:26:01.937112 + +""" + +import sqlalchemy as sa +from alembic import op +from alembic_postgresql_enum import TableReference +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "5ad8debc8fe6" +down_revision = "98cd9c8b5927" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + sa.Enum( + "PENDING", + "SUBMITTED", + "PROVISIONING", + "RUNNING", + "TERMINATING", + "TERMINATED", + "FAILED", + "DONE", + name="runstatus", + ).create(op.get_bind()) + with op.batch_alter_table("backends", schema=None) as batch_op: + batch_op.alter_column( + "config", + existing_type=sa.VARCHAR(length=2000), + type_=sa.String(length=20000), + existing_nullable=False, + ) + batch_op.alter_column( + "auth", + existing_type=sa.VARCHAR(length=2000), + type_=sa.String(length=20000), + existing_nullable=False, + ) + + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.alter_column( + "runner_timestamp", + existing_type=sa.INTEGER(), + type_=sa.BigInteger(), + existing_nullable=True, + ) + + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.alter_column( + "status", + existing_type=postgresql.ENUM( + "PENDING", + "SUBMITTED", + "PROVISIONING", + "RUNNING", + "TERMINATING", + "TERMINATED", + "ABORTED", + "FAILED", + "DONE", + name="jobstatus", + ), + type_=sa.Enum( + "PENDING", + "SUBMITTED", + "PROVISIONING", + "RUNNING", + "TERMINATING", + "TERMINATED", + "FAILED", + "DONE", + name="runstatus", + ), + existing_nullable=False, + postgresql_using="status::VARCHAR::runstatus", + ) + + sa.Enum( + "NO_INSTANCE_MATCHING_REQUIREMENTS", + "FAILED_TO_START_DUE_TO_NO_CAPACITY", + "INTERRUPTED_BY_NO_CAPACITY", + "INSTANCE_TERMINATED", + "CONTAINER_EXITED_WITH_ERROR", + "PORTS_BINDING_FAILED", + name="joberrorcode", + ).drop(op.get_bind()) + op.sync_enum_values( + "public", + "backendtype", + [ + "AWS", + "AZURE", + "CUDO", + "DATACRUNCH", + "DSTACK", + "GCP", + "KUBERNETES", + "LAMBDA", + "LOCAL", + "REMOTE", + "NEBIUS", + "OCI", + "RUNPOD", + "TENSORDOCK", + "VASTAI", + ], + [ + TableReference(table_schema="public", table_name="instances", column_name="backend"), + TableReference(table_schema="public", table_name="backends", column_name="type"), + ], + enum_values_to_rename=[], + ) + op.sync_enum_values( + "public", + "repotype", + ["REMOTE", "LOCAL", "VIRTUAL"], + [TableReference(table_schema="public", table_name="repos", column_name="type")], + enum_values_to_rename=[], + ) + op.sync_enum_values( + "public", + "jobstatus", + [ + "SUBMITTED", + "PROVISIONING", + "PULLING", + "RUNNING", + "TERMINATING", + "TERMINATED", + "ABORTED", + "FAILED", + "DONE", + ], + [TableReference(table_schema="public", table_name="jobs", column_name="status")], + enum_values_to_rename=[], + ) + op.sync_enum_values( + "public", + "jobterminationreason", + [ + "FAILED_TO_START_DUE_TO_NO_CAPACITY", + "INTERRUPTED_BY_NO_CAPACITY", + "WAITING_INSTANCE_LIMIT_EXCEEDED", + "WAITING_RUNNER_LIMIT_EXCEEDED", + "TERMINATED_BY_USER", + "VOLUME_ERROR", + "GATEWAY_ERROR", + "SCALED_DOWN", + "DONE_BY_RUNNER", + "ABORTED_BY_USER", + "TERMINATED_BY_SERVER", + "CONTAINER_EXITED_WITH_ERROR", + "PORTS_BINDING_FAILED", + "CREATING_CONTAINER_ERROR", + "EXECUTOR_ERROR", + ], + [ + TableReference( + table_schema="public", table_name="jobs", column_name="termination_reason" + ) + ], + enum_values_to_rename=[], + ) + op.sync_enum_values( + "public", + "instancestatus", + ["PENDING", "PROVISIONING", "IDLE", "BUSY", "TERMINATING", "TERMINATED"], + [TableReference(table_schema="public", table_name="instances", column_name="status")], + enum_values_to_rename=[], + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.sync_enum_values( + "public", + "instancestatus", + [ + "PENDING", + "CREATING", + "STARTING", + "READY", + "BUSY", + "TERMINATING", + "TERMINATED", + "FAILED", + ], + [TableReference(table_schema="public", table_name="instances", column_name="status")], + enum_values_to_rename=[], + ) + op.sync_enum_values( + "public", + "jobterminationreason", + [ + "FAILED_TO_START_DUE_TO_NO_CAPACITY", + "INTERRUPTED_BY_NO_CAPACITY", + "WAITING_RUNNER_LIMIT_EXCEEDED", + "TERMINATED_BY_USER", + "GATEWAY_ERROR", + "SCALED_DOWN", + "DONE_BY_RUNNER", + "ABORTED_BY_USER", + "TERMINATED_BY_SERVER", + "CONTAINER_EXITED_WITH_ERROR", + "PORTS_BINDING_FAILED", + ], + [ + TableReference( + table_schema="public", table_name="jobs", column_name="termination_reason" + ) + ], + enum_values_to_rename=[], + ) + op.sync_enum_values( + "public", + "jobstatus", + [ + "PENDING", + "SUBMITTED", + "PROVISIONING", + "RUNNING", + "TERMINATING", + "TERMINATED", + "ABORTED", + "FAILED", + "DONE", + ], + [TableReference(table_schema="public", table_name="jobs", column_name="status")], + enum_values_to_rename=[], + ) + op.sync_enum_values( + "public", + "repotype", + ["REMOTE", "LOCAL"], + [TableReference(table_schema="public", table_name="repos", column_name="type")], + enum_values_to_rename=[], + ) + op.sync_enum_values( + "public", + "backendtype", + ["AWS", "AZURE", "GCP", "LAMBDA"], + [ + TableReference(table_schema="public", table_name="instances", column_name="backend"), + TableReference(table_schema="public", table_name="backends", column_name="type"), + ], + enum_values_to_rename=[], + ) + sa.Enum( + "NO_INSTANCE_MATCHING_REQUIREMENTS", + "FAILED_TO_START_DUE_TO_NO_CAPACITY", + "INTERRUPTED_BY_NO_CAPACITY", + "INSTANCE_TERMINATED", + "CONTAINER_EXITED_WITH_ERROR", + "PORTS_BINDING_FAILED", + name="joberrorcode", + ).create(op.get_bind()) + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.alter_column( + "status", + existing_type=sa.Enum( + "PENDING", + "SUBMITTED", + "PROVISIONING", + "RUNNING", + "TERMINATING", + "TERMINATED", + "FAILED", + "DONE", + name="runstatus", + ), + type_=postgresql.ENUM( + "PENDING", + "SUBMITTED", + "PROVISIONING", + "RUNNING", + "TERMINATING", + "TERMINATED", + "ABORTED", + "FAILED", + "DONE", + name="jobstatus", + ), + existing_nullable=False, + postgresql_using="status::VARCHAR::jobstatus", + ) + + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.alter_column( + "runner_timestamp", + existing_type=sa.BigInteger(), + type_=sa.INTEGER(), + existing_nullable=True, + ) + + with op.batch_alter_table("backends", schema=None) as batch_op: + batch_op.alter_column( + "auth", + existing_type=sa.String(length=20000), + type_=sa.VARCHAR(length=2000), + existing_nullable=False, + ) + batch_op.alter_column( + "config", + existing_type=sa.String(length=20000), + type_=sa.VARCHAR(length=2000), + existing_nullable=False, + ) + + sa.Enum( + "PENDING", + "SUBMITTED", + "PROVISIONING", + "RUNNING", + "TERMINATING", + "TERMINATED", + "FAILED", + "DONE", + name="runstatus", + ).drop(op.get_bind()) + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2024/07_14_2143_91ac5e543037_extend_repos_creds_column.py b/src/dstack/_internal/server/migrations/versions/2024/07_14_2143_91ac5e543037_extend_repos_creds_column.py new file mode 100644 index 0000000000..62e8a32ab9 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2024/07_14_2143_91ac5e543037_extend_repos_creds_column.py @@ -0,0 +1,36 @@ +"""Extend repos.creds column + +Revision ID: 91ac5e543037 +Revises: 5ad8debc8fe6 +Create Date: 2024-07-14 21:43:03.242059 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "91ac5e543037" +down_revision = "5ad8debc8fe6" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + with op.batch_alter_table("repos", schema=None) as batch_op: + batch_op.alter_column( + "creds", + existing_type=sa.VARCHAR(length=2000), + type_=sa.String(length=5000), + existing_nullable=True, + ) + + +def downgrade() -> None: + with op.batch_alter_table("repos", schema=None) as batch_op: + batch_op.alter_column( + "creds", + existing_type=sa.String(length=5000), + type_=sa.VARCHAR(length=2000), + existing_nullable=True, + ) diff --git a/src/dstack/_internal/server/migrations/versions/2024/07_15_2309_3cf77fb8bcf1_store_repo_clone_url.py b/src/dstack/_internal/server/migrations/versions/2024/07_15_2309_3cf77fb8bcf1_store_repo_clone_url.py new file mode 100644 index 0000000000..1deb2fb546 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2024/07_15_2309_3cf77fb8bcf1_store_repo_clone_url.py @@ -0,0 +1,85 @@ +"""Store repo clone URL + +Revision ID: 3cf77fb8bcf1 +Revises: 91ac5e543037 +Create Date: 2024-07-15 23:09:40.150763 + +""" + +import json +import uuid + +import sqlalchemy as sa +from alembic import op +from sqlalchemy_utils import UUIDType + +# revision identifiers, used by Alembic. +revision = "3cf77fb8bcf1" +down_revision = "91ac5e543037" +branch_labels = None +depends_on = None + +repos_table = sa.Table( + "repos", + sa.MetaData(), + # partial description - only columns affected by this migration + sa.Column("id", UUIDType(binary=False), primary_key=True, default=uuid.uuid4), + sa.Column("info", sa.String(2000), nullable=False), + sa.Column("creds", sa.String(5000), nullable=True), +) + + +def upgrade() -> None: + select_stmt = sa.select(repos_table.c.id, repos_table.c.info, repos_table.c.creds).where( + repos_table.c.creds.isnot(None) + ) + + batch_update_params = [] + + for row in op.get_bind().execute(select_stmt).all(): + creds = json.loads(row.creds) + info = json.loads(row.info) + + repo_host_name = info["repo_host_name"] + repo_port = info.get("repo_port") + repo_user_name = info["repo_user_name"] + repo_name = info["repo_name"] + + netloc = f"{repo_host_name}:{repo_port}" if repo_port else repo_host_name + + if creds["protocol"] == "ssh": + clone_url = f"ssh://git@{netloc}/{repo_user_name}/{repo_name}.git" + else: + clone_url = f"https://{netloc}/{repo_user_name}/{repo_name}.git" + + creds["clone_url"] = clone_url + batch_update_params.append({"_id": row.id, "creds": json.dumps(creds)}) + + update_stmt = ( + repos_table.update() + .where(repos_table.c.id == sa.bindparam("_id")) + .values(creds=sa.bindparam("creds")) + ) + if batch_update_params: + op.get_bind().execute(update_stmt, batch_update_params) + + +def downgrade() -> None: + select_stmt = sa.select(repos_table.c.id, repos_table.c.creds).where( + repos_table.c.creds.isnot(None) + ) + + batch_update_params = [] + + for row in op.get_bind().execute(select_stmt).all(): + creds = json.loads(row.creds) + creds.pop("clone_url", None) + batch_update_params.append({"_id": row.id, "creds": json.dumps(creds)}) + + update_stmt = ( + repos_table.update() + .where(repos_table.c.id == sa.bindparam("_id")) + .values(creds=sa.bindparam("creds")) + ) + if batch_update_params: + op.get_bind().execute(update_stmt, batch_update_params) diff --git a/src/dstack/_internal/server/migrations/versions/2024/07_17_1543_c00090eaef21_support_fleets.py b/src/dstack/_internal/server/migrations/versions/2024/07_17_1543_c00090eaef21_support_fleets.py new file mode 100644 index 0000000000..5cb368e8fd --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2024/07_17_1543_c00090eaef21_support_fleets.py @@ -0,0 +1,108 @@ +"""Support fleets + +Revision ID: c00090eaef21 +Revises: 3cf77fb8bcf1 +Create Date: 2024-07-17 15:43:24.476764 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op +from sqlalchemy.dialects import postgresql + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "c00090eaef21" +down_revision = "3cf77fb8bcf1" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + sa.Enum( + "SUBMITTED", "ACTIVE", "TERMINATING", "TERMINATED", "FAILED", name="fleetstatus" + ).create(op.get_bind()) + op.create_table( + "fleets", + sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("name", sa.String(length=100), nullable=False), + sa.Column( + "project_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False + ), + sa.Column("created_at", dstack._internal.server.models.NaiveDateTime(), nullable=False), + sa.Column( + "last_processed_at", dstack._internal.server.models.NaiveDateTime(), nullable=False + ), + sa.Column("deleted", sa.Boolean(), nullable=False), + sa.Column("deleted_at", dstack._internal.server.models.NaiveDateTime(), nullable=True), + sa.Column( + "status", + postgresql.ENUM( + "SUBMITTED", + "ACTIVE", + "TERMINATING", + "TERMINATED", + "FAILED", + name="fleetstatus", + create_type=False, + ), + nullable=False, + ), + sa.Column("status_message", sa.Text(), nullable=True), + sa.Column("spec", sa.Text(), nullable=False), + sa.ForeignKeyConstraint( + ["project_id"], + ["projects.id"], + name=op.f("fk_fleets_project_id_projects"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_fleets")), + ) + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.add_column(sa.Column("instance_num", sa.Integer(), nullable=True)) + batch_op.add_column( + sa.Column( + "fleet_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True + ) + ) + batch_op.create_foreign_key( + batch_op.f("fk_instances_fleet_id_fleets"), "fleets", ["fleet_id"], ["id"] + ) + + batch_op.execute("UPDATE instances SET instance_num = 0") + + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.alter_column("instance_num", nullable=False) + + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "fleet_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True + ) + ) + batch_op.create_foreign_key( + batch_op.f("fk_runs_fleet_id_fleets"), "fleets", ["fleet_id"], ["id"] + ) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.drop_constraint(batch_op.f("fk_runs_fleet_id_fleets"), type_="foreignkey") + batch_op.drop_column("fleet_id") + + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.drop_constraint(batch_op.f("fk_instances_fleet_id_fleets"), type_="foreignkey") + batch_op.drop_column("fleet_id") + batch_op.drop_column("instance_num") + + op.drop_table("fleets") + sa.Enum("SUBMITTED", "ACTIVE", "TERMINATING", "TERMINATED", "FAILED", name="fleetstatus").drop( + op.get_bind() + ) + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2024/08_15_1024_710e5b3fac8f_add_encryption.py b/src/dstack/_internal/server/migrations/versions/2024/08_15_1024_710e5b3fac8f_add_encryption.py new file mode 100644 index 0000000000..105292043c --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2024/08_15_1024_710e5b3fac8f_add_encryption.py @@ -0,0 +1,54 @@ +"""Add encryption + +Revision ID: 710e5b3fac8f +Revises: c00090eaef21 +Create Date: 2024-08-15 10:24:30.113834 + +""" + +import hashlib + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "710e5b3fac8f" +down_revision = "c00090eaef21" +branch_labels = None +depends_on = None + + +ENCODED_PREFIX = "enc:identity:noname:" + + +def upgrade() -> None: + with op.batch_alter_table("users", schema=None) as batch_op: + batch_op.add_column(sa.Column("token_hash", sa.String(length=2000), nullable=True)) + batch_op.create_unique_constraint(batch_op.f("uq_users_token_hash"), ["token_hash"]) + + batch_update_params = [] + result = op.get_bind().execute(sa.text("SELECT id, token FROM users")) + for row in result: + token_hash = hashlib.sha256(row.token.encode()).hexdigest() + batch_update_params.append({"token_hash": token_hash, "id": row.id}) + if batch_update_params: + op.get_bind().execute( + sa.text("UPDATE users SET token_hash = :token_hash WHERE id = :id"), + batch_update_params, + ) + + with op.batch_alter_table("users", schema=None) as batch_op: + batch_op.alter_column("token_hash", nullable=False) + + op.execute(f"UPDATE backends SET auth = '{ENCODED_PREFIX}' || auth") + op.execute(f"UPDATE users SET token = '{ENCODED_PREFIX}' || token") + + +def downgrade() -> None: + # Assumes all rows decrypted to 'enc:identity:' before downgrading + op.execute(f"UPDATE users SET token = SUBSTRING(token, {len(ENCODED_PREFIX) + 1})") + op.execute(f"UPDATE backends SET auth = SUBSTRING(auth, {len(ENCODED_PREFIX) + 1})") + + with op.batch_alter_table("users", schema=None) as batch_op: + batch_op.drop_constraint(batch_op.f("uq_users_token_hash"), type_="unique") + batch_op.drop_column("token_hash") diff --git a/src/dstack/_internal/server/migrations/versions/2024/08_16_1425_54a77e19c64c_add_manager_project_role.py b/src/dstack/_internal/server/migrations/versions/2024/08_16_1425_54a77e19c64c_add_manager_project_role.py new file mode 100644 index 0000000000..f8e8cf8f8e --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2024/08_16_1425_54a77e19c64c_add_manager_project_role.py @@ -0,0 +1,67 @@ +"""Add Manager project role + +Revision ID: 54a77e19c64c +Revises: 710e5b3fac8f +Create Date: 2024-08-16 14:25:52.125915 + +""" + +import sqlalchemy as sa +from alembic import op +from alembic_postgresql_enum import TableReference + +# revision identifiers, used by Alembic. +revision = "54a77e19c64c" +down_revision = "710e5b3fac8f" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + if op.get_context().dialect.name == "postgresql": + op.sync_enum_values( + "public", + "projectrole", + ["ADMIN", "MANAGER", "USER"], + [ + TableReference( + table_schema="public", table_name="members", column_name="project_role" + ) + ], + enum_values_to_rename=[], + ) + else: + with op.batch_alter_table("members", schema=None) as batch_op: + batch_op.alter_column( + "project_role", + existing_type=sa.VARCHAR(length=5), + type_=sa.Enum("ADMIN", "MANAGER", "USER", name="projectrole"), + existing_nullable=False, + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + if op.get_context().dialect.name == "postgresql": + op.sync_enum_values( + "public", + "projectrole", + ["ADMIN", "USER"], + [ + TableReference( + table_schema="public", table_name="members", column_name="project_role" + ) + ], + enum_values_to_rename=[], + ) + else: + with op.batch_alter_table("members", schema=None) as batch_op: + batch_op.alter_column( + "project_role", + existing_type=sa.Enum("ADMIN", "MANAGER", "USER", name="projectrole"), + type_=sa.VARCHAR(length=5), + existing_nullable=False, + ) + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2024/08_19_1510_d6b11105f659_add_usermodel_active.py b/src/dstack/_internal/server/migrations/versions/2024/08_19_1510_d6b11105f659_add_usermodel_active.py new file mode 100644 index 0000000000..80fd4de55b --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2024/08_19_1510_d6b11105f659_add_usermodel_active.py @@ -0,0 +1,36 @@ +"""Add UserModel.active + +Revision ID: d6b11105f659 +Revises: 54a77e19c64c +Create Date: 2024-08-19 15:10:25.751199 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "d6b11105f659" +down_revision = "54a77e19c64c" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("users", schema=None) as batch_op: + batch_op.add_column(sa.Column("active", sa.Boolean(), nullable=True)) + + op.execute(sa.sql.text("UPDATE users SET active = TRUE")) + + with op.batch_alter_table("users", schema=None) as batch_op: + batch_op.alter_column("active", nullable=False) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("users", schema=None) as batch_op: + batch_op.drop_column("active") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2024/08_21_1420_ea60480f82bb_add_membermodel_member_num.py b/src/dstack/_internal/server/migrations/versions/2024/08_21_1420_ea60480f82bb_add_membermodel_member_num.py new file mode 100644 index 0000000000..7ec511b7ef --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2024/08_21_1420_ea60480f82bb_add_membermodel_member_num.py @@ -0,0 +1,32 @@ +"""Add MemberModel.member_num + +Revision ID: ea60480f82bb +Revises: d6b11105f659 +Create Date: 2024-08-21 14:20:34.605661 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "ea60480f82bb" +down_revision = "d6b11105f659" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("members", schema=None) as batch_op: + batch_op.add_column(sa.Column("member_num", sa.Integer(), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("members", schema=None) as batch_op: + batch_op.drop_column("member_num") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2024/08_30_1342_7b24b1c8eba7_add_instancemodel_last_processed_at.py b/src/dstack/_internal/server/migrations/versions/2024/08_30_1342_7b24b1c8eba7_add_instancemodel_last_processed_at.py new file mode 100644 index 0000000000..d9f6965ca1 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2024/08_30_1342_7b24b1c8eba7_add_instancemodel_last_processed_at.py @@ -0,0 +1,68 @@ +"""Add InstanceModel.last_processed_at + +Revision ID: 7b24b1c8eba7 +Revises: ea60480f82bb +Create Date: 2024-08-30 13:42:08.961792 + +""" + +import uuid + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models +from dstack._internal.utils.common import get_current_datetime + +# revision identifiers, used by Alembic. +revision = "7b24b1c8eba7" +down_revision = "ea60480f82bb" +branch_labels = None +depends_on = None + + +instance_table = sa.Table( + "instances", + sa.MetaData(), + # partial description - only columns affected by this migration + sa.Column("id", sqlalchemy_utils.UUIDType(binary=False), primary_key=True, default=uuid.uuid4), + sa.Column("last_processed_at", dstack._internal.server.models.NaiveDateTime()), +) + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "last_processed_at", dstack._internal.server.models.NaiveDateTime(), nullable=True + ) + ) + + op.get_bind().execute( + (instance_table.update().values(last_processed_at=get_current_datetime())) + ) + + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.alter_column("last_processed_at", nullable=False) + + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.add_column(sa.Column("instance_assigned", sa.Boolean(), nullable=True)) + + op.execute(sa.sql.text("UPDATE jobs SET instance_assigned = FALSE")) + + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.alter_column("instance_assigned", nullable=False) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.drop_column("instance_assigned") + + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.drop_column("last_processed_at") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2024/09_10_1107_c83d45f9a971_replace_string_with_text.py b/src/dstack/_internal/server/migrations/versions/2024/09_10_1107_c83d45f9a971_replace_string_with_text.py new file mode 100644 index 0000000000..3c55dd1c3d --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2024/09_10_1107_c83d45f9a971_replace_string_with_text.py @@ -0,0 +1,150 @@ +"""Replace String with Text + +Revision ID: c83d45f9a971 +Revises: 7b24b1c8eba7 +Create Date: 2024-09-10 11:07:45.618938 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "c83d45f9a971" +down_revision = "7b24b1c8eba7" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.alter_column( + "requirements", + existing_type=sa.VARCHAR(length=10000), + type_=sa.Text(), + existing_nullable=True, + ) + batch_op.alter_column( + "termination_policy", + existing_type=sa.VARCHAR(length=50), + type_=sa.String(length=100), + existing_nullable=True, + ) + batch_op.alter_column( + "backend_data", + existing_type=sa.VARCHAR(length=4000), + type_=sa.Text(), + existing_nullable=True, + ) + batch_op.alter_column( + "offer", existing_type=sa.VARCHAR(length=4000), type_=sa.Text(), existing_nullable=True + ) + batch_op.alter_column( + "job_provisioning_data", + existing_type=sa.VARCHAR(length=4000), + type_=sa.Text(), + existing_nullable=True, + ) + + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.alter_column( + "job_spec_data", + existing_type=sa.VARCHAR(length=4000), + type_=sa.Text(), + existing_nullable=False, + ) + batch_op.alter_column( + "job_provisioning_data", + existing_type=sa.VARCHAR(length=4000), + type_=sa.Text(), + existing_nullable=True, + ) + + with op.batch_alter_table("repos", schema=None) as batch_op: + batch_op.alter_column( + "info", existing_type=sa.VARCHAR(length=2000), type_=sa.Text(), existing_nullable=False + ) + + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.alter_column( + "run_spec", + existing_type=sa.VARCHAR(length=4000), + type_=sa.Text(), + existing_nullable=False, + ) + batch_op.alter_column( + "service_spec", + existing_type=sa.VARCHAR(length=4000), + type_=sa.Text(), + existing_nullable=True, + ) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.alter_column( + "service_spec", + existing_type=sa.Text(), + type_=sa.VARCHAR(length=4000), + existing_nullable=True, + ) + batch_op.alter_column( + "run_spec", + existing_type=sa.Text(), + type_=sa.VARCHAR(length=4000), + existing_nullable=False, + ) + + with op.batch_alter_table("repos", schema=None) as batch_op: + batch_op.alter_column( + "info", existing_type=sa.Text(), type_=sa.VARCHAR(length=2000), existing_nullable=False + ) + + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.alter_column( + "job_provisioning_data", + existing_type=sa.Text(), + type_=sa.VARCHAR(length=4000), + existing_nullable=True, + ) + batch_op.alter_column( + "job_spec_data", + existing_type=sa.Text(), + type_=sa.VARCHAR(length=4000), + existing_nullable=False, + ) + + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.alter_column( + "job_provisioning_data", + existing_type=sa.Text(), + type_=sa.VARCHAR(length=4000), + existing_nullable=True, + ) + batch_op.alter_column( + "offer", existing_type=sa.Text(), type_=sa.VARCHAR(length=4000), existing_nullable=True + ) + batch_op.alter_column( + "backend_data", + existing_type=sa.Text(), + type_=sa.VARCHAR(length=4000), + existing_nullable=True, + ) + batch_op.alter_column( + "termination_policy", + existing_type=sa.String(length=100), + type_=sa.VARCHAR(length=50), + existing_nullable=True, + ) + batch_op.alter_column( + "requirements", + existing_type=sa.Text(), + type_=sa.VARCHAR(length=10000), + existing_nullable=True, + ) + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2024/09_17_1223_e3b7db07727f_add_gatewaycomputemodel_app_updated_at.py b/src/dstack/_internal/server/migrations/versions/2024/09_17_1223_e3b7db07727f_add_gatewaycomputemodel_app_updated_at.py new file mode 100644 index 0000000000..90e39531ff --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2024/09_17_1223_e3b7db07727f_add_gatewaycomputemodel_app_updated_at.py @@ -0,0 +1,61 @@ +"""Add GatewayComputeModel.app_updated_at + +Revision ID: e3b7db07727f +Revises: c83d45f9a971 +Create Date: 2024-09-17 12:23:34.808431 + +""" + +import uuid +from datetime import timedelta + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models +from dstack._internal.utils.common import get_current_datetime + +# revision identifiers, used by Alembic. +revision = "e3b7db07727f" +down_revision = "c83d45f9a971" +branch_labels = None +depends_on = None + + +gateway_computes_table = sa.Table( + "gateway_computes", + sa.MetaData(), + # partial description - only columns affected by this migration + sa.Column("id", sqlalchemy_utils.UUIDType(binary=False), primary_key=True, default=uuid.uuid4), + sa.Column("app_updated_at", dstack._internal.server.models.NaiveDateTime()), +) + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("gateway_computes", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "app_updated_at", dstack._internal.server.models.NaiveDateTime(), nullable=True + ) + ) + + # Should be in the past to trigger gateway update after migrations + default_app_updated_at = get_current_datetime() - timedelta(minutes=60) + op.get_bind().execute( + (gateway_computes_table.update().values(app_updated_at=default_app_updated_at)) + ) + + with op.batch_alter_table("gateway_computes", schema=None) as batch_op: + batch_op.alter_column("app_updated_at", nullable=False) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("gateway_computes", schema=None) as batch_op: + batch_op.drop_column("app_updated_at") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2024/09_25_1352_a7b46c073fa1_add_placementgroupmodel.py b/src/dstack/_internal/server/migrations/versions/2024/09_25_1352_a7b46c073fa1_add_placementgroupmodel.py new file mode 100644 index 0000000000..291b0b7e6e --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2024/09_25_1352_a7b46c073fa1_add_placementgroupmodel.py @@ -0,0 +1,58 @@ +"""Add PlacementGroupModel + +Revision ID: a7b46c073fa1 +Revises: e3b7db07727f +Create Date: 2024-09-25 13:52:28.701586 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "a7b46c073fa1" +down_revision = "e3b7db07727f" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "placement_groups", + sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("name", sa.String(length=100), nullable=False), + sa.Column( + "project_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False + ), + sa.Column("fleet_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("fleet_deleted", sa.Boolean(), nullable=False), + sa.Column("created_at", dstack._internal.server.models.NaiveDateTime(), nullable=False), + sa.Column( + "last_processed_at", dstack._internal.server.models.NaiveDateTime(), nullable=False + ), + sa.Column("deleted", sa.Boolean(), nullable=False), + sa.Column("deleted_at", dstack._internal.server.models.NaiveDateTime(), nullable=True), + sa.Column("configuration", sa.Text(), nullable=False), + sa.Column("provisioning_data", sa.Text(), nullable=True), + sa.ForeignKeyConstraint( + ["fleet_id"], ["fleets.id"], name=op.f("fk_placement_groups_fleet_id_fleets") + ), + sa.ForeignKeyConstraint( + ["project_id"], + ["projects.id"], + name=op.f("fk_placement_groups_project_id_projects"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_placement_groups")), + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table("placement_groups") + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2024/10_14_1126_c20626d03cfb_add_jobmetricspoint.py b/src/dstack/_internal/server/migrations/versions/2024/10_14_1126_c20626d03cfb_add_jobmetricspoint.py new file mode 100644 index 0000000000..c273d8743a --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2024/10_14_1126_c20626d03cfb_add_jobmetricspoint.py @@ -0,0 +1,43 @@ +"""Add JobMetricsPoint + +Revision ID: c20626d03cfb +Revises: a7b46c073fa1 +Create Date: 2024-10-14 11:26:08.776260 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +# revision identifiers, used by Alembic. +revision = "c20626d03cfb" +down_revision = "a7b46c073fa1" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "job_metrics_points", + sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("job_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("timestamp_micro", sa.BigInteger(), nullable=False), + sa.Column("cpu_usage_micro", sa.BigInteger(), nullable=False), + sa.Column("memory_usage_bytes", sa.BigInteger(), nullable=False), + sa.Column("memory_working_set_bytes", sa.BigInteger(), nullable=False), + sa.Column("gpus_memory_usage_bytes", sa.Text(), nullable=False), + sa.Column("gpus_util_percent", sa.Text(), nullable=False), + sa.ForeignKeyConstraint( + ["job_id"], ["jobs.id"], name=op.f("fk_job_metrics_points_job_id_jobs") + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_job_metrics_points")), + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table("job_metrics_points") + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2024/10_16_1431_afbc600ff2b2_add_created_at_to_usermodel_and_.py b/src/dstack/_internal/server/migrations/versions/2024/10_16_1431_afbc600ff2b2_add_created_at_to_usermodel_and_.py new file mode 100644 index 0000000000..084dbe9e56 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2024/10_16_1431_afbc600ff2b2_add_created_at_to_usermodel_and_.py @@ -0,0 +1,102 @@ +"""Add created_at to UserModel and ProjectModel + +Revision ID: afbc600ff2b2 +Revises: c20626d03cfb +Create Date: 2024-10-16 14:31:49.040804 + +""" + +import uuid +from datetime import timedelta + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models +from dstack._internal.utils.common import get_current_datetime + +# revision identifiers, used by Alembic. +revision = "afbc600ff2b2" +down_revision = "c20626d03cfb" +branch_labels = None +depends_on = None + + +users_table = sa.Table( + "users", + sa.MetaData(), + # partial description - only columns affected by this migration + sa.Column("id", sqlalchemy_utils.UUIDType(binary=False), primary_key=True, default=uuid.uuid4), + sa.Column("created_at", dstack._internal.server.models.NaiveDateTime(), nullable=True), +) + + +projects_table = sa.Table( + "projects", + sa.MetaData(), + # partial description - only columns affected by this migration + sa.Column("id", sqlalchemy_utils.UUIDType(binary=False), primary_key=True, default=uuid.uuid4), + sa.Column("created_at", dstack._internal.server.models.NaiveDateTime(), nullable=True), +) + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("projects", schema=None) as batch_op: + batch_op.add_column( + sa.Column("created_at", dstack._internal.server.models.NaiveDateTime(), nullable=True) + ) + with op.batch_alter_table("users", schema=None) as batch_op: + batch_op.add_column( + sa.Column("created_at", dstack._internal.server.models.NaiveDateTime(), nullable=True) + ) + + # Set created_at on existing rows. + # The absolute value does not matter since it cannot be recovered. + # Just ensure that created_at order matches the insertion order. + # SELECT should fetch the rows in the insertion order when there are no additional conditions. + last_created_at = get_current_datetime() + + users_update_params = [] + users = op.get_bind().execute(sa.select(users_table)) + for i, row in enumerate(reversed(users.all())): + created_at = last_created_at - timedelta(seconds=i) + users_update_params.append({"_id": row.id, "created_at": created_at}) + update_stmt = ( + users_table.update() + .where(users_table.c.id == sa.bindparam("_id")) + .values(created_at=sa.bindparam("created_at")) + ) + if users_update_params: + op.get_bind().execute(update_stmt, users_update_params) + + projects_update_params = [] + projects = op.get_bind().execute(sa.select(projects_table)) + for i, row in enumerate(reversed(projects.all())): + created_at = last_created_at - timedelta(seconds=i) + projects_update_params.append({"_id": row.id, "created_at": created_at}) + update_stmt = ( + projects_table.update() + .where(projects_table.c.id == sa.bindparam("_id")) + .values(created_at=sa.bindparam("created_at")) + ) + if projects_update_params: + op.get_bind().execute(update_stmt, projects_update_params) + + with op.batch_alter_table("projects", schema=None) as batch_op: + batch_op.alter_column("created_at", nullable=False) + with op.batch_alter_table("users", schema=None) as batch_op: + batch_op.alter_column("created_at", nullable=False) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("users", schema=None) as batch_op: + batch_op.drop_column("created_at") + + with op.batch_alter_table("projects", schema=None) as batch_op: + batch_op.drop_column("created_at") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2024/11_04_1546_82b32a135ea2_.py b/src/dstack/_internal/server/migrations/versions/2024/11_04_1546_82b32a135ea2_.py new file mode 100644 index 0000000000..a3fcfdeca7 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2024/11_04_1546_82b32a135ea2_.py @@ -0,0 +1,58 @@ +"""empty message + +Revision ID: 82b32a135ea2 +Revises: afbc600ff2b2 +Create Date: 2024-11-04 15:46:37.719531 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +# revision identifiers, used by Alembic. +revision = "82b32a135ea2" +down_revision = "afbc600ff2b2" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("volumes", schema=None) as batch_op: + batch_op.add_column( + sa.Column("user_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True) + ) + batch_op.create_foreign_key( + batch_op.f("fk_volumes_user_id_users"), + "users", + ["user_id"], + ["id"], + ondelete="CASCADE", + ) + + # ### end Alembic commands ### + + # update any existing volumes and set the user_id equal to the project_owner.id which created the volume + op.execute(""" + UPDATE volumes AS v + SET user_id = ( + SELECT owner_id FROM projects + JOIN volumes ON projects.id = volumes.project_id + WHERE volumes.id = v.id + ) + WHERE user_id IS NULL + """) + + # set volumes.user_id to non-nullable + with op.batch_alter_table("volumes", schema=None) as batch_op: + batch_op.alter_column("user_id", nullable=False) + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("volumes", schema=None) as batch_op: + batch_op.drop_constraint(batch_op.f("fk_volumes_user_id_users"), type_="foreignkey") + batch_op.drop_column("user_id") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2024/11_14_1031_91a12fff6c76_add_repocredsmodel.py b/src/dstack/_internal/server/migrations/versions/2024/11_14_1031_91a12fff6c76_add_repocredsmodel.py new file mode 100644 index 0000000000..884fe3ea79 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2024/11_14_1031_91a12fff6c76_add_repocredsmodel.py @@ -0,0 +1,43 @@ +"""Add RepoCredsModel + +Revision ID: 91a12fff6c76 +Revises: 82b32a135ea2 +Create Date: 2024-11-14 10:31:07.112472 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +# revision identifiers, used by Alembic. +revision = "91a12fff6c76" +down_revision = "82b32a135ea2" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "repo_creds", + sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("repo_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("user_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("creds", sa.String(length=10000), nullable=False), + sa.ForeignKeyConstraint( + ["repo_id"], ["repos.id"], name=op.f("fk_repo_creds_repo_id_repos"), ondelete="CASCADE" + ), + sa.ForeignKeyConstraint( + ["user_id"], ["users.id"], name=op.f("fk_repo_creds_user_id_users"), ondelete="CASCADE" + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_repo_creds")), + sa.UniqueConstraint("repo_id", "user_id", name="uq_repo_creds_repo_id_user_id"), + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table("repo_creds") + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2024/12_24_1256_065588ec72b8_add_vultr_to_backendtype_enum.py b/src/dstack/_internal/server/migrations/versions/2024/12_24_1256_065588ec72b8_add_vultr_to_backendtype_enum.py new file mode 100644 index 0000000000..33d41aa087 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2024/12_24_1256_065588ec72b8_add_vultr_to_backendtype_enum.py @@ -0,0 +1,81 @@ +"""Add VULTR to backendtype enum + +Revision ID: 065588ec72b8 +Revises: 91a12fff6c76 +Create Date: 2024-12-24 12:56:57.018776 + +""" + +from alembic import op +from alembic_postgresql_enum import TableReference + +# revision identifiers, used by Alembic. +revision = "065588ec72b8" +down_revision = "91a12fff6c76" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + # Add the new enum value "VULTR" to the backendtype enum + op.sync_enum_values( + "public", + "backendtype", + [ + "AWS", + "AZURE", + "CUDO", + "DATACRUNCH", + "DSTACK", + "GCP", + "KUBERNETES", + "LAMBDA", + "LOCAL", + "REMOTE", + "NEBIUS", + "OCI", + "RUNPOD", + "TENSORDOCK", + "VASTAI", + "VULTR", # New value + ], + [ + TableReference(table_schema="public", table_name="instances", column_name="backend"), + TableReference(table_schema="public", table_name="backends", column_name="type"), + ], + enum_values_to_rename=[], + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + # Remove the new enum value "VULTR" from the backendtype enum + op.sync_enum_values( + "public", + "backendtype", + [ + "AWS", + "AZURE", + "CUDO", + "DATACRUNCH", + "DSTACK", + "GCP", + "KUBERNETES", + "LAMBDA", + "LOCAL", + "REMOTE", + "NEBIUS", + "OCI", + "RUNPOD", + "TENSORDOCK", + "VASTAI", + ], # Without "VULTR" + [ + TableReference(table_schema="public", table_name="instances", column_name="backend"), + TableReference(table_schema="public", table_name="backends", column_name="type"), + ], + enum_values_to_rename=[], + ) + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/01_10_1417_803c7e9ed85d_add_jobmodel_job_runtime_data.py b/src/dstack/_internal/server/migrations/versions/2025/01_10_1417_803c7e9ed85d_add_jobmodel_job_runtime_data.py new file mode 100644 index 0000000000..23b4c25407 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/01_10_1417_803c7e9ed85d_add_jobmodel_job_runtime_data.py @@ -0,0 +1,32 @@ +"""Add JobModel.job_runtime_data + +Revision ID: 803c7e9ed85d +Revises: c48df7985d57 +Create Date: 2025-01-10 14:17:24.029983 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "803c7e9ed85d" +down_revision = "c48df7985d57" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.add_column(sa.Column("job_runtime_data", sa.Text(), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.drop_column("job_runtime_data") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/01_14_1333_c48df7985d57_add_instance_termination_retries.py b/src/dstack/_internal/server/migrations/versions/2025/01_14_1333_c48df7985d57_add_instance_termination_retries.py new file mode 100644 index 0000000000..b932ad6cfd --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/01_14_1333_c48df7985d57_add_instance_termination_retries.py @@ -0,0 +1,38 @@ +"""Add instance termination retries + +Revision ID: c48df7985d57 +Revises: 065588ec72b8 +Create Date: 2025-01-14 13:33:17.722284 + +""" + +import sqlalchemy as sa +from alembic import op + +from dstack._internal.server.models import NaiveDateTime + +# revision identifiers, used by Alembic. +revision = "c48df7985d57" +down_revision = "065588ec72b8" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.add_column( + sa.Column("first_termination_retry_at", NaiveDateTime(), nullable=True) + ) + batch_op.add_column(sa.Column("last_termination_retry_at", NaiveDateTime(), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.drop_column("last_termination_retry_at") + batch_op.drop_column("first_termination_retry_at") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/01_16_1459_1338b788b612_reverse_job_instance_relationship.py b/src/dstack/_internal/server/migrations/versions/2025/01_16_1459_1338b788b612_reverse_job_instance_relationship.py new file mode 100644 index 0000000000..bbf52c8326 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/01_16_1459_1338b788b612_reverse_job_instance_relationship.py @@ -0,0 +1,71 @@ +"""Reverse Job-Instance relationship + +Revision ID: 1338b788b612 +Revises: 51d45659d574 +Create Date: 2025-01-16 14:59:19.113534 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +# revision identifiers, used by Alembic. +revision = "1338b788b612" +down_revision = "51d45659d574" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "instance_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True + ) + ) + batch_op.create_foreign_key( + batch_op.f("fk_jobs_instance_id_instances"), + "instances", + ["instance_id"], + ["id"], + ondelete="CASCADE", + ) + + op.execute(""" + UPDATE jobs AS j + SET instance_id = ( + SELECT i.id + FROM instances AS i + WHERE i.job_id = j.id + ) + """) + + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.drop_constraint("fk_instances_job_id_jobs", type_="foreignkey") + batch_op.drop_column("job_id") + + +def downgrade() -> None: + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.add_column( + sa.Column("job_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True) + ) + batch_op.create_foreign_key("fk_instances_job_id_jobs", "jobs", ["job_id"], ["id"]) + + # This migration is not fully reversible - we cannot assign multiple jobs to a single instance, + # thus LIMIT 1 + op.execute(""" + UPDATE instances AS i + SET job_id = ( + SELECT j.id + FROM jobs j + WHERE j.instance_id = i.id + ORDER by j.submitted_at DESC + LIMIT 1 + ) + """) + + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.drop_constraint(batch_op.f("fk_jobs_instance_id_instances"), type_="foreignkey") + batch_op.drop_column("instance_id") diff --git a/src/dstack/_internal/server/migrations/versions/2025/01_21_1053_ffa99edd1988_add_jobterminationreason_max_duration_.py b/src/dstack/_internal/server/migrations/versions/2025/01_21_1053_ffa99edd1988_add_jobterminationreason_max_duration_.py new file mode 100644 index 0000000000..15e0da5a54 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/01_21_1053_ffa99edd1988_add_jobterminationreason_max_duration_.py @@ -0,0 +1,81 @@ +"""Add JobTerminationReason.MAX_DURATION_EXCEEDED + +Revision ID: ffa99edd1988 +Revises: 803c7e9ed85d +Create Date: 2025-01-21 10:53:22.338540 + +""" + +from alembic import op +from alembic_postgresql_enum import TableReference + +# revision identifiers, used by Alembic. +revision = "ffa99edd1988" +down_revision = "803c7e9ed85d" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.sync_enum_values( + "public", + "jobterminationreason", + [ + "FAILED_TO_START_DUE_TO_NO_CAPACITY", + "INTERRUPTED_BY_NO_CAPACITY", + "WAITING_INSTANCE_LIMIT_EXCEEDED", + "WAITING_RUNNER_LIMIT_EXCEEDED", + "TERMINATED_BY_USER", + "VOLUME_ERROR", + "GATEWAY_ERROR", + "SCALED_DOWN", + "DONE_BY_RUNNER", + "ABORTED_BY_USER", + "TERMINATED_BY_SERVER", + "CONTAINER_EXITED_WITH_ERROR", + "PORTS_BINDING_FAILED", + "CREATING_CONTAINER_ERROR", + "EXECUTOR_ERROR", + "MAX_DURATION_EXCEEDED", + ], + [ + TableReference( + table_schema="public", table_name="jobs", column_name="termination_reason" + ) + ], + enum_values_to_rename=[], + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.sync_enum_values( + "public", + "jobterminationreason", + [ + "FAILED_TO_START_DUE_TO_NO_CAPACITY", + "INTERRUPTED_BY_NO_CAPACITY", + "WAITING_INSTANCE_LIMIT_EXCEEDED", + "WAITING_RUNNER_LIMIT_EXCEEDED", + "TERMINATED_BY_USER", + "VOLUME_ERROR", + "GATEWAY_ERROR", + "SCALED_DOWN", + "DONE_BY_RUNNER", + "ABORTED_BY_USER", + "TERMINATED_BY_SERVER", + "CONTAINER_EXITED_WITH_ERROR", + "PORTS_BINDING_FAILED", + "CREATING_CONTAINER_ERROR", + "EXECUTOR_ERROR", + ], + [ + TableReference( + table_schema="public", table_name="jobs", column_name="termination_reason" + ) + ], + enum_values_to_rename=[], + ) + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/01_29_1152_da574e93fee0_add_jobmodel_volumes_detached_at.py b/src/dstack/_internal/server/migrations/versions/2025/01_29_1152_da574e93fee0_add_jobmodel_volumes_detached_at.py new file mode 100644 index 0000000000..029f94bb92 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/01_29_1152_da574e93fee0_add_jobmodel_volumes_detached_at.py @@ -0,0 +1,40 @@ +"""Add JobModel.volumes_detached_at + +Revision ID: da574e93fee0 +Revises: ffa99edd1988 +Create Date: 2025-01-29 11:52:26.941513 + +""" + +import sqlalchemy as sa +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "da574e93fee0" +down_revision = "ffa99edd1988" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "volumes_detached_at", + dstack._internal.server.models.NaiveDateTime(), + nullable=True, + ) + ) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.drop_column("volumes_detached_at") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/02_04_1110_51d45659d574_add_instancemodel_blocks_fields.py b/src/dstack/_internal/server/migrations/versions/2025/02_04_1110_51d45659d574_add_instancemodel_blocks_fields.py new file mode 100644 index 0000000000..3667e508b7 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/02_04_1110_51d45659d574_add_instancemodel_blocks_fields.py @@ -0,0 +1,43 @@ +"""Add InstanceModel blocks fields + +Revision ID: 51d45659d574 +Revises: da574e93fee0 +Create Date: 2025-02-04 11:10:41.626273 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "51d45659d574" +down_revision = "da574e93fee0" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.add_column(sa.Column("total_blocks", sa.Integer(), nullable=True)) + batch_op.add_column(sa.Column("busy_blocks", sa.Integer(), nullable=True)) + + op.execute(""" + UPDATE instances + SET total_blocks = 1 + """) + op.execute(""" + UPDATE instances + SET busy_blocks = CASE + WHEN job_id IS NOT NULL THEN 1 + ELSE 0 + END + """) + + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.alter_column("busy_blocks", existing_type=sa.INTEGER(), nullable=False) + + +def downgrade() -> None: + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.drop_column("busy_blocks") + batch_op.drop_column("total_blocks") diff --git a/src/dstack/_internal/server/migrations/versions/2025/02_11_2230_63c3f19cb184_add_jobterminationreason_inactivity_.py b/src/dstack/_internal/server/migrations/versions/2025/02_11_2230_63c3f19cb184_add_jobterminationreason_inactivity_.py new file mode 100644 index 0000000000..659ac9e097 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/02_11_2230_63c3f19cb184_add_jobterminationreason_inactivity_.py @@ -0,0 +1,83 @@ +"""Add JobTerminationReason.INACTIVITY_DURATION_EXCEEDED + +Revision ID: 63c3f19cb184 +Revises: 1338b788b612 +Create Date: 2025-02-11 22:30:47.289393 + +""" + +from alembic import op +from alembic_postgresql_enum import TableReference + +# revision identifiers, used by Alembic. +revision = "63c3f19cb184" +down_revision = "1338b788b612" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.sync_enum_values( + enum_schema="public", + enum_name="jobterminationreason", + new_values=[ + "FAILED_TO_START_DUE_TO_NO_CAPACITY", + "INTERRUPTED_BY_NO_CAPACITY", + "WAITING_INSTANCE_LIMIT_EXCEEDED", + "WAITING_RUNNER_LIMIT_EXCEEDED", + "TERMINATED_BY_USER", + "VOLUME_ERROR", + "GATEWAY_ERROR", + "SCALED_DOWN", + "DONE_BY_RUNNER", + "ABORTED_BY_USER", + "TERMINATED_BY_SERVER", + "INACTIVITY_DURATION_EXCEEDED", + "CONTAINER_EXITED_WITH_ERROR", + "PORTS_BINDING_FAILED", + "CREATING_CONTAINER_ERROR", + "EXECUTOR_ERROR", + "MAX_DURATION_EXCEEDED", + ], + affected_columns=[ + TableReference( + table_schema="public", table_name="jobs", column_name="termination_reason" + ) + ], + enum_values_to_rename=[], + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.sync_enum_values( + enum_schema="public", + enum_name="jobterminationreason", + new_values=[ + "FAILED_TO_START_DUE_TO_NO_CAPACITY", + "INTERRUPTED_BY_NO_CAPACITY", + "WAITING_INSTANCE_LIMIT_EXCEEDED", + "WAITING_RUNNER_LIMIT_EXCEEDED", + "TERMINATED_BY_USER", + "VOLUME_ERROR", + "GATEWAY_ERROR", + "SCALED_DOWN", + "DONE_BY_RUNNER", + "ABORTED_BY_USER", + "TERMINATED_BY_SERVER", + "CONTAINER_EXITED_WITH_ERROR", + "PORTS_BINDING_FAILED", + "CREATING_CONTAINER_ERROR", + "EXECUTOR_ERROR", + "MAX_DURATION_EXCEEDED", + ], + affected_columns=[ + TableReference( + table_schema="public", table_name="jobs", column_name="termination_reason" + ) + ], + enum_values_to_rename=[], + ) + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/02_11_2337_1e76fb0dde87_add_jobmodel_inactivity_secs.py b/src/dstack/_internal/server/migrations/versions/2025/02_11_2337_1e76fb0dde87_add_jobmodel_inactivity_secs.py new file mode 100644 index 0000000000..17c99e9546 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/02_11_2337_1e76fb0dde87_add_jobmodel_inactivity_secs.py @@ -0,0 +1,32 @@ +"""Add JobModel.inactivity_secs + +Revision ID: 1e76fb0dde87 +Revises: 63c3f19cb184 +Create Date: 2025-02-11 23:37:58.823710 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "1e76fb0dde87" +down_revision = "63c3f19cb184" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.add_column(sa.Column("inactivity_secs", sa.Integer(), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.drop_column("inactivity_secs") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/02_12_1319_a751ef183f27_move_attachment_data_to_volumes_.py b/src/dstack/_internal/server/migrations/versions/2025/02_12_1319_a751ef183f27_move_attachment_data_to_volumes_.py new file mode 100644 index 0000000000..591f74ef53 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/02_12_1319_a751ef183f27_move_attachment_data_to_volumes_.py @@ -0,0 +1,34 @@ +"""Move attachment_data to volumes_attachments + +Revision ID: a751ef183f27 +Revises: 1e76fb0dde87 +Create Date: 2025-02-12 13:19:57.569591 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "a751ef183f27" +down_revision = "1e76fb0dde87" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("volumes_attachments", schema=None) as batch_op: + batch_op.alter_column("instace_id", new_column_name="instance_id") + batch_op.add_column(sa.Column("attachment_data", sa.Text(), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("volumes_attachments", schema=None) as batch_op: + batch_op.drop_column("attachment_data") + batch_op.alter_column("instance_id", new_column_name="instace_id") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/02_21_1059_60e444118b6d_add_jobprometheusmetrics.py b/src/dstack/_internal/server/migrations/versions/2025/02_21_1059_60e444118b6d_add_jobprometheusmetrics.py new file mode 100644 index 0000000000..232098099c --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/02_21_1059_60e444118b6d_add_jobprometheusmetrics.py @@ -0,0 +1,40 @@ +"""Add JobPrometheusMetrics + +Revision ID: 60e444118b6d +Revises: a751ef183f27 +Create Date: 2025-02-21 10:59:26.339353 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "60e444118b6d" +down_revision = "a751ef183f27" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "job_prometheus_metrics", + sa.Column("job_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("collected_at", dstack._internal.server.models.NaiveDateTime(), nullable=False), + sa.Column("text", sa.Text(), nullable=False), + sa.ForeignKeyConstraint( + ["job_id"], ["jobs.id"], name=op.f("fk_job_prometheus_metrics_job_id_jobs") + ), + sa.PrimaryKeyConstraint("job_id", name=op.f("pk_job_prometheus_metrics")), + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table("job_prometheus_metrics") + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/02_28_1512_98d1b92988bc_add_jobterminationreason_terminated_due_.py b/src/dstack/_internal/server/migrations/versions/2025/02_28_1512_98d1b92988bc_add_jobterminationreason_terminated_due_.py new file mode 100644 index 0000000000..76543d8fa3 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/02_28_1512_98d1b92988bc_add_jobterminationreason_terminated_due_.py @@ -0,0 +1,140 @@ +"""Add JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY + +Revision ID: 98d1b92988bc +Revises: 60e444118b6d +Create Date: 2025-02-28 15:12:37.649876 + +""" + +import sqlalchemy as sa +from alembic import op +from alembic_postgresql_enum import TableReference + +# revision identifiers, used by Alembic. +revision = "98d1b92988bc" +down_revision = "60e444118b6d" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # SQLite + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.alter_column( + "termination_reason", + existing_type=sa.VARCHAR(length=34), + type_=sa.Enum( + "FAILED_TO_START_DUE_TO_NO_CAPACITY", + "INTERRUPTED_BY_NO_CAPACITY", + "WAITING_INSTANCE_LIMIT_EXCEEDED", + "WAITING_RUNNER_LIMIT_EXCEEDED", + "TERMINATED_BY_USER", + "VOLUME_ERROR", + "GATEWAY_ERROR", + "SCALED_DOWN", + "DONE_BY_RUNNER", + "ABORTED_BY_USER", + "TERMINATED_BY_SERVER", + "INACTIVITY_DURATION_EXCEEDED", + "TERMINATED_DUE_TO_UTILIZATION_POLICY", + "CONTAINER_EXITED_WITH_ERROR", + "PORTS_BINDING_FAILED", + "CREATING_CONTAINER_ERROR", + "EXECUTOR_ERROR", + "MAX_DURATION_EXCEEDED", + name="jobterminationreason", + ), + existing_nullable=True, + ) + # PostgreSQL + op.sync_enum_values( + enum_schema="public", + enum_name="jobterminationreason", + new_values=[ + "FAILED_TO_START_DUE_TO_NO_CAPACITY", + "INTERRUPTED_BY_NO_CAPACITY", + "WAITING_INSTANCE_LIMIT_EXCEEDED", + "WAITING_RUNNER_LIMIT_EXCEEDED", + "TERMINATED_BY_USER", + "VOLUME_ERROR", + "GATEWAY_ERROR", + "SCALED_DOWN", + "DONE_BY_RUNNER", + "ABORTED_BY_USER", + "TERMINATED_BY_SERVER", + "INACTIVITY_DURATION_EXCEEDED", + "TERMINATED_DUE_TO_UTILIZATION_POLICY", + "CONTAINER_EXITED_WITH_ERROR", + "PORTS_BINDING_FAILED", + "CREATING_CONTAINER_ERROR", + "EXECUTOR_ERROR", + "MAX_DURATION_EXCEEDED", + ], + affected_columns=[ + TableReference( + table_schema="public", table_name="jobs", column_name="termination_reason" + ) + ], + enum_values_to_rename=[], + ) + + +def downgrade() -> None: + # SQLite + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.alter_column( + "termination_reason", + existing_type=sa.Enum( + "FAILED_TO_START_DUE_TO_NO_CAPACITY", + "INTERRUPTED_BY_NO_CAPACITY", + "WAITING_INSTANCE_LIMIT_EXCEEDED", + "WAITING_RUNNER_LIMIT_EXCEEDED", + "TERMINATED_BY_USER", + "VOLUME_ERROR", + "GATEWAY_ERROR", + "SCALED_DOWN", + "DONE_BY_RUNNER", + "ABORTED_BY_USER", + "TERMINATED_BY_SERVER", + "INACTIVITY_DURATION_EXCEEDED", + "TERMINATED_DUE_TO_UTILIZATION_POLICY", + "CONTAINER_EXITED_WITH_ERROR", + "PORTS_BINDING_FAILED", + "CREATING_CONTAINER_ERROR", + "EXECUTOR_ERROR", + "MAX_DURATION_EXCEEDED", + name="jobterminationreason", + ), + type_=sa.VARCHAR(length=34), + existing_nullable=True, + ) + # PostgreSQL + op.sync_enum_values( + enum_schema="public", + enum_name="jobterminationreason", + new_values=[ + "FAILED_TO_START_DUE_TO_NO_CAPACITY", + "INTERRUPTED_BY_NO_CAPACITY", + "WAITING_INSTANCE_LIMIT_EXCEEDED", + "WAITING_RUNNER_LIMIT_EXCEEDED", + "TERMINATED_BY_USER", + "VOLUME_ERROR", + "GATEWAY_ERROR", + "SCALED_DOWN", + "DONE_BY_RUNNER", + "ABORTED_BY_USER", + "TERMINATED_BY_SERVER", + "INACTIVITY_DURATION_EXCEEDED", + "CONTAINER_EXITED_WITH_ERROR", + "PORTS_BINDING_FAILED", + "CREATING_CONTAINER_ERROR", + "EXECUTOR_ERROR", + "MAX_DURATION_EXCEEDED", + ], + affected_columns=[ + TableReference( + table_schema="public", table_name="jobs", column_name="termination_reason" + ) + ], + enum_values_to_rename=[], + ) diff --git a/src/dstack/_internal/server/migrations/versions/2025/03_10_1449_bc8ca4a505c6_store_backendtype_as_string.py b/src/dstack/_internal/server/migrations/versions/2025/03_10_1449_bc8ca4a505c6_store_backendtype_as_string.py new file mode 100644 index 0000000000..4690d4a323 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/03_10_1449_bc8ca4a505c6_store_backendtype_as_string.py @@ -0,0 +1,171 @@ +"""Store BackendType as string + +Revision ID: bc8ca4a505c6 +Revises: 98d1b92988bc +Create Date: 2025-03-10 14:49:06.837118 + +""" + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "bc8ca4a505c6" +down_revision = "98d1b92988bc" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("backends", schema=None) as batch_op: + batch_op.alter_column( + "type", + existing_type=postgresql.ENUM( + "AWS", + "AZURE", + "CUDO", + "DATACRUNCH", + "DSTACK", + "GCP", + "KUBERNETES", + "LAMBDA", + "LOCAL", + "REMOTE", + "NEBIUS", + "OCI", + "RUNPOD", + "TENSORDOCK", + "VASTAI", + "VULTR", + name="backendtype", + ), + type_=sa.String(length=100), + existing_nullable=False, + ) + + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.alter_column( + "backend", + existing_type=postgresql.ENUM( + "AWS", + "AZURE", + "CUDO", + "DATACRUNCH", + "DSTACK", + "GCP", + "KUBERNETES", + "LAMBDA", + "LOCAL", + "REMOTE", + "NEBIUS", + "OCI", + "RUNPOD", + "TENSORDOCK", + "VASTAI", + "VULTR", + name="backendtype", + ), + type_=sa.String(length=100), + existing_nullable=True, + ) + + sa.Enum( + "AWS", + "AZURE", + "CUDO", + "DATACRUNCH", + "DSTACK", + "GCP", + "KUBERNETES", + "LAMBDA", + "LOCAL", + "REMOTE", + "NEBIUS", + "OCI", + "RUNPOD", + "TENSORDOCK", + "VASTAI", + "VULTR", + name="backendtype", + ).drop(op.get_bind()) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + sa.Enum( + "AWS", + "AZURE", + "CUDO", + "DATACRUNCH", + "DSTACK", + "GCP", + "KUBERNETES", + "LAMBDA", + "LOCAL", + "REMOTE", + "NEBIUS", + "OCI", + "RUNPOD", + "TENSORDOCK", + "VASTAI", + "VULTR", + name="backendtype", + ).create(op.get_bind()) + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.alter_column( + "backend", + existing_type=sa.String(length=100), + type_=postgresql.ENUM( + "AWS", + "AZURE", + "CUDO", + "DATACRUNCH", + "DSTACK", + "GCP", + "KUBERNETES", + "LAMBDA", + "LOCAL", + "REMOTE", + "NEBIUS", + "OCI", + "RUNPOD", + "TENSORDOCK", + "VASTAI", + "VULTR", + name="backendtype", + ), + existing_nullable=True, + postgresql_using="backend::VARCHAR::backendtype", + ) + + with op.batch_alter_table("backends", schema=None) as batch_op: + batch_op.alter_column( + "type", + existing_type=sa.String(length=100), + type_=postgresql.ENUM( + "AWS", + "AZURE", + "CUDO", + "DATACRUNCH", + "DSTACK", + "GCP", + "KUBERNETES", + "LAMBDA", + "LOCAL", + "REMOTE", + "NEBIUS", + "OCI", + "RUNPOD", + "TENSORDOCK", + "VASTAI", + "VULTR", + name="backendtype", + ), + existing_nullable=False, + postgresql_using="type::VARCHAR::backendtype", + ) + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/03_13_1113_7bc2586e8b9e_make_instancemodel_pool_id_optional.py b/src/dstack/_internal/server/migrations/versions/2025/03_13_1113_7bc2586e8b9e_make_instancemodel_pool_id_optional.py new file mode 100644 index 0000000000..9f935b5a9c --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/03_13_1113_7bc2586e8b9e_make_instancemodel_pool_id_optional.py @@ -0,0 +1,36 @@ +"""Make InstanceModel.pool_id optional + +Revision ID: 7bc2586e8b9e +Revises: bc8ca4a505c6 +Create Date: 2025-03-13 11:13:39.748303 + +""" + +import sqlalchemy_utils +from alembic import op + +# revision identifiers, used by Alembic. +revision = "7bc2586e8b9e" +down_revision = "bc8ca4a505c6" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.alter_column( + "pool_id", existing_type=sqlalchemy_utils.UUIDType(binary=False), nullable=True + ) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.alter_column( + "pool_id", existing_type=sqlalchemy_utils.UUIDType(binary=False), nullable=False + ) + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/04_15_1800_7ba3b59d7ca6_add_runmodel_resubmission_attempt.py b/src/dstack/_internal/server/migrations/versions/2025/04_15_1800_7ba3b59d7ca6_add_runmodel_resubmission_attempt.py new file mode 100644 index 0000000000..ddb7a30ed8 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/04_15_1800_7ba3b59d7ca6_add_runmodel_resubmission_attempt.py @@ -0,0 +1,35 @@ +"""Add RunModel.resubmission_attempt + +Revision ID: 7ba3b59d7ca6 +Revises: 7bc2586e8b9e +Create Date: 2025-04-15 18:00:35.320906 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "7ba3b59d7ca6" +down_revision = "7bc2586e8b9e" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.add_column(sa.Column("resubmission_attempt", sa.Integer(), nullable=True)) + batch_op.execute("UPDATE runs SET resubmission_attempt = 0") + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.alter_column("resubmission_attempt", nullable=False) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.drop_column("resubmission_attempt") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/05_09_1025_6c1a9d6530ee_add_jobmodel_exit_status.py b/src/dstack/_internal/server/migrations/versions/2025/05_09_1025_6c1a9d6530ee_add_jobmodel_exit_status.py new file mode 100644 index 0000000000..68ddd93693 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/05_09_1025_6c1a9d6530ee_add_jobmodel_exit_status.py @@ -0,0 +1,26 @@ +"""Add JobModel.exit_status + +Revision ID: 6c1a9d6530ee +Revises: 7ba3b59d7ca6 +Create Date: 2025-05-09 10:25:19.715852 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "6c1a9d6530ee" +down_revision = "7ba3b59d7ca6" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.add_column(sa.Column("exit_status", sa.Integer(), nullable=True)) + + +def downgrade() -> None: + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.drop_column("exit_status") diff --git a/src/dstack/_internal/server/migrations/versions/2025/05_13_1624_20166748b60c_add_jobmodel_disconnected_at.py b/src/dstack/_internal/server/migrations/versions/2025/05_13_1624_20166748b60c_add_jobmodel_disconnected_at.py new file mode 100644 index 0000000000..cc1c33e254 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/05_13_1624_20166748b60c_add_jobmodel_disconnected_at.py @@ -0,0 +1,100 @@ +"""Add JobModel.disconnected_at + +Revision ID: 20166748b60c +Revises: 6c1a9d6530ee +Create Date: 2025-05-13 16:24:32.496578 + +""" + +import sqlalchemy as sa +from alembic import op +from alembic_postgresql_enum import TableReference + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "20166748b60c" +down_revision = "6c1a9d6530ee" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "disconnected_at", dstack._internal.server.models.NaiveDateTime(), nullable=True + ) + ) + + op.sync_enum_values( + enum_schema="public", + enum_name="jobterminationreason", + new_values=[ + "FAILED_TO_START_DUE_TO_NO_CAPACITY", + "INTERRUPTED_BY_NO_CAPACITY", + "INSTANCE_UNREACHABLE", + "WAITING_INSTANCE_LIMIT_EXCEEDED", + "WAITING_RUNNER_LIMIT_EXCEEDED", + "TERMINATED_BY_USER", + "VOLUME_ERROR", + "GATEWAY_ERROR", + "SCALED_DOWN", + "DONE_BY_RUNNER", + "ABORTED_BY_USER", + "TERMINATED_BY_SERVER", + "INACTIVITY_DURATION_EXCEEDED", + "TERMINATED_DUE_TO_UTILIZATION_POLICY", + "CONTAINER_EXITED_WITH_ERROR", + "PORTS_BINDING_FAILED", + "CREATING_CONTAINER_ERROR", + "EXECUTOR_ERROR", + "MAX_DURATION_EXCEEDED", + ], + affected_columns=[ + TableReference( + table_schema="public", table_name="jobs", column_name="termination_reason" + ) + ], + enum_values_to_rename=[], + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.sync_enum_values( + enum_schema="public", + enum_name="jobterminationreason", + new_values=[ + "FAILED_TO_START_DUE_TO_NO_CAPACITY", + "INTERRUPTED_BY_NO_CAPACITY", + "WAITING_INSTANCE_LIMIT_EXCEEDED", + "WAITING_RUNNER_LIMIT_EXCEEDED", + "TERMINATED_BY_USER", + "VOLUME_ERROR", + "GATEWAY_ERROR", + "SCALED_DOWN", + "DONE_BY_RUNNER", + "ABORTED_BY_USER", + "TERMINATED_BY_SERVER", + "INACTIVITY_DURATION_EXCEEDED", + "TERMINATED_DUE_TO_UTILIZATION_POLICY", + "CONTAINER_EXITED_WITH_ERROR", + "PORTS_BINDING_FAILED", + "CREATING_CONTAINER_ERROR", + "EXECUTOR_ERROR", + "MAX_DURATION_EXCEEDED", + ], + affected_columns=[ + TableReference( + table_schema="public", table_name="jobs", column_name="termination_reason" + ) + ], + enum_values_to_rename=[], + ) + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.drop_column("disconnected_at") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/05_14_1524_bca2fdf130bf_add_runmodel_priority.py b/src/dstack/_internal/server/migrations/versions/2025/05_14_1524_bca2fdf130bf_add_runmodel_priority.py new file mode 100644 index 0000000000..4c51278534 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/05_14_1524_bca2fdf130bf_add_runmodel_priority.py @@ -0,0 +1,34 @@ +"""Add RunModel.priority + +Revision ID: bca2fdf130bf +Revises: 20166748b60c +Create Date: 2025-05-14 15:24:21.269775 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "bca2fdf130bf" +down_revision = "20166748b60c" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.add_column(sa.Column("priority", sa.Integer(), nullable=True)) + batch_op.execute("UPDATE runs SET priority = 0") + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.alter_column("priority", nullable=False) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.drop_column("priority") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/05_29_1530_35e90e1b0d3e_add_rolling_deployment_fields.py b/src/dstack/_internal/server/migrations/versions/2025/05_29_1530_35e90e1b0d3e_add_rolling_deployment_fields.py new file mode 100644 index 0000000000..e26206222f --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/05_29_1530_35e90e1b0d3e_add_rolling_deployment_fields.py @@ -0,0 +1,42 @@ +"""Add rolling deployment fields + +Revision ID: 35e90e1b0d3e +Revises: 35f732ee4cf5 +Create Date: 2025-05-29 15:30:27.878569 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "35e90e1b0d3e" +down_revision = "35f732ee4cf5" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.add_column(sa.Column("deployment_num", sa.Integer(), nullable=True)) + batch_op.add_column(sa.Column("desired_replica_count", sa.Integer(), nullable=True)) + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.execute("UPDATE runs SET deployment_num = 0") + batch_op.execute("UPDATE runs SET desired_replica_count = 1") + batch_op.alter_column("deployment_num", nullable=False) + batch_op.alter_column("desired_replica_count", nullable=False) + + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.add_column(sa.Column("deployment_num", sa.Integer(), nullable=True)) + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.execute("UPDATE jobs SET deployment_num = 0") + batch_op.alter_column("deployment_num", nullable=False) + + +def downgrade() -> None: + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.drop_column("deployment_num") + batch_op.drop_column("desired_replica_count") + + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.drop_column("deployment_num") diff --git a/src/dstack/_internal/server/migrations/versions/2025/06_06_1304_35f732ee4cf5_add_projectmodel_is_public.py b/src/dstack/_internal/server/migrations/versions/2025/06_06_1304_35f732ee4cf5_add_projectmodel_is_public.py new file mode 100644 index 0000000000..ed736384e4 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/06_06_1304_35f732ee4cf5_add_projectmodel_is_public.py @@ -0,0 +1,39 @@ +"""Add ProjectModel.is_public + +Revision ID: 35f732ee4cf5 +Revises: bca2fdf130bf +Create Date: 2025-06-06 13:04:02.912032 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "35f732ee4cf5" +down_revision = "bca2fdf130bf" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + # Add is_public column as nullable first + with op.batch_alter_table("projects", schema=None) as batch_op: + batch_op.add_column(sa.Column("is_public", sa.Boolean(), nullable=True)) + + # Set is_public to False for existing projects + op.execute(sa.sql.text("UPDATE projects SET is_public = FALSE")) + + # Make is_public non-nullable with default value + with op.batch_alter_table("projects", schema=None) as batch_op: + batch_op.alter_column("is_public", nullable=False, server_default=sa.false()) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + # Remove is_public column + with op.batch_alter_table("projects", schema=None) as batch_op: + batch_op.drop_column("is_public") + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/06_12_1228_5f1707c525d2_add_filearchivemodel.py b/src/dstack/_internal/server/migrations/versions/2025/06_12_1228_5f1707c525d2_add_filearchivemodel.py new file mode 100644 index 0000000000..a73d9db250 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/06_12_1228_5f1707c525d2_add_filearchivemodel.py @@ -0,0 +1,39 @@ +"""Add FileArchiveModel + +Revision ID: 5f1707c525d2 +Revises: 35e90e1b0d3e +Create Date: 2025-06-12 12:28:26.678380 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +# revision identifiers, used by Alembic. +revision = "5f1707c525d2" +down_revision = "35e90e1b0d3e" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.create_table( + "file_archives", + sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("user_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("blob_hash", sa.Text(), nullable=False), + sa.Column("blob", sa.LargeBinary(), nullable=True), + sa.ForeignKeyConstraint( + ["user_id"], + ["users.id"], + name=op.f("fk_file_archives_user_id_users"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_file_archives")), + sa.UniqueConstraint("user_id", "blob_hash", name="uq_file_archives_user_id_blob_hash"), + ) + + +def downgrade() -> None: + op.drop_table("file_archives") diff --git a/src/dstack/_internal/server/migrations/versions/2025/06_30_1100_644b8a114187_add_secretmodel.py b/src/dstack/_internal/server/migrations/versions/2025/06_30_1100_644b8a114187_add_secretmodel.py new file mode 100644 index 0000000000..6563e92dc9 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/06_30_1100_644b8a114187_add_secretmodel.py @@ -0,0 +1,49 @@ +"""Add SecretModel + +Revision ID: 644b8a114187 +Revises: 5f1707c525d2 +Create Date: 2025-06-30 11:00:04.326290 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "644b8a114187" +down_revision = "5f1707c525d2" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "secrets", + sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column( + "project_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False + ), + sa.Column("created_at", dstack._internal.server.models.NaiveDateTime(), nullable=False), + sa.Column("updated_at", dstack._internal.server.models.NaiveDateTime(), nullable=False), + sa.Column("name", sa.String(length=200), nullable=False), + sa.Column("value", dstack._internal.server.models.EncryptedString(), nullable=False), + sa.ForeignKeyConstraint( + ["project_id"], + ["projects.id"], + name=op.f("fk_secrets_project_id_projects"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_secrets")), + sa.UniqueConstraint("project_id", "name", name="uq_secrets_project_id_name"), + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table("secrets") + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/07_15_1426_d5863798bf41_add_volumemodel_last_job_processed_at.py b/src/dstack/_internal/server/migrations/versions/2025/07_15_1426_d5863798bf41_add_volumemodel_last_job_processed_at.py new file mode 100644 index 0000000000..1dc883e05e --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/07_15_1426_d5863798bf41_add_volumemodel_last_job_processed_at.py @@ -0,0 +1,40 @@ +"""Add VolumeModel.last_job_processed_at + +Revision ID: d5863798bf41 +Revises: 644b8a114187 +Create Date: 2025-07-15 14:26:22.981687 + +""" + +import sqlalchemy as sa +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "d5863798bf41" +down_revision = "644b8a114187" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("volumes", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "last_job_processed_at", + dstack._internal.server.models.NaiveDateTime(), + nullable=True, + ) + ) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("volumes", schema=None) as batch_op: + batch_op.drop_column("last_job_processed_at") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/07_17_1547_ec02a26a256c_add_runmodel_next_triggered_at.py b/src/dstack/_internal/server/migrations/versions/2025/07_17_1547_ec02a26a256c_add_runmodel_next_triggered_at.py new file mode 100644 index 0000000000..c3ea4e09c4 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/07_17_1547_ec02a26a256c_add_runmodel_next_triggered_at.py @@ -0,0 +1,38 @@ +"""Add RunModel.next_triggered_at + +Revision ID: ec02a26a256c +Revises: d5863798bf41 +Create Date: 2025-07-17 15:47:00.443217 + +""" + +import sqlalchemy as sa +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "ec02a26a256c" +down_revision = "d5863798bf41" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "next_triggered_at", dstack._internal.server.models.NaiveDateTime(), nullable=True + ) + ) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.drop_column("next_triggered_at") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/07_25_1036_50dd7ea98639_index_status_columns.py b/src/dstack/_internal/server/migrations/versions/2025/07_25_1036_50dd7ea98639_index_status_columns.py new file mode 100644 index 0000000000..52be9c7795 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/07_25_1036_50dd7ea98639_index_status_columns.py @@ -0,0 +1,55 @@ +"""Index status columns + +Revision ID: 50dd7ea98639 +Revises: ec02a26a256c +Create Date: 2025-07-25 10:36:25.127923 + +""" + +from alembic import op + +# revision identifiers, used by Alembic. +revision = "50dd7ea98639" +down_revision = "ec02a26a256c" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.create_index(batch_op.f("ix_runs_status"), ["status"], unique=False) + + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.create_index(batch_op.f("ix_jobs_status"), ["status"], unique=False) + + with op.batch_alter_table("fleets", schema=None) as batch_op: + batch_op.create_index(batch_op.f("ix_fleets_status"), ["status"], unique=False) + + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.create_index(batch_op.f("ix_instances_status"), ["status"], unique=False) + + with op.batch_alter_table("volumes", schema=None) as batch_op: + batch_op.create_index(batch_op.f("ix_volumes_status"), ["status"], unique=False) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_runs_status")) + + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_jobs_status")) + + with op.batch_alter_table("fleets", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_fleets_status")) + + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_instances_status")) + + with op.batch_alter_table("volumes", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_volumes_status")) + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/08_01_1456_728b1488b1b4_add_instance_health.py b/src/dstack/_internal/server/migrations/versions/2025/08_01_1456_728b1488b1b4_add_instance_health.py new file mode 100644 index 0000000000..79065fccb1 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/08_01_1456_728b1488b1b4_add_instance_health.py @@ -0,0 +1,50 @@ +"""Add instance health + +Revision ID: 728b1488b1b4 +Revises: 25479f540245 +Create Date: 2025-08-01 14:56:20.466990 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "728b1488b1b4" +down_revision = "25479f540245" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.create_table( + "instance_health_checks", + sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column( + "instance_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False + ), + sa.Column("collected_at", dstack._internal.server.models.NaiveDateTime(), nullable=False), + sa.Column("status", sa.VARCHAR(length=100), nullable=False), + sa.Column("response", sa.Text(), nullable=False), + sa.ForeignKeyConstraint( + ["instance_id"], + ["instances.id"], + name=op.f("fk_instance_health_checks_instance_id_instances"), + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_instance_health_checks")), + ) + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.add_column(sa.Column("health", sa.VARCHAR(length=100), nullable=True)) + op.execute("UPDATE instances SET health = 'HEALTHY'") + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.alter_column("health", existing_type=sa.VARCHAR(length=100), nullable=False) + + +def downgrade() -> None: + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.drop_column("health") + + op.drop_table("instance_health_checks") diff --git a/src/dstack/_internal/server/migrations/versions/2025/08_03_1951_25479f540245_add_probes.py b/src/dstack/_internal/server/migrations/versions/2025/08_03_1951_25479f540245_add_probes.py new file mode 100644 index 0000000000..d6251e64bb --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/08_03_1951_25479f540245_add_probes.py @@ -0,0 +1,43 @@ +"""Add probes + +Revision ID: 25479f540245 +Revises: 50dd7ea98639 +Create Date: 2025-08-03 19:51:07.722217 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "25479f540245" +down_revision = "50dd7ea98639" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "probes", + sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("name", sa.String(length=100), nullable=False), + sa.Column("job_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("probe_num", sa.Integer(), nullable=False), + sa.Column("due", dstack._internal.server.models.NaiveDateTime(), nullable=False), + sa.Column("success_streak", sa.BigInteger(), nullable=False), + sa.Column("active", sa.Boolean(), nullable=False), + sa.ForeignKeyConstraint(["job_id"], ["jobs.id"], name=op.f("fk_probes_job_id_jobs")), + sa.PrimaryKeyConstraint("id", "job_id", name=op.f("pk_probes")), + sa.UniqueConstraint("job_id", "probe_num", name="uq_probes_job_id_probe_num"), + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table("probes") + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/08_06_1349_74a1f55209bd_store_enums_as_strings.py b/src/dstack/_internal/server/migrations/versions/2025/08_06_1349_74a1f55209bd_store_enums_as_strings.py new file mode 100644 index 0000000000..f98934cf3a --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/08_06_1349_74a1f55209bd_store_enums_as_strings.py @@ -0,0 +1,484 @@ +"""Store enums as strings + +Revision ID: 74a1f55209bd +Revises: 728b1488b1b4 +Create Date: 2025-08-06 13:49:28.785378 + +""" + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "74a1f55209bd" +down_revision = "728b1488b1b4" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("users", schema=None) as batch_op: + batch_op.alter_column( + "global_role", + existing_type=postgresql.ENUM("ADMIN", "USER", name="globalrole"), + type_=sa.String(length=100), + existing_nullable=False, + ) + + with op.batch_alter_table("members", schema=None) as batch_op: + batch_op.alter_column( + "project_role", + existing_type=postgresql.ENUM("ADMIN", "MANAGER", "USER", name="projectrole"), + type_=sa.String(length=100), + existing_nullable=False, + ) + + with op.batch_alter_table("repos", schema=None) as batch_op: + batch_op.alter_column( + "type", + existing_type=postgresql.ENUM("REMOTE", "LOCAL", "VIRTUAL", name="repotype"), + type_=sa.String(length=100), + existing_nullable=False, + ) + + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.alter_column( + "status", + existing_type=postgresql.ENUM( + "PENDING", + "SUBMITTED", + "PROVISIONING", + "RUNNING", + "TERMINATING", + "TERMINATED", + "FAILED", + "DONE", + name="runstatus", + ), + type_=sa.String(length=100), + existing_nullable=False, + ) + batch_op.alter_column( + "termination_reason", + existing_type=postgresql.ENUM( + "ALL_JOBS_DONE", + "JOB_FAILED", + "RETRY_LIMIT_EXCEEDED", + "STOPPED_BY_USER", + "ABORTED_BY_USER", + "SERVER_ERROR", + name="runterminationreason", + ), + type_=sa.String(length=100), + existing_nullable=True, + ) + + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.alter_column( + "status", + existing_type=postgresql.ENUM( + "SUBMITTED", + "PROVISIONING", + "PULLING", + "RUNNING", + "TERMINATING", + "TERMINATED", + "ABORTED", + "FAILED", + "DONE", + name="jobstatus", + ), + type_=sa.String(length=100), + existing_nullable=False, + ) + batch_op.alter_column( + "termination_reason", + existing_type=postgresql.ENUM( + "FAILED_TO_START_DUE_TO_NO_CAPACITY", + "INTERRUPTED_BY_NO_CAPACITY", + "INSTANCE_UNREACHABLE", + "WAITING_INSTANCE_LIMIT_EXCEEDED", + "WAITING_RUNNER_LIMIT_EXCEEDED", + "TERMINATED_BY_USER", + "VOLUME_ERROR", + "GATEWAY_ERROR", + "SCALED_DOWN", + "DONE_BY_RUNNER", + "ABORTED_BY_USER", + "TERMINATED_BY_SERVER", + "INACTIVITY_DURATION_EXCEEDED", + "TERMINATED_DUE_TO_UTILIZATION_POLICY", + "CONTAINER_EXITED_WITH_ERROR", + "PORTS_BINDING_FAILED", + "CREATING_CONTAINER_ERROR", + "EXECUTOR_ERROR", + "MAX_DURATION_EXCEEDED", + name="jobterminationreason", + ), + type_=sa.String(length=100), + existing_nullable=True, + ) + + with op.batch_alter_table("fleets", schema=None) as batch_op: + batch_op.alter_column( + "status", + existing_type=postgresql.ENUM( + "SUBMITTED", "ACTIVE", "TERMINATING", "TERMINATED", "FAILED", name="fleetstatus" + ), + type_=sa.String(length=100), + existing_nullable=False, + ) + + with op.batch_alter_table("gateways", schema=None) as batch_op: + batch_op.alter_column( + "status", + existing_type=postgresql.ENUM( + "SUBMITTED", "PROVISIONING", "RUNNING", "FAILED", name="gatewaystatus" + ), + type_=sa.String(length=100), + existing_nullable=False, + ) + + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.alter_column( + "status", + existing_type=postgresql.ENUM( + "PENDING", + "PROVISIONING", + "IDLE", + "BUSY", + "TERMINATING", + "TERMINATED", + name="instancestatus", + ), + type_=sa.String(length=100), + existing_nullable=False, + ) + + with op.batch_alter_table("volumes", schema=None) as batch_op: + batch_op.alter_column( + "status", + existing_type=postgresql.ENUM( + "SUBMITTED", "PROVISIONING", "ACTIVE", "FAILED", name="volumestatus" + ), + type_=sa.String(length=100), + existing_nullable=False, + ) + + sa.Enum("ADMIN", "USER", name="globalrole").drop(op.get_bind()) + sa.Enum( + "ALL_JOBS_DONE", + "JOB_FAILED", + "RETRY_LIMIT_EXCEEDED", + "STOPPED_BY_USER", + "ABORTED_BY_USER", + "SERVER_ERROR", + name="runterminationreason", + ).drop(op.get_bind()) + sa.Enum("SUBMITTED", "PROVISIONING", "RUNNING", "FAILED", name="gatewaystatus").drop( + op.get_bind() + ) + sa.Enum("SUBMITTED", "PROVISIONING", "ACTIVE", "FAILED", name="volumestatus").drop( + op.get_bind() + ) + sa.Enum( + "PENDING", + "SUBMITTED", + "PROVISIONING", + "RUNNING", + "TERMINATING", + "TERMINATED", + "FAILED", + "DONE", + name="runstatus", + ).drop(op.get_bind()) + sa.Enum("REMOTE", "LOCAL", "VIRTUAL", name="repotype").drop(op.get_bind()) + sa.Enum( + "SUBMITTED", + "PROVISIONING", + "PULLING", + "RUNNING", + "TERMINATING", + "TERMINATED", + "ABORTED", + "FAILED", + "DONE", + name="jobstatus", + ).drop(op.get_bind()) + sa.Enum( + "PENDING", + "PROVISIONING", + "IDLE", + "BUSY", + "TERMINATING", + "TERMINATED", + name="instancestatus", + ).drop(op.get_bind()) + sa.Enum("SUBMITTED", "ACTIVE", "TERMINATING", "TERMINATED", "FAILED", name="fleetstatus").drop( + op.get_bind() + ) + sa.Enum("ADMIN", "MANAGER", "USER", name="projectrole").drop(op.get_bind()) + sa.Enum( + "FAILED_TO_START_DUE_TO_NO_CAPACITY", + "INTERRUPTED_BY_NO_CAPACITY", + "INSTANCE_UNREACHABLE", + "WAITING_INSTANCE_LIMIT_EXCEEDED", + "WAITING_RUNNER_LIMIT_EXCEEDED", + "TERMINATED_BY_USER", + "VOLUME_ERROR", + "GATEWAY_ERROR", + "SCALED_DOWN", + "DONE_BY_RUNNER", + "ABORTED_BY_USER", + "TERMINATED_BY_SERVER", + "INACTIVITY_DURATION_EXCEEDED", + "TERMINATED_DUE_TO_UTILIZATION_POLICY", + "CONTAINER_EXITED_WITH_ERROR", + "PORTS_BINDING_FAILED", + "CREATING_CONTAINER_ERROR", + "EXECUTOR_ERROR", + "MAX_DURATION_EXCEEDED", + name="jobterminationreason", + ).drop(op.get_bind()) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + sa.Enum( + "FAILED_TO_START_DUE_TO_NO_CAPACITY", + "INTERRUPTED_BY_NO_CAPACITY", + "INSTANCE_UNREACHABLE", + "WAITING_INSTANCE_LIMIT_EXCEEDED", + "WAITING_RUNNER_LIMIT_EXCEEDED", + "TERMINATED_BY_USER", + "VOLUME_ERROR", + "GATEWAY_ERROR", + "SCALED_DOWN", + "DONE_BY_RUNNER", + "ABORTED_BY_USER", + "TERMINATED_BY_SERVER", + "INACTIVITY_DURATION_EXCEEDED", + "TERMINATED_DUE_TO_UTILIZATION_POLICY", + "CONTAINER_EXITED_WITH_ERROR", + "PORTS_BINDING_FAILED", + "CREATING_CONTAINER_ERROR", + "EXECUTOR_ERROR", + "MAX_DURATION_EXCEEDED", + name="jobterminationreason", + ).create(op.get_bind()) + sa.Enum("ADMIN", "MANAGER", "USER", name="projectrole").create(op.get_bind()) + sa.Enum( + "SUBMITTED", "ACTIVE", "TERMINATING", "TERMINATED", "FAILED", name="fleetstatus" + ).create(op.get_bind()) + sa.Enum( + "PENDING", + "PROVISIONING", + "IDLE", + "BUSY", + "TERMINATING", + "TERMINATED", + name="instancestatus", + ).create(op.get_bind()) + sa.Enum( + "SUBMITTED", + "PROVISIONING", + "PULLING", + "RUNNING", + "TERMINATING", + "TERMINATED", + "ABORTED", + "FAILED", + "DONE", + name="jobstatus", + ).create(op.get_bind()) + sa.Enum("REMOTE", "LOCAL", "VIRTUAL", name="repotype").create(op.get_bind()) + sa.Enum( + "PENDING", + "SUBMITTED", + "PROVISIONING", + "RUNNING", + "TERMINATING", + "TERMINATED", + "FAILED", + "DONE", + name="runstatus", + ).create(op.get_bind()) + sa.Enum("SUBMITTED", "PROVISIONING", "ACTIVE", "FAILED", name="volumestatus").create( + op.get_bind() + ) + sa.Enum("SUBMITTED", "PROVISIONING", "RUNNING", "FAILED", name="gatewaystatus").create( + op.get_bind() + ) + sa.Enum( + "ALL_JOBS_DONE", + "JOB_FAILED", + "RETRY_LIMIT_EXCEEDED", + "STOPPED_BY_USER", + "ABORTED_BY_USER", + "SERVER_ERROR", + name="runterminationreason", + ).create(op.get_bind()) + sa.Enum("ADMIN", "USER", name="globalrole").create(op.get_bind()) + with op.batch_alter_table("volumes", schema=None) as batch_op: + batch_op.alter_column( + "status", + existing_type=sa.String(length=100), + type_=postgresql.ENUM( + "SUBMITTED", "PROVISIONING", "ACTIVE", "FAILED", name="volumestatus" + ), + existing_nullable=False, + postgresql_using="status::VARCHAR::volumestatus", + ) + + with op.batch_alter_table("users", schema=None) as batch_op: + batch_op.alter_column( + "global_role", + existing_type=sa.String(length=100), + type_=postgresql.ENUM("ADMIN", "USER", name="globalrole"), + existing_nullable=False, + postgresql_using="global_role::VARCHAR::globalrole", + ) + + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.alter_column( + "termination_reason", + existing_type=sa.String(length=100), + type_=postgresql.ENUM( + "ALL_JOBS_DONE", + "JOB_FAILED", + "RETRY_LIMIT_EXCEEDED", + "STOPPED_BY_USER", + "ABORTED_BY_USER", + "SERVER_ERROR", + name="runterminationreason", + ), + existing_nullable=True, + postgresql_using="termination_reason::VARCHAR::runterminationreason", + ) + batch_op.alter_column( + "status", + existing_type=sa.String(length=100), + type_=postgresql.ENUM( + "PENDING", + "SUBMITTED", + "PROVISIONING", + "RUNNING", + "TERMINATING", + "TERMINATED", + "FAILED", + "DONE", + name="runstatus", + ), + existing_nullable=False, + postgresql_using="status::VARCHAR::runstatus", + ) + + with op.batch_alter_table("repos", schema=None) as batch_op: + batch_op.alter_column( + "type", + existing_type=sa.String(length=100), + type_=postgresql.ENUM("REMOTE", "LOCAL", "VIRTUAL", name="repotype"), + existing_nullable=False, + postgresql_using="type::VARCHAR::repotype", + ) + + with op.batch_alter_table("members", schema=None) as batch_op: + batch_op.alter_column( + "project_role", + existing_type=sa.String(length=100), + type_=postgresql.ENUM("ADMIN", "MANAGER", "USER", name="projectrole"), + existing_nullable=False, + postgresql_using="project_role::VARCHAR::projectrole", + ) + + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.alter_column( + "termination_reason", + existing_type=sa.String(length=100), + type_=postgresql.ENUM( + "FAILED_TO_START_DUE_TO_NO_CAPACITY", + "INTERRUPTED_BY_NO_CAPACITY", + "INSTANCE_UNREACHABLE", + "WAITING_INSTANCE_LIMIT_EXCEEDED", + "WAITING_RUNNER_LIMIT_EXCEEDED", + "TERMINATED_BY_USER", + "VOLUME_ERROR", + "GATEWAY_ERROR", + "SCALED_DOWN", + "DONE_BY_RUNNER", + "ABORTED_BY_USER", + "TERMINATED_BY_SERVER", + "INACTIVITY_DURATION_EXCEEDED", + "TERMINATED_DUE_TO_UTILIZATION_POLICY", + "CONTAINER_EXITED_WITH_ERROR", + "PORTS_BINDING_FAILED", + "CREATING_CONTAINER_ERROR", + "EXECUTOR_ERROR", + "MAX_DURATION_EXCEEDED", + name="jobterminationreason", + ), + existing_nullable=True, + postgresql_using="termination_reason::VARCHAR::jobterminationreason", + ) + batch_op.alter_column( + "status", + existing_type=sa.String(length=100), + type_=postgresql.ENUM( + "SUBMITTED", + "PROVISIONING", + "PULLING", + "RUNNING", + "TERMINATING", + "TERMINATED", + "ABORTED", + "FAILED", + "DONE", + name="jobstatus", + ), + existing_nullable=False, + postgresql_using="status::VARCHAR::jobstatus", + ) + + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.alter_column( + "status", + existing_type=sa.String(length=100), + type_=postgresql.ENUM( + "PENDING", + "PROVISIONING", + "IDLE", + "BUSY", + "TERMINATING", + "TERMINATED", + name="instancestatus", + ), + existing_nullable=False, + postgresql_using="status::VARCHAR::instancestatus", + ) + + with op.batch_alter_table("gateways", schema=None) as batch_op: + batch_op.alter_column( + "status", + existing_type=sa.String(length=100), + type_=postgresql.ENUM( + "SUBMITTED", "PROVISIONING", "RUNNING", "FAILED", name="gatewaystatus" + ), + existing_nullable=False, + postgresql_using="status::VARCHAR::gatewaystatus", + ) + + with op.batch_alter_table("fleets", schema=None) as batch_op: + batch_op.alter_column( + "status", + existing_type=sa.String(length=100), + type_=postgresql.ENUM( + "SUBMITTED", "ACTIVE", "TERMINATING", "TERMINATED", "FAILED", name="fleetstatus" + ), + existing_nullable=False, + postgresql_using="status::VARCHAR::fleetstatus", + ) + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/08_11_1323_3d7f6c2ec000_add_jobmodel_registered.py b/src/dstack/_internal/server/migrations/versions/2025/08_11_1323_3d7f6c2ec000_add_jobmodel_registered.py new file mode 100644 index 0000000000..3bafa94ff5 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/08_11_1323_3d7f6c2ec000_add_jobmodel_registered.py @@ -0,0 +1,28 @@ +"""Add JobModel.registered + +Revision ID: 3d7f6c2ec000 +Revises: 74a1f55209bd +Create Date: 2025-08-11 13:23:39.530103 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "3d7f6c2ec000" +down_revision = "74a1f55209bd" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.add_column( + sa.Column("registered", sa.Boolean(), server_default=sa.false(), nullable=False) + ) + + +def downgrade() -> None: + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.drop_column("registered") diff --git a/src/dstack/_internal/server/migrations/versions/2025/08_15_1126_e2d08cd1b8d9_add_jobmodel_fleet.py b/src/dstack/_internal/server/migrations/versions/2025/08_15_1126_e2d08cd1b8d9_add_jobmodel_fleet.py new file mode 100644 index 0000000000..6136b7843c --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/08_15_1126_e2d08cd1b8d9_add_jobmodel_fleet.py @@ -0,0 +1,41 @@ +"""Add JobModel.fleet + +Revision ID: e2d08cd1b8d9 +Revises: 3d7f6c2ec000 +Create Date: 2025-08-15 11:26:05.670591 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +# revision identifiers, used by Alembic. +revision = "e2d08cd1b8d9" +down_revision = "3d7f6c2ec000" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "fleet_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True + ) + ) + batch_op.create_foreign_key( + batch_op.f("fk_jobs_fleet_id_fleets"), "fleets", ["fleet_id"], ["id"] + ) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.drop_constraint(batch_op.f("fk_jobs_fleet_id_fleets"), type_="foreignkey") + batch_op.drop_column("fleet_id") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/08_29_1608_2498ab323443_add_fleetmodel_consolidation_attempt_.py b/src/dstack/_internal/server/migrations/versions/2025/08_29_1608_2498ab323443_add_fleetmodel_consolidation_attempt_.py new file mode 100644 index 0000000000..534dacaba7 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/08_29_1608_2498ab323443_add_fleetmodel_consolidation_attempt_.py @@ -0,0 +1,44 @@ +"""Add FleetModel.consolidation_attempt and FleetModel.last_consolidated_at + +Revision ID: 2498ab323443 +Revises: e2d08cd1b8d9 +Create Date: 2025-08-29 16:08:48.686595 + +""" + +import sqlalchemy as sa +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "2498ab323443" +down_revision = "e2d08cd1b8d9" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("fleets", schema=None) as batch_op: + batch_op.add_column( + sa.Column("consolidation_attempt", sa.Integer(), server_default="0", nullable=False) + ) + batch_op.add_column( + sa.Column( + "last_consolidated_at", + dstack._internal.server.models.NaiveDateTime(), + nullable=True, + ) + ) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("fleets", schema=None) as batch_op: + batch_op.drop_column("last_consolidated_at") + batch_op.drop_column("consolidation_attempt") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/10_09_2031_ff1d94f65b08_user_ssh_key.py b/src/dstack/_internal/server/migrations/versions/2025/10_09_2031_ff1d94f65b08_user_ssh_key.py new file mode 100644 index 0000000000..fc79b58b08 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/10_09_2031_ff1d94f65b08_user_ssh_key.py @@ -0,0 +1,34 @@ +"""user.ssh_key + +Revision ID: ff1d94f65b08 +Revises: 2498ab323443 +Create Date: 2025-10-09 20:31:31.166786 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "ff1d94f65b08" +down_revision = "2498ab323443" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("users", schema=None) as batch_op: + batch_op.add_column(sa.Column("ssh_private_key", sa.Text(), nullable=True)) + batch_op.add_column(sa.Column("ssh_public_key", sa.Text(), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("users", schema=None) as batch_op: + batch_op.drop_column("ssh_public_key") + batch_op.drop_column("ssh_private_key") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/10_21_1601_7d1ec2b920ac_add_computegroupmodel.py b/src/dstack/_internal/server/migrations/versions/2025/10_21_1601_7d1ec2b920ac_add_computegroupmodel.py new file mode 100644 index 0000000000..e10a6113be --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/10_21_1601_7d1ec2b920ac_add_computegroupmodel.py @@ -0,0 +1,91 @@ +"""Add ComputeGroupModel + +Revision ID: 7d1ec2b920ac +Revises: ff1d94f65b08 +Create Date: 2025-10-21 16:01:23.739395 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "7d1ec2b920ac" +down_revision = "ff1d94f65b08" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.add_column(sa.Column("waiting_master_job", sa.Boolean(), nullable=True)) + op.create_table( + "compute_groups", + sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column( + "project_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False + ), + sa.Column("fleet_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("created_at", dstack._internal.server.models.NaiveDateTime(), nullable=False), + sa.Column("status", sa.String(length=100), nullable=False), + sa.Column( + "last_processed_at", dstack._internal.server.models.NaiveDateTime(), nullable=False + ), + sa.Column("deleted", sa.Boolean(), nullable=False), + sa.Column("deleted_at", dstack._internal.server.models.NaiveDateTime(), nullable=True), + sa.Column("provisioning_data", sa.Text(), nullable=False), + sa.Column( + "first_termination_retry_at", + dstack._internal.server.models.NaiveDateTime(), + nullable=True, + ), + sa.Column( + "last_termination_retry_at", + dstack._internal.server.models.NaiveDateTime(), + nullable=True, + ), + sa.ForeignKeyConstraint( + ["fleet_id"], ["fleets.id"], name=op.f("fk_compute_groups_fleet_id_fleets") + ), + sa.ForeignKeyConstraint( + ["project_id"], + ["projects.id"], + name=op.f("fk_compute_groups_project_id_projects"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_compute_groups")), + ) + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "compute_group_id", + sqlalchemy_utils.types.uuid.UUIDType(binary=False), + nullable=True, + ) + ) + batch_op.create_foreign_key( + batch_op.f("fk_instances_compute_group_id_compute_groups"), + "compute_groups", + ["compute_group_id"], + ["id"], + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.drop_column("waiting_master_job") + + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.drop_constraint( + batch_op.f("fk_instances_compute_group_id_compute_groups"), type_="foreignkey" + ) + batch_op.drop_column("compute_group_id") + + op.drop_table("compute_groups") + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/11_26_1143_06e977bc61c7_add_usermodel_deleted_and_original_name.py b/src/dstack/_internal/server/migrations/versions/2025/11_26_1143_06e977bc61c7_add_usermodel_deleted_and_original_name.py new file mode 100644 index 0000000000..27c6543d8c --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/11_26_1143_06e977bc61c7_add_usermodel_deleted_and_original_name.py @@ -0,0 +1,45 @@ +"""Add UserModel.deleted and original_name + +Revision ID: 06e977bc61c7 +Revises: 7d1ec2b920ac +Create Date: 2025-11-26 11:43:34.825686 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "06e977bc61c7" +down_revision = "7d1ec2b920ac" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("users", schema=None) as batch_op: + batch_op.add_column( + sa.Column("deleted", sa.Boolean(), server_default=sa.false(), nullable=False) + ) + batch_op.add_column(sa.Column("original_name", sa.String(length=50), nullable=True)) + + # For postgres, this was moved to a new migration to avoid deadlocks. + if op.get_context().dialect.name == "sqlite": + with op.batch_alter_table("projects", schema=None) as batch_op: + batch_op.add_column(sa.Column("original_name", sa.String(length=50), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("users", schema=None) as batch_op: + batch_op.drop_column("original_name") + batch_op.drop_column("deleted") + + # For postgres, this was moved to a new migration to avoid deadlocks. + if op.get_context().dialect.name == "sqlite": + with op.batch_alter_table("projects", schema=None) as batch_op: + batch_op.drop_column("original_name") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/11_27_1511_006512f572b4_add_projects_original_name.py b/src/dstack/_internal/server/migrations/versions/2025/11_27_1511_006512f572b4_add_projects_original_name.py new file mode 100644 index 0000000000..04e1545b63 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/11_27_1511_006512f572b4_add_projects_original_name.py @@ -0,0 +1,38 @@ +"""Add projects.original_name + +Revision ID: 006512f572b4 +Revises: 06e977bc61c7 +Create Date: 2025-11-27 15:11:21.249079 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "006512f572b4" +down_revision = "06e977bc61c7" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + # For postgres, this was moved from a previous migration to avoid deadlocks. + # Keep SQLite in previous migration since it doesn't have deadlock issues and + # does not support if_not_exists. + if op.get_context().dialect.name != "sqlite": + with op.batch_alter_table("projects", schema=None) as batch_op: + batch_op.add_column( + sa.Column("original_name", sa.String(length=50), nullable=True), + if_not_exists=True, + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + if op.get_context().dialect.name != "sqlite": + with op.batch_alter_table("projects", schema=None) as batch_op: + batch_op.drop_column("original_name") + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/12_04_2048_d4d9dc26cf58_add_ix_jobs_run_id.py b/src/dstack/_internal/server/migrations/versions/2025/12_04_2048_d4d9dc26cf58_add_ix_jobs_run_id.py new file mode 100644 index 0000000000..b3d485a075 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/12_04_2048_d4d9dc26cf58_add_ix_jobs_run_id.py @@ -0,0 +1,31 @@ +"""Add ix_jobs_run_id + +Revision ID: d4d9dc26cf58 +Revises: 006512f572b4 +Create Date: 2025-12-04 20:48:10.543248 + +""" + +from alembic import op + +# revision identifiers, used by Alembic. +revision = "d4d9dc26cf58" +down_revision = "006512f572b4" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.create_index(batch_op.f("ix_jobs_run_id"), ["run_id"], unique=False) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_jobs_run_id")) + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/12_04_2052_5fd659afca82_add_ix_instances_fleet_id.py b/src/dstack/_internal/server/migrations/versions/2025/12_04_2052_5fd659afca82_add_ix_instances_fleet_id.py new file mode 100644 index 0000000000..4e9467a7cf --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/12_04_2052_5fd659afca82_add_ix_instances_fleet_id.py @@ -0,0 +1,31 @@ +"""Add ix_instances_fleet_id + +Revision ID: 5fd659afca82 +Revises: d4d9dc26cf58 +Create Date: 2025-12-04 20:52:07.015334 + +""" + +from alembic import op + +# revision identifiers, used by Alembic. +revision = "5fd659afca82" +down_revision = "d4d9dc26cf58" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.create_index(batch_op.f("ix_instances_fleet_id"), ["fleet_id"], unique=False) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_instances_fleet_id")) + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/12_04_2056_22d74df9897e_add_events_and_event_targets.py b/src/dstack/_internal/server/migrations/versions/2025/12_04_2056_22d74df9897e_add_events_and_event_targets.py new file mode 100644 index 0000000000..87a48deba0 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/12_04_2056_22d74df9897e_add_events_and_event_targets.py @@ -0,0 +1,99 @@ +"""Add events and event_targets + +Revision ID: 22d74df9897e +Revises: 5fd659afca82 +Create Date: 2025-12-04 20:56:08.003504 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "22d74df9897e" +down_revision = "5fd659afca82" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "events", + sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("message", sa.Text(), nullable=False), + sa.Column("recorded_at", dstack._internal.server.models.NaiveDateTime(), nullable=False), + sa.Column( + "actor_user_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True + ), + sa.ForeignKeyConstraint( + ["actor_user_id"], + ["users.id"], + name=op.f("fk_events_actor_user_id_users"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_events")), + ) + with op.batch_alter_table("events", schema=None) as batch_op: + batch_op.create_index( + batch_op.f("ix_events_actor_user_id"), ["actor_user_id"], unique=False + ) + batch_op.create_index(batch_op.f("ix_events_recorded_at"), ["recorded_at"], unique=False) + + op.create_table( + "event_targets", + sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("event_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column( + "entity_project_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True + ), + sa.Column("entity_type", sa.String(length=100), nullable=False), + sa.Column("entity_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("entity_name", sa.String(length=200), nullable=False), + sa.ForeignKeyConstraint( + ["entity_project_id"], + ["projects.id"], + name=op.f("fk_event_targets_entity_project_id_projects"), + ondelete="CASCADE", + ), + sa.ForeignKeyConstraint( + ["event_id"], + ["events.id"], + name=op.f("fk_event_targets_event_id_events"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_event_targets")), + ) + with op.batch_alter_table("event_targets", schema=None) as batch_op: + batch_op.create_index( + batch_op.f("ix_event_targets_entity_id"), ["entity_id"], unique=False + ) + batch_op.create_index( + batch_op.f("ix_event_targets_entity_project_id"), ["entity_project_id"], unique=False + ) + batch_op.create_index( + batch_op.f("ix_event_targets_entity_type"), ["entity_type"], unique=False + ) + batch_op.create_index(batch_op.f("ix_event_targets_event_id"), ["event_id"], unique=False) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("event_targets", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_event_targets_event_id")) + batch_op.drop_index(batch_op.f("ix_event_targets_entity_type")) + batch_op.drop_index(batch_op.f("ix_event_targets_entity_project_id")) + batch_op.drop_index(batch_op.f("ix_event_targets_entity_id")) + + op.drop_table("event_targets") + with op.batch_alter_table("events", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_events_recorded_at")) + batch_op.drop_index(batch_op.f("ix_events_actor_user_id")) + + op.drop_table("events") + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/12_18_1054_706e0acc3a7d_add_runmodel_desired_replica_counts.py b/src/dstack/_internal/server/migrations/versions/2025/12_18_1054_706e0acc3a7d_add_runmodel_desired_replica_counts.py new file mode 100644 index 0000000000..e993df7bec --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/12_18_1054_706e0acc3a7d_add_runmodel_desired_replica_counts.py @@ -0,0 +1,26 @@ +"""add runmodel desired_replica_counts + +Revision ID: 706e0acc3a7d +Revises: 903c91e24634 +Create Date: 2025-12-18 10:54:13.508297 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "706e0acc3a7d" +down_revision = "903c91e24634" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.add_column(sa.Column("desired_replica_counts", sa.Text(), nullable=True)) + + +def downgrade() -> None: + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.drop_column("desired_replica_counts") diff --git a/src/dstack/_internal/server/migrations/versions/2025/12_21_2208_1aa9638ad963_added_email_index.py b/src/dstack/_internal/server/migrations/versions/2025/12_21_2208_1aa9638ad963_added_email_index.py new file mode 100644 index 0000000000..3b5a9d8b5c --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/12_21_2208_1aa9638ad963_added_email_index.py @@ -0,0 +1,31 @@ +"""Added email index + +Revision ID: 1aa9638ad963 +Revises: 22d74df9897e +Create Date: 2025-12-21 22:08:27.331645 + +""" + +from alembic import op + +# revision identifiers, used by Alembic. +revision = "1aa9638ad963" +down_revision = "22d74df9897e" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("users", schema=None) as batch_op: + batch_op.create_index(batch_op.f("ix_users_email"), ["email"], unique=False) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("users", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_users_email")) + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2025/12_22_1217_903c91e24634_add_instances_termination_reason_message.py b/src/dstack/_internal/server/migrations/versions/2025/12_22_1217_903c91e24634_add_instances_termination_reason_message.py new file mode 100644 index 0000000000..ff025fa2ba --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2025/12_22_1217_903c91e24634_add_instances_termination_reason_message.py @@ -0,0 +1,34 @@ +"""Add instances.termination_reason_message + +Revision ID: 903c91e24634 +Revises: 1aa9638ad963 +Create Date: 2025-12-22 12:17:58.573457 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "903c91e24634" +down_revision = "1aa9638ad963" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.add_column( + sa.Column("termination_reason_message", sa.String(length=4000), nullable=True) + ) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.drop_column("termination_reason_message") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/02_18_1107_57cff3ec86ce_add_computegroupmodel_pipeline_columns.py b/src/dstack/_internal/server/migrations/versions/2026/02_18_1107_57cff3ec86ce_add_computegroupmodel_pipeline_columns.py new file mode 100644 index 0000000000..e341b3b4a4 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/02_18_1107_57cff3ec86ce_add_computegroupmodel_pipeline_columns.py @@ -0,0 +1,47 @@ +"""Add ComputeGroupModel pipeline columns + +Revision ID: 57cff3ec86ce +Revises: 706e0acc3a7d +Create Date: 2026-02-18 11:07:48.686185 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "57cff3ec86ce" +down_revision = "706e0acc3a7d" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("compute_groups", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "lock_expires_at", dstack._internal.server.models.NaiveDateTime(), nullable=True + ) + ) + batch_op.add_column( + sa.Column( + "lock_token", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True + ) + ) + batch_op.add_column(sa.Column("lock_owner", sa.String(length=100), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("compute_groups", schema=None) as batch_op: + batch_op.drop_column("lock_owner") + batch_op.drop_column("lock_token") + batch_op.drop_column("lock_expires_at") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/02_18_1108_9c2a227b0154_add_placementgroupmodel_pipeline_columns.py b/src/dstack/_internal/server/migrations/versions/2026/02_18_1108_9c2a227b0154_add_placementgroupmodel_pipeline_columns.py new file mode 100644 index 0000000000..56297fde36 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/02_18_1108_9c2a227b0154_add_placementgroupmodel_pipeline_columns.py @@ -0,0 +1,47 @@ +"""Add PlacementGroupModel pipeline columns + +Revision ID: 9c2a227b0154 +Revises: 57cff3ec86ce +Create Date: 2026-02-18 11:08:57.860277 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "9c2a227b0154" +down_revision = "57cff3ec86ce" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("placement_groups", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "lock_expires_at", dstack._internal.server.models.NaiveDateTime(), nullable=True + ) + ) + batch_op.add_column( + sa.Column( + "lock_token", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True + ) + ) + batch_op.add_column(sa.Column("lock_owner", sa.String(length=100), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("placement_groups", schema=None) as batch_op: + batch_op.drop_column("lock_owner") + batch_op.drop_column("lock_token") + batch_op.drop_column("lock_expires_at") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/02_18_1122_a8ed24fd7f90_add_pipeline_indexes_for_compute_and_.py b/src/dstack/_internal/server/migrations/versions/2026/02_18_1122_a8ed24fd7f90_add_pipeline_indexes_for_compute_and_.py new file mode 100644 index 0000000000..44cc2846ec --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/02_18_1122_a8ed24fd7f90_add_pipeline_indexes_for_compute_and_.py @@ -0,0 +1,74 @@ +"""Add pipeline indexes for compute and placement groups + +Revision ID: a8ed24fd7f90 +Revises: 9c2a227b0154 +Create Date: 2026-02-18 11:22:25.972000 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "a8ed24fd7f90" +down_revision = "9c2a227b0154" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.get_context().autocommit_block(): + if op.get_context().dialect.name == "postgresql": + # Concurrent index ops can fail midway, leaving invalid indexes behind. + # Use DROP INDEX IF EXISTS so the migration can be retried safely. + op.drop_index( + "ix_compute_groups_pipeline_fetch_q", + table_name="compute_groups", + if_exists=True, + postgresql_concurrently=True, + ) + op.drop_index( + "ix_placement_groups_pipeline_fetch_q", + table_name="placement_groups", + if_exists=True, + postgresql_concurrently=True, + ) + op.create_index( + "ix_compute_groups_pipeline_fetch_q", + "compute_groups", + [sa.literal_column("last_processed_at ASC")], + unique=False, + postgresql_where=sa.text("(status NOT IN ('TERMINATED'))"), + sqlite_where=sa.text("(status NOT IN ('TERMINATED'))"), + postgresql_concurrently=True, + ) + op.create_index( + "ix_placement_groups_pipeline_fetch_q", + "placement_groups", + [sa.literal_column("last_processed_at ASC")], + unique=False, + postgresql_where=sa.text("deleted IS FALSE"), + sqlite_where=sa.text("deleted = 0"), + postgresql_concurrently=True, + ) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.get_context().autocommit_block(): + op.drop_index( + "ix_placement_groups_pipeline_fetch_q", + "placement_groups", + if_exists=True, + postgresql_concurrently=True, + ) + op.drop_index( + "ix_compute_groups_pipeline_fetch_q", + "compute_groups", + if_exists=True, + postgresql_concurrently=True, + ) + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/02_23_0548_140331002ece_add_gatewaymodel_pipeline_and_to_be_.py b/src/dstack/_internal/server/migrations/versions/2026/02_23_0548_140331002ece_add_gatewaymodel_pipeline_and_to_be_.py new file mode 100644 index 0000000000..fa3c8ce30c --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/02_23_0548_140331002ece_add_gatewaymodel_pipeline_and_to_be_.py @@ -0,0 +1,51 @@ +"""Add GatewayModel pipeline and to_be_deleted columns + +Revision ID: 140331002ece +Revises: a8ed24fd7f90 +Create Date: 2026-02-23 05:48:55.948838+00:00 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "140331002ece" +down_revision = "a8ed24fd7f90" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("gateways", schema=None) as batch_op: + batch_op.add_column( + sa.Column("to_be_deleted", sa.Boolean(), server_default=sa.false(), nullable=False) + ) + batch_op.add_column( + sa.Column( + "lock_expires_at", dstack._internal.server.models.NaiveDateTime(), nullable=True + ) + ) + batch_op.add_column( + sa.Column( + "lock_token", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True + ) + ) + batch_op.add_column(sa.Column("lock_owner", sa.String(length=100), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("gateways", schema=None) as batch_op: + batch_op.drop_column("lock_owner") + batch_op.drop_column("lock_token") + batch_op.drop_column("lock_expires_at") + batch_op.drop_column("to_be_deleted") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/02_23_1134_ccfac6ac7924_add_volumemodel_pipeline_columns.py b/src/dstack/_internal/server/migrations/versions/2026/02_23_1134_ccfac6ac7924_add_volumemodel_pipeline_columns.py new file mode 100644 index 0000000000..4034f227e0 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/02_23_1134_ccfac6ac7924_add_volumemodel_pipeline_columns.py @@ -0,0 +1,53 @@ +"""Add VolumeModel pipeline columns + +Revision ID: ccfac6ac7924 +Revises: 140331002ece +Create Date: 2026-02-23 11:34:24.731339+00:00 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "ccfac6ac7924" +down_revision = "140331002ece" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("volumes", schema=None) as batch_op: + batch_op.add_column( + sa.Column("to_be_deleted", sa.Boolean(), server_default=sa.false(), nullable=False) + ) + batch_op.add_column(sa.Column("auto_cleanup_enabled", sa.Boolean(), nullable=True)) + batch_op.add_column( + sa.Column( + "lock_expires_at", dstack._internal.server.models.NaiveDateTime(), nullable=True + ) + ) + batch_op.add_column( + sa.Column( + "lock_token", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True + ) + ) + batch_op.add_column(sa.Column("lock_owner", sa.String(length=100), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("volumes", schema=None) as batch_op: + batch_op.drop_column("lock_owner") + batch_op.drop_column("lock_token") + batch_op.drop_column("lock_expires_at") + batch_op.drop_column("auto_cleanup_enabled") + batch_op.drop_column("to_be_deleted") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/02_24_0945_9a363c3cbe04_add_ix_volumes_pipeline_fetch_q_index.py b/src/dstack/_internal/server/migrations/versions/2026/02_24_0945_9a363c3cbe04_add_ix_volumes_pipeline_fetch_q_index.py new file mode 100644 index 0000000000..1d729dbbdc --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/02_24_0945_9a363c3cbe04_add_ix_volumes_pipeline_fetch_q_index.py @@ -0,0 +1,50 @@ +"""Add ix_volumes_pipeline_fetch_q index + +Revision ID: 9a363c3cbe04 +Revises: ccfac6ac7924 +Create Date: 2026-02-24 09:45:54.068288+00:00 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "9a363c3cbe04" +down_revision = "ccfac6ac7924" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.get_context().autocommit_block(): + op.drop_index( + "ix_volumes_pipeline_fetch_q", + table_name="volumes", + if_exists=True, + postgresql_concurrently=True, + ) + op.create_index( + "ix_volumes_pipeline_fetch_q", + "volumes", + [sa.literal_column("last_processed_at ASC")], + unique=False, + sqlite_where=sa.text("deleted = 0"), + postgresql_where=sa.text("deleted IS FALSE"), + postgresql_concurrently=True, + ) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.get_context().autocommit_block(): + op.drop_index( + "ix_volumes_pipeline_fetch_q", + table_name="volumes", + if_exists=True, + postgresql_concurrently=True, + ) + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/02_27_1218_d21d3e61de27_add_fleetmodel_pipeline_columns.py b/src/dstack/_internal/server/migrations/versions/2026/02_27_1218_d21d3e61de27_add_fleetmodel_pipeline_columns.py new file mode 100644 index 0000000000..fad3da7909 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/02_27_1218_d21d3e61de27_add_fleetmodel_pipeline_columns.py @@ -0,0 +1,47 @@ +"""Add FleetModel pipeline columns + +Revision ID: d21d3e61de27 +Revises: 9a363c3cbe04 +Create Date: 2026-02-27 12:18:01.768776+00:00 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "d21d3e61de27" +down_revision = "9a363c3cbe04" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("fleets", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "lock_expires_at", dstack._internal.server.models.NaiveDateTime(), nullable=True + ) + ) + batch_op.add_column( + sa.Column( + "lock_token", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True + ) + ) + batch_op.add_column(sa.Column("lock_owner", sa.String(length=100), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("fleets", schema=None) as batch_op: + batch_op.drop_column("lock_owner") + batch_op.drop_column("lock_token") + batch_op.drop_column("lock_expires_at") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/03_02_0530_46150101edec_add_ix_fleets_pipeline_fetch_q_index.py b/src/dstack/_internal/server/migrations/versions/2026/03_02_0530_46150101edec_add_ix_fleets_pipeline_fetch_q_index.py new file mode 100644 index 0000000000..365aac41cf --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/03_02_0530_46150101edec_add_ix_fleets_pipeline_fetch_q_index.py @@ -0,0 +1,49 @@ +"""Add ix_fleets_pipeline_fetch_q index + +Revision ID: 46150101edec +Revises: d21d3e61de27 +Create Date: 2026-03-02 05:30:07.196407+00:00 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "46150101edec" +down_revision = "d21d3e61de27" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.get_context().autocommit_block(): + op.drop_index( + "ix_fleets_pipeline_fetch_q", + table_name="fleets", + if_exists=True, + postgresql_concurrently=True, + ) + op.create_index( + "ix_fleets_pipeline_fetch_q", + "fleets", + [sa.literal_column("last_processed_at ASC")], + unique=False, + sqlite_where=sa.text("deleted = 0"), + postgresql_where=sa.text("deleted IS FALSE"), + postgresql_concurrently=True, + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.get_context().autocommit_block(): + op.drop_index( + "ix_fleets_pipeline_fetch_q", + table_name="fleets", + if_exists=True, + postgresql_concurrently=True, + ) + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/03_04_2221_5e8c7a9202bc_add_exports.py b/src/dstack/_internal/server/migrations/versions/2026/03_04_2221_5e8c7a9202bc_add_exports.py new file mode 100644 index 0000000000..05a022f7ff --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/03_04_2221_5e8c7a9202bc_add_exports.py @@ -0,0 +1,118 @@ +"""Add exports + +Revision ID: 5e8c7a9202bc +Revises: 46150101edec +Create Date: 2026-03-04 22:21:54.971260+00:00 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "5e8c7a9202bc" +down_revision = "46150101edec" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "exports", + sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("name", sa.String(length=100), nullable=False), + sa.Column( + "project_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False + ), + sa.Column("created_at", dstack._internal.server.models.NaiveDateTime(), nullable=False), + sa.ForeignKeyConstraint( + ["project_id"], + ["projects.id"], + name=op.f("fk_exports_project_id_projects"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_exports")), + sa.UniqueConstraint("project_id", "name", name="uq_exports_project_id_name"), + ) + with op.batch_alter_table("exports", schema=None) as batch_op: + batch_op.create_index(batch_op.f("ix_exports_project_id"), ["project_id"], unique=False) + + op.create_table( + "exported_fleets", + sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("export_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("fleet_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.ForeignKeyConstraint( + ["export_id"], + ["exports.id"], + name=op.f("fk_exported_fleets_export_id_exports"), + ondelete="CASCADE", + ), + sa.ForeignKeyConstraint( + ["fleet_id"], + ["fleets.id"], + name=op.f("fk_exported_fleets_fleet_id_fleets"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_exported_fleets")), + sa.UniqueConstraint("export_id", "fleet_id", name="uq_exported_fleets_export_id_fleet_id"), + ) + with op.batch_alter_table("exported_fleets", schema=None) as batch_op: + batch_op.create_index( + batch_op.f("ix_exported_fleets_export_id"), ["export_id"], unique=False + ) + batch_op.create_index( + batch_op.f("ix_exported_fleets_fleet_id"), ["fleet_id"], unique=False + ) + + op.create_table( + "imports", + sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column( + "project_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False + ), + sa.Column("export_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("created_at", dstack._internal.server.models.NaiveDateTime(), nullable=False), + sa.ForeignKeyConstraint( + ["export_id"], + ["exports.id"], + name=op.f("fk_imports_export_id_exports"), + ondelete="CASCADE", + ), + sa.ForeignKeyConstraint( + ["project_id"], + ["projects.id"], + name=op.f("fk_imports_project_id_projects"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_imports")), + sa.UniqueConstraint("project_id", "export_id", name="uq_imports_project_id_export_id"), + ) + with op.batch_alter_table("imports", schema=None) as batch_op: + batch_op.create_index(batch_op.f("ix_imports_export_id"), ["export_id"], unique=False) + batch_op.create_index(batch_op.f("ix_imports_project_id"), ["project_id"], unique=False) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("imports", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_imports_project_id")) + batch_op.drop_index(batch_op.f("ix_imports_export_id")) + + op.drop_table("imports") + with op.batch_alter_table("exported_fleets", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_exported_fleets_fleet_id")) + batch_op.drop_index(batch_op.f("ix_exported_fleets_export_id")) + + op.drop_table("exported_fleets") + with op.batch_alter_table("exports", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_exports_project_id")) + + op.drop_table("exports") + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/03_05_0547_8e8647f20aa4_add_instancemodel_pipeline_columns.py b/src/dstack/_internal/server/migrations/versions/2026/03_05_0547_8e8647f20aa4_add_instancemodel_pipeline_columns.py new file mode 100644 index 0000000000..f1c2b1217a --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/03_05_0547_8e8647f20aa4_add_instancemodel_pipeline_columns.py @@ -0,0 +1,47 @@ +"""Add InstanceModel pipeline columns + +Revision ID: 8e8647f20aa4 +Revises: 5e8c7a9202bc +Create Date: 2026-03-05 05:47:39.307013+00:00 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "8e8647f20aa4" +down_revision = "5e8c7a9202bc" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "lock_expires_at", dstack._internal.server.models.NaiveDateTime(), nullable=True + ) + ) + batch_op.add_column( + sa.Column( + "lock_token", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True + ) + ) + batch_op.add_column(sa.Column("lock_owner", sa.String(length=100), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.drop_column("lock_owner") + batch_op.drop_column("lock_token") + batch_op.drop_column("lock_expires_at") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/03_05_0751_297c68450cc8_add_ix_instances_pipeline_fetch_q_index.py b/src/dstack/_internal/server/migrations/versions/2026/03_05_0751_297c68450cc8_add_ix_instances_pipeline_fetch_q_index.py new file mode 100644 index 0000000000..e629de0950 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/03_05_0751_297c68450cc8_add_ix_instances_pipeline_fetch_q_index.py @@ -0,0 +1,49 @@ +"""Add ix_instances_pipeline_fetch_q index + +Revision ID: 297c68450cc8 +Revises: 8e8647f20aa4 +Create Date: 2026-03-05 07:51:02.855596+00:00 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "297c68450cc8" +down_revision = "8e8647f20aa4" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.get_context().autocommit_block(): + op.drop_index( + "ix_instances_pipeline_fetch_q", + table_name="instances", + if_exists=True, + postgresql_concurrently=True, + ) + op.create_index( + "ix_instances_pipeline_fetch_q", + "instances", + [sa.literal_column("last_processed_at ASC")], + unique=False, + sqlite_where=sa.text("deleted = 0"), + postgresql_where=sa.text("deleted IS FALSE"), + postgresql_concurrently=True, + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.get_context().autocommit_block(): + op.drop_index( + "ix_instances_pipeline_fetch_q", + table_name="instances", + if_exists=True, + postgresql_concurrently=True, + ) + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/03_05_1015_9cb8e4e4d986_add_fleet_current_master_instance.py b/src/dstack/_internal/server/migrations/versions/2026/03_05_1015_9cb8e4e4d986_add_fleet_current_master_instance.py new file mode 100644 index 0000000000..2049236267 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/03_05_1015_9cb8e4e4d986_add_fleet_current_master_instance.py @@ -0,0 +1,37 @@ +"""Add FleetModel current master instance + +Revision ID: 9cb8e4e4d986 +Revises: 297c68450cc8 +Create Date: 2026-03-05 10:15:00.000000+00:00 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +# revision identifiers, used by Alembic. +revision = "9cb8e4e4d986" +down_revision = "297c68450cc8" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("fleets", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "current_master_instance_id", + sqlalchemy_utils.types.uuid.UUIDType(binary=False), + nullable=True, + ) + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("fleets", schema=None) as batch_op: + batch_op.drop_column("current_master_instance_id") + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/03_05_1045_c7b0a8e57294_add_ix_fleets_current_master_instance_id.py b/src/dstack/_internal/server/migrations/versions/2026/03_05_1045_c7b0a8e57294_add_ix_fleets_current_master_instance_id.py new file mode 100644 index 0000000000..e1cb938750 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/03_05_1045_c7b0a8e57294_add_ix_fleets_current_master_instance_id.py @@ -0,0 +1,42 @@ +"""Add ix_fleets_current_master_instance_id index + +Revision ID: c7b0a8e57294 +Revises: 9cb8e4e4d986 +Create Date: 2026-03-05 10:45:00.000000+00:00 + +""" + +from alembic import op + +# revision identifiers, used by Alembic. +revision = "c7b0a8e57294" +down_revision = "9cb8e4e4d986" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + with op.get_context().autocommit_block(): + op.drop_index( + "ix_fleets_current_master_instance_id", + table_name="fleets", + if_exists=True, + postgresql_concurrently=True, + ) + op.create_index( + "ix_fleets_current_master_instance_id", + "fleets", + ["current_master_instance_id"], + unique=False, + postgresql_concurrently=True, + ) + + +def downgrade() -> None: + with op.get_context().autocommit_block(): + op.drop_index( + "ix_fleets_current_master_instance_id", + table_name="fleets", + if_exists=True, + postgresql_concurrently=True, + ) diff --git a/src/dstack/_internal/server/migrations/versions/2026/03_06_1200_a13f5b55af01_add_projectmodel_templates_repo.py b/src/dstack/_internal/server/migrations/versions/2026/03_06_1200_a13f5b55af01_add_projectmodel_templates_repo.py new file mode 100644 index 0000000000..f5271ddc11 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/03_06_1200_a13f5b55af01_add_projectmodel_templates_repo.py @@ -0,0 +1,26 @@ +"""Add ProjectModel.templates_repo + +Revision ID: a13f5b55af01 +Revises: 5e8c7a9202bc +Create Date: 2026-03-06 12:00:00.000000 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "a13f5b55af01" +down_revision = "c7b0a8e57294" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + with op.batch_alter_table("projects", schema=None) as batch_op: + batch_op.add_column(sa.Column("templates_repo", sa.Text(), nullable=True)) + + +def downgrade() -> None: + with op.batch_alter_table("projects", schema=None) as batch_op: + batch_op.drop_column("templates_repo") diff --git a/src/dstack/_internal/server/migrations/versions/2026/03_09_0928_6026b29d78c7_add_jobmodel_pipeline_columns.py b/src/dstack/_internal/server/migrations/versions/2026/03_09_0928_6026b29d78c7_add_jobmodel_pipeline_columns.py new file mode 100644 index 0000000000..84126aa1ca --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/03_09_0928_6026b29d78c7_add_jobmodel_pipeline_columns.py @@ -0,0 +1,47 @@ +"""Add JobModel pipeline columns + +Revision ID: 6026b29d78c7 +Revises: a13f5b55af01 +Create Date: 2026-03-09 09:28:17.993416+00:00 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "6026b29d78c7" +down_revision = "a13f5b55af01" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "lock_expires_at", dstack._internal.server.models.NaiveDateTime(), nullable=True + ) + ) + batch_op.add_column( + sa.Column( + "lock_token", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True + ) + ) + batch_op.add_column(sa.Column("lock_owner", sa.String(length=100), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.drop_column("lock_owner") + batch_op.drop_column("lock_token") + batch_op.drop_column("lock_expires_at") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/03_10_1130_8b6d5d8c1b9a_add_ix_jobs_pipeline_fetch_q_index.py b/src/dstack/_internal/server/migrations/versions/2026/03_10_1130_8b6d5d8c1b9a_add_ix_jobs_pipeline_fetch_q_index.py new file mode 100644 index 0000000000..b6a1ca924f --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/03_10_1130_8b6d5d8c1b9a_add_ix_jobs_pipeline_fetch_q_index.py @@ -0,0 +1,51 @@ +"""Add ix_jobs_pipeline_fetch_q index + +Revision ID: 8b6d5d8c1b9a +Revises: 6026b29d78c7 +Create Date: 2026-03-10 11:30:00.000000+00:00 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "8b6d5d8c1b9a" +down_revision = "6026b29d78c7" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.get_context().autocommit_block(): + op.drop_index( + "ix_jobs_pipeline_fetch_q", + table_name="jobs", + if_exists=True, + postgresql_concurrently=True, + ) + op.create_index( + "ix_jobs_pipeline_fetch_q", + "jobs", + [sa.literal_column("last_processed_at ASC")], + unique=False, + sqlite_where=sa.text("(status NOT IN ('TERMINATED', 'ABORTED', 'FAILED', 'DONE'))"), + postgresql_where=sa.text( + "(status NOT IN ('TERMINATED', 'ABORTED', 'FAILED', 'DONE'))" + ), + postgresql_concurrently=True, + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.get_context().autocommit_block(): + op.drop_index( + "ix_jobs_pipeline_fetch_q", + table_name="jobs", + if_exists=True, + postgresql_concurrently=True, + ) + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/03_19_0924_7099b48e72a9_add_runmodel_pipeline_columns.py b/src/dstack/_internal/server/migrations/versions/2026/03_19_0924_7099b48e72a9_add_runmodel_pipeline_columns.py new file mode 100644 index 0000000000..353dbadeea --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/03_19_0924_7099b48e72a9_add_runmodel_pipeline_columns.py @@ -0,0 +1,47 @@ +"""Add RunModel pipeline columns + +Revision ID: 7099b48e72a9 +Revises: 8b6d5d8c1b9a +Create Date: 2026-03-19 09:24:29.042905+00:00 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "7099b48e72a9" +down_revision = "8b6d5d8c1b9a" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "lock_expires_at", dstack._internal.server.models.NaiveDateTime(), nullable=True + ) + ) + batch_op.add_column( + sa.Column( + "lock_token", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True + ) + ) + batch_op.add_column(sa.Column("lock_owner", sa.String(length=100), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.drop_column("lock_owner") + batch_op.drop_column("lock_token") + batch_op.drop_column("lock_expires_at") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/03_24_0528_c1c2ecaee45c_add_ix_runs_pipeline_fetch_q_index.py b/src/dstack/_internal/server/migrations/versions/2026/03_24_0528_c1c2ecaee45c_add_ix_runs_pipeline_fetch_q_index.py new file mode 100644 index 0000000000..eb47db4e40 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/03_24_0528_c1c2ecaee45c_add_ix_runs_pipeline_fetch_q_index.py @@ -0,0 +1,49 @@ +"""Add ix_runs_pipeline_fetch_q index + +Revision ID: c1c2ecaee45c +Revises: 7099b48e72a9 +Create Date: 2026-03-24 05:28:50.925623+00:00 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "c1c2ecaee45c" +down_revision = "7099b48e72a9" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.get_context().autocommit_block(): + op.drop_index( + "ix_runs_pipeline_fetch_q", + table_name="runs", + if_exists=True, + postgresql_concurrently=True, + ) + op.create_index( + "ix_runs_pipeline_fetch_q", + "runs", + [sa.literal_column("last_processed_at ASC")], + unique=False, + sqlite_where=sa.text("(status NOT IN ('TERMINATED', 'FAILED', 'DONE'))"), + postgresql_where=sa.text("(status NOT IN ('TERMINATED', 'FAILED', 'DONE'))"), + postgresql_concurrently=True, + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.get_context().autocommit_block(): + op.drop_index( + "ix_runs_pipeline_fetch_q", + table_name="runs", + if_exists=True, + postgresql_concurrently=True, + ) + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/03_24_1145_59e328ced74c_add_userpublickeymodel.py b/src/dstack/_internal/server/migrations/versions/2026/03_24_1145_59e328ced74c_add_userpublickeymodel.py new file mode 100644 index 0000000000..6a5e30afbf --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/03_24_1145_59e328ced74c_add_userpublickeymodel.py @@ -0,0 +1,50 @@ +"""Add UserPublicKeyModel + +Revision ID: 59e328ced74c +Revises: c1c2ecaee45c +Create Date: 2026-03-24 11:45:13.560594+00:00 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "59e328ced74c" +down_revision = "c1c2ecaee45c" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "user_public_keys", + sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("created_at", dstack._internal.server.models.NaiveDateTime(), nullable=False), + sa.Column("user_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("name", sa.String(length=100), nullable=False), + sa.Column("type", sa.String(length=100), nullable=False), + sa.Column("fingerprint", sa.String(length=100), nullable=False), + sa.Column("key", sa.Text(), nullable=False), + sa.ForeignKeyConstraint( + ["user_id"], + ["users.id"], + name=op.f("fk_user_public_keys_user_id_users"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_user_public_keys")), + sa.UniqueConstraint( + "user_id", "fingerprint", name="uq_user_public_keys_user_id_fingerprint" + ), + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table("user_public_keys") + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/03_30_0841_e9d81c97c042_add_jobmodel_graceful_termination_.py b/src/dstack/_internal/server/migrations/versions/2026/03_30_0841_e9d81c97c042_add_jobmodel_graceful_termination_.py new file mode 100644 index 0000000000..2e744e05d2 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/03_30_0841_e9d81c97c042_add_jobmodel_graceful_termination_.py @@ -0,0 +1,34 @@ +"""Add JobModel.graceful_termination_attempts + +Revision ID: e9d81c97c042 +Revises: 59e328ced74c +Create Date: 2026-03-30 08:41:29.308250+00:00 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "e9d81c97c042" +down_revision = "59e328ced74c" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.add_column( + sa.Column("graceful_termination_attempts", sa.Integer(), nullable=True) + ) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.drop_column("graceful_termination_attempts") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/04_03_1043_ad8c50120507_add_jobmetricspoint_job_id_index.py b/src/dstack/_internal/server/migrations/versions/2026/04_03_1043_ad8c50120507_add_jobmetricspoint_job_id_index.py new file mode 100644 index 0000000000..858b176d13 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/04_03_1043_ad8c50120507_add_jobmetricspoint_job_id_index.py @@ -0,0 +1,31 @@ +"""Add JobMetricsPoint.job_id index + +Revision ID: ad8c50120507 +Revises: e9d81c97c042 +Create Date: 2026-04-03 10:43:18.440334+00:00 + +""" + +from alembic import op + +# revision identifiers, used by Alembic. +revision = "ad8c50120507" +down_revision = "e9d81c97c042" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("job_metrics_points", schema=None) as batch_op: + batch_op.create_index(batch_op.f("ix_job_metrics_points_job_id"), ["job_id"], unique=False) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("job_metrics_points", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_job_metrics_points_job_id")) + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/04_10_1200_1b9e2e7e7d35_add_backends_source_config_and_auth.py b/src/dstack/_internal/server/migrations/versions/2026/04_10_1200_1b9e2e7e7d35_add_backends_source_config_and_auth.py new file mode 100644 index 0000000000..04e773f182 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/04_10_1200_1b9e2e7e7d35_add_backends_source_config_and_auth.py @@ -0,0 +1,36 @@ +"""Add BackendModel.source_config and BackendModel.source_auth + +Revision ID: 1b9e2e7e7d35 +Revises: ad8c50120507 +Create Date: 2026-04-10 12:00:00.000000+00:00 + +""" + +import sqlalchemy as sa +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "1b9e2e7e7d35" +down_revision = "ad8c50120507" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + with op.batch_alter_table("backends", schema=None) as batch_op: + batch_op.add_column(sa.Column("source_config", sa.String(length=20000), nullable=True)) + batch_op.add_column( + sa.Column( + "source_auth", + dstack._internal.server.models.EncryptedString(20000), + nullable=True, + ) + ) + + +def downgrade() -> None: + with op.batch_alter_table("backends", schema=None) as batch_op: + batch_op.drop_column("source_auth") + batch_op.drop_column("source_config") diff --git a/src/dstack/_internal/server/migrations/versions/2026/04_15_0515_94fcd7e38b7e_add_service_router_worker_sync_for_.py b/src/dstack/_internal/server/migrations/versions/2026/04_15_0515_94fcd7e38b7e_add_service_router_worker_sync_for_.py new file mode 100644 index 0000000000..4f5486adb8 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/04_15_0515_94fcd7e38b7e_add_service_router_worker_sync_for_.py @@ -0,0 +1,72 @@ +"""Add service_router_worker_sync for router-worker reconcile pipeline. + +Revision ID: 94fcd7e38b7e +Revises: 1b9e2e7e7d35 +Create Date: 2026-04-15 05:15:50.107554+00:00 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "94fcd7e38b7e" +down_revision = "1b9e2e7e7d35" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "service_router_worker_sync", + sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("run_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("deleted", sa.Boolean(), server_default=sa.false(), nullable=False), + sa.Column("created_at", dstack._internal.server.models.NaiveDateTime(), nullable=False), + sa.Column( + "last_processed_at", dstack._internal.server.models.NaiveDateTime(), nullable=False + ), + sa.Column( + "lock_expires_at", dstack._internal.server.models.NaiveDateTime(), nullable=True + ), + sa.Column("lock_token", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True), + sa.Column("lock_owner", sa.String(length=100), nullable=True), + sa.ForeignKeyConstraint( + ["run_id"], + ["runs.id"], + name=op.f("fk_service_router_worker_sync_run_id_runs"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_service_router_worker_sync")), + ) + with op.batch_alter_table("service_router_worker_sync", schema=None) as batch_op: + batch_op.create_index( + "ix_service_router_worker_sync_pipeline_fetch_q", + [sa.literal_column("last_processed_at ASC")], + unique=False, + sqlite_where=sa.text("deleted = 0"), + postgresql_where=sa.text("deleted IS FALSE"), + ) + batch_op.create_index( + batch_op.f("ix_service_router_worker_sync_run_id"), ["run_id"], unique=True + ) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("service_router_worker_sync", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_service_router_worker_sync_run_id")) + batch_op.drop_index( + "ix_service_router_worker_sync_pipeline_fetch_q", + sqlite_where=sa.text("deleted = 0"), + postgresql_where=sa.text("deleted IS FALSE"), + ) + + op.drop_table("service_router_worker_sync") + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/04_18_1822_f48b23790053_add_jobmodel_image_pull_progress.py b/src/dstack/_internal/server/migrations/versions/2026/04_18_1822_f48b23790053_add_jobmodel_image_pull_progress.py new file mode 100644 index 0000000000..33b46a73f7 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/04_18_1822_f48b23790053_add_jobmodel_image_pull_progress.py @@ -0,0 +1,32 @@ +"""Add JobModel.image_pull_progress + +Revision ID: f48b23790053 +Revises: 94fcd7e38b7e +Create Date: 2026-04-18 18:22:47.121819+00:00 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "f48b23790053" +down_revision = "94fcd7e38b7e" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.add_column(sa.Column("image_pull_progress", sa.Text(), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.drop_column("image_pull_progress") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/04_24_0542_82b671d9c5ab_add_instancemodel_provisioning_job_id_.py b/src/dstack/_internal/server/migrations/versions/2026/04_24_0542_82b671d9c5ab_add_instancemodel_provisioning_job_id_.py new file mode 100644 index 0000000000..38b5b78d41 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/04_24_0542_82b671d9c5ab_add_instancemodel_provisioning_job_id_.py @@ -0,0 +1,39 @@ +"""Add InstanceModel.provisioning_job_id for placeholder instances + +Revision ID: 82b671d9c5ab +Revises: f48b23790053 +Create Date: 2026-04-24 05:42:14.856254+00:00 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +# revision identifiers, used by Alembic. +revision = "82b671d9c5ab" +down_revision = "f48b23790053" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "provisioning_job_id", + sqlalchemy_utils.types.uuid.UUIDType(binary=False), + nullable=True, + ) + ) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.drop_column("provisioning_job_id") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/04_24_0715_d9f6d27f0c41_add_jobmodel_skip_min_processing_.py b/src/dstack/_internal/server/migrations/versions/2026/04_24_0715_d9f6d27f0c41_add_jobmodel_skip_min_processing_.py new file mode 100644 index 0000000000..9272470773 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/04_24_0715_d9f6d27f0c41_add_jobmodel_skip_min_processing_.py @@ -0,0 +1,37 @@ +"""Add JobModel.skip_min_processing_interval + +Revision ID: d9f6d27f0c41 +Revises: 82b671d9c5ab +Create Date: 2026-04-24 07:15:00.000000+00:00 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "d9f6d27f0c41" +down_revision = "82b671d9c5ab" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "skip_min_processing_interval", + sa.Boolean(), + server_default=sa.false(), + nullable=False, + ) + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.drop_column("skip_min_processing_interval") + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/04_27_0720_8c1f8f4fcb47_add_runmodel_skip_min_processing_.py b/src/dstack/_internal/server/migrations/versions/2026/04_27_0720_8c1f8f4fcb47_add_runmodel_skip_min_processing_.py new file mode 100644 index 0000000000..a8f4abac0a --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/04_27_0720_8c1f8f4fcb47_add_runmodel_skip_min_processing_.py @@ -0,0 +1,37 @@ +"""Add RunModel.skip_min_processing_interval + +Revision ID: 8c1f8f4fcb47 +Revises: d9f6d27f0c41 +Create Date: 2026-04-27 07:20:00.000000+00:00 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "8c1f8f4fcb47" +down_revision = "d9f6d27f0c41" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "skip_min_processing_interval", + sa.Boolean(), + server_default=sa.false(), + nullable=False, + ) + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.drop_column("skip_min_processing_interval") + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/04_27_1030_05c351d08f6b_add_instancemodel_skip_min_processing_.py b/src/dstack/_internal/server/migrations/versions/2026/04_27_1030_05c351d08f6b_add_instancemodel_skip_min_processing_.py new file mode 100644 index 0000000000..d44ee41d50 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/04_27_1030_05c351d08f6b_add_instancemodel_skip_min_processing_.py @@ -0,0 +1,37 @@ +"""Add InstanceModel.skip_min_processing_interval + +Revision ID: 05c351d08f6b +Revises: 8c1f8f4fcb47 +Create Date: 2026-04-27 10:30:00.000000+00:00 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "05c351d08f6b" +down_revision = "8c1f8f4fcb47" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "skip_min_processing_interval", + sa.Boolean(), + server_default=sa.false(), + nullable=False, + ) + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.drop_column("skip_min_processing_interval") + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/04_29_1700_db3679abd063_add_exportedgatewaymodel.py b/src/dstack/_internal/server/migrations/versions/2026/04_29_1700_db3679abd063_add_exportedgatewaymodel.py new file mode 100644 index 0000000000..85fa9c94b0 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/04_29_1700_db3679abd063_add_exportedgatewaymodel.py @@ -0,0 +1,64 @@ +"""Add ExportedGatewayModel + +Revision ID: db3679abd063 +Revises: 05c351d08f6b +Create Date: 2026-04-29 17:00:29.551669+00:00 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +# revision identifiers, used by Alembic. +revision = "db3679abd063" +down_revision = "05c351d08f6b" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "exported_gateways", + sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("export_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column( + "gateway_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False + ), + sa.ForeignKeyConstraint( + ["export_id"], + ["exports.id"], + name=op.f("fk_exported_gateways_export_id_exports"), + ondelete="CASCADE", + ), + sa.ForeignKeyConstraint( + ["gateway_id"], + ["gateways.id"], + name=op.f("fk_exported_gateways_gateway_id_gateways"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_exported_gateways")), + sa.UniqueConstraint( + "export_id", "gateway_id", name="uq_exported_gateways_export_id_gateway_id" + ), + ) + with op.batch_alter_table("exported_gateways", schema=None) as batch_op: + batch_op.create_index( + batch_op.f("ix_exported_gateways_export_id"), ["export_id"], unique=False + ) + batch_op.create_index( + batch_op.f("ix_exported_gateways_gateway_id"), ["gateway_id"], unique=False + ) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("exported_gateways", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_exported_gateways_gateway_id")) + batch_op.drop_index(batch_op.f("ix_exported_gateways_export_id")) + + op.drop_table("exported_gateways") + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/05_07_1721_205690dfeec2_add_gatewaymodel_forbid_new_services.py b/src/dstack/_internal/server/migrations/versions/2026/05_07_1721_205690dfeec2_add_gatewaymodel_forbid_new_services.py new file mode 100644 index 0000000000..d164fb9fd0 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/05_07_1721_205690dfeec2_add_gatewaymodel_forbid_new_services.py @@ -0,0 +1,36 @@ +"""Add GatewayModel.forbid_new_services + +Revision ID: 205690dfeec2 +Revises: db3679abd063 +Create Date: 2026-05-07 17:21:23.415019+00:00 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "205690dfeec2" +down_revision = "db3679abd063" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("gateways", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "forbid_new_services", sa.Boolean(), server_default=sa.false(), nullable=False + ) + ) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("gateways", schema=None) as batch_op: + batch_op.drop_column("forbid_new_services") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/05_13_0724_201cb7ccd0d3_add_exportmodel_is_global.py b/src/dstack/_internal/server/migrations/versions/2026/05_13_0724_201cb7ccd0d3_add_exportmodel_is_global.py new file mode 100644 index 0000000000..677dac54a0 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/05_13_0724_201cb7ccd0d3_add_exportmodel_is_global.py @@ -0,0 +1,34 @@ +"""Add ExportModel.is_global + +Revision ID: 201cb7ccd0d3 +Revises: 205690dfeec2 +Create Date: 2026-05-13 07:24:06.321892+00:00 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "201cb7ccd0d3" +down_revision = "205690dfeec2" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("exports", schema=None) as batch_op: + batch_op.add_column( + sa.Column("is_global", sa.Boolean(), server_default=sa.false(), nullable=False) + ) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("exports", schema=None) as batch_op: + batch_op.drop_column("is_global") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/06_01_1911_b7609b94ea4d_add_gatewaycomputemodel_gateway_id.py b/src/dstack/_internal/server/migrations/versions/2026/06_01_1911_b7609b94ea4d_add_gatewaycomputemodel_gateway_id.py new file mode 100644 index 0000000000..2729699af1 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/06_01_1911_b7609b94ea4d_add_gatewaycomputemodel_gateway_id.py @@ -0,0 +1,52 @@ +"""Add GatewayComputeModel.gateway_id + +Revision ID: b7609b94ea4d +Revises: 201cb7ccd0d3 +Create Date: 2026-06-01 19:11:30.641417+00:00 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +# revision identifiers, used by Alembic. +revision = "b7609b94ea4d" +down_revision = "201cb7ccd0d3" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("gateway_computes", schema=None) as batch_op: + batch_op.add_column( + sa.Column("replica_num", sa.Integer(), server_default="0", nullable=False) + ) + batch_op.add_column( + sa.Column( + "gateway_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True + ) + ) + batch_op.create_foreign_key( + batch_op.f("fk_gateway_computes_gateway_id_gateways"), + "gateways", + ["gateway_id"], + ["id"], + ondelete="SET NULL", + use_alter=True, + ) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("gateway_computes", schema=None) as batch_op: + batch_op.drop_constraint( + batch_op.f("fk_gateway_computes_gateway_id_gateways"), type_="foreignkey" + ) + batch_op.drop_column("gateway_id") + batch_op.drop_column("replica_num") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/4b4319398164_introduce_runs_processing.py b/src/dstack/_internal/server/migrations/versions/4b4319398164_introduce_runs_processing.py deleted file mode 100644 index 070191b70b..0000000000 --- a/src/dstack/_internal/server/migrations/versions/4b4319398164_introduce_runs_processing.py +++ /dev/null @@ -1,144 +0,0 @@ -"""Introduce runs processing - -Revision ID: 4b4319398164 -Revises: b88d55c2a07d -Create Date: 2024-03-01 14:30:28.918255 - -""" - -import sqlalchemy as sa -import sqlalchemy_utils -from alembic import op - -# revision identifiers, used by Alembic. -revision = "4b4319398164" -down_revision = "b88d55c2a07d" -branch_labels = None -depends_on = None - - -def upgrade() -> None: - with op.batch_alter_table("runs", schema=None) as batch_op: - # last_processed_at is nullable=False later - batch_op.add_column(sa.Column("last_processed_at", sa.DateTime(), nullable=True)) - batch_op.add_column( - sa.Column( - "gateway_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True - ) - ) - run_termination_reason_enum = sa.Enum( - "ALL_JOBS_DONE", - "JOB_FAILED", - "RETRY_LIMIT_EXCEEDED", - "STOPPED_BY_USER", - "ABORTED_BY_USER", - "SERVER_ERROR", - name="runterminationreason", - ) - run_termination_reason_enum.create(op.get_bind(), checkfirst=True) - batch_op.add_column( - sa.Column( - "termination_reason", - run_termination_reason_enum, - nullable=True, - ) - ) - batch_op.add_column(sa.Column("service_spec", sa.String(length=4000), nullable=True)) - batch_op.create_foreign_key( - batch_op.f("fk_runs_gateway_id_gateways"), - "gateways", - ["gateway_id"], - ["id"], - ondelete="SET NULL", - ) - op.execute("UPDATE runs SET last_processed_at = submitted_at") - op.execute( - "UPDATE runs SET " - " status = 'TERMINATED' " - "WHERE id NOT IN ( " - " SELECT run_id FROM jobs " - " WHERE status NOT IN ('TERMINATED', 'ABORTED', 'FAILED', 'DONE') " - ")" - ) - with op.batch_alter_table("runs", schema=None) as batch_op: - batch_op.alter_column("last_processed_at", nullable=False) - - job_termination_reason_enum = sa.Enum( - "FAILED_TO_START_DUE_TO_NO_CAPACITY", - "INTERRUPTED_BY_NO_CAPACITY", - "WAITING_RUNNER_LIMIT_EXCEEDED", - "TERMINATED_BY_USER", - "GATEWAY_ERROR", - "SCALED_DOWN", - "DONE_BY_RUNNER", - "ABORTED_BY_USER", - "TERMINATED_BY_SERVER", - "CONTAINER_EXITED_WITH_ERROR", - "PORTS_BINDING_FAILED", - name="jobterminationreason", - ) - job_termination_reason_enum.create(op.get_bind(), checkfirst=True) - - with op.batch_alter_table("jobs", schema=None) as batch_op: - batch_op.alter_column( - "error_code", - new_column_name="termination_reason", - type_=job_termination_reason_enum, - postgresql_using=("error_code::VARCHAR::jobterminationreason"), - ) - # replica_num is nullable=False later - batch_op.add_column(sa.Column("replica_num", sa.Integer(), nullable=True)) - batch_op.drop_column("removed") - batch_op.execute("UPDATE jobs SET replica_num = 0") - with op.batch_alter_table("jobs", schema=None) as batch_op: - batch_op.alter_column("replica_num", nullable=False) - - -def downgrade() -> None: - with op.batch_alter_table("jobs", schema=None) as batch_op: - batch_op.alter_column( - "termination_reason", - new_column_name="error_code", - type_=sa.VARCHAR(length=34), - ) - batch_op.add_column( - # all jobs will get not removed - sa.Column("removed", sa.BOOLEAN(), server_default=sa.false(), nullable=False) - ) - batch_op.drop_column("replica_num") - - with op.batch_alter_table("runs", schema=None) as batch_op: - batch_op.drop_constraint(batch_op.f("fk_runs_gateway_id_gateways"), type_="foreignkey") - batch_op.drop_column("service_spec") - batch_op.drop_column("termination_reason") - batch_op.drop_column("gateway_id") - batch_op.drop_column("last_processed_at") - op.execute("UPDATE runs SET status = 'SUBMITTED'") - op.execute("UPDATE jobs SET removed = 1") - - run_termination_reason_enum = sa.Enum( - "ALL_JOBS_DONE", - "JOB_FAILED", - "RETRY_LIMIT_EXCEEDED", - "STOPPED_BY_USER", - "ABORTED_BY_USER", - "SERVER_ERROR", - name="runterminationreason", - ) - run_termination_reason_enum.drop(op.get_bind(), checkfirst=True) - - job_termination_reason_enum = sa.Enum( - "FAILED_TO_START_DUE_TO_NO_CAPACITY", - "INTERRUPTED_BY_NO_CAPACITY", - "WAITING_RUNNER_LIMIT_EXCEEDED", - "TERMINATED_BY_USER", - "GATEWAY_ERROR", - "SCALED_DOWN", - "DONE_BY_RUNNER", - "ABORTED_BY_USER", - "TERMINATED_BY_SERVER", - "CONTAINER_EXITED_WITH_ERROR", - "PORTS_BINDING_FAILED", - name="jobterminationreason", - ) - job_termination_reason_enum.drop(op.get_bind(), checkfirst=True) diff --git a/src/dstack/_internal/server/migrations/versions/5ad8debc8fe6_fixes_for_psql.py b/src/dstack/_internal/server/migrations/versions/5ad8debc8fe6_fixes_for_psql.py deleted file mode 100644 index dc201d020e..0000000000 --- a/src/dstack/_internal/server/migrations/versions/5ad8debc8fe6_fixes_for_psql.py +++ /dev/null @@ -1,328 +0,0 @@ -"""Fixes for psql - -Revision ID: 5ad8debc8fe6 -Revises: 98cd9c8b5927 -Create Date: 2024-07-04 17:26:01.937112 - -""" - -import sqlalchemy as sa -from alembic import op -from alembic_postgresql_enum import TableReference -from sqlalchemy.dialects import postgresql - -# revision identifiers, used by Alembic. -revision = "5ad8debc8fe6" -down_revision = "98cd9c8b5927" -branch_labels = None -depends_on = None - - -def upgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - sa.Enum( - "PENDING", - "SUBMITTED", - "PROVISIONING", - "RUNNING", - "TERMINATING", - "TERMINATED", - "FAILED", - "DONE", - name="runstatus", - ).create(op.get_bind()) - with op.batch_alter_table("backends", schema=None) as batch_op: - batch_op.alter_column( - "config", - existing_type=sa.VARCHAR(length=2000), - type_=sa.String(length=20000), - existing_nullable=False, - ) - batch_op.alter_column( - "auth", - existing_type=sa.VARCHAR(length=2000), - type_=sa.String(length=20000), - existing_nullable=False, - ) - - with op.batch_alter_table("jobs", schema=None) as batch_op: - batch_op.alter_column( - "runner_timestamp", - existing_type=sa.INTEGER(), - type_=sa.BigInteger(), - existing_nullable=True, - ) - - with op.batch_alter_table("runs", schema=None) as batch_op: - batch_op.alter_column( - "status", - existing_type=postgresql.ENUM( - "PENDING", - "SUBMITTED", - "PROVISIONING", - "RUNNING", - "TERMINATING", - "TERMINATED", - "ABORTED", - "FAILED", - "DONE", - name="jobstatus", - ), - type_=sa.Enum( - "PENDING", - "SUBMITTED", - "PROVISIONING", - "RUNNING", - "TERMINATING", - "TERMINATED", - "FAILED", - "DONE", - name="runstatus", - ), - existing_nullable=False, - postgresql_using="status::VARCHAR::runstatus", - ) - - sa.Enum( - "NO_INSTANCE_MATCHING_REQUIREMENTS", - "FAILED_TO_START_DUE_TO_NO_CAPACITY", - "INTERRUPTED_BY_NO_CAPACITY", - "INSTANCE_TERMINATED", - "CONTAINER_EXITED_WITH_ERROR", - "PORTS_BINDING_FAILED", - name="joberrorcode", - ).drop(op.get_bind()) - op.sync_enum_values( - "public", - "backendtype", - [ - "AWS", - "AZURE", - "CUDO", - "DATACRUNCH", - "DSTACK", - "GCP", - "KUBERNETES", - "LAMBDA", - "LOCAL", - "REMOTE", - "NEBIUS", - "OCI", - "RUNPOD", - "TENSORDOCK", - "VASTAI", - ], - [ - TableReference(table_schema="public", table_name="instances", column_name="backend"), - TableReference(table_schema="public", table_name="backends", column_name="type"), - ], - enum_values_to_rename=[], - ) - op.sync_enum_values( - "public", - "repotype", - ["REMOTE", "LOCAL", "VIRTUAL"], - [TableReference(table_schema="public", table_name="repos", column_name="type")], - enum_values_to_rename=[], - ) - op.sync_enum_values( - "public", - "jobstatus", - [ - "SUBMITTED", - "PROVISIONING", - "PULLING", - "RUNNING", - "TERMINATING", - "TERMINATED", - "ABORTED", - "FAILED", - "DONE", - ], - [TableReference(table_schema="public", table_name="jobs", column_name="status")], - enum_values_to_rename=[], - ) - op.sync_enum_values( - "public", - "jobterminationreason", - [ - "FAILED_TO_START_DUE_TO_NO_CAPACITY", - "INTERRUPTED_BY_NO_CAPACITY", - "WAITING_INSTANCE_LIMIT_EXCEEDED", - "WAITING_RUNNER_LIMIT_EXCEEDED", - "TERMINATED_BY_USER", - "VOLUME_ERROR", - "GATEWAY_ERROR", - "SCALED_DOWN", - "DONE_BY_RUNNER", - "ABORTED_BY_USER", - "TERMINATED_BY_SERVER", - "CONTAINER_EXITED_WITH_ERROR", - "PORTS_BINDING_FAILED", - "CREATING_CONTAINER_ERROR", - "EXECUTOR_ERROR", - ], - [ - TableReference( - table_schema="public", table_name="jobs", column_name="termination_reason" - ) - ], - enum_values_to_rename=[], - ) - op.sync_enum_values( - "public", - "instancestatus", - ["PENDING", "PROVISIONING", "IDLE", "BUSY", "TERMINATING", "TERMINATED"], - [TableReference(table_schema="public", table_name="instances", column_name="status")], - enum_values_to_rename=[], - ) - # ### end Alembic commands ### - - -def downgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - op.sync_enum_values( - "public", - "instancestatus", - [ - "PENDING", - "CREATING", - "STARTING", - "READY", - "BUSY", - "TERMINATING", - "TERMINATED", - "FAILED", - ], - [TableReference(table_schema="public", table_name="instances", column_name="status")], - enum_values_to_rename=[], - ) - op.sync_enum_values( - "public", - "jobterminationreason", - [ - "FAILED_TO_START_DUE_TO_NO_CAPACITY", - "INTERRUPTED_BY_NO_CAPACITY", - "WAITING_RUNNER_LIMIT_EXCEEDED", - "TERMINATED_BY_USER", - "GATEWAY_ERROR", - "SCALED_DOWN", - "DONE_BY_RUNNER", - "ABORTED_BY_USER", - "TERMINATED_BY_SERVER", - "CONTAINER_EXITED_WITH_ERROR", - "PORTS_BINDING_FAILED", - ], - [ - TableReference( - table_schema="public", table_name="jobs", column_name="termination_reason" - ) - ], - enum_values_to_rename=[], - ) - op.sync_enum_values( - "public", - "jobstatus", - [ - "PENDING", - "SUBMITTED", - "PROVISIONING", - "RUNNING", - "TERMINATING", - "TERMINATED", - "ABORTED", - "FAILED", - "DONE", - ], - [TableReference(table_schema="public", table_name="jobs", column_name="status")], - enum_values_to_rename=[], - ) - op.sync_enum_values( - "public", - "repotype", - ["REMOTE", "LOCAL"], - [TableReference(table_schema="public", table_name="repos", column_name="type")], - enum_values_to_rename=[], - ) - op.sync_enum_values( - "public", - "backendtype", - ["AWS", "AZURE", "GCP", "LAMBDA"], - [ - TableReference(table_schema="public", table_name="instances", column_name="backend"), - TableReference(table_schema="public", table_name="backends", column_name="type"), - ], - enum_values_to_rename=[], - ) - sa.Enum( - "NO_INSTANCE_MATCHING_REQUIREMENTS", - "FAILED_TO_START_DUE_TO_NO_CAPACITY", - "INTERRUPTED_BY_NO_CAPACITY", - "INSTANCE_TERMINATED", - "CONTAINER_EXITED_WITH_ERROR", - "PORTS_BINDING_FAILED", - name="joberrorcode", - ).create(op.get_bind()) - with op.batch_alter_table("runs", schema=None) as batch_op: - batch_op.alter_column( - "status", - existing_type=sa.Enum( - "PENDING", - "SUBMITTED", - "PROVISIONING", - "RUNNING", - "TERMINATING", - "TERMINATED", - "FAILED", - "DONE", - name="runstatus", - ), - type_=postgresql.ENUM( - "PENDING", - "SUBMITTED", - "PROVISIONING", - "RUNNING", - "TERMINATING", - "TERMINATED", - "ABORTED", - "FAILED", - "DONE", - name="jobstatus", - ), - existing_nullable=False, - ) - - with op.batch_alter_table("jobs", schema=None) as batch_op: - batch_op.alter_column( - "runner_timestamp", - existing_type=sa.BigInteger(), - type_=sa.INTEGER(), - existing_nullable=True, - ) - - with op.batch_alter_table("backends", schema=None) as batch_op: - batch_op.alter_column( - "auth", - existing_type=sa.String(length=20000), - type_=sa.VARCHAR(length=2000), - existing_nullable=False, - ) - batch_op.alter_column( - "config", - existing_type=sa.String(length=20000), - type_=sa.VARCHAR(length=2000), - existing_nullable=False, - ) - - sa.Enum( - "PENDING", - "SUBMITTED", - "PROVISIONING", - "RUNNING", - "TERMINATING", - "TERMINATED", - "FAILED", - "DONE", - name="runstatus", - ).drop(op.get_bind()) - # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/models.py b/src/dstack/_internal/server/models.py index 61d7d5258b..8d6f3c512c 100644 --- a/src/dstack/_internal/server/models.py +++ b/src/dstack/_internal/server/models.py @@ -1,13 +1,12 @@ +import enum import uuid -from datetime import datetime -from typing import List, Optional +from datetime import datetime, timezone +from typing import Callable, Generic, List, Optional, TypeVar, Union from sqlalchemy import ( BigInteger, Boolean, - Column, DateTime, - Enum, Float, ForeignKey, Index, @@ -15,7 +14,6 @@ LargeBinary, MetaData, String, - Table, Text, TypeDecorator, UniqueConstraint, @@ -24,15 +22,21 @@ from sqlalchemy.sql import false from sqlalchemy_utils import UUIDType +from dstack._internal.core.errors import DstackError from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import CoreConfig, generate_dual_core_model +from dstack._internal.core.models.compute_groups import ComputeGroupStatus +from dstack._internal.core.models.events import EventTargetType +from dstack._internal.core.models.fleets import FleetStatus from dstack._internal.core.models.gateways import GatewayStatus +from dstack._internal.core.models.health import HealthStatus +from dstack._internal.core.models.instances import InstanceStatus, InstanceTerminationReason from dstack._internal.core.models.profiles import ( - DEFAULT_POOL_TERMINATION_IDLE_TIME, + DEFAULT_FLEET_TERMINATION_IDLE_TIME, TerminationPolicy, ) from dstack._internal.core.models.repos.base import RepoType from dstack._internal.core.models.runs import ( - InstanceStatus, JobStatus, JobTerminationReason, RunStatus, @@ -42,17 +46,23 @@ from dstack._internal.core.models.volumes import VolumeStatus from dstack._internal.server import settings from dstack._internal.utils.common import get_current_datetime +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) +# Default options (save-update, merge) + delete-orphan + delete (required by delete-orphan) +# delete-orphan allows to automatically delete entities removed from the relationship +CASCADE_DEFAULT_WITH_DELETE_ORPHAN = "save-update, merge, delete-orphan, delete" class NaiveDateTime(TypeDecorator): """ - The custom type decorator that ensures datetime objects are offset-naive when stored in the database. - This is needed because we use datetimes in UTC only and store them as offset-naive. - Some databases (e.g. Postgres) throw an error if the timezone is set. + A custom type decorator that ensures datetime objects are offset-naive when stored in the database + and offset-aware with UTC timezone when loaded from the database. + This is because we use datetimes in UTC everywhere, and + some databases (e.g. Postgres) throw an error if the timezone is set. """ impl = DateTime - cache_ok = True def process_bind_param(self, value, dialect): @@ -61,7 +71,121 @@ def process_bind_param(self, value, dialect): return value def process_result_value(self, value, dialect): - return value + if value is None: + return None + return value.replace(tzinfo=timezone.utc) + + +class DecryptedStringConfig(CoreConfig): + arbitrary_types_allowed = True + + +class DecryptedString(generate_dual_core_model(DecryptedStringConfig)): + """ + A type for representing plaintext strings encrypted with `EncryptedString`. + Besides the string, stores information if the decryption was successful. + This is useful so that application code can have custom handling of failed decrypts (e.g. ignoring). + """ + + plaintext: Optional[str] + """ + `plaintext` should not be read directly to avoid ignoring errors accidentally. + Unpack with `get_plaintext_or_error()`. + """ + decrypted: bool = True + exc: Optional[Exception] = None + + def get_plaintext_or_error(self) -> str: + if self.decrypted and self.plaintext is not None: + return self.plaintext + exc = DstackError("Failed to access plaintext") + if self.exc is not None: + raise exc from self.exc + raise exc + + +class EncryptedString(TypeDecorator): + """ + A custom type decorator that encrypts and decrypts strings for storing in the db. + """ + + impl = String + cache_ok = True + + _encrypt_func: Callable[[str], str] + _decrypt_func: Callable[[str], str] + + @classmethod + def set_encrypt_decrypt( + cls, + encrypt_func: Callable[[str], str], + decrypt_func: Callable[[str], str], + ): + cls._encrypt_func = encrypt_func + cls._decrypt_func = decrypt_func + + def process_bind_param( + self, value: Optional[Union[DecryptedString, str]], dialect + ) -> Optional[str]: + if value is None: + return None + if isinstance(value, str): + # Passing string allows binding an encrypted value directly + # e.g. for comparisons + return value + return EncryptedString._encrypt_func(value.get_plaintext_or_error()) + + def process_result_value(self, value: Optional[str], dialect) -> Optional[DecryptedString]: + if value is None: + return value + try: + plaintext = EncryptedString._decrypt_func(value) + return DecryptedString(plaintext=plaintext, decrypted=True) + except Exception as e: + logger.debug("Failed to decrypt encrypted string: %s", repr(e)) + return DecryptedString(plaintext=None, decrypted=False, exc=e) + + +E = TypeVar("E", bound=enum.Enum) + + +class EnumAsString(TypeDecorator, Generic[E]): + """ + A custom type decorator that stores enums as strings in the DB. + """ + + impl = String + cache_ok = True + + def __init__( + self, + enum_class: type[E], + *args, + fallback_deserializer: Optional[Callable[[str], E]] = None, + **kwargs, + ): + """ + Args: + enum_class: The enum class to be stored. + fallback_deserializer: An optional function used when the string + from the DB does not match any enum member name. If not + provided, an exception will be raised in such cases. + """ + self.enum_class = enum_class + self.fallback_deserializer = fallback_deserializer + super().__init__(*args, **kwargs) + + def process_bind_param(self, value: Optional[E], dialect) -> Optional[str]: + if value is None: + return None + return value.name + + def process_result_value(self, value: Optional[str], dialect) -> Optional[E]: + if value is None: + return None + if value not in self.enum_class.__members__ and self.fallback_deserializer is not None: + return self.fallback_deserializer(value) + return self.enum_class[value] constraint_naming_convention = { @@ -77,6 +201,12 @@ class BaseModel(DeclarativeBase): metadata = MetaData(naming_convention=constraint_naming_convention) +class PipelineModelMixin: + lock_expires_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) + lock_token: Mapped[Optional[uuid.UUID]] = mapped_column(UUIDType(binary=False)) + lock_owner: Mapped[Optional[str]] = mapped_column(String(100)) + + class UserModel(BaseModel): __tablename__ = "users" @@ -84,10 +214,30 @@ class UserModel(BaseModel): UUIDType(binary=False), primary_key=True, default=uuid.uuid4 ) name: Mapped[str] = mapped_column(String(50), unique=True) - token: Mapped[str] = mapped_column(String(200), unique=True) - global_role: Mapped[GlobalRole] = mapped_column(Enum(GlobalRole)) + created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) + token: Mapped[DecryptedString] = mapped_column(EncryptedString(200), unique=True) + token_hash: Mapped[str] = mapped_column(String(2000), unique=True) + """`token_hash` is used for fast token lookup when the stored token is encrypted.""" + global_role: Mapped[GlobalRole] = mapped_column(EnumAsString(GlobalRole, 100)) + active: Mapped[bool] = mapped_column(Boolean, default=True) + """`active` controls whether the user can access the API.""" + deleted: Mapped[bool] = mapped_column(Boolean, server_default=false()) + original_name: Mapped[Optional[str]] = mapped_column(String(50), nullable=True) + """`original_name` stores the deleted user's original name while `name` is changed to a unique + generated value. + """ + + # TODO: make these keys required in a future version. + ssh_private_key: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + """`ssh_private_key` can be `null` for users created before 0.19.33. + Keys for those users are being gradually generated on `/get_my_user` calls. + """ + ssh_public_key: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + """`ssh_public_key` can be `null` for users created before 0.19.33. + Keys for those users are being gradually generated on `/get_my_user` calls. + """ - email: Mapped[Optional[str]] = mapped_column(String(200), nullable=True) + email: Mapped[Optional[str]] = mapped_column(String(200), nullable=True, index=True) projects_quota: Mapped[int] = mapped_column( Integer, default=settings.USER_PROJECT_DEFAULT_QUOTA @@ -101,29 +251,43 @@ class ProjectModel(BaseModel): UUIDType(binary=False), primary_key=True, default=uuid.uuid4 ) name: Mapped[str] = mapped_column(String(50), unique=True) + created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) + is_public: Mapped[bool] = mapped_column(Boolean, default=False) + templates_repo: Mapped[Optional[str]] = mapped_column(Text, nullable=True) deleted: Mapped[bool] = mapped_column(Boolean, default=False) + original_name: Mapped[Optional[str]] = mapped_column(String(50), nullable=True) + """`original_name` stores the deleted project's original name while `name` is changed to a unique + generated value. + """ owner_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("users.id", ondelete="CASCADE")) owner: Mapped[UserModel] = relationship(lazy="joined") - members: Mapped[List["MemberModel"]] = relationship(back_populates="project", lazy="selectin") + members: Mapped[List["MemberModel"]] = relationship( + back_populates="project", order_by="MemberModel.member_num" + ) ssh_private_key: Mapped[str] = mapped_column(Text) ssh_public_key: Mapped[str] = mapped_column(Text) - backends: Mapped[List["BackendModel"]] = relationship( - back_populates="project", lazy="selectin" - ) + backends: Mapped[List["BackendModel"]] = relationship(back_populates="project") default_gateway_id: Mapped[Optional[uuid.UUID]] = mapped_column( ForeignKey("gateways.id", use_alter=True, ondelete="SET NULL"), nullable=True ) - default_gateway: Mapped["GatewayModel"] = relationship( - foreign_keys=[default_gateway_id], lazy="selectin" - ) + """ + **NOTE**: default_gateway_id may point to a previously imported gateway that the project is no + longer authorized to use. Check access before using the gateway. + """ + # TODO: drop `default_pool_id` after the release without pools. default_pool_id: Mapped[Optional[UUIDType]] = mapped_column( - ForeignKey("pools.id", use_alter=True, ondelete="SET NULL"), nullable=True + ForeignKey("pools.id", use_alter=True, ondelete="SET NULL"), + nullable=True, + deferred=True, # Not loaded so it can be deleted in the next releases ) + """`default_pool_id` exists because multi-replica deployments can break when upgrading from an + old version that uses pools to the version that drops pools from the database. + """ default_pool: Mapped[Optional["PoolModel"]] = relationship(foreign_keys=[default_pool_id]) @@ -137,7 +301,9 @@ class MemberModel(BaseModel): project: Mapped["ProjectModel"] = relationship() user_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("users.id", ondelete="CASCADE")) user: Mapped[UserModel] = relationship(lazy="joined") - project_role: Mapped[ProjectRole] = mapped_column(Enum(ProjectRole)) + project_role: Mapped[ProjectRole] = mapped_column(EnumAsString(ProjectRole, 100)) + member_num: Mapped[Optional[int]] = mapped_column(Integer) + """`member_num` defines member ordering.""" class BackendModel(BaseModel): @@ -148,10 +314,20 @@ class BackendModel(BaseModel): ) project_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("projects.id", ondelete="CASCADE")) project: Mapped["ProjectModel"] = relationship() - type: Mapped[BackendType] = mapped_column(Enum(BackendType)) + type: Mapped[BackendType] = mapped_column(EnumAsString(BackendType, 100)) config: Mapped[str] = mapped_column(String(20000)) - auth: Mapped[str] = mapped_column(String(20000)) + auth: Mapped[DecryptedString] = mapped_column(EncryptedString(20000)) + source_config: Mapped[Optional[str]] = mapped_column(String(20000), nullable=True) + """`source_config` stores the original non-sensitive backend config from user input + before configurators materialize defaults or generated values. + """ + source_auth: Mapped[Optional[DecryptedString]] = mapped_column( + EncryptedString(20000), nullable=True + ) + """`source_auth` stores the original sensitive backend config from user input + before configurators materialize defaults or generated values. + """ gateways: Mapped[List["GatewayModel"]] = relationship(back_populates="backend") @@ -165,12 +341,35 @@ class RepoModel(BaseModel): ) project_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("projects.id", ondelete="CASCADE")) project: Mapped["ProjectModel"] = relationship() - # RepoModel.name stores repo_id name: Mapped[str] = mapped_column(String(100)) - type: Mapped[RepoType] = mapped_column(Enum(RepoType)) + """`name` stores `repo_id`.""" + type: Mapped[RepoType] = mapped_column(EnumAsString(RepoType, 100)) - info: Mapped[str] = mapped_column(String(2000)) - creds: Mapped[Optional[str]] = mapped_column(String(2000)) + info: Mapped[str] = mapped_column(Text) + + creds: Mapped[Optional[str]] = mapped_column(String(5000)) + """ + `creds` is deprecated. Newly initialized repos should use per-user `RepoCredsModel` instead. + As of 0.18.25 there is no plan to remove this field; it is used as a fallback when + `RepoCredsModel` associated with the user is not found. + """ + + +class RepoCredsModel(BaseModel): + __tablename__ = "repo_creds" + __table_args__ = ( + UniqueConstraint("repo_id", "user_id", name="uq_repo_creds_repo_id_user_id"), + ) + + id: Mapped[uuid.UUID] = mapped_column( + UUIDType(binary=False), primary_key=True, default=uuid.uuid4 + ) + repo_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("repos.id", ondelete="CASCADE")) + repo: Mapped["RepoModel"] = relationship() + user_id: Mapped["UserModel"] = mapped_column(ForeignKey("users.id", ondelete="CASCADE")) + user: Mapped["UserModel"] = relationship() + + creds: Mapped[DecryptedString] = mapped_column(EncryptedString(10000)) class CodeModel(BaseModel): @@ -183,72 +382,222 @@ class CodeModel(BaseModel): repo_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("repos.id", ondelete="CASCADE")) repo: Mapped["RepoModel"] = relationship() blob_hash: Mapped[str] = mapped_column(String(4000)) - blob: Mapped[Optional[bytes]] = mapped_column(LargeBinary) # None means blob is stored on s3 + blob: Mapped[Optional[bytes]] = mapped_column(LargeBinary) + """`blob` is stored on S3 when it is `None`.""" + + +class FileArchiveModel(BaseModel): + __tablename__ = "file_archives" + __table_args__ = ( + UniqueConstraint("user_id", "blob_hash", name="uq_file_archives_user_id_blob_hash"), + ) + id: Mapped[uuid.UUID] = mapped_column( + UUIDType(binary=False), primary_key=True, default=uuid.uuid4 + ) + user_id: Mapped["UserModel"] = mapped_column(ForeignKey("users.id", ondelete="CASCADE")) + user: Mapped["UserModel"] = relationship() + blob_hash: Mapped[str] = mapped_column(Text) + blob: Mapped[Optional[bytes]] = mapped_column(LargeBinary) + """`blob` is stored on S3 when it is `None`.""" -class RunModel(BaseModel): + +class RunModel(PipelineModelMixin, BaseModel): __tablename__ = "runs" id: Mapped[uuid.UUID] = mapped_column( UUIDType(binary=False), primary_key=True, default=uuid.uuid4 ) deleted: Mapped[bool] = mapped_column(Boolean, default=False) + project_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("projects.id", ondelete="CASCADE")) project: Mapped["ProjectModel"] = relationship() - repo_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("repos.id", ondelete="CASCADE")) - repo: Mapped["RepoModel"] = relationship() + user_id: Mapped["UserModel"] = mapped_column(ForeignKey("users.id", ondelete="CASCADE")) user: Mapped["UserModel"] = relationship() - submitted_at: Mapped[datetime] = mapped_column(NaiveDateTime) + + repo_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("repos.id", ondelete="CASCADE")) + repo: Mapped["RepoModel"] = relationship() + + fleet_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("fleets.id")) + """`fleet_id` keeps runs attached to fleets so the fleets cannot be deleted while they are used. + A fleet can have no busy instances but still be used by a run, for example a service with + zero replicas. + """ + fleet: Mapped[Optional["FleetModel"]] = relationship(back_populates="runs") + run_name: Mapped[str] = mapped_column(String(100)) - status: Mapped[RunStatus] = mapped_column(Enum(RunStatus)) - run_spec: Mapped[str] = mapped_column(String(4000)) - jobs: Mapped[List["JobModel"]] = relationship(back_populates="run", lazy="selectin") + submitted_at: Mapped[datetime] = mapped_column(NaiveDateTime) last_processed_at: Mapped[datetime] = mapped_column(NaiveDateTime) + skip_min_processing_interval: Mapped[bool] = mapped_column( + Boolean, default=False, server_default=false() + ) + next_triggered_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) + status: Mapped[RunStatus] = mapped_column(EnumAsString(RunStatus, 100), index=True) + """`status` must be changed only via `switch_run_status()`.""" + termination_reason: Mapped[Optional[RunTerminationReason]] = mapped_column( + EnumAsString(RunTerminationReason, 100) + ) + resubmission_attempt: Mapped[int] = mapped_column(Integer, default=0) + """`resubmission_attempt` counts consecutive transitions to pending without provisioning. + It can be used to choose a retry delay based on the attempt number. + """ + run_spec: Mapped[str] = mapped_column(Text) + service_spec: Mapped[Optional[str]] = mapped_column(Text) + priority: Mapped[int] = mapped_column(Integer, default=0) + deployment_num: Mapped[int] = mapped_column(Integer) + desired_replica_count: Mapped[int] = mapped_column(Integer) + desired_replica_counts: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + jobs: Mapped[List["JobModel"]] = relationship( + back_populates="run", lazy="selectin", order_by="[JobModel.replica_num, JobModel.job_num]" + ) + gateway_id: Mapped[Optional[uuid.UUID]] = mapped_column( ForeignKey("gateways.id", ondelete="SET NULL") ) gateway: Mapped[Optional["GatewayModel"]] = relationship() - termination_reason: Mapped[Optional[RunTerminationReason]] = mapped_column( - Enum(RunTerminationReason) + + service_router_worker_sync: Mapped[Optional["ServiceRouterWorkerSyncModel"]] = relationship( + back_populates="run", uselist=False ) - service_spec: Mapped[Optional[str]] = mapped_column(String(4000)) - __table_args__ = (Index("ix_submitted_at_id", submitted_at.desc(), id),) + __table_args__ = ( + Index("ix_submitted_at_id", submitted_at.desc(), id), + Index( + "ix_runs_pipeline_fetch_q", + last_processed_at.asc(), + postgresql_where=status.not_in(RunStatus.finished_statuses()), + sqlite_where=status.not_in(RunStatus.finished_statuses()), + ), + ) -class JobModel(BaseModel): +class ServiceRouterWorkerSyncModel(PipelineModelMixin, BaseModel): + """ + Row processed by ServiceRouterWorkerSyncPipeline: sync router /workers with worker replicas. + At most one per run that uses replica-group routers. + """ + + __tablename__ = "service_router_worker_sync" + + id: Mapped[uuid.UUID] = mapped_column( + UUIDType(binary=False), primary_key=True, default=uuid.uuid4 + ) + run_id: Mapped[uuid.UUID] = mapped_column( + ForeignKey("runs.id", ondelete="CASCADE"), unique=True, index=True + ) + run: Mapped["RunModel"] = relationship(back_populates="service_router_worker_sync") + deleted: Mapped[bool] = mapped_column(Boolean, server_default=false()) + created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) + last_processed_at: Mapped[datetime] = mapped_column( + NaiveDateTime, default=get_current_datetime + ) + + __table_args__ = ( + Index( + "ix_service_router_worker_sync_pipeline_fetch_q", + last_processed_at.asc(), + postgresql_where=deleted == false(), + sqlite_where=deleted == false(), + ), + ) + + +class JobModel(PipelineModelMixin, BaseModel): __tablename__ = "jobs" id: Mapped[uuid.UUID] = mapped_column( UUIDType(binary=False), primary_key=True, default=uuid.uuid4 ) + project_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("projects.id", ondelete="CASCADE")) project: Mapped["ProjectModel"] = relationship() - run_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("runs.id", ondelete="CASCADE")) + + run_id: Mapped[uuid.UUID] = mapped_column( + ForeignKey("runs.id", ondelete="CASCADE"), index=True + ) run: Mapped["RunModel"] = relationship() + + fleet_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("fleets.id")) + """`fleet_id` keeps jobs attached to fleets because we may choose an optimal fleet for a master + job but not yet create an instance for it. + """ + fleet: Mapped[Optional["FleetModel"]] = relationship(back_populates="jobs") + run_name: Mapped[str] = mapped_column(String(100)) job_num: Mapped[int] = mapped_column(Integer) job_name: Mapped[str] = mapped_column(String(100)) submission_num: Mapped[int] = mapped_column(Integer) submitted_at: Mapped[datetime] = mapped_column(NaiveDateTime) last_processed_at: Mapped[datetime] = mapped_column(NaiveDateTime) - status: Mapped[JobStatus] = mapped_column(Enum(JobStatus)) + skip_min_processing_interval: Mapped[bool] = mapped_column( + Boolean, default=False, server_default=false() + ) + status: Mapped[JobStatus] = mapped_column(EnumAsString(JobStatus, 100), index=True) + """`status` must be changed only via `switch_job_status()`.""" termination_reason: Mapped[Optional[JobTerminationReason]] = mapped_column( - Enum(JobTerminationReason) + EnumAsString(JobTerminationReason, 100) ) termination_reason_message: Mapped[Optional[str]] = mapped_column(Text) - job_spec_data: Mapped[str] = mapped_column(String(4000)) - job_provisioning_data: Mapped[Optional[str]] = mapped_column(String(4000)) + disconnected_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) + """`disconnected_at` stores the first time connectivity issues were seen with the instance. + It resets every time connectivity is restored. + """ + exit_status: Mapped[Optional[int]] = mapped_column(Integer) + job_spec_data: Mapped[str] = mapped_column(Text) + job_provisioning_data: Mapped[Optional[str]] = mapped_column(Text) runner_timestamp: Mapped[Optional[int]] = mapped_column(BigInteger) - # `removed` is used to ensure that the instance is killed after the job is finished + inactivity_secs: Mapped[Optional[int]] = mapped_column(Integer) + """`inactivity_secs` uses `0` for active jobs and `None` when inactivity is not applicable.""" + graceful_termination_attempts: Mapped[Optional[int]] = mapped_column(Integer) + """`graceful_termination_attempts` is used for terminating jobs. + * `None` means graceful termination is not needed + * `0` means it is needed but not attempted, + * `>= 1` means at least one graceful stop attempt was sent. + """ remove_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) - instance: Mapped[Optional["InstanceModel"]] = relationship(back_populates="job") + """`remove_at` is used to ensure the container/instance is killed after the job is gracefully finished. + Cannot kill the container/instance until `remove_at` is set. + """ + volumes_detached_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) + instance_assigned: Mapped[bool] = mapped_column(Boolean, default=False) + """`instance_assigned` shows whether instance assignment has already been attempted. + If `instance_assigned` is `True` and `instance` is `None`, no instance was assigned. + """ + instance_id: Mapped[Optional[uuid.UUID]] = mapped_column( + ForeignKey("instances.id", ondelete="CASCADE") + ) + instance: Mapped[Optional["InstanceModel"]] = relationship(back_populates="jobs") used_instance_id: Mapped[Optional[uuid.UUID]] = mapped_column(UUIDType(binary=False)) replica_num: Mapped[int] = mapped_column(Integer) + deployment_num: Mapped[int] = mapped_column(Integer) + job_runtime_data: Mapped[Optional[str]] = mapped_column(Text) + probes: Mapped[list["ProbeModel"]] = relationship( + back_populates="job", order_by="ProbeModel.probe_num" + ) + registered: Mapped[bool] = mapped_column(Boolean, server_default=false()) + """`registered` shows whether the replica is registered to receive service requests. + It is always `False` for non-service runs. + """ + waiting_master_job: Mapped[Optional[bool]] = mapped_column(Boolean) + """`waiting_master_job` is `True` for non-master jobs that have to wait for master processing before + they can be processed. This allows updating all replica jobs even when only master is locked, + for example to provision instances for all jobs when processing master. If not set, all jobs + should be processed only one-by-one. + """ + image_pull_progress: Mapped[Optional[str]] = mapped_column(Text) + + __table_args__ = ( + Index( + "ix_jobs_pipeline_fetch_q", + last_processed_at.asc(), + postgresql_where=status.not_in(JobStatus.finished_statuses()), + sqlite_where=status.not_in(JobStatus.finished_statuses()), + ), + ) -class GatewayModel(BaseModel): +class GatewayModel(PipelineModelMixin, BaseModel): __tablename__ = "gateways" id: Mapped[uuid.UUID] = mapped_column( @@ -256,56 +605,113 @@ class GatewayModel(BaseModel): ) name: Mapped[str] = mapped_column(String(100)) region: Mapped[str] = mapped_column(String(100)) - wildcard_domain: Mapped[str] = mapped_column(String(100), nullable=True) + wildcard_domain: Mapped[Optional[str]] = mapped_column(String(100)) configuration: Mapped[Optional[str]] = mapped_column(Text) + """`configuration` is Optional for compatibility with pre-0.18.2 gateways. + Use `get_gateway_configuration` to construct `configuration` for old gateways. + """ created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) - status: Mapped[GatewayStatus] = mapped_column(Enum(GatewayStatus)) + status: Mapped[GatewayStatus] = mapped_column(EnumAsString(GatewayStatus, 100)) status_message: Mapped[Optional[str]] = mapped_column(Text) last_processed_at: Mapped[datetime] = mapped_column(NaiveDateTime) + to_be_deleted: Mapped[bool] = mapped_column(Boolean, server_default=false()) + forbid_new_services: Mapped[bool] = mapped_column(Boolean, server_default=false()) + """ + `forbid_new_services` is useful when migrating off the gateway or doing maintenance. + For now, it can only be set by server admins via an SQL query. + """ project_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("projects.id", ondelete="CASCADE")) project: Mapped["ProjectModel"] = relationship(foreign_keys=[project_id]) backend_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("backends.id", ondelete="CASCADE")) - backend: Mapped["BackendModel"] = relationship(lazy="selectin") + backend: Mapped["BackendModel"] = relationship() gateway_compute_id: Mapped[Optional[uuid.UUID]] = mapped_column( ForeignKey("gateway_computes.id", ondelete="CASCADE") ) - gateway_compute: Mapped[Optional["GatewayComputeModel"]] = relationship(lazy="joined") + gateway_compute: Mapped[Optional["GatewayComputeModel"]] = relationship( + foreign_keys=[gateway_compute_id] + ) + """ + Relationship with gateway computes for pre-0.20.25 gateways. + Use `get_gateway_compute_models()` for version-agnostic gateway compute retrieval. + """ + gateway_computes: Mapped[List["GatewayComputeModel"]] = relationship( + back_populates="gateway", + foreign_keys="GatewayComputeModel.gateway_id", + ) + """ + Relationship with gateway computes for 0.20.25+ gateways. + Use `get_gateway_compute_models()` for version-agnostic gateway compute retrieval. + """ runs: Mapped[List["RunModel"]] = relationship(back_populates="gateway") __table_args__ = (UniqueConstraint("project_id", "name", name="uq_gateways_project_id_name"),) + # TODO: Add pipeline index ("ix_gateways_pipeline_fetch_q") if gateways become soft-deleted. + class GatewayComputeModel(BaseModel): + """A single gateway replica. + **TODO**: consider renaming to `GatewayReplicaModel`. + """ + __tablename__ = "gateway_computes" id: Mapped[uuid.UUID] = mapped_column( UUIDType(binary=False), primary_key=True, default=uuid.uuid4 ) created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) + replica_num: Mapped[int] = mapped_column(Integer, server_default="0") instance_id: Mapped[str] = mapped_column(String(100)) ip_address: Mapped[str] = mapped_column(String(100)) + """Gateway replica IP address or domain name (e.g., k8s can use domain names). + **TODO**: rename. + """ hostname: Mapped[Optional[str]] = mapped_column(String(100)) + """Hostname of the gateway's load balancer. + **TODO**: move to `GatewayModel`. + """ configuration: Mapped[Optional[str]] = mapped_column(Text) + """`configuration` is optional for compatibility with pre-0.18.2 gateways. + Use `get_gateway_compute_configuration` to construct `configuration` for old gateways. + """ backend_data: Mapped[Optional[str]] = mapped_column(Text) region: Mapped[str] = mapped_column(String(100)) + gateway_id: Mapped[Optional[uuid.UUID]] = mapped_column( + ForeignKey( + "gateways.id", + ondelete="SET NULL", + use_alter=True, + ) + ) + gateway: Mapped[Optional["GatewayModel"]] = relationship( + back_populates="gateway_computes", + foreign_keys=[gateway_id], + ) + """ + Gateway. Can be None for pre-0.20.25 gateways, which use GatewayModel.gateway_compute_id to + establish the relationship. + """ + backend_id: Mapped[Optional[uuid.UUID]] = mapped_column( ForeignKey("backends.id", ondelete="CASCADE") ) backend: Mapped[Optional["BackendModel"]] = relationship() - # The key to authorize the server with the gateway ssh_private_key: Mapped[str] = mapped_column(Text) + """`ssh_private_key` is the key used to authorize the server with the gateway.""" ssh_public_key: Mapped[str] = mapped_column(Text) - # active means the server should maintain connection to gateway. active: Mapped[bool] = mapped_column(Boolean, default=True) + """`active` means the server should maintain a connection to the gateway.""" deleted: Mapped[bool] = mapped_column(Boolean, server_default=false()) + app_updated_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) +# TODO: Drop after the release without pools class PoolModel(BaseModel): __tablename__ = "pools" @@ -323,7 +729,58 @@ class PoolModel(BaseModel): instances: Mapped[List["InstanceModel"]] = relationship(back_populates="pool", lazy="selectin") -class InstanceModel(BaseModel): +class FleetModel(PipelineModelMixin, BaseModel): + __tablename__ = "fleets" + + id: Mapped[uuid.UUID] = mapped_column( + UUIDType(binary=False), primary_key=True, default=uuid.uuid4 + ) + name: Mapped[str] = mapped_column(String(100)) + + project_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("projects.id", ondelete="CASCADE")) + project: Mapped["ProjectModel"] = relationship(foreign_keys=[project_id]) + + created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) + last_processed_at: Mapped[datetime] = mapped_column( + NaiveDateTime, default=get_current_datetime + ) + deleted: Mapped[bool] = mapped_column(Boolean, default=False) + deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) + + status: Mapped[FleetStatus] = mapped_column(EnumAsString(FleetStatus, 100), index=True) + """`status` must be changed only via `switch_fleet_status()`.""" + status_message: Mapped[Optional[str]] = mapped_column(Text) + + spec: Mapped[str] = mapped_column(Text) + + runs: Mapped[List["RunModel"]] = relationship(back_populates="fleet") + jobs: Mapped[List["JobModel"]] = relationship(back_populates="fleet") + instances: Mapped[List["InstanceModel"]] = relationship( + back_populates="fleet", + foreign_keys="InstanceModel.fleet_id", + ) + + current_master_instance_id: Mapped[Optional[uuid.UUID]] = mapped_column( + UUIDType(binary=False), index=True + ) + + consolidation_attempt: Mapped[int] = mapped_column(Integer, server_default="0") + """`consolidation_attempt` counts how many times in a row the fleet needed consolidation. + It allows increasing delays between attempts. + """ + last_consolidated_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) + + __table_args__ = ( + Index( + "ix_fleets_pipeline_fetch_q", + last_processed_at.asc(), + postgresql_where=deleted == false(), + sqlite_where=deleted == false(), + ), + ) + + +class InstanceModel(PipelineModelMixin, BaseModel): __tablename__ = "instances" id: Mapped[uuid.UUID] = mapped_column( @@ -331,73 +788,145 @@ class InstanceModel(BaseModel): ) name: Mapped[str] = mapped_column(String(50)) - # instance + instance_num: Mapped[int] = mapped_column(Integer, default=0) + created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) + last_processed_at: Mapped[datetime] = mapped_column( + NaiveDateTime, default=get_current_datetime + ) + skip_min_processing_interval: Mapped[bool] = mapped_column( + Boolean, default=False, server_default=false() + ) deleted: Mapped[bool] = mapped_column(Boolean, default=False) deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) project_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("projects.id", ondelete="CASCADE")) project: Mapped["ProjectModel"] = relationship(foreign_keys=[project_id]) - pool_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("pools.id")) - pool: Mapped["PoolModel"] = relationship(back_populates="instances") + # TODO: drop `pool_id` after the release without pools. + pool_id: Mapped[Optional[uuid.UUID]] = mapped_column( + ForeignKey("pools.id"), + deferred=True, # Not loaded so it can be deleted in the next releases + ) + pool: Mapped[Optional["PoolModel"]] = relationship(back_populates="instances") + + fleet_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("fleets.id"), index=True) + fleet: Mapped[Optional["FleetModel"]] = relationship( + back_populates="instances", + foreign_keys=[fleet_id], + ) + """`fleet` can be `None` only for legacy instances created before fleets.""" + + compute_group_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("compute_groups.id")) + compute_group: Mapped[Optional["ComputeGroupModel"]] = relationship(back_populates="instances") - status: Mapped[InstanceStatus] = mapped_column(Enum(InstanceStatus)) + status: Mapped[InstanceStatus] = mapped_column(EnumAsString(InstanceStatus, 100), index=True) + """`status` must be changed only via `switch_instance_status()`.""" unreachable: Mapped[bool] = mapped_column(Boolean) - # VM started_at: Mapped[Optional[datetime]] = mapped_column( NaiveDateTime, default=get_current_datetime ) + """`started_at` is used only for VM instances.""" finished_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) - # create instance - # TODO: Introduce a field that would store all resolved instance profile parameters, etc, (similar to job_spec). - # Currently, profile parameters are parsed every time they are accessed (e.g. see profile.retry). + # TODO: introduce a field that stores all resolved instance profile parameters, similar to `job_spec`. profile: Mapped[Optional[str]] = mapped_column(Text) - requirements: Mapped[Optional[str]] = mapped_column(String(10_000)) + """`profile` stores raw profile data. Profile parameters are currently parsed every time they are + accessed, for example through `profile.retry`. + """ + requirements: Mapped[Optional[str]] = mapped_column(Text) instance_configuration: Mapped[Optional[str]] = mapped_column(Text) - # temination policy - termination_policy: Mapped[Optional[TerminationPolicy]] = mapped_column(String(50)) + termination_policy: Mapped[Optional[TerminationPolicy]] = mapped_column(String(100)) + # TODO: consider not assigning `DEFAULT_FLEET_TERMINATION_IDLE_TIME` here and making this optional. termination_idle_time: Mapped[int] = mapped_column( - Integer, default=DEFAULT_POOL_TERMINATION_IDLE_TIME + Integer, default=DEFAULT_FLEET_TERMINATION_IDLE_TIME ) + """`termination_idle_time` stores the idle timeout used for termination decisions.""" - # retry policy - last_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) + last_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime, deferred=True) + """`last_retry_at` is deprecated.""" - # instance termination handling termination_deadline: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) - termination_reason: Mapped[Optional[str]] = mapped_column(String(4000)) - health_status: Mapped[Optional[str]] = mapped_column(String(4000)) + """`termination_deadline` is used for instance termination handling.""" + termination_reason: Mapped[Optional[InstanceTerminationReason]] = mapped_column( + EnumAsString( + InstanceTerminationReason, + 4000, + fallback_deserializer=InstanceTerminationReason.from_legacy_str, + ) + ) + """`termination_reason` may need legacy deserialization because dstack versions prior to 0.20.1 represented instance termination + reasons as raw strings. Such strings may still be stored in the database, so this uses a + wide column and a fallback deserializer to convert them to relevant enum members. + """ + termination_reason_message: Mapped[Optional[str]] = mapped_column(String(4000)) + health_status: Mapped[Optional[str]] = mapped_column(String(4000), deferred=True) + """`health_status` is deprecated since 0.19.22 and is no longer used.""" + health: Mapped[HealthStatus] = mapped_column( + EnumAsString(HealthStatus, 100), default=HealthStatus.HEALTHY + ) + first_termination_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) + last_termination_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) - # backend - backend: Mapped[Optional[BackendType]] = mapped_column(Enum(BackendType)) - backend_data: Mapped[Optional[str]] = mapped_column(String(4000)) + backend: Mapped[Optional[BackendType]] = mapped_column(EnumAsString(BackendType, 100)) + backend_data: Mapped[Optional[str]] = mapped_column(Text) - # offer - offer: Mapped[Optional[str]] = mapped_column(String(4000)) + offer: Mapped[Optional[str]] = mapped_column(Text) + """`offer` is not set for cloud fleets that have not started provisioning.""" region: Mapped[Optional[str]] = mapped_column(String(2000)) price: Mapped[Optional[float]] = mapped_column(Float) - job_provisioning_data: Mapped[Optional[str]] = mapped_column(String(4000)) + job_provisioning_data: Mapped[Optional[str]] = mapped_column(Text) + provisioning_job_id: Mapped[Optional[uuid.UUID]] = mapped_column( + UUIDType(binary=False), default=None + ) + """When set, records the job that triggered this instance's creation. + A PENDING instance with this field set is a placeholder managed by + `JobSubmittedPipeline` and is not touched by `InstancePipeline`. + """ remote_connection_info: Mapped[Optional[str]] = mapped_column(Text) - # current job - job_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("jobs.id")) - job: Mapped[Optional["JobModel"]] = relationship(back_populates="instance", lazy="joined") + total_blocks: Mapped[Optional[int]] = mapped_column(Integer) + """`total_blocks` uses `NULL` to mean `auto` during provisioning; once ready it is not `NULL`.""" + busy_blocks: Mapped[int] = mapped_column(Integer, default=0) + + jobs: Mapped[list["JobModel"]] = relationship(back_populates="instance") last_job_processed_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) - # volumes attached to the instance - volumes: Mapped[List["VolumeModel"]] = relationship( - secondary="volumes_attachments", - back_populates="instances", + volume_attachments: Mapped[List["VolumeAttachmentModel"]] = relationship( + back_populates="instance", + cascade=CASCADE_DEFAULT_WITH_DELETE_ORPHAN, ) + __table_args__ = ( + Index( + "ix_instances_pipeline_fetch_q", + last_processed_at.asc(), + postgresql_where=deleted == false(), + sqlite_where=deleted == false(), + ), + ) + + +class InstanceHealthCheckModel(BaseModel): + __tablename__ = "instance_health_checks" -class VolumeModel(BaseModel): + id: Mapped[uuid.UUID] = mapped_column( + UUIDType(binary=False), primary_key=True, default=uuid.uuid4 + ) + + instance_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("instances.id")) + instance: Mapped["InstanceModel"] = relationship() + + collected_at: Mapped[datetime] = mapped_column(NaiveDateTime) + status: Mapped[HealthStatus] = mapped_column(EnumAsString(HealthStatus, 100)) + response: Mapped[str] = mapped_column(Text) + + +class VolumeModel(PipelineModelMixin, BaseModel): __tablename__ = "volumes" id: Mapped[uuid.UUID] = mapped_column( @@ -405,6 +934,9 @@ class VolumeModel(BaseModel): ) name: Mapped[str] = mapped_column(String(100)) + user_id: Mapped["UserModel"] = mapped_column(ForeignKey("users.id", ondelete="CASCADE")) + user: Mapped["UserModel"] = relationship() + project_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("projects.id", ondelete="CASCADE")) project: Mapped["ProjectModel"] = relationship(foreign_keys=[project_id]) @@ -412,26 +944,338 @@ class VolumeModel(BaseModel): last_processed_at: Mapped[datetime] = mapped_column( NaiveDateTime, default=get_current_datetime ) + last_job_processed_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) + """`last_job_processed_at` records the last time the volume was used by a job. + Updated when a job terminates and used to delete volumes on `auto_cleanup_duration`. + """ deleted: Mapped[bool] = mapped_column(Boolean, default=False) deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) + to_be_deleted: Mapped[bool] = mapped_column(Boolean, server_default=false()) - status: Mapped[VolumeStatus] = mapped_column(Enum(VolumeStatus)) + status: Mapped[VolumeStatus] = mapped_column(EnumAsString(VolumeStatus, 100), index=True) + """`status` must be changed only via `switch_volume_status()`.""" status_message: Mapped[Optional[str]] = mapped_column(Text) configuration: Mapped[str] = mapped_column(Text) volume_provisioning_data: Mapped[Optional[str]] = mapped_column(Text) + auto_cleanup_enabled: Mapped[Optional[bool]] = mapped_column(Boolean) + """`auto_cleanup_enabled` is set for all new models, but old models may not have it.""" + + attachments: Mapped[List["VolumeAttachmentModel"]] = relationship(back_populates="volume") + volume_attachment_data: Mapped[Optional[str]] = mapped_column(Text) + """`volume_attachment_data` is deprecated in favor of `VolumeAttachmentModel.attachment_data`.""" + + __table_args__ = ( + Index( + "ix_volumes_pipeline_fetch_q", + last_processed_at.asc(), + postgresql_where=deleted == false(), + sqlite_where=deleted == false(), + ), + ) - # instances the volume is attached to - instances: Mapped[List["InstanceModel"]] = relationship( - secondary="volumes_attachments", - back_populates="volumes", + +class VolumeAttachmentModel(BaseModel): + __tablename__ = "volumes_attachments" + + volume_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("volumes.id"), primary_key=True) + volume: Mapped[VolumeModel] = relationship(back_populates="attachments") + instance_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("instances.id"), primary_key=True) + instance: Mapped[InstanceModel] = relationship(back_populates="volume_attachments") + attachment_data: Mapped[Optional[str]] = mapped_column(Text) + + +class PlacementGroupModel(PipelineModelMixin, BaseModel): + __tablename__ = "placement_groups" + + id: Mapped[uuid.UUID] = mapped_column( + UUIDType(binary=False), primary_key=True, default=uuid.uuid4 ) + name: Mapped[str] = mapped_column(String(100)) + project_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("projects.id", ondelete="CASCADE")) + project: Mapped["ProjectModel"] = relationship(foreign_keys=[project_id]) -volumes_attachments_table = Table( - "volumes_attachments", - BackendModel.metadata, - Column("volume_id", ForeignKey("volumes.id"), primary_key=True), - Column("instace_id", ForeignKey("instances.id"), primary_key=True), -) + fleet_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("fleets.id")) + fleet: Mapped["FleetModel"] = relationship(foreign_keys=[fleet_id]) + # TODO: rename `fleet_deleted` to `to_be_deleted`. + fleet_deleted: Mapped[bool] = mapped_column(Boolean, default=False) + + created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) + last_processed_at: Mapped[datetime] = mapped_column( + NaiveDateTime, default=get_current_datetime + ) + deleted: Mapped[bool] = mapped_column(Boolean, default=False) + deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) + + configuration: Mapped[str] = mapped_column(Text) + provisioning_data: Mapped[Optional[str]] = mapped_column(Text) + + __table_args__ = ( + Index( + "ix_placement_groups_pipeline_fetch_q", + last_processed_at.asc(), + postgresql_where=deleted == false(), + sqlite_where=deleted == false(), + ), + ) + + +class ComputeGroupModel(PipelineModelMixin, BaseModel): + __tablename__ = "compute_groups" + + id: Mapped[uuid.UUID] = mapped_column( + UUIDType(binary=False), primary_key=True, default=uuid.uuid4 + ) + + project_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("projects.id", ondelete="CASCADE")) + project: Mapped["ProjectModel"] = relationship(foreign_keys=[project_id]) + + fleet_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("fleets.id")) + fleet: Mapped["FleetModel"] = relationship(foreign_keys=[fleet_id]) + + created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) + status: Mapped[ComputeGroupStatus] = mapped_column(EnumAsString(ComputeGroupStatus, 100)) + last_processed_at: Mapped[datetime] = mapped_column( + NaiveDateTime, default=get_current_datetime + ) + deleted: Mapped[bool] = mapped_column(Boolean, default=False) + deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) + + provisioning_data: Mapped[str] = mapped_column(Text) + + first_termination_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) + last_termination_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) + + instances: Mapped[List["InstanceModel"]] = relationship(back_populates="compute_group") + + __table_args__ = ( + Index( + "ix_compute_groups_pipeline_fetch_q", + last_processed_at.asc(), + postgresql_where=status.not_in(ComputeGroupStatus.finished_statuses()), + sqlite_where=status.not_in(ComputeGroupStatus.finished_statuses()), + ), + ) + + +class JobMetricsPoint(BaseModel): + __tablename__ = "job_metrics_points" + + id: Mapped[uuid.UUID] = mapped_column( + UUIDType(binary=False), primary_key=True, default=uuid.uuid4 + ) + + job_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("jobs.id"), index=True) + job: Mapped["JobModel"] = relationship() + + timestamp_micro: Mapped[int] = mapped_column(BigInteger) + cpu_usage_micro: Mapped[int] = mapped_column(BigInteger) + memory_usage_bytes: Mapped[int] = mapped_column(BigInteger) + memory_working_set_bytes: Mapped[int] = mapped_column(BigInteger) + + gpus_memory_usage_bytes: Mapped[str] = mapped_column(Text) + """`gpus_memory_usage_bytes` stores a JSON-encoded list of metric values with length `len(gpus)`.""" + gpus_util_percent: Mapped[str] = mapped_column(Text) + """`gpus_util_percent` stores a JSON-encoded list of metric values with length `len(gpus)`.""" + + +class JobPrometheusMetrics(BaseModel): + __tablename__ = "job_prometheus_metrics" + + job_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("jobs.id"), primary_key=True) + job: Mapped["JobModel"] = relationship() + + collected_at: Mapped[datetime] = mapped_column(NaiveDateTime) + text: Mapped[str] = mapped_column(Text) + """`text` stores the raw Prometheus text response.""" + + +class ProbeModel(BaseModel): + __tablename__ = "probes" + __table_args__ = (UniqueConstraint("job_id", "probe_num", name="uq_probes_job_id_probe_num"),) + + id: Mapped[uuid.UUID] = mapped_column( + UUIDType(binary=False), primary_key=True, default=uuid.uuid4 + ) + name: Mapped[str] = mapped_column(String(100)) + + job_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("jobs.id"), primary_key=True) + job: Mapped["JobModel"] = relationship(back_populates="probes") + + probe_num: Mapped[int] = mapped_column(Integer) + """`probe_num` is the index in `JobSpec.probes`.""" + due: Mapped[datetime] = mapped_column(NaiveDateTime) + success_streak: Mapped[int] = mapped_column(BigInteger) + active: Mapped[bool] = mapped_column(Boolean) + + +class SecretModel(BaseModel): + __tablename__ = "secrets" + __table_args__ = (UniqueConstraint("project_id", "name", name="uq_secrets_project_id_name"),) + + id: Mapped[uuid.UUID] = mapped_column( + UUIDType(binary=False), primary_key=True, default=uuid.uuid4 + ) + + project_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("projects.id", ondelete="CASCADE")) + project: Mapped["ProjectModel"] = relationship() + + created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) + updated_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) + + name: Mapped[str] = mapped_column(String(200)) + value: Mapped[DecryptedString] = mapped_column(EncryptedString()) + + +class EventModel(BaseModel): + __tablename__ = "events" + + id: Mapped[uuid.UUID] = mapped_column(UUIDType(binary=False), primary_key=True) + message: Mapped[str] = mapped_column(Text) + recorded_at: Mapped[datetime] = mapped_column(NaiveDateTime, index=True) + + actor_user_id: Mapped[Optional[uuid.UUID]] = mapped_column( + ForeignKey("users.id", ondelete="CASCADE"), nullable=True, index=True + ) + actor_user: Mapped[Optional["UserModel"]] = relationship() + + targets: Mapped[List["EventTargetModel"]] = relationship(back_populates="event") + + +class EventTargetModel(BaseModel): + __tablename__ = "event_targets" + + id: Mapped[uuid.UUID] = mapped_column( + UUIDType(binary=False), primary_key=True, default=uuid.uuid4 + ) + + event_id: Mapped[uuid.UUID] = mapped_column( + ForeignKey("events.id", ondelete="CASCADE"), index=True + ) + event: Mapped["EventModel"] = relationship() + + entity_project_id: Mapped[Optional[uuid.UUID]] = mapped_column( + ForeignKey("projects.id", ondelete="CASCADE"), nullable=True, index=True + ) + entity_project: Mapped[Optional["ProjectModel"]] = relationship() + + entity_type: Mapped[EventTargetType] = mapped_column( + EnumAsString(EventTargetType, 100), index=True + ) + entity_id: Mapped[uuid.UUID] = mapped_column(UUIDType(binary=False), index=True) + entity_name: Mapped[str] = mapped_column(String(200)) + + +class ExportModel(BaseModel): + __tablename__ = "exports" + __table_args__ = (UniqueConstraint("project_id", "name", name="uq_exports_project_id_name"),) + + id: Mapped[uuid.UUID] = mapped_column( + UUIDType(binary=False), primary_key=True, default=uuid.uuid4 + ) + name: Mapped[str] = mapped_column(String(100)) + project_id: Mapped[uuid.UUID] = mapped_column( + ForeignKey("projects.id", ondelete="CASCADE"), index=True + ) + project: Mapped["ProjectModel"] = relationship() + is_global: Mapped[bool] = mapped_column(Boolean, default=False, server_default=false()) + created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) + imports: Mapped[List["ImportModel"]] = relationship( + back_populates="export", + cascade=CASCADE_DEFAULT_WITH_DELETE_ORPHAN, + ) + exported_fleets: Mapped[List["ExportedFleetModel"]] = relationship( + back_populates="export", + cascade=CASCADE_DEFAULT_WITH_DELETE_ORPHAN, + ) + exported_gateways: Mapped[List["ExportedGatewayModel"]] = relationship( + back_populates="export", + cascade=CASCADE_DEFAULT_WITH_DELETE_ORPHAN, + ) + + +class ImportModel(BaseModel): + __tablename__ = "imports" + __table_args__ = ( + UniqueConstraint( + "project_id", + "export_id", + name="uq_imports_project_id_export_id", + ), + ) + + id: Mapped[uuid.UUID] = mapped_column( + UUIDType(binary=False), primary_key=True, default=uuid.uuid4 + ) + project_id: Mapped[uuid.UUID] = mapped_column( + ForeignKey("projects.id", ondelete="CASCADE"), index=True + ) + project: Mapped["ProjectModel"] = relationship() + export_id: Mapped[uuid.UUID] = mapped_column( + ForeignKey("exports.id", ondelete="CASCADE"), index=True + ) + export: Mapped["ExportModel"] = relationship() + created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) + + +class ExportedFleetModel(BaseModel): + __tablename__ = "exported_fleets" + __table_args__ = ( + UniqueConstraint("export_id", "fleet_id", name="uq_exported_fleets_export_id_fleet_id"), + ) + + id: Mapped[uuid.UUID] = mapped_column( + UUIDType(binary=False), primary_key=True, default=uuid.uuid4 + ) + export_id: Mapped[uuid.UUID] = mapped_column( + ForeignKey("exports.id", ondelete="CASCADE"), index=True + ) + export: Mapped["ExportModel"] = relationship() + fleet_id: Mapped[uuid.UUID] = mapped_column( + ForeignKey("fleets.id", ondelete="CASCADE"), index=True + ) + fleet: Mapped["FleetModel"] = relationship() + + +class ExportedGatewayModel(BaseModel): + __tablename__ = "exported_gateways" + __table_args__ = ( + UniqueConstraint( + "export_id", "gateway_id", name="uq_exported_gateways_export_id_gateway_id" + ), + ) + + id: Mapped[uuid.UUID] = mapped_column( + UUIDType(binary=False), primary_key=True, default=uuid.uuid4 + ) + export_id: Mapped[uuid.UUID] = mapped_column( + ForeignKey("exports.id", ondelete="CASCADE"), index=True + ) + export: Mapped["ExportModel"] = relationship() + gateway_id: Mapped[uuid.UUID] = mapped_column( + ForeignKey("gateways.id", ondelete="CASCADE"), index=True + ) + gateway: Mapped["GatewayModel"] = relationship() + + +class UserPublicKeyModel(BaseModel): + __tablename__ = "user_public_keys" + __table_args__ = ( + UniqueConstraint("user_id", "fingerprint", name="uq_user_public_keys_user_id_fingerprint"), + ) + + id: Mapped[uuid.UUID] = mapped_column( + UUIDType(binary=False), primary_key=True, default=uuid.uuid4 + ) + created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) + user_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("users.id", ondelete="CASCADE")) + user: Mapped["UserModel"] = relationship() + name: Mapped[str] = mapped_column(String(100)) + type: Mapped[str] = mapped_column(String(100)) + """`type` is a key type identifier used by OpenSSH, e.g., `ssh-rsa`, `ecdsa-sha2-nistp521`.""" + fingerprint: Mapped[str] = mapped_column(String(100)) + """`fingerprint` stores a key digest in the format used by OpenSSH: `SHA256:`.""" + key: Mapped[str] = mapped_column(Text) + """`key` stores a public key in the OpenSSH disk (ASCII-armored) format.""" diff --git a/src/dstack/_internal/server/routers/auth.py b/src/dstack/_internal/server/routers/auth.py new file mode 100644 index 0000000000..e44fb67f53 --- /dev/null +++ b/src/dstack/_internal/server/routers/auth.py @@ -0,0 +1,40 @@ +from fastapi import APIRouter + +from dstack._internal.core.models.auth import OAuthProviderInfo +from dstack._internal.server.schemas.auth import ( + OAuthGetNextRedirectRequest, + OAuthGetNextRedirectResponse, +) +from dstack._internal.server.services import auth as auth_services +from dstack._internal.server.utils.routers import CustomORJSONResponse + +router = APIRouter(prefix="/api/auth", tags=["authentication"]) + + +@router.post( + "/list_providers", summary="List OAuth providers", response_model=list[OAuthProviderInfo] +) +async def list_providers(): + """ + Returns OAuth2 providers registered on the server. + """ + return CustomORJSONResponse(auth_services.list_providers()) + + +@router.post( + "/get_next_redirect", + summary="Get next redirect URL", + response_model=OAuthGetNextRedirectResponse, +) +async def get_next_redirect(body: OAuthGetNextRedirectRequest): + """ + A helper endpoint that returns the next redirect URL in case the state encodes it. + Can be used by the UI after the redirect from the provider + to determine if the user needs to be redirected further (CLI login) + or the auth callback endpoint needs to be called directly (UI login). + """ + return CustomORJSONResponse( + OAuthGetNextRedirectResponse( + redirect_url=auth_services.get_next_redirect_url(code=body.code, state=body.state) + ) + ) diff --git a/src/dstack/_internal/server/routers/backends.py b/src/dstack/_internal/server/routers/backends.py index 91a6659c64..4fd84a4477 100644 --- a/src/dstack/_internal/server/routers/backends.py +++ b/src/dstack/_internal/server/routers/backends.py @@ -3,13 +3,12 @@ from fastapi import APIRouter, Depends from sqlalchemy.ext.asyncio import AsyncSession -from dstack._internal.core.errors import ResourceNotExistsError -from dstack._internal.core.models.backends import ( - AnyConfigInfoWithCreds, - AnyConfigInfoWithCredsPartial, - AnyConfigValues, +import dstack._internal.core.backends.configurators +from dstack._internal.core.backends.models import ( + AnyBackendConfigWithCreds, BackendInfoYAML, ) +from dstack._internal.core.errors import ResourceNotExistsError from dstack._internal.core.models.backends.base import BackendType from dstack._internal.server import settings from dstack._internal.server.db import get_session @@ -19,85 +18,96 @@ DeleteBackendsRequest, UpdateBackendYAMLRequest, ) -from dstack._internal.server.security.permissions import Authenticated, ProjectAdmin +from dstack._internal.server.security.permissions import ProjectAdmin from dstack._internal.server.services import backends +from dstack._internal.server.services.backends import handlers as backends_handlers from dstack._internal.server.services.config import ( ServerConfigManager, create_backend_config_yaml, get_backend_config_yaml, update_backend_config_yaml, ) +from dstack._internal.server.utils.routers import ( + CustomORJSONResponse, + get_base_api_additional_responses, +) -root_router = APIRouter(prefix="/api/backends", tags=["backends"]) -project_router = APIRouter(prefix="/api/project/{project_name}/backends", tags=["backends"]) - - -@root_router.post("/list_types") -async def list_backend_types() -> List[BackendType]: - return backends.list_available_backend_types() +root_router = APIRouter( + prefix="/api/backends", + tags=["backends"], + responses=get_base_api_additional_responses(), +) +project_router = APIRouter( + prefix="/api/project/{project_name}/backends", + tags=["backends"], + responses=get_base_api_additional_responses(), +) -@root_router.post("/config_values") -async def get_backend_config_values( - body: AnyConfigInfoWithCredsPartial, - user: UserModel = Depends(Authenticated()), -) -> AnyConfigValues: - return await backends.get_backend_config_values(config=body) +@root_router.post("/list_types", summary="List backend types", response_model=List[BackendType]) +async def list_backend_types(): + return CustomORJSONResponse( + dstack._internal.core.backends.configurators.list_available_backend_types() + ) -@project_router.post("/create") +@project_router.post("/create", summary="Create backend", response_model=AnyBackendConfigWithCreds) async def create_backend( - body: AnyConfigInfoWithCreds, + body: AnyBackendConfigWithCreds, session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectAdmin()), -) -> AnyConfigInfoWithCreds: +): _, project = user_project config = await backends.create_backend(session=session, project=project, config=body) if settings.SERVER_CONFIG_ENABLED: await ServerConfigManager().sync_config(session=session) - return config + return CustomORJSONResponse(config) -@project_router.post("/update") +@project_router.post("/update", summary="Update backend", response_model=AnyBackendConfigWithCreds) async def update_backend( - body: AnyConfigInfoWithCreds, + body: AnyBackendConfigWithCreds, session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectAdmin()), -) -> AnyConfigInfoWithCreds: +): _, project = user_project config = await backends.update_backend(session=session, project=project, config=body) if settings.SERVER_CONFIG_ENABLED: await ServerConfigManager().sync_config(session=session) - return config + return CustomORJSONResponse(config) -@project_router.post("/delete") +@project_router.post("/delete", summary="Delete backends") async def delete_backends( body: DeleteBackendsRequest, session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectAdmin()), ): _, project = user_project - await backends.delete_backends( - session=session, project=project, backends_types=body.backends_names + await backends_handlers.delete_backends_safe( + session=session, project=project, backends_types=body.backends_names, error=True ) if settings.SERVER_CONFIG_ENABLED: await ServerConfigManager().sync_config(session=session) -@project_router.post("/{backend_name}/config_info") +@project_router.post( + "/{backend_name}/config_info", + summary="Get backend config info", + response_model=AnyBackendConfigWithCreds, +) async def get_backend_config_info( backend_name: BackendType, user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectAdmin()), -) -> AnyConfigInfoWithCreds: +): _, project = user_project - config_info = await backends.get_config_info(project=project, backend_type=backend_name) - if config_info is None: + config = await backends.get_backend_config(project=project, backend_type=backend_name) + if config is None: raise ResourceNotExistsError() - return config_info + return CustomORJSONResponse(config) -@project_router.post("/create_yaml") +@project_router.post("/create_yaml", summary="Create backend YAML") async def create_backend_yaml( body: CreateBackendYAMLRequest, session: AsyncSession = Depends(get_session), @@ -111,7 +121,7 @@ async def create_backend_yaml( ) -@project_router.post("/update_yaml") +@project_router.post("/update_yaml", summary="Update backend YAML") async def update_backend_yaml( body: UpdateBackendYAMLRequest, session: AsyncSession = Depends(get_session), @@ -125,10 +135,14 @@ async def update_backend_yaml( ) -@project_router.post("/{backend_name}/get_yaml") +@project_router.post( + "/{backend_name}/get_yaml", summary="Get backend YAML", response_model=BackendInfoYAML +) async def get_backend_yaml( backend_name: BackendType, user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectAdmin()), -) -> BackendInfoYAML: +): _, project = user_project - return await get_backend_config_yaml(project=project, backend_type=backend_name) + return CustomORJSONResponse( + await get_backend_config_yaml(project=project, backend_type=backend_name) + ) diff --git a/src/dstack/_internal/server/routers/events.py b/src/dstack/_internal/server/routers/events.py new file mode 100644 index 0000000000..1d6d80b671 --- /dev/null +++ b/src/dstack/_internal/server/routers/events.py @@ -0,0 +1,64 @@ +from fastapi import APIRouter, Depends +from sqlalchemy.ext.asyncio import AsyncSession + +import dstack._internal.server.services.events as events_services +from dstack._internal.core.models.events import Event +from dstack._internal.server.db import get_session +from dstack._internal.server.models import UserModel +from dstack._internal.server.schemas.events import ListEventsRequest +from dstack._internal.server.security.permissions import Authenticated +from dstack._internal.server.utils.routers import ( + CustomORJSONResponse, + get_base_api_additional_responses, +) + +root_router = APIRouter( + prefix="/api/events", + tags=["events"], + responses=get_base_api_additional_responses(), +) + + +@root_router.post("/list", summary="List events", response_model=list[Event]) +async def list_events( + body: ListEventsRequest, + session: AsyncSession = Depends(get_session), + user: UserModel = Depends(Authenticated()), +): + """ + Returns events visible to the current user. + + Regular users can see events related to themselves and to projects they are members of. + Global admins can see all events. + + The results are paginated. To get the next page, pass `recorded_at` and `id` of + the last event from the previous page as `prev_recorded_at` and `prev_id`. + + NOTE: Some events may become available in the API with a delay after their `recorded_at`. + This should be taken into account when using the API to monitor recent events, + so that delayed events are not missed during pagination. + """ + return CustomORJSONResponse( + await events_services.list_events( + session=session, + user=user, + target_projects=body.target_projects, + target_users=body.target_users, + target_fleets=body.target_fleets, + target_instances=body.target_instances, + target_runs=body.target_runs, + target_jobs=body.target_jobs, + target_volumes=body.target_volumes, + target_gateways=body.target_gateways, + target_secrets=body.target_secrets, + within_projects=body.within_projects, + within_fleets=body.within_fleets, + within_runs=body.within_runs, + include_target_types=body.include_target_types, + actors=body.actors, + prev_recorded_at=body.prev_recorded_at, + prev_id=body.prev_id, + limit=body.limit, + ascending=body.ascending, + ) + ) diff --git a/src/dstack/_internal/server/routers/exports.py b/src/dstack/_internal/server/routers/exports.py new file mode 100644 index 0000000000..710fe17ce2 --- /dev/null +++ b/src/dstack/_internal/server/routers/exports.py @@ -0,0 +1,90 @@ +from typing import Annotated + +from fastapi import APIRouter, Depends +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.exports import Export +from dstack._internal.server.db import get_session +from dstack._internal.server.models import ProjectModel, UserModel +from dstack._internal.server.schemas.exports import ( + CreateExportRequest, + DeleteExportRequest, + UpdateExportRequest, +) +from dstack._internal.server.security.permissions import ProjectAdmin, ProjectMember +from dstack._internal.server.services import exports as exports_services +from dstack._internal.server.utils.routers import get_base_api_additional_responses + +project_router = APIRouter( + prefix="/api/project/{project_name}/exports", + tags=["exports"], + responses=get_base_api_additional_responses(), +) + + +@project_router.post("/create", summary="Create export", response_model=Export) +async def create_export( + body: CreateExportRequest, + session: Annotated[AsyncSession, Depends(get_session)], + user_project: Annotated[tuple[UserModel, ProjectModel], Depends(ProjectAdmin())], +): + user, project = user_project + return await exports_services.create_export( + session=session, + project=project, + user=user, + name=body.name, + is_global=body.is_global, + importer_project_names=body.importer_projects, + exported_fleet_names=body.exported_fleets, + exported_gateway_names=body.exported_gateways, + ) + + +@project_router.post("/update", summary="Update export", response_model=Export) +async def update_export( + body: UpdateExportRequest, + session: Annotated[AsyncSession, Depends(get_session)], + user_project: Annotated[tuple[UserModel, ProjectModel], Depends(ProjectAdmin())], +): + user, project = user_project + return await exports_services.update_export( + session=session, + project=project, + user=user, + name=body.name, + set_global=body.set_global, + unset_global=body.unset_global, + add_importer_project_names=body.add_importer_projects, + remove_importer_project_names=body.remove_importer_projects, + add_exported_fleet_names=body.add_exported_fleets, + remove_exported_fleet_names=body.remove_exported_fleets, + add_exported_gateway_names=body.add_exported_gateways, + remove_exported_gateway_names=body.remove_exported_gateways, + ) + + +@project_router.post("/delete", summary="Delete export") +async def delete_export( + body: DeleteExportRequest, + session: Annotated[AsyncSession, Depends(get_session)], + user_project: Annotated[tuple[UserModel, ProjectModel], Depends(ProjectAdmin())], +): + _, project = user_project + await exports_services.delete_export( + session=session, + project=project, + name=body.name, + ) + + +@project_router.post("/list", summary="List exports", response_model=list[Export]) +async def list_exports( + session: Annotated[AsyncSession, Depends(get_session)], + user_project: Annotated[tuple[UserModel, ProjectModel], Depends(ProjectMember())], +): + _, project = user_project + return await exports_services.list_exports( + session=session, + project=project, + ) diff --git a/src/dstack/_internal/server/routers/files.py b/src/dstack/_internal/server/routers/files.py new file mode 100644 index 0000000000..5456d3d5d5 --- /dev/null +++ b/src/dstack/_internal/server/routers/files.py @@ -0,0 +1,70 @@ +from typing import Annotated + +from fastapi import APIRouter, Depends, Request, UploadFile +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.errors import ResourceNotExistsError, ServerClientError +from dstack._internal.core.models.files import FileArchive +from dstack._internal.server.db import get_session +from dstack._internal.server.models import UserModel +from dstack._internal.server.schemas.files import GetFileArchiveByHashRequest +from dstack._internal.server.security.permissions import Authenticated +from dstack._internal.server.services import files +from dstack._internal.server.settings import SERVER_CODE_UPLOAD_LIMIT +from dstack._internal.server.utils.routers import ( + CustomORJSONResponse, + get_base_api_additional_responses, + get_request_size, +) +from dstack._internal.utils.common import sizeof_fmt + +router = APIRouter( + prefix="/api/files", + tags=["files"], + responses=get_base_api_additional_responses(), +) + + +@router.post( + "/get_archive_by_hash", summary="Get file archive by hash", response_model=FileArchive +) +async def get_archive_by_hash( + body: GetFileArchiveByHashRequest, + session: Annotated[AsyncSession, Depends(get_session)], + user: Annotated[UserModel, Depends(Authenticated())], +): + archive = await files.get_archive_by_hash( + session=session, + user=user, + hash=body.hash, + ) + if archive is None: + raise ResourceNotExistsError() + return CustomORJSONResponse(archive) + + +@router.post("/upload_archive", summary="Upload file archive", response_model=FileArchive) +async def upload_archive( + request: Request, + file: UploadFile, + session: Annotated[AsyncSession, Depends(get_session)], + user: Annotated[UserModel, Depends(Authenticated())], +): + request_size = get_request_size(request) + if SERVER_CODE_UPLOAD_LIMIT > 0 and request_size > SERVER_CODE_UPLOAD_LIMIT: + diff_size_fmt = sizeof_fmt(request_size) + limit_fmt = sizeof_fmt(SERVER_CODE_UPLOAD_LIMIT) + if diff_size_fmt == limit_fmt: + diff_size_fmt = f"{request_size}B" + limit_fmt = f"{SERVER_CODE_UPLOAD_LIMIT}B" + raise ServerClientError( + f"Archive size is {diff_size_fmt}, which exceeds the limit of {limit_fmt}." + " Use .gitignore/.dstackignore to exclude large files." + " This limit can be modified by setting the DSTACK_SERVER_CODE_UPLOAD_LIMIT environment variable." + ) + archive = await files.upload_archive( + session=session, + user=user, + file=file, + ) + return CustomORJSONResponse(archive) diff --git a/src/dstack/_internal/server/routers/fleets.py b/src/dstack/_internal/server/routers/fleets.py new file mode 100644 index 0000000000..b4131ee022 --- /dev/null +++ b/src/dstack/_internal/server/routers/fleets.py @@ -0,0 +1,236 @@ +from typing import List, Optional, Tuple + +from fastapi import APIRouter, Depends +from packaging.version import Version +from sqlalchemy.ext.asyncio import AsyncSession + +import dstack._internal.server.services.fleets as fleets_services +from dstack._internal.core.errors import ResourceNotExistsError +from dstack._internal.core.models.fleets import Fleet, FleetPlan +from dstack._internal.server.compatibility.fleets import patch_fleet, patch_fleet_plan +from dstack._internal.server.db import get_session +from dstack._internal.server.deps import Project +from dstack._internal.server.models import ProjectModel, UserModel +from dstack._internal.server.schemas.fleets import ( + ApplyFleetPlanRequest, + CreateFleetRequest, + DeleteFleetInstancesRequest, + DeleteFleetsRequest, + GetFleetPlanRequest, + GetFleetRequest, + ListFleetsRequest, + ListProjectFleetsRequest, +) +from dstack._internal.server.security.permissions import ( + Authenticated, + ProjectMember, + check_can_access_fleet, +) +from dstack._internal.server.services.pipelines import PipelineHinterProtocol, get_pipeline_hinter +from dstack._internal.server.utils.routers import ( + CustomORJSONResponse, + get_base_api_additional_responses, + get_client_version, +) + +root_router = APIRouter( + prefix="/api/fleets", + tags=["fleets"], + responses=get_base_api_additional_responses(), +) +project_router = APIRouter( + prefix="/api/project/{project_name}/fleets", + tags=["fleets"], + responses=get_base_api_additional_responses(), +) + + +@root_router.post("/list", summary="List fleets", response_model=List[Fleet]) +async def list_fleets( + body: ListFleetsRequest, + session: AsyncSession = Depends(get_session), + user: UserModel = Depends(Authenticated()), + client_version: Optional[Version] = Depends(get_client_version), +): + """ + Returns all fleets and instances within them visible to user sorted by descending `created_at`. + `project_name` and `only_active` can be specified as filters. + Includes only active fleet instances. To list all fleet instances, use `/api/instances/list`. + + The results are paginated. To get the next page, pass `created_at` and `id` of + the last fleet from the previous page as `prev_created_at` and `prev_id`. + """ + fleet_list = await fleets_services.list_fleets( + session=session, + user=user, + project_name=body.project_name, + only_active=body.only_active, + include_imported=body.include_imported, + prev_created_at=body.prev_created_at, + prev_id=body.prev_id, + limit=body.limit, + ascending=body.ascending, + ) + for fleet in fleet_list: + patch_fleet(fleet, client_version) + return CustomORJSONResponse(fleet_list) + + +@project_router.post("/list", summary="List project fleets", response_model=List[Fleet]) +async def list_project_fleets( + body: Optional[ListProjectFleetsRequest] = None, + session: AsyncSession = Depends(get_session), + user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), + client_version: Optional[Version] = Depends(get_client_version), +): + """ + Returns all fleets in the project. + Includes only active fleet instances. To list all fleet instances, use `/api/instances/list`. + """ + _, project = user_project + if body is None: + body = ListProjectFleetsRequest() + fleet_list = await fleets_services.list_project_fleets( + session=session, + project=project, + include_imported=body.include_imported, + ) + for fleet in fleet_list: + patch_fleet(fleet, client_version) + return CustomORJSONResponse(fleet_list) + + +@project_router.post("/get", summary="Get fleet", response_model=Fleet) +async def get_fleet( + body: GetFleetRequest, + session: AsyncSession = Depends(get_session), + user: UserModel = Depends(Authenticated()), + project: ProjectModel = Depends(Project()), + client_version: Optional[Version] = Depends(get_client_version), +): + """ + Returns a fleet given `name` or `id`. + If given `name`, does not return deleted fleets. + If given `id`, returns deleted fleets. + """ + await check_can_access_fleet( + session=session, user=user, fleet_project=project, fleet_name_or_id=body.get_name_or_id() + ) + fleet = await fleets_services.get_fleet( + session=session, project=project, name_or_id=body.get_name_or_id() + ) + if fleet is None: + raise ResourceNotExistsError() + patch_fleet(fleet, client_version) + return CustomORJSONResponse(fleet) + + +@project_router.post("/get_plan", summary="Get fleet plan", response_model=FleetPlan) +async def get_plan( + body: GetFleetPlanRequest, + session: AsyncSession = Depends(get_session), + user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), + client_version: Optional[Version] = Depends(get_client_version), +): + """ + Returns a fleet plan for the given fleet configuration. + """ + user, project = user_project + plan = await fleets_services.get_plan( + session=session, + project=project, + user=user, + spec=body.spec, + ) + patch_fleet_plan(plan, client_version) + return CustomORJSONResponse(plan) + + +@project_router.post("/apply", summary="Apply fleet plan", response_model=Fleet) +async def apply_plan( + body: ApplyFleetPlanRequest, + session: AsyncSession = Depends(get_session), + user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), + pipeline_hinter: PipelineHinterProtocol = Depends(get_pipeline_hinter), + client_version: Optional[Version] = Depends(get_client_version), +): + """ + Creates a new fleet or updates an existing fleet. + Errors if the expected current resource from the plan does not match the current resource. + Use `force: true` to apply even if the current resource does not match. + """ + user, project = user_project + fleet = await fleets_services.apply_plan( + session=session, + user=user, + project=project, + plan=body.plan, + force=body.force, + pipeline_hinter=pipeline_hinter, + ) + patch_fleet(fleet, client_version) + return CustomORJSONResponse(fleet) + + +@project_router.post("/create", summary="Create fleet", response_model=Fleet, deprecated=True) +async def create_fleet( + body: CreateFleetRequest, + session: AsyncSession = Depends(get_session), + user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), + pipeline_hinter: PipelineHinterProtocol = Depends(get_pipeline_hinter), + client_version: Optional[Version] = Depends(get_client_version), +): + """ + Creates a fleet given a fleet configuration. + """ + user, project = user_project + fleet = await fleets_services.create_fleet( + session=session, + project=project, + user=user, + spec=body.spec, + pipeline_hinter=pipeline_hinter, + ) + patch_fleet(fleet, client_version) + return CustomORJSONResponse(fleet) + + +@project_router.post("/delete", summary="Delete fleets") +async def delete_fleets( + body: DeleteFleetsRequest, + session: AsyncSession = Depends(get_session), + user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), + pipeline_hinter: PipelineHinterProtocol = Depends(get_pipeline_hinter), +): + """ + Deletes one or more fleets. + """ + user, project = user_project + await fleets_services.delete_fleets( + session=session, + project=project, + user=user, + names=body.names, + pipeline_hinter=pipeline_hinter, + ) + + +@project_router.post("/delete_instances", summary="Delete fleet instances") +async def delete_fleet_instances( + body: DeleteFleetInstancesRequest, + session: AsyncSession = Depends(get_session), + user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), + pipeline_hinter: PipelineHinterProtocol = Depends(get_pipeline_hinter), +): + """ + Deletes one or more instances within the fleet. + """ + user, project = user_project + await fleets_services.delete_fleets( + session=session, + project=project, + user=user, + names=[body.name], + instance_nums=body.instance_nums, + pipeline_hinter=pipeline_hinter, + ) diff --git a/src/dstack/_internal/server/routers/gateways.py b/src/dstack/_internal/server/routers/gateways.py index 4215fa90a2..6b9a6718dd 100644 --- a/src/dstack/_internal/server/routers/gateways.py +++ b/src/dstack/_internal/server/routers/gateways.py @@ -1,82 +1,140 @@ -from typing import List, Tuple +from typing import List, Optional, Tuple from fastapi import APIRouter, Depends +from packaging.version import Version from sqlalchemy.ext.asyncio import AsyncSession import dstack._internal.core.models.gateways as models import dstack._internal.server.schemas.gateways as schemas import dstack._internal.server.services.gateways as gateways from dstack._internal.core.errors import ResourceNotExistsError +from dstack._internal.core.models.common import EntityReference +from dstack._internal.server.compatibility.gateways import patch_gateway from dstack._internal.server.db import get_session +from dstack._internal.server.deps import Project from dstack._internal.server.models import ProjectModel, UserModel -from dstack._internal.server.security.permissions import ProjectAdmin, ProjectMember +from dstack._internal.server.security.permissions import ( + Authenticated, + ProjectAdmin, + ProjectMemberOrPublicAccess, + check_can_access_gateway, +) +from dstack._internal.server.services.pipelines import PipelineHinterProtocol, get_pipeline_hinter +from dstack._internal.server.utils.routers import ( + CustomORJSONResponse, + get_base_api_additional_responses, + get_client_version, +) -router = APIRouter(prefix="/api/project/{project_name}/gateways", tags=["gateways"]) +router = APIRouter( + prefix="/api/project/{project_name}/gateways", + tags=["gateways"], + responses=get_base_api_additional_responses(), +) -@router.post("/list") +@router.post("/list", summary="List gateways", response_model=List[models.Gateway]) async def list_gateways( + body: Optional[schemas.ListGatewaysRequest] = None, session: AsyncSession = Depends(get_session), - user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), -) -> List[models.Gateway]: + user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMemberOrPublicAccess()), + client_version: Optional[Version] = Depends(get_client_version), +): _, project = user_project - return await gateways.list_project_gateways(session=session, project=project) + if body is None: + body = schemas.ListGatewaysRequest() + gateway_list = await gateways.list_project_gateways( + session=session, + project=project, + include_imported=body.include_imported, + ) + for gateway in gateway_list: + patch_gateway(gateway, client_version) + return CustomORJSONResponse(gateway_list) -@router.post("/get") +@router.post("/get", summary="Get gateway", response_model=models.Gateway) async def get_gateway( body: schemas.GetGatewayRequest, session: AsyncSession = Depends(get_session), - user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), -) -> models.Gateway: - _, project = user_project + user: UserModel = Depends(Authenticated()), + project: ProjectModel = Depends(Project()), + client_version: Optional[Version] = Depends(get_client_version), +): + await check_can_access_gateway( + session=session, user=user, gateway_project=project, gateway_name=body.name + ) gateway = await gateways.get_gateway_by_name(session=session, project=project, name=body.name) if gateway is None: raise ResourceNotExistsError() - return gateway + patch_gateway(gateway, client_version) + return CustomORJSONResponse(gateway) -@router.post("/create") +@router.post("/create", summary="Create gateway", response_model=models.Gateway) async def create_gateway( body: schemas.CreateGatewayRequest, session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectAdmin()), -) -> models.Gateway: - _, project = user_project - return await gateways.create_gateway( + pipeline_hinter: PipelineHinterProtocol = Depends(get_pipeline_hinter), + client_version: Optional[Version] = Depends(get_client_version), +): + user, project = user_project + gateway = await gateways.create_gateway( session=session, + user=user, project=project, configuration=body.configuration, + pipeline_hinter=pipeline_hinter, ) + patch_gateway(gateway, client_version) + return CustomORJSONResponse(gateway) -@router.post("/delete") +@router.post("/delete", summary="Delete gateways") async def delete_gateways( body: schemas.DeleteGatewaysRequest, session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectAdmin()), ): - _, project = user_project - await gateways.delete_gateways(session=session, project=project, gateways_names=body.names) + user, project = user_project + await gateways.delete_gateways( + session=session, + project=project, + gateways_names=body.names, + user=user, + ) -@router.post("/set_default") +@router.post("/set_default", summary="Set default gateway") async def set_default_gateway( body: schemas.SetDefaultGatewayRequest, session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectAdmin()), ): - _, project = user_project - await gateways.set_default_gateway(session=session, project=project, name=body.name) + user, project = user_project + await gateways.set_default_gateway( + session=session, + project=project, + ref=EntityReference(name=body.name, project=body.gateway_project), + user=user, + ) -@router.post("/set_wildcard_domain") +@router.post("/set_wildcard_domain", summary="Set wildcard domain", response_model=models.Gateway) async def set_gateway_wildcard_domain( body: schemas.SetWildcardDomainRequest, session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectAdmin()), -) -> models.Gateway: - _, project = user_project - return await gateways.set_gateway_wildcard_domain( - session=session, project=project, name=body.name, wildcard_domain=body.wildcard_domain + client_version: Optional[Version] = Depends(get_client_version), +): + user, project = user_project + gateway = await gateways.set_gateway_wildcard_domain( + session=session, + project=project, + name=body.name, + wildcard_domain=body.wildcard_domain, + user=user, ) + patch_gateway(gateway, client_version) + return CustomORJSONResponse(gateway) diff --git a/src/dstack/_internal/server/routers/gpus.py b/src/dstack/_internal/server/routers/gpus.py new file mode 100644 index 0000000000..7c244e327e --- /dev/null +++ b/src/dstack/_internal/server/routers/gpus.py @@ -0,0 +1,42 @@ +from typing import Annotated, Optional, Tuple + +from fastapi import APIRouter, Depends +from packaging.version import Version +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.server.compatibility.gpus import patch_list_gpus_response +from dstack._internal.server.db import get_session +from dstack._internal.server.models import ProjectModel, UserModel +from dstack._internal.server.schemas.gpus import ListGpusRequest, ListGpusResponse +from dstack._internal.server.security.permissions import ProjectMember +from dstack._internal.server.services.gpus import list_gpus_grouped +from dstack._internal.server.utils.routers import ( + get_base_api_additional_responses, + get_client_version, +) + +project_router = APIRouter( + prefix="/api/project/{project_name}/gpus", + tags=["gpus"], + responses=get_base_api_additional_responses(), +) + + +@project_router.post( + "/list", summary="List GPUs", response_model=ListGpusResponse, response_model_exclude_none=True +) +async def list_gpus( + body: ListGpusRequest, + session: Annotated[AsyncSession, Depends(get_session)], + client_version: Annotated[Optional[Version], Depends(get_client_version)], + user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), +) -> ListGpusResponse: + _, project = user_project + resp = await list_gpus_grouped( + session=session, + project=project, + run_spec=body.run_spec, + group_by=body.group_by, + ) + patch_list_gpus_response(resp, client_version) + return resp diff --git a/src/dstack/_internal/server/routers/imports.py b/src/dstack/_internal/server/routers/imports.py new file mode 100644 index 0000000000..fd64eb0357 --- /dev/null +++ b/src/dstack/_internal/server/routers/imports.py @@ -0,0 +1,45 @@ +from typing import Annotated + +from fastapi import APIRouter, Depends +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.imports import Import +from dstack._internal.server.db import get_session +from dstack._internal.server.models import ProjectModel, UserModel +from dstack._internal.server.schemas.imports import DeleteImportRequest +from dstack._internal.server.security.permissions import ProjectAdmin, ProjectMember +from dstack._internal.server.services import imports as imports_services +from dstack._internal.server.utils.routers import get_base_api_additional_responses + +project_router = APIRouter( + prefix="/api/project/{project_name}/imports", + tags=["exports"], + responses=get_base_api_additional_responses(), +) + + +@project_router.post("/delete", summary="Delete import") +async def delete_import( + body: DeleteImportRequest, + session: Annotated[AsyncSession, Depends(get_session)], + user_project: Annotated[tuple[UserModel, ProjectModel], Depends(ProjectAdmin())], +): + _, project = user_project + await imports_services.delete_import( + session=session, + project=project, + export_name=body.export_name, + export_project_name=body.export_project_name, + ) + + +@project_router.post("/list", summary="List imports", response_model=list[Import]) +async def list_imports( + session: Annotated[AsyncSession, Depends(get_session)], + user_project: Annotated[tuple[UserModel, ProjectModel], Depends(ProjectMember())], +): + _, project = user_project + return await imports_services.list_imports( + session=session, + project=project, + ) diff --git a/src/dstack/_internal/server/routers/instances.py b/src/dstack/_internal/server/routers/instances.py new file mode 100644 index 0000000000..4b96d422c2 --- /dev/null +++ b/src/dstack/_internal/server/routers/instances.py @@ -0,0 +1,106 @@ +from typing import Annotated, List + +from fastapi import APIRouter, Depends +from sqlalchemy.ext.asyncio import AsyncSession + +import dstack._internal.server.services.instances as instances_services +from dstack._internal.core.errors import ResourceNotExistsError +from dstack._internal.core.models.instances import Instance +from dstack._internal.server.db import get_session +from dstack._internal.server.deps import Project +from dstack._internal.server.models import ProjectModel, UserModel +from dstack._internal.server.schemas.instances import ( + GetInstanceHealthChecksRequest, + GetInstanceHealthChecksResponse, + GetInstanceRequest, + ListInstancesRequest, +) +from dstack._internal.server.security.permissions import ( + Authenticated, + ProjectMember, + check_can_access_instance, +) +from dstack._internal.server.utils.routers import ( + CustomORJSONResponse, + get_base_api_additional_responses, +) + +root_router = APIRouter( + prefix="/api/instances", + tags=["fleets"], + responses=get_base_api_additional_responses(), +) +project_router = APIRouter( + prefix="/api/project/{project_name}/instances", + tags=["fleets"], + responses=get_base_api_additional_responses(), +) + + +@root_router.post("/list", response_model=List[Instance]) +async def list_instances( + body: ListInstancesRequest, + session: AsyncSession = Depends(get_session), + user: UserModel = Depends(Authenticated()), +): + """ + Returns all instances visible to user sorted by descending `created_at`. + `project_names` and `fleet_ids` can be specified as filters. + + The results are paginated. To get the next page, pass `created_at` and `id` of + the last instance from the previous page as `prev_created_at` and `prev_id`. + """ + return CustomORJSONResponse( + await instances_services.list_user_instances( + session=session, + user=user, + project_names=body.project_names, + fleet_ids=body.fleet_ids, + only_active=body.only_active, + include_imported=body.include_imported, + prev_created_at=body.prev_created_at, + prev_id=body.prev_id, + limit=body.limit, + ascending=body.ascending, + ) + ) + + +@project_router.post("/get_instance_health_checks", response_model=GetInstanceHealthChecksResponse) +async def get_instance_health_checks( + body: GetInstanceHealthChecksRequest, + session: AsyncSession = Depends(get_session), + user_project: tuple[UserModel, ProjectModel] = Depends(ProjectMember()), +): + _, project = user_project + health_checks = await instances_services.get_instance_health_checks( + session=session, + project=project, + fleet_name=body.fleet_name, + instance_num=body.instance_num, + after=body.after, + before=body.before, + limit=body.limit, + ) + return CustomORJSONResponse(GetInstanceHealthChecksResponse(health_checks=health_checks)) + + +@project_router.post("/get", response_model=Instance) +async def get_instance( + body: GetInstanceRequest, + session: Annotated[AsyncSession, Depends(get_session)], + user: Annotated[UserModel, Depends(Authenticated())], + project: Annotated[ProjectModel, Depends(Project())], +): + """ + Returns an instance given its ID. + """ + await check_can_access_instance( + session=session, user=user, instance_project=project, instance_id=body.id + ) + instance = await instances_services.get_instance( + session=session, project=project, instance_id=body.id + ) + if instance is None: + raise ResourceNotExistsError() + return CustomORJSONResponse(instance) diff --git a/src/dstack/_internal/server/routers/logs.py b/src/dstack/_internal/server/routers/logs.py index 1410554751..540c4ec125 100644 --- a/src/dstack/_internal/server/routers/logs.py +++ b/src/dstack/_internal/server/routers/logs.py @@ -7,19 +7,29 @@ from dstack._internal.server.schemas.logs import PollLogsRequest from dstack._internal.server.security.permissions import ProjectMember from dstack._internal.server.services import logs +from dstack._internal.server.utils.routers import ( + CustomORJSONResponse, + get_base_api_additional_responses, +) router = APIRouter( prefix="/api/project/{project_name}/logs", tags=["logs"], + responses=get_base_api_additional_responses(), ) @router.post( "/poll", + summary="Poll logs", + response_model=JobSubmissionLogs, ) async def poll_logs( body: PollLogsRequest, user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), -) -> JobSubmissionLogs: +): _, project = user_project - return logs.poll_logs(project=project, request=body) + # The runner guarantees logs have different timestamps if throughput < 1k logs / sec. + # Otherwise, some logs with duplicated timestamps may be filtered out. + # This limitation is imposed by cloud log services that support up to millisecond timestamp resolution. + return CustomORJSONResponse(await logs.poll_logs_async(project=project, request=body)) diff --git a/src/dstack/_internal/server/routers/metrics.py b/src/dstack/_internal/server/routers/metrics.py new file mode 100644 index 0000000000..14d0eb6fb5 --- /dev/null +++ b/src/dstack/_internal/server/routers/metrics.py @@ -0,0 +1,83 @@ +from datetime import datetime +from typing import Optional, Tuple +from uuid import UUID + +from fastapi import APIRouter, Depends +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.errors import ResourceNotExistsError +from dstack._internal.core.models.metrics import JobMetrics +from dstack._internal.server.db import get_session +from dstack._internal.server.models import ProjectModel, UserModel +from dstack._internal.server.security.permissions import ProjectMember +from dstack._internal.server.services import metrics +from dstack._internal.server.services.jobs import get_run_job_model +from dstack._internal.server.utils.routers import ( + CustomORJSONResponse, + get_base_api_additional_responses, +) + +router = APIRouter( + prefix="/api/project/{project_name}/metrics", + tags=["metrics"], + responses=get_base_api_additional_responses(), +) + + +@router.get( + "/job/{run_name}", + summary="Get metrics", + response_model=JobMetrics, +) +async def get_job_metrics( + run_name: str, + run_id: Optional[UUID] = None, + replica_num: int = 0, + job_num: int = 0, + limit: int = 1, + after: Optional[datetime] = None, + before: Optional[datetime] = None, + session: AsyncSession = Depends(get_session), + user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), +): + """ + Returns job-level metrics such as hardware utilization + given `run_name`, `run_id`, `replica_num`, and `job_num`. + If only `run_name` is specified, returns metrics of `(replica_num=0, job_num=0)` + of the latest run with the given name. + By default, returns one latest sample. To control time window/number of samples, use + `limit`, `after`, `before`. + + Supported metrics (all optional): + * `cpus_detected_num` + * `cpu_usage_percent` + * `memory_total_bytes` + * `memory_usage_bytes` + * `memory_working_set_bytes` + * `gpus_detected_num` + * `gpu_memory_total_bytes` + * `gpu_memory_usage_bytes_gpu{i}` + * `gpu_util_percent_gpu{i}` + """ + _, project = user_project + + job_model = await get_run_job_model( + session=session, + project=project, + run_name=run_name, + run_id=run_id, + replica_num=replica_num, + job_num=job_num, + ) + if job_model is None: + raise ResourceNotExistsError("Found no job with given parameters") + + return CustomORJSONResponse( + await metrics.get_job_metrics( + session=session, + job_model=job_model, + limit=limit, + after=after, + before=before, + ) + ) diff --git a/src/dstack/_internal/server/routers/pools.py b/src/dstack/_internal/server/routers/pools.py deleted file mode 100644 index 55fcb9b612..0000000000 --- a/src/dstack/_internal/server/routers/pools.py +++ /dev/null @@ -1,130 +0,0 @@ -from typing import List, Tuple - -from fastapi import APIRouter, Depends -from sqlalchemy.ext.asyncio import AsyncSession - -import dstack._internal.core.models.pools as models -import dstack._internal.server.schemas.pools as schemas -import dstack._internal.server.services.pools as pools -from dstack._internal.core.errors import ConfigurationError -from dstack._internal.core.models.pools import Instance -from dstack._internal.server.db import get_session -from dstack._internal.server.models import ProjectModel, UserModel -from dstack._internal.server.schemas.pools import ListPoolsRequest -from dstack._internal.server.schemas.runs import AddRemoteInstanceRequest -from dstack._internal.server.security.permissions import Authenticated, ProjectMember - -root_router = APIRouter(prefix="/api/pools", tags=["pool"]) -router = APIRouter(prefix="/api/project/{project_name}/pool", tags=["pool"]) - - -@root_router.post("/list_instances") -async def list_pool_instances( - body: ListPoolsRequest, - session: AsyncSession = Depends(get_session), - user: UserModel = Depends(Authenticated()), -) -> List[Instance]: - """ - Returns all instances visible to user sorted by descending created_at. - A **project_name** and **pool_name** can be specified as filters. - - The results are paginated. To get the next page, pass created_at and id of - the last run from the previous page as **prev_created_at** and **prev_id**. - """ - return await pools.list_user_pool_instances( - session=session, - user=user, - project_name=body.project_name, - pool_name=body.pool_name, - only_active=body.only_active, - prev_created_at=body.prev_created_at, - prev_id=body.prev_id, - limit=body.limit, - ascending=body.ascending, - ) - - -@router.post("/list") -async def list_pool( - session: AsyncSession = Depends(get_session), - user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), -) -> List[models.Pool]: - _, project = user_project - return await pools.list_project_pools(session=session, project=project) - - -@router.post("/create") -async def create_pool( - body: schemas.CreatePoolRequest, - session: AsyncSession = Depends(get_session), - user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), -) -> None: - _, project = user_project - await pools.create_pool(session=session, project=project, name=body.name) - - -@router.post("/set_default") -async def set_default_pool( - body: schemas.SetDefaultPoolRequest, - session: AsyncSession = Depends(get_session), - user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), -): - _, project_model = user_project - await pools.set_default_pool(session, project_model, body.pool_name) - - -@router.post("/delete") -async def delete_pool( - body: schemas.DeletePoolRequest, - session: AsyncSession = Depends(get_session), - user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), -) -> None: - _, project = user_project - await pools.delete_pool(session, project, body.name) - - -@router.post("/show") -async def show_pool( - body: schemas.ShowPoolRequest, - session: AsyncSession = Depends(get_session), - user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), -) -> models.PoolInstances: - _, project = user_project - return await pools.show_pool_instances(session, project, pool_name=body.name) - - -@router.post("/remove") -async def remove_instance( - body: schemas.RemoveInstanceRequest, - session: AsyncSession = Depends(get_session), - user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), -) -> None: - _, project_model = user_project - await pools.remove_instance( - session, project_model, body.pool_name, body.instance_name, body.force - ) - - -@router.post("/add_remote") -async def add_instance( - body: AddRemoteInstanceRequest, - session: AsyncSession = Depends(get_session), - user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), -) -> Instance: - if not body.host.strip() or not body.ssh_user.strip() or not body.ssh_keys: - raise ConfigurationError("Host, user or ssh keys are empty") - - _, project = user_project - result = await pools.add_remote( - session, - project=project, - pool_name=body.pool_name, - instance_name=body.instance_name, - instance_network=body.instance_network, - region=body.region, - host=body.host, - port=body.port or 22, - ssh_user=body.ssh_user, - ssh_keys=body.ssh_keys, - ) - return result diff --git a/src/dstack/_internal/server/routers/projects.py b/src/dstack/_internal/server/routers/projects.py index f946c521f3..f7d9098dfc 100644 --- a/src/dstack/_internal/server/routers/projects.py +++ b/src/dstack/_internal/server/routers/projects.py @@ -1,44 +1,114 @@ -from typing import List, Tuple +from typing import List, Optional, Tuple from fastapi import APIRouter, Depends from sqlalchemy.ext.asyncio import AsyncSession -from dstack._internal.core.models.projects import Project +from dstack._internal.core.models.projects import Project, ProjectsInfoListOrProjectsList from dstack._internal.server.db import get_session from dstack._internal.server.models import ProjectModel, UserModel from dstack._internal.server.schemas.projects import ( + AddProjectMemberRequest, CreateProjectRequest, DeleteProjectsRequest, + ListProjectsRequest, + RemoveProjectMemberRequest, SetProjectMembersRequest, + UpdateProjectRequest, +) +from dstack._internal.server.security.permissions import ( + Authenticated, + ProjectAdmin, + ProjectManager, + ProjectManagerOrPublicProject, + ProjectManagerOrSelfLeave, + ProjectMemberOrPublicAccess, +) +from dstack._internal.server.services import fleets, projects +from dstack._internal.server.utils.routers import ( + CustomORJSONResponse, + get_base_api_additional_responses, ) -from dstack._internal.server.security.permissions import Authenticated, ProjectAdmin, ProjectMember -from dstack._internal.server.services import projects -router = APIRouter(prefix="/api/projects", tags=["projects"]) +router = APIRouter( + prefix="/api/projects", + tags=["projects"], + responses=get_base_api_additional_responses(), +) -@router.post("/list") +@router.post("/list", summary="List projects", response_model=ProjectsInfoListOrProjectsList) async def list_projects( + body: Optional[ListProjectsRequest] = None, session: AsyncSession = Depends(get_session), user: UserModel = Depends(Authenticated()), -) -> List[Project]: - return await projects.list_user_projects(session=session, user=user) +): + """ + Returns projects visible to the user. + + Returns all accessible projects (member projects for regular users, all non-deleted + projects for global admins, plus public projects if `include_not_joined` is `True`). + + `members` and `backends` are always empty - call `/api/projects/{project_name}/get` to retrieve them. + """ + if body is None: + # For backward compatibility + body = ListProjectsRequest() + return CustomORJSONResponse( + await projects.list_user_accessible_projects( + session=session, + user=user, + include_not_joined=body.include_not_joined, + return_total_count=body.return_total_count, + name_pattern=body.name_pattern, + prev_created_at=body.prev_created_at, + prev_id=body.prev_id, + limit=body.limit, + ascending=body.ascending, + ) + ) + + +@router.post( + "/list_only_no_fleets", + summary="List projects with no active fleets", + response_model=List[Project], +) +async def list_only_no_fleets( + session: AsyncSession = Depends(get_session), + user: UserModel = Depends(Authenticated()), +): + """ + Returns only projects where the user is a member and that have no active fleets, + neither owned nor imported, sorted by ascending `created_at`. + + Active fleets are those with `deleted == False`. Projects with deleted fleets + (but no active fleets) are included. + + `members` and `backends` are always empty - call `/api/projects/{project_name}/get` to retrieve them. + """ + return CustomORJSONResponse( + await fleets.list_projects_with_no_active_fleets(session=session, user=user) + ) -@router.post("/create") +@router.post("/create", summary="Create project", response_model=Project) async def create_project( body: CreateProjectRequest, session: AsyncSession = Depends(get_session), user: UserModel = Depends(Authenticated()), -) -> Project: - return await projects.create_project( - session=session, - user=user, - project_name=body.project_name, +): + return CustomORJSONResponse( + await projects.create_project( + session=session, + user=user, + project_name=body.project_name, + is_public=body.is_public, + templates_repo=body.templates_repo, + ) ) -@router.post("/delete") +@router.post("/delete", summary="Delete projects") async def delete_projects( body: DeleteProjectsRequest, session: AsyncSession = Depends(get_session), @@ -51,28 +121,96 @@ async def delete_projects( ) -@router.post("/{project_name}/get") +@router.post("/{project_name}/get", summary="Get project", response_model=Project) async def get_project( session: AsyncSession = Depends(get_session), - user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), -) -> Project: + user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMemberOrPublicAccess()), +): _, project = user_project - return projects.project_model_to_project(project) + return CustomORJSONResponse(projects.project_model_to_project(project)) @router.post( "/{project_name}/set_members", + summary="Set members", + response_model=Project, ) async def set_project_members( body: SetProjectMembersRequest, session: AsyncSession = Depends(get_session), - user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectAdmin()), -) -> Project: - _, project = user_project + user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectManager()), +): + user, project = user_project await projects.set_project_members( session=session, + user=user, + project=project, + members=body.members, + ) + await session.refresh(project) + return CustomORJSONResponse(projects.project_model_to_project(project)) + + +@router.post( + "/{project_name}/add_members", + summary="Add members", + response_model=Project, +) +async def add_project_members( + body: AddProjectMemberRequest, + session: AsyncSession = Depends(get_session), + user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectManagerOrPublicProject()), +): + user, project = user_project + await projects.add_project_members( + session=session, + user=user, project=project, members=body.members, ) await session.refresh(project) - return projects.project_model_to_project(project) + return CustomORJSONResponse(projects.project_model_to_project(project)) + + +@router.post( + "/{project_name}/remove_members", + summary="Remove members", + response_model=Project, +) +async def remove_project_members( + body: RemoveProjectMemberRequest, + session: AsyncSession = Depends(get_session), + user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectManagerOrSelfLeave()), +): + user, project = user_project + await projects.remove_project_members( + session=session, + user=user, + project=project, + usernames=body.usernames, + ) + await session.refresh(project) + return CustomORJSONResponse(projects.project_model_to_project(project)) + + +@router.post( + "/{project_name}/update", + summary="Update project", + response_model=Project, +) +async def update_project( + body: UpdateProjectRequest, + session: AsyncSession = Depends(get_session), + user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectAdmin()), +): + user, project = user_project + await projects.update_project( + session=session, + user=user, + project=project, + is_public=body.is_public, + templates_repo=body.templates_repo, + reset_templates_repo=body.reset_templates_repo, + ) + await session.refresh(project) + return CustomORJSONResponse(projects.project_model_to_project(project)) diff --git a/src/dstack/_internal/server/routers/prometheus.py b/src/dstack/_internal/server/routers/prometheus.py new file mode 100644 index 0000000000..347ea947a8 --- /dev/null +++ b/src/dstack/_internal/server/routers/prometheus.py @@ -0,0 +1,35 @@ +import os +from typing import Annotated + +import prometheus_client +from fastapi import APIRouter, Depends +from fastapi.responses import PlainTextResponse +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.server import settings +from dstack._internal.server.db import get_session +from dstack._internal.server.security.permissions import OptionalServiceAccount +from dstack._internal.server.services.prometheus import custom_metrics +from dstack._internal.server.utils.routers import error_not_found + +_auth = OptionalServiceAccount(os.getenv("DSTACK_PROMETHEUS_AUTH_TOKEN")) + +router = APIRouter( + tags=["metrics"], + default_response_class=PlainTextResponse, + dependencies=[Depends(_auth)], +) + + +@router.get("/metrics", summary="Get Prometheus metrics") +async def get_prometheus_metrics( + session: Annotated[AsyncSession, Depends(get_session)], +) -> str: + # Note: Prometheus warns against storing high cardinality values in labels, + # yet both client and custom metrics have labels like project, run, fleet, etc. + # This may require a very big Prometheus server with lots of storage. + if not settings.ENABLE_PROMETHEUS_METRICS: + raise error_not_found() + custom_metrics_ = await custom_metrics.get_metrics(session=session) + client_metrics = prometheus_client.generate_latest().decode() + return custom_metrics_ + client_metrics diff --git a/src/dstack/_internal/server/routers/public_keys.py b/src/dstack/_internal/server/routers/public_keys.py new file mode 100644 index 0000000000..e846d15415 --- /dev/null +++ b/src/dstack/_internal/server/routers/public_keys.py @@ -0,0 +1,54 @@ +from typing import Annotated + +from fastapi import APIRouter, Depends +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.keys import PublicKeyInfo +from dstack._internal.server.db import get_session +from dstack._internal.server.models import UserModel +from dstack._internal.server.schemas.public_keys import ( + AddPublicKeyRequest, + DeletePublicKeysRequest, +) +from dstack._internal.server.security.permissions import Authenticated +from dstack._internal.server.services import public_keys as public_keys_services +from dstack._internal.server.utils.routers import ( + CustomORJSONResponse, + get_base_api_additional_responses, +) + +router = APIRouter( + prefix="/api/users/public_keys", + tags=["users"], + responses=get_base_api_additional_responses(), +) + + +@router.post("/list", summary="List SSH keys", response_model=list[PublicKeyInfo]) +async def list_user_public_keys( + session: Annotated[AsyncSession, Depends(get_session)], + user: Annotated[UserModel, Depends(Authenticated())], +): + public_keys = await public_keys_services.list_user_public_keys(session=session, user=user) + return CustomORJSONResponse(public_keys) + + +@router.post("/add", summary="Add SSH key", response_model=PublicKeyInfo) +async def add_user_public_key( + body: AddPublicKeyRequest, + session: Annotated[AsyncSession, Depends(get_session)], + user: Annotated[UserModel, Depends(Authenticated())], +): + public_key = await public_keys_services.add_user_public_key( + session=session, user=user, key=body.key, name=body.name + ) + return CustomORJSONResponse(public_key) + + +@router.post("/delete", summary="Delete SSH keys") +async def delete_user_public_keys( + body: DeletePublicKeysRequest, + session: Annotated[AsyncSession, Depends(get_session)], + user: Annotated[UserModel, Depends(Authenticated())], +): + await public_keys_services.delete_user_public_keys(session=session, user=user, ids=body.ids) diff --git a/src/dstack/_internal/server/routers/repos.py b/src/dstack/_internal/server/routers/repos.py index 94b8cb25a5..95307bd15c 100644 --- a/src/dstack/_internal/server/routers/repos.py +++ b/src/dstack/_internal/server/routers/repos.py @@ -14,55 +14,72 @@ ) from dstack._internal.server.security.permissions import ProjectMember from dstack._internal.server.services import repos -from dstack._internal.server.utils.routers import request_size_exceeded +from dstack._internal.server.settings import SERVER_CODE_UPLOAD_LIMIT +from dstack._internal.server.utils.routers import ( + CustomORJSONResponse, + get_base_api_additional_responses, + get_request_size, +) +from dstack._internal.utils.common import sizeof_fmt -router = APIRouter(prefix="/api/project/{project_name}/repos", tags=["repos"]) +router = APIRouter( + prefix="/api/project/{project_name}/repos", + tags=["repos"], + responses=get_base_api_additional_responses(), +) -@router.post("/list") +@router.post("/list", summary="List repos", response_model=List[RepoHead]) async def list_repos( session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), -) -> List[RepoHead]: +): _, project = user_project - return await repos.list_repos(session=session, project=project) + return CustomORJSONResponse(await repos.list_repos(session=session, project=project)) -@router.post("/get") +@router.post("/get", summary="Get repo", response_model=RepoHeadWithCreds) async def get_repo( body: GetRepoRequest, session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), -) -> RepoHeadWithCreds: - _, project = user_project +): + user, project = user_project repo = await repos.get_repo( session=session, project=project, + user=user, repo_id=body.repo_id, include_creds=body.include_creds, ) if repo is None: raise ResourceNotExistsError() - return repo + return CustomORJSONResponse(repo) -@router.post("/init") +@router.post("/init", summary="Initialize repo") async def init_repo( body: SaveRepoCredsRequest, session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), ): - _, project = user_project + """ + Creates or updates a repo with the repo info and repo creds. + Runs belong to repos, so this endpoint must be called before applying run configurations. + You can create `virtual` repos if you don't use git repos. + """ + user, project = user_project await repos.init_repo( session=session, project=project, + user=user, repo_id=body.repo_id, repo_info=body.repo_info, repo_creds=body.repo_creds, ) -@router.post("/delete") +@router.post("/delete", summary="Delete repos") async def delete_repos( body: DeleteReposRequest, session: AsyncSession = Depends(get_session), @@ -72,7 +89,7 @@ async def delete_repos( await repos.delete_repos(session=session, project=project, repos_ids=body.repos_ids) -@router.post("/upload_code") +@router.post("/upload_code", summary="Upload code") async def upload_code( request: Request, repo_id: str, @@ -80,10 +97,17 @@ async def upload_code( session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), ): - if request_size_exceeded(request, limit=2 * 2**20): + request_size = get_request_size(request) + if SERVER_CODE_UPLOAD_LIMIT > 0 and request_size > SERVER_CODE_UPLOAD_LIMIT: + diff_size_fmt = sizeof_fmt(request_size) + limit_fmt = sizeof_fmt(SERVER_CODE_UPLOAD_LIMIT) + if diff_size_fmt == limit_fmt: + diff_size_fmt = f"{request_size}B" + limit_fmt = f"{SERVER_CODE_UPLOAD_LIMIT}B" raise ServerClientError( - "Repo diff size exceeds the limit of 2MB. " - "Use .gitignore to exclude large files from the repo." + f"Repo diff size is {diff_size_fmt}, which exceeds the limit of {limit_fmt}." + " Use .gitignore to exclude large files from the repo." + " This limit can be modified by setting the DSTACK_SERVER_CODE_UPLOAD_LIMIT environment variable." ) _, project = user_project await repos.upload_code( diff --git a/src/dstack/_internal/server/routers/runs.py b/src/dstack/_internal/server/routers/runs.py index 2a8af76e83..a6523a1baa 100644 --- a/src/dstack/_internal/server/routers/runs.py +++ b/src/dstack/_internal/server/routers/runs.py @@ -1,17 +1,17 @@ -from typing import List, Tuple +from typing import Annotated, List, Optional, Tuple from fastapi import APIRouter, Depends +from packaging.version import Version from sqlalchemy.ext.asyncio import AsyncSession -from dstack._internal.core.errors import ComputeError, ResourceNotExistsError, ServerClientError -from dstack._internal.core.models.pools import Instance -from dstack._internal.core.models.runs import PoolInstanceOffers, Run, RunPlan +from dstack._internal.core.errors import ResourceNotExistsError +from dstack._internal.core.models.runs import Run, RunPlan +from dstack._internal.server.compatibility.runs import patch_run, patch_run_plan from dstack._internal.server.db import get_session from dstack._internal.server.models import ProjectModel, UserModel from dstack._internal.server.schemas.runs import ( - CreateInstanceRequest, + ApplyRunPlanRequest, DeleteRunsRequest, - GetOffersRequest, GetRunPlanRequest, GetRunRequest, ListRunsRequest, @@ -19,156 +19,205 @@ SubmitRunRequest, ) from dstack._internal.server.security.permissions import Authenticated, ProjectMember -from dstack._internal.server.services import runs -from dstack._internal.server.services.pools import ( - get_or_create_pool_by_name, +from dstack._internal.server.services import runs, users +from dstack._internal.server.services.pipelines import PipelineHinterProtocol, get_pipeline_hinter +from dstack._internal.server.utils.routers import ( + CustomORJSONResponse, + get_base_api_additional_responses, + get_client_version, ) root_router = APIRouter( prefix="/api/runs", tags=["runs"], + responses=get_base_api_additional_responses(), ) project_router = APIRouter( prefix="/api/project/{project_name}/runs", tags=["runs"], + responses=get_base_api_additional_responses(), ) -@root_router.post("/list") +def use_legacy_repo_dir( + client_version: Annotated[Optional[Version], Depends(get_client_version)], +) -> bool: + return client_version is not None and client_version < Version("0.19.27") + + +@root_router.post( + "/list", + summary="List runs", + response_model=List[Run], +) async def list_runs( body: ListRunsRequest, session: AsyncSession = Depends(get_session), user: UserModel = Depends(Authenticated()), -) -> List[Run]: + client_version: Optional[Version] = Depends(get_client_version), +): """ - Returns all runs visible to user sorted by descending submitted_at. - A **project_name**, **repo_id**, and **username** can be specified as filters. - Specifying **repo_id** without **project_name** returns no runs. + Returns all runs visible to user sorted by descending `submitted_at`. + `project_name`, `repo_id`, `username`, and `only_active` can be specified as filters. + Setting `only_active` to `true` excludes finished runs and deleted runs. + Specifying `repo_id` without `project_name` returns no runs. - The results are paginated. To get the next page, pass submitted_at and id of - the last run from the previous page as **prev_submitted_at** and **prev_run_id**. + The results are paginated. To get the next page, pass `submitted_at` and `id` of + the last run from the previous page as `prev_submitted_at` and `prev_run_id`. """ - return await runs.list_user_runs( + run_list = await runs.list_user_runs( session=session, user=user, project_name=body.project_name, repo_id=body.repo_id, username=body.username, only_active=body.only_active, + include_jobs=body.include_jobs, + job_submissions_limit=body.job_submissions_limit, prev_submitted_at=body.prev_submitted_at, prev_run_id=body.prev_run_id, limit=body.limit, ascending=body.ascending, ) + for run in run_list: + patch_run(run, client_version) + return CustomORJSONResponse(run_list) -@project_router.post("/get") +@project_router.post("/get", response_model=Run, summary="Get run") async def get_run( body: GetRunRequest, session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), -) -> Run: + client_version: Optional[Version] = Depends(get_client_version), +): + """ + Returns a run given `run_name` or `id`. + If given `run_name`, does not return deleted runs. + If given `id`, returns deleted runs. + """ _, project = user_project run = await runs.get_run( session=session, project=project, run_name=body.run_name, + run_id=body.id, ) if run is None: raise ResourceNotExistsError("Run not found") - return run + patch_run(run, client_version) + return CustomORJSONResponse(run) -@project_router.post("/get_plan") -async def get_run_plan( +@project_router.post( + "/get_plan", + summary="Get run plan", + response_model=RunPlan, +) +async def get_plan( body: GetRunPlanRequest, - session: AsyncSession = Depends(get_session), - user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), -) -> RunPlan: + session: Annotated[AsyncSession, Depends(get_session)], + user_project: Annotated[tuple[UserModel, ProjectModel], Depends(ProjectMember())], + client_version: Annotated[Optional[Version], Depends(get_client_version)], + legacy_repo_dir: Annotated[bool, Depends(use_legacy_repo_dir)], +): + """ + Returns a run plan for the given run spec. + This is an optional step before calling `/apply`. + """ user, project = user_project - run_plan = await runs.get_run_plan( + if not user.ssh_public_key and not body.run_spec.ssh_key_pub: + await users.refresh_ssh_key(session=session, actor=user) + run_plan = await runs.get_plan( session=session, project=project, user=user, run_spec=body.run_spec, + max_offers=body.max_offers, + legacy_repo_dir=legacy_repo_dir, ) - return run_plan - - -@project_router.post("/submit") -async def submit_run( - body: SubmitRunRequest, - session: AsyncSession = Depends(get_session), - user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), -) -> Run: + patch_run_plan(run_plan, client_version) + return CustomORJSONResponse(run_plan) + + +@project_router.post("/apply", response_model=Run, summary="Apply run plan") +async def apply_plan( + body: ApplyRunPlanRequest, + session: Annotated[AsyncSession, Depends(get_session)], + user_project: Annotated[tuple[UserModel, ProjectModel], Depends(ProjectMember())], + pipeline_hinter: Annotated[PipelineHinterProtocol, Depends(get_pipeline_hinter)], + legacy_repo_dir: Annotated[bool, Depends(use_legacy_repo_dir)], + client_version: Annotated[Optional[Version], Depends(get_client_version)], +): + """ + Creates a new run or updates an existing run. + Errors if the expected current resource from the plan does not match the current resource. + Use `force: true` to apply even if the current resource does not match. + If the existing run is active and cannot be updated, it must be stopped first. + """ user, project = user_project - return await runs.submit_run( + if not user.ssh_public_key and not body.plan.run_spec.ssh_key_pub: + await users.refresh_ssh_key(session=session, actor=user) + run = await runs.apply_plan( session=session, user=user, project=project, - run_spec=body.run_spec, + plan=body.plan, + force=body.force, + pipeline_hinter=pipeline_hinter, + legacy_repo_dir=legacy_repo_dir, ) + patch_run(run, client_version) + return CustomORJSONResponse(run) -@project_router.post("/stop") +@project_router.post("/stop", summary="Stop runs") async def stop_runs( body: StopRunsRequest, session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), + pipeline_hinter: PipelineHinterProtocol = Depends(get_pipeline_hinter), ): - _, project = user_project + """ + Stop one or more runs. + """ + user, project = user_project await runs.stop_runs( session=session, + user=user, project=project, runs_names=body.runs_names, abort=body.abort, + pipeline_hinter=pipeline_hinter, ) -@project_router.post("/delete") +@project_router.post("/delete", summary="Delete runs") async def delete_runs( body: DeleteRunsRequest, session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), ): - _, project = user_project - await runs.delete_runs(session=session, project=project, runs_names=body.runs_names) + """ + Delete one or more runs. The runs must be stopped before they can be deleted. + """ + user, project = user_project + await runs.delete_runs(session=session, user=user, project=project, runs_names=body.runs_names) -# FIXME: get_offers and create_instance semantically belong to pools, not runs -@project_router.post("/get_offers") -async def get_offers( - body: GetOffersRequest, +# apply_plan replaces submit_run since it can create new runs. +@project_router.post("/submit", deprecated=True) +async def submit_run( + body: SubmitRunRequest, session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), -) -> PoolInstanceOffers: - _, project = user_project - pool = await get_or_create_pool_by_name(session, project, body.profile.pool_name) - offers = await runs.get_create_instance_offers( + pipeline_hinter: PipelineHinterProtocol = Depends(get_pipeline_hinter), +) -> Run: + user, project = user_project + return await runs.submit_run( + session=session, + user=user, project=project, - profile=body.profile, - requirements=body.requirements, + run_spec=body.run_spec, + pipeline_hinter=pipeline_hinter, ) - instances = [instance for _, instance in offers] - return PoolInstanceOffers(pool_name=pool.name, instances=instances) - - -# FIXME: get_offers and create_instance semantically belong to pools, not runs -@project_router.post("/create_instance") -async def create_instance( - body: CreateInstanceRequest, - session: AsyncSession = Depends(get_session), - user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), -) -> Instance: - user, project = user_project - try: - instance = await runs.create_instance( - session=session, - project=project, - user=user, - profile=body.profile, - requirements=body.requirements, - ) - except ComputeError as e: - raise ServerClientError(str(e)) - return instance diff --git a/src/dstack/_internal/server/routers/secrets.py b/src/dstack/_internal/server/routers/secrets.py index 3732a0b1ca..db4d364b36 100644 --- a/src/dstack/_internal/server/routers/secrets.py +++ b/src/dstack/_internal/server/routers/secrets.py @@ -1,15 +1,20 @@ -from typing import List +from typing import List, Tuple -from fastapi import APIRouter +from fastapi import APIRouter, Depends +from sqlalchemy.ext.asyncio import AsyncSession -from dstack._internal.core.models.runs import Run +from dstack._internal.core.errors import ResourceNotExistsError from dstack._internal.core.models.secrets import Secret +from dstack._internal.server.db import get_session +from dstack._internal.server.models import ProjectModel, UserModel from dstack._internal.server.schemas.secrets import ( - AddSecretRequest, + CreateOrUpdateSecretRequest, DeleteSecretsRequest, - GetSecretsRequest, - ListSecretsRequest, + GetSecretRequest, ) +from dstack._internal.server.security.permissions import ProjectManager +from dstack._internal.server.services import secrets as secrets_services +from dstack._internal.server.utils.routers import CustomORJSONResponse router = APIRouter( prefix="/api/project/{project_name}/secrets", @@ -17,21 +22,67 @@ ) -@router.post("/list") -async def list_secrets(project_name: str, body: ListSecretsRequest) -> List[Run]: - pass +@router.post("/list", summary="List secrets", response_model=List[Secret]) +async def list_secrets( + session: AsyncSession = Depends(get_session), + user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectManager()), +): + user, project = user_project + return CustomORJSONResponse( + await secrets_services.list_secrets( + session=session, + project=project, + user=user, + ) + ) -@router.post("/get") -async def get_secret(project_name: str, body: GetSecretsRequest) -> Secret: - pass +@router.post("/get", summary="Get secret", response_model=Secret) +async def get_secret( + body: GetSecretRequest, + session: AsyncSession = Depends(get_session), + user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectManager()), +): + user, project = user_project + secret = await secrets_services.get_secret( + session=session, + project=project, + name=body.name, + user=user, + ) + if secret is None: + raise ResourceNotExistsError() + return CustomORJSONResponse(secret) -@router.post("/add") -async def add_or_update_secret(project_name: str, body: AddSecretRequest) -> Secret: - pass +@router.post("/create_or_update", summary="Create or update secret", response_model=Secret) +async def create_or_update_secret( + body: CreateOrUpdateSecretRequest, + session: AsyncSession = Depends(get_session), + user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectManager()), +): + user, project = user_project + return CustomORJSONResponse( + await secrets_services.create_or_update_secret( + session=session, + project=project, + name=body.name, + value=body.value, + user=user, + ) + ) -@router.post("/delete") -async def delete_secrets(project_name: str, body: DeleteSecretsRequest): - pass +@router.post("/delete", summary="Delete secrets") +async def delete_secrets( + body: DeleteSecretsRequest, + session: AsyncSession = Depends(get_session), + user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectManager()), +): + user, project = user_project + await secrets_services.delete_secrets( + session=session, + project=project, + names=body.secrets_names, + user=user, + ) diff --git a/src/dstack/_internal/server/routers/server.py b/src/dstack/_internal/server/routers/server.py new file mode 100644 index 0000000000..8fd3d77d46 --- /dev/null +++ b/src/dstack/_internal/server/routers/server.py @@ -0,0 +1,19 @@ +from fastapi import APIRouter + +from dstack._internal import settings +from dstack._internal.core.models.server import ServerInfo +from dstack._internal.server.utils.routers import CustomORJSONResponse + +router = APIRouter( + prefix="/api/server", + tags=["server"], +) + + +@router.post("/get_info", summary="Get server info", response_model=ServerInfo) +async def get_server_info(): + return CustomORJSONResponse( + ServerInfo( + server_version=settings.DSTACK_VERSION, + ) + ) diff --git a/src/dstack/_internal/server/routers/sshproxy.py b/src/dstack/_internal/server/routers/sshproxy.py new file mode 100644 index 0000000000..0baeb0f0ed --- /dev/null +++ b/src/dstack/_internal/server/routers/sshproxy.py @@ -0,0 +1,39 @@ +from typing import Annotated + +from fastapi import APIRouter, Depends +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.errors import ResourceNotExistsError +from dstack._internal.server import settings +from dstack._internal.server.db import get_session +from dstack._internal.server.schemas.sshproxy import GetUpstreamRequest, GetUpstreamResponse +from dstack._internal.server.security.permissions import AlwaysForbidden, ServiceAccount +from dstack._internal.server.services.sshproxy.handlers import get_upstream_response +from dstack._internal.server.utils.routers import ( + CustomORJSONResponse, + get_base_api_additional_responses, +) + +if settings.SSHPROXY_API_TOKEN is not None: + _auth = ServiceAccount(settings.SSHPROXY_API_TOKEN) +else: + _auth = AlwaysForbidden() + + +router = APIRouter( + prefix="/api/sshproxy", + tags=["proxy"], + responses=get_base_api_additional_responses(), + dependencies=[Depends(_auth)], +) + + +@router.post("/get_upstream", summary="Get upstream", response_model=GetUpstreamResponse) +async def get_upstream( + body: GetUpstreamRequest, + session: Annotated[AsyncSession, Depends(get_session)], +): + response = await get_upstream_response(session=session, upstream_id=body.id) + if response is None: + raise ResourceNotExistsError() + return CustomORJSONResponse(response) diff --git a/src/dstack/_internal/server/routers/templates.py b/src/dstack/_internal/server/routers/templates.py new file mode 100644 index 0000000000..99af9f273c --- /dev/null +++ b/src/dstack/_internal/server/routers/templates.py @@ -0,0 +1,22 @@ +from typing import List, Tuple + +from fastapi import APIRouter, Depends + +from dstack._internal.core.models.templates import UITemplate +from dstack._internal.server.models import ProjectModel, UserModel +from dstack._internal.server.security.permissions import ProjectMember +from dstack._internal.server.services import templates as templates_service +from dstack._internal.server.utils.routers import CustomORJSONResponse + +router = APIRouter( + prefix="/api/project/{project_name}/templates", + tags=["templates"], +) + + +@router.post("/list", summary="List templates", response_model=List[UITemplate]) +async def list_templates( + user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), +): + _, project = user_project + return CustomORJSONResponse(await templates_service.list_templates(project)) diff --git a/src/dstack/_internal/server/routers/users.py b/src/dstack/_internal/server/routers/users.py index 0ee2a64631..e26ce6dfbe 100644 --- a/src/dstack/_internal/server/routers/users.py +++ b/src/dstack/_internal/server/routers/users.py @@ -1,99 +1,151 @@ -from typing import List +from typing import Optional from fastapi import APIRouter, Depends from sqlalchemy.ext.asyncio import AsyncSession from dstack._internal.core.errors import ResourceNotExistsError -from dstack._internal.core.models.users import User, UserWithCreds +from dstack._internal.core.models.users import User, UsersInfoListOrUsersList, UserWithCreds from dstack._internal.server.db import get_session from dstack._internal.server.models import UserModel from dstack._internal.server.schemas.users import ( CreateUserRequest, DeleteUsersRequest, GetUserRequest, + ListUsersRequest, RefreshTokenRequest, UpdateUserRequest, ) from dstack._internal.server.security.permissions import Authenticated, GlobalAdmin -from dstack._internal.server.services import users +from dstack._internal.server.services import events, users +from dstack._internal.server.utils.routers import ( + CustomORJSONResponse, + get_base_api_additional_responses, +) -router = APIRouter(prefix="/api/users", tags=["users"]) +router = APIRouter( + prefix="/api/users", + tags=["users"], + responses=get_base_api_additional_responses(), +) -@router.post("/list") +@router.post("/list", summary="List users", response_model=UsersInfoListOrUsersList) async def list_users( + body: Optional[ListUsersRequest] = None, session: AsyncSession = Depends(get_session), user: UserModel = Depends(Authenticated()), -) -> List[User]: - return await users.list_users_for_user(session=session, user=user) +): + """ + Returns users visible to the user, sorted by descending `created_at`. + + Admins see all non-deleted users. Non-admins only see themselves. + + The results are paginated. To get the next page, pass `created_at` and `id` of + the last user from the previous page as `prev_created_at` and `prev_id`. + """ + if body is None: + # For backward compatibility + body = ListUsersRequest() + return CustomORJSONResponse( + await users.list_users_for_user( + session=session, + user=user, + return_total_count=body.return_total_count, + name_pattern=body.name_pattern, + prev_created_at=body.prev_created_at, + prev_id=body.prev_id, + limit=body.limit, + ascending=body.ascending, + ) + ) -@router.post("/get_my_user") +@router.post("/get_my_user", summary="Get my user", response_model=UserWithCreds) async def get_my_user( + session: AsyncSession = Depends(get_session), user: UserModel = Depends(Authenticated()), -) -> User: - return users.user_model_to_user(user) +): + if user.ssh_private_key is None or user.ssh_public_key is None: + # Generate keys for pre-0.19.33 users + await users.refresh_ssh_key(session=session, actor=user) + return CustomORJSONResponse(users.user_model_to_user_with_creds(user)) -@router.post("/get_user") +@router.post("/get_user", summary="Get user", response_model=UserWithCreds) async def get_user( body: GetUserRequest, session: AsyncSession = Depends(get_session), user: UserModel = Depends(Authenticated()), -) -> UserWithCreds: +): res = await users.get_user_with_creds_by_name( session=session, current_user=user, username=body.username ) if res is None: raise ResourceNotExistsError() - return res + return CustomORJSONResponse(res) -@router.post("/create") +@router.post("/create", summary="Create user", response_model=User) async def create_user( body: CreateUserRequest, session: AsyncSession = Depends(get_session), user: UserModel = Depends(GlobalAdmin()), -) -> User: +): res = await users.create_user( session=session, username=body.username, global_role=body.global_role, email=body.email, + active=body.active, + creator=user, ) - return users.user_model_to_user(res) + return CustomORJSONResponse(users.user_model_to_user(res)) -@router.post("/update") +@router.post("/update", summary="Update user", response_model=User) async def update_user( body: UpdateUserRequest, session: AsyncSession = Depends(get_session), user: UserModel = Depends(GlobalAdmin()), -) -> User: +): res = await users.update_user( session=session, + actor=events.UserActor.from_user(user), username=body.username, global_role=body.global_role, email=body.email, + active=body.active, ) if res is None: raise ResourceNotExistsError() - return users.user_model_to_user(res) + return CustomORJSONResponse(users.user_model_to_user(res)) + + +@router.post("/refresh_ssh_key", summary="Refresh SSH key", response_model=UserWithCreds) +async def refresh_ssh_key( + body: RefreshTokenRequest, + session: AsyncSession = Depends(get_session), + user: UserModel = Depends(Authenticated()), +): + res = await users.refresh_ssh_key(session=session, actor=user, username=body.username) + if res is None: + raise ResourceNotExistsError() + return CustomORJSONResponse(users.user_model_to_user_with_creds(res)) -@router.post("/refresh_token") +@router.post("/refresh_token", summary="Refresh token", response_model=UserWithCreds) async def refresh_token( body: RefreshTokenRequest, session: AsyncSession = Depends(get_session), - user: UserModel = Depends(GlobalAdmin()), -) -> UserWithCreds: - res = await users.refresh_user_token(session=session, username=body.username) + user: UserModel = Depends(Authenticated()), +): + res = await users.refresh_user_token(session=session, actor=user, username=body.username) if res is None: raise ResourceNotExistsError() - return users.user_model_to_user_with_creds(res) + return CustomORJSONResponse(users.user_model_to_user_with_creds(res)) -@router.post("/delete") +@router.post("/delete", summary="Delete users") async def delete_users( body: DeleteUsersRequest, session: AsyncSession = Depends(get_session), @@ -101,6 +153,6 @@ async def delete_users( ): await users.delete_users( session=session, - user=user, + actor=user, usernames=body.users, ) diff --git a/src/dstack/_internal/server/routers/volumes.py b/src/dstack/_internal/server/routers/volumes.py index 28aea74fc6..47db390118 100644 --- a/src/dstack/_internal/server/routers/volumes.py +++ b/src/dstack/_internal/server/routers/volumes.py @@ -12,55 +12,114 @@ CreateVolumeRequest, DeleteVolumesRequest, GetVolumeRequest, + ListVolumesRequest, +) +from dstack._internal.server.security.permissions import Authenticated, ProjectMember +from dstack._internal.server.services.pipelines import PipelineHinterProtocol, get_pipeline_hinter +from dstack._internal.server.utils.routers import ( + CustomORJSONResponse, + get_base_api_additional_responses, ) -from dstack._internal.server.security.permissions import ProjectMember -router = APIRouter(prefix="/api/project/{project_name}/volumes", tags=["volumes"]) +root_router = APIRouter( + prefix="/api/volumes", + tags=["volumes"], + responses=get_base_api_additional_responses(), +) +project_router = APIRouter(prefix="/api/project/{project_name}/volumes", tags=["volumes"]) -@router.post("/list") +@root_router.post("/list", summary="List volumes", response_model=List[Volume]) async def list_volumes( + body: ListVolumesRequest, + session: AsyncSession = Depends(get_session), + user: UserModel = Depends(Authenticated()), +): + """ + Returns all volumes visible to user sorted by descending `created_at`. + `project_name` and `only_active` can be specified as filters. + + The results are paginated. To get the next page, pass `created_at` and `id` of + the last fleet from the previous page as `prev_created_at` and `prev_id`. + """ + return CustomORJSONResponse( + await volumes_services.list_volumes( + session=session, + user=user, + project_name=body.project_name, + only_active=body.only_active, + prev_created_at=body.prev_created_at, + prev_id=body.prev_id, + limit=body.limit, + ascending=body.ascending, + ) + ) + + +@project_router.post("/list", summary="List project volumes", response_model=List[Volume]) +async def list_project_volumes( session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), -) -> List[Volume]: +): + """ + Returns all volumes in the project. + """ _, project = user_project - return await volumes_services.list_project_volumes(session=session, project=project) + return CustomORJSONResponse( + await volumes_services.list_project_volumes(session=session, project=project) + ) -@router.post("/get") +@project_router.post("/get", summary="Get volume", response_model=Volume) async def get_volume( body: GetVolumeRequest, session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), -) -> Volume: +): + """ + Returns a volume given a volume name. + """ _, project = user_project volume = await volumes_services.get_volume_by_name( session=session, project=project, name=body.name ) if volume is None: raise ResourceNotExistsError() - return volume + return CustomORJSONResponse(volume) -@router.post("/create") +@project_router.post("/create", summary="Create volume", response_model=Volume) async def create_volume( body: CreateVolumeRequest, session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), -) -> Volume: - _, project = user_project - return await volumes_services.create_volume( - session=session, - project=project, - configuration=body.configuration, + pipeline_hinter: PipelineHinterProtocol = Depends(get_pipeline_hinter), +): + """ + Creates a volume given a volume configuration. + """ + user, project = user_project + return CustomORJSONResponse( + await volumes_services.create_volume( + session=session, + project=project, + user=user, + configuration=body.configuration, + pipeline_hinter=pipeline_hinter, + ) ) -@router.post("/delete") +@project_router.post("/delete", summary="Delete volumes") async def delete_volumes( body: DeleteVolumesRequest, session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), ): - _, project = user_project - await volumes_services.delete_volumes(session=session, project=project, names=body.names) + """ + Deletes one or more volumes. + """ + user, project = user_project + await volumes_services.delete_volumes( + session=session, project=project, names=body.names, user=user + ) diff --git a/src/dstack/_internal/server/schemas/auth.py b/src/dstack/_internal/server/schemas/auth.py new file mode 100644 index 0000000000..942f1fb388 --- /dev/null +++ b/src/dstack/_internal/server/schemas/auth.py @@ -0,0 +1,83 @@ +from typing import Annotated, Optional + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +class OAuthInfoResponse(CoreModel): + enabled: Annotated[ + bool, Field(description="Whether the OAuth2 provider is configured on the server.") + ] + + +class OAuthAuthorizeRequest(CoreModel): + local_port: Annotated[ + Optional[int], + Field( + description="If specified, the user is redirected to localhost:local_port after the redirect from the provider.", + ge=1, + le=65535, + ), + ] = None + base_url: Annotated[ + Optional[str], + Field( + description=( + "The server base URL used to access the dstack server, e.g. `http://localhost:3000`." + " Used to build redirect URLs when the dstack server is available on multiple domains." + ) + ), + ] = None + + +class OAuthAuthorizeResponse(CoreModel): + authorization_url: Annotated[str, Field(description="An OAuth2 authorization URL.")] + + +class OAuthCallbackRequest(CoreModel): + code: Annotated[ + str, + Field( + description="The OAuth2 authorization code received from the provider in the redirect URL." + ), + ] + state: Annotated[ + str, + Field(description="The state parameter received from the provider in the redirect URL."), + ] + base_url: Annotated[ + Optional[str], + Field( + description=( + "The server base URL used to access the dstack server, e.g. `http://localhost:3000`." + " Used to build redirect URLs when the dstack server is available on multiple domains." + " It must match the base URL specified when generating the authorization URL." + ) + ), + ] = None + + +class OAuthGetNextRedirectRequest(CoreModel): + code: Annotated[ + str, + Field( + description="The OAuth2 authorization code received from the provider in the redirect URL." + ), + ] + state: Annotated[ + str, + Field(description="The state parameter received from the provider in the redirect URL."), + ] + + +class OAuthGetNextRedirectResponse(CoreModel): + redirect_url: Annotated[ + Optional[str], + Field( + description=( + "The URL that the user needs to be redirected to." + " If `null`, there is no next redirect." + ) + ), + ] diff --git a/src/dstack/_internal/server/schemas/common.py b/src/dstack/_internal/server/schemas/common.py index 17ee117df8..2bd4bae23f 100644 --- a/src/dstack/_internal/server/schemas/common.py +++ b/src/dstack/_internal/server/schemas/common.py @@ -1,5 +1,9 @@ +from typing import Annotated + +from pydantic import Field + from dstack._internal.core.models.common import CoreModel class RepoRequest(CoreModel): - repo_id: str + repo_id: Annotated[str, Field(description="A unique identifier of the repo")] diff --git a/src/dstack/_internal/server/schemas/events.py b/src/dstack/_internal/server/schemas/events.py new file mode 100644 index 0000000000..3899b1f398 --- /dev/null +++ b/src/dstack/_internal/server/schemas/events.py @@ -0,0 +1,211 @@ +import uuid +from datetime import datetime +from typing import Annotated, Optional +from uuid import UUID + +from pydantic import Field, root_validator + +from dstack._internal.core.models.common import CoreModel +from dstack._internal.core.models.events import EventTargetType + +MIN_FILTER_ITEMS = 1 +MAX_FILTER_ITEMS = 16 # Conservative limit to prevent overly complex db queries +LIST_EVENTS_DEFAULT_LIMIT = 100 + + +class ListEventsRequest(CoreModel): + target_projects: Annotated[ + Optional[list[uuid.UUID]], + Field( + description=( + "List of project IDs." + " The response will only include events that target the specified projects" + ), + min_items=MIN_FILTER_ITEMS, + max_items=MAX_FILTER_ITEMS, + ), + ] = None + target_users: Annotated[ + Optional[list[uuid.UUID]], + Field( + description=( + "List of user IDs." + " The response will only include events that target the specified users" + ), + min_items=MIN_FILTER_ITEMS, + max_items=MAX_FILTER_ITEMS, + ), + ] = None + target_fleets: Annotated[ + Optional[list[uuid.UUID]], + Field( + description=( + "List of fleet IDs." + " The response will only include events that target the specified fleets" + ), + min_items=MIN_FILTER_ITEMS, + max_items=MAX_FILTER_ITEMS, + ), + ] = None + target_instances: Annotated[ + Optional[list[uuid.UUID]], + Field( + description=( + "List of instance IDs." + " The response will only include events that target the specified instances" + ), + min_items=MIN_FILTER_ITEMS, + max_items=MAX_FILTER_ITEMS, + ), + ] = None + target_runs: Annotated[ + Optional[list[uuid.UUID]], + Field( + description=( + "List of run IDs." + " The response will only include events that target the specified runs" + ), + min_items=MIN_FILTER_ITEMS, + max_items=MAX_FILTER_ITEMS, + ), + ] = None + target_jobs: Annotated[ + Optional[list[uuid.UUID]], + Field( + description=( + "List of job IDs." + " The response will only include events that target the specified jobs" + ), + min_items=MIN_FILTER_ITEMS, + max_items=MAX_FILTER_ITEMS, + ), + ] = None + target_volumes: Annotated[ + Optional[list[uuid.UUID]], + Field( + description=( + "List of volume IDs." + " The response will only include events that target the specified volumes" + ), + min_items=MIN_FILTER_ITEMS, + max_items=MAX_FILTER_ITEMS, + ), + ] = None + target_gateways: Annotated[ + Optional[list[uuid.UUID]], + Field( + description=( + "List of gateway IDs." + " The response will only include events that target the specified gateways" + ), + min_items=MIN_FILTER_ITEMS, + max_items=MAX_FILTER_ITEMS, + ), + ] = None + target_secrets: Annotated[ + Optional[list[uuid.UUID]], + Field( + description=( + "List of secret IDs." + " The response will only include events that target the specified secrets" + ), + min_items=MIN_FILTER_ITEMS, + max_items=MAX_FILTER_ITEMS, + ), + ] = None + within_projects: Annotated[ + Optional[list[uuid.UUID]], + Field( + description=( + "List of project IDs." + " The response will only include events that target the specified projects" + " or any entities within those projects" + ), + min_items=MIN_FILTER_ITEMS, + max_items=MAX_FILTER_ITEMS, + ), + ] = None + within_fleets: Annotated[ + Optional[list[uuid.UUID]], + Field( + description=( + "List of fleet IDs." + " The response will only include events that target the specified fleets" + " or instances within those fleets" + ), + min_items=MIN_FILTER_ITEMS, + max_items=MAX_FILTER_ITEMS, + ), + ] = None + within_runs: Annotated[ + Optional[list[uuid.UUID]], + Field( + description=( + "List of run IDs." + " The response will only include events that target the specified runs" + " or jobs within those runs" + ), + min_items=MIN_FILTER_ITEMS, + max_items=MAX_FILTER_ITEMS, + ), + ] = None + include_target_types: Annotated[ + Optional[list[EventTargetType]], + Field( + description=( + "List of target types." + " The response will only include events that have a target" + " of one of the specified types" + ), + min_items=MIN_FILTER_ITEMS, + max_items=MAX_FILTER_ITEMS, + ), + ] = None + actors: Annotated[ + Optional[list[Optional[uuid.UUID]]], + Field( + description=( + "List of user IDs or `null` values." + " The response will only include events about actions" + " performed by the specified users," + " or performed by the system if `null` is specified" + ), + min_items=MIN_FILTER_ITEMS, + max_items=MAX_FILTER_ITEMS, + ), + ] = None + prev_recorded_at: Optional[datetime] = None + prev_id: Optional[UUID] = None + limit: int = Field(LIST_EVENTS_DEFAULT_LIMIT, ge=1, le=100) + ascending: bool = False + + @root_validator + def _validate_target_filters(cls, values): + """ + Raise an error if more than one target_* filter is set. Setting multiple + target_* filters would always result in an empty response, which might confuse users. + """ + + target_filters = [name for name in cls.__fields__ if name.startswith("target_")] + set_filters = [f for f in target_filters if values.get(f) is not None] + if len(set_filters) > 1: + raise ValueError( + f"At most one target_* filter can be set at a time. Got {', '.join(set_filters)}" + ) + return values + + @root_validator + def _validate_within_filters(cls, values): + """ + Raise an error if more than one within_* filter is set. Setting multiple + within_* filters is either redundant or incorrect. Each within_* filter + may also lead to additional db queries, causing unnecessary load. + """ + + within_filters = [name for name in cls.__fields__ if name.startswith("within_")] + set_filters = [f for f in within_filters if values.get(f) is not None] + if len(set_filters) > 1: + raise ValueError( + f"At most one within_* filter can be set at a time. Got {', '.join(set_filters)}" + ) + return values diff --git a/src/dstack/_internal/server/schemas/exports.py b/src/dstack/_internal/server/schemas/exports.py new file mode 100644 index 0000000000..74828fb455 --- /dev/null +++ b/src/dstack/_internal/server/schemas/exports.py @@ -0,0 +1,25 @@ +from dstack._internal.core.models.common import CoreModel + + +class CreateExportRequest(CoreModel): + name: str + is_global: bool = False + importer_projects: list[str] = [] + exported_fleets: list[str] = [] + exported_gateways: list[str] = [] + + +class UpdateExportRequest(CoreModel): + name: str + set_global: bool = False + unset_global: bool = False + add_importer_projects: list[str] = [] + remove_importer_projects: list[str] = [] + add_exported_fleets: list[str] = [] + remove_exported_fleets: list[str] = [] + add_exported_gateways: list[str] = [] + remove_exported_gateways: list[str] = [] + + +class DeleteExportRequest(CoreModel): + name: str diff --git a/src/dstack/_internal/server/schemas/files.py b/src/dstack/_internal/server/schemas/files.py new file mode 100644 index 0000000000..8cab50c9cf --- /dev/null +++ b/src/dstack/_internal/server/schemas/files.py @@ -0,0 +1,5 @@ +from dstack._internal.core.models.common import CoreModel + + +class GetFileArchiveByHashRequest(CoreModel): + hash: str diff --git a/src/dstack/_internal/server/schemas/fleets.py b/src/dstack/_internal/server/schemas/fleets.py new file mode 100644 index 0000000000..4bb25d50bb --- /dev/null +++ b/src/dstack/_internal/server/schemas/fleets.py @@ -0,0 +1,64 @@ +from datetime import datetime +from typing import Annotated, List, Optional +from uuid import UUID + +from pydantic import Field + +from dstack._internal.core.errors import ServerClientError +from dstack._internal.core.models.common import CoreModel +from dstack._internal.core.models.fleets import ApplyFleetPlanInput, FleetSpec +from dstack._internal.utils.common import EntityID, EntityName, EntityNameOrID + + +class ListFleetsRequest(CoreModel): + project_name: Optional[str] = None + only_active: bool = False + include_imported: bool = False + prev_created_at: Optional[datetime] = None + prev_id: Optional[UUID] = None + limit: int = Field(100, ge=0, le=100) + ascending: bool = False + + +class ListProjectFleetsRequest(CoreModel): + include_imported: bool = False + + +class GetFleetRequest(CoreModel): + name: Optional[str] + id: Optional[UUID] = None + + def get_name_or_id(self) -> EntityNameOrID: + if self.id is not None: + return EntityID(id=self.id) + elif self.name is not None: + return EntityName(name=self.name) + else: + raise ServerClientError("name or id must be specified") + + +class GetFleetPlanRequest(CoreModel): + spec: FleetSpec + + +class ApplyFleetPlanRequest(CoreModel): + plan: ApplyFleetPlanInput + force: Annotated[ + bool, + Field( + description="Use `force: true` to apply even if the expected resource does not match." + ), + ] + + +class CreateFleetRequest(CoreModel): + spec: FleetSpec + + +class DeleteFleetsRequest(CoreModel): + names: List[str] + + +class DeleteFleetInstancesRequest(CoreModel): + name: str + instance_nums: List[int] diff --git a/src/dstack/_internal/server/schemas/gateways.py b/src/dstack/_internal/server/schemas/gateways.py index 99930b95e9..4357c30430 100644 --- a/src/dstack/_internal/server/schemas/gateways.py +++ b/src/dstack/_internal/server/schemas/gateways.py @@ -1,34 +1,21 @@ -from typing import Dict, List, Optional +from typing import Any, Dict, List, Optional -from pydantic import root_validator - -from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.common import CoreModel +from dstack._internal.core.models.common import CoreConfig, CoreModel, generate_dual_core_model from dstack._internal.core.models.gateways import GatewayConfiguration -class CreateGatewayRequest(CoreModel): - name: Optional[str] - backend_type: Optional[BackendType] - region: Optional[str] - configuration: Optional[GatewayConfiguration] - - @root_validator - def fill_configuration(cls, values: Dict) -> Dict: - if values.get("configuration", None) is not None: - return values - backend_type = values.get("backend_type", None) - region = values.get("region", None) - if backend_type is None: - raise ValueError("backend_type must be specified") - if region is None: - raise ValueError("region must be specified") - values["configuration"] = GatewayConfiguration( - name=values.get("name", None), - backend=backend_type, - region=region, - ) - return values +class CreateGatewayRequestConfig(CoreConfig): + @staticmethod + def schema_extra(schema: Dict[str, Any]): + pass + + +class CreateGatewayRequest(generate_dual_core_model(CreateGatewayRequestConfig)): + configuration: GatewayConfiguration + + +class ListGatewaysRequest(CoreModel): + include_imported: bool = False class GetGatewayRequest(CoreModel): @@ -41,6 +28,7 @@ class DeleteGatewaysRequest(CoreModel): class SetDefaultGatewayRequest(CoreModel): name: str + gateway_project: Optional[str] = None class SetWildcardDomainRequest(CoreModel): diff --git a/src/dstack/_internal/server/schemas/gpus.py b/src/dstack/_internal/server/schemas/gpus.py new file mode 100644 index 0000000000..16d1191483 --- /dev/null +++ b/src/dstack/_internal/server/schemas/gpus.py @@ -0,0 +1,26 @@ +from typing import List, Literal, Optional + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel +from dstack._internal.core.models.gpus import GpuGroup +from dstack._internal.core.models.runs import RunSpec + + +class ListGpusRequest(CoreModel): + """Request for listing GPUs with optional grouping.""" + + run_spec: RunSpec + group_by: Optional[List[Literal["backend", "region", "count"]]] = Field( + default=None, + description="List of fields to group by. Valid values: 'backend', 'region', 'count'. " + "Note: 'region' can only be used together with 'backend'.", + ) + + +class ListGpusResponse(CoreModel): + """Response containing GPU specifications.""" + + gpus: List[GpuGroup] = Field( + description="List of GPU specifications, grouped according to the group_by parameter" + ) diff --git a/src/dstack/_internal/server/schemas/health/__init__.py b/src/dstack/_internal/server/schemas/health/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/server/schemas/health/dcgm.py b/src/dstack/_internal/server/schemas/health/dcgm.py new file mode 100644 index 0000000000..cf8f5ce506 --- /dev/null +++ b/src/dstack/_internal/server/schemas/health/dcgm.py @@ -0,0 +1,59 @@ +from enum import IntEnum + +from dstack._internal.core.models.common import CoreModel +from dstack._internal.core.models.health import HealthStatus + + +class DCGMHealthResult(IntEnum): + """ + `dcgmHealthWatchResult_enum` + + See: https://github.com/NVIDIA/go-dcgm/blob/85ceb31/pkg/dcgm/const.go#L1020-L1026 + """ + + DCGM_HEALTH_RESULT_PASS = 0 + DCGM_HEALTH_RESULT_WARN = 10 + DCGM_HEALTH_RESULT_FAIL = 20 + + def to_health_status(self) -> HealthStatus: + if self == self.DCGM_HEALTH_RESULT_PASS: + return HealthStatus.HEALTHY + if self == self.DCGM_HEALTH_RESULT_WARN: + return HealthStatus.WARNING + if self == self.DCGM_HEALTH_RESULT_FAIL: + return HealthStatus.FAILURE + raise AssertionError("should not reach here") + + +class DCGMHealthIncident(CoreModel): + """ + Flattened `dcgmIncidentInfo_t` + + See: https://github.com/NVIDIA/go-dcgm/blob/85ceb31/pkg/dcgm/health.go#L68-L73 + """ + + system: int + """`system` comes from `dcgmIncidentInfo_t`.""" + health: DCGMHealthResult + """`health` comes from `dcgmIncidentInfo_t`.""" + + error_message: str + """`error_message` comes from `dcgmDiagErrorDetail_t`.""" + error_code: int + """`error_code` comes from `dcgmDiagErrorDetail_t`.""" + + entity_group_id: int + """`entity_group_id` comes from `dcgmGroupEntityPair_t`.""" + entity_id: int + """`entity_id` comes from `dcgmGroupEntityPair_t`.""" + + +class DCGMHealthResponse(CoreModel): + """ + `dcgmHealthResponse_v5` + + See: https://github.com/NVIDIA/go-dcgm/blob/85ceb31/pkg/dcgm/health.go#L75-L78 + """ + + overall_health: DCGMHealthResult + incidents: list[DCGMHealthIncident] diff --git a/src/dstack/_internal/server/schemas/imports.py b/src/dstack/_internal/server/schemas/imports.py new file mode 100644 index 0000000000..3f2d71243f --- /dev/null +++ b/src/dstack/_internal/server/schemas/imports.py @@ -0,0 +1,10 @@ +from dstack._internal.core.models.common import CoreModel + + +class DeleteImportRequest(CoreModel): + """ + Imports are unnamed, so they are deleted using the name and project of their export. + """ + + export_name: str + export_project_name: str diff --git a/src/dstack/_internal/server/schemas/instances.py b/src/dstack/_internal/server/schemas/instances.py new file mode 100644 index 0000000000..8f87935b92 --- /dev/null +++ b/src/dstack/_internal/server/schemas/instances.py @@ -0,0 +1,52 @@ +from datetime import datetime +from typing import Optional +from uuid import UUID + +from dstack._internal.core.models.common import CoreModel +from dstack._internal.core.models.health import HealthCheck, HealthStatus +from dstack._internal.server.schemas.runner import InstanceHealthResponse + + +class GetInstanceRequest(CoreModel): + id: UUID + + +class ListInstancesRequest(CoreModel): + project_names: Optional[list[str]] = None + fleet_ids: Optional[list[UUID]] = None + only_active: bool = False + include_imported: bool = False + prev_created_at: Optional[datetime] = None + prev_id: Optional[UUID] = None + limit: int = 1000 + ascending: bool = False + + +class InstanceCheck(CoreModel): + reachable: bool + message: Optional[str] = None + health_response: Optional[InstanceHealthResponse] = None + + def get_health_status(self) -> HealthStatus: + if self.health_response is None: + return HealthStatus.HEALTHY + if self.health_response.dcgm is None: + return HealthStatus.HEALTHY + return self.health_response.dcgm.overall_health.to_health_status() + + def has_health_checks(self) -> bool: + if self.health_response is None: + return False + return self.health_response.dcgm is not None + + +class GetInstanceHealthChecksRequest(CoreModel): + fleet_name: str + instance_num: int + after: Optional[datetime] = None + before: Optional[datetime] = None + limit: Optional[int] = None + + +class GetInstanceHealthChecksResponse(CoreModel): + health_checks: list[HealthCheck] diff --git a/src/dstack/_internal/server/schemas/logs.py b/src/dstack/_internal/server/schemas/logs.py index 0d6c0a02b0..fd84ba6ff7 100644 --- a/src/dstack/_internal/server/schemas/logs.py +++ b/src/dstack/_internal/server/schemas/logs.py @@ -9,8 +9,9 @@ class PollLogsRequest(CoreModel): run_name: str job_submission_id: UUID4 - start_time: Optional[datetime] - end_time: Optional[datetime] + start_time: Optional[datetime] = None + end_time: Optional[datetime] = None descending: bool = False + next_token: Optional[str] = None limit: int = Field(100, ge=0, le=1000) diagnose: bool = False diff --git a/src/dstack/_internal/server/schemas/pools.py b/src/dstack/_internal/server/schemas/pools.py deleted file mode 100644 index f2d42e8c75..0000000000 --- a/src/dstack/_internal/server/schemas/pools.py +++ /dev/null @@ -1,38 +0,0 @@ -from datetime import datetime -from typing import Optional -from uuid import UUID - -from dstack._internal.core.models.common import CoreModel - - -class DeletePoolRequest(CoreModel): - name: str - force: bool - - -class CreatePoolRequest(CoreModel): - name: str - - -class ShowPoolRequest(CoreModel): - name: Optional[str] - - -class RemoveInstanceRequest(CoreModel): - pool_name: str - instance_name: str - force: bool = False - - -class SetDefaultPoolRequest(CoreModel): - pool_name: str - - -class ListPoolsRequest(CoreModel): - project_name: Optional[str] - pool_name: Optional[str] - only_active: bool = False - prev_created_at: Optional[datetime] - prev_id: Optional[UUID] - limit: int = 1000 - ascending: bool = False diff --git a/src/dstack/_internal/server/schemas/projects.py b/src/dstack/_internal/server/schemas/projects.py index e51528bf06..c45624f668 100644 --- a/src/dstack/_internal/server/schemas/projects.py +++ b/src/dstack/_internal/server/schemas/projects.py @@ -1,11 +1,63 @@ -from typing import List +from datetime import datetime +from typing import Annotated, List, Optional +from uuid import UUID + +from pydantic import Field from dstack._internal.core.models.common import CoreModel from dstack._internal.core.models.users import ProjectRole +class ListProjectsRequest(CoreModel): + include_not_joined: Annotated[ + bool, Field(description="Include public projects where user is not a member.") + ] = True + return_total_count: Annotated[ + bool, Field(description="Return `total_count` with the total number of projects.") + ] = False + name_pattern: Annotated[ + Optional[str], + Field( + description="Include only projects with the name containing `name_pattern`.", + regex="^[a-zA-Z0-9-_]*$", + ), + ] = None + prev_created_at: Annotated[ + Optional[datetime], + Field( + description="Paginate projects by specifying `created_at` of the last (first) project in previous batch for descending (ascending)." + ), + ] = None + prev_id: Annotated[ + Optional[UUID], + Field( + description=( + "Paginate projects by specifying `id` of the last (first) project in previous batch for descending (ascending)." + " Must be used together with `prev_created_at`." + ) + ), + ] = None + limit: Annotated[ + int, Field(ge=0, le=2000, description="Limit number of projects returned.") + ] = 2000 + ascending: Annotated[ + bool, + Field( + description="Return projects sorted by `created_at` in ascending order. Defaults to descending." + ), + ] = False + + class CreateProjectRequest(CoreModel): project_name: str + is_public: bool = False + templates_repo: Optional[str] = None + + +class UpdateProjectRequest(CoreModel): + is_public: Optional[bool] = None + templates_repo: Optional[str] = None + reset_templates_repo: bool = False class DeleteProjectsRequest(CoreModel): @@ -13,9 +65,20 @@ class DeleteProjectsRequest(CoreModel): class MemberSetting(CoreModel): - username: str + username: Annotated[ + str, + Field(description="The username or email of the user"), + ] project_role: ProjectRole class SetProjectMembersRequest(CoreModel): members: List[MemberSetting] + + +class AddProjectMemberRequest(CoreModel): + members: List[MemberSetting] + + +class RemoveProjectMemberRequest(CoreModel): + usernames: List[str] diff --git a/src/dstack/_internal/server/schemas/public_keys.py b/src/dstack/_internal/server/schemas/public_keys.py new file mode 100644 index 0000000000..97fcee11e4 --- /dev/null +++ b/src/dstack/_internal/server/schemas/public_keys.py @@ -0,0 +1,13 @@ +import uuid +from typing import Optional + +from dstack._internal.core.models.common import CoreModel + + +class AddPublicKeyRequest(CoreModel): + key: str + name: Optional[str] = None + + +class DeletePublicKeysRequest(CoreModel): + ids: list[uuid.UUID] diff --git a/src/dstack/_internal/server/schemas/repos.py b/src/dstack/_internal/server/schemas/repos.py index 6bef0294e5..b58a33d199 100644 --- a/src/dstack/_internal/server/schemas/repos.py +++ b/src/dstack/_internal/server/schemas/repos.py @@ -1,4 +1,6 @@ -from typing import List, Optional +from typing import Annotated, List, Optional + +from pydantic import Field from dstack._internal.core.models.common import CoreModel from dstack._internal.core.models.repos import AnyRepoInfo @@ -12,7 +14,10 @@ class GetRepoRequest(RepoRequest): class SaveRepoCredsRequest(RepoRequest): repo_info: AnyRepoInfo - repo_creds: Optional[RemoteRepoCreds] + repo_creds: Annotated[ + Optional[RemoteRepoCreds], + Field(description="The repo creds for accessing private remote repo"), + ] class DeleteReposRequest(CoreModel): diff --git a/src/dstack/_internal/server/schemas/runner.py b/src/dstack/_internal/server/schemas/runner.py index a13a7371ba..c1ad0407d0 100644 --- a/src/dstack/_internal/server/schemas/runner.py +++ b/src/dstack/_internal/server/schemas/runner.py @@ -1,22 +1,36 @@ from base64 import b64decode +from enum import Enum from typing import Dict, List, Optional, Union from pydantic import Field, validator from typing_extensions import Annotated -from dstack._internal.core.models.common import CoreModel +from dstack._internal.core.models.common import CoreModel, NetworkMode from dstack._internal.core.models.repos.remote import RemoteRepoCreds -from dstack._internal.core.models.runs import ClusterInfo, JobSpec, JobStatus, RunSpec -from dstack._internal.core.models.volumes import VolumeMountPoint +from dstack._internal.core.models.runs import ( + ClusterInfo, + ImagePullProgress, + JobSpec, + JobStatus, + JobSubmission, + Run, + RunSpec, +) +from dstack._internal.core.models.volumes import InstanceMountPoint, VolumeMountPoint +from dstack._internal.server.schemas.health.dcgm import DCGMHealthResponse class JobStateEvent(CoreModel): timestamp: int state: JobStatus + termination_reason: Optional[str] = None + termination_message: Optional[str] = None + exit_status: Optional[int] = None class LogEvent(CoreModel): - timestamp: int # nanoseconds + timestamp: int + """`timestamp` is stored in milliseconds.""" message: bytes @validator("message", pre=True) @@ -31,18 +45,28 @@ class PullResponse(CoreModel): job_logs: List[LogEvent] runner_logs: List[LogEvent] last_updated: int + no_connections_secs: Optional[int] = None + """`no_connections_secs` is optional for compatibility with old runners.""" + + +class JobInfoResponse(CoreModel): + working_dir: str + username: str class SubmitBody(CoreModel): - run_spec: Annotated[ - RunSpec, + run: Annotated[ + Run, Field( include={ - "run_name", - "repo_id", - "repo_data", - "configuration", - "configuration_path", + "id": True, + "run_spec": { + "run_name", + "repo_id", + "repo_data", + "configuration", + "configuration_path", + }, } ), ] @@ -53,18 +77,49 @@ class SubmitBody(CoreModel): "replica_num", "job_num", "jobs_per_replica", + "user", "commands", "entrypoint", "env", "gateway", + "single_branch", "max_duration", + "ssh_key", "working_dir", + "repo_dir", + "repo_data", + "repo_exists_action", + "file_archives", + } + ), + ] + job_submission: Annotated[ + JobSubmission, + Field( + include={ + "id", } ), ] cluster_info: Annotated[Optional[ClusterInfo], Field(include=True)] secrets: Annotated[Optional[Dict[str, str]], Field(include=True)] repo_credentials: Annotated[Optional[RemoteRepoCreds], Field(include=True)] + log_quota_hour: Annotated[Optional[int], Field(include=True)] = None + """Maximum bytes of log output per hour. None means unlimited.""" + # TODO: remove `run_spec` once instances deployed with 0.19.8 or earlier are no longer supported. + run_spec: Annotated[ + RunSpec, + Field( + include={ + "run_name", + "repo_id", + "repo_data", + "configuration", + "configuration_path", + }, + ), + ] + """`run_spec` is deprecated in favor of `run.run_spec`.""" class HealthcheckResponse(CoreModel): @@ -72,26 +127,151 @@ class HealthcheckResponse(CoreModel): version: str +class InstanceHealthResponse(CoreModel): + dcgm: Optional[DCGMHealthResponse] = None + + +class ShutdownRequest(CoreModel): + force: bool + + +class ComponentName(str, Enum): + RUNNER = "dstack-runner" + SHIM = "dstack-shim" + + +class ComponentStatus(str, Enum): + NOT_INSTALLED = "not-installed" + INSTALLED = "installed" + INSTALLING = "installing" + ERROR = "error" + + +class ComponentInfo(CoreModel): + name: str + """`name` does not use `ComponentName` so newer shim versions remain compatible with the older server.""" + version: str + status: ComponentStatus + + +class ComponentListResponse(CoreModel): + components: list[ComponentInfo] + + +class ComponentInstallRequest(CoreModel): + name: ComponentName + url: str + + +class GPUMetrics(CoreModel): + gpu_memory_usage_bytes: int + gpu_util_percent: int + + +class MetricsResponse(CoreModel): + timestamp_micro: int + cpu_usage_micro: int + memory_usage_bytes: int + memory_working_set_bytes: int + gpus: List[GPUMetrics] + + class ShimVolumeInfo(CoreModel): + backend: str name: str volume_id: str init_fs: bool + device_name: Optional[str] = None + + +class PortMapping(CoreModel): + host: int + container: int + + +class TaskStatus(str, Enum): + PENDING = "pending" + PREPARING = "preparing" + PULLING = "pulling" + CREATING = "creating" + RUNNING = "running" + TERMINATED = "terminated" + + +class GPUDevice(CoreModel): + path_on_host: str + path_in_container: str + + +class TaskListItem(CoreModel): + id: str + status: TaskStatus + + +class TaskListResponse(CoreModel): + ids: Optional[list[str]] = None + """`ids` is returned by pre-0.19.26 shim versions.""" + tasks: Optional[list[TaskListItem]] = None + """`tasks` is returned by shim versions 0.19.26 and newer.""" + + +class TaskInfoResponse(CoreModel): + id: str + status: TaskStatus + termination_reason: str + termination_message: str + ports: Optional[list[PortMapping]] = [] + """`ports` uses a default value for backward compatibility with 0.18.34. + It can be removed after a few releases. + """ + image_pull_progress: Optional[ImagePullProgress] = None + + +class TaskSubmitRequest(CoreModel): + id: str + name: str + registry_username: str + registry_password: str + image_name: str + container_user: str + privileged: bool + gpu: int + cpu: float + memory: int + shm_size: int + network_mode: NetworkMode + volumes: list[ShimVolumeInfo] + volume_mounts: list[VolumeMountPoint] + instance_mounts: list[InstanceMountPoint] + gpu_devices: list[GPUDevice] + host_ssh_user: str + host_ssh_keys: list[str] + container_ssh_keys: list[str] + + +class TaskTerminateRequest(CoreModel): + termination_reason: str + termination_message: str + timeout: int -class TaskConfigBody(CoreModel): +class LegacySubmitBody(CoreModel): username: str password: str image_name: str + privileged: bool container_name: str + container_user: str shm_size: int public_keys: List[str] ssh_user: str ssh_key: str mounts: List[VolumeMountPoint] volumes: List[ShimVolumeInfo] + instance_mounts: List[InstanceMountPoint] -class StopBody(CoreModel): +class LegacyStopBody(CoreModel): force: bool = False @@ -100,14 +280,6 @@ class JobResult(CoreModel): reason_message: str -class PullBody(CoreModel): +class LegacyPullResponse(CoreModel): state: str - executor_error: Optional[str] - container_name: Optional[str] - status: Optional[str] - running: Optional[bool] - oom_killed: Optional[bool] - dead: Optional[bool] - exit_code: Optional[int] - error: Optional[str] result: Optional[JobResult] diff --git a/src/dstack/_internal/server/schemas/runs.py b/src/dstack/_internal/server/schemas/runs.py index eec4cc322a..8447243715 100644 --- a/src/dstack/_internal/server/schemas/runs.py +++ b/src/dstack/_internal/server/schemas/runs.py @@ -1,60 +1,65 @@ from datetime import datetime -from typing import List, Optional +from typing import Annotated, List, Optional from uuid import UUID +from pydantic import Field + from dstack._internal.core.models.common import CoreModel -from dstack._internal.core.models.instances import SSHKey -from dstack._internal.core.models.profiles import Profile -from dstack._internal.core.models.runs import Requirements, RunSpec +from dstack._internal.core.models.runs import ApplyRunPlanInput, RunSpec class ListRunsRequest(CoreModel): - project_name: Optional[str] - repo_id: Optional[str] - username: Optional[str] + project_name: Optional[str] = None + repo_id: Optional[str] = None + username: Optional[str] = None only_active: bool = False - prev_submitted_at: Optional[datetime] - prev_run_id: Optional[UUID] - limit: int = 1000 + include_jobs: bool = Field( + True, + description=("Whether to include `jobs` in the response"), + ) + job_submissions_limit: Optional[int] = Field( + None, + ge=0, + description=( + "Limit number of job submissions returned per job to avoid large responses." + "Drops older job submissions. No effect with `include_jobs: false`" + ), + ) + prev_submitted_at: Optional[datetime] = None + prev_run_id: Optional[UUID] = None + limit: int = Field(100, ge=0, le=100) ascending: bool = False class GetRunRequest(CoreModel): - run_name: str + run_name: Optional[str] = None + id: Optional[UUID] = None class GetRunPlanRequest(CoreModel): run_spec: RunSpec - - -class GetOffersRequest(CoreModel): - profile: Profile - requirements: Requirements - - -class CreateInstanceRequest(CoreModel): - profile: Profile - requirements: Requirements - - -class AddRemoteInstanceRequest(CoreModel): - pool_name: Optional[str] - instance_name: Optional[str] - instance_network: Optional[str] - region: Optional[str] - host: str - port: int - ssh_user: str - ssh_keys: List[SSHKey] + max_offers: Optional[int] = Field( + description="The maximum number of offers to return", ge=1, le=10000 + ) class SubmitRunRequest(CoreModel): run_spec: RunSpec +class ApplyRunPlanRequest(CoreModel): + plan: ApplyRunPlanInput + force: Annotated[ + bool, + Field( + description="Use `force: true` to apply even if the expected resource does not match." + ), + ] + + class StopRunsRequest(CoreModel): runs_names: List[str] - abort: bool + abort: Annotated[bool, Field(description="Do not wait for a graceful shutdown.")] class DeleteRunsRequest(CoreModel): diff --git a/src/dstack/_internal/server/schemas/secrets.py b/src/dstack/_internal/server/schemas/secrets.py index 769c87052c..a8d78ea071 100644 --- a/src/dstack/_internal/server/schemas/secrets.py +++ b/src/dstack/_internal/server/schemas/secrets.py @@ -1,20 +1,16 @@ from typing import List -from dstack._internal.core.models.secrets import Secret -from dstack._internal.server.schemas.common import RepoRequest +from dstack._internal.core.models.common import CoreModel -class ListSecretsRequest(RepoRequest): - pass +class GetSecretRequest(CoreModel): + name: str -class GetSecretsRequest(RepoRequest): - pass +class CreateOrUpdateSecretRequest(CoreModel): + name: str + value: str -class AddSecretRequest(RepoRequest): - secret: Secret - - -class DeleteSecretsRequest(RepoRequest): +class DeleteSecretsRequest(CoreModel): secrets_names: List[str] diff --git a/src/dstack/_internal/server/schemas/sshproxy.py b/src/dstack/_internal/server/schemas/sshproxy.py new file mode 100644 index 0000000000..10c9297d88 --- /dev/null +++ b/src/dstack/_internal/server/schemas/sshproxy.py @@ -0,0 +1,27 @@ +from typing import Annotated + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +class GetUpstreamRequest(CoreModel): + # The format of id is intentionally not limited to UUID to allow further extensions + id: str + + +class UpstreamHost(CoreModel): + host: Annotated[str, Field(description="The hostname or IP address")] + port: Annotated[int, Field(description="The SSH port")] + user: Annotated[str, Field(description="The user to log in")] + private_key: Annotated[str, Field(description="The private key in OpenSSH file format")] + + +class GetUpstreamResponse(CoreModel): + hosts: Annotated[ + list[UpstreamHost], + Field(description="The chain of SSH hosts, the jump host(s) first, the target host last"), + ] + authorized_keys: Annotated[ + list[str], Field(description="The list of authorized public keys in OpenSSH file format") + ] diff --git a/src/dstack/_internal/server/schemas/users.py b/src/dstack/_internal/server/schemas/users.py index 1298bd0ed2..574d5b093e 100644 --- a/src/dstack/_internal/server/schemas/users.py +++ b/src/dstack/_internal/server/schemas/users.py @@ -1,9 +1,55 @@ -from typing import List, Optional +from datetime import datetime +from typing import Annotated, List, Optional +from uuid import UUID + +from pydantic import Field from dstack._internal.core.models.common import CoreModel from dstack._internal.core.models.users import GlobalRole +class ListUsersRequest(CoreModel): + return_total_count: Annotated[ + bool, Field(description="Return `total_count` with the total number of users.") + ] = False + name_pattern: Annotated[ + Optional[str], + Field( + description="Include only users with the name containing `name_pattern`.", + regex="^[a-zA-Z0-9-_]*$", + ), + ] = None + prev_created_at: Annotated[ + Optional[datetime], + Field( + description=( + "Paginate users by specifying `created_at` of the last (first) user in previous " + "batch for descending (ascending)." + ) + ), + ] = None + prev_id: Annotated[ + Optional[UUID], + Field( + description=( + "Paginate users by specifying `id` of the last (first) user in previous batch " + "for descending (ascending). Must be used together with `prev_created_at`." + ) + ), + ] = None + limit: Annotated[int, Field(ge=0, le=2000, description="Limit number of users returned.")] = ( + 2000 + ) + ascending: Annotated[ + bool, + Field( + description=( + "Return users sorted by `created_at` in ascending order. Defaults to descending." + ) + ), + ] = False + + class GetUserRequest(CoreModel): username: str @@ -12,6 +58,7 @@ class CreateUserRequest(CoreModel): username: str global_role: GlobalRole email: Optional[str] + active: bool = True UpdateUserRequest = CreateUserRequest diff --git a/src/dstack/_internal/server/schemas/volumes.py b/src/dstack/_internal/server/schemas/volumes.py index 1ca82467c7..ff1b106f9f 100644 --- a/src/dstack/_internal/server/schemas/volumes.py +++ b/src/dstack/_internal/server/schemas/volumes.py @@ -1,7 +1,20 @@ -from typing import List +from datetime import datetime +from typing import Annotated, List, Optional +from uuid import UUID + +from pydantic import Field from dstack._internal.core.models.common import CoreModel -from dstack._internal.core.models.volumes import VolumeConfiguration +from dstack._internal.core.models.volumes import AnyVolumeConfiguration + + +class ListVolumesRequest(CoreModel): + project_name: Optional[str] + only_active: bool = False + prev_created_at: Optional[datetime] + prev_id: Optional[UUID] + limit: int = Field(100, ge=0, le=100) + ascending: bool = False class GetVolumeRequest(CoreModel): @@ -9,7 +22,7 @@ class GetVolumeRequest(CoreModel): class CreateVolumeRequest(CoreModel): - configuration: VolumeConfiguration + configuration: Annotated[AnyVolumeConfiguration, Field(discriminator="backend")] class DeleteVolumesRequest(CoreModel): diff --git a/src/dstack/_internal/server/security/permissions.py b/src/dstack/_internal/server/security/permissions.py index b5555bc107..6a4269a256 100644 --- a/src/dstack/_internal/server/security/permissions.py +++ b/src/dstack/_internal/server/security/permissions.py @@ -1,20 +1,37 @@ -from typing import Tuple +from secrets import compare_digest +from typing import Annotated, Optional, Tuple +from uuid import UUID -from fastapi import Depends, Security +from fastapi import Depends, HTTPException, Security from fastapi.security import HTTPBearer from fastapi.security.http import HTTPAuthorizationCredentials +from sqlalchemy import exists, func, select from sqlalchemy.ext.asyncio import AsyncSession from dstack._internal.core.models.users import GlobalRole, ProjectRole from dstack._internal.server.db import get_session -from dstack._internal.server.models import ProjectModel, UserModel -from dstack._internal.server.services.projects import get_project_model_by_name -from dstack._internal.server.services.users import get_user_model_by_token +from dstack._internal.server.models import ( + ExportedFleetModel, + ExportedGatewayModel, + FleetModel, + GatewayModel, + ImportModel, + InstanceModel, + MemberModel, + ProjectModel, + UserModel, +) +from dstack._internal.server.services.projects import ( + get_project_model_by_name, + get_user_project_role, +) +from dstack._internal.server.services.users import log_in_with_token from dstack._internal.server.utils.routers import ( error_forbidden, error_invalid_token, error_not_found, ) +from dstack._internal.utils.common import EntityName, EntityNameOrID class Authenticated: @@ -23,7 +40,7 @@ async def __call__( session: AsyncSession = Depends(get_session), token: HTTPAuthorizationCredentials = Security(HTTPBearer()), ) -> UserModel: - user = await get_user_model_by_token(session=session, token=token.credentials) + user = await log_in_with_token(session=session, token=token.credentials) if user is None: raise error_invalid_token() return user @@ -35,7 +52,7 @@ async def __call__( session: AsyncSession = Depends(get_session), token: HTTPAuthorizationCredentials = Security(HTTPBearer()), ) -> UserModel: - user = await get_user_model_by_token(session=session, token=token.credentials) + user = await log_in_with_token(session=session, token=token.credentials) if user is None: raise error_invalid_token() if user.global_role == GlobalRole.ADMIN: @@ -50,20 +67,45 @@ async def __call__( session: AsyncSession = Depends(get_session), token: HTTPAuthorizationCredentials = Security(HTTPBearer()), ) -> Tuple[UserModel, ProjectModel]: - user = await get_user_model_by_token(session=session, token=token.credentials) + user = await log_in_with_token(session=session, token=token.credentials) if user is None: raise error_invalid_token() project = await get_project_model_by_name(session=session, project_name=project_name) if project is None: - raise error_forbidden() + raise error_not_found() if user.global_role == GlobalRole.ADMIN: return user, project - for member in project.members: - if member.user_id == user.id: - if member.project_role == ProjectRole.ADMIN: - return user, project - else: - raise error_forbidden() + project_role = get_user_project_role(user=user, project=project) + if project_role == ProjectRole.ADMIN: + return user, project + raise error_forbidden() + + +class ProjectManager: + """ + Allows project admins and managers to manage projects. + """ + + async def __call__( + self, + project_name: str, + session: AsyncSession = Depends(get_session), + token: HTTPAuthorizationCredentials = Security(HTTPBearer()), + ) -> Tuple[UserModel, ProjectModel]: + user = await log_in_with_token(session=session, token=token.credentials) + if user is None: + raise error_invalid_token() + project = await get_project_model_by_name(session=session, project_name=project_name) + if project is None: + raise error_not_found() + + if user.global_role == GlobalRole.ADMIN: + return user, project + + project_role = get_user_project_role(user=user, project=project) + if project_role in [ProjectRole.ADMIN, ProjectRole.MANAGER]: + return user, project + raise error_forbidden() @@ -75,15 +117,249 @@ async def __call__( project_name: str, token: HTTPAuthorizationCredentials = Security(HTTPBearer()), ) -> Tuple[UserModel, ProjectModel]: - user = await get_user_model_by_token(session=session, token=token.credentials) + return await get_project_member(session, project_name, token.credentials) + + +class ProjectMemberOrPublicAccess: + """ + Allows access to project for: + - Global admins + - Project members + - Any authenticated user if the project is public + """ + + async def __call__( + self, + *, + session: AsyncSession = Depends(get_session), + project_name: str, + token: HTTPAuthorizationCredentials = Security(HTTPBearer()), + ) -> Tuple[UserModel, ProjectModel]: + user = await log_in_with_token(session=session, token=token.credentials) + if user is None: + raise error_invalid_token() + + project = await get_project_model_by_name(session=session, project_name=project_name) + if project is None: + raise error_not_found() + + if user.global_role == GlobalRole.ADMIN: + return user, project + + project_role = get_user_project_role(user=user, project=project) + if project_role is not None: + return user, project + + if project.is_public: + return user, project + + raise error_forbidden() + + +class ProjectManagerOrPublicProject: + """ + Allows: + 1. Project managers to perform member management operations + 2. Access to public projects for any authenticated user + """ + + def __init__(self): + self.project_manager = ProjectManager() + + async def __call__( + self, + project_name: str, + session: AsyncSession = Depends(get_session), + token: HTTPAuthorizationCredentials = Security(HTTPBearer()), + ) -> Tuple[UserModel, ProjectModel]: + user = await log_in_with_token(session=session, token=token.credentials) + if user is None: + raise error_invalid_token() + project = await get_project_model_by_name(session=session, project_name=project_name) + if project is None: + raise error_not_found() + + if user.global_role == GlobalRole.ADMIN: + return user, project + + project_role = get_user_project_role(user=user, project=project) + if project_role in [ProjectRole.ADMIN, ProjectRole.MANAGER]: + return user, project + + if project.is_public: + return user, project + + raise error_forbidden() + + +class ProjectManagerOrSelfLeave: + """ + Allows: + 1. Project managers to remove any members + 2. Any project member to leave (remove themselves) + """ + + async def __call__( + self, + project_name: str, + session: AsyncSession = Depends(get_session), + token: HTTPAuthorizationCredentials = Security(HTTPBearer()), + ) -> Tuple[UserModel, ProjectModel]: + user = await log_in_with_token(session=session, token=token.credentials) if user is None: raise error_invalid_token() project = await get_project_model_by_name(session=session, project_name=project_name) if project is None: raise error_not_found() + if user.global_role == GlobalRole.ADMIN: return user, project - for member in project.members: - if member.user_id == user.id: - return user, project + + project_role = get_user_project_role(user=user, project=project) + if project_role is not None: + return user, project + + raise error_forbidden() + + +class ServiceAccount: + def __init__(self, token: str) -> None: + self._token = token.encode() + + async def __call__( + self, token: Annotated[HTTPAuthorizationCredentials, Security(HTTPBearer())] + ) -> None: + if not compare_digest(token.credentials.encode(), self._token): + raise error_invalid_token() + + +class OptionalServiceAccount(ServiceAccount): + _token: Optional[bytes] = None + + def __init__(self, token: Optional[str]) -> None: + if token is not None: + super().__init__(token) + + async def __call__( + self, + token: Annotated[ + Optional[HTTPAuthorizationCredentials], Security(HTTPBearer(auto_error=False)) + ], + ) -> None: + if self._token is None: + return + if token is None: + raise error_forbidden() + await super().__call__(token) + + +class AlwaysForbidden: + async def __call__(self) -> None: + raise error_forbidden() + + +async def get_project_member( + session: AsyncSession, project_name: str, token: str +) -> Tuple[UserModel, ProjectModel]: + user = await log_in_with_token(session=session, token=token) + if user is None: + raise error_invalid_token() + project = await get_project_model_by_name(session=session, project_name=project_name) + if project is None: + raise error_not_found() + if user.global_role == GlobalRole.ADMIN: + return user, project + project_role = get_user_project_role(user=user, project=project) + if project_role is not None: + return user, project + raise error_forbidden() + + +async def is_project_member(session: AsyncSession, project_name: str, token: str) -> bool: + try: + await get_project_member(session, project_name, token) + return True + except HTTPException: + return False + + +async def check_can_access_fleet( + session: AsyncSession, + user: UserModel, + fleet_project: ProjectModel, + fleet_name_or_id: EntityNameOrID, +) -> None: + if ( + user.global_role == GlobalRole.ADMIN + or get_user_project_role(user=user, project=fleet_project) is not None + ): + return + filters = [ + FleetModel.project_id == fleet_project.id, + exists().where( + MemberModel.user_id == user.id, + MemberModel.project_id == ImportModel.project_id, + ImportModel.export_id == ExportedFleetModel.export_id, + ExportedFleetModel.fleet_id == FleetModel.id, + ), + ] + if isinstance(fleet_name_or_id, EntityName): + filters.extend([FleetModel.name == fleet_name_or_id.name, FleetModel.deleted == False]) + else: + filters.append(FleetModel.id == fleet_name_or_id.id) + res = await session.execute(select(func.count()).select_from(FleetModel).where(*filters)) + if res.scalar_one() == 0: + raise error_forbidden() + + +async def check_can_access_gateway( + session: AsyncSession, + user: UserModel, + gateway_project: ProjectModel, + gateway_name: str, +) -> None: + if ( + user.global_role == GlobalRole.ADMIN + or gateway_project.is_public + or get_user_project_role(user=user, project=gateway_project) is not None + ): + return + filters = [ + GatewayModel.project_id == gateway_project.id, + GatewayModel.name == gateway_name, + exists().where( + MemberModel.user_id == user.id, + MemberModel.project_id == ImportModel.project_id, + ImportModel.export_id == ExportedGatewayModel.export_id, + ExportedGatewayModel.gateway_id == GatewayModel.id, + ), + ] + res = await session.execute(select(func.count()).select_from(GatewayModel).where(*filters)) + if res.scalar_one() == 0: + raise error_forbidden() + + +async def check_can_access_instance( + session: AsyncSession, + user: UserModel, + instance_project: ProjectModel, + instance_id: UUID, +) -> None: + if ( + user.global_role == GlobalRole.ADMIN + or get_user_project_role(user=user, project=instance_project) is not None + ): + return + filters = [ + InstanceModel.project_id == instance_project.id, + InstanceModel.id == instance_id, + exists().where( + MemberModel.user_id == user.id, + MemberModel.project_id == ImportModel.project_id, + ImportModel.export_id == ExportedFleetModel.export_id, + ExportedFleetModel.fleet_id == InstanceModel.fleet_id, + ), + ] + res = await session.execute(select(func.count()).select_from(InstanceModel).where(*filters)) + if res.scalar_one() == 0: raise error_forbidden() diff --git a/src/dstack/_internal/server/services/auth.py b/src/dstack/_internal/server/services/auth.py new file mode 100644 index 0000000000..8ea40994f3 --- /dev/null +++ b/src/dstack/_internal/server/services/auth.py @@ -0,0 +1,77 @@ +import secrets +import urllib.parse +from base64 import b64decode, b64encode +from typing import Optional + +from fastapi import Request, Response + +from dstack._internal.core.errors import ServerClientError +from dstack._internal.core.models.auth import OAuthProviderInfo, OAuthState +from dstack._internal.server import settings +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +_OAUTH_STATE_COOKIE_KEY = "oauth-state" + +_OAUTH_PROVIDERS: list[OAuthProviderInfo] = [] + + +def register_provider(provider_info: OAuthProviderInfo): + """ + Registers an OAuth2 provider supported on the server. + If the provider is supported but not configured, it should be registered with `enabled=False`. + The provider must register endpoints `/api/auth/{provider}/authorize` and `/api/auth/{provider}/callback` + as defined by the client (see `dstack.api.server._auth.AuthAPIClient`). + """ + _OAUTH_PROVIDERS.append(provider_info) + + +def list_providers() -> list[OAuthProviderInfo]: + return _OAUTH_PROVIDERS + + +def generate_oauth_state(local_port: Optional[int] = None) -> str: + value = str(secrets.token_hex(16)) + state = OAuthState(value=value, local_port=local_port) + return b64encode(state.json().encode()).decode() + + +def set_state_cookie(response: Response, state: str): + response.set_cookie( + key=_OAUTH_STATE_COOKIE_KEY, + value=state, + secure=settings.SERVER_URL.startswith("https://"), + samesite="strict", + httponly=True, + ) + + +def get_validated_state(request: Request, state: str) -> OAuthState: + state_cookie = request.cookies.get(_OAUTH_STATE_COOKIE_KEY) + if state != state_cookie: + raise ServerClientError("Invalid state token") + decoded_state = _decode_state(state) + if decoded_state is None: + raise ServerClientError("Invalid state token") + return decoded_state + + +def get_next_redirect_url(code: str, state: str) -> Optional[str]: + decoded_state = _decode_state(state) + if decoded_state is None: + raise ServerClientError("Invalid state token") + if decoded_state.local_port is None: + return None + params = {"code": code, "state": state} + redirect_url = f"http://localhost:{decoded_state.local_port}/auth/callback?{urllib.parse.urlencode(params)}" + return redirect_url + + +def _decode_state(state: str) -> Optional[OAuthState]: + try: + return OAuthState.parse_raw(b64decode(state, validate=True).decode()) + except Exception as e: + logger.debug("Exception when decoding OAuth2 state parameter: %s", repr(e)) + return None diff --git a/src/dstack/_internal/server/services/backends/__init__.py b/src/dstack/_internal/server/services/backends/__init__.py index 837ebdb375..f118261796 100644 --- a/src/dstack/_internal/server/services/backends/__init__.py +++ b/src/dstack/_internal/server/services/backends/__init__.py @@ -1,202 +1,101 @@ import asyncio -import heapq -from typing import Callable, Coroutine, List, Optional, Tuple, Type, Union +import json +import time +from collections.abc import Iterable, Iterator +from typing import Callable, Coroutine, Dict, List, Optional, Tuple from uuid import UUID +from cachetools import TTLCache +from pydantic import Field, ValidationError from sqlalchemy import delete, update from sqlalchemy.ext.asyncio import AsyncSession +from typing_extensions import Annotated -from dstack._internal.core.backends.base import Backend -from dstack._internal.core.backends.local import LocalBackend +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.base.configurator import ( + Configurator, + StoredBackendRecord, +) +from dstack._internal.core.backends.configurators import ( + get_configurator, + list_available_backend_types, +) +from dstack._internal.core.backends.models import ( + AnyBackendConfigWithCreds, + AnyBackendConfigWithoutCreds, +) from dstack._internal.core.errors import ( + BackendAuthError, BackendError, BackendInvalidCredentialsError, BackendNotAvailable, ResourceExistsError, + ResourceNotExistsError, ServerClientError, ) -from dstack._internal.core.models.backends import ( - AnyConfigInfoWithCreds, - AnyConfigInfoWithCredsPartial, - AnyConfigValues, -) from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import CoreModel from dstack._internal.core.models.instances import ( InstanceOfferWithAvailability, ) from dstack._internal.core.models.runs import Requirements -from dstack._internal.server.models import BackendModel, ProjectModel -from dstack._internal.server.services.backends.configurators.base import Configurator -from dstack._internal.server.settings import LOCAL_BACKEND_ENABLED -from dstack._internal.server.utils.common import run_async +from dstack._internal.server import settings +from dstack._internal.server.models import BackendModel, DecryptedString, ProjectModel +from dstack._internal.server.services.offers import merge_offer_iterables +from dstack._internal.utils.common import run_async from dstack._internal.utils.logging import get_logger logger = get_logger(__name__) -_CONFIGURATOR_CLASSES: List[Type[Configurator]] = [] - -try: - from dstack._internal.server.services.backends.configurators.aws import AWSConfigurator - - _CONFIGURATOR_CLASSES.append(AWSConfigurator) -except ImportError: - pass - -try: - from dstack._internal.server.services.backends.configurators.azure import AzureConfigurator - - _CONFIGURATOR_CLASSES.append(AzureConfigurator) -except ImportError: - pass - -try: - from dstack._internal.server.services.backends.configurators.cudo import ( - CudoConfigurator, - ) - - _CONFIGURATOR_CLASSES.append(CudoConfigurator) -except ImportError: - pass - -try: - from dstack._internal.server.services.backends.configurators.datacrunch import ( - DataCrunchConfigurator, - ) - - _CONFIGURATOR_CLASSES.append(DataCrunchConfigurator) -except ImportError: - pass -try: - from dstack._internal.server.services.backends.configurators.gcp import GCPConfigurator +class _BackendConfigWithCreds(CoreModel): + __root__: Annotated[AnyBackendConfigWithCreds, Field(..., discriminator="type")] - _CONFIGURATOR_CLASSES.append(GCPConfigurator) -except ImportError: - pass - -try: - from dstack._internal.server.services.backends.configurators.kubernetes import ( - KubernetesConfigurator, - ) - - _CONFIGURATOR_CLASSES.append(KubernetesConfigurator) -except ImportError: - pass - -try: - from dstack._internal.server.services.backends.configurators.lambdalabs import ( - LambdaConfigurator, - ) - - _CONFIGURATOR_CLASSES.append(LambdaConfigurator) -except ImportError: - pass - -try: - from dstack._internal.server.services.backends.configurators.nebius import NebiusConfigurator - - _CONFIGURATOR_CLASSES.append(NebiusConfigurator) -except ImportError: - pass - -try: - from dstack._internal.server.services.backends.configurators.oci import OCIConfigurator - - _CONFIGURATOR_CLASSES.append(OCIConfigurator) -except ImportError: - pass - -try: - from dstack._internal.server.services.backends.configurators.runpod import RunpodConfigurator - - _CONFIGURATOR_CLASSES.append(RunpodConfigurator) -except ImportError: - pass - -try: - from dstack._internal.server.services.backends.configurators.tensordock import ( - TensorDockConfigurator, - ) - _CONFIGURATOR_CLASSES.append(TensorDockConfigurator) -except ImportError: - pass - -try: - from dstack._internal.server.services.backends.configurators.vastai import VastAIConfigurator - - _CONFIGURATOR_CLASSES.append(VastAIConfigurator) -except ImportError: - pass - - -_BACKEND_TYPE_TO_CONFIGURATOR_CLASS_MAP = {c.TYPE: c for c in _CONFIGURATOR_CLASSES} - - -def register_configurator(configurator: Type[Configurator]): - _BACKEND_TYPE_TO_CONFIGURATOR_CLASS_MAP[configurator.TYPE] = configurator - - -def get_configurator(backend_type: Union[BackendType, str]) -> Optional[Configurator]: - backend_type = BackendType(backend_type) - configurator_class = _BACKEND_TYPE_TO_CONFIGURATOR_CLASS_MAP.get(backend_type) - if configurator_class is None: - return None - return configurator_class() - - -def list_available_backend_types() -> List[BackendType]: - available_backend_types = [] - for configurator_class in _BACKEND_TYPE_TO_CONFIGURATOR_CLASS_MAP.values(): - available_backend_types.append(configurator_class.TYPE) - return available_backend_types - - -async def get_backend_config_values( - config: AnyConfigInfoWithCredsPartial, -) -> AnyConfigValues: - configurator = get_configurator(config.type) - if configurator is None: - raise BackendNotAvailable() - config_values = await run_async(configurator.get_config_values, config) - return config_values +def serialize_source_backend_config( + config: AnyBackendConfigWithCreds, +) -> Tuple[str, Optional[str]]: + """Split user-intent backend config into non-sensitive and sensitive JSON blobs.""" + source_config_dict = config.dict() + source_auth = source_config_dict.pop("creds", None) + source_auth_json = None if source_auth is None else json.dumps(source_auth) + return json.dumps(source_config_dict), source_auth_json async def create_backend( session: AsyncSession, project: ProjectModel, - config: AnyConfigInfoWithCreds, -) -> AnyConfigInfoWithCreds: + config: AnyBackendConfigWithCreds, +) -> AnyBackendConfigWithCreds: configurator = get_configurator(config.type) if configurator is None: raise BackendNotAvailable() backend = await get_project_backend_by_type(project=project, backend_type=configurator.TYPE) if backend is not None: raise ResourceExistsError() - await run_async(configurator.get_config_values, config) - backend = await run_async(configurator.create_backend, project=project, config=config) + backend = await validate_and_create_backend_model( + project=project, configurator=configurator, config=config + ) session.add(backend) await session.commit() - clear_backend_cache(project.id) return config async def update_backend( session: AsyncSession, project: ProjectModel, - config: AnyConfigInfoWithCreds, -) -> AnyConfigInfoWithCreds: + config: AnyBackendConfigWithCreds, +) -> AnyBackendConfigWithCreds: configurator = get_configurator(config.type) if configurator is None: raise BackendNotAvailable() - config_info = await get_config_info( - project=project, - backend_type=configurator.TYPE, + backend_exists = any(configurator.TYPE == b.type for b in project.backends) + if not backend_exists: + raise ResourceNotExistsError() + backend = await validate_and_create_backend_model( + project=project, configurator=configurator, config=config ) - if config_info is None: - raise ServerClientError("Backend does not exist") - await run_async(configurator.get_config_values, config) - backend = await run_async(configurator.create_backend, project=project, config=config) + # FIXME: potentially long write transaction await session.execute( update(BackendModel) .where( @@ -206,25 +105,137 @@ async def update_backend( .values( config=backend.config, auth=backend.auth, + source_config=backend.source_config, + source_auth=backend.source_auth, ) ) - clear_backend_cache(project.id) return config -async def get_config_info( +async def validate_and_create_backend_model( + project: ProjectModel, + configurator: Configurator, + config: AnyBackendConfigWithCreds, +) -> BackendModel: + # Configurators may mutate `config` while building the effective stored backend config, + # so capture the user-intent payload before validation/create_backend runs. + source_config, source_auth = serialize_source_backend_config(config) + await run_async( + configurator.validate_config, config, default_creds_enabled=settings.DEFAULT_CREDS_ENABLED + ) + backend_record = await run_async( + configurator.create_backend, + project_name=project.name, + config=config, + ) + return BackendModel( + project_id=project.id, + type=configurator.TYPE, + config=backend_record.config, + auth=DecryptedString(plaintext=backend_record.auth), + source_config=source_config, + source_auth=None if source_auth is None else DecryptedString(plaintext=source_auth), + ) + + +async def get_backend_config( project: ProjectModel, backend_type: BackendType, -) -> Optional[AnyConfigInfoWithCreds]: +) -> Optional[AnyBackendConfigWithCreds]: configurator = get_configurator(backend_type) if configurator is None: raise BackendNotAvailable() - for b in project.backends: - if b.type == backend_type: - return configurator.get_config_info(b, include_creds=True) + for backend_model in project.backends: + if not backend_model.auth.decrypted: + logger.warning( + "Failed to decrypt creds for %s backend. Backend will be ignored.", + backend_model.type.value, + ) + continue + if backend_model.type == backend_type: + return get_backend_config_with_creds_from_backend_model(configurator, backend_model) return None +async def get_source_backend_config( + project: ProjectModel, + backend_type: BackendType, +) -> Optional[AnyBackendConfigWithCreds]: + backend_model = await get_project_backend_model_by_type(project, backend_type) + if backend_model is None: + return None + return get_source_backend_config_from_backend_model(backend_model) + + +def get_backend_config_with_creds_from_backend_model( + configurator: Configurator, + backend_model: BackendModel, +) -> AnyBackendConfigWithCreds: + backend_record = get_stored_backend_record(backend_model) + backend_config = configurator.get_backend_config_with_creds(backend_record) + return backend_config + + +def get_backend_config_without_creds_from_backend_model( + configurator: Configurator, + backend_model: BackendModel, +) -> AnyBackendConfigWithoutCreds: + backend_record = get_stored_backend_record(backend_model) + backend_config = configurator.get_backend_config_without_creds(backend_record) + return backend_config + + +def get_source_backend_config_from_backend_model( + backend_model: BackendModel, +) -> Optional[AnyBackendConfigWithCreds]: + """Reconstruct user-intent backend config from `source_config`/`source_auth`.""" + + if backend_model.source_config is None: + return None + try: + source_config_dict = json.loads(backend_model.source_config) + except ValueError: + logger.warning( + "Failed to parse source config for %s backend. Falling back to stored config.", + backend_model.type.value, + ) + return None + if backend_model.source_auth is not None: + if not backend_model.source_auth.decrypted: + logger.warning( + "Failed to decrypt source creds for %s backend. Falling back to stored config.", + backend_model.type.value, + ) + return None + try: + source_config_dict["creds"] = json.loads( + backend_model.source_auth.get_plaintext_or_error() + ) + except ValueError: + logger.warning( + "Failed to parse source creds for %s backend. Falling back to stored config.", + backend_model.type.value, + ) + return None + try: + return _BackendConfigWithCreds.parse_obj(source_config_dict).__root__ + except ValidationError: + logger.warning( + "Failed to validate source config for %s backend. Falling back to stored config.", + backend_model.type.value, + ) + return None + + +def get_stored_backend_record(backend_model: BackendModel) -> StoredBackendRecord: + return StoredBackendRecord( + config=backend_model.config, + auth=backend_model.auth.get_plaintext_or_error(), + project_id=backend_model.project_id, + backend_id=backend_model.id, + ) + + async def delete_backends( session: AsyncSession, project: ProjectModel, @@ -232,46 +243,111 @@ async def delete_backends( ): if BackendType.DSTACK in backends_types: raise ServerClientError("Cannot delete dstack backend") + current_backends_types = set(b.type for b in project.backends) + deleted_backends_types = current_backends_types.intersection(backends_types) + if len(deleted_backends_types) == 0: + return + # FIXME: potentially long write transaction + # Not urgent since backend deletion is a rare operation await session.execute( delete(BackendModel).where( - BackendModel.type.in_(backends_types), + BackendModel.type.in_(deleted_backends_types), BackendModel.project_id == project.id, ) ) - clear_backend_cache(project.id) - + logger.info( + "Deleted backends %s in project %s", + [b.value for b in deleted_backends_types], + project.name, + ) -_BACKENDS_CACHE = {} BackendTuple = Tuple[BackendModel, Backend] -async def get_project_backends_with_models(project: ProjectModel) -> List[BackendTuple]: - key = project.id - backends = _BACKENDS_CACHE.get(key) - if backends is not None: - return backends +_BACKENDS_CACHE_LOCKS: Dict[UUID, asyncio.Lock] = {} +_BACKENDS_CACHE = TTLCache[UUID, Dict[BackendType, BackendTuple]](maxsize=1000, ttl=300) - backends = [] - for backend_model in project.backends: - configurator = get_configurator(backend_model.type) - if configurator is None: - logger.warning( - "Missing dependencies for %s backend. Backend will be ignored.", backend_model.type - ) - continue - try: - backend = await run_async(configurator.get_backend, backend_model) - except BackendInvalidCredentialsError: - logger.warning( - "Credentials for %s backend are invalid. Backend will be ignored.", - backend_model.type, + +def _get_project_cache_lock(project_id: UUID) -> asyncio.Lock: + return _BACKENDS_CACHE_LOCKS.setdefault(project_id, asyncio.Lock()) + + +async def get_project_backends_with_models(project: ProjectModel) -> List[BackendTuple]: + async with _get_project_cache_lock(project.id): + key = project.id + project_backends = _BACKENDS_CACHE.get(key, {}) + to_init: List[Tuple[BackendModel, Configurator, StoredBackendRecord]] = [] + for backend_model in project.backends: + cached_backend = project_backends.get(backend_model.type) + if ( + cached_backend is not None + and cached_backend[0].config == backend_model.config + and cached_backend[0].auth == backend_model.auth + ): + continue + configurator = get_configurator(backend_model.type) + if configurator is None: + logger.warning( + "Missing dependencies for %s backend. Backend will be ignored.", + backend_model.type.value, + ) + continue + if not backend_model.auth.decrypted: + logger.warning( + "Failed to decrypt creds for %s backend. Backend will be ignored.", + backend_model.type.value, + ) + continue + backend_record = get_stored_backend_record(backend_model) + to_init.append((backend_model, configurator, backend_record)) + + if to_init: + t0 = time.time() + tasks = [ + _get_backend_tracked(configurator, backend_record) + for _, configurator, backend_record in to_init + ] + results = await asyncio.gather(*tasks, return_exceptions=True) + initialized_results = [] + for (backend_model, _, _), result in zip(to_init, results): + if isinstance(result, BaseException): + if isinstance(result, (BackendInvalidCredentialsError, BackendAuthError)): + logger.warning( + "Credentials for %s backend are invalid. Backend will be ignored.", + backend_model.type.value, + ) + else: + logger.error( + "Failed to initialize %s backend. Backend will be ignored.", + backend_model.type.value, + exc_info=result, + ) + else: + backend, duration = result + project_backends[backend_model.type] = (backend_model, backend) + initialized_results.append(f"{backend_model.type.value}={duration:.1f}s") + logger.debug( + "Initialized %d backends in %.1fs: %s", + len(initialized_results), + time.time() - t0, + ", ".join(initialized_results), ) - continue - backends.append((backend_model, backend)) - _BACKENDS_CACHE[key] = backends - return _BACKENDS_CACHE[key] + # `__setitem__()` will also expire the cache. + # Note that there is no global cache lock so a race condition is possible: + # one coroutine updates/re-assigns backends expired by another coroutine. + # This is ok since the only effect is that project's cache gets restored. + _BACKENDS_CACHE[key] = project_backends + return list(project_backends.values()) + + +async def _get_backend_tracked( + configurator: Configurator, backend_record: StoredBackendRecord +) -> Tuple[Backend, float]: + t = time.time() + backend = await run_async(configurator.get_backend, backend_record) + return backend, time.time() - t _get_project_backend_with_model_by_type = None @@ -322,8 +398,6 @@ async def get_project_backend_with_model_by_type_or_error( async def get_project_backends(project: ProjectModel) -> List[Backend]: backends_with_models = await get_project_backends_with_models(project) backends = [b for _, b in backends_with_models] - if LOCAL_BACKEND_ENABLED: - backends.append(LocalBackend()) return backends @@ -357,11 +431,6 @@ async def get_project_backend_by_type_or_error( return backend -def clear_backend_cache(project_id: UUID): - if project_id in _BACKENDS_CACHE: - del _BACKENDS_CACHE[project_id] - - async def get_project_backend_model_by_type( project: ProjectModel, backend_type: BackendType ) -> Optional[BackendModel]: @@ -382,14 +451,26 @@ async def get_project_backend_model_by_type_or_error( return backend_model -async def get_instance_offers( - backends: List[Backend], requirements: Requirements, exclude_not_available: bool = False -) -> List[Tuple[Backend, InstanceOfferWithAvailability]]: +async def get_backend_offers( + backends: List[Backend], + requirements: Requirements, + exclude_not_available: bool = False, +) -> Iterable[Tuple[Backend, InstanceOfferWithAvailability]]: """ - Returns list of instances satisfying minimal resource requirements sorted by price + Yields backend offers satisfying `requirements` sorted by price. """ - tasks = [run_async(backend.compute().get_offers, requirements) for backend in backends] - offers_by_backend = [] + + def get_filtered_offers_with_backends( + backend: Backend, + offers: Iterable[InstanceOfferWithAvailability], + ) -> Iterator[Tuple[Backend, InstanceOfferWithAvailability]]: + for offer in offers: + if not exclude_not_available or offer.availability.is_available(): + yield (backend, offer) + + logger.debug("Requesting instance offers from backends: %s", [b.TYPE.value for b in backends]) + tasks = [run_async(get_offers_tracked, backend, requirements) for backend in backends] + offers_by_backend: list[Iterable[tuple[Backend, InstanceOfferWithAvailability]]] = [] for backend, result in zip(backends, await asyncio.gather(*tasks, return_exceptions=True)): if isinstance(result, BackendError): logger.warning( @@ -405,14 +486,24 @@ async def get_instance_offers( exc_info=result, ) continue - offers_by_backend.append( - [ - (backend, offer) - for offer in result - if not exclude_not_available or offer.availability.is_available() - ] + offers_by_backend.append(get_filtered_offers_with_backends(backend, result)) + return merge_offer_iterables(*offers_by_backend) + + +def check_backend_type_available(backend_type: BackendType): + if backend_type not in list_available_backend_types(): + raise BackendNotAvailable( + f"Backend {backend_type.value} not available." + " Ensure that backend dependencies are installed." + f" Available backends: {[b.value for b in list_available_backend_types()]}." ) - # Merge preserving order for every backend - offers = heapq.merge(*offers_by_backend, key=lambda i: i[1].price) - # Put NOT_AVAILABLE, NO_QUOTA, and BUSY instances at the end, do not sort by price - return sorted(offers, key=lambda i: not i[1].availability.is_available()) + + +def get_offers_tracked( + backend: Backend, requirements: Requirements +) -> Iterator[InstanceOfferWithAvailability]: + start = time.time() + res = backend.compute().get_offers(requirements) + duration = time.time() - start + logger.debug("Got offers from %s in %.6fs", backend.TYPE.value, duration) + return res diff --git a/src/dstack/_internal/server/services/backends/configurators/aws.py b/src/dstack/_internal/server/services/backends/configurators/aws.py deleted file mode 100644 index eb6f7467a8..0000000000 --- a/src/dstack/_internal/server/services/backends/configurators/aws.py +++ /dev/null @@ -1,178 +0,0 @@ -import concurrent.futures -import json -from typing import List - -from boto3.session import Session - -from dstack._internal.core.backends.aws import AWSBackend, auth, compute -from dstack._internal.core.backends.aws.config import AWSConfig -from dstack._internal.core.errors import BackendAuthError, ComputeError, ServerClientError -from dstack._internal.core.models.backends.aws import ( - AnyAWSConfigInfo, - AWSAccessKeyCreds, - AWSConfigInfo, - AWSConfigInfoWithCreds, - AWSConfigInfoWithCredsPartial, - AWSConfigValues, - AWSCreds, - AWSDefaultCreds, - AWSStoredConfig, -) -from dstack._internal.core.models.backends.base import ( - BackendType, - ConfigElementValue, - ConfigMultiElement, -) -from dstack._internal.core.models.common import is_core_model_instance -from dstack._internal.server import settings -from dstack._internal.server.models import BackendModel, ProjectModel -from dstack._internal.server.services.backends.configurators.base import ( - Configurator, - raise_invalid_credentials_error, -) - -REGIONS = [ - ("US East, N. Virginia", "us-east-1"), - ("US East, Ohio", "us-east-2"), - ("US West, N. California", "us-west-1"), - ("US West, Oregon", "us-west-2"), - ("Asia Pacific, Singapore", "ap-southeast-1"), - ("Canada, Central", "ca-central-1"), - ("Europe, Frankfurt", "eu-central-1"), - ("Europe, Ireland", "eu-west-1"), - ("Europe, London", "eu-west-2"), - ("Europe, Paris", "eu-west-3"), - ("Europe, Stockholm", "eu-north-1"), -] -REGION_VALUES = [r[1] for r in REGIONS] -DEFAULT_REGIONS = REGION_VALUES -MAIN_REGION = "us-east-1" - - -class AWSConfigurator(Configurator): - TYPE: BackendType = BackendType.AWS - - def get_default_configs(self) -> List[AWSConfigInfoWithCreds]: - if not auth.default_creds_available(): - return [] - try: - auth.authenticate(creds=AWSDefaultCreds(), region=MAIN_REGION) - except BackendAuthError: - return [] - return [ - AWSConfigInfoWithCreds( - regions=DEFAULT_REGIONS, - creds=AWSDefaultCreds(), - ) - ] - - def get_config_values(self, config: AWSConfigInfoWithCredsPartial) -> AWSConfigValues: - config_values = AWSConfigValues(regions=None) - config_values.default_creds = ( - settings.DEFAULT_CREDS_ENABLED and auth.default_creds_available() - ) - if config.creds is None: - return config_values - if ( - is_core_model_instance(config.creds, AWSDefaultCreds) - and not settings.DEFAULT_CREDS_ENABLED - ): - raise_invalid_credentials_error(fields=[["creds"]]) - try: - session = auth.authenticate(creds=config.creds, region=MAIN_REGION) - except Exception: - if is_core_model_instance(config.creds, AWSAccessKeyCreds): - raise_invalid_credentials_error( - fields=[ - ["creds", "access_key"], - ["creds", "secret_key"], - ] - ) - else: - raise_invalid_credentials_error(fields=[["creds"]]) - config_values.regions = self._get_regions_element( - selected=config.regions or DEFAULT_REGIONS - ) - self._check_vpc_config( - session=session, - config=config, - ) - return config_values - - def create_backend( - self, project: ProjectModel, config: AWSConfigInfoWithCreds - ) -> BackendModel: - if config.regions is None: - config.regions = DEFAULT_REGIONS - return BackendModel( - project_id=project.id, - type=self.TYPE.value, - config=AWSStoredConfig(**AWSConfigInfo.__response__.parse_obj(config).dict()).json(), - auth=AWSCreds.parse_obj(config.creds).json(), - ) - - def get_config_info(self, model: BackendModel, include_creds: bool) -> AnyAWSConfigInfo: - config = self._get_backend_config(model) - if include_creds: - return AWSConfigInfoWithCreds.__response__.parse_obj(config) - return AWSConfigInfo.__response__.parse_obj(config) - - def get_backend(self, model: BackendModel) -> AWSBackend: - config = self._get_backend_config(model) - return AWSBackend(config=config) - - def _get_backend_config(self, model: BackendModel) -> AWSConfig: - return AWSConfig.__response__( - **json.loads(model.config), - creds=AWSCreds.parse_raw(model.auth).__root__, - ) - - def _get_regions_element(self, selected: List[str]) -> ConfigMultiElement: - element = ConfigMultiElement(selected=selected) - for r in REGION_VALUES: - element.values.append(ConfigElementValue(value=r, label=r)) - return element - - def _check_vpc_config(self, session: Session, config: AWSConfigInfoWithCredsPartial): - allocate_public_ip = config.public_ips if config.public_ips is not None else True - use_default_vpcs = config.default_vpcs if config.default_vpcs is not None else True - if config.vpc_name is not None and config.vpc_ids is not None: - raise ServerClientError(msg="Only one of `vpc_name` and `vpc_ids` can be specified") - if not use_default_vpcs and config.vpc_name is None and config.vpc_ids is None: - raise ServerClientError( - msg="`vpc_name` or `vpc_ids` must be specified if `default_vpcs: false`." - ) - regions = config.regions - if regions is None: - regions = DEFAULT_REGIONS - if config.vpc_ids is not None and not use_default_vpcs: - vpc_ids_regions = list(config.vpc_ids.keys()) - not_configured_regions = [r for r in regions if r not in vpc_ids_regions] - if len(not_configured_regions) > 0: - if config.regions is None: - raise ServerClientError( - f"`vpc_ids` not configured for regions {not_configured_regions}. " - "Configure `vpc_ids` for all regions or specify `regions`." - ) - raise ServerClientError( - f"`vpc_ids` not configured for regions {not_configured_regions}. " - "Configure `vpc_ids` for all regions specified in `regions`." - ) - # The number of workers should be >= the number of regions - with concurrent.futures.ThreadPoolExecutor(max_workers=12) as executor: - futures = [] - for region in regions: - ec2_client = session.client("ec2", region_name=region) - future = executor.submit( - compute.get_vpc_id_subnet_id_or_error, - ec2_client=ec2_client, - config=AWSConfig.parse_obj(config), - region=region, - allocate_public_ip=allocate_public_ip, - ) - futures.append(future) - for future in concurrent.futures.as_completed(futures): - try: - future.result() - except ComputeError as e: - raise ServerClientError(e.args[0]) diff --git a/src/dstack/_internal/server/services/backends/configurators/azure.py b/src/dstack/_internal/server/services/backends/configurators/azure.py deleted file mode 100644 index e52a85ad5c..0000000000 --- a/src/dstack/_internal/server/services/backends/configurators/azure.py +++ /dev/null @@ -1,418 +0,0 @@ -import json -from concurrent.futures import ThreadPoolExecutor -from typing import List, Optional, Tuple - -from azure.core.credentials import TokenCredential -from azure.mgmt import network as network_mgmt -from azure.mgmt import resource as resource_mgmt -from azure.mgmt import subscription as subscription_mgmt -from azure.mgmt.network.models import ( - AddressSpace, - NetworkSecurityGroup, - SecurityRule, - SecurityRuleAccess, - SecurityRuleDirection, - SecurityRuleProtocol, - Subnet, - VirtualNetwork, -) -from azure.mgmt.resource.resources.models import ResourceGroup - -from dstack._internal.core.backends.azure import AzureBackend, auth -from dstack._internal.core.backends.azure import utils as azure_utils -from dstack._internal.core.backends.azure.config import AzureConfig -from dstack._internal.core.errors import BackendAuthError, ServerClientError -from dstack._internal.core.models.backends.azure import ( - AnyAzureConfigInfo, - AzureClientCreds, - AzureConfigInfo, - AzureConfigInfoWithCreds, - AzureConfigInfoWithCredsPartial, - AzureConfigValues, - AzureCreds, - AzureDefaultCreds, - AzureStoredConfig, -) -from dstack._internal.core.models.backends.base import ( - BackendType, - ConfigElement, - ConfigElementValue, - ConfigMultiElement, -) -from dstack._internal.core.models.common import is_core_model_instance -from dstack._internal.server import settings -from dstack._internal.server.models import BackendModel, ProjectModel -from dstack._internal.server.services.backends.configurators.base import ( - Configurator, - raise_invalid_credentials_error, -) - -LOCATIONS = [ - ("(US) Central US", "centralus"), - ("(US) East US, Virginia", "eastus"), - ("(US) East US 2, Virginia", "eastus2"), - ("(US) South Central US, Texas", "southcentralus"), - ("(US) West US 2, Washington", "westus2"), - ("(US) West US 3, Phoenix", "westus3"), - ("(Canada) Canada Central, Toronto", "canadacentral"), - ("(Europe) France Central, Paris", "francecentral"), - ("(Europe) Germany West Central, Frankfurt", "germanywestcentral"), - ("(Europe) North Europe, Ireland", "northeurope"), - ("(Europe) Sweden Central, Gävle", "swedencentral"), - ("(Europe) UK South, London", "uksouth"), - ("(Europe) West Europe", "westeurope"), - ("(Asia Pacific) Southeast Asia, Singapore", "southeastasia"), - ("(Asia Pacific) East Asia", "eastasia"), - ("(South America) Brazil South", "brazilsouth"), -] -LOCATION_VALUES = [loc[1] for loc in LOCATIONS] -DEFAULT_LOCATIONS = LOCATION_VALUES -MAIN_LOCATION = "eastus" - - -class AzureConfigurator(Configurator): - TYPE: BackendType = BackendType.AZURE - - def get_default_configs(self) -> List[AzureConfigInfoWithCreds]: - if not auth.default_creds_available(): - return [] - try: - credential, _ = auth.authenticate(AzureDefaultCreds()) - except BackendAuthError: - return [] - tenant_id_element = self._get_tenant_id_element(credential=credential) - tenant_ids = [v.value for v in tenant_id_element.values] - subscription_id_element = self._get_subscription_id_element(credential=credential) - subscription_ids = [v.value for v in subscription_id_element.values] - configs = [] - for tenant_id in tenant_ids: - for subscription_id in subscription_ids: - config = AzureConfigInfoWithCreds( - tenant_id=tenant_id, - subscription_id=subscription_id, - locations=DEFAULT_LOCATIONS, - creds=AzureDefaultCreds(), - ) - configs.append(config) - return configs - - def get_config_values(self, config: AzureConfigInfoWithCredsPartial) -> AzureConfigValues: - config_values = AzureConfigValues() - config_values.default_creds = ( - settings.DEFAULT_CREDS_ENABLED and auth.default_creds_available() - ) - if config.creds is None: - return config_values - if ( - is_core_model_instance(config.creds, AzureDefaultCreds) - and not settings.DEFAULT_CREDS_ENABLED - ): - raise_invalid_credentials_error(fields=[["creds"]]) - if is_core_model_instance(config.creds, AzureClientCreds): - self._set_client_creds_tenant_id(config.creds, config.tenant_id) - try: - credential, creds_tenant_id = auth.authenticate(config.creds) - except BackendAuthError: - if is_core_model_instance(config.creds, AzureClientCreds): - raise_invalid_credentials_error( - fields=[ - ["creds", "tenant_id"], - ["creds", "client_id"], - ["creds", "client_secret"], - ] - ) - else: - raise_invalid_credentials_error(fields=[["creds"]]) - config_values.tenant_id = self._get_tenant_id_element( - credential=credential, - selected=config.tenant_id or creds_tenant_id, - ) - if config_values.tenant_id.selected is None: - return config_values - config_values.subscription_id = self._get_subscription_id_element( - credential=credential, - selected=config.subscription_id, - ) - if config_values.subscription_id.selected is None: - return config_values - config_values.locations = self._get_locations_element( - selected=config.locations or DEFAULT_LOCATIONS - ) - return config_values - - def create_backend( - self, project: ProjectModel, config: AzureConfigInfoWithCreds - ) -> BackendModel: - if config.locations is None: - config.locations = DEFAULT_LOCATIONS - if is_core_model_instance(config.creds, AzureClientCreds): - self._set_client_creds_tenant_id(config.creds, config.tenant_id) - credential, _ = auth.authenticate(config.creds) - resource_group = self._create_resource_group( - credential=credential, - subscription_id=config.subscription_id, - location=MAIN_LOCATION, - project_name=project.name, - ) - self._create_network_resources( - credential=credential, - subscription_id=config.subscription_id, - resource_group=resource_group, - locations=config.locations, - ) - return BackendModel( - project_id=project.id, - type=self.TYPE.value, - config=AzureStoredConfig( - **AzureConfigInfo.__response__.parse_obj(config).dict(), - resource_group=resource_group, - ).json(), - auth=AzureCreds.parse_obj(config.creds).__root__.json(), - ) - - def get_config_info(self, model: BackendModel, include_creds: bool) -> AnyAzureConfigInfo: - config = self._get_backend_config(model) - if include_creds: - return AzureConfigInfoWithCreds.__response__.parse_obj(config) - return AzureConfigInfo.__response__.parse_obj(config) - - def get_backend(self, model: BackendModel) -> AzureBackend: - config = self._get_backend_config(model) - return AzureBackend(config=config) - - def _get_backend_config(self, model: BackendModel) -> AzureConfig: - return AzureConfig.__response__( - **json.loads(model.config), - creds=AzureCreds.parse_raw(model.auth).__root__, - ) - - def _set_client_creds_tenant_id( - self, - creds: AzureClientCreds, - tenant_id: Optional[str], - ): - if creds.tenant_id is not None: - return - if tenant_id is None: - raise_invalid_credentials_error( - fields=[ - ["creds", "tenant_id"], - ["tenant_id"], - ] - ) - creds.tenant_id = tenant_id - - def _get_tenant_id_element( - self, - credential: auth.AzureCredential, - selected: Optional[str] = None, - ) -> ConfigElement: - subscription_client = subscription_mgmt.SubscriptionClient(credential=credential) - element = ConfigElement(selected=selected) - tenant_ids = [] - for tenant in subscription_client.tenants.list(): - tenant_ids.append(tenant.tenant_id) - element.values.append( - ConfigElementValue(value=tenant.tenant_id, label=tenant.tenant_id) - ) - if selected is not None and selected not in tenant_ids: - raise ServerClientError( - "Invalid tenant_id", - fields=[["tenant_id"]], - ) - if len(tenant_ids) == 1: - element.selected = tenant_ids[0] - return element - - def _get_subscription_id_element( - self, - credential: auth.AzureCredential, - selected: Optional[str] = None, - ) -> ConfigElement: - subscription_client = subscription_mgmt.SubscriptionClient(credential=credential) - element = ConfigElement(selected=selected) - subscription_ids = [] - for subscription in subscription_client.subscriptions.list(): - subscription_ids.append(subscription.subscription_id) - element.values.append( - ConfigElementValue( - value=subscription.subscription_id, - label=f"{subscription.display_name} ({subscription.subscription_id})", - ) - ) - if selected is not None and selected not in subscription_ids: - raise ServerClientError( - "Invalid subscription_id", - fields=[["subscription_id"]], - ) - if len(subscription_ids) == 1: - element.selected = subscription_ids[0] - if len(subscription_ids) == 0: - # Credentials without granted roles don't see any subscriptions - raise ServerClientError( - msg="No Azure subscriptions found for provided credentials. Ensure the account has enough permissions.", - ) - return element - - def _get_locations_element(self, selected: List[str]) -> ConfigMultiElement: - element = ConfigMultiElement() - for loc in LOCATION_VALUES: - element.values.append(ConfigElementValue(value=loc, label=loc)) - element.selected = selected - return element - - def _create_resource_group( - self, - credential: auth.AzureCredential, - subscription_id: str, - location: str, - project_name: str, - ) -> str: - resource_manager = ResourceManager( - credential=credential, - subscription_id=subscription_id, - ) - return resource_manager.create_resource_group( - name=_get_resource_group_name(project_name), - location=location, - ) - - def _create_network_resources( - self, - credential: auth.AzureCredential, - subscription_id: str, - resource_group: str, - locations: List[str], - ): - def func(location: str): - network_manager = NetworkManager( - credential=credential, subscription_id=subscription_id - ) - network_manager.create_virtual_network( - resource_group=resource_group, - location=location, - name=azure_utils.get_default_network_name(resource_group, location), - subnet_name=azure_utils.get_default_subnet_name(resource_group, location), - ) - network_manager.create_network_security_group( - resource_group=resource_group, - location=location, - name=azure_utils.get_default_network_security_group_name(resource_group, location), - ) - network_manager.create_gateway_network_security_group( - resource_group=resource_group, - location=location, - name=azure_utils.get_gateway_network_security_group_name(resource_group, location), - ) - - with ThreadPoolExecutor(max_workers=8) as executor: - for location in locations: - executor.submit(func, location) - - -class ResourceManager: - def __init__(self, credential: TokenCredential, subscription_id: str): - self.resource_client = resource_mgmt.ResourceManagementClient( - credential=credential, subscription_id=subscription_id - ) - - def create_resource_group( - self, - name: str, - location: str, - ) -> str: - resource_group: ResourceGroup = self.resource_client.resource_groups.create_or_update( - resource_group_name=name, - parameters=ResourceGroup( - location=location, - ), - ) - return resource_group.name - - -def _get_resource_group_name(project_name: str) -> str: - return f"dstack-{project_name}" - - -class NetworkManager: - def __init__(self, credential: TokenCredential, subscription_id: str): - self.network_client = network_mgmt.NetworkManagementClient( - credential=credential, subscription_id=subscription_id - ) - - def create_virtual_network( - self, - resource_group: str, - name: str, - subnet_name: str, - location: str, - ) -> Tuple[str, str]: - network: VirtualNetwork = self.network_client.virtual_networks.begin_create_or_update( - resource_group_name=resource_group, - virtual_network_name=name, - parameters=VirtualNetwork( - location=location, - address_space=AddressSpace(address_prefixes=["10.0.0.0/16"]), - subnets=[ - Subnet( - name=subnet_name, - address_prefix="10.0.0.0/20", - ) - ], - ), - ).result() - return network.name, subnet_name - - def create_network_security_group( - self, - resource_group: str, - location: str, - name: str, - ): - self.network_client.network_security_groups.begin_create_or_update( - resource_group_name=resource_group, - network_security_group_name=name, - parameters=NetworkSecurityGroup( - location=location, - security_rules=[ - SecurityRule( - name="runner_ssh", - protocol=SecurityRuleProtocol.TCP, - source_address_prefix="Internet", - source_port_range="*", - destination_address_prefix="*", - destination_port_range="22", - access=SecurityRuleAccess.ALLOW, - priority=100, - direction=SecurityRuleDirection.INBOUND, - ), - ], - ), - ).result() - - def create_gateway_network_security_group( - self, - resource_group: str, - location: str, - name: str, - ): - self.network_client.network_security_groups.begin_create_or_update( - resource_group_name=resource_group, - network_security_group_name=name, - parameters=NetworkSecurityGroup( - location=location, - security_rules=[ - SecurityRule( - name="gateway_all", - protocol=SecurityRuleProtocol.TCP, - source_address_prefix="Internet", - source_port_range="*", - destination_address_prefix="*", - destination_port_ranges=["22", "80", "443"], - access=SecurityRuleAccess.ALLOW, - priority=101, - direction=SecurityRuleDirection.INBOUND, - ) - ], - ), - ).result() diff --git a/src/dstack/_internal/server/services/backends/configurators/base.py b/src/dstack/_internal/server/services/backends/configurators/base.py deleted file mode 100644 index 03667736d6..0000000000 --- a/src/dstack/_internal/server/services/backends/configurators/base.py +++ /dev/null @@ -1,68 +0,0 @@ -from abc import ABC, abstractmethod -from typing import Any, List, Optional - -from dstack._internal.core.backends.base import Backend -from dstack._internal.core.errors import BackendInvalidCredentialsError -from dstack._internal.core.models.backends import ( - AnyConfigInfo, - AnyConfigInfoWithCreds, - AnyConfigInfoWithCredsPartial, - AnyConfigValues, -) -from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.server.models import BackendModel, ProjectModel - - -class Configurator(ABC): - TYPE: BackendType - - def get_default_configs(self) -> List[AnyConfigInfoWithCreds]: - """ - Tries to detect backend creds on the machine and - automatically construct backend configs from the creds. - """ - return [] - - @abstractmethod - def get_config_values(self, config: AnyConfigInfoWithCredsPartial) -> AnyConfigValues: - """ - Validates backend config and returns possible values for unfilled config parameters. - """ - pass - - @abstractmethod - def create_backend( - self, project: ProjectModel, config: AnyConfigInfoWithCreds - ) -> BackendModel: - """ - Creates BackendModel given backend config and creds. - It may perform backend initialization, create - cloud resources such as networks and managed identites, and - save additional configuration parameters. - """ - pass - - @abstractmethod - def get_config_info(self, model: BackendModel, include_creds: bool) -> AnyConfigInfo: - """ - Constructs backend's ConfigInfo to be returned in API responses. - Project admins may need to see backend's creds. In this case `include_creds` will be True. - Otherwise, no sensitive information should be included. - """ - pass - - @abstractmethod - def get_backend(self, model: BackendModel) -> Backend: - """ - Returns Backend instance from config and creds stored in `model`. - """ - pass - - -def raise_invalid_credentials_error( - fields: Optional[List[List[str]]] = None, details: Optional[Any] = None -): - msg = BackendInvalidCredentialsError.msg - if details: - msg += f": {details}" - raise BackendInvalidCredentialsError(fields=fields, msg=msg) diff --git a/src/dstack/_internal/server/services/backends/configurators/cudo.py b/src/dstack/_internal/server/services/backends/configurators/cudo.py deleted file mode 100644 index 9715441688..0000000000 --- a/src/dstack/_internal/server/services/backends/configurators/cudo.py +++ /dev/null @@ -1,87 +0,0 @@ -import json -from typing import List - -from dstack._internal.core.backends.base import Backend -from dstack._internal.core.backends.cudo import CudoBackend, CudoConfig, api_client -from dstack._internal.core.models.backends.base import ( - BackendType, - ConfigElementValue, - ConfigMultiElement, -) -from dstack._internal.core.models.backends.cudo import ( - CudoConfigInfo, - CudoConfigInfoWithCreds, - CudoConfigInfoWithCredsPartial, - CudoConfigValues, - CudoCreds, - CudoStoredConfig, -) -from dstack._internal.server.models import BackendModel, ProjectModel -from dstack._internal.server.services.backends import Configurator -from dstack._internal.server.services.backends.configurators.base import ( - raise_invalid_credentials_error, -) - -REGIONS = [ - "no-luster-1", - "se-smedjebacken-1", - "gb-london-1", - "se-stockholm-1", - "us-newyork-1", - "us-santaclara-1", -] - -DEFAULT_REGION = "no-luster-1" - - -class CudoConfigurator(Configurator): - TYPE: BackendType = BackendType.CUDO - - def get_config_values(self, config: CudoConfigInfoWithCredsPartial) -> CudoConfigValues: - config_values = CudoConfigValues() - if config.creds is None: - return config_values - self._validate_cudo_api_key(config.creds.api_key) - config_values.regions = self._get_regions_element( - selected=config.regions or [DEFAULT_REGION] - ) - return config_values - - def create_backend( - self, project: ProjectModel, config: CudoConfigInfoWithCreds - ) -> BackendModel: - if config.regions is None: - config.regions = REGIONS - return BackendModel( - project_id=project.id, - type=self.TYPE.value, - config=CudoStoredConfig(**CudoConfigInfo.__response__.parse_obj(config).dict()).json(), - auth=CudoCreds.parse_obj(config.creds).json(), - ) - - def get_config_info(self, model: BackendModel, include_creds: bool) -> CudoConfigInfo: - config = self._get_backend_config(model) - if include_creds: - return CudoConfigInfoWithCreds.__response__.parse_obj(config) - return CudoConfigInfo.__response__.parse_obj(config) - - def get_backend(self, model: BackendModel) -> Backend: - config = self._get_backend_config(model) - return CudoBackend(config=config) - - def _get_regions_element(self, selected: List[str]) -> ConfigMultiElement: - element = ConfigMultiElement(selected=selected) - for r in REGIONS: - element.values.append(ConfigElementValue(value=r, label=r)) - return element - - def _get_backend_config(self, model: BackendModel) -> CudoConfig: - return CudoConfig.__response__( - **json.loads(model.config), - creds=CudoCreds.parse_raw(model.auth), - ) - - def _validate_cudo_api_key(self, api_key: str): - client = api_client.CudoApiClient(api_key=api_key) - if not client.validate_api_key(): - raise_invalid_credentials_error(fields=[["creds", "api_key"]]) diff --git a/src/dstack/_internal/server/services/backends/configurators/datacrunch.py b/src/dstack/_internal/server/services/backends/configurators/datacrunch.py deleted file mode 100644 index dd03e0fab0..0000000000 --- a/src/dstack/_internal/server/services/backends/configurators/datacrunch.py +++ /dev/null @@ -1,79 +0,0 @@ -import json -from typing import List - -from dstack._internal.core.backends.datacrunch import DataCrunchBackend -from dstack._internal.core.backends.datacrunch.config import DataCrunchConfig -from dstack._internal.core.models.backends.base import ( - BackendType, - ConfigElementValue, - ConfigMultiElement, -) -from dstack._internal.core.models.backends.datacrunch import ( - AnyDataCrunchConfigInfo, - DataCrunchConfigInfo, - DataCrunchConfigInfoWithCreds, - DataCrunchConfigInfoWithCredsPartial, - DataCrunchConfigValues, - DataCrunchCreds, - DataCrunchStoredConfig, -) -from dstack._internal.server.models import BackendModel, ProjectModel -from dstack._internal.server.services.backends.configurators.base import Configurator - -REGIONS = [ - "FIN-01", - "ICE-01", -] - -DEFAULT_REGION = "FIN-01" - - -class DataCrunchConfigurator(Configurator): - TYPE: BackendType = BackendType.DATACRUNCH - - def get_config_values( - self, config: DataCrunchConfigInfoWithCredsPartial - ) -> DataCrunchConfigValues: - config_values = DataCrunchConfigValues() - if config.creds is None: - return config_values - config_values.regions = self._get_regions_element( - selected=config.regions or [DEFAULT_REGION] - ) - return config_values - - def create_backend( - self, project: ProjectModel, config: DataCrunchConfigInfoWithCreds - ) -> BackendModel: - if config.regions is None: - config.regions = REGIONS - return BackendModel( - project_id=project.id, - type=self.TYPE.value, - config=DataCrunchStoredConfig( - **DataCrunchConfigInfo.__response__.parse_obj(config).dict() - ).json(), - auth=DataCrunchCreds.parse_obj(config.creds).json(), - ) - - def get_config_info(self, model: BackendModel, include_creds: bool) -> AnyDataCrunchConfigInfo: - config = self._get_backend_config(model) - if include_creds: - return DataCrunchConfigInfoWithCreds.__response__.parse_obj(config) - return DataCrunchConfigInfo.__response__.parse_obj(config) - - def get_backend(self, model: BackendModel) -> DataCrunchBackend: - config = self._get_backend_config(model) - return DataCrunchBackend(config=config) - - def _get_backend_config(self, model: BackendModel) -> DataCrunchConfig: - return DataCrunchConfig.__response__( - **json.loads(model.config), - creds=DataCrunchCreds.parse_raw(model.auth), - ) - - def _get_regions_element(self, selected: List[str]) -> ConfigMultiElement: - element = ConfigMultiElement(selected=selected) - for r in REGIONS: - element.values.append(ConfigElementValue(value=r, label=r)) - return element diff --git a/src/dstack/_internal/server/services/backends/configurators/gcp.py b/src/dstack/_internal/server/services/backends/configurators/gcp.py deleted file mode 100644 index 285b5497d9..0000000000 --- a/src/dstack/_internal/server/services/backends/configurators/gcp.py +++ /dev/null @@ -1,245 +0,0 @@ -import json -from typing import List - -import google.cloud.compute_v1 as compute_v1 - -from dstack._internal.core.backends.gcp import GCPBackend, auth, resources -from dstack._internal.core.backends.gcp.config import GCPConfig -from dstack._internal.core.errors import BackendAuthError, ComputeError, ServerClientError -from dstack._internal.core.models.backends.base import ( - BackendType, - ConfigElement, - ConfigElementValue, - ConfigMultiElement, -) -from dstack._internal.core.models.backends.gcp import ( - AnyGCPConfigInfo, - GCPConfigInfo, - GCPConfigInfoWithCreds, - GCPConfigInfoWithCredsPartial, - GCPConfigValues, - GCPCreds, - GCPDefaultCreds, - GCPServiceAccountCreds, - GCPStoredConfig, -) -from dstack._internal.core.models.common import is_core_model_instance -from dstack._internal.server import settings -from dstack._internal.server.models import BackendModel, ProjectModel -from dstack._internal.server.services.backends.configurators.base import ( - Configurator, - raise_invalid_credentials_error, -) - -LOCATIONS = [ - { - "name": "North America", - "regions": [ - "northamerica-northeast1", - "northamerica-northeast2", - "us-central1", - "us-east1", - "us-east4", - "us-east5", - "us-south1", - "us-west1", - "us-west2", - "us-west3", - "us-west4", - ], - "default_region": "us-west1", - "default_zone": "us-west1-b", - }, - { - "name": "South America", - "regions": [ - "southamerica-east1", - "southamerica-west1", - ], - "default_region": "southamerica-east1", - "default_zone": "southamerica-east1-b", - }, - { - "name": "Europe", - "regions": [ - "europe-central2", - "europe-north1", - "europe-southwest1", - "europe-west1", - "europe-west2", - "europe-west3", - "europe-west4", - "europe-west6", - "europe-west8", - "europe-west9", - ], - "default_region": "europe-west4", - "default_zone": "europe-west4-a", - }, - { - "name": "Asia", - "regions": [ - "asia-east1", - "asia-east2", - "asia-northeast1", - "asia-northeast2", - "asia-northeast3", - "asia-south1", - "asia-south2", - "asia-southeast1", - "asia-southeast2", - ], - "default_region": "asia-southeast1", - "default_zone": "asia-southeast1-b", - }, - { - "name": "Middle East", - "regions": [ - "me-west1", - ], - "default_region": "me-west1", - "default_zone": "me-west1-b", - }, - { - "name": "Australia", - "regions": [ - "australia-southeast1", - "australia-southeast2", - ], - "default_region": "australia-southeast1", - "default_zone": "australia-southeast1-c", - }, -] -REGIONS = [r for loc in LOCATIONS for r in loc["regions"]] -DEFAULT_REGIONS = REGIONS -MAIN_REGION = "us-east1" - - -class GCPConfigurator(Configurator): - TYPE: BackendType = BackendType.GCP - - def get_default_configs(self) -> List[GCPConfigInfoWithCreds]: - if not auth.default_creds_available(): - return [] - try: - _, project_id = auth.authenticate(GCPDefaultCreds()) - except BackendAuthError: - return [] - - if project_id is None: - return [] - - return [ - GCPConfigInfoWithCreds( - project_id=project_id, - regions=DEFAULT_REGIONS, - creds=GCPDefaultCreds(), - ) - ] - - def get_config_values(self, config: GCPConfigInfoWithCredsPartial) -> GCPConfigValues: - config_values = GCPConfigValues(project_id=None, regions=None) - config_values.default_creds = ( - settings.DEFAULT_CREDS_ENABLED and auth.default_creds_available() - ) - if config.creds is None: - return config_values - if ( - is_core_model_instance(config.creds, GCPDefaultCreds) - and not settings.DEFAULT_CREDS_ENABLED - ): - raise_invalid_credentials_error(fields=[["creds"]]) - try: - credentials, project_id = auth.authenticate(creds=config.creds) - except BackendAuthError: - if is_core_model_instance(config.creds, GCPServiceAccountCreds): - raise_invalid_credentials_error(fields=[["creds", "data"]]) - else: - raise_invalid_credentials_error(fields=[["creds"]]) - if ( - project_id is not None - and config.project_id is not None - and config.project_id != project_id - ): - raise ServerClientError(msg="Wrong project_id", fields=[["project_id"]]) - config_values.project_id = self._get_project_id_element(selected=project_id) - config_values.regions = self._get_regions_element( - selected=config.regions or DEFAULT_REGIONS - ) - if config.project_id is None: - return config_values - network_client = compute_v1.NetworksClient(credentials=credentials) - routers_client = compute_v1.RoutersClient(credentials=credentials) - self._check_vpc_config( - network_client=network_client, - routers_client=routers_client, - config=config, - ) - return config_values - - def create_backend( - self, project: ProjectModel, config: GCPConfigInfoWithCreds - ) -> BackendModel: - if config.regions is None: - config.regions = DEFAULT_REGIONS - return BackendModel( - project_id=project.id, - type=self.TYPE.value, - config=GCPStoredConfig( - **GCPConfigInfo.__response__.parse_obj(config).dict(), - ).json(), - auth=GCPCreds.parse_obj(config.creds).json(), - ) - - def get_config_info(self, model: BackendModel, include_creds: bool) -> AnyGCPConfigInfo: - config = self._get_backend_config(model) - if include_creds: - return GCPConfigInfoWithCreds.__response__.parse_obj(config) - return GCPConfigInfo.__response__.parse_obj(config) - - def get_backend(self, model: BackendModel) -> GCPBackend: - config = self._get_backend_config(model) - return GCPBackend(config=config) - - def _get_backend_config(self, model: BackendModel) -> GCPConfig: - return GCPConfig.__response__( - **json.loads(model.config), - creds=GCPCreds.parse_raw(model.auth).__root__, - ) - - def _get_project_id_element( - self, - selected: str, - ) -> ConfigElement: - element = ConfigElement(selected=selected) - element.values.append(ConfigElementValue(value=selected, label=selected)) - return element - - def _get_regions_element( - self, - selected: List[str], - ) -> ConfigMultiElement: - element = ConfigMultiElement(selected=selected) - for region_name in REGIONS: - element.values.append(ConfigElementValue(value=region_name, label=region_name)) - return element - - def _check_vpc_config( - self, - network_client: compute_v1.NetworksClient, - routers_client: compute_v1.RoutersClient, - config: GCPConfigInfoWithCredsPartial, - ): - allocate_public_ip = config.public_ips if config.public_ips is not None else True - try: - resources.check_vpc( - network_client=network_client, - routers_client=routers_client, - project_id=config.project_id, - regions=config.regions or DEFAULT_REGIONS, - vpc_name=config.vpc_name, - shared_vpc_project_id=config.vpc_project_id, - allocate_public_ip=allocate_public_ip, - ) - except ComputeError as e: - raise ServerClientError(e.args[0]) diff --git a/src/dstack/_internal/server/services/backends/configurators/kubernetes.py b/src/dstack/_internal/server/services/backends/configurators/kubernetes.py deleted file mode 100644 index b8b04477ad..0000000000 --- a/src/dstack/_internal/server/services/backends/configurators/kubernetes.py +++ /dev/null @@ -1,63 +0,0 @@ -from typing import List - -from dstack._internal.core.backends.kubernetes import KubernetesBackend -from dstack._internal.core.backends.kubernetes.config import KubernetesConfig -from dstack._internal.core.backends.kubernetes.utils import get_api_from_config_data -from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.backends.kubernetes import ( - AnyKubernetesConfigInfo, - KubernetesConfigInfo, - KubernetesConfigInfoWithCreds, - KubernetesConfigInfoWithCredsPartial, - KubernetesConfigValues, - KubernetesStoredConfig, -) -from dstack._internal.server.models import BackendModel, ProjectModel -from dstack._internal.server.services.backends.configurators.base import ( - Configurator, - raise_invalid_credentials_error, -) -from dstack._internal.utils.logging import get_logger - -logger = get_logger(__name__) - - -class KubernetesConfigurator(Configurator): - TYPE: BackendType = BackendType.KUBERNETES - - def get_default_configs(self) -> List[KubernetesConfigInfoWithCreds]: - # TODO: automatically pick up kubernetes config - return [] - - def get_config_values( - self, config: KubernetesConfigInfoWithCredsPartial - ) -> KubernetesConfigValues: - try: - api = get_api_from_config_data(config.kubeconfig.data) - api.list_node() - except Exception as e: - logger.debug("Invalid kubeconfig: %s", str(e)) - raise_invalid_credentials_error(fields=[["kubeconfig"]]) - return KubernetesConfigValues() - - def create_backend( - self, project: ProjectModel, config: KubernetesConfigInfoWithCreds - ) -> BackendModel: - return BackendModel( - project_id=project.id, - type=self.TYPE.value, - config=KubernetesStoredConfig.__response__.parse_obj(config).json(), - auth="", - ) - - def get_config_info(self, model: BackendModel, include_creds: bool) -> AnyKubernetesConfigInfo: - config = self._get_backend_config(model) - if include_creds: - return KubernetesConfigInfoWithCreds.__response__.parse_obj(config) - return KubernetesConfigInfo.__response__.parse_obj(config) - - def get_backend(self, model: BackendModel) -> KubernetesBackend: - return KubernetesBackend(self._get_backend_config(model)) - - def _get_backend_config(self, model: BackendModel) -> KubernetesConfig: - return KubernetesConfig.__response__.parse_raw(model.config) diff --git a/src/dstack/_internal/server/services/backends/configurators/lambdalabs.py b/src/dstack/_internal/server/services/backends/configurators/lambdalabs.py deleted file mode 100644 index e21ca1a06d..0000000000 --- a/src/dstack/_internal/server/services/backends/configurators/lambdalabs.py +++ /dev/null @@ -1,96 +0,0 @@ -import json -from typing import List - -from dstack._internal.core.backends.lambdalabs import LambdaBackend, api_client -from dstack._internal.core.backends.lambdalabs.config import LambdaConfig -from dstack._internal.core.models.backends.base import ( - BackendType, - ConfigElementValue, - ConfigMultiElement, -) -from dstack._internal.core.models.backends.lambdalabs import ( - AnyLambdaConfigInfo, - LambdaConfigInfo, - LambdaConfigInfoWithCreds, - LambdaConfigInfoWithCredsPartial, - LambdaConfigValues, - LambdaCreds, - LambdaStoredConfig, -) -from dstack._internal.server.models import BackendModel, ProjectModel -from dstack._internal.server.services.backends.configurators.base import ( - Configurator, - raise_invalid_credentials_error, -) - -REGIONS = [ - "us-south-1", - "us-west-2", - "us-west-1", - "us-midwest-1", - "us-west-3", - "us-east-1", - "australia-southeast-1", - "europe-central-1", - "asia-south-1", - "me-west-1", - "europe-south-1", - "asia-northeast-1", -] - -DEFAULT_REGION = "us-east-1" - - -class LambdaConfigurator(Configurator): - TYPE: BackendType = BackendType.LAMBDA - - def get_config_values(self, config: LambdaConfigInfoWithCredsPartial) -> LambdaConfigValues: - config_values = LambdaConfigValues() - if config.creds is None: - return config_values - self._validate_lambda_api_key(config.creds.api_key) - config_values.regions = self._get_regions_element( - selected=config.regions or [DEFAULT_REGION] - ) - return config_values - - def create_backend( - self, project: ProjectModel, config: LambdaConfigInfoWithCreds - ) -> BackendModel: - if config.regions is None: - config.regions = REGIONS - return BackendModel( - project_id=project.id, - type=self.TYPE.value, - config=LambdaStoredConfig( - **LambdaConfigInfo.__response__.parse_obj(config).dict() - ).json(), - auth=LambdaCreds.parse_obj(config.creds).json(), - ) - - def get_config_info(self, model: BackendModel, include_creds: bool) -> AnyLambdaConfigInfo: - config = self._get_backend_config(model) - if include_creds: - return LambdaConfigInfoWithCreds.__response__.parse_obj(config) - return LambdaConfigInfo.__response__.parse_obj(config) - - def get_backend(self, model: BackendModel) -> LambdaBackend: - config = self._get_backend_config(model) - return LambdaBackend(config=config) - - def _get_backend_config(self, model: BackendModel) -> LambdaConfig: - return LambdaConfig.__response__( - **json.loads(model.config), - creds=LambdaCreds.parse_raw(model.auth), - ) - - def _validate_lambda_api_key(self, api_key: str): - client = api_client.LambdaAPIClient(api_key=api_key) - if not client.validate_api_key(): - raise_invalid_credentials_error(fields=[["creds", "api_key"]]) - - def _get_regions_element(self, selected: List[str]) -> ConfigMultiElement: - element = ConfigMultiElement(selected=selected) - for r in REGIONS: - element.values.append(ConfigElementValue(value=r, label=r)) - return element diff --git a/src/dstack/_internal/server/services/backends/configurators/nebius.py b/src/dstack/_internal/server/services/backends/configurators/nebius.py deleted file mode 100644 index a01efc052e..0000000000 --- a/src/dstack/_internal/server/services/backends/configurators/nebius.py +++ /dev/null @@ -1,85 +0,0 @@ -import json -from typing import List - -import requests - -import dstack._internal.core.backends.nebius.api_client as api_client -from dstack._internal.core.backends.base import Backend -from dstack._internal.core.backends.nebius import NebiusBackend -from dstack._internal.core.backends.nebius.config import NebiusConfig -from dstack._internal.core.models.backends.base import ( - BackendType, - ConfigElementValue, - ConfigMultiElement, -) -from dstack._internal.core.models.backends.nebius import ( - NebiusConfigInfo, - NebiusConfigInfoWithCreds, - NebiusConfigInfoWithCredsPartial, - NebiusConfigValues, - NebiusCreds, - NebiusStoredConfig, -) -from dstack._internal.server.models import BackendModel, ProjectModel -from dstack._internal.server.services.backends import Configurator -from dstack._internal.server.services.backends.configurators.base import ( - raise_invalid_credentials_error, -) - -REGIONS = ["eu-north1-c"] - - -class NebiusConfigurator(Configurator): - TYPE: BackendType = BackendType.NEBIUS - - def get_config_values(self, config: NebiusConfigInfoWithCredsPartial) -> NebiusConfigValues: - config_values = NebiusConfigValues() - if config.creds is None: - return config_values - self._validate_nebius_creds(config.creds) - # TODO(egor-s) cloud_id - # TODO(egor-s) folder_id - # TODO(egor-s) network_id - config_values.regions = self._get_regions_element(selected=config.regions or []) - return config_values - - def create_backend( - self, project: ProjectModel, config: NebiusConfigInfoWithCreds - ) -> BackendModel: - if config.regions is None: - config.regions = REGIONS - self._validate_nebius_creds(config.creds) - return BackendModel( - project_id=project.id, - type=self.TYPE.value, - config=NebiusStoredConfig.__response__.parse_obj(config).json(), - auth=NebiusCreds.parse_obj(config.creds).json(), - ) - - def get_config_info(self, model: BackendModel, include_creds: bool) -> NebiusConfigInfo: - config = self._get_backend_config(model) - if include_creds: - return NebiusConfigInfoWithCreds.__response__.parse_obj(config) - return NebiusConfigInfo.__response__.parse_obj(config) - - def get_backend(self, model: BackendModel) -> Backend: - config = self._get_backend_config(model) - return NebiusBackend(config=config) - - def _get_backend_config(self, model: BackendModel) -> NebiusConfig: - return NebiusConfig.__response__( - **json.loads(model.config), - creds=NebiusCreds.parse_raw(model.auth), - ) - - def _validate_nebius_creds(self, creds: NebiusCreds): - try: - api_client.NebiusAPIClient(json.loads(creds.data)).get_token() - except requests.HTTPError: - raise_invalid_credentials_error(fields=[["creds", "data"]]) - - def _get_regions_element(self, selected: List[str]) -> ConfigMultiElement: - element = ConfigMultiElement(selected=selected) - for r in REGIONS: - element.values.append(ConfigElementValue(value=r, label=r)) - return element diff --git a/src/dstack/_internal/server/services/backends/configurators/oci.py b/src/dstack/_internal/server/services/backends/configurators/oci.py deleted file mode 100644 index 03335b97e6..0000000000 --- a/src/dstack/_internal/server/services/backends/configurators/oci.py +++ /dev/null @@ -1,198 +0,0 @@ -import json -from typing import Dict, Iterable, List, Set, Tuple - -from dstack._internal.core.backends.oci import OCIBackend, auth, resources -from dstack._internal.core.backends.oci.config import OCIConfig -from dstack._internal.core.backends.oci.exceptions import any_oci_exception -from dstack._internal.core.backends.oci.region import ( - get_subscribed_regions, - make_region_client, - make_region_clients_map, -) -from dstack._internal.core.errors import ServerClientError -from dstack._internal.core.models.backends.base import ( - BackendType, - ConfigElementValue, - ConfigMultiElement, -) -from dstack._internal.core.models.backends.oci import ( - AnyOCIConfigInfo, - OCIConfigInfo, - OCIConfigInfoWithCreds, - OCIConfigInfoWithCredsPartial, - OCIConfigValues, - OCICreds, - OCIDefaultCreds, - OCIStoredConfig, -) -from dstack._internal.core.models.common import is_core_model_instance -from dstack._internal.server import settings -from dstack._internal.server.models import BackendModel, ProjectModel -from dstack._internal.server.services.backends.configurators.base import ( - Configurator, - raise_invalid_credentials_error, -) - -# where dstack images are published -SUPPORTED_REGIONS = frozenset( - [ - "eu-frankfurt-1", - "eu-milan-1", - "me-dubai-1", - "uk-london-1", - "us-ashburn-1", - "us-chicago-1", - "us-phoenix-1", - ] -) - - -class OCIConfigurator(Configurator): - TYPE: BackendType = BackendType.OCI - - def get_default_configs(self) -> List[OCIConfigInfoWithCreds]: - creds = OCIDefaultCreds() - try: - subscribed_regions = get_subscribed_regions(creds).names - except any_oci_exception: - return [] - return [ - OCIConfigInfoWithCreds( - regions=list(subscribed_regions & SUPPORTED_REGIONS), - creds=creds, - ) - ] - - def get_config_values(self, config: OCIConfigInfoWithCredsPartial) -> OCIConfigValues: - config_values = OCIConfigValues(regions=None) - config_values.default_creds = ( - settings.DEFAULT_CREDS_ENABLED and auth.default_creds_available() - ) - if config.creds is None: - return config_values - if ( - is_core_model_instance(config.creds, OCIDefaultCreds) - and not settings.DEFAULT_CREDS_ENABLED - ): - raise_invalid_credentials_error( - fields=[["creds"]], - details="Default credentials are forbidden by dstack settings", - ) - - try: - available_regions = get_subscribed_regions(config.creds).names & SUPPORTED_REGIONS - except any_oci_exception as e: - raise_invalid_credentials_error(fields=[["creds"]], details=e) - - if config.regions: - selected_regions = [r for r in config.regions if r in available_regions] - else: - selected_regions = list(available_regions) - - config_values.regions = self._get_regions_element( - available=available_regions, - selected=selected_regions, - ) - return config_values - - def create_backend( - self, project: ProjectModel, config: OCIConfigInfoWithCreds - ) -> BackendModel: - try: - subscribed_regions = get_subscribed_regions(config.creds) - except any_oci_exception as e: - raise_invalid_credentials_error(fields=[["creds"]], details=e) - - if config.regions is None: - config.regions = _filter_supported_regions(subscribed_regions.names) - else: - _raise_if_regions_unavailable(config.regions, subscribed_regions.names) - - compartment_id, subnet_ids_per_region = _create_resources( - project, config, subscribed_regions.home_region_name - ) - config.compartment_id = compartment_id - stored_config = OCIStoredConfig.__response__( - **config.dict(), subnet_ids_per_region=subnet_ids_per_region - ) - - return BackendModel( - project_id=project.id, - type=self.TYPE.value, - config=stored_config.json(), - auth=OCICreds.parse_obj(config.creds).json(), - ) - - def get_config_info(self, model: BackendModel, include_creds: bool) -> AnyOCIConfigInfo: - config = self._get_backend_config(model) - if include_creds: - return OCIConfigInfoWithCreds.__response__.parse_obj(config) - return OCIConfigInfo.__response__.parse_obj(config) - - def get_backend(self, model: BackendModel) -> OCIBackend: - config = self._get_backend_config(model) - return OCIBackend(config=config) - - def _get_backend_config(self, model: BackendModel) -> OCIConfig: - return OCIConfig.__response__( - **json.loads(model.config), - creds=OCICreds.parse_raw(model.auth).__root__, - ) - - def _get_regions_element( - self, available: Iterable[str], selected: List[str] - ) -> ConfigMultiElement: - element = ConfigMultiElement(selected=selected) - for region in available: - element.values.append(ConfigElementValue(value=region, label=region)) - return element - - -def _filter_supported_regions(subscribed_region_names: Set[str]) -> List[str]: - available_regions = subscribed_region_names & SUPPORTED_REGIONS - if not available_regions: - msg = ( - f"None of your subscribed regions {subscribed_region_names} are supported " - "by dstack yet. Please subscribe to a supported region in OCI Console or " - "contact dstack if you need a specific region to become supported. " - f"Currently supported regions are: {set(SUPPORTED_REGIONS)}" - ) - raise ServerClientError(msg) - return list(available_regions) - - -def _raise_if_regions_unavailable( - region_names: Iterable[str], subscribed_region_names: Set[str] -) -> None: - region_names = set(region_names) - if unsupported_regions := region_names - SUPPORTED_REGIONS: - msg = ( - f"Regions {unsupported_regions} are configured but not supported by dstack yet. " - f"Only these regions are supported: {set(SUPPORTED_REGIONS)}. " - "Please contact dstack if a region you need is missing." - ) - raise ServerClientError(msg, fields=[["regions"]]) - if unsubscribed_regions := region_names - subscribed_region_names: - msg = f"Regions {unsubscribed_regions} are configured but not subscribed to in OCI" - raise ServerClientError(msg, fields=[["regions"]]) - - -def _create_resources( - project: ProjectModel, config: OCIConfigInfoWithCreds, home_region: str -) -> Tuple[str, Dict[str, str]]: - compartment_id = config.compartment_id - if not compartment_id: - home_region_client = make_region_client(home_region, config.creds) - compartment_id = resources.get_or_create_compartment( - f"dstack-{project.name}", - home_region_client.client_config["tenancy"], - home_region_client.identity_client, - ).id - - region_clients = make_region_clients_map(config.regions, config.creds) - resources.wait_until_compartment_active(compartment_id, region_clients) - subnets_per_region = resources.set_up_network_resources( - compartment_id, project.name, region_clients - ) - - return compartment_id, subnets_per_region diff --git a/src/dstack/_internal/server/services/backends/configurators/runpod.py b/src/dstack/_internal/server/services/backends/configurators/runpod.py deleted file mode 100644 index f0482559be..0000000000 --- a/src/dstack/_internal/server/services/backends/configurators/runpod.py +++ /dev/null @@ -1,91 +0,0 @@ -import json -from typing import List - -from dstack._internal.core.backends.base import Backend -from dstack._internal.core.backends.runpod import RunpodBackend, RunpodConfig, api_client -from dstack._internal.core.models.backends.base import ( - BackendType, - ConfigElementValue, - ConfigMultiElement, -) -from dstack._internal.core.models.backends.runpod import ( - RunpodConfigInfo, - RunpodConfigInfoWithCreds, - RunpodConfigInfoWithCredsPartial, - RunpodConfigValues, - RunpodCreds, - RunpodStoredConfig, -) -from dstack._internal.server.models import BackendModel, ProjectModel -from dstack._internal.server.services.backends import Configurator -from dstack._internal.server.services.backends.configurators.base import ( - raise_invalid_credentials_error, -) - -REGIONS = [ - "CA-MTL-1", - "EU-NL-1", - "EU-RO-1", - "EU-SE-1", - "EUR-IS-1", - "EUR-IS-2", - "EUR-NO-1", - "US-OR-1", -] - -DEFAULT_REGION = "CA-MTL-1" - - -class RunpodConfigurator(Configurator): - TYPE: BackendType = BackendType.RUNPOD - - def get_config_values(self, config: RunpodConfigInfoWithCredsPartial) -> RunpodConfigValues: - config_values = RunpodConfigValues() - if config.creds is None: - return config_values - self._validate_runpod_api_key(config.creds.api_key) - config_values.regions = self._get_regions_element( - selected=config.regions or [DEFAULT_REGION] - ) - return config_values - - def create_backend( - self, project: ProjectModel, config: RunpodConfigInfoWithCreds - ) -> BackendModel: - if config.regions is None: - config.regions = REGIONS - return BackendModel( - project_id=project.id, - type=self.TYPE.value, - config=RunpodStoredConfig( - **RunpodConfigInfo.__response__.parse_obj(config).dict() - ).json(), - auth=RunpodCreds.parse_obj(config.creds).json(), - ) - - def get_config_info(self, model: BackendModel, include_creds: bool) -> RunpodConfigInfo: - config = self._get_backend_config(model) - if include_creds: - return RunpodConfigInfoWithCreds.__response__.parse_obj(config) - return RunpodConfigInfo.__response__.parse_obj(config) - - def get_backend(self, model: BackendModel) -> Backend: - config = self._get_backend_config(model) - return RunpodBackend(config=config) - - def _get_regions_element(self, selected: List[str]) -> ConfigMultiElement: - element = ConfigMultiElement(selected=selected) - for r in REGIONS: - element.values.append(ConfigElementValue(value=r, label=r)) - return element - - def _get_backend_config(self, model: BackendModel) -> RunpodConfig: - return RunpodConfig( - **json.loads(model.config), - creds=RunpodCreds.parse_raw(model.auth), - ) - - def _validate_runpod_api_key(self, api_key: str): - client = api_client.RunpodApiClient(api_key=api_key) - if not client.validate_api_key(): - raise_invalid_credentials_error(fields=[["creds", "api_key"]]) diff --git a/src/dstack/_internal/server/services/backends/configurators/tensordock.py b/src/dstack/_internal/server/services/backends/configurators/tensordock.py deleted file mode 100644 index c6616918ac..0000000000 --- a/src/dstack/_internal/server/services/backends/configurators/tensordock.py +++ /dev/null @@ -1,82 +0,0 @@ -import json -from typing import List - -from dstack._internal.core.backends.tensordock import TensorDockBackend, api_client -from dstack._internal.core.backends.tensordock.config import TensorDockConfig -from dstack._internal.core.models.backends.base import ( - BackendType, - ConfigElementValue, - ConfigMultiElement, -) -from dstack._internal.core.models.backends.tensordock import ( - AnyTensorDockConfigInfo, - TensorDockConfigInfo, - TensorDockConfigInfoWithCreds, - TensorDockConfigInfoWithCredsPartial, - TensorDockConfigValues, - TensorDockCreds, - TensorDockStoredConfig, -) -from dstack._internal.server.models import BackendModel, ProjectModel -from dstack._internal.server.services.backends.configurators.base import ( - Configurator, - raise_invalid_credentials_error, -) - -# TensorDock regions are dynamic, currently we don't offer any filtering -REGIONS = [] - - -class TensorDockConfigurator(Configurator): - TYPE: BackendType = BackendType.TENSORDOCK - - def get_config_values( - self, config: TensorDockConfigInfoWithCredsPartial - ) -> TensorDockConfigValues: - config_values = TensorDockConfigValues() - if config.creds is None: - return config_values - self._validate_tensordock_creds(config.creds.api_key, config.creds.api_token) - config_values.regions = self._get_regions_element(selected=config.regions or []) - return config_values - - def create_backend( - self, project: ProjectModel, config: TensorDockConfigInfoWithCreds - ) -> BackendModel: - if config.regions is None: - config.regions = REGIONS - return BackendModel( - project_id=project.id, - type=self.TYPE.value, - config=TensorDockStoredConfig( - **TensorDockConfigInfo.__response__.parse_obj(config).dict() - ).json(), - auth=TensorDockCreds.parse_obj(config.creds).json(), - ) - - def get_config_info(self, model: BackendModel, include_creds: bool) -> AnyTensorDockConfigInfo: - config = self._get_backend_config(model) - if include_creds: - return TensorDockConfigInfoWithCreds.__response__.parse_obj(config) - return TensorDockConfigInfo.__response__.parse_obj(config) - - def get_backend(self, model: BackendModel) -> TensorDockBackend: - config = self._get_backend_config(model) - return TensorDockBackend(config=config) - - def _get_backend_config(self, model: BackendModel) -> TensorDockConfig: - return TensorDockConfig.__response__( - **json.loads(model.config), - creds=TensorDockCreds.parse_raw(model.auth), - ) - - def _validate_tensordock_creds(self, api_key: str, api_token: str): - client = api_client.TensorDockAPIClient(api_key=api_key, api_token=api_token) - if not client.auth_test(): - raise_invalid_credentials_error(fields=[["creds", "api_key"], ["creds", "api_token"]]) - - def _get_regions_element(self, selected: List[str]) -> ConfigMultiElement: - element = ConfigMultiElement(selected=selected) - for r in REGIONS: - element.values.append(ConfigElementValue(value=r, label=r)) - return element diff --git a/src/dstack/_internal/server/services/backends/configurators/vastai.py b/src/dstack/_internal/server/services/backends/configurators/vastai.py deleted file mode 100644 index 072e39b122..0000000000 --- a/src/dstack/_internal/server/services/backends/configurators/vastai.py +++ /dev/null @@ -1,80 +0,0 @@ -import json -from typing import List - -from dstack._internal.core.backends.vastai import VastAIBackend, api_client -from dstack._internal.core.backends.vastai.config import VastAIConfig -from dstack._internal.core.models.backends.base import ( - BackendType, - ConfigElementValue, - ConfigMultiElement, -) -from dstack._internal.core.models.backends.vastai import ( - AnyVastAIConfigInfo, - VastAIConfigInfo, - VastAIConfigInfoWithCreds, - VastAIConfigInfoWithCredsPartial, - VastAIConfigValues, - VastAICreds, - VastAIStoredConfig, -) -from dstack._internal.server.models import BackendModel, ProjectModel -from dstack._internal.server.services.backends.configurators.base import ( - Configurator, - raise_invalid_credentials_error, -) - -# VastAI regions are dynamic, currently we don't offer any filtering -REGIONS = [] - - -class VastAIConfigurator(Configurator): - TYPE: BackendType = BackendType.VASTAI - - def get_config_values(self, config: VastAIConfigInfoWithCredsPartial) -> VastAIConfigValues: - config_values = VastAIConfigValues() - if config.creds is None: - return config_values - self._validate_vastai_creds(config.creds.api_key) - config_values.regions = self._get_regions_element(selected=config.regions or []) - return config_values - - def create_backend( - self, project: ProjectModel, config: VastAIConfigInfoWithCreds - ) -> BackendModel: - if config.regions is None: - config.regions = REGIONS - return BackendModel( - project_id=project.id, - type=self.TYPE.value, - config=VastAIStoredConfig( - **VastAIConfigInfo.__response__.parse_obj(config).dict() - ).json(), - auth=VastAICreds.parse_obj(config.creds).json(), - ) - - def get_config_info(self, model: BackendModel, include_creds: bool) -> AnyVastAIConfigInfo: - config = self._get_backend_config(model) - if include_creds: - return VastAIConfigInfoWithCreds.__response__.parse_obj(config) - return VastAIConfigInfo.__response__.parse_obj(config) - - def get_backend(self, model: BackendModel) -> VastAIBackend: - config = self._get_backend_config(model) - return VastAIBackend(config=config) - - def _get_backend_config(self, model: BackendModel) -> VastAIConfig: - return VastAIConfig.__response__( - **json.loads(model.config), - creds=VastAICreds.parse_raw(model.auth), - ) - - def _validate_vastai_creds(self, api_key: str): - client = api_client.VastAIAPIClient(api_key=api_key) - if not client.auth_test(): - raise_invalid_credentials_error(fields=[["creds", "api_key"]]) - - def _get_regions_element(self, selected: List[str]) -> ConfigMultiElement: - element = ConfigMultiElement(selected=selected) - for r in REGIONS: - element.values.append(ConfigElementValue(value=r, label=r)) - return element diff --git a/src/dstack/_internal/server/services/backends/handlers.py b/src/dstack/_internal/server/services/backends/handlers.py new file mode 100644 index 0000000000..f3f5bab68c --- /dev/null +++ b/src/dstack/_internal/server/services/backends/handlers.py @@ -0,0 +1,105 @@ +from typing import List + +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.errors import ServerClientError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.server.models import ProjectModel +from dstack._internal.server.services.backends import delete_backends +from dstack._internal.server.services.fleets import list_project_fleet_models +from dstack._internal.server.services.volumes import list_project_volumes +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +async def delete_backends_safe( + session: AsyncSession, + project: ProjectModel, + backends_types: List[BackendType], + error: bool = True, +): + try: + # FIXME: The checks are not under lock, + # so there can be dangling active resources due to race conditions. + await _check_active_instances( + session=session, + project=project, + backends_types=backends_types, + error=error, + ) + await _check_active_volumes( + session=session, + project=project, + backends_types=backends_types, + error=error, + ) + except ServerClientError as e: + if error: + raise + logger.warning("%s", e.msg) + await delete_backends( + session=session, + project=project, + backends_types=backends_types, + ) + + +async def _check_active_instances( + session: AsyncSession, + project: ProjectModel, + backends_types: List[BackendType], + error: bool, +): + fleet_models = await list_project_fleet_models( + session=session, + project=project, + ) + for fleet_model in fleet_models: + for instance in fleet_model.instances: + if ( + instance.status.is_active() + and instance.backend is not None + and instance.backend in backends_types + ): + if error: + msg = ( + f"Backend {instance.backend.value} has active instances." + " Delete instances before deleting the backend." + ) + else: + msg = ( + f"Backend {instance.backend.value} has active instances." + " The backend will be deleted but instances may be left hanging." + ) + raise ServerClientError(msg) + + +async def _check_active_volumes( + session: AsyncSession, + project: ProjectModel, + backends_types: List[BackendType], + error: bool, +): + volume_models = await list_project_volumes( + session=session, + project=project, + ) + for volume_model in volume_models: + if ( + volume_model.status.is_active() + and volume_model.provisioning_data is not None + and volume_model.provisioning_data.backend is not None + and volume_model.provisioning_data.backend in backends_types + ): + if error: + msg = ( + f"Backend {volume_model.provisioning_data.backend.value} has active volumes." + " Delete volumes before deleting the backend." + ) + else: + msg = ( + f"Backend {volume_model.provisioning_data.backend.value} has active volumes." + " The backend will be deleted but volumes may be left hanging." + ) + raise ServerClientError(msg) diff --git a/src/dstack/_internal/server/services/backends/provisioning.py b/src/dstack/_internal/server/services/backends/provisioning.py new file mode 100644 index 0000000000..435cca53ac --- /dev/null +++ b/src/dstack/_internal/server/services/backends/provisioning.py @@ -0,0 +1,135 @@ +import re +from typing import Optional + +from dstack._internal import settings +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import RegistryAuth +from dstack._internal.core.models.runs import JobProvisioningData +from dstack._internal.core.models.volumes import InstanceMountPoint +from dstack._internal.server.schemas.runner import GPUDevice +from dstack._internal.server.services.docker import apply_server_docker_defaults +from dstack._internal.utils.docker import parse_image_name + +# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html#efa-instance-types +_AWS_EFA_ENABLED_INSTANCE_TYPE_PATTERNS = [ + r"^p6-b300\.(48xlarge)$", + r"^p6-b200\.(48xlarge)$", + r"^p5\.(4xlarge|48xlarge)$", + r"^p5e\.(48xlarge)$", + r"^p5en\.(48xlarge)$", + r"^p4d\.(24xlarge)$", + r"^p4de\.(24xlarge)$", + r"^g7e\.(8xlarge|12xlarge|24xlarge|48xlarge)$", + r"^g6\.(8xlarge|12xlarge|16xlarge|24xlarge|48xlarge)$", + r"^g6e\.(8xlarge|12xlarge|16xlarge|24xlarge|48xlarge)$", + r"^gr6\.8xlarge$", + r"^g5\.(8xlarge|12xlarge|16xlarge|24xlarge|48xlarge)$", + r"^g4dn\.(8xlarge|12xlarge|16xlarge|metal)$", + r"^p3dn\.(24xlarge)$", +] + + +def get_instance_specific_mounts( + backend_type: BackendType, + instance_type_name: str, +) -> list[InstanceMountPoint]: + if backend_type == BackendType.GCP: + if instance_type_name == "a3-megagpu-8g": + return [ + InstanceMountPoint( + instance_path="/dev/aperture_devices", + path="/dev/aperture_devices", + ), + InstanceMountPoint( + instance_path="/var/lib/tcpxo/lib64", + path="/var/lib/tcpxo/lib64", + ), + InstanceMountPoint( + instance_path="/var/lib/fastrak/lib64", + path="/var/lib/fastrak/lib64", + ), + ] + if instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]: + return [ + InstanceMountPoint( + instance_path="/var/lib/nvidia/lib64", + path="/usr/local/nvidia/lib64", + ), + InstanceMountPoint( + instance_path="/var/lib/nvidia/bin", + path="/usr/local/nvidia/bin", + ), + InstanceMountPoint( + instance_path="/var/lib/tcpx/lib64", + path="/usr/local/tcpx/lib64", + ), + InstanceMountPoint( + instance_path="/run/tcpx", + path="/run/tcpx", + ), + ] + return [] + + +def get_instance_specific_gpu_devices( + backend_type: BackendType, + instance_type_name: str, +) -> list[GPUDevice]: + gpu_devices = [] + if backend_type == BackendType.GCP and instance_type_name in [ + "a3-edgegpu-8g", + "a3-highgpu-8g", + ]: + for i in range(8): + gpu_devices.append( + GPUDevice(path_on_host=f"/dev/nvidia{i}", path_in_container=f"/dev/nvidia{i}") + ) + gpu_devices.append( + GPUDevice(path_on_host="/dev/nvidia-uvm", path_in_container="/dev/nvidia-uvm") + ) + gpu_devices.append( + GPUDevice(path_on_host="/dev/nvidiactl", path_in_container="/dev/nvidiactl") + ) + return gpu_devices + + +def resolve_provisioning_image( + image_name: str, + registry_auth: Optional[RegistryAuth], + job_provisioning_data: JobProvisioningData, +) -> tuple[str, Optional[RegistryAuth]]: + image_name, registry_auth = apply_server_docker_defaults(image_name, registry_auth) + if job_provisioning_data.backend == BackendType.AWS: + image_name = _patch_base_image_for_aws_efa( + image_name, + job_provisioning_data.instance_type.name, + ) + return image_name, registry_auth + + +def _patch_base_image_for_aws_efa( + image_name: str, + instance_type_name: str, +) -> str: + is_efa_enabled = any( + re.match(pattern, instance_type_name) + for pattern in _AWS_EFA_ENABLED_INSTANCE_TYPE_PATTERNS + ) + if not is_efa_enabled: + return image_name + + if parse_image_name(image_name).repo != settings.DSTACK_DOCKER_BASE_IMAGE: + return image_name + + if image_name.endswith(f"-base-ubuntu{settings.DSTACK_DOCKER_BASE_IMAGE_UBUNTU_VERSION}"): + return ( + image_name[:-17] + + f"-devel-efa-ubuntu{settings.DSTACK_DOCKER_BASE_IMAGE_UBUNTU_VERSION}" + ) + if image_name.endswith(f"-devel-ubuntu{settings.DSTACK_DOCKER_BASE_IMAGE_UBUNTU_VERSION}"): + return ( + image_name[:-18] + + f"-devel-efa-ubuntu{settings.DSTACK_DOCKER_BASE_IMAGE_UBUNTU_VERSION}" + ) + + return image_name diff --git a/src/dstack/_internal/server/services/compute_groups.py b/src/dstack/_internal/server/services/compute_groups.py new file mode 100644 index 0000000000..4d759e0d21 --- /dev/null +++ b/src/dstack/_internal/server/services/compute_groups.py @@ -0,0 +1,22 @@ +from dstack._internal.core.models.compute_groups import ComputeGroup, ComputeGroupProvisioningData +from dstack._internal.server.models import ComputeGroupModel + + +def compute_group_model_to_compute_group(compute_group_model: ComputeGroupModel) -> ComputeGroup: + provisioning_data = get_compute_group_provisioning_data(compute_group_model) + return ComputeGroup( + id=compute_group_model.id, + project_name=compute_group_model.project.name, + status=compute_group_model.status, + name=provisioning_data.compute_group_name, + created_at=compute_group_model.created_at, + provisioning_data=provisioning_data, + ) + + +def get_compute_group_provisioning_data( + compute_group_model: ComputeGroupModel, +) -> ComputeGroupProvisioningData: + return ComputeGroupProvisioningData.__response__.parse_raw( + compute_group_model.provisioning_data + ) diff --git a/src/dstack/_internal/server/services/config.py b/src/dstack/_internal/server/services/config.py index 8808897519..0bc201677e 100644 --- a/src/dstack/_internal/server/services/config.py +++ b/src/dstack/_internal/server/services/config.py @@ -1,33 +1,35 @@ -from pathlib import Path -from typing import Dict, List, Literal, Optional, Union +from typing import List, Optional import yaml -from pydantic import BaseModel, Field, ValidationError, root_validator +from pydantic import Field, ValidationError from sqlalchemy.ext.asyncio import AsyncSession from typing_extensions import Annotated +import dstack._internal.core.backends.configurators +from dstack._internal.core.backends.models import ( + AnyBackendConfigWithCreds, + AnyBackendFileConfigWithCreds, + BackendInfoYAML, +) from dstack._internal.core.errors import ( + BackendNotAvailable, ResourceNotExistsError, ServerClientError, ) -from dstack._internal.core.models.backends import AnyConfigInfoWithCreds, BackendInfoYAML -from dstack._internal.core.models.backends.aws import AnyAWSCreds -from dstack._internal.core.models.backends.azure import AnyAzureCreds from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.backends.cudo import AnyCudoCreds -from dstack._internal.core.models.backends.datacrunch import AnyDataCrunchCreds -from dstack._internal.core.models.backends.kubernetes import KubernetesNetworkingConfig -from dstack._internal.core.models.backends.lambdalabs import AnyLambdaCreds -from dstack._internal.core.models.backends.oci import AnyOCICreds -from dstack._internal.core.models.backends.runpod import AnyRunpodCreds -from dstack._internal.core.models.backends.tensordock import AnyTensorDockCreds -from dstack._internal.core.models.backends.vastai import AnyVastAICreds from dstack._internal.core.models.common import CoreModel from dstack._internal.server import settings from dstack._internal.server.models import ProjectModel, UserModel from dstack._internal.server.services import backends as backends_services +from dstack._internal.server.services import encryption as encryption_services from dstack._internal.server.services import projects as projects_services -from dstack._internal.server.utils.common import run_async +from dstack._internal.server.services.backends.handlers import delete_backends_safe +from dstack._internal.server.services.encryption import AnyEncryptionKeyConfig +from dstack._internal.server.services.permissions import ( + DefaultPermissions, + set_default_permissions, +) +from dstack._internal.server.services.plugins import load_plugins from dstack._internal.utils.logging import get_logger logger = get_logger(__name__) @@ -36,8 +38,8 @@ # By default, PyYAML chooses the style of a collection depending on whether it has nested collections. # If a collection has nested collections, it will be assigned the block style. Otherwise it will have the flow style. # -# We want mapping to always be display in block-style but lists without nested objects in flow-style. -# So we define a custom representeter +# We want mapping to always be displayed in block-style but lists without nested objects in flow-style. +# So we define a custom representer. def seq_representer(dumper, sequence): @@ -48,370 +50,147 @@ def seq_representer(dumper, sequence): yaml.add_representer(list, seq_representer) -# Below we define pydantic models for configs allowed in server/config.yml and YAML-based API. -# There are some differences between the two, e.g. server/config.yml fills file-based -# credentials by looking for a file, while YAML-based API doesn't do this. -# So for some backends there are two sets of config models. - - -class AWSConfig(CoreModel): - type: Annotated[Literal["aws"], Field(description="The type of the backend")] = "aws" - regions: Annotated[Optional[List[str]], Field(description="The list of AWS regions")] = None - vpc_name: Annotated[ - Optional[str], - Field(description="The VPC name. All configured regions must have a VPC with this name"), - ] = None - vpc_ids: Annotated[ - Optional[Dict[str, str]], - Field( - description="The mapping from AWS regions to VPC IDs. If `default_vpcs: true`, omitted regions will use default VPCs" - ), - ] = None - default_vpcs: Annotated[ - Optional[bool], - Field( - description=( - "A flag to enable/disable using default VPCs in regions not configured by `vpc_ids`." - " Set to `false` if default VPCs should never be used." - " Defaults to `true`" - ) - ), - ] = None - public_ips: Annotated[ - Optional[bool], - Field( - description="A flag to enable/disable public IP assigning on instances. Defaults to `true`" - ), - ] = None - creds: AnyAWSCreds = Field(..., description="The credentials", discriminator="type") - - -class AzureConfig(CoreModel): - type: Annotated[Literal["azure"], Field(description="The type of the backend")] = "azure" - tenant_id: Annotated[str, Field(description="The tenant ID")] - subscription_id: Annotated[str, Field(description="The subscription ID")] - regions: Optional[List[str]] = None - creds: AnyAzureCreds = Field(..., description="The credentials", discriminator="type") - - -class CudoConfig(CoreModel): - type: Annotated[Literal["cudo"], Field(description="The type of backend")] = "cudo" - regions: Optional[List[str]] = None - project_id: Annotated[str, Field(description="The project ID")] - creds: Annotated[AnyCudoCreds, Field(description="The credentials")] - - -class DataCrunchConfig(CoreModel): - type: Annotated[Literal["datacrunch"], Field(description="The type of backend")] = "datacrunch" - regions: Optional[List[str]] = None - creds: Annotated[AnyDataCrunchCreds, Field(description="The credentials")] - - -class GCPServiceAccountCreds(CoreModel): - type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = ( - "service_account" - ) - filename: Annotated[str, Field(description="The path to the service account file")] - # If data is None, it is read from the file - data: Annotated[ - Optional[str], Field(description="The contents of the service account file") - ] = None - - @root_validator - def fill_data(cls, values): - return _fill_data(values) - - -class GCPServiceAccountAPICreds(CoreModel): - type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = ( - "service_account" - ) - filename: Annotated[ - Optional[str], Field(description="The path to the service account file") - ] = "" - data: Annotated[str, Field(description="The contents of the service account file")] - - -class GCPDefaultCreds(CoreModel): - type: Annotated[Literal["default"], Field(description="The type of credentials")] = "default" - - -AnyGCPCreds = Union[GCPServiceAccountCreds, GCPDefaultCreds] -AnyGCPAPICreds = Union[GCPServiceAccountAPICreds, GCPDefaultCreds] - - -class GCPConfig(CoreModel): - type: Annotated[Literal["gcp"], Field(description="The type of backend")] = "gcp" - project_id: Annotated[str, Field(description="The project ID")] - regions: Optional[List[str]] = None - vpc_name: Annotated[Optional[str], Field(description="The VPC name")] = None - vpc_project_id: Annotated[ - Optional[str], - Field(description="The shared VPC hosted project ID. Required for shared VPC only"), - ] = None - public_ips: Annotated[ - Optional[bool], - Field( - description="A flag to enable/disable public IP assigning on instances. Defaults to `true`" - ), - ] = None - creds: AnyGCPCreds = Field(..., description="The credentials", discriminator="type") +BackendFileConfigWithCreds = Annotated[ + AnyBackendFileConfigWithCreds, Field(..., discriminator="type") +] -class GCPAPIConfig(CoreModel): - type: Annotated[Literal["gcp"], Field(description="The type of backend")] = "gcp" - project_id: Annotated[str, Field(description="The project ID")] - regions: Optional[List[str]] = None - vpc_name: Annotated[Optional[str], Field(description="The VPC name")] = None - vpc_project_id: Annotated[ - Optional[str], - Field(description="The shared VPC hosted project ID. Required for shared VPC only"), +class ProjectConfig(CoreModel): + name: Annotated[str, Field(description="The name of the project")] + backends: Annotated[ + Optional[List[BackendFileConfigWithCreds]], Field(description="The list of backends") ] = None - creds: AnyGCPAPICreds = Field(..., description="The credentials", discriminator="type") - - -class KubeconfigConfig(CoreModel): - filename: Annotated[str, Field(description="The path to the kubeconfig file")] - data: Annotated[Optional[str], Field(description="The contents of the kubeconfig file")] = None - - @root_validator - def fill_data(cls, values): - return _fill_data(values) - -class KubeconfigAPIConfig(CoreModel): - filename: Annotated[str, Field(description="The path to the kubeconfig file")] = "" - data: Annotated[str, Field(description="The contents of the kubeconfig file")] +EncryptionKeyConfig = Annotated[AnyEncryptionKeyConfig, Field(..., discriminator="type")] -class KubernetesConfig(CoreModel): - type: Annotated[Literal["kubernetes"], Field(description="The type of backend")] = "kubernetes" - kubeconfig: Annotated[KubeconfigConfig, Field(description="The kubeconfig configuration")] - networking: Annotated[ - Optional[KubernetesNetworkingConfig], Field(description="The networking configuration") - ] +class EncryptionConfig(CoreModel): + keys: Annotated[List[EncryptionKeyConfig], Field(description="The encryption keys")] -class KubernetesAPIConfig(CoreModel): - type: Annotated[Literal["kubernetes"], Field(description="The type of backend")] = "kubernetes" - kubeconfig: Annotated[KubeconfigAPIConfig, Field(description="The kubeconfig configuration")] - networking: Annotated[ - Optional[KubernetesNetworkingConfig], Field(description="The networking configuration") - ] - -class LambdaConfig(CoreModel): - type: Annotated[Literal["lambda"], Field(description="The type of backend")] = "lambda" - regions: Optional[List[str]] = None - creds: Annotated[AnyLambdaCreds, Field(description="The credentials")] - - -class NebiusServiceAccountCreds(CoreModel): - type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = ( - "service_account" - ) - filename: Annotated[str, Field(description="The path to the service account file")] - data: Annotated[ - Optional[str], Field(description="The contents of the service account file") +class ServerConfig(CoreModel): + projects: Annotated[List[ProjectConfig], Field(description="The list of projects")] + encryption: Annotated[ + Optional[EncryptionConfig], Field(description="The encryption config") ] = None - - @root_validator - def fill_data(cls, values): - return _fill_data(values) - - -class NebiusServiceAccountAPICreds(CoreModel): - type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = ( - "service_account" - ) - filename: Annotated[str, Field(description="The path to the service account file")] - data: Annotated[str, Field(description="The contents of the service account file")] - - -AnyNebiusCreds = NebiusServiceAccountCreds -AnyNebiusAPICreds = NebiusServiceAccountAPICreds - - -class NebiusConfig(CoreModel): - type: Literal["nebius"] = "nebius" - cloud_id: str - folder_id: str - network_id: str - regions: Optional[List[str]] = None - creds: AnyNebiusCreds - - -class NebiusAPIConfig(CoreModel): - type: Literal["nebius"] = "nebius" - cloud_id: str - folder_id: str - network_id: str - regions: Optional[List[str]] = None - creds: AnyNebiusAPICreds - - -class OCIConfig(CoreModel): - type: Annotated[Literal["oci"], Field(description="The type of backend")] = "oci" - creds: Annotated[AnyOCICreds, Field(description="The credentials", discriminator="type")] - regions: Annotated[ - Optional[List[str]], - Field( - description="List of region names for running `dstack` jobs. Omit to use all regions" - ), + default_permissions: Annotated[ + Optional[DefaultPermissions], Field(description="The default user permissions") ] = None - compartment_id: Annotated[ - Optional[str], - Field( - description=( - "Compartment where `dstack` will create all resources. " - "Omit to instruct `dstack` to create a new compartment" - ) - ), + plugins: Annotated[ + Optional[List[str]], Field(description="The server-side plugins to enable") ] = None -class RunpodConfig(CoreModel): - type: Literal["runpod"] = "runpod" - regions: Optional[List[str]] = None - creds: AnyRunpodCreds - - -class TensorDockConfig(CoreModel): - type: Annotated[Literal["tensordock"], Field(description="The type of backend")] = "tensordock" - regions: Optional[List[str]] = None - creds: Annotated[AnyTensorDockCreds, Field(description="The credentials")] - - -class VastAIConfig(CoreModel): - type: Annotated[Literal["vastai"], Field(description="The type of backend")] = "vastai" - regions: Optional[List[str]] = None - creds: Annotated[AnyVastAICreds, Field(description="The credentials")] - - -class DstackConfig(CoreModel): - type: Annotated[Literal["dstack"], Field(description="The type of backend")] = "dstack" - - -AnyBackendConfig = Union[ - AWSConfig, - AzureConfig, - CudoConfig, - DataCrunchConfig, - GCPConfig, - KubernetesConfig, - LambdaConfig, - NebiusConfig, - OCIConfig, - RunpodConfig, - TensorDockConfig, - VastAIConfig, - DstackConfig, -] - -BackendConfig = Annotated[AnyBackendConfig, Field(..., discriminator="type")] - - -class _BackendConfig(BaseModel): - __root__: BackendConfig - - -AnyBackendAPIConfig = Union[ - AWSConfig, - AzureConfig, - CudoConfig, - DataCrunchConfig, - GCPAPIConfig, - KubernetesAPIConfig, - LambdaConfig, - NebiusAPIConfig, - OCIConfig, - RunpodConfig, - TensorDockConfig, - VastAIConfig, - DstackConfig, -] - - -BackendAPIConfig = Annotated[AnyBackendAPIConfig, Field(..., discriminator="type")] - - -class _BackendAPIConfig(BaseModel): - __root__: BackendAPIConfig - - -class ProjectConfig(CoreModel): - name: Annotated[str, Field(description="The name of the project")] - backends: Annotated[List[BackendConfig], Field(description="The list of backends")] - - -class ServerConfig(CoreModel): - projects: Annotated[List[ProjectConfig], Field(description="The list of projects")] - - class ServerConfigManager: def load_config(self) -> bool: self.config = self._load_config() return self.config is not None async def init_config(self, session: AsyncSession): - self.config = await self._init_config(session=session, init_backends=True) + """ + Initializes the default server/config.yml. + The default config is empty or contains an existing `main` project config. + """ + self.config = await self._init_config(session) if self.config is not None: self._save_config(self.config) async def sync_config(self, session: AsyncSession): # Disable config.yml sync for https://github.com/dstackai/dstack/issues/815. return - # self.config = await self._init_config(session=session, init_backends=False) - # if self.config is not None: - # self._save_config(self.config) + + async def apply_encryption(self): + if self.config is None: + logger.info("No server/config.yml. Skipping encryption configuration.") + return + if self.config.encryption is not None: + encryption_services.init_encryption_keys(self.config.encryption.keys) async def apply_config(self, session: AsyncSession, owner: UserModel): if self.config is None: raise ValueError("Config is not loaded") + if self.config.default_permissions is not None: + set_default_permissions(self.config.default_permissions) for project_config in self.config.projects: - project = await projects_services.get_project_model_by_name( - session=session, - project_name=project_config.name, + await self._apply_project_config( + session=session, owner=owner, project_config=project_config ) - if not project: - await projects_services.create_project_model( - session=session, owner=owner, project_name=project_config.name + load_plugins(enabled_plugins=self.config.plugins or []) + + async def _apply_project_config( + self, + session: AsyncSession, + owner: UserModel, + project_config: ProjectConfig, + ): + project = await projects_services.get_project_model_by_name( + session=session, + project_name=project_config.name, + ) + if not project: + await projects_services.create_project_model( + session=session, owner=owner, project_name=project_config.name + ) + project = await projects_services.get_project_model_by_name_or_error( + session=session, project_name=project_config.name + ) + backends_to_delete = set( + dstack._internal.core.backends.configurators.list_available_backend_types() + ) + for backend_file_config in project_config.backends or []: + backend_config = file_config_to_config(backend_file_config) + backend_type = BackendType(backend_config.type) + backends_to_delete.difference_update([backend_type]) + backend_exists = any(backend_type == b.type for b in project.backends) + try: + current_backend_config = await backends_services.get_backend_config( + project=project, + backend_type=backend_type, ) - project = await projects_services.get_project_model_by_name_or_error( - session=session, project_name=project_config.name + except BackendNotAvailable: + logger.warning( + "Backend %s not available and won't be configured." + " Check that backend dependencies are installed.", + backend_type.value, ) - backends_to_delete = backends_services.list_available_backend_types() - for backend_config in project_config.backends: - config_info = config_to_internal_config(backend_config) - backend_type = BackendType(config_info.type) - try: - backends_to_delete.remove(backend_type) - except ValueError: - continue - current_config_info = await backends_services.get_config_info( + continue + if current_backend_config is not None: + current_source_backend_config = await backends_services.get_source_backend_config( project=project, backend_type=backend_type, ) - if config_info == current_config_info: + # current_source_backend_config may be missing for old backend records + comparable_backend_config = current_source_backend_config or current_backend_config + if backend_config == comparable_backend_config: continue - try: - if current_config_info is None: - await backends_services.create_backend( - session=session, project=project, config=config_info - ) - else: - await backends_services.update_backend( - session=session, project=project, config=config_info - ) - except Exception as e: - logger.warning("Failed to configure backend %s: %s", config_info.type, e) - await backends_services.delete_backends( - session=session, project=project, backends_types=backends_to_delete - ) + # current_backend_config may be None if backend exists + # but it's config is invalid (e.g. cannot be decrypted). + # Update backend in this case. + if current_backend_config is None and not backend_exists: + apply_action = "create" + apply_func = backends_services.create_backend + else: + apply_action = "update" + apply_func = backends_services.update_backend + try: + await apply_func(session=session, project=project, config=backend_config) + except Exception as e: + logger.warning( + "Failed to %s backend %s in project %s: %s", + apply_action, + backend_config.type, + project.name, + e, + ) + await delete_backends_safe( + session=session, + project=project, + backends_types=list(backends_to_delete), + error=False, + ) - async def _init_config( - self, session: AsyncSession, init_backends: bool - ) -> Optional[ServerConfig]: + async def _init_config(self, session: AsyncSession) -> Optional[ServerConfig]: project = await projects_services.get_project_model_by_name( session=session, project_name=settings.DEFAULT_PROJECT_NAME, @@ -421,38 +200,20 @@ async def _init_config( # Force project reload to reflect updates when syncing await session.refresh(project) backends = [] - for backend_type in backends_services.list_available_backend_types(): - config_info = await backends_services.get_config_info( + for ( + backend_type + ) in dstack._internal.core.backends.configurators.list_available_backend_types(): + backend_config = await backends_services.get_backend_config( project=project, backend_type=backend_type ) - if config_info is not None: - backends.append(internal_config_to_config(config_info)) - if init_backends and len(backends) == 0: - backends = await self._init_backends(session=session, project=project) + if backend_config is not None: + backends.append(backend_config) return ServerConfig( - projects=[ProjectConfig(name=settings.DEFAULT_PROJECT_NAME, backends=backends)] + projects=[ProjectConfig(name=settings.DEFAULT_PROJECT_NAME, backends=backends)], + encryption=EncryptionConfig(keys=[]), + default_permissions=None, ) - async def _init_backends( - self, session: AsyncSession, project: ProjectModel - ) -> List[AnyConfigInfoWithCreds]: - backends = [] - for backend_type in backends_services.list_available_backend_types(): - configurator = backends_services.get_configurator(backend_type) - if configurator is None: - continue - config_infos = await run_async(configurator.get_default_configs) - for config_info in config_infos: - try: - await backends_services.create_backend( - session=session, project=project, config=config_info - ) - backends.append(internal_config_to_config(config_info)) - break - except Exception as e: - logger.debug("Failed to configure backend %s: %s", config_info.type, e) - return backends - def _load_config(self) -> Optional[ServerConfig]: try: with open(settings.SERVER_CONFIG_FILE_PATH) as f: @@ -470,13 +231,12 @@ def _save_config(self, config: ServerConfig): async def get_backend_config_yaml( project: ProjectModel, backend_type: BackendType ) -> BackendInfoYAML: - config_info = await backends_services.get_config_info( + backend_config = await backends_services.get_backend_config( project=project, backend_type=backend_type ) - if config_info is None: + if backend_config is None: raise ResourceNotExistsError() - config = internal_config_to_config(config_info) - config_yaml = config_to_yaml(config) + config_yaml = config_to_yaml(backend_config) return BackendInfoYAML( name=backend_type, config_yaml=config_yaml, @@ -488,9 +248,8 @@ async def create_backend_config_yaml( project: ProjectModel, config_yaml: str, ): - backend_config = config_yaml_to_backend_config(config_yaml) - config_info = config_to_internal_config(backend_config) - await backends_services.create_backend(session=session, project=project, config=config_info) + config = config_yaml_to_backend_config(config_yaml) + await backends_services.create_backend(session=session, project=project, config=config) async def update_backend_config_yaml( @@ -498,64 +257,35 @@ async def update_backend_config_yaml( project: ProjectModel, config_yaml: str, ): - backend_config = config_yaml_to_backend_config(config_yaml) - config_info = config_to_internal_config(backend_config) - await backends_services.update_backend(session=session, project=project, config=config_info) - - -server_config_manager = ServerConfigManager() - - -def internal_config_to_config(config_info: AnyConfigInfoWithCreds) -> BackendConfig: - backend_config = _BackendConfig.parse_obj(config_info.dict(exclude={"locations"})) - if config_info.type == "azure": - backend_config.__root__.regions = config_info.locations - return backend_config.__root__ + config = config_yaml_to_backend_config(config_yaml) + await backends_services.update_backend(session=session, project=project, config=config) -class _ConfigInfoWithCreds(CoreModel): - __root__: Annotated[AnyConfigInfoWithCreds, Field(..., discriminator="type")] +class _BackendConfigWithCreds(CoreModel): + """ + Model for parsing API and file YAML configs. + """ + __root__: Annotated[AnyBackendConfigWithCreds, Field(..., discriminator="type")] -def config_to_internal_config( - backend_config: Union[BackendConfig, BackendAPIConfig], -) -> AnyConfigInfoWithCreds: - backend_config_dict = backend_config.dict() - # Allow to not specify networking - if backend_config.type == "kubernetes": - if backend_config.networking is None: - backend_config_dict["networking"] = {} - if backend_config.type == "azure": - backend_config_dict["locations"] = backend_config_dict["regions"] - del backend_config_dict["regions"] - config_info = _ConfigInfoWithCreds.parse_obj(backend_config_dict) - return config_info.__root__ - -def config_yaml_to_backend_config(config_yaml: str) -> BackendAPIConfig: +def config_yaml_to_backend_config(config_yaml: str) -> AnyBackendConfigWithCreds: try: config_dict = yaml.load(config_yaml, yaml.FullLoader) except yaml.YAMLError: raise ServerClientError("Error parsing YAML") try: - backend_config = _BackendAPIConfig.parse_obj(config_dict).__root__ + backend_config = _BackendConfigWithCreds.parse_obj(config_dict).__root__ except ValidationError as e: raise ServerClientError(str(e)) return backend_config -def config_to_yaml(config: CoreModel) -> str: - return yaml.dump(config.dict(exclude_none=True), sort_keys=False) +def file_config_to_config(file_config: AnyBackendFileConfigWithCreds) -> AnyBackendConfigWithCreds: + backend_config_dict = file_config.dict() + backend_config = _BackendConfigWithCreds.parse_obj(backend_config_dict) + return backend_config.__root__ -def _fill_data(values: dict): - if values.get("data") is not None: - return values - if "filename" not in values: - raise ValueError() - try: - with open(Path(values["filename"]).expanduser()) as f: - values["data"] = f.read() - except OSError: - raise ValueError(f"No such file {values['filename']}") - return values +def config_to_yaml(config: CoreModel) -> str: + return yaml.dump(config.dict(exclude_none=True), sort_keys=False) diff --git a/src/dstack/_internal/server/services/docker.py b/src/dstack/_internal/server/services/docker.py index 13cc0ee46c..41580e473b 100644 --- a/src/dstack/_internal/server/services/docker.py +++ b/src/dstack/_internal/server/services/docker.py @@ -9,12 +9,16 @@ from typing_extensions import Annotated from dstack._internal.core.errors import DockerRegistryError -from dstack._internal.core.models.common import CoreModel -from dstack._internal.core.models.configurations import RegistryAuth +from dstack._internal.core.models.common import CoreModel, RegistryAuth +from dstack._internal.server import settings as server_settings from dstack._internal.server.utils.common import join_byte_stream_checked +from dstack._internal.utils.docker import ( + LEGACY_DEFAULT_REGISTRY, + is_default_registry, + parse_image_name, +) DEFAULT_PLATFORM = "linux/amd64" -DEFAULT_REGISTRY = "index.docker.io" MAX_CONFIG_OBJECT_SIZE = 2**22 # 4 MiB REGISTRY_REQUEST_TIMEOUT = 20 @@ -31,21 +35,18 @@ def __call__(self, dxf: DXF, response: requests.Response) -> None: ) -class DockerImage(CoreModel): - class Config: - frozen = True - - image: str - registry: Optional[str] - repo: str - tag: str - digest: Optional[str] - - class ImageConfig(CoreModel): + user: Annotated[Optional[str], Field(alias="User")] = None entrypoint: Annotated[Optional[List[str]], Field(alias="Entrypoint")] = None cmd: Annotated[Optional[List[str]], Field(alias="Cmd")] = None + @validator("user") + def normalize_user(cls, v: Optional[str]) -> Optional[str]: + # If USER is not set, the corresponding field may be missing or set to an empty string + if v == "": + return None + return v + class ImageConfigObject(CoreModel): config: ImageConfig = ImageConfig() @@ -66,10 +67,14 @@ class ImageManifest(CoreModel): def get_image_config(image_name: str, registry_auth: Optional[RegistryAuth]) -> ImageConfigObject: image = parse_image_name(image_name) + registry = image.registry + if registry is None or is_default_registry(registry): + registry = LEGACY_DEFAULT_REGISTRY + registry_client = DXF( - host=image.registry or DEFAULT_REGISTRY, + host=registry, repo=image.repo, - auth=DXFAuthAdapter(registry_auth), + auth=DXFAuthAdapter(registry_auth), # type: ignore[assignment] timeout=REGISTRY_REQUEST_TIMEOUT, ) @@ -80,11 +85,10 @@ def get_image_config(image_name: str, registry_auth: Optional[RegistryAuth]) -> ) manifest = ImageManifest.__response__.parse_raw(manifest_resp) config_stream = registry_client.pull_blob(manifest.config.digest) - config_resp = join_byte_stream_checked(config_stream, MAX_CONFIG_OBJECT_SIZE) + config_resp = join_byte_stream_checked(config_stream, MAX_CONFIG_OBJECT_SIZE) # type: ignore[arg-type] if config_resp is None: raise DockerRegistryError( - "Image config object exceeds the size limit of " - f"{MAX_CONFIG_OBJECT_SIZE} bytes" + f"Image config object exceeds the size limit of {MAX_CONFIG_OBJECT_SIZE} bytes" ) return ImageConfigObject.__response__.parse_raw(config_resp) @@ -92,56 +96,24 @@ def get_image_config(image_name: str, registry_auth: Optional[RegistryAuth]) -> raise DockerRegistryError(e) -def parse_image_name(image: str) -> DockerImage: - """ - :param image: docker image name - :return: registry host, repo, tag, digest - - >>> parse_image_name("ubuntu:22.04") - DockerImage(registry=None, repo='library/ubuntu', tag='22.04', digest=None) - >>> parse_image_name("dstackai/miniforge:py3.9-0.2") - DockerImage(registry=None, repo='dstackai/miniforge', tag='py3.9-0.2', digest=None) - >>> parse_image_name("ghcr.io/dstackai/miniforge") - DockerImage(registry='ghcr.io', repo='dstackai/miniforge', tag='latest', digest=None) - >>> parse_image_name("dstackai/miniforge@sha256:a4ba18a847a172a248d68faf6689e69fae4779b90b250211b79a26d21ddd6a15") - DockerImage(registry=None, repo='dstackai/miniforge', tag='latest', digest='sha256:a4ba18a847a172a248d68faf6689e69fae4779b90b250211b79a26d21ddd6a15') - """ - - digest = None - if "@" in image.split("/")[-1]: - image, digest = image.rsplit("@", maxsplit=1) - - tag = "latest" - if ":" in image.split("/")[-1]: # avoid detecting port as a tag - image, tag = image.rsplit(":", maxsplit=1) - - registry = None - components = image.split("/") - if len(components) == 1: # default registry, official image - repo = "library/" + components[0] - elif not is_host(components[0]): # default registry, custom image - repo = "/".join(components) - else: # custom registry - registry = components[0] - repo = "/".join(components[1:]) - - return DockerImage(image=image, registry=registry, repo=repo, tag=tag, digest=digest) - - -def is_host(s: str) -> bool: - """ - >>> is_host("localhost") - True - >>> is_host("localhost:5000") - True - >>> is_host("ghcr.io") - True - >>> is_host("127.0.0.1") - True - >>> is_host("dstackai") - False - """ - return s == "localhost" or ":" in s or "." in s +def apply_server_docker_defaults( + image_name: str, + registry_auth: Optional[RegistryAuth], +) -> tuple[str, Optional[RegistryAuth]]: + if parse_image_name(image_name).registry is not None: + return image_name, registry_auth + if server_settings.SERVER_DEFAULT_DOCKER_REGISTRY is not None: + image_name = f"{server_settings.SERVER_DEFAULT_DOCKER_REGISTRY}/{image_name}" + if ( + registry_auth is None + and server_settings.SERVER_DEFAULT_DOCKER_REGISTRY_USERNAME is not None + and server_settings.SERVER_DEFAULT_DOCKER_REGISTRY_PASSWORD is not None + ): + registry_auth = RegistryAuth( + username=server_settings.SERVER_DEFAULT_DOCKER_REGISTRY_USERNAME, + password=server_settings.SERVER_DEFAULT_DOCKER_REGISTRY_PASSWORD, + ) + return image_name, registry_auth DOCKER_TARGET_PATH_PATTERN = re.compile(r"^(/[^/\0]*)+/?$") diff --git a/src/dstack/_internal/server/services/encryption/__init__.py b/src/dstack/_internal/server/services/encryption/__init__.py new file mode 100644 index 0000000000..fd678a497c --- /dev/null +++ b/src/dstack/_internal/server/services/encryption/__init__.py @@ -0,0 +1,102 @@ +from contextlib import contextmanager +from typing import List, Tuple, Union + +from dstack._internal.core.errors import DstackError +from dstack._internal.server.models import EncryptedString +from dstack._internal.server.services.encryption.keys.aes import ( + AESEncryptionKey, + AESEncryptionKeyConfig, +) +from dstack._internal.server.services.encryption.keys.base import EncryptionKey +from dstack._internal.server.services.encryption.keys.identity import ( + IdentityEncryptionKey, + IdentityEncryptionKeyConfig, +) +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +class EncryptionError(DstackError): + pass + + +AnyEncryptionKeyConfig = Union[ + AESEncryptionKeyConfig, + IdentityEncryptionKeyConfig, +] + + +_ENCRYPTION_KEY_CLASSES = [ + IdentityEncryptionKey, + AESEncryptionKey, +] +_ENCRYPTION_KEY_TYPE_TO_ENCRYPTION_KEY_CLASS = {c.TYPE: c for c in _ENCRYPTION_KEY_CLASSES} + + +# TODO: Introduce EncryptionKeyConfigurator to support external providers +def get_encryption_key(config: AnyEncryptionKeyConfig) -> EncryptionKey: + return _ENCRYPTION_KEY_TYPE_TO_ENCRYPTION_KEY_CLASS[config.type](config) + + +def get_identity_encryption_key() -> IdentityEncryptionKey: + return IdentityEncryptionKey(IdentityEncryptionKeyConfig()) + + +_encryption_keys = [get_identity_encryption_key()] + + +def init_encryption_keys(encryption_key_configs: List[AnyEncryptionKeyConfig]): + global _encryption_keys + _encryption_keys = [get_encryption_key(c) for c in encryption_key_configs] + if not any(isinstance(key, IdentityEncryptionKey) for key in _encryption_keys): + _encryption_keys.append(get_identity_encryption_key()) + + +@contextmanager +def encryption_keys_context(encryption_keys: List[EncryptionKey]): + """ + A helper context manager to be used in tests. It's not concurrency-safe. + """ + global _encryption_keys + prev_encryption_keys = _encryption_keys + _encryption_keys = encryption_keys + try: + yield + finally: + _encryption_keys = prev_encryption_keys + + +def encrypt(plaintext: str) -> str: + key = _encryption_keys[0] + ciphertext = key.encrypt(plaintext) + packed_ciphertext = _pack_ciphertext(ciphertext, key_type=key.TYPE, key_name=key.name) + return packed_ciphertext + + +def decrypt(ciphertext: str) -> str: + key_type, _, ciphertext = _unpack_ciphertext(ciphertext) + # Ignore key_name when decrypting + for i, key in enumerate(_encryption_keys): + if key.TYPE != key_type: + continue + try: + return key.decrypt(ciphertext) + except Exception: + logger.debug(f"Attempt to decrypt ciphertext with key #{i} failed") + raise EncryptionError("All keys failed to decrypt ciphertext") + + +def _pack_ciphertext(ciphertext: str, key_type: str, key_name: str) -> str: + return f"enc:{key_type}:{key_name}:{ciphertext}" + + +def _unpack_ciphertext(packed_ciphertext: str) -> Tuple[str, str, str]: + _, key_type, key_name, ciphertext = packed_ciphertext.split(":", maxsplit=3) + return key_type, key_name, ciphertext + + +EncryptedString.set_encrypt_decrypt( + encrypt_func=encrypt, + decrypt_func=decrypt, +) diff --git a/src/dstack/_internal/server/services/encryption/keys/__init__.py b/src/dstack/_internal/server/services/encryption/keys/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/server/services/encryption/keys/aes.py b/src/dstack/_internal/server/services/encryption/keys/aes.py new file mode 100644 index 0000000000..4c6e08064e --- /dev/null +++ b/src/dstack/_internal/server/services/encryption/keys/aes.py @@ -0,0 +1,68 @@ +import os +from base64 import b64decode, b64encode +from typing import Literal + +from cryptography.hazmat.primitives.ciphers.aead import AESGCM +from pydantic import Field, validator +from typing_extensions import Annotated + +from dstack._internal.core.models.common import CoreModel +from dstack._internal.server.services.encryption.keys.base import EncryptionKey + + +class AESEncryptionKeyConfig(CoreModel): + type: Annotated[Literal["aes"], Field(description="The type of the key")] = "aes" + name: Annotated[str, Field(description="The key name for key identification")] + secret: Annotated[str, Field(description="Base64-encoded AES-256 key")] + + @validator("name") + def validate_name(cls, v): + if not v.isalnum(): + raise ValueError("Key name must be alphanumeric") + return v + + @validator("secret") + def validate_secret(cls, v): + try: + key = b64decode(v, validate=True) + except Exception as e: + raise ValueError("Failed to decode secret from base64") from e + if len(key) != 32: + raise ValueError(f"AES key must be 32 bytes. Got {len(key)} bytes") + return v + + +class AESEncryptionKey(EncryptionKey): + TYPE = "aes" + + def __init__(self, config: AESEncryptionKeyConfig) -> None: + self.config = config + self.key = b64decode(config.secret) + + @property + def name(self) -> str: + return self.config.name + + def encrypt(self, plaintext: str) -> str: + # Generate a random 12-byte (96-bit) nonce (recommended size for GCM) + nonce = os.urandom(12) + + # Create an AESGCM object and encrypt the plaintext + aesgcm = AESGCM(self.key) + ciphertext = aesgcm.encrypt(nonce, plaintext.encode("utf-8"), None) + + # base64-encode the nonce and ciphertext for storage + return b64encode(nonce + ciphertext).decode("utf-8") + + def decrypt(self, ciphertext: str) -> str: + data = b64decode(ciphertext) + + # Extract the nonce and ciphertext + nonce = data[:12] + decoded_ciphertext = data[12:] + + # Create an AESGCM object and decrypt the ciphertext + aesgcm = AESGCM(self.key) + plaintext = aesgcm.decrypt(nonce, decoded_ciphertext, None) + + return plaintext.decode("utf-8") diff --git a/src/dstack/_internal/server/services/encryption/keys/base.py b/src/dstack/_internal/server/services/encryption/keys/base.py new file mode 100644 index 0000000000..91a38dadd6 --- /dev/null +++ b/src/dstack/_internal/server/services/encryption/keys/base.py @@ -0,0 +1,19 @@ +from abc import ABC, abstractmethod +from typing import ClassVar + + +class EncryptionKey(ABC): + TYPE: ClassVar[str] + + @property + @abstractmethod + def name(self) -> str: + pass + + @abstractmethod + def encrypt(self, plaintext: str) -> str: + pass + + @abstractmethod + def decrypt(self, ciphertext: str) -> str: + pass diff --git a/src/dstack/_internal/server/services/encryption/keys/identity.py b/src/dstack/_internal/server/services/encryption/keys/identity.py new file mode 100644 index 0000000000..040a68ed22 --- /dev/null +++ b/src/dstack/_internal/server/services/encryption/keys/identity.py @@ -0,0 +1,28 @@ +from typing import Literal + +from pydantic import Field +from typing_extensions import Annotated + +from dstack._internal.core.models.common import CoreModel +from dstack._internal.server.services.encryption.keys.base import EncryptionKey + + +class IdentityEncryptionKeyConfig(CoreModel): + type: Annotated[Literal["identity"], Field(description="The type of the key")] = "identity" + + +class IdentityEncryptionKey(EncryptionKey): + TYPE = "identity" + + def __init__(self, config: IdentityEncryptionKeyConfig) -> None: + pass + + @property + def name(self) -> str: + return "noname" + + def encrypt(self, plaintext: str) -> str: + return plaintext + + def decrypt(self, ciphertext: str) -> str: + return ciphertext diff --git a/src/dstack/_internal/server/services/events.py b/src/dstack/_internal/server/services/events.py new file mode 100644 index 0000000000..dd7b33dc7f --- /dev/null +++ b/src/dstack/_internal/server/services/events.py @@ -0,0 +1,491 @@ +import uuid +from dataclasses import dataclass +from datetime import datetime +from typing import Optional, Union + +from sqlalchemy import and_, exists, or_, select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import joinedload + +from dstack._internal.core.models.events import Event, EventTarget, EventTargetType +from dstack._internal.core.models.users import GlobalRole +from dstack._internal.server import settings +from dstack._internal.server.models import ( + EventModel, + EventTargetModel, + FleetModel, + GatewayModel, + InstanceModel, + JobModel, + MemberModel, + ProjectModel, + RunModel, + SecretModel, + UserModel, + VolumeModel, +) +from dstack._internal.server.services.logging import fmt_entity +from dstack._internal.utils.common import get_current_datetime +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +class SystemActor: + """Represents the system as the actor of an event""" + + def fmt(self) -> str: + return "system" + + +@dataclass +class UserActor: + """ + Represents a user as the actor of an event. + + **NOTE**: Prefer using `UserActor.from_user` to create `UserActor` instances, + unless you don't have a complete `UserModel` available. + """ + + user_id: uuid.UUID + user_name: str + + @staticmethod + def from_user(user: UserModel) -> "UserActor": + return UserActor(user_id=user.id, user_name=user.name) + + def fmt(self) -> str: + return fmt_entity("user", self.user_id, self.user_name) + + +AnyActor = Union[SystemActor, UserActor] + + +@dataclass( + frozen=True, # to enforce the __post_init__ invariant +) +class Target: + """ + Target specification for event emission. + + **NOTE**: Prefer using `Target.from_model` to create `Target` instances, + unless you don't have a complete model available. + """ + + type: EventTargetType + project_id: Optional[uuid.UUID] + id: uuid.UUID + name: str + + def __post_init__(self): + if self.type == EventTargetType.USER and self.project_id is not None: + raise ValueError("User target cannot have project_id") + if self.type != EventTargetType.USER and self.project_id is None: + raise ValueError(f"{self.type} target must have project_id") + if self.type == EventTargetType.PROJECT and self.id != self.project_id: + raise ValueError("Project target id must be equal to project_id") + + @staticmethod + def from_model( + model: Union[ + FleetModel, + GatewayModel, + InstanceModel, + JobModel, + ProjectModel, + RunModel, + SecretModel, + UserModel, + VolumeModel, + ], + ) -> "Target": + if isinstance(model, FleetModel): + return Target( + type=EventTargetType.FLEET, + project_id=model.project_id or model.project.id, + id=model.id, + name=model.name, + ) + if isinstance(model, GatewayModel): + return Target( + type=EventTargetType.GATEWAY, + project_id=model.project_id or model.project.id, + id=model.id, + name=model.name, + ) + if isinstance(model, InstanceModel): + return Target( + type=EventTargetType.INSTANCE, + project_id=model.project_id or model.project.id, + id=model.id, + name=model.name, + ) + if isinstance(model, JobModel): + return Target( + type=EventTargetType.JOB, + project_id=model.project_id or model.project.id, + id=model.id, + name=model.job_name, + ) + if isinstance(model, ProjectModel): + return Target( + type=EventTargetType.PROJECT, + project_id=model.id, + id=model.id, + name=model.name, + ) + if isinstance(model, RunModel): + return Target( + type=EventTargetType.RUN, + project_id=model.project_id or model.project.id, + id=model.id, + name=model.run_name, + ) + if isinstance(model, SecretModel): + return Target( + type=EventTargetType.SECRET, + project_id=model.project_id or model.project.id, + id=model.id, + name=model.name, + ) + if isinstance(model, UserModel): + return Target( + type=EventTargetType.USER, + project_id=None, + id=model.id, + name=model.name, + ) + if isinstance(model, VolumeModel): + return Target( + type=EventTargetType.VOLUME, + project_id=model.project_id or model.project.id, + id=model.id, + name=model.name, + ) + raise ValueError(f"Unsupported model type: {type(model)}") + + def fmt(self) -> str: + return fmt_entity(self.type.value, self.id, self.name) + + +def emit(session: AsyncSession, message: str, actor: AnyActor, targets: list[Target]) -> None: + """ + Emit an event - add it to the current session without committing. + + Usage guidelines: + - Message: + - Use past tense - events should describe completed actions. + Bad: "Creating project" + Good: "Project created" + - Do not duplicate target and actor names in the message. + Bad: "User John created project MyProject" + Good: "Project created" + - Actor: + - Pass `UserActor` for events about user actions, e.g., in API handlers. + - Pass `SystemActor` for system-generated events, e.g., in background jobs. + - Targets: + - Link the event to one or more entities affected by it. + E.g., for a "Job assigned to instance" event, link it to the job and the instance. + - Do not link the event to parent entities of the affected entities. + E.g., the "Instance created" event should be linked to the instance only, + not to the fleet or project. Transitive relationships with parent entities + are inferred automatically when listing events using the within_* filters. + - **Important**: If linking the event to multiple targets with different access scopes + (e.g., entities in different projects, or different users), ensure that this does not + leak sensitive information. If a user has access to at least one of the targets, + they will see the entire event with all targets. If this is not desired, + consider emitting multiple separate events instead. + """ + if not targets: + raise ValueError("At least one target must be specified") + message = message.strip().rstrip(".").replace("\n", " ") + if not message: + raise ValueError("Message cannot be empty") + + logger.info( + "Emitting event: %s. Event targets: %s. Actor: %s", + message, + ", ".join(target.fmt() for target in targets), + actor.fmt(), + ) + + if settings.SERVER_EVENTS_TTL_SECONDS <= 0: + return + event = EventModel( + id=uuid.uuid4(), + message=message, + actor_user_id=actor.user_id if isinstance(actor, UserActor) else None, + recorded_at=get_current_datetime(), + targets=[], + ) + for target in targets: + event.targets.append( + EventTargetModel( + entity_type=target.type, + entity_project_id=target.project_id, + entity_id=target.id, + entity_name=target.name, + ) + ) + session.add(event) + + +async def list_events( + session: AsyncSession, + user: UserModel, # the user requesting the events + target_projects: Optional[list[uuid.UUID]], + target_users: Optional[list[uuid.UUID]], + target_fleets: Optional[list[uuid.UUID]], + target_instances: Optional[list[uuid.UUID]], + target_runs: Optional[list[uuid.UUID]], + target_jobs: Optional[list[uuid.UUID]], + target_volumes: Optional[list[uuid.UUID]], + target_gateways: Optional[list[uuid.UUID]], + target_secrets: Optional[list[uuid.UUID]], + within_projects: Optional[list[uuid.UUID]], + within_fleets: Optional[list[uuid.UUID]], + within_runs: Optional[list[uuid.UUID]], + include_target_types: Optional[list[EventTargetType]], + actors: Optional[list[Optional[uuid.UUID]]], + prev_recorded_at: Optional[datetime], + prev_id: Optional[uuid.UUID], + limit: int, + ascending: bool, +) -> list[Event]: + target_visibility_filters = [] + if user.global_role != GlobalRole.ADMIN: + query = select(MemberModel.project_id).where(MemberModel.user_id == user.id) + res = await session.execute(query) + # In Postgres, fetching project IDs separately is orders of magnitude faster + # than using a subquery. + project_ids = list(res.unique().scalars().all()) + target_visibility_filters.append( + or_( + EventTargetModel.entity_project_id.in_(project_ids), + and_( + EventTargetModel.entity_project_id.is_(None), + EventTargetModel.entity_type == EventTargetType.USER, + EventTargetModel.entity_id == user.id, + ), + ) + ) + target_filters = [] + if target_projects is not None: + target_filters.append( + and_( + EventTargetModel.entity_type == EventTargetType.PROJECT, + EventTargetModel.entity_id.in_(target_projects), + ) + ) + if target_users is not None: + target_filters.append( + and_( + EventTargetModel.entity_type == EventTargetType.USER, + EventTargetModel.entity_id.in_(target_users), + ) + ) + if target_fleets is not None: + target_filters.append( + and_( + EventTargetModel.entity_type == EventTargetType.FLEET, + EventTargetModel.entity_id.in_(target_fleets), + ) + ) + if target_instances is not None: + target_filters.append( + and_( + EventTargetModel.entity_type == EventTargetType.INSTANCE, + EventTargetModel.entity_id.in_(target_instances), + ) + ) + if target_runs is not None: + target_filters.append( + and_( + EventTargetModel.entity_type == EventTargetType.RUN, + EventTargetModel.entity_id.in_(target_runs), + ) + ) + if target_jobs is not None: + target_filters.append( + and_( + EventTargetModel.entity_type == EventTargetType.JOB, + EventTargetModel.entity_id.in_(target_jobs), + ) + ) + if target_volumes is not None: + target_filters.append( + and_( + EventTargetModel.entity_type == EventTargetType.VOLUME, + EventTargetModel.entity_id.in_(target_volumes), + ) + ) + if target_gateways is not None: + target_filters.append( + and_( + EventTargetModel.entity_type == EventTargetType.GATEWAY, + EventTargetModel.entity_id.in_(target_gateways), + ) + ) + if target_secrets is not None: + target_filters.append( + and_( + EventTargetModel.entity_type == EventTargetType.SECRET, + EventTargetModel.entity_id.in_(target_secrets), + ) + ) + if within_projects is not None: + target_filters.append(EventTargetModel.entity_project_id.in_(within_projects)) + if within_fleets is not None: + query = select(InstanceModel.id).where(InstanceModel.fleet_id.in_(within_fleets)) + res = await session.execute(query) + # In Postgres, fetching instance IDs separately is orders of magnitude faster + # than using a subquery. + instance_ids = list(res.unique().scalars().all()) + target_filters.append( + or_( + and_( + EventTargetModel.entity_type == EventTargetType.FLEET, + EventTargetModel.entity_id.in_(within_fleets), + ), + and_( + EventTargetModel.entity_type == EventTargetType.INSTANCE, + EventTargetModel.entity_id.in_(instance_ids), + ), + ) + ) + if within_runs is not None: + query = select(JobModel.id).where(JobModel.run_id.in_(within_runs)) + res = await session.execute(query) + # In Postgres, fetching job IDs separately is orders of magnitude faster + # than using a subquery. + job_ids = list(res.unique().scalars().all()) + target_filters.append( + or_( + and_( + EventTargetModel.entity_type == EventTargetType.RUN, + EventTargetModel.entity_id.in_(within_runs), + ), + and_( + EventTargetModel.entity_type == EventTargetType.JOB, + EventTargetModel.entity_id.in_(job_ids), + ), + ) + ) + if include_target_types is not None: + target_filters.append(EventTargetModel.entity_type.in_(include_target_types)) + + event_filters = [] + if actors is not None: + event_filters.append( + or_( + EventModel.actor_user_id.is_(None) if None in actors else False, + EventModel.actor_user_id.in_( + [actor_id for actor_id in actors if actor_id is not None] + ), + ) + ) + if prev_recorded_at is not None: + if ascending: + if prev_id is None: + event_filters.append(EventModel.recorded_at > prev_recorded_at) + else: + event_filters.append( + or_( + EventModel.recorded_at > prev_recorded_at, + and_(EventModel.recorded_at == prev_recorded_at, EventModel.id < prev_id), + ) + ) + else: + if prev_id is None: + event_filters.append(EventModel.recorded_at < prev_recorded_at) + else: + event_filters.append( + or_( + EventModel.recorded_at < prev_recorded_at, + and_(EventModel.recorded_at == prev_recorded_at, EventModel.id > prev_id), + ) + ) + order_by = (EventModel.recorded_at.desc(), EventModel.id) + if ascending: + order_by = (EventModel.recorded_at.asc(), EventModel.id.desc()) + query = ( + select(EventModel) + .order_by(*order_by) + .limit(limit) + .options( + ( + joinedload(EventModel.targets) + .joinedload(EventTargetModel.entity_project) + .load_only(ProjectModel.name, ProjectModel.original_name, ProjectModel.deleted) + .noload(ProjectModel.owner) + ), + joinedload(EventModel.actor_user).load_only( + UserModel.name, UserModel.original_name, UserModel.deleted + ), + ) + ) + if event_filters: + query = query.where(*event_filters) + if target_filters: + # Each returned event should reference at least one target the user **wants** to see + # (as defined by user-provided filters). + query = query.where( + exists().where( + and_( + EventTargetModel.event_id == EventModel.id, + *target_filters, + ) + ) + ) + if target_visibility_filters: + # Each returned event should reference at least one target the user **can** see + # (as defined by project membership). + query = query.where( + exists().where( + and_( + EventTargetModel.event_id == EventModel.id, + *target_visibility_filters, + ) + ) + ) + res = await session.execute(query) + event_models = res.unique().scalars().all() + return list(map(event_model_to_event, event_models)) + + +def event_target_model_to_event_target(model: EventTargetModel) -> EventTarget: + project_name = None + is_project_deleted = None + if model.entity_project is not None: + project_name = model.entity_project.name + is_project_deleted = model.entity_project.deleted + if is_project_deleted and model.entity_project.original_name is not None: + project_name = model.entity_project.original_name + return EventTarget( + type=model.entity_type.value, + project_id=model.entity_project_id, + project_name=project_name, + is_project_deleted=is_project_deleted, + id=model.entity_id, + name=model.entity_name, + ) + + +def event_model_to_event(event_model: EventModel) -> Event: + actor_user_name = None + is_actor_user_deleted = None + if event_model.actor_user is not None: + actor_user_name = event_model.actor_user.name + is_actor_user_deleted = event_model.actor_user.deleted + if is_actor_user_deleted and event_model.actor_user.original_name is not None: + actor_user_name = event_model.actor_user.original_name + targets = list(map(event_target_model_to_event_target, event_model.targets)) + return Event( + id=event_model.id, + message=event_model.message, + recorded_at=event_model.recorded_at, + actor_user_id=event_model.actor_user_id, + actor_user=actor_user_name, + is_actor_user_deleted=is_actor_user_deleted, + targets=targets, + ) diff --git a/src/dstack/_internal/server/services/exports.py b/src/dstack/_internal/server/services/exports.py new file mode 100644 index 0000000000..276374646c --- /dev/null +++ b/src/dstack/_internal/server/services/exports.py @@ -0,0 +1,482 @@ +from collections.abc import AsyncGenerator +from contextlib import asynccontextmanager, nullcontext +from typing import Optional + +from sqlalchemy import func, select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import selectinload + +from dstack._internal.core.errors import ( + ForbiddenError, + ResourceExistsError, + ResourceNotExistsError, + ServerClientError, +) +from dstack._internal.core.models.exports import ( + Export, + ExportedFleet, + ExportedGateway, + ExportImport, +) +from dstack._internal.core.models.users import GlobalRole +from dstack._internal.core.services import validate_dstack_resource_name +from dstack._internal.server.const import GLOBAL_EXPORTS_LOCK_NAMESPACE +from dstack._internal.server.db import get_db, is_db_postgres, is_db_sqlite +from dstack._internal.server.models import ( + ExportedFleetModel, + ExportedGatewayModel, + ExportModel, + FleetModel, + GatewayModel, + ImportModel, + ProjectModel, + ProjectRole, + UserModel, +) +from dstack._internal.server.services.fleets import get_fleet_spec, list_project_fleet_models +from dstack._internal.server.services.gateways import list_project_gateway_models +from dstack._internal.server.services.locking import get_locker, string_to_lock_id +from dstack._internal.server.services.projects import ( + get_user_project_role, + list_project_models, + list_user_project_models, +) + + +@asynccontextmanager +async def get_export_model_by_name_for_update( + session: AsyncSession, project: ProjectModel, name: str +) -> AsyncGenerator[Optional[ExportModel], None]: + """ + Fetch export from the database and lock it for update. + + **NOTE**: commit changes to the database before exiting from this context manager, + so that in-memory locks are only released after commit. + """ + filters = [ + ExportModel.project_id == project.id, + ExportModel.name == name, + ] + res = await session.execute(select(ExportModel.id).where(*filters)) + export_id = res.scalars().one_or_none() + if not export_id: + yield None + else: + async with get_locker(get_db().dialect_name).lock_ctx( + ExportModel.__tablename__, [export_id] + ): + # Refetch after lock + res = await session.execute( + select(ExportModel) + .where(ExportModel.id == export_id, *filters) + .options( + selectinload( + ExportModel.imports.and_( + ImportModel.project.has(ProjectModel.deleted == False) + ) + ) + .joinedload(ImportModel.project) + .load_only(ProjectModel.name), + selectinload( + ExportModel.exported_fleets.and_( + ExportedFleetModel.fleet.has(FleetModel.deleted == False) + ) + ) + .joinedload(ExportedFleetModel.fleet) + .load_only(FleetModel.name), + selectinload(ExportModel.exported_gateways) + .joinedload(ExportedGatewayModel.gateway) + .load_only(GatewayModel.name), + ) + .with_for_update(key_share=True) + ) + yield res.scalars().one_or_none() + + +async def export_exists(session: AsyncSession, project: ProjectModel, name: str) -> bool: + res = await session.execute( + select(func.count()) + .select_from(ExportModel) + .where(ExportModel.project_id == project.id, ExportModel.name == name) + ) + return res.scalar_one() > 0 + + +async def create_export( + session: AsyncSession, + project: ProjectModel, + user: UserModel, + name: str, + is_global: bool, + importer_project_names: list[str], + exported_fleet_names: list[str], + exported_gateway_names: list[str], +) -> Export: + validate_dstack_resource_name(name) + if is_global and importer_project_names: + raise ServerClientError( + "Do not specify any importer projects when creating a global export." + " Global exports are automatically imported in all projects" + ) + + export_names_lock_namespace = f"export_names_{project.name}" + if is_db_sqlite(): + # Start new transaction to see committed changes after lock + await session.commit() + elif is_db_postgres(): + await session.execute( + select(func.pg_advisory_xact_lock(string_to_lock_id(export_names_lock_namespace))) + ) + export_names_lock, _ = get_locker(get_db().dialect_name).get_lockset( + export_names_lock_namespace + ) + + if is_global: + if is_db_sqlite(): + # Start new transaction to see committed changes after lock + await session.commit() + elif is_db_postgres(): + await session.execute( + select( + func.pg_advisory_xact_lock(string_to_lock_id(GLOBAL_EXPORTS_LOCK_NAMESPACE)) + ) + ) + global_exports_lock, _ = get_locker(get_db().dialect_name).get_lockset( + GLOBAL_EXPORTS_LOCK_NAMESPACE + ) + else: + global_exports_lock = nullcontext() + + async with export_names_lock, global_exports_lock: + if await export_exists(session, project, name): + raise ResourceExistsError( + f"Export {name!r} already exists in project {project.name!r}" + ) + export = ExportModel( + name=name, + project=project, + is_global=False, + imports=[], + exported_fleets=[], + exported_gateways=[], + ) + await add_importer_projects(session, user, export, importer_project_names) + await add_exported_fleets(session, export, exported_fleet_names) + await add_exported_gateways(session, export, exported_gateway_names) + if is_global: + await set_as_global(session, export, user) + session.add(export) + await session.commit() + return export_model_to_export(export) + + +async def update_export( + session: AsyncSession, + project: ProjectModel, + user: UserModel, + name: str, + set_global: bool, + unset_global: bool, + add_importer_project_names: list[str], + remove_importer_project_names: list[str], + add_exported_fleet_names: list[str], + remove_exported_fleet_names: list[str], + add_exported_gateway_names: list[str], + remove_exported_gateway_names: list[str], +) -> Export: + if set_global: + if is_db_sqlite(): + # Start new transaction to see committed changes after lock + await session.commit() + elif is_db_postgres(): + await session.execute( + select( + func.pg_advisory_xact_lock(string_to_lock_id(GLOBAL_EXPORTS_LOCK_NAMESPACE)) + ) + ) + global_exports_lock, _ = get_locker(get_db().dialect_name).get_lockset( + GLOBAL_EXPORTS_LOCK_NAMESPACE + ) + else: + global_exports_lock = nullcontext() + + async with ( + global_exports_lock, + get_export_model_by_name_for_update(session, project, name) as export, + ): + if export is None: + raise ResourceNotExistsError(f"Export {name!r} not found in project {project.name!r}") + + if ( + not set_global + and not unset_global + and not add_importer_project_names + and not remove_importer_project_names + and not add_exported_fleet_names + and not remove_exported_fleet_names + and not add_exported_gateway_names + and not remove_exported_gateway_names + ): + raise ServerClientError("No changes specified") + if set_global and unset_global: + raise ServerClientError("Cannot set and unset global at the same time") + if (set_global or unset_global) and ( + add_importer_project_names or remove_importer_project_names + ): + raise ServerClientError( + "Cannot change global status and add/remove importers at the same time" + ) + + add_importer_project_names = list(map(str.lower, add_importer_project_names)) + remove_importer_project_names = list(map(str.lower, remove_importer_project_names)) + + add_remove_conflict_projects = set(add_importer_project_names) & set( + remove_importer_project_names + ) + if add_remove_conflict_projects: + raise ServerClientError( + f"Projects {add_remove_conflict_projects} are listed for both addition and removal." + " Cannot add and remove at the same time" + ) + add_remove_conflict_fleets = set(add_exported_fleet_names) & set( + remove_exported_fleet_names + ) + if add_remove_conflict_fleets: + raise ServerClientError( + f"Fleets {add_remove_conflict_fleets} are listed for both addition and removal." + " Cannot add and remove at the same time" + ) + add_remove_conflict_gateways = set(add_exported_gateway_names) & set( + remove_exported_gateway_names + ) + if add_remove_conflict_gateways: + raise ServerClientError( + f"Gateways {add_remove_conflict_gateways} are listed for both addition and removal." + " Cannot add and remove at the same time" + ) + + await add_importer_projects(session, user, export, add_importer_project_names) + await add_exported_fleets(session, export, add_exported_fleet_names) + await add_exported_gateways(session, export, add_exported_gateway_names) + await remove_importer_projects(export, remove_importer_project_names) + await remove_exported_fleets(export, remove_exported_fleet_names) + await remove_exported_gateways(export, remove_exported_gateway_names) + if unset_global: + await unset_as_global(export) + if set_global: + await set_as_global(session, export, user) + await session.commit() + return export_model_to_export(export) + + +async def set_as_global(session: AsyncSession, export: ExportModel, user: UserModel) -> None: + """ + **NOTE**: + Should be called with the `GLOBAL_EXPORTS_LOCK_NAMESPACE` lock acquired to prevent new + projects from being created while this export is being imported into existing ones. + """ + if export.is_global: + raise ServerClientError("The export is already global") + if user.global_role != GlobalRole.ADMIN: + raise ForbiddenError("Only global admins can make the export global") + all_projects = await list_project_models( + session, load_only_attrs=[ProjectModel.id, ProjectModel.name] + ) + already_importing = {imp.project_id for imp in export.imports} + for project in all_projects: + if project.id == export.project.id: + continue + if project.id in already_importing: + continue + export.imports.append(ImportModel(project=project)) + export.is_global = True + + +async def unset_as_global(export: ExportModel) -> None: + if not export.is_global: + raise ServerClientError("The export is already not global") + export.is_global = False + + +async def add_importer_projects( + session: AsyncSession, user: UserModel, export: ExportModel, names: list[str] +) -> None: + if not names: + return + names = list(map(str.lower, names)) + if len(names) != len(set(names)): + raise ServerClientError("Some importer projects are listed for addition more than once") + already_importing = {imp.project.name.lower() for imp in export.imports} & set(names) + if already_importing: + raise ServerClientError( + f"Projects {already_importing} are already importing export {export.name!r}" + ) + if export.project.name.lower() in names: + raise ServerClientError(f"Project {export.project.name!r} cannot import from itself") + projects = await list_user_project_models(session, user, only_names=True, include_members=True) + projects = [p for p in projects if p.name.lower() in names] + if user.global_role != GlobalRole.ADMIN: + projects = [p for p in projects if get_user_project_role(user, p) == ProjectRole.ADMIN] + if missing := set(names) - {p.name.lower() for p in projects}: + raise ServerClientError( + f"Projects {missing} not found or you are not allowed to add them as importers." + " Only project admins can add a project as importer" + ) + for project in projects: + export.imports.append(ImportModel(project=project)) + + +async def add_exported_fleets( + session: AsyncSession, export: ExportModel, names: list[str] +) -> None: + if not names: + return + if len(names) != len(set(names)): + raise ServerClientError("Some fleets are listed for addition more than once") + already_exported = {ef.fleet.name for ef in export.exported_fleets} & set(names) + if already_exported: + raise ServerClientError( + f"Fleets {already_exported} are already exported by export {export.name!r}" + ) + fleets = await list_project_fleet_models( + session=session, + project=export.project, + names=names, + include_imported=False, + include_deleted=False, + include_instances=False, + ) + if missing := set(names) - {f.name for f in fleets}: + raise ResourceNotExistsError( + f"Fleets {missing} not found in project {export.project.name!r}" + ) + cloud_fleet_names = [ + f.name for f in fleets if get_fleet_spec(f).configuration.ssh_config is None + ] + if cloud_fleet_names: + raise ServerClientError( + f"Fleets {cloud_fleet_names} are cloud fleets. Can only export SSH fleets" + ) + for fleet in fleets: + export.exported_fleets.append(ExportedFleetModel(fleet=fleet)) + + +async def remove_importer_projects(export: ExportModel, names: list[str]) -> None: + if not names: + return + if export.is_global: + raise ServerClientError("Cannot remove importers from a global export") + names = list(map(str.lower, names)) + if len(names) != len(set(names)): + raise ServerClientError("Some importer projects are listed for removal more than once") + existing = {imp.project.name.lower() for imp in export.imports} + if missing := set(names) - existing: + raise ServerClientError(f"Projects {missing} are not importing export {export.name!r}") + export.imports = [imp for imp in export.imports if imp.project.name.lower() not in names] + + +async def remove_exported_fleets(export: ExportModel, names: list[str]) -> None: + if len(names) != len(set(names)): + raise ServerClientError("Some fleets are listed for removal more than once") + existing = {ef.fleet.name for ef in export.exported_fleets} + if missing := set(names) - existing: + raise ServerClientError(f"Fleets {missing} are not exported by export {export.name!r}") + export.exported_fleets = [ef for ef in export.exported_fleets if ef.fleet.name not in names] + + +async def add_exported_gateways( + session: AsyncSession, export: ExportModel, names: list[str] +) -> None: + if not names: + return + if len(names) != len(set(names)): + raise ServerClientError("Some gateways are listed for addition more than once") + already_exported = {eg.gateway.name for eg in export.exported_gateways} & set(names) + if already_exported: + raise ServerClientError( + f"Gateways {already_exported} are already exported by export {export.name!r}" + ) + gateways = await list_project_gateway_models( + session=session, project=export.project, load_backend_type=True + ) + gateways = [g for g in gateways if g.name in names] + if missing := set(names) - {g.name for g in gateways}: + raise ResourceNotExistsError( + f"Gateways {missing} not found in project {export.project.name!r}" + ) + for gateway in gateways: + export.exported_gateways.append(ExportedGatewayModel(gateway=gateway)) + + +async def remove_exported_gateways(export: ExportModel, names: list[str]) -> None: + if len(names) != len(set(names)): + raise ServerClientError("Some gateways are listed for removal more than once") + existing = {eg.gateway.name for eg in export.exported_gateways} + if missing := set(names) - existing: + raise ServerClientError(f"Gateways {missing} are not exported by export {export.name!r}") + export.exported_gateways = [ + eg for eg in export.exported_gateways if eg.gateway.name not in names + ] + + +async def delete_export(session: AsyncSession, project: ProjectModel, name: str) -> None: + async with get_export_model_by_name_for_update(session, project, name) as export: + if export is None: + raise ResourceNotExistsError(f"Export {name!r} not found in project {project.name!r}") + await session.delete(export) + await session.commit() + + +async def list_exports(session: AsyncSession, project: ProjectModel) -> list[Export]: + res = await session.execute( + select(ExportModel) + .where(ExportModel.project == project) + .options( + selectinload( + ExportModel.imports.and_(ImportModel.project.has(ProjectModel.deleted == False)) + ) + .joinedload(ImportModel.project) + .load_only(ProjectModel.name), + selectinload( + ExportModel.exported_fleets.and_( + ExportedFleetModel.fleet.has(FleetModel.deleted == False) + ) + ) + .joinedload(ExportedFleetModel.fleet) + .load_only(FleetModel.name), + selectinload(ExportModel.exported_gateways) + .joinedload(ExportedGatewayModel.gateway) + .load_only(GatewayModel.name), + ) + .order_by(ExportModel.created_at.desc()) + ) + exports = res.scalars().all() + return [export_model_to_export(export) for export in exports] + + +def export_model_to_export(export_model: ExportModel) -> Export: + return Export( + id=export_model.id, + name=export_model.name, + is_global=export_model.is_global, + imports=[ + ExportImport( + project_name=import_model.project.name, + ) + for import_model in export_model.imports + ], + exported_fleets=[ + ExportedFleet( + id=exported_fleet_model.fleet.id, + name=exported_fleet_model.fleet.name, + ) + for exported_fleet_model in export_model.exported_fleets + ], + exported_gateways=[ + ExportedGateway( + id=exported_gateway_model.gateway.id, + name=exported_gateway_model.gateway.name, + ) + for exported_gateway_model in export_model.exported_gateways + ], + ) diff --git a/src/dstack/_internal/server/services/files.py b/src/dstack/_internal/server/services/files.py new file mode 100644 index 0000000000..7cba858aa5 --- /dev/null +++ b/src/dstack/_internal/server/services/files.py @@ -0,0 +1,114 @@ +import uuid +from typing import Optional + +import sqlalchemy.exc +from fastapi import UploadFile +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.errors import ServerClientError, ServerError +from dstack._internal.core.models.files import FileArchive +from dstack._internal.server.models import FileArchiveModel, UserModel +from dstack._internal.server.services.storage import get_default_storage +from dstack._internal.utils.common import run_async +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +async def get_archive_model( + session: AsyncSession, + id: uuid.UUID, + user: Optional[UserModel] = None, +) -> Optional[FileArchiveModel]: + stmt = select(FileArchiveModel).where(FileArchiveModel.id == id) + if user is not None: + stmt = stmt.where(FileArchiveModel.user_id == user.id) + res = await session.execute(stmt) + return res.scalar() + + +async def get_archive_model_by_hash( + session: AsyncSession, + user: UserModel, + hash: str, +) -> Optional[FileArchiveModel]: + res = await session.execute( + select(FileArchiveModel).where( + FileArchiveModel.user_id == user.id, + FileArchiveModel.blob_hash == hash, + ) + ) + return res.scalar() + + +async def get_archive_by_hash( + session: AsyncSession, + user: UserModel, + hash: str, +) -> Optional[FileArchive]: + archive_model = await get_archive_model_by_hash( + session=session, + user=user, + hash=hash, + ) + if archive_model is None: + return None + return archive_model_to_archive(archive_model) + + +async def upload_archive( + session: AsyncSession, + user: UserModel, + file: UploadFile, +) -> FileArchive: + if file.filename is None: + raise ServerClientError("filename not specified") + archive_hash = file.filename + archive_model = await get_archive_model_by_hash( + session=session, + user=user, + hash=archive_hash, + ) + if archive_model is not None: + logger.debug("File archive (user_id=%s, hash=%s) already uploaded", user.id, archive_hash) + return archive_model_to_archive(archive_model) + + blob = await file.read() + storage = get_default_storage() + if storage is not None: + await run_async(storage.upload_archive, str(user.id), archive_hash, blob) + archive_model = FileArchiveModel( + user_id=user.id, + blob_hash=archive_hash, + blob=blob if storage is None else None, + ) + + conflict = False + try: + async with session.begin_nested(): + session.add(archive_model) + except sqlalchemy.exc.IntegrityError as e: + # Concurrent API call just uploaded the same archive (TOC/TOU race condition), + # safe to ignore, but we need to refetch the archive from the DB to get its id + conflict = True + logger.debug("Conflict, rolling back: %s", e) + await session.commit() + + if conflict: + archive_model = await get_archive_model_by_hash( + session=session, + user=user, + hash=archive_hash, + ) + if archive_model is None: + raise ServerError("Failed to upload archive, unexpected conflict condition") + logger.debug("File archive (user_id=%s, hash=%s) already uploaded", user.id, archive_hash) + else: + logger.debug("File archive (user_id=%s, hash=%s) has been uploaded", user.id, archive_hash) + + return archive_model_to_archive(archive_model) + + +def archive_model_to_archive(archive_model: FileArchiveModel) -> FileArchive: + return FileArchive(id=archive_model.id, hash=archive_model.blob_hash) diff --git a/src/dstack/_internal/server/services/fleets.py b/src/dstack/_internal/server/services/fleets.py new file mode 100644 index 0000000000..547f91d52c --- /dev/null +++ b/src/dstack/_internal/server/services/fleets.py @@ -0,0 +1,1492 @@ +import asyncio +import uuid +from collections.abc import Callable +from datetime import datetime +from functools import wraps +from typing import List, Literal, Optional, Tuple, TypeVar, Union + +from sqlalchemy import and_, exists, false, func, or_, select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import aliased, joinedload, selectinload + +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.features import BACKENDS_WITH_CREATE_INSTANCE_SUPPORT +from dstack._internal.core.errors import ( + ForbiddenError, + ResourceExistsError, + ServerClientError, +) +from dstack._internal.core.models.common import ApplyAction, CoreModel +from dstack._internal.core.models.envs import Env +from dstack._internal.core.models.fleets import ( + ApplyFleetPlanInput, + BackendFleetConfiguraionProps, + Fleet, + FleetConfiguration, + FleetPlan, + FleetSpec, + FleetStatus, + InstanceGroupPlacement, + SSHFleetConfigurationProps, + SSHHostParams, + SSHParams, +) +from dstack._internal.core.models.instances import ( + InstanceOfferWithAvailability, + InstanceStatus, + InstanceTerminationReason, + SSHConnectionParams, + SSHKey, +) +from dstack._internal.core.models.placement import PlacementGroup +from dstack._internal.core.models.profiles import ( + Profile, + SpotPolicy, +) +from dstack._internal.core.models.projects import Project +from dstack._internal.core.models.resources import ResourcesSpec +from dstack._internal.core.models.runs import ( + JobProvisioningData, + Requirements, + RunStatus, + get_policy_map, +) +from dstack._internal.core.models.users import GlobalRole +from dstack._internal.core.services import validate_dstack_resource_name +from dstack._internal.core.services.diff import ModelDiff, copy_model, diff_models +from dstack._internal.server.db import get_db, is_db_postgres, is_db_sqlite, sqlite_commit +from dstack._internal.server.models import ( + ExportedFleetModel, + FleetModel, + ImportModel, + InstanceModel, + JobModel, + MemberModel, + ProjectModel, + RunModel, + UserModel, +) +from dstack._internal.server.services import events +from dstack._internal.server.services import instances as instances_services +from dstack._internal.server.services import offers as offers_services +from dstack._internal.server.services.instances import ( + get_instance_remote_connection_info, + is_placeholder_instance, + list_active_remote_instances, + switch_instance_status, +) +from dstack._internal.server.services.locking import ( + get_locker, + string_to_lock_id, +) +from dstack._internal.server.services.pipelines import PipelineHinterProtocol +from dstack._internal.server.services.plugins import apply_plugin_policies +from dstack._internal.server.services.projects import ( + get_member, + get_member_permissions, + list_user_project_models, + project_model_to_project, +) +from dstack._internal.server.services.resources import set_resources_defaults +from dstack._internal.utils import random_names +from dstack._internal.utils.common import ( + EntityID, + EntityName, + EntityNameOrID, + get_current_datetime, +) +from dstack._internal.utils.logging import get_logger +from dstack._internal.utils.ssh import pkey_from_str + +logger = get_logger(__name__) + + +def switch_fleet_status( + session: AsyncSession, + fleet_model: FleetModel, + new_status: FleetStatus, + actor: events.AnyActor = events.SystemActor(), +): + """ + Switch fleet status. + """ + old_status = fleet_model.status + if old_status == new_status: + return + + fleet_model.status = new_status + emit_fleet_status_change_event( + session=session, + fleet_model=fleet_model, + old_status=old_status, + new_status=new_status, + status_message=fleet_model.status_message, + actor=actor, + ) + + +def emit_fleet_status_change_event( + session: AsyncSession, + fleet_model: FleetModel, + old_status: FleetStatus, + new_status: FleetStatus, + status_message: Optional[str], + actor: events.AnyActor = events.SystemActor(), +) -> None: + if old_status == new_status: + return + msg = get_fleet_status_change_message( + old_status=old_status, + new_status=new_status, + status_message=status_message, + ) + events.emit(session, msg, actor=actor, targets=[events.Target.from_model(fleet_model)]) + + +def get_fleet_status_change_message( + old_status: FleetStatus, + new_status: FleetStatus, + status_message: Optional[str], +) -> str: + msg = f"Fleet status changed {old_status.upper()} -> {new_status.upper()}" + if status_message is not None: + msg += f" ({status_message})" + return msg + + +async def list_projects_with_no_active_fleets( + session: AsyncSession, + user: UserModel, +) -> List[Project]: + """ + Returns all projects where the user is a member that have no active fleets, + neither owned nor imported. + + Active fleets are those with `deleted == False`. Projects with only deleted fleets + (or no fleets) are included. Deleted projects are excluded. + + Applies to all users (both regular users and admins require membership). + """ + active_fleet_alias = aliased(FleetModel) + member_alias = aliased(MemberModel) + + query = ( + select(ProjectModel) + .join( + member_alias, + and_( + member_alias.project_id == ProjectModel.id, + member_alias.user_id == user.id, + ), + ) + .outerjoin( + active_fleet_alias, + and_( + or_( + active_fleet_alias.project_id == ProjectModel.id, + exists().where( + ImportModel.project_id == ProjectModel.id, + ImportModel.export_id == ExportedFleetModel.export_id, + ExportedFleetModel.fleet_id == active_fleet_alias.id, + ), + ), + active_fleet_alias.deleted == False, + ), + ) + .where( + ProjectModel.deleted == False, + active_fleet_alias.id.is_(None), + ) + .order_by(ProjectModel.created_at) + ) + + res = await session.execute(query) + project_models = list(res.scalars().unique().all()) + + return [ + project_model_to_project(p, include_backends=False, include_members=False) + for p in project_models + ] + + +async def list_fleets( + session: AsyncSession, + user: UserModel, + project_name: Optional[str], + only_active: bool, + include_imported: bool, + prev_created_at: Optional[datetime], + prev_id: Optional[uuid.UUID], + limit: int, + ascending: bool, +) -> List[Fleet]: + projects = await list_user_project_models( + session=session, + user=user, + only_names=True, + ) + if project_name is not None: + projects = [p for p in projects if p.name == project_name] + fleet_models = await list_projects_fleet_models( + session=session, + projects=projects, + only_active=only_active, + include_imported=include_imported, + prev_created_at=prev_created_at, + prev_id=prev_id, + limit=limit, + ascending=ascending, + ) + return [fleet_model_to_fleet(v) for v in fleet_models] + + +async def list_projects_fleet_models( + session: AsyncSession, + projects: List[ProjectModel], + only_active: bool, + include_imported: bool, + prev_created_at: Optional[datetime], + prev_id: Optional[uuid.UUID], + limit: int, + ascending: bool, +) -> List[FleetModel]: + filters = [] + project_ids = {p.id for p in projects} + is_fleet_imported_subquery = exists().where( + ImportModel.project_id.in_(project_ids), + ImportModel.export_id == ExportedFleetModel.export_id, + ExportedFleetModel.fleet_id == FleetModel.id, + ) + filters.append( + or_( + FleetModel.project_id.in_(project_ids), + is_fleet_imported_subquery if include_imported else false(), + ) + ) + if only_active: + filters.append(FleetModel.deleted == False) + if prev_created_at is not None: + if ascending: + if prev_id is None: + filters.append(FleetModel.created_at > prev_created_at) + else: + filters.append( + or_( + FleetModel.created_at > prev_created_at, + and_(FleetModel.created_at == prev_created_at, FleetModel.id < prev_id), + ) + ) + else: + if prev_id is None: + filters.append(FleetModel.created_at < prev_created_at) + else: + filters.append( + or_( + FleetModel.created_at < prev_created_at, + and_(FleetModel.created_at == prev_created_at, FleetModel.id > prev_id), + ) + ) + order_by = (FleetModel.created_at.desc(), FleetModel.id) + if ascending: + order_by = (FleetModel.created_at.asc(), FleetModel.id.desc()) + res = await session.execute( + select(FleetModel) + .where(*filters) + .order_by(*order_by) + .limit(limit) + .options( + joinedload(FleetModel.project).load_only(ProjectModel.name), + selectinload(FleetModel.instances.and_(InstanceModel.deleted == False)), + ) + ) + fleet_models = list(res.unique().scalars().all()) + return fleet_models + + +async def list_project_fleets( + session: AsyncSession, + project: ProjectModel, + names: Optional[List[str]] = None, + include_imported: bool = False, +) -> List[Fleet]: + fleet_models = await list_project_fleet_models( + session=session, project=project, names=names, include_imported=include_imported + ) + return [fleet_model_to_fleet(v) for v in fleet_models] + + +async def list_project_fleet_models( + session: AsyncSession, + project: ProjectModel, + names: Optional[List[str]] = None, + include_imported: bool = False, + include_deleted: bool = False, + include_instances: bool = True, +) -> List[FleetModel]: + filters = [] + is_fleet_imported_subquery = exists().where( + ImportModel.project_id == project.id, + ImportModel.export_id == ExportedFleetModel.export_id, + ExportedFleetModel.fleet_id == FleetModel.id, + ) + filters.append( + or_( + FleetModel.project_id == project.id, + is_fleet_imported_subquery if include_imported else false(), + ) + ) + if names is not None: + filters.append(FleetModel.name.in_(names)) + if not include_deleted: + filters.append(FleetModel.deleted == False) + options = [joinedload(FleetModel.project).load_only(ProjectModel.name)] + if include_instances: + options.append(selectinload(FleetModel.instances.and_(InstanceModel.deleted == False))) + res = await session.execute(select(FleetModel).where(*filters).options(*options)) + return list(res.unique().scalars().all()) + + +async def get_fleet( + session: AsyncSession, + project: ProjectModel, + name_or_id: EntityNameOrID, + include_sensitive: bool = False, +) -> Optional[Fleet]: + if isinstance(name_or_id, EntityID): + fleet_model = await get_project_fleet_model_by_id( + session=session, project=project, fleet_id=name_or_id.id + ) + else: + fleet_model = await get_project_fleet_model_by_name( + session=session, project=project, name=name_or_id.name + ) + if fleet_model is None: + return None + return fleet_model_to_fleet(fleet_model, include_sensitive=include_sensitive) + + +async def get_project_fleet_model_by_id( + session: AsyncSession, + project: ProjectModel, + fleet_id: uuid.UUID, +) -> Optional[FleetModel]: + filters = [ + FleetModel.id == fleet_id, + FleetModel.project_id == project.id, + ] + res = await session.execute( + select(FleetModel) + .where(*filters) + .options( + joinedload(FleetModel.instances.and_(InstanceModel.deleted == False)), + joinedload(FleetModel.project).load_only(ProjectModel.name), + ) + ) + return res.unique().scalar_one_or_none() + + +async def get_project_fleet_model_by_name( + session: AsyncSession, + project: ProjectModel, + name: str, + include_deleted: bool = False, +) -> Optional[FleetModel]: + filters = [ + FleetModel.name == name, + FleetModel.project_id == project.id, + ] + if not include_deleted: + filters.append(FleetModel.deleted == False) + res = await session.execute( + select(FleetModel) + .where(*filters) + .options( + joinedload(FleetModel.instances.and_(InstanceModel.deleted == False)), + joinedload(FleetModel.project).load_only(ProjectModel.name), + ) + ) + return res.unique().scalar_one_or_none() + + +async def get_plan( + session: AsyncSession, + project: ProjectModel, + user: UserModel, + spec: FleetSpec, +) -> FleetPlan: + # Spec must be copied by parsing to calculate merged_profile + effective_spec = copy_model(spec) + effective_spec = await apply_plugin_policies( + user=user.name, + project=project.name, + spec=effective_spec, + ) + # Spec must be copied by parsing to calculate merged_profile + effective_spec = copy_model(effective_spec) + _validate_fleet_spec_and_set_defaults(effective_spec) + + action = ApplyAction.CREATE + current_fleet: Optional[Fleet] = None + current_fleet_id: Optional[uuid.UUID] = None + + if effective_spec.configuration.name is not None: + current_fleet = await get_fleet( + session=session, + project=project, + name_or_id=EntityName(effective_spec.configuration.name), + include_sensitive=True, + ) + if current_fleet is not None: + _set_fleet_spec_defaults(current_fleet.spec) + if _can_update_fleet_spec(current_fleet.spec, effective_spec): + action = ApplyAction.UPDATE + current_fleet_id = current_fleet.id + await _check_ssh_hosts_not_yet_added(session, effective_spec, current_fleet_id) + + offers = [] + if effective_spec.configuration.ssh_config is None: + requirements = get_fleet_requirements(effective_spec) + nodes = effective_spec.configuration.nodes + include_only_create_instance_supported_backends = True + if nodes is not None: + include_only_create_instance_supported_backends = nodes.target != 0 + offers_with_backends = await get_fleet_offers( + project=project, + profile=effective_spec.merged_profile, + requirements=requirements, + fleet_spec=effective_spec, + blocks=effective_spec.configuration.blocks, + include_only_create_instance_supported_backends=( + include_only_create_instance_supported_backends + ), + ) + offers = [offer for _, offer in offers_with_backends] + + _remove_fleet_spec_sensitive_info(effective_spec) + if current_fleet is not None: + _remove_fleet_spec_sensitive_info(current_fleet.spec) + plan = FleetPlan( + project_name=project.name, + user=user.name, + spec=spec, + effective_spec=effective_spec, + current_resource=current_fleet, + offers=offers[:50], + total_offers=len(offers), + max_offer_price=max((offer.price for offer in offers), default=None), + action=action, + ) + return plan + + +async def get_fleet_offers( + project: ProjectModel, + profile: Profile, + requirements: Requirements, + placement_group: Optional[PlacementGroup] = None, + fleet_spec: Optional[FleetSpec] = None, + fleet_model: Optional[FleetModel] = None, + blocks: Union[int, Literal["auto"]] = 1, + exclude_not_available: bool = False, + master_job_provisioning_data: Optional[JobProvisioningData] = None, + infer_master_job_provisioning_data_from_fleet_instances: bool = True, + include_only_create_instance_supported_backends: bool = True, +) -> List[Tuple[Backend, InstanceOfferWithAvailability]]: + """ + Return offers for fleet planning and provisioning. + + By default, restricts to backends that support `create_instance`. + Set `include_only_create_instance_supported_backends=False` to include + all matching backends. + """ + multinode = False + if fleet_spec is not None: + multinode = fleet_spec.configuration.placement == InstanceGroupPlacement.CLUSTER + if fleet_model is not None: + fleet_spec_from_model = get_fleet_spec(fleet_model) + multinode = fleet_spec_from_model.configuration.placement == InstanceGroupPlacement.CLUSTER + # The caller may override the current cluster master explicitly instead + # of inferring placement restrictions from the loaded fleet instances. + if ( + master_job_provisioning_data is None + and infer_master_job_provisioning_data_from_fleet_instances + ): + for instance in fleet_model.instances: + jpd = instances_services.get_instance_provisioning_data(instance) + if jpd is not None: + master_job_provisioning_data = jpd + break + + offers = await offers_services.get_offers_by_requirements( + project=project, + profile=profile, + requirements=requirements, + exclude_not_available=exclude_not_available, + multinode=multinode, + master_job_provisioning_data=master_job_provisioning_data, + placement_group=placement_group, + blocks=blocks, + ) + if include_only_create_instance_supported_backends: + offers = [ + (backend, offer) + for backend, offer in offers + if offer.backend in BACKENDS_WITH_CREATE_INSTANCE_SUPPORT + ] + return offers + + +async def apply_plan( + session: AsyncSession, + user: UserModel, + project: ProjectModel, + plan: ApplyFleetPlanInput, + force: bool, + pipeline_hinter: PipelineHinterProtocol, +) -> Fleet: + spec = await apply_plugin_policies( + user=user.name, + project=project.name, + spec=plan.spec, + ) + # Spec must be copied by parsing to calculate merged_profile + spec = copy_model(spec) + _validate_fleet_spec_and_set_defaults(spec) + + if spec.configuration.ssh_config is not None: + _check_can_manage_ssh_fleets(user=user, project=project) + + configuration = spec.configuration + if configuration.name is None: + return await _create_fleet( + session=session, + project=project, + user=user, + spec=spec, + pipeline_hinter=pipeline_hinter, + ) + + fleet_model = await get_project_fleet_model_by_name( + session=session, + project=project, + name=configuration.name, + ) + if fleet_model is None: + return await _create_fleet( + session=session, + project=project, + user=user, + spec=spec, + pipeline_hinter=pipeline_hinter, + ) + + instances_ids = sorted(i.id for i in fleet_model.instances if not i.deleted) + await session.commit() + async with ( + get_locker(get_db().dialect_name).lock_ctx(FleetModel.__tablename__, [fleet_model.id]), + get_locker(get_db().dialect_name).lock_ctx(InstanceModel.__tablename__, instances_ids), + ): + # Refetch after lock + # TODO: Lock instances with FOR UPDATE? + # We do not respect InstanceModel.lock_* fields here because FleetPipeline does not update SSH instances. + # TODO: Respect InstanceModel.lock_* fields if FleetPipeline and apply update the same instances. + res = await session.execute( + select(FleetModel) + .where( + FleetModel.project_id == project.id, + FleetModel.id == fleet_model.id, + FleetModel.deleted == False, + ) + .options( + selectinload(FleetModel.instances) + .joinedload(InstanceModel.jobs) + .load_only(JobModel.id) + ) + # `is_fleet_in_use()` only needs active run presence/status. + .options( + selectinload( + FleetModel.runs.and_(RunModel.status.not_in(RunStatus.finished_statuses())) + ).load_only(RunModel.id, RunModel.status) + ) + .execution_options(populate_existing=True) + .order_by(FleetModel.id) # take locks in order + .with_for_update(key_share=True) + ) + fleet_model = res.scalars().unique().one_or_none() + if fleet_model is not None: + if fleet_model.lock_expires_at is not None: + # TODO: Make the endpoint fully async so we don't need to lock and error: + # put the request in queue and process in the background. + raise ServerClientError( + "Failed to update fleet: fleet is being processed currently. Try again later." + ) + return await _update_fleet( + session=session, + user=user, + project=project, + spec=spec, + current_resource=plan.current_resource, + force=force, + fleet_model=fleet_model, + ) + + return await _create_fleet( + session=session, + project=project, + user=user, + spec=spec, + pipeline_hinter=pipeline_hinter, + ) + + +async def create_fleet( + session: AsyncSession, + project: ProjectModel, + user: UserModel, + spec: FleetSpec, + pipeline_hinter: PipelineHinterProtocol, +) -> Fleet: + spec = await apply_plugin_policies( + user=user.name, + project=project.name, + spec=spec, + ) + # Spec must be copied by parsing to calculate merged_profile + spec = copy_model(spec) + _validate_fleet_spec_and_set_defaults(spec) + + if spec.configuration.ssh_config is not None: + _check_can_manage_ssh_fleets(user=user, project=project) + + return await _create_fleet( + session=session, project=project, user=user, spec=spec, pipeline_hinter=pipeline_hinter + ) + + +def create_fleet_instance_model( + session: AsyncSession, + project: ProjectModel, + username: str, + spec: FleetSpec, + instance_num: int, + instance_id: Optional[uuid.UUID] = None, +) -> InstanceModel: + profile = spec.merged_profile + requirements = get_fleet_requirements(spec) + instance_model = instances_services.create_instance_model( + session=session, + project=project, + username=username, + profile=profile, + requirements=requirements, + instance_name=f"{spec.configuration.name}-{instance_num}", + instance_num=instance_num, + instance_id=instance_id, + reservation=spec.merged_profile.reservation, + blocks=spec.configuration.blocks, + tags=spec.configuration.tags, + ) + return instance_model + + +async def create_fleet_ssh_instance_model( + project: ProjectModel, + spec: FleetSpec, + ssh_params: SSHParams, + env: Env, + blocks: Union[int, Literal["auto"]], + instance_num: int, + host: Union[SSHHostParams, str], +) -> InstanceModel: + if isinstance(host, str): + hostname = host + ssh_user = ssh_params.user + ssh_key = ssh_params.ssh_key + port = ssh_params.port + proxy_jump = ssh_params.proxy_jump + internal_ip = None + else: + hostname = host.hostname + ssh_user = host.user or ssh_params.user + ssh_key = host.ssh_key or ssh_params.ssh_key + port = host.port or ssh_params.port + proxy_jump = host.proxy_jump or ssh_params.proxy_jump + internal_ip = host.internal_ip + if host.blocks is not None: + blocks = host.blocks + + if ssh_user is None or ssh_key is None: + # This should not be reachable but checked by fleet spec validation + raise ServerClientError("ssh key or user not specified") + + if proxy_jump is not None: + assert proxy_jump.ssh_key is not None + ssh_proxy = SSHConnectionParams( + hostname=proxy_jump.hostname, + port=proxy_jump.port or 22, + username=proxy_jump.user, + ) + ssh_proxy_keys = [proxy_jump.ssh_key] + else: + ssh_proxy = None + ssh_proxy_keys = None + + instance_model = await instances_services.create_ssh_instance_model( + project=project, + instance_name=f"{spec.configuration.name}-{instance_num}", + instance_num=instance_num, + region="remote", + host=hostname, + ssh_user=ssh_user, + ssh_keys=[ssh_key], + ssh_proxy=ssh_proxy, + ssh_proxy_keys=ssh_proxy_keys, + env=env, + internal_ip=internal_ip, + instance_network=ssh_params.network, + port=port or 22, + blocks=blocks, + ) + return instance_model + + +async def delete_fleets( + session: AsyncSession, + project: ProjectModel, + user: UserModel, + names: List[str], + instance_nums: Optional[List[int]] = None, + pipeline_hinter: Optional[PipelineHinterProtocol] = None, +): + res = await session.execute( + select(FleetModel.id) + .where( + FleetModel.project_id == project.id, + FleetModel.name.in_(names), + FleetModel.deleted == False, + ) + .order_by(FleetModel.id) + ) + fleets_ids = list(res.scalars().unique().all()) + stmt = ( + select(InstanceModel.id) + .where( + InstanceModel.fleet_id.in_(fleets_ids), + InstanceModel.deleted == False, + ) + .order_by(InstanceModel.id) + ) + if instance_nums is not None: + stmt = stmt.where(InstanceModel.instance_num.in_(instance_nums)) + res = await session.execute(stmt) + instances_ids = list(res.scalars().unique().all()) + await sqlite_commit(session) + async with ( + get_locker(get_db().dialect_name).lock_ctx(FleetModel.__tablename__, fleets_ids), + get_locker(get_db().dialect_name).lock_ctx(InstanceModel.__tablename__, instances_ids), + ): + # Retry locking fleets to increase lock acquisition chances. + # This hack is needed until requests are queued. + fleet_models = [] + for i in range(10): + res = await session.execute( + select(FleetModel) + .where( + FleetModel.project_id == project.id, + FleetModel.id.in_(fleets_ids), + FleetModel.deleted == False, + FleetModel.lock_expires_at.is_(None), + ) + .options( + selectinload(FleetModel.instances.and_(InstanceModel.id.in_(instances_ids))) + .selectinload(InstanceModel.jobs) + .load_only(JobModel.id) + ) + .options( + selectinload( + FleetModel.runs.and_(RunModel.status.not_in(RunStatus.finished_statuses())) + ).load_only(RunModel.status) + ) + .order_by(FleetModel.id) # take locks in order + .with_for_update(key_share=True, of=FleetModel) + .execution_options(populate_existing=True) + ) + fleet_models = res.scalars().unique().all() + if len(fleet_models) == len(fleets_ids): + break + await asyncio.sleep(0.5) + if len(fleet_models) != len(fleets_ids): + # TODO: Make the endpoint fully async so we don't need to lock and error. + msg = ( + "Failed to delete fleets: fleets are being processed currently. Try again later." + if instance_nums is None + else "Failed to delete fleet instances: fleets are being processed currently. Try again later." + ) + raise ServerClientError(msg) + # Retry locking instances to increase lock acquisition chances. + # This hack is needed until requests are queued. + instances_left_to_lock = set(instances_ids) + for i in range(10): + res = await session.execute( + select(InstanceModel.id) + .where( + InstanceModel.id.in_(instances_left_to_lock), + InstanceModel.deleted == False, + InstanceModel.lock_expires_at.is_(None), + ) + .order_by(InstanceModel.id) # take locks in order + .with_for_update(key_share=True, of=InstanceModel) + .execution_options(populate_existing=True) + ) + instances_left_to_lock.difference_update(res.scalars().unique().all()) + if len(instances_left_to_lock) == 0: + break + await asyncio.sleep(0.5) + if len(instances_left_to_lock) > 0: + msg = ( + "Failed to delete fleets: fleet instances are being processed currently. Try again later." + if instance_nums is None + else "Failed to delete fleet instances: fleet instances are being processed currently. Try again later." + ) + raise ServerClientError(msg) + for fleet_model in fleet_models: + fleet_spec = get_fleet_spec(fleet_model) + if fleet_spec.configuration.ssh_config is not None: + _check_can_manage_ssh_fleets(user=user, project=project) + if instance_nums is None: + logger.info("Deleting fleets: %s", [f.name for f in fleet_models]) + else: + logger.info( + "Deleting fleets %s instances %s", [f.name for f in fleet_models], instance_nums + ) + hint_instance_pipeline = False + for fleet_model in fleet_models: + hint_instance_pipeline |= _terminate_fleet_instances( + session=session, + fleet_model=fleet_model, + instance_nums=instance_nums, + actor=user, + ) + # TERMINATING fleets are deleted by FleetPipeline after instances are terminated + if instance_nums is None: + switch_fleet_status( + session, + fleet_model, + FleetStatus.TERMINATING, + actor=events.UserActor.from_user(user), + ) + await session.commit() + if hint_instance_pipeline and pipeline_hinter is not None: + pipeline_hinter.hint_fetch(InstanceModel.__name__) + + +def fleet_model_to_fleet( + fleet_model: FleetModel, + include_deleted_instances: bool = False, + include_sensitive: bool = False, +) -> Fleet: + instance_models = fleet_model.instances + if not include_deleted_instances: + instance_models = [i for i in instance_models if not i.deleted] + instances = [instances_services.instance_model_to_instance(i) for i in instance_models] + instances = sorted(instances, key=lambda i: i.instance_num) + spec = get_fleet_spec(fleet_model) + if not include_sensitive: + _remove_fleet_spec_sensitive_info(spec) + return Fleet( + id=fleet_model.id, + name=fleet_model.name, + project_name=fleet_model.project.name, + spec=spec, + created_at=fleet_model.created_at, + status=fleet_model.status, + status_message=fleet_model.status_message, + instances=instances, + ) + + +def get_fleet_spec(fleet_model: FleetModel) -> FleetSpec: + return FleetSpec.__response__.parse_raw(fleet_model.spec) + + +async def generate_fleet_name(session: AsyncSession, project: ProjectModel) -> str: + res = await session.execute( + select(FleetModel.name).where( + FleetModel.project_id == project.id, + FleetModel.deleted == False, + ) + ) + names = set(res.scalars().all()) + while True: + name = random_names.generate_name() + if name not in names: + return name + + +def is_fleet_in_use(fleet_model: FleetModel, instance_nums: Optional[List[int]] = None) -> bool: + instances_in_use = [i for i in fleet_model.instances if i.jobs and not i.deleted] + selected_instance_in_use = instances_in_use + if instance_nums is not None: + selected_instance_in_use = [i for i in instances_in_use if i.instance_num in instance_nums] + active_runs = [r for r in fleet_model.runs if not r.status.is_finished()] + return len(selected_instance_in_use) > 0 or ( + instance_nums is None and len(instances_in_use) == 0 and len(active_runs) > 0 + ) + + +def is_fleet_empty(fleet_model: FleetModel) -> bool: + active_instances = [i for i in fleet_model.instances if not i.deleted] + return len(active_instances) == 0 + + +def is_cloud_cluster(fleet_model: FleetModel) -> bool: + fleet_spec = get_fleet_spec(fleet_model) + return ( + fleet_spec.configuration.placement == InstanceGroupPlacement.CLUSTER + and fleet_spec.configuration.ssh_config is None + ) + + +def get_fleet_requirements(fleet_spec: FleetSpec) -> Requirements: + profile = fleet_spec.merged_profile + resources = fleet_spec.configuration.resources + if resources is None: + resources = ResourcesSpec.unconstrained() + requirements = Requirements( + resources=resources, + max_price=profile.max_price, + spot=get_policy_map(profile.spot_policy, default=SpotPolicy.ONDEMAND), + reservation=fleet_spec.configuration.reservation, + multinode=fleet_spec.configuration.placement == InstanceGroupPlacement.CLUSTER, + backend_options=profile.backend_options, + ) + return requirements + + +def get_next_instance_num(taken_instance_nums: set[int]) -> int: + if not taken_instance_nums: + return 0 + min_instance_num = min(taken_instance_nums) + if min_instance_num > 0: + return 0 + instance_num = min_instance_num + 1 + while True: + if instance_num not in taken_instance_nums: + return instance_num + instance_num += 1 + + +def get_fleet_master_instance_provisioning_data( + fleet_model: FleetModel, + fleet_spec: FleetSpec, +) -> Optional[JobProvisioningData]: + if fleet_spec.configuration.placement != InstanceGroupPlacement.CLUSTER: + return None + + if fleet_model.current_master_instance_id is not None: + for instance_model in fleet_model.instances: + if ( + instance_model.id == fleet_model.current_master_instance_id + and not instance_model.deleted + and instance_model.job_provisioning_data is not None + ): + return JobProvisioningData.__response__.parse_raw( + instance_model.job_provisioning_data + ) + + return None + + +def can_create_new_cloud_instance_in_fleet(fleet_model: FleetModel, fleet_spec: FleetSpec) -> bool: + if fleet_spec.configuration.ssh_config is not None: + return False + active_instances = [i for i in fleet_model.instances if i.status.is_active()] + # nodes.max is a soft limit that can be exceeded when provisioning concurrently. + # The fleet consolidation logic will remove redundant nodes eventually. + if ( + fleet_spec.configuration.nodes is not None + and fleet_spec.configuration.nodes.max is not None + and len(active_instances) >= fleet_spec.configuration.nodes.max + ): + return False + return True + + +def check_can_create_new_cloud_instance_in_fleet(fleet_model: FleetModel, fleet_spec: FleetSpec): + if not can_create_new_cloud_instance_in_fleet(fleet_model, fleet_spec): + raise ValueError("Cannot fit new cloud instance into fleet") + + +async def _create_fleet( + session: AsyncSession, + project: ProjectModel, + user: UserModel, + spec: FleetSpec, + pipeline_hinter: PipelineHinterProtocol, +) -> Fleet: + lock_namespace = f"fleet_names_{project.name}" + if is_db_sqlite(): + # Start new transaction to see committed changes after lock + await session.commit() + elif is_db_postgres(): + await session.execute( + select(func.pg_advisory_xact_lock(string_to_lock_id(lock_namespace))) + ) + lock, _ = get_locker(get_db().dialect_name).get_lockset(lock_namespace) + async with lock: + if spec.configuration.name is not None: + fleet_model = await get_project_fleet_model_by_name( + session=session, + project=project, + name=spec.configuration.name, + ) + if fleet_model is not None: + raise ResourceExistsError() + else: + spec.configuration.name = await generate_fleet_name(session=session, project=project) + + now = get_current_datetime() + fleet_model = FleetModel( + id=uuid.uuid4(), + name=spec.configuration.name, + project=project, + status=FleetStatus.ACTIVE, + spec=spec.json(), + instances=[], + created_at=now, + last_processed_at=now, + ) + session.add(fleet_model) + events.emit( + session, + f"Fleet created. Status: {fleet_model.status.upper()}", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(fleet_model)], + ) + if spec.configuration.ssh_config is not None: + for i, host in enumerate(spec.configuration.ssh_config.hosts): + instance_model = await create_fleet_ssh_instance_model( + project=project, + spec=spec, + ssh_params=spec.configuration.ssh_config, + env=spec.configuration.env, + blocks=spec.configuration.blocks, + instance_num=i, + host=host, + ) + events.emit( + session, + ( + "Instance created on fleet submission." + f" Status: {instance_model.status.upper()}" + ), + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(instance_model)], + ) + fleet_model.instances.append(instance_model) + else: + for i in range(_get_fleet_nodes_to_provision(spec)): + instance_model = create_fleet_instance_model( + session=session, + project=project, + username=user.name, + spec=spec, + instance_num=i, + ) + events.emit( + session, + ( + "Instance created on fleet submission." + f" Status: {instance_model.status.upper()}" + ), + # Set `SystemActor` for consistency with other places where cloud instances can be + # created (fleet spec consolidation, job provisioning, etc). Think of the fleet as being + # created by the user, while the cloud instance is created by the system to satisfy the + # fleet spec. + actor=events.SystemActor(), + targets=[events.Target.from_model(instance_model)], + ) + fleet_model.instances.append(instance_model) + await session.commit() + if spec.configuration.ssh_config is None: + pipeline_hinter.hint_fetch(FleetModel.__name__) + pipeline_hinter.hint_fetch(InstanceModel.__name__) + return fleet_model_to_fleet(fleet_model) + + +async def _update_fleet( + session: AsyncSession, + user: UserModel, + project: ProjectModel, + spec: FleetSpec, + current_resource: Optional[Fleet], + force: bool, + fleet_model: FleetModel, +) -> Fleet: + fleet = fleet_model_to_fleet(fleet_model) + _set_fleet_spec_defaults(fleet.spec) + fleet_sensitive = fleet_model_to_fleet(fleet_model, include_sensitive=True) + _set_fleet_spec_defaults(fleet_sensitive.spec) + + if not force: + if current_resource is not None: + _set_fleet_spec_defaults(current_resource.spec) + if ( + current_resource is None + or current_resource.id != fleet.id + or current_resource.spec != fleet.spec + ): + raise ServerClientError( + "Failed to apply plan. Resource has been changed. Try again or use force apply." + ) + + _check_can_update_fleet_spec(fleet_sensitive.spec, spec) + + fleet_model.spec = spec.json() + # Reset consolidation attempt so the next pipeline pass picks up the spec change promptly. + fleet_model.consolidation_attempt = 0 + + if ( + fleet_sensitive.spec.configuration.ssh_config is not None + and spec.configuration.ssh_config is not None + ): + added_hosts, removed_hosts, changed_hosts = _calculate_ssh_hosts_changes( + current=fleet_sensitive.spec.configuration.ssh_config.hosts, + new=spec.configuration.ssh_config.hosts, + ) + # `_check_can_update_fleet_spec` ensures hosts are not changed + assert not changed_hosts, changed_hosts + active_instance_nums: set[int] = set() + removed_instance_nums: list[int] = [] + if removed_hosts or added_hosts: + for instance_model in fleet_model.instances: + if instance_model.deleted: + continue + active_instance_nums.add(instance_model.instance_num) + rci = get_instance_remote_connection_info(instance_model) + if rci is None: + logger.error( + "Cloud instance %s in SSH fleet %s", + instance_model.id, + fleet_model.id, + ) + continue + if rci.host in removed_hosts: + removed_instance_nums.append(instance_model.instance_num) + if added_hosts: + await _check_ssh_hosts_not_yet_added(session, spec, fleet.id) + for host in added_hosts.values(): + instance_num = get_next_instance_num(active_instance_nums) + instance_model = await create_fleet_ssh_instance_model( + project=project, + spec=spec, + ssh_params=spec.configuration.ssh_config, + env=spec.configuration.env, + blocks=spec.configuration.blocks, + instance_num=instance_num, + host=host, + ) + events.emit( + session, + f"Instance created on fleet update. Status: {instance_model.status.upper()}", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(instance_model)], + ) + fleet_model.instances.append(instance_model) + active_instance_nums.add(instance_num) + if removed_instance_nums: + _terminate_fleet_instances(session, fleet_model, removed_instance_nums, actor=user) + + await session.commit() + return fleet_model_to_fleet(fleet_model) + + +def _can_update_fleet_spec(current_fleet_spec: FleetSpec, new_fleet_spec: FleetSpec) -> bool: + try: + _check_can_update_fleet_spec(current_fleet_spec, new_fleet_spec) + except ServerClientError as e: + logger.debug("Run cannot be updated: %s", repr(e)) + return False + return True + + +M = TypeVar("M", bound=CoreModel) + + +def _check_can_update(*updatable_fields: str): + def decorator(fn: Callable[[M, M, ModelDiff], None]) -> Callable[[M, M], None]: + @wraps(fn) + def inner(current: M, new: M): + diff = _check_can_update_inner(current, new, updatable_fields) + fn(current, new, diff) + + return inner + + return decorator + + +def _check_can_update_inner(current: M, new: M, updatable_fields: tuple[str, ...]) -> ModelDiff: + diff = diff_models(current, new) + changed_fields = diff.keys() + if not (changed_fields <= set(updatable_fields)): + raise ServerClientError( + f"Failed to update fields {list(changed_fields)}." + f" Can only update {list(updatable_fields)}." + ) + return diff + + +@_check_can_update("configuration", "configuration_path", "merged_profile") +def _check_can_update_fleet_spec(current: FleetSpec, new: FleetSpec, diff: ModelDiff): + # Allow `merged_profile` only to absorb derived changes from supported configuration updates + # such as `configuration.reservation` and `configuration.tags`. + # Direct `profile` updates are still not in-place updatable. + if "configuration" in diff: + _check_can_update_fleet_configuration(current.configuration, new.configuration) + + +def _check_can_update_fleet_configuration(current: FleetConfiguration, new: FleetConfiguration): + diff = diff_models(current, new) + current_ssh_config = current.ssh_config + new_ssh_config = new.ssh_config + if current_ssh_config is None: + if new_ssh_config is not None: + raise ServerClientError("Fleet type changed from Cloud to SSH, cannot update") + # TODO: Support best-effort `nodes.target` apply semantics: + # create missing instances and terminate extra idle instances. + # Current in-place update only persists `target`; FleetPipeline reconciles `min`/`max`. + # + # For `reservation` and `tags`, update affects only future provisioning. + _check_can_update_inner( + current, + new, + ( + "nodes", + "reservation", + "tags", + "resources", + "backends", + "regions", + "availability_zones", + "instance_types", + "spot_policy", + "max_price", + ), + ) + return + + if new_ssh_config is None: + raise ServerClientError("Fleet type changed from SSH to Cloud, cannot update") + + _check_can_update_inner(current, new, ("ssh_config",)) + if "ssh_config" in diff: + _check_can_update_ssh_config(current_ssh_config, new_ssh_config) + + +@_check_can_update("hosts") +def _check_can_update_ssh_config(current: SSHParams, new: SSHParams, diff: ModelDiff): + if "hosts" in diff: + _, _, changed_hosts = _calculate_ssh_hosts_changes(current.hosts, new.hosts) + if changed_hosts: + raise ServerClientError( + f"Hosts configuration changed, cannot update: {list(changed_hosts)}" + ) + + +def _calculate_ssh_hosts_changes( + current: list[Union[SSHHostParams, str]], new: list[Union[SSHHostParams, str]] +) -> tuple[dict[str, Union[SSHHostParams, str]], set[str], set[str]]: + current_hosts = {h if isinstance(h, str) else h.hostname: h for h in current} + new_hosts = {h if isinstance(h, str) else h.hostname: h for h in new} + added_hosts = {h: new_hosts[h] for h in new_hosts.keys() - current_hosts} + removed_hosts = current_hosts.keys() - new_hosts + changed_hosts: set[str] = set() + for host in current_hosts.keys() & new_hosts: + current_host = current_hosts[host] + new_host = new_hosts[host] + if isinstance(current_host, str) or isinstance(new_host, str): + if current_host != new_host: + changed_hosts.add(host) + else: + current_host = copy_model(current_host, reset={"identity_file"}) + new_host = copy_model(new_host, reset={"identity_file"}) + # XXX: cannot use copy_model() or diff_models() with + # `reset={..., "proxy_jump": {"identity_file"}}` + # as SSHProxyParams.identity_file has no default value + if current_host.proxy_jump is not None: + current_host.proxy_jump.identity_file = "" + if new_host.proxy_jump is not None: + new_host.proxy_jump.identity_file = "" + if diff_models(current_host, new_host): + changed_hosts.add(host) + return added_hosts, removed_hosts, changed_hosts + + +def _check_can_manage_ssh_fleets(user: UserModel, project: ProjectModel): + if user.global_role == GlobalRole.ADMIN: + return + member = get_member(user=user, project=project) + if member is None: + raise ForbiddenError() + permissions = get_member_permissions(member) + if permissions.can_manage_ssh_fleets: + return + raise ForbiddenError() + + +async def _check_ssh_hosts_not_yet_added( + session: AsyncSession, spec: FleetSpec, current_fleet_id: Optional[uuid.UUID] = None +): + if spec.configuration.ssh_config and spec.configuration.ssh_config.hosts: + # there are manually listed hosts, need to check them for existence + active_instances = await list_active_remote_instances(session=session) + + existing_hosts = set() + for instance in active_instances: + # ignore instances belonging to the same fleet -- in-place update/recreate + if current_fleet_id is not None and instance.fleet_id == current_fleet_id: + continue + instance_conn_info = get_instance_remote_connection_info(instance) + assert instance_conn_info is not None + existing_hosts.add(instance_conn_info.host) + + instances_already_in_fleet = [] + for new_instance in spec.configuration.ssh_config.hosts: + hostname = new_instance if isinstance(new_instance, str) else new_instance.hostname + if hostname in existing_hosts: + instances_already_in_fleet.append(hostname) + + if instances_already_in_fleet: + raise ServerClientError( + msg=f"Instances [{', '.join(instances_already_in_fleet)}] are already assigned to a fleet." + ) + + +def _remove_fleet_spec_sensitive_info(spec: FleetSpec): + if spec.configuration.ssh_config is not None: + spec.configuration.ssh_config.ssh_key = None + for host in spec.configuration.ssh_config.hosts: + if not isinstance(host, str): + host.ssh_key = None + + +def _validate_fleet_spec_and_set_defaults(spec: FleetSpec): + if spec.configuration.name is not None: + validate_dstack_resource_name(spec.configuration.name) + _validate_fleet_configuration_subtype_specific_fields(spec.configuration) + if spec.configuration.ssh_config is not None: + _validate_all_ssh_params_specified(spec.configuration.ssh_config) + if spec.configuration.ssh_config.ssh_key is not None: + _validate_ssh_key(spec.configuration.ssh_config.ssh_key) + for host in spec.configuration.ssh_config.hosts: + if isinstance(host, SSHHostParams) and host.ssh_key is not None: + _validate_ssh_key(host.ssh_key) + _validate_internal_ips(spec.configuration.ssh_config) + _set_fleet_spec_defaults(spec) + + +def _validate_fleet_configuration_subtype_specific_fields(conf: FleetConfiguration): + if conf.ssh_config is None and conf.nodes is None: + raise ServerClientError("No ssh_config or nodes specified") + if conf.ssh_config is not None and conf.nodes is not None: + raise ServerClientError("ssh_config and nodes are mutually exclusive") + subtype: str + props_model: type[CoreModel] + if conf.ssh_config is not None: + subtype = "SSH" + props_model = BackendFleetConfiguraionProps + else: + subtype = "Backend" + props_model = SSHFleetConfigurationProps + non_default_fields: list[str] = [] + for field in props_model.__fields__.values(): + if getattr(conf, field.name) != field.default: + non_default_fields.append(field.name) + if non_default_fields: + raise ServerClientError( + f"{subtype} fleet configuration does not support the following fields:" + f" {non_default_fields}" + ) + return conf + + +def _set_fleet_spec_defaults(spec: FleetSpec): + if spec.configuration.resources is not None: + set_resources_defaults(spec.configuration.resources) + + +def _validate_all_ssh_params_specified(ssh_config: SSHParams): + for host in ssh_config.hosts: + if isinstance(host, str): + if ssh_config.ssh_key is None: + raise ServerClientError(f"No ssh key specified for host {host}") + if ssh_config.user is None: + raise ServerClientError(f"No ssh user specified for host {host}") + else: + if ssh_config.ssh_key is None and host.ssh_key is None: + raise ServerClientError(f"No ssh key specified for host {host.hostname}") + if ssh_config.user is None and host.user is None: + raise ServerClientError(f"No ssh user specified for host {host.hostname}") + + +def _validate_ssh_key(ssh_key: SSHKey): + if ssh_key.private is None: + raise ServerClientError("Private key not provided") + try: + pkey_from_str(ssh_key.private) + except ValueError: + raise ServerClientError( + "Unsupported key type. " + "The key type should be RSA, ECDSA, or Ed25519 and should not be encrypted with passphrase." + ) + + +def _validate_internal_ips(ssh_config: SSHParams): + internal_ips_num = 0 + for host in ssh_config.hosts: + if not isinstance(host, str) and host.internal_ip is not None: + internal_ips_num += 1 + if internal_ips_num != 0 and internal_ips_num != len(ssh_config.hosts): + raise ServerClientError("internal_ip must be specified for all hosts") + if internal_ips_num > 0 and ssh_config.network is not None: + raise ServerClientError("internal_ip is mutually exclusive with network") + + +def _get_fleet_nodes_to_provision(spec: FleetSpec) -> int: + if spec.configuration.nodes is None: + return 0 + return spec.configuration.nodes.target + + +def _terminate_fleet_instances( + session: AsyncSession, + fleet_model: FleetModel, + instance_nums: Optional[List[int]], + actor: UserModel, +) -> bool: + hint_instance_pipeline = False + if is_fleet_in_use(fleet_model, instance_nums=instance_nums): + if instance_nums is not None: + raise ServerClientError( + f"Failed to delete fleet {fleet_model.name} instances {instance_nums}. Fleet instances are in use." + ) + raise ServerClientError(f"Failed to delete fleet {fleet_model.name}. Fleet is in use.") + for instance in fleet_model.instances: + if instance_nums is not None and instance.instance_num not in instance_nums: + continue + if is_placeholder_instance(instance): + raise ServerClientError("Failed to delete instance while the job is provisioning.") + if instance.status == InstanceStatus.TERMINATED: + instance.deleted = True + else: + instance.termination_reason = InstanceTerminationReason.TERMINATED_BY_USER + if instance.status != InstanceStatus.TERMINATING: + instance.skip_min_processing_interval = True + hint_instance_pipeline = True + switch_instance_status( + session, + instance, + InstanceStatus.TERMINATING, + actor=events.UserActor.from_user(actor), + ) + return hint_instance_pipeline diff --git a/src/dstack/_internal/server/services/gateways/__init__.py b/src/dstack/_internal/server/services/gateways/__init__.py index e98283c4ca..e81dbf7044 100644 --- a/src/dstack/_internal/server/services/gateways/__init__.py +++ b/src/dstack/_internal/server/services/gateways/__init__.py @@ -1,26 +1,28 @@ import asyncio +import datetime import uuid -from datetime import timezone +from collections.abc import AsyncGenerator +from contextlib import asynccontextmanager +from datetime import timedelta +from functools import partial from typing import List, Optional, Sequence -from urllib.parse import urlparse import httpx -import sqlalchemy.orm as sa_orm -from sqlalchemy import select, update +from sqlalchemy import exists, func, or_, select, update from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import joinedload, selectinload -import dstack._internal.server.services.jobs as jobs_services import dstack._internal.utils.random_names as random_names -from dstack._internal.core.backends import ( - BACKENDS_WITH_GATEWAY_SUPPORT, - BACKENDS_WITH_PRIVATE_GATEWAY_SUPPORT, -) -from dstack._internal.core.backends.base import Backend from dstack._internal.core.backends.base.compute import ( Compute, + ComputeWithGatewaySupport, get_dstack_gateway_wheel, get_dstack_runner_version, ) +from dstack._internal.core.backends.features import ( + BACKENDS_WITH_GATEWAY_SUPPORT, + BACKENDS_WITH_PRIVATE_GATEWAY_SUPPORT, +) from dstack._internal.core.errors import ( GatewayError, ResourceNotExistsError, @@ -28,99 +30,175 @@ SSHError, ) from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import EntityReference from dstack._internal.core.models.gateways import ( + GATEWAY_REPLICAS_DEFAULT, + AnyGatewayRouterConfig, Gateway, GatewayComputeConfiguration, GatewayConfiguration, + GatewayReplica, + GatewaySpec, GatewayStatus, LetsEncryptGatewayCertificate, ) -from dstack._internal.core.models.runs import ( - Run, - RunSpec, - ServiceModelSpec, - ServiceSpec, -) from dstack._internal.core.services import validate_dstack_resource_name +from dstack._internal.proxy.gateway.const import SERVICE_SCALING_WINDOWS +from dstack._internal.proxy.gateway.schemas.stats import PerWindowStats, Stat from dstack._internal.server import settings +from dstack._internal.server.db import get_db, is_db_postgres, is_db_sqlite from dstack._internal.server.models import ( + BackendModel, + ExportedGatewayModel, GatewayComputeModel, GatewayModel, - JobModel, + ImportModel, ProjectModel, - RunModel, + UserModel, ) +from dstack._internal.server.services import events from dstack._internal.server.services.backends import ( - get_project_backend_by_type_or_error, + check_backend_type_available, get_project_backend_with_model_by_type_or_error, ) from dstack._internal.server.services.gateways.connection import GatewayConnection -from dstack._internal.server.services.gateways.options import get_service_options from dstack._internal.server.services.gateways.pool import gateway_connections_pool -from dstack._internal.server.services.logging import fmt -from dstack._internal.server.utils.common import ( - gather_map_async, +from dstack._internal.server.services.locking import ( + advisory_lock_ctx, + get_locker, + string_to_lock_id, +) +from dstack._internal.server.services.pipelines import PipelineHinterProtocol +from dstack._internal.server.services.plugins import apply_plugin_policies +from dstack._internal.server.utils.common import gather_map_async +from dstack._internal.utils.common import ( + get_current_datetime, + interpolate_gateway_domain, run_async, - wait_to_lock, ) -from dstack._internal.utils.common import get_current_datetime from dstack._internal.utils.crypto import generate_rsa_key_pair_bytes from dstack._internal.utils.logging import get_logger logger = get_logger(__name__) -PROCESSING_GATEWAYS_LOCK = asyncio.Lock() -PROCESSING_GATEWAYS_IDS = set() +def switch_gateway_status( + session: AsyncSession, + gateway_model: GatewayModel, + new_status: GatewayStatus, + actor: events.AnyActor = events.SystemActor(), +): + old_status = gateway_model.status + if old_status == new_status: + return + + gateway_model.status = new_status + emit_gateway_status_change_event( + session=session, + gateway_model=gateway_model, + old_status=old_status, + new_status=new_status, + status_message=gateway_model.status_message, + actor=actor, + ) + + +def emit_gateway_status_change_event( + session: AsyncSession, + gateway_model: GatewayModel, + old_status: GatewayStatus, + new_status: GatewayStatus, + status_message: Optional[str], + actor: events.AnyActor = events.SystemActor(), +) -> None: + if old_status == new_status: + return + msg = get_gateway_status_change_message( + old_status=old_status, + new_status=new_status, + status_message=status_message, + ) + events.emit(session, msg, actor=actor, targets=[events.Target.from_model(gateway_model)]) + + +def get_gateway_status_change_message( + old_status: GatewayStatus, new_status: GatewayStatus, status_message: Optional[str] +) -> str: + msg = f"Gateway status changed {old_status.upper()} -> {new_status.upper()}" + if status_message is not None: + msg += f" ({status_message})" + return msg GATEWAY_CONNECT_ATTEMPTS = 30 GATEWAY_CONNECT_DELAY = 10 -GATEWAY_CONFIGURE_ATTEMPTS = 40 +GATEWAY_CONFIGURE_ATTEMPTS = 50 GATEWAY_CONFIGURE_DELAY = 3 +# Artificial limit to avoid doing too many per-replica operations (gateway replica provisioning, +# service registration, etc) in a single pipeline tick. Can be lifted once the implementation is +# more mature. +GATEWAY_MAX_REPLICAS = 3 # documented in gateways.md, keep in sync -async def list_project_gateways(session: AsyncSession, project: ProjectModel) -> List[Gateway]: - gateways = await list_project_gateway_models(session=session, project=project) - return [gateway_model_to_gateway(g) for g in gateways] +async def list_project_gateways( + session: AsyncSession, + project: ProjectModel, + include_imported: bool = False, +) -> List[Gateway]: + gateways = await list_project_gateway_models( + session=session, + project=project, + include_imported=include_imported, + load_gateway_compute=True, + load_backend_type=True, + ) + return [ + gateway_model_to_gateway(g, default_gateway_id=project.default_gateway_id) + for g in gateways + ] async def get_gateway_by_name( session: AsyncSession, project: ProjectModel, name: str ) -> Optional[Gateway]: - gateway = await get_project_gateway_model_by_name(session=session, project=project, name=name) - if gateway is None: - return None - return gateway_model_to_gateway(gateway) - - -async def get_project_default_gateway( - session: AsyncSession, project: ProjectModel -) -> Optional[Gateway]: - gateway: Optional[GatewayModel] = project.default_gateway + gateway = await get_project_gateway_model_by_reference( + session=session, + project=project, + ref=EntityReference(name=name, project=None), + load_gateway_compute=True, + load_backend_type=True, + ) if gateway is None: return None - return gateway_model_to_gateway(gateway) + return gateway_model_to_gateway(gateway, default_gateway_id=project.default_gateway_id) async def create_gateway_compute( project_name: str, backend_compute: Compute, configuration: GatewayConfiguration, + replica_num: int, + gateway_id: Optional[uuid.UUID] = None, backend_id: Optional[uuid.UUID] = None, ) -> GatewayComputeModel: + assert isinstance(backend_compute, ComputeWithGatewaySupport) + assert configuration.name is not None + private_bytes, public_bytes = generate_rsa_key_pair_bytes() gateway_ssh_private_key = private_bytes.decode() gateway_ssh_public_key = public_bytes.decode() compute_configuration = GatewayComputeConfiguration( project_name=project_name, - instance_name=configuration.name, + instance_name=f"{configuration.name}-{replica_num}", backend=configuration.backend, region=configuration.region, + instance_type=configuration.instance_type, public_ip=configuration.public_ip, ssh_key_pub=gateway_ssh_public_key, certificate=configuration.certificate, + tags=configuration.tags, + router=configuration.router, ) gpd = await run_async( @@ -129,7 +207,9 @@ async def create_gateway_compute( ) return GatewayComputeModel( + gateway_id=gateway_id, backend_id=backend_id, + replica_num=replica_num, region=gpd.region, ip_address=gpd.ip_address, instance_id=gpd.instance_id, @@ -143,35 +223,78 @@ async def create_gateway_compute( async def create_gateway( session: AsyncSession, + user: UserModel, project: ProjectModel, configuration: GatewayConfiguration, + pipeline_hinter: PipelineHinterProtocol, ) -> Gateway: + spec = await apply_plugin_policies( + user=user.name, + project=project.name, + # Create pseudo spec until the gateway API is updated to accept spec + spec=GatewaySpec(configuration=configuration), + ) + configuration = spec.configuration _validate_gateway_configuration(configuration) backend_model, _ = await get_project_backend_with_model_by_type_or_error( project=project, backend_type=configuration.backend ) - if configuration.name is None: - configuration.name = await generate_gateway_name(session=session, project=project) - - gateway = GatewayModel( - name=configuration.name, - region=configuration.region, - project_id=project.id, - backend_id=backend_model.id, - wildcard_domain=configuration.domain, - configuration=configuration.json(), - status=GatewayStatus.SUBMITTED, - last_processed_at=get_current_datetime(), - ) - session.add(gateway) - await session.commit() - - if project.default_gateway is None or configuration.default: - await set_default_gateway(session=session, project=project, name=configuration.name) - - return gateway_model_to_gateway(gateway) + lock_namespace = f"gateway_names_{project.name}" + if is_db_sqlite(): + # Start new transaction to see committed changes after lock + await session.commit() + elif is_db_postgres(): + await session.execute( + select(func.pg_advisory_xact_lock(string_to_lock_id(lock_namespace))) + ) + lock, _ = get_locker(get_db().dialect_name).get_lockset(lock_namespace) + async with lock: + if configuration.name is None: + configuration.name = await generate_gateway_name(session=session, project=project) + + now = get_current_datetime() + gateway = GatewayModel( + id=uuid.uuid4(), + name=configuration.name, + region=configuration.region, + project_id=project.id, + backend_id=backend_model.id, + wildcard_domain=configuration.domain, + configuration=configuration.json(), + status=GatewayStatus.SUBMITTED, + created_at=now, + last_processed_at=now, + ) + session.add(gateway) + events.emit( + session, + f"Gateway created. Status: {gateway.status.upper()}", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(gateway)], + ) + await session.commit() + + default_gateway = await get_project_default_gateway_model(session=session, project=project) + if default_gateway is None or configuration.default: + await set_default_gateway( + session=session, + project=project, + ref=EntityReference(name=configuration.name, project=None), + user=user, + ) + default_gateway = gateway + pipeline_hinter.hint_fetch(GatewayModel.__name__) + gateway = await get_project_gateway_model_by_reference( + session=session, + project=project, + ref=EntityReference(name=configuration.name, project=None), + load_gateway_compute=True, + load_backend_type=True, + ) + assert gateway is not None + return gateway_model_to_gateway(gateway, default_gateway_id=default_gateway.id) async def connect_to_gateway_with_retry( @@ -188,7 +311,7 @@ async def connect_to_gateway_with_retry( for attempt in range(GATEWAY_CONNECT_ATTEMPTS): try: - connection = await gateway_connections_pool.add( + connection = await gateway_connections_pool.get_or_add( gateway_compute.ip_address, gateway_compute.ssh_private_key ) break @@ -202,92 +325,100 @@ async def connect_to_gateway_with_retry( return connection -async def delete_gateways(session: AsyncSession, project: ProjectModel, gateways_names: List[str]): - tasks = [] - gateways = [] - for gateway in await list_project_gateway_models(session=session, project=project): - if gateway.backend.type == BackendType.DSTACK: - continue - if gateway.name not in gateways_names: - continue - backend = await get_project_backend_by_type_or_error(project, gateway.backend.type) - tasks.append(_terminate_gateway(session=session, gateway=gateway, backend=backend)) - gateways.append(gateway) - logger.info("Deleting gateways: %s", [g.name for g in gateways]) - # terminate in parallel - # FIXME: not safe to share session between tasks – sqlalchemy can error - terminate_results = await asyncio.gather(*tasks, return_exceptions=True) - for gateway, error in zip(gateways, terminate_results): - if isinstance(error, Exception): - logger.exception( - "Error when deleting gateway compute for %s", - gateway.name, - exc_info=(type(error), error, error.__traceback__), - ) - continue # keep gateway - if gateway.gateway_compute is not None: - await gateway_connections_pool.remove(gateway.gateway_compute.ip_address) - gateway.gateway_compute.active = False - gateway.gateway_compute.deleted = True - session.add(gateway.gateway_compute) - await session.delete(gateway) - for gateway in gateways: - PROCESSING_GATEWAYS_IDS.remove(gateway.id) - await session.commit() - - -async def _terminate_gateway(session: AsyncSession, gateway: GatewayModel, backend: Backend): - await wait_to_lock(PROCESSING_GATEWAYS_LOCK, PROCESSING_GATEWAYS_IDS, gateway.id) - await session.refresh(gateway) - gateway_compute_configuration = get_gateway_compute_configuration(gateway) - if gateway.gateway_compute is not None and gateway_compute_configuration is not None: - logger.info("Deleting gateway compute for %s...", gateway.name) - await run_async( - backend.compute().terminate_gateway, - gateway.gateway_compute.instance_id, - gateway_compute_configuration, - gateway.gateway_compute.backend_data, +async def delete_gateways( + session: AsyncSession, + project: ProjectModel, + gateways_names: List[str], + user: UserModel, +): + res = await session.execute( + select(GatewayModel).where( + GatewayModel.project_id == project.id, + GatewayModel.name.in_(gateways_names), ) - logger.info("Deleted gateway compute for %s", gateway.name) + ) + gateway_models = res.scalars().all() + gateways_ids = sorted([g.id for g in gateway_models]) + await session.commit() + logger.info("Deleting gateways: %s", [g.name for g in gateway_models]) + async with get_locker(get_db().dialect_name).lock_ctx( + GatewayModel.__tablename__, gateways_ids + ): + # Retry locking gateways to increase lock acquisition chances. + # This hack is needed until requests are queued. + gateway_models = [] + for i in range(10): + res = await session.execute( + select(GatewayModel) + .where( + GatewayModel.id.in_(gateways_ids), + GatewayModel.project_id == project.id, + GatewayModel.lock_expires_at.is_(None), + ) + .options(joinedload(GatewayModel.backend).load_only(BackendModel.type)) + .order_by(GatewayModel.id) # take locks in order + .with_for_update(key_share=True, of=GatewayModel) + .execution_options(populate_existing=True) + ) + gateway_models = res.scalars().all() + if len(gateway_models) == len(gateways_ids): + break + await asyncio.sleep(0.5) + if len(gateway_models) != len(gateways_ids): + # TODO: Make the endpoint fully async so we don't need to lock and error. + raise ServerClientError( + "Failed to delete gateways: gateways are being processed currently. Try again later." + ) + for gateway_model in gateway_models: + if not gateway_model.to_be_deleted: + gateway_model.to_be_deleted = True + events.emit( + session, + "Gateway marked for deletion", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(gateway_model)], + ) + await session.commit() async def set_gateway_wildcard_domain( - session: AsyncSession, project: ProjectModel, name: str, wildcard_domain: Optional[str] + session: AsyncSession, + project: ProjectModel, + name: str, + wildcard_domain: Optional[str], + user: UserModel, ) -> Gateway: - gateway = await get_project_gateway_model_by_name( - session=session, - project=project, - name=name, - ) - if gateway is None: - raise ResourceNotExistsError() - if gateway.backend.type == BackendType.DSTACK: - raise ServerClientError("Custom domains for dstack Sky gateway are not supported") - await session.execute( - update(GatewayModel) - .where( - GatewayModel.project_id == project.id, - GatewayModel.name == name, - ) - .values( - wildcard_domain=wildcard_domain, - ) - ) - await session.commit() - gateway = await get_project_gateway_model_by_name( - session=session, - project=project, - name=name, - ) - if gateway is None: - raise ResourceNotExistsError() - return gateway_model_to_gateway(gateway) + async with get_project_gateway_model_by_name_for_update( + session=session, project=project, name=name + ) as gateway: + if gateway is None: + raise ResourceNotExistsError() + old_domain = gateway.wildcard_domain + if old_domain != wildcard_domain: + gateway.wildcard_domain = wildcard_domain + events.emit( + session, + f"Gateway wildcard domain changed {old_domain!r} -> {gateway.wildcard_domain!r}", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(gateway)], + ) + await session.commit() + return gateway_model_to_gateway(gateway, default_gateway_id=project.default_gateway_id) -async def set_default_gateway(session: AsyncSession, project: ProjectModel, name: str): - gateway = await get_project_gateway_model_by_name(session=session, project=project, name=name) +async def set_default_gateway( + session: AsyncSession, project: ProjectModel, ref: EntityReference, user: Optional[UserModel] +): + gateway = await get_project_gateway_model_by_reference( + session=session, project=project, ref=ref + ) if gateway is None: raise ResourceNotExistsError() + if gateway.to_be_deleted: + raise ServerClientError("Cannot set gateway marked for deletion as default") + previous_gateway = await get_project_default_gateway_model(session, project) + if previous_gateway is not None and previous_gateway.id == gateway.id: + return await session.execute( update(ProjectModel) .where( @@ -297,202 +428,233 @@ async def set_default_gateway(session: AsyncSession, project: ProjectModel, name default_gateway_id=gateway.id, ) ) + if previous_gateway is not None: + events.emit( + session, + "Gateway unset as project default", + actor=events.UserActor.from_user(user) if user is not None else events.SystemActor(), + targets=[ + events.Target.from_model(previous_gateway), + events.Target.from_model(project), + ], + ) + events.emit( + session, + "Gateway set as project default", + actor=events.UserActor.from_user(user) if user is not None else events.SystemActor(), + targets=[ + events.Target.from_model(gateway), + events.Target.from_model(project), + ], + ) await session.commit() async def list_project_gateway_models( - session: AsyncSession, project: ProjectModel + session: AsyncSession, + project: ProjectModel, + include_imported: bool = False, + load_gateway_compute: bool = False, + load_backend_type: bool = False, ) -> Sequence[GatewayModel]: - res = await session.execute(select(GatewayModel).where(GatewayModel.project_id == project.id)) - return res.scalars().all() + stmt = select(GatewayModel) + if include_imported: + stmt = stmt.where( + or_( + GatewayModel.project_id == project.id, + exists().where( + ImportModel.project_id == project.id, + ImportModel.export_id == ExportedGatewayModel.export_id, + ExportedGatewayModel.gateway_id == GatewayModel.id, + ), + ) + ).options(joinedload(GatewayModel.project).load_only(ProjectModel.id, ProjectModel.name)) + else: + stmt = stmt.where(GatewayModel.project_id == project.id) + if load_gateway_compute: + stmt = stmt.options(joinedload(GatewayModel.gateway_compute)) + stmt = stmt.options(selectinload(GatewayModel.gateway_computes)) + if load_backend_type: + stmt = stmt.options(joinedload(GatewayModel.backend).load_only(BackendModel.type)) + res = await session.execute(stmt) + return res.unique().scalars().all() -async def get_project_gateway_model_by_name( - session: AsyncSession, project: ProjectModel, name: str +async def get_project_gateway_model_by_reference( + session: AsyncSession, + project: ProjectModel, + ref: EntityReference, + load_gateway_compute: bool = False, + load_backend_type: bool = False, ) -> Optional[GatewayModel]: - res = await session.execute( - select(GatewayModel).where( - GatewayModel.project_id == project.id, GatewayModel.name == name + stmt = select(GatewayModel).where(GatewayModel.name == ref.name) + if ref.project is None or ref.project == project.name: + stmt = stmt.where(GatewayModel.project_id == project.id) + else: + stmt = stmt.where( + exists().where( + ImportModel.project_id == project.id, + ImportModel.export_id == ExportedGatewayModel.export_id, + ExportedGatewayModel.gateway_id == GatewayModel.id, + GatewayModel.project_id == ProjectModel.id, + ProjectModel.name == ref.project, + ) ) - ) + if load_gateway_compute: + stmt = stmt.options(joinedload(GatewayModel.gateway_compute)) + stmt = stmt.options(selectinload(GatewayModel.gateway_computes)) + if load_backend_type: + stmt = stmt.options(joinedload(GatewayModel.backend).load_only(BackendModel.type)) + res = await session.execute(stmt) return res.scalar() -async def generate_gateway_name(session: AsyncSession, project: ProjectModel) -> str: - gateways = await list_project_gateway_models(session=session, project=project) - names = {g.name for g in gateways} - while True: - name = random_names.generate_name() - if name not in names: - return name - +@asynccontextmanager +async def get_project_gateway_model_by_name_for_update( + session: AsyncSession, project: ProjectModel, name: str +) -> AsyncGenerator[Optional[GatewayModel], None]: + """ + Fetch the gateway from the database and lock it for update. -async def register_service(session: AsyncSession, run_model: RunModel): - run_spec = RunSpec.__response__.parse_raw(run_model.run_spec) + **NOTE**: commit changes to the database before exiting from this context manager, + so that in-memory locks are only released after commit. + """ - # TODO(egor-s): allow to configure gateway name - gateway_name: Optional[str] = None - if gateway_name is None: - gateway = run_model.project.default_gateway - if gateway is None: - raise ResourceNotExistsError("Default gateway is not set") + filters = [ + GatewayModel.project_id == project.id, + GatewayModel.name == name, + ] + res = await session.execute(select(GatewayModel.id).where(*filters)) + gateway_id = res.scalar_one_or_none() + if gateway_id is None: + yield None else: - gateway = await get_project_gateway_model_by_name( - session=session, project=run_model.project, name=gateway_name - ) - if gateway is None: - raise ResourceNotExistsError("Gateway does not exist") - if gateway.gateway_compute is None: - raise ServerClientError("Gateway has no instance associated with it") - - if gateway.status != GatewayStatus.RUNNING: - raise ServerClientError("Gateway status is not running") - - gateway_configuration = None - if gateway.configuration is not None: - gateway_configuration = GatewayConfiguration.__response__.parse_raw(gateway.configuration) - - service_https = _get_service_https(run_spec, gateway_configuration) - service_protocol = "https" if service_https else "http" - - if ( - service_https - and gateway_configuration is not None - and gateway_configuration.certificate is None - ): - raise ServerClientError( - "Cannot run HTTPS service on gateway with no SSL cerfificates configured" - ) - - gateway_https = _get_gateway_https(run_spec, gateway_configuration) - gateway_protocol = "https" if gateway_https else "http" - - wildcard_domain = gateway.wildcard_domain.lstrip("*.") if gateway.wildcard_domain else None - if wildcard_domain is None: - raise ServerClientError("Domain is required for gateway") - service_spec = ServiceSpec(url=f"{service_protocol}://{run_model.run_name}.{wildcard_domain}") - if run_spec.configuration.model is not None: - service_spec.model = ServiceModelSpec( - name=run_spec.configuration.model.name, - base_url=f"{gateway_protocol}://gateway.{wildcard_domain}", - type=run_spec.configuration.model.type, - ) - service_spec.options = get_service_options(run_spec.configuration) - - run_model.gateway = gateway - run_model.service_spec = service_spec.json() - - conn = await gateway_connections_pool.get(gateway.gateway_compute.ip_address) - if conn is None: - raise ServerClientError("Gateway is not connected") - - try: - logger.debug("%s: registering service as %s", fmt(run_model), service_spec.url) - async with conn.client() as client: - await client.register_service( - project=run_model.project.name, - run_id=run_model.id, - domain=urlparse(service_spec.url).hostname, - service_https=service_https, - gateway_https=gateway_https, - auth=run_spec.configuration.auth, - options=service_spec.options, - ssh_private_key=run_model.project.ssh_private_key, + async with get_locker(get_db().dialect_name).lock_ctx( + GatewayModel.__tablename__, [gateway_id] + ): + # Refetch after lock + res = await session.execute( + select(GatewayModel) + .where(GatewayModel.id.in_([gateway_id]), *filters) + .options(joinedload(GatewayModel.gateway_compute)) + .options(selectinload(GatewayModel.gateway_computes)) + .options(joinedload(GatewayModel.backend).load_only(BackendModel.type)) + .with_for_update(key_share=True, of=GatewayModel) ) - logger.info("%s: service is registered as %s", fmt(run_model), service_spec.url) - except SSHError: - raise ServerClientError("Gateway tunnel is not working") - except httpx.RequestError as e: - logger.debug("Gateway request failed", exc_info=True) - raise GatewayError(f"Gateway is not working: {e!r}") + yield res.scalar_one_or_none() -async def register_replica( - session: AsyncSession, gateway_id: uuid.UUID, run: Run, job_model: JobModel -): - conn = await get_gateway_connection(session, gateway_id) - job_submission = jobs_services.job_model_to_job_submission(job_model) - try: - logger.debug("%s: registering replica for service %s", fmt(job_model), run.id.hex) - async with conn.client() as client: - await client.register_replica( - run=run, - job_submission=job_submission, - ) - logger.info("%s: replica is registered for service %s", fmt(job_model), run.id.hex) - except (httpx.RequestError, SSHError) as e: - logger.debug("Gateway request failed", exc_info=True) - raise GatewayError(repr(e)) +async def get_project_default_gateway_model( + session: AsyncSession, + project: ProjectModel, + load_gateway_compute: bool = False, + load_backend_type: bool = False, +) -> Optional[GatewayModel]: + stmt = select(GatewayModel).where( + GatewayModel.id == project.default_gateway_id, + GatewayModel.to_be_deleted == False, + or_( + GatewayModel.project_id == project.id, + exists().where( + ImportModel.project_id == project.id, + ImportModel.export_id == ExportedGatewayModel.export_id, + ExportedGatewayModel.gateway_id == GatewayModel.id, + ), + ), + ) + if load_gateway_compute: + stmt = stmt.options(joinedload(GatewayModel.gateway_compute)) + stmt = stmt.options(selectinload(GatewayModel.gateway_computes)) + if load_backend_type: + stmt = stmt.options(joinedload(GatewayModel.backend).load_only(BackendModel.type)) + res = await session.execute(stmt) + return res.scalar_one_or_none() -async def unregister_service(session: AsyncSession, run_model: RunModel): - conn = await get_gateway_connection(session, run_model.gateway_id) - project = await session.get(ProjectModel, run_model.project_id) - try: - logger.debug("%s: unregistering service", fmt(run_model)) - async with conn.client() as client: - await client.unregister_service( - project=project.name, - run_id=run_model.id, - ) - logger.debug("%s: service is unregistered", fmt(run_model)) - except GatewayError as e: - # ignore if service is not registered - logger.warning("%s: unregistering service: %s", fmt(run_model), e) - except (httpx.RequestError, SSHError) as e: - logger.debug("Gateway request failed", exc_info=True) - raise GatewayError(repr(e)) +async def generate_gateway_name(session: AsyncSession, project: ProjectModel) -> str: + gateways = await list_project_gateway_models(session=session, project=project) + names = {g.name for g in gateways} + while True: + name = random_names.generate_name() + if name not in names: + return name -async def unregister_replica(session: AsyncSession, job_model: JobModel): +# TODO: Connect to gateway outside session +async def get_or_add_gateway_connections( + session: AsyncSession, gateway_id: uuid.UUID +) -> tuple[GatewayModel, List[GatewayConnection]]: res = await session.execute( - select(RunModel) - .where(RunModel.id == job_model.run_id) - .options(sa_orm.joinedload(RunModel.project)) + select(GatewayModel) + .where(GatewayModel.id == gateway_id) + .options(joinedload(GatewayModel.gateway_compute)) + .options(selectinload(GatewayModel.gateway_computes)) ) - run_model = res.scalar() - if run_model.gateway_id is None: - return - - conn = await get_gateway_connection(session, run_model.gateway_id) - try: - logger.debug( - "%s: unregistering replica from service %s", fmt(job_model), job_model.run_id.hex - ) - async with conn.client() as client: - await client.unregister_replica( - project=run_model.project.name, - run_id=run_model.id, - job_id=job_model.id, + gateway = res.scalar_one_or_none() + if gateway is None: + raise GatewayError("Gateway not found") + computes = get_gateway_compute_models(gateway) + if not computes: + raise GatewayError("Gateway compute not found") + connections: List[GatewayConnection] = [] + for compute in computes: + try: + conn = await gateway_connections_pool.get_or_add( + hostname=compute.ip_address, + id_rsa=compute.ssh_private_key, ) - logger.info( - "%s: replica is unregistered from service %s", fmt(job_model), job_model.run_id.hex - ) - except GatewayError as e: - # ignore if replica is not registered - logger.warning("%s: unregistering replica from service: %s", fmt(job_model), e) - except (httpx.RequestError, SSHError) as e: - logger.debug("Gateway request failed", exc_info=True) - raise GatewayError(repr(e)) + connections.append(conn) + except Exception as e: + logger.warning("Failed to connect to gateway %s: %s", compute.ip_address, e) + raise GatewayError("Failed to connect to gateway") + return gateway, connections -async def get_gateway_connection( - session: AsyncSession, gateway_id: uuid.UUID -) -> GatewayConnection: - gateway = await session.get(GatewayModel, gateway_id) - if gateway is None: - raise GatewayError("Gateway doesn't exist") - if gateway.gateway_compute is None: - raise GatewayError("Gateway is broken, no compute") - conn = await gateway_connections_pool.get(gateway.gateway_compute.ip_address) - if conn is None: - raise GatewayError("Gateway is not connected") - return conn +async def get_combined_gateway_stats( + session: AsyncSession, + gateway_id: uuid.UUID, + project_name: str, + run_name: str, +) -> Optional[PerWindowStats]: + """ + Return stats for *run_name* aggregated across all replicas of *gateway_id*. + """ + try: + _, connections = await get_or_add_gateway_connections(session, gateway_id) + except GatewayError: + return None + per_replica: list[PerWindowStats] = [] + for conn in connections: + stats = await conn.get_stats(project_name, run_name) + if stats is None: # Stats not fetched yet + # TODO: find a way to make service scaling decisions even if some gateway replicas are + # unavailable for fetching stats. + return None + per_replica.append(stats) + return _merge_per_window_stats(per_replica) if per_replica else None + + +def _merge_per_window_stats(stats_per_gateway_replica: list[PerWindowStats]) -> PerWindowStats: + merged: PerWindowStats = {} + for window in SERVICE_SCALING_WINDOWS: + total_requests = 0 + total_time_of_all_requests = 0.0 + for gateway_replica_stats in stats_per_gateway_replica: + stat = gateway_replica_stats[window] + total_requests += stat.requests + total_time_of_all_requests += stat.requests * stat.request_time + merged[window] = Stat( + requests=total_requests, + request_time=(total_time_of_all_requests / total_requests if total_requests else 0.0), + ) + return merged async def init_gateways(session: AsyncSession): res = await session.execute( select(GatewayComputeModel).where( + # FIXME: should not include computes related to gateways in the `provisioning` status. + # Causes warnings and delays when restarting the server during gateway provisioning. GatewayComputeModel.active == True, GatewayComputeModel.deleted == False, ) @@ -501,46 +663,99 @@ async def init_gateways(session: AsyncSession): if len(gateway_computes) > 0: logger.info(f"Connecting to {len(gateway_computes)} gateways...", {"show_path": False}) - for gateway, error in await gather_map_async( - gateway_computes, - lambda g: gateway_connections_pool.add(g.ip_address, g.ssh_private_key), - return_exceptions=True, - ): - if isinstance(error, Exception): - logger.warning("Failed to connect to gateway %s: %s", gateway.ip_address, error) - - if settings.SKIP_GATEWAY_UPDATE: - logger.debug("Skipping gateway update due to DSTACK_SKIP_GATEWAY_UPDATE env variable") - else: - build = get_dstack_runner_version() - for conn, error in await gather_map_async( + async with advisory_lock_ctx( + bind=session, + dialect_name=get_db().dialect_name, + resource="gateway_tunnels", + ): + for gateway, error in await gather_map_async( + gateway_computes, + lambda g: gateway_connections_pool.get_or_add(g.ip_address, g.ssh_private_key, True), + return_exceptions=True, + ): + if isinstance(error, Exception): + logger.warning("Failed to connect to gateway %s: %s", gateway.ip_address, error) + + if settings.SKIP_GATEWAY_UPDATE: + logger.debug("Skipping gateways update due to DSTACK_SKIP_GATEWAY_UPDATE env variable") + else: + build = get_dstack_runner_version() or "latest" + + for gateway_compute, res in await gather_map_async( + gateway_computes, + lambda c: _update_gateway(c, build), + return_exceptions=True, + ): + if isinstance(res, Exception): + logger.warning( + "Failed to update gateway %s: %s", gateway_compute.ip_address, res + ) + elif isinstance(res, bool) and res: + gateway_compute.app_updated_at = get_current_datetime() + + for gateway_compute, error in await gather_map_async( await gateway_connections_pool.all(), - lambda c: _update_gateway(c, build), + # Need several attempts to handle short gateway downtime after update + partial(configure_gateway, attempts=7), return_exceptions=True, ): if isinstance(error, Exception): - logger.warning("Failed to update gateway %s: %s", conn.ip_address, error) - - for conn, error in await gather_map_async( - await gateway_connections_pool.all(), - configure_gateway, - return_exceptions=True, - ): - if isinstance(error, Exception): - logger.warning("Failed to configure gateway %s: %r", conn.ip_address, error) + logger.warning( + "Failed to configure gateway %s: %r", gateway_compute.ip_address, error + ) -async def _update_gateway(connection: GatewayConnection, build: str): - logger.debug("Updating gateway %s", connection.ip_address) - stdout = await connection.tunnel.exec( - f"/bin/sh dstack/update.sh {get_dstack_gateway_wheel(build)} {build}" +async def _update_gateway(gateway_compute_model: GatewayComputeModel, build: str) -> bool: + if _recently_updated(gateway_compute_model): + logger.debug( + "Skipping gateway %s update. Gateway was recently updated.", + gateway_compute_model.ip_address, + ) + return False + connection = await gateway_connections_pool.get_or_add( + gateway_compute_model.ip_address, + gateway_compute_model.ssh_private_key, ) + logger.debug("Updating gateway %s", connection.ip_address) + router = _get_gateway_compute_router_config(gateway_compute_model) + + # Build package spec with extras and wheel URL + gateway_package = get_dstack_gateway_wheel(build, router) + commands = [ + # prevent update.sh from overwriting itself during execution + "cp dstack/update.sh dstack/_update.sh", + f'sh dstack/_update.sh "{gateway_package}" {build}', + "rm dstack/_update.sh", + ] + stdout = await connection.tunnel.aexec("/bin/sh -c '" + " && ".join(commands) + "'") if "Update successfully completed" in stdout: logger.info("Gateway %s updated", connection.ip_address) + return True + return False + + +def _recently_updated(gateway_compute_model: GatewayComputeModel) -> bool: + return gateway_compute_model.app_updated_at.replace( + tzinfo=datetime.timezone.utc + ) > get_current_datetime() - timedelta(seconds=60) + + +def _get_gateway_compute_router_config( + compute: GatewayComputeModel, +) -> Optional[AnyGatewayRouterConfig]: + if compute.configuration is None: # pre-0.18.2 gateway + return None # gateway routers introduced in 0.19.38 + compute_config: GatewayComputeConfiguration = ( + GatewayComputeConfiguration.__response__.parse_raw(compute.configuration) + ) + return compute_config.router -async def configure_gateway(connection: GatewayConnection) -> None: +async def configure_gateway( + connection: GatewayConnection, + attempts: int = GATEWAY_CONFIGURE_ATTEMPTS, +) -> None: """ Try submitting gateway config several times in case gateway's HTTP server is not running yet @@ -548,7 +763,7 @@ async def configure_gateway(connection: GatewayConnection) -> None: logger.debug("Configuring gateway %s", connection.ip_address) - for attempt in range(GATEWAY_CONFIGURE_ATTEMPTS - 1): + for attempt in range(attempts - 1): try: async with connection.client() as client: await client.submit_gateway_config() @@ -557,7 +772,7 @@ async def configure_gateway(connection: GatewayConnection) -> None: logger.debug( "Failed attempt %s/%s at configuring gateway %s: %r", attempt + 1, - GATEWAY_CONFIGURE_ATTEMPTS, + attempts, connection.ip_address, e, ) @@ -569,6 +784,14 @@ async def configure_gateway(connection: GatewayConnection) -> None: logger.info("Gateway %s configured", connection.ip_address) +def get_gateway_compute_models(gateway_model: GatewayModel) -> List[GatewayComputeModel]: + if gateway_model.gateway_computes: # 0.20.25+ gateway + return list(gateway_model.gateway_computes) + if gateway_model.gateway_compute is not None: # pre-0.20.25 gateway + return [gateway_model.gateway_compute] + return [] + + def get_gateway_configuration(gateway_model: GatewayModel) -> GatewayConfiguration: if gateway_model.configuration is not None: return GatewayConfiguration.__response__.parse_raw(gateway_model.configuration) @@ -583,74 +806,106 @@ def get_gateway_configuration(gateway_model: GatewayModel) -> GatewayConfigurati def get_gateway_compute_configuration( + gateway_compute: GatewayComputeModel, gateway_model: GatewayModel, -) -> Optional[GatewayComputeConfiguration]: - if gateway_model.gateway_compute is None: - return None - if gateway_model.gateway_compute.configuration is not None: - return GatewayComputeConfiguration.__response__.parse_raw( - gateway_model.gateway_compute.configuration - ) +) -> GatewayComputeConfiguration: + if gateway_compute.configuration is not None: + return GatewayComputeConfiguration.__response__.parse_raw(gateway_compute.configuration) # Handle gateways created before GatewayComputeConfiguration was introduced return GatewayComputeConfiguration( project_name=gateway_model.project.name, - instance_name=gateway_model.gateway_compute.instance_id, + instance_name=gateway_compute.instance_id, backend=gateway_model.backend.type, - region=gateway_model.gateway_compute.region, + region=gateway_compute.region, public_ip=True, - ssh_key_pub=gateway_model.gateway_compute.ssh_public_key, + ssh_key_pub=gateway_compute.ssh_public_key, certificate=LetsEncryptGatewayCertificate(), ) -def gateway_model_to_gateway(gateway_model: GatewayModel) -> Gateway: - ip_address = "" - instance_id = "" - hostname = "" - if gateway_model.gateway_compute is not None: - ip_address = gateway_model.gateway_compute.ip_address - instance_id = gateway_model.gateway_compute.instance_id - hostname = gateway_model.gateway_compute.hostname - if hostname is None: - hostname = ip_address - backend_type = gateway_model.backend.type - if gateway_model.backend.type == BackendType.DSTACK: - backend_type = BackendType.AWS +def gateway_model_to_gateway( + gateway_model: GatewayModel, default_gateway_id: Optional[uuid.UUID] +) -> Gateway: + """ + Args: + gateway_model: Gateway model to convert + default_gateway_id: ID of the default gateway in the project where `gateway_model` is being + viewed. Can be different from `gateway_model.project` if the gateway is imported. + """ + is_default = default_gateway_id == gateway_model.id configuration = get_gateway_configuration(gateway_model) - configuration.default = gateway_model.project.default_gateway_id == gateway_model.id + configuration.default = is_default + + compute_models = sorted(get_gateway_compute_models(gateway_model), key=lambda c: c.replica_num) + gateway_hostname = None + replicas = [] + for compute in compute_models: + compute_configuration = get_gateway_compute_configuration(compute, gateway_model) + replicas.append( + GatewayReplica( + hostname=compute.ip_address, + replica_num=compute.replica_num, + backend=compute_configuration.backend, + region=compute_configuration.region, + created_at=compute.created_at, + ) + ) + gateway_hostname = compute.hostname + return Gateway( + id=gateway_model.id, name=gateway_model.name, - ip_address=ip_address, - instance_id=instance_id, - hostname=hostname, - backend=backend_type, + project_name=gateway_model.project.name, + hostname=gateway_hostname, + backend=gateway_model.backend.type, region=gateway_model.region, wildcard_domain=gateway_model.wildcard_domain, - default=gateway_model.project.default_gateway_id == gateway_model.id, - created_at=gateway_model.created_at.replace(tzinfo=timezone.utc), + default=is_default, + created_at=gateway_model.created_at, status=gateway_model.status, status_message=gateway_model.status_message, configuration=configuration, + replicas=replicas, ) def _validate_gateway_configuration(configuration: GatewayConfiguration): + check_backend_type_available(configuration.backend) if configuration.backend not in BACKENDS_WITH_GATEWAY_SUPPORT: raise ServerClientError( - f"Gateways are not supported for {configuration.backend.value} backend. " - f"Supported backends: {[b.value for b in BACKENDS_WITH_GATEWAY_SUPPORT]}." + f"Gateways are not supported for {configuration.backend.value} backend." + " Available backends with gateway support:" + f" {[b.value for b in BACKENDS_WITH_GATEWAY_SUPPORT]}." ) if configuration.name is not None: validate_dstack_resource_name(configuration.name) + if configuration.domain is not None: + # validate that domain can be interpolated + interpolate_gateway_domain( + domain=configuration.domain, + run_project_name="example", + exception_type=ServerClientError, + ) + if ( not configuration.public_ip and configuration.backend not in BACKENDS_WITH_PRIVATE_GATEWAY_SUPPORT ): raise ServerClientError( f"Private gateways are not supported for {configuration.backend.value} backend. " - f"Supported backends: {[b.value for b in BACKENDS_WITH_PRIVATE_GATEWAY_SUPPORT]}." + " Available backends with private gateway support:" + f" {[b.value for b in BACKENDS_WITH_PRIVATE_GATEWAY_SUPPORT]}." + ) + + replicas = ( + configuration.replicas if configuration.replicas is not None else GATEWAY_REPLICAS_DEFAULT + ) + + if replicas > GATEWAY_MAX_REPLICAS: + raise ServerClientError( + f"Cannot provision {replicas} gateway replicas. This server allows at most {GATEWAY_MAX_REPLICAS}" ) if configuration.certificate is not None: @@ -660,23 +915,13 @@ def _validate_gateway_configuration(configuration: GatewayConfiguration): ) if configuration.certificate.type == "acm" and configuration.backend != BackendType.AWS: raise ServerClientError("acm certificate type is supported for aws backend only") + if replicas > 1: + raise ServerClientError( + "Replicated gateways do not support certificates." + " Set either `certificate: null` or `replicas: 1` in the gateway configuration" + ) - -def _get_service_https(run_spec: RunSpec, configuration: Optional[GatewayConfiguration]) -> bool: - if not run_spec.configuration.https: - return False - if configuration is None: - return True - if configuration.certificate is not None and configuration.certificate.type == "acm": - return False - return True - - -def _get_gateway_https(run_spec: RunSpec, configuration: Optional[GatewayConfiguration]) -> bool: - if configuration is None: - return True - if configuration.certificate is not None and configuration.certificate.type == "acm": - return False - if configuration.certificate is not None and configuration.certificate.type == "lets-encrypt": - return True - return False + if configuration.router is not None and replicas > 1: + raise ServerClientError( + "The deprecated `router` property is not supported for multi-replica gateways" + ) diff --git a/src/dstack/_internal/server/services/gateways/autoscalers.py b/src/dstack/_internal/server/services/gateways/autoscalers.py deleted file mode 100644 index eb2f4a1229..0000000000 --- a/src/dstack/_internal/server/services/gateways/autoscalers.py +++ /dev/null @@ -1,97 +0,0 @@ -import datetime -import math -from abc import ABC, abstractmethod -from typing import Dict, List, Optional - -from pydantic import BaseModel - -import dstack._internal.utils.common as common_utils -from dstack._internal.core.models.configurations import ServiceConfiguration -from dstack._internal.server.services.gateways.client import Stat - - -class ReplicaInfo(BaseModel): - """ - Attributes: - active (bool): starting/running/retrying or downscaled - timestamp (datetime.datetime): `submitted_at` for active, `last_processed_at` for inactive - """ - - active: bool - timestamp: datetime.datetime - - -class BaseAutoscaler(ABC): - @abstractmethod - def scale(self, replicas: List[ReplicaInfo], stats: Dict[int, Stat]) -> int: - """ - Args: - replicas: list of all replicas - stats: service stats from the gateway - - Returns: - diff: number of replicas to add or remove - """ - pass - - -class RPSAutoscaler(BaseAutoscaler): - def __init__( - self, - min_replicas: int, - max_replicas: int, - target: float, - scale_up_delay: int, - scale_down_delay: int, - ): - self.min_replicas = min_replicas - self.max_replicas = max_replicas - self.target = target - self.scale_up_delay = scale_up_delay - self.scale_down_delay = scale_down_delay - - def scale(self, replicas: List[ReplicaInfo], stats: Dict[int, Stat]) -> int: - now = common_utils.get_current_datetime() - active_replicas = [r for r in replicas if r.active] - last_scaled_at = max((r.timestamp for r in replicas), default=None) - - # calculate the average RPS over the last minute - rps = stats[60].requests / 60 - target_replicas = math.ceil(rps / self.target) - # clip the target replicas to the min and max values - target_replicas = min(max(target_replicas, self.min_replicas), self.max_replicas) - - if target_replicas > len(active_replicas): - if len(active_replicas) == 0: - # no replicas, scale up immediately - return target_replicas - if ( - last_scaled_at is not None - and (now - last_scaled_at).total_seconds() < self.scale_up_delay - ): - # too early to scale up, wait for the delay - return 0 - return target_replicas - len(active_replicas) - elif target_replicas < len(active_replicas): - if ( - last_scaled_at is not None - and (now - last_scaled_at).total_seconds() < self.scale_down_delay - ): - # too early to scale down, wait for the delay - return 0 - return target_replicas - len(active_replicas) - return 0 - - -def get_service_autoscaler(conf: ServiceConfiguration) -> Optional[BaseAutoscaler]: - if conf.scaling is None: - return None - if conf.scaling.metric == "rps": - return RPSAutoscaler( - # replicas count validated by configuration model - min_replicas=conf.replicas.min, - max_replicas=conf.replicas.max, - target=conf.scaling.target, - scale_up_delay=conf.scaling.scale_up_delay, - scale_down_delay=conf.scaling.scale_down_delay, - ) diff --git a/src/dstack/_internal/server/services/gateways/client.py b/src/dstack/_internal/server/services/gateways/client.py index b73b879dc5..4abd98811c 100644 --- a/src/dstack/_internal/server/services/gateways/client.py +++ b/src/dstack/_internal/server/services/gateways/client.py @@ -1,24 +1,19 @@ import asyncio import uuid -from typing import Dict, Optional +from typing import Optional import httpx -from pydantic import BaseModel, parse_obj_as +from pydantic import parse_obj_as +from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT from dstack._internal.core.errors import GatewayError -from dstack._internal.core.models.runs import JobSubmission, Run +from dstack._internal.core.models.configurations import RateLimit +from dstack._internal.core.models.instances import SSHConnectionParams +from dstack._internal.core.models.routers import AnyServiceRouterConfig +from dstack._internal.core.models.runs import JobSpec, JobSubmission, Run, get_service_port +from dstack._internal.proxy.gateway.schemas.stats import ServiceStats from dstack._internal.server import settings -GATEWAY_MANAGEMENT_PORT = 8000 - - -class Stat(BaseModel): - requests: int - request_time: float - - -StatsCollectResponse = Dict[str, Dict[int, Stat]] - class GatewayClient: def __init__(self, uds: Optional[str] = None, port: Optional[int] = None): @@ -42,25 +37,33 @@ def __init__(self, uds: Optional[str] = None, port: Optional[int] = None): async def register_service( self, project: str, - run_id: uuid.UUID, + run_name: str, domain: str, service_https: bool, gateway_https: bool, auth: bool, + client_max_body_size: int, options: dict, + rate_limits: list[RateLimit], ssh_private_key: str, + has_router_replica: bool = False, + router: Optional[AnyServiceRouterConfig] = None, ): if "openai" in options: entrypoint = f"gateway.{domain.split('.', maxsplit=1)[1]}" await self.register_openai_entrypoint(project, entrypoint, gateway_https) payload = { - "run_id": run_id.hex, + "run_name": run_name, "domain": domain, "https": service_https, "auth": auth, + "client_max_body_size": client_max_body_size, "options": options, + "rate_limits": [limit.dict() for limit in rate_limits], "ssh_private_key": ssh_private_key, + "has_router_replica": has_router_replica, + "router": router.dict() if router is not None else None, } resp = await self._client.post( self._url(f"/api/registry/{project}/services/register"), json=payload @@ -70,47 +73,65 @@ async def register_service( resp.raise_for_status() self.is_server_ready = True - async def unregister_service(self, project: str, run_id: uuid.UUID): + async def unregister_service(self, project: str, run_name: str): resp = await self._client.post( - self._url(f"/api/registry/{project}/services/{run_id.hex}/unregister") + self._url(f"/api/registry/{project}/services/{run_name}/unregister") ) if resp.status_code == 400: raise gateway_error(resp.json()) resp.raise_for_status() self.is_server_ready = True - async def register_replica(self, run: Run, job_submission: JobSubmission): + async def register_replica( + self, + run: Run, + job_spec: JobSpec, + job_submission: JobSubmission, + instance_project_ssh_private_key: Optional[str], + ssh_head_proxy: Optional[SSHConnectionParams], + ssh_head_proxy_private_key: Optional[str], + ): + assert run.run_spec.configuration.type == "service" payload = { "job_id": job_submission.id.hex, - "app_port": run.run_spec.configuration.port.container_port, + "app_port": get_service_port(job_spec, run.run_spec.configuration), + "ssh_head_proxy": ssh_head_proxy.dict() if ssh_head_proxy is not None else None, + "ssh_head_proxy_private_key": ssh_head_proxy_private_key, } jpd = job_submission.job_provisioning_data + assert jpd is not None + assert jpd.hostname is not None + assert jpd.ssh_port is not None + payload["internal_ip"] = jpd.internal_ip if not jpd.dockerized: payload.update( { "ssh_port": jpd.ssh_port, "ssh_host": f"{jpd.username}@{jpd.hostname}", + "ssh_proxy": jpd.ssh_proxy.dict() if jpd.ssh_proxy is not None else None, } ) - if jpd.ssh_proxy is not None: - payload.update( - { - "ssh_jump_port": jpd.ssh_proxy.port, - "ssh_jump_host": f"{jpd.ssh_proxy.username}@{jpd.ssh_proxy.hostname}", - } - ) else: + ssh_port = DSTACK_RUNNER_SSH_PORT + jrd = job_submission.job_runtime_data + if jrd is not None and jrd.ports is not None: + ssh_port = jrd.ports.get(ssh_port, ssh_port) payload.update( { - "ssh_port": 10022, + "ssh_port": ssh_port, "ssh_host": "root@localhost", - "ssh_jump_port": jpd.ssh_port, - "ssh_jump_host": f"{jpd.username}@{jpd.hostname}", + "ssh_proxy": SSHConnectionParams( + hostname=jpd.hostname, + username=jpd.username, + port=jpd.ssh_port, + ).dict(), + "ssh_proxy_private_key": instance_project_ssh_private_key, } ) - resp = await self._client.post( - self._url(f"/api/registry/{run.project_name}/services/{run.id.hex}/replicas/register"), + self._url( + f"/api/registry/{run.project_name}/services/{run.run_spec.run_name}/replicas/register" + ), json=payload, ) if resp.status_code == 400: @@ -118,10 +139,10 @@ async def register_replica(self, run: Run, job_submission: JobSubmission): resp.raise_for_status() self.is_server_ready = True - async def unregister_replica(self, project: str, run_id: uuid.UUID, job_id: uuid.UUID): + async def unregister_replica(self, project: str, run_name: str, job_id: uuid.UUID): resp = await self._client.post( self._url( - f"/api/registry/{project}/services/{run_id.hex}/replicas/{job_id.hex}/unregister" + f"/api/registry/{project}/services/{run_name}/replicas/{job_id.hex}/unregister" ) ) if resp.status_code == 400: @@ -133,7 +154,6 @@ async def register_openai_entrypoint(self, project: str, domain: str, https: boo resp = await self._client.post( self._url(f"/api/registry/{project}/entrypoints/register"), json={ - "module": "openai", "domain": domain, "https": https, }, @@ -165,20 +185,25 @@ async def info(self) -> dict: self.is_server_ready = True return resp.json() - async def collect_stats(self) -> StatsCollectResponse: + async def collect_stats(self) -> list[ServiceStats]: resp = await self._client.get(self._url("/api/stats/collect")) if resp.status_code == 400: raise gateway_error(resp.json()) resp.raise_for_status() self.is_server_ready = True - return parse_obj_as(StatsCollectResponse, resp.json()) + resp_data = resp.json() + if isinstance(resp_data, dict): + # Avoid errors if gateway is updated to new format and current server replica isn't. + # TODO: remove after a few releases + return [] + return parse_obj_as(list[ServiceStats], resp_data) def _url(self, path: str) -> str: return f"{self.base_url}/{path.lstrip('/')}" def gateway_error(data: dict) -> GatewayError: - return GatewayError(msg=f"{data['error']}: {data['message']}") + return GatewayError(msg=data["detail"]) class AsyncClientWrapper(httpx.AsyncClient): diff --git a/src/dstack/_internal/server/services/gateways/connection.py b/src/dstack/_internal/server/services/gateways/connection.py index 1355ea929f..dada5bea64 100644 --- a/src/dstack/_internal/server/services/gateways/connection.py +++ b/src/dstack/_internal/server/services/gateways/connection.py @@ -1,22 +1,29 @@ import contextlib -import os +import shutil import uuid -from typing import AsyncIterator, Dict, Optional +from typing import AsyncIterator, Optional import aiorwlock -from dstack._internal.server.services.gateways.client import ( - GATEWAY_MANAGEMENT_PORT, - GatewayClient, - Stat, +from dstack._internal.core.services.ssh.tunnel import ( + SSH_DEFAULT_OPTIONS, + IPSocket, + SocketPair, + SSHTunnel, + UnixSocket, ) -from dstack._internal.server.services.ssh import AsyncSSHTunnel +from dstack._internal.proxy.gateway.const import ( + PROXY_PORT_ON_GATEWAY, + SERVER_CONNECTIONS_DIR_ON_GATEWAY, +) +from dstack._internal.proxy.gateway.schemas.stats import PerWindowStats +from dstack._internal.server.services.gateways.client import GatewayClient +from dstack._internal.server.settings import SERVER_DIR_PATH from dstack._internal.utils.logging import get_logger +from dstack._internal.utils.path import FileContent, make_tmp_symlink_to_dir logger = get_logger(__name__) - - -SERVER_PORT_ON_GATEWAY = 8001 +CONNECTIONS_DIR = SERVER_DIR_PATH / "gateway-connections" class GatewayConnection: @@ -30,44 +37,94 @@ class GatewayConnection: def __init__(self, ip_address: str, id_rsa: str, server_port: int): self._lock = aiorwlock.RWLock() - self.stats: Dict[str, Dict[int, Stat]] = {} + self.stats: dict[tuple[str, str], PerWindowStats] = {} self.ip_address = ip_address - args = ["-L", "{temp_dir}/gateway:localhost:%d" % GATEWAY_MANAGEMENT_PORT] - args += ["-R", f"localhost:{SERVER_PORT_ON_GATEWAY}:localhost:{server_port}"] - self.tunnel = AsyncSSHTunnel( - f"ubuntu@{ip_address}", - id_rsa, - { - "StrictHostKeyChecking": "no", - "UserKnownHostsFile": "/dev/null", - "ExitOnForwardFailure": "yes", - "StreamLocalBindUnlink": "yes", - "ConnectTimeout": 1, - "ServerAliveInterval": 60, + self.server_port = server_port + # a persistent connection_dir is needed to discover and close leftover connections + # in case of server restarts w/o graceful shutdown + self.connection_dir = CONNECTIONS_DIR / ip_address + # connection_dir can have a long path that won't be accepted by the ssh command, + # so we create a short temporary symlink + self.temp_dir, self.connection_symlink_dir = make_tmp_symlink_to_dir( + self.connection_dir, "connection" + ) + self.gateway_socket_path = self.connection_symlink_dir / "gateway.sock" + self.tunnel = SSHTunnel( + destination=f"ubuntu@{ip_address}", + identity=FileContent(id_rsa), + control_sock_path=self.connection_symlink_dir / "control.sock", + options={ + **SSH_DEFAULT_OPTIONS, + "ConnectTimeout": "1", + "ServerAliveInterval": "60", }, - args, + forwarded_sockets=[ + SocketPair( + local=UnixSocket(path=self.gateway_socket_path), + remote=IPSocket(host="localhost", port=PROXY_PORT_ON_GATEWAY), + ), + ], + # reverse_forwarded_sockets are added later in .open() ) - self._client = GatewayClient(uds=os.path.join(self.tunnel.temp_dir, "gateway")) + self.tunnel_id = uuid.uuid4() + self._client = GatewayClient(uds=str(self.gateway_socket_path)) - async def check_or_restart(self): + async def check_or_restart(self) -> bool: async with self._lock.writer_lock: - if not await self.tunnel.check(): + if not await self.tunnel.acheck(): logger.info("Connection to gateway %s is down, restarting", self.ip_address) - await self.tunnel.start() - return + await self._open_tunnel() + return True + return False + + async def open(self, close_existing_tunnel: bool = False) -> None: + async with self._lock.writer_lock: + if close_existing_tunnel: + # Close remaining tunnel if previous server process died w/o graceful shutdown + if await self.tunnel.acheck(): + await self.tunnel.aclose() + await self._open_tunnel() + + async def _open_tunnel(self) -> None: + self.connection_dir.mkdir(parents=True, exist_ok=True) + remote_socket_path = f"{SERVER_CONNECTIONS_DIR_ON_GATEWAY}/{self.tunnel_id}.sock" + + # open w/o reverse forwarding and make sure reverse forwarding will be possible + self.tunnel.reverse_forwarded_sockets = [] + await self.tunnel.aopen() + await self.tunnel.aexec(f"mkdir -p {SERVER_CONNECTIONS_DIR_ON_GATEWAY}") + await self.tunnel.aexec(f"rm -f {remote_socket_path}") + + # add reverse forwarding + self.tunnel.reverse_forwarded_sockets = [ + SocketPair( + local=IPSocket(host="localhost", port=self.server_port), + remote=UnixSocket(path=remote_socket_path), + ), + ] + await self.tunnel.aopen() + + async def close(self) -> None: + async with self._lock.writer_lock: + await self.tunnel.aclose() + shutil.rmtree(self.connection_dir, ignore_errors=True) async def try_collect_stats(self) -> None: if not self._client.is_server_ready: return async with self._lock.writer_lock: - self.stats = await self._client.collect_stats() - for service_id, stats in self.stats.items(): - logger.debug("%s stats: %s", service_id, stats) + stats = {} + for service in await self._client.collect_stats(): + logger.debug( + "%s/%s stats: %s", service.project_name, service.run_name, service.stats + ) + stats[(service.project_name, service.run_name)] = service.stats + self.stats = stats - async def get_stats(self, service_id: uuid.UUID) -> Optional[Dict[int, Stat]]: + async def get_stats(self, project_name: str, run_name: str) -> Optional[PerWindowStats]: async with self._lock.reader_lock: - return self.stats.get(service_id.hex) + return self.stats.get((project_name, run_name)) @contextlib.asynccontextmanager async def client(self) -> AsyncIterator[GatewayClient]: diff --git a/src/dstack/_internal/server/services/gateways/pool.py b/src/dstack/_internal/server/services/gateways/pool.py index 7080799edf..eaed7c2e95 100644 --- a/src/dstack/_internal/server/services/gateways/pool.py +++ b/src/dstack/_internal/server/services/gateways/pool.py @@ -1,5 +1,5 @@ import asyncio -from typing import Dict, List, Optional +from typing import Dict, List from dstack._internal.server.services.gateways.connection import GatewayConnection from dstack._internal.server.settings import SERVER_PORT @@ -13,15 +13,22 @@ def __init__(self) -> None: self._connections: Dict[str, GatewayConnection] = {} self._lock = asyncio.Lock() - async def add(self, hostname: str, id_rsa: str) -> GatewayConnection: + async def get_or_add( + self, + hostname: str, + id_rsa: str, + close_existing_tunnel: bool = False, + ) -> GatewayConnection: async with self._lock: - if hostname in self._connections: - logger.warning(f"Gateway connection for {hostname} already exists") - return self._connections[hostname] + connection = self._connections.get(hostname) + if connection is not None: + return connection self._connections[hostname] = GatewayConnection(hostname, id_rsa, SERVER_PORT) - start_task = self._connections[hostname].tunnel.start() + open_task = self._connections[hostname].open( + close_existing_tunnel=close_existing_tunnel, + ) try: - await start_task + await open_task return self._connections[hostname] except Exception: async with self._lock: @@ -31,23 +38,19 @@ async def add(self, hostname: str, id_rsa: str) -> GatewayConnection: async def remove(self, hostname: str) -> bool: async with self._lock: if hostname not in self._connections: - logger.warning(f"Gateway connection for {hostname} does not exist") return False - stop_task = self._connections.pop(hostname).tunnel.stop() - await stop_task + close_task = self._connections.pop(hostname).close() + await close_task return True async def remove_all(self) -> None: async with self._lock: await asyncio.gather( - *(conn.tunnel.stop() for conn in self._connections.values()), + *(conn.close() for conn in self._connections.values()), return_exceptions=True, ) self._connections = {} - async def get(self, hostname: str) -> Optional[GatewayConnection]: - return self._connections.get(hostname) - async def all(self) -> List[GatewayConnection]: return list(self._connections.values()) diff --git a/src/dstack/_internal/server/services/gpus.py b/src/dstack/_internal/server/services/gpus.py new file mode 100644 index 0000000000..1497c95369 --- /dev/null +++ b/src/dstack/_internal/server/services/gpus.py @@ -0,0 +1,408 @@ +from typing import Dict, List, Literal, Optional, Tuple + +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.errors import ServerClientError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.gpus import BackendGpu, BackendGpus, GpuGroup +from dstack._internal.core.models.instances import InstanceOfferWithAvailability +from dstack._internal.core.models.profiles import SpotPolicy +from dstack._internal.core.models.resources import Range +from dstack._internal.core.models.runs import Requirements, RunSpec, get_policy_map +from dstack._internal.server.models import ProjectModel +from dstack._internal.server.schemas.gpus import ListGpusResponse +from dstack._internal.server.services.jobs import get_jobs_from_run_spec +from dstack._internal.server.services.offers import get_offers_by_requirements +from dstack._internal.server.services.runs.plan import ( + get_backend_offers_in_run_candidate_fleets, +) +from dstack._internal.utils.common import get_or_error + + +async def list_gpus_grouped( + session: AsyncSession, + project: ProjectModel, + run_spec: RunSpec, + group_by: Optional[List[Literal["backend", "region", "count"]]] = None, +) -> ListGpusResponse: + """Retrieves available GPU specifications based on a run spec, with optional grouping.""" + offers = await _get_gpu_offers(session=session, project=project, run_spec=run_spec) + backend_gpus = _process_offers_into_backend_gpus(offers) + group_by_set = set(group_by) if group_by else set() + if "region" in group_by_set and "backend" not in group_by_set: + raise ServerClientError("Cannot group by 'region' without also grouping by 'backend'") + + # Determine grouping strategy based on combination + has_backend = "backend" in group_by_set + has_region = "region" in group_by_set + has_count = "count" in group_by_set + if has_backend and has_region and has_count: + gpus = _get_gpus_grouped_by_backend_region_and_count(backend_gpus) + elif has_backend and has_count: + gpus = _get_gpus_grouped_by_backend_and_count(backend_gpus) + elif has_backend and has_region: + gpus = _get_gpus_grouped_by_backend_and_region(backend_gpus) + elif has_backend: + gpus = _get_gpus_grouped_by_backend(backend_gpus) + elif has_count: + gpus = _get_gpus_grouped_by_count(backend_gpus) + else: + gpus = _get_gpus_with_no_grouping(backend_gpus) + + return ListGpusResponse(gpus=gpus) + + +async def _get_gpu_offers( + session: AsyncSession, + project: ProjectModel, + run_spec: RunSpec, +) -> List[Tuple[Backend, InstanceOfferWithAvailability]]: + """Fetches all available instance offers that match the run spec's GPU requirements.""" + profile = run_spec.merged_profile + if profile.fleets is not None: + jobs = await get_jobs_from_run_spec(run_spec=run_spec, secrets={}, replica_num=0) + if len(jobs) == 0: + return [] + return await get_backend_offers_in_run_candidate_fleets( + session=session, + project=project, + run_spec=run_spec, + job=jobs[0], + volumes=None, + max_offers_per_fleet=None, + ) + requirements = Requirements( + resources=run_spec.configuration.resources, + max_price=profile.max_price, + spot=get_policy_map(profile.spot_policy, default=SpotPolicy.AUTO), + reservation=profile.reservation, + ) + return await get_offers_by_requirements( + project=project, + profile=profile, + requirements=requirements, + exclude_not_available=False, + multinode=False, + volumes=None, + privileged=False, + instance_mounts=False, + ) + + +def _process_offers_into_backend_gpus( + offers: List[Tuple[Backend, InstanceOfferWithAvailability]], +) -> List[BackendGpus]: + """Transforms raw offers into a structured list of BackendGpus, aggregating GPU info.""" + backend_data: Dict[BackendType, Dict] = {} + + for _, offer in offers: + backend_type = offer.backend + if backend_type not in backend_data: + backend_data[backend_type] = {"gpus": {}, "regions": set()} + + backend_data[backend_type]["regions"].add(offer.region) + + if not offer.instance.resources.gpus: + continue + + gpu_types_in_offer = {} + for gpu in offer.instance.resources.gpus: + gpu_type_key = (gpu.name, gpu.memory_mib, gpu.vendor) + if gpu_type_key not in gpu_types_in_offer: + gpu_types_in_offer[gpu_type_key] = 0 + gpu_types_in_offer[gpu_type_key] += 1 + + for ( + gpu_name, + gpu_memory_mib, + gpu_vendor, + ), gpu_count_in_offer in gpu_types_in_offer.items(): + instance_config_key = ( + gpu_name, + gpu_memory_mib, + gpu_vendor, + gpu_count_in_offer, + offer.instance.resources.spot, + offer.region, + ) + + if instance_config_key not in backend_data[backend_type]["gpus"]: + backend_data[backend_type]["gpus"][instance_config_key] = BackendGpu( + name=gpu_name, + memory_mib=gpu_memory_mib, + vendor=gpu_vendor, + availability=offer.availability, + spot=offer.instance.resources.spot, + count=gpu_count_in_offer, + price=offer.price, + region=offer.region, + ) + + backend_gpus_list = [] + for backend_type, data in backend_data.items(): + gpus_list = sorted( + list(data["gpus"].values()), + key=lambda g: ( + not g.availability.is_available(), + g.vendor.value, + g.name, + g.memory_mib, + ), + ) + backend_gpus_list.append( + BackendGpus( + backend_type=backend_type, + gpus=gpus_list, + regions=sorted(list(data["regions"])), + ) + ) + return backend_gpus_list + + +def _update_gpu_group(row: GpuGroup, gpu: BackendGpu, backend_type: BackendType): + """Updates an existing GpuGroup with new data from another GPU offer.""" + spot_type: Literal["spot", "on-demand"] = "spot" if gpu.spot else "on-demand" + + if gpu.availability not in row.availability: + row.availability.append(gpu.availability) + if spot_type not in row.spot: + row.spot.append(spot_type) + if row.backends and backend_type not in row.backends: + row.backends.append(backend_type) + + # FIXME: Consider using non-optional range + assert row.count.min is not None + assert row.count.max is not None + assert row.price.min is not None + assert row.price.max is not None + + row.count.min = min(row.count.min, gpu.count) + row.count.max = max(row.count.max, gpu.count) + per_gpu_price = gpu.price / gpu.count + row.price.min = min(row.price.min, per_gpu_price) + row.price.max = max(row.price.max, per_gpu_price) + + +def _get_gpus_with_no_grouping(backend_gpus: List[BackendGpus]) -> List[GpuGroup]: + """Aggregates GPU specs into a flat list, without any grouping.""" + gpu_rows: Dict[Tuple, GpuGroup] = {} + for backend in backend_gpus: + for gpu in backend.gpus: + key = (gpu.name, gpu.memory_mib, gpu.vendor) + if key not in gpu_rows: + per_gpu_price = gpu.price / gpu.count + price_range = Range[float](min=per_gpu_price, max=per_gpu_price) + + gpu_rows[key] = GpuGroup( + name=gpu.name, + memory_mib=gpu.memory_mib, + vendor=gpu.vendor, + availability=[gpu.availability], + spot=["spot" if gpu.spot else "on-demand"], + count=Range[int](min=gpu.count, max=gpu.count), + price=price_range, + backends=[backend.backend_type], + ) + else: + _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) + + result = sorted( + list(gpu_rows.values()), + key=lambda g: ( + not any(av.is_available() for av in g.availability), + g.price.min, + g.price.max, + g.name, + g.memory_mib, + ), + ) + + return result + + +def _get_gpus_grouped_by_backend(backend_gpus: List[BackendGpus]) -> List[GpuGroup]: + """Aggregates GPU specs, grouping them by backend.""" + gpu_rows: Dict[Tuple, GpuGroup] = {} + for backend in backend_gpus: + for gpu in backend.gpus: + key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type) + if key not in gpu_rows: + per_gpu_price = gpu.price / gpu.count + gpu_rows[key] = GpuGroup( + name=gpu.name, + memory_mib=gpu.memory_mib, + vendor=gpu.vendor, + availability=[gpu.availability], + spot=["spot" if gpu.spot else "on-demand"], + count=Range[int](min=gpu.count, max=gpu.count), + price=Range[float](min=per_gpu_price, max=per_gpu_price), + backend=backend.backend_type, + regions=backend.regions.copy(), + ) + else: + _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) + + return sorted( + list(gpu_rows.values()), + key=lambda g: ( + not any(av.is_available() for av in g.availability), + g.price.min, + g.price.max, + get_or_error(g.backend).value, + g.name, + g.memory_mib, + ), + ) + + +def _get_gpus_grouped_by_backend_and_region(backend_gpus: List[BackendGpus]) -> List[GpuGroup]: + """Aggregates GPU specs, grouping them by both backend and region.""" + gpu_rows: Dict[Tuple, GpuGroup] = {} + for backend in backend_gpus: + for gpu in backend.gpus: + key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type, gpu.region) + if key not in gpu_rows: + per_gpu_price = gpu.price / gpu.count + gpu_rows[key] = GpuGroup( + name=gpu.name, + memory_mib=gpu.memory_mib, + vendor=gpu.vendor, + availability=[gpu.availability], + spot=["spot" if gpu.spot else "on-demand"], + count=Range[int](min=gpu.count, max=gpu.count), + price=Range[float](min=per_gpu_price, max=per_gpu_price), + backend=backend.backend_type, + region=gpu.region, + ) + else: + _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) + + return sorted( + list(gpu_rows.values()), + key=lambda g: ( + not any(av.is_available() for av in g.availability), + g.price.min, + g.price.max, + get_or_error(g.backend).value, + g.region, + g.name, + g.memory_mib, + ), + ) + + +def _get_gpus_grouped_by_count(backend_gpus: List[BackendGpus]) -> List[GpuGroup]: + """Aggregates GPU specs, grouping them by GPU count.""" + gpu_rows: Dict[Tuple, GpuGroup] = {} + for backend in backend_gpus: + for gpu in backend.gpus: + key = (gpu.name, gpu.memory_mib, gpu.vendor, gpu.count) + if key not in gpu_rows: + per_gpu_price = gpu.price / gpu.count + gpu_rows[key] = GpuGroup( + name=gpu.name, + memory_mib=gpu.memory_mib, + vendor=gpu.vendor, + availability=[gpu.availability], + spot=["spot" if gpu.spot else "on-demand"], + count=Range[int](min=gpu.count, max=gpu.count), + price=Range[float](min=per_gpu_price, max=per_gpu_price), + backends=[backend.backend_type], + ) + else: + _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) + + return sorted( + list(gpu_rows.values()), + key=lambda g: ( + not any(av.is_available() for av in g.availability), + g.price.min, + g.price.max, + g.count.min, + g.name, + g.memory_mib, + ), + ) + + +def _get_gpus_grouped_by_backend_and_count(backend_gpus: List[BackendGpus]) -> List[GpuGroup]: + """Aggregates GPU specs, grouping them by backend and GPU count.""" + gpu_rows: Dict[Tuple, GpuGroup] = {} + for backend in backend_gpus: + for gpu in backend.gpus: + key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type, gpu.count) + if key not in gpu_rows: + per_gpu_price = gpu.price / gpu.count + gpu_rows[key] = GpuGroup( + name=gpu.name, + memory_mib=gpu.memory_mib, + vendor=gpu.vendor, + availability=[gpu.availability], + spot=["spot" if gpu.spot else "on-demand"], + count=Range[int](min=gpu.count, max=gpu.count), + price=Range[float](min=per_gpu_price, max=per_gpu_price), + backend=backend.backend_type, + regions=backend.regions.copy(), + ) + else: + _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) + + return sorted( + list(gpu_rows.values()), + key=lambda g: ( + not any(av.is_available() for av in g.availability), + g.price.min, + g.price.max, + get_or_error(g.backend).value, + g.count.min, + g.name, + g.memory_mib, + ), + ) + + +def _get_gpus_grouped_by_backend_region_and_count( + backend_gpus: List[BackendGpus], +) -> List[GpuGroup]: + """Aggregates GPU specs, grouping them by backend, region, and GPU count.""" + gpu_rows: Dict[Tuple, GpuGroup] = {} + for backend in backend_gpus: + for gpu in backend.gpus: + key = ( + gpu.name, + gpu.memory_mib, + gpu.vendor, + backend.backend_type, + gpu.region, + gpu.count, + ) + if key not in gpu_rows: + per_gpu_price = gpu.price / gpu.count + gpu_rows[key] = GpuGroup( + name=gpu.name, + memory_mib=gpu.memory_mib, + vendor=gpu.vendor, + availability=[gpu.availability], + spot=["spot" if gpu.spot else "on-demand"], + count=Range[int](min=gpu.count, max=gpu.count), + price=Range[float](min=per_gpu_price, max=per_gpu_price), + backend=backend.backend_type, + region=gpu.region, + ) + else: + _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) + + return sorted( + list(gpu_rows.values()), + key=lambda g: ( + not any(av.is_available() for av in g.availability), + g.price.min, + g.price.max, + get_or_error(g.backend).value, + g.region, + g.count.min, + g.name, + g.memory_mib, + ), + ) diff --git a/src/dstack/_internal/server/services/ides/__init__.py b/src/dstack/_internal/server/services/ides/__init__.py new file mode 100644 index 0000000000..f22987aafb --- /dev/null +++ b/src/dstack/_internal/server/services/ides/__init__.py @@ -0,0 +1,23 @@ +from typing import Literal, Optional + +from dstack._internal.server.services.ides.base import IDE +from dstack._internal.server.services.ides.cursor import CursorDesktop +from dstack._internal.server.services.ides.vscode import VSCodeDesktop +from dstack._internal.server.services.ides.windsurf import WindsurfDesktop +from dstack._internal.server.services.ides.zed import ZedDesktop + +_IDELiteral = Literal["vscode", "cursor", "windsurf", "zed"] + +_ide_literal_to_ide_class_map: dict[_IDELiteral, type[IDE]] = { + "vscode": VSCodeDesktop, + "cursor": CursorDesktop, + "windsurf": WindsurfDesktop, + "zed": ZedDesktop, +} + + +def get_ide(ide_literal: _IDELiteral) -> Optional[IDE]: + ide_class = _ide_literal_to_ide_class_map.get(ide_literal) + if ide_class is None: + return None + return ide_class() diff --git a/src/dstack/_internal/server/services/ides/base.py b/src/dstack/_internal/server/services/ides/base.py new file mode 100644 index 0000000000..f97aad6d91 --- /dev/null +++ b/src/dstack/_internal/server/services/ides/base.py @@ -0,0 +1,25 @@ +from abc import ABC, abstractmethod +from typing import ClassVar, Optional + + +class IDE(ABC): + name: ClassVar[str] + url_scheme: ClassVar[str] + + @abstractmethod + def get_install_commands( + self, version: Optional[str] = None, extensions: Optional[list[str]] = None + ) -> list[str]: + pass + + def get_url(self, authority: str, working_dir: str) -> str: + return f"{self.url_scheme}://vscode-remote/ssh-remote+{authority}{working_dir}" + + def get_print_readme_commands(self, authority: str) -> list[str]: + url = self.get_url(authority, working_dir="$DSTACK_WORKING_DIR") + return [ + f"echo 'To open in {self.name}, use link below:'", + "echo", + f'echo " {url}"', + "echo", + ] diff --git a/src/dstack/_internal/server/services/ides/cursor.py b/src/dstack/_internal/server/services/ides/cursor.py new file mode 100644 index 0000000000..95512f355e --- /dev/null +++ b/src/dstack/_internal/server/services/ides/cursor.py @@ -0,0 +1,31 @@ +from typing import Optional + +from dstack._internal.server.services.ides.base import IDE + + +class CursorDesktop(IDE): + name = "Cursor" + url_scheme = "cursor" + + def get_install_commands( + self, version: Optional[str] = None, extensions: Optional[list[str]] = None + ) -> list[str]: + commands = [] + if version is not None: + url = f"https://cursor.blob.core.windows.net/remote-releases/{version}/vscode-reh-linux-$arch.tar.gz" + archive = "vscode-reh-linux-$arch.tar.gz" + target = f'~/.cursor-server/cli/servers/"Stable-{version}"/server' + commands.extend( + [ + 'if [ $(uname -m) = "aarch64" ]; then arch="arm64"; else arch="x64"; fi', + "mkdir -p /tmp", + f'wget -q --show-progress "{url}" -O "/tmp/{archive}"', + f"mkdir -vp {target}", + f'tar --no-same-owner -xz --strip-components=1 -C {target} -f "/tmp/{archive}"', + f'rm "/tmp/{archive}"', + ] + ) + if extensions: + _extensions = " ".join(f'--install-extension "{name}"' for name in extensions) + commands.append(f'PATH="$PATH":{target}/bin cursor-server {_extensions}') + return commands diff --git a/src/dstack/_internal/server/services/ides/vscode.py b/src/dstack/_internal/server/services/ides/vscode.py new file mode 100644 index 0000000000..3ab0b8ab95 --- /dev/null +++ b/src/dstack/_internal/server/services/ides/vscode.py @@ -0,0 +1,33 @@ +from typing import Optional + +from dstack._internal.server.services.ides.base import IDE + + +class VSCodeDesktop(IDE): + name = "VS Code" + url_scheme = "vscode" + + def get_install_commands( + self, version: Optional[str] = None, extensions: Optional[list[str]] = None + ) -> list[str]: + commands = [] + if version is not None: + url = ( + f"https://update.code.visualstudio.com/commit:{version}/server-linux-$arch/stable" + ) + archive = "vscode-server-linux-$arch.tar.gz" + target = f'~/.vscode-server/bin/"{version}"' + commands.extend( + [ + 'if [ $(uname -m) = "aarch64" ]; then arch="arm64"; else arch="x64"; fi', + "mkdir -p /tmp", + f'wget -q --show-progress "{url}" -O "/tmp/{archive}"', + f"mkdir -vp {target}", + f'tar --no-same-owner -xz --strip-components=1 -C {target} -f "/tmp/{archive}"', + f'rm "/tmp/{archive}"', + ] + ) + if extensions: + _extensions = " ".join(f'--install-extension "{name}"' for name in extensions) + commands.append(f'PATH="$PATH":{target}/bin code-server {_extensions}') + return commands diff --git a/src/dstack/_internal/server/services/ides/windsurf.py b/src/dstack/_internal/server/services/ides/windsurf.py new file mode 100644 index 0000000000..3b5042bb6d --- /dev/null +++ b/src/dstack/_internal/server/services/ides/windsurf.py @@ -0,0 +1,32 @@ +from typing import Optional + +from dstack._internal.server.services.ides.base import IDE + + +class WindsurfDesktop(IDE): + name = "Windsurf" + url_scheme = "windsurf" + + def get_install_commands( + self, version: Optional[str] = None, extensions: Optional[list[str]] = None + ) -> list[str]: + commands = [] + if version is not None: + version, commit = version.split("@") + url = f"https://windsurf-stable.codeiumdata.com/linux-reh-$arch/stable/{commit}/windsurf-reh-linux-$arch-{version}.tar.gz" + archive = "windsurf-reh-linux-$arch.tar.gz" + target = f'~/.windsurf-server/bin/"{commit}"' + commands.extend( + [ + 'if [ $(uname -m) = "aarch64" ]; then arch="arm64"; else arch="x64"; fi', + "mkdir -p /tmp", + f'wget -q --show-progress "{url}" -O "/tmp/{archive}"', + f"mkdir -vp {target}", + f'tar --no-same-owner -xz --strip-components=1 -C {target} -f "/tmp/{archive}"', + f'rm "/tmp/{archive}"', + ] + ) + if extensions: + _extensions = " ".join(f'--install-extension "{name}"' for name in extensions) + commands.append(f'PATH="$PATH":{target}/bin windsurf-server {_extensions}') + return commands diff --git a/src/dstack/_internal/server/services/ides/zed.py b/src/dstack/_internal/server/services/ides/zed.py new file mode 100644 index 0000000000..851a6f83de --- /dev/null +++ b/src/dstack/_internal/server/services/ides/zed.py @@ -0,0 +1,19 @@ +from typing import Optional + +from dstack._internal.server.services.ides.base import IDE + + +class ZedDesktop(IDE): + name = "Zed" + url_scheme = "zed" + + def get_install_commands( + self, version: Optional[str] = None, extensions: Optional[list[str]] = None + ) -> list[str]: + # We don't need to pre-install any extensions for Zed so we let it + # auto-install the remote server into ~/.zed_server on the first SSH connect, + # downloading the binary that matches the connecting Zed client version. + return [] + + def get_url(self, authority: str, working_dir: str) -> str: + return f"zed://ssh/{authority}{working_dir}" diff --git a/src/dstack/_internal/server/services/imports.py b/src/dstack/_internal/server/services/imports.py new file mode 100644 index 0000000000..6ee2e93201 --- /dev/null +++ b/src/dstack/_internal/server/services/imports.py @@ -0,0 +1,105 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import joinedload, selectinload + +from dstack._internal.core.errors import ResourceNotExistsError, ServerClientError +from dstack._internal.core.models.imports import ( + Import, + ImportExport, + ImportExportedFleet, + ImportExportedGateway, +) +from dstack._internal.server.models import ( + ExportedFleetModel, + ExportedGatewayModel, + ExportModel, + FleetModel, + GatewayModel, + ImportModel, + ProjectModel, +) +from dstack._internal.server.services.exports import get_export_model_by_name_for_update +from dstack._internal.server.services.projects import get_project_model_by_name + + +async def list_imports(session: AsyncSession, project: ProjectModel) -> list[Import]: + res = await session.execute( + select(ImportModel) + .where(ImportModel.project_id == project.id) + .options( + joinedload(ImportModel.export) + .load_only(ExportModel.id, ExportModel.name) + .options( + joinedload(ExportModel.project).load_only(ProjectModel.name), + selectinload( + ExportModel.exported_fleets.and_( + ExportedFleetModel.fleet.has(FleetModel.deleted == False) + ) + ) + .joinedload(ExportedFleetModel.fleet) + .load_only(FleetModel.id, FleetModel.name), + selectinload(ExportModel.exported_gateways) + .joinedload(ExportedGatewayModel.gateway) + .load_only(GatewayModel.id, GatewayModel.name), + ) + ) + .order_by(ImportModel.created_at.desc()) + ) + imports = res.scalars().all() + return [import_model_to_import(imp) for imp in imports] + + +async def delete_import( + session: AsyncSession, + project: ProjectModel, + export_name: str, + export_project_name: str, +) -> None: + # Always the same error, so as not to expose the existence of exports + # that are not imported in this project. + not_found_error = ResourceNotExistsError( + f"Import '{export_project_name}/{export_name}' not found in project {project.name!r}" + ) + exporter_project = await get_project_model_by_name(session, export_project_name) + if exporter_project is None: + raise not_found_error + async with get_export_model_by_name_for_update( + session, exporter_project, export_name + ) as export: + if export is None: + raise not_found_error + if project.name.lower() not in {imp.project.name.lower() for imp in export.imports}: + raise not_found_error + if export.is_global: + raise ServerClientError( + f"'{export_project_name}/{export_name}' is a global export, cannot stop importing" + ) + export.imports = [ + imp for imp in export.imports if imp.project.name.lower() != project.name.lower() + ] + await session.commit() + + +def import_model_to_import(import_model: ImportModel) -> Import: + return Import( + id=import_model.id, + export=ImportExport( + id=import_model.export.id, + name=import_model.export.name, + project_name=import_model.export.project.name, + exported_fleets=[ + ImportExportedFleet( + id=ef.fleet.id, + name=ef.fleet.name, + ) + for ef in import_model.export.exported_fleets + ], + exported_gateways=[ + ImportExportedGateway( + id=eg.gateway.id, + name=eg.gateway.name, + ) + for eg in import_model.export.exported_gateways + ], + ), + ) diff --git a/src/dstack/_internal/server/services/instances.py b/src/dstack/_internal/server/services/instances.py new file mode 100644 index 0000000000..913d3c9f44 --- /dev/null +++ b/src/dstack/_internal/server/services/instances.py @@ -0,0 +1,1036 @@ +import operator +import uuid +from collections.abc import Container, Iterable +from datetime import datetime +from typing import Dict, List, Literal, Optional, Sequence, Union + +import gpuhunt +from sqlalchemy import and_, exists, false, or_, select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import contains_eager, joinedload, load_only + +from dstack._internal.core.backends.base.offers import ( + offer_to_catalog_item, + requirements_to_query_filter, +) +from dstack._internal.core.backends.features import BACKENDS_WITH_MULTINODE_SUPPORT +from dstack._internal.core.errors import ResourceNotExistsError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import EntityReference +from dstack._internal.core.models.envs import Env +from dstack._internal.core.models.health import HealthCheck, HealthEvent, HealthStatus +from dstack._internal.core.models.instances import ( + Instance, + InstanceAvailability, + InstanceConfiguration, + InstanceOffer, + InstanceOfferWithAvailability, + InstanceStatus, + InstanceTerminationReason, + InstanceType, + RemoteConnectionInfo, + Resources, + SSHConnectionParams, + SSHKey, +) +from dstack._internal.core.models.profiles import ( + DEFAULT_FLEET_TERMINATION_IDLE_TIME, + FleetInstanceSelector, + InstanceHostnameSelector, + InstanceNameSelector, + InstanceSelector, + Profile, + TerminationPolicy, +) +from dstack._internal.core.models.runs import JobProvisioningData, Requirements +from dstack._internal.core.models.volumes import Volume +from dstack._internal.core.services.profiles import get_termination +from dstack._internal.server import settings as server_settings +from dstack._internal.server.models import ( + ExportedFleetModel, + FleetModel, + ImportModel, + InstanceHealthCheckModel, + InstanceModel, + ProjectModel, + UserModel, +) +from dstack._internal.server.schemas.health.dcgm import DCGMHealthResponse +from dstack._internal.server.schemas.runner import InstanceHealthResponse, TaskStatus +from dstack._internal.server.services import events +from dstack._internal.server.services.logging import fmt +from dstack._internal.server.services.offers import generate_shared_offer +from dstack._internal.server.services.projects import list_user_project_models +from dstack._internal.server.services.runner.client import ShimClient +from dstack._internal.utils import common as common_utils +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +def switch_instance_status( + session: AsyncSession, + instance_model: InstanceModel, + new_status: InstanceStatus, + actor: events.AnyActor = events.SystemActor(), +): + """ + Switch instance status. + + **Usage notes**: + + - When switching to `TERMINATING` or `TERMINATED`, + `instance_model.termination_reason` must be set + + - When `instance_model.termination_reason` is set to `ERROR`, + the error must be further explained in `instance_model.termination_reason_message` + """ + + old_status = instance_model.status + if old_status == new_status: + return + instance_model.status = new_status + emit_instance_status_change_event( + session=session, + instance_model=instance_model, + old_status=old_status, + new_status=new_status, + termination_reason=instance_model.termination_reason, + termination_reason_message=instance_model.termination_reason_message, + actor=actor, + ) + + +def emit_instance_status_change_event( + session: AsyncSession, + instance_model: InstanceModel, + old_status: InstanceStatus, + new_status: InstanceStatus, + termination_reason: Optional[InstanceTerminationReason], + termination_reason_message: Optional[str], + actor: events.AnyActor = events.SystemActor(), +) -> None: + if old_status == new_status: + return + msg = get_instance_status_change_message( + old_status=old_status, + new_status=new_status, + termination_reason=termination_reason, + termination_reason_message=termination_reason_message, + ) + events.emit(session, msg, actor=actor, targets=[events.Target.from_model(instance_model)]) + + +def get_instance_status_change_message( + old_status: InstanceStatus, + new_status: InstanceStatus, + termination_reason: Optional[InstanceTerminationReason], + termination_reason_message: Optional[str], +) -> str: + msg = f"Instance status changed {old_status.upper()} -> {new_status.upper()}" + if ( + new_status == InstanceStatus.TERMINATING + or new_status == InstanceStatus.TERMINATED + and old_status != InstanceStatus.TERMINATING + ): + if termination_reason is None: + raise ValueError( + f"termination_reason must be set when switching to {new_status.upper()} status" + ) + if ( + termination_reason == InstanceTerminationReason.ERROR + and not termination_reason_message + ): + raise ValueError( + "termination_reason_message must be set when termination_reason is ERROR" + ) + msg += f". Termination reason: {termination_reason.upper()}" + if termination_reason_message: + msg += f" ({termination_reason_message})" + return msg + + +def format_instance_blocks_for_event(instance_model: InstanceModel) -> str: + return f"{instance_model.busy_blocks}/{instance_model.total_blocks} busy" + + +async def get_instance_health_checks( + session: AsyncSession, + project: ProjectModel, + fleet_name: str, + instance_num: int, + after: Optional[datetime] = None, + before: Optional[datetime] = None, + limit: Optional[int] = None, +) -> list[HealthCheck]: + """ + Returns instance health checks ordered from the latest to the earliest. + + Expected usage: + * limit=100 — get the latest 100 checks + * after= — get checks for the last hour + * before=, limit=100 ­— paginate back in history + """ + res = await session.execute( + select(InstanceModel) + .join(FleetModel) + .where( + ~InstanceModel.deleted, + InstanceModel.project_id == project.id, + InstanceModel.instance_num == instance_num, + FleetModel.name == fleet_name, + ) + .options(load_only(InstanceModel.id)) + ) + instance = res.scalar_one_or_none() + if instance is None: + raise ResourceNotExistsError() + + stmt = ( + select(InstanceHealthCheckModel) + .where(InstanceHealthCheckModel.instance_id == instance.id) + .order_by(InstanceHealthCheckModel.collected_at.desc()) + ) + if after is not None: + stmt = stmt.where(InstanceHealthCheckModel.collected_at > after) + if before is not None: + stmt = stmt.where(InstanceHealthCheckModel.collected_at < before) + if limit is not None: + stmt = stmt.limit(limit) + health_checks: list[HealthCheck] = [] + res = await session.execute(stmt) + for health_check_model in res.scalars(): + health_check = instance_health_check_model_to_health_check(health_check_model) + health_checks.append(health_check) + return health_checks + + +async def get_instance( + session: AsyncSession, + project: ProjectModel, + instance_id: uuid.UUID, +) -> Optional[Instance]: + res = await session.execute( + select(InstanceModel) + .where( + InstanceModel.id == instance_id, + InstanceModel.project_id == project.id, + ) + .options( + joinedload(InstanceModel.fleet).load_only(FleetModel.name), + joinedload(InstanceModel.project).load_only(ProjectModel.name), + ) + ) + instance_model = res.scalar_one_or_none() + if instance_model is None: + return None + return instance_model_to_instance(instance_model) + + +def instance_model_to_instance(instance_model: InstanceModel) -> Instance: + instance = Instance( + id=instance_model.id, + project_name=instance_model.project.name, + name=instance_model.name, + fleet_id=instance_model.fleet_id, + fleet_name=instance_model.fleet.name if instance_model.fleet else None, + instance_num=instance_model.instance_num, + status=instance_model.status, + unreachable=instance_model.unreachable, + health_status=instance_model.health, + termination_reason=( + instance_model.termination_reason.value if instance_model.termination_reason else None + ), + termination_reason_message=instance_model.termination_reason_message, + created=instance_model.created_at, + finished_at=instance_model.finished_at, + total_blocks=instance_model.total_blocks, + busy_blocks=instance_model.busy_blocks, + ) + + offer = get_instance_offer(instance_model) + if offer is not None: + instance.backend = offer.backend + instance.region = offer.region + instance.price = offer.price + + jpd = get_instance_provisioning_data(instance_model) + if jpd is not None: + instance.instance_type = jpd.instance_type + instance.hostname = jpd.hostname + instance.availability_zone = jpd.availability_zone + + return instance + + +def instance_health_check_model_to_health_check(model: InstanceHealthCheckModel) -> HealthCheck: + collected_at = model.collected_at + status = HealthStatus.HEALTHY + events: list[HealthEvent] = [] + instance_health_response = get_instance_health_response(model) + if (dcgm := instance_health_response.dcgm) is not None: + dcgm_health_check = dcgm_health_response_to_health_check(dcgm, collected_at) + status = dcgm_health_check.status + events.extend(dcgm_health_check.events) + events.sort(key=operator.attrgetter("timestamp"), reverse=True) + return HealthCheck( + collected_at=collected_at, + status=status, + events=events, + ) + + +def dcgm_health_response_to_health_check( + response: DCGMHealthResponse, collected_at: datetime +) -> HealthCheck: + events: list[HealthEvent] = [] + for incident in response.incidents: + events.append( + HealthEvent( + timestamp=collected_at, + status=incident.health.to_health_status(), + message=incident.error_message, + ) + ) + return HealthCheck( + collected_at=collected_at, + status=response.overall_health.to_health_status(), + events=events, + ) + + +def get_instance_health_response( + instance_health_check_model: InstanceHealthCheckModel, +) -> InstanceHealthResponse: + return InstanceHealthResponse.__response__.parse_raw(instance_health_check_model.response) + + +def get_instance_provisioning_data(instance_model: InstanceModel) -> Optional[JobProvisioningData]: + if instance_model.job_provisioning_data is None: + return None + return JobProvisioningData.__response__.parse_raw(instance_model.job_provisioning_data) + + +def get_instance_offer(instance_model: InstanceModel) -> Optional[InstanceOfferWithAvailability]: + if instance_model.offer is None: + return None + return InstanceOfferWithAvailability.__response__.parse_raw(instance_model.offer) + + +def get_instance_configuration(instance_model: InstanceModel) -> InstanceConfiguration: + return InstanceConfiguration.__response__.parse_raw(instance_model.instance_configuration) + + +def get_instance_profile(instance_model: InstanceModel) -> Profile: + return Profile.__response__.parse_raw(instance_model.profile) + + +def get_instance_requirements(instance_model: InstanceModel) -> Requirements: + return Requirements.__response__.parse_raw(instance_model.requirements) + + +def is_ssh_instance(instance_model: InstanceModel) -> bool: + return instance_model.remote_connection_info is not None + + +def is_placeholder_instance(instance_model: InstanceModel) -> bool: + """A PENDING instance with `provisioning_job_id` set is a placeholder + reserved by `JobSubmittedPipeline` during assignment and awaiting cloud + provisioning. It reserves an `instance_num` and a `nodes.max` slot but + has no backend, offer, or provisioning data until it is promoted. + `InstancePipeline` ignores placeholders; only `JobSubmittedPipeline` and + `JobTerminatingPipeline` act on them. + """ + return ( + instance_model.status == InstanceStatus.PENDING + and instance_model.provisioning_job_id is not None + ) + + +def filter_non_placeholder_instances(instance_models: list[InstanceModel]) -> list[InstanceModel]: + return [i for i in instance_models if not is_placeholder_instance(i)] + + +def get_instance_remote_connection_info( + instance_model: InstanceModel, +) -> Optional[RemoteConnectionInfo]: + if instance_model.remote_connection_info is None: + return None + return RemoteConnectionInfo.__response__.parse_raw(instance_model.remote_connection_info) + + +def get_instance_ssh_private_keys(instance_model: InstanceModel) -> tuple[str, Optional[str]]: + """ + Returns a pair of SSH private keys: host key and optional proxy jump key. + """ + host_private_key = instance_model.project.ssh_private_key + rci = get_instance_remote_connection_info(instance_model) + if rci is None: + # Cloud instance + return host_private_key, None + # SSH instance + if rci.ssh_proxy is None: + return host_private_key, None + if rci.ssh_proxy_keys is None: + # Inconsistent RemoteConnectionInfo structure - proxy without keys + raise ValueError("Missing instance SSH proxy private keys") + proxy_private_keys = [key.private for key in rci.ssh_proxy_keys if key.private is not None] + if not proxy_private_keys: + raise ValueError("No instance SSH proxy private key found") + return host_private_key, proxy_private_keys[0] + + +async def select_instances_by_selectors( + session: AsyncSession, + project: ProjectModel, + selectors: Sequence[InstanceSelector], + *, + fleets: Optional[Sequence[Union[EntityReference, str]]] = None, + detaching_instance_ids: Optional[Sequence[uuid.UUID]] = None, + fleet_id: Optional[uuid.UUID] = None, + instance_ids: Optional[Sequence[uuid.UUID]] = None, + lock_instances: bool = False, +) -> list[InstanceModel]: + if instance_ids is not None and len(instance_ids) == 0: + return [] + is_instance_imported_subquery = exists().where( + ImportModel.project_id == project.id, + ImportModel.export_id == ExportedFleetModel.export_id, + ExportedFleetModel.fleet_id == InstanceModel.fleet_id, + ) + filters = [ + or_( + InstanceModel.project_id == project.id, + is_instance_imported_subquery, + ), + FleetModel.deleted == False, + InstanceModel.deleted == False, + ] + if detaching_instance_ids is not None: + filters.append(InstanceModel.id.not_in(detaching_instance_ids)) + if fleet_id is not None: + filters.append(InstanceModel.fleet_id == fleet_id) + if instance_ids is not None: + filters.append(InstanceModel.id.in_(instance_ids)) + if fleets is not None: + filters.append( + or_( + *[ + _get_fleet_reference_condition(project, EntityReference.parse(fleet)) + for fleet in fleets + ] + ) + ) + selector_conditions = _get_instance_selector_conditions(project, selectors) + if selector_conditions: + filters.append(or_(*selector_conditions)) + + stmt = ( + select(InstanceModel) + .join(InstanceModel.fleet) + .join(FleetModel.project) + .where(*filters) + .options( + contains_eager(InstanceModel.fleet) + .load_only(FleetModel.id, FleetModel.name, FleetModel.project_id, FleetModel.spec) + .contains_eager(FleetModel.project) + .load_only(ProjectModel.name) + ) + ) + if lock_instances: + stmt = stmt.where(InstanceModel.lock_expires_at.is_(None)) + stmt = stmt.order_by(InstanceModel.id).with_for_update( + skip_locked=True, key_share=True, of=InstanceModel + ) + res = await session.execute(stmt) + instances = list(res.unique().scalars().all()) + return [ + instance + for instance in instances + if instance_matches_selectors(instance, selectors, project=project) + ] + + +def instance_matches_selectors( + instance: InstanceModel, + selectors: Sequence[InstanceSelector], + *, + project: ProjectModel, +) -> bool: + return any( + instance_matches_selector(instance, selector, project=project) for selector in selectors + ) + + +def instance_matches_selector( + instance: InstanceModel, + selector: InstanceSelector, + *, + project: ProjectModel, +) -> bool: + if isinstance(selector, InstanceNameSelector): + return instance.name == selector.name + if isinstance(selector, InstanceHostnameSelector): + return instance_matches_hostname_selector(instance, selector) + if isinstance(selector, FleetInstanceSelector): + return _instance_matches_fleet_instance_selector(instance, selector, project=project) + return False + + +def instance_matches_hostname_selector( + instance: InstanceModel, selector: InstanceHostnameSelector +) -> bool: + candidates = set() + jpd = get_instance_provisioning_data(instance) + if jpd is not None: + if jpd.hostname is not None: + candidates.add(jpd.hostname.lower()) + if jpd.internal_ip is not None: + candidates.add(jpd.internal_ip.lower()) + rci = get_instance_remote_connection_info(instance) + if rci is not None: + candidates.add(rci.host.lower()) + return selector.hostname.lower() in candidates + + +def _instance_matches_fleet_instance_selector( + instance: InstanceModel, + selector: FleetInstanceSelector, + *, + project: ProjectModel, +) -> bool: + fleet = instance.fleet + if fleet is None: + return False + if fleet.name != selector.fleet.name: + return False + if instance.instance_num != selector.instance: + return False + if selector.fleet.project is None: + return fleet.project_id == project.id + return fleet.project.name == selector.fleet.project + + +def _get_instance_selector_conditions( + project: ProjectModel, + selectors: Sequence[InstanceSelector], +) -> list: + conditions = [] + for selector in selectors: + if isinstance(selector, InstanceNameSelector): + conditions.append(InstanceModel.name == selector.name) + elif isinstance(selector, InstanceHostnameSelector): + conditions.append(_get_hostname_selector_condition(selector)) + elif isinstance(selector, FleetInstanceSelector): + conditions.append( + and_( + _get_fleet_reference_condition(project, selector.fleet), + InstanceModel.instance_num == selector.instance, + ) + ) + return conditions + + +def _get_fleet_reference_condition(project: ProjectModel, ref: EntityReference): + if ref.project is None: + return and_( + FleetModel.name == ref.name, + FleetModel.project_id == project.id, + ) + return and_( + FleetModel.name == ref.name, + ProjectModel.name == ref.project, + ) + + +def _get_hostname_selector_condition(selector: InstanceHostnameSelector): + # This is only a DB prefilter. `instance_matches_selector` parses these JSON columns + # and performs the exact hostname/internal IP comparison in memory. + return or_( + InstanceModel.job_provisioning_data.icontains(selector.hostname, autoescape=True), + InstanceModel.remote_connection_info.icontains(selector.hostname, autoescape=True), + ) + + +def instance_matches_constraints( + instance: InstanceModel, + *, + backend_types: Optional[List[BackendType]] = None, + regions: Optional[List[str]] = None, + instance_types: Optional[List[str]] = None, + zones: Optional[List[str]] = None, + requirements: Optional[Requirements] = None, +) -> bool: + """Check if an instance matches the given provisioning constraints.""" + jpd = get_instance_provisioning_data(instance) + if jpd is not None: + if backend_types is not None and jpd.get_base_backend() not in backend_types: + return False + if regions is not None and jpd.region.lower() not in [r.lower() for r in regions]: + return False + if instance_types is not None and jpd.instance_type.name.lower() not in [ + i.lower() for i in instance_types + ]: + return False + if ( + jpd.availability_zone is not None + and zones is not None + and jpd.availability_zone not in zones + ): + return False + + if requirements is not None: + if instance.offer is None: + return False + offer = InstanceOffer.__response__.parse_raw(instance.offer) + catalog_item = offer_to_catalog_item(offer) + if not gpuhunt.matches(catalog_item, q=requirements_to_query_filter(requirements)): + return False + + return True + + +def filter_instances( + instances: List[InstanceModel], + profile: Profile, + *, + requirements: Optional[Requirements] = None, + status: Optional[InstanceStatus] = None, + multinode: bool = False, + master_job_provisioning_data: Optional[JobProvisioningData] = None, + volumes: Optional[List[List[Volume]]] = None, + shared: bool = False, +) -> List[InstanceModel]: + backend_types: Optional[list[BackendType]] = profile.backends + regions: Optional[list[str]] = profile.regions + zones: Optional[list[str]] = profile.availability_zones + # (BackendType, region.lower() | "", availability_zone.lower() | None) + volumes_locations: Optional[set[tuple[BackendType, str, Optional[str]]]] = None + + if volumes: + volumes_locations = set() + for volume in volumes[0]: + volume_backend = volume.get_backend() + volume_region = volume.get_region().lower() + # If the volume has an AZ, it's added twice -- with and without an AZ. + # When the instance location is checked against the available volumes locations (see + # below) the instance with an AZ matches only the volume with the same AZ, while + # the instance without an AZ matches any volume with the same region regardless of AZs. + # This reflects the logic used before this stricter volumes_locations check was added. + volumes_locations.add((volume_backend, volume_region, None)) + if (volume_zone := volume.get_availability_zone()) is not None: + volumes_locations.add((volume_backend, volume_region, volume_zone.lower())) + + if multinode: + if backend_types is None: + backend_types = BACKENDS_WITH_MULTINODE_SUPPORT + backend_types = [b for b in backend_types if b in BACKENDS_WITH_MULTINODE_SUPPORT] + + # For multi-node, restrict backend and region. + # The default behavior is to provision all nodes in the same backend and region. + if master_job_provisioning_data is not None: + if backend_types is None: + backend_types = [master_job_provisioning_data.get_base_backend()] + backend_types = [ + b for b in backend_types if b == master_job_provisioning_data.get_base_backend() + ] + if regions is None: + regions = [master_job_provisioning_data.region] + regions = [r for r in regions if r == master_job_provisioning_data.region] + + instance_types = profile.instance_types + + filtered_instances: List[InstanceModel] = [] + for instance in instances: + if instance.unreachable: + continue + if instance.health.is_failure(): + continue + if status is not None and instance.status != status: + continue + if instance.total_blocks is None: + # Still provisioning, we don't know yet if it shared or not + continue + if (instance.total_blocks > 1) != shared: + continue + if not instance_matches_constraints( + instance, + backend_types=backend_types, + regions=regions, + instance_types=instance_types, + zones=zones, + requirements=requirements, + ): + continue + if volumes_locations is not None: + jpd = get_instance_provisioning_data(instance) + # instance_matches_constraints() also skips filtering if JPD is not set + if jpd is not None: + instance_backend = jpd.get_base_backend() + instance_region = jpd.region.lower() + instance_zone = jpd.availability_zone + if instance_zone is not None: + instance_zone = instance_zone.lower() + if (instance_backend, instance_region, instance_zone) not in volumes_locations: + continue + filtered_instances.append(instance) + return filtered_instances + + +def get_shared_instances_with_offers( + instances: List[InstanceModel], + profile: Profile, + requirements: Requirements, + *, + idle_only: bool = False, + multinode: bool = False, + volumes: Optional[List[List[Volume]]] = None, +) -> list[tuple[InstanceModel, InstanceOfferWithAvailability]]: + instances_with_offers: list[tuple[InstanceModel, InstanceOfferWithAvailability]] = [] + query_filter = requirements_to_query_filter(requirements) + filtered_instances = filter_instances( + instances=instances, + profile=profile, + multinode=multinode, + volumes=volumes, + shared=True, + ) + for instance in filtered_instances: + if idle_only and instance.status not in [InstanceStatus.IDLE, InstanceStatus.BUSY]: + continue + if multinode and instance.busy_blocks > 0: + continue + offer = get_instance_offer(instance) + if offer is None: + continue + total_blocks = common_utils.get_or_error(instance.total_blocks) + idle_blocks = total_blocks - instance.busy_blocks + min_blocks = total_blocks if multinode else 1 + for blocks in range(min_blocks, total_blocks + 1): + shared_offer = generate_shared_offer(offer, blocks, total_blocks) + catalog_item = offer_to_catalog_item(shared_offer) + if gpuhunt.matches(catalog_item, query_filter): + if blocks <= idle_blocks: + shared_offer.availability = InstanceAvailability.IDLE + else: + shared_offer.availability = InstanceAvailability.BUSY + if shared_offer.availability == InstanceAvailability.IDLE or not idle_only: + instances_with_offers.append((instance, shared_offer)) + break + return instances_with_offers + + +async def get_pool_instances( + session: AsyncSession, + project: ProjectModel, +) -> List[InstanceModel]: + res = await session.execute( + select(InstanceModel) + .where( + or_( + InstanceModel.project_id == project.id, + exists().where( + ImportModel.project_id == project.id, + ImportModel.export_id == ExportedFleetModel.export_id, + ExportedFleetModel.fleet_id == InstanceModel.fleet_id, + ), + ), + InstanceModel.deleted == False, + ) + .options(joinedload(InstanceModel.fleet)) + ) + instance_models = list(res.unique().scalars().all()) + return instance_models + + +async def list_projects_instance_models( + session: AsyncSession, + projects: List[ProjectModel], + fleet_ids: Optional[Iterable[uuid.UUID]], + only_active: bool, + include_imported: bool, + prev_created_at: Optional[datetime], + prev_id: Optional[uuid.UUID], + limit: int, + ascending: bool, +) -> List[InstanceModel]: + project_ids = [p.id for p in projects] + is_instance_imported_subquery = exists().where( + ImportModel.project_id.in_(project_ids), + ImportModel.export_id == ExportedFleetModel.export_id, + ExportedFleetModel.fleet_id == InstanceModel.fleet_id, + ) + filters: List = [ + or_( + InstanceModel.project_id.in_(project_ids), + is_instance_imported_subquery if include_imported else false(), + ) + ] + if fleet_ids is not None: + filters.append(InstanceModel.fleet_id.in_(fleet_ids)) + if only_active: + filters.extend( + [ + InstanceModel.deleted == False, + InstanceModel.status.in_([InstanceStatus.IDLE, InstanceStatus.BUSY]), + ] + ) + if prev_created_at is not None: + if ascending: + if prev_id is None: + filters.append(InstanceModel.created_at > prev_created_at) + else: + filters.append( + or_( + InstanceModel.created_at > prev_created_at, + and_( + InstanceModel.created_at == prev_created_at, + InstanceModel.id < prev_id, + ), + ) + ) + else: + if prev_id is None: + filters.append(InstanceModel.created_at < prev_created_at) + else: + filters.append( + or_( + InstanceModel.created_at < prev_created_at, + and_( + InstanceModel.created_at == prev_created_at, + InstanceModel.id > prev_id, + ), + ) + ) + order_by = (InstanceModel.created_at.desc(), InstanceModel.id) + if ascending: + order_by = (InstanceModel.created_at.asc(), InstanceModel.id.desc()) + + res = await session.execute( + select(InstanceModel) + .where(*filters) + .order_by(*order_by) + .limit(limit) + .options( + joinedload(InstanceModel.fleet), + joinedload(InstanceModel.project).load_only(ProjectModel.name), + ) + ) + instance_models = list(res.unique().scalars().all()) + return instance_models + + +async def list_user_instances( + session: AsyncSession, + user: UserModel, + project_names: Optional[Container[str]], + fleet_ids: Optional[Iterable[uuid.UUID]], + only_active: bool, + include_imported: bool, + prev_created_at: Optional[datetime], + prev_id: Optional[uuid.UUID], + limit: int, + ascending: bool, +) -> List[Instance]: + projects = await list_user_project_models( + session=session, + user=user, + only_names=True, + ) + if project_names is not None: + projects = [p for p in projects if p.name in project_names] + if len(projects) == 0: + return [] + instance_models = await list_projects_instance_models( + session=session, + projects=projects, + fleet_ids=fleet_ids, + only_active=only_active, + include_imported=include_imported, + prev_created_at=prev_created_at, + prev_id=prev_id, + limit=limit, + ascending=ascending, + ) + instances = [] + for instance in instance_models: + instances.append(instance_model_to_instance(instance)) + return instances + + +async def list_active_remote_instances( + session: AsyncSession, +) -> List[InstanceModel]: + filters: List = [InstanceModel.deleted == False, InstanceModel.backend == BackendType.REMOTE] + + res = await session.execute( + select(InstanceModel).where(*filters).order_by(InstanceModel.created_at.asc()) + ) + instance_models = list(res.unique().scalars().all()) + return instance_models + + +def create_instance_model( + session: AsyncSession, + project: ProjectModel, + username: str, + profile: Profile, + requirements: Requirements, + instance_name: str, + instance_num: int, + reservation: Optional[str], + blocks: Union[Literal["auto"], int], + tags: Optional[Dict[str, str]], + instance_id: Optional[uuid.UUID] = None, +) -> InstanceModel: + termination_policy, termination_idle_time = get_termination( + profile, DEFAULT_FLEET_TERMINATION_IDLE_TIME + ) + if instance_id is None: + instance_id = uuid.uuid4() + project_ssh_key = SSHKey( + public=project.ssh_public_key.strip(), + private=project.ssh_private_key.strip(), + ) + instance_config = InstanceConfiguration( + project_name=project.name, + instance_name=instance_name, + user=username, + ssh_keys=[project_ssh_key], + instance_id=str(instance_id), + reservation=reservation, + tags=tags, + ) + now = common_utils.get_current_datetime() + instance = InstanceModel( + id=instance_id, + name=instance_name, + instance_num=instance_num, + project=project, + created_at=now, + last_processed_at=now, + status=InstanceStatus.PENDING, + unreachable=False, + profile=profile.json(), + requirements=requirements.json(), + instance_configuration=instance_config.json(), + termination_policy=termination_policy, + termination_idle_time=termination_idle_time, + total_blocks=None if blocks == "auto" else blocks, + busy_blocks=0, + ) + session.add(instance) + return instance + + +async def create_ssh_instance_model( + project: ProjectModel, + instance_name: str, + instance_num: int, + internal_ip: Optional[str], + instance_network: Optional[str], + region: Optional[str], + host: str, + port: int, + ssh_user: str, + ssh_keys: List[SSHKey], + ssh_proxy: Optional[SSHConnectionParams], + ssh_proxy_keys: Optional[list[SSHKey]], + env: Env, + blocks: Union[Literal["auto"], int], +) -> InstanceModel: + # TODO: doc - will overwrite after remote connected + instance_resource = Resources(cpus=2, memory_mib=8, gpus=[], spot=False) + instance_type = InstanceType(name="ssh", resources=instance_resource) + + host_region = region if region is not None else "remote" + + remote = JobProvisioningData( + backend=BackendType.REMOTE, + instance_type=instance_type, + instance_id=instance_name, + hostname=host, + region=host_region, + internal_ip=internal_ip, + instance_network=instance_network, + price=0, + username=ssh_user, + ssh_port=port, + dockerized=True, + backend_data="", + ssh_proxy=None, + ) + offer = InstanceOfferWithAvailability( + backend=BackendType.REMOTE, + instance=instance_type, + region=host_region, + price=0.0, + availability=InstanceAvailability.AVAILABLE, + ) + remote_connection_info = RemoteConnectionInfo( + host=host, + port=port, + ssh_user=ssh_user, + ssh_keys=ssh_keys, + ssh_proxy=ssh_proxy, + ssh_proxy_keys=ssh_proxy_keys, + env=env, + ) + im = InstanceModel( + id=uuid.uuid4(), + name=instance_name, + instance_num=instance_num, + project=project, + backend=BackendType.REMOTE, + created_at=common_utils.get_current_datetime(), + started_at=common_utils.get_current_datetime(), + status=InstanceStatus.PENDING, + unreachable=False, + job_provisioning_data=remote.json(), + remote_connection_info=remote_connection_info.json(), + offer=offer.json(), + region=offer.region, + price=offer.price, + termination_policy=TerminationPolicy.DONT_DESTROY, + termination_idle_time=0, + total_blocks=None if blocks == "auto" else blocks, + busy_blocks=0, + ) + return im + + +def remove_dangling_tasks_from_instance(shim_client: ShimClient, instance: InstanceModel) -> None: + if not shim_client.is_api_v2_supported(): + return + assigned_to_instance_job_ids = {str(j.id) for j in instance.jobs} + task_list_response = shim_client.list_tasks() + tasks: list[tuple[str, Optional[TaskStatus]]] + if task_list_response.tasks is not None: + tasks = [(t.id, t.status) for t in task_list_response.tasks] + elif task_list_response.ids is not None: + # compatibility with pre-0.19.26 shim + tasks = [(t_id, None) for t_id in task_list_response.ids] + else: + raise ValueError("Unexpected task list response, neither `tasks` nor `ids` is set") + for task_id, task_status in tasks: + if task_id in assigned_to_instance_job_ids: + continue + should_terminate = task_status != TaskStatus.TERMINATED + should_remove = not server_settings.SERVER_KEEP_SHIM_TASKS + if not (should_terminate or should_remove): + continue + logger.warning( + "%s: dangling task found, id=%s, status=%s. Terminating and/or removing", + fmt(instance), + task_id, + task_status or "", + ) + if should_terminate: + shim_client.terminate_task( + task_id=task_id, + reason=None, + message=None, + timeout=0, + ) + if should_remove: + shim_client.remove_task(task_id=task_id) diff --git a/src/dstack/_internal/server/services/jobs/__init__.py b/src/dstack/_internal/server/services/jobs/__init__.py index c04be00244..e0c99221b3 100644 --- a/src/dstack/_internal/server/services/jobs/__init__.py +++ b/src/dstack/_internal/server/services/jobs/__init__.py @@ -1,144 +1,326 @@ -import asyncio -import datetime import itertools import json -from datetime import timezone +from collections.abc import Mapping +from datetime import timedelta from typing import Dict, Iterable, List, Optional, Tuple +from uuid import UUID -import sqlalchemy as sa -import sqlalchemy.orm as sa_orm +import requests +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import load_only -import dstack._internal.server.services.gateways as gateways -from dstack._internal.core.errors import BackendError, ComputeResourceNotFoundError, SSHError +from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT +from dstack._internal.core.errors import ( + ResourceNotExistsError, + ServerClientError, + SSHError, +) from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.configurations import RunConfigurationType -from dstack._internal.core.models.instances import RemoteConnectionInfo from dstack._internal.core.models.runs import ( - InstanceStatus, + ImagePullProgress, Job, + JobConnectionInfo, JobProvisioningData, + JobRuntimeData, JobSpec, JobStatus, JobSubmission, + JobTerminationReason, + RegistryAuth, RunSpec, ) -from dstack._internal.core.services.ssh import tunnel as ssh_tunnel -from dstack._internal.server.models import InstanceModel, JobModel, ProjectModel -from dstack._internal.server.services.backends import get_project_backend_by_type -from dstack._internal.server.services.jobs.configurators.base import JobConfigurator +from dstack._internal.core.models.volumes import Volume, VolumeMountPoint, VolumeStatus +from dstack._internal.server import settings +from dstack._internal.server.models import ( + InstanceModel, + JobModel, + ProjectModel, + RunModel, + VolumeModel, +) +from dstack._internal.server.services import events +from dstack._internal.server.services import volumes as volumes_services +from dstack._internal.server.services.ides import get_ide +from dstack._internal.server.services.instances import ( + get_instance_ssh_private_keys, +) +from dstack._internal.server.services.jobs.configurators.base import ( + JobConfigurator, + interpolate_job_volumes, +) from dstack._internal.server.services.jobs.configurators.dev import DevEnvironmentJobConfigurator from dstack._internal.server.services.jobs.configurators.service import ServiceJobConfigurator from dstack._internal.server.services.jobs.configurators.task import TaskJobConfigurator from dstack._internal.server.services.logging import fmt +from dstack._internal.server.services.probes import probe_model_to_probe from dstack._internal.server.services.runner import client -from dstack._internal.server.services.runner.ssh import get_runner_ports, runner_ssh_tunnel -from dstack._internal.server.services.volumes import volume_model_to_volume -from dstack._internal.server.utils.common import run_async, wait_to_lock -from dstack._internal.utils.common import get_current_datetime +from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel +from dstack._internal.server.services.sshproxy import ( + build_proxied_job_ssh_command, + build_proxied_job_ssh_url_authority, + build_proxied_job_upstream_id, +) +from dstack._internal.utils import common +from dstack._internal.utils.common import run_async +from dstack._internal.utils.interpolator import VariablesInterpolator from dstack._internal.utils.logging import get_logger +from dstack._internal.utils.ssh import build_ssh_command, build_ssh_url_authority logger = get_logger(__name__) -# TODO Make locks per project -SUBMITTED_PROCESSING_JOBS_LOCK = asyncio.Lock() -SUBMITTED_PROCESSING_JOBS_IDS = set() -RUNNING_PROCESSING_JOBS_LOCK = asyncio.Lock() -RUNNING_PROCESSING_JOBS_IDS = set() +def switch_job_status( + session: AsyncSession, + job_model: JobModel, + new_status: JobStatus, + actor: events.AnyActor = events.SystemActor(), +): + """ + Switch job status. + + **NOTE**: When switching to `TERMINATING`, set `termination_reason` and preferably + `termination_reason_message` before calling this function. + """ + old_status = job_model.status + if old_status == new_status: + return + + job_model.status = new_status + emit_job_status_change_event( + session=session, + job_model=job_model, + old_status=old_status, + new_status=new_status, + termination_reason=job_model.termination_reason, + termination_reason_message=job_model.termination_reason_message, + actor=actor, + ) -PROCESSING_POOL_LOCK = asyncio.Lock() -PROCESSING_POOL_IDS = set() -TERMINATING_PROCESSING_JOBS_LOCK = asyncio.Lock() -TERMINATING_PROCESSING_JOBS_IDS = set() +def get_job_status_change_message( + old_status: JobStatus, + new_status: JobStatus, + termination_reason: Optional[JobTerminationReason], + termination_reason_message: Optional[str], +) -> str: + msg = f"Job status changed {old_status.upper()} -> {new_status.upper()}" + if new_status == JobStatus.TERMINATING: + if termination_reason is None: + raise ValueError("termination_reason must be set when switching to TERMINATING status") + msg += f". Termination reason: {termination_reason.upper()}" + if termination_reason_message: + msg += f" ({termination_reason_message})" + return msg + + +def emit_job_status_change_event( + session: AsyncSession, + job_model: JobModel, + old_status: JobStatus, + new_status: JobStatus, + termination_reason: Optional[JobTerminationReason], + termination_reason_message: Optional[str], + actor: events.AnyActor = events.SystemActor(), +) -> None: + if old_status == new_status: + return + events.emit( + session, + get_job_status_change_message( + old_status=old_status, + new_status=new_status, + termination_reason=termination_reason, + termination_reason_message=termination_reason_message, + ), + actor=actor, + targets=[events.Target.from_model(job_model)], + ) -async def get_jobs_from_run_spec(run_spec: RunSpec, replica_num: int) -> List[Job]: +async def get_jobs_from_run_spec( + run_spec: RunSpec, + secrets: Dict[str, str], + replica_num: int, + replica_group_name: Optional[str] = None, +) -> List[Job]: return [ Job(job_spec=s, job_submissions=[]) - for s in await get_job_specs_from_run_spec(run_spec, replica_num) + for s in await get_job_specs_from_run_spec( + run_spec=run_spec, + secrets=secrets, + replica_num=replica_num, + replica_group_name=replica_group_name, + ) ] -async def get_job_specs_from_run_spec(run_spec: RunSpec, replica_num: int) -> List[JobSpec]: - job_configurator = _get_job_configurator(run_spec) +async def get_job_specs_from_run_spec( + run_spec: RunSpec, + secrets: Dict[str, str], + replica_num: int, + replica_group_name: Optional[str] = None, +) -> List[JobSpec]: + job_configurator = _get_job_configurator( + run_spec=run_spec, secrets=secrets, replica_group_name=replica_group_name + ) job_specs = await job_configurator.get_job_specs(replica_num=replica_num) return job_specs -def job_model_to_job_submission(job_model: JobModel) -> JobSubmission: - job_provisioning_data = None - if job_model.job_provisioning_data is not None: - job_provisioning_data = JobProvisioningData.__response__.parse_raw( - job_model.job_provisioning_data +def interpolate_job_spec_secrets(job_spec: JobSpec, secrets: Mapping[str, str]) -> None: + interpolate = VariablesInterpolator({"secrets": secrets}).interpolate_or_error + job_spec.env = {k: interpolate(v) for k, v in job_spec.env.items()} + if job_spec.registry_auth is not None: + job_spec.registry_auth = RegistryAuth( + username=interpolate(job_spec.registry_auth.username), + password=interpolate(job_spec.registry_auth.password), ) + + +def find_job(jobs: List[Job], replica_num: int, job_num: int) -> Job: + for job in jobs: + if job.job_spec.replica_num == replica_num and job.job_spec.job_num == job_num: + return job + raise ResourceNotExistsError( + f"Job with replica_num={replica_num} and job_num={job_num} not found" + ) + + +def find_jobs( + jobs: List[Job], + replica_num: Optional[int] = None, + job_num: Optional[int] = None, +) -> list[Job]: + res = jobs + if replica_num is not None: + res = [j for j in res if j.job_spec.replica_num == replica_num] + if job_num is not None: + res = [j for j in res if j.job_spec.job_num == job_num] + return res + + +async def get_run_job_model( + session: AsyncSession, + project: ProjectModel, + run_name: str, + run_id: Optional[UUID], + replica_num: int, + job_num: int, +) -> Optional[JobModel]: + filters = [ + RunModel.project_id == project.id, + RunModel.run_name == run_name, + JobModel.replica_num == replica_num, + JobModel.job_num == job_num, + ] + if run_id is not None: + filters.append(RunModel.id == run_id) + else: + # Assuming run_name is unique for non-deleted runs + filters.append(RunModel.deleted == False) + res = await session.execute( + select(JobModel) + .join(JobModel.run) + .where(*filters) + .order_by(JobModel.submission_num.desc()) + .limit(1) + ) + return res.scalar_one_or_none() + + +def job_model_to_job_submission( + job_model: JobModel, include_probes: bool = False +) -> JobSubmission: + job_provisioning_data = get_job_provisioning_data(job_model) + if job_provisioning_data is not None: # TODO remove after transitioning to computed fields job_provisioning_data.instance_type.resources.description = ( job_provisioning_data.instance_type.resources.pretty_format() ) + # TODO do we really still need this magic? See https://github.com/dstackai/dstack/pull/1682 + # i.e., replacing `jpd.backend` with `jpd.get_base_backend()` should give the same result if ( job_provisioning_data.backend == BackendType.DSTACK and job_provisioning_data.backend_data is not None ): backend_data = json.loads(job_provisioning_data.backend_data) job_provisioning_data.backend = backend_data["base_backend"] - last_processed_at = job_model.last_processed_at.replace(tzinfo=timezone.utc) + last_processed_at = job_model.last_processed_at finished_at = None if job_model.status.is_finished(): finished_at = last_processed_at + status_message = _get_job_status_message(job_model) + error = _get_job_error(job_model) + probes = [] + if include_probes: + probes = [probe_model_to_probe(pm) for pm in job_model.probes] return JobSubmission( id=job_model.id, submission_num=job_model.submission_num, - submitted_at=job_model.submitted_at.replace(tzinfo=timezone.utc), + deployment_num=job_model.deployment_num, + submitted_at=job_model.submitted_at, last_processed_at=last_processed_at, finished_at=finished_at, + inactivity_secs=job_model.inactivity_secs, status=job_model.status, - termination_reason=job_model.termination_reason, + status_message=status_message, + termination_reason=job_model.termination_reason.value + if job_model.termination_reason + else None, termination_reason_message=job_model.termination_reason_message, + exit_status=job_model.exit_status, job_provisioning_data=job_provisioning_data, + job_runtime_data=get_job_runtime_data(job_model), + error=error, + probes=probes, + image_pull_progress=_get_image_pull_progress(job_model), ) -def find_job(jobs: List[Job], replica_num: int, job_num: int) -> Job: - for job in jobs: - if job.job_spec.replica_num == replica_num and job.job_spec.job_num == job_num: - return job - raise ComputeResourceNotFoundError( - f"Job with replica_num={replica_num} and job_num={job_num} not found" - ) +def get_job_provisioning_data(job_model: JobModel) -> Optional[JobProvisioningData]: + if job_model.job_provisioning_data is None: + return None + return JobProvisioningData.__response__.parse_raw(job_model.job_provisioning_data) -async def terminate_job_provisioning_data_instance( - project: ProjectModel, job_provisioning_data: JobProvisioningData -): - backend = await get_project_backend_by_type( - project=project, - backend_type=job_provisioning_data.backend, - ) - if backend is None: - logger.error( - "Failed to terminate the instance. " - f"Backend {job_provisioning_data.backend} is not configured in project {project.name}." - ) - return - logger.debug("Terminating runner instance %s", job_provisioning_data.hostname) - await run_async( - backend.compute().terminate_instance, - job_provisioning_data.instance_id, - job_provisioning_data.region, - job_provisioning_data.backend_data, - ) +def get_job_runtime_data(job_model: JobModel) -> Optional[JobRuntimeData]: + if job_model.job_runtime_data is None: + return None + return JobRuntimeData.__response__.parse_raw(job_model.job_runtime_data) + + +def _get_image_pull_progress(job_model: JobModel) -> Optional[ImagePullProgress]: + if job_model.image_pull_progress is None: + return None + return ImagePullProgress.__response__.parse_raw(job_model.image_pull_progress) + + +def get_job_spec(job_model: JobModel) -> JobSpec: + return JobSpec.__response__.parse_raw(job_model.job_spec_data) def delay_job_instance_termination(job_model: JobModel): - job_model.remove_at = get_current_datetime() + datetime.timedelta(seconds=15) + job_model.remove_at = common.get_current_datetime() + timedelta(seconds=15) + + +def is_multinode_job(job: Job) -> bool: + return job.job_spec.jobs_per_replica > 1 + +def is_master_job(job: Job) -> bool: + return job.job_spec.job_num == 0 -def _get_job_configurator(run_spec: RunSpec) -> JobConfigurator: + +def _get_job_configurator( + run_spec: RunSpec, secrets: Dict[str, str], replica_group_name: Optional[str] = None +) -> JobConfigurator: configuration_type = RunConfigurationType(run_spec.configuration.type) configurator_class = _configuration_type_to_configurator_class_map[configuration_type] - return configurator_class(run_spec) + return configurator_class( + run_spec=run_spec, secrets=secrets, replica_group_name=replica_group_name + ) _job_configurator_classes = [ @@ -150,163 +332,32 @@ def _get_job_configurator(run_spec: RunSpec) -> JobConfigurator: _configuration_type_to_configurator_class_map = {c.TYPE: c for c in _job_configurator_classes} -async def stop_runner(session: AsyncSession, job_model: JobModel): - project = await session.get(ProjectModel, job_model.project_id) - ssh_private_key = project.ssh_private_key - - res = await session.execute( - sa.select(InstanceModel).where( - InstanceModel.project_id == job_model.project_id, InstanceModel.job_id == job_model.id - ) - ) - instance: Optional[InstanceModel] = res.scalar() - - if instance and instance.remote_connection_info is not None: - remote_conn_info: RemoteConnectionInfo = RemoteConnectionInfo.__response__.parse_raw( - instance.remote_connection_info - ) - ssh_private_key = remote_conn_info.ssh_keys[0].private - try: - await run_async(_stop_runner, job_model, ssh_private_key) - delay_job_instance_termination(job_model) - except SSHError: - logger.debug("%s: failed to stop runner", fmt(job_model)) - - -def _stop_runner( - job_model: JobModel, - server_ssh_private_key: str, -): - jpd = JobProvisioningData.__response__.parse_raw(job_model.job_provisioning_data) - logger.debug("%s: stopping runner %s", fmt(job_model), jpd.hostname) - ports = get_runner_ports() - with ssh_tunnel.RunnerTunnel( - hostname=jpd.hostname, - ssh_port=jpd.ssh_port, - user=jpd.username, - ports=ports, - id_rsa=server_ssh_private_key, - ssh_proxy=jpd.ssh_proxy, - ): - runner_client = client.RunnerClient(port=ports[client.REMOTE_RUNNER_PORT]) - runner_client.stop() - - -async def process_terminating_job(session: AsyncSession, job_model: JobModel): +async def stop_runner(job_model: JobModel, instance_model: InstanceModel): """ - Used by both process_terminating_jobs and process_terminating_run. - Caller must acquire the lock on the job. + Stops the runner using a preloaded instance model. + `instance_model.project` must be loaded because SSH key resolution uses the project keys. """ - if ( - job_model.remove_at is not None - and job_model.remove_at.replace(tzinfo=datetime.timezone.utc) > get_current_datetime() - ): - # it's too early to terminate the instance - return - - res = await session.execute( - sa.select(InstanceModel) - .where( - InstanceModel.project_id == job_model.project_id, - InstanceModel.job_id == job_model.id, - ) - .options( - sa_orm.joinedload(InstanceModel.project), - sa_orm.joinedload(InstanceModel.volumes), - ) - ) - instance: Optional[InstanceModel] = res.scalar() - - if instance is not None: - await wait_to_lock(PROCESSING_POOL_LOCK, PROCESSING_POOL_IDS, instance.id) + ssh_private_keys = get_instance_ssh_private_keys(instance_model) + jpd = get_job_provisioning_data(job_model) + if jpd is not None: + jrd = get_job_runtime_data(job_model) try: - await session.refresh(instance) - # there is an associated instance to empty - jpd = None - if job_model.job_provisioning_data is not None: - jpd = JobProvisioningData.__response__.parse_raw(job_model.job_provisioning_data) - logger.debug("%s: stopping container", fmt(job_model)) - ssh_private_key = instance.project.ssh_private_key - if instance and instance.remote_connection_info is not None: - remote_conn_info: RemoteConnectionInfo = ( - RemoteConnectionInfo.__response__.parse_raw( - instance.remote_connection_info - ) - ) - ssh_private_key = remote_conn_info.ssh_keys[0].private - await stop_container(job_model, jpd, ssh_private_key) - if len(instance.volumes) > 0: - logger.info("Detaching volumes: %s", [v.name for v in instance.volumes]) - await detach_volumes_from_instance( - project=instance.project, - instance=instance, - jpd=jpd, - ) - - if instance.status == InstanceStatus.BUSY: - instance.status = InstanceStatus.IDLE - elif instance.status != InstanceStatus.TERMINATED: - # instance was PROVISIONING (specially for the job) - # schedule for termination - instance.status = InstanceStatus.TERMINATING - - if jpd is None or not jpd.dockerized: - # do not reuse vastai/k8s instances - instance.status = InstanceStatus.TERMINATING - - instance.job_id = None - instance.last_job_processed_at = get_current_datetime() - logger.info( - "%s: instance '%s' has been released, new status is %s", - fmt(job_model), - instance.name, - instance.status.name, - ) - await gateways.unregister_replica( - session, job_model - ) # TODO(egor-s) ensure always runs + await run_async(_stop_runner, ssh_private_keys, jpd, jrd, job_model) + except SSHError: + logger.debug("%s: failed to stop runner", fmt(job_model)) - finally: - PROCESSING_POOL_IDS.remove(instance.id) - if job_model.termination_reason is not None: - job_model.status = job_model.termination_reason.to_status() - else: - job_model.status = JobStatus.FAILED - logger.warning("%s: job termination reason is not set", fmt(job_model)) - logger.info( - "%s: job status is %s, reason: %s", - fmt(job_model), - job_model.status.name, - job_model.termination_reason.name, - ) - - -async def stop_container( - job_model: JobModel, job_provisioning_data: JobProvisioningData, ssh_private_key: str +@runner_ssh_tunnel +def _stop_runner( + addresses: Mapping[int, client.LocalAddress], + job_model: JobModel, ): - if job_provisioning_data.dockerized: - # send a request to the shim to terminate the docker container - # SSHError and RequestException are caught in the `runner_ssh_tunner` decorator - await run_async( - _shim_submit_stop, - ssh_private_key, - job_provisioning_data, - job_model, - ) - - -@runner_ssh_tunnel(ports=[client.REMOTE_SHIM_PORT]) -def _shim_submit_stop(job_model: JobModel, ports: Dict[int, int]): - shim_client = client.ShimClient(port=ports[client.REMOTE_SHIM_PORT]) - - resp = shim_client.healthcheck() - if resp is None: - logger.debug("%s: can't stop container, shim is not available yet", fmt(job_model)) - return False # shim is not available yet - - # we force container deletion because the runner had time to gracefully stop the job - shim_client.stop(force=True) + logger.debug("%s: stopping runner", fmt(job_model)) + runner_client = client.RunnerClient.from_address(addresses[DSTACK_RUNNER_HTTP_PORT]) + try: + runner_client.stop() + except requests.RequestException: + logger.exception("%s: failed to stop runner gracefully", fmt(job_model)) def group_jobs_by_replica_latest(jobs: List[JobModel]) -> Iterable[Tuple[int, List[JobModel]]]: @@ -330,42 +381,239 @@ def group_jobs_by_replica_latest(jobs: List[JobModel]) -> Iterable[Tuple[int, Li yield replica_num, replica_jobs -async def detach_volumes_from_instance( +async def get_instances_ids_with_detaching_volumes(session: AsyncSession) -> List[UUID]: + res = await session.execute( + select(JobModel) + .where( + JobModel.status == JobStatus.TERMINATING, + JobModel.used_instance_id.is_not(None), + JobModel.volumes_detached_at.is_not(None), + ) + .options(load_only(JobModel.used_instance_id)) + ) + job_models = res.scalars().all() + return [jm.used_instance_id for jm in job_models if jm.used_instance_id] + + +async def get_job_configured_volumes( + session: AsyncSession, project: ProjectModel, - instance: InstanceModel, - jpd: JobProvisioningData, -): - backend = await get_project_backend_by_type( + run_spec: RunSpec, + job_num: int, + job_spec: Optional[JobSpec] = None, +) -> List[List[Volume]]: + """ + Returns a list of job volumes grouped by mount points. + """ + volume_models = await get_job_configured_volume_models( + session=session, project=project, - backend_type=jpd.backend, + run_spec=run_spec, + job_num=job_num, + job_spec=job_spec, ) - if backend is None: - logger.error("Failed to detach volumes from %s. Backend not available.", instance.name) - return + return [ + [volumes_services.volume_model_to_volume(v) for v in mount_point_volume_models] + for mount_point_volume_models in volume_models + ] - detached_volumes = [] - for volume_model in instance.volumes: - volume = volume_model_to_volume(volume_model) - try: - await run_async( - backend.compute().detach_volume, - volume=volume, - instance_id=jpd.instance_id, - ) - detached_volumes.append(volume_model) - except BackendError as e: - logger.error( - "Failed to detach volume %s from %s: %s", - volume_model.name, - instance.name, - repr(e), + +async def get_job_configured_volume_models( + session: AsyncSession, + project: ProjectModel, + run_spec: RunSpec, + job_num: int, + job_spec: Optional[JobSpec] = None, +) -> List[List[VolumeModel]]: + """ + Returns a list of job volume models grouped by mount points. + """ + job_volumes = None + if job_spec is not None: + job_volumes = job_spec.volumes + if job_volumes is None: + # job_spec not provided or a legacy job_spec without volumes + job_volumes = interpolate_job_volumes(run_spec.configuration.volumes, job_num) + volume_models = [] + for mount_point in job_volumes: + if not isinstance(mount_point, VolumeMountPoint): + continue + if isinstance(mount_point.name, str): + names = [mount_point.name] + else: + names = mount_point.name + mount_point_volume_models = [] + for name in names: + volume_model = await volumes_services.get_project_volume_model_by_name( + session=session, + project=project, + name=name, ) - except Exception: - logger.exception( - "Got exception when detaching volume %s from instance %s", - volume_model.name, - instance.name, + if volume_model is None: + raise ResourceNotExistsError(f"Volume {mount_point.name} not found") + if volume_model.to_be_deleted: + raise ServerClientError( + f"Volume {mount_point.name} is marked for deletion and cannot be attached" + ) + mount_point_volume_models.append(volume_model) + volume_models.append(mount_point_volume_models) + return volume_models + + +def check_can_attach_job_volumes(volumes: List[List[Volume]]): + """ + Performs basic checks if volumes can be attached. + This is useful to show error ASAP (when user submits the run). + If the attachment is to fail anyway, the error will be handled when processing submitted jobs. + """ + if len(volumes) == 0: + return + expected_locations = {(v.get_backend(), v.get_region().lower()) for v in volumes[0]} + for mount_point_volumes in volumes: + locations = {(v.get_backend(), v.get_region().lower()) for v in mount_point_volumes} + if locations != expected_locations: + raise ServerClientError( + "Volumes from different locations specified for different mount points" ) + for volume in mount_point_volumes: + if volume.status != VolumeStatus.ACTIVE: + raise ServerClientError(f"Cannot mount volumes that are not active: {volume.name}") + volumes_names = [v.name for vs in volumes for v in vs] + if len(volumes_names) != len(set(volumes_names)): + raise ServerClientError("Cannot attach the same volume at different mount points") - detached_volumes_ids = {v.id for v in detached_volumes} - instance.volumes = [v for v in instance.volumes if v.id not in detached_volumes_ids] + +async def get_job_attached_volumes( + session: AsyncSession, + project: ProjectModel, + run_spec: RunSpec, + job_num: int, + job_provisioning_data: JobProvisioningData, +) -> List[Volume]: + """ + Returns volumes attached to the job. + """ + job_configured_volumes = await get_job_configured_volumes( + session=session, + project=project, + run_spec=run_spec, + job_num=job_num, + ) + job_volumes = [] + for mount_point_volumes in job_configured_volumes: + job_volumes.append( + _get_job_mount_point_attached_volume(mount_point_volumes, job_provisioning_data) + ) + return job_volumes + + +def remove_job_spec_sensitive_info(spec: JobSpec): + spec.ssh_key = None + + +def get_job_connection_info(job_model: JobModel, run_spec: RunSpec) -> JobConnectionInfo: + # Run.attach() Python API method, used internally by CLI, uses the following as the Hostname + # in the SSH config: + # * for the (job=0 replica=0) job - run name, e.g., `my-task` + # * for other jobs - job name, e.g., `my-task-0-1` + attached_hostname = run_spec.run_name + if job_model.job_num != 0 or job_model.replica_num != 0: + attached_hostname = job_model.job_name + assert attached_hostname is not None + + # ide_* fields are for dev-environment only + ide_name: Optional[str] = None + # IDE URLs are not set until the job status is switched to RUNNING, + # as JobRuntimeData.working_dir, which is required to build URLs, is returned + # by dstack-runner's `/api/run` method + attached_ide_url: Optional[str] = None + proxied_ide_url: Optional[str] = None + if ( + run_spec.configuration.type == RunConfigurationType.DEV_ENVIRONMENT.value + and run_spec.configuration.ide is not None + ): + ide = get_ide(run_spec.configuration.ide) + if ide is not None: + ide_name = ide.name + jrd = get_job_runtime_data(job_model) + if jrd is not None and jrd.working_dir is not None: + attached_url_authority = build_ssh_url_authority(hostname=attached_hostname) + attached_ide_url = ide.get_url(attached_url_authority, jrd.working_dir) + proxied_url_authority = build_proxied_job_ssh_url_authority(job_model) + if proxied_url_authority is not None: + proxied_ide_url = ide.get_url(proxied_url_authority, jrd.working_dir) + + sshproxy_hostname: Optional[str] = None + sshproxy_port: Optional[int] = None + sshproxy_upstream_id: Optional[str] = None + if settings.SSHPROXY_ENABLED: + sshproxy_hostname = settings.SSHPROXY_HOSTNAME + sshproxy_port = settings.SSHPROXY_PORT + sshproxy_upstream_id = build_proxied_job_upstream_id(job_model) + + return JobConnectionInfo( + ide_name=ide_name, + attached_ide_url=attached_ide_url, + proxied_ide_url=proxied_ide_url, + attached_ssh_command=build_ssh_command(hostname=attached_hostname), + proxied_ssh_command=build_proxied_job_ssh_command(job_model), + sshproxy_hostname=sshproxy_hostname, + sshproxy_port=sshproxy_port, + sshproxy_upstream_id=sshproxy_upstream_id, + ) + + +def _get_job_mount_point_attached_volume( + volumes: List[Volume], + job_provisioning_data: JobProvisioningData, +) -> Volume: + """ + Returns the volume attached to the job among the list of possible mount point volumes. + """ + for volume in volumes: + if ( + volume.get_backend() != job_provisioning_data.get_base_backend() + or volume.get_region().lower() != job_provisioning_data.region.lower() + ): + continue + if ( + (volume_availability_zone := volume.get_availability_zone()) is not None + and job_provisioning_data.availability_zone is not None + and volume_availability_zone.lower() != job_provisioning_data.availability_zone.lower() + ): + continue + return volume + raise ServerClientError("Failed to find an eligible volume for the mount point") + + +def _get_job_status_message(job_model: JobModel) -> str: + if job_model.status == JobStatus.DONE: + return "exited (0)" + elif job_model.status == JobStatus.FAILED: + if job_model.termination_reason == JobTerminationReason.CONTAINER_EXITED_WITH_ERROR: + return f"exited ({job_model.exit_status})" + elif ( + job_model.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY + ): + if ( + job_model.termination_reason_message + and "No matching fleet found" in job_model.termination_reason_message + ): + return "no fleets" + return "no offers" + elif job_model.termination_reason == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY: + return "interrupted" + else: + return "error" + elif job_model.status == JobStatus.TERMINATED: + if job_model.termination_reason == JobTerminationReason.TERMINATED_BY_USER: + return "stopped" + elif job_model.termination_reason == JobTerminationReason.ABORTED_BY_USER: + return "aborted" + return job_model.status.value + + +def _get_job_error(job_model: JobModel) -> Optional[str]: + if job_model.termination_reason is None: + return None + return job_model.termination_reason.to_error() diff --git a/src/dstack/_internal/server/services/jobs/configurators/base.py b/src/dstack/_internal/server/services/jobs/configurators/base.py index e7ac35bd94..761745247d 100644 --- a/src/dstack/_internal/server/services/jobs/configurators/base.py +++ b/src/dstack/_internal/server/services/jobs/configurators/base.py @@ -1,30 +1,69 @@ import shlex import sys +import threading from abc import ABC, abstractmethod +from pathlib import PurePosixPath from typing import Dict, List, Optional +import orjson from cachetools import TTLCache, cached -import dstack.version as version +from dstack._internal import settings from dstack._internal.core.errors import DockerRegistryError, ServerClientError +from dstack._internal.core.models.common import RegistryAuth from dstack._internal.core.models.configurations import ( + DEFAULT_PROBE_INTERVAL, + DEFAULT_PROBE_METHOD, + DEFAULT_PROBE_READY_AFTER, + DEFAULT_PROBE_TIMEOUT, + DEFAULT_PROBE_UNTIL_READY, + DEFAULT_PROBE_URL, + DEFAULT_REPLICA_GROUP_NAME, + LEGACY_REPO_DIR, + OPENAI_MODEL_PROBE_TIMEOUT, + HTTPHeaderSpec, PortMapping, + ProbeConfig, PythonVersion, - RegistryAuth, + RepoExistsAction, RunConfigurationType, + ServiceConfiguration, +) +from dstack._internal.core.models.profiles import ( + DEFAULT_STOP_DURATION, + SpotPolicy, + UtilizationPolicy, ) -from dstack._internal.core.models.profiles import SpotPolicy from dstack._internal.core.models.runs import ( AppSpec, JobSpec, + JobSSHKey, + ProbeSpec, Requirements, Retry, RunSpec, ) +from dstack._internal.core.models.services import OpenAIChatModel +from dstack._internal.core.models.unix import UnixUser +from dstack._internal.core.models.volumes import MountPoint, VolumeMountPoint from dstack._internal.core.services.profiles import get_retry from dstack._internal.core.services.ssh.ports import filter_reserved_ports -from dstack._internal.server.services.docker import ImageConfig, get_image_config -from dstack._internal.server.utils.common import run_async +from dstack._internal.server.services.docker import ( + ImageConfig, + apply_server_docker_defaults, + get_image_config, +) +from dstack._internal.utils import crypto +from dstack._internal.utils.common import run_async +from dstack._internal.utils.interpolator import InterpolatorError, VariablesInterpolator +from dstack._internal.utils.logging import get_logger +from dstack._internal.utils.path import is_absolute_posix_path + +logger = get_logger(__name__) + + +DSTACK_DIR = "/dstack" +DSTACK_PROFILE_PATH = f"{DSTACK_DIR}/profile" def get_default_python_verison() -> str: @@ -39,15 +78,34 @@ def get_default_python_verison() -> str: ) -def get_default_image(python_version: str) -> str: - return f"dstackai/base:py{python_version}-{version.base_image}-cuda-12.1" +def get_default_image(nvcc: bool = False) -> str: + """ + Note: May be overridden by dstack (e.g., EFA-enabled version for AWS EFA-capable instances). + See `dstack._internal.server.services.backends.provisioning.resolve_provisioning_image` + for details. + + Args: + nvcc: If True, returns 'devel' variant, otherwise 'base'. + """ + return f"{settings.DSTACK_DOCKER_BASE_IMAGE}:{settings.DSTACK_DOCKER_BASE_IMAGE_VERSION}-{'devel' if nvcc else 'base'}-ubuntu{settings.DSTACK_DOCKER_BASE_IMAGE_UBUNTU_VERSION}" class JobConfigurator(ABC): TYPE: RunConfigurationType - def __init__(self, run_spec: RunSpec): + _image_config: Optional[ImageConfig] = None + # JobSSHKey should be shared for all jobs in a replica for inter-node communication. + _job_ssh_key: Optional[JobSSHKey] = None + + def __init__( + self, + run_spec: RunSpec, + secrets: Optional[Dict[str, str]] = None, + replica_group_name: Optional[str] = None, + ): self.run_spec = run_spec + self.secrets = secrets or {} + self.replica_group_name = replica_group_name async def get_job_specs(self, replica_num: int) -> List[JobSpec]: job_spec = await self._get_job_spec(replica_num=replica_num, job_num=0, jobs_per_replica=1) @@ -57,6 +115,10 @@ async def get_job_specs(self, replica_num: int) -> List[JobSpec]: def _shell_commands(self) -> List[str]: pass + @abstractmethod + def _default_single_branch(self) -> bool: + pass + @abstractmethod def _default_max_duration(self) -> Optional[int]: pass @@ -65,10 +127,35 @@ def _default_max_duration(self) -> Optional[int]: def _spot_policy(self) -> SpotPolicy: pass + def _reservation(self) -> Optional[str]: + return self.run_spec.merged_profile.reservation + @abstractmethod def _ports(self) -> List[PortMapping]: pass + async def _get_image_config(self) -> ImageConfig: + if self._image_config is not None: + return self._image_config + interpolate = VariablesInterpolator({"secrets": self.secrets}).interpolate_or_error + registry_auth = self.run_spec.configuration.registry_auth + if registry_auth is not None: + try: + registry_auth = RegistryAuth( + username=interpolate(registry_auth.username), + password=interpolate(registry_auth.password), + ) + except InterpolatorError as e: + raise ServerClientError(e.args[0]) + image_name, registry_auth = apply_server_docker_defaults(self._image_name(), registry_auth) + image_config = await run_async( + _get_image_config, + image_name, + registry_auth, + ) + self._image_config = image_config + return image_config + async def _get_job_spec( self, replica_num: int, @@ -80,35 +167,56 @@ async def _get_job_spec( job_num=job_num, job_name=f"{self.run_spec.run_name}-{job_num}-{replica_num}", jobs_per_replica=jobs_per_replica, + replica_group=self.replica_group_name or DEFAULT_REPLICA_GROUP_NAME, app_specs=self._app_specs(), commands=await self._commands(), env=self._env(), home_dir=self._home_dir(), image_name=self._image_name(), + user=await self._user(), + privileged=self._privileged(), + single_branch=self._single_branch(), max_duration=self._max_duration(), + stop_duration=self._stop_duration(), + utilization_policy=self._utilization_policy(), registry_auth=self._registry_auth(), - requirements=self._requirements(), + requirements=self._requirements(jobs_per_replica), retry=self._retry(), working_dir=self._working_dir(), + volumes=self._volumes(job_num), + ssh_key=self._ssh_key(jobs_per_replica), + repo_data=self.run_spec.repo_data, + repo_code_hash=self.run_spec.repo_code_hash, + repo_dir=self._repo_dir(), + repo_exists_action=self._repo_exists_action(), + file_archives=self.run_spec.file_archives, + service_port=self._service_port(), + probes=self._probes(), ) return job_spec + def _shell(self) -> str: + shell = self.run_spec.configuration.shell + if shell is not None: + path = PurePosixPath(shell) + if path.is_absolute(): + return shell + return str("/bin" / path) + if self.run_spec.configuration.image is None: # dstackai/base + return "/bin/bash" + return "/bin/sh" + async def _commands(self) -> List[str]: if self.run_spec.configuration.entrypoint is not None: # docker-like format + assert self.run_spec.configuration.type != "dev-environment" entrypoint = shlex.split(self.run_spec.configuration.entrypoint) commands = self.run_spec.configuration.commands - elif self.run_spec.configuration.image is None: # dstackai/base - entrypoint = ["/bin/bash", "-i", "-c"] - commands = [_join_shell_commands(self._shell_commands())] - elif self._shell_commands(): # custom docker image with shell commands - entrypoint = ["/bin/sh", "-i", "-c"] - commands = [_join_shell_commands(self._shell_commands())] + elif shell_commands := self._shell_commands(): + entrypoint = [self._shell(), "-i", "-c"] + dstack_image_commands = self._dstack_image_commands() + commands = [_join_shell_commands(dstack_image_commands + shell_commands)] else: # custom docker image without commands - image_config = await run_async( - _get_image_config, - self.run_spec.configuration.image, - self.run_spec.configuration.registry_auth, - ) + image_config = await self._get_image_config() entrypoint = image_config.entrypoint or [] commands = image_config.cmd or [] @@ -121,6 +229,38 @@ async def _commands(self) -> List[str]: return result + def _dstack_image_commands(self) -> List[str]: + if self.run_spec.configuration.docker is True: + return ["start-dockerd"] + if ( + self.run_spec.configuration.image is not None + or self.run_spec.configuration.entrypoint is not None + ): + return [] + return [ + f"eval $(echo 'export DSTACK_VENV_DIR={DSTACK_DIR}/venv' | sudo tee -a {DSTACK_PROFILE_PATH})", + # Make sure /dstack/venv is owned by the current user. + # XXX: Generally, /dstack and all its descendants should be owned by root, as it is + # intended to be a place for files shared by all users, but since a non-root user + # should be able to install packages via pip and we want to avoid cluttering the user's + # home dir if possible, we make the venv dir owned by the current user rather than + # creating it inside the user's home or (even worse) making /dstack/venv + # world-writable. + "sudo rm -rf $DSTACK_VENV_DIR", + "sudo mkdir $DSTACK_VENV_DIR", + "sudo chown $(id -u):$(id -g) $DSTACK_VENV_DIR", + # `uv` may emit: + # > warning: `VIRTUAL_ENV=/dstack/venv` does not match the project environment path + # > `.venv` and will be ignored; use `--active` to target the active environment + # > instead + # Safe to ignore, reusing dstack's venv for `uv` is discouraged (it should only be + # used for legacy `pip`-based configurations). `--no-active` suppresses the warning. + # Alternatively, the user can call `deactivate` once before using `uv`. + # If the user really wants to reuse dstack's venv, they must spefify `--active`. + f"uv venv -q --prompt dstack -p {self._python()} --seed $DSTACK_VENV_DIR", + f"eval $(echo '. $DSTACK_VENV_DIR/bin/activate' | sudo tee -a {DSTACK_PROFILE_PATH})", + ] + def _app_specs(self) -> List[AppSpec]: specs = [] for i, pm in enumerate(filter_reserved_ports(self._ports())): @@ -134,15 +274,36 @@ def _app_specs(self) -> List[AppSpec]: return specs def _env(self) -> Dict[str, str]: - return self.run_spec.configuration.env + return self.run_spec.configuration.env.as_dict() def _home_dir(self) -> Optional[str]: return self.run_spec.configuration.home_dir def _image_name(self) -> str: - if self.run_spec.configuration.image is not None: + if self.run_spec.configuration.docker is True: + return settings.DSTACK_DIND_IMAGE + elif self.run_spec.configuration.image is not None: return self.run_spec.configuration.image - return get_default_image(self._python()) + return get_default_image(nvcc=bool(self.run_spec.configuration.nvcc)) + + async def _user(self) -> Optional[UnixUser]: + user = self.run_spec.configuration.user + if user is None and self.run_spec.configuration.image is not None: + image_config = await self._get_image_config() + user = image_config.user + if user is None: + return None + return UnixUser.parse(user) + + def _privileged(self) -> bool: + if self.run_spec.configuration.docker is True: + return True + return self.run_spec.configuration.privileged + + def _single_branch(self) -> bool: + if self.run_spec.configuration.single_branch is None: + return self._default_single_branch() + return self.run_spec.configuration.single_branch def _max_duration(self) -> Optional[int]: if self.run_spec.merged_profile.max_duration is None: @@ -151,36 +312,186 @@ def _max_duration(self) -> Optional[int]: return None return self.run_spec.merged_profile.max_duration + def _stop_duration(self) -> Optional[int]: + if self.run_spec.merged_profile.stop_duration is None: + return DEFAULT_STOP_DURATION + if self.run_spec.merged_profile.stop_duration == "off": + return None + return self.run_spec.merged_profile.stop_duration + + def _utilization_policy(self) -> Optional[UtilizationPolicy]: + return self.run_spec.merged_profile.utilization_policy + def _registry_auth(self) -> Optional[RegistryAuth]: return self.run_spec.configuration.registry_auth - def _requirements(self) -> Requirements: + def _requirements(self, jobs_per_replica: int) -> Requirements: + resources = self.run_spec.configuration.resources + if self.run_spec.configuration.type == "service": + for group in self.run_spec.configuration.replica_groups: + if group.name == self.replica_group_name: + resources = group.resources + break spot_policy = self._spot_policy() return Requirements( - resources=self.run_spec.configuration.resources, + resources=resources, max_price=self.run_spec.merged_profile.max_price, spot=None if spot_policy == SpotPolicy.AUTO else (spot_policy == SpotPolicy.SPOT), + reservation=self._reservation(), + multinode=jobs_per_replica > 1, + backend_options=self.run_spec.merged_profile.backend_options, ) def _retry(self) -> Optional[Retry]: return get_retry(self.run_spec.merged_profile) + def _repo_dir(self) -> str: + """ + Returns absolute or relative path + """ + if repos := self.run_spec.configuration.repos: + return repos[0].path + # `repo_dir` may be set while `repos` is empty if the RunSpec was submitted before 0.20.0 + repo_dir = self.run_spec.repo_dir + # We need this fallback indefinitely, as there may be RunSpecs submitted before + # `repos[].path` was added, and JobSpec is regenerated from RunSpec on each retry + # and in-place update. + if repo_dir is None: + return LEGACY_REPO_DIR + return repo_dir + + def _repo_exists_action(self) -> Optional[RepoExistsAction]: + if not (repos := self.run_spec.configuration.repos): + # One of: + # - The configuration without repo submitted by any client. + # - The configuration _with_ repo submitted by pre-0.20.0 client (the `repos` option + # is always excluded by pre-0.20.0 clients for compatibility with pre-0.20.0 servers) + # In either case, we return None, and runner falls back to "skip" action if needed + # (the second case, the only action hardcoded in pre-0.20.0 runners) + return None + return repos[0].if_exists + def _working_dir(self) -> Optional[str]: """ - None means default working directory + Returns absolute path or None + + None means the default working directory taken from the image """ - return self.run_spec.working_dir + working_dir = self.run_spec.configuration.working_dir + if working_dir is None or is_absolute_posix_path(working_dir): + return working_dir + # Support for pre-0.20.0 configurations + return str(PurePosixPath(LEGACY_REPO_DIR) / working_dir) def _python(self) -> str: if self.run_spec.configuration.python is not None: return self.run_spec.configuration.python.value return get_default_python_verison() + def _volumes(self, job_num: int) -> List[MountPoint]: + return interpolate_job_volumes(self.run_spec.configuration.volumes, job_num) -def _join_shell_commands(commands: List[str], env: Optional[Dict[str, str]] = None) -> str: - if env is None: - env = {} - commands = [f"export {k}={v}" for k, v in env.items()] + commands + def _ssh_key(self, jobs_per_replica: int) -> Optional[JobSSHKey]: + if jobs_per_replica < 2: + return None + if self._job_ssh_key is None: + private, public = crypto.generate_rsa_key_pair_bytes(comment="dstack_job") + self._job_ssh_key = JobSSHKey( + private=private.decode(), + public=public.decode(), + ) + return self._job_ssh_key + + def _service_port(self) -> Optional[int]: + if isinstance(self.run_spec.configuration, ServiceConfiguration): + return self.run_spec.configuration.port.container_port + return None + + def _probes(self) -> list[ProbeSpec]: + if isinstance(self.run_spec.configuration, ServiceConfiguration): + probes = self.run_spec.configuration.probes + if probes is not None: + return list(map(_probe_config_to_spec, probes)) + # Generate default probe if model is set + model = self.run_spec.configuration.model + if isinstance(model, OpenAIChatModel): + return [_openai_model_probe_spec(model.name, model.prefix)] + return [] + + +def interpolate_job_volumes( + run_volumes: List[MountPoint], + job_num: int, +) -> List[MountPoint]: + if len(run_volumes) == 0: + return [] + interpolator = VariablesInterpolator( + namespaces={ + "dstack": { + "job_num": str(job_num), + "node_rank": str(job_num), # an alias for job_num + } + } + ) + job_volumes = [] + for mount_point in run_volumes: + if not isinstance(mount_point, VolumeMountPoint): + job_volumes.append(mount_point.copy()) + continue + if isinstance(mount_point.name, str): + names = [mount_point.name] + else: + names = mount_point.name + try: + interpolated_names = [interpolator.interpolate_or_error(n) for n in names] + except InterpolatorError as e: + raise ServerClientError(e.args[0]) + job_volumes.append( + VolumeMountPoint( + name=interpolated_names, + path=mount_point.path, + ) + ) + return job_volumes + + +def _probe_config_to_spec(c: ProbeConfig) -> ProbeSpec: + return ProbeSpec( + type=c.type, + url=c.url if c.url is not None else DEFAULT_PROBE_URL, + timeout=c.timeout if c.timeout is not None else DEFAULT_PROBE_TIMEOUT, + interval=c.interval if c.interval is not None else DEFAULT_PROBE_INTERVAL, + ready_after=c.ready_after if c.ready_after is not None else DEFAULT_PROBE_READY_AFTER, + method=c.method if c.method is not None else DEFAULT_PROBE_METHOD, + headers=c.headers, + body=c.body, + until_ready=c.until_ready if c.until_ready is not None else DEFAULT_PROBE_UNTIL_READY, + ) + + +def _openai_model_probe_spec(model_name: str, prefix: str) -> ProbeSpec: + body = orjson.dumps( + { + "model": model_name, + "messages": [{"role": "user", "content": "hi"}], + "max_tokens": 1, + } + ).decode("utf-8") + return ProbeSpec( + type="http", + method="post", + url=prefix.rstrip("/") + "/chat/completions", + headers=[ + HTTPHeaderSpec(name="Content-Type", value="application/json"), + ], + body=body, + timeout=OPENAI_MODEL_PROBE_TIMEOUT, + interval=DEFAULT_PROBE_INTERVAL, + ready_after=DEFAULT_PROBE_READY_AFTER, + ) + + +def _join_shell_commands(commands: List[str]) -> str: for i, cmd in enumerate(commands): cmd = cmd.strip() if cmd.endswith("&"): # escape background command @@ -189,7 +500,10 @@ def _join_shell_commands(commands: List[str], env: Optional[Dict[str, str]] = No return " && ".join(commands) -@cached(TTLCache(maxsize=2048, ttl=80)) +@cached( + cache=TTLCache(maxsize=2048, ttl=80), + lock=threading.Lock(), +) def _get_image_config(image: str, registry_auth: Optional[RegistryAuth]) -> ImageConfig: try: return get_image_config(image, registry_auth).config diff --git a/src/dstack/_internal/server/services/jobs/configurators/dev.py b/src/dstack/_internal/server/services/jobs/configurators/dev.py index 13d75a0f60..e4ee0a2d56 100644 --- a/src/dstack/_internal/server/services/jobs/configurators/dev.py +++ b/src/dstack/_internal/server/services/jobs/configurators/dev.py @@ -1,52 +1,70 @@ -from typing import List, Optional +from typing import Dict, List, Optional +from dstack._internal.core.errors import ServerClientError from dstack._internal.core.models.configurations import PortMapping, RunConfigurationType from dstack._internal.core.models.profiles import SpotPolicy from dstack._internal.core.models.runs import RunSpec +from dstack._internal.server.services.ides import get_ide from dstack._internal.server.services.jobs.configurators.base import JobConfigurator -from dstack._internal.server.services.jobs.configurators.extensions.vscode import VSCodeDesktop - -DEFAULT_MAX_DURATION_SECONDS = 6 * 3600 INSTALL_IPYKERNEL = ( - "(echo pip install ipykernel... && pip install -q --no-cache-dir ipykernel 2> /dev/null) || " - 'echo "no pip, ipykernel was not installed"' + "(echo 'uv pip install ipykernel...' && uv pip install -q --no-cache-dir ipykernel 2> /dev/null) || " + "(echo 'pip install ipykernel...' && pip install -q --no-cache-dir ipykernel 2> /dev/null) || " + "echo 'no uv or pip found, ipykernel was not installed'" ) class DevEnvironmentJobConfigurator(JobConfigurator): TYPE: RunConfigurationType = RunConfigurationType.DEV_ENVIRONMENT - def __init__(self, run_spec: RunSpec): - self.ide = VSCodeDesktop( - run_name=run_spec.run_name, - version=run_spec.configuration.version, - extensions=["ms-python.python", "ms-toolsai.jupyter"], - ) - super().__init__(run_spec) + ide_extensions = ["ms-python.python", "ms-toolsai.jupyter"] + + def __init__( + self, run_spec: RunSpec, secrets: Dict[str, str], replica_group_name: Optional[str] = None + ): + assert run_spec.configuration.type == "dev-environment" + + if run_spec.configuration.ide is None: + self.ide = None + else: + ide = get_ide(run_spec.configuration.ide) + if ide is None: + raise ServerClientError(f"Unsupported IDE: {run_spec.configuration.ide}") + self.ide = ide + super().__init__(run_spec=run_spec, secrets=secrets, replica_group_name=replica_group_name) def _shell_commands(self) -> List[str]: - # preserve environment variables for SSH clients - commands = ["env >> ~/.ssh/environment"] - commands += self.ide.get_install_commands() + assert self.run_spec.configuration.type == "dev-environment" + + commands = [] + if self.ide is not None: + commands += self.ide.get_install_commands( + version=self.run_spec.configuration.version, extensions=self.ide_extensions + ) commands.append(INSTALL_IPYKERNEL) commands += self.run_spec.configuration.setup - commands.append("echo ''") - commands += self.ide.get_print_readme_commands() + commands.append("echo") + commands += self.run_spec.configuration.init + if self.ide is not None: + assert self.run_spec.run_name is not None + commands += self.ide.get_print_readme_commands(self.run_spec.run_name) commands += [ f"echo 'To connect via SSH, use: `ssh {self.run_spec.run_name}`'", - "echo ''", + "echo", "echo -n 'To exit, press Ctrl+C.'", ] - commands += self.run_spec.configuration.init commands += ["tail -f /dev/null"] # idle return commands + def _default_single_branch(self) -> bool: + return False + def _default_max_duration(self) -> Optional[int]: - return DEFAULT_MAX_DURATION_SECONDS + return None def _spot_policy(self) -> SpotPolicy: return self.run_spec.merged_profile.spot_policy or SpotPolicy.ONDEMAND def _ports(self) -> List[PortMapping]: + assert self.run_spec.configuration.type == "dev-environment" return self.run_spec.configuration.ports diff --git a/src/dstack/_internal/server/services/jobs/configurators/extensions/base.py b/src/dstack/_internal/server/services/jobs/configurators/extensions/base.py deleted file mode 100644 index 73f30036f7..0000000000 --- a/src/dstack/_internal/server/services/jobs/configurators/extensions/base.py +++ /dev/null @@ -1,15 +0,0 @@ -from typing import Callable, List - -CommandsExtension = Callable[[], List[str]] - - -def get_required_commands(executables: List[str]) -> CommandsExtension: - def wrapper() -> List[str]: - commands = [] - for exe in executables: - commands.append( - f'((command -v {exe} > /dev/null) || (echo "{exe} is required" && exit 1))' - ) - return commands - - return wrapper diff --git a/src/dstack/_internal/server/services/jobs/configurators/extensions/vscode.py b/src/dstack/_internal/server/services/jobs/configurators/extensions/vscode.py deleted file mode 100644 index f38c4fcec6..0000000000 --- a/src/dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +++ /dev/null @@ -1,42 +0,0 @@ -from typing import List - - -class VSCodeDesktop: - def __init__( - self, - run_name: str, - version: str, - extensions: List[str], - ): - self.run_name = run_name - self.version = version - self.extensions = extensions - - def get_install_commands(self) -> List[str]: - commands = [] - if self.version is not None: - url = f"https://update.code.visualstudio.com/commit:{self.version}/server-linux-$arch/stable" - archive = "vscode-server-linux-$arch.tar.gz" - target = f'~/.vscode-server/bin/"{self.version}"' - commands.extend( - [ - 'if [ $(uname -m) = "aarch64" ]; then arch="arm64"; else arch="x64"; fi', - "mkdir -p /tmp", - f'wget -q --show-progress "{url}" -O "/tmp/{archive}"', - f"mkdir -vp {target}", - f'tar --no-same-owner -xz --strip-components=1 -C {target} -f "/tmp/{archive}"', - f'rm "/tmp/{archive}"', - ] - ) - if self.extensions: - extensions = " ".join(f'--install-extension "{name}"' for name in self.extensions) - commands.append(f'PATH="$PATH":{target}/bin code-server {extensions}') - return commands - - def get_print_readme_commands(self) -> List[str]: - return [ - "echo To open in VS Code Desktop, use link below:", - "echo ''", - f"echo ' vscode://vscode-remote/ssh-remote+{self.run_name}/workflow'", # TODO use $REPO_DIR - "echo ''", - ] diff --git a/src/dstack/_internal/server/services/jobs/configurators/service.py b/src/dstack/_internal/server/services/jobs/configurators/service.py index 19e77f89b4..45bc4c8f72 100644 --- a/src/dstack/_internal/server/services/jobs/configurators/service.py +++ b/src/dstack/_internal/server/services/jobs/configurators/service.py @@ -1,24 +1,128 @@ from typing import List, Optional -from dstack._internal.core.models.configurations import PortMapping, RunConfigurationType +from dstack._internal import settings +from dstack._internal.core.models.configurations import ( + PortMapping, + ReplicaGroup, + RunConfigurationType, +) from dstack._internal.core.models.profiles import SpotPolicy -from dstack._internal.server.services.jobs.configurators.base import JobConfigurator +from dstack._internal.core.models.unix import UnixUser +from dstack._internal.server.services.jobs.configurators.base import ( + JobConfigurator, + get_default_image, +) class ServiceJobConfigurator(JobConfigurator): TYPE: RunConfigurationType = RunConfigurationType.SERVICE + def _current_replica_group(self) -> Optional[ReplicaGroup]: + assert self.run_spec.configuration.type == "service" + for group in self.run_spec.configuration.replica_groups: + if group.name == self.replica_group_name: + return group + return None + def _shell_commands(self) -> List[str]: + assert self.run_spec.configuration.type == "service" + group = self._current_replica_group() + if group is not None: + return group.commands return self.run_spec.configuration.commands + def _image_name(self) -> str: + group = self._current_replica_group() + if group is not None: + if group.docker is True: + return settings.DSTACK_DIND_IMAGE + if group.image is not None: + return group.image + if group.nvcc is True: + return get_default_image(nvcc=True) + return super()._image_name() + + def _privileged(self) -> bool: + group = self._current_replica_group() + if group is not None: + if group.docker is True: + return True + if group.privileged is not None: + return group.privileged + return super()._privileged() + + def _dstack_image_commands(self) -> List[str]: + group = self._current_replica_group() + if group is not None: + if group.docker is True: + return ["start-dockerd"] + if group.image is not None: + return [] + return super()._dstack_image_commands() + + def _shell(self) -> str: + # Shell resolution order: + # 1. If `shell:` is set explicitly, the base honors it. + # 2. If this group sets `docker: true`, use /bin/bash — the + # DIND image ships bash, matching the service-level path. + # 3. If this group sets its own `image`, force /bin/sh. The + # base returns /bin/bash when service-level `image` is None, + # but a group-level custom image (e.g. alpine) may not ship + # bash. + # 4. Otherwise defer to the base (bash for dstackai/base, sh + # for a service-level custom image). + if self.run_spec.configuration.shell is None: + group = self._current_replica_group() + if group is not None: + if group.docker is True: + return "/bin/bash" + if group.image is not None: + return "/bin/sh" + return super()._shell() + + async def _user(self) -> Optional[UnixUser]: + # Base `_user()` only queries the image for a default user when + # `configuration.image` is set at the service level. When the + # group supplies its own `image`, perform the lookup here so the + # container runs as that image's default user. + # + # We intentionally do NOT look up the DIND image when the group + # sets `docker: true`. That matches service-level behavior: when + # `configuration.docker is True`, `configuration.image` is None, + # so the base skips the lookup. DIND is always privileged and + # effectively root anyway. + if self.run_spec.configuration.user is None: + group = self._current_replica_group() + if group is not None and group.image is not None: + image_config = await self._get_image_config() + if image_config.user is None: + return None + return UnixUser.parse(image_config.user) + return await super()._user() + + def _python(self) -> str: + group = self._current_replica_group() + if group is not None and group.python is not None: + return group.python.value + return super()._python() + + def _default_single_branch(self) -> bool: + return True + def _default_max_duration(self) -> Optional[int]: return None def _spot_policy(self) -> SpotPolicy: - return self.run_spec.merged_profile.spot_policy or SpotPolicy.AUTO + group = self._current_replica_group() + if group is not None and group.spot_policy is not None: + return group.spot_policy + return self.run_spec.merged_profile.spot_policy or SpotPolicy.ONDEMAND + + def _reservation(self) -> Optional[str]: + group = self._current_replica_group() + if group is not None and group.reservation is not None: + return group.reservation + return super()._reservation() def _ports(self) -> List[PortMapping]: return [] - - def _working_dir(self) -> Optional[str]: - return None if not self._shell_commands() else super()._working_dir() diff --git a/src/dstack/_internal/server/services/jobs/configurators/task.py b/src/dstack/_internal/server/services/jobs/configurators/task.py index 3908b654f5..51c136dfe7 100644 --- a/src/dstack/_internal/server/services/jobs/configurators/task.py +++ b/src/dstack/_internal/server/services/jobs/configurators/task.py @@ -5,13 +5,12 @@ from dstack._internal.core.models.runs import JobSpec from dstack._internal.server.services.jobs.configurators.base import JobConfigurator -DEFAULT_MAX_DURATION_SECONDS = 72 * 3600 - class TaskJobConfigurator(JobConfigurator): TYPE: RunConfigurationType = RunConfigurationType.TASK async def get_job_specs(self, replica_num: int) -> List[JobSpec]: + assert self.run_spec.configuration.type == "task" job_specs = [] for job_num in range(self.run_spec.configuration.nodes): job_spec = await self._get_job_spec( @@ -23,16 +22,18 @@ async def get_job_specs(self, replica_num: int) -> List[JobSpec]: return job_specs def _shell_commands(self) -> List[str]: + assert self.run_spec.configuration.type == "task" return self.run_spec.configuration.commands + def _default_single_branch(self) -> bool: + return True + def _default_max_duration(self) -> Optional[int]: - return DEFAULT_MAX_DURATION_SECONDS + return None def _spot_policy(self) -> SpotPolicy: - return self.run_spec.merged_profile.spot_policy or SpotPolicy.AUTO + return self.run_spec.merged_profile.spot_policy or SpotPolicy.ONDEMAND def _ports(self) -> List[PortMapping]: + assert self.run_spec.configuration.type == "task" return self.run_spec.configuration.ports - - def _working_dir(self) -> Optional[str]: - return None if not self._shell_commands() else super()._working_dir() diff --git a/src/dstack/_internal/server/services/jobs/job_replica_grpc_client.py b/src/dstack/_internal/server/services/jobs/job_replica_grpc_client.py new file mode 100644 index 0000000000..bc6f6cffe9 --- /dev/null +++ b/src/dstack/_internal/server/services/jobs/job_replica_grpc_client.py @@ -0,0 +1,57 @@ +"""SSH-tunneled gRPC channel target to a job's service port (UDS).""" + +from collections.abc import AsyncGenerator +from contextlib import asynccontextmanager +from datetime import timedelta +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Any + +import grpc + +from dstack._internal.core.services.ssh.tunnel import ( + SSH_DEFAULT_OPTIONS, + IPSocket, + SocketPair, + UnixSocket, +) +from dstack._internal.server.models import JobModel +from dstack._internal.server.services.jobs import get_job_spec +from dstack._internal.server.services.ssh import container_ssh_tunnel +from dstack._internal.utils.common import get_or_error + +SSH_CONNECT_TIMEOUT = timedelta(seconds=10) +# Match router_worker_sync HTTP server_info cap (_MAX_SERVER_INFO_RESPONSE_BYTES). +_MAX_GRPC_MESSAGE_BYTES = 256 * 1024 +_GRPC_CHANNEL_OPTIONS = ( + ("grpc.max_receive_message_length", _MAX_GRPC_MESSAGE_BYTES), + ("grpc.max_send_message_length", _MAX_GRPC_MESSAGE_BYTES), +) + + +@asynccontextmanager +async def get_service_replica_grpc_client(job: JobModel) -> AsyncGenerator[Any, None]: + options = { + **SSH_DEFAULT_OPTIONS, + "ConnectTimeout": str(int(SSH_CONNECT_TIMEOUT.total_seconds())), + } + job_spec = get_job_spec(job) + with TemporaryDirectory() as temp_dir: + # Keep the same socket file name as the HTTP helper for consistency. + app_socket_path = (Path(temp_dir) / "replica.sock").absolute() + async with container_ssh_tunnel( + job=job, + forwarded_sockets=[ + SocketPair( + remote=IPSocket("localhost", get_or_error(job_spec.service_port)), + local=UnixSocket(app_socket_path), + ), + ], + options=options, + ): + target = f"unix://{app_socket_path}" + channel = grpc.aio.insecure_channel(target, options=_GRPC_CHANNEL_OPTIONS) + try: + yield channel + finally: + await channel.close() diff --git a/src/dstack/_internal/server/services/jobs/job_replica_http_client.py b/src/dstack/_internal/server/services/jobs/job_replica_http_client.py new file mode 100644 index 0000000000..1497fe5e08 --- /dev/null +++ b/src/dstack/_internal/server/services/jobs/job_replica_http_client.py @@ -0,0 +1,49 @@ +"""SSH-tunneled async HTTP client to a job's service port (same path as probes).""" + +from collections.abc import AsyncGenerator +from contextlib import asynccontextmanager +from datetime import timedelta +from pathlib import Path +from tempfile import TemporaryDirectory + +from httpx import AsyncClient, AsyncHTTPTransport + +from dstack._internal.core.services.ssh.tunnel import ( + SSH_DEFAULT_OPTIONS, + IPSocket, + SocketPair, + UnixSocket, +) +from dstack._internal.server.models import JobModel +from dstack._internal.server.services.jobs import get_job_spec +from dstack._internal.server.services.ssh import container_ssh_tunnel +from dstack._internal.utils.common import get_or_error + +SSH_CONNECT_TIMEOUT = timedelta(seconds=10) + + +@asynccontextmanager +async def get_service_replica_client( + job: JobModel, +) -> AsyncGenerator[AsyncClient, None]: + options = { + **SSH_DEFAULT_OPTIONS, + "ConnectTimeout": str(int(SSH_CONNECT_TIMEOUT.total_seconds())), + } + job_spec = get_job_spec(job) + with TemporaryDirectory() as temp_dir: + app_socket_path = (Path(temp_dir) / "replica.sock").absolute() + async with container_ssh_tunnel( + job=job, + forwarded_sockets=[ + SocketPair( + remote=IPSocket("localhost", get_or_error(job_spec.service_port)), + local=UnixSocket(app_socket_path), + ), + ], + options=options, + ): + async with AsyncClient( + transport=AsyncHTTPTransport(uds=str(app_socket_path)) + ) as client: + yield client diff --git a/src/dstack/_internal/server/services/locking.py b/src/dstack/_internal/server/services/locking.py new file mode 100644 index 0000000000..2a2b833c02 --- /dev/null +++ b/src/dstack/_internal/server/services/locking.py @@ -0,0 +1,205 @@ +import asyncio +import collections.abc +import hashlib +from abc import abstractmethod +from asyncio import Lock +from contextlib import asynccontextmanager +from typing import AsyncGenerator, Iterable, Iterator, Protocol, TypeVar, Union + +from sqlalchemy import func, select +from sqlalchemy.ext.asyncio import AsyncConnection, AsyncSession + +KeyT = TypeVar("KeyT") + + +class LocksetLock(Protocol): + async def acquire(self) -> bool: ... + def release(self) -> None: ... + async def __aenter__(self): ... + async def __aexit__(self, exc_type, exc, tb): ... + + +T = TypeVar("T") + + +class Lockset(Protocol[T]): + def __contains__(self, item: T, /) -> bool: ... + def __iter__(self) -> Iterator[T]: ... + def __len__(self) -> int: ... + def add(self, item: T, /) -> None: ... + def discard(self, item: T, /) -> None: ... + def update(self, other: Iterable[T], /) -> None: ... + def difference_update(self, other: Iterable[T], /) -> None: ... + + +class ResourceLocker: + @abstractmethod + def get_lockset(self, namespace: str) -> tuple[LocksetLock, Lockset]: + """ + Returns a lockset containing locked resources for in-memory locking. + Also returns a lock that guards the lockset. + """ + pass + + @abstractmethod + @asynccontextmanager + async def lock_ctx(self, namespace: str, keys: list[KeyT]): + """ + Acquires locks for all keys in namespace. + The keys must be sorted to prevent deadlock. + """ + yield + + +class InMemoryResourceLocker(ResourceLocker): + def __init__(self): + self.namespace_to_locks_map: dict[str, tuple[Lock, set]] = {} + + def get_lockset(self, namespace: str) -> tuple[Lock, set]: + return self.namespace_to_locks_map.setdefault(namespace, (Lock(), set())) + + @asynccontextmanager + async def lock_ctx(self, namespace: str, keys: list[KeyT]): + lock, lockset = self.get_lockset(namespace) + try: + await _wait_to_lock_many(lock, lockset, keys) + yield + finally: + lockset.difference_update(keys) + + +class DummyAsyncLock: + async def __aenter__(self): + pass + + async def __aexit__(self, exc_type, exc, tb): + pass + + async def acquire(self): + return True + + def release(self): + pass + + +class DummySet(collections.abc.MutableSet): + def __contains__(self, item): + return False + + def __iter__(self): + return iter(()) + + def __len__(self): + return 0 + + def add(self, value): + pass + + def discard(self, value): + pass + + def update(self, other): + pass + + def difference_update(self, other): + pass + + +class DummyResourceLocker(ResourceLocker): + def __init__(self): + self.lock = DummyAsyncLock() + self.lockset = DummySet() + + def get_lockset(self, namespace: str) -> tuple[DummyAsyncLock, DummySet]: + return self.lock, self.lockset + + @asynccontextmanager + async def lock_ctx(self, namespace: str, keys: list[KeyT]): + yield + + +def string_to_lock_id(s: str) -> int: + return int(hashlib.sha256(s.encode()).hexdigest(), 16) % (2**63) + + +@asynccontextmanager +async def advisory_lock_ctx( + bind: Union[AsyncConnection, AsyncSession], dialect_name: str, resource: str +): + """ + Acquire a Postgres advisory lock on `resource`. No-op for SQLite. + + **NOTE**: The lock must be released by the same database connection that acquired it. + Attempts to release in a different connection will fail. + + To prevent unreleased locks: + + 1. When possible, prefer using `pg_advisory_xact_lock` instead of this context manager. + `pg_advisory_xact_lock` is automatically released at the end of transaction. + + 1. Prefer using `AsyncConnection` as `bind`. + + 1. If using `AsyncSession` as `bind`, **do not** commit before exiting from the context manager. + Committing will prompt `AsyncSession` to start a new transaction for releasing the lock, + which may be assigned to a different database connection, which will fail to release. + """ + + if dialect_name == "postgresql": + await bind.execute(select(func.pg_advisory_lock(string_to_lock_id(resource)))) + try: + yield + finally: + if dialect_name == "postgresql": + await bind.execute(select(func.pg_advisory_unlock(string_to_lock_id(resource)))) + + +@asynccontextmanager +async def try_advisory_lock_ctx( + bind: Union[AsyncConnection, AsyncSession], dialect_name: str, resource: str +) -> AsyncGenerator[bool, None]: + locked = True + if dialect_name == "postgresql": + res = await bind.execute(select(func.pg_try_advisory_lock(string_to_lock_id(resource)))) + locked = res.scalar_one() + try: + yield locked + finally: + if dialect_name == "postgresql" and locked: + await bind.execute(select(func.pg_advisory_unlock(string_to_lock_id(resource)))) + + +_in_memory_locker = InMemoryResourceLocker() +_dummy_locker = DummyResourceLocker() + + +def get_locker(dialect_name: str) -> ResourceLocker: + if dialect_name == "sqlite": + return _in_memory_locker + # We could use an in-memory locker on Postgres + # but it can lead to unnecessary lock contention, + # so we use a dummy locker that does not take any locks. + return _dummy_locker + + +async def _wait_to_lock_many( + lock: asyncio.Lock, locked: set[KeyT], keys: list[KeyT], *, delay: float = 0.1 +): + """ + Retry locking until all the keys are locked. + Lock is released during the sleep. + The keys must be sorted to prevent deadlock. + """ + left_to_lock = keys.copy() + while True: + async with lock: + locked_now_num = 0 + for key in left_to_lock: + if key in locked: + # Someone already acquired the lock, wait + break + locked.add(key) + locked_now_num += 1 + left_to_lock = left_to_lock[locked_now_num:] + if not left_to_lock: + return + await asyncio.sleep(delay) diff --git a/src/dstack/_internal/server/services/logging.py b/src/dstack/_internal/server/services/logging.py index 1f2d106a54..bf1b72f099 100644 --- a/src/dstack/_internal/server/services/logging.py +++ b/src/dstack/_internal/server/services/logging.py @@ -1,12 +1,29 @@ +import uuid from typing import Union -from dstack._internal.server.models import JobModel, RunModel +from dstack._internal.server.models import ( + GatewayModel, + InstanceModel, + JobModel, + ProbeModel, + RunModel, +) -def fmt(model: Union[RunModel, JobModel]) -> str: +def fmt(model: Union[RunModel, JobModel, InstanceModel, GatewayModel, ProbeModel]) -> str: """Consistent string representation of a model for logging.""" if isinstance(model, RunModel): - return f"run({model.id.hex[:6]}){model.run_name}" + return fmt_entity("run", model.id, model.run_name) if isinstance(model, JobModel): - return f"job({model.id.hex[:6]}){model.job_name}" + return fmt_entity("job", model.id, model.job_name) + if isinstance(model, InstanceModel): + return fmt_entity("instance", model.id, model.name) + if isinstance(model, GatewayModel): + return fmt_entity("gateway", model.id, model.name) + if isinstance(model, ProbeModel): + return fmt_entity("probe", model.id, model.name) return str(model) + + +def fmt_entity(entity_type: str, entity_id: uuid.UUID, entity_name: str) -> str: + return f"{entity_type}({entity_id.hex[:6]}){entity_name}" diff --git a/src/dstack/_internal/server/services/logs.py b/src/dstack/_internal/server/services/logs.py deleted file mode 100644 index 98aee337fc..0000000000 --- a/src/dstack/_internal/server/services/logs.py +++ /dev/null @@ -1,117 +0,0 @@ -import base64 -from datetime import datetime, timezone -from pathlib import Path -from typing import List -from uuid import UUID - -from dstack._internal.core.models.logs import JobSubmissionLogs, LogEvent, LogEventSource -from dstack._internal.server import settings -from dstack._internal.server.models import ProjectModel -from dstack._internal.server.schemas.logs import PollLogsRequest -from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent - - -def write_logs( - project: ProjectModel, - run_name: str, - job_submission_id: UUID, - runner_logs: List[RunnerLogEvent], - job_logs: List[RunnerLogEvent], -): - if len(runner_logs) > 0: - runner_log_file_path = _get_runner_log_file_path( - project_name=project.name, - run_name=run_name, - job_submission_id=job_submission_id, - ) - _write_logs( - log_file_path=runner_log_file_path, - log_events=runner_logs, - ) - if len(job_logs) > 0: - job_log_file_path = _get_job_log_file_path( - project_name=project.name, - run_name=run_name, - job_submission_id=job_submission_id, - ) - _write_logs( - log_file_path=job_log_file_path, - log_events=job_logs, - ) - - -def _write_logs( - log_file_path: Path, - log_events: List[RunnerLogEvent], -): - log_events_parsed = [_runner_log_event_to_log_event(log) for log in log_events] - log_file_path.parent.mkdir(exist_ok=True, parents=True) - with open(log_file_path, "a") as f: - f.writelines(log.json() + "\n" for log in log_events_parsed) - - -def poll_logs( - project: ProjectModel, - request: PollLogsRequest, -) -> JobSubmissionLogs: - # TODO Respect request.limit to support pagination - if request.diagnose: - log_file_path = _get_runner_log_file_path( - project_name=project.name, - run_name=request.run_name, - job_submission_id=request.job_submission_id, - ) - else: - log_file_path = _get_job_log_file_path( - project_name=project.name, - run_name=request.run_name, - job_submission_id=request.job_submission_id, - ) - logs = [] - try: - with open(log_file_path) as f: - for line in f: - log_event = LogEvent.__response__.parse_raw(line) - if request.start_time and log_event.timestamp <= request.start_time: - continue - if request.end_time is None or log_event.timestamp < request.end_time: - logs.append(log_event) - else: - break - except IOError: - pass - if request.descending: - logs = list(reversed(logs)) - return JobSubmissionLogs(logs=logs) - - -def _runner_log_event_to_log_event(runner_log_event: RunnerLogEvent) -> LogEvent: - return LogEvent( - timestamp=datetime.fromtimestamp(runner_log_event.timestamp / 1e9, tz=timezone.utc), - log_source=LogEventSource.STDOUT, - message=base64.b64encode(runner_log_event.message).decode(), - ) - - -def _get_job_log_file_path(project_name: str, run_name: str, job_submission_id: UUID) -> Path: - return ( - settings.SERVER_DIR_PATH - / "projects" - / project_name - / "logs" - / run_name - / str(job_submission_id) - / "job.log" - ) - - -def _get_runner_log_file_path(project_name: str, run_name: str, job_submission_id: UUID) -> Path: - return ( - settings.SERVER_DIR_PATH - / "projects" - / project_name - / "logs" - / run_name - / str(job_submission_id) - / "runner.log" - ) diff --git a/src/dstack/_internal/server/services/logs/__init__.py b/src/dstack/_internal/server/services/logs/__init__.py new file mode 100644 index 0000000000..bc601688bc --- /dev/null +++ b/src/dstack/_internal/server/services/logs/__init__.py @@ -0,0 +1,122 @@ +import atexit +from typing import List, Optional +from uuid import UUID + +from dstack._internal.core.errors import ServerClientError +from dstack._internal.core.models.logs import JobSubmissionLogs +from dstack._internal.server import settings +from dstack._internal.server.models import ProjectModel +from dstack._internal.server.schemas.logs import PollLogsRequest +from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent +from dstack._internal.server.services.logs import aws as aws_logs +from dstack._internal.server.services.logs import fluentbit as fluentbit_logs +from dstack._internal.server.services.logs import gcp as gcp_logs +from dstack._internal.server.services.logs.base import ( + LogStorage, + LogStorageError, + b64encode_raw_message, +) +from dstack._internal.server.services.logs.filelog import FileLogStorage +from dstack._internal.utils.common import run_async +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +_log_storage: Optional[LogStorage] = None + + +def get_log_storage() -> LogStorage: + global _log_storage + if _log_storage is not None: + return _log_storage + if settings.SERVER_CLOUDWATCH_LOG_GROUP: + if aws_logs.BOTO_AVAILABLE: + try: + _log_storage = aws_logs.CloudWatchLogStorage( + group=settings.SERVER_CLOUDWATCH_LOG_GROUP, + region=settings.SERVER_CLOUDWATCH_LOG_REGION, + ) + except LogStorageError as e: + logger.error("Failed to initialize CloudWatch Logs storage: %s", e) + except Exception: + logger.exception("Got exception when initializing CloudWatch Logs storage") + else: + logger.debug("Using CloudWatch Logs storage") + else: + logger.error("Cannot use CloudWatch Logs storage: boto3 is not installed") + elif settings.SERVER_GCP_LOGGING_PROJECT: + if gcp_logs.GCP_LOGGING_AVAILABLE: + try: + _log_storage = gcp_logs.GCPLogStorage( + project_id=settings.SERVER_GCP_LOGGING_PROJECT + ) + except LogStorageError as e: + logger.error("Failed to initialize GCP Logs storage: %s", e) + except Exception: + logger.exception("Got exception when initializing GCP Logs storage") + else: + logger.debug("Using GCP Logs storage") + else: + logger.error("Cannot use GCP Logs storage: GCP deps are not installed") + elif settings.SERVER_FLUENTBIT_HOST: + if fluentbit_logs.FLUENTBIT_AVAILABLE: + try: + _log_storage = fluentbit_logs.FluentBitLogStorage( + host=settings.SERVER_FLUENTBIT_HOST, + port=settings.SERVER_FLUENTBIT_PORT, + protocol=settings.SERVER_FLUENTBIT_PROTOCOL, + tag_prefix=settings.SERVER_FLUENTBIT_TAG_PREFIX, + es_host=settings.SERVER_ELASTICSEARCH_HOST, + es_index=settings.SERVER_ELASTICSEARCH_INDEX, + es_api_key=settings.SERVER_ELASTICSEARCH_API_KEY, + ) + except LogStorageError as e: + logger.error("Failed to initialize Fluent-bit Logs storage: %s", e) + except Exception: + logger.exception("Got exception when initializing Fluent-bit Logs storage") + else: + if settings.SERVER_ELASTICSEARCH_HOST: + logger.debug("Using Fluent-bit Logs storage with Elasticsearch/OpenSearch") + else: + logger.debug("Using Fluent-bit Logs storage in ship-only mode") + else: + logger.error("Cannot use Fluent-bit Logs storage: fluent-logger is not installed") + if _log_storage is None: + _log_storage = FileLogStorage() + logger.debug("Using file-based storage") + atexit.register(_log_storage.close) + return _log_storage + + +def write_logs( + project: ProjectModel, + run_name: str, + job_submission_id: UUID, + runner_logs: List[RunnerLogEvent], + job_logs: List[RunnerLogEvent], +) -> None: + return get_log_storage().write_logs( + project=project, + run_name=run_name, + job_submission_id=job_submission_id, + runner_logs=runner_logs, + job_logs=job_logs, + ) + + +async def poll_logs_async(project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs: + try: + job_submission_logs = await run_async( + get_log_storage().poll_logs, project=project, request=request + ) + except LogStorageError as e: + logger.error("Failed to poll logs from log storage: %s", repr(e)) + raise ServerClientError("Failed to poll logs from log storage") + # Logs are stored in plaintext but transmitted in base64 for API/CLI backward compatibility. + # Old logs stored in base64 are encoded twice for transmission and shown as base64 in CLI/UI. + # We live with that. + # TODO: Drop base64 encoding in 0.20. + for log_event in job_submission_logs.logs: + log_event.message = b64encode_raw_message(log_event.message.encode()) + return job_submission_logs diff --git a/src/dstack/_internal/server/services/logs/aws.py b/src/dstack/_internal/server/services/logs/aws.py new file mode 100644 index 0000000000..4e56f0865d --- /dev/null +++ b/src/dstack/_internal/server/services/logs/aws.py @@ -0,0 +1,373 @@ +import itertools +import operator +import urllib +import urllib.parse +from contextlib import contextmanager +from datetime import datetime, timedelta, timezone +from typing import Iterator, List, Optional, Set, Tuple, TypedDict +from uuid import UUID + +from dstack._internal.core.models.logs import ( + JobSubmissionLogs, + LogEvent, + LogEventSource, + LogProducer, +) +from dstack._internal.server.models import ProjectModel +from dstack._internal.server.schemas.logs import PollLogsRequest +from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent +from dstack._internal.server.services.logs.base import ( + LogStorage, + LogStorageError, + datetime_to_unix_time_ms, + unix_time_ms_to_datetime, +) +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +BOTO_AVAILABLE = True +try: + import boto3 + import botocore.exceptions +except ImportError: + BOTO_AVAILABLE = False +else: + + class _CloudWatchLogEvent(TypedDict): + timestamp: int # unix time in milliseconds + message: str + + class CloudWatchLogStorage(LogStorage): + # "The maximum number of log events in a batch is 10,000". + EVENT_MAX_COUNT_IN_BATCH = 10000 + # "The maximum batch size is 1,048,576 bytes" — exactly 1 MiB. "This size is calculated + # as the sum of all event messages in UTF-8, plus 26 bytes for each log event". + BATCH_MAX_SIZE = 1048576 + # "Each log event can be no larger than 256 KB" — KB means KiB; includes MESSAGE_OVERHEAD_SIZE. + MESSAGE_MAX_SIZE = 262144 + # Message size in bytes = len(message.encode("utf-8")) + MESSAGE_OVERHEAD_SIZE. + MESSAGE_OVERHEAD_SIZE = 26 + # "A batch of log events in a single request cannot span more than 24 hours". + BATCH_MAX_SPAN = int(timedelta(hours=24).total_seconds()) * 1000 + # Decrease allowed deltas by possible clock drift between dstack and CloudWatch. + CLOCK_DRIFT = int(timedelta(minutes=10).total_seconds()) * 1000 + # "None of the log events in the batch can be more than 14 days in the past." + PAST_EVENT_MAX_DELTA = int((timedelta(days=14)).total_seconds()) * 1000 - CLOCK_DRIFT + # "None of the log events in the batch can be more than 2 hours in the future." + FUTURE_EVENT_MAX_DELTA = int((timedelta(hours=2)).total_seconds()) * 1000 - CLOCK_DRIFT + # Maximum number of retries when polling for log events to skip empty pages. + MAX_RETRIES = 10 + + def __init__(self, *, group: str, region: Optional[str] = None) -> None: + with self._wrap_boto_errors(): + session = boto3.Session(region_name=region) + self._client = session.client("logs") + self._check_group_exists(group) + self._group = group + self._region = self._client.meta.region_name + # Stores names of already created streams. + # XXX: This set acts as an unbound cache. If this becomes a problem (in case of _very_ long + # running server and/or lots of jobs, consider replacing it with an LRU cache, e.g., + # a simple OrderedDict-based implementation should be OK. + self._streams: Set[str] = set() + + def close(self) -> None: + self._client.close() + + def poll_logs(self, project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs: + log_producer = LogProducer.RUNNER if request.diagnose else LogProducer.JOB + stream = self._get_stream_name( + project.name, request.run_name, request.job_submission_id, log_producer + ) + cw_events: List[_CloudWatchLogEvent] + next_token: Optional[str] = None + with self._wrap_boto_errors(): + try: + cw_events, next_token = self._get_log_events_with_retry(stream, request) + except botocore.exceptions.ClientError as e: + if not self._is_resource_not_found_exception(e): + raise + # Check if the group exists to distinguish between group not found vs stream not found + try: + self._check_group_exists(self._group) + # Group exists, so the error must be due to missing stream + logger.debug("Stream %s not found, returning dummy response", stream) + cw_events = [] + except LogStorageError: + # Group doesn't exist, re-raise the LogStorageError + raise + logs = [ + LogEvent( + timestamp=unix_time_ms_to_datetime(cw_event["timestamp"]), + log_source=LogEventSource.STDOUT, + message=cw_event["message"], + ) + for cw_event in cw_events + ] + return JobSubmissionLogs( + logs=logs, + external_url=self._get_stream_external_url(stream), + next_token=next_token, + ) + + def _get_log_events_with_retry( + self, stream: str, request: PollLogsRequest + ) -> Tuple[List[_CloudWatchLogEvent], Optional[str]]: + current_request = request + previous_next_token = request.next_token + next_token = None + + for _ in range(self.MAX_RETRIES): + cw_events, next_token = self._get_log_events(stream, current_request) + + if cw_events: + return cw_events, next_token + + if not next_token or next_token == previous_next_token: + return [], None + + previous_next_token = next_token + current_request = PollLogsRequest( + run_name=request.run_name, + job_submission_id=request.job_submission_id, + start_time=request.start_time, + end_time=request.end_time, + descending=request.descending, + next_token=next_token, + limit=request.limit, + diagnose=request.diagnose, + ) + + if not request.descending: + logger.debug( + "Stream %s: exhausted %d retries without finding logs, returning empty response", + stream, + self.MAX_RETRIES, + ) + # Only return the next token after exhausting retries if going descending— + # AWS CloudWatch guarantees more logs in that case. In ascending mode, + # next token is always returned, even if no logs remain. + # So descending works reliably; ascending has limits if gaps are too large. + # In the future, UI/CLI should handle retries, and we can return next token for ascending too. + return [], next_token if request.descending else None + + def _get_log_events( + self, stream: str, request: PollLogsRequest + ) -> Tuple[List[_CloudWatchLogEvent], Optional[str]]: + start_from_head = not request.descending + parameters = { + "logGroupName": self._group, + "logStreamName": stream, + "limit": request.limit, + "startFromHead": start_from_head, + } + + if request.start_time: + parameters["startTime"] = datetime_to_unix_time_ms(request.start_time) + + if request.end_time: + parameters["endTime"] = datetime_to_unix_time_ms(request.end_time) + elif start_from_head: + # When startFromHead=true and no endTime is provided, set endTime to "now" + # to prevent infinite pagination as new logs arrive faster than we can read them + parameters["endTime"] = datetime_to_unix_time_ms(datetime.now(timezone.utc)) + + if request.next_token: + parameters["nextToken"] = request.next_token + + response = self._client.get_log_events(**parameters) + + events = response.get("events", []) + next_token_key = "nextForwardToken" if start_from_head else "nextBackwardToken" + next_token = response.get(next_token_key) + + # TODO: The code below is not going to be used until we migrate from base64-encoded logs to plain text logs. + if request.descending: + events = list(reversed(events)) + + return events, next_token + + def _get_stream_external_url(self, stream: str) -> str: + quoted_group = urllib.parse.quote(self._group, safe="") + quoted_stream = urllib.parse.quote(stream, safe="") + return f"https://console.aws.amazon.com/cloudwatch/home?region={self._region}#logsV2:log-groups/log-group/{quoted_group}/log-events/{quoted_stream}" + + def write_logs( + self, + project: ProjectModel, + run_name: str, + job_submission_id: UUID, + runner_logs: List[RunnerLogEvent], + job_logs: List[RunnerLogEvent], + ): + if len(runner_logs) > 0: + runner_stream = self._get_stream_name( + project.name, run_name, job_submission_id, LogProducer.RUNNER + ) + self._write_logs( + stream=runner_stream, + log_events=runner_logs, + ) + if len(job_logs) > 0: + jog_stream = self._get_stream_name( + project.name, run_name, job_submission_id, LogProducer.JOB + ) + self._write_logs( + stream=jog_stream, + log_events=job_logs, + ) + + def _write_logs(self, stream: str, log_events: List[RunnerLogEvent]) -> None: + with self._wrap_boto_errors(): + self._ensure_stream_exists(stream) + try: + self._put_log_events(stream, log_events) + return + except botocore.exceptions.ClientError as e: + if not self._is_resource_not_found_exception(e): + raise + logger.debug("Stream %s not found, recreating", stream) + # The stream is probably deleted due to retention policy, our cache is stale. + self._ensure_stream_exists(stream, force=True) + self._put_log_events(stream, log_events) + + def _put_log_events(self, stream: str, log_events: List[RunnerLogEvent]) -> None: + # Python docs: "The built-in sorted() function is guaranteed to be stable." + sorted_log_events = sorted(log_events, key=operator.attrgetter("timestamp")) + if tuple(map(id, log_events)) != tuple(map(id, sorted_log_events)): + logger.error( + "Stream %s: events are not in chronological order, something wrong with runner", + stream, + ) + for batch in self._get_batch_iter(stream, sorted_log_events): + self._client.put_log_events( + logGroupName=self._group, + logStreamName=stream, + logEvents=batch, + ) + + def _get_batch_iter( + self, stream: str, log_events: List[RunnerLogEvent] + ) -> Iterator[List[_CloudWatchLogEvent]]: + shared_event_iter = iter(log_events) + event_iter = shared_event_iter + while True: + batch, excessive_event = self._get_next_batch(stream, event_iter) + if not batch: + return + yield batch + if excessive_event is not None: + event_iter = itertools.chain([excessive_event], shared_event_iter) + else: + event_iter = shared_event_iter + + def _get_next_batch( + self, stream: str, event_iter: Iterator[RunnerLogEvent] + ) -> Tuple[List[_CloudWatchLogEvent], Optional[RunnerLogEvent]]: + now_timestamp = int(datetime.now(timezone.utc).timestamp() * 1000) + batch: List[_CloudWatchLogEvent] = [] + total_size = 0 + event_count = 0 + first_timestamp: Optional[int] = None + skipped_past_events = 0 + skipped_future_events = 0 + # event that doesn't fit in the current batch + excessive_event: Optional[RunnerLogEvent] = None + for event in event_iter: + # Normally there should not be empty messages. + if not event.message: + continue + timestamp = event.timestamp + if first_timestamp is None: + first_timestamp = timestamp + elif timestamp - first_timestamp > self.BATCH_MAX_SPAN: + excessive_event = event + break + if now_timestamp - timestamp > self.PAST_EVENT_MAX_DELTA: + skipped_past_events += 1 + continue + if timestamp - now_timestamp > self.FUTURE_EVENT_MAX_DELTA: + skipped_future_events += 1 + continue + cw_event = self._runner_log_event_to_cloudwatch_event(event) + message_size = len(event.message) + self.MESSAGE_OVERHEAD_SIZE + if message_size > self.MESSAGE_MAX_SIZE: + # we should never hit this limit, as we use `io.Copy` to copy from pty to logs, + # which under the hood uses 32KiB buffer, see runner/internal/executor/executor.go, + # `execJob` -> `io.Copy(logger, ptmx)` + logger.error( + "Stream %s: skipping event %d, message exceeds max size: %d > %d", + stream, + timestamp, + message_size, + self.MESSAGE_MAX_SIZE, + ) + continue + if total_size + message_size > self.BATCH_MAX_SIZE: + excessive_event = event + break + batch.append(cw_event) + total_size += message_size + event_count += 1 + if event_count >= self.EVENT_MAX_COUNT_IN_BATCH: + break + if skipped_past_events > 0: + logger.error("Stream %s: skipping %d past event(s)", stream, skipped_past_events) + if skipped_future_events > 0: + logger.error( + "Stream %s: skipping %d future event(s)", stream, skipped_future_events + ) + return batch, excessive_event + + def _runner_log_event_to_cloudwatch_event( + self, runner_log_event: RunnerLogEvent + ) -> _CloudWatchLogEvent: + return { + "timestamp": runner_log_event.timestamp, + "message": runner_log_event.message.decode(errors="replace"), + } + + @contextmanager + def _wrap_boto_errors(self) -> Iterator[None]: + try: + yield + except (botocore.exceptions.BotoCoreError, botocore.exceptions.ClientError) as e: + raise LogStorageError(f"CloudWatch Logs error: {type(e).__name__}: {e}") from e + + def _is_resource_not_found_exception(self, exc: "botocore.exceptions.ClientError") -> bool: + try: + return exc.response["Error"]["Code"] == "ResourceNotFoundException" + except KeyError: + return False + + def _check_group_exists(self, name: str) -> None: + try: + self._client.describe_log_streams(logGroupName=name, limit=1) + except botocore.exceptions.ClientError as e: + if self._is_resource_not_found_exception(e): + raise LogStorageError(f"LogGroup '{name}' does not exist") + raise + + def _ensure_stream_exists(self, name: str, *, force: bool = False) -> None: + if not force and name in self._streams: + return + response = self._client.describe_log_streams( + logGroupName=self._group, logStreamNamePrefix=name + ) + for stream in response["logStreams"]: + if stream["logStreamName"] == name: + self._streams.add(name) + return + self._client.create_log_stream(logGroupName=self._group, logStreamName=name) + self._streams.add(name) + + def _get_stream_name( + self, + project_name: str, + run_name: str, + job_submission_id: UUID, + producer: LogProducer, + ) -> str: + return f"{project_name}/{run_name}/{job_submission_id}/{producer.value}" diff --git a/src/dstack/_internal/server/services/logs/base.py b/src/dstack/_internal/server/services/logs/base.py new file mode 100644 index 0000000000..54a6254b09 --- /dev/null +++ b/src/dstack/_internal/server/services/logs/base.py @@ -0,0 +1,47 @@ +import base64 +from abc import ABC, abstractmethod +from datetime import datetime, timezone +from typing import List +from uuid import UUID + +from dstack._internal.core.errors import DstackError +from dstack._internal.core.models.logs import JobSubmissionLogs +from dstack._internal.server.models import ProjectModel +from dstack._internal.server.schemas.logs import PollLogsRequest +from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent + + +class LogStorageError(DstackError): + pass + + +class LogStorage(ABC): + @abstractmethod + def poll_logs(self, project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs: + pass + + @abstractmethod + def write_logs( + self, + project: ProjectModel, + run_name: str, + job_submission_id: UUID, + runner_logs: List[RunnerLogEvent], + job_logs: List[RunnerLogEvent], + ) -> None: + pass + + def close(self) -> None: + pass + + +def unix_time_ms_to_datetime(unix_time_ms: int) -> datetime: + return datetime.fromtimestamp(unix_time_ms / 1000, tz=timezone.utc) + + +def datetime_to_unix_time_ms(dt: datetime) -> int: + return int(dt.timestamp() * 1000) + + +def b64encode_raw_message(message: bytes) -> str: + return base64.b64encode(message).decode() diff --git a/src/dstack/_internal/server/services/logs/filelog.py b/src/dstack/_internal/server/services/logs/filelog.py new file mode 100644 index 0000000000..e4289805c6 --- /dev/null +++ b/src/dstack/_internal/server/services/logs/filelog.py @@ -0,0 +1,261 @@ +import os +from pathlib import Path +from typing import Generator, List, Optional, Tuple, Union +from uuid import UUID + +from dstack._internal.core.errors import ServerClientError +from dstack._internal.core.models.logs import ( + JobSubmissionLogs, + LogEvent, + LogEventSource, + LogProducer, +) +from dstack._internal.server import settings +from dstack._internal.server.models import ProjectModel +from dstack._internal.server.schemas.logs import PollLogsRequest +from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent +from dstack._internal.server.services.logs.base import ( + LogStorage, + unix_time_ms_to_datetime, +) + + +class FileLogStorage(LogStorage): + root: Path + + def __init__(self, root: Union[Path, str, None] = None) -> None: + if root is None: + self.root = settings.SERVER_DIR_PATH + else: + self.root = Path(root) + + def poll_logs(self, project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs: + log_producer = LogProducer.RUNNER if request.diagnose else LogProducer.JOB + log_file_path = self._get_log_file_path( + project_name=project.name, + run_name=request.run_name, + job_submission_id=request.job_submission_id, + producer=log_producer, + ) + + if request.descending: + return self._poll_logs_descending(log_file_path, request) + else: + return self._poll_logs_ascending(log_file_path, request) + + def _poll_logs_ascending( + self, log_file_path: Path, request: PollLogsRequest + ) -> JobSubmissionLogs: + start_line = 0 + if request.next_token: + start_line = self._parse_next_token(request.next_token) + + logs = [] + next_token = None + current_line = 0 + + try: + with open(log_file_path) as f: + # Skip to start_line if needed + for _ in range(start_line): + if f.readline() == "": + # File is shorter than start_line + return JobSubmissionLogs(logs=logs, next_token=next_token) + current_line += 1 + + # Read lines one by one + while True: + line = f.readline() + if line == "": # EOF + break + + current_line += 1 + + try: + log_event = LogEvent.__response__.parse_raw(line) + except Exception: + # Skip malformed lines + continue + + if request.start_time and log_event.timestamp <= request.start_time: + continue + if request.end_time is not None and log_event.timestamp >= request.end_time: + break + + logs.append(log_event) + + if len(logs) >= request.limit: + # Check if there are more lines to read + if f.readline() != "": + next_token = str(current_line) + break + except FileNotFoundError: + pass + + return JobSubmissionLogs(logs=logs, next_token=next_token) + + def _poll_logs_descending( + self, log_file_path: Path, request: PollLogsRequest + ) -> JobSubmissionLogs: + start_offset = None + if request.next_token is not None: + start_offset = self._parse_next_token(request.next_token) + + candidate_logs = [] + + try: + line_generator = self._read_lines_reversed(log_file_path, start_offset) + + for line_bytes, line_start_offset in line_generator: + try: + line_str = line_bytes.decode("utf-8") + log_event = LogEvent.__response__.parse_raw(line_str) + except Exception: + continue # Skip malformed lines + + if request.end_time is not None and log_event.timestamp > request.end_time: + continue + if request.start_time and log_event.timestamp <= request.start_time: + break + + candidate_logs.append((log_event, line_start_offset)) + + if len(candidate_logs) > request.limit: + break + except FileNotFoundError: + return JobSubmissionLogs(logs=[], next_token=None) + + logs = [log for log, _ in candidate_logs[: request.limit]] + next_token = None + if len(candidate_logs) > request.limit: + # We fetched one more than the limit, so there are more pages. + # The next token should point to the start of the last log we are returning. + _, last_log_offset = candidate_logs[request.limit - 1] + next_token = str(last_log_offset) + + return JobSubmissionLogs(logs=logs, next_token=next_token) + + @staticmethod + def _read_lines_reversed( + filepath: Path, start_offset: Optional[int] = None, chunk_size: int = 8192 + ) -> Generator[Tuple[bytes, int], None, None]: + """ + A generator that yields lines from a file in reverse order, along with the byte + offset of the start of each line. This is memory-efficient for large files. + """ + with open(filepath, "rb") as f: + f.seek(0, os.SEEK_END) + file_size = f.tell() + cursor = file_size + + # If a start_offset is provided, optimize by starting the read + # from a more specific location instead of the end of the file. + if start_offset is not None and start_offset < file_size: + # To get the full content of the line that straddles the offset, + # we need to find its end (the next newline character). + f.seek(start_offset) + chunk = f.read(chunk_size) + newline_pos = chunk.find(b"\n") + if newline_pos != -1: + # Found the end of the line. The cursor for reverse reading + # should start from this point to include the full line. + cursor = start_offset + newline_pos + 1 + else: + # No newline found, which means the rest of the file is one line. + # The default cursor pointing to file_size is correct. + pass + + buffer = b"" + + while cursor > 0: + seek_pos = max(0, cursor - chunk_size) + amount_to_read = cursor - seek_pos + f.seek(seek_pos) + chunk = f.read(amount_to_read) + cursor = seek_pos + + buffer = chunk + buffer + + while b"\n" in buffer: + newline_pos = buffer.rfind(b"\n") + line = buffer[newline_pos + 1 :] + line_start_offset = cursor + newline_pos + 1 + + # Skip lines that start at or after the start_offset + if start_offset is None or line_start_offset < start_offset: + yield line, line_start_offset + + buffer = buffer[:newline_pos] + + # The remaining buffer is the first line of the file. + # Only yield it if we're not using start_offset or if it starts before start_offset + if buffer and (start_offset is None or 0 < start_offset): + yield buffer, 0 + + def write_logs( + self, + project: ProjectModel, + run_name: str, + job_submission_id: UUID, + runner_logs: List[RunnerLogEvent], + job_logs: List[RunnerLogEvent], + ): + if len(runner_logs) > 0: + runner_log_file_path = self._get_log_file_path( + project.name, run_name, job_submission_id, LogProducer.RUNNER + ) + self._write_logs( + log_file_path=runner_log_file_path, + log_events=runner_logs, + ) + if len(job_logs) > 0: + job_log_file_path = self._get_log_file_path( + project.name, run_name, job_submission_id, LogProducer.JOB + ) + self._write_logs( + log_file_path=job_log_file_path, + log_events=job_logs, + ) + + def _write_logs(self, log_file_path: Path, log_events: List[RunnerLogEvent]) -> None: + log_events_parsed = [self._runner_log_event_to_log_event(event) for event in log_events] + log_file_path.parent.mkdir(exist_ok=True, parents=True) + with open(log_file_path, "a") as f: + f.writelines(log.json() + "\n" for log in log_events_parsed) + + def _get_log_file_path( + self, + project_name: str, + run_name: str, + job_submission_id: UUID, + producer: LogProducer, + ) -> Path: + return ( + self.root + / "projects" + / project_name + / "logs" + / run_name + / str(job_submission_id) + / f"{producer.value}.log" + ) + + def _runner_log_event_to_log_event(self, runner_log_event: RunnerLogEvent) -> LogEvent: + return LogEvent( + timestamp=unix_time_ms_to_datetime(runner_log_event.timestamp), + log_source=LogEventSource.STDOUT, + message=runner_log_event.message.decode(errors="replace"), + ) + + def _parse_next_token(self, next_token: str) -> int: + if next_token is None: + return None + try: + value = int(next_token) + if value < 0: + raise ValueError("Offset must be non-negative") + return value + except (ValueError, TypeError): + raise ServerClientError( + f"Invalid next_token: {next_token}. Must be a non-negative integer." + ) diff --git a/src/dstack/_internal/server/services/logs/fluentbit.py b/src/dstack/_internal/server/services/logs/fluentbit.py new file mode 100644 index 0000000000..bb97e21f09 --- /dev/null +++ b/src/dstack/_internal/server/services/logs/fluentbit.py @@ -0,0 +1,329 @@ +from datetime import datetime +from typing import List, Optional, Protocol +from uuid import UUID + +import httpx + +from dstack._internal.core.errors import ServerClientError +from dstack._internal.core.models.logs import ( + JobSubmissionLogs, + LogEvent, + LogEventSource, + LogProducer, +) +from dstack._internal.server.models import ProjectModel +from dstack._internal.server.schemas.logs import PollLogsRequest +from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent +from dstack._internal.server.services.logs.base import ( + LogStorage, + LogStorageError, + unix_time_ms_to_datetime, +) +from dstack._internal.utils.common import batched +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +ELASTICSEARCH_AVAILABLE = True +try: + from elasticsearch import Elasticsearch + from elasticsearch.exceptions import ApiError, TransportError +except ImportError: + ELASTICSEARCH_AVAILABLE = False +else: + ElasticsearchError: tuple = (ApiError, TransportError) # type: ignore[misc] + + class ElasticsearchReader: + """Reads logs from Elasticsearch or OpenSearch.""" + + def __init__( + self, + host: str, + index: str, + api_key: Optional[str] = None, + ) -> None: + if api_key: + self._client = Elasticsearch(hosts=[host], api_key=api_key) + else: + self._client = Elasticsearch(hosts=[host]) + self._index = index + # Verify connection + try: + self._client.info() + except ElasticsearchError as e: + raise LogStorageError(f"Failed to connect to Elasticsearch/OpenSearch: {e}") from e + + def read( + self, + stream_name: str, + request: PollLogsRequest, + ) -> JobSubmissionLogs: + sort_order = "desc" if request.descending else "asc" + + query: dict = { + "bool": { + "must": [ + {"term": {"stream.keyword": stream_name}}, + ] + } + } + + if request.start_time: + query["bool"].setdefault("filter", []).append( + {"range": {"@timestamp": {"gt": request.start_time.isoformat()}}} + ) + if request.end_time: + query["bool"].setdefault("filter", []).append( + {"range": {"@timestamp": {"lt": request.end_time.isoformat()}}} + ) + + search_params: dict = { + "index": self._index, + "query": query, + "sort": [ + {"@timestamp": {"order": sort_order}}, + {"_id": {"order": sort_order}}, + ], + "size": request.limit, + } + + if request.next_token: + parts = request.next_token.split(":", 1) + if len(parts) != 2 or not parts[0] or not parts[1]: + raise ServerClientError( + f"Invalid next_token: {request.next_token}. " + "Must be in format 'timestamp:document_id'." + ) + search_params["search_after"] = [parts[0], parts[1]] + + try: + response = self._client.search(**search_params) + except ElasticsearchError as e: + raise LogStorageError(f"Elasticsearch/OpenSearch error: {e}") from e + + hits = response.get("hits", {}).get("hits", []) + logs = [] + last_sort_values = None + + for hit in hits: + source = hit.get("_source", {}) + timestamp_str = source.get("@timestamp") + message = source.get("message", "") + + if timestamp_str: + try: + timestamp = datetime.fromisoformat(timestamp_str.replace("Z", "+00:00")) + except ValueError: + continue + else: + continue + + logs.append( + LogEvent( + timestamp=timestamp, + log_source=LogEventSource.STDOUT, + message=message, + ) + ) + + sort_values = hit.get("sort") + if sort_values and len(sort_values) >= 2: + last_sort_values = sort_values + + next_token = None + if len(logs) == request.limit and last_sort_values is not None: + next_token = f"{last_sort_values[0]}:{last_sort_values[1]}" + + return JobSubmissionLogs( + logs=logs, + next_token=next_token, + ) + + def close(self) -> None: + self._client.close() + + +FLUENTBIT_AVAILABLE = True +try: + from fluent import sender as fluent_sender +except ImportError: + FLUENTBIT_AVAILABLE = False +else: + + class FluentBitWriter(Protocol): + def write(self, tag: str, records: List[dict]) -> None: ... + def close(self) -> None: ... + + class LogReader(Protocol): + def read(self, stream_name: str, request: PollLogsRequest) -> JobSubmissionLogs: ... + def close(self) -> None: ... + + class HTTPFluentBitWriter: + """Writes logs to Fluent-bit via HTTP POST.""" + + def __init__(self, host: str, port: int, tag_prefix: str) -> None: + self._endpoint = f"http://{host}:{port}" + self._client = httpx.Client(timeout=30.0) + self._tag_prefix = tag_prefix + + def write(self, tag: str, records: List[dict]) -> None: + prefixed_tag = f"{self._tag_prefix}.{tag}" if self._tag_prefix else tag + for record in records: + try: + response = self._client.post( + f"{self._endpoint}/{prefixed_tag}", + json=record, + headers={"Content-Type": "application/json"}, + ) + response.raise_for_status() + except httpx.HTTPStatusError as e: + raise LogStorageError( + f"Fluent-bit HTTP error: status {e.response.status_code}" + ) from e + except httpx.HTTPError as e: + raise LogStorageError(f"Fluent-bit HTTP error: {e}") from e + + def close(self) -> None: + self._client.close() + + class ForwardFluentBitWriter: + """Writes logs to Fluent-bit using Forward protocol.""" + + def __init__(self, host: str, port: int, tag_prefix: str) -> None: + self._sender = fluent_sender.FluentSender(tag_prefix, host=host, port=port) + self._tag_prefix = tag_prefix + + def write(self, tag: str, records: List[dict]) -> None: + for record in records: + if not self._sender.emit(tag, record): + error = self._sender.last_error + self._sender.clear_last_error() + raise LogStorageError(f"Fluent-bit Forward error: {error}") + + def close(self) -> None: + self._sender.close() + + class NullLogReader: + """ + Null reader for ship-only mode (no Elasticsearch/OpenSearch configured). + + Returns empty logs. Useful when logs are shipped to an external system + that is accessed directly rather than through dstack. + """ + + def read(self, stream_name: str, request: PollLogsRequest) -> JobSubmissionLogs: + return JobSubmissionLogs(logs=[], next_token=None) + + def close(self) -> None: + pass + + class FluentBitLogStorage(LogStorage): + """ + Log storage using Fluent-bit for writing and optionally Elasticsearch/OpenSearch for reading. + + Supports two modes: + - Full mode: Writes to Fluent-bit and reads from Elasticsearch/OpenSearch + - Ship-only mode: Writes to Fluent-bit only (no reading, returns empty logs) + """ + + MAX_BATCH_SIZE = 100 + + def __init__( + self, + host: str, + port: int, + protocol: str, + tag_prefix: str, + es_host: Optional[str] = None, + es_index: str = "dstack-logs", + es_api_key: Optional[str] = None, + ) -> None: + self._tag_prefix = tag_prefix + + if protocol == "http": + self._writer: FluentBitWriter = HTTPFluentBitWriter( + host=host, port=port, tag_prefix=tag_prefix + ) + elif protocol == "forward": + self._writer = ForwardFluentBitWriter(host=host, port=port, tag_prefix=tag_prefix) + else: + raise LogStorageError(f"Unsupported Fluent-bit protocol: {protocol}") + + self._reader: LogReader + if es_host: + if not ELASTICSEARCH_AVAILABLE: + raise LogStorageError( + "Elasticsearch/OpenSearch host configured but elasticsearch package " + "is not installed. Install with: pip install elasticsearch" + ) + self._reader = ElasticsearchReader( + host=es_host, + index=es_index, + api_key=es_api_key, + ) + logger.info( + "Fluent-bit log storage initialized with Elasticsearch/OpenSearch reader" + ) + else: + self._reader = NullLogReader() + logger.info( + "Fluent-bit log storage initialized in ship-only mode " + "(no Elasticsearch/OpenSearch configured for reading)" + ) + + def poll_logs(self, project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs: + producer = LogProducer.RUNNER if request.diagnose else LogProducer.JOB + stream_name = self._get_stream_name( + project_name=project.name, + run_name=request.run_name, + job_submission_id=request.job_submission_id, + producer=producer, + ) + return self._reader.read(stream_name=stream_name, request=request) + + def write_logs( + self, + project: ProjectModel, + run_name: str, + job_submission_id: UUID, + runner_logs: List[RunnerLogEvent], + job_logs: List[RunnerLogEvent], + ) -> None: + producers_with_logs = [(LogProducer.RUNNER, runner_logs), (LogProducer.JOB, job_logs)] + for producer, producer_logs in producers_with_logs: + if not producer_logs: + continue + stream_name = self._get_stream_name( + project_name=project.name, + run_name=run_name, + job_submission_id=job_submission_id, + producer=producer, + ) + self._write_logs_to_stream(stream_name=stream_name, logs=producer_logs) + + def _write_logs_to_stream(self, stream_name: str, logs: List[RunnerLogEvent]) -> None: + for batch in batched(logs, self.MAX_BATCH_SIZE): + records = [] + for log in batch: + message = log.message.decode(errors="replace") + timestamp = unix_time_ms_to_datetime(log.timestamp) + records.append( + { + "message": message, + "@timestamp": timestamp.isoformat(), + "stream": stream_name, + } + ) + self._writer.write(tag=stream_name, records=records) + + def close(self) -> None: + try: + self._writer.close() + finally: + self._reader.close() + + def _get_stream_name( + self, project_name: str, run_name: str, job_submission_id: UUID, producer: LogProducer + ) -> str: + return f"{project_name}/{run_name}/{job_submission_id}/{producer.value}" diff --git a/src/dstack/_internal/server/services/logs/gcp.py b/src/dstack/_internal/server/services/logs/gcp.py new file mode 100644 index 0000000000..c1b1a75cf1 --- /dev/null +++ b/src/dstack/_internal/server/services/logs/gcp.py @@ -0,0 +1,181 @@ +import urllib.parse +from typing import List +from uuid import UUID + +from dstack._internal.core.errors import ServerClientError +from dstack._internal.core.models.logs import ( + JobSubmissionLogs, + LogEvent, + LogEventSource, + LogProducer, +) +from dstack._internal.server.models import ProjectModel +from dstack._internal.server.schemas.logs import PollLogsRequest +from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent +from dstack._internal.server.services.logs.base import ( + LogStorage, + LogStorageError, + unix_time_ms_to_datetime, +) +from dstack._internal.utils.common import batched +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +GCP_LOGGING_AVAILABLE = True +try: + import google.api_core.exceptions + import google.auth.exceptions + from google.cloud import logging_v2 + from google.cloud.logging_v2.types import ListLogEntriesRequest +except ImportError: + GCP_LOGGING_AVAILABLE = False +else: + + class GCPLogStorage(LogStorage): + # Max expected message size from runner is 32KB. + # Max expected LogEntry size is 32KB + metadata < 50KB < 256KB limit. + # With MAX_BATCH_SIZE = 100, max write request size < 5MB < 10 MB limit. + # See: https://cloud.google.com/logging/quotas. + MAX_RUNNER_MESSAGE_SIZE = 32 * 1024 + MAX_BATCH_SIZE = 100 + + # Use the same log name for all run logs so that it's easy to manage all dstack-related logs. + LOG_NAME = "dstack-run-logs" + # Logs from different jobs belong to different "streams". + # GCP Logging has no built-in concepts of streams, so we implement them with labels. + # It should be fast to filter by labels since labels are indexed by default + # (https://cloud.google.com/logging/docs/analyze/custom-index). + + def __init__(self, project_id: str): + self.project_id = project_id + try: + self.client = logging_v2.Client(project=project_id) + self.logger = self.client.logger(name=self.LOG_NAME) + self.logger.list_entries(max_results=1) + # Python client doesn't seem to support dry_run, + # so emit an empty log to check permissions. + self.logger.log_empty() + except google.auth.exceptions.DefaultCredentialsError: + raise LogStorageError("Default credentials not found") + except google.api_core.exceptions.NotFound: + raise LogStorageError(f"Project {project_id} not found") + except google.api_core.exceptions.PermissionDenied: + raise LogStorageError("Insufficient permissions") + + def poll_logs(self, project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs: + # TODO: GCP may return logs in random order when events have the same timestamp. + producer = LogProducer.RUNNER if request.diagnose else LogProducer.JOB + stream_name = self._get_stream_name( + project_name=project.name, + run_name=request.run_name, + job_submission_id=request.job_submission_id, + producer=producer, + ) + log_filters = [f'labels.stream = "{stream_name}"'] + if request.start_time: + log_filters.append(f'timestamp > "{request.start_time.isoformat()}"') + if request.end_time: + log_filters.append(f'timestamp < "{request.end_time.isoformat()}"') + log_filter = " AND ".join(log_filters) + + order_by = logging_v2.DESCENDING if request.descending else logging_v2.ASCENDING + try: + # Use low-level API to get access to next_page_token + request_obj = ListLogEntriesRequest( + resource_names=[f"projects/{self.client.project}"], + filter=log_filter, + order_by=order_by, + page_size=request.limit, + page_token=request.next_token, + ) + response = self.client._logging_api._gapic_api.list_log_entries( # type: ignore[attr-defined] + request=request_obj + ) + + logs = [ + LogEvent( + timestamp=entry.timestamp, + message=entry.json_payload.get("message"), + log_source=LogEventSource.STDOUT, + ) + for entry in response.entries + ] + next_token = response.next_page_token or None + except google.api_core.exceptions.ResourceExhausted as e: + logger.warning("GCP Logging exception: %s", repr(e)) + # GCP Logging has severely low quota of 60 reads/min for entries.list + raise ServerClientError( + "GCP Logging read request limit exceeded." + " It's recommended to increase default entries.list request quota from 60 per minute." + ) + return JobSubmissionLogs( + logs=logs, + external_url=self._get_stream_extrnal_url(stream_name), + next_token=next_token if len(logs) > 0 else None, + ) + + def write_logs( + self, + project: ProjectModel, + run_name: str, + job_submission_id: UUID, + runner_logs: List[RunnerLogEvent], + job_logs: List[RunnerLogEvent], + ): + producers_with_logs = [(LogProducer.RUNNER, runner_logs), (LogProducer.JOB, job_logs)] + for producer, producer_logs in producers_with_logs: + stream_name = self._get_stream_name( + project_name=project.name, + run_name=run_name, + job_submission_id=job_submission_id, + producer=producer, + ) + self._write_logs_to_stream( + stream_name=stream_name, + logs=producer_logs, + ) + + def close(self): + self.client.close() + + def _write_logs_to_stream(self, stream_name: str, logs: List[RunnerLogEvent]): + with self.logger.batch() as batcher: + for batch in batched(logs, self.MAX_BATCH_SIZE): + for log in batch: + message = log.message.decode(errors="replace") + timestamp = unix_time_ms_to_datetime(log.timestamp) + if len(log.message) > self.MAX_RUNNER_MESSAGE_SIZE: + logger.error( + "Stream %s: skipping event at %s, message exceeds max size: %d > %d", + stream_name, + timestamp.isoformat(), + len(log.message), + self.MAX_RUNNER_MESSAGE_SIZE, + ) + continue + batcher.log_struct( + { + "message": message, + }, + labels={ + "stream": stream_name, + }, + timestamp=timestamp, + ) + batcher.commit() + + def _get_stream_name( + self, project_name: str, run_name: str, job_submission_id: UUID, producer: LogProducer + ) -> str: + return f"{project_name}-{run_name}-{job_submission_id}-{producer.value}" + + def _get_stream_extrnal_url(self, stream_name: str) -> str: + log_name_resource_name = self._get_log_name_resource_name() + query = f'logName="{log_name_resource_name}" AND labels.stream="{stream_name}"' + quoted_query = urllib.parse.quote(query, safe="") + return f"https://console.cloud.google.com/logs/query;query={quoted_query}?project={self.project_id}" + + def _get_log_name_resource_name(self) -> str: + return f"projects/{self.project_id}/logs/{self.LOG_NAME}" diff --git a/src/dstack/_internal/server/services/metrics.py b/src/dstack/_internal/server/services/metrics.py new file mode 100644 index 0000000000..d8f3c96830 --- /dev/null +++ b/src/dstack/_internal/server/services/metrics.py @@ -0,0 +1,172 @@ +import json +from collections import defaultdict +from collections.abc import Sequence +from datetime import datetime, timezone +from typing import Optional + +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.instances import Resources +from dstack._internal.core.models.metrics import JobMetrics, Metric +from dstack._internal.server.models import JobMetricsPoint, JobModel +from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data +from dstack._internal.utils.common import get_or_error +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +async def get_job_metrics( + session: AsyncSession, + job_model: JobModel, + after: Optional[datetime] = None, + before: Optional[datetime] = None, + limit: Optional[int] = None, +) -> JobMetrics: + """ + Returns metrics ordered from the latest to the earliest. + + Expected usage: + * limit=100 — get the latest 100 points + * after= — get points for the last one hour + * before=, limit=100 ­— paginate back in history + """ + stmt = ( + select(JobMetricsPoint) + .where(JobMetricsPoint.job_id == job_model.id) + .order_by(JobMetricsPoint.timestamp_micro.desc()) + ) + if after is not None: + # we need +1 point for cpu_usage_percent, thus >= + stmt = stmt.where(JobMetricsPoint.timestamp_micro >= _datetime_to_unix_time_micro(after)) + if before is not None: + stmt = stmt.where(JobMetricsPoint.timestamp_micro < _datetime_to_unix_time_micro(before)) + if limit is not None: + # +1 for cpu_usage_percent + stmt = stmt.limit(limit + 1) + res = await session.execute(stmt) + points = res.scalars().all() + # we need at least 2 points to calculate cpu_usage_percent + if len(points) < 2: + return JobMetrics(metrics=[]) + return _calculate_job_metrics(job_model, points) + + +def _calculate_job_metrics(job_model: JobModel, points: Sequence[JobMetricsPoint]) -> JobMetrics: + timestamps: list[datetime] = [] + cpu_usage_points: list[int] = [] + memory_usage_points: list[int] = [] + memory_working_set_points: list[int] = [] + gpus_memory_usage_points: defaultdict[int, list[int]] = defaultdict(list) + gpus_util_points: defaultdict[int, list[int]] = defaultdict(list) + + cpus_detected_num: Optional[int] = None + memory_total: Optional[int] = None + gpu_memory_total: Optional[int] = None + resources: Optional[Resources] = None + jrd = get_job_runtime_data(job_model) + if jrd is not None and jrd.offer is not None: + resources = jrd.offer.instance.resources + else: + jpd = get_job_provisioning_data(job_model) + if jpd is not None: + resources = jpd.instance_type.resources + if resources is not None: + cpus_detected_num = resources.cpus + memory_total = resources.memory_mib * 1024 * 1024 + if len(resources.gpus) > 0: + gpu_memory_total = resources.gpus[0].memory_mib * 1024 * 1024 + + gpus_detected_num: Optional[int] = None + gpus_detected_num_mismatch: bool = False + for point, prev_point in zip(points, points[1:]): + timestamps.append(_unix_time_micro_to_datetime(point.timestamp_micro)) + cpu_usage_points.append(_get_cpu_usage(point, prev_point)) + memory_usage_points.append(point.memory_usage_bytes) + memory_working_set_points.append(point.memory_working_set_bytes) + gpus_memory_usage = json.loads(point.gpus_memory_usage_bytes) + gpus_util = json.loads(point.gpus_util_percent) + if gpus_detected_num is None: + gpus_detected_num = len(gpus_memory_usage) + if len(gpus_memory_usage) != gpus_detected_num or len(gpus_util) != gpus_detected_num: + gpus_detected_num_mismatch = True + if not gpus_detected_num_mismatch: + for i in range(gpus_detected_num): + gpus_memory_usage_points[i].append(gpus_memory_usage[i]) + gpus_util_points[i].append(gpus_util[i]) + + metrics: list[Metric] = [ + Metric( + name="cpu_usage_percent", + timestamps=timestamps, + values=cpu_usage_points, + ), + Metric( + name="memory_usage_bytes", + timestamps=timestamps, + values=memory_usage_points, + ), + Metric( + name="memory_working_set_bytes", + timestamps=timestamps, + values=memory_working_set_points, + ), + ] + if cpus_detected_num is not None: + metrics.append(_make_constant_metric("cpus_detected_num", timestamps, cpus_detected_num)) + if memory_total is not None: + metrics.append(_make_constant_metric("memory_total_bytes", timestamps, memory_total)) + if gpus_detected_num_mismatch: + # If number of GPUs changed in the time window, skip GPU metrics altogether, otherwise + # results can be unpredictable (e.g, one GPU takes place of another, as they are + # identified by an array index only). + logger.warning("gpus_detected_num mismatch, skipping GPU metrics") + else: + metrics.append( + _make_constant_metric("gpus_detected_num", timestamps, get_or_error(gpus_detected_num)) + ) + if gpu_memory_total is not None: + metrics.append( + _make_constant_metric("gpu_memory_total_bytes", timestamps, gpu_memory_total) + ) + for index, gpu_memory_usage_points in gpus_memory_usage_points.items(): + metrics.append( + Metric( + name=f"gpu_memory_usage_bytes_gpu{index}", + timestamps=timestamps, + values=gpu_memory_usage_points, + ) + ) + for index, gpu_util_points in gpus_util_points.items(): + metrics.append( + Metric( + name=f"gpu_util_percent_gpu{index}", + timestamps=timestamps, + values=gpu_util_points, + ) + ) + return JobMetrics(metrics=metrics) + + +def _make_constant_metric(name: str, timestamps: list[datetime], value: float) -> Metric: + return Metric( + name=name, + timestamps=timestamps, + values=[value] * len(timestamps), + ) + + +def _get_cpu_usage(last_point: JobMetricsPoint, prev_point: JobMetricsPoint) -> int: + window = last_point.timestamp_micro - prev_point.timestamp_micro + if window == 0: + return 0 + return round((last_point.cpu_usage_micro - prev_point.cpu_usage_micro) / window * 100) + + +def _unix_time_micro_to_datetime(unix_time_ms: int) -> datetime: + return datetime.fromtimestamp(unix_time_ms / 1_000_000, tz=timezone.utc) + + +def _datetime_to_unix_time_micro(dt: datetime) -> int: + return int(dt.timestamp() * 1_000_000) diff --git a/src/dstack/_internal/server/services/offers.py b/src/dstack/_internal/server/services/offers.py new file mode 100644 index 0000000000..6fd739f13e --- /dev/null +++ b/src/dstack/_internal/server/services/offers.py @@ -0,0 +1,268 @@ +import heapq +import itertools +from collections.abc import Container, Iterable, Iterator +from typing import List, Literal, Optional, Tuple, TypeVar, Union + +import gpuhunt + +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.base.compute import ComputeWithPlacementGroupSupport +from dstack._internal.core.backends.features import ( + BACKENDS_WITH_INSTANCE_VOLUMES_SUPPORT, + BACKENDS_WITH_MULTINODE_SUPPORT, + BACKENDS_WITH_PRIVILEGED_SUPPORT, + BACKENDS_WITH_RESERVATION_SUPPORT, +) +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import ( + InstanceOfferWithAvailability, + InstanceType, + Resources, +) +from dstack._internal.core.models.placement import PlacementGroup +from dstack._internal.core.models.profiles import Profile +from dstack._internal.core.models.runs import JobProvisioningData, Requirements +from dstack._internal.core.models.volumes import Volume +from dstack._internal.server.models import ProjectModel +from dstack._internal.server.services import backends as backends_services + + +async def get_offers_by_requirements( + project: ProjectModel, + profile: Profile, + requirements: Requirements, + exclude_not_available=False, + multinode: bool = False, + master_job_provisioning_data: Optional[JobProvisioningData] = None, + volumes: Optional[List[List[Volume]]] = None, + privileged: bool = False, + instance_mounts: bool = False, + placement_group: Optional[PlacementGroup] = None, + blocks: Union[int, Literal["auto"]] = 1, + max_offers: Optional[int] = None, +) -> List[Tuple[Backend, InstanceOfferWithAvailability]]: + backends: List[Backend] = await backends_services.get_project_backends(project=project) + + backend_types: Optional[list[BackendType]] = profile.backends + regions: Optional[list[str]] = profile.regions + availability_zones: Optional[list[str]] = profile.availability_zones + instance_types: Optional[list[str]] = profile.instance_types + # (BackendType, region.lower() | "") + volumes_locations: Optional[set[tuple[BackendType, str]]] = None + + if volumes: + volumes_locations = {(v.get_backend(), v.get_region().lower()) for v in volumes[0]} + + if multinode: + if backend_types is None: + backend_types = BACKENDS_WITH_MULTINODE_SUPPORT + backend_types = [b for b in backend_types if b in BACKENDS_WITH_MULTINODE_SUPPORT] + + if privileged: + if backend_types is None: + backend_types = BACKENDS_WITH_PRIVILEGED_SUPPORT + backend_types = [b for b in backend_types if b in BACKENDS_WITH_PRIVILEGED_SUPPORT] + + if instance_mounts: + if backend_types is None: + backend_types = BACKENDS_WITH_INSTANCE_VOLUMES_SUPPORT + backend_types = [b for b in backend_types if b in BACKENDS_WITH_INSTANCE_VOLUMES_SUPPORT] + + if requirements.reservation is not None: + if backend_types is None: + backend_types = BACKENDS_WITH_RESERVATION_SUPPORT + backend_types = [b for b in backend_types if b in BACKENDS_WITH_RESERVATION_SUPPORT] + + # For multi-node, restrict backend and region. + # The default behavior is to provision all nodes in the same backend and region. + if master_job_provisioning_data is not None: + if backend_types is None: + backend_types = [master_job_provisioning_data.get_base_backend()] + if regions is None: + regions = [master_job_provisioning_data.region] + backend_types = [ + b for b in backend_types if b == master_job_provisioning_data.get_base_backend() + ] + regions = [r for r in regions if r == master_job_provisioning_data.region] + + if backend_types is not None: + backends = [b for b in backends if b.TYPE in backend_types or b.TYPE == BackendType.DSTACK] + + offers = await backends_services.get_backend_offers( + backends=backends, + requirements=requirements, + exclude_not_available=exclude_not_available, + ) + + offers = _filter_offers( + offers=offers, + # Double filtering by backends if backend returns offers for other backend. + backend_types=backend_types, + regions=regions, + availability_zones=availability_zones, + instance_types=instance_types, + placement_group=placement_group, + volumes_locations=volumes_locations, + ) + + if blocks != 1: + offers = _get_shareable_offers(offers, blocks) + + if max_offers is not None: + offers = itertools.islice(offers, max_offers) + + # Put NOT_AVAILABLE and NO_QUOTA offers at the end. + # We have to do this after taking max_offers to avoid processing all offers + # if all/most offers are unavailable. + return sorted(offers, key=lambda i: not i[1].availability.is_available()) + + +T = TypeVar("T") + + +def merge_offer_iterables( + *iterables: Iterable[tuple[T, InstanceOfferWithAvailability]], +) -> Iterable[tuple[T, InstanceOfferWithAvailability]]: + """ + Merge offers from different sources (e.g., different backends, different fleets). + + Some backends produce offers that are not sorted by price (e.g., `vastai` sorts by pod score). + That backend-specific order is preserved. + """ + return heapq.merge(*iterables, key=lambda i: i[1].price) + + +def is_divisible_into_blocks( + cpu_count: int, gpu_count: int, blocks: Union[int, Literal["auto"]] +) -> tuple[bool, int]: + """ + Returns `True` and number of blocks the instance can be split into or `False` and `0` if + is not divisible. + Requested number of blocks can be `auto`, which means as many as possible. + """ + if blocks == "auto": + if gpu_count == 0: + blocks = cpu_count + else: + blocks = min(cpu_count, gpu_count) + if blocks < 1 or cpu_count % blocks or gpu_count % blocks: + return False, 0 + return True, blocks + + +def generate_shared_offer( + offer: InstanceOfferWithAvailability, blocks: int, total_blocks: int +) -> InstanceOfferWithAvailability: + full_resources = offer.instance.resources + resources = Resources( + cpus=full_resources.cpus // total_blocks * blocks, + memory_mib=full_resources.memory_mib // total_blocks * blocks, + gpus=full_resources.gpus[: len(full_resources.gpus) // total_blocks * blocks], + spot=full_resources.spot, + disk=full_resources.disk, + description=full_resources.description, + ) + return InstanceOfferWithAvailability( + backend=offer.backend, + instance=InstanceType( + name=offer.instance.name, + resources=resources, + ), + region=offer.region, + price=offer.price, + backend_data=offer.backend_data, + availability=offer.availability, + blocks=blocks, + total_blocks=total_blocks, + ) + + +def get_instance_offer_with_restricted_az( + instance_offer: InstanceOfferWithAvailability, + master_job_provisioning_data: Optional[JobProvisioningData], +) -> InstanceOfferWithAvailability: + instance_offer = instance_offer.copy() + if ( + master_job_provisioning_data is not None + and master_job_provisioning_data.availability_zone is not None + ): + if instance_offer.availability_zones is None: + instance_offer.availability_zones = [master_job_provisioning_data.availability_zone] + instance_offer.availability_zones = [ + z + for z in instance_offer.availability_zones + if z == master_job_provisioning_data.availability_zone + ] + return instance_offer + + +def _filter_offers( + offers: Iterable[Tuple[Backend, InstanceOfferWithAvailability]], + backend_types: Optional[List[BackendType]] = None, + regions: Optional[List[str]] = None, + availability_zones: Optional[List[str]] = None, + instance_types: Optional[List[str]] = None, + placement_group: Optional[PlacementGroup] = None, + volumes_locations: Optional[Container[tuple[BackendType, str]]] = None, +) -> Iterator[Tuple[Backend, InstanceOfferWithAvailability]]: + """ + Yields filtered offers. May return modified offers to match the filters. + """ + if regions is not None: + regions = [r.lower() for r in regions] + if instance_types is not None: + instance_types = [i.lower() for i in instance_types] + + for b, offer in offers: + if backend_types is not None and offer.backend not in backend_types: + continue + if regions is not None and offer.region.lower() not in regions: + continue + if instance_types is not None and offer.instance.name.lower() not in instance_types: + continue + if placement_group is not None: + compute = b.compute() + if not isinstance( + compute, ComputeWithPlacementGroupSupport + ) or not compute.is_suitable_placement_group(placement_group, offer): + continue + if availability_zones is not None: + if offer.availability_zones is None: + continue + new_offer = offer.copy() + new_offer.availability_zones = [ + z for z in offer.availability_zones if z in availability_zones + ] + if not new_offer.availability_zones: + continue + offer = new_offer + # Offer is futher filtered against volumes AZs in Compute implementation, see + # ComputeWithCreateInstanceSupport._restrict_instance_offer_az_to_volumes_az() + if ( + volumes_locations is not None + and (offer.backend, offer.region.lower()) not in volumes_locations + ): + continue + yield (b, offer) + + +def _get_shareable_offers( + offers: Iterable[Tuple[Backend, InstanceOfferWithAvailability]], + blocks: Union[int, Literal["auto"]], +) -> Iterator[Tuple[Backend, InstanceOfferWithAvailability]]: + """ + Yields offers that can be shared with `total_blocks` set. + """ + for backend, offer in offers: + resources = offer.instance.resources + cpu_count = resources.cpus + gpu_count = len(resources.gpus) + if gpu_count > 0 and resources.gpus[0].vendor == gpuhunt.AcceleratorVendor.GOOGLE: + # TPUs cannot be shared + gpu_count = 1 + divisible, total_blocks = is_divisible_into_blocks(cpu_count, gpu_count, blocks) + if not divisible: + continue + new_offer = offer.copy() + new_offer.total_blocks = total_blocks + yield (backend, new_offer) diff --git a/src/dstack/_internal/server/services/permissions.py b/src/dstack/_internal/server/services/permissions.py new file mode 100644 index 0000000000..b3758e725c --- /dev/null +++ b/src/dstack/_internal/server/services/permissions.py @@ -0,0 +1,43 @@ +from pydantic import Field +from typing_extensions import Annotated + +from dstack._internal.core.models.common import CoreModel + + +class DefaultPermissions(CoreModel): + allow_non_admins_create_projects: Annotated[ + bool, + Field( + description=( + "This flag controls whether regular users (non-global admins)" + " can create and manage their own projects" + ) + ), + ] = True + allow_non_admins_manage_ssh_fleets: Annotated[ + bool, + Field( + description=( + "This flag controls whether regular project members (i.e. Users)" + " can add and delete SSH fleets" + ) + ), + ] = True + allow_managers_manage_secrets: Annotated[ + bool, + Field( + description=("This flag controls whether project managers can manage project secrets") + ), + ] = False + + +_default_permissions = DefaultPermissions() + + +def set_default_permissions(default_permissions: DefaultPermissions): + global _default_permissions + _default_permissions = default_permissions + + +def get_default_permissions() -> DefaultPermissions: + return _default_permissions diff --git a/src/dstack/_internal/server/services/pipelines.py b/src/dstack/_internal/server/services/pipelines.py new file mode 100644 index 0000000000..cbe2a28742 --- /dev/null +++ b/src/dstack/_internal/server/services/pipelines.py @@ -0,0 +1,30 @@ +from typing import Protocol + +from fastapi import Request + + +class PipelineHinterProtocol(Protocol): + def hint_fetch(self, model_name: str) -> None: + """ + Pass `Model.__name__` to hint replica's pipelines to fetch the model's items ASAP. + """ + pass + + +class _NoopPipelineHinter: + def hint_fetch(self, model_name: str) -> None: + pass + + +_noop_pipeline_hinter = _NoopPipelineHinter() + + +def get_pipeline_hinter(request: Request) -> PipelineHinterProtocol: + """ + Returns pipeline hinter that allows hinting replica's pipelines that there are new items for processing. + This can reduce processing latency if the processing happens rarely. + """ + pipeline_manager = getattr(request.app.state, "pipeline_manager", None) + if pipeline_manager is None: + return _noop_pipeline_hinter + return pipeline_manager.hinter diff --git a/src/dstack/_internal/server/services/placement.py b/src/dstack/_internal/server/services/placement.py new file mode 100644 index 0000000000..3292b70293 --- /dev/null +++ b/src/dstack/_internal/server/services/placement.py @@ -0,0 +1,218 @@ +import uuid +from collections.abc import Iterable +from typing import Optional +from uuid import UUID + +from sqlalchemy import and_, select, update +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.backends.base.compute import ( + ComputeWithPlacementGroupSupport, + generate_unique_placement_group_name, +) +from dstack._internal.core.errors import BackendError, PlacementGroupNotSupportedError +from dstack._internal.core.models.instances import InstanceOffer +from dstack._internal.core.models.placement import ( + PlacementGroup, + PlacementGroupConfiguration, + PlacementGroupProvisioningData, + PlacementStrategy, +) +from dstack._internal.server.models import FleetModel, PlacementGroupModel +from dstack._internal.server.services.instances import is_placeholder_instance +from dstack._internal.utils.common import run_async +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +def placement_group_model_to_placement_group( + placement_group_model: PlacementGroupModel, +) -> PlacementGroup: + configuration = get_placement_group_configuration(placement_group_model) + provisioning_data = get_placement_group_provisioning_data(placement_group_model) + return PlacementGroup( + name=placement_group_model.name, + project_name=placement_group_model.project.name, + configuration=configuration, + provisioning_data=provisioning_data, + ) + + +def placement_group_model_to_placement_group_optional( + placement_group_model: Optional[PlacementGroupModel], +) -> Optional[PlacementGroup]: + if placement_group_model is None: + return None + return placement_group_model_to_placement_group(placement_group_model) + + +def get_placement_group_configuration( + placement_group_model: PlacementGroupModel, +) -> PlacementGroupConfiguration: + return PlacementGroupConfiguration.__response__.parse_raw(placement_group_model.configuration) + + +def get_placement_group_provisioning_data( + placement_group_model: PlacementGroupModel, +) -> Optional[PlacementGroupProvisioningData]: + if placement_group_model.provisioning_data is None: + return None + return PlacementGroupProvisioningData.__response__.parse_raw( + placement_group_model.provisioning_data + ) + + +async def get_fleet_placement_group_models( + session: AsyncSession, + fleet_id: Optional[UUID], +) -> list[PlacementGroupModel]: + if fleet_id is None: + return [] + res = await session.execute( + select(PlacementGroupModel).where( + and_( + PlacementGroupModel.fleet_id == fleet_id, + PlacementGroupModel.deleted == False, + PlacementGroupModel.fleet_deleted == False, + ) + ) + ) + return list(res.scalars().all()) + + +async def schedule_fleet_placement_groups_deletion( + session: AsyncSession, fleet_id: UUID, except_placement_group_ids: Iterable[UUID] = () +): + await session.execute( + update(PlacementGroupModel) + .where( + and_( + PlacementGroupModel.fleet_id == fleet_id, + PlacementGroupModel.id.not_in(except_placement_group_ids), + ) + ) + .values(fleet_deleted=True) # TODO: rename `fleet_deleted` -> `to_be_deleted` + ) + + +def get_placement_group_model_for_job( + placement_group_models: list[PlacementGroupModel], + fleet_model: Optional[FleetModel], +) -> Optional[PlacementGroupModel]: + """ + Returns any fleet placement group for jobs that provision + in non-empty fleets and `None` for empty fleets. + This is so that only the first job creates placement groups. + Placeholder reservations are excluded: a placeholder-only fleet is treated + as empty here so offer selection is not pinned to a stale PG's region. + """ + placement_group_model = None + active_instances = [] + if fleet_model is not None: + active_instances = [ + i for i in fleet_model.instances if not i.deleted and not is_placeholder_instance(i) + ] + if len(active_instances) > 0 and len(placement_group_models) > 0: + placement_group_model = placement_group_models[0] + return placement_group_model + + +async def find_or_create_suitable_placement_group( + fleet_model: FleetModel, + placement_groups: list[PlacementGroupModel], + instance_offer: InstanceOffer, + compute: ComputeWithPlacementGroupSupport, +) -> Optional[PlacementGroupModel]: + placement_group_model = find_suitable_placement_group( + placement_groups=placement_groups, + instance_offer=instance_offer, + compute=compute, + ) + if placement_group_model is None: + placement_group_model = await create_placement_group( + fleet_model=fleet_model, + master_instance_offer=instance_offer, + compute=compute, + ) + return placement_group_model + + +def find_suitable_placement_group( + placement_groups: list[PlacementGroupModel], + instance_offer: InstanceOffer, + compute: ComputeWithPlacementGroupSupport, +) -> Optional[PlacementGroupModel]: + for pg in placement_groups: + if compute.is_suitable_placement_group( + placement_group_model_to_placement_group(pg), instance_offer + ): + return pg + return None + + +async def create_placement_group( + fleet_model: FleetModel, + master_instance_offer: InstanceOffer, + compute: ComputeWithPlacementGroupSupport, +) -> Optional[PlacementGroupModel]: + placement_group_model = PlacementGroupModel( + id=uuid.uuid4(), + # TODO: generate the name in Compute.create_placement_group to allow + # backend-specific name length limits + name=generate_unique_placement_group_name( + project_name=fleet_model.project.name, + fleet_name=fleet_model.name, + ), + project=fleet_model.project, + fleet=fleet_model, + configuration=PlacementGroupConfiguration( + backend=master_instance_offer.backend, + region=master_instance_offer.region, + placement_strategy=PlacementStrategy.CLUSTER, + ).json(), + ) + placement_group = placement_group_model_to_placement_group(placement_group_model) + logger.debug( + "Creating placement group %s in %s/%s", + placement_group.name, + placement_group.configuration.backend.value, + placement_group.configuration.region, + ) + try: + pgpd = await run_async( + compute.create_placement_group, + placement_group_model_to_placement_group(placement_group_model), + master_instance_offer, + ) + except PlacementGroupNotSupportedError: + logger.debug( + "Skipping offer %s because placement group not supported", + master_instance_offer.instance.name, + ) + return None + except BackendError as e: + logger.warning( + "Failed to create placement group %s in %s/%s: %r", + placement_group.name, + placement_group.configuration.backend.value, + placement_group.configuration.region, + e, + ) + return None + except Exception: + logger.exception( + "Got exception when creating placement group %s in %s/%s", + placement_group.name, + placement_group.configuration.backend.value, + placement_group.configuration.region, + ) + return None + logger.info( + "Created placement group %s in %s/%s", + placement_group.name, + placement_group.configuration.backend.value, + placement_group.configuration.region, + ) + placement_group_model.provisioning_data = pgpd.json() + return placement_group_model diff --git a/src/dstack/_internal/server/services/plugins.py b/src/dstack/_internal/server/services/plugins.py new file mode 100644 index 0000000000..d40b84b36d --- /dev/null +++ b/src/dstack/_internal/server/services/plugins.py @@ -0,0 +1,108 @@ +import itertools +from importlib import import_module +from importlib.metadata import entry_points +from typing import Dict + +from dstack._internal.core.errors import ServerClientError +from dstack._internal.utils.common import run_async +from dstack._internal.utils.logging import get_logger +from dstack.plugins import ApplyPolicy, ApplySpec, Plugin + +logger = get_logger(__name__) + + +_PLUGINS: list[Plugin] = [] + +_BUILTIN_PLUGINS: Dict[str, str] = {"rest_plugin": "dstack.plugins.builtin.rest_plugin:RESTPlugin"} + + +class PluginEntrypoint: + def __init__(self, name: str, import_path: str, is_builtin: bool = False): + self.name = name + self.import_path = import_path + self.is_builtin = is_builtin + + def load(self): + module_path, _, class_name = self.import_path.partition(":") + try: + module = import_module(module_path) + plugin_class = getattr(module, class_name, None) + if plugin_class is None: + logger.warning( + ("Failed to load plugin %s: plugin class %s not found in module %s."), + self.name, + class_name, + module_path, + ) + return None + if not issubclass(plugin_class, Plugin): + logger.warning( + ("Failed to load plugin %s: plugin class %s is not a subclass of Plugin."), + self.name, + class_name, + ) + return None + return plugin_class() + except ImportError: + logger.warning( + ( + "Failed to load plugin %s when importing %s." + " Ensure the module is on the import path." + ), + self.name, + self.import_path, + ) + return None + + +def load_plugins(enabled_plugins: list[str]): + _PLUGINS.clear() + entrypoints: dict[str, PluginEntrypoint] = {} + plugins_to_load = enabled_plugins.copy() + for entrypoint in entry_points(group="dstack.plugins"): + if entrypoint.name not in enabled_plugins: + logger.info( + ("Found not enabled plugin %s. Plugin will not be loaded."), + entrypoint.name, + ) + continue + else: + entrypoints[entrypoint.name] = PluginEntrypoint( + entrypoint.name, entrypoint.value, is_builtin=False + ) + + for name, import_path in _BUILTIN_PLUGINS.items(): + if name not in enabled_plugins: + logger.debug( + ("Found not enabled builtin plugin %s. Plugin will not be loaded."), + name, + ) + else: + entrypoints[name] = PluginEntrypoint(name, import_path, is_builtin=True) + + for plugin_name, plugin_entrypoint in entrypoints.items(): + plugin_instance = plugin_entrypoint.load() + if plugin_instance is not None: + _PLUGINS.append(plugin_instance) + plugins_to_load.remove(plugin_name) + logger.info("Loaded plugin %s", plugin_name) + + if plugins_to_load: + logger.warning("Enabled plugins not found: %s", plugins_to_load) + + +async def apply_plugin_policies(user: str, project: str, spec: ApplySpec) -> ApplySpec: + policies = _get_apply_policies() + for policy in policies: + try: + spec = await run_async(policy.on_apply, user=user, project=project, spec=spec) + except ValueError as e: + msg = None + if len(e.args) > 0: + msg = e.args[0] + raise ServerClientError(msg) + return spec + + +def _get_apply_policies() -> list[ApplyPolicy]: + return list(itertools.chain(*[p.get_apply_policies() for p in _PLUGINS])) diff --git a/src/dstack/_internal/server/services/pools.py b/src/dstack/_internal/server/services/pools.py deleted file mode 100644 index 20f6ac4bfe..0000000000 --- a/src/dstack/_internal/server/services/pools.py +++ /dev/null @@ -1,551 +0,0 @@ -import asyncio -import ipaddress -import uuid -from datetime import datetime, timezone -from typing import Dict, List, Optional - -import gpuhunt -from sqlalchemy import and_, or_, select -from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import joinedload - -from dstack._internal.core.backends import BACKENDS_WITH_MULTINODE_SUPPORT -from dstack._internal.core.backends.base.offers import ( - offer_to_catalog_item, - requirements_to_query_filter, -) -from dstack._internal.core.errors import ( - ResourceExistsError, - ResourceNotExistsError, - ServerClientError, -) -from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.instances import ( - InstanceAvailability, - InstanceOffer, - InstanceOfferWithAvailability, - InstanceType, - RemoteConnectionInfo, - Resources, - SSHKey, -) -from dstack._internal.core.models.pools import Instance, Pool, PoolInstances -from dstack._internal.core.models.profiles import DEFAULT_POOL_NAME, Profile, TerminationPolicy -from dstack._internal.core.models.runs import InstanceStatus, JobProvisioningData, Requirements -from dstack._internal.core.models.users import GlobalRole -from dstack._internal.core.models.volumes import Volume -from dstack._internal.server import settings -from dstack._internal.server.models import InstanceModel, PoolModel, ProjectModel, UserModel -from dstack._internal.server.services.jobs import PROCESSING_POOL_LOCK -from dstack._internal.server.services.projects import list_project_models, list_user_project_models -from dstack._internal.utils import common as common_utils -from dstack._internal.utils import random_names -from dstack._internal.utils.common import get_current_datetime -from dstack._internal.utils.logging import get_logger - -logger = get_logger(__name__) - - -async def list_project_pools(session: AsyncSession, project: ProjectModel) -> List[Pool]: - pools = await list_project_pool_models(session=session, project=project) - if len(pools) == 0: - pool = await get_or_create_pool_by_name(session, project, DEFAULT_POOL_NAME) - pools.append(pool) - return [pool_model_to_pool(p) for p in pools] - - -async def get_pool( - session: AsyncSession, project: ProjectModel, pool_name: str, select_deleted: bool = False -) -> Optional[PoolModel]: - filters = [ - PoolModel.name == pool_name, - PoolModel.project_id == project.id, - ] - if not select_deleted: - filters.append(PoolModel.deleted == False) - res = await session.scalars(select(PoolModel).where(*filters)) - return res.one_or_none() - - -async def get_or_create_pool_by_name( - session: AsyncSession, project: ProjectModel, pool_name: Optional[str] -) -> PoolModel: - if pool_name is None: - if project.default_pool_id is not None: - return await get_default_pool_or_error(session, project) - default_pool = await get_pool(session, project, DEFAULT_POOL_NAME) - if default_pool is not None: - await set_default_pool(session, project, DEFAULT_POOL_NAME) - return default_pool - return await create_pool(session, project, DEFAULT_POOL_NAME) - pool = await get_pool(session, project, pool_name) - if pool is not None: - return pool - return await create_pool(session, project, pool_name) - - -async def get_default_pool_or_error(session: AsyncSession, project: ProjectModel) -> PoolModel: - res = await session.execute(select(PoolModel).where(PoolModel.id == project.default_pool_id)) - return res.scalar_one() - - -async def create_pool(session: AsyncSession, project: ProjectModel, name: str) -> PoolModel: - pool = await get_pool(session, project, name) - if pool is not None: - raise ResourceExistsError() - pool = PoolModel( - name=name, - project_id=project.id, - ) - session.add(pool) - await session.commit() - await session.refresh(pool) - if project.default_pool_id is None: - await set_default_pool(session, project, pool.name) - return pool - - -async def list_project_pool_models( - session: AsyncSession, project: ProjectModel, select_deleted: bool = False -) -> List[PoolModel]: - filters = [PoolModel.project_id == project.id] - if not select_deleted: - filters.append(PoolModel.deleted == select_deleted) - pools = await session.execute( - select(PoolModel).where(*filters).options(joinedload(PoolModel.instances)) - ) - return list(pools.scalars().unique().all()) - - -async def set_default_pool(session: AsyncSession, project: ProjectModel, pool_name: str): - pool = await get_pool(session, project, pool_name) - if pool is None: - raise ResourceNotExistsError("Pool not found") - project.default_pool = pool - await session.commit() - - -async def delete_pool(session: AsyncSession, project: ProjectModel, pool_name: str) -> None: - # TODO force delete - pool = await get_pool(session, project, pool_name) - if pool is None: - raise ResourceNotExistsError("Pool not found") - - pool_instances = get_pool_instances(pool) - for instance in pool_instances: - if instance.status != InstanceStatus.TERMINATED: - raise ServerClientError("Cannot delete pool with running instances") - - pool.deleted = True - pool.deleted_at = get_current_datetime() - if project.default_pool_id == pool.id: - project.default_pool_id = None - await session.commit() - - -def pool_model_to_pool(pool_model: PoolModel) -> Pool: - total = 0 - available = 0 - for instance in pool_model.instances: - if not instance.deleted: - total += 1 - if instance.status.is_available(): - available += 1 - return Pool( - name=pool_model.name, - default=pool_model.project.default_pool_id == pool_model.id, - created_at=pool_model.created_at.replace(tzinfo=timezone.utc), - total_instances=total, - available_instances=available, - ) - - -async def remove_instance( - session: AsyncSession, - project: ProjectModel, - pool_name: str, - instance_name: str, - force: bool, -): - pool = await get_pool(session, project, pool_name) - if pool is None: - raise ResourceNotExistsError("Pool not found") - async with PROCESSING_POOL_LOCK: - terminated = False - for instance in pool.instances: - if instance.name == instance_name: - if force or instance.job_id is None: - instance.status = InstanceStatus.TERMINATING - terminated = True - await session.commit() - if not terminated: - raise ResourceNotExistsError("Could not find instance to terminate") - - -async def show_pool_instances( - session: AsyncSession, project: ProjectModel, pool_name: Optional[str] -) -> PoolInstances: - if pool_name is not None: - pool = await get_pool(session, project, pool_name) - if pool is None: - raise ResourceNotExistsError("Pool not found") - else: - pool = await get_or_create_pool_by_name(session, project, pool_name) - pool_instances = get_pool_instances(pool) - instances = list(map(instance_model_to_instance, pool_instances)) - return PoolInstances( - name=pool.name, - instances=instances, - ) - - -def get_pool_instances(pool: PoolModel) -> List[InstanceModel]: - return [instance for instance in pool.instances if not instance.deleted] - - -def instance_model_to_instance(instance_model: InstanceModel) -> Instance: - instance = Instance( - id=instance_model.id, - project_name=instance_model.project.name, - name=instance_model.name, - status=instance_model.status, - unreachable=instance_model.unreachable, - created=instance_model.created_at.replace(tzinfo=timezone.utc), - ) - - offer = get_instance_offer(instance_model) - if offer is not None: - instance.backend = offer.backend - instance.region = offer.region - instance.price = offer.price - - jpd = get_instance_provisioning_data(instance_model) - if jpd is not None: - instance.instance_type = jpd.instance_type - instance.hostname = jpd.hostname - - if instance_model.job is not None: - instance.job_name = instance_model.job.job_name - instance.job_status = instance_model.job.status - - if instance_model.pool is not None: - instance.pool_name = instance_model.pool.name - - return instance - - -def get_instance_provisioning_data(instance_model: InstanceModel) -> Optional[JobProvisioningData]: - if instance_model.job_provisioning_data is None: - return None - return JobProvisioningData.__response__.parse_raw(instance_model.job_provisioning_data) - - -def get_instance_offer(instance_model: InstanceModel) -> Optional[InstanceOfferWithAvailability]: - if instance_model.offer is None: - return None - return InstanceOfferWithAvailability.__response__.parse_raw(instance_model.offer) - - -_GENERATE_POOL_NAME_LOCK: Dict[str, asyncio.Lock] = {} - - -async def generate_instance_name( - session: AsyncSession, - project: ProjectModel, - pool_name: str, -) -> str: - lock = _GENERATE_POOL_NAME_LOCK.setdefault(project.name, asyncio.Lock()) - async with lock: - pool_instances = [] - pool = await get_pool(session, project, pool_name) - if pool is not None: - pool_instances = get_pool_instances(pool) - names = {g.name for g in pool_instances} - while True: - name = f"{random_names.generate_name()}" - if name not in names: - return name - - -async def add_remote( - session: AsyncSession, - project: ProjectModel, - pool_name: Optional[str], - instance_name: Optional[str], - instance_network: Optional[str], - region: Optional[str], - host: str, - port: int, - ssh_user: str, - ssh_keys: List[SSHKey], -) -> Instance: - if instance_network is not None: - try: - interface = ipaddress.IPv4Interface(instance_network) - instance_network = str(interface.network) - except ipaddress.AddressValueError: - raise ServerClientError("Failed to parse network value") - - # Check instance in all instances - pools = await list_project_pool_models(session, project) - for pool in pools: - for instance in pool.instances: - if instance.deleted: - continue - if instance.remote_connection_info is not None: - rci = RemoteConnectionInfo.__response__.parse_raw(instance.remote_connection_info) - if rci.host == host and rci.port == port and rci.ssh_user == ssh_user: - return instance_model_to_instance(instance) - - pool_model = await get_or_create_pool_by_name(session, project, pool_name) - pool_model_name = pool_model.name - if instance_name is None: - instance_name = await generate_instance_name(session, project, pool_model_name) - - # TODO: doc - will overwrite after remote connected - instance_resource = Resources(cpus=2, memory_mib=8, gpus=[], spot=False) - instance_type = InstanceType(name="ssh", resources=instance_resource) - - host_region = region if region is not None else "remote" - - remote = JobProvisioningData( - backend=BackendType.REMOTE, - instance_type=instance_type, - instance_id=instance_name, - hostname=host, - region=host_region, - internal_ip=None, - instance_network=instance_network, - price=0, - username=ssh_user, - ssh_port=port, - dockerized=True, - backend_data="", - ssh_proxy=None, - ) - offer = InstanceOfferWithAvailability( - backend=BackendType.REMOTE, - instance=instance_type, - region=host_region, - price=0.0, - availability=InstanceAvailability.AVAILABLE, - ) - - ssh_connection_info = RemoteConnectionInfo( - host=host, port=port, ssh_user=ssh_user, ssh_keys=ssh_keys - ).json() - - im = InstanceModel( - id=uuid.uuid4(), - name=instance_name, - project=project, - pool=pool_model, - backend=BackendType.REMOTE, - created_at=common_utils.get_current_datetime(), - started_at=common_utils.get_current_datetime(), - status=InstanceStatus.PENDING, - unreachable=False, - job_provisioning_data=remote.json(), - remote_connection_info=ssh_connection_info, - offer=offer.json(), - region=offer.region, - price=offer.price, - termination_policy=TerminationPolicy.DONT_DESTROY, - termination_idle_time=0, - ) - session.add(im) - await session.commit() - - instance = instance_model_to_instance(im) - return instance - - -def filter_pool_instances( - pool_instances: List[InstanceModel], - profile: Profile, - requirements: Requirements, - *, - status: Optional[InstanceStatus] = None, - multinode: bool = False, - master_job_provisioning_data: Optional[JobProvisioningData] = None, - volumes: Optional[List[Volume]] = None, -) -> List[InstanceModel]: - instances: List[InstanceModel] = [] - candidates: List[InstanceModel] = [] - - backend_types = profile.backends - regions = profile.regions - zone = None - - if volumes: - volume = volumes[0] - backend_types = [volume.configuration.backend] - regions = [volume.configuration.region] - if volume.provisioning_data is not None: - zone = volume.provisioning_data.availability_zone - - if multinode: - if not backend_types: - backend_types = BACKENDS_WITH_MULTINODE_SUPPORT - backend_types = [b for b in backend_types if b in BACKENDS_WITH_MULTINODE_SUPPORT] - - # For multi-node, restrict backend and region. - # The default behavior is to provision all nodes in the same backend and region. - if master_job_provisioning_data is not None: - if not backend_types: - backend_types = [master_job_provisioning_data.backend] - backend_types = [b for b in backend_types if b == master_job_provisioning_data.backend] - if not regions: - regions = [master_job_provisioning_data.region] - regions = [b for b in backend_types if b == master_job_provisioning_data.region] - - for instance in pool_instances: - if instance.unreachable: - continue - if profile.instance_name is not None and instance.name != profile.instance_name: - continue - if status is not None and instance.status != status: - continue - - # TODO: remove on prod - if settings.LOCAL_BACKEND_ENABLED and instance.backend == BackendType.LOCAL: - instances.append(instance) - continue - - if backend_types is not None and instance.backend not in backend_types: - continue - - if regions is not None and instance.region not in regions: - continue - - jpd = get_instance_provisioning_data(instance) - if ( - jpd is not None - and jpd.availability_zone is not None - and zone is not None - and jpd.availability_zone != zone - ): - continue - - candidates.append(instance) - - query_filter = requirements_to_query_filter(requirements) - for instance in candidates: - if instance.offer is None: - continue - offer = InstanceOffer.__response__.parse_raw(instance.offer) - catalog_item = offer_to_catalog_item(offer) - if gpuhunt.matches(catalog_item, query_filter): - instances.append(instance) - return instances - - -async def list_pools_instance_models( - session: AsyncSession, - projects: List[ProjectModel], - pool: Optional[PoolModel], - only_active: bool, - prev_created_at: Optional[datetime], - prev_id: Optional[uuid.UUID], - limit: int, - ascending: bool, -) -> List[InstanceModel]: - filters: List = [ - InstanceModel.project_id.in_(p.id for p in projects), - ] - if pool is not None: - filters.append(InstanceModel.pool_id == pool.id) - if only_active: - filters.extend( - [ - InstanceModel.deleted == False, - InstanceModel.status.in_([InstanceStatus.IDLE, InstanceStatus.BUSY]), - ] - ) - if prev_created_at is not None: - if ascending: - if prev_id is None: - filters.append(InstanceModel.created_at > prev_created_at) - else: - filters.append( - or_( - InstanceModel.created_at > prev_created_at, - and_( - InstanceModel.created_at == prev_created_at, - InstanceModel.id < prev_id, - ), - ) - ) - else: - if prev_id is None: - filters.append(InstanceModel.created_at < prev_created_at) - else: - filters.append( - or_( - InstanceModel.created_at < prev_created_at, - and_( - InstanceModel.created_at == prev_created_at, - InstanceModel.id > prev_id, - ), - ) - ) - order_by = (InstanceModel.created_at.desc(), InstanceModel.id) - if ascending: - order_by = (InstanceModel.created_at.asc(), InstanceModel.id.desc()) - - res = await session.execute( - select(InstanceModel) - .where(*filters) - .order_by(*order_by) - .limit(limit) - .options(joinedload(InstanceModel.pool)) - ) - instance_models = list(res.scalars().all()) - return instance_models - - -async def list_user_pool_instances( - session: AsyncSession, - user: UserModel, - project_name: Optional[str], - pool_name: Optional[str], - only_active: bool, - prev_created_at: Optional[datetime], - prev_id: Optional[uuid.UUID], - limit: int, - ascending: bool, -) -> List[Instance]: - if user.global_role == GlobalRole.ADMIN: - projects = await list_project_models(session=session) - else: - projects = await list_user_project_models(session=session, user=user) - if not projects: - return [] - - pool = None - if project_name is not None: - projects = [proj for proj in projects if proj.name == project_name] - if len(projects) == 0: - return [] - if pool_name is not None: - pool = await get_pool( - session=session, - project=projects[0], - pool_name=pool_name, - select_deleted=(not only_active), - ) - - instance_models = await list_pools_instance_models( - session=session, - projects=projects, - pool=pool, - only_active=only_active, - prev_created_at=prev_created_at, - prev_id=prev_id, - limit=limit, - ascending=ascending, - ) - instances = [] - for instance in instance_models: - instances.append(instance_model_to_instance(instance)) - return instances diff --git a/src/dstack/_internal/server/services/probes.py b/src/dstack/_internal/server/services/probes.py new file mode 100644 index 0000000000..719c6a94d3 --- /dev/null +++ b/src/dstack/_internal/server/services/probes.py @@ -0,0 +1,10 @@ +from dstack._internal.core.models.runs import Probe, ProbeSpec +from dstack._internal.server.models import ProbeModel + + +def probe_model_to_probe(probe_model: ProbeModel) -> Probe: + return Probe(success_streak=probe_model.success_streak) + + +def is_probe_ready(probe: ProbeModel, spec: ProbeSpec) -> bool: + return probe.success_streak >= spec.ready_after diff --git a/src/dstack/_internal/server/services/projects.py b/src/dstack/_internal/server/services/projects.py index b2d6df4344..5308d6045e 100644 --- a/src/dstack/_internal/server/services/projects.py +++ b/src/dstack/_internal/server/services/projects.py @@ -1,26 +1,56 @@ +import re +import secrets import uuid +from datetime import datetime from typing import Awaitable, Callable, List, Optional, Tuple -from sqlalchemy import delete, select, update +from sqlalchemy import and_, delete, literal_column, or_, select, update from sqlalchemy import func as safunc from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import QueryableAttribute, joinedload, load_only +from dstack._internal.core.backends.configurators import get_configurator +from dstack._internal.core.backends.dstack.models import ( + DstackBackendConfig, + DstackBaseBackendConfig, +) +from dstack._internal.core.backends.models import BackendInfo from dstack._internal.core.errors import ForbiddenError, ResourceExistsError, ServerClientError -from dstack._internal.core.models.backends import BackendInfo -from dstack._internal.core.models.backends.dstack import ( - DstackBaseBackendConfigInfo, - DstackConfigInfo, +from dstack._internal.core.models.projects import ( + Member, + MemberPermissions, + Project, + ProjectHookConfig, + ProjectsInfoList, + ProjectsInfoListOrProjectsList, ) -from dstack._internal.core.models.common import is_core_model_instance -from dstack._internal.core.models.projects import Member, Project +from dstack._internal.core.models.runs import RunStatus from dstack._internal.core.models.users import GlobalRole, ProjectRole -from dstack._internal.server.models import MemberModel, ProjectModel, UserModel +from dstack._internal.server.const import GLOBAL_EXPORTS_LOCK_NAMESPACE +from dstack._internal.server.db import get_db, is_db_postgres, is_db_sqlite +from dstack._internal.server.models import ( + ExportModel, + FleetModel, + ImportModel, + MemberModel, + ProjectModel, + RunModel, + UserModel, + VolumeModel, +) from dstack._internal.server.schemas.projects import MemberSetting -from dstack._internal.server.services import users -from dstack._internal.server.services.backends import get_configurator +from dstack._internal.server.services import events, users +from dstack._internal.server.services import templates as templates_service +from dstack._internal.server.services.backends import ( + get_backend_config_without_creds_from_backend_model, +) +from dstack._internal.server.services.locking import ( + get_locker, + string_to_lock_id, +) +from dstack._internal.server.services.permissions import get_default_permissions from dstack._internal.server.settings import DEFAULT_PROJECT_NAME -from dstack._internal.server.utils.common import run_async -from dstack._internal.utils.common import get_current_datetime +from dstack._internal.utils.common import get_current_datetime, run_async from dstack._internal.utils.crypto import generate_rsa_key_pair_bytes from dstack._internal.utils.logging import get_logger @@ -28,7 +58,8 @@ async def get_or_create_default_project( - session: AsyncSession, user: UserModel + session: AsyncSession, + user: UserModel, ) -> Tuple[Project, bool]: default_project = await get_project_by_name( session=session, @@ -37,25 +68,94 @@ async def get_or_create_default_project( if default_project is not None: return default_project, False default_project = await create_project( - session=session, user=user, project_name=DEFAULT_PROJECT_NAME + session=session, + user=user, + project_name=DEFAULT_PROJECT_NAME, ) return default_project, True -async def list_user_projects( +async def list_user_accessible_projects( session: AsyncSession, user: UserModel, -) -> List[Project]: - if user.global_role == GlobalRole.ADMIN: - projects = await list_project_models(session=session) - else: - projects = await list_user_project_models(session=session, user=user) - return [project_model_to_project(p) for p in projects] - - -async def list_projects(session: AsyncSession) -> List[Project]: - projects = await list_project_models(session=session) - return [project_model_to_project(p) for p in projects] + include_not_joined: bool, + return_total_count: bool, + name_pattern: Optional[str], + prev_created_at: Optional[datetime], + prev_id: Optional[uuid.UUID], + limit: int, + ascending: bool, +) -> ProjectsInfoListOrProjectsList: + """ + Returns all projects accessible to the user: + - All projects for global admins + - Projects where user is a member (public or private) + - if `include_not_joined`: Public projects where user is NOT a member + """ + filters = [ProjectModel.deleted == False] + if name_pattern: + name_pattern = name_pattern.replace("_", "/_") + filters.append(ProjectModel.name.ilike(f"%{name_pattern}%", escape="/")) + stmt = select(ProjectModel).where(*filters) + if user.global_role != GlobalRole.ADMIN: + stmt = stmt.outerjoin( + MemberModel, + onclause=and_( + MemberModel.project_id == ProjectModel.id, + MemberModel.user_id == user.id, + ), + ) + if include_not_joined: + stmt = stmt.where( + or_( + ProjectModel.is_public == True, + MemberModel.user_id.is_not(None), + ) + ) + else: + stmt = stmt.where(MemberModel.user_id.is_not(None)) + pagination_filters = [] + if prev_created_at is not None: + if ascending: + if prev_id is None: + pagination_filters.append(ProjectModel.created_at > prev_created_at) + else: + pagination_filters.append( + or_( + ProjectModel.created_at > prev_created_at, + and_( + ProjectModel.created_at == prev_created_at, ProjectModel.id < prev_id + ), + ) + ) + else: + if prev_id is None: + pagination_filters.append(ProjectModel.created_at < prev_created_at) + else: + pagination_filters.append( + or_( + ProjectModel.created_at < prev_created_at, + and_( + ProjectModel.created_at == prev_created_at, ProjectModel.id > prev_id + ), + ) + ) + order_by = (ProjectModel.created_at.desc(), ProjectModel.id) + if ascending: + order_by = (ProjectModel.created_at.asc(), ProjectModel.id.desc()) + total_count = None + if return_total_count: + res = await session.execute(stmt.with_only_columns(safunc.count(literal_column("1")))) + total_count = res.scalar_one() + res = await session.execute(stmt.where(*pagination_filters).order_by(*order_by).limit(limit)) + project_models = res.unique().scalars().all() + projects = [ + project_model_to_project(p, include_backends=False, include_members=False) + for p in project_models + ] + if total_count is None: + return projects + return ProjectsInfoList(total_count=total_count, projects=projects) async def get_project_by_name( @@ -68,7 +168,17 @@ async def get_project_by_name( return project_model_to_project(project_model) -async def create_project(session: AsyncSession, user: UserModel, project_name: str) -> Project: +async def create_project( + session: AsyncSession, + user: UserModel, + project_name: str, + is_public: bool = False, + templates_repo: Optional[str] = None, + config: Optional[ProjectHookConfig] = None, +) -> Project: + user_permissions = users.get_user_permissions(user) + if not user_permissions.can_create_projects: + raise ForbiddenError("User cannot create projects") project = await get_project_model_by_name( session=session, project_name=project_name, ignore_case=True ) @@ -79,93 +189,274 @@ async def create_project(session: AsyncSession, user: UserModel, project_name: s session=session, owner=user, project_name=project_name, + is_public=is_public, + templates_repo=templates_repo, ) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.ADMIN, + member_num=0, ) project_model = await get_project_model_by_name_or_error( session=session, project_name=project_name ) for hook in _CREATE_PROJECT_HOOKS: - await hook(session, project_model) - await session.refresh(project_model) # a hook may change project + await hook(session, project_model, config) + # a hook may change project + session.expire(project_model) + project_model = await get_project_model_by_name_or_error( + session=session, project_name=project_name + ) return project_model_to_project(project_model) +async def update_project( + session: AsyncSession, + user: UserModel, + project: ProjectModel, + is_public: Optional[bool] = None, + templates_repo: Optional[str] = None, + reset_templates_repo: bool = False, +): + updated_fields = [] + if is_public is not None and is_public != project.is_public: + project.is_public = is_public + updated_fields.append(f"is_public={is_public}") + + update_templates_repo, new_templates_repo = await _resolve_new_templates_repo( + project=project, + templates_repo=templates_repo, + reset_templates_repo=reset_templates_repo, + ) + if update_templates_repo: + templates_service.invalidate_templates_cache( + project.id, project.templates_repo, new_templates_repo + ) + project.templates_repo = new_templates_repo + updated_fields.append(f"templates_repo={new_templates_repo}") + events.emit( + session, + f"Project updated. Updated fields: {', '.join(updated_fields) or ''}", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(project)], + ) + await session.commit() + + async def delete_projects( session: AsyncSession, user: UserModel, projects_names: List[str], ): if user.global_role != GlobalRole.ADMIN: - user_projects = await list_user_project_models(session=session, user=user) + user_projects = await list_member_project_models( + session=session, user=user, include_members=True + ) user_project_names = [p.name for p in user_projects] for project_name in projects_names: if project_name not in user_project_names: raise ForbiddenError() - for project in user_projects: + projects_to_delete = [p for p in user_projects if p.name in projects_names] + for project in projects_to_delete: if not _is_project_admin(user=user, project=project): raise ForbiddenError() - if all(name in projects_names for name in user_project_names): - raise ServerClientError("Cannot delete the only project") - timestamp = str(int(get_current_datetime().timestamp())) - new_project_name = "_deleted_" + timestamp + ProjectModel.name - await session.execute( - update(ProjectModel) - .where(ProjectModel.name.in_(projects_names)) - .values( - deleted=True, - name=new_project_name, + + res = await session.execute( + select(ProjectModel) + .where( + ProjectModel.name.in_(projects_names), + ProjectModel.deleted == False, ) + .options(load_only(ProjectModel.id, ProjectModel.name)) ) - await session.commit() + projects = res.scalars().all() + if len(projects) != len(projects_names): + raise ServerClientError("Failed to delete non-existent projects") + for p in projects: + # FIXME: The checks are not under lock, + # so there can be dangling active resources due to race conditions. + await _check_project_has_active_resources(session=session, project_id=p.id) -async def add_project_member( - session: AsyncSession, - project: ProjectModel, - user: UserModel, - project_role: ProjectRole, - commit: bool = True, -) -> MemberModel: - member = MemberModel( - user_id=user.id, - project_id=project.id, - project_role=project_role, - ) - session.add(member) - if commit: - await session.commit() - return member + project_ids = {p.id for p in projects} + timestamp = str(int(get_current_datetime().timestamp())) + updates = [] + for p in projects: + updates.append( + { + "id": p.id, + "name": f"_deleted_{timestamp}_{secrets.token_hex(8)}", + "original_name": p.name, + "deleted": True, + } + ) + events.emit( + session, + "Project deleted", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(p)], + ) + await session.execute(update(ProjectModel), updates) + await session.execute(delete(ExportModel).where(ExportModel.project_id.in_(project_ids))) + await session.execute(delete(ImportModel).where(ImportModel.project_id.in_(project_ids))) + await session.commit() async def set_project_members( session: AsyncSession, + user: UserModel, project: ProjectModel, members: List[MemberSetting], ): + usernames = {m.username for m in members} + if len(usernames) != len(members): + raise ServerClientError("Cannot add same user multiple times") + + project = await get_project_model_by_name_or_error( + session=session, + project_name=project.name, + ) + project_role = get_user_project_role(user=user, project=project) + if user.global_role != GlobalRole.ADMIN and project_role == ProjectRole.MANAGER: + new_admins_members = { + (m.username, m.project_role) for m in members if m.project_role == ProjectRole.ADMIN + } + current_admins_members = { + (m.user.name, m.project_role) + for m in project.members + if m.project_role == ProjectRole.ADMIN + } + if new_admins_members != current_admins_members: + raise ForbiddenError("Access denied: changing project admins") + + # FIXME: potentially long write transaction + # clear_project_members() issues DELETE without commit await clear_project_members(session=session, project=project) - usernames = [m.username for m in members] - res = await session.execute(select(UserModel).where(UserModel.name.in_(usernames))) + names = [m.username for m in members] + res = await session.execute( + select(UserModel).where( + (UserModel.name.in_(names)) | (UserModel.email.in_(names)), + UserModel.deleted == False, + ) + ) users = res.scalars().all() username_to_user = {user.name: user for user in users} - for member in members: - user = username_to_user.get(member.username) - if user is None: + email_to_user = {user.email: user for user in users if user.email} + for i, member in enumerate(members): + user_to_add = username_to_user.get(member.username) or email_to_user.get(member.username) + if user_to_add is None: continue await add_project_member( session=session, project=project, - user=user, + user=user_to_add, project_role=member.project_role, + member_num=i, commit=False, ) await session.commit() +async def add_project_members( + session: AsyncSession, + user: UserModel, + project: ProjectModel, + members: List[MemberSetting], +): + """Add multiple members to a project.""" + usernames = {m.username for m in members} + if len(usernames) != len(members): + raise ServerClientError("Cannot add same user multiple times") + + project = await get_project_model_by_name_or_error( + session=session, + project_name=project.name, + ) + requesting_user_role = get_user_project_role(user=user, project=project) + + is_self_join_to_public = ( + len(members) == 1 + and project.is_public + and (members[0].username == user.name or members[0].username == user.email) + and requesting_user_role is None + ) + + if not is_self_join_to_public: + if user.global_role != GlobalRole.ADMIN and requesting_user_role not in [ + ProjectRole.ADMIN, + ProjectRole.MANAGER, + ]: + raise ForbiddenError("Access denied: insufficient permissions to add members") + + if user.global_role != GlobalRole.ADMIN and requesting_user_role == ProjectRole.MANAGER: + for member in members: + if member.project_role == ProjectRole.ADMIN: + raise ForbiddenError( + "Access denied: only global admins can add project admins" + ) + else: + if members[0].project_role != ProjectRole.USER: + raise ForbiddenError("Access denied: can only join public projects as user role") + + res = await session.execute( + select(UserModel).where( + (UserModel.name.in_(usernames)) | (UserModel.email.in_(usernames)), + UserModel.deleted == False, + ) + ) + users_found = res.scalars().all() + + username_to_user = {user.name: user for user in users_found} + email_to_user = {user.email: user for user in users_found if user.email} + + member_by_user_id = {m.user_id: m for m in project.members} + + for member_setting in members: + user_to_add = username_to_user.get(member_setting.username) or email_to_user.get( + member_setting.username + ) + if user_to_add is None: + raise ServerClientError(f"User not found: {member_setting.username}") + + if user_to_add.id in member_by_user_id: + existing_member = member_by_user_id[user_to_add.id] + if existing_member.project_role != member_setting.project_role: + existing_member.project_role = member_setting.project_role + else: + await add_project_member( + session=session, + project=project, + user=user_to_add, + project_role=member_setting.project_role, + member_num=None, + commit=False, + ) + + await session.commit() + + +async def add_project_member( + session: AsyncSession, + project: ProjectModel, + user: UserModel, + project_role: ProjectRole, + member_num: Optional[int] = None, + commit: bool = True, +) -> MemberModel: + member = MemberModel( + user_id=user.id, + project_id=project.id, + project_role=project_role, + member_num=member_num, + ) + session.add(member) + if commit: + await session.commit() + return member + + async def clear_project_members( session: AsyncSession, project: ProjectModel, @@ -176,13 +467,65 @@ async def clear_project_members( async def list_user_project_models( session: AsyncSession, user: UserModel, + only_names: bool = False, + include_members: bool = False, ) -> List[ProjectModel]: + load_only_attrs = [] + if only_names: + load_only_attrs += [ProjectModel.id, ProjectModel.name] + if user.global_role == GlobalRole.ADMIN: + return await list_project_models( + session=session, load_only_attrs=load_only_attrs, include_members=include_members + ) + return await list_member_project_models( + session=session, + user=user, + load_only_attrs=load_only_attrs, + include_members=include_members, + ) + + +async def list_member_project_models( + session: AsyncSession, + user: UserModel, + include_members: bool = False, + load_only_attrs: Optional[List[QueryableAttribute]] = None, +) -> List[ProjectModel]: + """ + List project models for a user where they are a member. + """ + options = [] + if include_members: + options.append(joinedload(ProjectModel.members)) + if load_only_attrs: + options.append(load_only(*load_only_attrs)) res = await session.execute( - select(ProjectModel).where( + select(ProjectModel) + .where( MemberModel.project_id == ProjectModel.id, MemberModel.user_id == user.id, ProjectModel.deleted == False, ) + .options(*options) + ) + return list(res.scalars().unique().all()) + + +async def list_public_non_member_project_models( + session: AsyncSession, + user: UserModel, +) -> List[ProjectModel]: + """ + List public project models where user is NOT a member. + """ + res = await session.execute( + select(ProjectModel).where( + ProjectModel.deleted == False, + ProjectModel.is_public == True, + ProjectModel.id.notin_( + select(MemberModel.project_id).where(MemberModel.user_id == user.id) + ), + ) ) return list(res.scalars().all()) @@ -202,11 +545,21 @@ async def list_user_owned_project_models( async def list_project_models( session: AsyncSession, + load_only_attrs: Optional[List[QueryableAttribute]] = None, + include_members: bool = False, ) -> List[ProjectModel]: + options = [] + if include_members: + options.append(joinedload(ProjectModel.members)) + if load_only_attrs: + options.append(load_only(*load_only_attrs)) res = await session.execute( - select(ProjectModel).where(ProjectModel.deleted == False), + select(ProjectModel).where(ProjectModel.deleted == False).options(*options) ) - return list(res.scalars().all()) + return list(res.scalars().unique().all()) + + +# TODO: Do not load ProjectModel.backends and ProjectModel.members by default when getting project async def get_project_model_by_name( @@ -217,8 +570,13 @@ async def get_project_model_by_name( filters.append(safunc.lower(ProjectModel.name) == safunc.lower(project_name)) else: filters.append(ProjectModel.name == project_name) - res = await session.execute(select(ProjectModel).where(*filters)) - return res.scalar() + res = await session.execute( + select(ProjectModel) + .where(*filters) + .options(joinedload(ProjectModel.backends)) + .options(joinedload(ProjectModel.members)) + ) + return res.unique().scalar() async def get_project_model_by_name_or_error( @@ -226,12 +584,15 @@ async def get_project_model_by_name_or_error( project_name: str, ) -> ProjectModel: res = await session.execute( - select(ProjectModel).where( + select(ProjectModel) + .where( ProjectModel.name == project_name, ProjectModel.deleted == False, ) + .options(joinedload(ProjectModel.backends)) + .options(joinedload(ProjectModel.members)) ) - return res.scalar_one() + return res.unique().scalar_one() async def get_project_model_by_id_or_error( @@ -239,17 +600,26 @@ async def get_project_model_by_id_or_error( project_id: uuid.UUID, ) -> ProjectModel: res = await session.execute( - select(ProjectModel).where( + select(ProjectModel) + .where( ProjectModel.id == project_id, ProjectModel.deleted == False, ) + .options(joinedload(ProjectModel.backends)) + .options(joinedload(ProjectModel.members)) ) - return res.scalar_one() + return res.unique().scalar_one() async def create_project_model( - session: AsyncSession, owner: UserModel, project_name: str + session: AsyncSession, + owner: UserModel, + project_name: str, + is_public: bool = False, + templates_repo: Optional[str] = None, ) -> ProjectModel: + validate_project_name(project_name) + templates_repo = await _normalize_templates_repo_url(templates_repo) private_bytes, public_bytes = await run_async( generate_rsa_key_pair_bytes, f"{project_name}@dstack" ) @@ -259,55 +629,179 @@ async def create_project_model( name=project_name, ssh_private_key=private_bytes.decode(), ssh_public_key=public_bytes.decode(), + is_public=is_public, + templates_repo=templates_repo, ) - session.add(project) - await session.commit() + + if is_db_sqlite(): + # Start new transaction to see committed changes after lock + await session.commit() + elif is_db_postgres(): + await session.execute( + select(safunc.pg_advisory_xact_lock(string_to_lock_id(GLOBAL_EXPORTS_LOCK_NAMESPACE))) + ) + global_exports_lock, _ = get_locker(get_db().dialect_name).get_lockset( + GLOBAL_EXPORTS_LOCK_NAMESPACE + ) + + async with global_exports_lock: + res = await session.execute(select(ExportModel.id).where(ExportModel.is_global == True)) + for export_id in res.scalars().all(): + session.add(ImportModel(project=project, export_id=export_id)) + session.add(project) + events.emit( + session, + "Project created", + actor=events.UserActor.from_user(owner), + targets=[events.Target.from_model(project)], + ) + await session.commit() return project -def project_model_to_project(project_model: ProjectModel) -> Project: +def get_user_project_role(user: UserModel, project: ProjectModel) -> Optional[ProjectRole]: + for member in project.members: + if member.user_id == user.id: + return member.project_role + return None + + +def get_member(user: UserModel, project: ProjectModel) -> Optional[MemberModel]: + for member in project.members: + if member.user_id == user.id: + return member + return None + + +def project_model_to_project( + project_model: ProjectModel, + include_backends: bool = True, + include_members: bool = True, +) -> Project: members = [] - for m in project_model.members: - members.append( - Member( - user=users.user_model_to_user(m.user), - project_role=m.project_role, + if include_members: + for m in project_model.members: + members.append( + Member( + user=users.user_model_to_user(m.user), + project_role=m.project_role, + permissions=get_member_permissions(m), + ) ) - ) backends = [] - for b in project_model.backends: - configurator = get_configurator(b.type) - if configurator is None: - logger.warning("Configurator for backend %s not found", b.type) - continue - config_info = configurator.get_config_info(model=b, include_creds=False) - if is_core_model_instance(config_info, DstackConfigInfo): - for backend_type in config_info.base_backends: + if include_backends: + for b in project_model.backends: + configurator = get_configurator(b.type) + if configurator is None: + logger.warning("Configurator for backend %s not found", b.type) + continue + if not b.auth.decrypted: + logger.warning( + "Failed to decrypt creds for %s backend. Backend will be ignored.", + b.type.value, + ) + continue + backend_config = get_backend_config_without_creds_from_backend_model(configurator, b) + if isinstance(backend_config, DstackBackendConfig): + for backend_type in backend_config.base_backends: + backends.append( + BackendInfo( + name=backend_type, + config=DstackBaseBackendConfig(type=backend_type), + ) + ) + else: backends.append( BackendInfo( - name=backend_type, config=DstackBaseBackendConfigInfo(type=backend_type) + name=b.type, + config=backend_config, ) ) - else: - backends.append( - BackendInfo( - name=b.type, - config=config_info, - ) - ) return Project( project_id=project_model.id, project_name=project_model.name, owner=users.user_model_to_user(project_model.owner), + created_at=project_model.created_at, backends=backends, members=members, + is_public=project_model.is_public, + **( + {"templates_repo": project_model.templates_repo} + if project_model.templates_repo is not None + else {} + ), + ) + + +def get_member_permissions(member_model: MemberModel) -> MemberPermissions: + default_permissions = get_default_permissions() + user_model = member_model.user + can_manage_ssh_fleets = True + if not default_permissions.allow_non_admins_manage_ssh_fleets: + if ( + user_model.global_role != GlobalRole.ADMIN + and member_model.project_role != ProjectRole.ADMIN + ): + can_manage_ssh_fleets = False + can_manage_secrets = ( + user_model.global_role == GlobalRole.ADMIN + or member_model.project_role == ProjectRole.ADMIN + or ( + member_model.project_role == ProjectRole.MANAGER + and default_permissions.allow_managers_manage_secrets + ) + ) + return MemberPermissions( + can_manage_ssh_fleets=can_manage_ssh_fleets, + can_manage_secrets=can_manage_secrets, ) +def validate_project_name(project_name: str): + if not is_valid_project_name(project_name): + raise ServerClientError("Project name should match regex '^[a-zA-Z0-9-_]{1,50}$'") + + +def is_valid_project_name(project_name: str) -> bool: + return re.match("^[a-zA-Z0-9-_]{1,50}$", project_name) is not None + + +async def _normalize_templates_repo_url(templates_repo: Optional[str]) -> Optional[str]: + if templates_repo is None: + return None + templates_repo = templates_repo.strip() + if templates_repo == "": + return None + try: + await run_async(templates_service.validate_templates_repo_access, templates_repo) + except ValueError as e: + raise ServerClientError(str(e)) + return templates_repo + + +async def _resolve_new_templates_repo( + project: ProjectModel, + templates_repo: Optional[str], + reset_templates_repo: bool, +) -> Tuple[bool, Optional[str]]: + if reset_templates_repo: + return project.templates_repo is not None, None + if templates_repo is None: + return False, None + normalized_templates_repo = await _normalize_templates_repo_url(templates_repo) + if normalized_templates_repo is None: + return False, None + if normalized_templates_repo == project.templates_repo: + return False, None + return True, normalized_templates_repo + + _CREATE_PROJECT_HOOKS = [] -def register_create_project_hook(func: Callable[[AsyncSession, ProjectModel], Awaitable[None]]): +def register_create_project_hook( + func: Callable[[AsyncSession, ProjectModel, Optional[ProjectHookConfig]], Awaitable[None]], +): _CREATE_PROJECT_HOOKS.append(func) @@ -323,8 +817,122 @@ def _is_project_admin( user: UserModel, project: ProjectModel, ) -> bool: + if user.id == project.owner_id: + return True + for m in project.members: if user.id == m.user_id: if m.project_role == ProjectRole.ADMIN: return True return False + + +async def _check_project_has_active_resources(session: AsyncSession, project_id: uuid.UUID): + res = await session.execute( + select(RunModel.run_name).where( + RunModel.project_id == project_id, + RunModel.status.not_in(RunStatus.finished_statuses()), + ) + ) + run_names = list(res.scalars().all()) + if len(run_names) > 0: + raise ServerClientError(f"Failed to delete project with active runs: {run_names}") + res = await session.execute( + select(FleetModel.name).where( + FleetModel.project_id == project_id, + FleetModel.deleted.is_(False), + ) + ) + fleet_names = list(res.scalars().all()) + if len(fleet_names) > 0: + raise ServerClientError(f"Failed to delete project with active fleets: {fleet_names}") + res = await session.execute( + select(VolumeModel.name).where( + VolumeModel.project_id == project_id, + VolumeModel.deleted.is_(False), + ) + ) + volume_names = list(res.scalars().all()) + if len(volume_names) > 0: + raise ServerClientError(f"Failed to delete project with active volumes: {volume_names}") + + +async def remove_project_members( + session: AsyncSession, + user: UserModel, + project: ProjectModel, + usernames: List[str], +): + """Remove multiple members from a project.""" + project = await get_project_model_by_name_or_error( + session=session, + project_name=project.name, + ) + requesting_user_role = get_user_project_role(user=user, project=project) + + is_self_leave = ( + len(usernames) == 1 + and (usernames[0] == user.name or usernames[0] == user.email) + and requesting_user_role is not None + ) + + if not is_self_leave: + if user.global_role != GlobalRole.ADMIN and requesting_user_role not in [ + ProjectRole.ADMIN, + ProjectRole.MANAGER, + ]: + raise ForbiddenError("Access denied: insufficient permissions to remove members") + + res = await session.execute( + select(UserModel).where( + (UserModel.name.in_(usernames)) | (UserModel.email.in_(usernames)), + UserModel.deleted == False, + ) + ) + users_found = res.scalars().all() + + username_to_user = {user.name: user for user in users_found} + email_to_user = {user.email: user for user in users_found if user.email} + + member_by_user_id = {m.user_id: m for m in project.members} + + members_to_remove = [] + admin_removals = 0 + + for username in usernames: + user_to_remove = username_to_user.get(username) or email_to_user.get(username) + if user_to_remove is None: + raise ServerClientError(f"User not found: {username}") + + if user_to_remove.id not in member_by_user_id: + raise ServerClientError(f"User is not a member of this project: {username}") + + member_to_remove = member_by_user_id[user_to_remove.id] + + if member_to_remove.project_role == ProjectRole.ADMIN: + if is_self_leave: + total_admins = sum( + 1 for member in project.members if member.project_role == ProjectRole.ADMIN + ) + if total_admins <= 1: + raise ServerClientError("Cannot leave project: you are the last admin") + else: + if user.global_role != GlobalRole.ADMIN: + raise ForbiddenError( + f"Access denied: only global admins can remove project admins (user: {username})" + ) + admin_removals += 1 + + members_to_remove.append(member_to_remove) + + if not is_self_leave: + total_admins = sum( + 1 for member in project.members if member.project_role == ProjectRole.ADMIN + ) + if admin_removals >= total_admins: + raise ServerClientError("Cannot remove all project admins") + + for member in members_to_remove: + await session.delete(member) + + await session.commit() diff --git a/src/dstack/_internal/server/services/prometheus/__init__.py b/src/dstack/_internal/server/services/prometheus/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/server/services/prometheus/client_metrics.py b/src/dstack/_internal/server/services/prometheus/client_metrics.py new file mode 100644 index 0000000000..d2c5da9db6 --- /dev/null +++ b/src/dstack/_internal/server/services/prometheus/client_metrics.py @@ -0,0 +1,55 @@ +from prometheus_client import Counter, Histogram + + +class RunMetrics: + """Wrapper class for run-related Prometheus metrics.""" + + def __init__(self): + # submit_to_provision_duration reflects real provisioning time + # but does not reflect how quickly provisioning processing works + # since it includes scheduling time, retrying, etc. + self._submit_to_provision_duration = Histogram( + "dstack_submit_to_provision_duration_seconds", + "Time from when a run has been submitted and first job provisioning", + # Buckets optimized for percentile calculation + buckets=[ + 15, + 30, + 45, + 60, + 90, + 120, + 180, + 240, + 300, + 360, + 420, + 480, + 540, + 600, + 900, + 1200, + 1800, + float("inf"), + ], + labelnames=["project_name", "run_type"], + ) + + self._pending_runs_total = Counter( + "dstack_pending_runs_total", + "Number of pending runs", + labelnames=["project_name", "run_type"], + ) + + def log_submit_to_provision_duration( + self, duration_seconds: float, project_name: str, run_type: str + ): + self._submit_to_provision_duration.labels( + project_name=project_name, run_type=run_type + ).observe(duration_seconds) + + def increment_pending_runs(self, project_name: str, run_type: str): + self._pending_runs_total.labels(project_name=project_name, run_type=run_type).inc() + + +run_metrics = RunMetrics() diff --git a/src/dstack/_internal/server/services/prometheus/custom_metrics.py b/src/dstack/_internal/server/services/prometheus/custom_metrics.py new file mode 100644 index 0000000000..6880f5c3ed --- /dev/null +++ b/src/dstack/_internal/server/services/prometheus/custom_metrics.py @@ -0,0 +1,328 @@ +import itertools +import json +from collections import defaultdict +from collections.abc import Generator, Iterable +from typing import ClassVar +from uuid import UUID + +from prometheus_client import Metric +from prometheus_client.parser import text_string_to_metric_families +from prometheus_client.samples import Sample +from sqlalchemy import func, select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import aliased, joinedload + +from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.core.models.runs import JobStatus, RunStatus +from dstack._internal.server.models import ( + InstanceModel, + JobMetricsPoint, + JobModel, + JobPrometheusMetrics, + ProjectModel, + RunModel, + UserModel, +) +from dstack._internal.server.services.instances import get_instance_offer +from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data +from dstack._internal.server.services.runs import get_run_spec +from dstack._internal.utils.common import get_current_datetime + + +async def get_metrics(session: AsyncSession) -> str: + metrics_iter = itertools.chain( + await get_instance_metrics(session), + await get_run_metrics(session), + await get_job_metrics(session), + ) + return "\n".join(_render_metrics(metrics_iter)) + "\n" + + +async def get_instance_metrics(session: AsyncSession) -> Iterable[Metric]: + res = await session.execute( + select(InstanceModel) + .join(ProjectModel) + .where( + InstanceModel.deleted == False, + InstanceModel.status.in_( + [ + InstanceStatus.PROVISIONING, + InstanceStatus.IDLE, + InstanceStatus.BUSY, + InstanceStatus.TERMINATING, + ] + ), + ) + .order_by(ProjectModel.name, InstanceModel.name) + .options( + joinedload(InstanceModel.project), + joinedload(InstanceModel.fleet), + ) + ) + instances = res.unique().scalars().all() + metrics = _InstanceMetrics() + now = get_current_datetime() + for instance in instances: + fleet = instance.fleet + offer = get_instance_offer(instance) + gpu = "" + gpu_count = 0 + if offer is not None and len(offer.instance.resources.gpus) > 0: + gpu = offer.instance.resources.gpus[0].name + gpu_count = len(offer.instance.resources.gpus) + labels: dict[str, str] = { + "dstack_project_name": instance.project.name, + "dstack_fleet_name": fleet.name if fleet is not None else "", + "dstack_fleet_id": str(fleet.id) if fleet is not None else "", + "dstack_instance_name": str(instance.name), + "dstack_instance_id": str(instance.id), + "dstack_instance_type": offer.instance.name if offer is not None else "", + "dstack_backend": instance.backend.value if instance.backend is not None else "", + "dstack_gpu": gpu, + } + duration = (now - instance.created_at).total_seconds() + metrics.add_sample(_INSTANCE_DURATION, labels, duration) + metrics.add_sample(_INSTANCE_PRICE, labels, instance.price or 0.0) + metrics.add_sample(_INSTANCE_GPU_COUNT, labels, gpu_count) + return metrics.values() + + +async def get_run_metrics(session: AsyncSession) -> Iterable[Metric]: + res = await session.execute( + select(ProjectModel.name, UserModel.name, RunModel.status, func.count(RunModel.id)) + .join_from(RunModel, ProjectModel) + .join_from(RunModel, UserModel, RunModel.user_id == UserModel.id) + .group_by(ProjectModel.name, UserModel.name, RunModel.status) + .order_by(ProjectModel.name, UserModel.name, RunModel.status) + ) + projects: dict[str, dict[str, dict[RunStatus, int]]] = defaultdict( + lambda: defaultdict(lambda: defaultdict(int)) + ) + for project_name, user_name, status, count in res.all(): + projects[project_name][user_name][status] = count + metrics = _RunMetrics() + for project_name, users in projects.items(): + for user_name, statuses in users.items(): + labels: dict[str, str] = { + "dstack_project_name": project_name, + "dstack_user_name": user_name, + } + metrics.add_sample(_RUN_COUNT_TOTAL, labels, sum(statuses.values())) + metrics.add_sample(_RUN_COUNT_TERMINATED, labels, statuses[RunStatus.TERMINATED]) + metrics.add_sample(_RUN_COUNT_FAILED, labels, statuses[RunStatus.FAILED]) + metrics.add_sample(_RUN_COUNT_DONE, labels, statuses[RunStatus.DONE]) + return metrics.values() + + +async def get_job_metrics(session: AsyncSession) -> Iterable[Metric]: + res = await session.execute( + select(JobModel) + .join(ProjectModel) + .where( + JobModel.status.in_( + [ + JobStatus.PROVISIONING, + JobStatus.PULLING, + JobStatus.RUNNING, + JobStatus.TERMINATING, + ] + ) + ) + .order_by(ProjectModel.name, JobModel.job_name) + .options( + joinedload(JobModel.project), + joinedload(JobModel.run).joinedload(RunModel.user), + ) + ) + jobs = res.scalars().all() + job_ids = {job.id for job in jobs} + job_metrics_points = await _get_job_metrics_points(session, job_ids) + job_prometheus_metrics = await _get_job_prometheus_metrics(session, job_ids) + + metrics = _JobMetrics() + now = get_current_datetime() + for job in jobs: + jpd = get_job_provisioning_data(job) + if jpd is None: + continue + jrd = get_job_runtime_data(job) + resources = jpd.instance_type.resources + price = jpd.price + if jrd is not None and jrd.offer is not None: + resources = jrd.offer.instance.resources + price = jrd.offer.price + gpus = resources.gpus + cpus = resources.cpus + run_spec = get_run_spec(job.run) + labels = { + "dstack_project_name": job.project.name, + "dstack_user_name": job.run.user.name, + "dstack_run_name": job.run_name, + "dstack_run_id": str(job.run_id), + "dstack_job_name": job.job_name, + "dstack_job_id": str(job.id), + "dstack_job_num": str(job.job_num), + "dstack_replica_num": str(job.replica_num), + "dstack_run_type": run_spec.configuration.type, + "dstack_backend": jpd.get_base_backend().value, + "dstack_gpu": gpus[0].name if gpus else "", + } + duration = (now - job.submitted_at).total_seconds() + metrics.add_sample(_JOB_DURATION, labels, duration) + metrics.add_sample(_JOB_PRICE, labels, price) + metrics.add_sample(_JOB_GPU_COUNT, labels, len(gpus)) + metrics.add_sample(_JOB_CPU_COUNT, labels, cpus) + metrics.add_sample(_JOB_MEMORY_TOTAL, labels, resources.memory_mib * 1024 * 1024) + jmp = job_metrics_points.get(job.id) + if jmp is not None: + metrics.add_sample(_JOB_CPU_TIME, labels, jmp.cpu_usage_micro / 1_000_000) + metrics.add_sample(_JOB_MEMORY_USAGE, labels, jmp.memory_usage_bytes) + metrics.add_sample(_JOB_MEMORY_WORKING_SET, labels, jmp.memory_working_set_bytes) + if gpus: + gpu_memory_total = gpus[0].memory_mib * 1024 * 1024 + for gpu_num, (gpu_util, gpu_memory_usage) in enumerate( + zip( + json.loads(jmp.gpus_util_percent), + json.loads(jmp.gpus_memory_usage_bytes), + ) + ): + gpu_labels = labels.copy() + gpu_labels["dstack_gpu_num"] = str(gpu_num) + metrics.add_sample(_JOB_GPU_USAGE_RATIO, gpu_labels, gpu_util / 100) + metrics.add_sample(_JOB_GPU_MEMORY_TOTAL, gpu_labels, gpu_memory_total) + metrics.add_sample(_JOB_GPU_MEMORY_USAGE, gpu_labels, gpu_memory_usage) + jpm = job_prometheus_metrics.get(job.id) + if jpm is not None: + for metric in text_string_to_metric_families(jpm.text): + metrics.add_metric(metric, labels) + return metrics.values() + + +_COUNTER = "counter" +_GAUGE = "gauge" + +_INSTANCE_DURATION = "dstack_instance_duration_seconds_total" +_INSTANCE_PRICE = "dstack_instance_price_dollars_per_hour" +_INSTANCE_GPU_COUNT = "dstack_instance_gpu_count" +_RUN_COUNT_TOTAL = "dstack_run_count_total" +_RUN_COUNT_TERMINATED = "dstack_run_count_terminated_total" +_RUN_COUNT_FAILED = "dstack_run_count_failed_total" +_RUN_COUNT_DONE = "dstack_run_count_done_total" +_JOB_DURATION = "dstack_job_duration_seconds_total" +_JOB_PRICE = "dstack_job_price_dollars_per_hour" +_JOB_GPU_COUNT = "dstack_job_gpu_count" +_JOB_CPU_COUNT = "dstack_job_cpu_count" +_JOB_CPU_TIME = "dstack_job_cpu_time_seconds_total" +_JOB_MEMORY_TOTAL = "dstack_job_memory_total_bytes" +_JOB_MEMORY_USAGE = "dstack_job_memory_usage_bytes" +_JOB_MEMORY_WORKING_SET = "dstack_job_memory_working_set_bytes" +_JOB_GPU_USAGE_RATIO = "dstack_job_gpu_usage_ratio" +_JOB_GPU_MEMORY_TOTAL = "dstack_job_gpu_memory_total_bytes" +_JOB_GPU_MEMORY_USAGE = "dstack_job_gpu_memory_usage_bytes" + + +class _Metrics(dict[str, Metric]): + metrics: ClassVar[list[tuple[str, str, str]]] + + def __init__(self): + super().__init__() + for name, typ, documentation in self.metrics: + self[name] = Metric(name=name, documentation=documentation, typ=typ) + + def add_sample(self, name: str, labels: dict[str, str], value: float) -> None: + # NOTE: Keeps reference to labels. + self[name].add_sample(name=name, labels=labels, value=value) + + def add_metric(self, metric: Metric, labels: dict[str, str]) -> None: + # NOTE: Modifies and keeps reference to metric. + name = metric.name + samples = metric.samples + stored_metric = self.get(name) + if stored_metric is None: + stored_metric = metric + stored_metric.samples = [] + self[name] = stored_metric + for sample in samples: + sample.labels.update(labels) + # text_string_to_metric_families "fixes" counter names appending _total, + # we rebuild Sample to revert this + stored_metric.samples.append(Sample(name, *sample[1:])) + + +class _InstanceMetrics(_Metrics): + metrics = [ + (_INSTANCE_DURATION, _COUNTER, "Total seconds the instance is running"), + (_INSTANCE_PRICE, _GAUGE, "Instance price, USD/hour"), + (_INSTANCE_GPU_COUNT, _GAUGE, "Instance GPU count"), + ] + + +class _RunMetrics(_Metrics): + metrics = [ + (_RUN_COUNT_TOTAL, _COUNTER, "Total runs count"), + (_RUN_COUNT_TERMINATED, _COUNTER, "Terminated runs count"), + (_RUN_COUNT_FAILED, _COUNTER, "Failed runs count"), + (_RUN_COUNT_DONE, _COUNTER, "Done runs count"), + ] + + +class _JobMetrics(_Metrics): + metrics = [ + (_JOB_DURATION, _COUNTER, "Total seconds the job is running"), + (_JOB_PRICE, _GAUGE, "Job instance price, USD/hour"), + (_JOB_GPU_COUNT, _GAUGE, "Job GPU count"), + (_JOB_CPU_COUNT, _GAUGE, "Job CPU count"), + (_JOB_CPU_TIME, _COUNTER, "Total CPU time consumed by the job, seconds"), + (_JOB_MEMORY_TOTAL, _GAUGE, "Total memory allocated for the job, bytes"), + (_JOB_MEMORY_USAGE, _GAUGE, "Memory used by the job (including cache), bytes"), + (_JOB_MEMORY_WORKING_SET, _GAUGE, "Memory used by the job (not including cache), bytes"), + (_JOB_GPU_USAGE_RATIO, _GAUGE, "Job GPU usage, percent (as 0.0-1.0)"), + (_JOB_GPU_MEMORY_TOTAL, _GAUGE, "Total GPU memory allocated for the job, bytes"), + (_JOB_GPU_MEMORY_USAGE, _GAUGE, "GPU memory used by the job, bytes"), + ] + + +async def _get_job_metrics_points( + session: AsyncSession, job_ids: Iterable[UUID] +) -> dict[UUID, JobMetricsPoint]: + subquery = select( + JobMetricsPoint, + func.row_number() + .over( + partition_by=JobMetricsPoint.job_id, + order_by=JobMetricsPoint.timestamp_micro.desc(), + ) + .label("row_number"), + ).subquery() + res = await session.execute( + select(aliased(JobMetricsPoint, subquery)).where( + subquery.c.row_number == 1, + subquery.c.job_id.in_(job_ids), + ) + ) + return {p.job_id: p for p in res.scalars().all()} + + +async def _get_job_prometheus_metrics( + session: AsyncSession, job_ids: Iterable[UUID] +) -> dict[UUID, JobPrometheusMetrics]: + res = await session.execute( + select(JobPrometheusMetrics).where(JobPrometheusMetrics.job_id.in_(job_ids)) + ) + return {p.job_id: p for p in res.scalars().all()} + + +def _render_metrics(metrics: Iterable[Metric]) -> Generator[str, None, None]: + for metric in metrics: + if not metric.samples: + continue + yield f"# HELP {metric.name} {metric.documentation}" + yield f"# TYPE {metric.name} {metric.type}" + for sample in metric.samples: + parts: list[str] = [f"{sample.name}{{"] + parts.extend(",".join(f'{name}="{value}"' for name, value in sample.labels.items())) + parts.append(f"}} {float(sample.value)}") + # text_string_to_metric_families converts milliseconds to float seconds + if isinstance(sample.timestamp, float): + parts.append(f" {int(sample.timestamp * 1000)}") + yield "".join(parts) diff --git a/src/dstack/_internal/server/services/proxy/__init__.py b/src/dstack/_internal/server/services/proxy/__init__.py new file mode 100644 index 0000000000..4b503cbfa0 --- /dev/null +++ b/src/dstack/_internal/server/services/proxy/__init__.py @@ -0,0 +1,3 @@ +""" +Dependencies for dstack-proxy that allow it to run as part of dstack-server. +""" diff --git a/src/dstack/_internal/server/services/proxy/auth.py b/src/dstack/_internal/server/services/proxy/auth.py new file mode 100644 index 0000000000..fecc685095 --- /dev/null +++ b/src/dstack/_internal/server/services/proxy/auth.py @@ -0,0 +1,12 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.proxy.lib.auth import BaseProxyAuthProvider +from dstack._internal.server.security.permissions import is_project_member + + +class ServerProxyAuthProvider(BaseProxyAuthProvider): + def __init__(self, session: AsyncSession) -> None: + self.session = session + + async def is_project_member(self, project_name: str, token: str) -> bool: + return await is_project_member(self.session, project_name, token) diff --git a/src/dstack/_internal/server/services/proxy/deps.py b/src/dstack/_internal/server/services/proxy/deps.py new file mode 100644 index 0000000000..558b143ab8 --- /dev/null +++ b/src/dstack/_internal/server/services/proxy/deps.py @@ -0,0 +1,18 @@ +from typing import AsyncGenerator + +from dstack._internal.proxy.lib.auth import BaseProxyAuthProvider +from dstack._internal.proxy.lib.deps import ProxyDependencyInjector +from dstack._internal.proxy.lib.repo import BaseProxyRepo +from dstack._internal.server.db import get_session_ctx +from dstack._internal.server.services.proxy.auth import ServerProxyAuthProvider +from dstack._internal.server.services.proxy.repo import ServerProxyRepo + + +class ServerProxyDependencyInjector(ProxyDependencyInjector): + async def get_repo(self) -> AsyncGenerator[BaseProxyRepo, None]: + async with get_session_ctx() as session: + yield ServerProxyRepo(session) + + async def get_auth_provider(self) -> AsyncGenerator[BaseProxyAuthProvider, None]: + async with get_session_ctx() as session: + yield ServerProxyAuthProvider(session) diff --git a/src/dstack/_internal/server/services/proxy/repo.py b/src/dstack/_internal/server/services/proxy/repo.py new file mode 100644 index 0000000000..e2a8d3c117 --- /dev/null +++ b/src/dstack/_internal/server/services/proxy/repo.py @@ -0,0 +1,205 @@ +from typing import List, Optional + +import pydantic +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import contains_eager, joinedload + +import dstack._internal.server.services.jobs as jobs_services +from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT +from dstack._internal.core.models.configurations import ServiceConfiguration +from dstack._internal.core.models.instances import SSHConnectionParams +from dstack._internal.core.models.runs import ( + JobProvisioningData, + JobStatus, + RunStatus, + ServiceSpec, + get_service_port, +) +from dstack._internal.core.models.services import AnyModel +from dstack._internal.proxy.lib.models import ( + AnyModelFormat, + ChatModel, + OpenAIChatModelFormat, + Project, + Replica, + Service, + TGIChatModelFormat, +) +from dstack._internal.proxy.lib.repo import BaseProxyRepo +from dstack._internal.server.models import InstanceModel, JobModel, ProjectModel, RunModel +from dstack._internal.server.services.instances import get_instance_remote_connection_info +from dstack._internal.server.services.jobs import get_job_spec +from dstack._internal.server.services.runs import get_run_spec +from dstack._internal.server.settings import DEFAULT_SERVICE_CLIENT_MAX_BODY_SIZE +from dstack._internal.utils.common import get_or_error + + +class ServerProxyRepo(BaseProxyRepo): + """ + A repo implementation used by dstack-proxy running within dstack-server. + Retrieves data from dstack-server's database. + """ + + def __init__(self, session: AsyncSession) -> None: + self.session = session + + async def get_service(self, project_name: str, run_name: str) -> Optional[Service]: + res = await self.session.execute( + select(JobModel) + .join(JobModel.project) + .join(JobModel.run) + .where( + ProjectModel.name == project_name, + RunModel.gateway_id.is_(None), + JobModel.run_name == run_name, + JobModel.status == JobStatus.RUNNING, + JobModel.registered == True, + JobModel.job_num == 0, + ) + .options( + contains_eager(JobModel.run), + contains_eager(JobModel.project), + joinedload(JobModel.instance).joinedload(InstanceModel.project), + ) + ) + jobs = res.unique().scalars().all() + if not len(jobs): + return None + run = jobs[0].run + run_spec = get_run_spec(run) + if not isinstance(run_spec.configuration, ServiceConfiguration): + return None + router_group = next( + (g for g in run_spec.configuration.replica_groups if g.router is not None), + None, + ) + has_router_replica = router_group is not None + router = run_spec.configuration.router + replicas = [] + for job in jobs: + jpd: JobProvisioningData = JobProvisioningData.__response__.parse_raw( + job.job_provisioning_data + ) + assert jpd.hostname is not None + assert jpd.ssh_port is not None + instance = get_or_error(job.instance) + if not jpd.dockerized: + ssh_destination = f"{jpd.username}@{jpd.hostname}" + ssh_port = jpd.ssh_port + ssh_proxy = jpd.ssh_proxy + ssh_proxy_private_key = None + else: + ssh_destination = "root@localhost" + ssh_port = DSTACK_RUNNER_SSH_PORT + job_submission = jobs_services.job_model_to_job_submission(job) + jrd = job_submission.job_runtime_data + if jrd is not None and jrd.ports is not None: + ssh_port = jrd.ports.get(ssh_port, ssh_port) + ssh_proxy = SSHConnectionParams( + hostname=jpd.hostname, + username=jpd.username, + port=jpd.ssh_port, + ) + ssh_proxy_private_key = None + if job.project_id != instance.project_id: + ssh_proxy_private_key = instance.project.ssh_private_key + ssh_head_proxy: Optional[SSHConnectionParams] = None + ssh_head_proxy_private_key: Optional[str] = None + rci = get_instance_remote_connection_info(instance) + if rci is not None and rci.ssh_proxy is not None: + ssh_head_proxy = rci.ssh_proxy + ssh_head_proxy_private_key = get_or_error(rci.ssh_proxy_keys)[0].private + job_spec = get_job_spec(job) + if router_group is not None and job_spec.replica_group != router_group.name: + # Strict router-only: when a router is configured, the proxy should only be aware + # of router replicas. + continue + replica = Replica( + id=job.id.hex, + app_port=get_service_port(job_spec, run_spec.configuration), + ssh_destination=ssh_destination, + ssh_port=ssh_port, + ssh_proxy=ssh_proxy, + ssh_proxy_private_key=ssh_proxy_private_key, + ssh_head_proxy=ssh_head_proxy, + ssh_head_proxy_private_key=ssh_head_proxy_private_key, + internal_ip=jpd.internal_ip, + ) + replicas.append(replica) + return Service( + project_name=project_name, + run_name=run.run_name, + domain=None, + https=None, + auth=run_spec.configuration.auth, + client_max_body_size=DEFAULT_SERVICE_CLIENT_MAX_BODY_SIZE, + strip_prefix=run_spec.configuration.strip_prefix, + replicas=tuple(replicas), + has_router_replica=has_router_replica, + router=router, + ) + + async def list_models(self, project_name: str) -> List[ChatModel]: + res = await self.session.execute( + select(RunModel) + .join(RunModel.project) + .where( + ProjectModel.name == project_name, + RunModel.gateway_id.is_(None), + RunModel.service_spec.is_not(None), + RunModel.status == RunStatus.RUNNING, + ) + ) + models = [] + for run in res.scalars().all(): + service_spec: ServiceSpec = ServiceSpec.__response__.parse_raw(run.service_spec) + model_spec = service_spec.model + model_options_obj = service_spec.options.get("openai", {}).get("model") + if model_spec is None or model_options_obj is None: + continue + model_options = pydantic.parse_obj_as(AnyModel, model_options_obj) # type: ignore[arg-type] + model = ChatModel( + project_name=project_name, + name=model_spec.name, + created_at=run.submitted_at, + run_name=run.run_name, + format_spec=_model_options_to_format_spec(model_options), + ) + models.append(model) + return models + + async def get_model(self, project_name: str, name: str) -> Optional[ChatModel]: + models = await self.list_models(project_name) + models = [m for m in models if m.name == name] + if not models: + return None + # If there are many models with the same name, choose the most recent + return max(models, key=lambda m: m.created_at) + + async def get_project(self, name: str) -> Optional[Project]: + res = await self.session.execute(select(ProjectModel).where(ProjectModel.name == name)) + project = res.scalar_one_or_none() + if project is None: + return None + return Project( + name=project.name, + ssh_private_key=project.ssh_private_key, + ) + + +def _model_options_to_format_spec(model: AnyModel) -> AnyModelFormat: + if model.type == "chat": + if model.format == "openai": + return OpenAIChatModelFormat(prefix=model.prefix) + elif model.format == "tgi": + assert model.chat_template is not None + assert model.eos_token is not None + return TGIChatModelFormat( + chat_template=model.chat_template, + eos_token=model.eos_token, + ) + else: + raise RuntimeError(f"Unexpected model format {model.format}") + else: + raise RuntimeError(f"Unexpected model type {model.type}") diff --git a/src/dstack/_internal/server/services/proxy/routers/__init__.py b/src/dstack/_internal/server/services/proxy/routers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/server/services/proxy/routers/service_proxy.py b/src/dstack/_internal/server/services/proxy/routers/service_proxy.py new file mode 100644 index 0000000000..95582f2bb7 --- /dev/null +++ b/src/dstack/_internal/server/services/proxy/routers/service_proxy.py @@ -0,0 +1,49 @@ +from fastapi import APIRouter, Depends, Request, status +from fastapi.datastructures import URL +from fastapi.responses import RedirectResponse, Response +from typing_extensions import Annotated + +from dstack._internal.proxy.lib.deps import ( + ProxyAuth, + ProxyAuthContext, + get_proxy_repo, + get_service_connection_pool, +) +from dstack._internal.proxy.lib.repo import BaseProxyRepo +from dstack._internal.proxy.lib.services.service_connection import ServiceConnectionPool +from dstack._internal.server.services.proxy.services import service_proxy + +router = APIRouter() + + +@router.get("/{project_name}/{run_name}", summary="Redirect to service root") +@router.post("/{project_name}/{run_name}", summary="Redirect to service root") +@router.put("/{project_name}/{run_name}", summary="Redirect to service root") +@router.delete("/{project_name}/{run_name}", summary="Redirect to service root") +@router.patch("/{project_name}/{run_name}", summary="Redirect to service root") +@router.head("/{project_name}/{run_name}", summary="Redirect to service root") +async def redirect_to_service_root(request: Request, project_name: str, run_name: str) -> Response: + url = URL(str(request.url)) + url = url.replace(path=url.path + "/") + return RedirectResponse(url, status.HTTP_308_PERMANENT_REDIRECT) + + +@router.get("/{project_name}/{run_name}/{path:path}", summary="Proxy service request") +@router.post("/{project_name}/{run_name}/{path:path}", summary="Proxy service request") +@router.put("/{project_name}/{run_name}/{path:path}", summary="Proxy service request") +@router.delete("/{project_name}/{run_name}/{path:path}", summary="Proxy service request") +@router.patch("/{project_name}/{run_name}/{path:path}", summary="Proxy service request") +@router.head("/{project_name}/{run_name}/{path:path}", summary="Proxy service request") +@router.options("/{project_name}/{run_name}/{path:path}", summary="Proxy service request") +async def service_reverse_proxy( + project_name: str, + run_name: str, + path: str, + request: Request, + auth: Annotated[ProxyAuthContext, Depends(ProxyAuth(auto_enforce=False))], + repo: Annotated[BaseProxyRepo, Depends(get_proxy_repo)], + service_conn_pool: Annotated[ServiceConnectionPool, Depends(get_service_connection_pool)], +) -> Response: + return await service_proxy.proxy( + project_name, run_name, path, request, auth, repo, service_conn_pool + ) diff --git a/src/dstack/_internal/server/services/proxy/services/__init__.py b/src/dstack/_internal/server/services/proxy/services/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/server/services/proxy/services/service_proxy.py b/src/dstack/_internal/server/services/proxy/services/service_proxy.py new file mode 100644 index 0000000000..ba74107b6b --- /dev/null +++ b/src/dstack/_internal/server/services/proxy/services/service_proxy.py @@ -0,0 +1,163 @@ +from typing import AsyncGenerator, AsyncIterator, Optional + +import fastapi +import httpx +from fastapi import status +from starlette.requests import ClientDisconnect + +from dstack._internal.core.models.routers import RouterType +from dstack._internal.proxy.lib.const import ROUTER_WHITELISTED_PATHS +from dstack._internal.proxy.lib.deps import ProxyAuthContext +from dstack._internal.proxy.lib.errors import ProxyError +from dstack._internal.proxy.lib.repo import BaseProxyRepo +from dstack._internal.proxy.lib.services.service_connection import ( + ServiceConnectionPool, + get_service_replica_client, +) +from dstack._internal.utils.common import concat_url_path +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) +UVICORN_AUTOMATIC_HEADERS = ("Server", "Date") + + +async def proxy( + project_name: str, + run_name: str, + path: str, + request: fastapi.Request, + auth: ProxyAuthContext, + repo: BaseProxyRepo, + service_conn_pool: ServiceConnectionPool, +) -> fastapi.responses.Response: + if "Upgrade" in request.headers: + raise ProxyError("Upgrading connections is not supported", status.HTTP_400_BAD_REQUEST) + + service = await repo.get_service(project_name, run_name) + if service is None or not service.replicas: + raise ProxyError(f"Service {project_name}/{run_name} not found", status.HTTP_404_NOT_FOUND) + if service.auth: + await auth.enforce() + + if not service.strip_prefix: + path = concat_url_path(request.scope.get("root_path", "/"), request.url.path) + + if ( + service.router is not None and service.router.type == RouterType.SGLANG + ) or service.has_router_replica: + path_for_match = path if path.startswith("/") else f"/{path}" + if not _is_whitelisted_path(path_for_match, ROUTER_WHITELISTED_PATHS): + raise ProxyError("Path is not allowed for this service", status.HTTP_403_FORBIDDEN) + + client = await get_service_replica_client(service, repo, service_conn_pool) + + try: + upstream_request = await build_upstream_request(request, path, client) + except ClientDisconnect: + logger.debug( + "Downstream client disconnected before response was sent for %s %s", + request.method, + request.url, + ) + raise ProxyError("Client disconnected") + + try: + upstream_response = await client.send(upstream_request, stream=True) + except httpx.RequestError as e: + logger.debug( + "Error requesting %s %s: %r", upstream_request.method, upstream_request.url, e + ) + if isinstance(e, httpx.TimeoutException): + raise ProxyError("Timed out requesting upstream", status.HTTP_504_GATEWAY_TIMEOUT) + raise ProxyError("Error requesting upstream", status.HTTP_502_BAD_GATEWAY) + + return fastapi.responses.StreamingResponse( + stream_response(upstream_response), + status_code=upstream_response.status_code, + headers=clean_response_headers(upstream_response.headers), + ) + + +def _is_whitelisted_path(path: str, whitelisted_paths: tuple[str, ...]) -> bool: + for allowed in whitelisted_paths: + if allowed.endswith("/"): + if path.startswith(allowed): + return True + elif path == allowed: + return True + return False + + +def clean_response_headers(headers: httpx.Headers) -> httpx.Headers: + headers = httpx.Headers(headers) # copy + for header in UVICORN_AUTOMATIC_HEADERS: + if header in headers: + del headers[header] + return headers + + +async def stream_response(response: httpx.Response) -> AsyncGenerator[bytes, None]: + try: + async for chunk in response.aiter_raw(): + yield chunk + except httpx.RequestError as e: + logger.debug( + "Error streaming response %s %s: %r", response.request.method, response.request.url, e + ) + + try: + await response.aclose() + except httpx.RequestError as e: + logger.debug( + "Error closing response %s %s: %r", + response.request.method, + response.request.url, + e, + ) + + +async def build_upstream_request( + downstream_request: fastapi.Request, path: str, client: httpx.AsyncClient +) -> httpx.Request: + url = httpx.URL(path=path, query=downstream_request.url.query.encode("utf-8")) + request_stream = await FastAPIToHttpxRequestStreamAdaptor( + downstream_request.stream(), downstream_request.url + ).get_stream() + client.cookies.clear() # the client is shared by all users, don't leak cookies + + # TODO(#2237): add common proxy headers + return client.build_request( + downstream_request.method, url, headers=downstream_request.headers, content=request_stream + ) + + +class FastAPIToHttpxRequestStreamAdaptor: + """ + If a FastAPI request has no body, its stream consists of empty byte sequences (b""). + This adaptor detects such streams and replaces them with None, otherwise httpx will + considers them actual request bodies, which can lead to unexpected behavior. + """ + + def __init__(self, stream: AsyncIterator[bytes], url: fastapi.datastructures.URL) -> None: + self._stream = stream + self._url = url + + async def get_stream(self) -> Optional[AsyncGenerator[bytes, None]]: + try: + first_chunk = await self._stream.__anext__() + except StopAsyncIteration: + return None + except ClientDisconnect: + logger.debug("Downstream client disconnected when requesting %s", self._url) + return None + if first_chunk == b"": + return None + return self._adaptor(first_chunk) + + async def _adaptor(self, first_chunk: bytes) -> AsyncGenerator[bytes, None]: + yield first_chunk + try: + async for chunk in self._stream: + yield chunk + except ClientDisconnect: + logger.debug("Downstream client disconnected when requesting %s", self._url) diff --git a/src/dstack/_internal/server/services/public_keys.py b/src/dstack/_internal/server/services/public_keys.py new file mode 100644 index 0000000000..52c642fa38 --- /dev/null +++ b/src/dstack/_internal/server/services/public_keys.py @@ -0,0 +1,258 @@ +import asyncio +import base64 +import hashlib +import subprocess +import uuid +from collections.abc import Iterable +from typing import Any, ClassVar, Optional + +import paramiko.pkey +import sqlalchemy.exc +from sqlalchemy import delete, select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.errors import DstackError, ResourceExistsError, ServerClientError +from dstack._internal.core.models.keys import PublicKeyInfo +from dstack._internal.server.models import UserModel, UserPublicKeyModel +from dstack._internal.server.services import events +from dstack._internal.utils.logging import get_logger +from dstack._internal.utils.ssh import find_ssh_util + +logger = get_logger(__name__) + +supported_key_types = [ + "ssh-rsa", + "ecdsa-sha2-nistp256", + "ecdsa-sha2-nistp384", + "ecdsa-sha2-nistp521", + "ssh-ed25519", + "sk-ecdsa-sha2-nistp256@openssh.com", + "sk-ssh-ed25519@openssh.com", +] + + +class PublicKeyError(DstackError): + # The message displayed to the user, should not contain internal/sensitive info + # Any debug info should be passed to the constructor as positional arguments + # and accessed via debug_message() + msg: ClassVar = "Public key error" + + def __init__(self, *args: Any, **kwargs: str) -> None: + super().__init__(*args) + self._kwargs = kwargs + + def __str__(self) -> str: + return self.msg.format(**self._kwargs) + + def debug_message(self) -> str: + return super().__str__() + + +class InvalidPublicKeyError(PublicKeyError): + msg = "Invalid public key, must be in OpenSSH public key format" + + +class UnsupportedPublicKeyError(PublicKeyError): + msg = "Unsupported key type: {type}" + + +async def list_user_public_keys(session: AsyncSession, user: UserModel) -> list[PublicKeyInfo]: + res = await session.execute( + select(UserPublicKeyModel) + .where(UserPublicKeyModel.user_id == user.id) + .order_by(UserPublicKeyModel.created_at.desc()) + ) + user_public_keys = res.scalars().all() + return [user_public_key_model_to_public_key_info(k) for k in user_public_keys] + + +async def add_user_public_key( + session: AsyncSession, user: UserModel, key: str, name: Optional[str] = None +) -> PublicKeyInfo: + try: + type_, blob, comment = parse_openssh_public_key(key) + await validate_openssh_public_key(key) + except PublicKeyError as e: + logger.debug("User public key validation error: %s: %s", e, e.debug_message()) + raise ServerClientError(str(e)) + except (TimeoutError, OSError) as e: + logger.warning("Failed to validate user public key: %s", e) + raise ServerClientError("Failed to validate the key. Try later") + + if not name: + name = comment or hashlib.md5(blob).hexdigest() + fingerprint = get_openssh_public_key_fingerprint(blob) + + user_public_key = UserPublicKeyModel( + user=user, + name=name, + type=type_, + fingerprint=fingerprint, + key=key, + ) + try: + async with session.begin_nested(): + session.add(user_public_key) + except sqlalchemy.exc.IntegrityError: + raise ResourceExistsError() + events.emit( + session, + f"Public key added. Fingerprint: {fingerprint}", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(user)], + ) + await session.commit() + + return user_public_key_model_to_public_key_info(user_public_key) + + +async def delete_user_public_keys( + session: AsyncSession, user: UserModel, ids: Iterable[uuid.UUID] +) -> None: + res = await session.execute( + delete(UserPublicKeyModel) + .where( + UserPublicKeyModel.user_id == user.id, + UserPublicKeyModel.id.in_(ids), + ) + .returning(UserPublicKeyModel.fingerprint) + ) + for fingerprint in res.scalars().all(): + events.emit( + session, + f"Public key deleted. Fingerprint: {fingerprint}", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(user)], + ) + await session.commit() + + +def parse_openssh_public_key(key: str) -> tuple[str, bytes, Optional[str]]: + """ + Parses OpenSSH public key in disk format. + + Args: + key: public key file contents. + + Returns: + key type, blob in wire format, and optional comment. + + Raises: + InvalidPublicKeyError: if the key disk format is not valid or the declared disk format + key type does not match the actual key type in the blob. + Note, the key blob is not checked, further validation is required. + UnsupportedPublicKeyError: if the key type is not supported. + """ + # OpenSSH disk (ASCII-armored) format for public keys: + # [ ] + # See: section 4.1 "Public key format" + # https://cvsweb.openbsd.org/checkout/src/usr.bin/ssh/PROTOCOL + # e.g., + # * without comment: + # ssh-ed25519 AAAAC3NzaC1lZ[...truncated...] + # * with default comment added by ssh-keygen: + # ssh-rsa AAAAB3NzaC1yc2EAAAADAQ[...truncated...] username@hostname + # * with user-provided comment: + # sk-ssh-ed25519@openssh.com AAAAGnN[...truncated...] my FIDO2 key + + # OpenSSH wire format for public keys: + # string certificate or public key format identifier + # byte[n] key/certificate data + # See: https://datatracker.ietf.org/doc/html/rfc4253#section-6.6 + # Where string type is encoded as follows: + # > They are stored as a uint32 containing its length (number of bytes that follow) + # > and zero (= empty string) or more bytes that are the value of the string. + # > Terminating null characters are not used. + # See: https://datatracker.ietf.org/doc/html/rfc4251#section-5 + # e.g., + # 00 00 00 0b 73 73 68 2d 65 64 32 35 35 31 39 |....ssh-ed25519| + + # PublicBlob.from_string() ensures that: + # * there are at least two fields in the disk format: and + # * key type in the disk format (PublicBlob.key_type) matches key type in the wire format + try: + pb = paramiko.pkey.PublicBlob.from_string(key) + except ValueError as e: + raise InvalidPublicKeyError(str(e)) from e + if pb.key_type not in supported_key_types: + raise UnsupportedPublicKeyError(type=pb.key_type) + return pb.key_type, pb.key_blob, pb.comment or None + + +def get_openssh_public_key_fingerprint(key_blob: bytes) -> str: + """ + Returns OpenSSH public key fingerprint in the format used by OpenSSH. + + See `paramiko.pkey.PKey.fingerprint` for the implementation. + + Args: + key_blob: public key blob in OpenSSH wire format. + + Returns: + A fingerprint as an ASCII string, the same format OpenSSH uses. + """ + sha256_digest_armored = base64.b64encode(hashlib.sha256(key_blob).digest()).decode() + return f"SHA256:{sha256_digest_armored.rstrip('=')}" + + +async def validate_openssh_public_key(key: str) -> None: + """ + Validates OpenSSH public key in disk format using `ssh-keygen`. + + Args: + key: public key file contents. + + Raises: + InvalidPublicKeyError: the key is not valid - `ssh-keygen` returned non-zero exit status. + TimeoutError: validation timeout expired. + OSerror: failed to execute `ssh-keygen` subprocess. + """ + proc = None + try: + proc = await asyncio.create_subprocess_exec( + _get_ssh_keygen_executable(), + "-l", + "-f", + "-", + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + output, _ = await asyncio.wait_for(proc.communicate(input=key.encode()), timeout=3) + except asyncio.TimeoutError: + if proc is not None: + proc.kill() + raise TimeoutError("Validation timeout expired") + except OSError: + if proc is not None: + proc.kill() + raise + if proc.returncode != 0: + raise InvalidPublicKeyError(output) + + +def user_public_key_model_to_public_key_info( + user_public_key_model: UserPublicKeyModel, +) -> PublicKeyInfo: + return PublicKeyInfo( + id=user_public_key_model.id, + added_at=user_public_key_model.created_at, + name=user_public_key_model.name, + type=user_public_key_model.type, + fingerprint=user_public_key_model.fingerprint, + ) + + +_ssh_keygen_executable: Optional[str] = None + + +def _get_ssh_keygen_executable() -> str: + global _ssh_keygen_executable + if _ssh_keygen_executable is not None: + return _ssh_keygen_executable + ssh_keygen_path = find_ssh_util("ssh-keygen") + if ssh_keygen_path is None: + _ssh_keygen_executable = "ssh-keygen" + else: + _ssh_keygen_executable = str(ssh_keygen_path) + return _ssh_keygen_executable diff --git a/src/dstack/_internal/server/services/repos.py b/src/dstack/_internal/server/services/repos.py index c741c4d401..fd5bf77f38 100644 --- a/src/dstack/_internal/server/services/repos.py +++ b/src/dstack/_internal/server/services/repos.py @@ -6,17 +6,32 @@ from sqlalchemy import delete, select, update from sqlalchemy.ext.asyncio import AsyncSession -from dstack._internal.core.errors import RepoDoesNotExistError, ServerClientError +from dstack._internal.core.errors import ( + RepoDoesNotExistError, + ResourceExistsError, + ResourceNotExistsError, + ServerClientError, +) from dstack._internal.core.models.repos import ( - AnyRepoHead, AnyRepoInfo, RepoHead, RepoHeadWithCreds, ) +from dstack._internal.core.models.repos.base import RepoType from dstack._internal.core.models.repos.remote import RemoteRepoCreds -from dstack._internal.server.models import CodeModel, ProjectModel, RepoModel +from dstack._internal.server.models import ( + CodeModel, + DecryptedString, + ProjectModel, + RepoCredsModel, + RepoModel, + UserModel, +) from dstack._internal.server.services.storage import get_default_storage -from dstack._internal.server.utils.common import run_async +from dstack._internal.utils.common import run_async +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) async def list_repos( @@ -31,6 +46,7 @@ async def list_repos( async def get_repo( session: AsyncSession, project: ProjectModel, + user: UserModel, repo_id: str, include_creds: bool, ) -> Optional[RepoHeadWithCreds]: @@ -41,15 +57,52 @@ async def get_repo( ) if repo is None: return None - return repo_model_to_repo_head(repo, include_creds=include_creds) + if not include_creds or repo.type != RepoType.REMOTE: + return RepoHeadWithCreds.parse_obj(repo_model_to_repo_head(repo)) + repo_creds = await get_repo_creds( + session=session, + repo=repo, + user=user, + ) + return repo_model_to_repo_head_with_creds(repo, repo_creds) async def init_repo( session: AsyncSession, project: ProjectModel, + user: UserModel, repo_id: str, repo_info: AnyRepoInfo, repo_creds: Optional[RemoteRepoCreds], +) -> RepoModel: + repo = await create_or_update_repo( + session=session, + project=project, + repo_id=repo_id, + repo_info=repo_info, + ) + if repo.type == RepoType.REMOTE: + if repo_creds is not None: + await create_or_update_repo_creds( + session=session, + repo=repo, + user=user, + creds=repo_creds, + ) + else: + await delete_repo_creds( + session=session, + repo=repo, + user=user, + ) + return repo + + +async def create_or_update_repo( + session: AsyncSession, + project: ProjectModel, + repo_id: str, + repo_info: AnyRepoInfo, ) -> RepoModel: try: return await create_repo( @@ -57,17 +110,13 @@ async def init_repo( project=project, repo_id=repo_id, repo_info=repo_info, - repo_creds=repo_creds, ) - except sqlalchemy.exc.IntegrityError: - await session.rollback() - await session.refresh(project) + except ResourceExistsError: return await update_repo( session=session, project=project, repo_id=repo_id, repo_info=repo_info, - repo_creds=repo_creds, ) @@ -76,16 +125,18 @@ async def create_repo( project: ProjectModel, repo_id: str, repo_info: AnyRepoInfo, - repo_creds: Optional[RemoteRepoCreds], ) -> RepoModel: repo = RepoModel( project_id=project.id, name=repo_id, - type=repo_info.repo_type, + type=RepoType(repo_info.repo_type), info=repo_info.json(), - creds=repo_creds.json() if repo_creds else None, ) - session.add(repo) + try: + async with session.begin_nested(): + session.add(repo) + except sqlalchemy.exc.IntegrityError: + raise ResourceExistsError() await session.commit() return repo @@ -95,15 +146,7 @@ async def update_repo( project: ProjectModel, repo_id: str, repo_info: AnyRepoInfo, - repo_creds: Optional[RemoteRepoCreds], ) -> RepoModel: - repo = RepoModel( - project_id=project.id, - name=repo_id, - type=repo_info.repo_type, - info=repo_info.json(), - creds=repo_creds.json() if repo_creds else None, - ) await session.execute( update(RepoModel) .where( @@ -111,11 +154,13 @@ async def update_repo( RepoModel.name == repo_id, ) .values( - info=repo.info, - creds=repo.creds, + info=repo_info.json(), ) ) await session.commit() + repo = await get_repo_model(session=session, project=project, repo_id=repo_id) + if repo is None: + raise ResourceNotExistsError() return repo @@ -128,6 +173,101 @@ async def delete_repos( delete(RepoModel).where(RepoModel.project_id == project.id, RepoModel.name.in_(repos_ids)) ) await session.commit() + logger.info("Deleted repos %s in project %s", repos_ids, project.name) + + +async def get_repo_creds( + session: AsyncSession, + repo: RepoModel, + user: UserModel, +) -> Optional[RepoCredsModel]: + res = await session.execute( + select(RepoCredsModel).where( + RepoCredsModel.repo_id == repo.id, + RepoCredsModel.user_id == user.id, + ) + ) + return res.scalar() + + +async def create_or_update_repo_creds( + session: AsyncSession, + repo: RepoModel, + user: UserModel, + creds: RemoteRepoCreds, +) -> RepoCredsModel: + try: + return await create_repo_creds( + session=session, + repo=repo, + user=user, + creds=creds, + ) + except ResourceExistsError: + return await update_repo_creds( + session=session, + repo=repo, + user=user, + creds=creds, + ) + + +async def create_repo_creds( + session: AsyncSession, + repo: RepoModel, + user: UserModel, + creds: RemoteRepoCreds, +) -> RepoCredsModel: + repo_creds = RepoCredsModel( + repo_id=repo.id, + user_id=user.id, + creds=DecryptedString(plaintext=creds.json()), + ) + try: + async with session.begin_nested(): + session.add(repo_creds) + except sqlalchemy.exc.IntegrityError: + raise ResourceExistsError() + await session.commit() + return repo_creds + + +async def update_repo_creds( + session: AsyncSession, + repo: RepoModel, + user: UserModel, + creds: RemoteRepoCreds, +) -> RepoCredsModel: + await session.execute( + update(RepoCredsModel) + .where( + RepoCredsModel.repo_id == repo.id, + RepoCredsModel.user_id == user.id, + ) + .values( + creds=DecryptedString(plaintext=creds.json()), + ) + ) + await session.commit() + repo_creds = await get_repo_creds(session=session, repo=repo, user=user) + if repo_creds is None: + raise ResourceNotExistsError() + return repo_creds + + +async def delete_repo_creds( + session: AsyncSession, + repo: RepoModel, + user: UserModel, +): + await session.execute( + delete(RepoCredsModel).where( + RepoCredsModel.repo_id == repo.id, + RepoCredsModel.user_id == user.id, + ) + ) + await session.commit() + logger.info("Deleted repo creds for repo %s user %s", repo.name, user.name) async def upload_code( @@ -164,7 +304,13 @@ async def upload_code( blob=None, ) await run_async(storage.upload_code, project.name, repo.name, code.blob_hash, blob) - session.add(code) + try: + async with session.begin_nested(): + session.add(code) + except sqlalchemy.exc.IntegrityError as e: + # Concurrent API call just uploaded the same code blob (TOC/TOU race condition), + # safe to ignore + logger.debug("Conflict, rolling back: %s", e) await session.commit() @@ -196,21 +342,27 @@ async def get_code_model( return res.scalar() -def repo_model_to_repo_head( - repo_model: RepoModel, - include_creds: bool = False, -) -> AnyRepoHead: - if include_creds: - return RepoHeadWithCreds.parse_obj( - { - "repo_id": repo_model.name, - "repo_info": json.loads(repo_model.info), - "repo_creds": json.loads(repo_model.creds) if repo_model.creds else None, - } - ) - return RepoHead.parse_obj( +def repo_model_to_repo_head(repo_model: RepoModel) -> RepoHead: + return RepoHead.__response__.parse_obj( + { + "repo_id": repo_model.name, + "repo_info": json.loads(repo_model.info), + } + ) + + +def repo_model_to_repo_head_with_creds( + repo_model: RepoModel, repo_creds_model: Optional[RepoCredsModel] +) -> RepoHeadWithCreds: + repo_creds_raw: Optional[str] + if repo_creds_model is None: + repo_creds_raw = repo_model.creds + else: + repo_creds_raw = repo_creds_model.creds.plaintext + return RepoHeadWithCreds.__response__.parse_obj( { "repo_id": repo_model.name, "repo_info": json.loads(repo_model.info), + "repo_creds": json.loads(repo_creds_raw) if repo_creds_raw else None, } ) diff --git a/src/dstack/_internal/server/services/requirements/__init__.py b/src/dstack/_internal/server/services/requirements/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/server/services/requirements/combine.py b/src/dstack/_internal/server/services/requirements/combine.py new file mode 100644 index 0000000000..53cec3457e --- /dev/null +++ b/src/dstack/_internal/server/services/requirements/combine.py @@ -0,0 +1,286 @@ +from typing import Callable, List, Optional, Protocol, TypeVar + +from pydantic import BaseModel +from typing_extensions import Self + +from dstack._internal.core.backends.profile_options import AnyBackendProfileOptions +from dstack._internal.core.models.profiles import Profile, SpotPolicy +from dstack._internal.core.models.resources import ( + CPUSpec, + DiskSpec, + GPUSpec, + Memory, + Range, + ResourcesSpec, +) +from dstack._internal.core.models.runs import Requirements +from dstack._internal.utils.combine import ( + CombineError, + combine_optional, + get_single_value_optional, +) +from dstack._internal.utils.typing import SupportsRichComparison + + +def combine_fleet_and_run_profiles( + fleet_profile: Profile, run_profile: Profile +) -> Optional[Profile]: + """ + Combines fleet and run profile parameters that affect offer selection or provisioning. + """ + try: + return Profile( + backends=_intersect_lists_optional(fleet_profile.backends, run_profile.backends), + regions=_intersect_lists_optional(fleet_profile.regions, run_profile.regions), + availability_zones=_intersect_lists_optional( + fleet_profile.availability_zones, run_profile.availability_zones + ), + instance_types=_intersect_lists_optional( + fleet_profile.instance_types, run_profile.instance_types + ), + reservation=get_single_value_optional( + fleet_profile.reservation, run_profile.reservation + ), + spot_policy=_combine_spot_policy_optional( + fleet_profile.spot_policy, run_profile.spot_policy + ), + max_price=_get_min_optional(fleet_profile.max_price, run_profile.max_price), + idle_duration=_combine_idle_duration_optional( + fleet_profile.idle_duration, run_profile.idle_duration + ), + tags=_combine_tags_optional(fleet_profile.tags, run_profile.tags), + backend_options=_combine_backend_options_optional( + fleet_profile.backend_options, run_profile.backend_options + ), + ) + except CombineError: + return None + + +def combine_fleet_and_run_requirements( + fleet_requirements: Requirements, run_requirements: Requirements +) -> Optional[Requirements]: + try: + return Requirements( + resources=_combine_resources(fleet_requirements.resources, run_requirements.resources), + max_price=_get_min_optional(fleet_requirements.max_price, run_requirements.max_price), + spot=_combine_spot_optional(fleet_requirements.spot, run_requirements.spot), + reservation=get_single_value_optional( + fleet_requirements.reservation, run_requirements.reservation + ), + multinode=fleet_requirements.multinode or run_requirements.multinode, + backend_options=_combine_backend_options_optional( + fleet_requirements.backend_options, run_requirements.backend_options + ), + ) + except CombineError: + return None + + +def _combine_backend_options( + value1: List[AnyBackendProfileOptions], + value2: List[AnyBackendProfileOptions], +) -> List[AnyBackendProfileOptions]: + by_type: dict[str, AnyBackendProfileOptions] = {opt.type: opt for opt in value1} + for opt in value2: + if opt.type in by_type: + by_type[opt.type] = by_type[opt.type].combine(opt) + else: + by_type[opt.type] = opt.copy(deep=True) + return list(by_type.values()) + + +def _combine_backend_options_optional( + value1: Optional[List[AnyBackendProfileOptions]], + value2: Optional[List[AnyBackendProfileOptions]], +) -> Optional[List[AnyBackendProfileOptions]]: + return _combine_copy_model_list_optional(value1, value2, _combine_backend_options) + + +_T = TypeVar("_T") +_ModelT = TypeVar("_ModelT", bound=BaseModel) +_CompT = TypeVar("_CompT", bound=SupportsRichComparison) + + +class _SupportsCopy(Protocol): + def copy(self) -> Self: ... + + +_CopyT = TypeVar("_CopyT", bound=_SupportsCopy) + + +def _intersect_lists_optional( + list1: Optional[list[_T]], list2: Optional[list[_T]] +) -> Optional[list[_T]]: + if list1 is None: + if list2 is None: + return None + return list2.copy() + if list2 is None: + return list1.copy() + return [x for x in list1 if x in list2] + + +def _get_min(value1: _CompT, value2: _CompT) -> _CompT: + return min(value1, value2) + + +def _get_min_optional(value1: Optional[_CompT], value2: Optional[_CompT]) -> Optional[_CompT]: + return combine_optional(value1, value2, _get_min) + + +def _combine_spot_policy(value1: SpotPolicy, value2: SpotPolicy) -> SpotPolicy: + if value1 == SpotPolicy.AUTO: + return value2 + if value2 == SpotPolicy.AUTO: + return value1 + if value1 == value2: + return value1 + raise CombineError(f"spot_policy values {value1} and {value2} cannot be combined") + + +def _combine_spot_policy_optional( + value1: Optional[SpotPolicy], value2: Optional[SpotPolicy] +) -> Optional[SpotPolicy]: + return combine_optional(value1, value2, _combine_spot_policy) + + +def _combine_idle_duration(value1: int, value2: int) -> int: + if value1 < 0: + if value2 < 0: + return min(value1, value2) + return value2 + if value2 < 0: + return value1 + return min(value1, value2) + + +def _combine_idle_duration_optional(value1: Optional[int], value2: Optional[int]) -> Optional[int]: + return combine_optional(value1, value2, _combine_idle_duration) + + +def _combine_tags_optional( + value1: Optional[dict[str, str]], value2: Optional[dict[str, str]] +) -> Optional[dict[str, str]]: + return _combine_copy_optional(value1, value2, _combine_tags) + + +def _combine_tags(value1: dict[str, str], value2: dict[str, str]) -> dict[str, str]: + return value1 | value2 + + +def _combine_resources(value1: ResourcesSpec, value2: ResourcesSpec) -> ResourcesSpec: + return ResourcesSpec( + cpu=_combine_cpu(value1.cpu, value2.cpu), # type: ignore[attr-defined] + memory=_combine_memory(value1.memory, value2.memory), + shm_size=_combine_shm_size_optional(value1.shm_size, value2.shm_size), + gpu=_combine_gpu_optional(value1.gpu, value2.gpu), + disk=_combine_disk_optional(value1.disk, value2.disk), + ) + + +def _combine_cpu(value1: CPUSpec, value2: CPUSpec) -> CPUSpec: + return CPUSpec( + arch=get_single_value_optional(value1.arch, value2.arch), + count=_combine_range(value1.count, value2.count), + ) + + +def _combine_memory(value1: Range[Memory], value2: Range[Memory]) -> Range[Memory]: + return _combine_range(value1, value2) + + +def _combine_shm_size_optional( + value1: Optional[Memory], value2: Optional[Memory] +) -> Optional[Memory]: + return _get_min_optional(value1, value2) + + +def _combine_gpu(value1: GPUSpec, value2: GPUSpec) -> GPUSpec: + return GPUSpec( + vendor=get_single_value_optional(value1.vendor, value2.vendor), + name=_intersect_lists_optional(value1.name, value2.name), + count=_combine_range(value1.count, value2.count), + memory=_combine_range_optional(value1.memory, value2.memory), + total_memory=_combine_range_optional(value1.total_memory, value2.total_memory), + compute_capability=_get_min_optional(value1.compute_capability, value2.compute_capability), + ) + + +def _combine_gpu_optional( + value1: Optional[GPUSpec], value2: Optional[GPUSpec] +) -> Optional[GPUSpec]: + return _combine_models_optional(value1, value2, _combine_gpu) + + +def _combine_disk(value1: DiskSpec, value2: DiskSpec) -> DiskSpec: + return DiskSpec(size=_combine_range(value1.size, value2.size)) + + +def _combine_disk_optional( + value1: Optional[DiskSpec], value2: Optional[DiskSpec] +) -> Optional[DiskSpec]: + return _combine_models_optional(value1, value2, _combine_disk) + + +def _combine_spot(value1: bool, value2: bool) -> bool: + if value1 != value2: + raise CombineError(f"spot values {value1} and {value2} cannot be combined") + return value1 + + +def _combine_spot_optional(value1: Optional[bool], value2: Optional[bool]) -> Optional[bool]: + return combine_optional(value1, value2, _combine_spot) + + +def _combine_range(value1: Range, value2: Range) -> Range: + res = value1.intersect(value2) + if res is None: + raise CombineError(f"Ranges {value1} and {value2} cannot be combined") + return res + + +def _combine_range_optional(value1: Optional[Range], value2: Optional[Range]) -> Optional[Range]: + return _combine_models_optional(value1, value2, _combine_range) + + +def _combine_models_optional( + value1: Optional[_ModelT], + value2: Optional[_ModelT], + combiner: Callable[[_ModelT, _ModelT], _ModelT], +) -> Optional[_ModelT]: + if value1 is None: + if value2 is not None: + return value2.copy(deep=True) + return None + if value2 is None: + return value1.copy(deep=True) + return combiner(value1, value2) + + +def _combine_copy_optional( + value1: Optional[_CopyT], + value2: Optional[_CopyT], + combiner: Callable[[_CopyT, _CopyT], _CopyT], +) -> Optional[_CopyT]: + if value1 is None: + if value2 is not None: + return value2.copy() + return None + if value2 is None: + return value1.copy() + return combiner(value1, value2) + + +def _combine_copy_model_list_optional( + value1: Optional[List[_ModelT]], + value2: Optional[List[_ModelT]], + combiner: Callable[[List[_ModelT], List[_ModelT]], List[_ModelT]], +) -> Optional[List[_ModelT]]: + if value1 is None: + if value2 is not None: + return [item.copy(deep=True) for item in value2] + return None + if value2 is None: + return [item.copy(deep=True) for item in value1] + return combiner(value1, value2) diff --git a/src/dstack/_internal/server/services/resources.py b/src/dstack/_internal/server/services/resources.py new file mode 100644 index 0000000000..8b38f92f4e --- /dev/null +++ b/src/dstack/_internal/server/services/resources.py @@ -0,0 +1,52 @@ +from typing import Optional + +import gpuhunt +from pydantic import parse_obj_as + +from dstack._internal.core.models.resources import CPUSpec, ResourcesSpec + + +def set_resources_defaults(resources: ResourcesSpec) -> None: + # TODO: Remove in 0.20. Use resources.cpu directly + cpu = parse_obj_as(CPUSpec, resources.cpu) + if cpu.arch is None: + gpu = resources.gpu + if ( + gpu is not None + and gpu.vendor in [None, gpuhunt.AcceleratorVendor.NVIDIA] + and gpu.name + and any(map(gpuhunt.is_nvidia_superchip, gpu.name)) + ): + cpu.arch = gpuhunt.CPUArchitecture.ARM + else: + cpu.arch = gpuhunt.CPUArchitecture.X86 + resources.cpu = cpu + + +def set_gpu_vendor_default( + resources: ResourcesSpec, + image: Optional[str], + docker: Optional[bool], +) -> None: + """Default GPU vendor to Nvidia when using the default CUDA image, + since it's only compatible with Nvidia GPUs. Only called for runs + (not fleets) since fleets don't have image context. + + The client infers the same default for display and validation + (see validate_gpu_vendor_and_image) but does not write it to the spec + for 0.19.x server compatibility. This server-side function is what + actually sets the vendor before offer matching. + + TODO: All resource defaults and validation (gpu vendor, cpu arch, memory, + disk, etc.) should be set here on the server, not split between client + and model-level defaults.""" + gpu = resources.gpu + if ( + gpu is not None + and gpu.vendor is None + and gpu.name is None + and gpu.count.max != 0 + and image is None + and docker is not True + ): + gpu.vendor = gpuhunt.AcceleratorVendor.NVIDIA diff --git a/src/dstack/_internal/server/services/runner/client.py b/src/dstack/_internal/server/services/runner/client.py index bdb27ee39e..7ccc2b1af7 100644 --- a/src/dstack/_internal/server/services/runner/client.py +++ b/src/dstack/_internal/server/services/runner/client.py @@ -1,71 +1,155 @@ -from dataclasses import dataclass -from typing import BinaryIO, Dict, List, Optional, Union +import urllib.parse +import uuid +from collections.abc import Generator +from http import HTTPStatus +from pathlib import Path +from typing import BinaryIO, Dict, List, Literal, Optional, TypeVar, Union, overload +import packaging.version import requests import requests.exceptions +import requests_unixsocket +from typing_extensions import Self +from dstack._internal.core.errors import DstackError +from dstack._internal.core.models.common import CoreModel, NetworkMode +from dstack._internal.core.models.envs import Env from dstack._internal.core.models.repos.remote import RemoteRepoCreds from dstack._internal.core.models.resources import Memory -from dstack._internal.core.models.runs import ClusterInfo, JobSpec, RunSpec -from dstack._internal.core.models.volumes import Volume, VolumeMountPoint +from dstack._internal.core.models.runs import ClusterInfo, Job, Run +from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint +from dstack._internal.server import settings as server_settings +from dstack._internal.server.schemas.instances import InstanceCheck from dstack._internal.server.schemas.runner import ( + ComponentInfo, + ComponentInstallRequest, + ComponentListResponse, + ComponentName, + GPUDevice, HealthcheckResponse, - PullBody, + InstanceHealthResponse, + JobInfoResponse, + LegacyPullResponse, + LegacyStopBody, + LegacySubmitBody, + MetricsResponse, PullResponse, ShimVolumeInfo, - StopBody, + ShutdownRequest, SubmitBody, - TaskConfigBody, + TaskInfoResponse, + TaskListResponse, + TaskStatus, + TaskSubmitRequest, + TaskTerminateRequest, ) +from dstack._internal.utils.common import get_or_error +from dstack._internal.utils.logging import get_logger +from dstack._internal.utils.path import PathLike -REMOTE_SHIM_PORT = 10998 -REMOTE_RUNNER_PORT = 10999 -REQUEST_TIMEOUT = 15 +REQUEST_TIMEOUT = 9 +UPLOAD_CODE_REQUEST_TIMEOUT = 60 +logger = get_logger(__name__) -@dataclass -class HealthStatus: - healthy: bool - reason: str - - def __str__(self) -> str: - return self.reason +LocalAddress = Union[int, Path] +"""A local TCP port or a Unix domain socket path the client connects to.""" class RunnerClient: + # `/api/upload_code` call is not required if there is no code + _OPTIONAL_CODE_UPLOAD_MIN_VERSION = (0, 20, 17) + + _version_string: str + _version_tuple: Optional["_Version"] + _negotiated: bool = False + def __init__( self, - port: int, + port: Optional[int] = None, hostname: str = "localhost", + uds: Optional[PathLike] = None, ): - self.secure = False - self.hostname = hostname - self.port = port + self._session, self._base_url = _make_session_and_base_url(port, hostname, uds) + + @classmethod + def from_address(cls, address: LocalAddress) -> Self: + """ + Builds a client from a TCP port (`int`) or a Unix domain socket path (`Path`). + """ + if isinstance(address, int): + return cls(port=address) + return cls(uds=address) + + def get_version_string(self) -> str: + if not self._negotiated: + self._negotiate() + return self._version_string + + def get_version_tuple(self) -> Optional["_Version"]: + if not self._negotiated: + self._negotiate() + return self._version_tuple + + def is_code_upload_optional(self) -> bool: + version_tuple = self.get_version_tuple() + return version_tuple is None or version_tuple >= self._OPTIONAL_CODE_UPLOAD_MIN_VERSION def healthcheck(self) -> Optional[HealthcheckResponse]: try: - resp = requests.get(self._url("/api/healthcheck"), timeout=REQUEST_TIMEOUT) - resp.raise_for_status() - return HealthcheckResponse.__response__.parse_obj(resp.json()) + healthcheck_response = self._healthcheck() except requests.exceptions.RequestException: return None + if not self._negotiated: + self._negotiate(healthcheck_response) + return healthcheck_response + + def get_metrics(self) -> Optional[MetricsResponse]: + resp = self._session.get(self._url("/api/metrics"), timeout=REQUEST_TIMEOUT) + if resp.status_code == 404: + return None + resp.raise_for_status() + return MetricsResponse.__response__.parse_obj(resp.json()) def submit_job( self, - run_spec: RunSpec, - job_spec: JobSpec, + run: Run, + job: Job, cluster_info: ClusterInfo, secrets: Dict[str, str], repo_credentials: Optional[RemoteRepoCreds], + instance_env: Optional[Union[Env, Dict[str, str]]] = None, + router_env: Optional[Dict[str, str]] = None, ): + # XXX: This is a quick-and-dirty hack to deliver InstanceModel-specific + # and Dynamo-router environment variables to the runner without runner + # API modification. Both layers are merged into a deep-copied job_spec + # so the shared spec object held by the caller is not mutated. + job_spec = job.job_spec + if instance_env is not None or router_env is not None: + merged_env: Dict[str, str] = {} + if instance_env is not None: + if isinstance(instance_env, Env): + merged_env.update(instance_env.as_dict()) + else: + merged_env.update(instance_env) + merged_env.update(job_spec.env) + if router_env is not None: + merged_env.update(router_env) + job_spec = job_spec.copy(deep=True) + job_spec.env = merged_env + quota = server_settings.SERVER_LOG_QUOTA_PER_JOB_HOUR body = SubmitBody( - run_spec=run_spec, + run=run, job_spec=job_spec, + job_submission=job.job_submissions[-1], cluster_info=cluster_info, secrets=secrets, repo_credentials=repo_credentials, + log_quota_hour=quota if quota > 0 else None, + run_spec=run.run_spec, ) - resp = requests.post( + resp = self._session.post( # use .json() to encode enums self._url("/api/submit"), data=body.json(), @@ -74,110 +158,589 @@ def submit_job( ) resp.raise_for_status() + def upload_archive(self, id: uuid.UUID, file: Union[BinaryIO, bytes]): + resp = self._session.post( + self._url("/api/upload_archive"), + files={"archive": (str(id), file)}, + timeout=UPLOAD_CODE_REQUEST_TIMEOUT, + ) + resp.raise_for_status() + def upload_code(self, file: Union[BinaryIO, bytes]): - resp = requests.post(self._url("/api/upload_code"), data=file, timeout=REQUEST_TIMEOUT) + resp = self._session.post( + self._url("/api/upload_code"), data=file, timeout=UPLOAD_CODE_REQUEST_TIMEOUT + ) resp.raise_for_status() - def run_job(self): - resp = requests.post(self._url("/api/run"), timeout=REQUEST_TIMEOUT) + def run_job(self) -> Optional[JobInfoResponse]: + resp = self._session.post(self._url("/api/run"), timeout=REQUEST_TIMEOUT) resp.raise_for_status() + if not _is_json_response(resp): + # Old runner or runner failed to get job info + return None + return JobInfoResponse.__response__.parse_obj(resp.json()) def pull(self, timestamp: int) -> PullResponse: - resp = requests.get( + resp = self._session.get( self._url("/api/pull"), params={"timestamp": timestamp}, timeout=REQUEST_TIMEOUT ) resp.raise_for_status() return PullResponse.__response__.parse_obj(resp.json()) def stop(self): - resp = requests.post(self._url("/api/stop"), timeout=REQUEST_TIMEOUT) + resp = self._session.post(self._url("/api/stop"), timeout=REQUEST_TIMEOUT) resp.raise_for_status() def _url(self, path: str) -> str: - return f"{'https' if self.secure else 'http'}://{self.hostname}:{self.port}/{path.lstrip('/')}" + return f"{self._base_url}/{path.lstrip('/')}" + + def _healthcheck(self) -> HealthcheckResponse: + resp = self._session.get(self._url("/api/healthcheck"), timeout=REQUEST_TIMEOUT) + resp.raise_for_status() + return HealthcheckResponse.__response__.parse_obj(resp.json()) + + def _negotiate(self, healthcheck_response: Optional[HealthcheckResponse] = None) -> None: + if healthcheck_response is None: + healthcheck_response = self._healthcheck() + version_string = healthcheck_response.version + version_tuple = _parse_version(version_string) + self._version_string = version_string + self._version_tuple = version_tuple + self._negotiated = True + + +class ShimError(DstackError): + pass + + +class ShimHTTPError(ShimError): + """ + An HTTP error wrapper for `requests.exceptions.HTTPError`. Should be used as follows: + + try: + + except requests.exceptions.HTTPError as e: + raise ShimHTTPError() from e + """ + + def __str__(self) -> str: + return self.message + + def __repr__(self) -> str: + return f"{self.__class__.__name__}({self.status_code})" + + @property + def status_code(self) -> int: + cause = self._cause + if cause is not None and cause.response is not None: + return cause.response.status_code + return 0 + + @property + def message(self) -> str: + cause = self._cause + if cause is None: + return "unknown_error" + return str(cause) + + @property + def _cause(self) -> Optional[requests.exceptions.HTTPError]: + cause = self.__cause__ + if isinstance(cause, requests.exceptions.HTTPError): + return cause + return None + + +class ShimAPIVersionError(ShimError): + pass + + +class ComponentList: + _items: dict[ComponentName, ComponentInfo] + + def __init__(self) -> None: + self._items = {} + + def __iter__(self) -> Generator[ComponentInfo, None, None]: + for component_info in self._items.values(): + yield component_info + + @classmethod + def from_response(cls, response: ComponentListResponse) -> Self: + components = cls() + for component_info in response.components: + try: + components.add(component_info) + except ValueError as e: + logger.warning("Error processing ComponentInfo: %s", e) + return components + + @property + def runner(self) -> Optional[ComponentInfo]: + return self.get(ComponentName.RUNNER) + + @property + def shim(self) -> Optional[ComponentInfo]: + return self.get(ComponentName.SHIM) + + def get(self, name: ComponentName) -> Optional[ComponentInfo]: + return self._items.get(name) + + def add(self, component_info: ComponentInfo) -> None: + try: + name = ComponentName(component_info.name) + except ValueError as e: + raise ValueError(f"Unknown component: {component_info.name}") from e + if name in self._items: + raise ValueError(f"Duplicate component: {component_info.name}") + self._items[name] = component_info class ShimClient: + # API v2 (a.k.a. Future API) — `/api/tasks/[:id[/{terminate,remove}]]` + # API v1 (a.k.a. Legacy API) — `/api/{submit,pull,stop}` + _API_V2_MIN_SHIM_VERSION = (0, 18, 34) + + # `/api/instance/health` + _INSTANCE_HEALTH_MIN_SHIM_VERSION = (0, 19, 22) + + # `/api/components` + _COMPONENTS_MIN_SHIM_VERSION = (0, 20, 0) + + # `/api/shutdown` + _SHUTDOWN_MIN_SHIM_VERSION = (0, 20, 1) + + _shim_version_string: str + _shim_version_tuple: Optional["_Version"] + _api_version: int + _negotiated: bool = False + def __init__( self, - port: int, + port: Optional[int] = None, hostname: str = "localhost", + uds: Optional[PathLike] = None, ): - self.secure = False - self.hostname = hostname - self.port = port + self._session, self._base_url = _make_session_and_base_url(port, hostname, uds) + + @classmethod + def from_address(cls, address: LocalAddress) -> Self: + """ + Builds a client from a TCP port (`int`) or a Unix domain socket path (`Path`). + """ + if isinstance(address, int): + return cls(port=address) + return cls(uds=address) + + # Methods shared by all API versions + + def get_version_string(self) -> str: + if not self._negotiated: + self._negotiate() + return self._shim_version_string + + def get_version_tuple(self) -> Optional["_Version"]: + if not self._negotiated: + self._negotiate() + return self._shim_version_tuple + + def is_api_v2_supported(self) -> bool: + if not self._negotiated: + self._negotiate() + return self._api_version == 2 + + def is_instance_health_supported(self) -> bool: + if not self._negotiated: + self._negotiate() + return ( + self._shim_version_tuple is None + or self._shim_version_tuple >= self._INSTANCE_HEALTH_MIN_SHIM_VERSION + ) + + def are_components_supported(self) -> bool: + if not self._negotiated: + self._negotiate() + return ( + self._shim_version_tuple is None + or self._shim_version_tuple >= self._COMPONENTS_MIN_SHIM_VERSION + ) + + def is_shutdown_supported(self) -> bool: + if not self._negotiated: + self._negotiate() + return ( + self._shim_version_tuple is None + or self._shim_version_tuple >= self._SHUTDOWN_MIN_SHIM_VERSION + ) - def healthcheck(self, unmask_exeptions: bool = False) -> Optional[HealthcheckResponse]: + @overload + def healthcheck(self) -> Optional[HealthcheckResponse]: ... + + @overload + def healthcheck(self, unmask_exceptions: Literal[True]) -> HealthcheckResponse: ... + + def healthcheck(self, unmask_exceptions: bool = False) -> Optional[HealthcheckResponse]: try: - resp = requests.get(self._url("/api/healthcheck"), timeout=REQUEST_TIMEOUT) - resp.raise_for_status() - return HealthcheckResponse.__response__.parse_obj(resp.json()) + resp = self._request("GET", "/api/healthcheck", raise_for_status=True) except requests.exceptions.RequestException: - if unmask_exeptions: + if unmask_exceptions: raise return None + if not self._negotiated: + self._negotiate(resp) + return self._response(HealthcheckResponse, resp) + + # API v2 methods + + def get_instance_health(self) -> Optional[InstanceHealthResponse]: + if not self.is_instance_health_supported(): + logger.debug("instance health is not supported: %s", self._shim_version_string) + return None + resp = self._request("GET", "/api/instance/health") + if resp.status_code == HTTPStatus.NOT_FOUND: + logger.warning("instance health: %s", resp.text) + return None + self._raise_for_status(resp) + return self._response(InstanceHealthResponse, resp) + + def shutdown(self, *, force: bool) -> bool: + if not self.is_shutdown_supported(): + logger.debug("shim shutdown is not supported: %s", self._shim_version_string) + return False + body = ShutdownRequest(force=force) + resp = self._request("POST", "/api/shutdown", body) + # TODO: Remove this check after 0.20.1 release, use _request(..., raise_for_status=True) + if resp.status_code == HTTPStatus.NOT_FOUND and self._shim_version_tuple is None: + # Old dev build of shim + logger.debug("shim shutdown is not supported: %s", self._shim_version_string) + return False + self._raise_for_status(resp) + return True + + def is_safe_to_restart(self) -> bool: + if not self.is_api_v2_supported(): + # old shim, `/api/shutdown` is not supported anyway + return False + task_list = self.list_tasks() + if (tasks := task_list.tasks) is None: + # old shim, `/api/shutdown` is not supported anyway + return False + restart_safe_task_statuses = self._get_restart_safe_task_statuses() + return all(t.status in restart_safe_task_statuses for t in tasks) + + def get_components(self) -> Optional[ComponentList]: + if not self.are_components_supported(): + logger.debug("components are not supported: %s", self._shim_version_string) + return None + resp = self._request("GET", "/api/components", raise_for_status=True) + return ComponentList.from_response(self._response(ComponentListResponse, resp)) + + def install_runner(self, url: str) -> None: + body = ComponentInstallRequest( + name=ComponentName.RUNNER, + url=url, + ) + self._request("POST", "/api/components/install", body, raise_for_status=True) + + def install_shim(self, url: str) -> None: + body = ComponentInstallRequest( + name=ComponentName.SHIM, + url=url, + ) + self._request("POST", "/api/components/install", body, raise_for_status=True) + + def list_tasks(self) -> TaskListResponse: + if not self.is_api_v2_supported(): + raise ShimAPIVersionError() + resp = self._request("GET", "/api/tasks", raise_for_status=True) + return self._response(TaskListResponse, resp) + + def get_task(self, task_id: "_TaskID") -> TaskInfoResponse: + if not self.is_api_v2_supported(): + raise ShimAPIVersionError() + resp = self._request("GET", f"/api/tasks/{task_id}", raise_for_status=True) + return self._response(TaskInfoResponse, resp) + + def submit_task( + self, + task_id: "_TaskID", + name: str, + registry_username: str, + registry_password: str, + image_name: str, + container_user: str, + privileged: bool, + gpu: Optional[int], + cpu: Optional[float], + memory: Optional[Memory], + shm_size: Optional[Memory], + network_mode: NetworkMode, + volumes: list[Volume], + volume_mounts: list[VolumeMountPoint], + instance_mounts: list[InstanceMountPoint], + gpu_devices: list[GPUDevice], + host_ssh_user: str, + host_ssh_keys: list[str], + container_ssh_keys: list[str], + instance_id: str, + ) -> None: + if not self.is_api_v2_supported(): + raise ShimAPIVersionError() + body = TaskSubmitRequest( + id=str(task_id), + name=name, + registry_username=registry_username, + registry_password=registry_password, + image_name=image_name, + container_user=container_user, + privileged=privileged, + gpu=gpu if gpu is not None else -1, # None = -1 = "all available" (0 means "0 GPU") + cpu=cpu if cpu is not None else 0, # None = 0 = "all available" + memory=_memory_to_bytes(memory), # None = 0 = "all available" + shm_size=_memory_to_bytes(shm_size), # None = 0 = "use default value" + network_mode=network_mode, + volumes=[_volume_to_shim_volume_info(v, instance_id) for v in volumes], + volume_mounts=volume_mounts, + instance_mounts=instance_mounts, + gpu_devices=gpu_devices, + host_ssh_user=host_ssh_user, + host_ssh_keys=host_ssh_keys, + container_ssh_keys=container_ssh_keys, + ) + self._request("POST", "/api/tasks", body, raise_for_status=True) + + def terminate_task( + self, + task_id: "_TaskID", + reason: Optional[str] = None, + message: Optional[str] = None, + *, + timeout: int = 10, + ) -> None: + if not self.is_api_v2_supported(): + raise ShimAPIVersionError() + body = TaskTerminateRequest( + termination_reason=reason or "", + termination_message=message or "", + timeout=timeout, + ) + self._request("POST", f"/api/tasks/{task_id}/terminate", body, raise_for_status=True) + + def remove_task(self, task_id: "_TaskID") -> None: + if not self.is_api_v2_supported(): + raise ShimAPIVersionError() + self._request("POST", f"/api/tasks/{task_id}/remove", raise_for_status=True) + + # API v1 methods def submit( self, username: str, password: str, image_name: str, + privileged: bool, container_name: str, + container_user: str, shm_size: Optional[Memory], public_keys: List[str], ssh_user: str, ssh_key: str, mounts: List[VolumeMountPoint], volumes: List[Volume], - ): - _shm_size = int(shm_size * 1024 * 1024 * 1014) if shm_size else 0 - volume_infos = [_volume_to_shim_volume_info(v) for v in volumes] - post_body = TaskConfigBody( + instance_mounts: List[InstanceMountPoint], + instance_id: str, + ) -> bool: + """ + Returns `True` if submitted and `False` if the shim already has a job (`409 Conflict`). + Other error statuses raise an exception. + """ + body = LegacySubmitBody( username=username, password=password, image_name=image_name, + privileged=privileged, container_name=container_name, - shm_size=_shm_size, + container_user=container_user, + shm_size=int(shm_size * 1024**3) if shm_size else 0, public_keys=public_keys, ssh_user=ssh_user, ssh_key=ssh_key, mounts=mounts, - volumes=volume_infos, - ).dict() - resp = requests.post( - self._url("/api/submit"), - json=post_body, - timeout=REQUEST_TIMEOUT, + volumes=[_volume_to_shim_volume_info(v, instance_id) for v in volumes], + instance_mounts=instance_mounts, ) - resp.raise_for_status() + resp = self._request("POST", "/api/submit", body) + if resp.status_code == HTTPStatus.CONFLICT: + return False + self._raise_for_status(resp) + return True - def stop(self, force: bool = False): - body = StopBody(force=force) - resp = requests.post(self._url("/api/stop"), json=body.dict(), timeout=REQUEST_TIMEOUT) - resp.raise_for_status() + def stop(self, force: bool = False) -> None: + body = LegacyStopBody(force=force) + self._request("POST", "/api/stop", body, raise_for_status=True) - def pull(self) -> PullBody: - resp = requests.get(self._url("/api/pull"), timeout=REQUEST_TIMEOUT) - resp.raise_for_status() - return PullBody.__response__.parse_obj(resp.json()) + def pull(self) -> LegacyPullResponse: + resp = self._request("GET", "/api/pull", raise_for_status=True) + return self._response(LegacyPullResponse, resp) - def _url(self, path: str) -> str: - return f"{'https' if self.secure else 'http'}://{self.hostname}:{self.port}/{path.lstrip('/')}" + # Metrics + + def get_task_metrics(self, task_id: "_TaskID") -> Optional[str]: + resp = self._request("GET", f"/metrics/tasks/{task_id}") + if resp.status_code == HTTPStatus.NOT_FOUND: + # Metrics exporter is not installed or old shim version + return None + if resp.status_code == HTTPStatus.BAD_GATEWAY: + # Metrics exporter is not available or returned an error + logger.info("failed to collect metrics for task %s: %s", task_id, resp.text) + return None + self._raise_for_status(resp) + return resp.text + + # Private methods used for public methods implementations + + def _request( + self, + method: str, + path: str, + body: Optional[CoreModel] = None, + *, + raise_for_status: bool = False, + ) -> requests.Response: + url = f"{self._base_url}/{path.lstrip('/')}" + if body is not None: + json = body.dict() + else: + json = None + resp = self._session.request(method, url, json=json, timeout=REQUEST_TIMEOUT) + if raise_for_status: + self._raise_for_status(resp) + return resp + + _M = TypeVar("_M", bound=CoreModel) + + def _response(self, model_cls: type[_M], response: requests.Response) -> _M: + return model_cls.__response__.parse_obj(response.json()) + + def _raise_for_status(self, response: requests.Response) -> None: + try: + response.raise_for_status() + except requests.exceptions.HTTPError as e: + raise ShimHTTPError() from e + def _negotiate(self, healthcheck_response: Optional[requests.Response] = None) -> None: + if healthcheck_response is None: + healthcheck_response = self._request("GET", "/api/healthcheck", raise_for_status=True) + version_string = self._response(HealthcheckResponse, healthcheck_response).version + version_tuple = _parse_version(version_string) + if version_tuple is None or version_tuple >= self._API_V2_MIN_SHIM_VERSION: + api_version = 2 + else: + api_version = 1 + self._shim_version_string = version_string + self._shim_version_tuple = version_tuple + self._api_version = api_version + self._negotiated = True -def health_response_to_health_status(data: HealthcheckResponse) -> HealthStatus: - if data.service == "dstack-shim": - return HealthStatus(healthy=True, reason="Service is OK") + def _get_restart_safe_task_statuses(self) -> list[TaskStatus]: + # TODO: Rework shim's DockerRunner.Run() so that it does not wait for container termination + # (this at least requires replacing .waitContainer() with periodic polling of container + # statuses and moving some cleanup defer calls to .Terminate() and/or .Remove()) and add + # TaskStatus.RUNNING to the list of restart-safe task statuses for supported shim versions. + return [TaskStatus.TERMINATED] + + +def _make_session_and_base_url( + port: Optional[int], hostname: str, uds: Optional[PathLike] +) -> tuple[requests.Session, str]: + """ + Builds a session and base URL for HTTP over TCP (`port`) or over + a Unix domain socket (`uds`). Exactly one of the two must be specified. + """ + if (port is None) == (uds is None): + raise ValueError("Either port or uds must be specified, not both") + session = requests.Session() + if uds is not None: + base_url = f"http+unix://{urllib.parse.quote(str(uds), safe='')}" + session.mount("http+unix://", requests_unixsocket.UnixAdapter()) else: - return HealthStatus( - healthy=False, - reason=f"Service name is {data.service}, service version: {data.version}", + base_url = f"http://{hostname}:{port}" + return session, base_url + + +def healthcheck_response_to_instance_check( + response: HealthcheckResponse, + instance_health_response: Optional[InstanceHealthResponse] = None, +) -> InstanceCheck: + if response.service == "dstack-shim": + message: Optional[str] = None + if ( + instance_health_response is not None + and instance_health_response.dcgm is not None + and instance_health_response.dcgm.incidents + ): + message = instance_health_response.dcgm.incidents[0].error_message + return InstanceCheck( + reachable=True, health_response=instance_health_response, message=message ) + return InstanceCheck( + reachable=False, + message=f"unexpected service: {response.service} version: {response.version}", + health_response=instance_health_response, + ) -def _volume_to_shim_volume_info(volume: Volume) -> ShimVolumeInfo: +def _volume_to_shim_volume_info(volume: Volume, instance_id: str) -> ShimVolumeInfo: + device_name = None + attachment_data = volume.get_attachment_data_for_instance(instance_id) + if attachment_data is not None: + device_name = attachment_data.device_name return ShimVolumeInfo( + backend=volume.configuration.backend.value, name=volume.name, - volume_id=volume.volume_id, + volume_id=get_or_error(volume.volume_id), init_fs=not volume.external, + device_name=device_name, ) + + +def _memory_to_bytes(memory: Optional[Memory]) -> int: + if memory is None: + return 0 + return int(memory * 1024**3) + + +def _is_json_response(response: requests.Response) -> bool: + content_type = response.headers.get("content-type") + if not content_type: + return False + return content_type.split(";", maxsplit=1)[0].strip() == "application/json" + + +_TaskID = Union[uuid.UUID, str] + +_Version = tuple[int, int, int] + + +def _parse_version(version_string: str) -> Optional[_Version]: + """ + Returns a (major, minor, micro) tuple if the version if final. + Returns `None`, which means "latest", if: + * the version is prerelease or dev build -- assuming that in most cases it's a build based on + the latest final release + * the version consists of only major part or not valid at all, e.g., staging builds have + GitHub run number (e.g., 1234) instead of the version -- assuming that it's a "bleeding edge", + not yet released version + """ + try: + version = packaging.version.parse(version_string) + except packaging.version.InvalidVersion: + return None + if version.is_prerelease or version.is_devrelease: + return None + release = version.release + if len(release) <= 1: + return None + if len(release) == 2: + return (*release, 0) + return release[:3] diff --git a/src/dstack/_internal/server/services/runner/pool.py b/src/dstack/_internal/server/services/runner/pool.py new file mode 100644 index 0000000000..e3a012967d --- /dev/null +++ b/src/dstack/_internal/server/services/runner/pool.py @@ -0,0 +1,367 @@ +import os +import shutil +import threading +import time +from dataclasses import dataclass +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Collection, Optional, Union +from weakref import WeakValueDictionary + +from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT, DSTACK_SHIM_HTTP_PORT +from dstack._internal.core.errors import SSHError +from dstack._internal.core.models.instances import SSHConnectionParams +from dstack._internal.core.models.runs import JobProvisioningData, JobRuntimeData +from dstack._internal.core.services.ssh.tunnel import ( + SSH_DEFAULT_OPTIONS, + IPSocket, + SocketPair, + SSHTunnel, + UnixSocket, +) +from dstack._internal.server.settings import ( + SERVER_DIR_PATH, + SERVER_SSH_CONNECT_TIMEOUT, +) +from dstack._internal.utils.logging import get_logger +from dstack._internal.utils.path import FileContent, make_tmp_symlink_to_dir + +logger = get_logger(__name__) + +PrivateKeyOrPair = Union[str, tuple[str, Optional[str]]] +"""A host private key or pair of (host private key, optional proxy jump private key)""" + +CONNECTIONS_DIR = SERVER_DIR_PATH / "instance-connections" + +MIN_ALIVE_CHECK_INTERVAL = 30 +"""How often (at most) `InstanceConnection.is_alive()` runs `ssh -O check`, in seconds.""" + + +@dataclass(frozen=True) +class InstanceConnectionKey: + hostname: str + port: int + ports_to_forward: tuple[int, ...] + + @staticmethod + def from_jpd( + jpd: JobProvisioningData, jrd: Optional[JobRuntimeData] = None + ) -> "InstanceConnectionKey": + assert jpd.hostname is not None and jpd.ssh_port is not None + container_to_host_port_map = InstanceConnection.get_container_to_host_port_map(jpd, jrd) + return InstanceConnectionKey( + hostname=jpd.hostname, + port=jpd.ssh_port, + ports_to_forward=tuple(container_to_host_port_map.values()), + ) + + +# InstanceConnectionPool has sync interface because runner/shim clients and all the callers are sync. +# TODO: Consider moving all of them to async for consistency with other pools/clients. +class InstanceConnectionPool: + """ + A pool of SSH connections to instances' host sshd (VM-based) + or runner sshd (container-based) for forwarding shim and runner ports. + + NOTE: The pool is not currently intended for arbitrary ports forwarding, only for shim and runner ports. + E.g. it cannot be used to forward services ports for probes or router-worker communication. + This simplified model allows forwarding the same ports for the given host:port and reusing the connection across all calls. + TODO: Generalize to support arbitrary ports forwarding incl. job's ports. + + Incompatible with multiple server processes sharing the same server dir: + connection dirs and control sockets are assumed to be owned by a single process. + """ + + def __init__(self): + self._connections: dict[InstanceConnectionKey, InstanceConnection] = {} + # Use `WeakValueDictionary` for automatic GC of unused locks and avoid manual refcounting. + # A lock is expected to exist only while a thread holds a strong reference to it. + self._access_locks: WeakValueDictionary[InstanceConnectionKey, threading.Lock] = ( + WeakValueDictionary() + ) + self._access_locks_lock = threading.Lock() + self._closed = False + + def get_or_open( + self, + ssh_private_key: PrivateKeyOrPair, + jpd: JobProvisioningData, + jrd: Optional[JobRuntimeData], + ) -> Optional["InstanceConnection"]: + """ + Starts a new SSH connection or returns an existing one. + Existing connections are checked for health periodically + so that subsequent calls to `get_or_open()` eventually return a healthy connection. + """ + key = InstanceConnectionKey.from_jpd(jpd, jrd) + lock = self._get_access_lock(key) + with lock: + if self._closed: + return None + conn = self._connections.get(key) + if conn is not None: + if conn.is_alive(): + return conn + # The master process is gone — evict and reopen. + logger.debug("Instance connection %s is dead, reopening", key) + self._connections.pop(key) + try: + conn.close() + except Exception: + logger.exception("Failed to close instance connection %s", key) + try: + conn = InstanceConnection(ssh_private_key, jpd, jrd) + conn.open() + except SSHError: + # error logged in tunnel + return None + self._connections[key] = conn + return conn + + def drop(self, key: InstanceConnectionKey) -> None: + lock = self._get_access_lock(key) + with lock: + try: + conn = self._connections.pop(key) + except KeyError: + return + try: + conn.close() + except Exception: + logger.exception("Failed to close instance connection %s", key) + + def drop_by_jpd(self, jpd: JobProvisioningData, jrd: Optional[JobRuntimeData] = None): + if jpd.hostname is None or jpd.ssh_port is None: + return + key = InstanceConnectionKey.from_jpd(jpd, jrd) + self.drop(key) + + def startup_cleanup(self) -> None: + """ + Removes connection dirs left by a previous server process (e.g. after SIGKILL). + Must be called on server startup before the pool is used. + Leftover live masters are reaped by `ControlPersist`. + """ + shutil.rmtree(CONNECTIONS_DIR, ignore_errors=True) + + def close_all(self) -> None: + """ + Closes all connections and prevents new ones from being opened. + Safe to call concurrently with in-flight `get_or_open()` calls. + `get_or_open()` will return `None` after `close_all()`. + """ + with self._access_locks_lock: + self._closed = True + # self._connections holds cached connections, and + # self._access_locks may hold mid-open connections not yet cached. + keys = set(self._connections) | set(self._access_locks.keys()) + logger.debug("Closing %d instance connection(s)", len(keys)) + for key in keys: + self.drop(key) + + def _get_access_lock(self, key: InstanceConnectionKey) -> threading.Lock: + with self._access_locks_lock: + lock = self._access_locks.get(key) + if lock is not None: + return lock + lock = threading.Lock() + self._access_locks[key] = lock + return lock + + +instance_connection_pool = InstanceConnectionPool() + + +class InstanceConnection: + """ + An SSH connection to instance's host sshd (VM-based) + or runner sshd (container-based) for forwarding shim and runner ports. + + The same control socket is used for all connections to the same hostname:port, + unless jrd overrides the runner port mapped on host (blocks case). + In case of blocks, each job establishes a separate connection with a different runner port forwarded. + TODO: Re-use the same SSH connection for all blocks via `-O forward`/`-O cancel`. + """ + + def __init__( + self, + ssh_private_key: PrivateKeyOrPair, + jpd: JobProvisioningData, + jrd: Optional[JobRuntimeData], + ephemeral: bool = False, + ) -> None: + """ + Args: + ephemeral: Creates a unique tmp dir for the UDS. Use when connection re-use is not needed. + """ + self._key = InstanceConnectionKey.from_jpd(jpd, jrd) + self._ephemeral = ephemeral + self._last_verified_at: float = 0.0 + self._temp_dir, self._effective_conn_dir, self._real_conn_dir = ( + InstanceConnection._resolve_conn_dir(self._key, ephemeral) + ) + self._control_socket_path = self._effective_conn_dir / "control.sock" + self._real_control_socket_path = self._real_conn_dir / "control.sock" + self._container_to_host_port_map = InstanceConnection.get_container_to_host_port_map( + jpd, jrd + ) + self._host_port_to_uds_map = InstanceConnection._get_host_port_to_uds_map( + conn_dir=self._effective_conn_dir, + ports_to_forward=self._key.ports_to_forward, + ) + self._tunnel = SSHTunnel( + destination=f"{jpd.username}@{jpd.hostname}", + port=jpd.ssh_port, + identity=InstanceConnection._get_identity(ssh_private_key), + control_sock_path=self._control_socket_path, + forwarded_sockets=self._get_forwarded_sockets(self._host_port_to_uds_map), + ssh_proxies=InstanceConnection._get_proxies(ssh_private_key, jpd), + options={ + **SSH_DEFAULT_OPTIONS, + "ConnectTimeout": str(SERVER_SSH_CONNECT_TIMEOUT), + # Auto-close half-opened connections (the instance not responding). + "ServerAliveInterval": "10", + "ServerAliveCountMax": "3", + # Set ControlPersist to auto-close orphaned background ssh process + # in case dstack server shutdown is not graceful. + "ControlPersist": "2m", + }, + batch_mode=True, + ) + + def open(self) -> None: + # A control socket left by a killed master or by a master that exited after + # its tmp symlink was deleted prevents ssh from becoming a mux master + # ("ControlSocket ... already exists, disabling multiplexing"). + # Remove it unless it's served by a live master (then open() attaches to it). + if self._real_control_socket_path.exists() and not self._tunnel.check(): + self._real_control_socket_path.unlink(missing_ok=True) + self._tunnel.open() + self._last_verified_at = time.monotonic() + + def is_alive(self) -> bool: + """ + Verifies that the connection's SSH master process is alive: + + 1. The control socket exists (a stat). Catches cleanly exited masters (incl. ControlPersist). + 2. `ssh -O check`. Catches killed masters that left a stale socket file behind. + Rate-limited to once per `MIN_ALIVE_CHECK_INTERVAL`. + + Does not detect half-open TCP (ServerAliveInterval converts it into a clean exit) + or mid-request deaths (handled by the callers' drop-on-error pattern). + """ + if not self._control_socket_path.exists(): + return False + now = time.monotonic() + if now - self._last_verified_at < MIN_ALIVE_CHECK_INTERVAL: + return True + if not self._tunnel.check(): + return False + # Keep the symlink fresh so that age-based /tmp cleanup is less likely to remove it. + try: + os.utime(self._effective_conn_dir, follow_symlinks=False) + except OSError: + pass + self._last_verified_at = now + return True + + def forwarded_paths(self) -> dict[int, Path]: + """Returns a mapping from container port to the local UDS path.""" + return { + container_port: self._host_port_to_uds_map[host_port] + for container_port, host_port in self._container_to_host_port_map.items() + } + + def close(self) -> None: + self._tunnel.close() + # Remove a stale control.sock left by a killed master, forwarded UDS files + # (ssh does not unlink them on exit), and the dir itself, so that + # CONNECTIONS_DIR does not accumulate dirs of gone instances. + # A master that survives close() because it is unreachable via a deleted + # symlink is reaped by ControlPersist. + shutil.rmtree(self._real_conn_dir, ignore_errors=True) + + @property + def key(self) -> InstanceConnectionKey: + return self._key + + @staticmethod + def get_container_to_host_port_map( + jpd: JobProvisioningData, + jrd: Optional[JobRuntimeData], + ) -> dict[int, int]: + runner_host_port = DSTACK_RUNNER_HTTP_PORT + if jrd is not None and jrd.ports is not None: + runner_host_port = jrd.ports.get(DSTACK_RUNNER_HTTP_PORT, runner_host_port) + port_map = {DSTACK_RUNNER_HTTP_PORT: runner_host_port} + if jpd.dockerized: + port_map[DSTACK_SHIM_HTTP_PORT] = DSTACK_SHIM_HTTP_PORT + return port_map + + @staticmethod + def _resolve_conn_dir( + key: InstanceConnectionKey, ephemeral: bool + ) -> tuple[TemporaryDirectory, Path, Path]: + """ + Returns (temp dir to retain, dir to be used by ssh, real conn dir). + """ + if ephemeral: + temp_dir = TemporaryDirectory() + path = Path(temp_dir.name) + return temp_dir, path, path + + conn_dir = ( + CONNECTIONS_DIR + / f"{key.hostname}:{key.port},{','.join(map(str, key.ports_to_forward))}" + ) + conn_dir.mkdir(parents=True, exist_ok=True) + # Connection_dir can have a long path that won't be accepted by the ssh command, + # so we create a short temporary symlink. + # The symlink may be removed by age-based /tmp cleanup while the connection is still alive. + # The connection dir will be removed and the connection is re-opened. + temp_dir, conn_symlink_dir = make_tmp_symlink_to_dir( + dirpath=conn_dir, + symlink_dirname="connection", + ) + return temp_dir, conn_symlink_dir, conn_dir + + @staticmethod + def _get_host_port_to_uds_map( + conn_dir: Path, + ports_to_forward: Collection[int], + ) -> dict[int, Path]: + return {port: conn_dir / f"{port}.sock" for port in ports_to_forward} + + @staticmethod + def _get_forwarded_sockets(host_port_to_uds_map: dict[int, Path]) -> list[SocketPair]: + return [ + SocketPair( + local=UnixSocket(path=path), + remote=IPSocket(host="localhost", port=port), + ) + for port, path in host_port_to_uds_map.items() + ] + + @staticmethod + def _get_identity(ssh_private_key: PrivateKeyOrPair) -> FileContent: + if isinstance(ssh_private_key, tuple): + ssh_private_key, _ = ssh_private_key + return FileContent(ssh_private_key) + + @staticmethod + def _get_proxies( + ssh_private_key: PrivateKeyOrPair, jpd: JobProvisioningData + ) -> list[tuple[SSHConnectionParams, FileContent]]: + if jpd.ssh_proxy is None: + return [] + + if isinstance(ssh_private_key, str): + ssh_proxy_private_key = ssh_private_key + else: + ssh_proxy_private_key = ssh_private_key[1] + if ssh_proxy_private_key is None: + # In case proxy key is None, fallback to main key (k8s case). + ssh_proxy_private_key = ssh_private_key[0] + + proxy_identity = FileContent(ssh_proxy_private_key) + return [(jpd.ssh_proxy, proxy_identity)] diff --git a/src/dstack/_internal/server/services/runner/ssh.py b/src/dstack/_internal/server/services/runner/ssh.py index fe6e55a08f..a7009df468 100644 --- a/src/dstack/_internal/server/services/runner/ssh.py +++ b/src/dstack/_internal/server/services/runner/ssh.py @@ -1,95 +1,104 @@ import functools -import inspect -import socket -import time -from typing import Callable, Dict, List, Optional +from collections.abc import Mapping +from typing import Callable, Literal, Optional, TypeVar, Union import requests from typing_extensions import Concatenate, ParamSpec -from dstack._internal.core.errors import SSHError -from dstack._internal.core.models.runs import JobProvisioningData -from dstack._internal.core.services.ssh.tunnel import RunnerTunnel -from dstack._internal.server.services.runner import client -from dstack._internal.server.settings import LOCAL_BACKEND_ENABLED -from dstack._internal.utils.logging import get_logger +from dstack._internal.core.errors import DstackError, SSHError +from dstack._internal.core.models.runs import JobProvisioningData, JobRuntimeData +from dstack._internal.server import settings +from dstack._internal.server.services.runner.client import LocalAddress +from dstack._internal.server.services.runner.pool import ( + InstanceConnection, + PrivateKeyOrPair, + instance_connection_pool, +) -logger = get_logger(__name__) P = ParamSpec("P") +R = TypeVar("R") def runner_ssh_tunnel( - ports: List[int], retries: int = 3, retry_interval: float = 1 -) -> Callable[[Callable[P, bool]], Callable[Concatenate[str, JobProvisioningData, P], bool]]: - def decorator( - func: Callable[P, bool], - ) -> Callable[Concatenate[str, JobProvisioningData, P], bool]: - @functools.wraps(func) - def wrapper( - ssh_private_key: str, - job_provisioning_data: JobProvisioningData, - *args: P.args, - **kwargs: P.kwargs, - ) -> bool: - """ - Returns: - is successful - """ + func: Callable[Concatenate[Mapping[int, LocalAddress], P], R], +) -> Callable[ + Concatenate[PrivateKeyOrPair, JobProvisioningData, Optional[JobRuntimeData], P], + Union[Literal[False], R], +]: + """ + A decorator that opens an SSH tunnel to the runner instance for port forwarding. - if LOCAL_BACKEND_ENABLED: - # without SSH - port_map = {p: p for p in ports} - return func(*args, ports=port_map, **kwargs) + Forwarded ports: + * VM-based backends: forward the shim and runner ports. + * Container-based backends: forward only the runner port. + * `jrd.ports` may remap the runner port (blocks case). - func_kwargs_names = [ - p.name - for p in inspect.signature(func).parameters.values() - if p.kind == p.KEYWORD_ONLY - ] - ssh_kwargs = {} - if "ssh_private_key" in func_kwargs_names: - ssh_kwargs["ssh_private_key"] = ssh_private_key - if "job_provisioning_data" in func_kwargs_names: - ssh_kwargs["job_provisioning_data"] = job_provisioning_data + Always forwards the same ports for the given instance/job so that connection is reused across all calls. + In case of blocks, each job uses a separate connection as the runner host port differs. - for attempt in range(retries): - last = attempt == retries - 1 - try: - with RunnerTunnel( - hostname=job_provisioning_data.hostname, - ssh_port=job_provisioning_data.ssh_port, - user=job_provisioning_data.username, - ports=get_runner_ports(ports=ports), - id_rsa=ssh_private_key, - ssh_proxy=job_provisioning_data.ssh_proxy, - ) as tun: - return func(*args, ports=tun.ports, **ssh_kwargs, **kwargs) - except SSHError: - pass # error is logged in the tunnel - except requests.RequestException as e: - if last: - logger.debug( - "Cannot connect to %s's API: %s", job_provisioning_data.hostname, e - ) - if not last: - time.sleep(retry_interval) - return False + There are no retries: a transient transport failure fails the call, + and the callers must retry. In high-latency setups, tune `DSTACK_SERVER_SSH_CONNECT_TIMEOUT`. + """ - return wrapper + @functools.wraps(func) + def wrapper( + ssh_private_key: PrivateKeyOrPair, + job_provisioning_data: JobProvisioningData, + job_runtime_data: Optional[JobRuntimeData], + *args: P.args, + **kwargs: P.kwargs, + ) -> Union[Literal[False], R]: + """ + Returns: + is successful + """ + if job_provisioning_data.hostname is None or job_provisioning_data.ssh_port is None: + # The callers may try to establish tunnels even if hostname/ssh_port is missing + # and rely on `False` being returned in this case. + return False - return decorator + if not settings.SERVER_SSH_POOL_ENABLED or not job_provisioning_data.dockerized: + # Connections from dstack-server to runner's sshd are expected to be short + # as the `inactivity_duration` feature distinguishes user and server connections based on duration. + # Do not re-use SSH connections for container-based backends. + # TODO: Drop `inactivity_duration` dependence on connection duration and re-use connections. + try: + conn = InstanceConnection( + ssh_private_key=ssh_private_key, + jpd=job_provisioning_data, + jrd=job_runtime_data, + ephemeral=True, + ) + conn.open() + except SSHError: + return False + try: + return func(conn.forwarded_paths(), *args, **kwargs) + except (DstackError, requests.RequestException): + return False + finally: + conn.close() + # First try a cached connection and, if it's dead, a new connection. + # Connections already cover against + # a) cleanly-exited master (ControlPersist reap); and + # b) stale control socket file left by killed master. + # (Because we cannot rely solely on connection errors from `func` – it may swallow the errors.) + # but we still want a fast retry in case master dies mid-request. + for _ in range(2): + conn = instance_connection_pool.get_or_open( + ssh_private_key=ssh_private_key, + jpd=job_provisioning_data, + jrd=job_runtime_data, + ) + if conn is None: + return False # couldn't establish at all + try: + return func(conn.forwarded_paths(), *args, **kwargs) + except (SSHError, requests.ConnectionError): + instance_connection_pool.drop(conn.key) # dead ssh connection, re-open + except (DstackError, requests.RequestException): + return False # reached runner, app-level fail; don't re-open ssh connection + return False -def get_runner_ports(ports: Optional[List[int]] = None) -> Dict[int, int]: - ports = ports or [client.REMOTE_RUNNER_PORT] - sockets = [] - try: - for port in ports: - s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - s.bind(("localhost", 0)) # Bind to a free port provided by the host - sockets.append((port, s)) - return {port: s.getsockname()[1] for port, s in sockets} - finally: - for _, s in sockets: - s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - s.close() + return wrapper diff --git a/src/dstack/_internal/server/services/runs.py b/src/dstack/_internal/server/services/runs.py deleted file mode 100644 index fd266ca935..0000000000 --- a/src/dstack/_internal/server/services/runs.py +++ /dev/null @@ -1,1029 +0,0 @@ -import asyncio -import itertools -import math -import uuid -from datetime import datetime, timezone -from typing import List, Optional, Set, Tuple - -import pydantic -from sqlalchemy import and_, or_, select, update -from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import joinedload - -import dstack._internal.server.services.gateways as gateways -import dstack._internal.utils.common as common_utils -from dstack._internal.core.backends import ( - BACKENDS_WITH_CREATE_INSTANCE_SUPPORT, - BACKENDS_WITH_MULTINODE_SUPPORT, -) -from dstack._internal.core.backends.base import Backend -from dstack._internal.core.errors import ( - RepoDoesNotExistError, - ResourceNotExistsError, - ServerClientError, -) -from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.instances import ( - DockerConfig, - InstanceAvailability, - InstanceConfiguration, - InstanceOfferWithAvailability, - SSHKey, -) -from dstack._internal.core.models.pools import Instance -from dstack._internal.core.models.profiles import ( - DEFAULT_POOL_TERMINATION_IDLE_TIME, - CreationPolicy, - Profile, - SpotPolicy, - TerminationPolicy, -) -from dstack._internal.core.models.runs import ( - InstanceStatus, - Job, - JobPlan, - JobProvisioningData, - JobSpec, - JobStatus, - JobSubmission, - JobTerminationReason, - Requirements, - Run, - RunPlan, - RunSpec, - RunStatus, - RunTerminationReason, - ServiceSpec, - get_policy_map, -) -from dstack._internal.core.models.users import GlobalRole -from dstack._internal.core.models.volumes import Volume, VolumeStatus -from dstack._internal.core.services import validate_dstack_resource_name -from dstack._internal.server.models import ( - InstanceModel, - JobModel, - PoolModel, - ProjectModel, - RepoModel, - RunModel, - UserModel, - VolumeModel, -) -from dstack._internal.server.services import backends as backends_services -from dstack._internal.server.services import pools as pools_services -from dstack._internal.server.services import repos as repos_services -from dstack._internal.server.services import volumes as volumes_services -from dstack._internal.server.services.docker import is_valid_docker_volume_target, parse_image_name -from dstack._internal.server.services.jobs import ( - RUNNING_PROCESSING_JOBS_IDS, - RUNNING_PROCESSING_JOBS_LOCK, - SUBMITTED_PROCESSING_JOBS_IDS, - SUBMITTED_PROCESSING_JOBS_LOCK, - TERMINATING_PROCESSING_JOBS_IDS, - TERMINATING_PROCESSING_JOBS_LOCK, - get_jobs_from_run_spec, - group_jobs_by_replica_latest, - job_model_to_job_submission, - process_terminating_job, - stop_runner, -) -from dstack._internal.server.services.jobs.configurators.base import ( - get_default_image, - get_default_python_verison, -) -from dstack._internal.server.services.logging import fmt -from dstack._internal.server.services.pools import ( - filter_pool_instances, - generate_instance_name, - get_instance_offer, - get_or_create_pool_by_name, - get_pool_instances, - instance_model_to_instance, -) -from dstack._internal.server.services.projects import list_project_models, list_user_project_models -from dstack._internal.server.services.users import get_user_model_by_name -from dstack._internal.server.utils.common import wait_to_lock, wait_unlock -from dstack._internal.utils.logging import get_logger -from dstack._internal.utils.random_names import generate_name - -logger = get_logger(__name__) - -# Run processing task must acquire the lock and add the run id to the set. -# Run processing has higher priority than job processing. -# It means that job processing tasks should not take the job if `job.run_id` is in the set. -# But run processing tasks should wait until job processing tasks release PROCESSING_JOBS locks. -PROCESSING_RUNS_LOCK = asyncio.Lock() -PROCESSING_RUNS_IDS: Set[uuid.UUID] = set() - -JOB_TERMINATION_REASONS_TO_RETRY = { - JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY, - JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY, -} - - -async def list_user_runs( - session: AsyncSession, - user: UserModel, - project_name: Optional[str], - repo_id: Optional[str], - username: Optional[str], - only_active: bool, - prev_submitted_at: Optional[datetime], - prev_run_id: Optional[uuid.UUID], - limit: int, - ascending: bool, -) -> List[Run]: - if project_name is None and repo_id is not None: - return [] - if user.global_role == GlobalRole.ADMIN: - projects = await list_project_models(session=session) - else: - projects = await list_user_project_models(session=session, user=user) - runs_user = None - if username is not None: - runs_user = await get_user_model_by_name(session=session, username=username) - if runs_user is None: - raise ResourceNotExistsError("User not found") - repo = None - if project_name is not None: - projects = [p for p in projects if p.name == project_name] - if len(projects) == 0: - return [] - if repo_id is not None: - repo = await repos_services.get_repo_model( - session=session, - project=projects[0], - repo_id=repo_id, - ) - if repo is None: - raise RepoDoesNotExistError.with_id(repo_id) - run_models = await list_projects_run_models( - session=session, - projects=projects, - repo=repo, - runs_user=runs_user, - only_active=only_active, - prev_submitted_at=prev_submitted_at, - prev_run_id=prev_run_id, - limit=limit, - ascending=ascending, - ) - runs = [] - for r in run_models: - try: - runs.append(run_model_to_run(r, return_in_api=True)) - except pydantic.ValidationError: - pass - if len(run_models) > len(runs): - logger.debug("Can't load %s runs", len(run_models) - len(runs)) - return runs - - -async def list_projects_run_models( - session: AsyncSession, - projects: List[ProjectModel], - repo: Optional[RepoModel], - runs_user: Optional[UserModel], - only_active: bool, - prev_submitted_at: Optional[datetime], - prev_run_id: Optional[uuid.UUID], - limit: int, - ascending: bool, -) -> List[RunModel]: - filters = [RunModel.deleted == False, RunModel.project_id.in_(p.id for p in projects)] - if repo is not None: - filters.append(RunModel.repo_id == repo.id) - if runs_user is not None: - filters.append(RunModel.user_id == runs_user.id) - if only_active: - filters.append(RunModel.status.not_in(RunStatus.finished_statuses())) - if prev_submitted_at is not None: - if ascending: - if prev_run_id is None: - filters.append(RunModel.submitted_at > prev_submitted_at) - else: - filters.append( - or_( - RunModel.submitted_at > prev_submitted_at, - and_( - RunModel.submitted_at == prev_submitted_at, RunModel.id < prev_run_id - ), - ) - ) - else: - if prev_run_id is None: - filters.append(RunModel.submitted_at < prev_submitted_at) - else: - filters.append( - or_( - RunModel.submitted_at < prev_submitted_at, - and_( - RunModel.submitted_at == prev_submitted_at, RunModel.id > prev_run_id - ), - ) - ) - order_by = (RunModel.submitted_at.desc(), RunModel.id) - if ascending: - order_by = (RunModel.submitted_at.asc(), RunModel.id.desc()) - - res = await session.execute( - select(RunModel) - .where(*filters) - .order_by(*order_by) - .limit(limit) - .options(joinedload(RunModel.user)) - ) - run_models = list(res.scalars().all()) - return run_models - - -async def get_run( - session: AsyncSession, - project: ProjectModel, - run_name: str, -) -> Optional[Run]: - res = await session.execute( - select(RunModel) - .where( - RunModel.project_id == project.id, - RunModel.run_name == run_name, - RunModel.deleted == False, - ) - .options(joinedload(RunModel.user)) - ) - run_model = res.scalar() - if run_model is None: - return None - return run_model_to_run(run_model, return_in_api=True) - - -async def get_run_plan( - session: AsyncSession, - project: ProjectModel, - user: UserModel, - run_spec: RunSpec, -) -> RunPlan: - _validate_run_spec(run_spec) - - profile = run_spec.merged_profile - creation_policy = profile.creation_policy - - # TODO(egor-s): do we need to generate all replicas here? - jobs = await get_jobs_from_run_spec(run_spec, replica_num=0) - - volumes = await get_run_volumes( - session=session, - project=project, - run_spec=run_spec, - ) - - pool = await get_or_create_pool_by_name( - session=session, project=project, pool_name=profile.pool_name - ) - pool_offers = _get_pool_offers( - pool=pool, - run_spec=run_spec, - job=jobs[0], - volumes=volumes, - ) - run_name = run_spec.run_name # preserve run_name - run_spec.run_name = "dry-run" # will regenerate jobs on submission - - # Get offers once for all jobs - offers = [] - if creation_policy == CreationPolicy.REUSE_OR_CREATE: - offers = await get_offers_by_requirements( - project=project, - profile=profile, - requirements=jobs[0].job_spec.requirements, - exclude_not_available=False, - multinode=jobs[0].job_spec.jobs_per_replica > 1, - volumes=volumes, - ) - - job_plans = [] - for job in jobs: - job_offers: List[InstanceOfferWithAvailability] = [] - job_offers.extend(pool_offers) - job_offers.extend(offer for _, offer in offers) - - # TODO(egor-s): merge job_offers and pool_offers based on (availability, use/create, price) - job_plan = JobPlan( - job_spec=job.job_spec, - offers=job_offers[:50], - total_offers=len(job_offers), - max_price=max((offer.price for offer in job_offers), default=None), - ) - job_plans.append(job_plan) - - run_spec.profile.pool_name = pool.name # write pool name back for the client - run_spec.run_name = run_name # restore run_name - run_plan = RunPlan( - project_name=project.name, - user=user.name, - run_spec=run_spec, - job_plans=job_plans, - ) - return run_plan - - -async def get_offers_by_requirements( - project: ProjectModel, - profile: Profile, - requirements: Requirements, - exclude_not_available=False, - multinode: bool = False, - master_job_provisioning_data: Optional[JobProvisioningData] = None, - volumes: Optional[List[Volume]] = None, -) -> List[Tuple[Backend, InstanceOfferWithAvailability]]: - backends: List[Backend] = await backends_services.get_project_backends(project=project) - - # For backward-compatibility to show offers if users set `backends: [dstack]` - if ( - profile.backends is not None - and len(profile.backends) == 1 - and BackendType.DSTACK in profile.backends - ): - profile.backends = None - - backend_types = profile.backends - regions = profile.regions - - if volumes: - volume = volumes[0] - backend_types = [volume.configuration.backend] - regions = [volume.configuration.region] - - if multinode: - if not backend_types: - backend_types = BACKENDS_WITH_MULTINODE_SUPPORT - backend_types = [b for b in backend_types if b in BACKENDS_WITH_MULTINODE_SUPPORT] - - # For multi-node, restrict backend and region. - # The default behavior is to provision all nodes in the same backend and region. - if master_job_provisioning_data is not None: - if not backend_types: - backend_types = [master_job_provisioning_data.backend] - if not regions: - regions = [master_job_provisioning_data.region] - backend_types = [b for b in backend_types if b == master_job_provisioning_data.backend] - regions = [b for b in backend_types if b == master_job_provisioning_data.region] - - if backend_types is not None: - backends = [b for b in backends if b.TYPE in backend_types or b.TYPE == BackendType.DSTACK] - - offers = await backends_services.get_instance_offers( - backends=backends, - requirements=requirements, - exclude_not_available=exclude_not_available, - ) - - # Filter offers again for backends since a backend - # can return offers of different backend types (e.g. BackendType.DSTACK). - # The first filter should remain as an optimization. - if backend_types is not None: - offers = [(b, o) for b, o in offers if o.backend in backend_types] - - if regions is not None: - offers = [(b, o) for b, o in offers if o.region in regions] - - if profile.instance_types is not None: - offers = [(b, o) for b, o in offers if o.instance.name in profile.instance_types] - - return offers - - -async def submit_run( - session: AsyncSession, - user: UserModel, - project: ProjectModel, - run_spec: RunSpec, -) -> Run: - _validate_run_spec(run_spec) - - repo = await repos_services.get_repo_model( - session=session, - project=project, - repo_id=run_spec.repo_id, - ) - if repo is None: - raise RepoDoesNotExistError.with_id(run_spec.repo_id) - - if run_spec.run_name is None: - run_spec.run_name = await _generate_run_name( - session=session, - project=project, - ) - else: - await delete_runs(session=session, project=project, runs_names=[run_spec.run_name]) - - await validate_run( - session=session, - user=user, - project=project, - run_spec=run_spec, - ) - - submitted_at = common_utils.get_current_datetime() - run_model = RunModel( - id=uuid.uuid4(), - project_id=project.id, - project=project, - repo_id=repo.id, - user_id=user.id, - run_name=run_spec.run_name, - submitted_at=submitted_at, - status=RunStatus.SUBMITTED, - run_spec=run_spec.json(), - last_processed_at=submitted_at, - ) - session.add(run_model) - - replicas = 1 - if run_spec.configuration.type == "service": - replicas = run_spec.configuration.replicas.min - await gateways.register_service(session, run_model) - - for replica_num in range(replicas): - jobs = await get_jobs_from_run_spec(run_spec, replica_num=replica_num) - for job in jobs: - job_model = create_job_model_for_new_submission( - run_model=run_model, - job=job, - status=JobStatus.SUBMITTED, - ) - session.add(job_model) - await session.commit() - await session.refresh(run_model) - - run = run_model_to_run(run_model, return_in_api=True) - return run - - -def create_job_model_for_new_submission( - run_model: RunModel, - job: Job, - status: JobStatus, -) -> JobModel: - now = common_utils.get_current_datetime() - return JobModel( - id=uuid.uuid4(), - project_id=run_model.project_id, - run_id=run_model.id, - run_name=run_model.run_name, - job_num=job.job_spec.job_num, - job_name=f"{job.job_spec.job_name}", - replica_num=job.job_spec.replica_num, - submission_num=len(job.job_submissions), - submitted_at=now, - last_processed_at=now, - status=status, - termination_reason=None, - job_spec_data=job.job_spec.json(), - job_provisioning_data=None, - ) - - -async def stop_runs( - session: AsyncSession, - project: ProjectModel, - runs_names: List[str], - abort: bool, -): - """ - If abort is False, jobs receive a signal to stop and run status will be changed as a reaction to jobs status change. - If abort is True, run is marked as TERMINATED and process_runs will stop the jobs. - """ - res = await session.execute( - select(RunModel).where( - RunModel.project_id == project.id, - RunModel.run_name.in_(runs_names), - RunModel.status.not_in(RunStatus.finished_statuses()), - ) - ) - runs = res.scalars().all() - # TODO(egor-s): consider raising an exception if no runs found - # FIXME: not safe to share session between tasks – sqlalchemy can error - await asyncio.gather(*(stop_run(session, run, abort) for run in runs)) - - -async def stop_run(session: AsyncSession, run: RunModel, abort: bool): - await wait_to_lock(PROCESSING_RUNS_LOCK, PROCESSING_RUNS_IDS, run.id) - - try: - await session.refresh(run) - if run.status.is_finished(): - return - - run.status = RunStatus.TERMINATING - if abort: - run.termination_reason = RunTerminationReason.ABORTED_BY_USER - else: - run.termination_reason = RunTerminationReason.STOPPED_BY_USER - await session.commit() # run will be refreshed later - # process the run out of turn - logger.debug("%s: terminating because %s", fmt(run), run.termination_reason.name) - await process_terminating_run(session, run) - - run.last_processed_at = common_utils.get_current_datetime() - await session.commit() - finally: - PROCESSING_RUNS_IDS.remove(run.id) - - -async def delete_runs( - session: AsyncSession, - project: ProjectModel, - runs_names: List[str], -): - res = await session.execute( - select(RunModel).where( - RunModel.project_id == project.id, RunModel.run_name.in_(runs_names) - ) - ) - run_models = res.scalars().all() - active_runs = [r for r in run_models if not r.status.is_finished()] - if len(active_runs) > 0: - raise ServerClientError( - msg=f"Cannot delete active runs: {[r.run_name for r in active_runs]}" - ) - await session.execute( - update(RunModel) - .where( - RunModel.project_id == project.id, - RunModel.run_name.in_(runs_names), - ) - .values(deleted=True) - ) - await session.commit() - - -async def get_create_instance_offers( - project: ProjectModel, - profile: Profile, - requirements: Requirements, - exclude_not_available=False, -) -> List[Tuple[Backend, InstanceOfferWithAvailability]]: - offers = await get_offers_by_requirements( - project=project, - profile=profile, - requirements=requirements, - exclude_not_available=exclude_not_available, - ) - offers = [ - (backend, offer) - for backend, offer in offers - if backend.TYPE in BACKENDS_WITH_CREATE_INSTANCE_SUPPORT - ] - return offers - - -async def create_instance( - session: AsyncSession, - project: ProjectModel, - user: UserModel, - profile: Profile, - requirements: Requirements, -) -> Instance: - offers = await get_create_instance_offers( - project=project, - profile=profile, - requirements=requirements, - exclude_not_available=True, - ) - - # Raise error if no backends suppport create_instance - backend_types = set((backend.TYPE for backend, _ in offers)) - if all( - (backend_type not in BACKENDS_WITH_CREATE_INSTANCE_SUPPORT) - for backend_type in backend_types - ): - backends = ", ".join(sorted(backend_types)) - raise ServerClientError( - f"Backends {backends} do not support create_instance. Try to select other backends." - ) - - if not offers: - raise ServerClientError( - "Failed to find offers to create the instance." - ) # TODO(sergeyme): ComputeError? - - pool = await pools_services.get_or_create_pool_by_name(session, project, profile.pool_name) - instance_name = await generate_instance_name( - session=session, project=project, pool_name=pool.name - ) - - termination_policy = profile.termination_policy or TerminationPolicy.DESTROY_AFTER_IDLE - termination_idle_time = profile.termination_idle_time - if termination_idle_time is None: - termination_idle_time = DEFAULT_POOL_TERMINATION_IDLE_TIME - - instance = InstanceModel( - id=uuid.uuid4(), - name=instance_name, - project=project, - pool=pool, - created_at=common_utils.get_current_datetime(), - status=InstanceStatus.PENDING, - unreachable=False, - profile=profile.json(), - requirements=requirements.json(), - instance_configuration=None, - termination_policy=termination_policy, - termination_idle_time=termination_idle_time, - ) - logger.info( - "Added a new instance %s", - instance.name, - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.PENDING.value, - }, - ) - session.add(instance) - await session.commit() - - project_ssh_key = SSHKey( - public=project.ssh_public_key.strip(), - private=project.ssh_private_key.strip(), - ) - dstack_default_image = parse_image_name(get_default_image(get_default_python_verison())) - instance_config = InstanceConfiguration( - project_name=project.name, - instance_name=instance_name, - instance_id=str(instance.id), - ssh_keys=[project_ssh_key], - job_docker_config=DockerConfig( - image=dstack_default_image, - registry_auth=None, - ), - user=user.name, - ) - instance.instance_configuration = instance_config.json() - await session.commit() - - return instance_model_to_instance(instance) - - -def run_model_to_run( - run_model: RunModel, include_job_submissions: bool = True, return_in_api: bool = False -) -> Run: - jobs: List[Job] = [] - run_jobs = sorted(run_model.jobs, key=lambda j: (j.replica_num, j.job_num, j.submission_num)) - for replica_num, replica_submissions in itertools.groupby( - run_jobs, key=lambda j: j.replica_num - ): - for job_num, job_submissions in itertools.groupby( - replica_submissions, key=lambda j: j.job_num - ): - job_spec = None - submissions = [] - for job_model in job_submissions: - if job_spec is None: - job_spec = JobSpec.__response__.parse_raw(job_model.job_spec_data) - if include_job_submissions: - job_submission = job_model_to_job_submission(job_model) - if return_in_api: - # Set default non-None values for 0.18 backward-compatibility - # Remove in 0.19 - if job_submission.job_provisioning_data is not None: - if job_submission.job_provisioning_data.hostname is None: - job_submission.job_provisioning_data.hostname = "" - if job_submission.job_provisioning_data.ssh_port is None: - job_submission.job_provisioning_data.ssh_port = 22 - submissions.append(job_submission) - if job_spec is not None: - jobs.append(Job(job_spec=job_spec, job_submissions=submissions)) - - run_spec = RunSpec.__response__.parse_raw(run_model.run_spec) - - latest_job_submission = None - if include_job_submissions: - # TODO(egor-s): does it make sense with replicas and multi-node? - if jobs: - latest_job_submission = jobs[0].job_submissions[-1] - - service_spec = None - if run_model.service_spec is not None: - service_spec = ServiceSpec.__response__.parse_raw(run_model.service_spec) - - run = Run( - id=run_model.id, - project_name=run_model.project.name, - user=run_model.user.name, - submitted_at=run_model.submitted_at.replace(tzinfo=timezone.utc), - last_processed_at=run_model.last_processed_at.replace(tzinfo=timezone.utc), - status=run_model.status, - termination_reason=run_model.termination_reason, - run_spec=run_spec, - jobs=jobs, - latest_job_submission=latest_job_submission, - service=service_spec, - ) - run.cost = _get_run_cost(run) - return run - - -_PROJECTS_TO_RUN_NAMES_LOCK = {} - - -def _get_pool_offers( - pool: PoolModel, - run_spec: RunSpec, - job: Job, - volumes: List[Volume], -) -> List[InstanceOfferWithAvailability]: - profile = run_spec.merged_profile - requirements = Requirements( - resources=run_spec.configuration.resources, - max_price=profile.max_price, - spot=get_policy_map(profile.spot_policy, default=SpotPolicy.AUTO), - ) - pool_filtered_instances = filter_pool_instances( - pool_instances=get_pool_instances(pool), - profile=profile, - requirements=requirements, - multinode=job.job_spec.jobs_per_replica > 1, - volumes=volumes, - ) - pool_offers: List[InstanceOfferWithAvailability] = [] - for instance in pool_filtered_instances: - offer = get_instance_offer(instance) - if offer is None: - continue - offer.availability = InstanceAvailability.BUSY - if instance.status == InstanceStatus.IDLE: - offer.availability = InstanceAvailability.IDLE - if instance.unreachable: - offer.availability = InstanceAvailability.NOT_AVAILABLE - pool_offers.append(offer) - return pool_offers - - -async def _generate_run_name( - session: AsyncSession, - project: ProjectModel, -) -> str: - lock = _PROJECTS_TO_RUN_NAMES_LOCK.setdefault(project.name, asyncio.Lock()) - run_name_base = generate_name() - idx = 1 - async with lock: - while ( - await get_run( - session=session, - project=project, - run_name=f"{run_name_base}-{idx}", - ) - is not None - ): - idx += 1 - return f"{run_name_base}-{idx}" - - -async def validate_run( - session: AsyncSession, - user: UserModel, - project: ProjectModel, - run_spec: RunSpec, -): - volumes = await get_run_volumes( - session=session, - project=project, - run_spec=run_spec, - ) - check_can_attach_run_volumes( - run_spec=run_spec, - volumes=volumes, - ) - - -async def get_run_volumes( - session: AsyncSession, - project: ProjectModel, - run_spec: RunSpec, -) -> List[Volume]: - volume_models = await get_run_volume_models( - session=session, - project=project, - run_spec=run_spec, - ) - return [volumes_services.volume_model_to_volume(v) for v in volume_models] - - -async def get_run_volume_models( - session: AsyncSession, - project: ProjectModel, - run_spec: RunSpec, -) -> List[VolumeModel]: - if len(run_spec.configuration.volumes) == 0: - return [] - volume_models = [] - for mount_point in run_spec.configuration.volumes: - volume_model = await volumes_services.get_project_volume_model_by_name( - session=session, - project=project, - name=mount_point.name, - ) - if volume_model is None: - raise ResourceNotExistsError(f"Volume {mount_point.name} not found") - volume_models.append(volume_model) - return volume_models - - -def check_can_attach_run_volumes( - run_spec: RunSpec, - volumes: List[Volume], -): - if len(volumes) == 0: - return - # Perform basic checks if volumes can be attached. - # This is useful to show error ASAP (when user submits the run). - # If the attachment is to fail anyway, the error will be handled when proccessing submitted jobs. - backend = volumes[0].configuration.backend - region = volumes[0].configuration.region - for volume in volumes: - if backend != volume.configuration.backend: - raise ServerClientError("Cannot mount volumes from different backends") - if region != volume.configuration.region: - raise ServerClientError("Cannot mount volumes from different regions") - if volume.status != VolumeStatus.ACTIVE: - raise ServerClientError("Cannot mount volumes that are not active") - - -def _get_run_cost(run: Run) -> float: - run_cost = math.fsum( - _get_job_submission_cost(submission) - for job in run.jobs - for submission in job.job_submissions - ) - return round(run_cost, 4) - - -def _get_job_submission_cost(job_submission: JobSubmission) -> float: - if job_submission.job_provisioning_data is None: - return 0 - duration_hours = job_submission.duration.total_seconds() / 3600 - return job_submission.job_provisioning_data.price * duration_hours - - -def _validate_run_spec(run_spec: RunSpec): - if run_spec.run_name is not None: - validate_dstack_resource_name(run_spec.run_name) - for mount_point in run_spec.configuration.volumes: - if not is_valid_docker_volume_target(mount_point.path): - raise ServerClientError(f"Invalid volume mount path: {mount_point.path}") - if mount_point.path.startswith("/workflow"): - raise ServerClientError("Mounting volumes inside /workflow is not supported") - - -async def process_terminating_run(session: AsyncSession, run: RunModel): - """ - Used by both `process_runs` and `stop_run` to process a run that is TERMINATING. - Caller must acquire the lock on run. - """ - - assert run.termination_reason is not None - job_termination_reason = run.termination_reason.to_job_termination_reason() - - jobs_ids_set = {job.id for job in run.jobs} - await wait_unlock(RUNNING_PROCESSING_JOBS_LOCK, RUNNING_PROCESSING_JOBS_IDS, jobs_ids_set) - await wait_unlock(SUBMITTED_PROCESSING_JOBS_LOCK, SUBMITTED_PROCESSING_JOBS_IDS, jobs_ids_set) - await wait_unlock( - TERMINATING_PROCESSING_JOBS_LOCK, TERMINATING_PROCESSING_JOBS_IDS, jobs_ids_set - ) - await session.refresh(run) - - unfinished_jobs_count = 0 - job: JobModel - for job in run.jobs: - if job.status.is_finished(): - continue - unfinished_jobs_count += 1 - if job.status == JobStatus.TERMINATING: - # `process_terminating_jobs` will abort frozen jobs - continue - - if job.status == JobStatus.RUNNING and job_termination_reason not in { - JobTerminationReason.ABORTED_BY_USER, - JobTerminationReason.DONE_BY_RUNNER, - }: - # send a signal to stop the job gracefully - await stop_runner(session, job) - job.status = JobStatus.TERMINATING - job.termination_reason = job_termination_reason - await process_terminating_job(session, job) - if job.status.is_finished(): - unfinished_jobs_count -= 1 - job.last_processed_at = common_utils.get_current_datetime() - - if unfinished_jobs_count == 0: - if run.gateway_id is not None: - try: - await gateways.unregister_service(session, run) - except Exception as e: - logger.warning("%s: failed to unregister service: %s", fmt(run), repr(e)) - run.status = run.termination_reason.to_status() - logger.info( - "%s: run status has changed TERMINATING -> %s, reason: %s", - fmt(run), - run.status.name, - run.termination_reason.name, - ) - - -async def scale_run_replicas(session: AsyncSession, run_model: RunModel, replicas_diff: int): - if replicas_diff == 0: - # nothing to do - return - - logger.info( - "%s: scaling %s %s replica(s)", - fmt(run_model), - "UP" if replicas_diff > 0 else "DOWN", - abs(replicas_diff), - ) - - # lists of (importance, replica_num, jobs) - active_replicas = [] - inactive_replicas = [] - - for replica_num, replica_jobs in group_jobs_by_replica_latest(run_model.jobs): - statuses = set(job.status for job in replica_jobs) - if {JobStatus.TERMINATING, *JobStatus.finished_statuses()} & statuses: - # if there are any terminating or finished jobs, the replica is inactive - inactive_replicas.append((0, replica_num, replica_jobs)) - elif JobStatus.SUBMITTED in statuses: - # if there are any submitted jobs, the replica is active and has the importance of 0 - active_replicas.append((0, replica_num, replica_jobs)) - elif {JobStatus.PROVISIONING, JobStatus.PULLING} & statuses: - # if there are any provisioning or pulling jobs, the replica is active and has the importance of 1 - active_replicas.append((1, replica_num, replica_jobs)) - else: - # all jobs are running, the replica is active and has the importance of 2 - active_replicas.append((2, replica_num, replica_jobs)) - - # sort by importance (desc) and replica_num (asc) - active_replicas.sort(key=lambda r: (-r[0], r[1])) - run_spec = RunSpec.__response__.parse_raw(run_model.run_spec) - - if replicas_diff < 0: - if len(active_replicas) + replicas_diff < run_spec.configuration.replicas.min: - raise ServerClientError("Can't scale down below the minimum number of replicas") - - for _, _, replica_jobs in reversed(active_replicas[-abs(replicas_diff) :]): - # scale down the less important replicas first - for job in replica_jobs: - if job.status.is_finished() or job.status == JobStatus.TERMINATING: - continue - job.status = JobStatus.TERMINATING - job.termination_reason = JobTerminationReason.SCALED_DOWN - # background task will process the job later - else: - if len(active_replicas) + replicas_diff > run_spec.configuration.replicas.max: - raise ServerClientError("Can't scale up above the maximum number of replicas") - scheduled_replicas = 0 - - # rerun inactive replicas - for _, _, replica_jobs in inactive_replicas: - if scheduled_replicas == replicas_diff: - break - await retry_run_replica_jobs(session, run_model, replica_jobs, only_failed=False) - scheduled_replicas += 1 - - # create new replicas - for replica_num in range( - len(active_replicas) + scheduled_replicas, len(active_replicas) + replicas_diff - ): - jobs = await get_jobs_from_run_spec(run_spec, replica_num=replica_num) - for job in jobs: - job_model = create_job_model_for_new_submission( - run_model=run_model, - job=job, - status=JobStatus.SUBMITTED, - ) - session.add(job_model) - - -async def retry_run_replica_jobs( - session: AsyncSession, run_model: RunModel, latest_jobs: List[JobModel], *, only_failed: bool -): - for job_model in latest_jobs: - if not (job_model.status.is_finished() or job_model.status == JobStatus.TERMINATING): - if only_failed: - # No need to resubmit, skip - continue - # The job is not finished, but we have to retry all jobs. Terminate it - job_model.status = JobStatus.TERMINATING - job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER - - new_job_model = create_job_model_for_new_submission( - run_model=run_model, - job=Job( - job_spec=JobSpec.__response__.parse_raw(job_model.job_spec_data), - job_submissions=[], - ), - status=JobStatus.SUBMITTED, - ) - # dirty hack to avoid passing all job submissions - new_job_model.submission_num = job_model.submission_num + 1 - session.add(new_job_model) diff --git a/src/dstack/_internal/server/services/runs/__init__.py b/src/dstack/_internal/server/services/runs/__init__.py new file mode 100644 index 0000000000..c32c3971e1 --- /dev/null +++ b/src/dstack/_internal/server/services/runs/__init__.py @@ -0,0 +1,1066 @@ +import itertools +import math +import uuid +from collections.abc import Iterable +from datetime import datetime, timezone +from typing import List, Optional + +import pydantic +from apscheduler.triggers.cron import CronTrigger +from sqlalchemy import and_, func, or_, select, update +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import joinedload, selectinload + +import dstack._internal.utils.common as common_utils +from dstack._internal.core.errors import ( + RepoDoesNotExistError, + ResourceNotExistsError, + ServerClientError, +) +from dstack._internal.core.models.common import ApplyAction +from dstack._internal.core.models.profiles import ( + RetryEvent, +) +from dstack._internal.core.models.runs import ( + ApplyRunPlanInput, + Job, + JobConnectionInfo, + JobStatus, + JobSubmission, + JobTerminationReason, + ProbeSpec, + Run, + RunFleet, + RunPlan, + RunSpec, + RunStatus, + RunTerminationReason, + ServiceSpec, +) +from dstack._internal.core.services.diff import format_diff_fields_for_event +from dstack._internal.server.db import get_db, is_db_postgres, is_db_sqlite +from dstack._internal.server.models import ( + FleetModel, + JobModel, + ProbeModel, + ProjectModel, + RepoModel, + RunModel, + UserModel, +) +from dstack._internal.server.services import events, services +from dstack._internal.server.services import repos as repos_services +from dstack._internal.server.services.jobs import ( + check_can_attach_job_volumes, + get_job_configured_volumes, + get_job_connection_info, + get_job_spec, + get_jobs_from_run_spec, + job_model_to_job_submission, + remove_job_spec_sensitive_info, +) +from dstack._internal.server.services.locking import get_locker, string_to_lock_id +from dstack._internal.server.services.pipelines import PipelineHinterProtocol +from dstack._internal.server.services.plugins import apply_plugin_policies +from dstack._internal.server.services.probes import is_probe_ready +from dstack._internal.server.services.projects import list_user_project_models +from dstack._internal.server.services.resources import ( + set_gpu_vendor_default, + set_resources_defaults, +) +from dstack._internal.server.services.runs.plan import get_job_plans +from dstack._internal.server.services.runs.service_router_worker_sync import ( + ensure_service_router_worker_sync_row, +) +from dstack._internal.server.services.runs.spec import ( + can_update_run_spec, + check_can_update_run_spec, + validate_run_spec_and_set_defaults, +) +from dstack._internal.server.services.secrets import get_project_secrets_mapping +from dstack._internal.server.services.users import get_user_model_by_name +from dstack._internal.utils.logging import get_logger +from dstack._internal.utils.random_names import generate_name + +logger = get_logger(__name__) + + +JOB_TERMINATION_REASONS_TO_RETRY = { + JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY, + JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY, +} + + +def switch_run_status( + session: AsyncSession, + run_model: RunModel, + new_status: RunStatus, + actor: events.AnyActor = events.SystemActor(), +): + """ + Switch run status. + """ + old_status = run_model.status + if old_status == new_status: + return + + run_model.status = new_status + emit_run_status_change_event( + session=session, + run_model=run_model, + old_status=old_status, + new_status=new_status, + actor=actor, + ) + + +def emit_run_status_change_event( + session: AsyncSession, + run_model: RunModel, + old_status: RunStatus, + new_status: RunStatus, + actor: events.AnyActor = events.SystemActor(), +) -> None: + if old_status == new_status: + return + events.emit( + session, + get_run_status_change_message( + old_status=old_status, + new_status=new_status, + termination_reason=run_model.termination_reason, + ), + actor=actor, + targets=[events.Target.from_model(run_model)], + ) + + +def get_run_status_change_message( + old_status: RunStatus, + new_status: RunStatus, + termination_reason: Optional[RunTerminationReason], +) -> str: + msg = f"Run status changed {old_status.upper()} -> {new_status.upper()}" + if new_status == RunStatus.TERMINATING: + if termination_reason is None: + raise ValueError("termination_reason must be set when switching to TERMINATING status") + msg += f". Termination reason: {termination_reason.upper()}" + return msg + + +def get_run_spec(run_model: RunModel) -> RunSpec: + return RunSpec.__response__.parse_raw(run_model.run_spec) + + +async def list_user_runs( + session: AsyncSession, + user: UserModel, + project_name: Optional[str], + repo_id: Optional[str], + username: Optional[str], + only_active: bool, + include_jobs: bool, + job_submissions_limit: Optional[int], + prev_submitted_at: Optional[datetime], + prev_run_id: Optional[uuid.UUID], + limit: int, + ascending: bool, +) -> List[Run]: + if project_name is None and repo_id is not None: + return [] + projects = await list_user_project_models( + session=session, + user=user, + only_names=True, + ) + runs_user = None + if username is not None: + runs_user = await get_user_model_by_name(session=session, username=username) + if runs_user is None: + raise ResourceNotExistsError("User not found") + repo = None + if project_name is not None: + projects = [p for p in projects if p.name == project_name] + if len(projects) == 0: + return [] + if repo_id is not None: + repo = await repos_services.get_repo_model( + session=session, + project=projects[0], + repo_id=repo_id, + ) + if repo is None: + raise RepoDoesNotExistError.with_id(repo_id) + run_models = await list_projects_run_models( + session=session, + projects=projects, + repo=repo, + runs_user=runs_user, + only_active=only_active, + prev_submitted_at=prev_submitted_at, + prev_run_id=prev_run_id, + limit=limit, + ascending=ascending, + ) + runs = [] + for r in run_models: + try: + runs.append( + run_model_to_run( + r, + return_in_api=True, + include_jobs=include_jobs, + job_submissions_limit=job_submissions_limit, + ) + ) + except pydantic.ValidationError: + pass + if len(run_models) > len(runs): + logger.debug("Can't load %s runs", len(run_models) - len(runs)) + return runs + + +async def list_projects_run_models( + session: AsyncSession, + projects: List[ProjectModel], + repo: Optional[RepoModel], + runs_user: Optional[UserModel], + only_active: bool, + prev_submitted_at: Optional[datetime], + prev_run_id: Optional[uuid.UUID], + limit: int, + ascending: bool, +) -> List[RunModel]: + filters = [] + filters.append(RunModel.project_id.in_(p.id for p in projects)) + if repo is not None: + filters.append(RunModel.repo_id == repo.id) + if runs_user is not None: + filters.append(RunModel.user_id == runs_user.id) + if only_active: + filters.append(RunModel.status.not_in(RunStatus.finished_statuses())) + if prev_submitted_at is not None: + if ascending: + if prev_run_id is None: + filters.append(RunModel.submitted_at > prev_submitted_at) + else: + filters.append( + or_( + RunModel.submitted_at > prev_submitted_at, + and_( + RunModel.submitted_at == prev_submitted_at, RunModel.id < prev_run_id + ), + ) + ) + else: + if prev_run_id is None: + filters.append(RunModel.submitted_at < prev_submitted_at) + else: + filters.append( + or_( + RunModel.submitted_at < prev_submitted_at, + and_( + RunModel.submitted_at == prev_submitted_at, RunModel.id > prev_run_id + ), + ) + ) + order_by = (RunModel.submitted_at.desc(), RunModel.id) + if ascending: + order_by = (RunModel.submitted_at.asc(), RunModel.id.desc()) + + res = await session.execute( + select(RunModel) + .where(*filters) + .options(joinedload(RunModel.user).load_only(UserModel.name)) + .options(joinedload(RunModel.fleet).load_only(FleetModel.id, FleetModel.name)) + .options(selectinload(RunModel.jobs).joinedload(JobModel.probes)) + .order_by(*order_by) + .limit(limit) + ) + run_models = list(res.scalars().all()) + return run_models + + +async def get_run( + session: AsyncSession, + project: ProjectModel, + run_name: Optional[str] = None, + run_id: Optional[uuid.UUID] = None, +) -> Optional[Run]: + if run_id is not None: + return await get_run_by_id( + session=session, + project=project, + run_id=run_id, + ) + elif run_name is not None: + return await get_run_by_name( + session=session, + project=project, + run_name=run_name, + ) + raise ServerClientError("run_name or id must be specified") + + +async def get_run_model_by_name( + session: AsyncSession, + project: ProjectModel, + run_name: str, +) -> Optional[RunModel]: + res = await session.execute( + select(RunModel) + .where( + RunModel.project_id == project.id, + RunModel.run_name == run_name, + RunModel.deleted == False, + ) + .options(joinedload(RunModel.user)) + .options(joinedload(RunModel.fleet).load_only(FleetModel.id, FleetModel.name)) + .options(selectinload(RunModel.jobs).joinedload(JobModel.probes)) + ) + return res.scalar() + + +async def get_run_by_name( + session: AsyncSession, + project: ProjectModel, + run_name: str, +) -> Optional[Run]: + run_model = await get_run_model_by_name(session=session, project=project, run_name=run_name) + if run_model is None: + return None + return run_model_to_run(run_model, return_in_api=True, include_job_connection_info=True) + + +async def get_run_by_id( + session: AsyncSession, + project: ProjectModel, + run_id: uuid.UUID, +) -> Optional[Run]: + res = await session.execute( + select(RunModel) + .where( + RunModel.project_id == project.id, + RunModel.id == run_id, + ) + .options(joinedload(RunModel.user)) + .options(joinedload(RunModel.fleet).load_only(FleetModel.id, FleetModel.name)) + .options(selectinload(RunModel.jobs).joinedload(JobModel.probes)) + ) + run_model = res.scalar() + if run_model is None: + return None + return run_model_to_run(run_model, return_in_api=True, include_job_connection_info=True) + + +async def get_plan( + session: AsyncSession, + project: ProjectModel, + user: UserModel, + run_spec: RunSpec, + max_offers: Optional[int], + legacy_repo_dir: bool = False, +) -> RunPlan: + # Spec must be copied by parsing to calculate merged_profile + effective_run_spec = RunSpec.parse_obj(run_spec.dict()) + effective_run_spec = await apply_plugin_policies( + user=user.name, + project=project.name, + spec=effective_run_spec, + ) + effective_run_spec = RunSpec.parse_obj(effective_run_spec.dict()) + validate_run_spec_and_set_defaults( + user=user, + run_spec=effective_run_spec, + legacy_repo_dir=legacy_repo_dir, + ) + profile = effective_run_spec.merged_profile + + current_resource = None + action = ApplyAction.CREATE + if effective_run_spec.run_name is not None: + current_resource = await get_run_by_name( + session=session, + project=project, + run_name=effective_run_spec.run_name, + ) + if current_resource is not None: + # For backward compatibility (current_resource may has been submitted before + # some fields, e.g., CPUSpec.arch, gpu.vendor were added) + _set_run_resources_defaults(current_resource.run_spec) + if not current_resource.status.is_finished() and can_update_run_spec( + current_resource.run_spec, effective_run_spec + ): + action = ApplyAction.UPDATE + + job_plans = await get_job_plans( + session=session, + project=project, + profile=profile, + run_spec=effective_run_spec, + max_offers=max_offers, + ) + run_plan = RunPlan( + project_name=project.name, + user=user.name, + run_spec=run_spec, + effective_run_spec=effective_run_spec, + job_plans=job_plans, + current_resource=current_resource, + action=action, + ) + return run_plan + + +async def apply_plan( + session: AsyncSession, + user: UserModel, + project: ProjectModel, + plan: ApplyRunPlanInput, + force: bool, + pipeline_hinter: Optional[PipelineHinterProtocol] = None, + legacy_repo_dir: bool = False, +) -> Run: + run_spec = plan.run_spec + run_spec = await apply_plugin_policies( + user=user.name, + project=project.name, + spec=run_spec, + ) + # Spec must be copied by parsing to calculate merged_profile + run_spec = RunSpec.parse_obj(run_spec.dict()) + validate_run_spec_and_set_defaults( + user=user, run_spec=run_spec, legacy_repo_dir=legacy_repo_dir + ) + if run_spec.run_name is None: + return await submit_run( + session=session, + user=user, + project=project, + run_spec=run_spec, + pipeline_hinter=pipeline_hinter, + ) + current_resource_model = await get_run_model_by_name( + session=session, + project=project, + run_name=run_spec.run_name, + ) + if current_resource_model is None or current_resource_model.status.is_finished(): + return await submit_run( + session=session, + user=user, + project=project, + run_spec=run_spec, + pipeline_hinter=pipeline_hinter, + ) + current_resource = run_model_to_run(current_resource_model, return_in_api=True) + + # For backward compatibility (current_resource may has been submitted before + # some fields, e.g., CPUSpec.arch, gpu.vendor were added) + _set_run_resources_defaults(current_resource.run_spec) + try: + spec_diff = check_can_update_run_spec(current_resource.run_spec, run_spec) + except ServerClientError: + # The except is only needed to raise an appropriate error if run is active + if not current_resource.status.is_finished(): + raise ServerClientError("Cannot override active run. Stop the run first.") + raise + if not force: + if plan.current_resource is not None: + _set_run_resources_defaults(plan.current_resource.run_spec) + if ( + plan.current_resource is None + or plan.current_resource.id != current_resource.id + or plan.current_resource.run_spec != current_resource.run_spec + ): + raise ServerClientError( + "Failed to apply plan. Resource has been changed. Try again or use force apply." + ) + new_deployment_num = current_resource.deployment_num + 1 + # FIXME: potentially long write transaction + # Avoid getting run_model after update + await session.execute( + update(RunModel) + .where(RunModel.id == current_resource.id) + .values( + run_spec=run_spec.json(), + priority=run_spec.configuration.priority, + deployment_num=new_deployment_num, + ) + ) + await ensure_service_router_worker_sync_row(session, current_resource_model, run_spec) + events.emit( + session, + ( + f"Run updated. Deployment: {new_deployment_num}." + f" Changed fields: {format_diff_fields_for_event(spec_diff)}" + ), + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(current_resource_model)], + ) + run = await get_run_by_name( + session=session, + project=project, + run_name=run_spec.run_name, + ) + return common_utils.get_or_error(run) + + +async def submit_run( + session: AsyncSession, + user: UserModel, + project: ProjectModel, + run_spec: RunSpec, + pipeline_hinter: Optional[PipelineHinterProtocol] = None, +) -> Run: + validate_run_spec_and_set_defaults(user, run_spec) + repo = await _get_run_repo_or_error( + session=session, + project=project, + run_spec=run_spec, + ) + secrets = await get_project_secrets_mapping( + session=session, + project=project, + ) + + lock_namespace = f"run_names_{project.name}" + if is_db_sqlite(): + # Start new transaction to see committed changes after lock + await session.commit() + elif is_db_postgres(): + await session.execute( + select(func.pg_advisory_xact_lock(string_to_lock_id(lock_namespace))) + ) + lock, _ = get_locker(get_db().dialect_name).get_lockset(lock_namespace) + async with lock: + # FIXME: delete_runs commits, so Postgres lock is released too early. + if run_spec.run_name is None: + run_spec.run_name = await _generate_run_name( + session=session, + project=project, + ) + else: + await delete_runs( + session=session, user=user, project=project, runs_names=[run_spec.run_name] + ) + + await _validate_run( + session=session, + user=user, + project=project, + run_spec=run_spec, + ) + + submitted_at = common_utils.get_current_datetime() + initial_status = RunStatus.SUBMITTED + initial_replicas = 1 + if run_spec.merged_profile.schedule is not None: + initial_status = RunStatus.PENDING + initial_replicas = 0 + + run_model = RunModel( + id=uuid.uuid4(), + project_id=project.id, + project=project, + repo_id=repo.id, + user_id=user.id, + run_name=run_spec.run_name, + submitted_at=submitted_at, + status=initial_status, + run_spec=run_spec.json(), + last_processed_at=submitted_at, + priority=run_spec.configuration.priority, + deployment_num=0, + desired_replica_count=1, # a relevant value will be set in RunPipeline + next_triggered_at=_get_next_triggered_at(run_spec), + ) + session.add(run_model) + events.emit( + session, + f"Run submitted. Status: {run_model.status.upper()}", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(run_model)], + ) + + if run_spec.configuration.type == "service": + # FIXME: Register services asynchronously in the background + await services.register_service(session, run_model, run_spec) + service_config = run_spec.configuration + + global_replica_num = 0 # Global counter across all groups for unique replica_num + + for replica_group in service_config.replica_groups: + if run_spec.merged_profile.schedule is not None: + group_initial_replicas = 0 + else: + group_initial_replicas = replica_group.count.min or 0 + + # Each replica in this group gets the same group-specific configuration + for group_replica_num in range(group_initial_replicas): + jobs = await get_jobs_from_run_spec( + run_spec=run_spec, + secrets=secrets, + replica_num=global_replica_num, + replica_group_name=replica_group.name, + ) + + for job in jobs: + job_model = create_job_model_for_new_submission( + run_model=run_model, + job=job, + status=JobStatus.SUBMITTED, + ) + session.add(job_model) + events.emit( + session, + f"Job created on run submission. Status: {job_model.status.upper()}", + actor=events.SystemActor(), + targets=[ + events.Target.from_model(job_model), + ], + ) + global_replica_num += 1 + await ensure_service_router_worker_sync_row(session, run_model, run_spec) + else: + for replica_num in range(initial_replicas): + jobs = await get_jobs_from_run_spec( + run_spec=run_spec, + secrets=secrets, + replica_num=replica_num, + ) + for job in jobs: + job_model = create_job_model_for_new_submission( + run_model=run_model, + job=job, + status=JobStatus.SUBMITTED, + ) + session.add(job_model) + events.emit( + session, + f"Job created on run submission. Status: {job_model.status.upper()}", + # Set `SystemActor` for consistency with all other places where jobs can be + # created (retry, scaling, rolling deployments, etc). Think of the run as being + # created by the user, while the job is created by the system to satisfy the + # run spec. + actor=events.SystemActor(), + targets=[ + events.Target.from_model(job_model), + ], + ) + await session.commit() + if pipeline_hinter is not None: + pipeline_hinter.hint_fetch(JobModel.__name__) + pipeline_hinter.hint_fetch(RunModel.__name__) + await session.refresh(run_model) + + run = await get_run_by_id(session, project, run_model.id) + return common_utils.get_or_error(run) + + +def create_job_model_for_new_submission( + run_model: RunModel, + job: Job, + status: JobStatus, +) -> JobModel: + """ + Create a new job. + + **NOTE**: don't forget to emit an event when writing the new job to the database. + """ + now = common_utils.get_current_datetime() + return JobModel( + id=uuid.uuid4(), + project_id=run_model.project_id, + run_id=run_model.id, + run_name=run_model.run_name, + job_num=job.job_spec.job_num, + job_name=f"{job.job_spec.job_name}", + replica_num=job.job_spec.replica_num, + deployment_num=run_model.deployment_num, + submission_num=len(job.job_submissions), + submitted_at=now, + last_processed_at=now, + status=status, + termination_reason=None, + job_spec_data=job.job_spec.json(), + job_provisioning_data=None, + probes=[], + waiting_master_job=job.job_spec.job_num != 0, + ) + + +async def stop_runs( + session: AsyncSession, + user: UserModel, + project: ProjectModel, + runs_names: List[str], + abort: bool, + pipeline_hinter: Optional[PipelineHinterProtocol] = None, +): + res = await session.execute( + select(RunModel).where( + RunModel.project_id == project.id, + RunModel.run_name.in_(runs_names), + RunModel.status.not_in(RunStatus.finished_statuses()), + ) + ) + run_models = res.scalars().all() + run_ids = sorted([r.id for r in run_models]) + await session.commit() + async with get_locker(get_db().dialect_name).lock_ctx(RunModel.__tablename__, run_ids): + res = await session.execute( + select(RunModel) + .where(RunModel.id.in_(run_ids)) + .order_by(RunModel.id) # take locks in order + .with_for_update(key_share=True) + .execution_options(populate_existing=True) + ) + run_models = res.scalars().all() + for run_model in run_models: + if run_model.status.is_finished(): + continue + if abort: + run_model.termination_reason = RunTerminationReason.ABORTED_BY_USER + else: + run_model.termination_reason = RunTerminationReason.STOPPED_BY_USER + switch_run_status( + session, run_model, RunStatus.TERMINATING, actor=events.UserActor.from_user(user) + ) + run_model.skip_min_processing_interval = True + # The run will be terminated by RunPipeline. + await session.commit() + if pipeline_hinter is not None: + pipeline_hinter.hint_fetch(RunModel.__name__) + + +async def delete_runs( + session: AsyncSession, + user: UserModel, + project: ProjectModel, + runs_names: List[str], +): + res = await session.execute( + select(RunModel).where( + RunModel.project_id == project.id, + RunModel.run_name.in_(runs_names), + ) + ) + run_models = res.scalars().all() + run_ids = sorted([r.id for r in run_models]) + await session.commit() + async with get_locker(get_db().dialect_name).lock_ctx(RunModel.__tablename__, run_ids): + res = await session.execute( + select(RunModel) + .where(RunModel.id.in_(run_ids)) + .order_by(RunModel.id) # take locks in order + .with_for_update(key_share=True) + ) + run_models = res.scalars().all() + active_runs = [r for r in run_models if not r.status.is_finished()] + if len(active_runs) > 0: + raise ServerClientError( + msg=f"Cannot delete active runs: {[r.run_name for r in active_runs]}" + ) + for run_model in run_models: + if not run_model.deleted: + run_model.deleted = True + events.emit( + session, + "Run deleted", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(run_model)], + ) + await session.commit() + + +def run_model_to_run( + run_model: RunModel, + include_jobs: bool = True, + job_submissions_limit: Optional[int] = None, + return_in_api: bool = False, + include_sensitive: bool = False, + include_job_connection_info: bool = False, +) -> Run: + run_spec = get_run_spec(run_model) + + jobs: List[Job] = [] + if include_jobs: + jobs = _get_run_jobs_with_submissions( + run_model=run_model, + run_spec=run_spec, + job_submissions_limit=job_submissions_limit, + return_in_api=return_in_api, + include_sensitive=include_sensitive, + include_job_connection_info=include_job_connection_info, + ) + + latest_job_submission = None + if len(jobs) > 0 and len(jobs[0].job_submissions) > 0: + # TODO(egor-s): does it make sense with replicas and multi-node? + latest_job_submission = jobs[0].job_submissions[-1] + + service_spec = None + if run_model.service_spec is not None: + service_spec = ServiceSpec.__response__.parse_raw(run_model.service_spec) + + status_message = _get_run_status_message(run_model) + error = _get_run_error(run_model) + fleet = _get_run_fleet(run_model) + next_triggered_at = None + if not run_model.status.is_finished(): + next_triggered_at = _get_next_triggered_at(run_spec) + run = Run( + id=run_model.id, + project_name=run_model.project.name, + user=run_model.user.name, + fleet=fleet, + submitted_at=run_model.submitted_at, + last_processed_at=run_model.last_processed_at, + status=run_model.status, + status_message=status_message, + termination_reason=run_model.termination_reason.value + if run_model.termination_reason + else None, + run_spec=run_spec, + jobs=jobs, + latest_job_submission=latest_job_submission, + service=service_spec, + deployment_num=run_model.deployment_num, + error=error, + deleted=run_model.deleted, + next_triggered_at=next_triggered_at, + ) + run.cost = _get_run_cost(run) + return run + + +def _set_run_resources_defaults(run_spec: RunSpec) -> None: + """Apply resource defaults to a run spec, including GPU vendor inference.""" + set_resources_defaults(run_spec.configuration.resources) + set_gpu_vendor_default( + run_spec.configuration.resources, + image=run_spec.configuration.image, + docker=getattr(run_spec.configuration, "docker", None), + ) + + +def _get_run_jobs_with_submissions( + run_model: RunModel, + run_spec: RunSpec, + job_submissions_limit: Optional[int], + return_in_api: bool = False, + include_sensitive: bool = False, + include_job_connection_info: bool = False, +) -> List[Job]: + jobs: List[Job] = [] + run_jobs = sorted(run_model.jobs, key=lambda j: (j.replica_num, j.job_num, j.submission_num)) + for replica_num, replica_submissions in itertools.groupby( + run_jobs, key=lambda j: j.replica_num + ): + for job_num, job_models in itertools.groupby(replica_submissions, key=lambda j: j.job_num): + submissions = [] + job_model = None + if job_submissions_limit is not None: + if job_submissions_limit == 0: + # Take latest job submission to return its job_spec + job_models = list(job_models)[-1:] + else: + job_models = list(job_models)[-job_submissions_limit:] + for job_model in job_models: + if job_submissions_limit != 0: + job_submission = job_model_to_job_submission( + job_model, include_probes=return_in_api + ) + if return_in_api: + # Set default non-None values for 0.18 backward-compatibility + # Remove in 0.19 + if job_submission.job_provisioning_data is not None: + if job_submission.job_provisioning_data.hostname is None: + job_submission.job_provisioning_data.hostname = "" + if job_submission.job_provisioning_data.ssh_port is None: + job_submission.job_provisioning_data.ssh_port = 22 + submissions.append(job_submission) + if job_model is not None: + # Use the spec from the latest submission. Submissions can have different specs + job_spec = get_job_spec(job_model) + if not include_sensitive: + remove_job_spec_sensitive_info(job_spec) + job_connection_info: Optional[JobConnectionInfo] = None + if include_job_connection_info and job_model.status == JobStatus.RUNNING: + job_connection_info = get_job_connection_info(job_model, run_spec) + jobs.append( + Job( + job_spec=job_spec, + job_submissions=submissions, + job_connection_info=job_connection_info, + ) + ) + return jobs + + +def _get_run_status_message(run_model: RunModel) -> str: + if len(run_model.jobs) == 0: + return run_model.status.value + + sorted_job_models = sorted( + run_model.jobs, key=lambda j: (j.replica_num, j.job_num, j.submission_num) + ) + job_models_grouped_by_job = list( + list(jm) + for _, jm in itertools.groupby(sorted_job_models, key=lambda j: (j.replica_num, j.job_num)) + ) + + if all(job_models[-1].status == JobStatus.PULLING for job_models in job_models_grouped_by_job): + # Show `pulling`` if last job submission of all jobs is pulling + return "pulling" + + if run_model.status in [RunStatus.SUBMITTED, RunStatus.PENDING]: + # Show `retrying` if any job caused the run to retry + for job_models in job_models_grouped_by_job: + last_job_spec = get_job_spec(job_models[-1]) + retry_on_events = last_job_spec.retry.on_events if last_job_spec.retry else [] + last_job_termination_reason = _get_last_job_termination_reason(job_models) + if ( + last_job_termination_reason + == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY + and RetryEvent.NO_CAPACITY in retry_on_events + ): + # TODO: Show `retrying` for other retry events + return "retrying" + + return run_model.status.value + + +def _get_last_job_termination_reason(job_models: List[JobModel]) -> Optional[JobTerminationReason]: + for job_model in reversed(job_models): + if job_model.termination_reason is not None: + return job_model.termination_reason + return None + + +def _get_run_error(run_model: RunModel) -> Optional[str]: + if run_model.termination_reason is None: + return None + return run_model.termination_reason.to_error() + + +def _get_run_fleet(run_model: RunModel) -> Optional[RunFleet]: + if run_model.fleet is None: + return None + return RunFleet( + id=run_model.fleet.id, + name=run_model.fleet.name, + ) + + +async def _generate_run_name( + session: AsyncSession, + project: ProjectModel, +) -> str: + run_name_base = generate_name() + idx = 1 + while True: + res = await session.execute( + select(RunModel).where( + RunModel.project_id == project.id, + RunModel.run_name == f"{run_name_base}-{idx}", + RunModel.deleted == False, + ) + ) + run_model = res.scalar() + if run_model is None: + return f"{run_name_base}-{idx}" + idx += 1 + + +async def _validate_run( + session: AsyncSession, + user: UserModel, + project: ProjectModel, + run_spec: RunSpec, +): + await _validate_run_volumes( + session=session, + project=project, + run_spec=run_spec, + ) + + +async def _validate_run_volumes( + session: AsyncSession, + project: ProjectModel, + run_spec: RunSpec, +): + # The volumes validation should be done here and not in job configurator + # since potentially we may need to validate volumes for jobs/replicas + # that won't be created immediately (e.g. range of replicas or nodes). + nodes = 1 + if run_spec.configuration.type == "task": + nodes = run_spec.configuration.nodes + for job_num in range(nodes): + volumes = await get_job_configured_volumes( + session=session, project=project, run_spec=run_spec, job_num=job_num + ) + check_can_attach_job_volumes(volumes=volumes) + + +async def _get_run_repo_or_error( + session: AsyncSession, + project: ProjectModel, + run_spec: RunSpec, +) -> RepoModel: + # Must be set by _validate_run_spec_and_set_defaults() + repo_id = common_utils.get_or_error(run_spec.repo_id) + repo_data = common_utils.get_or_error(run_spec.repo_data) + if repo_data.repo_type == "virtual": + repo = await repos_services.create_or_update_repo( + session=session, + project=project, + repo_id=repo_id, + repo_info=repo_data, + ) + repo = await repos_services.get_repo_model( + session=session, + project=project, + repo_id=repo_id, + ) + if repo is None: + raise RepoDoesNotExistError.with_id(repo_id) + return repo + + +def _get_run_cost(run: Run) -> float: + run_cost = math.fsum( + _get_job_submission_cost(submission) + for job in run.jobs + for submission in job.job_submissions + ) + return round(run_cost, 4) + + +def _get_job_submission_cost(job_submission: JobSubmission) -> float: + if job_submission.job_provisioning_data is None: + return 0 + duration_hours = job_submission.duration.total_seconds() / 3600 + return job_submission.job_provisioning_data.price * duration_hours + + +def is_job_ready(probes: Iterable[ProbeModel], probe_specs: Iterable[ProbeSpec]) -> bool: + return all(is_probe_ready(probe, probe_spec) for probe, probe_spec in zip(probes, probe_specs)) + + +def _get_next_triggered_at(run_spec: RunSpec) -> Optional[datetime]: + if run_spec.merged_profile.schedule is None: + return None + now = common_utils.get_current_datetime() + fire_times = [] + for cron in run_spec.merged_profile.schedule.crons: + cron_trigger = CronTrigger.from_crontab(cron, timezone=timezone.utc) + fire_times.append( + cron_trigger.get_next_fire_time( + previous_fire_time=None, + now=now, + ) + ) + return min(fire_times) diff --git a/src/dstack/_internal/server/services/runs/plan.py b/src/dstack/_internal/server/services/runs/plan.py new file mode 100644 index 0000000000..8d272bdb2c --- /dev/null +++ b/src/dstack/_internal/server/services/runs/plan.py @@ -0,0 +1,1052 @@ +import math +import uuid +from collections.abc import Hashable, Mapping +from dataclasses import dataclass +from enum import Enum +from typing import Optional, Union + +from sqlalchemy import and_, exists, not_, or_, select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import contains_eager, noload + +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.models.common import EntityReference +from dstack._internal.core.models.fleets import FleetSpec, InstanceGroupPlacement +from dstack._internal.core.models.instances import ( + InstanceAvailability, + InstanceOfferWithAvailability, + InstanceStatus, +) +from dstack._internal.core.models.profiles import CreationPolicy, Profile +from dstack._internal.core.models.runs import ( + Job, + JobPlan, + JobProvisioningData, + Requirements, + RunSpec, +) +from dstack._internal.core.models.volumes import Volume +from dstack._internal.server.models import ( + ExportedFleetModel, + FleetModel, + ImportModel, + InstanceModel, + ProjectModel, + RunModel, +) +from dstack._internal.server.services.fleets import ( + check_can_create_new_cloud_instance_in_fleet, + get_fleet_master_instance_provisioning_data, + get_fleet_requirements, + get_fleet_spec, +) +from dstack._internal.server.services.instances import ( + filter_instances, + get_instance_offer, + get_pool_instances, + get_shared_instances_with_offers, + is_placeholder_instance, + select_instances_by_selectors, +) +from dstack._internal.server.services.jobs import ( + get_instances_ids_with_detaching_volumes, + get_job_configured_volumes, + get_jobs_from_run_spec, + is_master_job, + is_multinode_job, + remove_job_spec_sensitive_info, +) +from dstack._internal.server.services.offers import ( + get_offers_by_requirements, + merge_offer_iterables, +) +from dstack._internal.server.services.requirements.combine import ( + combine_fleet_and_run_profiles, + combine_fleet_and_run_requirements, +) +from dstack._internal.server.services.runs.spec import ( + check_run_spec_requires_instance_mounts, + get_nodes_required_num, +) +from dstack._internal.server.services.secrets import get_project_secrets_mapping +from dstack._internal.utils import common as common_utils +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +_DEFAULT_MAX_OFFERS = 50 +# To avoid too many offers from being processed per fleet when searching for optimal fleet. +# Without the limit, time and peak memory usage spike since +# they grow linearly with the number of fleets. +_PER_FLEET_MAX_OFFERS = 100 + + +async def get_job_plans( + session: AsyncSession, + project: ProjectModel, + profile: Profile, + run_spec: RunSpec, + max_offers: Optional[int], +) -> list[JobPlan]: + """ + Returns job plans for the given run spec. + + Normal run planning (`dstack apply`) selects the best fleet candidate for each planned job + and builds offers from that path. `dstack offer` without `--group-by` uses the same + `/runs/get_plan` API, but its synthetic run spec is detected by + `_should_select_best_fleet_candidate()`. In that case, planning skips + best-fleet-candidate selection and collects offers directly: global offers when no fleets + are specified, or offers from the selected fleets when `--fleet` is used. + + Services are planned per replica group. Other run types are planned once and then expanded + into per-job `JobPlan` results. + """ + run_name = run_spec.run_name + if run_spec.run_name is None: + # Set/unset dummy run name to generate job names for run plan. + run_spec.run_name = "dry-run" + + secrets = await get_project_secrets_mapping(session=session, project=project) + + job_plans = [] + + volumes = await get_job_configured_volumes( + session=session, + project=project, + run_spec=run_spec, + job_num=0, + ) + + if _should_select_best_fleet_candidate(run_spec) and run_spec.merged_profile.instances is None: + candidate_fleet_models = await _select_candidate_fleet_models( + session=session, + project=project, + run_model=None, + run_spec=run_spec, + ) + else: + candidate_fleet_models = None + + if run_spec.configuration.type == "service": + replica_group_names = [g.name for g in run_spec.configuration.replica_groups] + else: + replica_group_names = [None] + + for replica_group_name in replica_group_names: + jobs = await get_jobs_from_run_spec( + run_spec=run_spec, + secrets=secrets, + replica_num=0, + replica_group_name=replica_group_name, + ) + if candidate_fleet_models is None: + if profile.instances is not None: + instance_offers = await get_targeted_instance_offers( + session=session, + project=project, + run_spec=run_spec, + job=jobs[0], + volumes=volumes, + ) + backend_offers = [] + elif profile.fleets is None: + instance_offers, backend_offers = await _get_non_fleet_offers( + session=session, + project=project, + profile=profile, + run_spec=run_spec, + job=jobs[0], + volumes=volumes, + ) + else: + instance_offers, backend_offers = await _get_offers_in_run_candidate_fleets( + session=session, + project=project, + run_spec=run_spec, + job=jobs[0], + volumes=volumes, + ) + else: + fleet_model, instance_offers, backend_offers = await find_optimal_fleet_with_offers( + project=project, + fleet_models=candidate_fleet_models, + run_model=None, + run_spec=run_spec, + job=jobs[0], + master_job_provisioning_data=None, + volumes=volumes, + exclude_not_available=False, + ) + + for job in jobs: + job_plan = _get_job_plan( + instance_offers=instance_offers, + backend_offers=backend_offers, + profile=profile, + job=job, + max_offers=max_offers, + ) + job_plans.append(job_plan) + + run_spec.run_name = run_name + return job_plans + + +async def get_run_candidate_fleet_models_filters( + session: AsyncSession, + project: ProjectModel, + run_model: Optional[RunModel], + run_spec: RunSpec, +) -> tuple[list, list]: + """ + Returns ORM fleet and instance filters for selecting run candidate fleet models with instances. + """ + # If another job freed the instance but is still trying to detach volumes, + # do not provision on it to prevent attaching volumes that are currently detaching. + detaching_instances_ids = await get_instances_ids_with_detaching_volumes(session) + is_fleet_imported_subquery = exists().where( + ImportModel.project_id == project.id, + ImportModel.export_id == ExportedFleetModel.export_id, + ExportedFleetModel.fleet_id == FleetModel.id, + ) + fleet_filters = [ + or_( + FleetModel.project_id == project.id, + is_fleet_imported_subquery, + ), + FleetModel.deleted == False, + ] + if run_model is not None and run_model.fleet is not None: + fleet_filters.append(FleetModel.id == run_model.fleet_id) + if run_spec.merged_profile.fleets is not None: + fleet_conditions = [] + for ref in map(EntityReference.parse, run_spec.merged_profile.fleets): + if ref.project is None: + fleet_conditions.append( + and_( + FleetModel.name == ref.name, + FleetModel.project_id == project.id, + ) + ) + else: + fleet_conditions.append( + and_( + FleetModel.name == ref.name, + ProjectModel.name == ref.project, + ) + ) + fleet_filters.append(or_(*fleet_conditions)) + instance_filters = [ + InstanceModel.deleted == False, + InstanceModel.id.not_in(detaching_instances_ids), + ] + return fleet_filters, instance_filters + + +async def select_run_candidate_fleet_models_with_filters( + session: AsyncSession, + fleet_filters: list, + instance_filters: list, + lock_instances: bool, +) -> tuple[list[FleetModel], list[FleetModel]]: + # Selecting fleets in two queries since Postgres does not allow + # locking nullable side of an outer join. So, first lock instances with inner join. + # Then select left out fleets without instances. + stmt = ( + select(FleetModel) + .join(FleetModel.project) # can be referenced by fleet_filters + .join(FleetModel.instances) + .where(*fleet_filters) + .where(*instance_filters) + .options(contains_eager(FleetModel.instances)) + .execution_options(populate_existing=True) + ) + if lock_instances: + # Skip locked instances since waiting for all the instances to unlock may take indefinite time. + # TODO: Switch to optimistic locking – implement select-lock-reselect loop. + stmt = stmt.where(InstanceModel.lock_expires_at.is_(None)) + stmt = stmt.order_by( + InstanceModel.id # take locks in order + ).with_for_update(skip_locked=True, key_share=True, of=InstanceModel) + res = await session.execute(stmt) + fleet_models_with_instances = list(res.unique().scalars().all()) + fleet_models_with_instances_ids = [f.id for f in fleet_models_with_instances] + res = await session.execute( + select(FleetModel) + .join(FleetModel.project) # can be referenced by fleet_filters + .outerjoin(FleetModel.instances) + .where( + *fleet_filters, + FleetModel.id.not_in(fleet_models_with_instances_ids), + ) + .where( + or_( + InstanceModel.id.is_(None), + not_(and_(*instance_filters)), + ) + ) + .options(noload(FleetModel.instances)) + .execution_options(populate_existing=True) + ) + fleet_models_without_instances = list(res.unique().scalars().all()) + return fleet_models_with_instances, fleet_models_without_instances + + +@dataclass +class _FleetCandidate: + fleet_model: FleetModel + fleet_spec: FleetSpec + instance_offers: list[tuple[InstanceModel, InstanceOfferWithAvailability]] + min_instance_offer_price: float + has_pool_capacity: bool + + +@dataclass +class _FleetCandidateWithBackendOffers: + candidate: _FleetCandidate + backend_offers: list[tuple[Backend, InstanceOfferWithAvailability]] + sort_key: tuple[bool, float, float] + + +async def find_optimal_fleet_with_offers( + project: ProjectModel, + fleet_models: list[FleetModel], + run_model: Optional[RunModel], + run_spec: RunSpec, + job: Job, + master_job_provisioning_data: Optional[JobProvisioningData], + volumes: Optional[list[list[Volume]]], + exclude_not_available: bool, + skip_backend_offers_on_pool_capacity: bool = False, +) -> tuple[ + Optional[FleetModel], + list[tuple[InstanceModel, InstanceOfferWithAvailability]], + list[tuple[Backend, InstanceOfferWithAvailability]], +]: + """ + Finds the optimal fleet for the run among the given fleets and returns + the fleet model, pool offers with instances, and backend offers. + Returns empty backend offers if run_model.fleet is set since + backend offer from this function are needed only for run plan. + Only available offers are considered for selecting fleets but may return + either available or all offers depending on `exclude_not_available`. + """ + if run_model is not None and run_model.fleet is not None: + # Using the fleet that was already chosen by the master job + instance_offers = get_instance_offers_in_fleet( + fleet_model=run_model.fleet, + run_spec=run_spec, + job=job, + master_job_provisioning_data=master_job_provisioning_data, + volumes=volumes, + exclude_not_available=exclude_not_available, + ) + return run_model.fleet, instance_offers, [] + + nodes_required_num = get_nodes_required_num(run_spec) + # The current strategy is first to consider fleets that can accommodate + # the run without additional provisioning and choose the one with the cheapest pool offer. + # Then choose a fleet with the cheapest pool offer among all fleets with pool offers. + # If there are no fleets with pool offers, choose a fleet with a cheapest backend offer. + # TODO: Consider trying all backend offers and then choosing a fleet. + + # First step: consider instance offers. + candidates: list[_FleetCandidate] = [] + for fleet_model in fleet_models: + fleet_spec = get_fleet_spec(fleet_model) + if ( + is_multinode_job(job) + and fleet_spec.configuration.placement != InstanceGroupPlacement.CLUSTER + ): + # Limit multinode runs to cluster fleets to guarantee best connectivity. + continue + + if not _run_can_fit_into_fleet(run_spec, fleet_model, fleet_spec): + logger.debug( + "Skipping fleet %s from consideration: run cannot fit into fleet", + fleet_model.name, + ) + continue + + all_instance_offers = get_instance_offers_in_fleet( + fleet_model=fleet_model, + run_spec=run_spec, + job=job, + # No need to pass master_job_provisioning_data for master job + # as all pool offers are suitable. + master_job_provisioning_data=None, + volumes=volumes, + exclude_not_available=False, + ) + available_instance_offers = _exclude_non_available_instance_offers(all_instance_offers) + candidates.append( + _FleetCandidate( + fleet_model=fleet_model, + fleet_spec=fleet_spec, + instance_offers=( + available_instance_offers if exclude_not_available else all_instance_offers + ), + min_instance_offer_price=_get_min_instance_or_backend_offer_price( + available_instance_offers + ), + # Require at least one available instance so that fleets without matching + # instances are not treated as having capacity when nodes_required_num is 0 + # (e.g. a service scaling from zero replicas). + has_pool_capacity=( + len(available_instance_offers) > 0 + and nodes_required_num <= len(available_instance_offers) + ), + ) + ) + + # If any candidate fleet has pool capacity, the optimal fleet will be one of + # those, so backend offers from any fleet won't affect selection — skip them entirely when allowed. + skip_backend_offers = skip_backend_offers_on_pool_capacity and any( + candidate.has_pool_capacity for candidate in candidates + ) + + # Second step: gather backend offers unless skipped. + candidates_with_backend_offers: list[_FleetCandidateWithBackendOffers] = [] + for candidate in candidates: + backend_offers: list[tuple[Backend, InstanceOfferWithAvailability]] + if skip_backend_offers: + backend_offers = [] + else: + backend_offers = await _get_backend_offers_in_fleet( + project=project, + fleet_model=candidate.fleet_model, + fleet_spec=candidate.fleet_spec, + run_spec=run_spec, + job=job, + volumes=volumes, + max_offers=_PER_FLEET_MAX_OFFERS, + ) + available_backend_offers = _exclude_non_available_backend_offers(backend_offers) + candidates_with_backend_offers.append( + _FleetCandidateWithBackendOffers( + candidate=candidate, + backend_offers=backend_offers, + # Pool-capacity fleets first; then cheapest pool offer; then cheapest backend. + sort_key=( + not candidate.has_pool_capacity, + candidate.min_instance_offer_price, + _get_min_instance_or_backend_offer_price(available_backend_offers), + ), + ) + ) + + if not candidates_with_backend_offers: + return None, [], [] + + optimal = min(candidates_with_backend_offers, key=lambda c: c.sort_key) + optimal_fleet_model = optimal.candidate.fleet_model + instance_offers = optimal.candidate.instance_offers + if skip_backend_offers: + backend_offers = [] + else: + # Refetch backend offers without limit to return all offers for the optimal fleet. + backend_offers = await _get_backend_offers_in_fleet( + project=project, + fleet_model=optimal_fleet_model, + run_spec=run_spec, + job=job, + volumes=volumes, + max_offers=None, + ) + if exclude_not_available: + backend_offers = _exclude_non_available_backend_offers(backend_offers) + return optimal_fleet_model, instance_offers, backend_offers + + +def get_run_profile_and_requirements_in_fleet( + job: Job, + run_spec: RunSpec, + fleet_spec: FleetSpec, +) -> tuple[Profile, Requirements]: + profile = combine_fleet_and_run_profiles(fleet_spec.merged_profile, run_spec.merged_profile) + if profile is None: + raise ValueError("Cannot combine fleet profile") + fleet_requirements = get_fleet_requirements(fleet_spec) + requirements = combine_fleet_and_run_requirements( + fleet_requirements, job.job_spec.requirements + ) + if requirements is None: + raise ValueError("Cannot combine fleet requirements") + return profile, requirements + + +async def _select_candidate_fleet_models( + session: AsyncSession, + project: ProjectModel, + run_model: Optional[RunModel], + run_spec: RunSpec, +) -> list[FleetModel]: + fleet_filters, instance_filters = await get_run_candidate_fleet_models_filters( + session=session, + project=project, + run_model=run_model, + run_spec=run_spec, + ) + ( + fleet_models_with_instances, + fleet_models_without_instances, + ) = await select_run_candidate_fleet_models_with_filters( + session=session, + fleet_filters=fleet_filters, + instance_filters=instance_filters, + lock_instances=False, + ) + return fleet_models_with_instances + fleet_models_without_instances + + +def get_instance_offers_in_fleet( + fleet_model: FleetModel, + run_spec: RunSpec, + job: Job, + master_job_provisioning_data: Optional[JobProvisioningData] = None, + volumes: Optional[list[list[Volume]]] = None, + exclude_not_available: bool = False, +) -> list[tuple[InstanceModel, InstanceOfferWithAvailability]]: + return get_instance_offers_from_instances( + instances=fleet_model.instances, + run_spec=run_spec, + job=job, + master_job_provisioning_data=master_job_provisioning_data, + volumes=volumes, + exclude_not_available=exclude_not_available, + ) + + +def get_instance_offers_from_instances( + instances: list[InstanceModel], + run_spec: RunSpec, + job: Job, + master_job_provisioning_data: Optional[JobProvisioningData] = None, + volumes: Optional[list[list[Volume]]] = None, + exclude_not_available: bool = False, +) -> list[tuple[InstanceModel, InstanceOfferWithAvailability]]: + profile = run_spec.merged_profile + multinode = is_multinode_job(job) + nonshared_instances = filter_instances( + instances=instances, + profile=profile, + requirements=job.job_spec.requirements, + multinode=multinode, + master_job_provisioning_data=master_job_provisioning_data, + volumes=volumes, + shared=False, + ) + instances_with_offers = _get_offers_from_instances(nonshared_instances) + shared_instances_with_offers = get_shared_instances_with_offers( + instances=instances, + profile=profile, + requirements=job.job_spec.requirements, + multinode=multinode, + volumes=volumes, + ) + instances_with_offers.extend(shared_instances_with_offers) + instances_with_offers.sort(key=lambda o: o[0].price or 0) + if exclude_not_available: + return _exclude_non_available_instance_offers(instances_with_offers) + return instances_with_offers + + +async def get_targeted_instance_offers( + session: AsyncSession, + project: ProjectModel, + run_spec: RunSpec, + job: Job, + master_job_provisioning_data: Optional[JobProvisioningData] = None, + volumes: Optional[list[list[Volume]]] = None, + exclude_not_available: bool = False, + fleet_id: Optional[uuid.UUID] = None, + instance_ids: Optional[list[uuid.UUID]] = None, + lock_instances: bool = False, +) -> list[tuple[InstanceModel, InstanceOfferWithAvailability]]: + selectors = common_utils.get_or_error(run_spec.merged_profile.instances) + detaching_instance_ids = await get_instances_ids_with_detaching_volumes(session) + instances = await select_instances_by_selectors( + session=session, + project=project, + selectors=selectors, + fleets=run_spec.merged_profile.fleets, + detaching_instance_ids=detaching_instance_ids, + fleet_id=fleet_id, + instance_ids=instance_ids, + lock_instances=lock_instances, + ) + return select_targeted_instance_offers( + instances=instances, + run_spec=run_spec, + job=job, + master_job_provisioning_data=master_job_provisioning_data, + volumes=volumes, + exclude_not_available=exclude_not_available, + ) + + +def select_targeted_instance_offers( + instances: list[InstanceModel], + run_spec: RunSpec, + job: Job, + master_job_provisioning_data: Optional[JobProvisioningData] = None, + volumes: Optional[list[list[Volume]]] = None, + exclude_not_available: bool = False, +) -> list[tuple[InstanceModel, InstanceOfferWithAvailability]]: + candidates: list[_TargetedInstanceOffersCandidate] = [] + for fleet_instances in _group_instances_by_fleet(instances).values(): + fleet = common_utils.get_or_error(fleet_instances[0].fleet) + fleet_spec = get_fleet_spec(fleet) + if ( + is_multinode_job(job) + and fleet_spec.configuration.placement != InstanceGroupPlacement.CLUSTER + ): + continue + all_offers = get_instance_offers_from_instances( + instances=fleet_instances, + run_spec=run_spec, + job=job, + master_job_provisioning_data=master_job_provisioning_data, + volumes=volumes, + exclude_not_available=False, + ) + if len(all_offers) < _get_required_instance_offers(run_spec, job): + continue + available_offers = _exclude_non_available_instance_offers(all_offers) + if exclude_not_available: + all_offers = available_offers + if all_offers: + has_capacity = len(available_offers) >= _get_required_instance_offers(run_spec, job) + candidates.append( + _TargetedInstanceOffersCandidate( + lacks_capacity=not has_capacity, + available_price=_get_min_instance_or_backend_offer_price(available_offers), + selected_price=_get_min_instance_or_backend_offer_price(all_offers), + offers=all_offers, + ) + ) + if not candidates: + return [] + return min(candidates, key=lambda candidate: candidate.sort_key()).offers + + +@dataclass(frozen=True) +class _TargetedInstanceOffersCandidate: + lacks_capacity: bool + available_price: float + selected_price: float + offers: list[tuple[InstanceModel, InstanceOfferWithAvailability]] + + def sort_key(self) -> tuple[bool, float, float]: + return self.lacks_capacity, self.available_price, self.selected_price + + +def _group_instances_by_fleet( + instances: list[InstanceModel], +) -> dict[uuid.UUID, list[InstanceModel]]: + instances_by_fleet: dict[uuid.UUID, list[InstanceModel]] = {} + for instance in instances: + if instance.fleet_id is None: + continue + instances_by_fleet.setdefault(instance.fleet_id, []).append(instance) + return instances_by_fleet + + +def _get_required_instance_offers(run_spec: RunSpec, job: Job) -> int: + if is_multinode_job(job) and is_master_job(job): + return get_nodes_required_num(run_spec) + return 1 + + +def _run_can_fit_into_fleet( + run_spec: RunSpec, fleet_model: FleetModel, fleet_spec: FleetSpec +) -> bool: + """ + Returns `False` if the run cannot fit into fleet for sure. + This is helpful heuristic to avoid even considering fleets too small for a run. + A run may not fit even if this function returns `True`. + This will lead to some jobs failing due to exceeding `nodes.max` + or more than `nodes.max` instances being provisioned + and eventually removed by the fleet consolidation logic. + """ + # No check for cloud fleets with blocks > 1 since we don't know + # how many jobs such fleets can accommodate. + nodes_required_num = get_nodes_required_num(run_spec) + if ( + fleet_spec.configuration.nodes is not None + and fleet_spec.configuration.blocks == 1 + and fleet_spec.configuration.nodes.max is not None + ): + occupied_instances = _get_occupied_instances(fleet_model.instances) + fleet_available_capacity = fleet_spec.configuration.nodes.max - len(occupied_instances) + if fleet_available_capacity < nodes_required_num: + return False + elif fleet_spec.configuration.ssh_config is not None: + # Currently assume that each idle block can run a job. + # TODO: Take resources / eligible offers into account. + total_idle_blocks = 0 + for instance in fleet_model.instances: + total_blocks = instance.total_blocks or 1 + total_idle_blocks += total_blocks - instance.busy_blocks + if total_idle_blocks < nodes_required_num: + return False + return True + + +def _get_occupied_instances(instance_models: list[InstanceModel]) -> list[InstanceModel]: + # A placeholder has busy_blocks == 0 but reserves a `nodes.max` slot + # (unlike an IDLE instance, which can be reused by this run), so count + # it here the same as a busy instance. + return [i for i in instance_models if i.busy_blocks > 0 or is_placeholder_instance(i)] + + +async def _get_backend_offers_in_fleet( + project: ProjectModel, + fleet_model: FleetModel, + run_spec: RunSpec, + job: Job, + volumes: Optional[list[list[Volume]]], + fleet_spec: Optional[FleetSpec] = None, + max_offers: Optional[int] = None, +) -> list[tuple[Backend, InstanceOfferWithAvailability]]: + if fleet_spec is None: + fleet_spec = get_fleet_spec(fleet_model) + try: + check_can_create_new_cloud_instance_in_fleet(fleet_model, fleet_spec) + profile, requirements = get_run_profile_and_requirements_in_fleet( + job=job, + run_spec=run_spec, + fleet_spec=fleet_spec, + ) + except ValueError: + backend_offers = [] + else: + # Master job offers must be in the same cluster as existing instances. + master_instance_provisioning_data = get_fleet_master_instance_provisioning_data( + fleet_model=fleet_model, + fleet_spec=fleet_spec, + ) + # Handle multinode for old jobs that don't have requirements.multinode set. + # TODO: Drop multinode param. + multinode = requirements.multinode or is_multinode_job(job) + backend_offers = await get_offers_by_requirements( + project=project, + profile=profile, + requirements=requirements, + multinode=multinode, + master_job_provisioning_data=master_instance_provisioning_data, + volumes=volumes, + privileged=job.job_spec.privileged, + instance_mounts=check_run_spec_requires_instance_mounts(run_spec), + max_offers=max_offers, + ) + return backend_offers + + +async def _get_pool_offers( + session: AsyncSession, + project: ProjectModel, + run_spec: RunSpec, + job: Job, + volumes: list[list[Volume]], +) -> list[tuple[InstanceModel, InstanceOfferWithAvailability]]: + pool_offers: list[tuple[InstanceModel, InstanceOfferWithAvailability]] = [] + detaching_instances_ids = await get_instances_ids_with_detaching_volumes(session) + pool_instances = await get_pool_instances(session, project) + pool_instances = [i for i in pool_instances if i.id not in detaching_instances_ids] + multinode = is_multinode_job(job) + shared_instances_with_offers = get_shared_instances_with_offers( + instances=pool_instances, + profile=run_spec.merged_profile, + requirements=job.job_spec.requirements, + volumes=volumes, + multinode=multinode, + ) + for offer in shared_instances_with_offers: + pool_offers.append(offer) + + nonshared_instances = filter_instances( + instances=pool_instances, + profile=run_spec.merged_profile, + requirements=job.job_spec.requirements, + multinode=multinode, + volumes=volumes, + shared=False, + ) + nonshared_instances_with_offers = _get_offers_from_instances(nonshared_instances) + pool_offers.extend(nonshared_instances_with_offers) + pool_offers.sort(key=lambda o: o[1].price) + return pool_offers + + +async def _get_non_fleet_offers( + session: AsyncSession, + project: ProjectModel, + profile: Profile, + run_spec: RunSpec, + job: Job, + volumes: list[list[Volume]], +) -> tuple[ + list[tuple[InstanceModel, InstanceOfferWithAvailability]], + list[tuple[Backend, InstanceOfferWithAvailability]], +]: + """ + Returns instance and backend offers for job irrespective of fleets, + i.e. all pool instances and project backends matching the spec. + """ + if profile.instances is not None: + instance_offers = await get_targeted_instance_offers( + session=session, + project=project, + run_spec=run_spec, + job=job, + volumes=volumes, + ) + return instance_offers, [] + + instance_offers = await _get_pool_offers( + session=session, + project=project, + run_spec=run_spec, + job=job, + volumes=volumes, + ) + backend_offers = await get_offers_by_requirements( + project=project, + profile=profile, + requirements=job.job_spec.requirements, + exclude_not_available=False, + multinode=is_multinode_job(job), + volumes=volumes, + privileged=job.job_spec.privileged, + instance_mounts=check_run_spec_requires_instance_mounts(run_spec), + ) + return instance_offers, backend_offers + + +async def get_backend_offers_in_run_candidate_fleets( + session: AsyncSession, + project: ProjectModel, + run_spec: RunSpec, + job: Job, + volumes: Optional[list[list[Volume]]], + max_offers_per_fleet: Optional[int] = None, +) -> list[tuple[Backend, InstanceOfferWithAvailability]]: + """ + Returns backend offers across the run's selected candidate fleets. + + Used by `dstack offer --fleet ...` and `dstack offer --group-by ... --fleet ...`. + It resolves the selected fleets from `run_spec`, requests backend offers in each fleet, + merges them, and deduplicates identical backend offers across fleets. + """ + if run_spec.merged_profile.instances is not None: + return [] + + candidate_fleet_models = await _select_candidate_fleet_models( + session=session, + project=project, + run_model=None, + run_spec=run_spec, + ) + seen_offer_identities = set() + offers: list[tuple[Backend, InstanceOfferWithAvailability]] = [] + for candidate_fleet_model in candidate_fleet_models: + offers_from_fleet = [] + for backend, offer in await _get_backend_offers_in_fleet( + project=project, + fleet_model=candidate_fleet_model, + run_spec=run_spec, + job=job, + volumes=volumes, + max_offers=max_offers_per_fleet, + ): + offer_identity = _get_backend_offer_identity(offer) + if offer_identity not in seen_offer_identities: + offers_from_fleet.append((backend, offer)) + seen_offer_identities.add(offer_identity) + offers = list(merge_offer_iterables(offers, offers_from_fleet)) + return offers + + +async def _get_offers_in_run_candidate_fleets( + session: AsyncSession, + project: ProjectModel, + run_spec: RunSpec, + job: Job, + volumes: list[list[Volume]], +) -> tuple[ + list[tuple[InstanceModel, InstanceOfferWithAvailability]], + list[tuple[Backend, InstanceOfferWithAvailability]], +]: + """ + Returns existing-instance and backend offers across the run's candidate fleets. + + Used by `dstack offer --fleet ...` without `--group-by`. Unlike normal `dstack apply`, it + does not choose a single best fleet. Instead, it gathers existing-instance and backend + offers from each selected fleet, keeps existing instances as separate reusable options, and + deduplicates identical backend offers across fleets. + """ + if run_spec.merged_profile.instances is not None: + instance_offers = await get_targeted_instance_offers( + session=session, + project=project, + run_spec=run_spec, + job=job, + volumes=volumes, + ) + return instance_offers, [] + + candidate_fleet_models = await _select_candidate_fleet_models( + session=session, + project=project, + run_model=None, + run_spec=run_spec, + ) + instance_offers: list[tuple[InstanceModel, InstanceOfferWithAvailability]] = [] + for candidate_fleet_model in candidate_fleet_models: + instance_offers.extend( + get_instance_offers_in_fleet( + fleet_model=candidate_fleet_model, + run_spec=run_spec, + job=job, + volumes=volumes, + exclude_not_available=False, + ) + ) + instance_offers.sort(key=lambda offer: offer[1].price or 0) + # TODO: Intentionally pass `max_offers_per_fleet=None` here. `dstack offer --fleet ...` + # is expected to return the exact `total_offers`, so capping backend offers per selected + # fleet would make that total approximate. We already deduplicate identical backend offers + # while merging selected fleets via `_get_backend_offer_identity()`. Revisit adding a cap + # only if this path causes real performance or memory problems. + backend_offers = await get_backend_offers_in_run_candidate_fleets( + session=session, + project=project, + run_spec=run_spec, + job=job, + volumes=volumes, + max_offers_per_fleet=None, + ) + return instance_offers, backend_offers + + +def _get_backend_offer_identity(offer: InstanceOfferWithAvailability) -> Hashable: + """ + Returns a hashable identity for a backend offer using the full offer payload. + + Needed to deduplicate identical backend offers when merging offers from multiple fleets for + `dstack offer --fleet ...`. + """ + return _freeze_offer_identity_value(offer.dict()) + + +def _freeze_offer_identity_value(value: object) -> Hashable: + """Converts nested offer payload values into a deterministic hashable form.""" + if isinstance(value, Mapping): + return tuple( + sorted( + ( + ( + _freeze_offer_identity_value(key), + _freeze_offer_identity_value(nested_value), + ) + for key, nested_value in value.items() + ), + key=repr, + ) + ) + if isinstance(value, Enum): + return value.value + if isinstance(value, (list, tuple)): + return tuple(_freeze_offer_identity_value(item) for item in value) + if isinstance(value, (set, frozenset)): + return tuple(sorted((_freeze_offer_identity_value(item) for item in value), key=repr)) + if not isinstance(value, Hashable): + raise TypeError(f"Unsupported backend offer identity value: {type(value)!r}") + return value + + +def _get_job_plan( + instance_offers: list[tuple[InstanceModel, InstanceOfferWithAvailability]], + backend_offers: list[tuple[Backend, InstanceOfferWithAvailability]], + profile: Profile, + job: Job, + max_offers: Optional[int], +) -> JobPlan: + job_offers: list[InstanceOfferWithAvailability] = [] + job_offers.extend(offer for _, offer in instance_offers) + if profile.creation_policy == CreationPolicy.REUSE_OR_CREATE and profile.instances is None: + job_offers.extend(offer for _, offer in backend_offers) + job_offers.sort(key=lambda offer: not offer.availability.is_available()) + remove_job_spec_sensitive_info(job.job_spec) + return JobPlan( + job_spec=job.job_spec, + offers=job_offers[: (max_offers or _DEFAULT_MAX_OFFERS)], + total_offers=len(job_offers), + max_price=max((offer.price for offer in job_offers), default=None), + ) + + +def _should_select_best_fleet_candidate(run_spec: RunSpec) -> bool: + """ + Returns ``True`` for normal run planning and ``False`` for `dstack offer` without + `--group-by`. + + Both `dstack apply` and `dstack offer` without `--group-by` call `/runs/get_plan`. The + current way to recognize `dstack offer` without `--group-by` is the synthetic task spec + that the CLI sends with `type == "task"` and `commands == [":"]`. + TODO: Replace this command-shape hack with an explicit request/API signal for + `dstack offer` without `--group-by`. + + When this function returns ``False``, the planner skips best-fleet-candidate selection + and goes directly to the special `dstack offer` collection path: + global offers when no fleets are specified, or offers from the selected fleets when + `--fleet` is used. + + A real task with `commands == [":"]` would also match this special `dstack offer` path. + """ + return not (run_spec.configuration.type == "task" and run_spec.configuration.commands == [":"]) + + +def _get_offers_from_instances( + instances: list[InstanceModel], +) -> list[tuple[InstanceModel, InstanceOfferWithAvailability]]: + instances_with_offers = [] + for instance in instances: + offer = common_utils.get_or_error(get_instance_offer(instance)) + offer.availability = InstanceAvailability.BUSY + if instance.status == InstanceStatus.IDLE: + offer.availability = InstanceAvailability.IDLE + instances_with_offers.append((instance, offer)) + return instances_with_offers + + +def _get_min_instance_or_backend_offer_price( + offers: Union[ + list[tuple[InstanceModel, InstanceOfferWithAvailability]], + list[tuple[Backend, InstanceOfferWithAvailability]], + ], +) -> float: + min_offer_price = math.inf + if len(offers) > 0: + min_offer_price = offers[0][1].price + return min_offer_price + + +def _exclude_non_available_instance_offers( + instance_offers: list[tuple[InstanceModel, InstanceOfferWithAvailability]], +) -> list[tuple[InstanceModel, InstanceOfferWithAvailability]]: + return [ + (instance, offer) + for instance, offer in instance_offers + if offer.availability.is_available() + ] + + +def _exclude_non_available_backend_offers( + backend_offers: list[tuple[Backend, InstanceOfferWithAvailability]], +) -> list[tuple[Backend, InstanceOfferWithAvailability]]: + return [ + (backend, offer) for backend, offer in backend_offers if offer.availability.is_available() + ] diff --git a/src/dstack/_internal/server/services/runs/replicas.py b/src/dstack/_internal/server/services/runs/replicas.py new file mode 100644 index 0000000000..9320881522 --- /dev/null +++ b/src/dstack/_internal/server/services/runs/replicas.py @@ -0,0 +1,231 @@ +from dataclasses import dataclass +from enum import Enum +from typing import Dict, List, Optional, Tuple, Union + +from dstack._internal.core.models.configurations import ReplicaGroup, ServiceConfiguration +from dstack._internal.core.models.routers import RouterType +from dstack._internal.core.models.runs import JobStatus, JobTerminationReason, RunSpec +from dstack._internal.server.models import JobModel, RunModel +from dstack._internal.server.services.jobs import ( + get_job_provisioning_data, + get_job_spec, + group_jobs_by_replica_latest, +) + + +@dataclass +class GroupRolloutState: + active_replicas: List[Tuple[int, bool, int, List[JobModel]]] + inactive_replicas: List[Tuple[int, bool, int, List[JobModel]]] + has_out_of_date_replicas: bool + non_terminated_replica_count: int + unregistered_out_of_date_replica_count: int + registered_non_terminating_replica_count: int + + +class RouterEnvStatus(str, Enum): + """Outcomes returned from get_router_env_for_job() when no env dict is + appropriate. Each value carries a distinct caller-side action. + + Using an enum (rather than empty-dict sentinels) means callers can rely + on either `is` or `==` to compare — both yield correct, unambiguous + results — and stray dicts from elsewhere can never accidentally match. + + NOT_PROVISIONED — router job exists but its internal_ip is not yet + known. Transient; caller should defer this worker + and retry on the next pipeline tick (subject to + ROUTER_PROVISIONING_WAIT_TIMEOUT_SECONDS in + jobs_running.py). + FAILED — router job has reached a terminal state + (TERMINATING/TERMINATED/FAILED/ABORTED/DONE). + Permanent; caller should stop deferring and + terminate this worker — waiting longer cannot + recover because the router will not come back with + a fresh internal_ip. + """ + + NOT_PROVISIONED = "not_provisioned" + FAILED = "failed" + + +def build_replica_lists( + run_model: RunModel, + group_filter: Optional[str] = None, +) -> Tuple[ + List[Tuple[int, bool, int, List[JobModel]]], List[Tuple[int, bool, int, List[JobModel]]] +]: + # lists of (importance, is_out_of_date, replica_num, jobs) + active_replicas: list[tuple[int, bool, int, list[JobModel]]] = [] + inactive_replicas: list[tuple[int, bool, int, list[JobModel]]] = [] + + for replica_num, replica_jobs in group_jobs_by_replica_latest(run_model.jobs): + # Filter by group if specified + if group_filter is not None: + if not job_belongs_to_group(replica_jobs[0], group_filter): + continue + + statuses = set(job.status for job in replica_jobs) + deployment_num = replica_jobs[0].deployment_num # same for all jobs + is_out_of_date = deployment_num < run_model.deployment_num + + if {JobStatus.TERMINATING, *JobStatus.finished_statuses()} & statuses: + # if there are any terminating or finished jobs, the replica is inactive + inactive_replicas.append((0, is_out_of_date, replica_num, replica_jobs)) + elif JobStatus.SUBMITTED in statuses: + # if there are any submitted jobs, the replica is active and has the importance of 0 + active_replicas.append((0, is_out_of_date, replica_num, replica_jobs)) + elif {JobStatus.PROVISIONING, JobStatus.PULLING} & statuses: + # if there are any provisioning or pulling jobs, the replica is active and has the importance of 1 + active_replicas.append((1, is_out_of_date, replica_num, replica_jobs)) + elif not is_replica_registered(replica_jobs): + # all jobs are running, but not receiving traffic, the replica is active and has the importance of 2 + active_replicas.append((2, is_out_of_date, replica_num, replica_jobs)) + else: + # all jobs are running and ready, the replica is active and has the importance of 3 + active_replicas.append((3, is_out_of_date, replica_num, replica_jobs)) + + # Sort by is_out_of_date (up-to-date first), importance (desc), and replica_num (asc) + active_replicas.sort(key=lambda r: (r[1], -r[0], r[2])) + + return active_replicas, inactive_replicas + + +def get_group_rollout_state(run_model: RunModel, group: ReplicaGroup) -> GroupRolloutState: + assert group.name is not None, "Group name is always set" + active_replicas, inactive_replicas = build_replica_lists( + run_model=run_model, + group_filter=group.name, + ) + + non_terminated_replica_nums = set() + unregistered_out_of_date_replica_count = 0 + registered_non_terminating_replica_count = 0 + + for _, jobs in group_jobs_by_replica_latest(run_model.jobs): + if not job_belongs_to_group(jobs[0], group.name): + continue + + if any(not j.status.is_finished() for j in jobs): + non_terminated_replica_nums.add(jobs[0].replica_num) + + if ( + any(j.deployment_num < run_model.deployment_num for j in jobs) + and any( + j.status not in [JobStatus.TERMINATING] + JobStatus.finished_statuses() + for j in jobs + ) + and not is_replica_registered(jobs) + ): + unregistered_out_of_date_replica_count += 1 + + if is_replica_registered(jobs) and all(j.status != JobStatus.TERMINATING for j in jobs): + registered_non_terminating_replica_count += 1 + + return GroupRolloutState( + active_replicas=active_replicas, + inactive_replicas=inactive_replicas, + has_out_of_date_replicas=has_out_of_date_replicas(run_model, group_filter=group.name), + non_terminated_replica_count=len(non_terminated_replica_nums), + unregistered_out_of_date_replica_count=unregistered_out_of_date_replica_count, + registered_non_terminating_replica_count=registered_non_terminating_replica_count, + ) + + +def job_belongs_to_group(job: JobModel, group_name: str) -> bool: + job_spec = get_job_spec(job) + return job_spec.replica_group == group_name + + +def has_out_of_date_replicas(run: RunModel, group_filter: Optional[str] = None) -> bool: + for job in run.jobs: + # Filter jobs by group if specified + if group_filter is not None: + if not job_belongs_to_group(job, group_filter): + continue + if job.deployment_num < run.deployment_num and not ( + job.status.is_finished() or job.termination_reason == JobTerminationReason.SCALED_DOWN + ): + return True + return False + + +def is_replica_registered(jobs: list[JobModel]) -> bool: + # Only job_num=0 is supposed to receive service requests + return jobs[0].registered + + +def get_router_replica_group(run_spec: RunSpec) -> Optional[ReplicaGroup]: + """Return the (single) replica group with a `router:` field, or None. + + `validate_at_most_one_router_replica_group` guarantees at most one such + group exists, so we can safely return on the first match. + """ + cfg = run_spec.configuration + if not isinstance(cfg, ServiceConfiguration): + return None + for g in cfg.replica_groups: + if g.router is not None: + return g + return None + + +def find_router_job(run_model: RunModel, router_group_name: str) -> Optional[JobModel]: + for j in run_model.jobs: + if job_belongs_to_group(j, router_group_name): + return j + return None + + +def get_router_env_for_job( + run_model: RunModel, run_spec: RunSpec, job_model: JobModel +) -> Optional[Union[Dict[str, str], RouterEnvStatus]]: + """Compute env vars exposing the router replica's address to a worker job. + + Returns one of four values, each communicating a distinct outcome: + + None -> not applicable. Either the + run has no router replica + group, or this job IS the + router replica. Caller does + nothing. + RouterEnvStatus.NOT_PROVISIONED -> router job exists but has no + internal_ip yet. Caller defers. + RouterEnvStatus.FAILED -> router job has reached a + terminal state and can never + expose an internal_ip. Caller + terminates this worker; + waiting cannot recover. + {"DSTACK_ROUTER_INTERNAL_IP": ...} -> ready-to-merge env dict + containing the router + replica's internal IP. + """ + router_group = get_router_replica_group(run_spec) + if router_group is None or router_group.name is None: + return None + # DSTACK_ROUTER_INTERNAL_IP is Dynamo-specific. SGLang workers + # are registered via the worker-sync pipeline (ServiceRouterWorkerSyncModel) + if router_group.router is None or router_group.router.type != RouterType.DYNAMO: + return None + if job_belongs_to_group(job_model, router_group.name): + # Router replica itself doesn't need to be told its own IP. + return None + + router_job = find_router_job(run_model, router_group.name) + if router_job is None: + # The router's latest submission is in a terminal state and was + # filtered out by _fetch_run_model's not-terminated predicate. + return RouterEnvStatus.FAILED + + # If the router has reached a terminal state, the worker cannot recover + # by waiting — the router will not come back with a fresh internal_ip + # under the same job. Surface this as FAILED so the caller can stop + # the wait loop and terminate the worker with a clear reason. + if router_job.status == JobStatus.TERMINATING or router_job.status.is_finished(): + return RouterEnvStatus.FAILED + + # Router is alive but may not yet have been assigned a machine. + jpd = get_job_provisioning_data(router_job) + if jpd is None or not jpd.internal_ip: + return RouterEnvStatus.NOT_PROVISIONED + + return {"DSTACK_ROUTER_INTERNAL_IP": jpd.internal_ip} diff --git a/src/dstack/_internal/server/services/runs/router_worker_sync.py b/src/dstack/_internal/server/services/runs/router_worker_sync.py new file mode 100644 index 0000000000..910dc8d576 --- /dev/null +++ b/src/dstack/_internal/server/services/runs/router_worker_sync.py @@ -0,0 +1,640 @@ +"""Reconcile SGLang router /workers with dstack's registered worker replicas (async, SSH-tunneled).""" + +import json +from typing import Any, List, Literal, Optional, TypedDict +from urllib.parse import urlsplit, urlunsplit + +import grpc +from google.protobuf.json_format import MessageToDict +from httpx import ( + AsyncClient, + ConnectError, + ConnectTimeout, + ReadTimeout, + RemoteProtocolError, + Response, +) +from smg_grpc_proto import ( + sglang_scheduler_pb2, + sglang_scheduler_pb2_grpc, + vllm_engine_pb2, + vllm_engine_pb2_grpc, +) +from typing_extensions import NotRequired + +from dstack._internal.core.errors import SSHError +from dstack._internal.core.models.configurations import ReplicaGroup, ServiceConfiguration +from dstack._internal.core.models.runs import JobStatus, RunSpec, get_service_port +from dstack._internal.server.models import JobModel, RunModel +from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_spec +from dstack._internal.server.services.jobs.job_replica_grpc_client import ( + get_service_replica_grpc_client, +) +from dstack._internal.server.services.jobs.job_replica_http_client import ( + get_service_replica_client, +) +from dstack._internal.server.services.logging import fmt +from dstack._internal.utils.logging import get_logger + +from .replicas import job_belongs_to_group +from .service_router_worker_sync import run_spec_has_sglang_router_replica_group + +logger = get_logger(__name__) + +_ROUTER_HTTP = "http://dstack" +_ROUTER_HTTP_TIMEOUT = 10.0 +_MAX_SERVER_INFO_RESPONSE_BYTES = 256 * 1024 +_MAX_WORKERS_RESPONSE_BYTES = 2 * 1024 * 1024 +_MAX_WORKERS_COMMAND_ACK_BYTES = 64 * 1024 +_MAX_WORKERS_LIST_ITEMS = 8192 +_GRPC_DISCOVERY_TIMEOUT = 30.0 + + +class _ResponseTooLargeError(Exception): + pass + + +async def _stream_response_body_bytes(resp: Response, max_bytes: int) -> bytes: + buf = bytearray() + async for chunk in resp.aiter_bytes(): + buf.extend(chunk) + if len(buf) > max_bytes: + raise _ResponseTooLargeError() + return bytes(buf) + + +async def _request_json_limited( + client: AsyncClient, + method: str, + url: str, + *, + max_response_bytes: int, + ok_statuses: set[int], + json_body: Optional[dict] = None, + timeout: float = _ROUTER_HTTP_TIMEOUT, +) -> Any: + kwargs: dict[str, Any] = {"timeout": timeout} + if json_body is not None: + kwargs["json"] = json_body + endpoint = f"{method} {url}" + async with client.stream(method, url, **kwargs) as resp: + if resp.status_code not in ok_statuses: + logger.warning( + "router_http unexpected status endpoint=%s status_code=%s expected=%s", + endpoint, + resp.status_code, + sorted(ok_statuses), + ) + return None + cl = resp.headers.get("content-length") + if cl is not None: + try: + if int(cl) > max_response_bytes: + raise _ResponseTooLargeError() + except ValueError: + pass + raw = await _stream_response_body_bytes(resp, max_response_bytes) + try: + return json.loads(raw) + except json.JSONDecodeError: + logger.warning("router_http JSON parse failed endpoint=%s", endpoint) + return None + + +class _TargetWorker(TypedDict): + url: str + worker_type: str + bootstrap_port: NotRequired[Optional[int]] + connection_mode: NotRequired[str] + runtime_type: NotRequired[str] + kv_connector: NotRequired[str] + kv_role: NotRequired[str] + + +class _WorkerPayloadResult(TypedDict): + status: Literal["ready", "not_ready"] + worker: Optional[_TargetWorker] + + +_ConnectionMode = Literal["grpc", "http"] +_RuntimeType = Literal["sglang", "vllm"] +_GRPC_RUNTIME_TYPES: tuple[_RuntimeType, ...] = ("sglang", "vllm") + + +def run_model_has_sglang_router_replica_group(run_model: RunModel) -> bool: + run_spec = RunSpec.__response__.parse_raw(run_model.run_spec) + return run_spec_has_sglang_router_replica_group(run_spec) + + +def _get_router_job(run_model: RunModel, router_group: ReplicaGroup) -> Optional[JobModel]: + group_name = router_group.name + assert group_name is not None, "Replica group name is set by validation" + router_jobs = [ + j + for j in run_model.jobs + if job_belongs_to_group(j, group_name) and j.status == JobStatus.RUNNING + ] + if not router_jobs: + return None + # Router replica group is currently validated to have count=1, so we assume a single active + # router job here. When we support multiple router replicas for HA, this should be updated + # to handle syncing across all active router jobs. + return router_jobs[0] + + +def _normalize_worker_url(url: str) -> str: + url = url.strip() + parts = urlsplit(url) + path = (parts.path or "").rstrip("/") + return urlunsplit((parts.scheme, parts.netloc, path, parts.query, parts.fragment)) + + +def _get_connection_mode_from_workers( + current_workers: List[dict], +) -> Optional[_ConnectionMode]: + # PD services register multiple workers (e.g. prefill and decode). We expect + # every listed worker to use the same connection_mode (all grpc or all http), + # not a mix of protocols on one router. + modes: set[str] = set() + for worker in current_workers: + mode = worker.get("connection_mode") + if isinstance(mode, str) and mode in ("http", "grpc"): + modes.add(mode) + if modes == {"grpc"}: + return "grpc" + if modes == {"http"}: + return "http" + return None + + +def _get_runtime_type_from_workers( + current_workers: List[dict], +) -> Optional[_RuntimeType]: + # We expect every listed gRPC worker to share the same runtime_type + # (all sglang or all vllm), not a mix of runtimes on one router. + runtimes: set[str] = set() + for worker in current_workers: + # For HTTP workers,there is no “pick vLLM vs SGLang gRPC stub” step, + # so runtime_type is irrelevant for HTTP workers. + if worker.get("connection_mode") != "grpc": + continue + runtime_type = worker.get("runtime_type") + if isinstance(runtime_type, str) and runtime_type in _GRPC_RUNTIME_TYPES: + runtimes.add(runtime_type) + if runtimes == {"sglang"}: + return "sglang" + if runtimes == {"vllm"}: + return "vllm" + return None + + +def _is_expected_router_workers_fetch_error(error: Exception) -> bool: + """SMG router may not accept HTTP yet during startup.""" + if isinstance( + error, + ( + RemoteProtocolError, + ConnectError, + ConnectTimeout, + ReadTimeout, + TimeoutError, + ), + ): + return True + if isinstance(error, OSError) and error.errno in {61, 111}: + return True + return False + + +def _log_router_workers_fetch_failure(error: Exception) -> None: + if _is_expected_router_workers_fetch_error(error): + logger.debug("Router /workers not ready yet: %r", error) + return + logger.exception("Error getting router /workers") + + +async def _get_router_workers(client: AsyncClient) -> List[dict]: + try: + data = await _request_json_limited( + client, + "GET", + f"{_ROUTER_HTTP}/workers", + max_response_bytes=_MAX_WORKERS_RESPONSE_BYTES, + ok_statuses={200}, + ) + if not isinstance(data, dict): + return [] + workers = data.get("workers", []) + if not isinstance(workers, list): + return [] + if len(workers) > _MAX_WORKERS_LIST_ITEMS: + logger.warning( + "Router /workers list exceeds %s items, truncating", + _MAX_WORKERS_LIST_ITEMS, + ) + workers = workers[:_MAX_WORKERS_LIST_ITEMS] + return [w for w in workers if isinstance(w, dict)] + except _ResponseTooLargeError: + logger.warning("Router /workers response exceeded size limit") + except Exception as e: + _log_router_workers_fetch_failure(e) + return [] + + +async def _add_worker_to_router( + client: AsyncClient, + url: str, + worker_type: str = "regular", + bootstrap_port: Optional[int] = None, + *, + connection_mode: Optional[str] = None, + runtime_type: Optional[str] = None, + kv_connector: Optional[str] = None, + kv_role: Optional[str] = None, +) -> bool: + try: + payload: dict = {"url": url, "worker_type": worker_type} + if bootstrap_port is not None: + payload["bootstrap_port"] = bootstrap_port + if connection_mode is not None: + payload["connection_mode"] = connection_mode + if runtime_type is not None: + payload["runtime_type"] = runtime_type + if kv_connector is not None: + payload["kv_connector"] = kv_connector + if kv_role is not None: + payload["kv_role"] = kv_role + body = await _request_json_limited( + client, + "POST", + f"{_ROUTER_HTTP}/workers", + max_response_bytes=_MAX_WORKERS_COMMAND_ACK_BYTES, + ok_statuses={202}, + json_body=payload, + ) + return isinstance(body, dict) and body.get("status") == "accepted" + except _ResponseTooLargeError: + logger.warning("Router add-worker response exceeded size limit for %s", url) + return False + except Exception: + logger.exception("Error adding worker %s", url) + return False + + +async def _remove_worker_from_router_by_id( + client: AsyncClient, worker_id: str, *, worker_url: str +) -> bool: + try: + body = await _request_json_limited( + client, + "DELETE", + f"{_ROUTER_HTTP}/workers/{worker_id}", + max_response_bytes=_MAX_WORKERS_COMMAND_ACK_BYTES, + ok_statuses={202}, + ) + return isinstance(body, dict) and body.get("status") == "accepted" + except _ResponseTooLargeError: + logger.warning("Router remove-worker response exceeded size limit for %s", worker_url) + return False + except Exception: + logger.exception("Error removing worker %s", worker_url) + return False + + +async def _update_workers_in_router_replica( + client: AsyncClient, + target_workers: List[_TargetWorker], + *, + current_workers: List[dict], +) -> None: + current_urls: set[str] = set() + current_ids_by_norm_url: dict[str, str] = {} + for w in current_workers: + u = w.get("url") + if not isinstance(u, str) or not u: + continue + norm_u = _normalize_worker_url(u) + current_urls.add(norm_u) + wid = w.get("id") + if isinstance(wid, str) and wid: + current_ids_by_norm_url[norm_u] = wid + target_by_norm = {_normalize_worker_url(t["url"]): t for t in target_workers} + target_urls = set(target_by_norm.keys()) + to_add = sorted(target_urls - current_urls) + to_remove = sorted(current_urls - target_urls) + for norm_url in to_add: + tw = target_by_norm[norm_url] + ok = await _add_worker_to_router( + client, + tw["url"], + tw["worker_type"], + tw.get("bootstrap_port"), + connection_mode=tw.get("connection_mode"), + runtime_type=tw.get("runtime_type"), + kv_connector=tw.get("kv_connector"), + kv_role=tw.get("kv_role"), + ) + if not ok: + logger.warning("Failed to add worker %s, continuing with others", tw["url"]) + for url in to_remove: + wid = current_ids_by_norm_url.get(url) + if not wid: + logger.error("No worker id found for url %s", url) + ok = False + else: + ok = await _remove_worker_from_router_by_id(client, wid, worker_url=url) + if not ok: + logger.warning("Failed to remove worker %s, continuing with others", url) + + +def _vllm_kv_role_to_worker_type(kv_role: str) -> str: + if kv_role == "kv_producer": + return "prefill" + if kv_role == "kv_consumer": + return "decode" + return "regular" + + +def _is_expected_grpc_discovery_error(error: Exception) -> bool: + """Expected while a gRPC worker is still starting or the wrong stub is probed.""" + if isinstance(error, grpc.aio.AioRpcError): + return error.code() in ( + grpc.StatusCode.UNAVAILABLE, + grpc.StatusCode.DEADLINE_EXCEEDED, + grpc.StatusCode.UNIMPLEMENTED, + ) + return False + + +async def _get_http_worker(job_model: JobModel, *, worker_url: str) -> _WorkerPayloadResult: + try: + async with get_service_replica_client(job_model) as client: + data = await _request_json_limited( + client, + "GET", + f"{_ROUTER_HTTP}/server_info", + max_response_bytes=_MAX_SERVER_INFO_RESPONSE_BYTES, + ok_statuses={200}, + ) + if isinstance(data, dict): + if data.get("status") != "ready": + return {"status": "not_ready", "worker": None} + mode = data.get("disaggregation_mode", "") + if mode == "prefill": + bootstrap_port = data.get("disaggregation_bootstrap_port") + worker: _TargetWorker = { + "url": worker_url, + "worker_type": "prefill", + "connection_mode": "http", + "runtime_type": "sglang", + } + if bootstrap_port is not None: + worker["bootstrap_port"] = bootstrap_port + return {"status": "ready", "worker": worker} + if mode == "decode": + return { + "status": "ready", + "worker": { + "url": worker_url, + "worker_type": "decode", + "connection_mode": "http", + "runtime_type": "sglang", + }, + } + return { + "status": "ready", + "worker": { + "url": worker_url, + "worker_type": "regular", + "connection_mode": "http", + "runtime_type": "sglang", + }, + } + except _ResponseTooLargeError: + logger.warning("server_info response too large for worker %s", worker_url) + except RemoteProtocolError as e: + logger.debug("HTTP server_info not available for worker %s: %r", worker_url, e) + except Exception as e: + logger.exception("Could not fetch server_info for worker %s: %r", worker_url, e) + return {"status": "not_ready", "worker": None} + + +async def _get_grpc_server_info( + channel: grpc.aio.Channel, + runtime_type: _RuntimeType, +) -> Any: + if runtime_type == "sglang": + stub = sglang_scheduler_pb2_grpc.SglangSchedulerStub(channel) + request = sglang_scheduler_pb2.GetServerInfoRequest() + else: + stub = vllm_engine_pb2_grpc.VllmEngineStub(channel) + request = vllm_engine_pb2.GetServerInfoRequest() + return await stub.GetServerInfo(request, timeout=_GRPC_DISCOVERY_TIMEOUT) + + +async def _discover_grpc_server_info( + channel: grpc.aio.Channel, +) -> tuple[Optional[_RuntimeType], Optional[Any]]: + # Bootstrap only: router workers list has no runtime_type yet. + for runtime_type in _GRPC_RUNTIME_TYPES: + try: + response = await _get_grpc_server_info(channel, runtime_type) + except Exception as e: + if _is_expected_grpc_discovery_error(e): + continue + raise + return runtime_type, response + return None, None + + +def _grpc_server_info_to_worker( + worker_url: str, + runtime_type: _RuntimeType, + response: Any, +) -> _TargetWorker: + if runtime_type == "vllm": + kv_role = response.kv_role or "" + kv_connector = response.kv_connector or "" + worker: _TargetWorker = { + "url": worker_url, + "connection_mode": "grpc", + "runtime_type": runtime_type, + "worker_type": _vllm_kv_role_to_worker_type(kv_role), + } + if kv_connector: + worker["kv_connector"] = kv_connector + if kv_role: + worker["kv_role"] = kv_role + return worker + + server_args = ( + MessageToDict(response.server_args, preserving_proto_field_name=True) + if response.server_args is not None + else {} + ) + mode = server_args.get("disaggregation_mode") + worker_type = mode if mode in ("prefill", "decode") else "regular" + worker = { + "url": worker_url, + "connection_mode": "grpc", + "runtime_type": runtime_type, + "worker_type": worker_type, + } + if worker_type == "prefill": + bootstrap_port = server_args.get("disaggregation_bootstrap_port") + if bootstrap_port is not None: + worker["bootstrap_port"] = int(bootstrap_port) + return worker + + +async def _get_grpc_worker( + job_model: JobModel, + *, + worker_url: str, + runtime_type: Optional[_RuntimeType] = None, +) -> _WorkerPayloadResult: + try: + async with get_service_replica_grpc_client(job_model) as channel: + if runtime_type is not None: + try: + response = await _get_grpc_server_info(channel, runtime_type) + except Exception as e: + if _is_expected_grpc_discovery_error(e): + logger.debug("gRPC worker %s not ready (GetServerInfo)", worker_url) + return {"status": "not_ready", "worker": None} + raise + else: + runtime_type, response = await _discover_grpc_server_info(channel) + if runtime_type is None or response is None: + logger.debug("gRPC worker %s not ready (GetServerInfo)", worker_url) + return {"status": "not_ready", "worker": None} + except Exception as e: + logger.exception( + "Could not fetch gRPC GetServerInfo for worker %s: %r", + worker_url, + e, + ) + return {"status": "not_ready", "worker": None} + + worker = _grpc_server_info_to_worker(worker_url, runtime_type, response) + return {"status": "ready", "worker": worker} + + +async def _get_worker( + job_model: JobModel, + *, + http_worker_url: str, + grpc_worker_url: str, + connection_mode: Optional[_ConnectionMode] = None, + runtime_type: Optional[_RuntimeType] = None, +) -> _WorkerPayloadResult: + if connection_mode == "grpc": + return await _get_grpc_worker( + job_model, worker_url=grpc_worker_url, runtime_type=runtime_type + ) + if connection_mode == "http": + return await _get_http_worker(job_model, worker_url=http_worker_url) + # Router workers list is empty and no connection_mode discovered. + try: + result = await _get_http_worker(job_model, worker_url=http_worker_url) + except RemoteProtocolError as e: + logger.debug( + "HTTP server_info probe failed for %s (trying gRPC): %r", + http_worker_url, + e, + ) + result: _WorkerPayloadResult = {"status": "not_ready", "worker": None} + if result["status"] == "ready": + return result + return await _get_grpc_worker(job_model, worker_url=grpc_worker_url, runtime_type=runtime_type) + + +async def _build_target_workers( + run_model: RunModel, + run_spec: RunSpec, + replica_groups: list[ReplicaGroup], + *, + connection_mode: Optional[_ConnectionMode] = None, + runtime_type: Optional[_RuntimeType] = None, +) -> List[_TargetWorker]: + workers: List[_TargetWorker] = [] + config = run_spec.configuration + if not isinstance(config, ServiceConfiguration): + return workers + + for group in replica_groups: + if group.router is not None: + continue + assert group.name is not None, "Replica group name is set by validation" + group_name = group.name + for job in run_model.jobs: + if not job_belongs_to_group(job, group_name): + continue + if job.status != JobStatus.RUNNING: + continue + jpd = get_job_provisioning_data(job) + if jpd is None: + continue + ip = jpd.internal_ip or jpd.hostname + if not ip: + continue + job_spec = get_job_spec(job) + port = get_service_port(job_spec, config) + http_worker_url = f"http://{ip}:{port}" + grpc_worker_url = f"grpc://{ip}:{port}" + result = await _get_worker( + job, + http_worker_url=http_worker_url, + grpc_worker_url=grpc_worker_url, + connection_mode=connection_mode, + runtime_type=runtime_type, + ) + if result["status"] == "ready" and result["worker"]: + workers.append(result["worker"]) + elif result["status"] == "not_ready": + logger.debug( + "Worker not ready http=%s grpc=%s", + http_worker_url, + grpc_worker_url, + ) + return workers + + +async def sync_router_workers_for_run_model(run_model: RunModel) -> None: + run_spec = RunSpec.__response__.parse_raw(run_model.run_spec) + config = run_spec.configuration + if not isinstance(config, ServiceConfiguration): + return + replica_groups = config.replica_groups + router_group = next((g for g in replica_groups if g.router is not None), None) + if router_group is None: + return + + router_job = _get_router_job(run_model, router_group) + if router_job is None: + return + try: + async with get_service_replica_client(router_job) as client: + current_workers = await _get_router_workers(client) + # connection_mode can be grpc or http, runtime_type can be sglang or vllm. + connection_mode = _get_connection_mode_from_workers(current_workers) + runtime_type = _get_runtime_type_from_workers(current_workers) + # Empty current_workers on first sync is expected. First syncprobes both connection_mode and + # runtime_type. Subsequent syncs don't need to probe again because connection_mode and runtime_type + # is already set in current_workers. + target_workers = await _build_target_workers( + run_model, + run_spec, + replica_groups, + connection_mode=connection_mode, + runtime_type=runtime_type, + ) + await _update_workers_in_router_replica( + client, target_workers, current_workers=current_workers + ) + except SSHError as e: + logger.warning( + "%s: failed to sync workers with router: %r", + fmt(router_job), + e, + ) diff --git a/src/dstack/_internal/server/services/runs/service_router_worker_sync.py b/src/dstack/_internal/server/services/runs/service_router_worker_sync.py new file mode 100644 index 0000000000..b251e76f92 --- /dev/null +++ b/src/dstack/_internal/server/services/runs/service_router_worker_sync.py @@ -0,0 +1,79 @@ +"""Service-router replica pipeline: detect router groups and ensure sync table rows.""" + +import uuid +from datetime import datetime +from typing import Optional, TypedDict + +from sqlalchemy import select, update +from sqlalchemy.ext.asyncio import AsyncSession + +import dstack._internal.utils.common as common_utils +from dstack._internal.core.models.configurations import ServiceConfiguration +from dstack._internal.core.models.routers import RouterType +from dstack._internal.core.models.runs import RunSpec +from dstack._internal.server.models import RunModel, ServiceRouterWorkerSyncModel + + +class _SyncRowUpdateMap(TypedDict, total=False): + deleted: bool + last_processed_at: datetime + lock_expires_at: Optional[datetime] + lock_token: Optional[uuid.UUID] + lock_owner: Optional[str] + + +def _reactivate_sync_row_update_map(*, now: datetime) -> _SyncRowUpdateMap: + return { + "deleted": False, + "last_processed_at": now, + "lock_expires_at": None, + "lock_token": None, + "lock_owner": None, + } + + +def run_spec_has_sglang_router_replica_group(run_spec: RunSpec) -> bool: + if run_spec.configuration.type != "service": + return False + cfg = run_spec.configuration + if not isinstance(cfg, ServiceConfiguration): + return False + return any( + g.router is not None and g.router.type == RouterType.SGLANG for g in cfg.replica_groups + ) + + +async def ensure_service_router_worker_sync_row( + session: AsyncSession, + run_model: RunModel, + run_spec: RunSpec, +) -> None: + if not run_spec_has_sglang_router_replica_group(run_spec): + return + res = await session.execute( + select(ServiceRouterWorkerSyncModel).where( + ServiceRouterWorkerSyncModel.run_id == run_model.id + ) + ) + sync_row = res.scalar_one_or_none() + now = common_utils.get_current_datetime() + if sync_row is not None: + if sync_row.deleted: + # If the router replica group is reintroduced in service configuration (via re-apply), + # reactivate the existing sync row so the background pipeline resumes syncing router workers. + update_map = _reactivate_sync_row_update_map(now=now) + await session.execute( + update(ServiceRouterWorkerSyncModel) + .where(ServiceRouterWorkerSyncModel.id == sync_row.id) + .values(**update_map) + ) + return + session.add( + ServiceRouterWorkerSyncModel( + id=uuid.uuid4(), + run_id=run_model.id, + deleted=False, + created_at=now, + last_processed_at=now, + ) + ) diff --git a/src/dstack/_internal/server/services/runs/spec.py b/src/dstack/_internal/server/services/runs/spec.py new file mode 100644 index 0000000000..cb989ef5b4 --- /dev/null +++ b/src/dstack/_internal/server/services/runs/spec.py @@ -0,0 +1,321 @@ +from dstack._internal.core.errors import ServerClientError +from dstack._internal.core.models.configurations import ( + RUN_PRIORITY_DEFAULT, + SERVICE_HTTPS_DEFAULT, + ServiceConfiguration, +) +from dstack._internal.core.models.profiles import ProfileRetry +from dstack._internal.core.models.repos.virtual import DEFAULT_VIRTUAL_REPO_ID, VirtualRunRepoData +from dstack._internal.core.models.routers import RouterType +from dstack._internal.core.models.runs import LEGACY_REPO_DIR, AnyRunConfiguration, RunSpec +from dstack._internal.core.models.volumes import InstanceMountPoint +from dstack._internal.core.services import validate_dstack_resource_name +from dstack._internal.core.services.diff import ModelDiff, diff_models +from dstack._internal.server import settings +from dstack._internal.server.models import UserModel +from dstack._internal.server.services.docker import is_valid_docker_volume_target +from dstack._internal.server.services.resources import ( + set_gpu_vendor_default, + set_resources_defaults, +) +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +_UPDATABLE_SPEC_FIELDS = ["configuration_path", "configuration"] +_TYPE_SPECIFIC_UPDATABLE_SPEC_FIELDS = { + "service": [ + # rolling deployment + "repo_data", + "repo_code_hash", + "file_archives", + "working_dir", + ], +} +_CONF_UPDATABLE_FIELDS = ["priority"] +_TYPE_SPECIFIC_CONF_UPDATABLE_FIELDS = { + "dev-environment": ["inactivity_duration"], + "service": [ + # in-place + "replicas", + "scaling", + # rolling deployment + # NOTE: keep this list in sync with the "Rolling deployment" section in services.md + "port", + "probes", + "resources", + "volumes", + "docker", + "files", + "image", + "user", + "privileged", + "entrypoint", + "working_dir", + "python", + "nvcc", + "single_branch", + "env", + "shell", + "commands", + ], +} + + +def validate_run_spec_and_set_defaults( + user: UserModel, run_spec: RunSpec, legacy_repo_dir: bool = False +): + # This function may set defaults for null run_spec values, + # although most defaults are resolved when building job_spec + # so that we can keep both the original user-supplied value (null in run_spec) + # and the default in job_spec. + # If a property is stored in job_spec - resolve the default there. + # Server defaults are preferable over client defaults so that + # the defaults depend on the server version, not the client version. + if run_spec.run_name is not None: + validate_dstack_resource_name(run_spec.run_name) + _validate_retry_duration(run_spec) + for mount_point in run_spec.configuration.volumes: + if not is_valid_docker_volume_target(mount_point.path): + raise ServerClientError(f"Invalid volume mount path: {mount_point.path}") + if run_spec.repo_id is None and run_spec.repo_data is not None: + raise ServerClientError("repo_data must not be set if repo_id is not set") + if run_spec.repo_id is not None and run_spec.repo_data is None: + raise ServerClientError("repo_id must not be set if repo_data is not set") + # Some run_spec parameters have to be set here and not in the model defaults since + # the client may not pass them or pass null, but they must be always present, e.g. for runner. + if run_spec.repo_id is None: + run_spec.repo_id = DEFAULT_VIRTUAL_REPO_ID + if run_spec.repo_data is None: + run_spec.repo_data = VirtualRunRepoData() + if ( + run_spec.merged_profile.utilization_policy is not None + and run_spec.merged_profile.utilization_policy.time_window + > settings.SERVER_METRICS_RUNNING_TTL_SECONDS + ): + raise ServerClientError( + f"Maximum utilization_policy.time_window is {settings.SERVER_METRICS_RUNNING_TTL_SECONDS}s" + ) + if isinstance(run_spec.configuration, ServiceConfiguration): + if run_spec.merged_profile.schedule and all( + group.count.min == 0 for group in run_spec.configuration.replica_groups + ): + raise ServerClientError( + "Scheduled services with autoscaling to zero are not supported" + ) + if len(run_spec.configuration.probes or []) > settings.MAX_PROBES_PER_JOB: + raise ServerClientError( + f"Cannot configure more than {settings.MAX_PROBES_PER_JOB} probes" + ) + if any( + p.timeout is not None and p.timeout > settings.MAX_PROBE_TIMEOUT + for p in (run_spec.configuration.probes or []) + ): + raise ServerClientError( + f"Probe timeout cannot be longer than {settings.MAX_PROBE_TIMEOUT}s" + ) + if run_spec.configuration.priority is None: + run_spec.configuration.priority = RUN_PRIORITY_DEFAULT + # We do not reject top-level `resources` when `replicas` is a list. Adding strict checks + # would be fragile because the spec may be changed later (for example by plugins). + set_resources_defaults(run_spec.configuration.resources) + set_gpu_vendor_default( + run_spec.configuration.resources, + image=run_spec.configuration.image, + docker=getattr(run_spec.configuration, "docker", None), + ) + if run_spec.ssh_key_pub is None: + if user.ssh_public_key: + run_spec.ssh_key_pub = user.ssh_public_key + else: + raise ServerClientError("ssh_key_pub must be set if the user has no ssh_public_key") + if run_spec.configuration.working_dir is None and legacy_repo_dir: + run_spec.configuration.working_dir = LEGACY_REPO_DIR + + +def _validate_retry_duration(run_spec: RunSpec) -> None: + retry = run_spec.merged_profile.retry + if isinstance(retry, ProfileRetry) and retry.duration is not None and retry.duration < 0: + raise ServerClientError("retry.duration cannot be negative") + + +def _check_dynamo_in_place_update_compatibility( + current_run_spec: RunSpec, new_run_spec: RunSpec +) -> None: + """Reject in-place updates that would re-provision a Dynamo router. + + Workers cache the router internal IP at provisioning time; changes that + trigger a rolling router update must not be applied in place. + """ + current_cfg = current_run_spec.configuration + new_cfg = new_run_spec.configuration + if not isinstance(current_cfg, ServiceConfiguration) or not isinstance( + new_cfg, ServiceConfiguration + ): + return + + current_router_group = next( + (g for g in current_cfg.replica_groups if g.router is not None), None + ) + new_router_group = next((g for g in new_cfg.replica_groups if g.router is not None), None) + current_router_type = ( + current_router_group.router.type + if current_router_group is not None and current_router_group.router is not None + else None + ) + new_router_type = ( + new_router_group.router.type + if new_router_group is not None and new_router_group.router is not None + else None + ) + if ( + current_router_type is not None + and new_router_type is not None + and current_router_type != new_router_type + ): + raise ServerClientError( + "Cannot change router.type in place. Stop the run with `dstack stop` and re-apply." + ) + if RouterType.DYNAMO not in (current_router_type, new_router_type): + return + if current_router_group != new_router_group: + raise ServerClientError( + "Cannot update a Dynamo router replica group in place. " + "Stop the run with `dstack stop` and re-apply." + ) + _router_affecting_top_level_fields = tuple( + f + for f in _TYPE_SPECIFIC_CONF_UPDATABLE_FIELDS.get("service", []) + if f not in ("replicas", "scaling") + ) + for field in _router_affecting_top_level_fields: + if getattr(current_cfg, field, None) != getattr(new_cfg, field, None): + raise ServerClientError( + f"Cannot change top-level `{field}` in place when the " + f"service has a Dynamo router (would re-provision the " + f"router and invalidate workers' cached " + f"DSTACK_ROUTER_INTERNAL_IP). Stop the run with " + f"`dstack stop` and re-apply." + ) + for field in _TYPE_SPECIFIC_UPDATABLE_SPEC_FIELDS.get("service", []): + if getattr(current_run_spec, field, None) != getattr(new_run_spec, field, None): + raise ServerClientError( + f"Cannot change top-level `{field}` in place when the " + f"service has a Dynamo router (would re-provision the " + f"router and invalidate workers' cached " + f"DSTACK_ROUTER_INTERNAL_IP). Stop the run with " + f"`dstack stop` and re-apply." + ) + + +def check_can_update_run_spec(current_run_spec: RunSpec, new_run_spec: RunSpec) -> ModelDiff: + """ + Check if in-place update is possible. + + Returns the diff if it is possible. + Raises ServerClientError otherwise. + """ + spec_diff = diff_models(current_run_spec, new_run_spec) + changed_spec_fields = list(spec_diff.keys()) + updatable_spec_fields = _UPDATABLE_SPEC_FIELDS + _TYPE_SPECIFIC_UPDATABLE_SPEC_FIELDS.get( + new_run_spec.configuration.type, [] + ) + for key in changed_spec_fields: + if key not in updatable_spec_fields: + raise ServerClientError( + f"Failed to update fields {changed_spec_fields}." + f" Can only update {updatable_spec_fields}." + ) + _check_dynamo_in_place_update_compatibility(current_run_spec, new_run_spec) + # We don't allow update if the order of archives has been changed, as even if the archives + # are the same (the same id => hash => content and the same container path), the order of + # unpacking matters when one path is a subpath of another. + ignore_files = current_run_spec.file_archives == new_run_spec.file_archives + spec_diff["configuration"] = _check_can_update_configuration( + current_run_spec.configuration, new_run_spec.configuration, ignore_files + ) + return spec_diff + + +def can_update_run_spec(current_run_spec: RunSpec, new_run_spec: RunSpec) -> bool: + try: + check_can_update_run_spec(current_run_spec, new_run_spec) + except ServerClientError as e: + logger.debug("Run cannot be updated: %s", repr(e)) + return False + return True + + +def get_nodes_required_num(run_spec: RunSpec) -> int: + nodes_required_num = 1 + if run_spec.configuration.type == "task": + nodes_required_num = run_spec.configuration.nodes + elif run_spec.configuration.type == "service": + nodes_required_num = sum( + group.count.min or 0 for group in run_spec.configuration.replica_groups + ) + return nodes_required_num + + +def check_run_spec_requires_instance_mounts(run_spec: RunSpec) -> bool: + return any( + isinstance(mp, InstanceMountPoint) and not mp.optional + for mp in run_spec.configuration.volumes + ) + + +def _check_can_update_configuration( + current: AnyRunConfiguration, new: AnyRunConfiguration, ignore_files: bool +) -> ModelDiff: + """ + Check if in-place update is possible. + + Returns the diff if it is possible. + Raises ServerClientError otherwise. + """ + if current.type != new.type: + raise ServerClientError( + f"Configuration type changed from {current.type} to {new.type}, cannot update" + ) + + if isinstance(current, ServiceConfiguration) and isinstance(new, ServiceConfiguration): + current_router_group = next( + (g for g in current.replica_groups if g.router is not None), None + ) + new_router_group = next((g for g in new.replica_groups if g.router is not None), None) + current_router_group_name = ( + current_router_group.name if current_router_group is not None else None + ) + new_router_group_name = new_router_group.name if new_router_group is not None else None + if current_router_group_name != new_router_group_name: + raise ServerClientError( + "Cannot update router replica groups in-place (adding/removing `router` or changing " + "which replica group is the router is not supported). Stop the run and apply again." + ) + updatable_fields = _CONF_UPDATABLE_FIELDS + _TYPE_SPECIFIC_CONF_UPDATABLE_FIELDS.get( + new.type, [] + ) + if ignore_files: + # We ignore files diff if the file archives are the same. It allows the user to move + # local files/dirs as long as their name(*), content, and the container path stay the same. + # (*) We could also ignore local name changes if the names didn't change in the tarballs. + # Currently, the client preserves the original file/dir name it the tarball, but it could + # use some generic names like "file"/"directory" instead. + updatable_fields.append("files") + if ( + isinstance(current, ServiceConfiguration) + and isinstance(new, ServiceConfiguration) + and current.https in (None, SERVICE_HTTPS_DEFAULT) + and new.https in (None, SERVICE_HTTPS_DEFAULT) + ): + # Allow switching between `https: ` and unset `https`. Has no effect. + updatable_fields.append("https") + diff = diff_models(current, new) + changed_fields = list(diff.keys()) + for key in changed_fields: + if key not in updatable_fields: + raise ServerClientError( + f"Failed to update fields {changed_fields}. Can only update {updatable_fields}" + ) + return diff diff --git a/src/dstack/_internal/server/services/secrets.py b/src/dstack/_internal/server/services/secrets.py new file mode 100644 index 0000000000..fd7e484eaa --- /dev/null +++ b/src/dstack/_internal/server/services/secrets.py @@ -0,0 +1,266 @@ +import re +import uuid +from collections.abc import AsyncGenerator +from contextlib import asynccontextmanager +from typing import Dict, List, Optional + +import sqlalchemy.exc +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.errors import ( + ForbiddenError, + ResourceExistsError, + ResourceNotExistsError, + ServerClientError, +) +from dstack._internal.core.models.secrets import Secret +from dstack._internal.core.models.users import GlobalRole +from dstack._internal.server.db import get_db +from dstack._internal.server.models import DecryptedString, ProjectModel, SecretModel, UserModel +from dstack._internal.server.services import events +from dstack._internal.server.services.locking import get_locker +from dstack._internal.server.services.projects import get_member, get_member_permissions + +_SECRET_NAME_REGEX = "^[A-Za-z0-9-_]{1,200}$" +_SECRET_VALUE_MAX_LENGTH = 5000 + + +async def list_secrets( + session: AsyncSession, + project: ProjectModel, + user: UserModel, +) -> List[Secret]: + _check_can_manage_secrets(user=user, project=project) + secret_models = await list_project_secret_models(session=session, project=project) + return [secret_model_to_secret(s, include_value=False) for s in secret_models] + + +async def get_project_secrets_mapping( + session: AsyncSession, + project: ProjectModel, +) -> Dict[str, str]: + secret_models = await list_project_secret_models(session=session, project=project) + return {s.name: s.value.get_plaintext_or_error() for s in secret_models} + + +async def get_secret( + session: AsyncSession, + project: ProjectModel, + name: str, + user: UserModel, +) -> Optional[Secret]: + _check_can_manage_secrets(user=user, project=project) + secret_model = await get_project_secret_model_by_name( + session=session, + project=project, + name=name, + ) + if secret_model is None: + return None + return secret_model_to_secret(secret_model, include_value=True) + + +async def create_or_update_secret( + session: AsyncSession, + project: ProjectModel, + name: str, + value: str, + user: UserModel, +) -> Secret: + _check_can_manage_secrets(user=user, project=project) + _validate_secret(name=name, value=value) + try: + secret_model = await create_secret( + session=session, + project=project, + name=name, + value=value, + user=user, + ) + except ResourceExistsError: + secret_model = await update_secret( + session=session, + project=project, + name=name, + value=value, + user=user, + ) + return secret_model_to_secret(secret_model, include_value=True) + + +async def delete_secrets( + session: AsyncSession, + project: ProjectModel, + names: List[str], + user: UserModel, +): + _check_can_manage_secrets(user=user, project=project) + async with get_project_secret_models_by_name_for_update( + session=session, project=project, names=names + ) as secret_models: + existing_names = [s.name for s in secret_models] + missing_names = set(names) - set(existing_names) + if missing_names: + raise ResourceNotExistsError(f"Secrets not found: {', '.join(missing_names)}") + for secret_model in secret_models: + await session.delete(secret_model) + events.emit( + session, + "Secret deleted", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(secret_model)], + ) + await session.commit() + + +def secret_model_to_secret(secret_model: SecretModel, include_value: bool = False) -> Secret: + value = None + if include_value: + value = secret_model.value.get_plaintext_or_error() + return Secret( + id=secret_model.id, + name=secret_model.name, + value=value, + ) + + +async def list_project_secret_models( + session: AsyncSession, + project: ProjectModel, +) -> List[SecretModel]: + res = await session.execute( + select(SecretModel) + .where( + SecretModel.project_id == project.id, + ) + .order_by(SecretModel.created_at.desc()) + ) + secret_models = list(res.scalars().all()) + return secret_models + + +async def get_project_secret_model_by_name( + session: AsyncSession, + project: ProjectModel, + name: str, +) -> Optional[SecretModel]: + res = await session.execute( + select(SecretModel).where( + SecretModel.project_id == project.id, + SecretModel.name == name, + ) + ) + return res.scalar_one_or_none() + + +@asynccontextmanager +async def get_project_secret_models_by_name_for_update( + session: AsyncSession, project: ProjectModel, names: list[str] +) -> AsyncGenerator[list[SecretModel], None]: + """ + Fetch secrets from the database and lock them for update. + + **NOTE**: commit changes to the database before exiting from this context manager, + so that in-memory locks are only released after commit. + """ + filters = [ + SecretModel.project_id == project.id, + SecretModel.name.in_(names), + ] + res = await session.execute(select(SecretModel.id).where(*filters)) + secret_ids = res.scalars().all() + if not secret_ids: + yield [] + else: + async with get_locker(get_db().dialect_name).lock_ctx( + SecretModel.__tablename__, sorted(secret_ids) + ): + # Refetch after lock + res = await session.execute( + select(SecretModel) + .where(SecretModel.id.in_(secret_ids), *filters) + .with_for_update(key_share=True) + .order_by(SecretModel.id) # take locks in order + ) + yield list(res.scalars().all()) + + +async def create_secret( + session: AsyncSession, + project: ProjectModel, + name: str, + value: str, + user: UserModel, +) -> SecretModel: + secret_model = SecretModel( + id=uuid.uuid4(), + project_id=project.id, + name=name, + value=DecryptedString(plaintext=value), + ) + try: + async with session.begin_nested(): + session.add(secret_model) + events.emit( + session, + "Secret created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(secret_model)], + ) + except sqlalchemy.exc.IntegrityError: + raise ResourceExistsError() + await session.commit() + return secret_model + + +async def update_secret( + session: AsyncSession, + project: ProjectModel, + name: str, + value: str, + user: UserModel, +) -> SecretModel: + async with get_project_secret_models_by_name_for_update( + session=session, project=project, names=[name] + ) as secret_models: + if not secret_models: + raise ResourceNotExistsError() + secret_model = secret_models[0] + if secret_model.value.get_plaintext_or_error() != value: + secret_model.value = DecryptedString(plaintext=value) + events.emit( + session, + "Secret updated", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(secret_model)], + ) + await session.commit() + return secret_model + + +def _validate_secret(name: str, value: str): + _validate_secret_name(name) + _validate_secret_value(value) + + +def _validate_secret_name(name: str): + if re.match(_SECRET_NAME_REGEX, name) is None: + raise ServerClientError(f"Secret name should match regex '{_SECRET_NAME_REGEX}") + + +def _validate_secret_value(value: str): + if len(value) > _SECRET_VALUE_MAX_LENGTH: + raise ServerClientError(f"Secret value length must not exceed {_SECRET_VALUE_MAX_LENGTH}") + + +def _check_can_manage_secrets(user: UserModel, project: ProjectModel): + if user.global_role == GlobalRole.ADMIN: + return + member = get_member(user=user, project=project) + if member is None: + raise ForbiddenError() + permissions = get_member_permissions(member) + if permissions.can_manage_secrets: + return + raise ForbiddenError() diff --git a/src/dstack/_internal/server/services/services/__init__.py b/src/dstack/_internal/server/services/services/__init__.py new file mode 100644 index 0000000000..b637683af7 --- /dev/null +++ b/src/dstack/_internal/server/services/services/__init__.py @@ -0,0 +1,378 @@ +""" +Application logic related to `type: service` runs. +""" + +from functools import partial +from typing import Optional + +import httpx +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.errors import ( + GatewayError, + ResourceNotExistsError, + ServerClientError, + SSHError, +) +from dstack._internal.core.models.configurations import ( + SERVICE_HTTPS_DEFAULT, + EntityReference, + ServiceConfiguration, +) +from dstack._internal.core.models.gateways import GatewayConfiguration, GatewayStatus +from dstack._internal.core.models.routers import ( + AnyServiceRouterConfig, + RouterType, + SGLangServiceRouterConfig, +) +from dstack._internal.core.models.runs import RunSpec, ServiceModelSpec, ServiceSpec +from dstack._internal.core.models.services import OpenAIChatModel +from dstack._internal.proxy.gateway.const import SERVICE_ALREADY_REGISTERED_ERROR_TEMPLATE +from dstack._internal.server import settings +from dstack._internal.server.models import GatewayModel, RunModel +from dstack._internal.server.services import events +from dstack._internal.server.services.gateways import ( + get_gateway_compute_models, + get_gateway_configuration, + get_or_add_gateway_connections, + get_project_default_gateway_model, + get_project_gateway_model_by_reference, +) +from dstack._internal.server.services.logging import fmt +from dstack._internal.server.services.services.options import get_service_options +from dstack._internal.utils.common import interpolate_gateway_domain +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +async def register_service(session: AsyncSession, run_model: RunModel, run_spec: RunSpec): + assert isinstance(run_spec.configuration, ServiceConfiguration) + + if isinstance(run_spec.configuration.gateway, EntityReference) or isinstance( + run_spec.configuration.gateway, str + ): + gateway_reference = EntityReference.parse(run_spec.configuration.gateway) + gateway = await get_project_gateway_model_by_reference( + session=session, + project=run_model.project, + ref=gateway_reference, + load_gateway_compute=True, + load_backend_type=True, + ) + if gateway is None: + raise ResourceNotExistsError( + f"Gateway {gateway_reference.format()} does not exist" + f" in project {run_model.project.name}" + ) + if gateway.to_be_deleted: + raise ResourceNotExistsError( + f"Gateway {gateway_reference.format()} was marked for deletion" + ) + elif run_spec.configuration.gateway == False: + gateway = None + else: + gateway = await get_project_default_gateway_model( + session=session, + project=run_model.project, + load_gateway_compute=True, + load_backend_type=True, + ) + if gateway is None and run_spec.configuration.gateway == True: + raise ResourceNotExistsError( + "The service requires a gateway, but there is no default gateway in the project" + ) + + if gateway is not None: + service_spec = await _register_service_in_gateway(session, run_model, run_spec, gateway) + run_model.gateway = gateway + elif not settings.FORBID_SERVICES_WITHOUT_GATEWAY: + service_spec = _register_service_in_server(run_model, run_spec) + else: + raise ResourceNotExistsError( + "This dstack-server installation forbids services without a gateway." + " Please configure a gateway." + ) + run_model.service_spec = service_spec.json() + + +async def _register_service_in_gateway( + session: AsyncSession, run_model: RunModel, run_spec: RunSpec, gateway: GatewayModel +) -> ServiceSpec: + assert run_spec.configuration.type == "service" + + if not get_gateway_compute_models(gateway): + raise ServerClientError("Gateway has no instance associated with it") + + if gateway.status != GatewayStatus.RUNNING: + raise ServerClientError("Gateway status is not running") + + if gateway.forbid_new_services: + raise ServerClientError("Gateway does not accept new services") + + gateway_configuration = get_gateway_configuration(gateway) + + has_replica_group_router = any( + g.router is not None for g in run_spec.configuration.replica_groups + ) + if has_replica_group_router and _gateway_has_sglang_router(gateway_configuration): + raise ServerClientError( + "A replica-group `router:` cannot be used with a gateway that has router configuration." + ) + + # Check: service specifies SGLang router but gateway does not have it + service_router = run_spec.configuration.router + service_wants_sglang = service_router is not None and isinstance( + service_router, SGLangServiceRouterConfig + ) + if service_wants_sglang and not _gateway_has_sglang_router(gateway_configuration): + raise ServerClientError( + "Service requires gateway with SGLang router but gateway " + f"'{gateway.name}' does not have the SGLang router configured." + ) + + configure_service_https = _should_configure_service_https_on_gateway( + run_spec, gateway_configuration + ) + show_service_https = _should_show_service_https(run_spec, gateway_configuration) + service_protocol = "https" if show_service_https else "http" + + if ( + not show_service_https + and gateway_configuration.certificate is not None + and gateway_configuration.certificate.type == "acm" + ): + # SSL termination is done globally at load balancer so cannot runs only some services via http. + raise ServerClientError( + "Cannot run HTTP service on gateway with ACM certificates configured" + ) + + if show_service_https and gateway_configuration.certificate is None: + raise ServerClientError( + "Cannot run HTTPS service on gateway with no SSL certificates configured" + ) + + router = _build_service_router_config(gateway_configuration, run_spec.configuration) + + gateway_https = _get_gateway_https(gateway_configuration) + gateway_protocol = "https" if gateway_https else "http" + + wildcard_domain = gateway.wildcard_domain.lstrip("*.") if gateway.wildcard_domain else None + if wildcard_domain is None: + raise ServerClientError("Domain is required for gateway") + wildcard_domain = interpolate_gateway_domain( + domain=wildcard_domain, + run_project_name=run_model.project.name, + exception_type=GatewayError, + ) + service_url = f"{service_protocol}://{run_model.run_name}.{wildcard_domain}" + if isinstance(run_spec.configuration.model, OpenAIChatModel): + model_url = service_url + run_spec.configuration.model.prefix + else: + model_url = f"{gateway_protocol}://gateway.{wildcard_domain}" + service_spec = _get_service_spec( + configuration=run_spec.configuration, + service_url=service_url, + model_url=model_url, + ) + + domain = service_spec.get_domain() + assert domain is not None + + _, connections = await get_or_add_gateway_connections(session, gateway.id) + for conn in connections: + try: + logger.debug("%s: registering service as %s", fmt(run_model), service_spec.url) + async with conn.client() as client: + do_register = partial( + client.register_service, + project=run_model.project.name, + run_name=run_model.run_name, + domain=domain, + service_https=configure_service_https, + gateway_https=gateway_https, + auth=run_spec.configuration.auth, + client_max_body_size=settings.DEFAULT_SERVICE_CLIENT_MAX_BODY_SIZE, + options=service_spec.options, + rate_limits=run_spec.configuration.rate_limits, + ssh_private_key=run_model.project.ssh_private_key, + has_router_replica=has_replica_group_router, + router=router, + ) + try: + await do_register() + except GatewayError as e: + if e.msg == SERVICE_ALREADY_REGISTERED_ERROR_TEMPLATE.format( + ref=f"{run_model.project.name}/{run_model.run_name}" + ): + # Happens if there was a communication issue with the gateway when last (un)registering + logger.warning( + "Service %s/%s is dangling on gateway replica %s, unregistering and re-registering", + run_model.project.name, + run_model.run_name, + conn.ip_address, + ) + await client.unregister_service( + project=run_model.project.name, + run_name=run_model.run_name, + ) + await do_register() + else: + raise + except SSHError: + raise ServerClientError("Gateway tunnel is not working") + except httpx.RequestError as e: + logger.debug("Gateway request failed", exc_info=True) + raise GatewayError(f"Gateway is not working: {e!r}") + + events.emit( + session, + "Service registered in gateway", + actor=events.SystemActor(), + targets=[ + events.Target.from_model(run_model), + events.Target.from_model(gateway), + ], + ) + return service_spec + + +def _register_service_in_server(run_model: RunModel, run_spec: RunSpec) -> ServiceSpec: + assert run_spec.configuration.type == "service" + if ( + run_spec.configuration.router is not None + and run_spec.configuration.router.type == RouterType.SGLANG + ): + raise ServerClientError( + "Service with SGLang router configuration requires a gateway. " + "Please configure a gateway with the SGLang router enabled." + ) + if run_spec.configuration.https not in ( + None, + "auto", + True, # Default set by pre-0.20.12 clients. TODO(0.21.0?): forbid True too. + ): + raise ServerClientError( + f"Setting `https: {run_spec.configuration.https}` is not allowed without a gateway." + " Please configure a gateway or remove the `https` property from the service configuration" + ) + # Check if any group has autoscaling (min != max) + has_autoscaling = any( + group.count.min != group.count.max for group in run_spec.configuration.replica_groups + ) + if has_autoscaling: + raise ServerClientError( + "Auto-scaling is not supported when running services without a gateway." + " Please configure a gateway or set `replicas` to a fixed value in the service configuration" + ) + if run_spec.configuration.rate_limits: + raise ServerClientError( + "Rate limits are not supported when running services without a gateway." + " Please configure a gateway or remove `rate_limits` from the service configuration" + ) + service_url = f"/proxy/services/{run_model.project.name}/{run_model.run_name}/" + if isinstance(run_spec.configuration.model, OpenAIChatModel): + model_url = service_url.rstrip("/") + run_spec.configuration.model.prefix + else: + model_url = f"/proxy/models/{run_model.project.name}/" + return _get_service_spec( + configuration=run_spec.configuration, + service_url=service_url, + model_url=model_url, + ) + + +def _gateway_has_sglang_router(config: GatewayConfiguration) -> bool: + return config.router is not None and config.router.type == RouterType.SGLANG.value + + +def _build_service_router_config( + gateway_configuration: GatewayConfiguration, + service_configuration: ServiceConfiguration, +) -> Optional[AnyServiceRouterConfig]: + """ + Build router config from gateway (type, policy) + service (pd_disaggregation, policy override). + Service's policy overrides gateway's if present. Keeps backward compat: SGLang enabled + automatically when gateway has it configured. + """ + if not _gateway_has_sglang_router(gateway_configuration): + return None + + gateway_router = gateway_configuration.router + assert gateway_router is not None # ensured by _gateway_has_sglang_router + router_type = gateway_router.type + policy = gateway_router.policy + + service_router = service_configuration.router + if service_router is not None and isinstance(service_router, SGLangServiceRouterConfig): + policy = service_router.policy + pd_disaggregation = service_router.pd_disaggregation + else: + pd_disaggregation = False + + return SGLangServiceRouterConfig( + type=router_type, + policy=policy, + pd_disaggregation=pd_disaggregation, + ) + + +def _get_service_spec( + configuration: ServiceConfiguration, service_url: str, model_url: str +) -> ServiceSpec: + service_spec = ServiceSpec(url=service_url) + if configuration.model is not None: + service_spec.model = ServiceModelSpec( + name=configuration.model.name, + base_url=model_url, + type=configuration.model.type, + ) + service_spec.options = get_service_options(configuration) + return service_spec + + +def _should_configure_service_https_on_gateway( + run_spec: RunSpec, configuration: GatewayConfiguration +) -> bool: + """ + Returns `True` if the gateway needs to serve the service with HTTPS. + May be `False` for HTTPS services, e.g. SSL termination is done on a load balancer. + """ + assert run_spec.configuration.type == "service" + https = run_spec.configuration.https + if https is None: + https = SERVICE_HTTPS_DEFAULT + if https == "auto": + if configuration.certificate is None: + return False + if configuration.certificate.type == "acm": + return False + return True + if not https: + return False + if configuration.certificate is not None and configuration.certificate.type == "acm": + return False + return True + + +def _should_show_service_https(run_spec: RunSpec, configuration: GatewayConfiguration) -> bool: + """ + Returns `True` if the service needs to be accessed via https://. + """ + assert run_spec.configuration.type == "service" + https = run_spec.configuration.https + if https is None: + https = SERVICE_HTTPS_DEFAULT + if https == "auto": + if configuration.certificate is None: + return False + return True + return https + + +def _get_gateway_https(configuration: GatewayConfiguration) -> bool: + if configuration.certificate is not None and configuration.certificate.type == "acm": + return False + if configuration.certificate is not None and configuration.certificate.type == "lets-encrypt": + return True + return False diff --git a/src/dstack/_internal/server/services/services/autoscalers.py b/src/dstack/_internal/server/services/services/autoscalers.py new file mode 100644 index 0000000000..1cc49a14c8 --- /dev/null +++ b/src/dstack/_internal/server/services/services/autoscalers.py @@ -0,0 +1,129 @@ +import datetime +import math +from abc import ABC, abstractmethod +from typing import Optional + +import dstack._internal.utils.common as common_utils +from dstack._internal.core.models.configurations import DEFAULT_SCALING_WINDOW, ScalingSpec +from dstack._internal.core.models.resources import Range +from dstack._internal.proxy.gateway.schemas.stats import PerWindowStats + + +class BaseServiceScaler(ABC): + @abstractmethod + def get_desired_count( + self, + current_desired_count: int, + stats: Optional[PerWindowStats], + last_scaled_at: Optional[datetime.datetime], + ) -> int: + """ + Args: + stats: service usage stats + current_desired_count: currently used desired count + last_scaled_at: last time service was scaled, None if it was never scaled yet + + Returns: + desired_count: desired count of replicas + """ + pass + + +class ManualScaler(BaseServiceScaler): + """ + Scales replicas to keep it between `min_replicas` and `max_replicas` + in case `min_replicas` or `max_replicas` change. + """ + + def __init__( + self, + min_replicas: int, + max_replicas: int, + ): + self.min_replicas = min_replicas + self.max_replicas = max_replicas + + def get_desired_count( + self, + current_desired_count: int, + stats: Optional[PerWindowStats], + last_scaled_at: Optional[datetime.datetime], + ) -> int: + # clip the desired count to the min and max values + return min(max(current_desired_count, self.min_replicas), self.max_replicas) + + +class RPSAutoscaler(BaseServiceScaler): + def __init__( + self, + min_replicas: int, + max_replicas: int, + target: float, + window: int, + scale_up_delay: int, + scale_down_delay: int, + ): + self.min_replicas = min_replicas + self.max_replicas = max_replicas + self.target = target + self.window = window + self.scale_up_delay = scale_up_delay + self.scale_down_delay = scale_down_delay + + def get_desired_count( + self, + current_desired_count: int, + stats: Optional[PerWindowStats], + last_scaled_at: Optional[datetime.datetime], + ) -> int: + if not stats: + return current_desired_count + + now = common_utils.get_current_datetime() + + rps = stats[self.window].requests / self.window + new_desired_count = math.ceil(rps / self.target) + # clip the desired count to the min and max values + new_desired_count = min(max(new_desired_count, self.min_replicas), self.max_replicas) + + if new_desired_count > current_desired_count: + if current_desired_count == 0: + # no replicas, scale up immediately + return new_desired_count + if ( + last_scaled_at is not None + and (now - last_scaled_at).total_seconds() < self.scale_up_delay + ): + # too early to scale up, wait for the delay + return current_desired_count + return new_desired_count + elif new_desired_count < current_desired_count: + if ( + last_scaled_at is not None + and (now - last_scaled_at).total_seconds() < self.scale_down_delay + ): + # too early to scale down, wait for the delay + return current_desired_count + return new_desired_count + return new_desired_count + + +def get_service_scaler(count: Range[int], scaling: Optional[ScalingSpec]) -> BaseServiceScaler: + assert count.min is not None + assert count.max is not None + if scaling is None: + return ManualScaler( + min_replicas=count.min, + max_replicas=count.max, + ) + if scaling.metric == "rps": + return RPSAutoscaler( + # replicas count validated by configuration model + min_replicas=count.min, + max_replicas=count.max, + target=scaling.target, + window=scaling.window if scaling.window is not None else DEFAULT_SCALING_WINDOW, + scale_up_delay=scaling.scale_up_delay, + scale_down_delay=scaling.scale_down_delay, + ) + raise ValueError(f"No scaler found for scaling parameters {scaling}") diff --git a/src/dstack/_internal/server/services/gateways/options.py b/src/dstack/_internal/server/services/services/options.py similarity index 86% rename from src/dstack/_internal/server/services/gateways/options.py rename to src/dstack/_internal/server/services/services/options.py index 0969f38a4b..3e26be39ba 100644 --- a/src/dstack/_internal/server/services/gateways/options.py +++ b/src/dstack/_internal/server/services/services/options.py @@ -4,13 +4,13 @@ from dstack._internal.core.errors import ServerClientError from dstack._internal.core.models.configurations import ServiceConfiguration -from dstack._internal.core.models.gateways import AnyModel +from dstack._internal.core.models.services import AnyModel def complete_service_model(model_info: AnyModel, env: Dict[str, str]): if model_info.type == "chat" and model_info.format == "tgi": if model_info.chat_template is None or model_info.eos_token is None: - hf_token = env.get("HUGGING_FACE_HUB_TOKEN", None) + hf_token = env.get("HF_TOKEN", env.get("HUGGING_FACE_HUB_TOKEN")) tokenizer_config = get_tokenizer_config(model_info.name, hf_token=hf_token) if model_info.chat_template is None: model_info.chat_template = tokenizer_config[ @@ -35,9 +35,9 @@ def get_tokenizer_config(model_id: str, hf_token: Optional[str] = None) -> dict: if resp.status_code == 403: raise ServerClientError("Private HF models are not supported") if resp.status_code == 401: - message = "Failed to access gated model. Specify HUGGING_FACE_HUB_TOKEN env." + message = "Failed to access gated model. Specify HF_TOKEN env." if hf_token is not None: - message = "Failed to access gated model. Invalid HUGGING_FACE_HUB_TOKEN env." + message = "Failed to access gated model. Invalid HF_TOKEN env." raise ServerClientError(message) resp.raise_for_status() except requests.RequestException as e: @@ -48,6 +48,6 @@ def get_tokenizer_config(model_id: str, hf_token: Optional[str] = None) -> dict: def get_service_options(conf: ServiceConfiguration) -> dict: options = {} if conf.model is not None: - complete_service_model(conf.model, env=conf.env) + complete_service_model(conf.model, env=conf.env.as_dict()) options["openai"] = {"model": conf.model.dict()} return options diff --git a/src/dstack/_internal/server/services/ssh.py b/src/dstack/_internal/server/services/ssh.py new file mode 100644 index 0000000000..9d07263885 --- /dev/null +++ b/src/dstack/_internal/server/services/ssh.py @@ -0,0 +1,98 @@ +from collections.abc import Iterable + +from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT +from dstack._internal.core.models.instances import SSHConnectionParams +from dstack._internal.core.services.ssh.tunnel import SSH_DEFAULT_OPTIONS, SocketPair, SSHTunnel +from dstack._internal.server.models import JobModel +from dstack._internal.server.services.instances import get_instance_remote_connection_info +from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data +from dstack._internal.utils.common import get_or_error +from dstack._internal.utils.path import FileContent + + +def get_container_ssh_credentials(job: JobModel) -> list[tuple[SSHConnectionParams, FileContent]]: + """ + Returns the information needed to connect to the SSH server inside the job container. + + The user of the target host (container) is set to: + * VM-based backends and SSH instances: "root" + * container-based backends: `JobProvisioningData.username`, which is, as of 2026-03-10, + is always "root" on all supported backends (Runpod, Vast.ai, Kubernetes) + + Args: + job: `JobModel` with `project`, `instance` and `instance.project` fields loaded. + + Returns: + A list of hosts credentials as (host's `SSHConnectionParams`, private key's `FileContent`) + pairs ordered from the first proxy jump (if any) to the target host (container). + """ + hosts: list[tuple[SSHConnectionParams, FileContent]] = [] + + instance = get_or_error(job.instance) + + rci = get_instance_remote_connection_info(instance) + if rci is not None and (head_proxy := rci.ssh_proxy) is not None: + head_key = FileContent(get_or_error(get_or_error(rci.ssh_proxy_keys)[0].private)) + hosts.append((head_proxy, head_key)) + + jpd = get_job_provisioning_data(job) + assert jpd is not None + assert jpd.hostname is not None + assert jpd.ssh_port is not None + + job_project_key = FileContent(job.project.ssh_private_key) + + if jpd.dockerized: + instance_proxy = SSHConnectionParams( + hostname=jpd.hostname, + username=jpd.username, + port=jpd.ssh_port, + ) + instance_project_key = FileContent(instance.project.ssh_private_key) + hosts.append((instance_proxy, instance_project_key)) + ssh_port = DSTACK_RUNNER_SSH_PORT + jrd = get_job_runtime_data(job) + if jrd is not None and jrd.ports is not None: + ssh_port = jrd.ports.get(ssh_port, ssh_port) + target_host = SSHConnectionParams( + hostname="localhost", + username="root", + port=ssh_port, + ) + hosts.append((target_host, job_project_key)) + else: + if jpd.ssh_proxy is not None: + # As of 2026-03-13, the only container-based backend with SSH proxy is Kubernetes, + # which is implemented as follows: the jump pod (JobProvisioningData.ssh_proxy) + # is created once per project via Compute.run_job() with a public key submitted as + # a method argument, that is, with the public key of the project of the first (within + # that project) job submitted to the cluster. + hosts.append((jpd.ssh_proxy, job_project_key)) + target_host = SSHConnectionParams( + hostname=jpd.hostname, + username=jpd.username, + port=jpd.ssh_port, + ) + hosts.append((target_host, job_project_key)) + + return hosts + + +def container_ssh_tunnel( + job: JobModel, + forwarded_sockets: Iterable[SocketPair] = (), + options: dict[str, str] = SSH_DEFAULT_OPTIONS, +) -> SSHTunnel: + """ + Build SSHTunnel for connecting to the container running the specified job. + """ + hosts = get_container_ssh_credentials(job) + target, identity = hosts[-1] + return SSHTunnel( + destination=f"{target.username}@{target.hostname}", + port=target.port, + ssh_proxies=hosts[:-1], + identity=identity, + forwarded_sockets=forwarded_sockets, + options=options, + ) diff --git a/src/dstack/_internal/server/services/ssh/__init__.py b/src/dstack/_internal/server/services/ssh/__init__.py deleted file mode 100644 index 9043f1d334..0000000000 --- a/src/dstack/_internal/server/services/ssh/__init__.py +++ /dev/null @@ -1,94 +0,0 @@ -import asyncio -import os -import subprocess -import tempfile -from typing import Any, Dict, List - -from dstack._internal.core.errors import SSHError -from dstack._internal.core.services.ssh import get_ssh_error -from dstack._internal.utils.logging import get_logger - -logger = get_logger(__name__) - - -class AsyncSSHTunnel: - def __init__(self, user_host: str, id_rsa: str, options: Dict[str, Any], args: List[str]): - # TODO(egor-s): reuse existing SSH control sock (in case of server restart) - self.user_host = user_host - - self._temp_dir = tempfile.TemporaryDirectory() - with open( - self.id_rsa, opener=lambda path, flags: os.open(path, flags, 0o600), mode="w" - ) as f: - f.write(id_rsa) - - self._start_cmd = ["ssh", "-F", "none", "-i", self.id_rsa, "-f", "-N"] - self._start_cmd += ["-M", "-S", self.control_sock_path] - for key, value in options.items(): - self._start_cmd += ["-o", f"{key}={value}"] - self._start_cmd += args - self._start_cmd += [user_host] - self._start_cmd = self._interpolate(self._start_cmd) - - self._stop_cmd = ["ssh", "-S", self.control_sock_path, "-O", "exit", user_host] - self._stop_cmd = self._interpolate(self._stop_cmd) - - self._check_cmd = ["ssh", "-S", self.control_sock_path, "-O", "check", user_host] - self._check_cmd = self._interpolate(self._check_cmd) - - self._exec_cmd = ["ssh", "-S", self.control_sock_path, user_host] - - def _interpolate(self, cmd: List[str]) -> List[str]: - data = { - "temp_dir": self.temp_dir, - "id_rsa": self.id_rsa, - "control_sock_path": self.control_sock_path, - } - return [arg.format(**data) for arg in cmd] - - @property - def temp_dir(self) -> str: - return self._temp_dir.name - - @property - def id_rsa(self) -> str: - return os.path.join(self.temp_dir, "id_rsa") - - @property - def control_sock_path(self) -> str: - return os.path.join(self.temp_dir, "control") - - async def start(self): - proc = await asyncio.create_subprocess_exec( - *self._start_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE - ) - _, stderr = await proc.communicate() - if proc.returncode != 0: - # TODO(egor-s): make robust, retry - raise get_ssh_error(stderr) - logger.debug("SSH tunnel `%s` is up", self.user_host) - - async def stop(self): - proc = await asyncio.create_subprocess_exec( - *self._stop_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL - ) - await proc.wait() - - async def check(self) -> bool: - proc = await asyncio.create_subprocess_exec( - *self._check_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL - ) - await proc.wait() - ok = proc.returncode == 0 - # logger.debug("SSH tunnel %s check: %s", self.user_host, "OK" if ok else "FAIL") - return ok - - async def exec(self, command: str) -> str: - proc = await asyncio.create_subprocess_exec( - *self._exec_cmd, command, stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - stdout, stderr = await proc.communicate() - if proc.returncode != 0: - # TODO(egor-s): make robust, retry - raise SSHError(stderr.decode()) - return stdout.decode() diff --git a/src/dstack/_internal/server/services/ssh_fleets/__init__.py b/src/dstack/_internal/server/services/ssh_fleets/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/server/services/ssh_fleets/provisioning.py b/src/dstack/_internal/server/services/ssh_fleets/provisioning.py new file mode 100644 index 0000000000..553b82f852 --- /dev/null +++ b/src/dstack/_internal/server/services/ssh_fleets/provisioning.py @@ -0,0 +1,373 @@ +import io +import json +import time +from contextlib import contextmanager, nullcontext +from textwrap import dedent +from typing import Any, Dict, Generator, List, Optional + +import paramiko +from gpuhunt import AcceleratorVendor, correct_gpu_memory_gib + +from dstack._internal.core.backends.base.compute import ( + DSTACK_SHIM_RESTART_INTERVAL_SECONDS, + GoArchType, + normalize_arch, +) +from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT +from dstack._internal.core.errors import SSHProvisioningError +from dstack._internal.core.models.instances import ( + Disk, + Gpu, + InstanceType, + Resources, + SSHConnectionParams, +) +from dstack._internal.utils.gpu import ( + convert_amd_gpu_name, + convert_intel_accelerator_name, + convert_nvidia_gpu_name, +) +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +SSH_CONNECT_TIMEOUT = 10 + +DSTACK_SHIM_ENV_FILE = "shim.env" + +HOST_INFO_FILE = "host_info.json" + + +def detect_cpu_arch(client: paramiko.SSHClient) -> GoArchType: + cmd = "uname -m" + try: + _, stdout, stderr = client.exec_command(cmd, timeout=20) + except (paramiko.SSHException, OSError) as e: + raise SSHProvisioningError(f"detect_cpu_arch: {e}") from e + out = stdout.read().strip().decode() + err = stderr.read().strip().decode() + if err: + raise SSHProvisioningError(f"detect_cpu_arch: {cmd} failed, stdout: {out}, stderr: {err}") + try: + return normalize_arch(out) + except ValueError as e: + raise SSHProvisioningError(f"detect_cpu_arch: failed to normalize arch: {e}") from e + + +def sftp_upload(client: paramiko.SSHClient, path: str, body: str) -> None: + try: + sftp = client.open_sftp() + channel = sftp.get_channel() + if channel is not None: + channel.settimeout(10) + sftp.putfo(io.BytesIO(body.encode()), path) + sftp.close() + except (paramiko.SSHException, OSError) as e: + raise SSHProvisioningError(f"sft_upload failed: {e}") from e + + +def upload_envs(client: paramiko.SSHClient, working_dir: str, envs: Dict[str, str]) -> None: + envs["DSTACK_SERVICE_MODE"] = "1" # make host_info.json on start + dot_env = "\n".join(f'{key}="{value.strip()}"' for key, value in envs.items()) + tmp_file_path = f"/tmp/{DSTACK_SHIM_ENV_FILE}" + sftp_upload(client, tmp_file_path, dot_env) + try: + dest = f"{working_dir}/{DSTACK_SHIM_ENV_FILE}" + cmd = ( + f"sudo mkdir -p {working_dir} && sudo mv {tmp_file_path} {dest}" + f" && {{ sudo chcon system_u:object_r:etc_t:s0 {dest} 2>/dev/null || true; }}" + ) + _, stdout, stderr = client.exec_command(cmd, timeout=20) + out = stdout.read().strip().decode() + err = stderr.read().strip().decode() + if out or err: + raise SSHProvisioningError( + f"The command 'upload_envs' didn't work. stdout: {out}, stderr: {err}" + ) + except (paramiko.SSHException, OSError) as e: + raise SSHProvisioningError(f"upload_envs failed: {e}") from e + + +def run_pre_start_commands( + client: paramiko.SSHClient, shim_pre_start_commands: List[str], authorized_keys: List[str] +) -> None: + try: + authorized_keys_content = "\n".join(authorized_keys).strip() + _, stdout, stderr = client.exec_command( + f"echo '\n{authorized_keys_content}' >> ~/.ssh/authorized_keys", timeout=5 + ) + out = stdout.read().strip().decode() + err = stderr.read().strip().decode() + if out or err: + raise SSHProvisioningError( + f"The command 'authorized_keys' didn't work. stdout: {out}, stderr: {err}" + ) + except (paramiko.SSHException, OSError) as e: + raise SSHProvisioningError(f"upload authorized_keys failed: {e}") from e + + script = " && ".join(shim_pre_start_commands) + try: + _, stdout, stderr = client.exec_command(f"sudo sh -c '{script}'", timeout=120) + out = stdout.read().strip().decode() + err = stderr.read().strip().decode() + if out or err: + raise SSHProvisioningError( + f"The command 'run_pre_start_commands' didn't work. stdout: {out}, stderr: {err}" + ) + except (paramiko.SSHException, OSError) as e: + raise SSHProvisioningError(f"run_pre-start_commands failed: {e}") from e + + +def run_shim_as_systemd_service( + client: paramiko.SSHClient, binary_path: str, working_dir: str, dev: bool +) -> None: + # Stop restart attempts after ≈ 1 hour + start_limit_interval_seconds = 3600 + start_limit_burst = int( + start_limit_interval_seconds / DSTACK_SHIM_RESTART_INTERVAL_SECONDS * 0.9 + ) + shim_service = dedent(f"""\ + [Unit] + Description=dstack-shim + After=network-online.target + StartLimitIntervalSec={start_limit_interval_seconds} + StartLimitBurst={start_limit_burst} + + [Service] + Type=simple + User=root + Restart=always + RestartSec={DSTACK_SHIM_RESTART_INTERVAL_SECONDS} + WorkingDirectory={working_dir} + EnvironmentFile={working_dir}/{DSTACK_SHIM_ENV_FILE} + ExecStart={binary_path} + + [Install] + WantedBy=multi-user.target + """) + + sftp_upload(client, "/tmp/dstack-shim.service", shim_service) + + try: + cmd = """\ + sudo mv /tmp/dstack-shim.service /etc/systemd/system/dstack-shim.service && \ + { sudo chcon system_u:object_r:systemd_unit_file_t:s0 /etc/systemd/system/dstack-shim.service 2>/dev/null || true; } && \ + sudo systemctl daemon-reload && \ + sudo systemctl --quiet enable dstack-shim && \ + sudo systemctl restart dstack-shim + """ + _, stdout, stderr = client.exec_command(cmd, timeout=100) + out = stdout.read().strip().decode() + err = stderr.read().strip().decode() + if out or err: + raise SSHProvisioningError( + f"The command 'run_shim_as_systemd_service' didn't work. stdout: {out}, stderr: {err}" + ) + except (paramiko.SSHException, OSError) as e: + raise SSHProvisioningError(f"run_shim_as_systemd failed: {e}") from e + + +def check_dstack_shim_service(client: paramiko.SSHClient): + try: + _, stdout, _ = client.exec_command("sudo systemctl status dstack-shim.service", timeout=10) + status = stdout.read() + except (paramiko.SSHException, OSError) as e: + raise SSHProvisioningError(f"Checking dstack-shim.service status failed: {e}") from e + + for raw_line in status.splitlines(): + line = raw_line.decode() + if line.strip().startswith("Active: failed"): + raise SSHProvisioningError(f"The dstack-shim service doesn't start: {line.strip()}") + + +def remove_host_info_if_exists(client: paramiko.SSHClient, working_dir: str) -> None: + file_path = f"{working_dir}/{HOST_INFO_FILE}" + try: + _, _, stderr = client.exec_command( + f"sudo test -e {file_path} && sudo rm {file_path}", timeout=10 + ) + err = stderr.read().decode().strip() + if err: + logger.debug(f"{HOST_INFO_FILE} hasn't been removed: %s", err) + except (paramiko.SSHException, OSError) as e: + raise SSHProvisioningError(f"remove_host_info_if_exists failed: {e}") + + +def remove_dstack_runner_if_exists(client: paramiko.SSHClient, path: str) -> None: + try: + _, _, stderr = client.exec_command(f"sudo test -e {path} && sudo rm {path}", timeout=10) + err = stderr.read().decode().strip() + if err: + logger.debug(f"{path} hasn't been removed: %s", err) + except (paramiko.SSHException, OSError) as e: + raise SSHProvisioningError(f"remove_dstack_runner_if_exists failed: {e}") + + +def get_host_info(client: paramiko.SSHClient, working_dir: str) -> Dict[str, Any]: + # wait host_info + retries = 60 + iter_delay = 3 + for _ in range(retries): + try: + _, stdout, stderr = client.exec_command( + f"sudo cat {working_dir}/{HOST_INFO_FILE}", timeout=10 + ) + err = stderr.read().decode().strip() + if err: + logger.debug("Retry after error: %s", err) + time.sleep(iter_delay) + continue + except (paramiko.SSHException, OSError) as e: + logger.debug(f"Cannot run `cat {HOST_INFO_FILE}` in the remote instance: %s", e) + else: + try: + host_info_json = stdout.read() + host_info = json.loads(host_info_json) + return host_info + except ValueError: # JSON parse error + check_dstack_shim_service(client) + raise SSHProvisioningError("Cannot parse host_info") + time.sleep(iter_delay) + else: + check_dstack_shim_service(client) + raise SSHProvisioningError("Cannot get host_info") + + +def get_shim_healthcheck(client: paramiko.SSHClient) -> str: + retries = 20 + iter_delay = 3 + for _ in range(retries): + healthcheck = _get_shim_healthcheck(client) + if healthcheck is not None: + return healthcheck + logger.debug("healthcheck is empty. retry") + time.sleep(iter_delay) + raise SSHProvisioningError("Cannot get HealthcheckResponse") + + +def _get_shim_healthcheck(client: paramiko.SSHClient) -> Optional[str]: + try: + _, stdout, stderr = client.exec_command( + f"curl -s http://localhost:{DSTACK_SHIM_HTTP_PORT}/api/healthcheck", timeout=15 + ) + out = stdout.read().strip().decode() + err = stderr.read().strip().decode() + except (paramiko.SSHException, OSError) as e: + raise SSHProvisioningError(f"get_shim_healthcheck failed: {e}") from e + if err: + raise SSHProvisioningError( + f"get_shim_healthcheck didn't work. stdout: {out}, stderr: {err}" + ) + if not out: + return None + return out + + +def host_info_to_instance_type(host_info: Dict[str, Any], arch: GoArchType) -> InstanceType: + gpu_count = host_info.get("gpu_count", 0) + if gpu_count > 0: + gpu_vendor = AcceleratorVendor.cast(host_info.get("gpu_vendor", "nvidia")) + gpu_name = host_info["gpu_name"] + if gpu_vendor == AcceleratorVendor.NVIDIA: + gpu_name = convert_nvidia_gpu_name(gpu_name) + elif gpu_vendor == AcceleratorVendor.AMD: + gpu_name = convert_amd_gpu_name(gpu_name) + elif gpu_vendor == AcceleratorVendor.INTEL: + gpu_name = convert_intel_accelerator_name(gpu_name) + gpu_memory_mib = host_info["gpu_memory"] + if isinstance(gpu_memory_mib, str): + # older shim versions report gpu_memory as a string + gpu_memory_mib = float(gpu_memory_mib.lower().replace("mib", "").strip()) + else: + # newer shim versions report gpu_memory as an integer + gpu_memory_mib = float(gpu_memory_mib) + gpu_memory_mib = correct_gpu_memory_gib(gpu_name, gpu_memory_mib) * 1024 + gpus = [Gpu(vendor=gpu_vendor, name=gpu_name, memory_mib=gpu_memory_mib)] * gpu_count + else: + gpus = [] + instance_type = InstanceType( + name="instance", + resources=Resources( + cpu_arch=arch.to_cpu_architecture(), + cpus=host_info["cpus"], + memory_mib=host_info["memory"] / 1024 / 1024, + spot=False, + gpus=gpus, + disk=Disk(size_mib=host_info["disk_size"] / 1024 / 1024), + ), + ) + return instance_type + + +@contextmanager +def get_paramiko_connection( + ssh_user: str, + host: str, + port: int, + pkeys: List[paramiko.PKey], + proxy: Optional[SSHConnectionParams] = None, + proxy_pkeys: Optional[list[paramiko.PKey]] = None, +) -> Generator[paramiko.SSHClient, None, None]: + if proxy is not None: + if proxy_pkeys is None: + raise SSHProvisioningError("Missing proxy private keys") + proxy_ctx = get_paramiko_connection( + proxy.username, proxy.hostname, proxy.port, proxy_pkeys + ) + else: + proxy_ctx = nullcontext() + conn_url = f"{ssh_user}@{host}:{port}" + with proxy_ctx as proxy_client, paramiko.SSHClient() as client: + proxy_channel: Optional[paramiko.Channel] = None + if proxy_client is not None: + transport = proxy_client.get_transport() + assert transport is not None + try: + proxy_channel = transport.open_channel("direct-tcpip", (host, port), ("", 0)) + except (paramiko.SSHException, OSError) as e: + raise SSHProvisioningError(f"Proxy channel failed: {e}") from e + client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + for pkey in pkeys: + logger.debug("Try to connect to %s with key %s", conn_url, pkey.fingerprint) + connected = _paramiko_connect(client, ssh_user, host, port, pkey, proxy_channel) + if connected: + yield client + return + logger.debug( + f'Authentication failed to connect to "{conn_url}" and {pkey.fingerprint}' + ) + keys_fp = ", ".join(f"{pk.fingerprint!r}" for pk in pkeys) + raise SSHProvisioningError( + f"SSH connection to the {conn_url} with keys [{keys_fp}] was unsuccessful" + ) + + +def _paramiko_connect( + client: paramiko.SSHClient, + user: str, + host: str, + port: int, + pkey: paramiko.PKey, + channel: Optional[paramiko.Channel] = None, +) -> bool: + """ + Returns `True` if connected, `False` if auth failed, and raises `SSHProvisioningError` + on other errors. + """ + try: + client.connect( + username=user, + hostname=host, + port=port, + pkey=pkey, + look_for_keys=False, + allow_agent=False, + timeout=SSH_CONNECT_TIMEOUT, + sock=channel, + ) + return True + except paramiko.AuthenticationException: + return False + except (paramiko.SSHException, OSError) as e: + raise SSHProvisioningError(f"Connect failed: {e}") from e diff --git a/src/dstack/_internal/server/services/sshproxy/__init__.py b/src/dstack/_internal/server/services/sshproxy/__init__.py new file mode 100644 index 0000000000..aa4621a2c6 --- /dev/null +++ b/src/dstack/_internal/server/services/sshproxy/__init__.py @@ -0,0 +1,32 @@ +from typing import Optional + +from dstack._internal.server import settings +from dstack._internal.server.models import JobModel +from dstack._internal.utils.ssh import build_ssh_command, build_ssh_url_authority + + +def build_proxied_job_ssh_url_authority(job: JobModel) -> Optional[str]: + if not settings.SSHPROXY_ENABLED: + return None + assert settings.SSHPROXY_HOSTNAME is not None + return build_ssh_url_authority( + username=build_proxied_job_upstream_id(job), + hostname=settings.SSHPROXY_HOSTNAME, + port=settings.SSHPROXY_PORT, + ) + + +def build_proxied_job_ssh_command(job: JobModel) -> Optional[list[str]]: + if not settings.SSHPROXY_ENABLED: + return None + assert settings.SSHPROXY_HOSTNAME is not None + return build_ssh_command( + username=build_proxied_job_upstream_id(job), + hostname=settings.SSHPROXY_HOSTNAME, + port=settings.SSHPROXY_PORT, + ) + + +def build_proxied_job_upstream_id(job: JobModel) -> str: + # Job's UUID in lowercase, without dashes + return job.id.hex diff --git a/src/dstack/_internal/server/services/sshproxy/handlers.py b/src/dstack/_internal/server/services/sshproxy/handlers.py new file mode 100644 index 0000000000..713b3c8028 --- /dev/null +++ b/src/dstack/_internal/server/services/sshproxy/handlers.py @@ -0,0 +1,93 @@ +from typing import Optional +from uuid import UUID + +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import joinedload, load_only + +from dstack._internal.core.models.runs import JobStatus +from dstack._internal.server.models import ( + InstanceModel, + JobModel, + ProjectModel, + RunModel, + UserModel, + UserPublicKeyModel, +) +from dstack._internal.server.schemas.sshproxy import GetUpstreamResponse, UpstreamHost +from dstack._internal.server.services.jobs import get_job_runtime_data, get_job_spec +from dstack._internal.server.services.runs import get_run_spec +from dstack._internal.server.services.ssh import get_container_ssh_credentials + + +async def get_upstream_response( + session: AsyncSession, + upstream_id: str, +) -> Optional[GetUpstreamResponse]: + # The format of upstream_id is intentionally not limited to UUID in the API schema to allow + # further extensions. Currently, it's just a JobModel.id + try: + job_id = UUID(upstream_id) + except ValueError: + return None + + res = await session.execute( + select(JobModel) + .where( + JobModel.id == job_id, + JobModel.status == JobStatus.RUNNING, + ) + .options( + joinedload(JobModel.project, innerjoin=True).load_only(ProjectModel.ssh_private_key), + ( + joinedload(JobModel.instance, innerjoin=True) + .load_only(InstanceModel.remote_connection_info) + .joinedload(InstanceModel.project, innerjoin=True) + .load_only(ProjectModel.ssh_private_key) + ), + ( + joinedload(JobModel.run, innerjoin=True) + .load_only(RunModel.run_spec, RunModel.user_id) + .joinedload(RunModel.user, innerjoin=True) + .load_only(UserModel.ssh_public_key) + ), + ) + ) + job = res.scalar_one_or_none() + if job is None: + return None + + hosts: list[UpstreamHost] = [] + for ssh_params, private_key in get_container_ssh_credentials(job): + hosts.append( + UpstreamHost( + host=ssh_params.hostname, + port=ssh_params.port, + user=ssh_params.username, + private_key=private_key.content, + ) + ) + + username: Optional[str] = None + if (jrd := get_job_runtime_data(job)) is not None: + username = jrd.username + if username is None and (job_spec_user := get_job_spec(job).user) is not None: + username = job_spec_user.username + if username is not None: + hosts[-1].user = username + + res = await session.execute( + select(UserPublicKeyModel) + .where(UserPublicKeyModel.user_id == job.run.user_id) + .options(load_only(UserPublicKeyModel.key)) + ) + authorized_keys = {k.key for k in res.scalars().all()} + if (run_spec_key := get_run_spec(job.run).ssh_key_pub) is not None: + authorized_keys.add(run_spec_key) + if (user_key := job.run.user.ssh_public_key) is not None: + authorized_keys.add(user_key) + + return GetUpstreamResponse( + hosts=hosts, + authorized_keys=list(authorized_keys), + ) diff --git a/src/dstack/_internal/server/services/storage.py b/src/dstack/_internal/server/services/storage.py deleted file mode 100644 index 0745ab36ee..0000000000 --- a/src/dstack/_internal/server/services/storage.py +++ /dev/null @@ -1,74 +0,0 @@ -from typing import Optional - -from dstack._internal.server import settings - -BOTO_AVAILABLE = True -try: - import botocore.exceptions - from boto3 import Session -except ImportError: - BOTO_AVAILABLE = False - - -class S3Storage: - def __init__( - self, - bucket: str, - region: str, - ): - self._session = Session() - self._client = self._session.client("s3", region_name=region) - self.bucket = bucket - - def upload_code( - self, - project_id: str, - repo_id: str, - code_hash: str, - blob: bytes, - ): - self._client.put_object( - Bucket=self.bucket, - Key=_get_code_key(project_id, repo_id, code_hash), - Body=blob, - ) - - def get_code( - self, - project_id: str, - repo_id: str, - code_hash: str, - ) -> Optional[bytes]: - try: - response = self._client.get_object( - Bucket=self.bucket, - Key=_get_code_key(project_id, repo_id, code_hash), - ) - except botocore.exceptions.ClientError as e: - if e.response["Error"]["Code"] == "NoSuchKey": - return None - raise e - return response["Body"].read() - - -def _get_code_key(project_id: str, repo_id: str, code_hash: str) -> str: - return f"data/projects/{project_id}/codes/{repo_id}/{code_hash}" - - -_default_storage = None - - -def init_default_storage(): - global _default_storage - if settings.SERVER_BUCKET is None: - raise ValueError("settings.SERVER_BUCKET not set") - if not BOTO_AVAILABLE: - raise ValueError("AWS dependencies are not installed") - _default_storage = S3Storage( - bucket=settings.SERVER_BUCKET, - region=settings.SERVER_BUCKET_REGION, - ) - - -def get_default_storage() -> Optional[S3Storage]: - return _default_storage diff --git a/src/dstack/_internal/server/services/storage/__init__.py b/src/dstack/_internal/server/services/storage/__init__.py new file mode 100644 index 0000000000..d76a5d4bee --- /dev/null +++ b/src/dstack/_internal/server/services/storage/__init__.py @@ -0,0 +1,37 @@ +from typing import Optional + +from dstack._internal.server import settings +from dstack._internal.server.services.storage import gcs, s3 +from dstack._internal.server.services.storage.base import BaseStorage + +_default_storage = None + + +def init_default_storage(): + global _default_storage + if settings.SERVER_S3_BUCKET is None and settings.SERVER_GCS_BUCKET is None: + raise ValueError( + "Either settings.SERVER_S3_BUCKET or settings.SERVER_GCS_BUCKET must be set" + ) + if settings.SERVER_S3_BUCKET and settings.SERVER_GCS_BUCKET: + raise ValueError( + "Only one of settings.SERVER_S3_BUCKET or settings.SERVER_GCS_BUCKET can be set" + ) + + if settings.SERVER_S3_BUCKET: + if not s3.BOTO_AVAILABLE: + raise ValueError("AWS dependencies are not installed") + _default_storage = s3.S3Storage( + bucket=settings.SERVER_S3_BUCKET, + region=settings.SERVER_S3_BUCKET_REGION, + ) + elif settings.SERVER_GCS_BUCKET: + if not gcs.GCS_AVAILABLE: + raise ValueError("GCS dependencies are not installed") + _default_storage = gcs.GCSStorage( + bucket=settings.SERVER_GCS_BUCKET, + ) + + +def get_default_storage() -> Optional[BaseStorage]: + return _default_storage diff --git a/src/dstack/_internal/server/services/storage/base.py b/src/dstack/_internal/server/services/storage/base.py new file mode 100644 index 0000000000..bd203b31b4 --- /dev/null +++ b/src/dstack/_internal/server/services/storage/base.py @@ -0,0 +1,48 @@ +from abc import ABC, abstractmethod +from typing import Optional + + +class BaseStorage(ABC): + @abstractmethod + def upload_code( + self, + project_name: str, + repo_id: str, + code_hash: str, + blob: bytes, + ): + pass + + @abstractmethod + def get_code( + self, + project_name: str, + repo_id: str, + code_hash: str, + ) -> Optional[bytes]: + pass + + @abstractmethod + def upload_archive( + self, + user_id: str, + archive_hash: str, + blob: bytes, + ): + pass + + @abstractmethod + def get_archive( + self, + user_id: str, + archive_hash: str, + ) -> Optional[bytes]: + pass + + @staticmethod + def _get_code_key(project_name: str, repo_id: str, code_hash: str) -> str: + return f"data/projects/{project_name}/codes/{repo_id}/{code_hash}" + + @staticmethod + def _get_archive_key(user_id: str, archive_hash: str) -> str: + return f"data/users/{user_id}/file_archives/{archive_hash}" diff --git a/src/dstack/_internal/server/services/storage/gcs.py b/src/dstack/_internal/server/services/storage/gcs.py new file mode 100644 index 0000000000..d30c1e849a --- /dev/null +++ b/src/dstack/_internal/server/services/storage/gcs.py @@ -0,0 +1,66 @@ +from typing import Optional + +from dstack._internal.server.services.storage.base import BaseStorage + +GCS_AVAILABLE = True +try: + from google.cloud import storage + from google.cloud.exceptions import NotFound +except ImportError: + GCS_AVAILABLE = False +else: + + class GCSStorage(BaseStorage): + def __init__( + self, + bucket: str, + ): + self._client = storage.Client() + self._bucket = self._client.bucket(bucket) + + def upload_code( + self, + project_name: str, + repo_id: str, + code_hash: str, + blob: bytes, + ): + key = self._get_code_key(project_name, repo_id, code_hash) + self._upload(key, blob) + + def get_code( + self, + project_name: str, + repo_id: str, + code_hash: str, + ) -> Optional[bytes]: + key = self._get_code_key(project_name, repo_id, code_hash) + return self._get(key) + + def upload_archive( + self, + user_id: str, + archive_hash: str, + blob: bytes, + ): + key = self._get_archive_key(user_id, archive_hash) + self._upload(key, blob) + + def get_archive( + self, + user_id: str, + archive_hash: str, + ) -> Optional[bytes]: + key = self._get_archive_key(user_id, archive_hash) + return self._get(key) + + def _upload(self, key: str, blob: bytes): + blob_obj = self._bucket.blob(key) + blob_obj.upload_from_string(blob) + + def _get(self, key: str) -> Optional[bytes]: + try: + blob = self._bucket.blob(key) + except NotFound: + return None + return blob.download_as_bytes() diff --git a/src/dstack/_internal/server/services/storage/s3.py b/src/dstack/_internal/server/services/storage/s3.py new file mode 100644 index 0000000000..2921e69f15 --- /dev/null +++ b/src/dstack/_internal/server/services/storage/s3.py @@ -0,0 +1,69 @@ +from typing import Optional + +from dstack._internal.server.services.storage.base import BaseStorage + +BOTO_AVAILABLE = True +try: + import botocore.exceptions + from boto3 import Session +except ImportError: + BOTO_AVAILABLE = False +else: + + class S3Storage(BaseStorage): + def __init__( + self, + bucket: str, + region: Optional[str] = None, + ): + self._session = Session() + self._client = self._session.client("s3", region_name=region) + self.bucket = bucket + + def upload_code( + self, + project_name: str, + repo_id: str, + code_hash: str, + blob: bytes, + ): + key = self._get_code_key(project_name, repo_id, code_hash) + self._upload(key, blob) + + def get_code( + self, + project_name: str, + repo_id: str, + code_hash: str, + ) -> Optional[bytes]: + key = self._get_code_key(project_name, repo_id, code_hash) + return self._get(key) + + def upload_archive( + self, + user_id: str, + archive_hash: str, + blob: bytes, + ): + key = self._get_archive_key(user_id, archive_hash) + self._upload(key, blob) + + def get_archive( + self, + user_id: str, + archive_hash: str, + ) -> Optional[bytes]: + key = self._get_archive_key(user_id, archive_hash) + return self._get(key) + + def _upload(self, key: str, blob: bytes): + self._client.put_object(Bucket=self.bucket, Key=key, Body=blob) + + def _get(self, key: str) -> Optional[bytes]: + try: + response = self._client.get_object(Bucket=self.bucket, Key=key) + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "NoSuchKey": + return None + raise e + return response["Body"].read() diff --git a/src/dstack/_internal/server/services/templates.py b/src/dstack/_internal/server/services/templates.py new file mode 100644 index 0000000000..1ac1f357c4 --- /dev/null +++ b/src/dstack/_internal/server/services/templates.py @@ -0,0 +1,119 @@ +import shutil +import threading +import uuid +from pathlib import Path +from typing import List, Optional + +import git +import yaml +from cachetools import TTLCache, cached + +from dstack._internal.core.models.templates import UITemplate +from dstack._internal.server import settings +from dstack._internal.server.models import ProjectModel +from dstack._internal.utils.common import run_async +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + +TEMPLATES_DIR_NAME = ".dstack/templates" +CACHE_TTL_SECONDS = 180 + +_templates_cache: TTLCache = TTLCache(maxsize=1024, ttl=CACHE_TTL_SECONDS) +_templates_lock = threading.Lock() + + +async def list_templates(project: ProjectModel) -> List[UITemplate]: + """Return templates available for the UI.""" + repo_url = project.templates_repo or settings.SERVER_TEMPLATES_REPO + if not repo_url: + return [] + repo_key = _repo_key(project.id, repo_url) + return await run_async(_list_templates_sync, repo_key, repo_url) + + +@cached(cache=_templates_cache, lock=_templates_lock) +def _list_templates_sync(repo_key: str, repo_url: str) -> List[UITemplate]: + try: + repo_path = _fetch_templates_repo(repo_key, repo_url) + except git.GitCommandError as e: + status = getattr(e, "status", "unknown") + stderr = (getattr(e, "stderr", "") or "").strip().splitlines() + reason = stderr[-1] if stderr else "git command failed" + logger.warning( + "Failed to fetch templates repo %s (exit_code=%s): %s", repo_url, status, reason + ) + return [] + return _parse_templates(repo_path) + + +def _fetch_templates_repo(repo_key: str, repo_url: str) -> Path: + repo_dir = settings.SERVER_DATA_DIR_PATH / "templates-repos" / repo_key + if repo_dir.exists(): + try: + repo = git.Repo(str(repo_dir)) + remote_url = next(repo.remote().urls, None) + if remote_url != repo_url: + logger.info("Templates repo URL changed for key %s, re-cloning", repo_key) + shutil.rmtree(repo_dir) + else: + repo.remotes.origin.pull() + return repo_dir + except (git.InvalidGitRepositoryError, git.GitCommandError): + logger.warning("Invalid templates repo at %s, re-cloning", repo_dir) + shutil.rmtree(repo_dir) + + repo_dir.parent.mkdir(parents=True, exist_ok=True) + git.Repo.clone_from( + repo_url, + str(repo_dir), + depth=1, + ) + return repo_dir + + +def _parse_templates(repo_path: Path) -> List[UITemplate]: + templates_dir = repo_path / TEMPLATES_DIR_NAME + if not templates_dir.is_dir(): + logger.warning("Templates directory %s not found in repo", TEMPLATES_DIR_NAME) + return [] + + templates: List[UITemplate] = [] + for entry in sorted(templates_dir.iterdir()): + if entry.suffix not in (".yml", ".yaml"): + continue + try: + with open(entry) as f: + data = yaml.safe_load(f) + if not isinstance(data, dict): + logger.warning("Skipping %s: not a valid YAML mapping", entry.name) + continue + if data.get("type") != "template": + logger.debug("Skipping %s: type is not 'template'", entry.name) + continue + template = UITemplate.parse_obj(data) + templates.append(template) + except Exception: + logger.warning("Skipping invalid template %s", entry.name, exc_info=True) + continue + + return templates + + +def _repo_key(project_id: uuid.UUID, repo_url: str) -> str: + key_source = f"{project_id}:{repo_url}" + return uuid.uuid5(uuid.NAMESPACE_URL, key_source).hex + + +def validate_templates_repo_access(repo_url: str) -> None: + try: + git.Git().ls_remote("--exit-code", repo_url, "HEAD") + except git.GitCommandError: + raise ValueError(f"Cannot access templates repo: {repo_url}") + + +def invalidate_templates_cache(project_id: uuid.UUID, *repo_urls: Optional[str]) -> None: + unique_repo_urls = {repo_url for repo_url in repo_urls if repo_url} + with _templates_lock: + for repo_url in unique_repo_urls: + _templates_cache.pop((_repo_key(project_id, repo_url), repo_url), None) diff --git a/src/dstack/_internal/server/services/users.py b/src/dstack/_internal/server/services/users.py index e4b01dfbe6..73ceebe0ef 100644 --- a/src/dstack/_internal/server/services/users.py +++ b/src/dstack/_internal/server/services/users.py @@ -1,13 +1,43 @@ +import hashlib +import os +import re +import secrets import uuid +from collections.abc import AsyncGenerator +from contextlib import asynccontextmanager +from datetime import datetime from typing import Awaitable, Callable, List, Optional, Tuple -from sqlalchemy import delete, select, update +from sqlalchemy import and_, delete, literal_column, or_, select from sqlalchemy import func as safunc from sqlalchemy.ext.asyncio import AsyncSession - -from dstack._internal.core.errors import ResourceExistsError -from dstack._internal.core.models.users import GlobalRole, User, UserTokenCreds, UserWithCreds -from dstack._internal.server.models import UserModel +from sqlalchemy.orm import load_only + +from dstack._internal.core.errors import ( + ResourceExistsError, + ServerClientError, +) +from dstack._internal.core.models.users import ( + GlobalRole, + User, + UserHookConfig, + UserPermissions, + UsersInfoList, + UsersInfoListOrUsersList, + UserTokenCreds, + UserWithCreds, +) +from dstack._internal.server.db import get_db +from dstack._internal.server.models import DecryptedString, MemberModel, UserModel +from dstack._internal.server.services import events +from dstack._internal.server.services.locking import get_locker +from dstack._internal.server.services.permissions import get_default_permissions +from dstack._internal.server.utils.routers import error_forbidden +from dstack._internal.utils import crypto +from dstack._internal.utils.common import get_current_datetime, get_or_error, run_async +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) _ADMIN_USERNAME = "admin" @@ -17,7 +47,10 @@ async def get_or_create_admin_user(session: AsyncSession) -> Tuple[UserModel, bo if admin is not None: return admin, False admin = await create_user( - session=session, username=_ADMIN_USERNAME, global_role=GlobalRole.ADMIN + session=session, + username=_ADMIN_USERNAME, + global_role=GlobalRole.ADMIN, + token=os.getenv("DSTACK_SERVER_ADMIN_TOKEN"), ) return admin, True @@ -25,18 +58,90 @@ async def get_or_create_admin_user(session: AsyncSession) -> Tuple[UserModel, bo async def list_users_for_user( session: AsyncSession, user: UserModel, -) -> List[User]: + return_total_count: bool, + name_pattern: Optional[str], + prev_created_at: Optional[datetime], + prev_id: Optional[uuid.UUID], + limit: int, + ascending: bool, +) -> UsersInfoListOrUsersList: if user.global_role == GlobalRole.ADMIN: - return await list_all_users(session=session) - return [user_model_to_user(user)] + return await list_all_users( + session=session, + include_deleted=False, + return_total_count=return_total_count, + name_pattern=name_pattern, + prev_created_at=prev_created_at, + prev_id=prev_id, + limit=limit, + ascending=ascending, + ) + users = [] + if not user.deleted and (name_pattern is None or name_pattern.lower() in user.name.lower()): + users.append(user_model_to_user(user)) + if return_total_count: + return UsersInfoList(total_count=len(users), users=users) + return users async def list_all_users( session: AsyncSession, -) -> List[User]: - res = await session.execute(select(UserModel)) + include_deleted: bool = False, + return_total_count: bool = False, + name_pattern: Optional[str] = None, + prev_created_at: Optional[datetime] = None, + prev_id: Optional[uuid.UUID] = None, + limit: int = 2000, + ascending: bool = False, +) -> UsersInfoListOrUsersList: + filters = [] + if not include_deleted: + filters.append(UserModel.deleted == False) + if name_pattern: + name_pattern = name_pattern.replace("_", "/_") + filters.append(UserModel.name.ilike(f"%{name_pattern}%", escape="/")) + stmt = select(UserModel).where(*filters) + pagination_filters = [] + if prev_created_at is not None: + if ascending: + if prev_id is None: + pagination_filters.append(UserModel.created_at > prev_created_at) + else: + pagination_filters.append( + or_( + UserModel.created_at > prev_created_at, + and_( + UserModel.created_at == prev_created_at, + UserModel.id < prev_id, + ), + ) + ) + else: + if prev_id is None: + pagination_filters.append(UserModel.created_at < prev_created_at) + else: + pagination_filters.append( + or_( + UserModel.created_at < prev_created_at, + and_( + UserModel.created_at == prev_created_at, + UserModel.id > prev_id, + ), + ) + ) + order_by = (UserModel.created_at.desc(), UserModel.id) + if ascending: + order_by = (UserModel.created_at.asc(), UserModel.id.desc()) + total_count = None + if return_total_count: + res = await session.execute(stmt.with_only_columns(safunc.count(literal_column("1")))) + total_count = res.scalar_one() + res = await session.execute(stmt.where(*pagination_filters).order_by(*order_by).limit(limit)) user_models = res.scalars().all() - return [user_model_to_user(u) for u in user_models] + users = [user_model_to_user(u) for u in user_models] + if total_count is None: + return users + return UsersInfoList(total_count=total_count, users=users) async def get_user_with_creds_by_name( @@ -57,54 +162,166 @@ async def create_user( username: str, global_role: GlobalRole, email: Optional[str] = None, + active: bool = True, + token: Optional[str] = None, + config: Optional[UserHookConfig] = None, + creator: Optional[UserModel] = None, ) -> UserModel: + validate_username(username) user_model = await get_user_model_by_name(session=session, username=username, ignore_case=True) if user_model is not None: raise ResourceExistsError() + if token is None: + token = str(uuid.uuid4()) + private_bytes, public_bytes = await run_async(crypto.generate_rsa_key_pair_bytes, username) user = UserModel( id=uuid.uuid4(), name=username, global_role=global_role, - token=str(uuid.uuid4()), + token=DecryptedString(plaintext=token), + token_hash=get_token_hash(token), email=email, + active=active, + ssh_private_key=private_bytes.decode(), + ssh_public_key=public_bytes.decode(), ) session.add(user) + events.emit( + session, + "User created", + actor=events.UserActor.from_user(creator) if creator else events.UserActor.from_user(user), + targets=[events.Target.from_model(user)], + ) await session.commit() for func in _CREATE_USER_HOOKS: - await func(session, user) + await func(session, user, config) return user async def update_user( session: AsyncSession, + actor: events.AnyActor, username: str, global_role: GlobalRole, email: Optional[str] = None, -) -> UserModel: - await session.execute( - update(UserModel) - .where(UserModel.name == username) - .values(global_role=global_role, email=email) - ) - await session.commit() - return await get_user_model_by_name_or_error(session=session, username=username) + active: bool = True, +) -> Optional[UserModel]: + async with get_user_model_by_name_for_update(session, username) as user: + if user is None: + return None + updated_fields = [] + if global_role != user.global_role: + user.global_role = global_role + updated_fields.append(f"global_role={global_role}") + if email != user.email: + user.email = email + updated_fields.append("email") # do not include potentially sensitive new value + if active != user.active: + user.active = active + updated_fields.append(f"active={active}") + events.emit( + session, + f"User updated. Updated fields: {', '.join(updated_fields) or ''}", + actor=actor, + targets=[events.Target.from_model(user)], + ) + await session.commit() + return user -async def refresh_user_token(session: AsyncSession, username: str) -> Optional[UserModel]: - await session.execute( - update(UserModel).where(UserModel.name == username).values(token=uuid.uuid4()) - ) - await session.commit() - return await get_user_model_by_name(session=session, username=username) +async def refresh_ssh_key( + session: AsyncSession, + actor: UserModel, + username: Optional[str] = None, +) -> Optional[UserModel]: + if username is None: + username = actor.name + if actor.global_role != GlobalRole.ADMIN and actor.name != username: + raise error_forbidden() + async with get_user_model_by_name_for_update(session, username) as user: + if user is None: + return None + private_bytes, public_bytes = await run_async(crypto.generate_rsa_key_pair_bytes, username) + user.ssh_private_key = private_bytes.decode() + user.ssh_public_key = public_bytes.decode() + events.emit( + session, + "User SSH key refreshed", + actor=events.UserActor.from_user(actor), + targets=[events.Target.from_model(user)], + ) + await session.commit() + return user + + +async def refresh_user_token( + session: AsyncSession, + actor: UserModel, + username: str, +) -> Optional[UserModel]: + if actor.global_role != GlobalRole.ADMIN and actor.name != username: + raise error_forbidden() + async with get_user_model_by_name_for_update(session, username) as user: + if user is None: + return None + new_token = str(uuid.uuid4()) + user.token = DecryptedString(plaintext=new_token) + user.token_hash = get_token_hash(new_token) + events.emit( + session, + "User token refreshed", + actor=events.UserActor.from_user(actor), + targets=[events.Target.from_model(user)], + ) + await session.commit() + return user async def delete_users( session: AsyncSession, - user: UserModel, + actor: UserModel, usernames: List[str], ): - await session.execute(delete(UserModel).where(UserModel.name.in_(usernames))) - await session.commit() + if _ADMIN_USERNAME in usernames: + raise ServerClientError(f"User {_ADMIN_USERNAME!r} cannot be deleted") + + filters = [ + UserModel.name.in_(usernames), + UserModel.deleted == False, + ] + res = await session.execute(select(UserModel.id).where(*filters)) + user_ids = list(res.scalars().all()) + user_ids.sort() + + async with get_locker(get_db().dialect_name).lock_ctx(UserModel.__tablename__, user_ids): + # Refetch after lock + res = await session.execute( + select(UserModel) + .where(UserModel.id.in_(user_ids), *filters) + .order_by(UserModel.id) # take locks in order + .options(load_only(UserModel.id, UserModel.name)) + .with_for_update(key_share=True) + ) + users = list(res.scalars().all()) + if len(users) != len(usernames): + raise ServerClientError("Failed to delete non-existent users") + user_ids = [u.id for u in users] + timestamp = str(int(get_current_datetime().timestamp())) + for u in users: + event_target = events.Target.from_model(u) # build target before renaming the user + u.deleted = True + u.active = False + u.original_name = u.name + u.name = f"_deleted_{timestamp}_{secrets.token_hex(8)}" + events.emit( + session, + "User deleted", + actor=events.UserActor.from_user(actor), + targets=[event_target], + ) + await session.execute(delete(MemberModel).where(MemberModel.user_id.in_(user_ids))) + # Projects are not deleted automatically if owners are deleted. + await session.commit() async def get_user_model_by_name( @@ -112,7 +329,7 @@ async def get_user_model_by_name( username: str, ignore_case: bool = False, ) -> Optional[UserModel]: - filters = [] + filters = [UserModel.deleted == False] if ignore_case: filters.append(safunc.lower(UserModel.name) == safunc.lower(username)) else: @@ -121,22 +338,78 @@ async def get_user_model_by_name( return res.scalar() -async def get_user_model_by_name_or_error(session: AsyncSession, username: str) -> UserModel: - res = await session.execute(select(UserModel).where(UserModel.name == username)) - return res.scalar_one() +async def get_user_model_by_name_or_error( + session: AsyncSession, + username: str, + ignore_case: bool = False, +) -> UserModel: + return get_or_error( + await get_user_model_by_name(session=session, username=username, ignore_case=ignore_case) + ) -async def get_user_model_by_token(session: AsyncSession, token: str) -> Optional[UserModel]: - res = await session.execute(select(UserModel).where(UserModel.token == token)) - return res.scalar() +@asynccontextmanager +async def get_user_model_by_name_for_update( + session: AsyncSession, username: str +) -> AsyncGenerator[Optional[UserModel], None]: + """ + Fetch the user from the database and lock it for update. + + **NOTE**: commit changes to the database before exiting from this context manager, + so that in-memory locks are only released after commit. + """ + + filters = [ + UserModel.name == username, + UserModel.deleted == False, + ] + res = await session.execute(select(UserModel.id).where(*filters)) + user_id = res.scalar_one_or_none() + if user_id is None: + yield None + else: + async with get_locker(get_db().dialect_name).lock_ctx(UserModel.__tablename__, [user_id]): + # Refetch after lock + res = await session.execute( + select(UserModel) + .where(UserModel.id.in_([user_id]), *filters) + .with_for_update(key_share=True) + ) + yield res.scalar_one_or_none() + + +async def log_in_with_token(session: AsyncSession, token: str) -> Optional[UserModel]: + token_hash = get_token_hash(token) + res = await session.execute( + select(UserModel).where( + UserModel.token_hash == token_hash, + UserModel.active == True, + UserModel.deleted == False, + ) + ) + user = res.scalar() + if user is None: + return None + if not user.token.decrypted: + logger.error( + "Failed to get user by token. Token cannot be decrypted: %s", repr(user.token.exc) + ) + return None + if user.token.get_plaintext_or_error() != token: + return None + return user def user_model_to_user(user_model: UserModel) -> User: return User( id=user_model.id, username=user_model.name, + created_at=user_model.created_at, global_role=user_model.global_role, email=user_model.email, + active=user_model.active, + permissions=get_user_permissions(user_model), + ssh_public_key=user_model.ssh_public_key, ) @@ -144,14 +417,45 @@ def user_model_to_user_with_creds(user_model: UserModel) -> UserWithCreds: return UserWithCreds( id=user_model.id, username=user_model.name, + created_at=user_model.created_at, global_role=user_model.global_role, email=user_model.email, - creds=UserTokenCreds(token=user_model.token), + active=user_model.active, + permissions=get_user_permissions(user_model), + ssh_public_key=user_model.ssh_public_key, + creds=UserTokenCreds(token=user_model.token.get_plaintext_or_error()), + ssh_private_key=user_model.ssh_private_key, + ) + + +def get_user_permissions(user_model: UserModel) -> UserPermissions: + default_permissions = get_default_permissions() + can_create_projects = True + if not default_permissions.allow_non_admins_create_projects: + if user_model.global_role != GlobalRole.ADMIN: + can_create_projects = False + return UserPermissions( + can_create_projects=can_create_projects, ) +def validate_username(username: str): + if not is_valid_username(username): + raise ServerClientError("Username should match regex '^[a-zA-Z0-9-_]{1,60}$'") + + +def is_valid_username(username: str) -> bool: + return re.match("^[a-zA-Z0-9-_]{1,60}$", username) is not None + + _CREATE_USER_HOOKS = [] -def register_create_user_hook(func: Callable[[AsyncSession, UserModel], Awaitable[None]]): +def register_create_user_hook( + func: Callable[[AsyncSession, UserModel, Optional[UserHookConfig]], Awaitable[None]], +): _CREATE_USER_HOOKS.append(func) + + +def get_token_hash(token: str) -> str: + return hashlib.sha256(token.encode()).hexdigest() diff --git a/src/dstack/_internal/server/services/volumes.py b/src/dstack/_internal/server/services/volumes.py index 5373406553..9ec85ad8d2 100644 --- a/src/dstack/_internal/server/services/volumes.py +++ b/src/dstack/_internal/server/services/volumes.py @@ -1,37 +1,186 @@ import asyncio import uuid -from datetime import timezone +from datetime import datetime, timedelta from typing import List, Optional -from sqlalchemy import select, update +from sqlalchemy import and_, func, or_, select from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import joinedload +from sqlalchemy.orm import joinedload, selectinload -from dstack._internal.core.backends import BACKENDS_WITH_VOLUMES_SUPPORT +from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport +from dstack._internal.core.backends.features import BACKENDS_WITH_VOLUMES_SUPPORT from dstack._internal.core.errors import ( BackendNotAvailable, ResourceExistsError, ServerClientError, ) +from dstack._internal.core.models.profiles import parse_duration from dstack._internal.core.models.volumes import ( + AnyVolumeConfiguration, Volume, + VolumeAttachment, VolumeAttachmentData, VolumeConfiguration, + VolumeInstance, VolumeProvisioningData, + VolumeSpec, VolumeStatus, ) from dstack._internal.core.services import validate_dstack_resource_name -from dstack._internal.server.models import ProjectModel, VolumeModel +from dstack._internal.server.db import get_db, is_db_postgres, is_db_sqlite +from dstack._internal.server.models import ( + InstanceModel, + ProjectModel, + UserModel, + VolumeAttachmentModel, + VolumeModel, +) from dstack._internal.server.services import backends as backends_services -from dstack._internal.server.utils.common import run_async, wait_to_lock_many +from dstack._internal.server.services import events +from dstack._internal.server.services.instances import get_instance_provisioning_data +from dstack._internal.server.services.locking import ( + get_locker, + string_to_lock_id, +) +from dstack._internal.server.services.pipelines import PipelineHinterProtocol +from dstack._internal.server.services.plugins import apply_plugin_policies +from dstack._internal.server.services.projects import list_user_project_models from dstack._internal.utils import common, random_names from dstack._internal.utils.logging import get_logger logger = get_logger(__name__) -PROCESSING_VOLUMES_LOCK = asyncio.Lock() -PROCESSING_VOLUMES_IDS = set() +def switch_volume_status( + session: AsyncSession, + volume_model: VolumeModel, + new_status: VolumeStatus, + actor: events.AnyActor = events.SystemActor(), +): + old_status = volume_model.status + if old_status == new_status: + return + + volume_model.status = new_status + emit_volume_status_change_event( + session=session, + volume_model=volume_model, + old_status=old_status, + new_status=new_status, + status_message=volume_model.status_message, + actor=actor, + ) + + +def emit_volume_status_change_event( + session: AsyncSession, + volume_model: VolumeModel, + old_status: VolumeStatus, + new_status: VolumeStatus, + status_message: Optional[str], + actor: events.AnyActor = events.SystemActor(), +) -> None: + if old_status == new_status: + return + msg = get_volume_status_change_message( + old_status=old_status, + new_status=new_status, + status_message=status_message, + ) + events.emit(session, msg, actor=actor, targets=[events.Target.from_model(volume_model)]) + + +def get_volume_status_change_message( + old_status: VolumeStatus, + new_status: VolumeStatus, + status_message: Optional[str], +) -> str: + msg = f"Volume status changed {old_status.upper()} -> {new_status.upper()}" + if status_message is not None: + msg += f" ({status_message})" + return msg + + +async def list_volumes( + session: AsyncSession, + user: UserModel, + project_name: Optional[str], + only_active: bool, + prev_created_at: Optional[datetime], + prev_id: Optional[uuid.UUID], + limit: int, + ascending: bool, +) -> List[Volume]: + projects = await list_user_project_models( + session=session, + user=user, + only_names=True, + ) + if project_name is not None: + projects = [p for p in projects if p.name == project_name] + volume_models = await list_projects_volume_models( + session=session, + projects=projects, + only_active=only_active, + prev_created_at=prev_created_at, + prev_id=prev_id, + limit=limit, + ascending=ascending, + ) + return [volume_model_to_volume(v) for v in volume_models] + + +async def list_projects_volume_models( + session: AsyncSession, + projects: List[ProjectModel], + only_active: bool, + prev_created_at: Optional[datetime], + prev_id: Optional[uuid.UUID], + limit: int, + ascending: bool, +) -> List[VolumeModel]: + filters = [] + filters.append(VolumeModel.project_id.in_(p.id for p in projects)) + if only_active: + filters.append(VolumeModel.deleted == False) + if prev_created_at is not None: + if ascending: + if prev_id is None: + filters.append(VolumeModel.created_at > prev_created_at) + else: + filters.append( + or_( + VolumeModel.created_at > prev_created_at, + and_(VolumeModel.created_at == prev_created_at, VolumeModel.id < prev_id), + ) + ) + else: + if prev_id is None: + filters.append(VolumeModel.created_at < prev_created_at) + else: + filters.append( + or_( + VolumeModel.created_at < prev_created_at, + and_(VolumeModel.created_at == prev_created_at, VolumeModel.id > prev_id), + ) + ) + order_by = (VolumeModel.created_at.desc(), VolumeModel.id) + if ascending: + order_by = (VolumeModel.created_at.asc(), VolumeModel.id.desc()) + res = await session.execute( + select(VolumeModel) + .where(*filters) + .order_by(*order_by) + .limit(limit) + .options(joinedload(VolumeModel.user)) + .options( + joinedload(VolumeModel.attachments) + .joinedload(VolumeAttachmentModel.instance) + .joinedload(InstanceModel.fleet) + ) + ) + volume_models = list(res.unique().scalars().all()) + return volume_models async def list_project_volumes( @@ -56,8 +205,18 @@ async def list_project_volume_models( filters.append(VolumeModel.name.in_(names)) if not include_deleted: filters.append(VolumeModel.deleted == False) - res = await session.execute(select(VolumeModel).where(*filters)) - return list(res.scalars().all()) + res = await session.execute( + select(VolumeModel) + .where(*filters) + .options(joinedload(VolumeModel.user)) + .options(joinedload(VolumeModel.project)) + .options( + joinedload(VolumeModel.attachments) + .joinedload(VolumeAttachmentModel.instance) + .joinedload(InstanceModel.fleet) + ) + ) + return list(res.unique().scalars().all()) async def get_volume_by_name( @@ -83,42 +242,85 @@ async def get_project_volume_model_by_name( ] if not include_deleted: filters.append(VolumeModel.deleted == False) - res = await session.execute(select(VolumeModel).where(*filters)) - return res.scalar_one_or_none() + res = await session.execute( + select(VolumeModel) + .where(*filters) + .options(joinedload(VolumeModel.user)) + .options(joinedload(VolumeModel.project)) + .options( + joinedload(VolumeModel.attachments) + .joinedload(VolumeAttachmentModel.instance) + .joinedload(InstanceModel.fleet) + ) + ) + return res.unique().scalar_one_or_none() async def create_volume( session: AsyncSession, project: ProjectModel, - configuration: VolumeConfiguration, + user: UserModel, + configuration: AnyVolumeConfiguration, + pipeline_hinter: PipelineHinterProtocol, ) -> Volume: + spec = await apply_plugin_policies( + user=user.name, + project=project.name, + # Create pseudo spec until the volume API is updated to accept spec + spec=VolumeSpec(configuration=configuration), + ) + configuration = spec.configuration _validate_volume_configuration(configuration) - if configuration.name is not None: - volume_model = await get_project_volume_model_by_name( - session=session, - project=project, + lock_namespace = f"volume_names_{project.name}" + if is_db_sqlite(): + # Start new transaction to see committed changes after lock + await session.commit() + elif is_db_postgres(): + await session.execute( + select(func.pg_advisory_xact_lock(string_to_lock_id(lock_namespace))) + ) + lock, _ = get_locker(get_db().dialect_name).get_lockset(lock_namespace) + async with lock: + if configuration.name is not None: + volume_model = await get_project_volume_model_by_name( + session=session, + project=project, + name=configuration.name, + ) + if volume_model is not None: + raise ResourceExistsError() + else: + configuration.name = await generate_volume_name(session=session, project=project) + + now = common.get_current_datetime() + volume_model = VolumeModel( + id=uuid.uuid4(), name=configuration.name, + user_id=user.id, + project=project, + status=VolumeStatus.SUBMITTED, + configuration=configuration.json(), + auto_cleanup_enabled=_get_autocleanup_enabled(configuration), + attachments=[], + created_at=now, + last_processed_at=now, ) - if volume_model is not None: - raise ResourceExistsError() - else: - configuration.name = await generate_volume_name(session=session, project=project) - - volume_model = VolumeModel( - id=uuid.uuid4(), - name=configuration.name, - project=project, - status=VolumeStatus.SUBMITTED, - configuration=configuration.json(), - ) - session.add(volume_model) - await session.commit() - await session.refresh(volume_model) - return volume_model_to_volume(volume_model) + session.add(volume_model) + events.emit( + session, + message=f"Volume created. Status: {volume_model.status.upper()}", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(volume_model)], + ) + await session.commit() + pipeline_hinter.hint_fetch(VolumeModel.__name__) + return volume_model_to_volume(volume_model) -async def delete_volumes(session: AsyncSession, project: ProjectModel, names: List[str]): +async def delete_volumes( + session: AsyncSession, project: ProjectModel, names: List[str], user: UserModel +): res = await session.execute( select(VolumeModel).where( VolumeModel.project_id == project.id, @@ -128,45 +330,50 @@ async def delete_volumes(session: AsyncSession, project: ProjectModel, names: Li ) volume_models = res.scalars().all() volumes_ids = sorted([v.id for v in volume_models]) + await session.commit() logger.info("Deleting volumes: %s", [v.name for v in volume_models]) - await wait_to_lock_many(PROCESSING_VOLUMES_LOCK, PROCESSING_VOLUMES_IDS, volumes_ids) - try: - # Refetch after lock - res = await session.execute( - select(VolumeModel) - .where( - VolumeModel.project_id == project.id, - VolumeModel.name.in_(names), - VolumeModel.deleted == False, + async with get_locker(get_db().dialect_name).lock_ctx(VolumeModel.__tablename__, volumes_ids): + # Retry locking volumes to increase lock acquisition chances. + # This hack is needed until requests are queued. + volume_models = [] + for i in range(10): + res = await session.execute( + select(VolumeModel) + .where( + VolumeModel.project_id == project.id, + VolumeModel.id.in_(volumes_ids), + VolumeModel.deleted == False, + VolumeModel.lock_expires_at.is_(None), + ) + .options(selectinload(VolumeModel.attachments)) + .order_by(VolumeModel.id) # take locks in order + .with_for_update(key_share=True, of=VolumeModel) + .execution_options(populate_existing=True) + ) + volume_models = res.scalars().unique().all() + if len(volume_models) == len(volumes_ids): + break + await asyncio.sleep(0.5) + if len(volume_models) != len(volumes_ids): + # TODO: Make the endpoint fully async so we don't need to lock and error. + raise ServerClientError( + "Failed to delete volumes: volumes are being processed currently. Try again later." ) - .options(joinedload(VolumeModel.instances)) - .execution_options(populate_existing=True) - ) - volume_models = res.scalars().unique().all() for volume_model in volume_models: - if len(volume_model.instances) > 0: + if len(volume_model.attachments) > 0: raise ServerClientError( f"Failed to delete volume {volume_model.name}. Volume is in use." ) for volume_model in volume_models: - try: - await _delete_volume(session=session, project=project, volume_model=volume_model) - except Exception: - logger.exception("Error when deleting volume %s", volume_model.name) - await session.execute( - update(VolumeModel) - .where( - VolumeModel.project_id == project.id, - VolumeModel.id.in_(volumes_ids), - ) - .values( - deleted=True, - deleted_at=common.get_current_datetime(), - ) - ) + if not volume_model.to_be_deleted: + volume_model.to_be_deleted = True + events.emit( + session, + message="Volume marked for deletion", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(volume_model)], + ) await session.commit() - finally: - PROCESSING_VOLUMES_IDS.difference_update(volumes_ids) def volume_model_to_volume(volume_model: VolumeModel) -> Volume: @@ -176,23 +383,42 @@ def volume_model_to_volume(volume_model: VolumeModel) -> Volume: # Initially VolumeProvisionigData lacked backend if vpd is not None and vpd.backend is None: vpd.backend = configuration.backend - return Volume( + attachments = [] + for volume_attachment_model in volume_model.attachments: + instance = volume_attachment_model.instance + attachments.append( + VolumeAttachment( + instance=instance_model_to_volume_instance(instance), + attachment_data=get_attachment_data(volume_attachment_model), + ) + ) + deleted_at = None + if volume_model.deleted_at is not None: + deleted_at = volume_model.deleted_at + volume = Volume( name=volume_model.name, project_name=volume_model.project.name, + user=volume_model.user.name, configuration=configuration, - external=configuration.volume_id is not None, - created_at=volume_model.created_at.replace(tzinfo=timezone.utc), + external=configuration.is_external, + created_at=volume_model.created_at, + last_processed_at=volume_model.last_processed_at, status=volume_model.status, status_message=volume_model.status_message, + deleted=volume_model.deleted, + deleted_at=deleted_at, volume_id=vpd.volume_id if vpd is not None else None, provisioning_data=vpd, + attachments=attachments, attachment_data=vad, - volume_model_id=volume_model.id, + id=volume_model.id, ) + volume.cost = _get_volume_cost(volume) + return volume -def get_volume_configuration(volume_model: VolumeModel) -> VolumeConfiguration: - return VolumeConfiguration.__response__.parse_raw(volume_model.configuration) +def get_volume_configuration(volume_model: VolumeModel) -> AnyVolumeConfiguration: + return VolumeConfiguration.__response__.parse_raw(volume_model.configuration).__root__ def get_volume_provisioning_data(volume_model: VolumeModel) -> Optional[VolumeProvisioningData]: @@ -207,26 +433,66 @@ def get_volume_attachment_data(volume_model: VolumeModel) -> Optional[VolumeAtta return VolumeAttachmentData.__response__.parse_raw(volume_model.volume_attachment_data) +def get_attachment_data( + volume_attachment_model: VolumeAttachmentModel, +) -> Optional[VolumeAttachmentData]: + if volume_attachment_model.attachment_data is None: + return None + return VolumeAttachmentData.__response__.parse_raw(volume_attachment_model.attachment_data) + + +def instance_model_to_volume_instance(instance_model: InstanceModel) -> VolumeInstance: + instance_id = None + jpd = get_instance_provisioning_data(instance_model) + if jpd is not None: + instance_id = jpd.instance_id + return VolumeInstance( + name=instance_model.name, + fleet_name=instance_model.fleet.name if instance_model.fleet else None, + instance_num=instance_model.instance_num, + instance_id=instance_id, + ) + + async def generate_volume_name(session: AsyncSession, project: ProjectModel) -> str: - volume_models = await list_project_volume_models(session=session, project=project) - names = {v.name for v in volume_models} + res = await session.execute( + select(VolumeModel.name).where( + VolumeModel.project_id == project.id, + VolumeModel.deleted == False, + ) + ) + names = set(res.scalars().all()) while True: name = random_names.generate_name() if name not in names: return name -def _validate_volume_configuration(configuration: VolumeConfiguration): - if configuration.volume_id is None and configuration.size is None: - raise ServerClientError("Volume must specify either volume_id or size") +def _validate_volume_configuration(configuration: AnyVolumeConfiguration): + if configuration.external_volume_id is None and configuration.size is None: + raise ServerClientError("Volume must specify either existing identifier or size") + backends_services.check_backend_type_available(configuration.backend) if configuration.backend not in BACKENDS_WITH_VOLUMES_SUPPORT: raise ServerClientError( - f"Volumes are not supported for {configuration.backend.value} backend. " - f"Supported backends: {[b.value for b in BACKENDS_WITH_VOLUMES_SUPPORT]}." + f"Volumes are not supported for {configuration.backend.value} backend." + f" Available backends with volumes support: {[b.value for b in BACKENDS_WITH_VOLUMES_SUPPORT]}." ) if configuration.name is not None: validate_dstack_resource_name(configuration.name) + if configuration.is_external and configuration.auto_cleanup_duration is not None: + if ( + isinstance(configuration.auto_cleanup_duration, int) + and configuration.auto_cleanup_duration > 0 + ) or ( + isinstance(configuration.auto_cleanup_duration, str) + and configuration.auto_cleanup_duration not in ("off", "-1") + ): + raise ServerClientError( + "External volumes do not support auto_cleanup_duration. " + "Auto-cleanup only works for volumes created and managed by dstack." + ) + async def _delete_volume(session: AsyncSession, project: ProjectModel, volume_model: VolumeModel): volume = volume_model_to_volume(volume_model) @@ -234,9 +500,7 @@ async def _delete_volume(session: AsyncSession, project: ProjectModel, volume_mo return if volume.provisioning_data is None: - logger.error( - f"Failed to delete volume {volume_model.name}. volume.provisioning_data is None." - ) + # The volume wasn't provisioned so there is nothing to delete return if volume.provisioning_data.backend is None: logger.error( @@ -255,7 +519,34 @@ async def _delete_volume(session: AsyncSession, project: ProjectModel, volume_mo ) return - await run_async( - backend.compute().delete_volume, + compute = backend.compute() + assert isinstance(compute, ComputeWithVolumeSupport) + await common.run_async( + compute.delete_volume, volume=volume, ) + + +# Clouds charge volumes assuming 30-day months, e.g. https://aws.amazon.com/ebs/pricing/ +_VOLUME_PRICING_PERIOD = timedelta(days=30) + + +def _get_volume_cost(volume: Volume) -> float: + if volume.provisioning_data is None or volume.provisioning_data.price is None: + return 0.0 + finished_at = common.get_current_datetime() + if volume.deleted_at: + finished_at = volume.deleted_at + elif not volume.status.is_active(): + finished_at = volume.last_processed_at + volume_age = finished_at - volume.created_at + return ( + volume_age.total_seconds() + * volume.provisioning_data.price + / _VOLUME_PRICING_PERIOD.total_seconds() + ) + + +def _get_autocleanup_enabled(configuration: AnyVolumeConfiguration) -> bool: + auto_cleanup_duration = parse_duration(configuration.auto_cleanup_duration) + return auto_cleanup_duration is not None and auto_cleanup_duration > 0 diff --git a/src/dstack/_internal/server/settings.py b/src/dstack/_internal/server/settings.py index b410d9c4b0..27a97a6db5 100644 --- a/src/dstack/_internal/server/settings.py +++ b/src/dstack/_internal/server/settings.py @@ -1,14 +1,26 @@ +""" +Environment variables read by the dstack server. Documented in reference/env.md +""" + import os +from enum import Enum from pathlib import Path +from dstack._internal.server.utils.settings import parse_hostname_port +from dstack._internal.utils.env import environ +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + DSTACK_DIR_PATH = Path("~/.dstack/").expanduser() -SERVER_DIR_PATH = Path(os.getenv("DSTACK_SERVER_DIR", DSTACK_DIR_PATH / "server")) +SERVER_DIR_PATH = Path(os.getenv("DSTACK_SERVER_DIR", DSTACK_DIR_PATH / "server")).resolve() SERVER_CONFIG_FILE_PATH = SERVER_DIR_PATH / "config.yml" SERVER_DATA_DIR_PATH = SERVER_DIR_PATH / "data" SERVER_DATA_DIR_PATH.mkdir(parents=True, exist_ok=True) + DATABASE_URL = os.getenv( "DSTACK_DATABASE_URL", f"sqlite+aiosqlite:///{str(SERVER_DATA_DIR_PATH.absolute())}/sqlite.db" ) @@ -27,16 +39,85 @@ "DSTACK_ALEMBIC_MIGRATIONS_LOCATION", "dstack._internal.server:migrations" ) +# Users may want to decrease client pool size to run on small DB instances +# or increase client pool size to support more concurrent requests. +DB_POOL_SIZE = int(os.getenv("DSTACK_DB_POOL_SIZE", 20)) +DB_MAX_OVERFLOW = int(os.getenv("DSTACK_DB_MAX_OVERFLOW", 20)) + +SERVER_BACKGROUND_PROCESSING_DISABLED = ( + os.getenv("DSTACK_SERVER_BACKGROUND_PROCESSING_DISABLED") is not None +) +SERVER_BACKGROUND_PROCESSING_ENABLED = not SERVER_BACKGROUND_PROCESSING_DISABLED + +SERVER_EXECUTOR_MAX_WORKERS = int(os.getenv("DSTACK_SERVER_EXECUTOR_MAX_WORKERS", 128)) + +MAX_OFFERS_TRIED = int(os.getenv("DSTACK_SERVER_MAX_OFFERS_TRIED", 25)) +MAX_PROBES_PER_JOB = int(os.getenv("DSTACK_SERVER_MAX_PROBES_PER_JOB", 10)) +MAX_PROBE_TIMEOUT = int(os.getenv("DSTACK_SERVER_MAX_PROBE_TIMEOUT", 60 * 5)) + SERVER_CONFIG_DISABLED = os.getenv("DSTACK_SERVER_CONFIG_DISABLED") is not None SERVER_CONFIG_ENABLED = not SERVER_CONFIG_DISABLED -SERVER_BUCKET = os.getenv("DSTACK_SERVER_BUCKET") -SERVER_BUCKET_REGION = os.getenv("DSTACK_SERVER_BUCKET_REGION", "eu-west-1") +# TODO: remove deprecated DSTACK_SERVER_BUCKET and DSTACK_SERVER_BUCKET_REGION env var usage +SERVER_S3_BUCKET = os.getenv("DSTACK_SERVER_S3_BUCKET", os.getenv("DSTACK_SERVER_BUCKET")) +SERVER_S3_BUCKET_REGION = os.getenv( + "DSTACK_SERVER_S3_BUCKET_REGION", os.getenv("DSTACK_SERVER_BUCKET_REGION") +) + +SERVER_GCS_BUCKET = os.getenv("DSTACK_SERVER_GCS_BUCKET") + +SERVER_CLOUDWATCH_LOG_GROUP = os.getenv("DSTACK_SERVER_CLOUDWATCH_LOG_GROUP") +SERVER_CLOUDWATCH_LOG_REGION = os.getenv("DSTACK_SERVER_CLOUDWATCH_LOG_REGION") + +SERVER_GCP_LOGGING_PROJECT = os.getenv("DSTACK_SERVER_GCP_LOGGING_PROJECT") + +SERVER_FLUENTBIT_HOST = os.getenv("DSTACK_SERVER_FLUENTBIT_HOST") +SERVER_FLUENTBIT_PORT = int(os.getenv("DSTACK_SERVER_FLUENTBIT_PORT", "24224")) +SERVER_FLUENTBIT_PROTOCOL = os.getenv("DSTACK_SERVER_FLUENTBIT_PROTOCOL", "forward") +SERVER_FLUENTBIT_TAG_PREFIX = os.getenv("DSTACK_SERVER_FLUENTBIT_TAG_PREFIX", "dstack") + +SERVER_ELASTICSEARCH_HOST = os.getenv("DSTACK_SERVER_ELASTICSEARCH_HOST") +SERVER_ELASTICSEARCH_INDEX = os.getenv("DSTACK_SERVER_ELASTICSEARCH_INDEX", "dstack-logs") +SERVER_ELASTICSEARCH_API_KEY = os.getenv("DSTACK_SERVER_ELASTICSEARCH_API_KEY") + +SERVER_METRICS_RUNNING_TTL_SECONDS = environ.get_int( + "DSTACK_SERVER_METRICS_RUNNING_TTL_SECONDS", default=3600 +) +SERVER_METRICS_FINISHED_TTL_SECONDS = environ.get_int( + "DSTACK_SERVER_METRICS_FINISHED_TTL_SECONDS", default=7 * 24 * 3600 +) +SERVER_INSTANCE_HEALTH_TTL_SECONDS = environ.get_int( + "DSTACK_SERVER_INSTANCE_HEALTH_TTL_SECONDS", default=7 * 24 * 3600 +) +SERVER_INSTANCE_HEALTH_MIN_COLLECT_INTERVAL_SECONDS = environ.get_int( + "DSTACK_SERVER_INSTANCE_HEALTH_MIN_COLLECT_INTERVAL_SECONDS", default=60 +) + +SERVER_EVENTS_TTL_SECONDS = int( + # default documented in reference/env.md, keep in sync + os.getenv("DSTACK_SERVER_EVENTS_TTL_SECONDS", 30 * 24 * 3600) +) + +SSHPROXY_API_TOKEN = environ.get("DSTACK_SSHPROXY_API_TOKEN") or None +SSHPROXY_HOSTNAME, SSHPROXY_PORT = environ.get_callback( + "DSTACK_SERVER_SSHPROXY_ADDRESS", parse_hostname_port, default=(None, None) +) +SSHPROXY_ENABLED = SSHPROXY_API_TOKEN is not None and SSHPROXY_HOSTNAME is not None +SSHPROXY_ENFORCED = os.getenv("DSTACK_SERVER_SSHPROXY_ENFORCED") is not None +if SSHPROXY_ENFORCED and not SSHPROXY_ENABLED: + logger.warning("sshproxy is not enabled, ignoring DSTACK_SERVER_SSHPROXY_ENFORCED") + SSHPROXY_ENFORCED = False + +SERVER_KEEP_SHIM_TASKS = os.getenv("DSTACK_SERVER_KEEP_SHIM_TASKS") is not None DEFAULT_PROJECT_NAME = "main" SENTRY_DSN = os.getenv("DSTACK_SENTRY_DSN") SENTRY_TRACES_SAMPLE_RATE = float(os.getenv("DSTACK_SENTRY_TRACES_SAMPLE_RATE", 0.1)) +SENTRY_TRACES_BACKGROUND_SAMPLE_RATE = float( + os.getenv("DSTACK_SENTRY_TRACES_BACKGROUND_SAMPLE_RATE", 0.01) +) +SENTRY_PROFILES_SAMPLE_RATE = float(os.getenv("DSTACK_SENTRY_PROFILES_SAMPLE_RATE", 0)) DEFAULT_CREDS_DISABLED = os.getenv("DSTACK_DEFAULT_CREDS_DISABLED") is not None DEFAULT_CREDS_ENABLED = not DEFAULT_CREDS_DISABLED @@ -44,16 +125,59 @@ ACME_SERVER = os.getenv("DSTACK_ACME_SERVER") ACME_EAB_KID = os.getenv("DSTACK_ACME_EAB_KID") ACME_EAB_HMAC_KEY = os.getenv("DSTACK_ACME_EAB_HMAC_KEY") +DEFAULT_SERVICE_CLIENT_MAX_BODY_SIZE = int( + os.getenv("DSTACK_DEFAULT_SERVICE_CLIENT_MAX_BODY_SIZE", 64 * 1024 * 1024) +) + +SERVER_DEFAULT_DOCKER_REGISTRY = os.getenv("DSTACK_SERVER_DEFAULT_DOCKER_REGISTRY") or None +SERVER_DEFAULT_DOCKER_REGISTRY_USERNAME = ( + os.getenv("DSTACK_SERVER_DEFAULT_DOCKER_REGISTRY_USERNAME") or None +) +SERVER_DEFAULT_DOCKER_REGISTRY_PASSWORD = ( + os.getenv("DSTACK_SERVER_DEFAULT_DOCKER_REGISTRY_PASSWORD") or None +) USER_PROJECT_DEFAULT_QUOTA = int(os.getenv("DSTACK_USER_PROJECT_DEFAULT_QUOTA", 10)) +FORBID_SERVICES_WITHOUT_GATEWAY = os.getenv("DSTACK_FORBID_SERVICES_WITHOUT_GATEWAY") is not None +SERVER_CODE_UPLOAD_LIMIT = int(os.getenv("DSTACK_SERVER_CODE_UPLOAD_LIMIT", 2 * 2**20)) + +SERVER_TEMPLATES_REPO = os.getenv("DSTACK_SERVER_TEMPLATES_REPO") + +# Per-job log quota: maximum bytes of log output per calendar hour. 0 = unlimited. +SERVER_LOG_QUOTA_PER_JOB_HOUR = int( + os.getenv("DSTACK_SERVER_LOG_QUOTA_PER_JOB_HOUR", 50 * 1024 * 1024) # 50 MB +) + +SERVER_SSH_POOL_DISABLED = os.getenv("DSTACK_SERVER_SSH_POOL_DISABLED") is not None +SERVER_SSH_POOL_ENABLED = not SERVER_SSH_POOL_DISABLED +SERVER_SSH_CONNECT_TIMEOUT = int(os.getenv("DSTACK_SERVER_SSH_CONNECT_TIMEOUT", 3)) # Development settings SQL_ECHO_ENABLED = os.getenv("DSTACK_SQL_ECHO_ENABLED") is not None -LOCAL_BACKEND_ENABLED = os.getenv("DSTACK_LOCAL_BACKEND_ENABLED") is not None +SERVER_PROFILING_ENABLED = os.getenv("DSTACK_SERVER_PROFILING_ENABLED") is not None UPDATE_DEFAULT_PROJECT = os.getenv("DSTACK_UPDATE_DEFAULT_PROJECT") is not None DO_NOT_UPDATE_DEFAULT_PROJECT = os.getenv("DSTACK_DO_NOT_UPDATE_DEFAULT_PROJECT") is not None -SKIP_GATEWAY_UPDATE = os.getenv("DSTACK_SKIP_GATEWAY_UPDATE", None) is not None +SKIP_GATEWAY_UPDATE = os.getenv("DSTACK_SKIP_GATEWAY_UPDATE") is not None +ENABLE_PROMETHEUS_METRICS = os.getenv("DSTACK_ENABLE_PROMETHEUS_METRICS") is not None + + +class JobNetworkMode(Enum): + # "host" for multinode runs only, "bridge" otherwise. Opt-in new defaut + HOST_FOR_MULTINODE_ONLY = 1 + # "bridge" if the job occupies only a part of the instance, "host" otherswise. Current default + HOST_WHEN_POSSIBLE = 2 + # Always "bridge", even for multinode runs. Same as legacy DSTACK_FORCE_BRIDGE_NETWORK=true + FORCED_BRIDGE = 3 + + +DEFAULT_JOB_NETWORK_MODE = JobNetworkMode.HOST_WHEN_POSSIBLE +JOB_NETWORK_MODE = environ.get_enum( + "DSTACK_SERVER_JOB_NETWORK_MODE", + JobNetworkMode, + value_type=int, + default=DEFAULT_JOB_NETWORK_MODE, +) diff --git a/src/dstack/_internal/server/testing/common.py b/src/dstack/_internal/server/testing/common.py index 543001d024..2c0a66be5a 100644 --- a/src/dstack/_internal/server/testing/common.py +++ b/src/dstack/_internal/server/testing/common.py @@ -1,95 +1,219 @@ import json import uuid +from collections.abc import Callable +from contextlib import contextmanager from datetime import datetime, timezone -from typing import Dict, Optional +from typing import Any, Dict, List, Literal, Optional, Union from uuid import UUID +import gpuhunt +from sqlalchemy import delete, select from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import joinedload +from dstack._internal.core.backends.base.compute import ( + Compute, + ComputeWithCreateInstanceSupport, + ComputeWithGatewaySupport, + ComputeWithGroupProvisioningSupport, + ComputeWithInstanceVolumesSupport, + ComputeWithMultinodeSupport, + ComputeWithPlacementGroupSupport, + ComputeWithPrivateGatewaySupport, + ComputeWithPrivilegedSupport, + ComputeWithReservationSupport, + ComputeWithVolumeSupport, +) from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import NetworkMode +from dstack._internal.core.models.compute_groups import ( + ComputeGroupProvisioningData, + ComputeGroupStatus, +) from dstack._internal.core.models.configurations import ( AnyRunConfiguration, DevEnvironmentConfiguration, ) -from dstack._internal.core.models.gateways import GatewayStatus -from dstack._internal.core.models.instances import InstanceConfiguration, InstanceType, Resources +from dstack._internal.core.models.envs import Env +from dstack._internal.core.models.fleets import ( + FleetConfiguration, + FleetNodesSpec, + FleetSpec, + FleetStatus, + InstanceGroupPlacement, + SSHHostParams, + SSHParams, +) +from dstack._internal.core.models.gateways import GatewayComputeConfiguration, GatewayStatus +from dstack._internal.core.models.health import HealthStatus +from dstack._internal.core.models.instances import ( + Disk, + Gpu, + InstanceAvailability, + InstanceConfiguration, + InstanceOfferWithAvailability, + InstanceStatus, + InstanceType, + RemoteConnectionInfo, + Resources, + SSHConnectionParams, + SSHKey, +) +from dstack._internal.core.models.placement import ( + PlacementGroupConfiguration, + PlacementGroupProvisioningData, + PlacementStrategy, +) from dstack._internal.core.models.profiles import ( - DEFAULT_POOL_NAME, - DEFAULT_POOL_TERMINATION_IDLE_TIME, + DEFAULT_FLEET_TERMINATION_IDLE_TIME, Profile, + TerminationPolicy, ) +from dstack._internal.core.models.repos import AnyRunRepoData from dstack._internal.core.models.repos.base import RepoType from dstack._internal.core.models.repos.local import LocalRunRepoData -from dstack._internal.core.models.resources import Memory, ResourcesSpec +from dstack._internal.core.models.resources import CPUSpec, Memory, ResourcesSpec from dstack._internal.core.models.runs import ( - InstanceStatus, JobProvisioningData, + JobRuntimeData, JobStatus, JobTerminationReason, Requirements, RunSpec, RunStatus, + RunTerminationReason, ) from dstack._internal.core.models.users import GlobalRole from dstack._internal.core.models.volumes import ( + AnyVolumeConfiguration, + KubernetesVolumeConfiguration, + Volume, + VolumeAttachment, VolumeConfiguration, VolumeProvisioningData, VolumeStatus, ) from dstack._internal.server.models import ( BackendModel, + CodeModel, + ComputeGroupModel, + DecryptedString, + EventModel, + ExportedFleetModel, + ExportedGatewayModel, + ExportModel, + FileArchiveModel, + FleetModel, GatewayComputeModel, GatewayModel, + ImportModel, + InstanceHealthCheckModel, InstanceModel, + JobMetricsPoint, JobModel, - PoolModel, + JobPrometheusMetrics, + PlacementGroupModel, + ProbeModel, ProjectModel, + RepoCredsModel, RepoModel, RunModel, + SecretModel, UserModel, + UserPublicKeyModel, + VolumeAttachmentModel, VolumeModel, ) from dstack._internal.server.services.jobs import get_job_specs_from_run_spec +from dstack._internal.server.services.permissions import ( + DefaultPermissions, + get_default_permissions, + set_default_permissions, +) +from dstack._internal.server.services.users import get_token_hash -def get_auth_headers(token: str) -> Dict: +def get_auth_headers(token: Union[DecryptedString, str]) -> Dict: + if isinstance(token, DecryptedString): + token = token.get_plaintext_or_error() return {"Authorization": f"Bearer {token}"} async def create_user( session: AsyncSession, name: str = "test_user", + created_at: datetime = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), global_role: GlobalRole = GlobalRole.ADMIN, token: Optional[str] = None, email: Optional[str] = None, + ssh_public_key: Optional[str] = None, + ssh_private_key: Optional[str] = None, + active: bool = True, + deleted: bool = False, ) -> UserModel: if token is None: token = str(uuid.uuid4()) user = UserModel( name=name, + created_at=created_at, global_role=global_role, - token=token, + token=DecryptedString(plaintext=token), + token_hash=get_token_hash(token), email=email, + ssh_public_key=ssh_public_key, + ssh_private_key=ssh_private_key, + active=active, + deleted=deleted, ) session.add(user) await session.commit() return user +async def create_user_public_key( + session: AsyncSession, + user: UserModel, + name: str = "test-key", + type: str = "ssh-ed25519", + fingerprint: str = "SHA256:testfingerprint", + key: str = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5", + created_at: datetime = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), +) -> UserPublicKeyModel: + user_public_key = UserPublicKeyModel( + user=user, + name=name, + type=type, + fingerprint=fingerprint, + key=key, + created_at=created_at, + ) + session.add(user_public_key) + await session.commit() + return user_public_key + + async def create_project( session: AsyncSession, owner: Optional[UserModel] = None, name: str = "test_project", + created_at: datetime = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), ssh_private_key: str = "", ssh_public_key: str = "", + is_public: bool = False, + templates_repo: Optional[str] = None, + deleted: bool = False, ) -> ProjectModel: if owner is None: owner = await create_user(session=session, name="test_owner") project = ProjectModel( name=name, owner_id=owner.id, + created_at=created_at, ssh_private_key=ssh_private_key, ssh_public_key=ssh_public_key, + is_public=is_public, + templates_repo=templates_repo, + deleted=deleted, ) session.add(project) await session.commit() @@ -102,6 +226,8 @@ async def create_backend( backend_type: BackendType = BackendType.AWS, config: Optional[Dict] = None, auth: Optional[Dict] = None, + source_config: Optional[Dict] = None, + source_auth: Optional[Dict] = None, ) -> BackendModel: if config is None: config = { @@ -117,7 +243,11 @@ async def create_backend( project_id=project_id, type=backend_type, config=json.dumps(config), - auth=json.dumps(auth), + auth=DecryptedString(plaintext=json.dumps(auth)), + source_config=None if source_config is None else json.dumps(source_config), + source_auth=( + None if source_auth is None else DecryptedString(plaintext=json.dumps(source_auth)) + ), ) session.add(backend) await session.commit() @@ -135,47 +265,95 @@ async def create_repo( if info is None: info = { "repo_type": "remote", - "repo_host_name": "github.com", - "repo_port": None, - "repo_user_name": "dstackai", "repo_name": "dstack", } - if creds is None: - creds = { - "protocol": "https", - "private_key": None, - "oauth_token": "test_token", - } repo = RepoModel( project_id=project_id, name=repo_name, type=repo_type, info=json.dumps(info), - creds=json.dumps(creds), + creds=json.dumps(creds) if creds is not None else None, ) session.add(repo) await session.commit() return repo +async def create_code( + session: AsyncSession, + repo: RepoModel, + blob_hash: str = "blob_hash", + blob: Optional[bytes] = b"blob_content", +) -> CodeModel: + code = CodeModel( + repo_id=repo.id, + blob_hash=blob_hash, + blob=blob, + ) + session.add(code) + await session.commit() + return code + + +async def create_repo_creds( + session: AsyncSession, + repo_id: UUID, + user_id: UUID, + creds: Optional[dict] = None, +) -> RepoCredsModel: + if creds is None: + creds = { + "clone_url": "https://github.com/dstackai/dstack.git", + "private_key": None, + "oauth_token": "test_token", + } + repo_creds = RepoCredsModel( + repo_id=repo_id, + user_id=user_id, + creds=DecryptedString(plaintext=json.dumps(creds)), + ) + session.add(repo_creds) + await session.commit() + return repo_creds + + +async def create_file_archive( + session: AsyncSession, + user_id: UUID, + blob_hash: str = "blob_hash", + blob: Optional[bytes] = b"blob_content", +) -> FileArchiveModel: + archive = FileArchiveModel( + user_id=user_id, + blob_hash=blob_hash, + blob=blob, + ) + session.add(archive) + await session.commit() + return archive + + def get_run_spec( - run_name: str, repo_id: str, - profile: Optional[Profile] = None, + run_name: str = "test-run", + configuration_path: str = "dstack.yaml", + profile: Union[Profile, Callable[[], Profile], None] = lambda: Profile(name="default"), configuration: Optional[AnyRunConfiguration] = None, + ssh_key_pub: Optional[str] = "user_ssh_key", + repo_data: AnyRunRepoData = LocalRunRepoData(repo_dir="/"), + repo_code_hash: Optional[str] = None, ) -> RunSpec: - if profile is None: - profile = Profile(name="default") + if callable(profile): + profile = profile() return RunSpec( run_name=run_name, repo_id=repo_id, - repo_data=LocalRunRepoData(repo_dir="/"), - repo_code_hash=None, - working_dir=".", - configuration_path="dstack.yaml", + repo_data=repo_data, + repo_code_hash=repo_code_hash, + configuration_path=configuration_path, configuration=configuration or DevEnvironmentConfiguration(ide="vscode"), profile=profile, - ssh_key_pub="user_ssh_key", + ssh_key_pub=ssh_key_pub, ) @@ -184,12 +362,23 @@ async def create_run( project: ProjectModel, repo: RepoModel, user: UserModel, - run_name: str = "test-run", + fleet: Optional[FleetModel] = None, + gateway: Optional[GatewayModel] = None, + run_name: Optional[str] = None, status: RunStatus = RunStatus.SUBMITTED, + termination_reason: Optional[RunTerminationReason] = None, submitted_at: datetime = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), run_spec: Optional[RunSpec] = None, run_id: Optional[UUID] = None, + deleted: bool = False, + priority: int = 0, + deployment_num: int = 0, + resubmission_attempt: int = 0, + next_triggered_at: Optional[datetime] = None, + last_processed_at: Optional[datetime] = None, ) -> RunModel: + if run_name is None: + run_name = "test-run" if run_spec is None: run_spec = get_run_spec( run_name=run_name, @@ -197,16 +386,28 @@ async def create_run( ) if run_id is None: run_id = uuid.uuid4() + if last_processed_at is None: + last_processed_at = submitted_at run = RunModel( id=run_id, + deleted=deleted, project_id=project.id, repo_id=repo.id, user_id=user.id, + fleet_id=fleet.id if fleet else None, submitted_at=submitted_at, run_name=run_name, status=status, + termination_reason=termination_reason, run_spec=run_spec.json(), - last_processed_at=submitted_at, + last_processed_at=last_processed_at, + jobs=[], + priority=priority, + deployment_num=deployment_num, + desired_replica_count=1, + resubmission_attempt=resubmission_attempt, + next_triggered_at=next_triggered_at, + gateway=gateway, ) session.add(run) await session.commit() @@ -216,25 +417,39 @@ async def create_run( async def create_job( session: AsyncSession, run: RunModel, + fleet: Optional[FleetModel] = None, submission_num: int = 0, status: JobStatus = JobStatus.SUBMITTED, submitted_at: datetime = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), last_processed_at: datetime = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), termination_reason: Optional[JobTerminationReason] = None, job_provisioning_data: Optional[JobProvisioningData] = None, + job_runtime_data: Optional[JobRuntimeData] = None, instance: Optional[InstanceModel] = None, job_num: int = 0, replica_num: int = 0, + deployment_num: Optional[int] = None, + instance_assigned: bool = False, + disconnected_at: Optional[datetime] = None, + registered: bool = False, + waiting_master_job: Optional[bool] = None, ) -> JobModel: + if deployment_num is None: + deployment_num = run.deployment_num run_spec = RunSpec.parse_raw(run.run_spec) - job_spec = (await get_job_specs_from_run_spec(run_spec, replica_num=replica_num))[0] + job_spec = ( + await get_job_specs_from_run_spec(run_spec=run_spec, secrets={}, replica_num=replica_num) + )[0] + job_spec.job_num = job_num job = JobModel( project_id=run.project_id, + fleet=fleet, run_id=run.id, run_name=run.run_name, job_num=job_num, - job_name=run.run_name + f"-0-{replica_num}", + job_name=run.run_name + f"-{job_num}-{replica_num}", replica_num=replica_num, + deployment_num=deployment_num, submission_num=submission_num, submitted_at=submitted_at, last_processed_at=last_processed_at, @@ -242,32 +457,178 @@ async def create_job( termination_reason=termination_reason, job_spec_data=job_spec.json(), job_provisioning_data=job_provisioning_data.json() if job_provisioning_data else None, + job_runtime_data=job_runtime_data.json() if job_runtime_data else None, instance=instance, + instance_assigned=instance_assigned, used_instance_id=instance.id if instance is not None else None, + disconnected_at=disconnected_at, + probes=[], + registered=registered, + waiting_master_job=waiting_master_job, ) session.add(job) await session.commit() return job -def get_job_provisioning_data() -> JobProvisioningData: - return JobProvisioningData( - backend=BackendType.AWS, - instance_type=InstanceType( +def get_job_provisioning_data( + dockerized: bool = False, + backend: BackendType = BackendType.AWS, + region: str = "us-east-1", + availability_zone: Optional[str] = None, + gpu_count: int = 0, + gpu_memory_gib: float = 16, + gpu_name: str = "T4", + cpu_count: int = 1, + memory_gib: float = 0.5, + spot: bool = False, + hostname: str = "127.0.0.4", + internal_ip: Optional[str] = "127.0.0.4", + price: float = 10.5, + instance_type: Optional[InstanceType] = None, + username: str = "ubuntu", + ssh_port: int = 22, + ssh_proxy: Optional[SSHConnectionParams] = None, +) -> JobProvisioningData: + gpus = [ + Gpu( + name=gpu_name, + memory_mib=int(gpu_memory_gib * 1024), + vendor=gpuhunt.AcceleratorVendor.NVIDIA, + ) + ] * gpu_count + if instance_type is None: + instance_type = InstanceType( name="instance", - resources=Resources(cpus=1, memory_mib=512, spot=False, gpus=[]), - ), + resources=Resources( + cpus=cpu_count, memory_mib=int(memory_gib * 1024), spot=spot, gpus=gpus + ), + ) + return JobProvisioningData( + backend=backend, + instance_type=instance_type, instance_id="instance_id", - hostname="127.0.0.4", - internal_ip="127.0.0.4", - region="us-east-1", - price=10.5, - username="ubuntu", - ssh_port=22, - dockerized=False, + hostname=hostname, + internal_ip=internal_ip, + region=region, + availability_zone=availability_zone, + price=price, + username=username, + ssh_port=ssh_port, + dockerized=dockerized, backend_data=None, - ssh_proxy=None, + ssh_proxy=ssh_proxy, + ) + + +def get_job_runtime_data( + network_mode: str = NetworkMode.HOST, + cpu: Optional[float] = None, + gpu: Optional[int] = None, + memory: Optional[float] = None, + ports: Optional[dict[int, int]] = None, + offer: Optional[InstanceOfferWithAvailability] = None, + volume_names: Optional[list[str]] = None, + working_dir: Optional[str] = None, + username: Optional[str] = None, +) -> JobRuntimeData: + return JobRuntimeData( + network_mode=NetworkMode(network_mode), + cpu=cpu, + gpu=gpu, + memory=Memory(memory) if memory is not None else None, + ports=ports, + offer=offer, + volume_names=volume_names, + working_dir=working_dir, + username=username, + ) + + +def get_compute_group_provisioning_data( + compute_group_id: str = "test_compute_group", + compute_group_name: str = "test_compute_group", + backend: BackendType = BackendType.RUNPOD, + region: str = "US", + job_provisioning_datas: Optional[list[JobProvisioningData]] = None, + backend_data: Optional[str] = None, +) -> ComputeGroupProvisioningData: + if job_provisioning_datas is None: + job_provisioning_datas = [] + return ComputeGroupProvisioningData( + compute_group_id=compute_group_id, + compute_group_name=compute_group_name, + backend=backend, + region=region, + job_provisioning_datas=job_provisioning_datas, + backend_data=backend_data, + ) + + +async def create_compute_group( + session: AsyncSession, + project: ProjectModel, + fleet: FleetModel, + status: ComputeGroupStatus = ComputeGroupStatus.RUNNING, + provisioning_data: Optional[ComputeGroupProvisioningData] = None, + last_processed_at: datetime = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), +): + if provisioning_data is None: + provisioning_data = get_compute_group_provisioning_data() + compute_group = ComputeGroupModel( + project=project, + fleet=fleet, + status=status, + provisioning_data=provisioning_data.json(), + last_processed_at=last_processed_at, + ) + session.add(compute_group) + await session.commit() + return compute_group + + +async def create_export( + session: AsyncSession, + exporter_project: ProjectModel, + importer_projects: list[ProjectModel], + exported_fleets: list[FleetModel], + exported_gateways: Optional[list[GatewayModel]] = None, + name: str = "test-export", + is_global: bool = False, +) -> ExportModel: + export = ExportModel( + name=name, + project=exporter_project, + is_global=is_global, + imports=[ImportModel(project=project) for project in importer_projects], + exported_fleets=[ExportedFleetModel(fleet=fleet) for fleet in exported_fleets], + exported_gateways=[ + ExportedGatewayModel(gateway=gateway) for gateway in (exported_gateways or []) + ], + ) + session.add(export) + await session.commit() + return export + + +async def create_probe( + session: AsyncSession, + job: JobModel, + probe_num: int = 0, + due: datetime = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc), + success_streak: int = 0, +) -> ProbeModel: + probe = ProbeModel( + name=f"{job.job_name}-{probe_num}", + job=job, + probe_num=probe_num, + due=due, + success_streak=success_streak, + active=True, ) + session.add(probe) + await session.commit() + return probe async def create_gateway( @@ -277,9 +638,9 @@ async def create_gateway( name: str = "test_gateway", region: str = "us", wildcard_domain: Optional[str] = None, - gateway_compute_id: Optional[UUID] = None, status: Optional[GatewayStatus] = GatewayStatus.SUBMITTED, last_processed_at: datetime = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + forbid_new_services: bool = False, ) -> GatewayModel: gateway = GatewayModel( project_id=project_id, @@ -287,9 +648,9 @@ async def create_gateway( name=name, region=region, wildcard_domain=wildcard_domain, - gateway_compute_id=gateway_compute_id, status=status, last_processed_at=last_processed_at, + forbid_new_services=forbid_new_services, ) session.add(gateway) await session.commit() @@ -298,6 +659,7 @@ async def create_gateway( async def create_gateway_compute( session: AsyncSession, + gateway_id: Optional[UUID] = None, backend_id: Optional[UUID] = None, ip_address: Optional[str] = "1.1.1.1", region: str = "us", @@ -306,6 +668,7 @@ async def create_gateway_compute( ssh_public_key: str = "", ) -> GatewayComputeModel: gateway_compute = GatewayComputeModel( + gateway_id=gateway_id, backend_id=backend_id, ip_address=ip_address, region=region, @@ -318,27 +681,123 @@ async def create_gateway_compute( return gateway_compute -async def create_pool( +def get_gateway_compute_configuration( + project_name: str = "test-project", + instance_name: str = "test-instance", + backend: BackendType = BackendType.AWS, + region: str = "us", + public_ip: bool = True, +) -> GatewayComputeConfiguration: + return GatewayComputeConfiguration( + project_name=project_name, + instance_name=instance_name, + backend=backend, + region=region, + public_ip=public_ip, + ssh_key_pub="", + certificate=None, + ) + + +async def create_fleet( session: AsyncSession, project: ProjectModel, - pool_name: Optional[str] = None, -) -> PoolModel: - pool_name = pool_name if pool_name is not None else DEFAULT_POOL_NAME - pool = PoolModel( - name=pool_name, + created_at: datetime = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + spec: Optional[FleetSpec] = None, + fleet_id: Optional[UUID] = None, + status: FleetStatus = FleetStatus.ACTIVE, + deleted: bool = False, + name: Optional[str] = None, + last_processed_at: datetime = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), +) -> FleetModel: + if fleet_id is None: + fleet_id = uuid.uuid4() + if spec is None: + spec = get_fleet_spec() + if name is not None: + spec.configuration.name = name + fm = FleetModel( + id=fleet_id, project=project, - project_id=project.id, + deleted=deleted, + name=spec.configuration.name, + status=status, + created_at=created_at, + spec=spec.json(), + instances=[], + runs=[], + last_processed_at=last_processed_at, ) - session.add(pool) + session.add(fm) await session.commit() - return pool + return fm + + +def get_fleet_spec( + conf: Optional[FleetConfiguration] = None, profile: Optional[Profile] = None +) -> FleetSpec: + if conf is None: + conf = get_fleet_configuration() + if profile is None: + profile = Profile() + return FleetSpec( + configuration=conf, + configuration_path="fleet.dstack.yml", + profile=profile, + ) + + +def get_fleet_configuration( + name: str = "test-fleet", + nodes: FleetNodesSpec = FleetNodesSpec(min=1, target=1, max=1), + placement: Optional[InstanceGroupPlacement] = None, + backends: Optional[list[BackendType]] = None, +) -> FleetConfiguration: + return FleetConfiguration( + name=name, + nodes=nodes, + placement=placement, + backends=backends, + ) + + +def get_ssh_fleet_configuration( + name: str = "test-fleet", + user: str = "ubuntu", + ssh_key: Optional[SSHKey] = None, + hosts: Optional[list[Union[SSHHostParams, str]]] = None, + network: Optional[str] = None, + placement: Optional[InstanceGroupPlacement] = None, + blocks: Optional[Union[int, Literal["auto"]]] = None, +) -> FleetConfiguration: + if ssh_key is None: + ssh_key = SSHKey(public="", private=get_private_key_string()) + if hosts is None: + hosts = ["10.0.0.100"] + ssh_config = SSHParams( + user=user, + ssh_key=ssh_key, + hosts=hosts, + network=network, + ) + optional_properties: dict[str, Any] = {} + if blocks is not None: + optional_properties["blocks"] = blocks + return FleetConfiguration( + name=name, + ssh_config=ssh_config, + placement=placement, + **optional_properties, + ) async def create_instance( session: AsyncSession, project: ProjectModel, - pool: PoolModel, + fleet: Optional[FleetModel] = None, status: InstanceStatus = InstanceStatus.IDLE, + unreachable: bool = False, + health_status: HealthStatus = HealthStatus.HEALTHY, created_at: datetime = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), finished_at: Optional[datetime] = None, spot: bool = False, @@ -346,132 +805,327 @@ async def create_instance( requirements: Optional[Requirements] = None, instance_configuration: Optional[InstanceConfiguration] = None, instance_id: Optional[UUID] = None, + job: Optional[JobModel] = None, + instance_num: int = 0, + backend: BackendType = BackendType.VERDA, + termination_policy: Optional[TerminationPolicy] = None, + termination_idle_time: int = DEFAULT_FLEET_TERMINATION_IDLE_TIME, + region: Optional[str] = None, + availability_zone: Optional[str] = None, + remote_connection_info: Optional[RemoteConnectionInfo] = None, + offer: Optional[Union[InstanceOfferWithAvailability, Literal["auto"]]] = "auto", + job_provisioning_data: Optional[Union[JobProvisioningData, Literal["auto"]]] = "auto", + total_blocks: Optional[int] = 1, + busy_blocks: int = 0, + name: str = "test_instance", + volumes: Optional[List[VolumeModel]] = None, + price: float = 1.0, + last_processed_at: datetime = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + provisioning_job_id: Optional[UUID] = None, ) -> InstanceModel: if instance_id is None: instance_id = uuid.uuid4() - job_provisioning_data = { - "backend": "datacrunch", - "instance_type": { - "name": "instance", - "resources": { - "cpus": 1, - "memory_mib": 512, - "gpus": [], - "spot": spot, - "disk": {"size_mib": 102400}, - "description": "", - }, - }, - "instance_id": "running_instance.id", - "ssh_proxy": None, - "hostname": "running_instance.ip", - "region": "running_instance.location", - "price": 0.1, - "username": "root", - "ssh_port": 22, - "dockerized": True, - "backend_data": None, - } - offer = { - "backend": "datacrunch", - "instance": { - "name": "instance", - "resources": { - "cpus": 2, - "memory_mib": 12000, - "gpus": [], - "spot": spot, - "disk": {"size_mib": 102400}, - "description": "", - }, - }, - "region": "en", - "price": 1, - "availability": "available", - } - + if region is None: + region = "" if backend == BackendType.KUBERNETES else "eu-west" + if job_provisioning_data == "auto": + job_provisioning_data = get_job_provisioning_data( + dockerized=True, + backend=backend, + region=region, + availability_zone=availability_zone, + spot=spot, + hostname="running_instance.ip", + internal_ip=None, + ) + if offer == "auto": + offer = get_instance_offer_with_availability( + backend=backend, region=region, spot=spot, price=price + ) if profile is None: profile = Profile(name="test_name") if requirements is None: - requirements = Requirements(resources=ResourcesSpec(cpu=1)) + requirements = Requirements(resources=ResourcesSpec(cpu=CPUSpec.parse("1"))) if instance_configuration is None: - instance_configuration = InstanceConfiguration( - project_name="test_proj", - instance_name="test_instance_name", - instance_id="test instance id", - job_docker_config=None, - ssh_keys=[], - user="test_user", - ) + instance_configuration = get_instance_configuration() + + if volumes is None: + volumes = [] + volume_attachments = [] + for volume in volumes: + volume_attachments.append(VolumeAttachmentModel(volume=volume)) im = InstanceModel( id=instance_id, - name="test_instance", - pool=pool, + name=name, + instance_num=instance_num, + fleet=fleet, project=project, status=status, - unreachable=False, + last_processed_at=last_processed_at, + unreachable=unreachable, + health=health_status, created_at=created_at, started_at=created_at, finished_at=finished_at, - job_provisioning_data=json.dumps(job_provisioning_data), - offer=json.dumps(offer), - price=1, - region="eu-west", - backend=BackendType.DATACRUNCH, - termination_idle_time=DEFAULT_POOL_TERMINATION_IDLE_TIME, + job_provisioning_data=job_provisioning_data.json() if job_provisioning_data else None, + offer=offer.json() if offer else None, + price=price, + region=region, + backend=backend, + termination_policy=termination_policy, + termination_idle_time=termination_idle_time, profile=profile.json(), requirements=requirements.json(), instance_configuration=instance_configuration.json(), + remote_connection_info=remote_connection_info.json() if remote_connection_info else None, + volume_attachments=volume_attachments, + total_blocks=total_blocks, + busy_blocks=busy_blocks, + provisioning_job_id=provisioning_job_id, ) + if job: + im.jobs.append(job) session.add(im) await session.commit() return im +def get_instance_configuration( + project_name: str = "test-project", + instance_name: str = "test-instance", + user: str = "dstack-user", +) -> InstanceConfiguration: + return InstanceConfiguration( + project_name=project_name, + instance_name=instance_name, + user=user, + ssh_keys=[], + ) + + +def get_instance_offer_with_availability( + backend: BackendType = BackendType.AWS, + region: str = "eu-west", + gpu_count: int = 0, + gpu_name: str = "T4", + gpu_memory_gib: float = 16, + cpu_count: int = 2, + memory_gib: float = 12, + disk_gib: float = 100.0, + spot: bool = False, + blocks: int = 1, + total_blocks: int = 1, + availability_zones: Optional[List[str]] = None, + price: float = 1.0, + instance_type: str = "instance", + availability: InstanceAvailability = InstanceAvailability.AVAILABLE, +): + gpus = [ + Gpu( + name=gpu_name, + memory_mib=int(gpu_memory_gib * 1024), + vendor=gpuhunt.AcceleratorVendor.NVIDIA, + ) + ] * gpu_count + return InstanceOfferWithAvailability( + backend=backend, + instance=InstanceType( + name=instance_type, + resources=Resources( + cpus=cpu_count, + memory_mib=int(memory_gib * 1024), + gpus=gpus, + spot=spot, + disk=Disk(size_mib=int(disk_gib * 1024)), + description="", + ), + ), + region=region, + price=price, + availability=availability, + availability_zones=availability_zones, + blocks=blocks, + total_blocks=total_blocks, + ) + + +def get_remote_connection_info( + host: str = "10.0.0.10", + port: int = 22, + ssh_user: str = "ubuntu", + ssh_keys: Optional[list[SSHKey]] = None, + ssh_proxy: Optional[SSHConnectionParams] = None, + ssh_proxy_keys: Optional[list[SSHKey]] = None, + env: Optional[Union[Env, dict]] = None, +): + if ssh_keys is None: + ssh_keys = [get_ssh_key()] + if env is None: + env = Env() + elif isinstance(env, dict): + env = Env.parse_obj(env) + return RemoteConnectionInfo( + host=host, + port=port, + ssh_user=ssh_user, + ssh_keys=ssh_keys, + ssh_proxy=ssh_proxy, + ssh_proxy_keys=ssh_proxy_keys, + env=env, + ) + + +def get_ssh_key() -> SSHKey: + return SSHKey( + public="ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIO6mJxVbNtm0zXgMLvByrhXJCmJRveSrJxLB5/OzcyCk", + private=""" + -----BEGIN OPENSSH PRIVATE KEY----- + b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAAAMwAAAAtzc2gtZW + QyNTUxOQAAACDupicVWzbZtM14DC7wcq4VyQpiUb3kqycSwefzs3MgpAAAAJCiWa5Volmu + VQAAAAtzc2gtZWQyNTUxOQAAACDupicVWzbZtM14DC7wcq4VyQpiUb3kqycSwefzs3MgpA + AAAEAncHi4AhS6XdMp5Gzd+IMse/4ekyQ54UngByf0Sp0uH+6mJxVbNtm0zXgMLvByrhXJ + CmJRveSrJxLB5/OzcyCkAAAACWRlZkBkZWZwYwECAwQ= + -----END OPENSSH PRIVATE KEY----- + """, + ) + + +async def create_instance_health_check( + session: AsyncSession, + instance: InstanceModel, + collected_at: datetime = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + status: HealthStatus = HealthStatus.HEALTHY, + response: str = "{}", +) -> InstanceHealthCheckModel: + health_check = InstanceHealthCheckModel( + instance_id=instance.id, + collected_at=collected_at, + status=status, + response=response, + ) + session.add(health_check) + await session.commit() + return health_check + + async def create_volume( session: AsyncSession, project: ProjectModel, + user: UserModel, status: VolumeStatus = VolumeStatus.SUBMITTED, created_at: datetime = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), - configuration: Optional[VolumeConfiguration] = None, + last_processed_at: Optional[datetime] = None, + last_job_processed_at: Optional[datetime] = None, + configuration: Optional[AnyVolumeConfiguration] = None, volume_provisioning_data: Optional[VolumeProvisioningData] = None, deleted_at: Optional[datetime] = None, + backend: BackendType = BackendType.AWS, + region: str = "eu-west-1", ) -> VolumeModel: if configuration is None: - configuration = get_volume_configuration() + configuration = get_volume_configuration(backend=backend, region=region) + if last_processed_at is None: + last_processed_at = created_at vm = VolumeModel( project=project, + user_id=user.id, name=configuration.name, status=status, created_at=created_at, + last_processed_at=last_processed_at, + last_job_processed_at=last_job_processed_at, configuration=configuration.json(), volume_provisioning_data=volume_provisioning_data.json() if volume_provisioning_data else None, - instances=[], + attachments=[], deleted_at=deleted_at, + deleted=True if deleted_at else False, ) session.add(vm) await session.commit() return vm +def get_volume( + id_: Optional[UUID] = None, + name: str = "test_volume", + user: str = "test_user", + project_name: str = "test_project", + configuration: Optional[AnyVolumeConfiguration] = None, + external: bool = False, + created_at: datetime = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + last_processed_at: datetime = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + status: VolumeStatus = VolumeStatus.ACTIVE, + status_message: Optional[str] = None, + deleted: bool = False, + deleted_at: Optional[datetime] = None, + volume_id: Optional[str] = None, + provisioning_data: Optional[VolumeProvisioningData] = None, + attachments: Optional[List[VolumeAttachment]] = None, +) -> Volume: + if id_ is None: + id_ = uuid.uuid4() + if configuration is None: + configuration = get_volume_configuration() + if attachments is None: + attachments = [] + return Volume( + id=id_, + name=name, + user=user, + project_name=project_name, + configuration=configuration, + external=external, + created_at=created_at, + last_processed_at=last_processed_at, + status=status, + status_message=status_message, + deleted=deleted, + deleted_at=deleted_at, + volume_id=volume_id, + provisioning_data=provisioning_data, + attachments=attachments, + ) + + def get_volume_configuration( name: str = "test-volume", backend: BackendType = BackendType.AWS, region: str = "eu-west-1", size: Optional[Memory] = Memory(100), volume_id: Optional[str] = None, -) -> VolumeConfiguration: - return VolumeConfiguration( + auto_cleanup_duration: Optional[Union[str, int]] = None, +) -> AnyVolumeConfiguration: + assert backend != BackendType.KUBERNETES, "use get_kubernetes_volume_configuration() instead" + return VolumeConfiguration.parse_obj( + dict( + name=name, + backend=backend, + region=region, + size=size, + volume_id=volume_id, + auto_cleanup_duration=auto_cleanup_duration, + ) + ).__root__ + + +def get_kubernetes_volume_configuration( + name: str = "test-volume", + size: Optional[Memory] = Memory(100), + claim_name: Optional[str] = None, + auto_cleanup_duration: Optional[Union[str, int]] = None, + storage_class_name: Optional[str] = None, +) -> KubernetesVolumeConfiguration: + return KubernetesVolumeConfiguration( name=name, - backend=backend, - region=region, + backend=BackendType.KUBERNETES, size=size, - volume_id=volume_id, + claim_name=claim_name, + storage_class_name=storage_class_name, + auto_cleanup_duration=auto_cleanup_duration, ) @@ -479,19 +1133,233 @@ def get_volume_provisioning_data( volume_id: str = "vol-1234", size_gb: int = 100, availability_zone: Optional[str] = None, + price: Optional[float] = 1.0, backend_data: Optional[str] = None, + backend: Optional[BackendType] = None, ) -> VolumeProvisioningData: return VolumeProvisioningData( + backend=backend, volume_id=volume_id, size_gb=size_gb, availability_zone=availability_zone, + price=price, backend_data=backend_data, ) +async def create_placement_group( + session: AsyncSession, + project: ProjectModel, + fleet: FleetModel, + name: str = "test-pg", + created_at: datetime = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + configuration: Optional[PlacementGroupConfiguration] = None, + provisioning_data: Optional[PlacementGroupProvisioningData] = None, + fleet_deleted: Optional[bool] = False, + deleted: Optional[bool] = False, + deleted_at: Optional[datetime] = None, +) -> PlacementGroupModel: + if configuration is None: + configuration = get_placement_group_configuration() + if provisioning_data is None: + provisioning_data = get_placement_group_provisioning_data() + pg = PlacementGroupModel( + project=project, + fleet=fleet, + name=name, + created_at=created_at, + configuration=configuration.json(), + provisioning_data=provisioning_data.json(), + fleet_deleted=fleet_deleted, + deleted=deleted, + deleted_at=deleted_at, + ) + session.add(pg) + await session.commit() + return pg + + +def get_placement_group_configuration( + backend: BackendType = BackendType.AWS, + region: str = "eu-central-1", + strategy: PlacementStrategy = PlacementStrategy.CLUSTER, +) -> PlacementGroupConfiguration: + return PlacementGroupConfiguration( + backend=backend, + region=region, + placement_strategy=strategy, + ) + + +def get_placement_group_provisioning_data( + backend: BackendType = BackendType.AWS, +) -> PlacementGroupProvisioningData: + return PlacementGroupProvisioningData(backend=backend) + + +async def create_job_metrics_point( + session: AsyncSession, + job_model: JobModel, + timestamp: datetime, + cpu_usage_micro: int = 1_000_000, + memory_usage_bytes: int = 1024, + memory_working_set_bytes: int = 1024, + gpus_memory_usage_bytes: Optional[List[int]] = None, + gpus_util_percent: Optional[List[int]] = None, +) -> JobMetricsPoint: + timestamp_micro = int(timestamp.timestamp() * 1_000_000) + if gpus_memory_usage_bytes is None: + gpus_memory_usage_bytes = [] + if gpus_util_percent is None: + gpus_util_percent = [] + jmp = JobMetricsPoint( + job_id=job_model.id, + timestamp_micro=timestamp_micro, + cpu_usage_micro=cpu_usage_micro, + memory_usage_bytes=memory_usage_bytes, + memory_working_set_bytes=memory_working_set_bytes, + gpus_memory_usage_bytes=json.dumps(gpus_memory_usage_bytes), + gpus_util_percent=json.dumps(gpus_util_percent), + ) + session.add(jmp) + await session.commit() + return jmp + + +async def create_job_prometheus_metrics( + session: AsyncSession, + job: JobModel, + collected_at: datetime = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + text: str = "# Prometheus metrics\n", +): + metrics = JobPrometheusMetrics( + job_id=job.id, + collected_at=collected_at, + text=text, + ) + session.add(metrics) + await session.commit() + return metrics + + +async def create_secret( + session: AsyncSession, + project: ProjectModel, + name: str, + value: str, +): + secret_model = SecretModel( + project=project, + name=name, + value=DecryptedString(plaintext=value), + ) + session.add(secret_model) + await session.commit() + return secret_model + + +async def list_events(session: AsyncSession) -> list[EventModel]: + res = await session.execute( + select(EventModel) + .order_by(EventModel.recorded_at, EventModel.id) + .options(joinedload(EventModel.targets)) + ) + return list(res.scalars().unique().all()) + + +async def clear_events(session: AsyncSession) -> None: + await session.execute(delete(EventModel)) + + +def get_private_key_string() -> str: + return """ +-----BEGIN RSA PRIVATE KEY----- +MIIJJwIBAAKCAgEApZ8j9eU/C2/XvM7tG9tjhT85IHuJ2hQ61DYYDIPb8bY8/KWJ +WIVb90CBElVtmRnO7AvGsceKJ2I6YFsr37RVLAgo6Is0osvO+co+3bGiHxNwT7sX ++MatuiLtzvGZLQW8Os/xMy+aIIgzTZ0pDmEJIIlO2msd4jZO9R6UpPa1F4z0Oj0G +0So262qXHMGBs63CFqbLeQKecUK8e0RfUD1mxr8f4zJ33JpW0rjg0uZiAjLnYOYN +C4e4bWnIS7byGrcuRDXpYIrGXrxcrG16CKr7zrFNq+h4f5e7wDUICwPz5X8ke+JZ +0DIm5ooXWO07BLPNG9fbQHIR8SQgT4X+sfYasYUT9cFugwEiWSWyrRKoc4ZRmwiL +Rz5Tb5Rgn+OFXq1yYr+CnguTr4n6Ldv9RLMBye1r8S/h1Yi5DBZOyJDCTuw0tPhL +eUjS/pBLZ5oxSnUDQ1lirSOHDPpn6N9Mxtm9IN6WElv1W2pM55sCp33NuMbsC0C3 +8iCan3Z0giKxaNyeejzHEEkgeGq8UMGDaQglfDIOkKMI6zHeGQc0201lDsCXKGeN +6xeXdubtuZg1EPKdnNeZDZB636LZ+opi/6OLPNo7ml/zU24eymKMHF21+eO2TTVk +Eh0skTs4b9R0tHRhzAvZrDC6NR4CyJFCCE+lzkkLenSD1DLiEjExoLChGtECAwEA +AQKCAgB734gs7RZ3PmKUdAxBzpgj3AKlOeED/Cd3+zGHgsPpiE0bBdCxJaWAS31+ +Mej0Hqp2P+SPqVe6VyykTuyEt8MQWNYH/74RmPAoQc09UROZvJc++wdV6XucgW1u +X6MaWnTLZCXaC9tyQ4xjm41OlOMXs7sHgCBsxgPOL94rd95ATAuK14QWw0UqVKHL +Pyv8MJS/DmeXDY9l1O1WIPBM+m+5bM+zxVaC5+jSWLbG5ssdK+eEwOu22P7mzryh +bKattp5jJBN2QrVVu/pweL1SaFhH4rLeRdSCUgF6I+/tFTrBRpQKGGTmY+xWd6g4 +uc5vmO9qyMrS675hpoyIDgdOIW0abm8Jb1rnAbKVtBx4yTfLeD+Cx5a+o24JEIH7 +4J6yutUabWvRNz0JT9bpiEQYZBKZROt1sSdjf+8xxgXQHIuAn1F/xjfqdBvxG0UE +2UkP3+UO6DEl7ciE4+eBaBoJp1DHkWOyXgAC/RvR9aNuPvOV5RfTw/DtL5eLTuZQ +1AUnKcjE0CAryCAkNdY42gRT1m/BvUrf96zKbcQS61YgHS9jtPsoaPh2AKiUAo96 +a4M+fRMmVPxlO8TcykTL4BRVihuz2Gx+DOB7M/UVGTtk1pHVJqjDFuX4M5gzrkjt ++px34flQaBPR7um/91aEicV3t4x4OGIDhcjd49wor8fLp3AxcQKCAQEA256rxqeG +oZxlaqXALr5uRlAVf2uf2DtVP7bWwoQpT8ULm4hQfKz+yvntm7jq7wuW5RYzGpFA +einBFbbsUs8VGtMYOmiD2IR7KYYsqd+4wEIvv5LhWAtIHMu+E3zir1aXj0yEZELM +Ou+zNxhhwewxgzPg3LmfjD0bnL/yvJavlEvxKcZy7kODHCW9j4B3/6Mm4KlcFC5p +DYmtlhBGPK6FpH1PDJrrzZKZApLoAT8D6h71ZH9p/9q6CmKduy3hGGVlDYQNhEI4 +40S7r9cMsI6Rz6hT/uj5EexUc0LYbPCDMlXhOMXRNnrAHwKW1myD4Mp8o4suaTYT +c0IY4imqP+/a9QKCAQEAwQ6XR0WnnyHjgWF5z2l/TG5Io4GBOwZBHzbOb+6afBGZ +ScnlVuGhyfusiYBXEdejwZGuE+jR5Oe+6yP+mEtDfxeXPQ91KhSbu5i0zu+Mkokb +LXjhL/MlPM6TKGG4XHZ+BHSV4aqQPp2EL2jViyd9/vbb/oNhTLDP4pNMX90G/bYq +VIa1GH5lCS0xg611HwgLGSTVnggrcUsytgpMprdV0N/NWla9dOeUaIBqt/m2RbuH +Csyqe/AjwwB9CLKYnGL5gus14guHWXBEPUR/GjcyODIq/0WIlOyANAzHWy2WXzgq +w4NGo6L7IoiIbL1EED3gljPlHd5JeXU9MtjWDvIO7QKCAQB52vk+mTdHNmrDGMKg +bQLsuoSjFYk0Rf+QAZf5h7EQVKmTG7hk5OvenXvsGlcoWYrZA09Jn2xiHAbJUJyh +ecsg/h2EUvdMzH011f+0JbDx5AdwSUQFQQU7DQUi9PkmBmrDlNYkdzewP811dW7Q +VYhHXyKV9dyDyGgougwp/YXgR57A6h5c+1Kk7H/YPpTWX6UzpGS1weaCH3EUQWVn +SAJY+TpCKTdK8ds6JV7bSiaW4aSQpW2gC7GMD5mrANLTYXcHX8zMJJ5B46Ir96tP +z1syGBi66HNCMZnN9jn1gCGbbTEw+fmSO9ubmSkuQjmOIWu0poYS1HFIU1VRL4MK +RMB9AoIBAFiNhcx2Yc23cLCO8p216WM4ju8Y3xsg4kwcCpMDIi9YrzROfHjepCSO +4XRsvwN7Iy0N0ohlWamit0sKRqS6mSo5uvCSH47+xvREtmLZNGSeqS2xbbFd2S3M +H2n9cOBQpbsLcxiA8QsXm2NXtePPaJbDyuMyhjX0QFbQc87hBmzn2wDMjVK/3z5X +UYfxz3A9c0HESIvleW/NK2Se0swB+kYF8h7G/L4b31IT3V+oFfhkbSwB9w1EeFLg +7XlI2oGZUJPBqgSWfy4CNfrYaWiv+sQWFuziiySsWp4FYogrH/drPwpRM9ypTIJp +mBIwuoCssVCUWzrZFGC26yxgk8dlNn0CggEAOfjn13/pSPzjlEOMA3IrUd/5cllI +hST6gzwXr4DmxnTzyKsLGPoMoE2r/whWReZTTSzFh+CbNBMOzQdlNo5k2WBt6mg8 +ey1hVYhkH6plOHJ8W4Abx+S/6C2s+QgUeEhFzeDAkYHNdJdQuPg/HWzk08RGmruA +kXYzp3q5IQqgKM4abf8oye3n6d3bl6Vc4MHTV+1Kxm6za6Of7wMcZ9uNEqxozw2H +mgsoXQqZBWaHGwLv8fkPuUmRp+JPaJW8Aag/3swpyTCZ21DneYcqy6S8MG2R8NjV +VOl2sg6hJrQQHfmKH7ru4U5PTZzhHIw1RAWdagjiBONB2MeHYIFWncxKGw== +-----END RSA PRIVATE KEY----- +""" + + +@contextmanager +def default_permissions_context(default_permissions: DefaultPermissions): + prev_default_permissions = get_default_permissions() + set_default_permissions(default_permissions) + try: + yield + finally: + set_default_permissions(prev_default_permissions) + + class AsyncContextManager: async def __aenter__(self): pass async def __aexit__(self, exc_type, exc, traceback): pass + + +class ComputeMockSpec( + Compute, + ComputeWithCreateInstanceSupport, + ComputeWithGroupProvisioningSupport, + ComputeWithPrivilegedSupport, + ComputeWithInstanceVolumesSupport, + ComputeWithMultinodeSupport, + ComputeWithReservationSupport, + ComputeWithPlacementGroupSupport, + ComputeWithGatewaySupport, + ComputeWithPrivateGatewaySupport, + ComputeWithVolumeSupport, +): + """ + Can be used to create Compute mocks that pass all `isinstance()` asserts. + """ + + pass diff --git a/src/dstack/_internal/server/testing/conf.py b/src/dstack/_internal/server/testing/conf.py new file mode 100644 index 0000000000..1847b1cd38 --- /dev/null +++ b/src/dstack/_internal/server/testing/conf.py @@ -0,0 +1,74 @@ +import pytest +import pytest_asyncio +from sqlalchemy import StaticPool +from sqlalchemy.ext.asyncio import create_async_engine +from testcontainers.postgres import PostgresContainer + +from dstack._internal.server import settings +from dstack._internal.server.db import Database, override_db +from dstack._internal.server.models import BaseModel + +# Remember initialized URLs to create metadata once per session. +_initialized_postgres_db_urls = set() + + +@pytest.fixture(scope="session") +def postgres_container(): + with PostgresContainer("postgres:16-alpine", driver="asyncpg") as postgres: + yield postgres.get_connection_url() + + +# test_db is function-scoped since making it session-scoped did not bring much benefit. +@pytest_asyncio.fixture +async def test_db(request): + db_type = getattr(request, "param", "sqlite") + engine = None + if db_type == "sqlite": + db_url = "sqlite+aiosqlite://" + # For SQLite, allow accessing the in-memory DB from multiple threads: + # https://docs.sqlalchemy.org/en/13/dialects/sqlite.html#using-a-memory-database-in-multiple-threads + engine = create_async_engine( + db_url, + echo=settings.SQL_ECHO_ENABLED, + connect_args={"check_same_thread": False}, + poolclass=StaticPool, + ) + elif db_type == "postgres": + if not request.config.getoption("--runpostgres"): + pytest.skip("Skipping Postgres tests as --runpostgres was not provided") + db_url = request.getfixturevalue("postgres_container") + else: + raise ValueError(f"Unknown db_type {db_type}") + db = Database(db_url, engine=engine) + override_db(db) + if db_type == "sqlite": + async with db.engine.begin() as conn: + await conn.run_sync(BaseModel.metadata.create_all) + # Relying on function-scoped engine for a clean DB + else: + if db_url not in _initialized_postgres_db_urls: + async with db.engine.begin() as conn: + await conn.run_sync(BaseModel.metadata.create_all) + _initialized_postgres_db_urls.add(db_url) + await _truncate_postgres_db(db) + yield db + await db.engine.dispose() + + +@pytest_asyncio.fixture +async def session(test_db): + db = test_db + async with db.get_session() as session: + yield session + + +async def _truncate_postgres_db(db: Database): + preparer = db.engine.sync_engine.dialect.identifier_preparer + table_names = ", ".join( + preparer.format_table(table) for table in BaseModel.metadata.sorted_tables + ) + if not table_names: + return + truncate_statement = f"TRUNCATE {table_names} RESTART IDENTITY CASCADE" + async with db.engine.begin() as conn: + await conn.exec_driver_sql(truncate_statement) diff --git a/src/dstack/_internal/server/testing/matchers.py b/src/dstack/_internal/server/testing/matchers.py new file mode 100644 index 0000000000..0cf610807e --- /dev/null +++ b/src/dstack/_internal/server/testing/matchers.py @@ -0,0 +1,31 @@ +import re +import uuid + + +class SomeUUID4Str: + """ + A matcher that compares equal to any valid UUID4 string + """ + + # Simplified UUID regex: just checks the 8-4-4-4-12 hex structure + _uuid_regex = re.compile( + r"^[0-9a-f]{8}-" + r"[0-9a-f]{4}-" + r"[0-9a-f]{4}-" + r"[0-9a-f]{4}-" + r"[0-9a-f]{12}$" + ) + + def __eq__(self, other): + if isinstance(other, str): + if not self._uuid_regex.match(other): + return False + try: + return uuid.UUID(other).version == 4 + except ValueError: + return False + + return False + + def __repr__(self): + return "SomeUUID4Str()" diff --git a/src/dstack/_internal/server/utils/common.py b/src/dstack/_internal/server/utils/common.py index 444a978fda..7d8b4290e6 100644 --- a/src/dstack/_internal/server/utils/common.py +++ b/src/dstack/_internal/server/utils/common.py @@ -1,5 +1,4 @@ import asyncio -from functools import partial from typing import ( Awaitable, Callable, @@ -7,23 +6,11 @@ List, Optional, Sequence, - Set, Tuple, TypeVar, Union, ) -from typing_extensions import ParamSpec - -P = ParamSpec("P") -R = TypeVar("R") - - -async def run_async(func: Callable[P, R], *args: P.args, **kwargs: P.kwargs) -> R: - func_with_args = partial(func, *args, **kwargs) - return await asyncio.get_running_loop().run_in_executor(None, func_with_args) - - ItemT = TypeVar("ItemT") ResultT = TypeVar("ResultT") @@ -55,55 +42,6 @@ async def gather_map_async( ] -KeyT = TypeVar("KeyT") - - -async def wait_unlock( - lock: asyncio.Lock, locked: Set[KeyT], keys: Iterable[KeyT], *, delay: float = 0.1 -): - """ - Wait until all keys are unlocked (not presented in the `locked` set). - Lock is released during the sleep. - """ - keys_set = set(keys) - while True: - async with lock: - if not keys_set.intersection(locked): - return - await asyncio.sleep(delay) - - -async def wait_to_lock(lock: asyncio.Lock, locked: Set[KeyT], key: KeyT, *, delay: float = 0.1): - """ - Retry locking until the key is locked. - Lock is released during the sleep. - """ - while True: - async with lock: - if key not in locked: - locked.add(key) - return - await asyncio.sleep(delay) - - -async def wait_to_lock_many( - lock: asyncio.Lock, locked: Set[KeyT], keys: List[KeyT], *, delay: float = 0.1 -): - """ - Retry locking until all the keys are locked. - Lock is released during the sleep. - The keys must be sorted to prevent deadlock. - """ - left_to_lock = keys.copy() - while len(left_to_lock) > 0: - async with lock: - for key in left_to_lock: - if key not in locked: - locked.add(key) - left_to_lock.remove(key) - await asyncio.sleep(delay) - - def join_byte_stream_checked(stream: Iterable[bytes], max_size: int) -> Optional[bytes]: """ Join an iterable of `bytes` values into one `bytes` value, diff --git a/src/dstack/_internal/server/utils/logging.py b/src/dstack/_internal/server/utils/logging.py index 6c50fa0d4c..03d7d05cb4 100644 --- a/src/dstack/_internal/server/utils/logging.py +++ b/src/dstack/_internal/server/utils/logging.py @@ -2,7 +2,7 @@ import logging import sys -from pythonjsonlogger import jsonlogger +from pythonjsonlogger.json import JsonFormatter from dstack._internal.cli.utils.common import console from dstack._internal.cli.utils.rich import DstackRichHandler @@ -11,7 +11,7 @@ class AsyncioCancelledErrorFilter(logging.Filter): def filter(self, record: logging.LogRecord) -> bool: - if record.exc_info is None: + if not record.exc_info: return True if isinstance(record.exc_info[1], asyncio.CancelledError): return False @@ -25,21 +25,21 @@ def configure_logging(): fmt="%(levelname)s %(asctime)s.%(msecs)03d %(name)s %(message)s", datefmt="%Y-%m-%dT%H:%M:%S", ), - "json": jsonlogger.JsonFormatter( + "json": JsonFormatter( "%(asctime)s %(name)s %(levelname)s %(message)s", json_ensure_ascii=False, rename_fields={"name": "logger", "asctime": "timestamp", "levelname": "level"}, ), } - handlers = { + handlers: dict[str, logging.Handler] = { "rich": DstackRichHandler(console=console), "standard": logging.StreamHandler(stream=sys.stdout), "json": logging.StreamHandler(stream=sys.stdout), } if settings.LOG_FORMAT not in formatters: raise ValueError(f"Invalid settings.LOG_FORMAT: {settings.LOG_FORMAT}") - formatter = formatters.get(settings.LOG_FORMAT) - handler = handlers.get(settings.LOG_FORMAT) + formatter = formatters[settings.LOG_FORMAT] + handler = handlers[settings.LOG_FORMAT] handler.setFormatter(formatter) handler.addFilter(AsyncioCancelledErrorFilter()) root_logger = logging.getLogger(None) @@ -47,3 +47,5 @@ def configure_logging(): root_logger.setLevel(settings.ROOT_LOG_LEVEL) dstack_logger = logging.getLogger("dstack") dstack_logger.setLevel(settings.LOG_LEVEL) + # paramiko emits error logs in cases handled by dstack + logging.getLogger("paramiko").setLevel(logging.CRITICAL) diff --git a/src/dstack/_internal/server/utils/routers.py b/src/dstack/_internal/server/utils/routers.py index 185d2294ed..5aff751868 100644 --- a/src/dstack/_internal/server/utils/routers.py +++ b/src/dstack/_internal/server/utils/routers.py @@ -1,10 +1,79 @@ -from typing import Dict, List, Optional +from typing import Any, Dict, List, Optional -from fastapi import HTTPException, Request, status -from fastapi.responses import JSONResponse -from packaging import version +import orjson +import packaging.version +from fastapi import HTTPException, Request, Response, status -from dstack._internal.core.errors import ServerClientError +from dstack._internal.core.errors import ServerClientError, ServerClientErrorCode +from dstack._internal.core.models.common import CoreModel +from dstack._internal.utils.json_utils import get_orjson_default_options, orjson_default +from dstack._internal.utils.version import parse_version + + +class CustomORJSONResponse(Response): + """ + Custom JSONResponse that uses orjson for serialization. + + It's recommended to return this class from routers directly instead of + returning pydantic models to avoid the FastAPI's jsonable_encoder overhead. + See https://fastapi.tiangolo.com/advanced/custom-response/#use-orjsonresponse. + + Beware that FastAPI skips model validation when responses are returned directly. + If serialization needs to be modified, override `dict()` instead of adding validators. + """ + + media_type = "application/json" + + def render(self, content: Any) -> bytes: + return orjson.dumps( + content, + option=get_orjson_default_options(), + default=orjson_default, + ) + + +class BadRequestDetailsModel(CoreModel): + code: Optional[ServerClientErrorCode] = ServerClientErrorCode.UNSPECIFIED_ERROR + msg: str + + +class BadRequestErrorModel(CoreModel): + detail: BadRequestDetailsModel + + +class AccessDeniedDetailsModel(CoreModel): + code: Optional[str] = None + msg: str = "Access denied" + + +class AccessDeniedErrorModel(CoreModel): + detail: AccessDeniedDetailsModel + + +def get_base_api_additional_responses() -> Dict: + """ + Returns additional responses for the OpenAPI docs relevant to all API endpoints. + The endpoints may override responses to make them as specific as possible. + E.g. an endpoint may specify which error codes it may return in `code`. + """ + return { + 400: get_bad_request_additional_response(), + 403: get_access_denied_additional_response(), + } + + +def get_bad_request_additional_response() -> Dict: + return { + "description": "Bad request", + "model": BadRequestErrorModel, + } + + +def get_access_denied_additional_response() -> Dict: + return { + "description": "Access denied", + "model": AccessDeniedErrorModel, + } def error_detail(msg: str, code: Optional[str] = None, **kwargs) -> Dict: @@ -48,65 +117,50 @@ def get_server_client_error_details(error: ServerClientError) -> List[Dict]: return details -def request_size_exceeded(request: Request, limit: int) -> bool: +def get_request_size(request: Request) -> int: if "content-length" not in request.headers: - return True - content_length = int(request.headers["content-length"]) - if content_length > limit: - return True - return False + return 0 + return int(request.headers["content-length"]) + + +def get_client_version(request: Request) -> Optional[packaging.version.Version]: + """ + FastAPI dependency that returns the dstack client version or None if the version is latest/dev. + """ + + version = request.headers.get("x-api-version") + if version is None: + return None + try: + return parse_version(version) + except ValueError as e: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=[error_detail(str(e))], + ) def check_client_server_compatibility( - client_version: Optional[str], + client_version: Optional[packaging.version.Version], server_version: Optional[str], -) -> Optional[JSONResponse]: +) -> None: """ - Returns `JSONResponse` with error if client/server versions are incompatible. - Returns `None` otherwise. + Raise HTTP exception if the client is incompatible with the server. """ if client_version is None or server_version is None: return None - parsed_server_version = version.parse(server_version) - # latest allows client to bypass compatibility check (e.g. frontend) - if client_version == "latest": + parsed_server_version = parse_version(server_version) + if parsed_server_version is None: return None - try: - parsed_client_version = version.parse(client_version) - except version.InvalidVersion: - return JSONResponse( + # We preserve full client backward compatibility across patch releases. + # Server is always partially backward-compatible (so no check). + if client_version > parsed_server_version and ( + client_version.major > parsed_server_version.major + or client_version.minor > parsed_server_version.minor + ): + msg = f"The client/CLI version ({client_version}) is incompatible with the server version ({server_version})." + raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, - content={ - "detail": get_server_client_error_details( - ServerClientError("Bad API version specified") - ) - }, + detail=get_server_client_error_details(ServerClientError(msg=msg)), ) - # We preserve backward-compatibility across micro releases, - # but do not preserve forward-compatibility. - if parsed_client_version < parsed_server_version and ( - parsed_client_version.major < parsed_server_version.major - or parsed_client_version.minor < parsed_server_version.minor - ): - return error_incompatible_versions(client_version, server_version, ask_cli_update=True) - elif parsed_client_version > parsed_server_version and ( - parsed_client_version.major > parsed_server_version.major - or parsed_client_version.minor > parsed_server_version.minor - or parsed_client_version.micro > parsed_server_version.micro - ): - return error_incompatible_versions(client_version, server_version, ask_cli_update=False) return None - - -def error_incompatible_versions( - client_version: Optional[str], - server_version: str, - ask_cli_update: bool, -) -> JSONResponse: - msg = f"The client/CLI version ({client_version}) is incompatible with the server version ({server_version})." - if ask_cli_update: - msg += f" Update the dstack CLI: `pip install dstack=={server_version}`." - return JSONResponse( - status_code=status.HTTP_400_BAD_REQUEST, - content={"detail": get_server_client_error_details(ServerClientError(msg=msg))}, - ) diff --git a/src/dstack/_internal/server/utils/sentry_utils.py b/src/dstack/_internal/server/utils/sentry_utils.py new file mode 100644 index 0000000000..7eec46e4a2 --- /dev/null +++ b/src/dstack/_internal/server/utils/sentry_utils.py @@ -0,0 +1,57 @@ +import asyncio +import functools +from typing import Optional + +import sentry_sdk +from sentry_sdk.types import Event, Hint, SamplingContext + +from dstack._internal.server import settings + +SCHEDULED_TASKS_PREFIX = "scheduled_tasks" +PIPELINE_TASKS_PREFIX = "pipeline_tasks" + + +def instrument_scheduled_task(f): + return instrument_named_task(f"{SCHEDULED_TASKS_PREFIX}.{f.__name__}")(f) + + +def instrument_pipeline_task(name: str): + return instrument_named_task(f"{PIPELINE_TASKS_PREFIX}.{name}") + + +def instrument_named_task(name: str): + def decorator(f): + @functools.wraps(f) + async def wrapper(*args, **kwargs): + with sentry_sdk.isolation_scope(): + with sentry_sdk.start_transaction(name=name): + return await f(*args, **kwargs) + + return wrapper + + return decorator + + +def sentry_traces_sampler(sampling_context: SamplingContext) -> float: + parent_sampling_decision = sampling_context["parent_sampled"] + if parent_sampling_decision is not None: + return float(parent_sampling_decision) + transaction_context = sampling_context["transaction_context"] + name = transaction_context.get("name") + if name is not None: + if _is_background_transaction(name): + return settings.SENTRY_TRACES_BACKGROUND_SAMPLE_RATE + return settings.SENTRY_TRACES_SAMPLE_RATE + + +class AsyncioCancelledErrorFilterEventProcessor: + # See https://docs.sentry.io/platforms/python/configuration/filtering/#filtering-error-events + def __call__(self, event: Event, hint: Hint) -> Optional[Event]: + exc_info = hint.get("exc_info") + if exc_info and isinstance(exc_info[1], asyncio.CancelledError): + return None + return event + + +def _is_background_transaction(name: str) -> bool: + return name.startswith(SCHEDULED_TASKS_PREFIX) or name.startswith(PIPELINE_TASKS_PREFIX) diff --git a/src/dstack/_internal/server/utils/settings.py b/src/dstack/_internal/server/utils/settings.py new file mode 100644 index 0000000000..a80d63ed9e --- /dev/null +++ b/src/dstack/_internal/server/utils/settings.py @@ -0,0 +1,19 @@ +from typing import Optional +from urllib.parse import urlsplit + + +def parse_hostname_port(address: str) -> tuple[str, Optional[int]]: + err_msg = "must be valid HOSTNAME[:PORT]" + if "//" in address: + raise ValueError(err_msg) + res = urlsplit(f"//{address}") + if any((res.path, res.query, res.fragment, res.username, res.password)): + raise ValueError(err_msg) + hostname = res.hostname + if not hostname: + raise ValueError(err_msg) + try: + port = res.port + except ValueError as e: + raise ValueError(err_msg) from e + return hostname, port diff --git a/src/dstack/_internal/settings.py b/src/dstack/_internal/settings.py index 7b57c3f394..afb8bb5f0b 100644 --- a/src/dstack/_internal/settings.py +++ b/src/dstack/_internal/settings.py @@ -1,15 +1,52 @@ import os from dstack import version +from dstack._internal.utils.env import environ +from dstack._internal.utils.version import parse_version DSTACK_VERSION = os.getenv("DSTACK_VERSION", version.__version__) +if parse_version(DSTACK_VERSION) is None: + # The build backend (hatching) requires not None for versions, + # but the code currently treats None as dev version. + # TODO: update the code to treat 0.0.0 as dev version. + DSTACK_VERSION = None DSTACK_RELEASE = os.getenv("DSTACK_RELEASE") is not None or version.__is_release__ +DSTACK_RUNNER_VERSION = os.getenv("DSTACK_RUNNER_VERSION") +DSTACK_RUNNER_VERSION_URL = os.getenv("DSTACK_RUNNER_VERSION_URL") +DSTACK_RUNNER_DOWNLOAD_URL = os.getenv("DSTACK_RUNNER_DOWNLOAD_URL") +DSTACK_SHIM_VERSION = os.getenv("DSTACK_SHIM_VERSION") +DSTACK_SHIM_VERSION_URL = os.getenv("DSTACK_SHIM_VERSION_URL") +DSTACK_SHIM_DOWNLOAD_URL = os.getenv("DSTACK_SHIM_DOWNLOAD_URL") DSTACK_USE_LATEST_FROM_BRANCH = os.getenv("DSTACK_USE_LATEST_FROM_BRANCH") is not None +DSTACK_DOCKER_BASE_IMAGE = os.getenv("DSTACK_DOCKER_BASE_IMAGE", "dstackai/base") +DSTACK_DOCKER_BASE_IMAGE_VERSION = os.getenv( + "DSTACK_DOCKER_BASE_IMAGE_VERSION", version.docker_base_image +) +DSTACK_DOCKER_BASE_IMAGE_UBUNTU_VERSION = os.getenv( + "DSTACK_DOCKER_BASE_IMAGE_UBUNTU_VERSION", version.docker_base_image_ubuntu_version +) +DSTACK_VM_BASE_IMAGE_VERSION = os.getenv("DSTACK_VM_BASE_IMAGE_VERSION", version.vm_base_image) +DSTACK_VM_BASE_IMAGE_PREFIX = os.getenv("DSTACK_VM_BASE_IMAGE_PREFIX", "") # e.g. stgn-123- +DSTACK_DIND_IMAGE = os.getenv("DSTACK_DIND_IMAGE", "dstackai/dind") + +CLI_LOG_LEVEL = os.getenv("DSTACK_CLI_LOG_LEVEL", "INFO").upper() +CLI_FILE_LOG_LEVEL = os.getenv("DSTACK_CLI_FILE_LOG_LEVEL", "DEBUG").upper() +# Can be used to disable control characters (e.g. for testing). +CLI_RICH_FORCE_TERMINAL = environ.get_bool("DSTACK_CLI_RICH_FORCE_TERMINAL") + + class FeatureFlags: """ dstack feature flags. Feature flags are temporary and can be used when developing large features. This class may be empty if there are no such features in development. Feature flags are environment variables of the form DSTACK_FF_* """ + + CLI_PRINT_JOB_CONNECTION_INFO = ( + os.getenv("DSTACK_FF_CLI_PRINT_JOB_CONNECTION_INFO") is not None + ) + """If DSTACK_FF_CLI_PRINT_JOB_CONNECTION_INFO enabled, `dstack apply` command prints server-provided + IDE URL(s) and SSH command(s) before job logs (for dev-environments only). + """ diff --git a/src/dstack/_internal/utils/combine.py b/src/dstack/_internal/utils/combine.py new file mode 100644 index 0000000000..cff1e3d5fd --- /dev/null +++ b/src/dstack/_internal/utils/combine.py @@ -0,0 +1,34 @@ +from typing import Callable, Optional, TypeVar + +from dstack._internal.utils.typing import SupportsRichComparison + +_T = TypeVar("_T") +_CompT = TypeVar("_CompT", bound=SupportsRichComparison) + + +class CombineError(ValueError): + pass + + +def combine_optional( + value1: Optional[_T], value2: Optional[_T], combiner: Callable[[_T, _T], _T] +) -> Optional[_T]: + if value1 is None: + return value2 + if value2 is None: + return value1 + return combiner(value1, value2) + + +def get_max_optional(value1: Optional[_CompT], value2: Optional[_CompT]) -> Optional[_CompT]: + return combine_optional(value1, value2, max) + + +def _get_single_value(value1: _T, value2: _T) -> _T: + if value1 == value2: + return value1 + raise CombineError(f"Values {value1!r} and {value2!r} cannot be combined") + + +def get_single_value_optional(value1: Optional[_T], value2: Optional[_T]) -> Optional[_T]: + return combine_optional(value1, value2, _get_single_value) diff --git a/src/dstack/_internal/utils/common.py b/src/dstack/_internal/utils/common.py index c37f0df325..2ee1f337ad 100644 --- a/src/dstack/_internal/utils/common.py +++ b/src/dstack/_internal/utils/common.py @@ -1,8 +1,54 @@ +import asyncio +import enum +import itertools import re import time +from collections.abc import Callable +from dataclasses import dataclass from datetime import datetime, timedelta, timezone +from functools import partial from pathlib import Path -from typing import Any, Iterable, List, Optional, TypeVar, Union +from typing import Any, Final, Iterable, List, Optional, TypeVar, Union +from urllib.parse import urlparse +from uuid import UUID + +from typing_extensions import ParamSpec + +from dstack._internal.core.models.common import Duration +from dstack._internal.utils.interpolator import InterpolatorError, VariablesInterpolator + + +class Unset: + pass + + +UNSET: Final = Unset() +""" +Use `UNSET` as kwargs default value to distinguish between +specified and non-specified `Optional` values. +""" + + +@dataclass +class EntityName: + name: str + + +@dataclass +class EntityID: + id: UUID + + +EntityNameOrID = Union[EntityName, EntityID] + + +P = ParamSpec("P") +R = TypeVar("R") + + +async def run_async(func: Callable[P, R], *args: P.args, **kwargs: P.kwargs) -> R: + func_with_args = partial(func, *args, **kwargs) + return await asyncio.get_running_loop().run_in_executor(None, func_with_args) def get_dstack_dir() -> Path: @@ -17,14 +63,18 @@ def get_milliseconds_since_epoch() -> int: return int(round(time.time() * 1000)) -def pretty_date(time: Union[datetime, int] = False) -> str: +DateFormatter = Callable[[datetime], str] + + +def local_time(time: datetime) -> str: + """Return HH:MM in local timezone""" + return time.astimezone().strftime("%H:%M") + + +def pretty_date(time: datetime) -> str: """ - Get a datetime object or an epoch timestamp and return a - pretty string like 'an hour ago', 'Yesterday', '3 months ago', - 'just now', etc + Return a pretty string like 'an hour ago', 'Yesterday', '3 months ago', 'just now', etc """ - if isinstance(time, int): - time = datetime.fromtimestamp(time, tz=timezone.utc) now = get_current_datetime() diff = now - time if diff.days < 0: @@ -48,112 +98,133 @@ def pretty_date(time: Union[datetime, int] = False) -> str: if diff.days < 7: return str(diff.days) + " days ago" if diff.days < 31: - return str(round(diff.days / 7)) + " weeks ago" + weeks = round(diff.days / 7) + if weeks == 1: + return str(weeks) + " week ago" + return str(weeks) + " weeks ago" if diff.days < 365: - return str(round(diff.days / 30)) + " months ago" + months = round(diff.days / 30) + if months == 1: + return str(months) + " month ago" + return str(months) + " months ago" years = round(diff.days / 365) if years == 1: return str(years) + " year ago" return str(years) + " years ago" +def format_mib_as_gb(mib: int) -> str: + """Format a MiB value as a human-readable GB string, e.g. 512 → '0.5GB', 8192 → '8GB'.""" + return f"{round(mib / 1024, 1):g}GB" + + def pretty_resources( + *, + cpu_arch: Optional[Any] = None, cpus: Optional[Any] = None, memory: Optional[Any] = None, gpu_count: Optional[Any] = None, + gpu_vendor: Optional[Any] = None, gpu_name: Optional[Any] = None, gpu_memory: Optional[Any] = None, total_gpu_memory: Optional[Any] = None, compute_capability: Optional[Any] = None, disk_size: Optional[Any] = None, ) -> str: - """ - >>> pretty_resources(cpus=4, memory="16GB") - '4xCPU, 16GB' - >>> pretty_resources(cpus=4, memory="16GB", gpu_count=1) - '4xCPU, 16GB, 1xGPU' - >>> pretty_resources(cpus=4, memory="16GB", gpu_count=1, gpu_name='A100') - '4xCPU, 16GB, 1xA100' - >>> pretty_resources(cpus=4, memory="16GB", gpu_count=1, gpu_name='A100', gpu_memory="40GB") - '4xCPU, 16GB, 1xA100 (40GB)' - >>> pretty_resources(cpus=4, memory="16GB", gpu_count=1, total_gpu_memory="80GB") - '4xCPU, 16GB, 1xGPU (total 80GB)' - >>> pretty_resources(cpus=4, memory="16GB", gpu_count=2, gpu_name='A100', gpu_memory="40GB", total_gpu_memory="80GB") - '4xCPU, 16GB, 2xA100 (40GB, total 80GB)' - >>> pretty_resources(gpu_count=1, compute_capability="8.0") - '1xGPU (8.0)' - """ + """Format resource requirements as a human-readable string.""" parts = [] if cpus is not None: - parts.append(f"{cpus}xCPU") + cpu_arch_lower: Optional[str] = None + if isinstance(cpu_arch, enum.Enum): + cpu_arch_lower = str(cpu_arch.value).lower() + elif isinstance(cpu_arch, str): + cpu_arch_lower = cpu_arch.lower() + if cpu_arch_lower == "arm": + cpu_arch_prefix = "arm:" + else: + cpu_arch_prefix = "" + parts.append(f"cpu={cpu_arch_prefix}{cpus}") if memory is not None: - parts.append(f"{memory}") + parts.append(f"mem={memory}") + if disk_size: + parts.append(f"disk={disk_size}") if gpu_count: gpu_parts = [] if gpu_memory is not None: gpu_parts.append(f"{gpu_memory}") + if gpu_count is not None: + gpu_parts.append(f"{gpu_count}") if total_gpu_memory is not None: - gpu_parts.append(f"total {total_gpu_memory}") + gpu_parts.append(f"{total_gpu_memory}") if compute_capability is not None: gpu_parts.append(f"{compute_capability}") - gpu = f"{gpu_count}x{gpu_name or 'GPU'}" - if gpu_parts: - gpu += f" ({', '.join(gpu_parts)})" - parts.append(gpu) - if disk_size: - parts.append(f"{disk_size} (disk)") - return ", ".join(parts) + if gpu_name: + parts.append("gpu=" + ":".join([f"{gpu_name}"] + gpu_parts)) + elif gpu_vendor: + vendor_str = gpu_vendor.value if isinstance(gpu_vendor, enum.Enum) else str(gpu_vendor) + parts.append("gpu=" + ":".join([vendor_str] + gpu_parts)) + else: + parts.append("gpu=" + ":".join(gpu_parts)) + return " ".join(parts) -def since(timestamp: str) -> datetime: +def parse_since(value: str) -> datetime: + """ + Returns a timestamp given an RFC 3339 string (e.g. 2023-09-24T15:30:00Z) + or a duration (e.g. 10s, 5m, 1d) between the timestamp and now. + """ try: - seconds = parse_pretty_duration(timestamp) + seconds = Duration.parse(value) return get_current_datetime() - timedelta(seconds=seconds) except ValueError: pass try: - return datetime.fromisoformat(timestamp) + res = datetime.fromisoformat(value) except ValueError: - pass - try: - return datetime.fromtimestamp(int(timestamp)) - except Exception: raise ValueError("Invalid datetime format") + else: + return check_time_offset_aware(res) + +def check_time_offset_aware(time: datetime) -> datetime: + if time.tzinfo is None: + raise ValueError("Timestamp is not offset-aware. Specify timezone.") + return time -def parse_pretty_duration(duration: str) -> int: - regex = re.compile(r"(?P\d+)(?Ps|m|h|d|w)$") - re_match = regex.match(duration) - if not re_match: - raise ValueError(f"Cannot parse the duration {duration}") - amount, unit = int(re_match.group("amount")), re_match.group("unit") - multiplier = { - "s": 1, - "m": 60, - "h": 3600, - "d": 24 * 3600, - "w": 7 * 24 * 3600, - }[unit] - return amount * multiplier + +DURATION_UNITS_DESC = [ + ("w", 7 * 24 * 3600), + ("d", 24 * 3600), + ("h", 3600), + ("m", 60), + ("s", 1), +] def format_pretty_duration(seconds: int) -> str: + if seconds == 0: + return "0s" if seconds < 0: raise ValueError("Seconds cannot be negative") - units = [ - ("w", 7 * 24 * 3600), - ("d", 24 * 3600), - ("h", 3600), - ("m", 60), - ("s", 1), - ] - for unit, multiplier in units: + for unit, multiplier in DURATION_UNITS_DESC: if seconds % multiplier == 0: return f"{seconds // multiplier}{unit}" return f"{seconds}s" # Fallback to seconds if no larger unit fits perfectly +def format_duration_multiunit(seconds: int) -> str: + """90 -> 1m 30s, 4545 -> 1h 15m 45s, etc""" + if seconds < 0: + raise ValueError("Seconds cannot be negative") + result = "" + for unit, multiplier in DURATION_UNITS_DESC: + if unit_value := seconds // multiplier: + result += f" {unit_value}{unit}" + seconds -= unit_value * multiplier + return result.lstrip() or "0s" + + def sizeof_fmt(num, suffix="B"): for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]: if abs(num) < 1024.0: @@ -168,28 +239,22 @@ def remove_prefix(text: str, prefix: str) -> str: return text -T = TypeVar("T") - - -def split_chunks(iterable: Iterable[T], chunk_size: int) -> Iterable[List[T]]: +def has_duplicates(iterable: Iterable[Any]) -> bool: """ - Splits an iterable into chunks of at most `chunk_size` items. + Checks if there are any duplicate items in the given iterable. - >>> list(split_chunks([1, 2, 3, 4, 5], 2)) - [[1, 2], [3, 4], [5]] + O(n^2) implementation, but works with iterables with unhashable items. + For iterables with hashable items, prefer len(set(iterable)) != len(iterable). """ + seen = [] + for item in iterable: + if item in seen: + return True + seen.append(item) + return False - if chunk_size < 1: - raise ValueError(f"chunk_size should be a positive integer, not {chunk_size}") - chunk = [] - for item in iterable: - chunk.append(item) - if len(chunk) == chunk_size: - yield chunk - chunk = [] - if chunk: - yield chunk +T = TypeVar("T") MEMORY_UNITS = { @@ -218,3 +283,71 @@ def parse_memory(memory: str, as_untis: str = "M") -> float: value_in_bytes = value * MEMORY_UNITS[units.upper()] result = value_in_bytes / MEMORY_UNITS[as_untis.upper()] return result + + +def get_or_error(v: Optional[T]) -> T: + """ + Unpacks an optional value. Used to denote that None is not possible in the current context. + """ + if v is None: + raise ValueError("Optional value is None") + return v + + +# TODO: drop after dropping Python 3.11 +def batched(seq: Iterable[T], n: int) -> Iterable[List[T]]: + """ + Roughly equivalent to itertools.batched from Python 3.12. + + >>> list(batched([1, 2, 3, 4, 5], 2)) + [[1, 2], [3, 4], [5]] + """ + + if n < 1: + raise ValueError(f"n should be a positive integer, not {n}") + it = iter(seq) + return iter(lambda: list(itertools.islice(it, n)), []) + + +StrT = TypeVar("StrT", str, bytes) + + +def concat_url_path(a: StrT, b: StrT) -> StrT: + if not b: + return a + sep = "/" if isinstance(a, str) else b"/" + return a.removesuffix(sep) + sep + b.removeprefix(sep) + + +def make_proxy_url(server_url: str, proxy_url: str) -> str: + """ + Constructs a URL to dstack-proxy services or endpoints. + `proxy_url` can be a full URL (gateway), in which case it is returned as is, + or a path (in-server proxy), in which case it is concatenated with `server_url` + """ + proxy = urlparse(proxy_url) + if proxy.scheme and proxy.netloc: + return proxy_url + server = urlparse(server_url) + proxy = proxy._replace( + scheme=server.scheme or "http", + netloc=server.netloc, + path=concat_url_path(server.path, proxy.path), + ) + return proxy.geturl() + + +def interpolate_gateway_domain( + domain: str, run_project_name: str, exception_type: Optional[type[Exception]] +) -> str: + interpolator = VariablesInterpolator({"run": {"project_name": run_project_name}}) + try: + return interpolator.interpolate_or_error(domain) + except InterpolatorError as e: + if exception_type is None: + return domain + raise exception_type(f"Cannot interpolate gateway domain name: {e.args[0]}") from e + + +def list_enum_values_for_annotation(enum_class: type[enum.Enum]) -> str: + return ", ".join(f"`{e.value}`" for e in enum_class) diff --git a/src/dstack/_internal/utils/cron.py b/src/dstack/_internal/utils/cron.py new file mode 100644 index 0000000000..acf05f3389 --- /dev/null +++ b/src/dstack/_internal/utils/cron.py @@ -0,0 +1,5 @@ +from apscheduler.triggers.cron import CronTrigger + + +def validate_cron(cron_expr: str): + CronTrigger.from_crontab(cron_expr) diff --git a/src/dstack/_internal/utils/docker.py b/src/dstack/_internal/utils/docker.py new file mode 100644 index 0000000000..780f37e86d --- /dev/null +++ b/src/dstack/_internal/utils/docker.py @@ -0,0 +1,45 @@ +from dataclasses import dataclass +from typing import Optional + +# https://github.com/distribution/reference/blob/0965666a6ade2e06035fe352e38344be1e68951a/normalize.go#L11-L31 +DEFAULT_REGISTRY = "docker.io" +LEGACY_DEFAULT_REGISTRY = "index.docker.io" + + +@dataclass(kw_only=True) +class DockerImage: + image: str + registry: Optional[str] = None + repo: str + tag: str + digest: Optional[str] = None + + +def parse_image_name(image: str) -> DockerImage: + digest = None + if "@" in image.split("/")[-1]: + image, digest = image.rsplit("@", maxsplit=1) + + tag = "latest" + if ":" in image.split("/")[-1]: # avoid detecting port as a tag + image, tag = image.rsplit(":", maxsplit=1) + + registry = None + components = image.split("/") + if len(components) == 1: # default registry, official image + repo = "library/" + components[0] + elif not _is_host(components[0]): # default registry, custom image + repo = "/".join(components) + else: # custom registry + registry = components[0] + repo = "/".join(components[1:]) + + return DockerImage(image=image, registry=registry, repo=repo, tag=tag, digest=digest) + + +def is_default_registry(registry: str) -> bool: + return registry in [DEFAULT_REGISTRY, LEGACY_DEFAULT_REGISTRY] + + +def _is_host(s: str) -> bool: + return s == "localhost" or ":" in s or "." in s diff --git a/src/dstack/_internal/utils/env.py b/src/dstack/_internal/utils/env.py new file mode 100644 index 0000000000..9ce97b6c40 --- /dev/null +++ b/src/dstack/_internal/utils/env.py @@ -0,0 +1,121 @@ +import os +from collections.abc import Mapping +from enum import Enum +from typing import Callable, Optional, TypeVar, Union, overload + +_EVT = Union[str, int] +_ET = TypeVar("_ET", bound=Enum) + +_CVT = TypeVar("_CVT") + + +class Environ: + def __init__(self, environ: Mapping[str, str]): + self._environ = environ + + @overload + def get(self, name: str, *, default: None = None) -> Optional[str]: ... + + @overload + def get(self, name: str, *, default: str) -> str: ... + + def get(self, name: str, *, default: Optional[str] = None) -> Optional[str]: + return self._environ.get(name, default) + + @overload + def get_bool(self, name: str, *, default: None = None) -> Optional[bool]: ... + + @overload + def get_bool(self, name: str, *, default: bool) -> bool: ... + + def get_bool(self, name: str, *, default: Optional[bool] = None) -> Optional[bool]: + try: + raw_value = self._environ[name] + except KeyError: + return default + value = raw_value.lower() + if value in ["0", "false", "off"]: + return False + if value in ["1", "true", "on"]: + return True + raise ValueError(f"Invalid bool value: {name}={raw_value}") + + @overload + def get_int(self, name: str, *, default: None = None) -> Optional[int]: ... + + @overload + def get_int(self, name: str, *, default: int) -> int: ... + + def get_int(self, name: str, *, default: Optional[int] = None) -> Optional[int]: + try: + raw_value = self._environ[name] + except KeyError: + return default + try: + return int(raw_value) + except ValueError as e: + raise ValueError(f"Invalid int value: {e}: {name}={raw_value}") from e + + @overload + def get_enum( + self, + name: str, + enum_cls: type[_ET], + *, + value_type: Optional[type[_EVT]] = None, + default: None = None, + ) -> Optional[_ET]: ... + + @overload + def get_enum( + self, + name: str, + enum_cls: type[_ET], + *, + value_type: Optional[type[_EVT]] = None, + default: _ET, + ) -> _ET: ... + + def get_enum( + self, + name: str, + enum_cls: type[_ET], + *, + value_type: Optional[type[_EVT]] = None, + default: Optional[_ET] = None, + ) -> Optional[_ET]: + try: + raw_value = self._environ[name] + except KeyError: + return default + try: + if value_type is not None: + raw_value = value_type(raw_value) + return enum_cls(raw_value) + except (ValueError, TypeError) as e: + raise ValueError(f"Invalid {enum_cls.__name__} value: {e}: {name}={raw_value}") from e + + @overload + def get_callback( + self, name: str, callback: Callable[[str], _CVT], *, default: None = None + ) -> Optional[_CVT]: ... + + @overload + def get_callback( + self, name: str, callback: Callable[[str], _CVT], *, default: _CVT + ) -> _CVT: ... + + def get_callback( + self, name: str, callback: Callable[[str], _CVT], *, default: Optional[_CVT] = None + ) -> Optional[_CVT]: + try: + raw_value = self._environ[name] + except KeyError: + return default + try: + return callback(raw_value) + except ValueError as e: + raise ValueError(f"Invalid value: {e}: {name}={raw_value}") from e + + +environ = Environ(os.environ) diff --git a/src/dstack/_internal/utils/event_loop.py b/src/dstack/_internal/utils/event_loop.py new file mode 100644 index 0000000000..29de015c01 --- /dev/null +++ b/src/dstack/_internal/utils/event_loop.py @@ -0,0 +1,30 @@ +import asyncio +import threading +from collections.abc import Awaitable +from typing import TypeVar + +T = TypeVar("T") + + +class DaemonEventLoop: + """ + A wrapper around asyncio.EventLoop that runs the loop in a daemon thread. + The thread is started with the first `await_` call. + """ + + def __init__(self) -> None: + self._loop = asyncio.new_event_loop() + self._start_lock = threading.Lock() + self._started = False + + def await_(self, awaitable: Awaitable[T]) -> T: + with self._start_lock: + if not self._started: + threading.Thread(target=self._loop.run_forever, daemon=True).start() + self._started = True + future = asyncio.run_coroutine_threadsafe(_coroutine(awaitable), self._loop) + return future.result() + + +async def _coroutine(awaitable: Awaitable[T]) -> T: + return await awaitable diff --git a/src/dstack/_internal/utils/files.py b/src/dstack/_internal/utils/files.py new file mode 100644 index 0000000000..71a0f8ef70 --- /dev/null +++ b/src/dstack/_internal/utils/files.py @@ -0,0 +1,69 @@ +import tarfile +from pathlib import Path +from typing import BinaryIO + +import ignore +import ignore.overrides + +from dstack._internal.utils.hash import get_sha256 +from dstack._internal.utils.path import PathLike, normalize_path + + +def create_file_archive(root: PathLike, fp: BinaryIO) -> str: + """ + Packs the directory or file to a tar archive and writes it to the file-like object. + + Archives can be used to transfer file(s) (e.g., over the network) preserving + file properties such as permissions, timestamps, etc. + + NOTE: `.gitignore` and `.dstackignore` are respected. + + Args: + root: The absolute path to the directory or file. + fp: The binary file-like object. + + Returns: + The SHA-256 hash of the archive as a hex string. + + Raises: + ValueError: If the path is not absolute. + OSError: Underlying errors from the tarfile module + """ + root = Path(root) + if not root.is_absolute(): + raise ValueError(f"path must be absolute: {root}") + walk = ( + ignore.WalkBuilder(root) + .overrides(ignore.overrides.OverrideBuilder(root).add("!/.git/").build()) + .hidden(False) # do not ignore files that start with a dot + .require_git(False) # respect git ignore rules even if not a git repo + .add_custom_ignore_filename(".dstackignore") + .build() + ) + # sort paths to ensure archive reproducibility + paths = sorted(entry.path() for entry in walk) + with tarfile.TarFile(mode="w", fileobj=fp) as t: + for path in paths: + arcname = str(path.relative_to(root.parent)) + info = t.gettarinfo(path, arcname) + if info.issym(): + # Symlinks are handled as follows: each symlink in the chain is checked, and + # * if the target is inside the root: keep relative links as is, replace absolute + # links with relative ones; + # * if the target is outside the root: replace the link with the actual file. + target = Path(info.linkname) + if not target.is_absolute(): + target = path.parent / target + target = normalize_path(target) + try: + target.relative_to(root) + except ValueError: + # Adding as a file + t.add(path.resolve(), arcname, recursive=False) + else: + # Adding as a relative symlink + info.linkname = str(target.relative_to(path.parent, walk_up=True)) + t.addfile(info) + else: + t.add(path, arcname, recursive=False) + return get_sha256(fp) diff --git a/src/dstack/_internal/utils/gpu.py b/src/dstack/_internal/utils/gpu.py index 2455c545fe..d0fc94e50c 100644 --- a/src/dstack/_internal/utils/gpu.py +++ b/src/dstack/_internal/utils/gpu.py @@ -1,7 +1,7 @@ import re -def convert_gpu_name(name: str) -> str: +def convert_nvidia_gpu_name(name: str) -> str: """Convert gpu_name from nvidia-smi to short version""" # https://github.com/NVIDIA/open-gpu-kernel-modules/ name = name.replace("NVIDIA ", "") @@ -20,8 +20,41 @@ def convert_gpu_name(name: str) -> str: return name.replace(" ", "") name = name.replace(" Ti", "Ti") + name = re.sub(r"(?i) ?SUPER", "SUPER", name) + name = name.replace(" NVL", "NVL") + name = name.replace(" Ada Generation", "Ada") name = name.replace("RTX ", "RTX") - m = re.search(r"([A|H|L|P|T|V]\d+[Ti]?)", name) + m = re.search(r"([AHLPTV]\d+\w*)", name) if m is not None: return m.group(0) return name.replace(" ", "") + + +def convert_amd_gpu_name(name: str) -> str: + """Convert asic.market_name from amd-smi to short version""" + if match := _AMD_INSTINCT_MARKET_NAME_REGEX.search(name): + name = match.group("name") + # https://github.com/ROCm/amdsmi/blob/52b3947/src/amd_smi/amd_smi_utils.cc#L558-L593 + if name == "MI300X-O": + return "MI300X" + return name + + +def convert_intel_accelerator_name(name: str) -> str: + """Convert name from hl-smi to market name""" + for model_name, market_name in _INTEL_GAUDI_MODELS.items(): + if name.startswith(model_name): + return market_name + return name + + +_AMD_INSTINCT_MARKET_NAME_REGEX = re.compile( + r"^(?:AMD )?(?:Instinct )?(?PMI\d{1,3}[A-Z]?(?:-\w+)?)(?:\s|$)", flags=re.ASCII | re.I +) + +_INTEL_GAUDI_MODELS = { + "HL-205": "Gaudi", + "HL-225": "Gaudi2", + "HL-325": "Gaudi3", # OAM + "HL-338": "Gaudi3", # PCIe +} diff --git a/src/dstack/_internal/utils/ignore.py b/src/dstack/_internal/utils/ignore.py deleted file mode 100644 index cfe83e2e77..0000000000 --- a/src/dstack/_internal/utils/ignore.py +++ /dev/null @@ -1,90 +0,0 @@ -import fnmatch -from itertools import zip_longest -from pathlib import Path -from typing import Dict, List, Optional - -from dstack._internal.utils.path import PathLike - - -class GitIgnore: - def __init__( - self, root_dir: PathLike, ignore_files: List[str] = None, globs: List[str] = None - ): - self.root_dir = Path(root_dir) - self.ignore_files = ( - ignore_files - if ignore_files is not None - else [".gitignore", ".git/info/exclude", ".dstackignore"] - ) - self.ignore_globs: Dict[str, List[str]] = {".": globs or []} - self.load_recursive() - - def load_ignore_file(self, path: str, ignore_file: Path): - if path not in self.ignore_globs: - self.ignore_globs[path] = [] - with ignore_file.open("r") as f: - for line in f: - line = self.rstrip(line.rstrip("\n")).rstrip("/") - line = line.replace("\\ ", " ") - if line.startswith("#") or not line: - continue - self.ignore_globs[path].append(line) - - def load_recursive(self, path: Optional[Path] = None): - path = path or self.root_dir - for ignore_file in self.ignore_files: - ignore_file = path / ignore_file - if ignore_file.exists(): - self.load_ignore_file(str(path.relative_to(self.root_dir)), ignore_file) - - for subdir in path.iterdir(): - if not subdir.is_dir() or self.ignore(subdir.relative_to(self.root_dir)): - continue - self.load_recursive(subdir) - - @staticmethod - def rstrip(value: str) -> str: - end = len(value) - 1 - while end >= 0: - if not value[end].isspace(): - break - if end > 0 and value[end - 1] == "\\": - break # escaped space - end -= 1 - else: - return "" - return value[: end + 1] - - @staticmethod - def fnmatch(name: str, pattern: str, sep="/") -> bool: - if pattern.startswith(sep): - name = sep + name - for n, p in zip_longest( - reversed(name.split(sep)), reversed(pattern.split(sep)), fillvalue=None - ): - if p == "**": - raise NotImplementedError() - if p is None: - return True - if n is None or not fnmatch.fnmatch(n, p): - return False - return True - - def ignore(self, path: PathLike, sep="/") -> bool: - if not path: - return False - path = Path(path) - if path.is_absolute(): - path = path.relative_to(self.root_dir) - - tokens = ("." + sep + str(path)).split(sep) - for i in range(1, len(tokens)): - parent = sep.join(tokens[:-i]) - globs = self.ignore_globs.get(parent) - if not globs: - continue - name = sep.join(tokens[-i:]) - for glob in globs: - if self.fnmatch(name, glob, sep=sep): - return True - return False diff --git a/src/dstack/_internal/utils/interpolator.py b/src/dstack/_internal/utils/interpolator.py index 02d56243c7..9a4e44659b 100644 --- a/src/dstack/_internal/utils/interpolator.py +++ b/src/dstack/_internal/utils/interpolator.py @@ -1,5 +1,6 @@ import string -from typing import Dict, Iterable, List, Optional, Tuple, Union +from collections.abc import Mapping +from typing import Iterable, List, Literal, Optional, Tuple, Union, overload class Pattern: @@ -12,13 +13,30 @@ class Name: char = first_char | set(string.digits + ".") +class InterpolatorError(ValueError): + """ + Raised when interpolation fails. + + May be shown to the users, should not contain sensitive information, + such as variable values. + """ + + pass + + class VariablesInterpolator: def __init__( - self, namespaces: Dict[str, Dict[str, str]], *, skip: Optional[Iterable[str]] = None + self, namespaces: Mapping[str, Mapping[str, str]], *, skip: Optional[Iterable[str]] = None ): self.skip = set(skip) if skip is not None else set() self.variables = {f"{ns}.{k}": v for ns in namespaces for k, v in namespaces[ns].items()} + @overload + def interpolate(self, s: str, return_missing: Literal[False] = False) -> str: ... + + @overload + def interpolate(self, s: str, return_missing: Literal[True]) -> Tuple[str, List[str]]: ... + def interpolate( self, s: str, return_missing: bool = False ) -> Union[str, Tuple[str, List[str]]]: @@ -42,11 +60,11 @@ def interpolate( tokens.append(s[start:opening]) closing = s.find(Pattern.closing, opening) if closing == -1: - raise ValueError(f"No pattern closing: {s[opening:]}") + raise InterpolatorError(f"No pattern closing: {s[opening:]}") name = s[opening + len(Pattern.opening) : closing].strip() if not self.validate_name(name): - raise ValueError(f"Illegal reference name: {name}") + raise InterpolatorError(f"Illegal reference name: {name}") if name.split(".")[0] in self.skip: tokens.append(s[opening : closing + len(Pattern.closing)]) elif name in self.variables: @@ -57,6 +75,12 @@ def interpolate( s = "".join(tokens) return (s, missing) if return_missing else s + def interpolate_or_error(self, s: str) -> str: + res, missing = self.interpolate(s, return_missing=True) + if len(missing) == 0: + return res + raise InterpolatorError(f"Failed to interpolate due to missing vars: {missing}") + @staticmethod def validate_name(s: str) -> bool: if s.count(".") != 1 or not (0 < s.index(".") < len(s) - 1): diff --git a/src/dstack/_internal/utils/json_schema.py b/src/dstack/_internal/utils/json_schema.py new file mode 100644 index 0000000000..19bcd0bc62 --- /dev/null +++ b/src/dstack/_internal/utils/json_schema.py @@ -0,0 +1,11 @@ +def add_extra_schema_types(schema_property: dict, extra_types: list[dict]): + if "allOf" in schema_property: + refs = [schema_property.pop("allOf")[0]] + elif "anyOf" in schema_property: + refs = schema_property.pop("anyOf") + elif "type" in schema_property: + refs = [{"type": schema_property.pop("type")}] + else: + refs = [{"$ref": schema_property.pop("$ref")}] + refs.extend(extra_types) + schema_property["anyOf"] = refs diff --git a/src/dstack/_internal/utils/json_utils.py b/src/dstack/_internal/utils/json_utils.py new file mode 100644 index 0000000000..9017e94c31 --- /dev/null +++ b/src/dstack/_internal/utils/json_utils.py @@ -0,0 +1,54 @@ +from typing import Any + +import orjson +from pydantic import BaseModel + +FREEZEGUN = True +try: + from freezegun.api import FakeDatetime +except ImportError: + FREEZEGUN = False + + +ASYNCPG = True +try: + import asyncpg.pgproto.pgproto +except ImportError: + ASYNCPG = False + + +def pydantic_orjson_dumps(v: Any, *, default: Any) -> str: + return orjson.dumps( + v, + option=get_orjson_default_options(), + default=orjson_default, + ).decode() + + +def pydantic_orjson_dumps_with_indent(v: Any, *, default: Any) -> str: + return orjson.dumps( + v, + option=get_orjson_default_options() | orjson.OPT_INDENT_2, + default=orjson_default, + ).decode() + + +def orjson_default(obj): + if isinstance(obj, float): + # orjson does not convert float subclasses be default + return float(obj) + if isinstance(obj, BaseModel): + # Allows calling orjson.dumps() on pydantic models + # (e.g. to return from the API) + return obj.dict() + if ASYNCPG: + if isinstance(obj, asyncpg.pgproto.pgproto.UUID): + return str(obj) + if FREEZEGUN: + if isinstance(obj, FakeDatetime): + return obj.isoformat() + raise TypeError + + +def get_orjson_default_options() -> int: + return orjson.OPT_NON_STR_KEYS diff --git a/src/dstack/_internal/utils/nested_list.py b/src/dstack/_internal/utils/nested_list.py new file mode 100644 index 0000000000..599298ed49 --- /dev/null +++ b/src/dstack/_internal/utils/nested_list.py @@ -0,0 +1,47 @@ +from dataclasses import dataclass, field +from typing import Optional + + +@dataclass +class NestedListItem: + label: str + children: list["NestedListItem"] = field(default_factory=list) + + def render(self, indent: int = 0, visited: Optional[set[int]] = None) -> str: + if visited is None: + visited = set() + + item_id = id(self) + if item_id in visited: + raise ValueError(f"Cycle detected at item: {self.label}") + + visited.add(item_id) + prefix = " " * indent + "- " + output = f"{prefix}{self.label}\n" + for child in self.children: + # `visited.copy()` so that we only detect cycles within each path, + # rather than duplicate items in unrelated paths + output += child.render(indent + 1, visited.copy()) + return output + + +@dataclass +class NestedList: + """ + A nested list that can be rendered in Markdown-like format: + + - Item 1 + - Item 2 + - Item 2.1 + - Item 2.2 + - Item 2.2.1 + - Item 3 + """ + + children: list[NestedListItem] = field(default_factory=list) + + def render(self) -> str: + output = "" + for child in self.children: + output += child.render() + return output diff --git a/src/dstack/_internal/utils/network.py b/src/dstack/_internal/utils/network.py index 5174380250..355753a617 100644 --- a/src/dstack/_internal/utils/network.py +++ b/src/dstack/_internal/utils/network.py @@ -1,5 +1,5 @@ import ipaddress -from typing import Optional, Sequence +from typing import List, Optional, Sequence def get_ip_from_network(network: Optional[str], addresses: Sequence[str]) -> Optional[str]: @@ -32,3 +32,19 @@ def get_ip_from_network(network: Optional[str], addresses: Sequence[str]) -> Opt # return any ipv4 internal_ip = str(ip_addresses[0]) if ip_addresses else None return internal_ip + + +def is_ip_among_addresses(ip_address: str, addresses: Sequence[str]) -> bool: + ip_addresses = get_ips_from_addresses(addresses) + return ip_address in ip_addresses + + +def get_ips_from_addresses(addresses: Sequence[str]) -> List[str]: + ip_addresses = [] + for address in addresses: + try: + interface = ipaddress.IPv4Interface(address) + ip_addresses.append(interface.ip) + except ipaddress.AddressValueError: + continue + return [str(ip) for ip in ip_addresses] diff --git a/src/dstack/_internal/utils/path.py b/src/dstack/_internal/utils/path.py index bc20db3d2f..07b8fdd664 100644 --- a/src/dstack/_internal/utils/path.py +++ b/src/dstack/_internal/utils/path.py @@ -1,10 +1,25 @@ import os -from pathlib import Path, PurePath +from dataclasses import dataclass +from pathlib import Path, PurePath, PurePosixPath +from tempfile import TemporaryDirectory from typing import Union PathLike = Union[str, os.PathLike] +@dataclass +class FilePath: + path: PathLike + + +@dataclass +class FileContent: + content: str + + +FilePathOrContent = Union[FilePath, FileContent] + + def path_in_dir(path: PathLike, directory: PathLike) -> bool: try: Path(path).resolve().relative_to(Path(directory).resolve()) @@ -13,16 +28,40 @@ def path_in_dir(path: PathLike, directory: PathLike) -> bool: return False -def resolve_relative_path(path: str) -> PurePath: +def normalize_path(path: PathLike) -> PurePath: path = PurePath(path) - if path.is_absolute(): - raise ValueError("Path should be relative") stack = [] for part in path.parts: if part == "..": if not stack: - raise ValueError("Path is outside of the repo") + raise ValueError("Path is outside of the top directory") stack.pop() else: stack.append(part) return PurePath(*stack) + + +def resolve_relative_path(path: PathLike) -> PurePath: + path = PurePath(path) + if path.is_absolute(): + raise ValueError("Path should be relative") + try: + return normalize_path(path) + except ValueError: + raise ValueError("Path is outside of the repo") + + +def is_absolute_posix_path(path: PathLike) -> bool: + # Passing Windows path leads to undefined behavior + if str(path).startswith("~"): + return True + return PurePosixPath(path).is_absolute() + + +def make_tmp_symlink_to_dir( + dirpath: PathLike, symlink_dirname: str +) -> tuple[TemporaryDirectory, Path]: + temp_dir = TemporaryDirectory() + symlink_dir = Path(temp_dir.name) / symlink_dirname + symlink_dir.symlink_to(dirpath, target_is_directory=True) + return temp_dir, symlink_dir diff --git a/src/dstack/_internal/utils/ssh.py b/src/dstack/_internal/utils/ssh.py index 92f3e285e1..e1828bc33b 100644 --- a/src/dstack/_internal/utils/ssh.py +++ b/src/dstack/_internal/utils/ssh.py @@ -1,25 +1,30 @@ import io import os import re +import shutil import subprocess import sys import tempfile from pathlib import Path -from typing import Dict, Optional +from typing import Dict, Optional, Union import paramiko from filelock import FileLock from paramiko.config import SSHConfig from paramiko.pkey import PKey, PublicBlob +from paramiko.ssh_exception import SSHException +from dstack._internal.compat import IS_WINDOWS from dstack._internal.utils.logging import get_logger -from dstack._internal.utils.path import PathLike +from dstack._internal.utils.path import FilePath, PathLike logger = get_logger(__name__) default_ssh_config_path = "~/.ssh/config" +SUPPORTED_KEY_TYPES = (paramiko.RSAKey, paramiko.ECDSAKey, paramiko.Ed25519Key) + def get_public_key_fingerprint(text: str) -> str: pb = PublicBlob.from_string(text) @@ -36,18 +41,95 @@ def get_host_config(hostname: str, ssh_config_path: PathLike = default_ssh_confi def make_ssh_command_for_git(identity_file: PathLike) -> str: - return f"ssh -F none -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o IdentitiesOnly=yes -F /dev/null -o IdentityFile={identity_file}" + # No need to use :func:`find_ssh_client()` even on Windows even if `ssh` not in + # Windows `PATH` -- MSYS2 git (from Git for Windows) always has access to it, + # see https://www.msys2.org/docs/environments/ ("MSYS environment [...] is always active") + return ( + f'ssh -F none -i "{normalize_path(identity_file)}"' + " -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o IdentitiesOnly=yes" + ) + + +def make_git_env( + *, + disable_prompt: bool = True, + disable_askpass: bool = False, + disable_config: bool = False, + identity_file: Optional[PathLike] = None, +) -> dict[str, str]: + env: dict[str, str] = {} + if disable_prompt: + # Fail with error instead of prompting on the terminal (e.g., when asking for + # HTTP authentication) + env["GIT_TERMINAL_PROMPT"] = "0" + if disable_askpass: + env["GIT_ASKPASS"] = "" + env["SSH_ASKPASS"] = "" + if disable_config: + # Disable system-wide config (usually /etc/gitconfig) + env["GIT_CONFIG_SYSTEM"] = os.devnull + # Disable user (aka "global") config ($XDG_CONFIG_HOME/git/config or ~/.git/config) + env["GIT_CONFIG_GLOBAL"] = os.devnull + # Disable repo (aka "local") config (./.git/config) + env["GIT_DIR"] = os.devnull + if identity_file is not None: + env["GIT_SSH_COMMAND"] = make_ssh_command_for_git(identity_file) + return env def try_ssh_key_passphrase(identity_file: PathLike, passphrase: str = "") -> bool: + ssh_keygen = find_ssh_util("ssh-keygen") + if ssh_keygen is None: + logger.warning("ssh-keygen not found") + return False r = subprocess.run( - ["ssh-keygen", "-y", "-P", passphrase, "-f", identity_file], + [ssh_keygen, "-y", "-P", passphrase, "-f", identity_file], stdout=subprocess.DEVNULL, stderr=sys.stdout.buffer, ) return r.returncode == 0 +def normalize_path(path: PathLike, *, collapse_user: bool = False) -> str: + """ + Converts a path to the most compatible format. + On Windows, replaces backslashes with slashes. + Additionally, if `collapse_user` is `True`, tries to replace the user home part of the path + with `~`. + + :param path: Path object or string + :param collapse_user: try to replace user home prefix with `~`. `False` by default. + :return: Normalized path as string + """ + if collapse_user: + # The following "reverse" expanduser operation not only makes paths shorter and "nicer", + # but also fixes one specific issue with OpenSSH bundled with Git for Windows (MSYS2), + # see :func:`include_ssh_config` for details. + try: + path = Path(path).relative_to(Path.home()) + path = f"~/{path}" + except ValueError: + pass + if IS_WINDOWS: + # Git for Windows ssh (based on MSYS2, but there may be subtle differences between + # vanilla MSYS2 ssh and Git for Windows ssh) supports: + # * C:\\Users\\User + # * C:/Users/User + # * /c/Users/User + # does not support: + # * C:\Users\User (as pathllib.WindowsPath is rendered) + # OpenSSH_for_Windows supports: + # * C:\Users\User + # * C:\\Users\\User + # * C:/Users/User + # does not support: + # * /c/User/User + # We use C:/Users/User format as the safest (supported by both ssh builds; + # no backslash-escaping pitfalls) + return str(path).replace("\\", "/") + return str(path) + + def include_ssh_config(path: PathLike, ssh_config_path: PathLike = default_ssh_config_path): """ Adds Include entry on top of the default ssh config file @@ -56,25 +138,30 @@ def include_ssh_config(path: PathLike, ssh_config_path: PathLike = default_ssh_c """ ssh_config_path = os.path.expanduser(ssh_config_path) Path(ssh_config_path).parent.mkdir(0o600, parents=True, exist_ok=True) + # MSYS2 OpenSSH accepts only /c/Users/User/... format in the Include directive (although + # it accepts C:/Users/User/... in other directives). We try to work around this issue + # converting the path to ~/.dstack/... format. + path = normalize_path(path, collapse_user=True) include = f"Include {path}\n" content = "" - with FileLock(str(ssh_config_path) + ".lock"): - if os.path.exists(ssh_config_path): - with open(ssh_config_path, "r") as f: - content = f.read() - if include not in content: - try: + try: + with FileLock(str(ssh_config_path) + ".lock"): + if os.path.exists(ssh_config_path): + with open(ssh_config_path, "r") as f: + content = f.read() + if include not in content: with open(ssh_config_path, "w") as f: f.write(include + content) - except PermissionError: - logger.warning( - f"Couldn't update `{ssh_config_path}` due to a permissions problem.\n\n" - f"The `vscode://vscode-remote/ssh-remote+/workflow` link and " - f"the `ssh ` command won't work.\n\n" - f"To fix this, make sure `{ssh_config_path}` is writable, or add " - f"`Include {path}` to the top of `{ssh_config_path}` manually.", - extra={"markup": True}, - ) + except PermissionError: + logger.warning( + f"Couldn't update `{ssh_config_path}` due to a permissions problem.\n" + f"The `vscode://vscode-remote/ssh-remote+/` and " + f"`cursor://vscode-remote/ssh-remote+/` links and " + f"the `ssh ` command won't work.\n" + f"To fix this, make sure `{ssh_config_path}` is writable, or add " + f"`Include {path}` to the top of `{ssh_config_path}` manually.", + extra={"markup": True}, + ) def get_ssh_config(path: PathLike, host: str) -> Optional[Dict[str, str]]: @@ -100,7 +187,7 @@ def get_ssh_config(path: PathLike, host: str) -> Optional[Dict[str, str]]: return None -def update_ssh_config(path: PathLike, host: str, options: Dict[str, str]): +def update_ssh_config(path: PathLike, host: str, options: Dict[str, Union[str, int, FilePath]]): Path(path).parent.mkdir(parents=True, exist_ok=True) with FileLock(str(path) + ".lock"): copy_mode = True @@ -118,40 +205,48 @@ def update_ssh_config(path: PathLike, host: str, options: Dict[str, str]): if options: f.write(f"Host {host}\n") for k, v in options.items(): + if isinstance(v, FilePath): + v = normalize_path(v.path, collapse_user=True) f.write(f" {k} {v}\n") f.flush() -def convert_pkcs8_to_pem(private_string: str) -> str: +def convert_ssh_key_to_pem(private_string: str) -> str: if not private_string.startswith("-----BEGIN PRIVATE KEY-----"): return private_string with tempfile.NamedTemporaryFile(mode="w+") as key_file: key_file.write(private_string) key_file.flush() - cmd = ["ssh-keygen", "-p", "-m", "PEM", "-f", key_file.name, "-y", "-q", "-N", ""] - try: - subprocess.run( - cmd, - check=True, - capture_output=True, - text=True, - ) - except FileNotFoundError: + if ssh_keygen := find_ssh_util("ssh-keygen"): + cmd = [ssh_keygen, "-p", "-m", "PEM", "-f", key_file.name, "-y", "-q", "-N", ""] + try: + subprocess.run( + cmd, + check=True, + capture_output=True, + text=True, + ) + except subprocess.CalledProcessError as e: + logger.error("Fail to convert ssh key: stdout=%s, stderr=%s", e.stdout, e.stderr) + else: logger.error("Use a PEM key or install ssh-keygen to convert it automatically") - except subprocess.CalledProcessError as e: - logger.error("Fail to convert ssh key: stdout=%s, stderr=%s", e.stdout, e.stderr) - key_file.seek(0) private_string = key_file.read() return private_string -def rsa_pkey_from_str(private_string: str) -> PKey: - key_file = io.StringIO(private_string.strip()) - pkey = paramiko.RSAKey.from_private_key(key_file) - key_file.close() - return pkey +def pkey_from_str(private_string: str) -> PKey: + for key_type in SUPPORTED_KEY_TYPES: + try: + key_file = io.StringIO(private_string.strip()) + pkey = key_type.from_private_key(key_file) + key_file.close() + return pkey + except (SSHException, ValueError): + pass + + raise ValueError("Unsupported key type") def generate_public_key(private_key: PKey) -> str: @@ -178,3 +273,135 @@ def check_required_ssh_version() -> bool: return False return False + + +def find_ssh_client() -> Optional[Path]: + """ + Finds and returns an absolute path of `ssh` executable or `None` if not found. + + If the `DSTACK_SSH_CLIENT` environment variable is set, return its value, otherwise: + * on POSIX, look for `ssh` executable in `PATH` and return it (if any). + * on Windows, first look for OpenSSH bundled with Git for Windows checking + a known directory structure, then check `PATH`, and finally check a well-known location of + OpenSSH for Windows. + """ + path_str = os.getenv("DSTACK_SSH_CLIENT") + if path_str: + path = Path(path_str) + if path.exists(): + return path.resolve() + logger.warning("DSTACK_SSH_CLIENT=%s does not exist", path_str) + return None + if not IS_WINDOWS: + path_str = shutil.which("ssh") + if path_str: + return Path(path_str) + return None + # First, we check for ssh bundled with Git for Windows (MSYS2/MinGW-w64-built OpenSSH Portable) + # as a preferred client. It supports ForkAfterAuthentication; ControlMaster is only partially + # supported, we don't use it. + git_path_str = shutil.which("git") + if git_path_str: + # C:\Program Files\Git\cmd\git.exe -> C:\Program Files\Git\usr\bin\ssh.exe + path = Path(git_path_str).parent.parent / "usr" / "bin" / "ssh.exe" + if path.exists(): + return path + # Then we check for any ssh client in PATH. It can be anything, but most likely it will be + # OpenSSH for Windows (see below). Nonetheless, it's worth trying since it's also may be + # MSYS2/Cygwin OpenSSH Portable. + path_str = shutil.which("ssh") + if path_str: + return Path(path_str) + # Finally we check for OpenSSH for Windows (Microsoft's fork of OpenSSH Portable). + # It does not support some features, namely ControlMaster and ForkAfterAuthentication. + windir_str = os.getenv("WINDIR") + if windir_str: + path = Path(windir_str) / "System32" / "OpenSSH" / "ssh.exe" + if path.exists(): + return path + return None + + +_ssh_util_dir: Optional[Path] = None + + +def find_ssh_util(name: str) -> Optional[Path]: + """ + Returns an absolute path of a given `ssh*` utility or `None` if not found. + + :param name: a utility binary name without `.exe` suffix, e.g., `ssh-keygen`, `ssh-copy-id`. + :return: a Path object. + """ + global _ssh_util_dir + if _ssh_util_dir is None: + ssh_client_path = find_ssh_client() + if ssh_client_path is None: + return None + _ssh_util_dir = ssh_client_path.parent + if IS_WINDOWS: + name = f"{name}.exe" + path = _ssh_util_dir / name + if path.exists(): + return path + return None + + +def build_ssh_command( + *, + username: Optional[str] = None, + hostname: str, + port: Optional[int] = None, + ssh_executable: Optional[str] = None, +) -> list[str]: + """ + Builds an SSH client command line to connect. + + The resulting command is: + + ssh [username@]hostname [-p port] + + The port argument -p is only included if the port is not the default SSH port (22). + + :param username: an optional user login name. + :param hostname: a hostname, required. + :param port: an optional SSH port, defaults to 22. + :param ssh_executable: an optional file name or path of the SSH client, defaults to `ssh`. + :return: a list of command line arguments including the executable. + """ + if ssh_executable is None: + ssh_executable = "ssh" + command: list[str] = [ssh_executable] + if username is not None: + command.append(f"{username}@{hostname}") + else: + command.append(hostname) + if port is not None and port != 22: + command.extend(("-p", str(port))) + return command + + +def build_ssh_url_authority( + *, username: Optional[str] = None, hostname: str, port: Optional[int] = None +) -> str: + """ + Builds an authority URL component for use with ssh:// and ssh-based URLs (e.g., vscode://). + + The authority component consists of subcomponents: + + authority = [userinfo "@"] host [":" port] + + The port subcomponent is only included if the port is not the default SSH port (22). + + :param username: an optional user login name, used as the userinfo if provided. + :param hostname: a hostname, required. + :param port: an optional SSH port, defaults to 22. + :return: the authority URL component as a string. + """ + if ":" in hostname and not hostname.startswith("["): + hostname = f"[{hostname}]" + authority = hostname + if username is not None: + authority = f"{username}@{authority}" + if port is not None and port != 22: + authority = f"{authority}:{port}" + return authority diff --git a/src/dstack/_internal/utils/tags.py b/src/dstack/_internal/utils/tags.py new file mode 100644 index 0000000000..c9bcac5099 --- /dev/null +++ b/src/dstack/_internal/utils/tags.py @@ -0,0 +1,42 @@ +import re +from typing import Dict, Optional + +# dstack resource tags allow alphanumeric tags with some special symbols in values. +# Should be valid across most backends (e.g. AWS). +# Does not guarantee that they are valid across all backends (e.g. GCP). +# So backends need to filter bad tags out. +TAG_KEY_PATTERN = re.compile(r"^[_\-a-zA-Z0-9]{1,60}$") +TAG_VALUE_PATTERN = re.compile(r"^[a-zA-Z0-9 .:/=_\-+@]{0,256}$") + + +def tags_validator(tags: Optional[Dict[str, str]]) -> Optional[Dict[str, str]]: + if tags is None: + return + validate_tags(tags) + return tags + + +def validate_tags(tags: Dict[str, str]): + for k, v in tags.items(): + _validate_tag(k, v) + + +def _validate_tag(key: str, value: str): + if not is_valid_tag_key(key): + raise ValueError( + f"Invalid tag key {key}. The key must match regex '{TAG_KEY_PATTERN.pattern}'" + ) + if not is_valid_tag_value(value): + raise ValueError( + f"Invalid tag value {value}. The value must match regex '{TAG_VALUE_PATTERN.pattern}'" + ) + + +def is_valid_tag_key(name: str) -> bool: + match = re.match(TAG_KEY_PATTERN, name) + return match is not None + + +def is_valid_tag_value(value: str) -> bool: + match = re.match(TAG_VALUE_PATTERN, value) + return match is not None diff --git a/src/dstack/_internal/utils/typing.py b/src/dstack/_internal/utils/typing.py new file mode 100644 index 0000000000..024464a0c3 --- /dev/null +++ b/src/dstack/_internal/utils/typing.py @@ -0,0 +1,14 @@ +from typing import Any, Protocol, TypeVar, Union + +_T_contra = TypeVar("_T_contra", contravariant=True) + + +class SupportsDunderLT(Protocol[_T_contra]): + def __lt__(self, other: _T_contra, /) -> bool: ... + + +class SupportsDunderGT(Protocol[_T_contra]): + def __gt__(self, other: _T_contra, /) -> bool: ... + + +SupportsRichComparison = Union[SupportsDunderLT[Any], SupportsDunderGT[Any]] diff --git a/src/dstack/_internal/utils/version.py b/src/dstack/_internal/utils/version.py new file mode 100644 index 0000000000..bd8703d0e9 --- /dev/null +++ b/src/dstack/_internal/utils/version.py @@ -0,0 +1,22 @@ +from typing import Optional + +import packaging.version + + +def parse_version(version_string: str) -> Optional[packaging.version.Version]: + """ + Returns a `packaging.version.Version` instance or `None` if the version is dev/latest. + + Values parsed as the dev/latest version: + * the "latest" literal + * any "0.0.0" release, e.g., "0.0.0", "0.0.0a1", "0.0.0.dev0" + """ + if version_string == "latest": + return None + try: + version = packaging.version.parse(version_string) + except packaging.version.InvalidVersion as e: + raise ValueError(f"Invalid version: {version_string}") from e + if version.release == (0, 0, 0): + return None + return version diff --git a/src/dstack/api/__init__.py b/src/dstack/api/__init__.py index 4e5b76c047..0e6c6ebe3c 100644 --- a/src/dstack/api/__init__.py +++ b/src/dstack/api/__init__.py @@ -1,20 +1,25 @@ # ruff: noqa: F401 +import warnings + from dstack._internal.core.errors import ClientError from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.configurations import RegistryAuth +from dstack._internal.core.models.common import RegistryAuth +from dstack._internal.core.models.configurations import ( + DevEnvironmentConfiguration as _DevEnvironmentConfiguration, +) from dstack._internal.core.models.configurations import ScalingSpec as Scaling from dstack._internal.core.models.configurations import ( ServiceConfiguration as _ServiceConfiguration, ) from dstack._internal.core.models.configurations import TaskConfiguration as _TaskConfiguration -from dstack._internal.core.models.gateways import OpenAIChatModel, TGIChatModel -from dstack._internal.core.models.repos.local import LocalRepo from dstack._internal.core.models.repos.remote import RemoteRepo from dstack._internal.core.models.repos.virtual import VirtualRepo from dstack._internal.core.models.resources import ComputeCapability, Memory, Range +from dstack._internal.core.models.resources import CPUSpec as CPU from dstack._internal.core.models.resources import DiskSpec as Disk from dstack._internal.core.models.resources import GPUSpec as GPU from dstack._internal.core.models.resources import ResourcesSpec as Resources +from dstack._internal.core.models.services import OpenAIChatModel, TGIChatModel from dstack._internal.core.services.ssh.ports import PortUsedError from dstack.api._public import BackendCollection, Client, RepoCollection, RunCollection from dstack.api._public.backends import Backend @@ -22,3 +27,20 @@ Service = _ServiceConfiguration Task = _TaskConfiguration +DevEnvironment = _DevEnvironmentConfiguration + + +def __getattr__(name): + if name == "LocalRepo": + from dstack._internal.core.models.repos.local import LocalRepo + + warnings.warn( + ( + "Local repositories are not supported since 0.20.0. Use `files` to mount" + " an arbitrary directory: https://dstack.ai/docs/concepts/tasks/#files" + ), + DeprecationWarning, + ) + return LocalRepo + + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/src/dstack/api/_public/__init__.py b/src/dstack/api/_public/__init__.py index ca067db43d..1d9ab353d2 100644 --- a/src/dstack/api/_public/__init__.py +++ b/src/dstack/api/_public/__init__.py @@ -2,12 +2,10 @@ import dstack._internal.core.services.api_client as api_client_service from dstack._internal.core.errors import ConfigurationError -from dstack._internal.core.services.configs import ConfigManager from dstack._internal.utils.logging import get_logger -from dstack._internal.utils.path import PathLike +from dstack._internal.utils.path import PathLike as PathLike from dstack.api._public.backends import BackendCollection -from dstack.api._public.pools import PoolCollection -from dstack.api._public.repos import RepoCollection, get_ssh_keypair +from dstack.api._public.repos import RepoCollection from dstack.api._public.runs import RunCollection from dstack.api.server import APIClient @@ -16,12 +14,14 @@ class Client: """ - High-level API client for interacting with dstack server + High-level API client for interacting with the `dstack` server Attributes: + project: The project name. runs: Operations with runs. repos: Operations with repositories. backends: Operations with backends. + client: Low-level API client that supports all API endpoints. """ def __init__( @@ -34,37 +34,35 @@ def __init__( # Args: # api_client: low-level server API client # project_name: project name used for runs - # ssh_identity_file: SSH keypair to access instances + # ssh_identity_file: deprecated and will be removed in 0.19.40 # """ self._client = api_client self._project = project_name self._repos = RepoCollection(api_client, project_name) self._backends = BackendCollection(api_client, project_name) self._runs = RunCollection(api_client, project_name, self) - self._pool = PoolCollection(api_client, project_name) - if ssh_identity_file: - self.ssh_identity_file = str(ssh_identity_file) - else: - self.ssh_identity_file = get_ssh_keypair(None, ConfigManager().dstack_key_path) + if ssh_identity_file is not None: + logger.warning( + "[code]ssh_identity_file[/code] in [code]Client[/code] is deprecated and ignored; will be removed" + " since 0.19.40" + ) @staticmethod def from_config( project_name: Optional[str] = None, server_url: Optional[str] = None, user_token: Optional[str] = None, - ssh_identity_file: Optional[PathLike] = None, ) -> "Client": """ Creates a Client using the default configuration from `~/.dstack/config.yml` if it exists. Args: - project_name: The name of the project, required if `server_url` and `user_token` are specified - server_url: The dstack server URL (e.g. `http://localhost:3000/` or `https://sky.dstack.ai`) - user_token: The dstack user token - ssh_identity_file: The private SSH key path for SSH tunneling + project_name: The name of the project. required if `server_url` and `user_token` are specified. + server_url: The dstack server URL (e.g. `http://localhost:3000/` or `https://sky.dstack.ai`). + user_token: The dstack user token. Returns: - A client instance + A client instance. """ if server_url is not None and user_token is not None: if project_name is None: @@ -75,29 +73,24 @@ def from_config( return Client( api_client=api_client, project_name=project_name, - ssh_identity_file=ssh_identity_file, ) @property - def repos(self) -> RepoCollection: - return self._repos - - @property - def backends(self) -> BackendCollection: - return self._backends + def project(self) -> str: + return self._project @property def runs(self) -> RunCollection: return self._runs @property - def client(self) -> APIClient: - return self._client + def repos(self) -> RepoCollection: + return self._repos @property - def project(self) -> str: - return self._project + def backends(self) -> BackendCollection: + return self._backends @property - def pool(self) -> PoolCollection: - return self._pool + def client(self) -> APIClient: + return self._client diff --git a/src/dstack/api/_public/backends.py b/src/dstack/api/_public/backends.py index 84e446f88d..452fb9c458 100644 --- a/src/dstack/api/_public/backends.py +++ b/src/dstack/api/_public/backends.py @@ -1,6 +1,6 @@ from typing import List -from dstack._internal.core.models.backends import BackendInfo +from dstack._internal.core.backends.models import BackendInfo from dstack.api.server import APIClient diff --git a/src/dstack/api/_public/pools.py b/src/dstack/api/_public/pools.py deleted file mode 100644 index a496c2f4be..0000000000 --- a/src/dstack/api/_public/pools.py +++ /dev/null @@ -1,41 +0,0 @@ -from typing import List - -from dstack._internal.core.models.pools import Pool -from dstack.api.server import APIClient - - -class PoolInstance: - def __init__(self, api_client: APIClient, pool: Pool): - self._api_client = api_client - self._pool = pool - - @property - def name(self) -> str: - return self._pool.name - - def __str__(self) -> str: - return f"" - - def __repr__(self) -> str: - return f"" - - -class PoolCollection: - """ - Operations with pools - """ - - def __init__(self, api_client: APIClient, project: str): - self._api_client = api_client - self._project = project - - def list(self) -> List[PoolInstance]: - """ - List available pool in the project - - Returns: - pools - """ - list_raw_pool = self._api_client.pool.list(project_name=self._project) - list_pool = [PoolInstance(self._api_client, instance) for instance in list_raw_pool] - return list_pool diff --git a/src/dstack/api/_public/repos.py b/src/dstack/api/_public/repos.py index 12f2372242..07dfa1e46d 100644 --- a/src/dstack/api/_public/repos.py +++ b/src/dstack/api/_public/repos.py @@ -1,19 +1,20 @@ -from pathlib import Path -from typing import Optional, Union - -import giturlparse -from git import InvalidGitRepositoryError - -from dstack._internal.core.errors import ConfigurationError, ResourceNotExistsError -from dstack._internal.core.models.repos import LocalRepo, RemoteRepo -from dstack._internal.core.models.repos.base import Repo, RepoType -from dstack._internal.core.services.configs import ConfigManager -from dstack._internal.core.services.repos import ( - InvalidRepoCredentialsError, - get_local_repo_credentials, - load_repo, +from typing import Literal, Optional, Union, overload + +from dstack._internal.core.errors import ( + ConfigurationError, + RepoInvalidCredentialsError, + RepoInvalidGitRepositoryError, + ResourceNotExistsError, +) +from dstack._internal.core.models.repos import ( + LocalRepo, + RemoteRepo, + RemoteRepoCreds, + Repo, + RepoHead, + RepoHeadWithCreds, ) -from dstack._internal.utils.crypto import generate_rsa_key_pair +from dstack._internal.core.services.repos import get_repo_creds_and_default_branch from dstack._internal.utils.logging import get_logger from dstack._internal.utils.path import PathLike from dstack.api.server import APIClient @@ -35,6 +36,7 @@ def init( repo: Repo, git_identity_file: Optional[PathLike] = None, oauth_token: Optional[str] = None, + creds: Optional[RemoteRepoCreds] = None, ): """ Initializes the repo and configures its credentials in the project. @@ -43,7 +45,7 @@ def init( Example: ```python - repo=RemoteRepo.from_url( + repo = RemoteRepo.from_url( repo_url="https://github.com/dstackai/dstack-examples", repo_branch="main", ) @@ -56,7 +58,7 @@ def init( Once the repo is initialized, you can pass the repo object to the run: ```python - run = client.runs.submit( + run = client.runs.apply_configuration( configuration=..., repo=repo, ) @@ -66,39 +68,25 @@ def init( repo: The repo to initialize. git_identity_file: The private SSH key path for accessing the remote repo. oauth_token: The GitHub OAuth token to access the remote repo. + creds: Optional prepared repo credentials. If specified, both `git_identity_file` + and `oauth_token` are ignored. """ - creds = None - if isinstance(repo, RemoteRepo): + if isinstance(repo, LocalRepo): + raise ConfigurationError( + "Local repositories are not supported since 0.20.0. Use `files` to mount" + " an arbitrary directory: https://dstack.ai/docs/concepts/tasks/#files" + ) + if creds is None and isinstance(repo, RemoteRepo): try: - creds = get_local_repo_credentials( - repo_data=repo.run_repo_data, + creds, _ = get_repo_creds_and_default_branch( + repo_url=repo.repo_url, identity_file=git_identity_file, oauth_token=oauth_token, - original_hostname=giturlparse.parse(repo.repo_url).resource, ) - except InvalidRepoCredentialsError as e: - raise ConfigurationError(*e.args) + except RepoInvalidCredentialsError: + raise ConfigurationError("No valid default Git credentials found") self._api_client.repos.init(self._project, repo.repo_id, repo.get_repo_info(), creds) - def is_initialized( - self, - repo: Repo, - ) -> bool: - # """ - # Checks if the remote repo is initialized in the project - # - # Args: - # repo: repo to check - # - # Returns: - # repo is initialized - # """ - try: - self._api_client.repos.get(self._project, repo.repo_id, include_creds=False) - return True - except ResourceNotExistsError: - return False - def load( self, repo_dir: PathLike, @@ -106,70 +94,108 @@ def load( init: bool = False, git_identity_file: Optional[PathLike] = None, oauth_token: Optional[str] = None, - ) -> Union[RemoteRepo, LocalRepo]: + ) -> RemoteRepo: # """ # Loads the repo from the local directory using global config - # + # Args: - # repo_dir: repo root directory - # local: do not try to load `RemoteRepo` first - # init: initialize the repo if it's not initialized - # git_identity_file: path to an SSH private key to access the remote repo - # oauth_token: GitHub OAuth token to access the remote repo - # + # repo_dir: Repo root directory. + # local: Do not try to load `RemoteRepo` first. + # init: Initialize the repo if it's not initialized. + # git_identity_file: Path to an SSH private key to access the remote repo. + # oauth_token: GitHub OAuth token to access the remote repo. + # Raises: - # ConfigurationError: if the repo is not initialized and `init` is `False` - # + # ConfigurationError: If the repo is not initialized and `init` is `False`. + # Returns: - # repo: initialized repo + # repo: Initialized repo. # """ - config = ConfigManager() + logger.warning( + "The load() method is deprecated, use RemoteRepo directly:" + " https://dstack.ai/docs/reference/api/python/#dstack.api.RemoteRepo" + ) + if local: + raise ConfigurationError( + "Local repositories are not supported since 0.20.0. Use `files` to mount" + " an arbitrary directory: https://dstack.ai/docs/concepts/tasks/#files" + ) if not init: - logger.debug("Loading repo config") - repo_config = config.get_repo_config(repo_dir) - if repo_config is None: - raise ConfigurationError( - "The repo is not initialized. Run `dstack init` to initialize the repo." - ) - repo = load_repo(repo_config) - try: - self._api_client.repos.get(self._project, repo.repo_id, include_creds=False) - except ResourceNotExistsError: - raise ConfigurationError( - "The repo is not initialized. Run `dstack init` to initialize the repo." - ) - else: - logger.debug("Initializing repo") - repo = LocalRepo(repo_dir=repo_dir) # default - if not local: - try: - repo = RemoteRepo(local_repo_dir=repo_dir) - except InvalidGitRepositoryError: - pass # use default - self.init(repo, git_identity_file, oauth_token) - config.save_repo_config( - repo.repo_dir, - repo.repo_id, - RepoType(repo.run_repo_data.repo_type), - get_ssh_keypair(None, config.dstack_key_path), + raise ConfigurationError( + "Repo config has been removed in 0.20.0," + " this method can now only be used with init=True" + ) + logger.debug("Initializing repo") + try: + repo = RemoteRepo.from_dir(repo_dir) + except RepoInvalidGitRepositoryError: + raise ConfigurationError( + f"Git repo not found: {repo_dir}. Use `files` to mount an arbitrary" + " directory: https://dstack.ai/docs/concepts/tasks/#files" ) + self.init(repo, git_identity_file, oauth_token) return repo + def is_initialized( + self, + repo: Repo, + by_user: bool = False, + ) -> bool: + """ + Checks if the repo is initialized in the project -def get_ssh_keypair(key_path: Optional[PathLike], dstack_key_path: Path) -> str: - """Returns a path to the private key""" - if key_path is not None: - key_path = Path(key_path).expanduser().resolve() - pub_key = ( - key_path - if key_path.suffix == ".pub" - else key_path.with_suffix(key_path.suffix + ".pub") - ) - private_key = pub_key.with_suffix("") - if pub_key.exists() and private_key.exists(): - return str(private_key) - raise ConfigurationError(f"Make sure valid keypair exists: {private_key}(.pub)") - - if not dstack_key_path.exists(): - generate_rsa_key_pair(private_key_path=dstack_key_path) - return str(dstack_key_path) + Args: + repo: The repo to check. + by_user: Require the remote repo to be initialized by the user, that is, to have + the user's credentials. Ignored for other repo types. + + Returns: + Whether the repo is initialized or not. + """ + if isinstance(repo, RemoteRepo) and by_user: + return self._is_initialized_by_user(repo) + try: + self._api_client.repos.get(self._project, repo.repo_id) + return True + except ResourceNotExistsError: + return False + + def _is_initialized_by_user(self, repo: RemoteRepo) -> bool: + try: + repo_head = self._api_client.repos.get_with_creds(self._project, repo.repo_id) + except ResourceNotExistsError: + return False + # This works because: + # - RepoCollection.init() always submits RemoteRepoCreds for remote repos, even if + # the repo is public + # - Server returns creds only if there is RepoCredsModel for the user (or legacy + # shared creds in RepoModel) + # TODO: add an API method with the same logic returning a bool value? + return repo_head.repo_creds is not None + + @overload + def get(self, repo_id: str, *, with_creds: Literal[False] = False) -> Optional[RepoHead]: ... + + @overload + def get(self, repo_id: str, *, with_creds: Literal[True]) -> Optional[RepoHeadWithCreds]: ... + + def get( + self, repo_id: str, *, with_creds: bool = False + ) -> Optional[Union[RepoHead, RepoHeadWithCreds]]: + """ + Returns the repo by `repo_id` + + Args: + repo_id: The repo ID. + with_creds: include repo credentials in the response. + + Returns: + The repo or `None` if the repo is not found. + """ + method = self._api_client.repos.get + if with_creds: + method = self._api_client.repos.get_with_creds + try: + return method(self._project, repo_id) + except ResourceNotExistsError: + return None diff --git a/src/dstack/api/_public/resources.py b/src/dstack/api/_public/resources.py deleted file mode 100644 index d32ec4bd4b..0000000000 --- a/src/dstack/api/_public/resources.py +++ /dev/null @@ -1,105 +0,0 @@ -from typing import List, Optional, Union - -from dstack._internal.core.models.resources import ( - DEFAULT_CPU_COUNT, - DEFAULT_GPU_COUNT, - DEFAULT_MEMORY_SIZE, - ComputeCapabilityLike, - DiskLike, - DiskSpec, - DiskSpecSchema, - GPULike, - GPUSpec, - GPUSpecSchema, - IntRangeLike, - MemoryLike, - MemoryRangeLike, - ResourcesSpec, - ResourcesSpecSchema, -) - - -# TODO(andrey): This method looks like a workaround and possibly must be reworked (replaced with something else). -# Currently it's only used by the `dstack pool add` command. -def Resources( - *, - cpu: IntRangeLike = DEFAULT_CPU_COUNT, - memory: MemoryRangeLike = DEFAULT_MEMORY_SIZE, - gpu: Optional[GPULike] = None, - shm_size: Optional[MemoryLike] = None, - disk: Optional[DiskLike] = None, -) -> ResourcesSpec: - """ - Creates required resources specification. - - Args: - cpu (Optional[Range[int]]): The number of CPUs - memory (Optional[Range[Memory]]): The size of RAM memory (e.g., `"16GB"`) - gpu (Optional[GPUSpec]): The GPU spec - shm_size (Optional[Range[Memory]]): The of shared memory (e.g., `"8GB"`). If you are using parallel communicating processes (e.g., dataloaders in PyTorch), you may need to configure this. - disk (Optional[DiskSpec]): The disk spec - - Returns: - resources specification - """ - return ResourcesSpec.parse_obj( - ResourcesSpecSchema( - cpu=cpu, - memory=memory, - gpu=gpu, - shm_size=shm_size, - disk=disk, - ) - ) - - -def GPU( - *, - name: Optional[Union[List[str], str]] = None, - count: IntRangeLike = DEFAULT_GPU_COUNT, - memory: Optional[MemoryRangeLike] = None, - total_memory: Optional[MemoryRangeLike] = None, - compute_capability: Optional[ComputeCapabilityLike] = None, -) -> GPUSpec: - """ - Creates GPU specification. - - Args: - name (Optional[List[str]]): The name of the GPU (e.g., `"A100"` or `"H100"`) - count (Optional[Range[int]]): The number of GPUs - memory (Optional[Range[Memory]]): The size of a single GPU memory (e.g., `"16GB"`) - total_memory (Optional[Range[Memory]]): The total size of all GPUs memory (e.g., `"32GB"`) - compute_capability (Optional[float]): The minimum compute capability of the GPU (e.g., `7.5`) - - Returns: - GPU specification - """ - return GPUSpec.parse_obj( - GPUSpecSchema( - name=name, - count=count, - memory=memory, - total_memory=total_memory, - compute_capability=compute_capability, - ) - ) - - -def Disk( - *, - size: MemoryRangeLike, -) -> DiskSpec: - """ - Creates disk specification. - - Args: - size (Range[Memory]): The size of the disk (e.g., `"100GB"`) - - Returns: - disk specification - """ - return DiskSpec.parse_obj( - DiskSpecSchema( - size=size, - ) - ) diff --git a/src/dstack/api/_public/runs.py b/src/dstack/api/_public/runs.py index 2341b89366..1b3def5424 100644 --- a/src/dstack/api/_public/runs.py +++ b/src/dstack/api/_public/runs.py @@ -4,44 +4,52 @@ import threading import time from abc import ABC +from collections.abc import Iterator +from contextlib import contextmanager from copy import copy from datetime import datetime from pathlib import Path -from typing import Dict, Iterable, List, Optional, Union -from urllib.parse import urlparse +from typing import BinaryIO, Dict, Iterable, List, Optional, Union +from urllib.parse import urlencode, urlparse from websocket import WebSocketApp import dstack.api as api -from dstack._internal.core.errors import ConfigurationError, ResourceNotExistsError +from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT, DSTACK_RUNNER_SSH_PORT +from dstack._internal.core.deprecated import Deprecated +from dstack._internal.core.errors import ClientError, ConfigurationError, ResourceNotExistsError from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.configurations import AnyRunConfiguration -from dstack._internal.core.models.pools import Instance +from dstack._internal.core.models.configurations import ( + AnyRunConfiguration, + PortMapping, + ServiceConfiguration, +) +from dstack._internal.core.models.files import FileArchiveMapping from dstack._internal.core.models.profiles import ( - DEFAULT_RUN_TERMINATION_IDLE_TIME, - CreationPolicy, Profile, - ProfileRetryPolicy, - SpotPolicy, - TerminationPolicy, ) from dstack._internal.core.models.repos.base import Repo -from dstack._internal.core.models.resources import ResourcesSpec +from dstack._internal.core.models.repos.virtual import VirtualRepo from dstack._internal.core.models.runs import ( + Job, JobSpec, - PoolInstanceOffers, - Requirements, + JobStatus, RunPlan, RunSpec, RunStatus, + get_service_port, ) from dstack._internal.core.models.runs import Run as RunModel +from dstack._internal.core.services.configs import ConfigManager from dstack._internal.core.services.logs import URLReplacer -from dstack._internal.core.services.ssh.attach import SSHAttach +from dstack._internal.core.services.ssh.attach import BaseSSHAttach, SSHAttach, SSHProxyAttach +from dstack._internal.core.services.ssh.key_manager import UserSSHKeyManager from dstack._internal.core.services.ssh.ports import PortsLock from dstack._internal.server.schemas.logs import PollLogsRequest +from dstack._internal.utils.common import get_or_error, make_proxy_url +from dstack._internal.utils.files import create_file_archive from dstack._internal.utils.logging import get_logger -from dstack._internal.utils.path import PathLike, path_in_dir +from dstack._internal.utils.path import PathLike from dstack.api.server import APIClient logger = get_logger(__name__) @@ -61,16 +69,20 @@ def __init__( self, api_client: APIClient, project: str, - ssh_identity_file: Optional[PathLike], run: RunModel, ports_lock: Optional[PortsLock] = None, + ssh_identity_file: Optional[PathLike] = None, ): self._api_client = api_client self._project = project - self._ssh_identity_file = ssh_identity_file self._run = run self._ports_lock: Optional[PortsLock] = ports_lock - self._ssh_attach: Optional[SSHAttach] = None + self._ssh_attach: Optional[BaseSSHAttach] = None + if ssh_identity_file is not None: + logger.warning( + "[code]ssh_identity_file[/code] in [code]Run[/code] is deprecated and ignored; will be removed" + " since 0.19.40" + ) @property def name(self) -> str: @@ -99,47 +111,77 @@ def hostname(self) -> str: def service_url(self) -> str: if self._run.run_spec.configuration.type != "service": raise ValueError("The run is not a service") - return self._run.service.url + return make_proxy_url( + server_url=self._api_client.base_url, + proxy_url=self._run.service.url, + ) - def _attached_logs( - self, - ) -> Iterable[bytes]: + @property + def service_model(self) -> Optional["ServiceModel"]: + if self._run.run_spec.configuration.type != "service": + raise ValueError("The run is not a service") + if self._run.service.model is None: + return None + return ServiceModel( + name=self._run.service.model.name, + url=make_proxy_url( + server_url=self._api_client.base_url, + proxy_url=self._run.service.model.base_url, + ), + ) + + def _attached_logs(self, start_time: Optional[datetime] = None) -> Iterable[bytes]: q = queue.Queue() _done = object() def ws_thread(): try: logger.debug("Starting WebSocket logs for %s", self.name) - ws.run_forever() + ws.run_forever(ping_interval=60) finally: logger.debug("WebSocket logs are done for %s", self.name) q.put(_done) + url = f"ws://localhost:{self.ports[DSTACK_RUNNER_HTTP_PORT]}/logs_ws" + query_params = {} + if start_time is not None: + query_params["start_time"] = start_time.isoformat() + if query_params: + url = f"{url}?{urlencode(query_params)}" ws = WebSocketApp( - f"ws://localhost:{self.ports[10999]}/logs_ws", + url=url, on_open=lambda _: logger.debug("WebSocket logs are connected to %s", self.name), - on_close=lambda _, __, ___: logger.debug("WebSocket logs are disconnected"), + on_close=lambda _, status_code, msg: logger.debug( + "WebSocket logs are disconnected. status_code: %s; message: %s", + status_code, + msg, + ), on_message=lambda _, message: q.put(message), ) threading.Thread(target=ws_thread).start() - ports = self.ports hostname = "127.0.0.1" secure = False + ports = self.ports + path_prefix = "" if self._run.service is not None: - url = urlparse(self._run.service.url) + url = urlparse(self.service_url) + hostname = url.hostname + secure = url.scheme == "https" + service_port = url.port + if service_port is None: + service_port = 443 if secure else 80 ports = { **ports, - # we support only default https port - self._run.run_spec.configuration.port.container_port: url.port or 443, + get_or_error(get_or_error(self._ssh_attach).service_port): service_port, } - hostname = url.hostname - secure = url.scheme == "https" + path_prefix = url.path replace_urls = URLReplacer( ports=ports, app_specs=self._run.jobs[0].job_spec.app_specs, hostname=hostname, secure=secure, + path_prefix=path_prefix, ip_address=self.hostname, ) @@ -157,60 +199,59 @@ def logs( self, start_time: Optional[datetime] = None, diagnose: bool = False, - replica_num: int = 0, + replica_num: Optional[int] = None, job_num: int = 0, ) -> Iterable[bytes]: """ - Iterate through run's log messages + Iterate through run's log messages. Args: - start_time: minimal log timestamp - diagnose: return runner logs if `True` + start_time: Minimal log timestamp. + diagnose: Return runner logs if `True`. Yields: - log messages + Log messages. """ if diagnose is False and self._ssh_attach is not None: - yield from self._attached_logs() + yield from self._attached_logs(start_time=start_time) else: - job = None - for j in self._run.jobs: - if j.job_spec.replica_num == replica_num and j.job_spec.job_num == job_num: - job = j + job = self._find_job(replica_num=replica_num, job_num=job_num) if job is None: return [] - next_start_time = start_time + next_token = None while True: resp = self._api_client.logs.poll( project_name=self._project, body=PollLogsRequest( run_name=self.name, job_submission_id=job.job_submissions[-1].id, - start_time=next_start_time, + start_time=start_time, end_time=None, descending=False, + limit=1000, diagnose=diagnose, + next_token=next_token, ), ) - if len(resp.logs) == 0: - return [] for log in resp.logs: yield base64.b64decode(log.message) - next_start_time = resp.logs[-1].timestamp + next_token = resp.next_token + if next_token is None: + break def refresh(self): """ - Get up-to-date run info + Get up-to-date run info. """ self._run = self._api_client.runs.get(self._project, self._run.run_spec.run_name) logger.debug("Refreshed run %s: %s", self.name, self.status) def stop(self, abort: bool = False): """ - Terminate the instance and detach + Terminate the instance and detach. Args: - abort: gracefully stop the run if `False` + abort: Gracefully stop the run if `False`. """ self._api_client.runs.stop(self._project, [self.name], abort) logger.debug("%s run %s", "Aborted" if abort else "Stopped", self.name) @@ -219,21 +260,54 @@ def stop(self, abort: bool = False): def attach( self, ssh_identity_file: Optional[PathLike] = None, + bind_address: Optional[str] = None, + ports_overrides: Optional[List[PortMapping]] = None, + replica_num: Optional[int] = None, + job_num: int = 0, ) -> bool: """ Establish an SSH tunnel to the instance and update SSH config Args: - ssh_identity_file: SSH keypair to access instances + ssh_identity_file: SSH keypair to access instances. + replica_num: replica_num or None to attach to any running replica. Raises: dstack.api.PortUsedError: If ports are in use or the run is attached by another process. """ - ssh_identity_file = ssh_identity_file or self._ssh_identity_file - if ssh_identity_file is None: - raise ConfigurationError("SSH identity file is required to attach to the run") + if not ssh_identity_file: + config_manager = ConfigManager() + key_manager = UserSSHKeyManager(self._api_client, config_manager.dstack_ssh_dir) + user_key = key_manager.get_user_key() + if user_key.public_key == self._run.run_spec.ssh_key_pub: + ssh_identity_file = user_key.private_key_path + else: + if config_manager.dstack_key_path.exists(): + logger.debug(f"Using legacy [code]{config_manager.dstack_key_path}[/code].") + ssh_identity_file = config_manager.dstack_key_path + else: + raise ConfigurationError( + f"User SSH key doesn't match; default SSH key ({config_manager.dstack_key_path}) doesn't exist" + ) ssh_identity_file = str(ssh_identity_file) + job = self._find_job(replica_num=replica_num, job_num=job_num) + if job is None: + replica_repr = replica_num if replica_num is not None else "" + raise ClientError(f"Failed to find replica={replica_repr} job={job_num}") + replica_num = job.job_spec.replica_num + + name = self.name + if replica_num != 0 or job_num != 0: + name = job.job_spec.job_name + + if self._ssh_attach is not None and name != self._ssh_attach.run_name: + # This is only a limitation when using the same Run instance via Python API. + # The CLI can attach to different jobs simultaneously. + raise ClientError("Cannot attach to different job with active attach. Detach first.") + + # TODO: Check there are no two attaches to the same run with different params + if self._ssh_attach is None: while self.status in ( RunStatus.SUBMITTED, @@ -246,46 +320,89 @@ def attach( if self.status.is_finished() and self.status != RunStatus.DONE: return False - job = self._run.jobs[0] # TODO(egor-s): pull logs from all replicas? - provisioning_data = job.job_submissions[-1].job_provisioning_data + # Reload job + job = get_or_error(self._find_job(replica_num=replica_num, job_num=job_num)) + latest_job_submission = job.job_submissions[-1] + provisioning_data = latest_job_submission.job_provisioning_data + if provisioning_data is None: + raise ClientError("Failed to attach. The run is not provisioned yet.") - control_sock_path_and_port_locks = SSHAttach.reuse_control_sock_path_and_port_locks( - run_name=self.name - ) + ports_lock = SSHAttach.reuse_ports_lock(run_name=name) - if control_sock_path_and_port_locks is None: + if ports_lock is None: if self._ports_lock is None: - self._ports_lock = _reserve_ports(job.job_spec) + self._ports_lock = _reserve_ports(job.job_spec, ports_overrides) logger.debug( "Attaching to %s (%s: %s)", - self.name, + name, provisioning_data.hostname, self._ports_lock.dict(), ) else: - self._ports_lock = control_sock_path_and_port_locks[1] + self._ports_lock = ports_lock logger.debug( "Reusing the existing tunnel to %s (%s: %s)", - self.name, + name, provisioning_data.hostname, self._ports_lock.dict(), ) - self._ssh_attach = SSHAttach( - hostname=self.hostname, - ssh_port=provisioning_data.ssh_port, - user=provisioning_data.username, - id_rsa_path=ssh_identity_file, - ports_lock=self._ports_lock, - run_name=self.name, - dockerized=provisioning_data.dockerized, - ssh_proxy=provisioning_data.ssh_proxy, - control_sock_path=control_sock_path_and_port_locks[0] - if control_sock_path_and_port_locks - else None, - local_backend=provisioning_data.backend == BackendType.LOCAL, - ) - if not control_sock_path_and_port_locks: + service_port = None + if isinstance(self._run.run_spec.configuration, ServiceConfiguration): + service_port = get_service_port(job.job_spec, self._run.run_spec.configuration) + + ssh_attach: BaseSSHAttach + + if (jci := job.job_connection_info) is not None and jci.sshproxy_hostname is not None: + assert jci.sshproxy_upstream_id is not None + ssh_attach = SSHProxyAttach( + hostname=jci.sshproxy_hostname, + port=jci.sshproxy_port, + upstream_id=jci.sshproxy_upstream_id, + identity_path=ssh_identity_file, + ports_lock=self._ports_lock, + run_name=name, + service_port=service_port, + bind_address=bind_address, + ) + else: + hostname = provisioning_data.hostname + assert hostname is not None + ssh_port = provisioning_data.ssh_port + assert ssh_port is not None + + runtime_data = latest_job_submission.job_runtime_data + + container_ssh_port = DSTACK_RUNNER_SSH_PORT + if runtime_data is not None and runtime_data.ports is not None: + container_ssh_port = runtime_data.ports.get( + container_ssh_port, container_ssh_port + ) + + if runtime_data is not None and runtime_data.username is not None: + container_user = runtime_data.username + elif job.job_spec.user is not None and job.job_spec.user.username is not None: + container_user = job.job_spec.user.username + else: + container_user = "root" + + ssh_attach = SSHAttach( + hostname=hostname, + ssh_port=ssh_port, + container_ssh_port=container_ssh_port, + user=provisioning_data.username, + container_user=container_user, + identity_path=ssh_identity_file, + ports_lock=self._ports_lock, + run_name=name, + dockerized=provisioning_data.dockerized, + ssh_proxy=provisioning_data.ssh_proxy, + service_port=service_port, + bind_address=bind_address, + ) + + self._ssh_attach = ssh_attach + if not ports_lock: self._ssh_attach.attach() self._ports_lock = None @@ -296,10 +413,21 @@ def detach(self): Stop the SSH tunnel to the instance and update SSH config """ if self._ssh_attach is not None: - logger.debug("Detaching from %s", self.name) + logger.debug("Detaching from %s", self._ssh_attach.run_name) self._ssh_attach.detach() self._ssh_attach = None + def _find_job(self, replica_num: Optional[int], job_num: int) -> Optional[Job]: + for j in self._run.jobs: + if ( + replica_num is not None + and j.job_spec.replica_num == replica_num + or replica_num is None + and j.job_submissions[-1].status == JobStatus.RUNNING + ) and j.job_spec.job_num == job_num: + return j + return None + def __str__(self) -> str: return f"" @@ -307,9 +435,26 @@ def __repr__(self) -> str: return f"" +class ServiceModel: + def __init__(self, name: str, url: str) -> None: + self._name = name + self._url = url + + @property + def name(self) -> str: + return self._name + + @property + def url(self) -> str: + return self._url + + def __repr__(self) -> str: + return f"" + + class RunCollection: """ - Operations with runs + Operations with runs. """ def __init__( @@ -322,205 +467,201 @@ def __init__( self._project = project self._client = client - def submit( + def get_run_plan( self, configuration: AnyRunConfiguration, - configuration_path: Optional[str] = None, repo: Optional[Repo] = None, - backends: Optional[List[BackendType]] = None, - regions: Optional[List[str]] = None, - instance_types: Optional[List[str]] = None, - resources: Optional[ResourcesSpec] = None, - spot_policy: Optional[SpotPolicy] = None, - retry_policy: Optional[ProfileRetryPolicy] = None, - max_duration: Optional[Union[int, str]] = None, - max_price: Optional[float] = None, - working_dir: Optional[str] = None, - run_name: Optional[str] = None, - reserve_ports: bool = True, - ) -> Run: + profile: Optional[Profile] = None, + configuration_path: Optional[str] = None, + repo_dir: Union[Deprecated, str, None] = Deprecated.PLACEHOLDER, + ssh_identity_file: Optional[PathLike] = None, + ) -> RunPlan: """ - Submit a run + Get a run plan. + Use this method to see the run plan before applying the configuration. Args: - configuration (Union[Task, Service]): A run configuration. - configuration_path: The path to the configuration file, relative to the root directory of the repo. - repo (Union[LocalRepo, RemoteRepo, VirtualRepo]): A repo to mount to the run. - backends: A list of allowed backend for provisioning. - regions: A list of cloud regions for provisioning. - resources: The requirements to run the configuration. Overrides the configuration's resources. - spot_policy: A spot policy for provisioning. - retry_policy (RetryPolicy): A retry policy. - max_duration: The max instance running duration in seconds. - max_price: The max instance price in dollars per hour for provisioning. - working_dir: A working directory relative to the repo root directory - run_name: A desired name of the run. Must be unique in the project. If not specified, a random name is assigned. - reserve_ports: Whether local ports should be reserved in advance. + configuration (Union[Task, Service, DevEnvironment]): The run configuration. + repo (Union[RemoteRepo, VirtualRepo, None]): + The repo to use for the run. Pass `None` if repo is not needed. + profile: The profile to use for the run. + configuration_path: The path to the configuration file. Omit if the configuration + is not loaded from a file. + ssh_identity_file: Path to the private SSH key file. The corresponding public key + (`.pub` file) is read and included in the run plan, allowing SSH access to the instances. + If the `.pub` file does not exist, it is generated automatically. + If ssh_identity_file is not specified, the user key is used. Returns: - submitted run + Run plan. """ if repo is None: - repo = configuration.get_repo() - if repo is None: - raise ConfigurationError("Repo is required for this type of configuration") - # TODO: Add Git credentials to RemoteRepo and if they are set, pass them here to RepoCollection.init - self._client.repos.init(repo) - - run_plan = self.get_plan( - configuration=configuration, - repo=repo, - configuration_path=configuration_path, - backends=backends, - regions=regions, - instance_types=instance_types, - resources=resources, - spot_policy=spot_policy, - retry_policy=retry_policy, - max_duration=max_duration, - max_price=max_price, - working_dir=working_dir, - run_name=run_name, - ) - return self.exec_plan(run_plan, repo, reserve_ports=reserve_ports) - - def get_offers(self, profile: Profile, requirements: Requirements) -> PoolInstanceOffers: - return self._api_client.runs.get_offers(self._project, profile, requirements) - - def create_instance(self, profile: Profile, requirements: Requirements) -> Instance: - return self._api_client.runs.create_instance(self._project, profile, requirements) - - def get_plan( - self, - configuration: AnyRunConfiguration, - repo: Repo, - configuration_path: Optional[str] = None, - backends: Optional[List[BackendType]] = None, - regions: Optional[List[str]] = None, - instance_types: Optional[List[str]] = None, - resources: Optional[ResourcesSpec] = None, - spot_policy: Optional[SpotPolicy] = None, - retry_policy: Optional[ProfileRetryPolicy] = None, - max_duration: Optional[Union[int, str]] = None, - max_price: Optional[float] = None, - working_dir: Optional[str] = None, - run_name: Optional[str] = None, - pool_name: Optional[str] = None, - instance_name: Optional[str] = None, - creation_policy: Optional[CreationPolicy] = None, - termination_policy: Optional[TerminationPolicy] = None, - termination_policy_idle: int = DEFAULT_RUN_TERMINATION_IDLE_TIME, - ) -> RunPlan: - # """ - # Get run plan. Same arguments as `submit` - # - # Returns: - # run plan - # """ - if working_dir is None: - working_dir = "." - elif repo.repo_dir is not None: - working_dir_path = Path(repo.repo_dir) / working_dir - if not path_in_dir(working_dir_path, repo.repo_dir): - raise ConfigurationError("Working directory is outside of the repo") - working_dir = working_dir_path.relative_to(repo.repo_dir).as_posix() - - if configuration_path is None: - configuration_path = "(python)" - - if resources is not None: - configuration = configuration.copy(deep=True) - configuration.resources = resources - - profile = Profile( - name="(python)", - backends=backends, - regions=regions, - instance_types=instance_types, - spot_policy=spot_policy, - retry_policy=retry_policy, - max_duration=max_duration, - max_price=max_price, - pool_name=pool_name, - instance_name=instance_name, - creation_policy=creation_policy, - termination_policy=termination_policy, - termination_idle_time=termination_policy_idle, - ) + repo = VirtualRepo() + repo_code_hash: Optional[str] = None + if repo.has_code_to_write(): + with _prepare_code_file(repo) as (_, repo_code_hash): + pass + + if repo_dir is not Deprecated.PLACEHOLDER: + logger.warning( + "The repo_dir argument is deprecated, ignored, and will be removed soon." + " Remove it and use the repos[].path configuration property instead." + ) + if configuration.repos: + repo_dir = configuration.repos[0].path + else: + repo_dir = None + + self._validate_configuration_files(configuration, configuration_path) + file_archives: list[FileArchiveMapping] = [] + for file_mapping in configuration.files: + with tempfile.TemporaryFile("w+b") as fp: + try: + archive_hash = create_file_archive(file_mapping.local_path, fp) + except OSError as e: + raise ClientError(f"failed to archive '{file_mapping.local_path}': {e}") from e + fp.seek(0) + archive = self._api_client.files.upload_archive(hash=archive_hash, fp=fp) + file_archives.append(FileArchiveMapping(id=archive.id, path=file_mapping.path)) + + if ssh_identity_file: + ssh_key_pub = Path(ssh_identity_file).with_suffix(".pub").read_text() + else: + ssh_key_pub = None # using the server-managed user key run_spec = RunSpec( - run_name=run_name, + run_name=configuration.name, repo_id=repo.repo_id, repo_data=repo.run_repo_data, - repo_code_hash=None, # `exec_plan` will fill it - working_dir=working_dir, + repo_code_hash=repo_code_hash, + repo_dir=repo_dir, + file_archives=file_archives, configuration_path=configuration_path, configuration=configuration, profile=profile, - ssh_key_pub=Path(self._client.ssh_identity_file + ".pub").read_text().strip(), + ssh_key_pub=ssh_key_pub, ) logger.debug("Getting run plan") - return self._api_client.runs.get_plan(self._project, run_spec) + run_plan = self._api_client.runs.get_plan(self._project, run_spec) + return run_plan - def exec_plan( + def apply_plan( self, run_plan: RunPlan, - repo: Repo, + repo: Optional[Repo] = None, reserve_ports: bool = True, ) -> Run: - # """ - # Execute run plan - # - # Args: - # run_plan: result of `get_plan` call - # repo: repo to use for the run - # reserve_ports: reserve local ports before submit - # - # Returns: - # submitted run - # """ + """ + Apply the run plan. + Use this method to apply run plans returned by `get_run_plan`. + + Args: + run_plan: The result of `get_run_plan` call. + repo (Union[RemoteRepo, VirtualRepo, None]): + The repo to use for the run. Should be the same repo that is passed to `get_run_plan`. + reserve_ports: Reserve local ports before applying. Use if you'll attach to the run. + + Returns: + Submitted run. + """ ports_lock = None if reserve_ports: # TODO handle multiple jobs ports_lock = _reserve_ports(run_plan.job_plans[0].job_spec) - with tempfile.TemporaryFile("w+b") as fp: - run_plan.run_spec.repo_code_hash = repo.write_code_file(fp) - fp.seek(0) - self._api_client.repos.upload_code( - self._project, repo.repo_id, run_plan.run_spec.repo_code_hash, fp - ) - logger.debug("Submitting run spec") - run = self._api_client.runs.submit(self._project, run_plan.run_spec) + if repo is None: + repo = VirtualRepo() + if repo.has_code_to_write(): + with _prepare_code_file(repo) as (fp, repo_code_hash): + self._api_client.repos.upload_code( + project_name=self._project, + repo_id=repo.repo_id, + code_hash=repo_code_hash, + fp=fp, + ) + + run = self._api_client.runs.apply_plan(self._project, run_plan) return self._model_to_submitted_run(run, ports_lock) - def list(self, all: bool = False) -> List[Run]: + def apply_configuration( + self, + configuration: AnyRunConfiguration, + repo: Optional[Repo] = None, + profile: Optional[Profile] = None, + configuration_path: Optional[str] = None, + reserve_ports: bool = True, + ssh_identity_file: Optional[PathLike] = None, + ) -> Run: """ - List runs + Apply the run configuration. + Use this method to apply configurations without getting a run plan first. Args: - all: show all runs (active and finished) if `True` + configuration (Union[Task, Service, DevEnvironment]): The run configuration. + repo (Union[RemoteRepo, VirtualRepo, None]): + The repo to use for the run. Pass `None` if repo is not needed. + profile: The profile to use for the run. + configuration_path: The path to the configuration file. Omit if the configuration is not loaded from a file. + reserve_ports: Reserve local ports before applying. Use if you'll attach to the run. + ssh_identity_file: Path to the private SSH key file. The corresponding public key + (`.pub` file) is read and included in the run plan, allowing SSH access to the instances. + If the `.pub` file does not exist, it is generated automatically. + If ssh_identity_file is not specified, the user key is used. Returns: - list of runs + Submitted run. """ - runs = self._api_client.runs.list(project_name=self._project, repo_id=None) - if not all: - active = [run for run in runs if not run.status.is_finished()] - if active: - runs = active - else: - runs = runs[:1] # the most recent finished run + run_plan = self.get_run_plan( + configuration=configuration, + repo=repo, + profile=profile, + configuration_path=configuration_path, + ssh_identity_file=ssh_identity_file, + ) + run = self.apply_plan( + run_plan=run_plan, + repo=repo, + reserve_ports=reserve_ports, + ) + return run + + def list(self, all: bool = False, limit: Optional[int] = None) -> List[Run]: + """ + List runs. + + Args: + all: Show all runs (active and finished) if `True`. + limit: Limit the number of runs to return. Must be less than 100. + + Returns: + List of runs. + """ + # Return only one page of latest runs (<=100). Returning all the pages may be costly. + # TODO: Consider introducing `since` filter with a reasonable default. + only_active = not all and limit is None + runs = self._api_client.runs.list( + project_name=self._project, + repo_id=None, + only_active=only_active, + limit=limit or 100, + job_submissions_limit=1, + ) + if only_active and len(runs) == 0: + runs = self._api_client.runs.list( + project_name=self._project, + repo_id=None, + limit=1, + ) return [self._model_to_run(run) for run in runs] def get(self, run_name: str) -> Optional[Run]: """ - Get run by run name + Get run by run name. Args: - run_name: run name + run_name: Run name. Returns: - The run or `None` if not found + The run or `None` if not found. """ try: run = self._api_client.runs.get(self._project, run_name) @@ -532,7 +673,6 @@ def _model_to_run(self, run: RunModel) -> Run: return Run( self._api_client, self._project, - self._client.ssh_identity_file, run, ) @@ -540,15 +680,57 @@ def _model_to_submitted_run(self, run: RunModel, ports_lock: Optional[PortsLock] return Run( self._api_client, self._project, - self._client.ssh_identity_file, run, ports_lock, ) - -def _reserve_ports(job_spec: JobSpec) -> PortsLock: - ports = {10999: 0} # Runner API - for app in job_spec.app_specs: - ports[app.port] = app.map_to_port or 0 + def _validate_configuration_files( + self, configuration: AnyRunConfiguration, configuration_path: Optional[PathLike] + ) -> None: + """ + Expands, normalizes and validates local paths specified in + the `files` configuration property. + """ + base_dir: Optional[Path] = None + if configuration_path is not None: + base_dir = Path(configuration_path).expanduser().resolve().parent + for file_mapping in configuration.files: + path = Path(file_mapping.local_path).expanduser() + if not path.is_absolute(): + if base_dir is None: + raise ConfigurationError( + f"Path '{path}' is relative but `configuration_path` is not provided" + ) + else: + path = base_dir / path + if not path.exists(): + raise ConfigurationError(f"Path '{path}' specified in `files` does not exist") + file_mapping.local_path = str(path) + + +def _reserve_ports( + job_spec: JobSpec, + ports_overrides: Optional[List[PortMapping]] = None, +) -> PortsLock: + if ports_overrides is None: + ports_overrides = [] + ports = {DSTACK_RUNNER_HTTP_PORT: 0} + if job_spec.app_specs: + for app in job_spec.app_specs: + ports[app.port] = app.map_to_port or 0 + for port_override in ports_overrides: + if port_override.container_port not in ports: + raise ClientError( + f"Cannot override port {port_override.container_port} not exposed by the run" + ) + ports[port_override.container_port] = port_override.local_port or 0 logger.debug("Reserving ports: %s", ports) return PortsLock(ports).acquire() + + +@contextmanager +def _prepare_code_file(repo: Repo) -> Iterator[tuple[BinaryIO, str]]: + with tempfile.TemporaryFile("w+b") as fp: + repo_code_hash = repo.write_code_file(fp) + fp.seek(0) + yield fp, repo_code_hash diff --git a/src/dstack/api/huggingface/__init__.py b/src/dstack/api/huggingface/__init__.py deleted file mode 100644 index 83e5491172..0000000000 --- a/src/dstack/api/huggingface/__init__.py +++ /dev/null @@ -1,73 +0,0 @@ -from typing import Dict, Optional - -from dstack.api._public.huggingface.finetuning.sft import FineTuningTask - - -class SFTFineTuningTask(FineTuningTask): - def __init__( - self, - model_name: str, - dataset_name: str, - env: Dict[str, str], - new_model_name: Optional[str] = None, - report_to: Optional[str] = None, - per_device_train_batch_size: int = 4, - per_device_eval_batch_size: int = 4, - gradient_accumulation_steps: int = 1, - learning_rate: float = 2e-4, - max_grad_norm: float = 0.3, - weight_decay: float = 0.001, - lora_alpha: int = 16, - lora_dropout: float = 0.1, - lora_r: int = 64, - max_seq_length: Optional[int] = None, - use_4bit: bool = True, - use_nested_quant: bool = True, - bnb_4bit_compute_dtype: str = "float16", - bnb_4bit_quant_type: str = "nf4", - num_train_epochs: float = 1, - fp16: bool = False, - bf16: bool = False, - packing: bool = False, - gradient_checkpointing: bool = True, - optim: str = "paged_adamw_32bit", - lr_scheduler_type: str = "constant", - max_steps: int = -1, - warmup_ratio: float = 0.03, - group_by_length: bool = True, - save_steps: int = 0, - logging_steps: int = 25, - ): - super().__init__( - model_name, - dataset_name, - new_model_name, - env, - report_to, - per_device_train_batch_size, - per_device_eval_batch_size, - gradient_accumulation_steps, - learning_rate, - max_grad_norm, - weight_decay, - lora_alpha, - lora_dropout, - lora_r, - max_seq_length, - use_4bit, - use_nested_quant, - bnb_4bit_compute_dtype, - bnb_4bit_quant_type, - num_train_epochs, - fp16, - bf16, - packing, - gradient_checkpointing, - optim, - lr_scheduler_type, - max_steps, - warmup_ratio, - group_by_length, - save_steps, - logging_steps, - ) diff --git a/src/dstack/api/server/__init__.py b/src/dstack/api/server/__init__.py index 125b5923cf..82b009863a 100644 --- a/src/dstack/api/server/__init__.py +++ b/src/dstack/api/server/__init__.py @@ -1,3 +1,4 @@ +import hashlib import os import pprint import time @@ -6,12 +7,24 @@ import requests from dstack import version -from dstack._internal.core.errors import ClientError, ServerClientError +from dstack._internal.core.errors import ( + ClientError, + MethodNotAllowedError, + ServerClientError, + URLNotFoundError, +) from dstack._internal.utils.logging import get_logger +from dstack.api.server._auth import AuthAPIClient from dstack.api.server._backends import BackendsAPIClient +from dstack.api.server._events import EventsAPIClient +from dstack.api.server._exports import ExportsAPIClient +from dstack.api.server._files import FilesAPIClient +from dstack.api.server._fleets import FleetsAPIClient from dstack.api.server._gateways import GatewaysAPIClient +from dstack.api.server._gpus import GpusAPIClient +from dstack.api.server._imports import ImportsAPIClient from dstack.api.server._logs import LogsAPIClient -from dstack.api.server._pools import PoolAPIClient +from dstack.api.server._metrics import MetricsAPIClient from dstack.api.server._projects import ProjectsAPIClient from dstack.api.server._repos import ReposAPIClient from dstack.api.server._runs import RunsAPIClient @@ -19,86 +32,130 @@ from dstack.api.server._users import UsersAPIClient from dstack.api.server._volumes import VolumesAPIClient -logger = get_logger(__name__) - - _MAX_RETRIES = 3 _RETRY_INTERVAL = 1 class APIClient: """ - Low-level API client for interacting with dstack server. Implements all API endpoints + Low-level API client for interacting with the `dstack` server. + Supports all HTTP API endpoints. Attributes: users: operations with users projects: operations with projects backends: operations with backends + fleets: operations with fleets runs: operations with runs + gpus: operations with GPUs + metrics: operations with metrics logs: operations with logs gateways: operations with gateways - pools: operations with pools + volumes: operations with volumes + exports: operations with exports + files: operations with files """ - def __init__(self, base_url: str, token: str): + def __init__(self, base_url: str, token: Optional[str] = None): """ Args: - base_url: API endpoints prefix, e.g. `http://127.0.0.1:3000/` - token: API token + base_url: The API endpoints prefix, e.g. `http://127.0.0.1:3000/`. + token: The API token. """ self._base_url = base_url.rstrip("/") - self._token = token self._s = requests.session() - self._s.headers.update({"Authorization": f"Bearer {token}"}) + self._token = None + if token is not None: + self._token = token + self._s.headers.update({"Authorization": f"Bearer {token}"}) client_api_version = os.getenv("DSTACK_CLIENT_API_VERSION", version.__version__) if client_api_version is not None: self._s.headers.update({"X-API-VERSION": client_api_version}) + self._logger = get_logger(__name__) + + @property + def base_url(self) -> str: + return self._base_url + + @property + def auth(self) -> AuthAPIClient: + return AuthAPIClient(self._request, self._logger) @property def users(self) -> UsersAPIClient: - return UsersAPIClient(self._request) + return UsersAPIClient(self._request, self._logger) @property def projects(self) -> ProjectsAPIClient: - return ProjectsAPIClient(self._request) + return ProjectsAPIClient(self._request, self._logger) @property def backends(self) -> BackendsAPIClient: - return BackendsAPIClient(self._request) + return BackendsAPIClient(self._request, self._logger) + + @property + def fleets(self) -> FleetsAPIClient: + return FleetsAPIClient(self._request, self._logger) @property def repos(self) -> ReposAPIClient: - return ReposAPIClient(self._request) + return ReposAPIClient(self._request, self._logger) @property def runs(self) -> RunsAPIClient: - return RunsAPIClient(self._request) + return RunsAPIClient(self._request, self._logger) + + @property + def gpus(self) -> GpusAPIClient: + return GpusAPIClient(self._request, self._logger) + + @property + def metrics(self) -> MetricsAPIClient: + return MetricsAPIClient(self._request, self._logger) @property def logs(self) -> LogsAPIClient: - return LogsAPIClient(self._request) + return LogsAPIClient(self._request, self._logger) @property def secrets(self) -> SecretsAPIClient: - return SecretsAPIClient(self._request) + return SecretsAPIClient(self._request, self._logger) @property def gateways(self) -> GatewaysAPIClient: - return GatewaysAPIClient(self._request) + return GatewaysAPIClient(self._request, self._logger) @property - def pool(self) -> PoolAPIClient: - return PoolAPIClient(self._request) + def volumes(self) -> VolumesAPIClient: + return VolumesAPIClient(self._request, self._logger) @property - def volumes(self) -> VolumesAPIClient: - return VolumesAPIClient(self._request) + def exports(self) -> ExportsAPIClient: + return ExportsAPIClient(self._request, self._logger) + + @property + def imports(self) -> ImportsAPIClient: + return ImportsAPIClient(self._request, self._logger) + + @property + def files(self) -> FilesAPIClient: + return FilesAPIClient(self._request, self._logger) + + @property + def events(self) -> EventsAPIClient: + return EventsAPIClient(self._request, self._logger) + + def get_token_hash(self) -> str: + if self._token is None: + raise ValueError("Token not set") + return hashlib.sha1(self._token.encode()).hexdigest()[:8] def _request( self, path: str, body: Optional[str] = None, raise_for_status: bool = True, + method: str = "POST", **kwargs, ) -> requests.Response: path = path.lstrip("/") @@ -106,21 +163,28 @@ def _request( kwargs.setdefault("headers", {})["Content-Type"] = "application/json" kwargs["data"] = body - logger.debug("POST /%s", path) + self._logger.debug("POST /%s", path) for _ in range(_MAX_RETRIES): try: # TODO: set adequate timeout here or everywhere the method is used - resp = self._s.post(f"{self._base_url}/{path}", **kwargs) + resp = self._s.request(method, f"{self._base_url}/{path}", **kwargs) break except requests.exceptions.ConnectionError as e: - logger.debug("Could not connect to server: %s", e) + self._logger.debug("Could not connect to server: %s", e) time.sleep(_RETRY_INTERVAL) else: raise ClientError(f"Failed to connect to dstack server {self._base_url}") + if 400 <= resp.status_code < 600: + self._logger.debug( + "Error requesting %s. Status: %s. Headers: %s. Body: %s", + resp.request.url, + resp.status_code, + resp.headers, + resp.content, + ) + if raise_for_status: - if resp.status_code == 500: - raise ClientError("Unexpected dstack server error") if resp.status_code == 400: # raise ServerClientError detail: List[Dict] = resp.json()["detail"] if len(detail) == 1 and detail[0]["code"] in _server_client_errors: @@ -130,7 +194,20 @@ def _request( if resp.status_code == 422: formatted_error = pprint.pformat(resp.json()) raise ClientError(f"Server validation error: \n{formatted_error}") - resp.raise_for_status() + if resp.status_code == 403: + raise ClientError( + f"Access to {resp.request.url} is denied. Please check your access token" + ) + if resp.status_code == 404: + raise URLNotFoundError(f"Status code 404 when requesting {resp.request.url}") + if resp.status_code == 405: + raise MethodNotAllowedError(f"Status code 405 when requesting {resp.request.url}") + if 400 <= resp.status_code < 600: + raise ClientError( + f"Unexpected error: status code {resp.status_code}" + f" when requesting {resp.request.url}." + " Check the server logs for backend issues, and the CLI logs at (~/.dstack/logs/cli/latest.log) local CLI output" + ) return resp diff --git a/src/dstack/api/server/_auth.py b/src/dstack/api/server/_auth.py new file mode 100644 index 0000000000..b944a292a2 --- /dev/null +++ b/src/dstack/api/server/_auth.py @@ -0,0 +1,30 @@ +from typing import Optional + +from pydantic import parse_obj_as + +from dstack._internal.core.models.auth import OAuthProviderInfo +from dstack._internal.core.models.users import UserWithCreds +from dstack._internal.server.schemas.auth import ( + OAuthAuthorizeRequest, + OAuthAuthorizeResponse, + OAuthCallbackRequest, +) +from dstack.api.server._group import APIClientGroup + + +class AuthAPIClient(APIClientGroup): + def list_providers(self) -> list[OAuthProviderInfo]: + resp = self._request("/api/auth/list_providers") + return parse_obj_as(list[OAuthProviderInfo.__response__], resp.json()) + + def authorize(self, provider: str, local_port: Optional[int] = None) -> OAuthAuthorizeResponse: + body = OAuthAuthorizeRequest(local_port=local_port) + resp = self._request(f"/api/auth/{provider}/authorize", body=body.json()) + return parse_obj_as(OAuthAuthorizeResponse.__response__, resp.json()) + + def callback( + self, provider: str, code: str, state: str, base_url: Optional[str] = None + ) -> UserWithCreds: + body = OAuthCallbackRequest(code=code, state=state, base_url=base_url) + resp = self._request(f"/api/auth/{provider}/callback", body=body.json()) + return parse_obj_as(UserWithCreds.__response__, resp.json()) diff --git a/src/dstack/api/server/_backends.py b/src/dstack/api/server/_backends.py index 2d2136abb3..ba9db2bc00 100644 --- a/src/dstack/api/server/_backends.py +++ b/src/dstack/api/server/_backends.py @@ -2,10 +2,8 @@ from pydantic import parse_obj_as -from dstack._internal.core.models.backends import ( - AnyConfigInfoWithCreds, - AnyConfigInfoWithCredsPartial, - AnyConfigValues, +from dstack._internal.core.backends.models import ( + AnyBackendConfigWithCreds, ) from dstack._internal.core.models.backends.base import BackendType from dstack._internal.server.schemas.backends import DeleteBackendsRequest @@ -17,26 +15,24 @@ def list_backend_types(self) -> List[BackendType]: resp = self._request("/api/backends/list_types") return parse_obj_as(List[BackendType], resp.json()) - def config_values(self, config: AnyConfigInfoWithCredsPartial) -> AnyConfigValues: - resp = self._request("/api/backends/config_values", body=config.json()) - return parse_obj_as(AnyConfigValues, resp.json()) - def create( - self, project_name: str, config: AnyConfigInfoWithCreds - ) -> AnyConfigInfoWithCredsPartial: + self, project_name: str, config: AnyBackendConfigWithCreds + ) -> AnyBackendConfigWithCreds: resp = self._request(f"/api/project/{project_name}/backends/create", body=config.json()) - return parse_obj_as(AnyConfigInfoWithCredsPartial, resp.json()) + return parse_obj_as(AnyBackendConfigWithCreds, resp.json()) def update( - self, project_name: str, config: AnyConfigInfoWithCreds - ) -> AnyConfigInfoWithCredsPartial: + self, project_name: str, config: AnyBackendConfigWithCreds + ) -> AnyBackendConfigWithCreds: resp = self._request(f"/api/project/{project_name}/backends/update", body=config.json()) - return parse_obj_as(AnyConfigInfoWithCredsPartial, resp.json()) + return parse_obj_as(AnyBackendConfigWithCreds, resp.json()) def delete(self, project_name: str, backends_names: List[BackendType]): body = DeleteBackendsRequest(backends_names=backends_names) self._request(f"/api/project/{project_name}/backends/delete", body=body.json()) - def config_info(self, project_name: str, backend_name: BackendType) -> AnyConfigInfoWithCreds: + def config_info( + self, project_name: str, backend_name: BackendType + ) -> AnyBackendConfigWithCreds: resp = self._request(f"/api/project/{project_name}/backends/{backend_name}/config_info") - return parse_obj_as(AnyConfigInfoWithCreds, resp.json()) + return parse_obj_as(AnyBackendConfigWithCreds, resp.json()) diff --git a/src/dstack/api/server/_events.py b/src/dstack/api/server/_events.py new file mode 100644 index 0000000000..22cd8893cd --- /dev/null +++ b/src/dstack/api/server/_events.py @@ -0,0 +1,64 @@ +from datetime import datetime, timezone +from typing import Optional +from uuid import UUID + +from pydantic import parse_obj_as + +from dstack._internal.core.compatibility.events import get_list_events_excludes +from dstack._internal.core.models.events import Event, EventTargetType +from dstack._internal.server.schemas.events import LIST_EVENTS_DEFAULT_LIMIT, ListEventsRequest +from dstack.api.server._group import APIClientGroup + + +class EventsAPIClient(APIClientGroup): + def list( + self, + target_projects: Optional[list[UUID]] = None, + target_users: Optional[list[UUID]] = None, + target_fleets: Optional[list[UUID]] = None, + target_instances: Optional[list[UUID]] = None, + target_runs: Optional[list[UUID]] = None, + target_jobs: Optional[list[UUID]] = None, + within_projects: Optional[list[UUID]] = None, + within_fleets: Optional[list[UUID]] = None, + within_runs: Optional[list[UUID]] = None, + include_target_types: Optional[list[EventTargetType]] = None, + actors: Optional[list[Optional[UUID]]] = None, + prev_recorded_at: Optional[datetime] = None, + prev_id: Optional[UUID] = None, + limit: int = LIST_EVENTS_DEFAULT_LIMIT, + ascending: bool = False, + *, + # NOTE: New parameters go here. Avoid positional parameters, they can break compatibility. + target_volumes: Optional[list[UUID]] = None, + target_gateways: Optional[list[UUID]] = None, + target_secrets: Optional[list[UUID]] = None, + ) -> list[Event]: + if prev_recorded_at is not None: + # Time zones other than UTC are misinterpreted by the server: + # https://github.com/dstackai/dstack/issues/3354 + prev_recorded_at = prev_recorded_at.astimezone(timezone.utc) + req = ListEventsRequest( + target_projects=target_projects, + target_users=target_users, + target_fleets=target_fleets, + target_instances=target_instances, + target_runs=target_runs, + target_jobs=target_jobs, + target_volumes=target_volumes, + target_gateways=target_gateways, + target_secrets=target_secrets, + within_projects=within_projects, + within_fleets=within_fleets, + within_runs=within_runs, + include_target_types=include_target_types, + actors=actors, + prev_recorded_at=prev_recorded_at, + prev_id=prev_id, + limit=limit, + ascending=ascending, + ) + resp = self._request( + "/api/events/list", body=req.json(exclude=get_list_events_excludes(req)) + ) + return parse_obj_as(list[Event.__response__], resp.json()) diff --git a/src/dstack/api/server/_exports.py b/src/dstack/api/server/_exports.py new file mode 100644 index 0000000000..f23016011d --- /dev/null +++ b/src/dstack/api/server/_exports.py @@ -0,0 +1,79 @@ +from typing import List + +from pydantic import parse_obj_as + +from dstack._internal.core.compatibility.exports import ( + get_create_export_excludes, + get_update_export_excludes, +) +from dstack._internal.core.models.exports import Export +from dstack._internal.server.schemas.exports import ( + CreateExportRequest, + DeleteExportRequest, + UpdateExportRequest, +) +from dstack.api.server._group import APIClientGroup + + +class ExportsAPIClient(APIClientGroup): + def list(self, project_name: str) -> List[Export]: + resp = self._request(f"/api/project/{project_name}/exports/list") + return parse_obj_as(List[Export.__response__], resp.json()) + + def create( + self, + project_name: str, + name: str, + *, + is_global: bool = False, + importer_projects: List[str] = [], + exported_fleets: List[str] = [], + exported_gateways: List[str] = [], + ) -> Export: + body = CreateExportRequest( + name=name, + is_global=is_global, + importer_projects=importer_projects, + exported_fleets=exported_fleets, + exported_gateways=exported_gateways, + ) + resp = self._request( + f"/api/project/{project_name}/exports/create", + body=body.json(exclude=get_create_export_excludes(body)), + ) + return parse_obj_as(Export.__response__, resp.json()) + + def update( + self, + project_name: str, + name: str, + *, + set_global: bool = False, + unset_global: bool = False, + add_importer_projects: List[str] = [], + remove_importer_projects: List[str] = [], + add_exported_fleets: List[str] = [], + remove_exported_fleets: List[str] = [], + add_exported_gateways: List[str] = [], + remove_exported_gateways: List[str] = [], + ) -> Export: + body = UpdateExportRequest( + name=name, + set_global=set_global, + unset_global=unset_global, + add_importer_projects=add_importer_projects, + remove_importer_projects=remove_importer_projects, + add_exported_fleets=add_exported_fleets, + remove_exported_fleets=remove_exported_fleets, + add_exported_gateways=add_exported_gateways, + remove_exported_gateways=remove_exported_gateways, + ) + resp = self._request( + f"/api/project/{project_name}/exports/update", + body=body.json(exclude=get_update_export_excludes(body)), + ) + return parse_obj_as(Export.__response__, resp.json()) + + def delete(self, project_name: str, name: str) -> None: + body = DeleteExportRequest(name=name) + self._request(f"/api/project/{project_name}/exports/delete", body=body.json()) diff --git a/src/dstack/api/server/_files.py b/src/dstack/api/server/_files.py new file mode 100644 index 0000000000..e7bdde91a3 --- /dev/null +++ b/src/dstack/api/server/_files.py @@ -0,0 +1,18 @@ +from typing import BinaryIO + +from pydantic import parse_obj_as + +from dstack._internal.core.models.files import FileArchive +from dstack._internal.server.schemas.files import GetFileArchiveByHashRequest +from dstack.api.server._group import APIClientGroup + + +class FilesAPIClient(APIClientGroup): + def get_archive_by_hash(self, hash: str) -> FileArchive: + body = GetFileArchiveByHashRequest(hash=hash) + resp = self._request("/api/files/get_archive_by_hash", body=body.json()) + return parse_obj_as(FileArchive.__response__, resp.json()) + + def upload_archive(self, hash: str, fp: BinaryIO) -> FileArchive: + resp = self._request("/api/files/upload_archive", files={"file": (hash, fp)}) + return parse_obj_as(FileArchive.__response__, resp.json()) diff --git a/src/dstack/api/server/_fleets.py b/src/dstack/api/server/_fleets.py new file mode 100644 index 0000000000..93f27e6728 --- /dev/null +++ b/src/dstack/api/server/_fleets.py @@ -0,0 +1,94 @@ +import copy +from typing import List, Optional, Union +from uuid import UUID + +from pydantic import parse_obj_as + +from dstack._internal.core.compatibility.fleets import ( + get_apply_plan_excludes, + get_create_fleet_excludes, + get_get_plan_excludes, + patch_fleet_spec, +) +from dstack._internal.core.models.fleets import ApplyFleetPlanInput, Fleet, FleetPlan, FleetSpec +from dstack._internal.server.schemas.fleets import ( + ApplyFleetPlanRequest, + CreateFleetRequest, + DeleteFleetInstancesRequest, + DeleteFleetsRequest, + GetFleetPlanRequest, + GetFleetRequest, + ListProjectFleetsRequest, +) +from dstack.api.server._group import APIClientGroup + + +class FleetsAPIClient(APIClientGroup): + def list(self, project_name: str, *, include_imported: bool = False) -> List[Fleet]: + body = ListProjectFleetsRequest(include_imported=include_imported) + resp = self._request(f"/api/project/{project_name}/fleets/list", body=body.json()) + return parse_obj_as(List[Fleet.__response__], resp.json()) + + def get( + self, project_name: str, name: Optional[str] = None, fleet_id: Optional[UUID] = None + ) -> Fleet: + if name is None and fleet_id is None: + raise ValueError("Either name or fleet_id must be provided") + if name is not None and fleet_id is not None: + raise ValueError("Cannot specify both name and fleet_id") + body = GetFleetRequest(name=name, id=fleet_id) + resp = self._request( + f"/api/project/{project_name}/fleets/get", + body=body.json(), + ) + return parse_obj_as(Fleet.__response__, resp.json()) + + def get_plan( + self, + project_name: str, + spec: FleetSpec, + ) -> FleetPlan: + body = GetFleetPlanRequest(spec=spec) + body = copy.deepcopy(body) + patch_fleet_spec(body.spec) + body_json = body.json(exclude=get_get_plan_excludes(spec)) + resp = self._request(f"/api/project/{project_name}/fleets/get_plan", body=body_json) + return parse_obj_as(FleetPlan.__response__, resp.json()) + + def apply_plan( + self, + project_name: str, + plan: Union[FleetPlan, ApplyFleetPlanInput], + force: bool = False, + ) -> Fleet: + plan_input = ApplyFleetPlanInput.__response__.parse_obj(plan) + body = ApplyFleetPlanRequest(plan=plan_input, force=force) + body = copy.deepcopy(body) + patch_fleet_spec(body.plan.spec) + if body.plan.current_resource is not None: + patch_fleet_spec(body.plan.current_resource.spec) + body_json = body.json(exclude=get_apply_plan_excludes(plan_input)) + resp = self._request(f"/api/project/{project_name}/fleets/apply", body=body_json) + return parse_obj_as(Fleet.__response__, resp.json()) + + def delete(self, project_name: str, names: List[str]) -> None: + body = DeleteFleetsRequest(names=names) + self._request(f"/api/project/{project_name}/fleets/delete", body=body.json()) + + def delete_instances(self, project_name: str, name: str, instance_nums: List[int]) -> None: + body = DeleteFleetInstancesRequest(name=name, instance_nums=instance_nums) + self._request(f"/api/project/{project_name}/fleets/delete_instances", body=body.json()) + + # Deprecated + # TODO: Remove in 0.21 + def create( + self, + project_name: str, + spec: FleetSpec, + ) -> Fleet: + body = CreateFleetRequest(spec=spec) + body = copy.deepcopy(body) + patch_fleet_spec(body.spec) + body_json = body.json(exclude=get_create_fleet_excludes(spec)) + resp = self._request(f"/api/project/{project_name}/fleets/create", body=body_json) + return parse_obj_as(Fleet.__response__, resp.json()) diff --git a/src/dstack/api/server/_gateways.py b/src/dstack/api/server/_gateways.py index f2800c2fe6..6a325bdbde 100644 --- a/src/dstack/api/server/_gateways.py +++ b/src/dstack/api/server/_gateways.py @@ -2,12 +2,16 @@ from pydantic import parse_obj_as -from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.compatibility.gateways import ( + get_create_gateway_excludes, + get_set_default_gateway_excludes, +) from dstack._internal.core.models.gateways import Gateway, GatewayConfiguration from dstack._internal.server.schemas.gateways import ( CreateGatewayRequest, DeleteGatewaysRequest, GetGatewayRequest, + ListGatewaysRequest, SetDefaultGatewayRequest, SetWildcardDomainRequest, ) @@ -15,8 +19,11 @@ class GatewaysAPIClient(APIClientGroup): - def list(self, project_name: str) -> List[Gateway]: - resp = self._request(f"/api/project/{project_name}/gateways/list") + def list(self, project_name: str, *, include_imported: bool = False) -> List[Gateway]: + body = ListGatewaysRequest( + include_imported=include_imported, + ) + resp = self._request(f"/api/project/{project_name}/gateways/list", body=body.json()) return parse_obj_as(List[Gateway.__response__], resp.json()) def get(self, project_name: str, gateway_name: str) -> Gateway: @@ -24,32 +31,32 @@ def get(self, project_name: str, gateway_name: str) -> Gateway: resp = self._request(f"/api/project/{project_name}/gateways/get", body=body.json()) return parse_obj_as(Gateway.__response__, resp.json()) - # gateway_name, backend_type, region are left for backward-compatibility with 0.18.x - # TODO: Remove in 0.19 def create( self, project_name: str, - gateway_name: Optional[str] = None, - backend_type: Optional[BackendType] = None, - region: Optional[str] = None, - configuration: Optional[GatewayConfiguration] = None, + configuration: GatewayConfiguration, ) -> Gateway: - body = CreateGatewayRequest( - name=gateway_name, - backend_type=backend_type, - region=region, - configuration=configuration, + body = CreateGatewayRequest(configuration=configuration) + resp = self._request( + f"/api/project/{project_name}/gateways/create", + body=body.json(exclude=get_create_gateway_excludes(configuration)), ) - resp = self._request(f"/api/project/{project_name}/gateways/create", body=body.json()) return parse_obj_as(Gateway.__response__, resp.json()) def delete(self, project_name: str, gateways_names: List[str]) -> None: body = DeleteGatewaysRequest(names=gateways_names) self._request(f"/api/project/{project_name}/gateways/delete", body=body.json()) - def set_default(self, project_name: str, gateway_name: str) -> None: - body = SetDefaultGatewayRequest(name=gateway_name) - self._request(f"/api/project/{project_name}/gateways/set_default", body=body.json()) + def set_default( + self, project_name: str, gateway_name: str, *, gateway_project: Optional[str] = None + ) -> None: + if gateway_project == project_name: + gateway_project = None # omit for compatibility with pre-0.20.20 servers + body = SetDefaultGatewayRequest(name=gateway_name, gateway_project=gateway_project) + self._request( + f"/api/project/{project_name}/gateways/set_default", + body=body.json(exclude=get_set_default_gateway_excludes(body)), + ) def set_wildcard_domain( self, project_name: str, gateway_name: str, wildcard_domain: str diff --git a/src/dstack/api/server/_gpus.py b/src/dstack/api/server/_gpus.py new file mode 100644 index 0000000000..253410604d --- /dev/null +++ b/src/dstack/api/server/_gpus.py @@ -0,0 +1,27 @@ +from typing import List, Literal, Optional, cast + +from pydantic import parse_obj_as + +from dstack._internal.core.compatibility.gpus import get_list_gpus_excludes +from dstack._internal.core.models.gpus import GpuGroup +from dstack._internal.core.models.runs import RunSpec +from dstack._internal.server.schemas.gpus import ListGpusRequest, ListGpusResponse +from dstack.api.server._group import APIClientGroup + + +class GpusAPIClient(APIClientGroup): + def list_gpus( + self, + project_name: str, + run_spec: RunSpec, + group_by: Optional[List[str]] = None, + ) -> List[GpuGroup]: + body = ListGpusRequest( + run_spec=run_spec, + group_by=cast(Optional[List[Literal["backend", "region", "count"]]], group_by), + ) + resp = self._request( + f"/api/project/{project_name}/gpus/list", + body=body.json(exclude=get_list_gpus_excludes(body)), + ) + return parse_obj_as(ListGpusResponse, resp.json()).gpus diff --git a/src/dstack/api/server/_group.py b/src/dstack/api/server/_group.py index 41f7fe3df2..9d3ec1918a 100644 --- a/src/dstack/api/server/_group.py +++ b/src/dstack/api/server/_group.py @@ -1,3 +1,4 @@ +from logging import Logger from typing import Optional import requests @@ -6,11 +7,16 @@ class APIRequest(Protocol): def __call__( - self, path: str, body: Optional[str] = None, raise_for_status: bool = True, **kwargs - ) -> requests.Response: - pass + self, + path: str, + body: Optional[str] = None, + raise_for_status: bool = True, + method: str = "POST", + **kwargs, + ) -> requests.Response: ... class APIClientGroup: - def __init__(self, _request: APIRequest): + def __init__(self, _request: APIRequest, _logger: Logger): self._request = _request + self._logger = _logger diff --git a/src/dstack/api/server/_imports.py b/src/dstack/api/server/_imports.py new file mode 100644 index 0000000000..bcc1abb162 --- /dev/null +++ b/src/dstack/api/server/_imports.py @@ -0,0 +1,19 @@ +from typing import List + +from pydantic import parse_obj_as + +from dstack._internal.core.models.imports import Import +from dstack._internal.server.schemas.imports import DeleteImportRequest +from dstack.api.server._group import APIClientGroup + + +class ImportsAPIClient(APIClientGroup): + def list(self, project_name: str) -> List[Import]: + resp = self._request(f"/api/project/{project_name}/imports/list") + return parse_obj_as(List[Import.__response__], resp.json()) + + def delete(self, *, project_name: str, export_project_name: str, export_name: str) -> None: + body = DeleteImportRequest( + export_project_name=export_project_name, export_name=export_name + ) + self._request(f"/api/project/{project_name}/imports/delete", body=body.json()) diff --git a/src/dstack/api/server/_logs.py b/src/dstack/api/server/_logs.py index b82d7017d7..7cdfc246f7 100644 --- a/src/dstack/api/server/_logs.py +++ b/src/dstack/api/server/_logs.py @@ -1,5 +1,6 @@ from pydantic import parse_obj_as +from dstack._internal.core.compatibility.logs import get_poll_logs_excludes from dstack._internal.core.models.logs import JobSubmissionLogs from dstack._internal.server.schemas.logs import PollLogsRequest from dstack.api.server._group import APIClientGroup @@ -7,5 +8,8 @@ class LogsAPIClient(APIClientGroup): def poll(self, project_name: str, body: PollLogsRequest) -> JobSubmissionLogs: - resp = self._request(f"/api/project/{project_name}/logs/poll", body=body.json()) + resp = self._request( + f"/api/project/{project_name}/logs/poll", + body=body.json(exclude=get_poll_logs_excludes(body)), + ) return parse_obj_as(JobSubmissionLogs.__response__, resp.json()) diff --git a/src/dstack/api/server/_metrics.py b/src/dstack/api/server/_metrics.py new file mode 100644 index 0000000000..8b378c89b3 --- /dev/null +++ b/src/dstack/api/server/_metrics.py @@ -0,0 +1,23 @@ +from pydantic import parse_obj_as + +from dstack._internal.core.models.metrics import JobMetrics +from dstack.api.server._group import APIClientGroup + + +class MetricsAPIClient(APIClientGroup): + def get_job_metrics( + self, + project_name: str, + run_name: str, + replica_num: int = 0, + job_num: int = 0, + ) -> JobMetrics: + resp = self._request( + f"/api/project/{project_name}/metrics/job/{run_name}", + method="GET", + params={ + "replica_num": replica_num, + "job_num": job_num, + }, + ) + return parse_obj_as(JobMetrics.__response__, resp.json()) diff --git a/src/dstack/api/server/_pools.py b/src/dstack/api/server/_pools.py deleted file mode 100644 index 4ce7ab7dd3..0000000000 --- a/src/dstack/api/server/_pools.py +++ /dev/null @@ -1,63 +0,0 @@ -from typing import List, Optional - -from pydantic import parse_obj_as - -import dstack._internal.server.schemas.pools as schemas_pools -from dstack._internal.core.models.instances import SSHKey -from dstack._internal.core.models.pools import Instance, Pool, PoolInstances -from dstack._internal.server.schemas.runs import AddRemoteInstanceRequest -from dstack.api.server._group import APIClientGroup - - -class PoolAPIClient(APIClientGroup): - def list(self, project_name: str) -> List[Pool]: - resp = self._request(f"/api/project/{project_name}/pool/list") - return parse_obj_as(List[Pool.__response__], resp.json()) - - def delete(self, project_name: str, pool_name: str, force: bool) -> None: - body = schemas_pools.DeletePoolRequest(name=pool_name, force=force) - self._request(f"/api/project/{project_name}/pool/delete", body=body.json()) - - def create(self, project_name: str, pool_name: str) -> None: - body = schemas_pools.CreatePoolRequest(name=pool_name) - self._request(f"/api/project/{project_name}/pool/create", body=body.json()) - - def show(self, project_name: str, pool_name: Optional[str]) -> PoolInstances: - body = schemas_pools.ShowPoolRequest(name=pool_name) - resp = self._request(f"/api/project/{project_name}/pool/show", body=body.json()) - return parse_obj_as(PoolInstances.__response__, resp.json()) - - def remove(self, project_name: str, pool_name: str, instance_name: str, force: bool) -> None: - body = schemas_pools.RemoveInstanceRequest( - pool_name=pool_name, instance_name=instance_name, force=force - ) - self._request(f"/api/project/{project_name}/pool/remove", body=body.json()) - - def set_default(self, project_name: str, pool_name: str) -> None: - body = schemas_pools.SetDefaultPoolRequest(pool_name=pool_name) - self._request(f"/api/project/{project_name}/pool/set_default", body=body.json()) - - def add_remote( - self, - project_name: str, - pool_name: Optional[str], - instance_name: Optional[str], - instance_network: Optional[str], - region: Optional[str], - host: str, - port: int, - ssh_user: str, - ssh_keys: List[SSHKey], - ) -> Instance: - body = AddRemoteInstanceRequest( - pool_name=pool_name, - instance_name=instance_name, - instance_network=instance_network, - region=region, - host=host, - port=port, - ssh_user=ssh_user, - ssh_keys=ssh_keys, - ) - result = self._request(f"/api/project/{project_name}/pool/add_remote", body=body.json()) - return parse_obj_as(Instance.__response__, result.json()) diff --git a/src/dstack/api/server/_projects.py b/src/dstack/api/server/_projects.py index 19793a1bc0..96a1f511f7 100644 --- a/src/dstack/api/server/_projects.py +++ b/src/dstack/api/server/_projects.py @@ -1,24 +1,91 @@ -from typing import List +import json +from datetime import datetime +from typing import Any, List, Literal, Optional, Union, overload +from uuid import UUID from pydantic import parse_obj_as -from dstack._internal.core.models.projects import Project +from dstack._internal.core.models.projects import ( + Project, + ProjectsInfoList, + ProjectsInfoListOrProjectsList, +) +from dstack._internal.core.models.users import ProjectRole from dstack._internal.server.schemas.projects import ( + AddProjectMemberRequest, CreateProjectRequest, DeleteProjectsRequest, MemberSetting, + RemoveProjectMemberRequest, SetProjectMembersRequest, ) from dstack.api.server._group import APIClientGroup class ProjectsAPIClient(APIClientGroup): - def list(self) -> List[Project]: - resp = self._request("/api/projects/list") - return parse_obj_as(List[Project.__response__], resp.json()) + @overload + def list( + self, + include_not_joined: bool = True, + *, + return_total_count: Literal[True], + name_pattern: Optional[str] = None, + prev_created_at: Optional[datetime] = None, + prev_id: Optional[UUID] = None, + limit: Optional[int] = None, + ascending: Optional[bool] = None, + ) -> ProjectsInfoList: + pass + + @overload + def list( + self, + include_not_joined: bool = True, + *, + return_total_count: Union[Literal[False], None] = None, + name_pattern: Optional[str] = None, + prev_created_at: Optional[datetime] = None, + prev_id: Optional[UUID] = None, + limit: Optional[int] = None, + ascending: Optional[bool] = None, + ) -> List[Project]: + pass + + def list( + self, + include_not_joined: bool = True, + *, + return_total_count: Optional[bool] = None, + name_pattern: Optional[str] = None, + prev_created_at: Optional[datetime] = None, + prev_id: Optional[UUID] = None, + limit: Optional[int] = None, + ascending: Optional[bool] = None, + ) -> ProjectsInfoListOrProjectsList: + # Passing only non-None fields for backward compatibility with 0.20 servers. + body: dict[str, Any] = { + "include_not_joined": include_not_joined, + } + if return_total_count is not None: + body["return_total_count"] = return_total_count + if name_pattern is not None: + body["name_pattern"] = name_pattern + if prev_created_at is not None: + body["prev_created_at"] = prev_created_at.isoformat() + if prev_id is not None: + body["prev_id"] = str(prev_id) + if limit is not None: + body["limit"] = limit + if ascending is not None: + body["ascending"] = ascending + resp = self._request("/api/projects/list", body=json.dumps(body)) + resp_json = resp.json() + if isinstance(resp_json, list): + return parse_obj_as(List[Project.__response__], resp_json) + return parse_obj_as(ProjectsInfoList, resp_json) - def create(self, project_name: str) -> Project: - body = CreateProjectRequest(project_name=project_name) + def create(self, project_name: str, is_public: bool = False) -> Project: + body = CreateProjectRequest(project_name=project_name, is_public=is_public) resp = self._request("/api/projects/create", body=body.json()) return parse_obj_as(Project.__response__, resp.json()) @@ -34,3 +101,24 @@ def set_members(self, project_name: str, members: List[MemberSetting]) -> Projec body = SetProjectMembersRequest(members=members) resp = self._request(f"/api/projects/{project_name}/set_members", body=body.json()) return parse_obj_as(Project.__response__, resp.json()) + + def add_member(self, project_name: str, username: str, project_role: ProjectRole) -> Project: + member_setting = MemberSetting(username=username, project_role=project_role) + body = AddProjectMemberRequest(members=[member_setting]) + resp = self._request(f"/api/projects/{project_name}/add_members", body=body.json()) + return parse_obj_as(Project.__response__, resp.json()) + + def add_members(self, project_name: str, members: List[MemberSetting]) -> Project: + body = AddProjectMemberRequest(members=members) + resp = self._request(f"/api/projects/{project_name}/add_members", body=body.json()) + return parse_obj_as(Project.__response__, resp.json()) + + def remove_member(self, project_name: str, username: str) -> Project: + body = RemoveProjectMemberRequest(usernames=[username]) + resp = self._request(f"/api/projects/{project_name}/remove_members", body=body.json()) + return parse_obj_as(Project.__response__, resp.json()) + + def remove_members(self, project_name: str, usernames: List[str]) -> Project: + body = RemoveProjectMemberRequest(usernames=usernames) + resp = self._request(f"/api/projects/{project_name}/remove_members", body=body.json()) + return parse_obj_as(Project.__response__, resp.json()) diff --git a/src/dstack/api/server/_repos.py b/src/dstack/api/server/_repos.py index da05e3a68b..03f9eb9eab 100644 --- a/src/dstack/api/server/_repos.py +++ b/src/dstack/api/server/_repos.py @@ -2,7 +2,12 @@ from pydantic import parse_obj_as -from dstack._internal.core.models.repos import AnyRepoInfo, RemoteRepoCreds, RepoHead +from dstack._internal.core.models.repos import ( + AnyRepoInfo, + RemoteRepoCreds, + RepoHead, + RepoHeadWithCreds, +) from dstack._internal.server.schemas.repos import ( DeleteReposRequest, GetRepoRequest, @@ -16,11 +21,23 @@ def list(self, project_name: str) -> List[RepoHead]: resp = self._request(f"/api/project/{project_name}/repos/list") return parse_obj_as(List[RepoHead.__response__], resp.json()) - def get(self, project_name: str, repo_id: str, include_creds: bool) -> RepoHead: - body = GetRepoRequest(repo_id=repo_id, include_creds=include_creds) + def get( + self, project_name: str, repo_id: str, include_creds: Optional[bool] = None + ) -> RepoHead: + if include_creds is not None: + self._logger.warning( + "`include_creds` argument is deprecated and has no effect, `get()` always returns" + " the repo without creds. Use `get_with_creds()` to get the repo with creds" + ) + body = GetRepoRequest(repo_id=repo_id, include_creds=False) resp = self._request(f"/api/project/{project_name}/repos/get", body=body.json()) return parse_obj_as(RepoHead.__response__, resp.json()) + def get_with_creds(self, project_name: str, repo_id: str) -> RepoHeadWithCreds: + body = GetRepoRequest(repo_id=repo_id, include_creds=True) + resp = self._request(f"/api/project/{project_name}/repos/get", body=body.json()) + return parse_obj_as(RepoHeadWithCreds.__response__, resp.json()) + def init( self, project_name: str, @@ -28,7 +45,11 @@ def init( repo_info: AnyRepoInfo, repo_creds: Optional[RemoteRepoCreds] = None, ): - body = SaveRepoCredsRequest(repo_id=repo_id, repo_info=repo_info, repo_creds=repo_creds) + body = SaveRepoCredsRequest( + repo_id=repo_id, + repo_info=repo_info, + repo_creds=repo_creds, + ) self._request(f"/api/project/{project_name}/repos/init", body=body.json()) def delete(self, project_name: str, repos_ids: List[str]): diff --git a/src/dstack/api/server/_runs.py b/src/dstack/api/server/_runs.py index e189c5164d..0543f3384a 100644 --- a/src/dstack/api/server/_runs.py +++ b/src/dstack/api/server/_runs.py @@ -1,48 +1,104 @@ -from typing import List, Optional +import copy +from datetime import datetime +from typing import List, Optional, Union +from uuid import UUID from pydantic import parse_obj_as -from dstack._internal.core.models.pools import Instance -from dstack._internal.core.models.profiles import Profile +from dstack._internal.core.compatibility.runs import ( + get_apply_plan_excludes, + get_get_plan_excludes, + get_list_runs_excludes, + patch_run_spec, +) from dstack._internal.core.models.runs import ( - PoolInstanceOffers, - Requirements, + ApplyRunPlanInput, Run, RunPlan, RunSpec, ) from dstack._internal.server.schemas.runs import ( - CreateInstanceRequest, + ApplyRunPlanRequest, DeleteRunsRequest, - GetOffersRequest, GetRunPlanRequest, GetRunRequest, ListRunsRequest, StopRunsRequest, - SubmitRunRequest, ) from dstack.api.server._group import APIClientGroup class RunsAPIClient(APIClientGroup): - def list(self, project_name: Optional[str], repo_id: Optional[str]) -> List[Run]: - body = ListRunsRequest(project_name=project_name, repo_id=repo_id) - resp = self._request("/api/runs/list", body=body.json()) + def list( + self, + project_name: Optional[str], + repo_id: Optional[str], + username: Optional[str] = None, + only_active: bool = False, + prev_submitted_at: Optional[datetime] = None, + prev_run_id: Optional[UUID] = None, + limit: int = 100, + ascending: bool = False, + include_jobs: bool = True, + job_submissions_limit: Optional[int] = None, + ) -> List[Run]: + body = ListRunsRequest( + project_name=project_name, + repo_id=repo_id, + username=username, + only_active=only_active, + include_jobs=include_jobs, + job_submissions_limit=job_submissions_limit, + prev_submitted_at=prev_submitted_at, + prev_run_id=prev_run_id, + limit=limit, + ascending=ascending, + ) + resp = self._request( + "/api/runs/list", body=body.json(exclude=get_list_runs_excludes(body)) + ) return parse_obj_as(List[Run.__response__], resp.json()) - def get(self, project_name: str, run_name: str) -> Run: - body = GetRunRequest(run_name=run_name) - resp = self._request(f"/api/project/{project_name}/runs/get", body=body.json()) + def get( + self, project_name: str, run_name: Optional[str] = None, run_id: Optional[UUID] = None + ) -> Run: + if run_name is None and run_id is None: + raise ValueError("Either run_name or run_id must be provided") + if run_name is not None and run_id is not None: + raise ValueError("Cannot specify both run_name and run_id") + body = GetRunRequest(run_name=run_name, id=run_id) + json_body = body.json() + resp = self._request(f"/api/project/{project_name}/runs/get", body=json_body) return parse_obj_as(Run.__response__, resp.json()) - def get_plan(self, project_name: str, run_spec: RunSpec) -> RunPlan: - body = GetRunPlanRequest(run_spec=run_spec) - resp = self._request(f"/api/project/{project_name}/runs/get_plan", body=body.json()) + def get_plan( + self, project_name: str, run_spec: RunSpec, max_offers: Optional[int] = None + ) -> RunPlan: + body = GetRunPlanRequest(run_spec=run_spec, max_offers=max_offers) + body = copy.deepcopy(body) + patch_run_spec(body.run_spec) + resp = self._request( + f"/api/project/{project_name}/runs/get_plan", + body=body.json(exclude=get_get_plan_excludes(body)), + ) return parse_obj_as(RunPlan.__response__, resp.json()) - def submit(self, project_name: str, run_spec: RunSpec) -> Run: - body = SubmitRunRequest(run_spec=run_spec) - resp = self._request(f"/api/project/{project_name}/runs/submit", body=body.json()) + def apply_plan( + self, + project_name: str, + plan: Union[RunPlan, ApplyRunPlanInput], + force: bool = False, + ) -> Run: + plan_input: ApplyRunPlanInput = ApplyRunPlanInput.__response__.parse_obj(plan) + body = ApplyRunPlanRequest(plan=plan_input, force=force) + body = copy.deepcopy(body) + patch_run_spec(body.plan.run_spec) + if body.plan.current_resource is not None: + patch_run_spec(body.plan.current_resource.run_spec) + resp = self._request( + f"/api/project/{project_name}/runs/apply", + body=body.json(exclude=get_apply_plan_excludes(plan_input)), + ) return parse_obj_as(Run.__response__, resp.json()) def stop(self, project_name: str, runs_names: List[str], abort: bool): @@ -52,22 +108,3 @@ def stop(self, project_name: str, runs_names: List[str], abort: bool): def delete(self, project_name: str, runs_names: List[str]): body = DeleteRunsRequest(runs_names=runs_names) self._request(f"/api/project/{project_name}/runs/delete", body=body.json()) - - # FIXME: get_offers and create_instance do not belong runs api - - def get_offers( - self, project_name: str, profile: Profile, requirements: Requirements - ) -> PoolInstanceOffers: - body = GetOffersRequest(profile=profile, requirements=requirements) - resp = self._request(f"/api/project/{project_name}/runs/get_offers", body=body.json()) - return parse_obj_as(PoolInstanceOffers.__response__, resp.json()) - - def create_instance( - self, - project_name: str, - profile: Profile, - requirements: Requirements, - ) -> Instance: - body = CreateInstanceRequest(profile=profile, requirements=requirements) - resp = self._request(f"/api/project/{project_name}/runs/create_instance", body=body.json()) - return parse_obj_as(Instance.__response__, resp.json()) diff --git a/src/dstack/api/server/_secrets.py b/src/dstack/api/server/_secrets.py index adba4081a9..9a2a2763f1 100644 --- a/src/dstack/api/server/_secrets.py +++ b/src/dstack/api/server/_secrets.py @@ -4,33 +4,33 @@ from dstack._internal.core.models.secrets import Secret from dstack._internal.server.schemas.secrets import ( - AddSecretRequest, + CreateOrUpdateSecretRequest, DeleteSecretsRequest, - GetSecretsRequest, - ListSecretsRequest, + GetSecretRequest, ) from dstack.api.server._group import APIClientGroup class SecretsAPIClient(APIClientGroup): - def list(self, project_name: str, repo_id: str) -> List[Secret]: - body = ListSecretsRequest(repo_id=repo_id) - resp = self._request(f"/api/project/{project_name}/secrets/list", body=body.json()) + def list(self, project_name: str) -> List[Secret]: + resp = self._request(f"/api/project/{project_name}/secrets/list") return parse_obj_as(List[Secret.__response__], resp.json()) - def get(self, project_name: str, repo_id: str, secret_name: str) -> Secret: - raise NotImplementedError() - body = GetSecretsRequest(repo_id=repo_id) + def get(self, project_name: str, name: str) -> Secret: + body = GetSecretRequest(name=name) resp = self._request(f"/api/project/{project_name}/secrets/get", body=body.json()) return parse_obj_as(Secret, resp.json()) - def add(self, project_name: str, repo_id: str, secret_name: str, secret_value: str) -> Secret: - body = AddSecretRequest( - repo_id=repo_id, secret=Secret(name=secret_name, value=secret_value) + def create_or_update(self, project_name: str, name: str, value: str) -> Secret: + body = CreateOrUpdateSecretRequest( + name=name, + value=value, + ) + resp = self._request( + f"/api/project/{project_name}/secrets/create_or_update", body=body.json() ) - resp = self._request(f"/api/project/{project_name}/secrets/add", body=body.json()) return parse_obj_as(Secret.__response__, resp.json()) - def delete(self, project_name: str, repo_id: str, secrets_names: List[str]): - body = DeleteSecretsRequest(repo_id=repo_id, secrets_names=secrets_names) + def delete(self, project_name: str, names: List[str]): + body = DeleteSecretsRequest(secrets_names=names) self._request(f"/api/project/{project_name}/secrets/delete", body=body.json()) diff --git a/src/dstack/api/server/_users.py b/src/dstack/api/server/_users.py index 5c2d58b0b8..885eae54a2 100644 --- a/src/dstack/api/server/_users.py +++ b/src/dstack/api/server/_users.py @@ -1,8 +1,18 @@ -from typing import List +import json +from datetime import datetime +from typing import Any, List, Optional +from uuid import UUID from pydantic import parse_obj_as +from pydantic.json import pydantic_encoder -from dstack._internal.core.models.users import GlobalRole, User, UserWithCreds +from dstack._internal.core.models.users import ( + GlobalRole, + User, + UsersInfoList, + UsersInfoListOrUsersList, + UserWithCreds, +) from dstack._internal.server.schemas.users import ( CreateUserRequest, GetUserRequest, @@ -13,13 +23,43 @@ class UsersAPIClient(APIClientGroup): - def list(self) -> List[User]: - resp = self._request("/api/users/list") - return parse_obj_as(List[User.__response__], resp.json()) + def list( + self, + return_total_count: Optional[bool] = None, + name_pattern: Optional[str] = None, + prev_created_at: Optional[datetime] = None, + prev_id: Optional[UUID] = None, + limit: Optional[int] = None, + ascending: Optional[bool] = None, + ) -> UsersInfoListOrUsersList: + # Passing only non-None fields for backward compatibility with 0.20 servers. + body: dict[str, Any] = {} + if return_total_count is not None: + body["return_total_count"] = return_total_count + if name_pattern is not None: + body["name_pattern"] = name_pattern + if prev_created_at is not None: + body["prev_created_at"] = prev_created_at + if prev_id is not None: + body["prev_id"] = prev_id + if limit is not None: + body["limit"] = limit + if ascending is not None: + body["ascending"] = ascending + if body: + resp = self._request( + "/api/users/list", body=json.dumps(body, default=pydantic_encoder) + ) + else: + resp = self._request("/api/users/list") + resp_json = resp.json() + if isinstance(resp_json, list): + return parse_obj_as(List[User.__response__], resp_json) + return parse_obj_as(UsersInfoList, resp_json) - def get_my_user(self) -> User: + def get_my_user(self) -> UserWithCreds: resp = self._request("/api/users/get_my_user") - return parse_obj_as(User.__response__, resp.json()) + return parse_obj_as(UserWithCreds.__response__, resp.json()) def get_user(self, username: str) -> User: body = GetUserRequest(username=username) diff --git a/src/dstack/api/server/_volumes.py b/src/dstack/api/server/_volumes.py index 8dc31b6431..5cf56afc3d 100644 --- a/src/dstack/api/server/_volumes.py +++ b/src/dstack/api/server/_volumes.py @@ -2,7 +2,8 @@ from pydantic import parse_obj_as -from dstack._internal.core.models.volumes import Volume, VolumeConfiguration +from dstack._internal.core.compatibility.volumes import get_create_volume_excludes +from dstack._internal.core.models.volumes import AnyVolumeConfiguration, Volume from dstack._internal.server.schemas.volumes import ( CreateVolumeRequest, DeleteVolumesRequest, @@ -24,10 +25,13 @@ def get(self, project_name: str, name: str) -> Volume: def create( self, project_name: str, - configuration: VolumeConfiguration, + configuration: AnyVolumeConfiguration, ) -> Volume: body = CreateVolumeRequest(configuration=configuration) - resp = self._request(f"/api/project/{project_name}/volumes/create", body=body.json()) + resp = self._request( + f"/api/project/{project_name}/volumes/create", + body=body.json(exclude=get_create_volume_excludes(configuration)), + ) return parse_obj_as(Volume.__response__, resp.json()) def delete(self, project_name: str, names: List[str]) -> None: diff --git a/src/dstack/api/utils.py b/src/dstack/api/utils.py index 9471ce74b2..66bec859fb 100644 --- a/src/dstack/api/utils.py +++ b/src/dstack/api/utils.py @@ -2,6 +2,7 @@ from typing import Optional, Tuple import yaml +from pydantic import ValidationError from dstack._internal.core.errors import ConfigurationError from dstack._internal.core.models.configurations import AnyRunConfiguration @@ -31,10 +32,6 @@ def load_profile(repo_dir: PathLike, profile_name: Optional[str]) -> Profile: if repo_profile is not None: return repo_profile - repo_profiles_path = Path(repo_dir) / ".dstack/profiles.yml" - if not repo_profiles_path.exists(): - repo_profiles_path = repo_profiles_path.with_suffix(".yaml") - dstack_dir = get_dstack_dir() global_profiles_path = dstack_dir / "profiles.yml" if not global_profiles_path.exists(): @@ -96,6 +93,8 @@ def _load_profile_from_path(profiles_path: Path, profile_name: Optional[str]) -> config = ProfilesConfig.parse_obj(yaml.safe_load(f)) except FileNotFoundError: return None + except ValidationError as e: + raise ConfigurationError(e) if profile_name is None: return config.default() diff --git a/src/dstack/plugins/__init__.py b/src/dstack/plugins/__init__.py new file mode 100644 index 0000000000..93970043a8 --- /dev/null +++ b/src/dstack/plugins/__init__.py @@ -0,0 +1,8 @@ +# ruff: noqa: F401 +from dstack._internal.core.models.fleets import FleetSpec +from dstack._internal.core.models.gateways import GatewaySpec +from dstack._internal.core.models.runs import RunSpec +from dstack._internal.core.models.volumes import VolumeSpec +from dstack.plugins._base import ApplyPolicy, Plugin +from dstack.plugins._models import ApplySpec +from dstack.plugins._utils import get_plugin_logger diff --git a/src/dstack/plugins/_base.py b/src/dstack/plugins/_base.py new file mode 100644 index 0000000000..a30ae0c338 --- /dev/null +++ b/src/dstack/plugins/_base.py @@ -0,0 +1,72 @@ +from dstack._internal.core.models.fleets import FleetSpec +from dstack._internal.core.models.gateways import GatewaySpec +from dstack._internal.core.models.runs import RunSpec +from dstack._internal.core.models.volumes import VolumeSpec +from dstack.plugins._models import ApplySpec + + +class ApplyPolicy: + """ + A base apply policy class to modify specs on `dstack apply`. + Subclass it and return the subclass instance in `Plugin.get_apply_policies()`. + """ + + def on_apply(self, user: str, project: str, spec: ApplySpec) -> ApplySpec: + """ + Modify `spec` before it's applied. + Raise `ValueError` for `spec` to be rejected as invalid. + + This method can be called twice: + * first when a user gets a plan + * second when a user applies a plan + + In both cases, the original spec is passed, so the method does not + need to check if it modified the spec before. + + It's safe to modify and return `spec` without copying. + """ + if isinstance(spec, RunSpec): + return self.on_run_apply(user=user, project=project, spec=spec) + if isinstance(spec, FleetSpec): + return self.on_fleet_apply(user=user, project=project, spec=spec) + if isinstance(spec, VolumeSpec): + return self.on_volume_apply(user=user, project=project, spec=spec) + if isinstance(spec, GatewaySpec): + return self.on_gateway_apply(user=user, project=project, spec=spec) + raise ValueError(f"Unknown spec type {type(spec)}") + + def on_run_apply(self, user: str, project: str, spec: RunSpec) -> RunSpec: + """ + Called by the default `on_apply()` implementation for runs. + """ + return spec + + def on_fleet_apply(self, user: str, project: str, spec: FleetSpec) -> FleetSpec: + """ + Called by the default `on_apply()` implementation for fleets. + """ + return spec + + def on_volume_apply(self, user: str, project: str, spec: VolumeSpec) -> VolumeSpec: + """ + Called by the default `on_apply()` implementation for volumes. + """ + return spec + + def on_gateway_apply(self, user: str, project: str, spec: GatewaySpec) -> GatewaySpec: + """ + Called by the default `on_apply()` implementation for gateways. + """ + return spec + + +class Plugin: + """ + A base plugin class. + Plugins must subclass it, implement public methods, + and register the subclass as an entrypoint of the package + (https://packaging.python.org/en/latest/specifications/entry-points/). + """ + + def get_apply_policies(self) -> list[ApplyPolicy]: + return [] diff --git a/src/dstack/plugins/_models.py b/src/dstack/plugins/_models.py new file mode 100644 index 0000000000..124e0e5936 --- /dev/null +++ b/src/dstack/plugins/_models.py @@ -0,0 +1,8 @@ +from typing import TypeVar + +from dstack._internal.core.models.fleets import FleetSpec +from dstack._internal.core.models.gateways import GatewaySpec +from dstack._internal.core.models.runs import RunSpec +from dstack._internal.core.models.volumes import VolumeSpec + +ApplySpec = TypeVar("ApplySpec", RunSpec, FleetSpec, VolumeSpec, GatewaySpec) diff --git a/src/dstack/plugins/_utils.py b/src/dstack/plugins/_utils.py new file mode 100644 index 0000000000..9de3ff2606 --- /dev/null +++ b/src/dstack/plugins/_utils.py @@ -0,0 +1,19 @@ +import logging + +from dstack._internal.utils.logging import get_logger + + +def get_plugin_logger(name: str) -> logging.Logger: + """ + Use this function to set up loggers in plugins. + + Put at the top of the plugin modules: + + ``` + from dstack.plugins import get_plugin_logger + + logger = get_plugin_logger(__name__) + ``` + + """ + return get_logger(f"dstack.plugins.{name}") diff --git a/src/dstack/plugins/builtin/__init__.py b/src/dstack/plugins/builtin/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/plugins/builtin/rest_plugin/__init__.py b/src/dstack/plugins/builtin/rest_plugin/__init__.py new file mode 100644 index 0000000000..4d5e0fe14a --- /dev/null +++ b/src/dstack/plugins/builtin/rest_plugin/__init__.py @@ -0,0 +1,18 @@ +# ruff: noqa: F401 +from dstack.plugins.builtin.rest_plugin._models import ( + FleetSpecRequest, + FleetSpecResponse, + GatewaySpecRequest, + GatewaySpecResponse, + RunSpecRequest, + RunSpecResponse, + SpecApplyRequest, + SpecApplyResponse, + VolumeSpecRequest, + VolumeSpecResponse, +) +from dstack.plugins.builtin.rest_plugin._plugin import ( + PLUGIN_SERVICE_URI_ENV_VAR_NAME, + CustomApplyPolicy, + RESTPlugin, +) diff --git a/src/dstack/plugins/builtin/rest_plugin/_models.py b/src/dstack/plugins/builtin/rest_plugin/_models.py new file mode 100644 index 0000000000..ee3a042464 --- /dev/null +++ b/src/dstack/plugins/builtin/rest_plugin/_models.py @@ -0,0 +1,48 @@ +from typing import Generic, Optional, TypeVar + +from pydantic import BaseModel, Field +from typing_extensions import Annotated + +from dstack._internal.core.models.fleets import FleetSpec +from dstack._internal.core.models.gateways import GatewaySpec +from dstack._internal.core.models.runs import RunSpec +from dstack._internal.core.models.volumes import VolumeSpec + +SpecType = TypeVar("SpecType", RunSpec, FleetSpec, VolumeSpec, GatewaySpec) + + +class SpecApplyRequest(BaseModel, Generic[SpecType]): + user: Annotated[str, Field(description="The name of the user making the apply request")] + project: Annotated[str, Field(description="The name of the project the request is for")] + spec: Annotated[SpecType, Field(description="The spec to be applied")] + + # Override dict() to remove __orig_class__ attribute and avoid "TypeError: Object of type _GenericAlias is not JSON serializable" + # error. This issue doesn't happen though when running the code in pytest, only when running the server. + def dict(self, *args, **kwargs): + d = super().dict(*args, **kwargs) + d.pop("__orig_class__", None) + return d + + +RunSpecRequest = SpecApplyRequest[RunSpec] +FleetSpecRequest = SpecApplyRequest[FleetSpec] +VolumeSpecRequest = SpecApplyRequest[VolumeSpec] +GatewaySpecRequest = SpecApplyRequest[GatewaySpec] + + +class SpecApplyResponse(BaseModel, Generic[SpecType]): + spec: Annotated[ + SpecType, + Field( + description="The spec to apply, original spec if error otherwise original or mutated by plugin service if approved" + ), + ] + error: Annotated[ + Optional[str], Field(description="Error message if request is rejected", min_length=1) + ] = None + + +RunSpecResponse = SpecApplyResponse[RunSpec] +FleetSpecResponse = SpecApplyResponse[FleetSpec] +VolumeSpecResponse = SpecApplyResponse[VolumeSpec] +GatewaySpecResponse = SpecApplyResponse[GatewaySpec] diff --git a/src/dstack/plugins/builtin/rest_plugin/_plugin.py b/src/dstack/plugins/builtin/rest_plugin/_plugin.py new file mode 100644 index 0000000000..210dd50e19 --- /dev/null +++ b/src/dstack/plugins/builtin/rest_plugin/_plugin.py @@ -0,0 +1,147 @@ +import json +import os +from typing import Dict, Optional, Type + +import requests +from pydantic import ValidationError + +from dstack._internal.core.compatibility.fleets import get_fleet_spec_excludes +from dstack._internal.core.compatibility.gateways import get_gateway_spec_excludes +from dstack._internal.core.compatibility.runs import get_run_spec_excludes +from dstack._internal.core.compatibility.volumes import get_volume_spec_excludes +from dstack._internal.core.errors import ServerClientError +from dstack._internal.core.models.fleets import FleetSpec +from dstack._internal.core.models.gateways import GatewaySpec +from dstack._internal.core.models.volumes import VolumeSpec +from dstack.plugins import ApplyPolicy, ApplySpec, Plugin, RunSpec, get_plugin_logger +from dstack.plugins.builtin.rest_plugin import ( + FleetSpecRequest, + FleetSpecResponse, + GatewaySpecRequest, + GatewaySpecResponse, + RunSpecRequest, + RunSpecResponse, + SpecApplyRequest, + SpecApplyResponse, + VolumeSpecRequest, + VolumeSpecResponse, +) + +logger = get_plugin_logger(__name__) + +PLUGIN_SERVICE_URI_ENV_VAR_NAME = "DSTACK_PLUGIN_SERVICE_URI" +PLUGIN_REQUEST_TIMEOUT_SEC = 8 + + +class CustomApplyPolicy(ApplyPolicy): + def __init__(self): + self._plugin_service_uri = os.getenv(PLUGIN_SERVICE_URI_ENV_VAR_NAME) + logger.info(f"Found plugin service at {self._plugin_service_uri}") + if not self._plugin_service_uri: + logger.error( + f"Cannot create policy because {PLUGIN_SERVICE_URI_ENV_VAR_NAME} is not set" + ) + raise ServerClientError(f"{PLUGIN_SERVICE_URI_ENV_VAR_NAME} is not set") + + def _check_request_rejected(self, response: SpecApplyResponse): + if response.error is not None: + logger.error(f"Plugin service rejected apply request: {response.error}") + raise ServerClientError(f"Apply request rejected: {response.error}") + + def _call_plugin_service( + self, + spec_request: SpecApplyRequest, + endpoint: str, + excludes: Optional[Dict], + ) -> ApplySpec: + response = None + try: + response = requests.post( + f"{self._plugin_service_uri}{endpoint}", + json=spec_request.dict(exclude={"spec": excludes}), + headers={"accept": "application/json", "Content-Type": "application/json"}, + timeout=PLUGIN_REQUEST_TIMEOUT_SEC, + ) + response.raise_for_status() + spec_json = json.loads(response.text) + return spec_json + except requests.exceptions.ConnectionError as e: + logger.error( + f"Could not connect to plugin service at {self._plugin_service_uri}: %s", e + ) + raise ServerClientError( + f"Could not connect to plugin service at {self._plugin_service_uri}" + ) + except requests.RequestException as e: + logger.error("Request to the plugin service failed: %s", e) + raise ServerClientError("Request to the plugin service failed") + + def _on_apply( + self, + request_cls: Type[SpecApplyRequest], + response_cls: Type[SpecApplyResponse], + endpoint: str, + user: str, + project: str, + spec: ApplySpec, + excludes: Optional[Dict] = None, + ) -> ApplySpec: + spec_json = None + try: + spec_request = request_cls(user=user, project=project, spec=spec) + spec_json = self._call_plugin_service(spec_request, endpoint, excludes) + response = response_cls(**spec_json) + self._check_request_rejected(response) + return response.spec + except ValidationError: + logger.error(f"Plugin service returned invalid response:\n{spec_json}") + raise ServerClientError("Plugin service returned an invalid response") + + def on_run_apply(self, user: str, project: str, spec: RunSpec) -> RunSpec: + return self._on_apply( + RunSpecRequest, + RunSpecResponse, + "/apply_policies/on_run_apply", + user, + project, + spec, + excludes=get_run_spec_excludes(spec), + ) + + def on_fleet_apply(self, user: str, project: str, spec: FleetSpec) -> FleetSpec: + return self._on_apply( + FleetSpecRequest, + FleetSpecResponse, + "/apply_policies/on_fleet_apply", + user, + project, + spec, + excludes=get_fleet_spec_excludes(spec), + ) + + def on_volume_apply(self, user: str, project: str, spec: VolumeSpec) -> VolumeSpec: + return self._on_apply( + VolumeSpecRequest, + VolumeSpecResponse, + "/apply_policies/on_volume_apply", + user, + project, + spec, + excludes=get_volume_spec_excludes(spec), + ) + + def on_gateway_apply(self, user: str, project: str, spec: GatewaySpec) -> GatewaySpec: + return self._on_apply( + GatewaySpecRequest, + GatewaySpecResponse, + "/apply_policies/on_gateway_apply", + user, + project, + spec, + excludes=get_gateway_spec_excludes(spec), + ) + + +class RESTPlugin(Plugin): + def get_apply_policies(self) -> list[ApplyPolicy]: + return [CustomApplyPolicy()] diff --git a/src/dstack/version.py b/src/dstack/version.py index 33441d6e4f..b912a6c2df 100644 --- a/src/dstack/version.py +++ b/src/dstack/version.py @@ -1,3 +1,10 @@ -__version__ = None +# WARNING: +# This file is overwritten when building the dstack package in CI. +# If you are making any changes, +# remember to update and test the build-artifacts.yml workflow. + +__version__ = "0.0.0" __is_release__ = False -base_image = "0.4" +docker_base_image = "0.14" +docker_base_image_ubuntu_version = "24.04" +vm_base_image = "0.14" diff --git a/src/tests/_internal/cli/commands/test_config.py b/src/tests/_internal/cli/commands/test_config.py deleted file mode 100644 index 430e51f8f5..0000000000 --- a/src/tests/_internal/cli/commands/test_config.py +++ /dev/null @@ -1,48 +0,0 @@ -from pathlib import Path -from unittest.mock import patch - -import yaml -from pytest import CaptureFixture - -from dstack._internal.utils.logging import get_logger -from tests._internal.cli.common import run_dstack_cli - - -class TestConfig: - def test_configures_project(self, capsys: CaptureFixture, tmp_path: Path): - cli_config_path = tmp_path / ".dstack" / "config.yml" - logger = get_logger("dstack._internal.cli.commands.config") - with patch.object(logger, "info") as logger_info_mock: - with patch("dstack.api.server.APIClient") as APIClientMock: - api_client_mock = APIClientMock.return_value - api_client_mock.projects.get - exit_code = run_dstack_cli( - [ - "config", - "--url", - "http://127.0.0.1:31313", - "--project", - "project", - "--token", - "token", - ], - home_dir=tmp_path, - ) - APIClientMock.assert_called_once_with( - base_url="http://127.0.0.1:31313", token="token" - ) - logger_info_mock.assert_called_once_with( - f"Configuration updated at {cli_config_path}", {"show_path": False} - ) - assert exit_code == 0 - assert yaml.load(cli_config_path.read_text(), yaml.FullLoader) == { - "projects": [ - { - "default": True, - "name": "project", - "token": "token", - "url": "http://127.0.0.1:31313", - } - ], - "repos": [], - } diff --git a/src/tests/_internal/cli/commands/test_login.py b/src/tests/_internal/cli/commands/test_login.py new file mode 100644 index 0000000000..4e0c27ae34 --- /dev/null +++ b/src/tests/_internal/cli/commands/test_login.py @@ -0,0 +1,431 @@ +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import call, patch + +from pytest import CaptureFixture + +from tests._internal.cli.common import run_dstack_cli + + +class TestLogin: + @staticmethod + def _setup_auth_mocks(api_client_mock, login_server_mock, user_token="token"): + """Set up common authentication mocks.""" + api_client_mock.return_value.auth.list_providers.return_value = [ + SimpleNamespace(name="github", enabled=True) + ] + api_client_mock.return_value.auth.authorize.return_value = SimpleNamespace( + authorization_url="http://auth_url" + ) + user = SimpleNamespace(username="me", creds=SimpleNamespace(token=user_token)) + login_server_mock.return_value.get_logged_in_user.return_value = user + return user + + @staticmethod + def _setup_config_manager_with_state_tracking( + config_manager_mock, tmp_path: Path, project_configs: list[SimpleNamespace] + ): + """Set up ConfigManager mock with state tracking via side effects.""" + config_manager_mock.return_value.config_filepath = tmp_path / "config.yml" + config_manager_mock.return_value.list_project_configs.return_value = project_configs + + def configure_project_side_effect(name, url, token, default): + for pc in project_configs: + if pc.name == name: + pc.url = url + pc.token = token + if default: + for p in project_configs: + p.default = False + pc.default = default or pc.default + return + + def get_project_config_side_effect(name=None): + if name is None: + for pc in project_configs: + if pc.default: + return pc + return None + for pc in project_configs: + if pc.name == name: + return pc + return None + + config_manager_mock.return_value.configure_project.side_effect = ( + configure_project_side_effect + ) + config_manager_mock.return_value.get_project_config.side_effect = ( + get_project_config_side_effect + ) + + def test_login_no_projects(self, capsys: CaptureFixture, tmp_path: Path): + with ( + patch("dstack._internal.cli.commands.login.webbrowser") as webbrowser_mock, + patch("dstack._internal.cli.commands.login.APIClient") as api_client_mock, + patch("dstack._internal.cli.commands.login._LoginServer") as login_server_mock, + patch( + "dstack._internal.cli.commands.login._normalize_url_or_error" + ) as normalize_url_mock, + ): + webbrowser_mock.open.return_value = True + normalize_url_mock.return_value = "http://127.0.0.1:31313" + self._setup_auth_mocks(api_client_mock, login_server_mock) + api_client_mock.return_value.projects.list.return_value = [] + + exit_code = run_dstack_cli( + ["login", "--url", "http://127.0.0.1:31313", "--provider", "github"], + home_dir=tmp_path, + ) + + assert exit_code == 0 + assert capsys.readouterr().out.replace("\n", "") == ( + "Your browser has been opened to log in with Github:" + "http://auth_url" + "Logged in as me" + "No projects configured. Create your own project via the UI or contact a project manager to add you to the project." + ) + + def test_login_configures_projects(self, capsys: CaptureFixture, tmp_path: Path): + with ( + patch("dstack._internal.cli.commands.login.webbrowser") as webbrowser_mock, + patch("dstack._internal.cli.commands.login.APIClient") as api_client_mock, + patch("dstack._internal.cli.commands.login.ConfigManager") as config_manager_mock, + patch("dstack._internal.cli.commands.login._LoginServer") as login_server_mock, + patch( + "dstack._internal.cli.commands.login._normalize_url_or_error" + ) as normalize_url_mock, + ): + webbrowser_mock.open.return_value = True + normalize_url_mock.return_value = "http://127.0.0.1:31313" + user = self._setup_auth_mocks(api_client_mock, login_server_mock) + api_client_mock.return_value.projects.list.return_value = [ + SimpleNamespace(project_name="project1"), + SimpleNamespace(project_name="project2"), + ] + api_client_mock.return_value.base_url = "http://127.0.0.1:31313" + + project_configs = [ + SimpleNamespace( + name="project1", url="http://127.0.0.1:31313", token="token", default=False + ), + SimpleNamespace( + name="project2", url="http://127.0.0.1:31313", token="token", default=False + ), + ] + config_manager_mock.return_value.get_project_config.return_value = None + self._setup_config_manager_with_state_tracking( + config_manager_mock, tmp_path, project_configs + ) + + exit_code = run_dstack_cli( + ["login", "--url", "http://127.0.0.1:31313", "--provider", "github"], + home_dir=tmp_path, + ) + + config_manager_mock.return_value.configure_project.assert_has_calls( + [ + call( + name="project1", + url="http://127.0.0.1:31313", + token=user.creds.token, + default=False, + ), + call( + name="project2", + url="http://127.0.0.1:31313", + token=user.creds.token, + default=False, + ), + call( + name="project1", url="http://127.0.0.1:31313", token="token", default=True + ), + ] + ) + config_manager_mock.return_value.save.assert_called() + final_default = config_manager_mock.return_value.get_project_config() + assert final_default is not None + assert final_default.name == "project1" + + assert exit_code == 0 + assert capsys.readouterr().out.replace("\n", "") == ( + "Your browser has been opened to log in with Github:" + "http://auth_url" + "Logged in as me" + f"Added project1, project2 projects at {tmp_path / 'config.yml'}" + f"Set project1 project as default at {tmp_path / 'config.yml'}" + ) + + def test_login_configures_projects_yes_sets_first_project_default( + self, capsys: CaptureFixture, tmp_path: Path + ): + with ( + patch("dstack._internal.cli.commands.login.webbrowser") as webbrowser_mock, + patch("dstack._internal.cli.commands.login.APIClient") as api_client_mock, + patch("dstack._internal.cli.commands.login.ConfigManager") as config_manager_mock, + patch("dstack._internal.cli.commands.login._LoginServer") as login_server_mock, + patch( + "dstack._internal.cli.commands.login._normalize_url_or_error" + ) as normalize_url_mock, + ): + webbrowser_mock.open.return_value = True + normalize_url_mock.return_value = "http://127.0.0.1:31313" + user = self._setup_auth_mocks(api_client_mock, login_server_mock) + api_client_mock.return_value.projects.list.return_value = [ + SimpleNamespace(project_name="project1"), + SimpleNamespace(project_name="project2"), + ] + api_client_mock.return_value.base_url = "http://127.0.0.1:31313" + + project_configs = [ + SimpleNamespace( + name="project1", url="http://127.0.0.1:31313", token="token", default=False + ), + SimpleNamespace( + name="project2", url="http://127.0.0.1:31313", token="token", default=True + ), + ] + self._setup_config_manager_with_state_tracking( + config_manager_mock, tmp_path, project_configs + ) + + exit_code = run_dstack_cli( + ["login", "--url", "http://127.0.0.1:31313", "--provider", "github", "--yes"], + home_dir=tmp_path, + ) + + config_manager_mock.return_value.configure_project.assert_has_calls( + [ + call( + name="project1", + url="http://127.0.0.1:31313", + token=user.creds.token, + default=False, + ), + call( + name="project2", + url="http://127.0.0.1:31313", + token=user.creds.token, + default=False, + ), + call( + name="project1", url="http://127.0.0.1:31313", token="token", default=True + ), + ] + ) + final_default = config_manager_mock.return_value.get_project_config() + assert final_default is not None + assert final_default.name == "project1" + + assert exit_code == 0 + assert capsys.readouterr().out.replace("\n", "") == ( + "Your browser has been opened to log in with Github:" + "http://auth_url" + "Logged in as me" + f"Added project1, project2 projects at {tmp_path / 'config.yml'}" + f"Set project1 project as default at {tmp_path / 'config.yml'}" + ) + + def test_login_configures_projects_no_does_not_change_default( + self, capsys: CaptureFixture, tmp_path: Path + ): + with ( + patch("dstack._internal.cli.commands.login.webbrowser") as webbrowser_mock, + patch("dstack._internal.cli.commands.login.APIClient") as api_client_mock, + patch("dstack._internal.cli.commands.login.ConfigManager") as config_manager_mock, + patch("dstack._internal.cli.commands.login._LoginServer") as login_server_mock, + patch( + "dstack._internal.cli.commands.login._normalize_url_or_error" + ) as normalize_url_mock, + ): + webbrowser_mock.open.return_value = True + normalize_url_mock.return_value = "http://127.0.0.1:31313" + user = self._setup_auth_mocks(api_client_mock, login_server_mock) + api_client_mock.return_value.projects.list.return_value = [ + SimpleNamespace(project_name="project1"), + SimpleNamespace(project_name="project2"), + ] + api_client_mock.return_value.base_url = "http://127.0.0.1:31313" + + project_configs = [ + SimpleNamespace( + name="project1", url="http://127.0.0.1:31313", token="token", default=False + ), + SimpleNamespace( + name="project2", url="http://127.0.0.1:31313", token="token", default=True + ), + ] + self._setup_config_manager_with_state_tracking( + config_manager_mock, tmp_path, project_configs + ) + + exit_code = run_dstack_cli( + ["login", "--url", "http://127.0.0.1:31313", "--provider", "github", "--no"], + home_dir=tmp_path, + ) + + config_manager_mock.return_value.configure_project.assert_has_calls( + [ + call( + name="project1", + url="http://127.0.0.1:31313", + token=user.creds.token, + default=False, + ), + call( + name="project2", + url="http://127.0.0.1:31313", + token=user.creds.token, + default=False, + ), + ] + ) + assert ( + call(name="project1", url="http://127.0.0.1:31313", token="token", default=True) + not in config_manager_mock.return_value.configure_project.mock_calls + ) + final_default = config_manager_mock.return_value.get_project_config() + assert final_default is not None + assert final_default.name == "project2" + + assert exit_code == 0 + assert capsys.readouterr().out.replace("\n", "") == ( + "Your browser has been opened to log in with Github:" + "http://auth_url" + "Logged in as me" + f"Added project1, project2 projects at {tmp_path / 'config.yml'}" + ) + + def test_login_single_project_auto_default(self, capsys: CaptureFixture, tmp_path: Path): + with ( + patch("dstack._internal.cli.commands.login.webbrowser") as webbrowser_mock, + patch("dstack._internal.cli.commands.login.APIClient") as api_client_mock, + patch("dstack._internal.cli.commands.login.ConfigManager") as config_manager_mock, + patch("dstack._internal.cli.commands.login._LoginServer") as login_server_mock, + patch( + "dstack._internal.cli.commands.login._normalize_url_or_error" + ) as normalize_url_mock, + ): + webbrowser_mock.open.return_value = True + normalize_url_mock.return_value = "http://127.0.0.1:31313" + user = self._setup_auth_mocks(api_client_mock, login_server_mock) + api_client_mock.return_value.projects.list.return_value = [ + SimpleNamespace(project_name="project1"), + ] + api_client_mock.return_value.base_url = "http://127.0.0.1:31313" + + project_configs = [ + SimpleNamespace( + name="project1", url="http://127.0.0.1:31313", token="token", default=False + ), + ] + config_manager_mock.return_value.get_project_config.return_value = None + self._setup_config_manager_with_state_tracking( + config_manager_mock, tmp_path, project_configs + ) + + exit_code = run_dstack_cli( + ["login", "--url", "http://127.0.0.1:31313", "--provider", "github"], + home_dir=tmp_path, + ) + + config_manager_mock.return_value.configure_project.assert_has_calls( + [ + call( + name="project1", + url="http://127.0.0.1:31313", + token=user.creds.token, + default=False, + ), + call( + name="project1", url="http://127.0.0.1:31313", token="token", default=True + ), + ] + ) + final_default = config_manager_mock.return_value.get_project_config() + assert final_default is not None + assert final_default.name == "project1" + + assert exit_code == 0 + assert capsys.readouterr().out.replace("\n", "") == ( + "Your browser has been opened to log in with Github:" + "http://auth_url" + "Logged in as me" + f"Added project1 project at {tmp_path / 'config.yml'}" + f"Set project1 project as default at {tmp_path / 'config.yml'}" + ) + + def test_login_interactive_prompts_for_default_project( + self, capsys: CaptureFixture, tmp_path: Path + ): + with ( + patch("dstack._internal.cli.commands.login.webbrowser") as webbrowser_mock, + patch("dstack._internal.cli.commands.login.APIClient") as api_client_mock, + patch("dstack._internal.cli.commands.login.ConfigManager") as config_manager_mock, + patch("dstack._internal.cli.commands.login._LoginServer") as login_server_mock, + patch( + "dstack._internal.cli.commands.login._normalize_url_or_error" + ) as normalize_url_mock, + patch( + "dstack._internal.cli.commands.login.select_default_project" + ) as select_default_project_mock, + patch("dstack._internal.cli.commands.login.is_project_menu_supported", True), + ): + webbrowser_mock.open.return_value = True + normalize_url_mock.return_value = "http://127.0.0.1:31313" + user = self._setup_auth_mocks(api_client_mock, login_server_mock) + api_client_mock.return_value.projects.list.return_value = [ + SimpleNamespace(project_name="project1"), + SimpleNamespace(project_name="project2"), + ] + api_client_mock.return_value.base_url = "http://127.0.0.1:31313" + + project_configs = [ + SimpleNamespace( + name="project1", url="http://127.0.0.1:31313", token="token", default=False + ), + SimpleNamespace( + name="project2", url="http://127.0.0.1:31313", token="token", default=False + ), + ] + config_manager_mock.return_value.get_project_config.return_value = None + self._setup_config_manager_with_state_tracking( + config_manager_mock, tmp_path, project_configs + ) + select_default_project_mock.return_value = project_configs[1] + + exit_code = run_dstack_cli( + ["login", "--url", "http://127.0.0.1:31313", "--provider", "github"], + home_dir=tmp_path, + ) + + select_default_project_mock.assert_called_once() + config_manager_mock.return_value.configure_project.assert_has_calls( + [ + call( + name="project1", + url="http://127.0.0.1:31313", + token=user.creds.token, + default=False, + ), + call( + name="project2", + url="http://127.0.0.1:31313", + token=user.creds.token, + default=False, + ), + call( + name="project2", url="http://127.0.0.1:31313", token="token", default=True + ), + ] + ) + final_default = config_manager_mock.return_value.get_project_config() + assert final_default is not None + assert final_default.name == "project2" + + assert exit_code == 0 + assert capsys.readouterr().out.replace("\n", "") == ( + "Your browser has been opened to log in with Github:" + "http://auth_url" + "Logged in as me" + f"Added project1, project2 projects at {tmp_path / 'config.yml'}" + ) diff --git a/src/tests/_internal/cli/commands/test_metrics.py b/src/tests/_internal/cli/commands/test_metrics.py new file mode 100644 index 0000000000..5e9492a4a5 --- /dev/null +++ b/src/tests/_internal/cli/commands/test_metrics.py @@ -0,0 +1,34 @@ +import pytest + +from dstack._internal.cli.commands.metrics import _format_memory + + +@pytest.mark.parametrize( + "bytes_value,decimal_places,expected", + [ + # Test MB values with different decimal places + (512 * 1024 * 1024, 0, "512MB"), # exact MB, no decimals + (512 * 1024 * 1024, 2, "512MB"), # exact MB, with decimals + (512.5 * 1024 * 1024, 0, "512MB"), # decimal MB, no decimals + (512.5 * 1024 * 1024, 2, "512.5MB"), # decimal MB, 2 decimals + (512.5 * 1024 * 1024, 3, "512.5MB"), # decimal MB, 3 decimals + (999 * 1024 * 1024, 0, "999MB"), # just under 1GB, no decimals + (999 * 1024 * 1024, 2, "999MB"), # just under 1GB, with decimals + # Test GB values with different decimal places + (1.5 * 1024 * 1024 * 1024, 0, "2GB"), # decimal GB, no decimals + (1.5 * 1024 * 1024 * 1024, 2, "1.5GB"), # decimal GB, 2 decimals + (1.5 * 1024 * 1024 * 1024, 3, "1.5GB"), # decimal GB, 3 decimals + (2 * 1024 * 1024 * 1024, 0, "2GB"), # exact GB, no decimals + (2 * 1024 * 1024 * 1024, 2, "2GB"), # exact GB, with decimals + # Test edge cases + (0, 0, "0MB"), # zero bytes, no decimals + (0, 2, "0MB"), # zero bytes, with decimals + (1023 * 1024, 0, "1MB"), # just under 1MB, no decimals + (1023 * 1024, 2, "1MB"), # just under 1MB, with decimals + (1024 * 1024 * 1024 - 1, 0, "1024MB"), # just under 1GB, no decimals + (1024 * 1024 * 1024 - 1, 2, "1024MB"), # just under 1GB, with decimals + ], +) +def test_format_memory(bytes_value: int, decimal_places: int, expected: str): + result = _format_memory(bytes_value, decimal_places) + assert result == expected diff --git a/src/tests/_internal/cli/commands/test_project.py b/src/tests/_internal/cli/commands/test_project.py new file mode 100644 index 0000000000..f022b3d404 --- /dev/null +++ b/src/tests/_internal/cli/commands/test_project.py @@ -0,0 +1,41 @@ +from pathlib import Path +from unittest.mock import patch + +import yaml +from pytest import CaptureFixture + +from tests._internal.cli.common import run_dstack_cli + + +class TestProjectAdd: + def test_adds_project(self, capsys: CaptureFixture, tmp_path: Path): + cli_config_path = tmp_path / ".dstack" / "config.yml" + with patch("dstack.api.server.APIClient") as APIClientMock: + api_client_mock = APIClientMock.return_value + exit_code = run_dstack_cli( + [ + "project", + "add", + "--name", + "project", + "--url", + "http://127.0.0.1:31313", + "--token", + "token", + "-y", + ], + home_dir=tmp_path, + ) + APIClientMock.assert_called_once_with(base_url="http://127.0.0.1:31313", token="token") + api_client_mock.projects.get.assert_called_with("project") + assert exit_code == 0 + assert yaml.load(cli_config_path.read_text(), yaml.FullLoader) == { + "projects": [ + { + "default": True, + "name": "project", + "token": "token", + "url": "http://127.0.0.1:31313", + } + ], + } diff --git a/src/tests/_internal/cli/common.py b/src/tests/_internal/cli/common.py index 8b4a370ea6..09f4541c7e 100644 --- a/src/tests/_internal/cli/common.py +++ b/src/tests/_internal/cli/common.py @@ -7,7 +7,7 @@ def run_dstack_cli( - args: List[str], + cli_args: List[str], home_dir: Optional[Path] = None, repo_dir: Optional[Path] = None, ) -> int: @@ -18,13 +18,14 @@ def run_dstack_cli( if home_dir is not None: prev_home_dir = os.environ["HOME"] os.environ["HOME"] = str(home_dir) - with patch("sys.argv", ["dstack"] + args): + with patch("sys.argv", ["dstack"] + cli_args): try: main() except SystemExit as e: exit_code = e.code - if home_dir is not None: - os.environ["HOME"] = prev_home_dir - if repo_dir is not None: - os.chdir(cwd) + finally: + if home_dir is not None: + os.environ["HOME"] = prev_home_dir + if repo_dir is not None: + os.chdir(cwd) return exit_code diff --git a/src/tests/_internal/cli/services/configurators/test_fleet.py b/src/tests/_internal/cli/services/configurators/test_fleet.py new file mode 100644 index 0000000000..f1d0bfe222 --- /dev/null +++ b/src/tests/_internal/cli/services/configurators/test_fleet.py @@ -0,0 +1,247 @@ +import argparse +from datetime import datetime, timezone +from textwrap import dedent +from typing import List, Optional, Tuple +from unittest.mock import Mock +from uuid import uuid4 + +import pytest +from rich.console import Console + +import dstack._internal.cli.services.configurators.fleet as fleet_configurator_module +from dstack._internal.cli.services.configurators.fleet import ( + FleetConfigurator, + _render_fleet_spec_diff, +) +from dstack._internal.core.errors import ConfigurationError +from dstack._internal.core.models.common import ApplyAction +from dstack._internal.core.models.envs import Env +from dstack._internal.core.models.fleets import ( + Fleet, + FleetConfiguration, + FleetNodesSpec, + FleetPlan, + FleetSpec, + FleetStatus, + InstanceGroupPlacement, +) +from dstack._internal.core.models.profiles import Profile + + +def create_conf() -> FleetConfiguration: + return FleetConfiguration.parse_obj({"ssh_config": {"hosts": ["1.2.3.4"]}}) + + +def apply_args( + conf: FleetConfiguration, args: List[str] +) -> Tuple[FleetConfiguration, argparse.Namespace]: + parser = argparse.ArgumentParser() + configurator = FleetConfigurator(Mock()) + configurator.register_args(parser) + conf = conf.copy(deep=True) + configurator_args = parser.parse_args(args) + configurator.apply_args(conf, configurator_args) + return conf, configurator_args + + +def get_cloud_fleet_spec( + *, + name: str = "test-fleet", + nodes: Optional[FleetNodesSpec] = None, + placement: Optional[InstanceGroupPlacement] = None, +) -> FleetSpec: + if nodes is None: + nodes = FleetNodesSpec(min=0, target=0, max=1) + return FleetSpec( + configuration=FleetConfiguration( + name=name, + nodes=nodes, + placement=placement, + ), + configuration_path="fleet.dstack.yml", + profile=Profile(), + ) + + +def get_ssh_fleet_spec( + *, + name: str = "test-fleet", + hosts: Optional[list[str]] = None, +) -> FleetSpec: + if hosts is None: + hosts = ["10.0.0.100"] + return FleetSpec( + configuration=FleetConfiguration.parse_obj( + { + "name": name, + "ssh_config": {"hosts": hosts}, + } + ), + configuration_path="fleet.dstack.yml", + profile=Profile(), + ) + + +def create_fleet_plan( + *, + current_spec: FleetSpec, + spec: FleetSpec, + action: ApplyAction, +) -> FleetPlan: + return FleetPlan( + project_name="test-project", + user="test-user", + spec=spec, + effective_spec=spec, + current_resource=Fleet( + id=uuid4(), + name=current_spec.configuration.name or "test-fleet", + project_name="test-project", + spec=current_spec, + created_at=datetime.now(timezone.utc), + status=FleetStatus.ACTIVE, + instances=[], + ), + offers=[], + total_offers=0, + action=action, + ) + + +def get_command_args() -> argparse.Namespace: + return argparse.Namespace( + yes=False, + force=False, + detach=False, + ) + + +def patch_console_and_confirm( + monkeypatch: pytest.MonkeyPatch, +) -> tuple[Console, Mock]: + console = Console(record=True, force_terminal=False, color_system=None, width=120) + confirm_ask = Mock(return_value=False) + monkeypatch.setattr(fleet_configurator_module, "console", console) + monkeypatch.setattr(fleet_configurator_module, "confirm_ask", confirm_ask) + return console, confirm_ask + + +class TestFleetConfigurator: + def test_env(self): + conf = create_conf() + modified, args = apply_args(conf, ["-e", "A=1", "--env", "B=2"]) + conf.env = Env.parse_obj({"A": "1", "B": "2"}) + assert modified.dict() == conf.dict() + + def test_env_override(self): + conf = create_conf() + conf.env = Env.parse_obj({"A": "0"}) + modified, args = apply_args(conf, ["-e", "A=1", "--env", "B=2"]) + conf.env = Env.parse_obj({"A": "1", "B": "2"}) + assert modified.dict() == conf.dict() + + def test_env_value_from_environ(self, monkeypatch: pytest.MonkeyPatch): + monkeypatch.setenv("FROM_ENV", "2") + conf = create_conf() + conf.env = Env.parse_obj({"FROM_CONF": "1"}) + modified, args = apply_args(conf, ["--env", "FROM_ENV"]) + conf.env = Env.parse_obj({"FROM_CONF": "1", "FROM_ENV": "2"}) + assert modified.dict() == conf.dict() + + def test_env_value_from_environ_not_set(self, monkeypatch: pytest.MonkeyPatch): + monkeypatch.delenv("FROM_ENV", raising=False) + conf = create_conf() + with pytest.raises(ConfigurationError, match=r"FROM_ENV is not set"): + apply_args(conf, ["--env", "FROM_ENV"]) + + +class TestApplyPlanMessages: + def test_prints_in_place_update_diff(self, monkeypatch: pytest.MonkeyPatch): + console, confirm_ask = patch_console_and_confirm(monkeypatch) + current_spec = get_cloud_fleet_spec(nodes=FleetNodesSpec(min=0, target=0, max=1)) + spec = get_cloud_fleet_spec(nodes=FleetNodesSpec(min=1, target=1, max=1)) + plan = create_fleet_plan( + current_spec=current_spec, + spec=spec, + action=ApplyAction.UPDATE, + ) + + FleetConfigurator(Mock())._apply_plan(plan, get_command_args()) + + output = console.export_text() + assert "Found fleet test-fleet." in output + assert "Detected changes that can be updated in-place:" in output + assert "- Configuration properties:" in output + assert " - nodes" in output + confirm_ask.assert_called_once_with("Update the fleet in-place?") + + def test_prints_recreate_diff(self, monkeypatch: pytest.MonkeyPatch): + console, confirm_ask = patch_console_and_confirm(monkeypatch) + current_spec = get_cloud_fleet_spec(placement=InstanceGroupPlacement.ANY) + spec = get_cloud_fleet_spec(placement=InstanceGroupPlacement.CLUSTER) + plan = create_fleet_plan( + current_spec=current_spec, + spec=spec, + action=ApplyAction.CREATE, + ) + + FleetConfigurator(Mock())._apply_plan(plan, get_command_args()) + + output = console.export_text() + assert "Found fleet test-fleet." in output + assert "Detected changes that cannot be updated in-place:" in output + assert "- Configuration properties:" in output + assert " - placement" in output + confirm_ask.assert_called_once_with("Re-create the fleet?") + + def test_prints_no_diff_message(self, monkeypatch: pytest.MonkeyPatch): + console, confirm_ask = patch_console_and_confirm(monkeypatch) + spec = get_cloud_fleet_spec() + plan = create_fleet_plan( + current_spec=spec, + spec=spec.copy(deep=True), + action=ApplyAction.UPDATE, + ) + + FleetConfigurator(Mock())._apply_plan(plan, get_command_args()) + + output = console.export_text() + assert "Found fleet test-fleet." in output + assert "No configuration changes detected." in output + assert "Detected changes that" not in output + confirm_ask.assert_called_once_with("Re-create the fleet?") + + +class TestRenderFleetSpecDiff: + def test_renders_cloud_nodes_change(self): + old = get_cloud_fleet_spec(nodes=FleetNodesSpec(min=0, target=0, max=1)) + new = get_cloud_fleet_spec(nodes=FleetNodesSpec(min=1, target=1, max=1)) + + assert ( + _render_fleet_spec_diff(old, new) + == dedent( + """ + - Configuration properties: + - nodes + """ + ).lstrip() + ) + + def test_renders_ssh_hosts_change(self): + old = get_ssh_fleet_spec(hosts=["10.0.0.100"]) + new = get_ssh_fleet_spec(hosts=["10.0.0.100", "10.0.0.101"]) + + assert ( + _render_fleet_spec_diff(old, new) + == dedent( + """ + - Configuration properties: + - ssh_config + """ + ).lstrip() + ) + + def test_no_diff(self): + spec = get_cloud_fleet_spec() + + assert _render_fleet_spec_diff(spec, spec.copy(deep=True)) is None diff --git a/src/tests/_internal/cli/services/configurators/test_profile.py b/src/tests/_internal/cli/services/configurators/test_profile.py index d9a047bdf7..d3c363c0f2 100644 --- a/src/tests/_internal/cli/services/configurators/test_profile.py +++ b/src/tests/_internal/cli/services/configurators/test_profile.py @@ -5,7 +5,7 @@ apply_profile_args, register_profile_args, ) -from dstack._internal.core.models.profiles import Profile, ProfileRetryPolicy, SpotPolicy +from dstack._internal.core.models.profiles import Profile, ProfileRetry, SpotPolicy class TestProfileArgs: @@ -34,8 +34,8 @@ def test_max_duration(self): def test_backends(self): profile = Profile(name="test") - modified, _ = apply_args(profile, ["-b", "local", "--backend", "aws"]) - profile.backends = ["local", "aws"] + modified, _ = apply_args(profile, ["-b", "gcp", "--backend", "aws"]) + profile.backends = ["gcp", "aws"] assert profile.dict() == modified.dict() def test_spot_policy_spot(self): @@ -51,21 +51,21 @@ def test_spot_policy_on_demand(self): assert profile.dict() == modified.dict() def test_retry(self): - profile = Profile(name="test") - profile.retry_policy = ProfileRetryPolicy(retry=True) + profile = Profile(name="test", retry=None) modified, _ = apply_args(profile, ["--retry"]) + profile.retry = True assert profile.dict() == modified.dict() def test_no_retry(self): - profile = Profile(name="test", retry_policy=ProfileRetryPolicy(retry=True, duration=3600)) + profile = Profile(name="test", retry=None) modified, _ = apply_args(profile, ["--no-retry"]) - profile.retry_policy.retry = False + profile.retry = False assert profile.dict() == modified.dict() def test_retry_duration(self): profile = Profile(name="test") modified, _ = apply_args(profile, ["--retry-duration", "1h"]) - profile.retry_policy = ProfileRetryPolicy(retry=True, duration=3600) + profile.retry = ProfileRetry(on_events=None, duration="1h") assert profile.dict() == modified.dict() @@ -73,6 +73,6 @@ def apply_args(profile: Profile, args: List[str]) -> Tuple[Profile, argparse.Nam parser = argparse.ArgumentParser() register_profile_args(parser) profile = profile.copy(deep=True) # to avoid modifying the original profile - args = parser.parse_args(args) - apply_profile_args(args, profile) - return profile, args + parsed_args = parser.parse_args(args) + apply_profile_args(parsed_args, profile) + return profile, parsed_args diff --git a/src/tests/_internal/cli/services/configurators/test_run.py b/src/tests/_internal/cli/services/configurators/test_run.py index c7bbcab641..97be49405d 100644 --- a/src/tests/_internal/cli/services/configurators/test_run.py +++ b/src/tests/_internal/cli/services/configurators/test_run.py @@ -1,27 +1,56 @@ import argparse -from typing import List, Tuple +from textwrap import dedent +from typing import List, Optional, Tuple +from unittest.mock import Mock import pytest +from gpuhunt import KNOWN_TENSTORRENT_ACCELERATORS, AcceleratorVendor -from dstack._internal.cli.services.configurators.run import run_configurators_mapping +from dstack._internal.cli.services.configurators import get_run_configurator_class +from dstack._internal.cli.services.configurators.run import ( + BaseRunConfigurator, + render_run_spec_diff, +) from dstack._internal.core.errors import ConfigurationError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import RegistryAuth from dstack._internal.core.models.configurations import ( - BaseConfiguration, + BaseRunConfiguration, + DevEnvironmentConfiguration, PortMapping, TaskConfiguration, ) +from dstack._internal.core.models.envs import Env +from dstack._internal.core.models.profiles import Profile +from dstack._internal.server.testing.common import get_run_spec + +_TENSTORRENT_ACCELERATOR_NAMES = tuple( + sorted({gpu.name for gpu in KNOWN_TENSTORRENT_ACCELERATORS}) +) -class TestRunConfigurator: +class TestApplyArgs: + def apply_args( + self, conf: BaseRunConfiguration, args: List[str] + ) -> Tuple[BaseRunConfiguration, argparse.Namespace]: + parser = argparse.ArgumentParser() + configurator_class = get_run_configurator_class(conf.type) + configurator = configurator_class(Mock()) + configurator.register_args(parser) + conf = conf.copy(deep=True) # to avoid modifying the original configuration + parsed_args = parser.parse_args(args) + configurator.apply_args(conf, parsed_args) + return conf, parsed_args + def test_env(self): conf = TaskConfiguration(commands=["whoami"]) - modified, args = apply_args(conf, ["-e", "A=1", "--env", "B=2"]) - conf.env = {"A": "1", "B": "2"} + modified, args = self.apply_args(conf, ["-e", "A=1", "--env", "B=2"]) + conf.env = Env.parse_obj({"A": "1", "B": "2"}) assert modified.dict() == conf.dict() def test_ports(self): conf = TaskConfiguration(commands=["whoami"]) - modified, args = apply_args(conf, ["-p", "80", "--port", "8080"]) + modified, args = self.apply_args(conf, ["-p", "80", "--port", "8080"]) conf.ports = [ PortMapping(local_port=80, container_port=80), PortMapping(local_port=8080, container_port=8080), @@ -31,17 +60,17 @@ def test_ports(self): def test_container_ports_conflict(self): conf = TaskConfiguration(commands=["whoami"]) with pytest.raises(ConfigurationError): - apply_args(conf, ["-p", "8000:80", "--port", "8001:80"]) + self.apply_args(conf, ["-p", "8000:80", "--port", "8001:80"]) def test_env_override(self): - conf = TaskConfiguration(commands=["whoami"], env={"A": "0"}) - modified, args = apply_args(conf, ["-e", "A=1", "--env", "B=2"]) - conf.env = {"A": "1", "B": "2"} + conf = TaskConfiguration(commands=["whoami"], env=Env.parse_obj({"A": "0"})) + modified, args = self.apply_args(conf, ["-e", "A=1", "--env", "B=2"]) + conf.env = Env.parse_obj({"A": "1", "B": "2"}) assert modified.dict() == conf.dict() def test_ports_override(self): conf = TaskConfiguration(commands=["whoami"], ports=["80"]) - modified, args = apply_args(conf, ["-p", "8000:80", "--port", "8001:8000"]) + modified, args = self.apply_args(conf, ["-p", "8000:80", "--port", "8001:8000"]) conf.ports = [ PortMapping(local_port=8000, container_port=80), PortMapping(local_port=8001, container_port=8000), @@ -51,22 +80,337 @@ def test_ports_override(self): def test_local_ports_conflict(self): conf = TaskConfiguration(commands=["whoami"], ports=["3000"]) with pytest.raises(ConfigurationError): - apply_args(conf, ["-p", "3000:4000"]) + self.apply_args(conf, ["-p", "3000:4000"]) def test_any_port(self): conf = TaskConfiguration(commands=["whoami"], ports=["8000"]) - modified, args = apply_args(conf, ["-p", "*:8000"]) + modified, args = self.apply_args(conf, ["-p", "*:8000"]) conf.ports = [PortMapping(local_port=None, container_port=8000)] assert modified.dict() == conf.dict() + def test_interpolates_env(self): + conf = TaskConfiguration( + image="my_image", + registry_auth=RegistryAuth( + username="${{ env.REGISTRY_USERNAME }}", + password="${{ env.REGISTRY_PASSWORD }}", + ), + env=Env.parse_obj( + { + "REGISTRY_USERNAME": "test_user", + "REGISTRY_PASSWORD": "test_password", + } + ), + ) + modified, args = self.apply_args(conf, []) + assert modified.registry_auth == RegistryAuth( + username="test_user", + password="test_password", + ) + + +class TestValidateGPUVendorAndImage: + def prepare_conf( + self, + *, + image: Optional[str] = None, + gpu_spec: Optional[str] = None, + docker: Optional[bool] = None, + ) -> BaseRunConfiguration: + conf_dict = { + "type": "none", + } + if image is not None: + conf_dict["image"] = image + if gpu_spec is not None: + conf_dict["resources"] = { + "gpu": gpu_spec, + } + if docker is not None: + conf_dict["docker"] = docker + return BaseRunConfiguration.parse_obj(conf_dict) + + def validate(self, conf: BaseRunConfiguration) -> None: + BaseRunConfigurator(api_client=Mock()).validate_gpu_vendor_and_image(conf) + + def test_no_gpu(self): + conf = self.prepare_conf() + self.validate(conf) + assert conf.resources.gpu is not None + # Vendor is not written to spec for compatibility with older servers. + # The server infers nvidia in set_resources_defaults(). + assert conf.resources.gpu.vendor is None + assert conf.resources.gpu.name is None + assert conf.resources.gpu.count.min == 0 + + def test_zero_gpu(self): + conf = self.prepare_conf(gpu_spec="0") + self.validate(conf) + assert conf.resources.gpu.vendor is None + + def test_gpu_no_vendor_no_image_defaults_to_nvidia(self): + """Vendor is inferred as nvidia for validation but NOT written to spec.""" + conf = self.prepare_conf(gpu_spec="1") + self.validate(conf) + assert conf.resources.gpu.vendor is None + + def test_gpu_no_vendor_with_image_no_default(self): + conf = self.prepare_conf(gpu_spec="1", image="my-custom-image") + self.validate(conf) + assert conf.resources.gpu.vendor is None + + def test_gpu_no_vendor_docker_true_no_default(self): + conf = self.prepare_conf(gpu_spec="1", docker=True) + self.validate(conf) + assert conf.resources.gpu.vendor is None + + @pytest.mark.parametrize( + ["gpu_spec", "expected_vendor"], + [ + ["nvidia", AcceleratorVendor.NVIDIA], + ["tpu", AcceleratorVendor.GOOGLE], + ["google", AcceleratorVendor.GOOGLE], + ], + ) + def test_non_amd_vendor_declared(self, gpu_spec, expected_vendor): + conf = self.prepare_conf(gpu_spec=gpu_spec) + self.validate(conf) + assert conf.resources.gpu.vendor == expected_vendor + + def test_amd_vendor_declared_with_image(self): + conf = self.prepare_conf(image="tgi:rocm", gpu_spec="AMD") + self.validate(conf) + assert conf.resources.gpu.vendor == AcceleratorVendor.AMD + + @pytest.mark.parametrize( + ["gpu_spec", "expected_vendor"], + [ + ["a40,l40", AcceleratorVendor.NVIDIA], # lowercase + ["V3-64", AcceleratorVendor.GOOGLE], # uppercase + ], + ) + def test_one_non_amd_vendor_inferred(self, gpu_spec, expected_vendor): + conf = self.prepare_conf(gpu_spec=gpu_spec) + self.validate(conf) + assert conf.resources.gpu.vendor == expected_vendor + + @pytest.mark.parametrize("gpu_spec", ["MI300X", "MI300x", "mi300x"]) + def test_amd_vendor_inferred_with_image(self, gpu_spec): + conf = self.prepare_conf(image="tgi:rocm", gpu_spec=gpu_spec) + self.validate(conf) + assert conf.resources.gpu.vendor == AcceleratorVendor.AMD + + @pytest.mark.parametrize("gpu_spec", ["foo", "foo,bar"]) + def test_one_unknown_vendor_inferred(self, gpu_spec): + conf = self.prepare_conf(gpu_spec=gpu_spec) + self.validate(conf) + assert conf.resources.gpu.vendor is None + + @pytest.mark.parametrize( + "gpu_spec", + [ + "A1000,v4", # Nvidia and Google + "v3-64,foo", # Google and unknown + ], + ) + def test_two_non_amd_vendors_inferred(self, gpu_spec): + conf = self.prepare_conf(gpu_spec=gpu_spec) + self.validate(conf) + assert conf.resources.gpu.vendor is None + + @pytest.mark.parametrize( + "gpu_spec", + [ + "A1000,mi300x", # Nvidia and AMD (lowercase) + "MI300x,v3-64", # AMD (mixedcase) and Google + "foo,MI300X", # unknown and AMD (uppercase) + ], + ) + def test_two_vendors_including_amd_inferred_with_image(self, gpu_spec): + conf = self.prepare_conf(image="tgi:rocm", gpu_spec=gpu_spec) + self.validate(conf) + assert conf.resources.gpu.vendor is None + + def test_amd_vendor_declared_no_image(self): + conf = self.prepare_conf(gpu_spec="AMD") + with pytest.raises( + ConfigurationError, match=r"`image` is required if `resources.gpu.vendor` is `amd`" + ): + self.validate(conf) + + @pytest.mark.parametrize("gpu_spec", ["AMD", "MI300X"]) + def test_amd_vendor_docker_true_no_image(self, gpu_spec): + conf = self.prepare_conf(gpu_spec=gpu_spec, docker=True) + self.validate(conf) + assert conf.resources.gpu.vendor == AcceleratorVendor.AMD + + @pytest.mark.parametrize("gpu_spec", ["MI300X", "MI300x", "mi300x"]) + def test_amd_vendor_inferred_no_image(self, gpu_spec): + conf = self.prepare_conf(gpu_spec=gpu_spec) + with pytest.raises( + ConfigurationError, match=r"`image` is required if `resources.gpu.vendor` is `amd`" + ): + self.validate(conf) + + @pytest.mark.parametrize( + "gpu_spec", + [ + "A1000,mi300x", # Nvidia and AMD (lowercase) + "MI300x,v3-64", # AMD (mixedcase) and Google + "foo,MI300X", # unknown and AMD (uppercase) + ], + ) + def test_two_vendors_including_amd_inferred_no_image(self, gpu_spec): + conf = self.prepare_conf(gpu_spec=gpu_spec) + with pytest.raises( + ConfigurationError, match=r"`image` is required if `resources.gpu.vendor` is `amd`" + ): + self.validate(conf) + + @pytest.mark.parametrize("gpu_spec", _TENSTORRENT_ACCELERATOR_NAMES) + def test_tenstorrent_docker_true_no_image(self, gpu_spec): + conf = self.prepare_conf(gpu_spec=gpu_spec, docker=True) + self.validate(conf) + assert conf.resources.gpu.vendor == AcceleratorVendor.TENSTORRENT + + @pytest.mark.parametrize("gpu_spec", _TENSTORRENT_ACCELERATOR_NAMES) + def test_tenstorrent_vendor_inferred_no_image(self, gpu_spec): + conf = self.prepare_conf(gpu_spec=gpu_spec) + with pytest.raises( + ConfigurationError, + match=r"`image` is required if `resources.gpu.vendor` is `tenstorrent`", + ): + self.validate(conf) + + +class TestValidateCPUArchAndImage: + def prepare_conf( + self, + *, + cpu_spec: str, + gpu_spec: Optional[str] = None, + image: Optional[str] = None, + ) -> BaseRunConfiguration: + conf_dict = { + "type": "none", + "resources": { + "cpu": cpu_spec, + }, + } + if image is not None: + conf_dict["image"] = image + if gpu_spec is not None: + conf_dict["resources"]["gpu"] = gpu_spec + return BaseRunConfiguration.parse_obj(conf_dict) + + def validate(self, conf: BaseRunConfiguration) -> None: + # validate_gpu_vendor_and_image sets GPU vendor if not set + BaseRunConfigurator(api_client=Mock()).validate_gpu_vendor_and_image(conf) + BaseRunConfigurator(api_client=Mock()).validate_cpu_arch_and_image(conf) + + @pytest.mark.parametrize("gpu_spec", [None, "GH200", "H100"]) + def test_explicit_arm_with_image(self, gpu_spec: Optional[str]): + conf = self.prepare_conf(cpu_spec="arm:1..", gpu_spec=gpu_spec, image="ubuntu") + self.validate(conf) + + def test_inferred_arm_with_image(self): + conf = self.prepare_conf(cpu_spec="1..", gpu_spec="GH200", image="ubuntu") + self.validate(conf) + + @pytest.mark.parametrize("cpu_spec", ["1..", "arm:1.."]) + def test_arm_no_image(self, cpu_spec: str): + conf = self.prepare_conf(cpu_spec=cpu_spec, gpu_spec="GH200") + with pytest.raises( + ConfigurationError, match=r"`image` is required if `resources.cpu.arch` is `arm`" + ): + self.validate(conf) + + @pytest.mark.parametrize("cpu_spec", ["1..", "x86:1.."]) + @pytest.mark.parametrize("image", [None, "ubuntu"]) + def test_x86(self, cpu_spec: str, image: Optional[str]): + conf = self.prepare_conf(cpu_spec=cpu_spec, gpu_spec="H100", image=image) + self.validate(conf) + + +class TestRenderRunSpecDiff: + def test_diff(self): + old = get_run_spec( + run_name="test", + repo_id="test-1", + configuration_path="1.dstack.yml", + profile=Profile( + backends=[BackendType.AWS], + regions=["us-west-1"], + name="test", + default=True, + ), + configuration=DevEnvironmentConfiguration( + name="test", + ide="vscode", + inactivity_duration=60, + ), + ) + new = get_run_spec( + run_name="test", + repo_id="test-2", + configuration_path="2.dstack.yml", + profile=Profile( + backends=[BackendType.AWS], + regions=["us-west-2"], + name="test", + default=True, + ), + configuration=DevEnvironmentConfiguration( + name="test", + ide="cursor", + inactivity_duration=None, + ), + ) + assert ( + render_run_spec_diff(old, new) + == dedent( + """ + - Repo ID + - Configuration path + - Configuration properties: + - ide + - inactivity_duration + - Profile properties: + - regions + """ + ).lstrip() + ) + + def test_field_type_change(self): + old = get_run_spec( + run_name="test", + repo_id="test", + profile=Profile(name="test"), + configuration=DevEnvironmentConfiguration( + name="test", + ide="vscode", + ), + ) + new = get_run_spec( + run_name="test", + repo_id="test", + profile=None, + configuration=TaskConfiguration( + name="test", + commands=["sleep infinity"], + ), + ) + assert ( + render_run_spec_diff(old, new) + == dedent( + """ + - Configuration type + - Profile + """ + ).lstrip() + ) -def apply_args( - conf: BaseConfiguration, args: List[str] -) -> Tuple[BaseConfiguration, argparse.Namespace]: - parser = argparse.ArgumentParser() - configurator = run_configurators_mapping[conf.type] - configurator.register(parser) - conf = conf.copy(deep=True) # to avoid modifying the original configuration - args, unknown = parser.parse_known_args(args) - configurator.apply(args, unknown, conf) - return conf, args + def test_no_diff(self): + old = get_run_spec(run_name="test", repo_id="test") + new = get_run_spec(run_name="test", repo_id="test") + assert render_run_spec_diff(old, new) is None diff --git a/src/tests/_internal/cli/services/test_events.py b/src/tests/_internal/cli/services/test_events.py new file mode 100644 index 0000000000..95aa4aaec5 --- /dev/null +++ b/src/tests/_internal/cli/services/test_events.py @@ -0,0 +1,349 @@ +import uuid +from dataclasses import asdict +from datetime import datetime, timedelta, timezone +from typing import Optional +from unittest.mock import MagicMock + +from dstack._internal.cli.services.events import EventListFilters, EventTracker +from dstack._internal.core.models.events import Event, EventTarget, EventTargetType +from dstack._internal.server.schemas.events import LIST_EVENTS_DEFAULT_LIMIT + + +class TestEventTracker: + def create_test_event( + self, + event_id: Optional[uuid.UUID] = None, + recorded_at: Optional[datetime] = None, + message: str = "Test event", + ) -> Event: + if event_id is None: + event_id = uuid.uuid4() + if recorded_at is None: + recorded_at = datetime.now(timezone.utc) + + return Event( + id=event_id, + message=message, + recorded_at=recorded_at, + actor_user_id=uuid.uuid4(), + actor_user="test_user", + targets=[ + EventTarget( + type=EventTargetType.RUN, + project_id=uuid.uuid4(), + project_name="test_project", + id=uuid.uuid4(), + name="test_run", + ) + ], + ) + + def test_poll_no_since(self): + mock_client = MagicMock() + filters = EventListFilters(target_runs=[uuid.uuid4()]) + + tracker = EventTracker( + client=mock_client, + filters=filters, + since=None, + event_delay_tolerance=timedelta(seconds=20), + ) + + # First poll - requests latest existing events + + event1 = self.create_test_event( + recorded_at=datetime(2023, 1, 1, 9, 0, tzinfo=timezone.utc) + ) + event2 = self.create_test_event( + recorded_at=datetime(2023, 1, 1, 10, 0, tzinfo=timezone.utc) + ) + mock_client.list.return_value = [event2, event1] # reversed due to ascending=False + + events = list(tracker.poll()) + + assert events == [event1, event2] + mock_client.list.assert_called_once_with( + ascending=False, + **asdict(filters), + ) + + # Second poll - requests events after the latest existing event + + mock_client.list.reset_mock() + mock_client.list.return_value = [] + + events = list(tracker.poll()) + + assert events == [] + mock_client.list.assert_called_once_with( + ascending=True, + **asdict(filters), + prev_recorded_at=event2.recorded_at - timedelta(seconds=20), + prev_id=None, + limit=LIST_EVENTS_DEFAULT_LIMIT, + ) + + def test_poll_with_since(self): + mock_client = MagicMock() + filters = EventListFilters(target_runs=[uuid.uuid4()]) + + tracker = EventTracker( + client=mock_client, + filters=filters, + since=datetime(2023, 1, 1, 8, 0, tzinfo=timezone.utc), + event_delay_tolerance=timedelta(seconds=20), + ) + + # First poll - requests events after `since` + + event1 = self.create_test_event( + recorded_at=datetime(2023, 1, 1, 9, 0, tzinfo=timezone.utc) + ) + event2 = self.create_test_event( + recorded_at=datetime(2023, 1, 1, 10, 0, tzinfo=timezone.utc) + ) + mock_client.list.return_value = [event1, event2] + + events = list(tracker.poll()) + + assert events == [event1, event2] + mock_client.list.assert_called_once_with( + ascending=True, + **asdict(filters), + prev_recorded_at=datetime(2023, 1, 1, 8, 0, tzinfo=timezone.utc), + prev_id=None, + limit=LIST_EVENTS_DEFAULT_LIMIT, + ) + + # Second poll - requests events after the latest event + + mock_client.list.reset_mock() + mock_client.list.return_value = [] + + events = list(tracker.poll()) + + assert events == [] + mock_client.list.assert_called_once_with( + ascending=True, + **asdict(filters), + prev_recorded_at=event2.recorded_at - timedelta(seconds=20), + prev_id=None, + limit=LIST_EVENTS_DEFAULT_LIMIT, + ) + + def test_poll_with_since_never_uses_prev_recorded_at_earlier_than_since(self): + mock_client = MagicMock() + filters = EventListFilters(target_runs=[uuid.uuid4()]) + since = datetime(2023, 1, 1, 10, 0, tzinfo=timezone.utc) + + tracker = EventTracker( + client=mock_client, + filters=filters, + since=datetime(2023, 1, 1, 10, 0, tzinfo=timezone.utc), + event_delay_tolerance=timedelta(seconds=20), + ) + + # First poll - returns an event that is 5 seconds newer than `since` + + event1 = self.create_test_event(recorded_at=since + timedelta(seconds=5)) + mock_client.list.return_value = [event1] + + events = list(tracker.poll()) + + assert events == [event1] + mock_client.list.assert_called_once_with( + ascending=True, + **asdict(filters), + prev_recorded_at=since, + prev_id=None, + limit=LIST_EVENTS_DEFAULT_LIMIT, + ) + + # Second poll - prev_recorded_at should still be `since` (not event1 - 20s) + + mock_client.list.reset_mock() + mock_client.list.return_value = [] + + events = list(tracker.poll()) + + assert events == [] + mock_client.list.assert_called_once_with( + ascending=True, + **asdict(filters), + prev_recorded_at=since, + prev_id=None, + limit=LIST_EVENTS_DEFAULT_LIMIT, + ) + + def test_poll_no_since_always_empty_response(self): + mock_client = MagicMock() + filters = EventListFilters(target_runs=[uuid.uuid4()]) + + tracker = EventTracker( + client=mock_client, + filters=filters, + since=None, + event_delay_tolerance=timedelta(seconds=20), + ) + + for _ in range(2): + mock_client.list.reset_mock() + mock_client.list.return_value = [] + events = list(tracker.poll()) + assert events == [] + mock_client.list.assert_called_once_with( + ascending=False, + **asdict(filters), + ) + + def test_poll_with_since_always_empty_response(self): + mock_client = MagicMock() + filters = EventListFilters(target_runs=[uuid.uuid4()]) + + tracker = EventTracker( + client=mock_client, + filters=filters, + since=datetime(2023, 1, 1, 8, 0, tzinfo=timezone.utc), + event_delay_tolerance=timedelta(seconds=20), + ) + + for _ in range(2): + mock_client.list.reset_mock() + mock_client.list.return_value = [] + events = list(tracker.poll()) + assert events == [] + mock_client.list.assert_called_once_with( + ascending=True, + **asdict(filters), + prev_recorded_at=datetime(2023, 1, 1, 8, 0, tzinfo=timezone.utc), + prev_id=None, + limit=LIST_EVENTS_DEFAULT_LIMIT, + ) + + def test_poll_event_deduplication(self): + mock_client = MagicMock() + filters = EventListFilters(target_runs=[uuid.uuid4()]) + + tracker = EventTracker( + client=mock_client, + filters=filters, + since=datetime(2023, 1, 1, 8, 0, tzinfo=timezone.utc), + event_delay_tolerance=timedelta(seconds=20), + ) + + # First poll - returns event1 and event2 + + event1 = self.create_test_event( + recorded_at=datetime(2023, 1, 1, 9, 0, tzinfo=timezone.utc) + ) + event2 = self.create_test_event( + recorded_at=datetime(2023, 1, 1, 10, 0, tzinfo=timezone.utc) + ) + mock_client.list.return_value = [event1, event2] + + events = list(tracker.poll()) + + assert events == [event1, event2] + mock_client.list.assert_called_once_with( + ascending=True, + **asdict(filters), + prev_recorded_at=datetime(2023, 1, 1, 8, 0, tzinfo=timezone.utc), + prev_id=None, + limit=LIST_EVENTS_DEFAULT_LIMIT, + ) + + # Second poll - returns event2 (duplicate) and event3 (new) + + mock_client.list.reset_mock() + event3 = self.create_test_event( + recorded_at=datetime(2023, 1, 1, 11, 0, tzinfo=timezone.utc) + ) + mock_client.list.return_value = [event2, event3] + + events = list(tracker.poll()) + + assert events == [event3] # does not return duplicate event2 + mock_client.list.assert_called_once_with( + ascending=True, + **asdict(filters), + prev_recorded_at=event2.recorded_at - timedelta(seconds=20), + prev_id=None, + limit=LIST_EVENTS_DEFAULT_LIMIT, + ) + + def test_poll_respects_pagination(self): + mock_client = MagicMock() + filters = EventListFilters(target_runs=[uuid.uuid4()]) + + tracker = EventTracker( + client=mock_client, + filters=filters, + since=datetime(2023, 1, 1, 8, 0, tzinfo=timezone.utc), + event_delay_tolerance=timedelta(seconds=20), + ) + + ### + # First poll - create (1.5 * default limit) events + ### + + num_events = int(LIST_EVENTS_DEFAULT_LIMIT * 1.5) + events = [ + self.create_test_event( + recorded_at=datetime(2023, 1, 1, 9, 0, tzinfo=timezone.utc) + timedelta(seconds=i) + ) + for i in range(num_events) + ] + + # Mock pagination: first call returns first batch, second call returns remaining events + call_count = 0 + + def mock_list(**kwargs): + nonlocal call_count + call_count += 1 + if call_count == 1: + return events[:LIST_EVENTS_DEFAULT_LIMIT] # First batch + elif call_count == 2: + return events[LIST_EVENTS_DEFAULT_LIMIT:] # Remaining events + else: + return [] + + mock_client.list.side_effect = mock_list + + result_events = list(tracker.poll()) + + assert result_events == events + assert mock_client.list.call_count == 2 + + # Verify first call + first_call = mock_client.list.call_args_list[0] + assert first_call[1]["ascending"] == True + assert first_call[1]["prev_recorded_at"] == datetime(2023, 1, 1, 8, 0, tzinfo=timezone.utc) + assert first_call[1]["prev_id"] is None + assert first_call[1]["limit"] == LIST_EVENTS_DEFAULT_LIMIT + + # Verify second call (pagination) + second_call = mock_client.list.call_args_list[1] + assert second_call[1]["ascending"] == True + assert ( + second_call[1]["prev_recorded_at"] == events[LIST_EVENTS_DEFAULT_LIMIT - 1].recorded_at + ) + assert second_call[1]["prev_id"] == events[LIST_EVENTS_DEFAULT_LIMIT - 1].id + assert second_call[1]["limit"] == LIST_EVENTS_DEFAULT_LIMIT + + ### + # Second poll - should make one call for new events + ### + + mock_client.reset_mock() + mock_client.list.return_value = [] + + result_events = list(tracker.poll()) + + assert result_events == [] + mock_client.list.assert_called_once_with( + ascending=True, + **asdict(filters), + prev_recorded_at=events[-1].recorded_at - timedelta(seconds=20), + prev_id=None, + limit=LIST_EVENTS_DEFAULT_LIMIT, + ) diff --git a/src/tests/_internal/cli/utils/conftest.py b/src/tests/_internal/cli/utils/conftest.py new file mode 100644 index 0000000000..0f87c88bd0 --- /dev/null +++ b/src/tests/_internal/cli/utils/conftest.py @@ -0,0 +1,19 @@ +from unittest.mock import Mock + +import pytest + +from dstack._internal.server.services.docker import ImageConfig, ImageConfigObject + + +@pytest.fixture +def image_config_mock(monkeypatch: pytest.MonkeyPatch) -> ImageConfig: + image_config = ImageConfig.parse_obj({"User": None, "Entrypoint": None, "Cmd": ["/bin/bash"]}) + monkeypatch.setattr( + "dstack._internal.server.services.jobs.configurators.base._get_image_config", + Mock(return_value=image_config), + ) + monkeypatch.setattr( + "dstack._internal.server.services.docker.get_image_config", + Mock(return_value=ImageConfigObject(config=image_config)), + ) + return image_config diff --git a/src/tests/_internal/cli/utils/test_common.py b/src/tests/_internal/cli/utils/test_common.py new file mode 100644 index 0000000000..d439b9e7fa --- /dev/null +++ b/src/tests/_internal/cli/utils/test_common.py @@ -0,0 +1,103 @@ +import os +from datetime import datetime, timedelta, timezone +from pathlib import Path +from typing import Generator +from unittest.mock import patch + +import pytest + +from dstack._internal.cli.utils.common import _get_cli_log_file + + +@pytest.fixture +def mock_dstack_dir(tmp_path: Path) -> Generator[Path, None, None]: + with patch("dstack._internal.cli.utils.common.get_dstack_dir") as mock: + mock.return_value = tmp_path + yield tmp_path + + +class TestGetCliLogFile: + def test_no_existing_dir(self, mock_dstack_dir: Path): + log_dir = mock_dstack_dir / "logs" / "cli" + expected_log_file = log_dir / "latest.log" + assert not log_dir.exists() + + result = _get_cli_log_file() + + assert log_dir.exists() + assert result == expected_log_file + + def test_no_rotation_needed_for_today_file(self, mock_dstack_dir: Path): + log_dir = mock_dstack_dir / "logs" / "cli" + log_dir.mkdir(parents=True, exist_ok=True) + latest_log = log_dir / "latest.log" + latest_log.touch() + + result = _get_cli_log_file() + + assert result == latest_log + assert latest_log.exists(), "latest.log should not have been renamed" + + @patch("dstack._internal.cli.utils.common.datetime") + def test_simple_rotation(self, mock_datetime, mock_dstack_dir: Path): + # Mock "now" to be a specific date + now = datetime(2023, 10, 27, 10, 0, 0, tzinfo=timezone.utc) + mock_datetime.now.return_value = now + # Ensure fromtimestamp still works correctly for the System Under Test + mock_datetime.fromtimestamp.side_effect = lambda ts, tz: datetime.fromtimestamp(ts, tz) + + log_dir = mock_dstack_dir / "logs" / "cli" + log_dir.mkdir(parents=True, exist_ok=True) + latest_log = log_dir / "latest.log" + latest_log.touch() + + # Set the modification time to yesterday + yesterday = now - timedelta(days=1) + mtime = yesterday.timestamp() + os.utime(latest_log, (mtime, mtime)) + + # The expected rotated file name is based on the modification time (yesterday) + date_str = yesterday.strftime("%Y-%m-%d") + expected_rotated_log = log_dir / f"{date_str}.log" + + result = _get_cli_log_file() + + assert result == log_dir / "latest.log" + assert not latest_log.exists(), "The original latest.log should have been renamed" + assert expected_rotated_log.exists(), "The log file should have been rotated" + + @patch("dstack._internal.cli.utils.common.datetime") + def test_rotation_with_conflict(self, mock_datetime, mock_dstack_dir: Path): + now = datetime(2023, 10, 27, 10, 0, 0, tzinfo=timezone.utc) + yesterday = now - timedelta(days=1) + mock_datetime.now.return_value = now + mock_datetime.fromtimestamp.side_effect = lambda ts, tz: datetime.fromtimestamp(ts, tz) + + log_dir = mock_dstack_dir / "logs" / "cli" + log_dir.mkdir(parents=True, exist_ok=True) + + # Create the old 'latest.log' and set its modification time to yesterday + latest_log = log_dir / "latest.log" + latest_log.touch() + mtime = yesterday.timestamp() + os.utime(latest_log, (mtime, mtime)) + + # Create conflicting files that already exist from a previous rotation + date_str = yesterday.strftime("%Y-%m-%d") + conflicting_log_1 = log_dir / f"{date_str}.log" + conflicting_log_1.touch() + conflicting_log_2 = log_dir / f"{date_str}-1.log" + conflicting_log_2.touch() + + # We expect the file to be rotated to the next available counter + expected_rotated_log = log_dir / f"{date_str}-2.log" + + result = _get_cli_log_file() + + assert result == log_dir / "latest.log" + assert not latest_log.exists(), "The original latest.log should have been renamed" + assert conflicting_log_1.exists(), "Existing rotated log should be untouched" + assert conflicting_log_2.exists(), "Existing rotated log with counter should be untouched" + assert expected_rotated_log.exists(), ( + "The log should have rotated to the next available counter" + ) diff --git a/src/tests/_internal/cli/utils/test_fleet.py b/src/tests/_internal/cli/utils/test_fleet.py new file mode 100644 index 0000000000..00fedff685 --- /dev/null +++ b/src/tests/_internal/cli/utils/test_fleet.py @@ -0,0 +1,525 @@ +import re +from datetime import datetime, timezone +from typing import List, Optional +from uuid import uuid4 + +from rich.table import Table +from rich.text import Text + +from dstack._internal.cli.utils.fleet import get_fleets_table +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.fleets import ( + Fleet, + FleetConfiguration, + FleetNodesSpec, + FleetSpec, + FleetStatus, + InstanceGroupPlacement, + SSHHostParams, + SSHParams, +) +from dstack._internal.core.models.instances import ( + Disk, + Gpu, + Instance, + InstanceStatus, + InstanceType, + Resources, + SSHKey, +) +from dstack._internal.core.models.profiles import Profile, SpotPolicy +from dstack._internal.core.models.resources import GPUSpec, Range, ResourcesSpec + + +def _strip_rich_markup(text: str) -> str: + return re.sub(r"\[[^\]]*\]([^\[]*)\[/[^\]]*\]", r"\1", text) + + +def get_table_cells(table: Table) -> list[dict[str, str]]: + rows = [] + + if not table.columns: + return rows + + num_rows = len(table.columns[0]._cells) + + for row_idx in range(num_rows): + row = {} + for col in table.columns: + col_name = str(col.header) + if row_idx < len(col._cells): + cell_value = col._cells[row_idx] + if isinstance(cell_value, Text): + row[col_name] = cell_value.plain + else: + text = str(cell_value) + row[col_name] = _strip_rich_markup(text) + else: + row[col_name] = "" + rows.append(row) + + return rows + + +def get_table_cell_style(table: Table, column_name: str, row_idx: int = 0) -> Optional[str]: + for col in table.columns: + if str(col.header) == column_name: + if row_idx < len(col._cells): + cell_value = col._cells[row_idx] + if isinstance(cell_value, Text): + return str(cell_value.style) if cell_value.style else None + text = str(cell_value) + match = re.search(r"\[([^\]]+)\][^\[]*\[/\]", text) + if match: + return match.group(1) + return None + return None + + +def create_test_instance( + instance_num: int = 0, + backend: BackendType = BackendType.AWS, + region: str = "us-east-1", + status: InstanceStatus = InstanceStatus.IDLE, + price: Optional[float] = 0.50, + spot: bool = False, + gpu_name: Optional[str] = None, + gpu_count: int = 0, + gpu_memory_mib: int = 0, +) -> Instance: + gpus = [] + if gpu_count > 0 and gpu_name: + gpus = [Gpu(name=gpu_name, memory_mib=gpu_memory_mib)] * gpu_count + + resources = Resources( + cpus=4, + memory_mib=16384, + gpus=gpus, + spot=spot, + disk=Disk(size_mib=102400), + ) + instance_type = InstanceType(name="test-instance", resources=resources) + + return Instance( + id=uuid4(), + project_name="test-project", + name=f"instance-{instance_num}", + instance_num=instance_num, + backend=backend, + region=region, + status=status, + price=price, + instance_type=instance_type, + created=datetime(2023, 1, 2, 3, 4, 5, tzinfo=timezone.utc), + ) + + +def create_backend_fleet( + name: str = "test-fleet", + nodes_min: int = 0, + nodes_max: int = 2, + backends: Optional[List[BackendType]] = None, + spot_policy: SpotPolicy = SpotPolicy.AUTO, + max_price: Optional[float] = None, + placement: Optional[InstanceGroupPlacement] = None, + gpu_count_min: int = 0, + gpu_count_max: int = 0, + instances: Optional[List[Instance]] = None, + status: FleetStatus = FleetStatus.ACTIVE, + project_name: str = "test-project", +) -> Fleet: + nodes = FleetNodesSpec(min=nodes_min, target=nodes_min, max=nodes_max) + + gpu_spec = None + if gpu_count_max > 0: + gpu_spec = GPUSpec(count=Range[int](min=gpu_count_min, max=gpu_count_max)) + + resources = ResourcesSpec(gpu=gpu_spec) if gpu_spec else ResourcesSpec() + + config = FleetConfiguration( + name=name, + nodes=nodes, + backends=backends, + placement=placement, + resources=resources, + ) + + profile = Profile(name="default", spot_policy=spot_policy, max_price=max_price) + + spec = FleetSpec( + configuration=config, + configuration_path="fleet.dstack.yml", + profile=profile, + ) + + return Fleet( + id=uuid4(), + name=name, + project_name=project_name, + spec=spec, + created_at=datetime(2023, 1, 2, 3, 4, 5, tzinfo=timezone.utc), + status=status, + instances=instances or [], + ) + + +def create_ssh_fleet( + name: str = "ssh-fleet", + hosts: Optional[List[str]] = None, + placement: Optional[InstanceGroupPlacement] = None, + instances: Optional[List[Instance]] = None, + status: FleetStatus = FleetStatus.ACTIVE, +) -> Fleet: + if hosts is None: + hosts = ["10.0.0.1", "10.0.0.2"] + + ssh_key = SSHKey(public="ssh-rsa AAAA...", private="-----BEGIN PRIVATE KEY-----\n...") + ssh_config = SSHParams( + user="ubuntu", + ssh_key=ssh_key, + hosts=[SSHHostParams(hostname=h) for h in hosts], + network=None, + ) + + config = FleetConfiguration( + name=name, + ssh_config=ssh_config, + placement=placement, + ) + + spec = FleetSpec( + configuration=config, + configuration_path="fleet.dstack.yml", + profile=Profile(name="default"), + ) + + return Fleet( + id=uuid4(), + name=name, + project_name="test-project", + spec=spec, + created_at=datetime(2023, 1, 2, 3, 4, 5, tzinfo=timezone.utc), + status=status, + instances=instances or [], + ) + + +class TestGetFleetsTable: + def test_backend_fleet_without_verbose(self): + instance = create_test_instance( + instance_num=0, + backend=BackendType.AWS, + region="us-east-1", + status=InstanceStatus.IDLE, + price=0.50, + spot=True, + ) + fleet = create_backend_fleet( + name="my-cloud", + nodes_min=0, + nodes_max=4, + backends=[BackendType.AWS], + spot_policy=SpotPolicy.AUTO, + instances=[instance], + ) + + table = get_fleets_table([fleet], current_project="test-project", verbose=False) + cells = get_table_cells(table) + + assert len(cells) == 2 # 1 fleet row + 1 instance row + + fleet_row = cells[0] + assert fleet_row["NAME"] == "my-cloud" + assert fleet_row["NODES"] == "0..4" + assert fleet_row["BACKEND"] == "aws" + assert fleet_row["SPOT"] == "auto" + assert fleet_row["PRICE"] == "-" # no max_price set + assert fleet_row["STATUS"] == "active" + + instance_row = cells[1] + assert "instance=0" in instance_row["NAME"] + assert instance_row["BACKEND"] == "aws (us-east-1)" + assert instance_row["SPOT"] == "spot" + assert instance_row["PRICE"] == "$0.5" + assert instance_row["STATUS"] == "idle" + + def test_backend_fleet_with_verbose(self): + instance = create_test_instance( + instance_num=0, + backend=BackendType.GCP, + region="us-west4", + status=InstanceStatus.BUSY, + price=1.25, + spot=False, + ) + fleet = create_backend_fleet( + name="my-cloud", + nodes_min=1, + nodes_max=1, + backends=[BackendType.GCP], + spot_policy=SpotPolicy.ONDEMAND, + max_price=2.0, + placement=InstanceGroupPlacement.CLUSTER, + instances=[instance], + ) + + table = get_fleets_table([fleet], current_project="test-project", verbose=True) + cells = get_table_cells(table) + + assert len(cells) == 2 + + fleet_row = cells[0] + assert fleet_row["NAME"] == "my-cloud" + assert fleet_row["NODES"] == "1 (cluster)" + assert fleet_row["BACKEND"] == "gcp" + assert fleet_row["SPOT"] == "on-demand" + assert fleet_row["PRICE"] == "$0..$2" + assert fleet_row["STATUS"] == "active" + + instance_row = cells[1] + assert "instance=0" in instance_row["NAME"] + assert instance_row["BACKEND"] == "gcp (us-west4)" + assert instance_row["SPOT"] == "on-demand" + assert instance_row["PRICE"] == "$1.25" + + def test_ssh_fleet_without_verbose(self): + instance1 = create_test_instance( + instance_num=0, + backend=BackendType.REMOTE, + region="", + status=InstanceStatus.IDLE, + price=None, + spot=False, + gpu_name="L4", + gpu_count=1, + gpu_memory_mib=24576, + ) + instance2 = create_test_instance( + instance_num=1, + backend=BackendType.REMOTE, + region="", + status=InstanceStatus.BUSY, + price=None, + spot=False, + gpu_name="L4", + gpu_count=1, + gpu_memory_mib=24576, + ) + fleet = create_ssh_fleet( + name="my-ssh", + hosts=["10.0.0.1", "10.0.0.2"], + instances=[instance1, instance2], + ) + + table = get_fleets_table([fleet], current_project="test-project", verbose=False) + cells = get_table_cells(table) + + assert len(cells) == 3 # 1 fleet row + 2 instance rows + + fleet_row = cells[0] + assert fleet_row["NAME"] == "my-ssh" + assert fleet_row["NODES"] == "2" + assert fleet_row["BACKEND"] == "ssh" + assert fleet_row["SPOT"] == "-" + assert fleet_row["PRICE"] == "-" + assert fleet_row["STATUS"] == "active" + + for i, instance_row in enumerate(cells[1:], start=0): + assert f"instance={i}" in instance_row["NAME"] + assert instance_row["BACKEND"] == "ssh" + assert instance_row["SPOT"] == "-" + assert instance_row["PRICE"] == "-" + + def test_ssh_fleet_with_verbose(self): + instance = create_test_instance( + instance_num=0, + backend=BackendType.REMOTE, + region="", + status=InstanceStatus.IDLE, + price=None, + spot=False, + ) + fleet = create_ssh_fleet( + name="my-ssh", + hosts=["10.0.0.1"], + placement=InstanceGroupPlacement.CLUSTER, + instances=[instance], + ) + + table = get_fleets_table([fleet], current_project="test-project", verbose=True) + cells = get_table_cells(table) + + assert len(cells) == 2 + + fleet_row = cells[0] + assert fleet_row["NAME"] == "my-ssh" + assert fleet_row["NODES"] == "1 (cluster)" + assert fleet_row["RESOURCES"] == "-" + assert fleet_row["BACKEND"] == "ssh" + assert fleet_row["SPOT"] == "-" + assert fleet_row["PRICE"] == "-" + + instance_row = cells[1] + assert "instance=0" in instance_row["NAME"] + assert instance_row["BACKEND"] == "ssh" + assert instance_row["SPOT"] == "-" + assert instance_row["PRICE"] == "-" + + def test_mixed_fleets(self): + backend_instance = create_test_instance( + instance_num=0, + backend=BackendType.AWS, + region="us-east-1", + status=InstanceStatus.BUSY, + price=0.75, + spot=True, + ) + backend_fleet = create_backend_fleet( + name="cloud-fleet", + nodes_min=0, + nodes_max=2, + backends=[BackendType.AWS], + spot_policy=SpotPolicy.SPOT, + instances=[backend_instance], + ) + + ssh_instance = create_test_instance( + instance_num=0, + backend=BackendType.REMOTE, + region="", + status=InstanceStatus.IDLE, + price=None, + spot=False, + ) + ssh_fleet = create_ssh_fleet( + name="ssh-fleet", + hosts=["10.0.0.1"], + instances=[ssh_instance], + ) + + table = get_fleets_table( + [backend_fleet, ssh_fleet], current_project="test-project", verbose=False + ) + cells = get_table_cells(table) + + assert len(cells) == 4 # 2 fleet rows + 2 instance rows + + assert cells[0]["NAME"] == "cloud-fleet" + assert cells[0]["NODES"] == "0..2" + assert cells[0]["BACKEND"] == "aws" + assert cells[0]["SPOT"] == "spot" + + assert "instance=0" in cells[1]["NAME"] + assert cells[1]["SPOT"] == "spot" + assert cells[1]["PRICE"] == "$0.75" + + assert cells[2]["NAME"] == "ssh-fleet" + assert cells[2]["NODES"] == "1" + assert cells[2]["BACKEND"] == "ssh" + assert cells[2]["SPOT"] == "-" + assert cells[2]["PRICE"] == "-" + + assert "instance=0" in cells[3]["NAME"] + assert cells[3]["SPOT"] == "-" + assert cells[3]["PRICE"] == "-" + + def test_fleet_status_colors(self): + # Add instances to avoid placeholder rows affecting row indices + active_instance = create_test_instance(instance_num=0, status=InstanceStatus.IDLE) + active_fleet = create_backend_fleet( + name="active", status=FleetStatus.ACTIVE, instances=[active_instance] + ) + + terminating_instance = create_test_instance( + instance_num=0, status=InstanceStatus.TERMINATING + ) + terminating_fleet = create_backend_fleet( + name="terminating", status=FleetStatus.TERMINATING, instances=[terminating_instance] + ) + + table = get_fleets_table( + [active_fleet, terminating_fleet], current_project="test-project", verbose=False + ) + + active_style = get_table_cell_style(table, "STATUS", 0) + assert active_style == "bold white" + + # Row 2 (after active fleet's instance) + terminating_style = get_table_cell_style(table, "STATUS", 2) + assert terminating_style == "bold deep_sky_blue1" + + def test_instance_status_colors(self): + idle_instance = create_test_instance(instance_num=0, status=InstanceStatus.IDLE) + busy_instance = create_test_instance(instance_num=1, status=InstanceStatus.BUSY) + + fleet = create_backend_fleet( + name="test", + instances=[idle_instance, busy_instance], + ) + + table = get_fleets_table([fleet], current_project="test-project", verbose=False) + + idle_style = get_table_cell_style(table, "STATUS", 1) + assert idle_style == "bold sea_green3" + + busy_style = get_table_cell_style(table, "STATUS", 2) + assert busy_style == "bold white" + + def test_empty_fleet(self): + fleet = create_backend_fleet(name="empty-fleet", instances=[]) + + table = get_fleets_table([fleet], current_project="test-project", verbose=False) + cells = get_table_cells(table) + + assert len(cells) == 1 + assert cells[0]["NAME"] == "empty-fleet" + + def test_fleet_with_max_price(self): + fleet = create_backend_fleet( + name="priced-fleet", + max_price=5.0, + ) + + table = get_fleets_table([fleet], current_project="test-project", verbose=False) + cells = get_table_cells(table) + + assert cells[0]["PRICE"] == "$0..$5" + + def test_fleet_with_multiple_backends(self): + fleet = create_backend_fleet( + name="multi-backend", + backends=[BackendType.AWS, BackendType.GCP, BackendType.AZURE], + ) + + table = get_fleets_table([fleet], current_project="test-project", verbose=False) + cells = get_table_cells(table) + + assert cells[0]["BACKEND"] == "aws, gcp, azure" + + def test_fleet_with_any_backend(self): + fleet = create_backend_fleet( + name="any-backend", + backends=None, + ) + + table = get_fleets_table([fleet], current_project="test-project", verbose=False) + cells = get_table_cells(table) + + assert cells[0]["BACKEND"] == "*" + + def test_with_imported_fleet(self): + current_project_fleet = create_backend_fleet( + name="current-fleet", project_name="current-project" + ) + other_project_fleet = create_backend_fleet( + name="other-fleet", project_name="other-project" + ) + table = get_fleets_table( + [current_project_fleet, other_project_fleet], + verbose=False, + current_project="current-project", + ) + cells = get_table_cells(table) + assert len(cells) == 2 + assert cells[0]["NAME"] == "current-fleet" + assert cells[1]["NAME"] == "other-project/other-fleet" diff --git a/src/tests/_internal/cli/utils/test_offer.py b/src/tests/_internal/cli/utils/test_offer.py new file mode 100644 index 0000000000..9a9a4dfb16 --- /dev/null +++ b/src/tests/_internal/cli/utils/test_offer.py @@ -0,0 +1,91 @@ +import pytest + +from dstack._internal.cli.utils.common import console +from dstack._internal.cli.utils.run import print_run_plan +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import ApplyAction +from dstack._internal.core.models.instances import ( + InstanceAvailability, + InstanceOfferWithAvailability, + InstanceType, + Resources, +) +from dstack._internal.core.models.runs import JobPlan, RunPlan +from dstack._internal.server.services.jobs import get_jobs_from_run_spec +from dstack._internal.server.testing.common import get_run_spec + +_OFFER_FLEET_HINT = ( + "Hint: Existing fleets are ignored, and all available offers are shown." + " To filter by fleet, pass --fleet NAME." +) +_OFFER_FLEET_HINT_START = "Hint: Existing fleets are ignored" + + +def _get_offer(index: int) -> InstanceOfferWithAvailability: + return InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name=f"instance-{index}", + resources=Resources(cpus=2, memory_mib=8192, spot=False, gpus=[]), + ), + region="us-east-1", + price=float(index), + availability=InstanceAvailability.AVAILABLE, + ) + + +async def _get_run_plan( + *, offers: list[InstanceOfferWithAvailability], total_offers: int +) -> RunPlan: + run_spec = get_run_spec(repo_id="test-repo") + job = (await get_jobs_from_run_spec(run_spec=run_spec, secrets={}, replica_num=0))[0] + return RunPlan( + project_name="test-project", + user="test-user", + run_spec=run_spec, + effective_run_spec=run_spec, + job_plans=[ + JobPlan( + job_spec=job.job_spec, + offers=offers, + total_offers=total_offers, + max_price=max((offer.price for offer in offers), default=None), + ) + ], + action=ApplyAction.CREATE, + ) + + +class TestPrintRunPlanOfferHint: + @pytest.mark.asyncio + async def test_prints_hint_before_short_offer_table(self): + run_plan = await _get_run_plan(offers=[_get_offer(1), _get_offer(2)], total_offers=2) + + with console.capture() as capture: + print_run_plan( + run_plan, + include_run_properties=False, + show_offer_fleet_hint=True, + ) + + output = capture.get() + assert " ".join(_OFFER_FLEET_HINT.split()) in " ".join(output.split()) + assert output.index(_OFFER_FLEET_HINT_START) < output.index("1 aws (us-east-1)") + + @pytest.mark.asyncio + async def test_prints_hint_after_truncated_offer_table(self): + offers = [_get_offer(index) for index in range(1, 4)] + run_plan = await _get_run_plan(offers=offers, total_offers=10) + + with console.capture() as capture: + print_run_plan( + run_plan, + include_run_properties=False, + show_offer_fleet_hint=True, + ) + + output = capture.get() + shown_footer = "Shown 3 of 10 offers, $3max" + assert shown_footer in output + assert " ".join(_OFFER_FLEET_HINT.split()) in " ".join(output.split()) + assert output.index(shown_footer) < output.index(_OFFER_FLEET_HINT_START) diff --git a/src/tests/_internal/cli/utils/test_run.py b/src/tests/_internal/cli/utils/test_run.py new file mode 100644 index 0000000000..4ff9114393 --- /dev/null +++ b/src/tests/_internal/cli/utils/test_run.py @@ -0,0 +1,577 @@ +import re +from datetime import datetime, timezone +from typing import Optional +from unittest.mock import Mock + +import pytest +from rich.table import Table +from rich.text import Text +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import selectinload + +from dstack._internal.cli.utils.run import _format_pull_progress, get_runs_table +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.configurations import ( + AnyRunConfiguration, + ServiceConfiguration, + TaskConfiguration, +) +from dstack._internal.core.models.instances import Disk, InstanceType, Resources +from dstack._internal.core.models.profiles import Profile +from dstack._internal.core.models.resources import Range +from dstack._internal.core.models.runs import ( + ImagePullProgress, + JobProvisioningData, + JobStatus, + JobTerminationReason, + RunStatus, +) +from dstack._internal.server.models import RunModel +from dstack._internal.server.services import encryption # noqa: F401 # import for side-effect +from dstack._internal.server.services.runs import run_model_to_run +from dstack._internal.server.testing.common import ( + create_job, + create_project, + create_repo, + create_run, + create_user, + get_job_provisioning_data, + get_run_spec, +) +from dstack.api import Run +from dstack.api.server import APIClient + + +def _strip_rich_markup(text: str) -> str: + return re.sub(r"\[[^\]]*\]([^\[]*)\[/[^\]]*\]", r"\1", text) + + +def get_table_cells(table: Table) -> list[dict[str, str]]: + rows = [] + + if not table.columns: + return rows + + num_rows = len(table.columns[0]._cells) + + for row_idx in range(num_rows): + row = {} + for col in table.columns: + col_name = str(col.header) + if row_idx < len(col._cells): + cell_value = col._cells[row_idx] + if isinstance(cell_value, Text): + row[col_name] = cell_value.plain + else: + text = str(cell_value) + row[col_name] = _strip_rich_markup(text) + else: + row[col_name] = "" + rows.append(row) + + return rows + + +def get_table_cell_style(table: Table, column_name: str, row_idx: int = 0) -> Optional[str]: + for col in table.columns: + if str(col.header) == column_name: + if row_idx < len(col._cells): + cell_value = col._cells[row_idx] + if isinstance(cell_value, Text): + return str(cell_value.style) if cell_value.style else None + text = str(cell_value) + match = re.search(r"\[([^\]]+)\][^\[]*\[/\]", text) + if match: + return match.group(1) + return None + return None + + +async def create_run_with_job( + session: AsyncSession, + run_name: str = "test-run", + run_status: Optional[RunStatus] = None, + job_status: JobStatus = JobStatus.RUNNING, + configuration: Optional[AnyRunConfiguration] = None, + job_provisioning_data: Optional[JobProvisioningData] = None, + termination_reason: Optional[JobTerminationReason] = None, + exit_status: Optional[int] = None, + termination_reason_message: Optional[str] = None, + submitted_at: Optional[datetime] = None, +) -> Run: + if submitted_at is None: + submitted_at = datetime(2023, 1, 2, 3, 4, 5, tzinfo=timezone.utc) + + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + + if configuration is None: + configuration = TaskConfiguration( + type="task", + image="ubuntu:latest", + commands=["echo hello"], + ) + + if run_status is None: + if job_status == JobStatus.DONE: + run_status = RunStatus.DONE + elif job_status == JobStatus.FAILED: + run_status = RunStatus.FAILED + elif job_status in [JobStatus.TERMINATED, JobStatus.ABORTED]: + run_status = RunStatus.TERMINATED + elif job_status == JobStatus.TERMINATING: + run_status = RunStatus.TERMINATING + elif job_status == JobStatus.PROVISIONING: + run_status = RunStatus.PROVISIONING + elif job_status == JobStatus.PULLING: + run_status = RunStatus.PROVISIONING + else: + run_status = RunStatus.RUNNING + + run_spec = get_run_spec( + run_name=run_name, + repo_id=repo.name, + profile=Profile(name="default"), + configuration=configuration, + ) + + run_model_db = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name=run_name, + run_spec=run_spec, + status=run_status, + submitted_at=submitted_at, + ) + + if job_provisioning_data is None: + resources = Resources( + cpus=2, + memory_mib=4096, + gpus=[], + spot=False, + disk=Disk(size_mib=102400), + ) + instance_type = InstanceType(name="t2.medium", resources=resources) + job_provisioning_data = get_job_provisioning_data( + backend=BackendType.AWS, + region="us-east-1", + cpu_count=2, + memory_gib=4, + spot=False, + hostname="1.2.3.4", + price=0.0464, + instance_type=instance_type, + ) + + job_model = await create_job( + session=session, + run=run_model_db, + status=job_status, + submitted_at=submitted_at, + last_processed_at=submitted_at, + job_provisioning_data=job_provisioning_data, + termination_reason=termination_reason, + ) + + if exit_status is not None: + job_model.exit_status = exit_status + if termination_reason_message is not None: + job_model.termination_reason_message = termination_reason_message + if exit_status is not None or termination_reason_message is not None: + await session.commit() + + await session.refresh(run_model_db) + + res = await session.execute( + select(RunModel).where(RunModel.id == run_model_db.id).options(selectinload(RunModel.jobs)) + ) + run_model_db = res.scalar_one() + + run_model = run_model_to_run(run_model_db) + + return Run( + api_client=Mock(spec=APIClient), + project=project.name, + run=run_model, + ) + + +@pytest.mark.asyncio +@pytest.mark.usefixtures("test_db", "image_config_mock") +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestGetRunsTable: + async def test_simple_run(self, session: AsyncSession): + api_run = await create_run_with_job(session=session) + table = get_runs_table([api_run], verbose=False) + + cells = get_table_cells(table) + assert len(cells) == 1 + row = cells[0] + + assert row["NAME"] == "test-run" + assert row["BACKEND"] == "aws (us-east-1)" + assert row["GPU"] == "-" + assert row["PRICE"] == "$0.0464" + assert row["STATUS"] == "running" + assert row["SUBMITTED"] == "3 years ago" + + name_column = next(col for col in table.columns if str(col.header) == "NAME") + assert name_column.style == "bold" + + status_style = get_table_cell_style(table, "STATUS", 0) + assert status_style == "bold sea_green3" + + @pytest.mark.parametrize( + "job_status,termination_reason,exit_status,termination_reason_message,expected_status,expected_style", + [ + (JobStatus.DONE, None, None, None, "exited (0)", "grey"), + ( + JobStatus.FAILED, + JobTerminationReason.CONTAINER_EXITED_WITH_ERROR, + 1, + None, + "exited (1)", + "indian_red1", + ), + ( + JobStatus.FAILED, + JobTerminationReason.CONTAINER_EXITED_WITH_ERROR, + 42, + None, + "exited (42)", + "indian_red1", + ), + ( + JobStatus.FAILED, + JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY, + None, + None, + "no offers", + "gold1", + ), + ( + JobStatus.FAILED, + JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY, + None, + "No matching fleet found. Possible reasons: https://dstack.ai/docs/guides/troubleshooting/#no-fleets", + "no fleets", + "indian_red1", + ), + ( + JobStatus.FAILED, + JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY, + None, + None, + "interrupted", + "gold1", + ), + ( + JobStatus.FAILED, + JobTerminationReason.INSTANCE_UNREACHABLE, + None, + None, + "error", + "indian_red1", + ), + ( + JobStatus.TERMINATED, + JobTerminationReason.TERMINATED_BY_USER, + None, + None, + "stopped", + "grey", + ), + ( + JobStatus.TERMINATED, + JobTerminationReason.ABORTED_BY_USER, + None, + None, + "aborted", + "grey", + ), + (JobStatus.RUNNING, None, None, None, "running", "bold sea_green3"), + (JobStatus.PROVISIONING, None, None, None, "provisioning", "bold deep_sky_blue1"), + (JobStatus.PULLING, None, None, None, "pulling", "bold sea_green3"), + (JobStatus.TERMINATING, None, None, None, "terminating", "bold deep_sky_blue1"), + ], + ) + async def test_status_messages( + self, + session: AsyncSession, + job_status: JobStatus, + termination_reason: Optional[JobTerminationReason], + exit_status: Optional[int], + termination_reason_message: Optional[str], + expected_status: str, + expected_style: str, + ): + api_run = await create_run_with_job( + session=session, + job_status=job_status, + termination_reason=termination_reason, + exit_status=exit_status, + termination_reason_message=termination_reason_message, + ) + + table = get_runs_table([api_run], verbose=False) + cells = get_table_cells(table) + + assert len(cells) == 1 + assert cells[0]["STATUS"] == expected_status + + status_style = get_table_cell_style(table, "STATUS", 0) + assert status_style == expected_style + + async def test_multi_node_task_with_multiple_jobs(self, session: AsyncSession): + # Verifies that a multi-node task with 3 jobs (all same replica_num=0, different job_num=0,1,2) + # displays only job= in table rows, not replica=, since all jobs share the same replica. + # Expected: 4 rows total (1 run header + 3 job rows with job=0,1,2). + submitted_at = datetime(2023, 1, 2, 3, 4, 5, tzinfo=timezone.utc) + + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + + configuration = TaskConfiguration( + type="task", + image="ubuntu:latest", + commands=["echo hello"], + nodes=3, + ) + + run_spec = get_run_spec( + run_name="multi-node-run", + repo_id=repo.name, + profile=Profile(name="default"), + configuration=configuration, + ) + + run_model_db = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="multi-node-run", + run_spec=run_spec, + status=RunStatus.RUNNING, + submitted_at=submitted_at, + ) + + resources = Resources( + cpus=2, + memory_mib=4096, + gpus=[], + spot=False, + disk=Disk(size_mib=102400), + ) + instance_type = InstanceType(name="t2.medium", resources=resources) + job_provisioning_data = get_job_provisioning_data( + backend=BackendType.AWS, + region="us-east-1", + cpu_count=2, + memory_gib=4, + spot=False, + hostname="1.2.3.4", + price=0.0464, + instance_type=instance_type, + ) + + for job_num in range(3): + await create_job( + session=session, + run=run_model_db, + status=JobStatus.RUNNING, + submitted_at=submitted_at, + last_processed_at=submitted_at, + job_provisioning_data=job_provisioning_data, + replica_num=0, + job_num=job_num, + ) + + await session.refresh(run_model_db) + + res = await session.execute( + select(RunModel) + .where(RunModel.id == run_model_db.id) + .options(selectinload(RunModel.jobs)) + ) + run_model_db = res.scalar_one() + + run_model = run_model_to_run(run_model_db) + + api_run = Run( + api_client=Mock(spec=APIClient), + project=project.name, + run=run_model, + ) + + table = get_runs_table([api_run], verbose=False) + cells = get_table_cells(table) + + assert len(cells) == 4 + assert cells[0]["NAME"] == "multi-node-run" + + for i in range(1, 4): + job_row = cells[i] + assert "replica=" not in job_row["NAME"] + assert f"job={i - 1}" in job_row["NAME"] + assert job_row["STATUS"] == "running" + + async def test_service_with_multiple_replicas_and_jobs(self, session: AsyncSession): + # Verifies that a service with 3 replicas and 1 job per replica displays replica= but not job= + # in table rows (since there's only one job per replica). Expected: 4 rows total (1 run header + 3 job rows). + submitted_at = datetime(2023, 1, 2, 3, 4, 5, tzinfo=timezone.utc) + + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + + configuration = ServiceConfiguration( + type="service", + image="ubuntu:latest", + commands=["echo hello"], + port=8000, + replicas=Range[int](min=3, max=3), + ) + + run_spec = get_run_spec( + run_name="service-run", + repo_id=repo.name, + profile=Profile(name="default"), + configuration=configuration, + ) + + run_model_db = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="service-run", + run_spec=run_spec, + status=RunStatus.RUNNING, + submitted_at=submitted_at, + ) + + resources = Resources( + cpus=2, + memory_mib=4096, + gpus=[], + spot=False, + disk=Disk(size_mib=102400), + ) + instance_type = InstanceType(name="t2.medium", resources=resources) + job_provisioning_data = get_job_provisioning_data( + backend=BackendType.AWS, + region="us-east-1", + cpu_count=2, + memory_gib=4, + spot=False, + hostname="1.2.3.4", + price=0.0464, + instance_type=instance_type, + ) + + for replica_num in range(3): + await create_job( + session=session, + run=run_model_db, + status=JobStatus.RUNNING, + submitted_at=submitted_at, + last_processed_at=submitted_at, + job_provisioning_data=job_provisioning_data, + replica_num=replica_num, + job_num=0, + ) + + await session.refresh(run_model_db) + + res = await session.execute( + select(RunModel) + .where(RunModel.id == run_model_db.id) + .options(selectinload(RunModel.jobs)) + ) + run_model_db = res.scalar_one() + + run_model = run_model_to_run(run_model_db) + + api_run = Run( + api_client=Mock(spec=APIClient), + project=project.name, + run=run_model, + ) + + table = get_runs_table([api_run], verbose=False) + cells = get_table_cells(table) + + assert len(cells) == 4 + assert cells[0]["NAME"] == "service-run" + + for i in range(1, 4): + job_row = cells[i] + assert f"replica={i - 1}" in job_row["NAME"] + assert "job=" not in job_row["NAME"] + assert job_row["STATUS"] == "running" + + +@pytest.mark.parametrize( + "progress,expected", + [ + pytest.param( + ImagePullProgress( + downloaded_bytes=300 * 2**20, + extracted_bytes=200 * 2**20, + total_bytes=500 * 2**20, + is_total_bytes_final=True, + ), + "200/300/500MB", + id="mb_final", + ), + pytest.param( + ImagePullProgress( + downloaded_bytes=300 * 2**20, + extracted_bytes=200 * 2**20, + total_bytes=500 * 2**20, + is_total_bytes_final=False, + ), + "200/300/≥500MB", + id="mb_non_final", + ), + pytest.param( + ImagePullProgress( + downloaded_bytes=int(1.5 * 2**30), + extracted_bytes=1 * 2**30, + total_bytes=2 * 2**30, + is_total_bytes_final=True, + ), + "1.00/1.50/2.00GB", + id="gb_final", + ), + pytest.param( + ImagePullProgress( + downloaded_bytes=int(1.5 * 2**30), + extracted_bytes=1 * 2**30, + total_bytes=2 * 2**30, + is_total_bytes_final=False, + ), + "1.00/1.50/≥2.00GB", + id="gb_non_final", + ), + pytest.param( + ImagePullProgress( + downloaded_bytes=0, + extracted_bytes=0, + total_bytes=2**30, + is_total_bytes_final=True, + ), + "0.00/0.00/1.00GB", + id="gb_boundary", + ), + ], +) +def test_format_pull_progress(progress: ImagePullProgress, expected: str) -> None: + assert _format_pull_progress(progress) == expected diff --git a/src/tests/_internal/cli/utils/test_updates.py b/src/tests/_internal/cli/utils/test_updates.py new file mode 100644 index 0000000000..6e18c9e9b3 --- /dev/null +++ b/src/tests/_internal/cli/utils/test_updates.py @@ -0,0 +1,24 @@ +import pytest + +from dstack._internal.cli.utils import updates + + +@pytest.mark.parametrize( + "current_version,latest_version,expected", + [ + ("1.0.0", "1.0.1", True), # patch update, both releases + ("1.0.0", "2.0.0", True), # major update, both releases + ("1.0.0", "1.0.0", False), # same version + ("1.1.0", "1.0.9", False), # downgrade + ("1.0.0a1", "1.0.0", True), # pre-release to release (should show update) + ("1.0.0", "1.0.0a1", False), # release to pre-release (should NOT show update) + ("1.0.0b1", "1.0.0b2", True), # beta to beta (should show update) + ("1.0.0rc1", "1.0.0", True), # rc to release (should show update) + ("1.0.0", "1.0.0rc1", False), # release to rc (should NOT show update) + ("1.0.0a1", "1.0.0b1", True), # alpha to beta (should show update) + ("1.0.0b1", "1.0.0rc1", True), # beta to rc (should show update) + ("1.0.0rc1", "1.0.1a1", True), # rc to next alpha (should show update) + ], +) +def test_is_update_available(current_version, latest_version, expected): + assert updates.is_update_available(current_version, latest_version) == expected diff --git a/src/tests/_internal/core/backends/aws/__init__.py b/src/tests/_internal/core/backends/aws/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/core/backends/aws/test_configurator.py b/src/tests/_internal/core/backends/aws/test_configurator.py new file mode 100644 index 0000000000..f18582c1fe --- /dev/null +++ b/src/tests/_internal/core/backends/aws/test_configurator.py @@ -0,0 +1,35 @@ +from unittest.mock import patch + +import pytest + +from dstack._internal.core.backends.aws.configurator import AWSConfigurator +from dstack._internal.core.backends.aws.models import AWSAccessKeyCreds, AWSBackendConfigWithCreds +from dstack._internal.core.errors import ( + BackendAuthError, + BackendInvalidCredentialsError, +) + + +class TestAWSConfigurator: + def test_validate_config_valid(self): + config = AWSBackendConfigWithCreds( + creds=AWSAccessKeyCreds(access_key="valid", secret_key="valid"), regions=["us-west-1"] + ) + with ( + patch("dstack._internal.core.backends.aws.auth.authenticate"), + patch("dstack._internal.core.backends.aws.compute.get_vpc_id_subnets_ids_or_error"), + ): + AWSConfigurator().validate_config(config, default_creds_enabled=True) + + def test_validate_config_invalid_creds(self): + config = AWSBackendConfigWithCreds( + creds=AWSAccessKeyCreds(access_key="invalid", secret_key="invalid"), + regions=["us-west-1"], + ) + with ( + patch("dstack._internal.core.backends.aws.auth.authenticate") as authenticate_mock, + pytest.raises(BackendInvalidCredentialsError) as exc_info, + ): + authenticate_mock.side_effect = BackendAuthError() + AWSConfigurator().validate_config(config, default_creds_enabled=True) + assert exc_info.value.fields == [["creds", "access_key"], ["creds", "secret_key"]] diff --git a/src/tests/_internal/core/backends/aws/test_resources.py b/src/tests/_internal/core/backends/aws/test_resources.py new file mode 100644 index 0000000000..efe19ad870 --- /dev/null +++ b/src/tests/_internal/core/backends/aws/test_resources.py @@ -0,0 +1,396 @@ +import logging +from unittest.mock import Mock + +import pytest + +from dstack._internal.core.backends.aws.models import AWSOSImage, AWSOSImageConfig +from dstack._internal.core.backends.aws.resources import ( + _create_network_interfaces_struct, + _is_valid_tag_key, + _is_valid_tag_value, + get_image_id_and_username, + validate_tags, +) +from dstack._internal.core.errors import BackendError, ComputeResourceNotFoundError + + +class TestIsValidTagKey: + @pytest.mark.parametrize( + "key", + [ + "Environment", + "Project123", + "special-chars-+/@=:_", + "a" * 128, + ], + ) + def test_valid_tag_key(self, key): + assert _is_valid_tag_key(key) + + @pytest.mark.parametrize( + "key", + [ + "aws:reserved", + "key\twith\nweird\nspaces", + "", + "a" * 129, + "Invalid#Char", + ], + ) + def test_invalid_tag_key(self, key): + assert not _is_valid_tag_key(key) + + +class TestIsValidTagValue: + @pytest.mark.parametrize( + "value", + [ + "Production", + "v1.0", + "", + "a" * 256, + ], + ) + def test_valid_tag_value(self, value): + assert _is_valid_tag_value(value) is True + + @pytest.mark.parametrize( + "value", + [ + "a" * 257, + "Invalid#Value", + ], + ) + def test_invalid_tag_value(self, value): + assert _is_valid_tag_value(value) is False + + +class TestValidateTags: + def test_validate_valid_tags(self): + tags = { + "Environment": "Production", + "Project": "AWS_Tag_Validator", + } + assert validate_tags(tags) is None + + def test_validate_invalid_tags(self): + tags = {"aws:ReservedKey": "SomeValue", "ValidKey": "Invalid#Value"} + with pytest.raises(BackendError, match="Invalid resource tags"): + validate_tags(tags) + + +class TestGetImageIdAndUsername: + @pytest.fixture + def ec2_client_mock(self) -> Mock: + mock = Mock(spec_set=["describe_images"]) + mock.describe_images.return_value = { + "Images": [ + { + "ImageId": "ami-00000000000000000", + "State": "available", + "CreationDate": "2000-01-01T00:00:00.000Z", + }, + ], + } + return mock + + def test_returns_the_latest_available(self, ec2_client_mock: Mock): + ec2_client_mock.describe_images.return_value = { + "Images": [ + # the latest, but not available + { + "ImageId": "ami-00000000000000001", + "State": "failed", + "CreationDate": "2024-01-01T00:00:00.000Z", + }, + # available, but not the latest + { + "ImageId": "ami-00000000000000002", + "State": "available", + "CreationDate": "2022-01-01T00:00:00.000Z", + }, + # the latest available + { + "ImageId": "ami-00000000000000003", + "State": "available", + "CreationDate": "2023-01-01T00:00:00.000Z", + }, + ] + } + image_id, username = get_image_id_and_username( + ec2_client_mock, + gpu_name=None, + instance_type="some", + ) + assert image_id == "ami-00000000000000003" + assert username == "ubuntu" + + def test_raises_resource_not_found_if_none_available( + self, + monkeypatch: pytest.MonkeyPatch, + caplog: pytest.LogCaptureFixture, + ec2_client_mock: Mock, + ): + monkeypatch.setattr("dstack._internal.settings.DSTACK_VM_BASE_IMAGE_VERSION", "0.0") + caplog.set_level(logging.WARNING) + ec2_client_mock.describe_images.return_value = { + "Images": [ + { + "ImageId": "ami-00000000000000000", + "State": "failed", + "CreationDate": "2000-01-01T00:00:00.000Z", + }, + ] + } + with pytest.raises(ComputeResourceNotFoundError): + get_image_id_and_username( + ec2_client_mock, + gpu_name=None, + instance_type="some", + ) + assert "image 'dstack-0.0' not found" in caplog.text + + @pytest.mark.parametrize( + ["cuda", "expected_name", "expected_owner"], + [ + [False, "dstack-0.0", "142421590066"], + [ + True, + "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04) *", + "898082745236", + ], + ], + ) + def test_uses_default_image_name_and_account_id_if_image_config_not_provided( + self, + monkeypatch: pytest.MonkeyPatch, + ec2_client_mock: Mock, + cuda: bool, + expected_name: str, + expected_owner: str, + ): + monkeypatch.setattr("dstack._internal.settings.DSTACK_VM_BASE_IMAGE_VERSION", "0.0") + _, username = get_image_id_and_username( + ec2_client_mock, + gpu_name="A10G" if cuda else None, + instance_type="some", + ) + assert username == "ubuntu" + ec2_client_mock.describe_images.assert_called_once_with( + Filters=[{"Name": "name", "Values": [expected_name]}], Owners=[expected_owner] + ) + + @pytest.mark.parametrize( + ["cuda", "expected_name", "expected_owner", "expected_username"], + [ + [False, "cpu-ami", "123456789012", "debian"], + [True, "nvidia-ami", "self", "dstack"], + ], + ) + def test_uses_image_config_if_provided( + self, + ec2_client_mock: Mock, + cuda: bool, + expected_name: str, + expected_owner: str, + expected_username: str, + ): + image_config = AWSOSImageConfig( + cpu=AWSOSImage( + name="cpu-ami", + owner="123456789012", + user="debian", + ), + nvidia=AWSOSImage( + name="nvidia-ami", + user="dstack", + ), + ) + _, username = get_image_id_and_username( + ec2_client_mock, + gpu_name="A10G" if cuda else None, + instance_type="some", + image_config=image_config, + ) + assert username == expected_username + ec2_client_mock.describe_images.assert_called_once_with( + Filters=[{"Name": "name", "Values": [expected_name]}], + Owners=[expected_owner], + ) + + def test_raises_resource_not_found_if_image_config_property_not_set( + self, caplog: pytest.LogCaptureFixture, ec2_client_mock: Mock + ): + caplog.set_level(logging.WARNING) + image_config = AWSOSImageConfig( + nvidia=AWSOSImage( + name="nvidia-ami", + user="dstack", + ), + ) + with pytest.raises(ComputeResourceNotFoundError): + get_image_id_and_username( + ec2_client_mock, + gpu_name=None, + instance_type="some", + image_config=image_config, + ) + assert "cpu image not configured" in caplog.text + + +class TestCreateNetworkInterfacesStruct: + def test_non_efa_instance_single_interface(self): + interfaces = _create_network_interfaces_struct( + instance_type="m5.large", + subnet_id="subnet-1", + security_group_id="sg-1", + allocate_public_ip=True, + max_efa_interfaces=0, + ) + assert interfaces == [ + { + "AssociatePublicIpAddress": True, + "DeviceIndex": 0, + "SubnetId": "subnet-1", + "Groups": ["sg-1"], + "InterfaceType": "interface", + }, + ] + + def test_non_efa_instance_no_public_ip(self): + interfaces = _create_network_interfaces_struct( + instance_type="m5.large", + subnet_id="subnet-1", + security_group_id="sg-1", + allocate_public_ip=False, + max_efa_interfaces=0, + ) + assert interfaces[0]["AssociatePublicIpAddress"] is False + assert interfaces[0]["InterfaceType"] == "interface" + + def test_single_efa_interface(self): + interfaces = _create_network_interfaces_struct( + instance_type="g5.8xlarge", + subnet_id="subnet-1", + security_group_id="sg-1", + allocate_public_ip=True, + max_efa_interfaces=1, + ) + # multi_eni is False, so the single EFA NIC keeps the public IP + assert interfaces == [ + { + "AssociatePublicIpAddress": True, + "DeviceIndex": 0, + "SubnetId": "subnet-1", + "Groups": ["sg-1"], + "InterfaceType": "efa", + }, + ] + + def test_multi_efa_instance(self): + interfaces = _create_network_interfaces_struct( + instance_type="p4d.24xlarge", + subnet_id="subnet-1", + security_group_id="sg-1", + allocate_public_ip=True, + max_efa_interfaces=4, + ) + # Multiple NICs disable auto-assigned public IP on every interface + assert interfaces[0] == { + "AssociatePublicIpAddress": False, + "DeviceIndex": 0, + "SubnetId": "subnet-1", + "Groups": ["sg-1"], + "InterfaceType": "efa", + } + assert interfaces[1:] == [ + { + "AssociatePublicIpAddress": False, + "NetworkCardIndex": i, + "DeviceIndex": 1, + "SubnetId": "subnet-1", + "Groups": ["sg-1"], + "InterfaceType": "efa-only", + } + for i in range(1, 4) + ] + + def test_p5_uses_efa_every_fourth_interface(self): + interfaces = _create_network_interfaces_struct( + instance_type="p5.48xlarge", + subnet_id="subnet-1", + security_group_id="sg-1", + allocate_public_ip=True, + max_efa_interfaces=32, + ) + assert len(interfaces) == 32 + assert all(i["NetworkCardIndex"] == idx for idx, i in enumerate(interfaces) if idx > 0) + # The primary NIC is a combined efa interface + assert interfaces[0]["InterfaceType"] == "efa" + assert "NetworkCardIndex" not in interfaces[0] + # Every 4th secondary NIC is a combined efa interface, the rest are efa-only + for idx, interface in enumerate(interfaces[1:], start=1): + expected = "efa" if idx % 4 == 0 else "efa-only" + assert interface["InterfaceType"] == expected, idx + + def test_p6_b200_efa_on_every_card(self): + # p6-b200 has 8 EFA-capable network cards (indexes 0-7), handled by the generic path + interfaces = _create_network_interfaces_struct( + instance_type="p6-b200.48xlarge", + subnet_id="subnet-1", + security_group_id="sg-1", + allocate_public_ip=True, + max_efa_interfaces=8, + ) + assert len(interfaces) == 8 + assert interfaces[0] == { + "AssociatePublicIpAddress": False, + "DeviceIndex": 0, + "SubnetId": "subnet-1", + "Groups": ["sg-1"], + "InterfaceType": "efa", + } + assert interfaces[1:] == [ + { + "AssociatePublicIpAddress": False, + "NetworkCardIndex": i, + "DeviceIndex": 1, + "SubnetId": "subnet-1", + "Groups": ["sg-1"], + "InterfaceType": "efa-only", + } + for i in range(1, 8) + ] + + def test_p6_b300_ena_only_primary_nic(self): + # p6-b300 has 17 network cards: the primary (index 0) supports only ENA, the remaining + # 16 cards (indexes 1-16) support EFA. max_efa_interfaces is 16. + interfaces = _create_network_interfaces_struct( + instance_type="p6-b300.48xlarge", + subnet_id="subnet-1", + security_group_id="sg-1", + allocate_public_ip=True, + max_efa_interfaces=16, + ) + # 1 ENA primary + 16 EFA secondary cards + assert len(interfaces) == 17 + # Primary card is a plain ENA interface, not EFA + assert interfaces[0] == { + "AssociatePublicIpAddress": False, + "DeviceIndex": 0, + "SubnetId": "subnet-1", + "Groups": ["sg-1"], + "InterfaceType": "interface", + } + # EFA-only interfaces span network card indexes 1-16 + assert interfaces[1:] == [ + { + "AssociatePublicIpAddress": False, + "NetworkCardIndex": i, + "DeviceIndex": 1, + "SubnetId": "subnet-1", + "Groups": ["sg-1"], + "InterfaceType": "efa-only", + } + for i in range(1, 17) + ] diff --git a/src/tests/_internal/core/backends/azure/test_compute.py b/src/tests/_internal/core/backends/azure/test_compute.py index d83b04edae..a0ce0afaef 100644 --- a/src/tests/_internal/core/backends/azure/test_compute.py +++ b/src/tests/_internal/core/backends/azure/test_compute.py @@ -1,6 +1,6 @@ import pytest -from dstack import version +from dstack._internal import settings from dstack._internal.core.backends.azure.compute import VMImageVariant from dstack._internal.core.models.instances import Gpu, InstanceType, Resources @@ -55,9 +55,18 @@ def test_from_instance_type( @pytest.mark.parametrize( ["variant", "expected_name"], [ - [VMImageVariant.GRID, f"dstack-grid-{version.base_image}"], - [VMImageVariant.CUDA, f"dstack-cuda-{version.base_image}"], - [VMImageVariant.STANDARD, f"dstack-{version.base_image}"], + [ + VMImageVariant.GRID, + f"{settings.DSTACK_VM_BASE_IMAGE_PREFIX}dstack-grid-{settings.DSTACK_VM_BASE_IMAGE_VERSION}", + ], + [ + VMImageVariant.CUDA, + f"{settings.DSTACK_VM_BASE_IMAGE_PREFIX}dstack-cuda-{settings.DSTACK_VM_BASE_IMAGE_VERSION}", + ], + [ + VMImageVariant.STANDARD, + f"{settings.DSTACK_VM_BASE_IMAGE_PREFIX}dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}", + ], ], ) def test_get_image_name(self, variant: VMImageVariant, expected_name: str): diff --git a/src/tests/_internal/core/backends/azure/test_configurator.py b/src/tests/_internal/core/backends/azure/test_configurator.py new file mode 100644 index 0000000000..1a329e0cd5 --- /dev/null +++ b/src/tests/_internal/core/backends/azure/test_configurator.py @@ -0,0 +1,131 @@ +from unittest.mock import Mock, patch + +import pytest + +from dstack._internal.core.backends.azure.configurator import AzureConfigurator +from dstack._internal.core.backends.azure.models import ( + AzureBackendConfigWithCreds, + AzureClientCreds, +) +from dstack._internal.core.errors import ( + BackendAuthError, + BackendInvalidCredentialsError, + ServerClientError, +) + + +class TestAzureConfigurator: + def test_validate_config_valid(self): + config = AzureBackendConfigWithCreds( + creds=AzureClientCreds( + tenant_id="valid", + client_id="valid", + client_secret="valid", + ), + tenant_id="ten1", + subscription_id="sub1", + regions=["eastus"], + ) + with ( + patch("dstack._internal.core.backends.azure.auth.authenticate") as authenticate_mock, + patch("azure.mgmt.subscription.SubscriptionClient") as SubscriptionClientMock, + ): + authenticate_mock.return_value = Mock(), Mock() + subcription_client_mock = SubscriptionClientMock.return_value + subcription_client_mock.tenants.list.return_value = [Mock(tenant_id="ten1")] + subcription_client_mock.subscriptions.list.return_value = [ + Mock(subscription_id="sub1") + ] + AzureConfigurator().validate_config(config, default_creds_enabled=True) + + def test_validate_config_invalid_creds(self): + config = AzureBackendConfigWithCreds( + creds=AzureClientCreds( + tenant_id="invalid", + client_id="invalid", + client_secret="invalid", + ), + tenant_id="invalid", + subscription_id="invalid", + regions=["eastus"], + ) + with ( + patch("dstack._internal.core.backends.azure.auth.authenticate") as mock_authenticate, + pytest.raises(BackendInvalidCredentialsError) as exc_info, + ): + mock_authenticate.side_effect = BackendAuthError() + AzureConfigurator().validate_config(config, default_creds_enabled=True) + assert exc_info.value.fields == [ + ["creds", "tenant_id"], + ["creds", "client_id"], + ["creds", "client_secret"], + ] + + +class TestCheckConfigVpc: + def _make_config(self, **kwargs): + return AzureBackendConfigWithCreds( + creds=AzureClientCreds(tenant_id="t", client_id="c", client_secret="s"), + tenant_id="ten1", + subscription_id="sub1", + **kwargs, + ) + + def _check(self, config): + with ( + patch("azure.mgmt.network.NetworkManagementClient"), + patch( + "dstack._internal.core.backends.azure.compute.get_resource_group_network_subnet_or_error" + ), + ): + AzureConfigurator()._check_config_vpc(config, Mock()) + + def test_public_ips_false_requires_network_config(self): + config = self._make_config(regions=["westeurope"], public_ips=False) + with pytest.raises(ServerClientError, match="`vpc_ids` or `subnet_ids` must be specified"): + AzureConfigurator()._check_config_vpc(config, Mock()) + + def test_public_ips_false_with_vpc_ids_ok(self): + config = self._make_config( + regions=["westeurope"], public_ips=False, vpc_ids={"westeurope": "rg/net"} + ) + self._check(config) + + def test_public_ips_false_with_subnet_ids_ok(self): + config = self._make_config( + regions=["westeurope"], public_ips=False, subnet_ids={"westeurope": "rg/net/subnet"} + ) + self._check(config) + + def test_overlap_raises(self): + config = self._make_config( + regions=["westeurope", "eastus"], + vpc_ids={"westeurope": "rg/net", "eastus": "rg/net2"}, + subnet_ids={"westeurope": "rg/net/subnet"}, + ) + with pytest.raises(ServerClientError, match="westeurope"): + AzureConfigurator()._check_config_vpc(config, Mock()) + + def test_uncovered_region_raises_with_vpc_ids(self): + config = self._make_config( + regions=["westeurope", "eastus"], + vpc_ids={"westeurope": "rg/net"}, + ) + with pytest.raises(ServerClientError, match="eastus"): + AzureConfigurator()._check_config_vpc(config, Mock()) + + def test_uncovered_region_raises_with_subnet_ids(self): + config = self._make_config( + regions=["westeurope", "eastus"], + subnet_ids={"westeurope": "rg/net/subnet"}, + ) + with pytest.raises(ServerClientError, match="eastus"): + AzureConfigurator()._check_config_vpc(config, Mock()) + + def test_mixed_vpc_and_subnet_ids_covers_all_regions(self): + config = self._make_config( + regions=["westeurope", "eastus"], + vpc_ids={"westeurope": "rg/net"}, + subnet_ids={"eastus": "rg/net/subnet"}, + ) + self._check(config) diff --git a/src/tests/_internal/core/backends/azure/test_resources.py b/src/tests/_internal/core/backends/azure/test_resources.py new file mode 100644 index 0000000000..12498fa9a0 --- /dev/null +++ b/src/tests/_internal/core/backends/azure/test_resources.py @@ -0,0 +1,74 @@ +import pytest + +from dstack._internal.core.backends.azure.resources import ( + _is_valid_tag_key, + _is_valid_tag_value, + validate_tags, +) +from dstack._internal.core.errors import BackendError + + +class TestValidateTags: + def test_valid_tags(self): + tags = {"ValidTag": "SomeValue"} + assert validate_tags(tags) is None + + def test_invalid_tags(self): + tags = {"Invalid", + "Invalid>key", + "Invalid%key", + "Invalid&key", + "Invalid\\key", + "Invalid?key", + "Invalid/key", + ], + ) + def test_invalid_tag_keys(self, key): + assert not _is_valid_tag_key(key) + + +class TestIsValidTagValue: + @pytest.mark.parametrize( + "value", + [ + "ValidValue", + "Value_with_special_chars!@#", + "a" * 256, + "", + ], + ) + def test_valid_tag_values(self, value): + assert _is_valid_tag_value(value) + + @pytest.mark.parametrize( + "value", + [ + "a" * 257, + ], + ) + def test_invalid_tag_values(self, value): + assert not _is_valid_tag_value(value) diff --git a/src/tests/_internal/core/backends/base/__init__.py b/src/tests/_internal/core/backends/base/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/core/backends/base/test_compute.py b/src/tests/_internal/core/backends/base/test_compute.py new file mode 100644 index 0000000000..7892a3f0f5 --- /dev/null +++ b/src/tests/_internal/core/backends/base/test_compute.py @@ -0,0 +1,83 @@ +import re +from typing import Optional + +import gpuhunt +import pytest + +from dstack._internal.core.backends.base.compute import ( + GoArchType, + generate_unique_backend_name, + generate_unique_gateway_instance_name, + generate_unique_instance_name, + generate_unique_volume_name, + normalize_arch, +) +from dstack._internal.server.testing.common import ( + get_gateway_compute_configuration, + get_instance_configuration, + get_volume, +) + + +class TestGenerateUniqueInstanceName: + def test_generates_name(self): + configuration = get_instance_configuration( + project_name="project1", instance_name="my-instance" + ) + name = generate_unique_instance_name(configuration, 60) + assert re.match(r"^dstack-project1-my-instance-[a-z0-9]{8}$", name) + + +class TestGenerateUniqueGatewayInstanceName: + def test_generates_name(self): + configuration = get_gateway_compute_configuration( + project_name="project1", instance_name="my-gateway" + ) + name = generate_unique_gateway_instance_name(configuration, 60) + assert re.match(r"^dstack-project1-my-gateway-[a-z0-9]{8}$", name) + + +class TestGenerateUniqueVolumeName: + def test_generates_name(self): + volume = get_volume(project_name="project1", name="my-volume") + name = generate_unique_volume_name(volume, 60) + assert re.match(r"^dstack-project1-my-volume-[a-z0-9]{8}$", name) + + +class TestGenerateUniqueBackendName: + def test_generates_name_with_project(self): + name = generate_unique_backend_name("instance", "project", 60) + assert re.match(r"^dstack-project-instance-[a-z0-9]{8}$", name) + + def test_generates_name_without_project(self): + name = generate_unique_backend_name("instance", None, 60) + assert re.match(r"^dstack-instance-[a-z0-9]{8}$", name) + + def test_truncates_long_names(self): + name = generate_unique_backend_name("a" * 100, "project", 30) + assert re.match(r"^dstack-project-aaaaaa-[a-z0-9]{8}$", name) + + def test_validates_project_name(self): + name = generate_unique_backend_name("instance", "invalid_project!@", 60) + assert re.match(r"^dstack-instance-[a-z0-9]{8}$", name) + + +class TestNormalizeArch: + @pytest.mark.parametrize( + "arch", [None, "", "X86", "x86_64", "AMD64", gpuhunt.CPUArchitecture.X86] + ) + def test_amd64(self, arch: Optional[str]): + assert normalize_arch(arch) is GoArchType.AMD64 + + @pytest.mark.parametrize("arch", ["arm", "ARM64", "AArch64", gpuhunt.CPUArchitecture.ARM]) + def test_arm64(self, arch: str): + assert normalize_arch(arch) is GoArchType.ARM64 + + @pytest.mark.parametrize("arch", ["IA32", "i686", "ARM32", "aarch32"]) + def test_32bit_not_supported(self, arch: str): + with pytest.raises(ValueError, match="32-bit architectures are not supported"): + normalize_arch(arch) + + def test_unknown_arch(self): + with pytest.raises(ValueError, match="Unsupported architecture: MIPS"): + normalize_arch("MIPS") diff --git a/src/tests/_internal/core/backends/base/test_profile_options.py b/src/tests/_internal/core/backends/base/test_profile_options.py new file mode 100644 index 0000000000..635cae4d55 --- /dev/null +++ b/src/tests/_internal/core/backends/base/test_profile_options.py @@ -0,0 +1,18 @@ +from dstack._internal.core.backends.base.profile_options import get_backend_profile_options +from dstack._internal.core.backends.vastai.profile_options import ( + VastAIProfileOptions, +) + + +class TestGetBackendProfileOptions: + def test_returns_none_for_empty_list(self): + assert get_backend_profile_options([], VastAIProfileOptions) is None + + def test_returns_none_for_none(self): + assert get_backend_profile_options(None, VastAIProfileOptions) is None + + def test_returns_matching_option(self): + opts = [VastAIProfileOptions(min_score=500)] + result = get_backend_profile_options(opts, VastAIProfileOptions) + assert isinstance(result, VastAIProfileOptions) + assert result.min_score == 500 diff --git a/src/tests/_internal/core/backends/cloudrift/__init__.py b/src/tests/_internal/core/backends/cloudrift/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/core/backends/cloudrift/test_configurator.py b/src/tests/_internal/core/backends/cloudrift/test_configurator.py new file mode 100644 index 0000000000..89145279fd --- /dev/null +++ b/src/tests/_internal/core/backends/cloudrift/test_configurator.py @@ -0,0 +1,34 @@ +from unittest.mock import patch + +import pytest + +from dstack._internal.core.backends.cloudrift.configurator import ( + CloudRiftConfigurator, +) +from dstack._internal.core.backends.cloudrift.models import ( + CloudRiftBackendConfigWithCreds, + CloudRiftCreds, +) +from dstack._internal.core.errors import BackendInvalidCredentialsError + + +class TestCloudRiftConfigurator: + def test_validate_config_valid(self): + config = CloudRiftBackendConfigWithCreds(creds=CloudRiftCreds(api_key="valid")) + with patch( + "dstack._internal.core.backends.cloudrift.api_client.RiftClient.validate_api_key" + ) as validate_mock: + validate_mock.return_value = True + CloudRiftConfigurator().validate_config(config, default_creds_enabled=True) + + def test_validate_config_invalid(self): + config = CloudRiftBackendConfigWithCreds(creds=CloudRiftCreds(api_key="invalid")) + with ( + patch( + "dstack._internal.core.backends.cloudrift.api_client.RiftClient.validate_api_key" + ) as validate_mock, + pytest.raises(BackendInvalidCredentialsError) as exc_info, + ): + validate_mock.return_value = False + CloudRiftConfigurator().validate_config(config, default_creds_enabled=True) + assert exc_info.value.fields == [["creds", "api_key"]] diff --git a/src/tests/_internal/core/backends/cudo/__init__.py b/src/tests/_internal/core/backends/cudo/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/core/backends/cudo/test_configurator.py b/src/tests/_internal/core/backends/cudo/test_configurator.py new file mode 100644 index 0000000000..c36b3df0a5 --- /dev/null +++ b/src/tests/_internal/core/backends/cudo/test_configurator.py @@ -0,0 +1,37 @@ +from unittest.mock import patch + +import pytest + +from dstack._internal.core.backends.cudo.configurator import CudoConfigurator +from dstack._internal.core.backends.cudo.models import CudoBackendConfigWithCreds, CudoCreds +from dstack._internal.core.errors import BackendInvalidCredentialsError + + +class TestCudoConfigurator: + def test_validate_config_valid(self): + config = CudoBackendConfigWithCreds( + creds=CudoCreds(api_key="valid"), + project_id="project1", + regions=["no-luster-1"], + ) + with patch( + "dstack._internal.core.backends.cudo.api_client.CudoApiClient.validate_api_key" + ) as validate_mock: + validate_mock.return_value = True + CudoConfigurator().validate_config(config, default_creds_enabled=True) + + def test_validate_config_invalid_creds(self): + config = CudoBackendConfigWithCreds( + creds=CudoCreds(api_key="invalid"), + project_id="project1", + regions=["no-luster-1"], + ) + with ( + patch( + "dstack._internal.core.backends.cudo.api_client.CudoApiClient.validate_api_key" + ) as validate_mock, + pytest.raises(BackendInvalidCredentialsError) as exc_info, + ): + validate_mock.return_value = False + CudoConfigurator().validate_config(config, default_creds_enabled=True) + assert exc_info.value.fields == [["creds", "api_key"]] diff --git a/src/tests/_internal/core/backends/gcp/test_configurator.py b/src/tests/_internal/core/backends/gcp/test_configurator.py new file mode 100644 index 0000000000..635b11f2cf --- /dev/null +++ b/src/tests/_internal/core/backends/gcp/test_configurator.py @@ -0,0 +1,42 @@ +from unittest.mock import Mock, patch + +import pytest + +from dstack._internal.core.backends.gcp.configurator import GCPConfigurator +from dstack._internal.core.backends.gcp.models import ( + GCPBackendConfigWithCreds, + GCPServiceAccountCreds, +) +from dstack._internal.core.errors import ( + BackendAuthError, + BackendInvalidCredentialsError, +) + + +class TestGCPConfigurator: + def test_validate_config_valid(self): + config = GCPBackendConfigWithCreds( + creds=GCPServiceAccountCreds(data="valid", filename="-"), + project_id="valid-project", + regions=["us-west1"], + ) + with ( + patch("dstack._internal.core.backends.gcp.auth.authenticate") as authenticate_mock, + patch("dstack._internal.core.backends.gcp.resources.check_vpc"), + ): + authenticate_mock.return_value = Mock(), Mock() + GCPConfigurator().validate_config(config, default_creds_enabled=True) + + def test_validate_config_invalid_creds(self): + config = GCPBackendConfigWithCreds( + creds=GCPServiceAccountCreds(data="invalid", filename="-"), + project_id="invalid-project", + regions=["us-west1"], + ) + with ( + patch("dstack._internal.core.backends.gcp.auth.authenticate") as authenticate_mock, + pytest.raises(BackendInvalidCredentialsError) as exc_info, + ): + authenticate_mock.side_effect = BackendAuthError() + GCPConfigurator().validate_config(config, default_creds_enabled=True) + assert exc_info.value.fields == [["creds", "data"]] diff --git a/src/tests/_internal/core/backends/gcp/test_resources.py b/src/tests/_internal/core/backends/gcp/test_resources.py index 2318ea84b1..47fe52438d 100644 --- a/src/tests/_internal/core/backends/gcp/test_resources.py +++ b/src/tests/_internal/core/backends/gcp/test_resources.py @@ -1,6 +1,24 @@ import pytest from dstack._internal.core.backends.gcp import resources as gcp_resources +from dstack._internal.core.errors import BackendError + + +class TestValidateLabels: + def test_validate_valid_labels(self): + labels = { + "env": "production", + "project": "gcp-label-validator", + } + assert gcp_resources.validate_labels(labels) is None + + def test_validate_invalid_labels(self): + labels = { + "InvalidName": "validvalue", + "valid-name": "invalid_value!", + } + with pytest.raises(BackendError, match="Invalid resource label"): + gcp_resources.validate_labels(labels) class TestIsValidResourceName: @@ -12,14 +30,13 @@ class TestIsValidResourceName: "a" * 64, "-startswithdash", "1startswithdigit", - "asd_asd", "Uppercase", ], ) def test_invalid_name(self, name): assert not gcp_resources.is_valid_resource_name(name) - @pytest.mark.parametrize("name", ["a", "some-name-with-dashes-123"]) + @pytest.mark.parametrize("name", ["a", "some-name-with-dashes-123", "asd_asd"]) def test_valid_name(self, name): assert gcp_resources.is_valid_resource_name(name) @@ -29,7 +46,6 @@ class TestIsValidLabelValue: "name", [ "a" * 64, - "asd_asd", "Uppercase", ], ) diff --git a/src/tests/_internal/core/backends/jarvislabs/__init__.py b/src/tests/_internal/core/backends/jarvislabs/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/core/backends/jarvislabs/test_api_client.py b/src/tests/_internal/core/backends/jarvislabs/test_api_client.py new file mode 100644 index 0000000000..9fdf487242 --- /dev/null +++ b/src/tests/_internal/core/backends/jarvislabs/test_api_client.py @@ -0,0 +1,257 @@ +import pytest +import requests +from gpuhunt.providers.jarvislabs import API_URL + +from dstack._internal.core.backends.jarvislabs.api_client import ( + JarvisLabsAPIClient, + is_cpu_vm, +) +from dstack._internal.core.errors import BackendError, BackendInvalidCredentialsError + + +def test_validate_api_key_returns_false_on_unauthorized(requests_mock): + requests_mock.get(f"{API_URL}/users/user_info", status_code=401) + + assert JarvisLabsAPIClient("bad").validate_api_key() is False + + +def test_get_user_info_raises_invalid_credentials_on_forbidden(requests_mock): + requests_mock.get(f"{API_URL}/users/user_info", status_code=403) + + with pytest.raises(BackendInvalidCredentialsError): + JarvisLabsAPIClient("bad").get_user_info() + + +def test_make_request_wraps_request_errors(requests_mock): + requests_mock.get( + f"{API_URL}/users/user_info", + exc=requests.ConnectTimeout("timed out"), + ) + + with pytest.raises(BackendError, match="JarvisLabs request failed"): + JarvisLabsAPIClient("token").get_user_info() + + +def test_get_user_info_rejects_non_json_success_response(requests_mock): + requests_mock.get(f"{API_URL}/users/user_info", text="ok") + + with pytest.raises(BackendError, match="Unexpected non-JSON JarvisLabs response"): + JarvisLabsAPIClient("token").get_user_info() + + +def test_add_ssh_key_if_needed_reuses_existing_key(requests_mock): + public_key = "ssh-rsa AAAA test-comment" + requests_mock.get( + f"{API_URL}/ssh/", + json=[{"ssh_key": "ssh-rsa AAAA another-comment", "key_name": "existing"}], + ) + + JarvisLabsAPIClient("token").add_ssh_key_if_needed(public_key) + + assert requests_mock.call_count == 1 + + +def test_add_ssh_key_if_needed_adds_missing_key(requests_mock): + public_key = "ssh-rsa AAAA test-comment" + requests_mock.get(f"{API_URL}/ssh/", json=[]) + requests_mock.post(f"{API_URL}/ssh/", json={"success": True}) + + JarvisLabsAPIClient("token").add_ssh_key_if_needed(public_key) + + assert requests_mock.last_request.json() == { + "ssh_key": public_key, + "key_name": "dstack-36deb09319b2204c", + } + + +def test_create_ssh_key_adds_key_and_returns_created_key_id(requests_mock): + public_key = "ssh-rsa AAAA test-comment" + requests_mock.post(f"{API_URL}/ssh/", json={"success": True}) + requests_mock.get( + f"{API_URL}/ssh/", + json=[ + { + "ssh_key": "ssh-rsa AAAA another-comment", + "key_name": "dstack-test-0.key", + "key_id": "key-id", + } + ], + ) + + key_id = JarvisLabsAPIClient("token").create_ssh_key( + public_key=public_key, + key_name="dstack-test-0.key", + ) + + assert key_id == "key-id" + assert requests_mock.request_history[0].json() == { + "ssh_key": public_key, + "key_name": "dstack-test-0.key", + } + + +def test_create_ssh_key_raises_if_created_key_id_is_missing(requests_mock): + requests_mock.post(f"{API_URL}/ssh/", json={"success": True}) + requests_mock.get(f"{API_URL}/ssh/", json=[]) + + with pytest.raises(BackendError, match="Failed to find created JarvisLabs SSH key"): + JarvisLabsAPIClient("token").create_ssh_key( + public_key="ssh-rsa AAAA test-comment", + key_name="dstack-test-0.key", + ) + + +def test_delete_ssh_key_deletes_key(requests_mock): + requests_mock.delete(f"{API_URL}/ssh/key-id", json={"success": True}) + + JarvisLabsAPIClient("token").delete_ssh_key("key-id") + + assert requests_mock.last_request.method == "DELETE" + + +def test_delete_ssh_key_ignores_missing_key(requests_mock): + requests_mock.delete(f"{API_URL}/ssh/key-id", status_code=404, json={"detail": "not found"}) + + JarvisLabsAPIClient("token").delete_ssh_key("key-id") + + +def test_create_gpu_vm_posts_to_regional_vm_endpoint(requests_mock): + requests_mock.post( + "https://backendn.jarvislabs.net/templates/vm/create", + json={"machine_id": 123}, + ) + + machine_id = JarvisLabsAPIClient("token").create_gpu_vm( + gpu_type="A100-80GB", + num_gpus=1, + is_spot=False, + storage=250, + region="india-noida-01", + name="dstack-test", + ) + + assert machine_id == "123" + assert requests_mock.last_request.headers["Authorization"] == "Bearer token" + assert requests_mock.last_request.json() == { + "gpu_type": "A100-80GB", + "num_gpus": 1, + "hdd": 250, + "region": "india-noida-01", + "name": "dstack-test", + "is_spot": False, + "duration": "hour", + "disk_type": "ssd", + "http_ports": "", + "script_id": None, + "script_args": "", + "fs_id": None, + "arguments": "", + } + + +def test_create_gpu_vm_posts_chennai_region_to_chennai_endpoint(requests_mock): + requests_mock.post( + "https://backendc.jarvislabs.net/templates/vm/create", + json={"machine_id": 123}, + ) + + JarvisLabsAPIClient("token").create_gpu_vm( + gpu_type="RTX-PRO6000", + num_gpus=1, + is_spot=False, + storage=100, + region="india-chennai-01", + name="dstack-test", + ) + + assert requests_mock.last_request.json()["gpu_type"] == "RTX-PRO6000" + assert requests_mock.last_request.json()["region"] == "india-chennai-01" + + +def test_create_gpu_vm_rejects_unsupported_region(requests_mock): + with pytest.raises(BackendError, match="Unsupported JarvisLabs region"): + JarvisLabsAPIClient("token").create_gpu_vm( + gpu_type="H100", + num_gpus=1, + is_spot=False, + storage=100, + region="unknown-region", + name="dstack-test", + ) + + assert requests_mock.call_count == 0 + + +def test_create_gpu_vm_sets_spot_flag(requests_mock): + requests_mock.post( + "https://backendn.jarvislabs.net/templates/vm/create", + json={"machine_id": 123}, + ) + + JarvisLabsAPIClient("token").create_gpu_vm( + gpu_type="L4", + num_gpus=1, + is_spot=True, + storage=100, + region="india-noida-01", + name="dstack-spot", + ) + + assert requests_mock.last_request.json()["is_spot"] is True + + +def test_create_cpu_vm_posts_to_regional_cpu_vm_endpoint(requests_mock): + requests_mock.post( + "https://backendn.jarvislabs.net/templates/vm/cpu/create", + json={"machine_id": 456}, + ) + + machine_id = JarvisLabsAPIClient("token").create_cpu_vm( + vcpus=4, + ram_gb=16, + storage=100, + region="india-noida-01", + name="dstack-cpu", + ) + + assert machine_id == "456" + assert requests_mock.last_request.json() == { + "num_cpus": 1, + "vcpus": 4, + "ram_gb": 16, + "hdd": 100, + "region": "india-noida-01", + "name": "dstack-cpu", + "duration": "hour", + "disk_type": "ssd", + } + + +def test_destroy_instance_uses_cpu_vm_endpoint_for_cpu_vm(requests_mock): + requests_mock.get( + f"{API_URL}/users/fetch/456", + json={ + "success": True, + "instance": { + "machine_id": 456, + "template": "vm", + "gpu_type": "CPU", + "region": "india-noida-01", + }, + }, + ) + requests_mock.post( + "https://backendn.jarvislabs.net/templates/vm/cpu/destroy", + json={"success": True}, + ) + + JarvisLabsAPIClient("token").destroy_instance(machine_id="456", region="india-noida-01") + + assert requests_mock.last_request.qs == {"machine_id": ["456"]} + + +def test_is_cpu_vm_requires_vm_template_and_cpu_gpu_type(): + assert is_cpu_vm({"template": "vm", "gpu_type": "CPU"}) + assert is_cpu_vm({"framework": "VM", "gpu_type": "CPU"}) + assert not is_cpu_vm({"template": "pytorch", "gpu_type": "CPU"}) + assert not is_cpu_vm({"template": "vm", "gpu_type": "H100"}) diff --git a/src/tests/_internal/core/backends/jarvislabs/test_compute.py b/src/tests/_internal/core/backends/jarvislabs/test_compute.py new file mode 100644 index 0000000000..6ee60dfc25 --- /dev/null +++ b/src/tests/_internal/core/backends/jarvislabs/test_compute.py @@ -0,0 +1,481 @@ +from unittest.mock import MagicMock, call, patch + +import pytest + +from dstack._internal.core.backends.jarvislabs.compute import ( + CONFIGURABLE_DISK_SIZE, + JarvisLabsCompute, + JarvisLabsInstanceBackendData, + _get_disk_size_gb, + _get_jarvislabs_gpu_type, + _get_ssh_username, +) +from dstack._internal.core.backends.jarvislabs.models import JarvisLabsConfig, JarvisLabsCreds +from dstack._internal.core.errors import BackendError, NoCapacityError, ProvisioningError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import ( + Disk, + Gpu, + InstanceAvailability, + InstanceConfiguration, + InstanceOffer, + InstanceOfferWithAvailability, + InstanceType, + Resources, + SSHKey, +) +from dstack._internal.core.models.resources import ResourcesSpec +from dstack._internal.core.models.runs import JobProvisioningData, Requirements + + +def _compute() -> JarvisLabsCompute: + compute = JarvisLabsCompute( + JarvisLabsConfig(creds=JarvisLabsCreds(api_key="test"), regions=["india-noida-01"]) + ) + compute.api_client = MagicMock() + compute.api_client.create_ssh_key.return_value = "ssh-key-id" + compute.api_client.get_instance_status.return_value = {"status": "Running"} + return compute + + +def _instance_config(ssh_keys: list[SSHKey] | None = None) -> InstanceConfiguration: + return InstanceConfiguration( + project_name="test-project", + instance_name="jarvislabs-test", + user="test-user", + ssh_keys=ssh_keys or [SSHKey(public="ssh-rsa AAAA test")], + ) + + +def _gpu_offer( + *, + gpu_name: str = "A100", + gpu_memory_mib: int = 80 * 1024, + disk_size_mib: int = 250 * 1024, + spot: bool = False, + backend_data: dict | None = None, +) -> InstanceOfferWithAvailability: + return InstanceOfferWithAvailability( + backend=BackendType.JARVISLABS, + instance=InstanceType( + name=f"{gpu_name}-1x", + resources=Resources( + cpus=28, + memory_mib=112 * 1024, + gpus=[Gpu(name=gpu_name, memory_mib=gpu_memory_mib)], + spot=spot, + disk=Disk(size_mib=disk_size_mib), + ), + ), + region="india-noida-01", + price=1.49, + backend_data=backend_data or {}, + availability=InstanceAvailability.AVAILABLE, + ) + + +def _cpu_offer(*, disk_size_mib: int = 10 * 1024) -> InstanceOfferWithAvailability: + return InstanceOfferWithAvailability( + backend=BackendType.JARVISLABS, + instance=InstanceType( + name="cpu-4x16", + resources=Resources( + cpus=4, + memory_mib=16 * 1024, + gpus=[], + spot=False, + disk=Disk(size_mib=disk_size_mib), + ), + ), + region="india-noida-01", + price=0.0992, + availability=InstanceAvailability.AVAILABLE, + ) + + +def _cpu_catalog_offer(*, disk_size_mib: int = 10 * 1024) -> InstanceOffer: + offer = _cpu_offer(disk_size_mib=disk_size_mib) + return InstanceOffer( + backend=offer.backend, + instance=offer.instance, + region=offer.region, + price=offer.price, + ) + + +def test_get_jarvislabs_gpu_type_uses_backend_data_or_gpu_name(): + assert ( + _get_jarvislabs_gpu_type(_gpu_offer(backend_data={"gpu_type": "A100-80GB"})) == "A100-80GB" + ) + assert _get_jarvislabs_gpu_type(_gpu_offer()) == "A100" + assert _get_jarvislabs_gpu_type(_gpu_offer(gpu_name="H100")) == "H100" + assert ( + _get_jarvislabs_gpu_type( + _gpu_offer( + gpu_name="RTXPRO6000", + gpu_memory_mib=96 * 1024, + backend_data={"gpu_type": "RTX-PRO6000"}, + ) + ) + == "RTX-PRO6000" + ) + + +def test_get_jarvislabs_gpu_type_prefers_backend_data(): + offer = _gpu_offer( + gpu_name="RTXPRO6000", + gpu_memory_mib=96 * 1024, + backend_data={"gpu_type": "RTX PRO 6000"}, + ) + + assert _get_jarvislabs_gpu_type(offer) == "RTX PRO 6000" + + +def test_get_disk_size_gb_clamps_to_jarvislabs_vm_minimum(): + assert _get_disk_size_gb(_cpu_offer(disk_size_mib=10 * 1024)) == 100 + assert _get_disk_size_gb(_gpu_offer(disk_size_mib=250 * 1024)) == 250 + + +def test_get_all_offers_uses_configurable_disk_size(): + compute = _compute() + + with patch( + "dstack._internal.core.backends.jarvislabs.compute.get_catalog_offers", + return_value=[_cpu_catalog_offer()], + ) as m: + offers = compute.get_all_offers_with_availability() + + assert len(offers) == 1 + assert offers[0].availability == InstanceAvailability.AVAILABLE + m.assert_called_once_with( + backend=BackendType.JARVISLABS, + locations=["india-noida-01"], + catalog=compute._catalog, + configurable_disk_size=CONFIGURABLE_DISK_SIZE, + ) + + +def test_get_offers_reuses_all_offers_cache_and_modifies_disk_size(): + compute = _compute() + compute.get_all_offers_with_availability = MagicMock( + return_value=[_cpu_offer(disk_size_mib=100 * 1024)] + ) + + offers_250gb = list(compute.get_offers(Requirements(resources=ResourcesSpec(disk="250GB")))) + offers_300gb = list(compute.get_offers(Requirements(resources=ResourcesSpec(disk="300GB")))) + + assert len(offers_250gb) == 1 + assert offers_250gb[0].instance.resources.disk.size_mib == 250 * 1024 + assert len(offers_300gb) == 1 + assert offers_300gb[0].instance.resources.disk.size_mib == 300 * 1024 + compute.get_all_offers_with_availability.assert_called_once() + + +def test_create_gpu_instance_creates_ssh_key_and_gpu_vm(): + compute = _compute() + compute.api_client.create_gpu_vm.return_value = "123" + + with patch( + "dstack._internal.core.backends.jarvislabs.compute.generate_unique_instance_name", + return_value="dstack-test", + ): + provisioning_data = compute.create_instance( + _gpu_offer(backend_data={"gpu_type": "A100-80GB"}), _instance_config(), None + ) + + compute.api_client.create_ssh_key.assert_called_once_with( + public_key="ssh-rsa AAAA test", + key_name="dstack-test-0.key", + ) + compute.api_client.create_gpu_vm.assert_called_once_with( + gpu_type="A100-80GB", + num_gpus=1, + is_spot=False, + storage=250, + region="india-noida-01", + name="dstack-test", + ) + assert provisioning_data.instance_id == "123" + assert provisioning_data.username == "ubuntu" + assert provisioning_data.dockerized is True + backend_data = JarvisLabsInstanceBackendData.load(provisioning_data.backend_data) + assert backend_data.ssh_key_ids == ["ssh-key-id"] + compute.api_client.get_instance_status.assert_not_called() + + +def test_create_gpu_instance_passes_spot_flag(): + compute = _compute() + compute.api_client.create_gpu_vm.return_value = "123" + + with patch( + "dstack._internal.core.backends.jarvislabs.compute.generate_unique_instance_name", + return_value="dstack-test", + ): + compute.create_instance( + _gpu_offer(spot=True, backend_data={"gpu_type": "A100-80GB"}), + _instance_config(), + None, + ) + + compute.api_client.create_gpu_vm.assert_called_once_with( + gpu_type="A100-80GB", + num_gpus=1, + is_spot=True, + storage=250, + region="india-noida-01", + name="dstack-test", + ) + + +def test_create_rtx_pro_6000_instance_uses_jarvislabs_gpu_type_from_backend_data(): + compute = _compute() + compute.api_client.create_gpu_vm.return_value = "123" + offer = _gpu_offer( + gpu_name="RTXPRO6000", + gpu_memory_mib=96 * 1024, + backend_data={"gpu_type": "RTX-PRO6000"}, + ) + + with patch( + "dstack._internal.core.backends.jarvislabs.compute.generate_unique_instance_name", + return_value="dstack-test", + ): + compute.create_instance(offer, _instance_config(), None) + + compute.api_client.create_gpu_vm.assert_called_once_with( + gpu_type="RTX-PRO6000", + num_gpus=1, + is_spot=False, + storage=250, + region="india-noida-01", + name="dstack-test", + ) + + +def test_create_cpu_instance_creates_ssh_key_and_cpu_vm(): + compute = _compute() + compute.api_client.create_cpu_vm.return_value = "456" + + with patch( + "dstack._internal.core.backends.jarvislabs.compute.generate_unique_instance_name", + return_value="dstack-cpu", + ): + provisioning_data = compute.create_instance(_cpu_offer(), _instance_config(), None) + + compute.api_client.create_ssh_key.assert_called_once_with( + public_key="ssh-rsa AAAA test", + key_name="dstack-cpu-0.key", + ) + compute.api_client.create_cpu_vm.assert_called_once_with( + vcpus=4, + ram_gb=16, + storage=100, + region="india-noida-01", + name="dstack-cpu", + ) + assert provisioning_data.instance_id == "456" + backend_data = JarvisLabsInstanceBackendData.load(provisioning_data.backend_data) + assert backend_data.ssh_key_ids == ["ssh-key-id"] + + +def test_update_provisioning_data_sets_hostname_and_starts_runner(): + compute = _compute() + compute.api_client.get_instance.return_value = { + "machine_id": 123, + "status": "Running", + "public_ip": "203.0.113.10", + "ssh_str": "ssh -o StrictHostKeyChecking=no ubuntu@203.0.113.10", + } + provisioning_data = JobProvisioningData( + backend=BackendType.JARVISLABS, + instance_type=_gpu_offer().instance, + instance_id="123", + region="india-noida-01", + price=1.49, + username="ubuntu", + ssh_port=22, + dockerized=True, + ) + + with patch( + "dstack._internal.core.backends.jarvislabs.compute._start_runner", return_value=True + ) as m: + compute.update_provisioning_data( + provisioning_data, + project_ssh_public_key="ssh-rsa AAAA test", + project_ssh_private_key="private-key", + ) + + assert provisioning_data.hostname == "203.0.113.10" + assert provisioning_data.username == "ubuntu" + m.assert_called_once_with( + hostname="203.0.113.10", + username="ubuntu", + project_ssh_private_key="private-key", + arch=None, + ) + + +def test_update_provisioning_data_does_not_set_hostname_until_runner_starts(): + compute = _compute() + compute.api_client.get_instance.return_value = { + "machine_id": 123, + "status": "Running", + "public_ip": "203.0.113.10", + "ssh_str": "ssh -o StrictHostKeyChecking=no ubuntu@203.0.113.10", + } + provisioning_data = JobProvisioningData( + backend=BackendType.JARVISLABS, + instance_type=_gpu_offer().instance, + instance_id="123", + region="india-noida-01", + price=1.49, + username="ubuntu", + ssh_port=22, + dockerized=True, + ) + + with patch( + "dstack._internal.core.backends.jarvislabs.compute._start_runner", return_value=False + ): + compute.update_provisioning_data( + provisioning_data, + project_ssh_public_key="ssh-rsa AAAA test", + project_ssh_private_key="private-key", + ) + + assert provisioning_data.hostname is None + + +def test_get_ssh_username_parses_jarvislabs_ssh_command(): + assert ( + _get_ssh_username({"ssh_str": "ssh -o StrictHostKeyChecking=no ubuntu@203.0.113.10"}) + == "ubuntu" + ) + assert _get_ssh_username({"ssh_str": "ssh -p 22 root@203.0.113.10"}) == "root" + assert _get_ssh_username({}) == "ubuntu" + + +def test_terminate_instance_delegates_to_api_client_without_backend_data(): + compute = _compute() + + compute.terminate_instance("123", "india-noida-01") + + compute.api_client.destroy_instance.assert_called_once_with( + machine_id="123", + region="india-noida-01", + ) + compute.api_client.delete_ssh_key.assert_not_called() + + +def test_terminate_instance_deletes_created_ssh_keys(): + compute = _compute() + backend_data = JarvisLabsInstanceBackendData( + ssh_key_ids=["ssh-key-id-1", "ssh-key-id-2"] + ).json() + + compute.terminate_instance("123", "india-noida-01", backend_data) + + compute.api_client.destroy_instance.assert_called_once_with( + machine_id="123", + region="india-noida-01", + ) + assert compute.api_client.delete_ssh_key.call_args_list == [ + call("ssh-key-id-1"), + call("ssh-key-id-2"), + ] + + +def test_create_instance_cleans_up_ssh_key_on_create_failure(): + compute = _compute() + compute.api_client.create_gpu_vm.side_effect = NoCapacityError( + "L4 not available at this moment, please try again later" + ) + + with patch( + "dstack._internal.core.backends.jarvislabs.compute.generate_unique_instance_name", + return_value="dstack-test", + ): + with pytest.raises(NoCapacityError): + compute.create_instance(_gpu_offer(spot=True), _instance_config(), None) + + compute.api_client.destroy_instance.assert_not_called() + compute.api_client.delete_ssh_key.assert_called_once_with("ssh-key-id") + + +def test_create_instance_cleans_up_created_ssh_key_if_later_ssh_key_create_fails(): + compute = _compute() + compute.api_client.create_ssh_key.side_effect = [ + "ssh-key-id-1", + BackendError("ssh create failed"), + ] + instance_config = _instance_config( + ssh_keys=[ + SSHKey(public="ssh-rsa AAAA test-1"), + SSHKey(public="ssh-rsa BBBB test-2"), + ] + ) + + with patch( + "dstack._internal.core.backends.jarvislabs.compute.generate_unique_instance_name", + return_value="dstack-test", + ): + with pytest.raises(BackendError, match="ssh create failed"): + compute.create_instance(_gpu_offer(), instance_config, None) + + compute.api_client.create_gpu_vm.assert_not_called() + compute.api_client.delete_ssh_key.assert_called_once_with("ssh-key-id-1") + + +def test_update_provisioning_data_raises_provisioning_error_from_failed_capacity_status(): + compute = _compute() + compute.api_client.get_instance.return_value = None + compute.api_client.get_instance_status.return_value = { + "status": "Failed", + "error": "L4 not available at this moment, please try again later", + "code": 404, + } + provisioning_data = JobProvisioningData( + backend=BackendType.JARVISLABS, + instance_type=_gpu_offer().instance, + instance_id="123", + region="india-noida-01", + price=1.49, + username="ubuntu", + ssh_port=22, + dockerized=True, + ) + + with pytest.raises(ProvisioningError): + compute.update_provisioning_data( + provisioning_data, + project_ssh_public_key="ssh-rsa AAAA test", + project_ssh_private_key="private-key", + ) + + +def test_update_provisioning_data_raises_provisioning_error_from_failed_status(): + compute = _compute() + compute.api_client.get_instance.return_value = None + compute.api_client.get_instance_status.return_value = { + "status": "Failed", + "error": "image setup failed", + "code": 500, + } + provisioning_data = JobProvisioningData( + backend=BackendType.JARVISLABS, + instance_type=_gpu_offer().instance, + instance_id="123", + region="india-noida-01", + price=1.49, + username="ubuntu", + ssh_port=22, + dockerized=True, + ) + + with pytest.raises(ProvisioningError): + compute.update_provisioning_data( + provisioning_data, + project_ssh_public_key="ssh-rsa AAAA test", + project_ssh_private_key="private-key", + ) diff --git a/src/tests/_internal/core/backends/jarvislabs/test_configurator.py b/src/tests/_internal/core/backends/jarvislabs/test_configurator.py new file mode 100644 index 0000000000..1da92e2600 --- /dev/null +++ b/src/tests/_internal/core/backends/jarvislabs/test_configurator.py @@ -0,0 +1,54 @@ +from unittest.mock import patch + +import pytest + +from dstack._internal.core.backends.jarvislabs.configurator import JarvisLabsConfigurator +from dstack._internal.core.backends.jarvislabs.models import ( + JarvisLabsBackendConfigWithCreds, + JarvisLabsCreds, +) +from dstack._internal.core.errors import BackendInvalidCredentialsError, ServerClientError + + +class TestJarvisLabsConfigurator: + def test_validate_config_valid(self): + config = JarvisLabsBackendConfigWithCreds( + creds=JarvisLabsCreds(api_key="valid"), + regions=["india-noida-01"], + ) + with patch( + "dstack._internal.core.backends.jarvislabs.api_client.JarvisLabsAPIClient.validate_api_key" + ) as validate_mock: + validate_mock.return_value = True + JarvisLabsConfigurator().validate_config(config, default_creds_enabled=True) + + def test_validate_config_invalid_creds(self): + config = JarvisLabsBackendConfigWithCreds( + creds=JarvisLabsCreds(api_key="invalid"), + regions=["india-noida-01"], + ) + with ( + patch( + "dstack._internal.core.backends.jarvislabs.api_client.JarvisLabsAPIClient.validate_api_key" + ) as validate_mock, + pytest.raises(BackendInvalidCredentialsError) as exc_info, + ): + validate_mock.return_value = False + JarvisLabsConfigurator().validate_config(config, default_creds_enabled=True) + assert exc_info.value.fields == [["creds", "api_key"]] + + def test_validate_config_unsupported_region(self): + config = JarvisLabsBackendConfigWithCreds( + creds=JarvisLabsCreds(api_key="valid"), + regions=["unknown-region"], + ) + with ( + patch( + "dstack._internal.core.backends.jarvislabs.api_client.JarvisLabsAPIClient.validate_api_key" + ) as validate_mock, + pytest.raises(ServerClientError) as exc_info, + ): + validate_mock.return_value = True + JarvisLabsConfigurator().validate_config(config, default_creds_enabled=True) + assert exc_info.value.fields == [["regions"]] + assert "Unsupported JarvisLabs regions" in exc_info.value.msg diff --git a/src/tests/_internal/core/backends/kubernetes/test_compute.py b/src/tests/_internal/core/backends/kubernetes/test_compute.py deleted file mode 100644 index 3e6d1d8bc8..0000000000 --- a/src/tests/_internal/core/backends/kubernetes/test_compute.py +++ /dev/null @@ -1,24 +0,0 @@ -from dstack._internal.core.backends.kubernetes.compute import _get_gpus_from_node_labels -from dstack._internal.core.models.instances import Gpu - - -class TestGetGPUsFromNodeLabels: - def test_returns_no_gpus_if_no_labels(self): - assert _get_gpus_from_node_labels({}) == [] - - def test_returns_no_gpus_if_missing_labels(self): - assert _get_gpus_from_node_labels({"nvidia.com/gpu.count": 1}) == [] - - def test_returns_correct_memory_for_different_A100(self): - assert _get_gpus_from_node_labels( - { - "nvidia.com/gpu.count": 1, - "nvidia.com/gpu.product": "A100-SXM4-40GB", - } - ) == [Gpu(name="A100", memory_mib=40 * 1024)] - assert _get_gpus_from_node_labels( - { - "nvidia.com/gpu.count": 1, - "nvidia.com/gpu.product": "A100-SXM4-80GB", - } - ) == [Gpu(name="A100", memory_mib=80 * 1024)] diff --git a/src/tests/_internal/core/backends/kubernetes/test_configurator.py b/src/tests/_internal/core/backends/kubernetes/test_configurator.py new file mode 100644 index 0000000000..36f32a08b9 --- /dev/null +++ b/src/tests/_internal/core/backends/kubernetes/test_configurator.py @@ -0,0 +1,95 @@ +from unittest.mock import Mock + +import pytest + +from dstack._internal.core.backends.kubernetes.configurator import KubernetesConfigurator +from dstack._internal.core.backends.kubernetes.models import ( + KubeconfigConfig, + KubernetesBackendConfigWithCreds, + KubernetesContextConfig, + KubernetesProxyJumpConfig, +) +from dstack._internal.core.errors import ServerClientError + + +@pytest.fixture +def get_clusters_mock(monkeypatch: pytest.MonkeyPatch) -> Mock: + mock = Mock(return_value=[]) + monkeypatch.setattr( + "dstack._internal.core.backends.kubernetes.configurator.get_clusters_from_backend_config", + mock, + ) + return mock + + +class TestKubernetesConfigurator: + @pytest.mark.usefixtures("get_clusters_mock") + def test_validate_config_valid_current_context(self): + config = KubernetesBackendConfigWithCreds( + kubeconfig=KubeconfigConfig(data="mocked", filename="-"), + proxy_jump=KubernetesProxyJumpConfig(hostname=None, port=30022), + namespace="ns", + ) + KubernetesConfigurator().validate_config(config, default_creds_enabled=True) + + @pytest.mark.usefixtures("get_clusters_mock") + def test_validate_config_valid_explicit_contexts(self): + config = KubernetesBackendConfigWithCreds( + kubeconfig=KubeconfigConfig(data="mocked", filename="-"), + contexts=["ctx"], + ) + KubernetesConfigurator().validate_config(config, default_creds_enabled=True) + + @pytest.mark.usefixtures("get_clusters_mock") + def test_validate_config_contexts_proxy_jump_mutually_exclusive(self): + config = KubernetesBackendConfigWithCreds( + kubeconfig=KubeconfigConfig(data="mocked", filename="-"), + proxy_jump=KubernetesProxyJumpConfig(hostname=None, port=30022), + contexts=["ctx"], + ) + with pytest.raises(ServerClientError, match="proxy_jump must not be set"): + KubernetesConfigurator().validate_config(config, default_creds_enabled=True) + + @pytest.mark.usefixtures("get_clusters_mock") + def test_validate_config_contexts_namespace_mutually_exclusive(self): + config = KubernetesBackendConfigWithCreds( + kubeconfig=KubeconfigConfig(data="mocked", filename="-"), + namespace="ns", + contexts=["ctx"], + ) + with pytest.raises(ServerClientError, match="namespace must not be set"): + KubernetesConfigurator().validate_config(config, default_creds_enabled=True) + + @pytest.mark.usefixtures("get_clusters_mock") + def test_validate_config_duplicate_contexts(self): + config = KubernetesBackendConfigWithCreds( + kubeconfig=KubeconfigConfig(data="mocked", filename="-"), + contexts=[ + "ctx-3", + KubernetesContextConfig(name="ctx-4"), + "ctx-1", + KubernetesContextConfig(name="ctx-1"), + "ctx-2", + KubernetesContextConfig(name="ctx-3"), + ], + ) + with pytest.raises(ServerClientError, match="duplicate contexts: ctx-1, ctx-3"): + KubernetesConfigurator().validate_config(config, default_creds_enabled=True) + + def test_validate_config_cluster_check_failed( + self, monkeypatch: pytest.MonkeyPatch, get_clusters_mock: Mock + ): + config = KubernetesBackendConfigWithCreds( + kubeconfig=KubeconfigConfig(data="mocked", filename="-"), + contexts=["ctx"], + ) + + monkeypatch.setattr( + "dstack._internal.core.backends.kubernetes.configurator.check_cluster", + Mock(return_value=False), + ) + cluster_mock = Mock() + get_clusters_mock.return_value = [cluster_mock] + with pytest.raises(ServerClientError, match="Failed to validate cluster") as exc_info: + KubernetesConfigurator().validate_config(config, default_creds_enabled=True) + assert exc_info.value.fields == [["kubeconfig"]] diff --git a/src/tests/_internal/core/backends/kubernetes/test_resources.py b/src/tests/_internal/core/backends/kubernetes/test_resources.py new file mode 100644 index 0000000000..1839a41917 --- /dev/null +++ b/src/tests/_internal/core/backends/kubernetes/test_resources.py @@ -0,0 +1,113 @@ +import logging + +import pytest +from gpuhunt import AcceleratorVendor + +from dstack._internal.core.backends.kubernetes.resources import ( + get_amd_gpu_from_node_labels, + get_nvidia_gpu_from_node_labels, + validate_label_key, + validate_label_value, +) +from dstack._internal.core.models.instances import Gpu + + +class TestGetNvidiaGPUFromNodeLabels: + def test_returns_none_if_no_labels(self): + assert get_nvidia_gpu_from_node_labels({}) is None + + def test_returns_correct_memory_for_different_A100(self): + assert get_nvidia_gpu_from_node_labels( + {"nvidia.com/gpu.product": "A100-SXM4-40GB"} + ) == Gpu(vendor=AcceleratorVendor.NVIDIA, name="A100", memory_mib=40 * 1024) + + assert get_nvidia_gpu_from_node_labels( + {"nvidia.com/gpu.product": "A100-SXM4-80GB"} + ) == Gpu(vendor=AcceleratorVendor.NVIDIA, name="A100", memory_mib=80 * 1024) + + +class TestGetAMDGPUFromNodeLabels: + def test_returns_no_gpus_if_no_labels(self): + assert get_amd_gpu_from_node_labels({}) is None + + def test_returns_known_gpu(self): + assert get_amd_gpu_from_node_labels({"beta.amd.com/gpu.device-id.74b5": "4"}) == Gpu( + vendor=AcceleratorVendor.AMD, name="MI300X", memory_mib=192 * 1024 + ) + + def test_returns_known_gpu_if_multiple_device_ids_match_the_same_gpu(self): + # 4x AMD Instinct MI300X VF + 4x AMD Instinct MI300X + labels = {"beta.amd.com/gpu.device-id.74b5": "4", "beta.amd.com/gpu.device-id.74a1": "4"} + assert get_amd_gpu_from_node_labels(labels) == Gpu( + vendor=AcceleratorVendor.AMD, name="MI300X", memory_mib=192 * 1024 + ) + + def test_returns_none_if_device_id_is_unknown(self, caplog: pytest.LogCaptureFixture): + caplog.set_level(logging.WARNING) + assert get_amd_gpu_from_node_labels({"beta.amd.com/gpu.device-id.ffff": "4"}) is None + assert "Unknown AMD GPU device id: FFFF" in caplog.text + + def test_returns_none_if_multiple_gpu_models(self, caplog: pytest.LogCaptureFixture): + caplog.set_level(logging.WARNING) + # 4x AMD Instinct MI300X VF + 4x AMD Instinct MI325X + labels = {"beta.amd.com/gpu.device-id.74b5": "4", "beta.amd.com/gpu.device-id.74a5": "4"} + assert get_amd_gpu_from_node_labels(labels) is None + assert "Multiple AMD GPU models detected" in caplog.text + + +class TestLabelValidation: + @pytest.mark.parametrize( + "key", + [ + pytest.param("env", id="private"), + pytest.param("k8s.example.com/Valid.Label_Name-1", id="prefixed"), + ], + ) + def test_valid_key(self, key: str): + validate_label_key(key) + + @pytest.mark.parametrize( + ["key", "expected_error"], + [ + pytest.param("app.kubernetes.io//name", "Too many segments", id="too-many-segments"), + pytest.param("/name", "Empty prefix", id="empty-prefix"), + pytest.param("a" * 254 + "/name", "Prefix too long", id="too-long-prefix"), + pytest.param("invalid prefix/name", "Invalid prefix", id="space-in-prefix"), + pytest.param("my_app/name", "Invalid prefix", id="underscore-in-prefix"), + pytest.param("-invalid/name", "Invalid prefix", id="leading-dash-in-prefix"), + pytest.param("invalid-/name", "Invalid prefix", id="trailing-dash-in-prefix"), + pytest.param("Invalid/name", "Invalid prefix", id="uppercase-in-prefix"), + pytest.param("", "Empty name", id="empty-name-no-prefix"), + pytest.param("prefix/", "Empty name", id="empty-name-with-prefix"), + pytest.param("a" * 64, "Name too long", id="too-long-name-no-prefix"), + pytest.param("prefix/" + "a" * 64, "Name too long", id="too-long-name-with-prefix"), + pytest.param("-name", "Invalid name", id="leading-dash-in-name"), + pytest.param("name-", "Invalid name", id="trailing-dash-in-name"), + ], + ) + def test_invalid_key(self, key: str, expected_error: str): + with pytest.raises(ValueError, match=expected_error): + validate_label_key(key) + + @pytest.mark.parametrize( + "value", + [ + pytest.param("", id="empty"), + pytest.param("Valid.Label_Value-1", id="non-empty"), + ], + ) + def test_valid_value(self, value: str): + validate_label_value(value) + + @pytest.mark.parametrize( + ["value", "expected_error"], + [ + pytest.param("a" * 64, "Value too long", id="too-long"), + pytest.param("invalid value", "Invalid value", id="space"), + pytest.param("-invalid", "Invalid value", id="leading-dash"), + pytest.param("invalid-", "Invalid value", id="trailing-dash"), + ], + ) + def test_invalid_value(self, value: str, expected_error: str): + with pytest.raises(ValueError, match=expected_error): + validate_label_value(value) diff --git a/src/tests/_internal/core/backends/kubernetes/test_utils.py b/src/tests/_internal/core/backends/kubernetes/test_utils.py new file mode 100644 index 0000000000..f58868217f --- /dev/null +++ b/src/tests/_internal/core/backends/kubernetes/test_utils.py @@ -0,0 +1,318 @@ +import logging +from textwrap import dedent +from typing import Optional, Union + +import pytest + +from dstack._internal.core.backends.kubernetes.models import ( + KubeconfigConfig, + KubernetesBackendConfigWithCreds, + KubernetesContextConfig, + KubernetesProxyJumpConfig, +) +from dstack._internal.core.backends.kubernetes.utils import ( + Cluster, + get_clusters_from_backend_config, +) + + +class TestGetClustersFromBackendConfig: + def make_config( + self, + kubeconfig_data: str, + *, + contexts: Optional[list[Union[KubernetesContextConfig, str]]] = None, + namespace: Optional[str] = None, + proxy_jump: Optional[KubernetesProxyJumpConfig] = None, + ) -> KubernetesBackendConfigWithCreds: + return KubernetesBackendConfigWithCreds( + kubeconfig=KubeconfigConfig(data=kubeconfig_data, filename="-"), + contexts=contexts, + namespace=namespace, + proxy_jump=proxy_jump, + ) + + def make_kubeconfig( + self, + *, + current_context: str = "ctx-a", + # (context name, namespace) pairs + contexts: tuple[tuple[str, str], ...] = (("ctx-a", "default"),), + ) -> str: + clusters_yaml = "\n".join( + dedent(f""" + - name: cluster-{name} + cluster: + server: https://{name}.example.com:6443 + """) + for name, _ in contexts + ) + users_yaml = "\n".join( + dedent(f""" + - name: user-{name} + user: + token: token-{name} + """) + for name, _ in contexts + ) + contexts_yaml = "\n".join( + dedent(f""" + - name: {name} + context: + cluster: cluster-{name} + user: user-{name} + namespace: {namespace} + """) + for name, namespace in contexts + ) + return dedent(""" + apiVersion: v1 + kind: Config + current-context: {current_context} + clusters: + {clusters} + contexts: + {contexts} + users: + {users} + """).format( + current_context=current_context, + clusters=clusters_yaml, + contexts=contexts_yaml, + users=users_yaml, + ) + + def test_returns_single_cluster_using_current_context(self): + config = self.make_config( + self.make_kubeconfig( + current_context="ctx-a", + contexts=( + ("ctx-b", "team-b"), + ("ctx-a", "default"), + ), + ), + ) + + clusters = get_clusters_from_backend_config(config) + + assert len(clusters) == 1 + cluster = clusters[0] + assert isinstance(cluster, Cluster) + assert cluster.context_name == "ctx-a" + assert cluster.region == "" + assert cluster.namespace == "default" + assert cluster.proxy_jump == KubernetesProxyJumpConfig() + assert cluster.api_client.configuration.host == "https://ctx-a.example.com:6443" # pyright: ignore[reportAttributeAccessIssue] + + def test_single_context_uses_namespace_from_backend_config(self): + config = self.make_config( + self.make_kubeconfig(contexts=(("ctx-a", "team-a"),)), + namespace="team-a", + ) + + clusters = get_clusters_from_backend_config(config) + + assert clusters[0].namespace == "team-a" + + def test_single_context_defaults_namespace_when_not_set(self): + config = self.make_config( + self.make_kubeconfig(contexts=(("ctx-a", "team-a"),)), + namespace=None, + ) + + clusters = get_clusters_from_backend_config(config) + + assert clusters[0].namespace == "default" + + def test_single_context_uses_proxy_jump_from_backend_config(self): + proxy_jump = KubernetesProxyJumpConfig(hostname="1.2.3.4", port=2222) + config = self.make_config( + self.make_kubeconfig(), + proxy_jump=proxy_jump, + ) + + clusters = get_clusters_from_backend_config(config) + + assert clusters[0].proxy_jump == proxy_jump + + def test_single_context_uses_default_proxy_jump_when_unset(self): + config = self.make_config(self.make_kubeconfig(), proxy_jump=None) + + clusters = get_clusters_from_backend_config(config) + + assert clusters[0].proxy_jump == KubernetesProxyJumpConfig() + + def test_single_context_warns_on_namespace_mismatch(self, caplog: pytest.LogCaptureFixture): + caplog.set_level(logging.WARNING) + config = self.make_config( + self.make_kubeconfig(contexts=(("ctx-a", "kube-ns"),)), + namespace="config-ns", + ) + + clusters = get_clusters_from_backend_config(config) + + assert clusters[0].namespace == "config-ns" + assert "Namespace mismatch" in caplog.text + assert "kube-ns" in caplog.text + assert "config-ns" in caplog.text + + def test_single_context_does_not_warn_when_namespace_matches( + self, caplog: pytest.LogCaptureFixture + ): + caplog.set_level(logging.WARNING) + config = self.make_config( + self.make_kubeconfig(contexts=(("ctx-a", "team-a"),)), + namespace="team-a", + ) + + get_clusters_from_backend_config(config) + + assert "Namespace mismatch" not in caplog.text + + def test_single_context_raises_when_current_context_missing(self): + kubeconfig = dedent(""" + apiVersion: v1 + kind: Config + clusters: + - name: cluster-a + cluster: + server: https://a.example.com:6443 + contexts: + - name: ctx-a + context: + cluster: cluster-a + user: user-a + users: + - name: user-a + user: + token: t + """) + config = self.make_config(kubeconfig) + + with pytest.raises(ValueError, match="current-context is not set"): + get_clusters_from_backend_config(config) + + def test_contexts_as_strings(self): + config = self.make_config( + self.make_kubeconfig( + current_context="ctx-a", + contexts=( + ("ctx-a", "ns-a"), + ("ctx-b", "ns-b"), + ), + ), + contexts=["ctx-a", "ctx-b"], + ) + + clusters = get_clusters_from_backend_config(config) + + assert [c.context_name for c in clusters] == ["ctx-a", "ctx-b"] + assert [c.region for c in clusters] == ["ctx-a", "ctx-b"] + assert [c.namespace for c in clusters] == ["ns-a", "ns-b"] + assert all(c.proxy_jump == KubernetesProxyJumpConfig() for c in clusters) + assert clusters[0].api_client.configuration.host == "https://ctx-a.example.com:6443" # pyright: ignore[reportAttributeAccessIssue] + assert clusters[1].api_client.configuration.host == "https://ctx-b.example.com:6443" # pyright: ignore[reportAttributeAccessIssue] + + def test_contexts_with_per_context_proxy_jump(self): + proxy_jump_a = KubernetesProxyJumpConfig(hostname="a.example.com", port=2201) + proxy_jump_b = KubernetesProxyJumpConfig(hostname="b.example.com", port=2202) + config = self.make_config( + self.make_kubeconfig( + contexts=( + ("ctx-a", "ns-a"), + ("ctx-b", "ns-b"), + ), + ), + contexts=[ + KubernetesContextConfig(name="ctx-a", proxy_jump=proxy_jump_a), + KubernetesContextConfig(name="ctx-b", proxy_jump=proxy_jump_b), + ], + ) + + clusters = get_clusters_from_backend_config(config) + + assert clusters[0].proxy_jump == proxy_jump_a + assert clusters[1].proxy_jump == proxy_jump_b + + def test_contexts_mix_string_and_object(self): + proxy_jump = KubernetesProxyJumpConfig(hostname="b.example.com", port=2222) + config = self.make_config( + self.make_kubeconfig( + contexts=( + ("ctx-a", "ns-a"), + ("ctx-b", "ns-b"), + ), + ), + contexts=[ + "ctx-a", + KubernetesContextConfig(name="ctx-b", proxy_jump=proxy_jump), + ], + ) + + clusters = get_clusters_from_backend_config(config) + + assert clusters[0].proxy_jump == KubernetesProxyJumpConfig() + assert clusters[1].proxy_jump == proxy_jump + + def test_contexts_object_without_proxy_jump_uses_default(self): + config = self.make_config( + self.make_kubeconfig(contexts=(("ctx-a", "ns-a"),)), + contexts=[KubernetesContextConfig(name="ctx-a", proxy_jump=None)], + ) + + clusters = get_clusters_from_backend_config(config) + + assert clusters[0].proxy_jump == KubernetesProxyJumpConfig() + + def test_contexts_ignores_backend_namespace_and_proxy_jump(self): + config = self.make_config( + self.make_kubeconfig(contexts=(("ctx-a", "kube-ns"),)), + contexts=["ctx-a"], + namespace="config-ns", + proxy_jump=KubernetesProxyJumpConfig(hostname="ignored", port=1), + ) + + clusters = get_clusters_from_backend_config(config) + + assert clusters[0].namespace == "kube-ns" + assert clusters[0].proxy_jump == KubernetesProxyJumpConfig() + + def test_contexts_does_not_warn_on_namespace_mismatch(self, caplog: pytest.LogCaptureFixture): + caplog.set_level(logging.WARNING) + config = self.make_config( + self.make_kubeconfig(contexts=(("ctx-a", "kube-ns"),)), + contexts=["ctx-a"], + namespace="config-ns", + ) + + get_clusters_from_backend_config(config) + + assert "Namespace mismatch" not in caplog.text + + def test_contexts_raises_for_unknown_context(self): + config = self.make_config( + self.make_kubeconfig(contexts=(("ctx-a", "ns-a"),)), + contexts=["ctx-missing"], + ) + + with pytest.raises(ValueError, match="context ctx-missing not found"): + get_clusters_from_backend_config(config) + + def test_empty_contexts_returns_no_clusters(self): + config = self.make_config( + self.make_kubeconfig(contexts=(("ctx-a", "ns-a"),)), + contexts=[], + ) + + clusters = get_clusters_from_backend_config(config) + + assert clusters == [] + + def test_request_timeout_and_retries_propagate_to_client(self): + config = self.make_config(self.make_kubeconfig()) + + clusters = get_clusters_from_backend_config(config, request_timeout=7, retries=5) + + api_client = clusters[0].api_client + assert api_client.configuration.retries == 5 # pyright: ignore[reportAttributeAccessIssue] + assert getattr(api_client, "_ApiClient__request_timeout", None) == 7 diff --git a/src/tests/_internal/core/backends/lambdalabs/__init__.py b/src/tests/_internal/core/backends/lambdalabs/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/core/backends/lambdalabs/test_configurator.py b/src/tests/_internal/core/backends/lambdalabs/test_configurator.py new file mode 100644 index 0000000000..bf5c89b59e --- /dev/null +++ b/src/tests/_internal/core/backends/lambdalabs/test_configurator.py @@ -0,0 +1,38 @@ +from unittest.mock import patch + +import pytest + +from dstack._internal.core.backends.lambdalabs.configurator import LambdaConfigurator +from dstack._internal.core.backends.lambdalabs.models import ( + LambdaBackendConfigWithCreds, + LambdaCreds, +) +from dstack._internal.core.errors import BackendInvalidCredentialsError + + +class TestLambdaConfigurator: + def test_validate_config_valid(self): + config = LambdaBackendConfigWithCreds( + creds=LambdaCreds(api_key="valid"), + regions=["us-east-1"], + ) + with patch( + "dstack._internal.core.backends.lambdalabs.api_client.LambdaAPIClient.validate_api_key" + ) as validate_mock: + validate_mock.return_value = True + LambdaConfigurator().validate_config(config, default_creds_enabled=True) + + def test_validate_config_invalid_creds(self): + config = LambdaBackendConfigWithCreds( + creds=LambdaCreds(api_key="invalid"), + regions=["us-east-1"], + ) + with ( + patch( + "dstack._internal.core.backends.lambdalabs.api_client.LambdaAPIClient.validate_api_key" + ) as validate_mock, + pytest.raises(BackendInvalidCredentialsError) as exc_info, + ): + validate_mock.return_value = False + LambdaConfigurator().validate_config(config, default_creds_enabled=True) + assert exc_info.value.fields == [["creds", "api_key"]] diff --git a/src/tests/_internal/core/backends/oci/test_configurator.py b/src/tests/_internal/core/backends/oci/test_configurator.py new file mode 100644 index 0000000000..5472b3048f --- /dev/null +++ b/src/tests/_internal/core/backends/oci/test_configurator.py @@ -0,0 +1,55 @@ +from unittest.mock import Mock, patch + +import pytest +from oci.exceptions import ClientError + +from dstack._internal.core.backends.oci.configurator import OCIConfigurator +from dstack._internal.core.backends.oci.models import ( + OCIBackendConfigWithCreds, + OCIClientCreds, +) +from dstack._internal.core.errors import BackendInvalidCredentialsError + + +class TestOCIConfigurator: + def test_validate_config_valid(self): + config = OCIBackendConfigWithCreds( + creds=OCIClientCreds( + user="valid_user", + tenancy="valid_tenancy", + key_content="valid_key", + key_file=None, + pass_phrase=None, + fingerprint="valid_fingerprint", + region="us-ashburn-1", + ), + regions=["us-ashburn-1"], + ) + with patch( + "dstack._internal.core.backends.oci.configurator.get_subscribed_regions" + ) as regions_mock: + regions_mock.return_value = Mock(names=["us-ashburn-1"]) + OCIConfigurator().validate_config(config, default_creds_enabled=True) + + def test_validate_config_invalid_creds(self): + config = OCIBackendConfigWithCreds( + creds=OCIClientCreds( + user="invalid_user", + tenancy="invalid_tenancy", + key_content="invalid_key", + key_file=None, + pass_phrase=None, + fingerprint="invalid_fingerprint", + region="us-ashburn-1", + ), + regions=["us-ashburn-1"], + ) + with ( + patch( + "dstack._internal.core.backends.oci.configurator.get_subscribed_regions" + ) as regions_mock, + pytest.raises(BackendInvalidCredentialsError) as exc_info, + ): + regions_mock.side_effect = ClientError("Invalid credentials") + OCIConfigurator().validate_config(config, default_creds_enabled=True) + assert exc_info.value.fields == [["creds"]] diff --git a/src/tests/_internal/core/backends/runpod/__init__.py b/src/tests/_internal/core/backends/runpod/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/core/backends/runpod/test_api_client.py b/src/tests/_internal/core/backends/runpod/test_api_client.py new file mode 100644 index 0000000000..9de7ebed47 --- /dev/null +++ b/src/tests/_internal/core/backends/runpod/test_api_client.py @@ -0,0 +1,71 @@ +from dstack._internal.core.backends.runpod.api_client import ( + RunpodApiClient, + _generate_cpu_pod_deployment_mutation, +) + + +class _Response: + def __init__(self, payload): + self._payload = payload + + def json(self): + return self._payload + + +def test_generate_cpu_pod_deployment_mutation(): + mutation = _generate_cpu_pod_deployment_mutation( + name="cpu-test", + image_name="python:3.11-slim", + instance_id="cpu3g-2-8", + cloud_type="SECURE", + deploy_cost=0.08, + start_ssh=True, + data_center_id="AP-JP-1", + container_disk_in_gb=5, + docker_args='{"cmd":["echo hi"]}', + ports="22/tcp, 8080/http", + volume_mount_path="/workspace", + env={"HELLO": "WORLD"}, + template_id="runpod-ubuntu", + network_volume_id="vol-1", + container_registry_auth_id="cred-1", + ) + + assert "deployCpuPod" in mutation + assert 'name: "cpu-test"' in mutation + assert 'imageName: "python:3.11-slim"' in mutation + assert 'instanceId: "cpu3g-2-8"' in mutation + assert "cloudType: SECURE" in mutation + assert "deployCost: 0.08" in mutation + assert "startSsh: true" in mutation + assert 'dataCenterId: "AP-JP-1"' in mutation + assert "containerDiskInGb: 5" in mutation + assert 'ports: "22/tcp,8080/http"' in mutation + assert 'volumeMountPath: "/workspace"' in mutation + assert 'env: [{ key: "HELLO", value: "WORLD" }]' in mutation + assert 'templateId: "runpod-ubuntu"' in mutation + assert 'networkVolumeId: "vol-1"' in mutation + assert 'containerRegistryAuthId: "cred-1"' in mutation + + +def test_create_cpu_pod_uses_deploy_cpu_pod(monkeypatch): + client = RunpodApiClient(api_key="test") + query = {} + + def fake_make_request(data): + query["value"] = data["query"] + return _Response({"data": {"deployCpuPod": {"id": "cpu-pod-1"}}}) + + monkeypatch.setattr(client, "_make_request", fake_make_request) + + response = client.create_cpu_pod( + name="cpu-test", + image_name="python:3.11-slim", + instance_id="cpu3g-2-8", + cloud_type="SECURE", + deploy_cost=0.08, + ) + + assert response["id"] == "cpu-pod-1" + assert "deployCpuPod" in query["value"] + assert "podFindAndDeployOnDemand" not in query["value"] diff --git a/src/tests/_internal/core/backends/runpod/test_configurator.py b/src/tests/_internal/core/backends/runpod/test_configurator.py new file mode 100644 index 0000000000..2bfac1477c --- /dev/null +++ b/src/tests/_internal/core/backends/runpod/test_configurator.py @@ -0,0 +1,33 @@ +from unittest.mock import patch + +import pytest + +from dstack._internal.core.backends.runpod.configurator import RunpodConfigurator +from dstack._internal.core.backends.runpod.models import RunpodBackendConfigWithCreds, RunpodCreds +from dstack._internal.core.errors import BackendInvalidCredentialsError + + +class TestRunpodConfigurator: + def test_validate_config_valid(self): + config = RunpodBackendConfigWithCreds( + creds=RunpodCreds(api_key="valid"), + ) + with patch( + "dstack._internal.core.backends.runpod.api_client.RunpodApiClient.validate_api_key" + ) as validate_mock: + validate_mock.return_value = True + RunpodConfigurator().validate_config(config, default_creds_enabled=True) + + def test_validate_config_invalid_creds(self): + config = RunpodBackendConfigWithCreds( + creds=RunpodCreds(api_key="invalid"), + ) + with ( + patch( + "dstack._internal.core.backends.runpod.api_client.RunpodApiClient.validate_api_key" + ) as validate_mock, + pytest.raises(BackendInvalidCredentialsError) as exc_info, + ): + validate_mock.return_value = False + RunpodConfigurator().validate_config(config, default_creds_enabled=True) + assert exc_info.value.fields == [["creds", "api_key"]] diff --git a/src/tests/_internal/core/backends/tensordock/__init__.py b/src/tests/_internal/core/backends/tensordock/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/core/backends/vastai/__init__.py b/src/tests/_internal/core/backends/vastai/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/core/backends/vastai/test_compute.py b/src/tests/_internal/core/backends/vastai/test_compute.py new file mode 100644 index 0000000000..8925b38b29 --- /dev/null +++ b/src/tests/_internal/core/backends/vastai/test_compute.py @@ -0,0 +1,56 @@ +from unittest.mock import patch + +from dstack._internal.core.backends.vastai.compute import VastAICompute +from dstack._internal.core.backends.vastai.models import VastAIConfig, VastAICreds +from dstack._internal.core.models.resources import ResourcesSpec +from dstack._internal.core.models.runs import Requirements + + +def _config(community_cloud=None) -> VastAIConfig: + return VastAIConfig(creds=VastAICreds(api_key="test"), community_cloud=community_cloud) + + +def _requirements() -> Requirements: + return Requirements(resources=ResourcesSpec()) + + +def test_vastai_compute_enables_community_cloud_by_default(): + with ( + patch("dstack._internal.core.backends.vastai.compute.VastAIProvider") as vast_provider_cls, + patch("dstack._internal.core.backends.vastai.compute.gpuhunt.Catalog") as catalog_cls, + patch("dstack._internal.core.backends.vastai.compute.get_catalog_offers", return_value=[]), + ): + catalog_instance = catalog_cls.return_value + compute = VastAICompute(_config()) + list(compute.get_offers(_requirements())) + vast_provider_cls.assert_called_once() + assert vast_provider_cls.call_args.kwargs["community_cloud"] is True + catalog_instance.add_provider.assert_called_once() + + +def test_vastai_compute_can_enable_community_cloud(): + with ( + patch("dstack._internal.core.backends.vastai.compute.VastAIProvider") as vast_provider_cls, + patch("dstack._internal.core.backends.vastai.compute.gpuhunt.Catalog") as catalog_cls, + patch("dstack._internal.core.backends.vastai.compute.get_catalog_offers", return_value=[]), + ): + catalog_instance = catalog_cls.return_value + compute = VastAICompute(_config(community_cloud=True)) + list(compute.get_offers(_requirements())) + vast_provider_cls.assert_called_once() + assert vast_provider_cls.call_args.kwargs["community_cloud"] is True + catalog_instance.add_provider.assert_called_once() + + +def test_vastai_compute_can_disable_community_cloud(): + with ( + patch("dstack._internal.core.backends.vastai.compute.VastAIProvider") as vast_provider_cls, + patch("dstack._internal.core.backends.vastai.compute.gpuhunt.Catalog") as catalog_cls, + patch("dstack._internal.core.backends.vastai.compute.get_catalog_offers", return_value=[]), + ): + catalog_instance = catalog_cls.return_value + compute = VastAICompute(_config(community_cloud=False)) + list(compute.get_offers(_requirements())) + vast_provider_cls.assert_called_once() + assert vast_provider_cls.call_args.kwargs["community_cloud"] is False + catalog_instance.add_provider.assert_called_once() diff --git a/src/tests/_internal/core/backends/vastai/test_configurator.py b/src/tests/_internal/core/backends/vastai/test_configurator.py new file mode 100644 index 0000000000..16c3c82edb --- /dev/null +++ b/src/tests/_internal/core/backends/vastai/test_configurator.py @@ -0,0 +1,47 @@ +from unittest.mock import patch + +import pytest + +from dstack._internal.core.backends.vastai.configurator import VastAIConfigurator +from dstack._internal.core.backends.vastai.models import VastAIBackendConfigWithCreds, VastAICreds +from dstack._internal.core.errors import BackendInvalidCredentialsError + + +class TestVastAIConfigurator: + def test_allow_community_cloud_default(self): + config = VastAIBackendConfigWithCreds(creds=VastAICreds(api_key="valid")) + backend = VastAIConfigurator().create_backend(project_name="main", config=config) + loaded_config = VastAIConfigurator()._get_config(backend) + assert loaded_config.allow_community_cloud is True + + def test_allow_community_cloud_enabled(self): + config = VastAIBackendConfigWithCreds( + creds=VastAICreds(api_key="valid"), community_cloud=True + ) + backend = VastAIConfigurator().create_backend(project_name="main", config=config) + loaded_config = VastAIConfigurator()._get_config(backend) + assert loaded_config.allow_community_cloud is True + + def test_validate_config_valid(self): + config = VastAIBackendConfigWithCreds( + creds=VastAICreds(api_key="valid"), + ) + with patch( + "dstack._internal.core.backends.vastai.api_client.VastAIAPIClient.auth_test" + ) as auth_test_mock: + auth_test_mock.return_value = True + VastAIConfigurator().validate_config(config, default_creds_enabled=True) + + def test_validate_config_invalid_creds(self): + config = VastAIBackendConfigWithCreds( + creds=VastAICreds(api_key="invalid"), + ) + with ( + patch( + "dstack._internal.core.backends.vastai.api_client.VastAIAPIClient.auth_test" + ) as auth_test_mock, + pytest.raises(BackendInvalidCredentialsError) as exc_info, + ): + auth_test_mock.return_value = False + VastAIConfigurator().validate_config(config, default_creds_enabled=True) + assert exc_info.value.fields == [["creds", "api_key"]] diff --git a/src/tests/_internal/core/backends/vastai/test_profile_options.py b/src/tests/_internal/core/backends/vastai/test_profile_options.py new file mode 100644 index 0000000000..42f423946e --- /dev/null +++ b/src/tests/_internal/core/backends/vastai/test_profile_options.py @@ -0,0 +1,51 @@ +import pytest + +from dstack._internal.core.backends.vastai.profile_options import ( + VastAIOfferOrder, + VastAIProfileOptions, +) +from dstack._internal.utils.combine import CombineError + + +class TestVastAIProfileOptionsCombine: + def test_combine_empty_options(self): + a = VastAIProfileOptions() + b = VastAIProfileOptions() + result = a.combine(b) + assert result == VastAIProfileOptions() + + def test_combine_all_fields_set(self): + a = VastAIProfileOptions( + offer_order=VastAIOfferOrder.PRICE, + min_reliability=0.7, + min_score=100, + ) + b = VastAIProfileOptions( + offer_order=VastAIOfferOrder.PRICE, + min_reliability=0.95, + min_score=300, + ) + a_combine_b = a.combine(b) + assert a_combine_b.offer_order == VastAIOfferOrder.PRICE + assert a_combine_b.min_reliability == 0.95 + assert a_combine_b.min_score == 300 + b_combine_a = b.combine(a) + assert b_combine_a.offer_order == VastAIOfferOrder.PRICE + assert b_combine_a.min_reliability == 0.95 + assert b_combine_a.min_score == 300 + + def test_combine_one_has_all_fields_set(self): + a = VastAIProfileOptions( + offer_order=VastAIOfferOrder.PRICE, + min_reliability=0.7, + min_score=100, + ) + b = VastAIProfileOptions() + assert a.combine(b) == a + assert b.combine(a) == a + + def test_combine_conflicting_offer_order_raises(self): + a = VastAIProfileOptions(offer_order=VastAIOfferOrder.PRICE) + b = VastAIProfileOptions(offer_order=VastAIOfferOrder.SCORE) + with pytest.raises(CombineError): + a.combine(b) diff --git a/src/tests/_internal/core/backends/verda/__init__.py b/src/tests/_internal/core/backends/verda/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/core/backends/verda/test_compute.py b/src/tests/_internal/core/backends/verda/test_compute.py new file mode 100644 index 0000000000..b8ae8f494d --- /dev/null +++ b/src/tests/_internal/core/backends/verda/test_compute.py @@ -0,0 +1,405 @@ +from types import SimpleNamespace +from unittest.mock import MagicMock, patch + +import pytest +from verda.exceptions import APIException + +from dstack._internal.core.backends.verda.compute import ( + VerdaCompute, + VerdaInstanceBackendData, + _create_ssh_key, + _create_startup_script, +) +from dstack._internal.core.errors import BackendError, NoCapacityError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import ( + InstanceAvailability, + InstanceOffer, + InstanceType, + Resources, +) + + +def _offer(spot: bool, name: str = "SOME.INSTANCE", region: str = "FIN-01") -> InstanceOffer: + return InstanceOffer( + backend=BackendType.VERDA, + instance=InstanceType( + name=name, + resources=Resources(cpus=8, memory_mib=16384, gpus=[], spot=spot), + ), + region=region, + price=1.0, + ) + + +def _assert_terminate_call(action_mock: MagicMock): + action_mock.assert_called_once() + kwargs = action_mock.call_args.kwargs + assert kwargs["id_list"] == ["instance-id"] + assert kwargs["action"] == "delete" + if "delete_permanently" in kwargs: + assert kwargs["delete_permanently"] is True + + +class TestCreateSSHKey: + def test_creates_ssh_key(self): + client = MagicMock() + client.ssh_keys.create.return_value = SimpleNamespace(id="new-ssh-key-id") + + key_id = _create_ssh_key( + client=client, + name="dstack-test-key", + public_key="ssh-rsa test", + ) + + assert key_id == "new-ssh-key-id" + client.ssh_keys.create.assert_called_once_with("dstack-test-key", "ssh-rsa test") + + def test_raises_backend_error_on_api_exception(self): + client = MagicMock() + client.ssh_keys.create.side_effect = APIException("invalid_request", "Boom") + + with pytest.raises(BackendError, match="creating SSH key: Boom"): + _create_ssh_key( + client=client, + name="dstack-test-key", + public_key="ssh-rsa test", + ) + + +class TestCreateStartupScript: + def test_creates_startup_script(self): + client = MagicMock() + client.startup_scripts.create.return_value = SimpleNamespace(id="new-script-id") + + script_id = _create_startup_script( + client=client, + name="dstack-test-script.sh", + script="echo bye", + ) + + assert script_id == "new-script-id" + client.startup_scripts.create.assert_called_once_with( + "dstack-test-script.sh", + "echo bye", + ) + + def test_raises_backend_error_on_api_exception(self): + client = MagicMock() + client.startup_scripts.create.side_effect = APIException("invalid_request", "Boom") + + with pytest.raises(BackendError, match="creating startup script: Boom"): + _create_startup_script( + client=client, + name="dstack-test-script.sh", + script="echo bye", + ) + + +class TestCreateInstance: + def test_cleans_up_created_ssh_keys_if_later_ssh_key_create_fails(self): + compute = VerdaCompute.__new__(VerdaCompute) + compute.client = MagicMock() + + instance_offer = SimpleNamespace( + backend="verda", + instance=SimpleNamespace( + name="CPU.4V.16G", + resources=SimpleNamespace( + disk=SimpleNamespace(size_mib=102400), + gpus=[], + spot=False, + ), + ), + region="FIN-01", + price=0.0279, + ) + instance_config = SimpleNamespace( + instance_name="verda-one-node-0", + get_public_keys=lambda: ["ssh-rsa test-1", "ssh-rsa test-2"], + ) + + with ( + patch( + "dstack._internal.core.backends.verda.compute.generate_unique_instance_name", + return_value="verda-one-node-0", + ), + patch( + "dstack._internal.core.backends.verda.compute._create_ssh_key", + side_effect=["ssh-key-id-1", BackendError("ssh create failed")], + ), + patch( + "dstack._internal.core.backends.verda.compute._create_startup_script" + ) as create_startup_script, + patch( + "dstack._internal.core.backends.verda.compute._delete_startup_script" + ) as delete_startup_script, + patch( + "dstack._internal.core.backends.verda.compute._delete_ssh_keys" + ) as delete_ssh_keys, + ): + with pytest.raises(BackendError, match="ssh create failed"): + compute.create_instance(instance_offer, instance_config, None) + + create_startup_script.assert_not_called() + delete_startup_script.assert_called_once_with(compute.client, None) + delete_ssh_keys.assert_called_once_with(compute.client, ["ssh-key-id-1"]) + + def test_cleans_up_ssh_keys_if_startup_script_create_fails(self): + compute = VerdaCompute.__new__(VerdaCompute) + compute.client = MagicMock() + + instance_offer = SimpleNamespace( + backend="verda", + instance=SimpleNamespace( + name="CPU.4V.16G", + resources=SimpleNamespace( + disk=SimpleNamespace(size_mib=102400), + gpus=[], + spot=False, + ), + ), + region="FIN-01", + price=0.0279, + ) + instance_config = SimpleNamespace( + instance_name="verda-one-node-0", + get_public_keys=lambda: ["ssh-rsa test-1", "ssh-rsa test-2"], + ) + + with ( + patch( + "dstack._internal.core.backends.verda.compute.generate_unique_instance_name", + return_value="verda-one-node-0", + ), + patch( + "dstack._internal.core.backends.verda.compute._create_ssh_key", + side_effect=["ssh-key-id-1", "ssh-key-id-2"], + ), + patch( + "dstack._internal.core.backends.verda.compute._create_startup_script", + side_effect=BackendError("script create failed"), + ), + patch( + "dstack._internal.core.backends.verda.compute._delete_startup_script" + ) as delete_startup_script, + patch( + "dstack._internal.core.backends.verda.compute._delete_ssh_keys" + ) as delete_ssh_keys, + ): + with pytest.raises(BackendError, match="script create failed"): + compute.create_instance(instance_offer, instance_config, None) + + delete_startup_script.assert_called_once_with(compute.client, None) + delete_ssh_keys.assert_called_once_with(compute.client, ["ssh-key-id-1", "ssh-key-id-2"]) + + def test_cleans_up_startup_script_if_deploy_fails(self): + compute = VerdaCompute.__new__(VerdaCompute) + compute.client = MagicMock() + + instance_offer = SimpleNamespace( + backend="verda", + instance=SimpleNamespace( + name="CPU.4V.16G", + resources=SimpleNamespace( + disk=SimpleNamespace(size_mib=102400), + gpus=[], + spot=False, + ), + ), + region="FIN-01", + price=0.0279, + ) + instance_config = SimpleNamespace( + instance_name="verda-one-node-0", + get_public_keys=lambda: ["ssh-rsa test"], + ) + + with ( + patch( + "dstack._internal.core.backends.verda.compute.generate_unique_instance_name", + return_value="verda-one-node-0", + ), + patch( + "dstack._internal.core.backends.verda.compute.get_shim_commands", + return_value=["echo ready"], + ), + patch( + "dstack._internal.core.backends.verda.compute._create_ssh_key", + return_value="ssh-key-id", + ), + patch( + "dstack._internal.core.backends.verda.compute._create_startup_script", + return_value="startup-script-id", + ), + patch( + "dstack._internal.core.backends.verda.compute._deploy_instance", + side_effect=NoCapacityError("no capacity"), + ), + patch( + "dstack._internal.core.backends.verda.compute._delete_startup_script" + ) as delete_startup_script, + patch( + "dstack._internal.core.backends.verda.compute._delete_ssh_keys" + ) as delete_ssh_keys, + ): + with pytest.raises(NoCapacityError): + compute.create_instance(instance_offer, instance_config, None) + + delete_startup_script.assert_called_once_with(compute.client, "startup-script-id") + delete_ssh_keys.assert_called_once_with(compute.client, ["ssh-key-id"]) + + def test_stores_ssh_key_ids_in_backend_data(self): + compute = VerdaCompute.__new__(VerdaCompute) + compute.client = MagicMock() + + instance_offer = SimpleNamespace( + backend="verda", + instance=SimpleNamespace( + name="CPU.4V.16G", + resources=SimpleNamespace( + disk=SimpleNamespace(size_mib=102400), + gpus=[], + spot=False, + ), + ), + region="FIN-01", + price=0.0279, + ) + instance_config = SimpleNamespace( + instance_name="verda-one-node-0", + get_public_keys=lambda: ["ssh-rsa test-1", "ssh-rsa test-2"], + ) + provider_instance = SimpleNamespace(id="provider-instance-id", location="FIN-01") + + with ( + patch( + "dstack._internal.core.backends.verda.compute.generate_unique_instance_name", + return_value="verda-one-node-0", + ), + patch( + "dstack._internal.core.backends.verda.compute.get_shim_commands", + return_value=["echo ready"], + ), + patch( + "dstack._internal.core.backends.verda.compute._create_ssh_key", + side_effect=["ssh-key-id-1", "ssh-key-id-2"], + ), + patch( + "dstack._internal.core.backends.verda.compute._create_startup_script", + return_value="startup-script-id", + ), + patch( + "dstack._internal.core.backends.verda.compute._deploy_instance", + return_value=provider_instance, + ), + patch( + "dstack._internal.core.backends.verda.compute.JobProvisioningData", + side_effect=lambda **kwargs: SimpleNamespace(**kwargs), + ), + ): + jpd = compute.create_instance(instance_offer, instance_config, None) + + backend_data = VerdaInstanceBackendData.load(jpd.backend_data) + assert backend_data.startup_script_id == "startup-script-id" + assert backend_data.ssh_key_ids == ["ssh-key-id-1", "ssh-key-id-2"] + + +class TestGetOffersWithAvailability: + @pytest.mark.parametrize("available_as_spot", [True, False]) + def test_availability_resolved_against_matching_inventory(self, available_as_spot): + compute = VerdaCompute.__new__(VerdaCompute) + compute.client = MagicMock() + + def get_availabilities(is_spot): + names = ["SOME.INSTANCE"] if is_spot == available_as_spot else [] + return [{"location_code": "FIN-01", "availabilities": names}] + + compute.client.instances.get_availabilities.side_effect = get_availabilities + + offers = compute._get_offers_with_availability([_offer(spot=False), _offer(spot=True)]) + availability_by_spot = {o.instance.resources.spot: o.availability for o in offers} + + assert availability_by_spot[available_as_spot] == InstanceAvailability.AVAILABLE + assert availability_by_spot[not available_as_spot] == InstanceAvailability.NOT_AVAILABLE + + def test_queries_both_spot_and_on_demand_availability(self): + compute = VerdaCompute.__new__(VerdaCompute) + compute.client = MagicMock() + compute.client.instances.get_availabilities.return_value = [] + + compute._get_offers_with_availability([_offer(spot=True)]) + + requested_is_spot = { + call.kwargs.get("is_spot") + for call in compute.client.instances.get_availabilities.call_args_list + } + assert requested_is_spot == {True, False} + + +class TestTerminateInstance: + def test_terminate_instance_without_backend_data(self): + compute = VerdaCompute.__new__(VerdaCompute) + compute.client = MagicMock() + + compute.terminate_instance("instance-id", "FIN-01", None) + + _assert_terminate_call(compute.client.instances.action) + compute.client.startup_scripts.delete_by_id.assert_not_called() + compute.client.ssh_keys.delete.assert_not_called() + + def test_terminate_instance_deletes_startup_script(self): + compute = VerdaCompute.__new__(VerdaCompute) + compute.client = MagicMock() + backend_data = VerdaInstanceBackendData( + startup_script_id="script-id", + ssh_key_ids=["ssh-key-id-1", "ssh-key-id-2"], + ).json() + + compute.terminate_instance("instance-id", "FIN-01", backend_data) + + _assert_terminate_call(compute.client.instances.action) + compute.client.startup_scripts.delete_by_id.assert_called_once_with("script-id") + compute.client.ssh_keys.delete.assert_called_once_with(["ssh-key-id-1", "ssh-key-id-2"]) + + def test_terminate_instance_still_deletes_script_when_instance_is_missing(self): + compute = VerdaCompute.__new__(VerdaCompute) + compute.client = MagicMock() + compute.client.instances.action.side_effect = APIException("", "Invalid instance id") + backend_data = VerdaInstanceBackendData( + startup_script_id="script-id", + ssh_key_ids=["ssh-key-id-1"], + ).json() + + compute.terminate_instance("instance-id", "FIN-01", backend_data) + + compute.client.startup_scripts.delete_by_id.assert_called_once_with("script-id") + compute.client.ssh_keys.delete.assert_called_once_with(["ssh-key-id-1"]) + + def test_terminate_instance_retries_on_script_delete_error(self): + compute = VerdaCompute.__new__(VerdaCompute) + compute.client = MagicMock() + compute.client.startup_scripts.delete_by_id.side_effect = APIException( + "", "Random API error" + ) + backend_data = VerdaInstanceBackendData( + startup_script_id="script-id", + ssh_key_ids=["ssh-key-id-1"], + ).json() + + with pytest.raises(APIException): + compute.terminate_instance("instance-id", "FIN-01", backend_data) + + compute.client.ssh_keys.delete.assert_not_called() + + def test_terminate_instance_retries_on_ssh_key_delete_error(self): + compute = VerdaCompute.__new__(VerdaCompute) + compute.client = MagicMock() + compute.client.ssh_keys.delete.side_effect = APIException("", "Random API error") + backend_data = VerdaInstanceBackendData( + startup_script_id="script-id", + ssh_key_ids=["ssh-key-id-1"], + ).json() + + with pytest.raises(APIException): + compute.terminate_instance("instance-id", "FIN-01", backend_data) diff --git a/src/tests/_internal/core/backends/verda/test_configurator.py b/src/tests/_internal/core/backends/verda/test_configurator.py new file mode 100644 index 0000000000..a6f1106da8 --- /dev/null +++ b/src/tests/_internal/core/backends/verda/test_configurator.py @@ -0,0 +1,22 @@ +from unittest.mock import patch + +from dstack._internal.core.backends.verda.configurator import ( + VerdaConfigurator, +) +from dstack._internal.core.backends.verda.models import ( + VerdaBackendConfigWithCreds, + VerdaCreds, +) + + +class TestVerdaConfigurator: + def test_validate_config_valid(self): + config = VerdaBackendConfigWithCreds( + type="verda", + creds=VerdaCreds(client_id="valid", client_secret="valid"), + regions=["FIN-01"], + ) + with patch( + "dstack._internal.core.backends.verda.configurator.VerdaConfigurator._validate_creds" + ): + VerdaConfigurator().validate_config(config, default_creds_enabled=True) diff --git a/src/tests/_internal/core/backends/vultr/__init__.py b/src/tests/_internal/core/backends/vultr/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/core/backends/vultr/test_configurator.py b/src/tests/_internal/core/backends/vultr/test_configurator.py new file mode 100644 index 0000000000..ccd6107d94 --- /dev/null +++ b/src/tests/_internal/core/backends/vultr/test_configurator.py @@ -0,0 +1,33 @@ +from unittest.mock import patch + +import pytest + +from dstack._internal.core.backends.vultr.configurator import VultrConfigurator +from dstack._internal.core.backends.vultr.models import VultrBackendConfigWithCreds, VultrCreds +from dstack._internal.core.errors import BackendInvalidCredentialsError + + +class TestVultrConfigurator: + def test_validate_config_valid(self): + config = VultrBackendConfigWithCreds( + creds=VultrCreds(api_key="valid"), + ) + with patch( + "dstack._internal.core.backends.vultr.api_client.VultrApiClient.validate_api_key" + ) as validate_mock: + validate_mock.return_value = True + VultrConfigurator().validate_config(config, default_creds_enabled=True) + + def test_validate_config_invalid_creds(self): + config = VultrBackendConfigWithCreds( + creds=VultrCreds(api_key="invalid"), + ) + with ( + patch( + "dstack._internal.core.backends.vultr.api_client.VultrApiClient.validate_api_key" + ) as validate_mock, + pytest.raises(BackendInvalidCredentialsError) as exc_info, + ): + validate_mock.return_value = False + VultrConfigurator().validate_config(config, default_creds_enabled=True) + assert exc_info.value.fields == [["creds", "api_key"]] diff --git a/src/tests/_internal/core/models/repos/__init__.py b/src/tests/_internal/core/models/repos/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/core/models/repos/test_local.py b/src/tests/_internal/core/models/repos/test_local.py new file mode 100644 index 0000000000..e6a1b8ab26 --- /dev/null +++ b/src/tests/_internal/core/models/repos/test_local.py @@ -0,0 +1,79 @@ +import io +import tarfile +from pathlib import Path + +import pytest + +from dstack._internal.core.models.repos.local import LocalRepo + + +class TestRepoPathType: + REPO_DIR_NAME = "repo" + + @pytest.fixture + def repo_parent_dir(self, tmp_path: Path) -> Path: + repo_dir = tmp_path / self.REPO_DIR_NAME + repo_dir.mkdir() + (repo_dir / "file.txt").touch() + (repo_dir / "inner").mkdir() + (repo_dir / "inner" / "file.txt").mkdir() + return tmp_path + + @staticmethod + def check(repo: LocalRepo) -> None: + fp = io.BytesIO() + repo.write_code_file(fp) + fp.seek(0) + with tarfile.open(fileobj=fp, mode="r") as tar: + names = tar.getnames() + assert "file.txt" in names + assert "inner/file.txt" in names + + def test_absolute(self, repo_parent_dir: Path): + repo = LocalRepo.from_dir(repo_parent_dir.resolve() / self.REPO_DIR_NAME) + self.check(repo) + + def test_relative(self, repo_parent_dir: Path, monkeypatch): + monkeypatch.chdir(repo_parent_dir) + repo = LocalRepo.from_dir(self.REPO_DIR_NAME) + self.check(repo) + + def test_cwd(self, repo_parent_dir: Path, monkeypatch): + monkeypatch.chdir(repo_parent_dir / self.REPO_DIR_NAME) + repo = LocalRepo.from_dir(".") + self.check(repo) + + def test_with_parent_reference(self, repo_parent_dir: Path, monkeypatch): + cwd = repo_parent_dir / "test" + cwd.mkdir() + monkeypatch.chdir(cwd) + repo = LocalRepo.from_dir(Path("..") / self.REPO_DIR_NAME) + self.check(repo) + + +def test_ignore_rules(tmp_path: Path): + (tmp_path / "file1.txt").touch() + (tmp_path / "file2.py").touch() + (tmp_path / ".hidden").touch() + (tmp_path / ".dstackignore").write_text("file2.py\n") + (tmp_path / "inner").mkdir() + (tmp_path / "inner" / "file3.txt").touch() + (tmp_path / "inner" / "file4.py").touch() + (tmp_path / "inner" / ".gitignore").write_text("*.txt") + (tmp_path / ".git").mkdir() + (tmp_path / ".git" / "config").touch() + + repo = LocalRepo.from_dir(tmp_path) + fp = io.BytesIO() + repo.write_code_file(fp) + fp.seek(0) + + with tarfile.open(fileobj=fp, mode="r") as tar: + names = tar.getnames() + assert "file1.txt" in names + assert "file2.py" not in names # ignored by .dstackignore + assert ".hidden" in names + assert ".dstackignore" in names + assert "inner/file3.txt" not in names # ignored by inner/.gitignore + assert "inner/file4.py" in names + assert ".git/config" not in names # .git always ignored diff --git a/src/tests/_internal/core/models/repos/test_remote.py b/src/tests/_internal/core/models/repos/test_remote.py new file mode 100644 index 0000000000..acf80f2596 --- /dev/null +++ b/src/tests/_internal/core/models/repos/test_remote.py @@ -0,0 +1,80 @@ +import pytest + +from dstack._internal.core.errors import RepoError +from dstack._internal.core.models.repos.remote import GitRepoURL + + +class TestGitRepoURL: + def test_parse_https_url(self): + url = GitRepoURL.parse("https://github.com/dstackai/dstack.git") + assert url.as_https() == "https://github.com/dstackai/dstack.git" + assert url.as_ssh() == "ssh://git@github.com/dstackai/dstack.git" + + def test_parse_https_url_with_port(self): + url = GitRepoURL.parse("https://github.com:8443/dstackai/dstack.git") + assert url.as_https() == "https://github.com:8443/dstackai/dstack.git" + assert url.as_ssh() == "ssh://git@github.com/dstackai/dstack.git" + + def test_parse_https_url_with_ssh_config(self): + ssh_config = { + "github.com": { + "user": "test-user", + "port": "2222", + "hostname": "test.github.com", + } + } + url = GitRepoURL.parse( + "https://github.com:8443/dstackai/dstack.git", + get_ssh_config=lambda host: ssh_config.get(host, {}), + ) + assert url.as_https() == "https://github.com:8443/dstackai/dstack.git" + assert url.as_ssh() == "ssh://test-user@github.com:2222/dstackai/dstack.git" + + def test_parse_scp_location(self): + url = GitRepoURL.parse("test-user@test.example:a/b/c.git") + assert url.as_https() == "https://test.example/a/b/c.git" + assert url.as_ssh() == "ssh://test-user@test.example/a/b/c.git" + + def test_parse_scp_location_with_ssh_config(self): + ssh_config = { + "test.example": { + "user": "test-user-2", + "port": "2222", + "hostname": "test2.example", + } + } + url = GitRepoURL.parse( + "test-user@test.example:a/b/c.git", + get_ssh_config=lambda host: ssh_config.get(host, {}), + ) + assert url.as_https() == "https://test2.example/a/b/c.git" + assert url.as_ssh() == "ssh://test-user@test2.example:2222/a/b/c.git" + + def test_parse_ssh_url_with_ssh_config(self): + ssh_config = { + "test": { + "user": "test-user", + "port": "2222", + "hostname": "test.example", + } + } + url = GitRepoURL.parse( + "ssh://test/repo.git", get_ssh_config=lambda host: ssh_config.get(host, {}) + ) + assert url.as_https() == "https://test.example/repo.git" + assert url.as_ssh() == "ssh://test-user@test.example:2222/repo.git" + + def test_parse_unsupported_scheme(self): + with pytest.raises(RepoError): + GitRepoURL.parse("ftp://test.example/group/repo.git") + + def test_parse_garbage(self): + with pytest.raises(RepoError): + GitRepoURL.parse("garbage") + + def test_oauth_token(self): + url = GitRepoURL.parse("https://github.com/dstackai/dstack.git") + assert ( + url.as_https("secret-token") + == "https://anything:secret-token@github.com/dstackai/dstack.git" + ) diff --git a/src/tests/_internal/core/models/test_common.py b/src/tests/_internal/core/models/test_common.py new file mode 100644 index 0000000000..8cc1e50032 --- /dev/null +++ b/src/tests/_internal/core/models/test_common.py @@ -0,0 +1,27 @@ +import pytest + +from dstack._internal.core.models.common import EntityReference + + +class TestEntityReferenceParse: + @pytest.mark.parametrize( + "value, expected", + [ + ("fleet", EntityReference(project=None, name="fleet")), + ("project/fleet", EntityReference(project="project", name="fleet")), + ( + EntityReference(project="proj", name="fleet"), + EntityReference(project="proj", name="fleet"), + ), + ], + ) + def test_valid(self, value, expected): + assert EntityReference.parse(value) == expected + + @pytest.mark.parametrize( + "value", + ["", "/name", "name/", "/", "a/b/c"], + ) + def test_invalid(self, value: str): + with pytest.raises(ValueError, match="Invalid entity reference"): + EntityReference.parse(value) diff --git a/src/tests/_internal/core/models/test_configurations.py b/src/tests/_internal/core/models/test_configurations.py index 90dfdc11e2..364624e597 100644 --- a/src/tests/_internal/core/models/test_configurations.py +++ b/src/tests/_internal/core/models/test_configurations.py @@ -3,11 +3,59 @@ import pytest from dstack._internal.core.errors import ConfigurationError -from dstack._internal.core.models.configurations import RegistryAuth, parse_run_configuration +from dstack._internal.core.models.common import RegistryAuth +from dstack._internal.core.models.configurations import ( + DevEnvironmentConfigurationParams, + PythonVersion, + RepoSpec, + ServiceConfiguration, + parse_run_configuration, +) from dstack._internal.core.models.resources import Range +from dstack._internal.core.models.routers import ReplicaGroupRouterConfig class TestParseConfiguration: + def test_service_model_probes_none_when_omitted(self): + """When model is set but probes omitted, probes should remain None. + The default probe is generated server-side in the job configurator.""" + conf = { + "type": "service", + "commands": ["python3 -m http.server"], + "port": 8000, + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + } + parsed = parse_run_configuration(conf) + assert isinstance(parsed, ServiceConfiguration) + assert parsed.probes is None + + def test_service_model_does_not_override_explicit_probes(self): + conf = { + "type": "service", + "commands": ["python3 -m http.server"], + "port": 8000, + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "probes": [{"type": "http", "url": "/https/github.com/health"}], + } + parsed = parse_run_configuration(conf) + assert isinstance(parsed, ServiceConfiguration) + assert parsed.probes is not None + assert len(parsed.probes) == 1 + assert parsed.probes[0].url == "/health" + + def test_service_model_explicit_empty_probes_no_default(self): + conf = { + "type": "service", + "commands": ["python3 -m http.server"], + "port": 8000, + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "probes": [], + } + parsed = parse_run_configuration(conf) + assert isinstance(parsed, ServiceConfiguration) + assert parsed.probes is not None + assert len(parsed.probes) == 0 + def test_services_replicas_and_scaling(self): def test_conf(replicas: Any, scaling: Optional[Any] = None): conf = { @@ -51,6 +99,709 @@ def test_conf(replicas: Any, scaling: Optional[Any] = None): ) ) + def test_replica_group_router(self): + conf = { + "type": "service", + "port": 8000, + "replicas": [ + { + "name": "router", + "count": 1, + "commands": ["sglang serve"], + "router": {"type": "sglang"}, + }, + {"name": "worker", "count": 2, "commands": ["worker"]}, + ], + } + parsed = parse_run_configuration(conf) + assert isinstance(parsed, ServiceConfiguration) + assert parsed.replicas is not None + assert isinstance(parsed.replicas, list) + router_g = next(g for g in parsed.replicas if g.name == "router") + assert isinstance(router_g.router, ReplicaGroupRouterConfig) + assert router_g.router.type == "sglang" + + def test_replica_group_router_forbids_service_level_router(self): + conf = { + "type": "service", + "port": 8000, + "router": {"type": "sglang"}, + "replicas": [ + { + "name": "router", + "count": 1, + "commands": ["sglang serve"], + "router": {"type": "sglang"}, + }, + {"name": "worker", "count": 2, "commands": ["worker"]}, + ], + } + with pytest.raises( + ConfigurationError, + match="Service-Level router configuration is not allowed together with replica-group", + ): + parse_run_configuration(conf) + + def test_spot_policy_set_at_both_service_and_group_rejected(self): + with pytest.raises( + ConfigurationError, + match="`spot_policy` is set at both", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "spot_policy": "spot", + "replicas": [ + { + "count": 1, + "commands": ["x"], + "spot_policy": "on-demand", + }, + ], + } + ) + + def test_reservation_set_at_both_service_and_group_rejected(self): + with pytest.raises( + ConfigurationError, + match="`reservation` is set at both", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "image": "x", + "reservation": "svc-res", + "replicas": [ + { + "count": 1, + "reservation": "grp-res", + }, + ], + } + ) + + @pytest.mark.parametrize("shell", [None, "sh", "bash", "/usr/bin/zsh"]) + def test_shell_valid(self, shell: Optional[str]): + conf = { + "type": "task", + "shell": shell, + "commands": ["sleep inf"], + } + assert parse_run_configuration(conf).shell == shell + + def test_shell_invalid(self): + conf = { + "type": "task", + "shell": "zsh", + "commands": ["sleep inf"], + } + with pytest.raises( + ConfigurationError, match="The value must be `sh`, `bash`, or an absolute path" + ): + parse_run_configuration(conf) + + +class TestReplicaGroupContainerFields: + """Per-replica-group image-source fields: `image`, `docker`, `python`, + `nvcc`, `privileged`. Covers field-level mutex validators, the + cross-level no-mixing validator, the runnable-check validator, and + YAML coercion for `python`.""" + + def test_replica_group_accepts_image_python_nvcc_docker(self): + conf = { + "type": "service", + "port": 8000, + "replicas": [ + {"name": "a", "count": 1, "image": "nginx:latest", "commands": ["x"]}, + {"name": "b", "count": 1, "python": "3.12", "commands": ["x"]}, + {"name": "c", "count": 1, "nvcc": True, "commands": ["x"]}, + {"name": "d", "count": 1, "docker": True, "commands": ["x"]}, + ], + } + parsed = parse_run_configuration(conf) + assert isinstance(parsed, ServiceConfiguration) + groups = {g.name: g for g in parsed.replicas} + assert groups["a"].image == "nginx:latest" + assert groups["b"].python == PythonVersion.PY312 + assert groups["c"].nvcc is True + assert groups["d"].docker is True + + def test_replica_group_accepts_privileged(self): + conf = { + "type": "service", + "port": 8000, + "replicas": [ + { + "name": "a", + "count": 1, + "image": "x", + "privileged": True, + "commands": ["x"], + }, + ], + } + parsed = parse_run_configuration(conf) + assert parsed.replicas[0].privileged is True + + @pytest.mark.parametrize( + "yaml_value,expected", + [ + (3.10, PythonVersion.PY310), + (3.12, PythonVersion.PY312), + ("3.10", PythonVersion.PY310), + ("3.12", PythonVersion.PY312), + ], + ) + def test_replica_group_python_yaml_coercion(self, yaml_value, expected): + """YAML may parse `3.10` as float 3.1 — must coerce back to '3.10'.""" + conf = { + "type": "service", + "port": 8000, + "replicas": [{"count": 1, "python": yaml_value, "commands": ["x"]}], + } + parsed = parse_run_configuration(conf) + assert parsed.replicas[0].python == expected + + def test_replica_group_image_python_mutex(self): + with pytest.raises( + ConfigurationError, + match="`image` and `python` are mutually exclusive", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "replicas": [ + {"count": 1, "image": "x", "python": "3.12", "commands": ["x"]}, + ], + } + ) + + def test_replica_group_image_docker_mutex(self): + with pytest.raises( + ConfigurationError, + match="`image` and `docker` are mutually exclusive", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "replicas": [ + {"count": 1, "image": "x", "docker": True, "commands": ["x"]}, + ], + } + ) + + def test_replica_group_python_docker_mutex(self): + with pytest.raises( + ConfigurationError, + match="`python` and `docker` are mutually exclusive", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "replicas": [ + {"count": 1, "python": "3.12", "docker": True, "commands": ["x"]}, + ], + } + ) + + def test_replica_group_nvcc_docker_mutex(self): + with pytest.raises( + ConfigurationError, + match="`nvcc` and `docker` are mutually exclusive", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "replicas": [ + {"count": 1, "nvcc": True, "docker": True, "commands": ["x"]}, + ], + } + ) + + def test_replica_group_python_nvcc_allowed_together(self): + """python + nvcc is the dstackai/base + CUDA combo, must be allowed.""" + conf = { + "type": "service", + "port": 8000, + "replicas": [ + {"count": 1, "python": "3.12", "nvcc": True, "commands": ["x"]}, + ], + } + parsed = parse_run_configuration(conf) + assert parsed.replicas[0].python == PythonVersion.PY312 + assert parsed.replicas[0].nvcc is True + + def test_replica_group_docker_with_privileged_false_rejected(self): + with pytest.raises( + ConfigurationError, + match="`privileged: false` is incompatible with `docker: true`", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "replicas": [ + { + "count": 1, + "docker": True, + "privileged": False, + "commands": ["x"], + }, + ], + } + ) + + def test_replica_group_docker_with_privileged_unset_allowed(self): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "replicas": [ + {"count": 1, "docker": True, "commands": ["x"]}, + ], + } + ) + + def test_image_set_at_both_service_and_group_rejected(self): + with pytest.raises( + ConfigurationError, + match="`image` is set at both", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "image": "svc:1.0", + "replicas": [ + {"count": 1, "image": "grp:1.0", "commands": ["x"]}, + ], + } + ) + + def test_docker_set_at_both_service_and_group_rejected(self): + with pytest.raises( + ConfigurationError, + match="`docker` is set at both", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "docker": True, + "replicas": [ + {"count": 1, "docker": True, "commands": ["x"]}, + ], + } + ) + + def test_python_set_at_both_service_and_group_rejected(self): + with pytest.raises( + ConfigurationError, + match="`python` is set at both", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "python": "3.12", + "replicas": [ + {"count": 1, "python": "3.12", "commands": ["x"]}, + ], + } + ) + + def test_nvcc_set_at_both_service_and_group_rejected(self): + with pytest.raises( + ConfigurationError, + match="`nvcc` is set at both", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "nvcc": True, + "replicas": [ + {"count": 1, "nvcc": True, "commands": ["x"]}, + ], + } + ) + + def test_privileged_set_at_both_service_and_group_rejected(self): + with pytest.raises( + ConfigurationError, + match="`privileged` is set at both", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "privileged": True, + "replicas": [ + { + "count": 1, + "image": "x", + "privileged": True, + "commands": ["x"], + }, + ], + } + ) + + def test_image_at_service_with_groups_inheriting_allowed(self): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "image": "svc:1.0", + "replicas": [ + {"count": 1, "commands": ["x"]}, + {"count": 1, "commands": ["x"]}, + ], + } + ) + + def test_docker_at_service_with_groups_inheriting_allowed(self): + """Service-level `docker: true` combined with groups that don't set + docker should parse cleanly — groups inherit the service-level value. + Guards against the no-mixing validator accidentally rejecting the + inherit case.""" + parse_run_configuration( + { + "type": "service", + "port": 8000, + "docker": True, + "replicas": [ + {"count": 1, "commands": ["x"]}, + {"count": 1, "commands": ["x"]}, + ], + } + ) + + def test_partial_mix_rejected(self): + """Service sets image; only one group overrides — still a mix.""" + with pytest.raises( + ConfigurationError, + match=r"replica group\(s\) \['b'\]", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "image": "svc:1.0", + "replicas": [ + {"name": "a", "count": 1, "commands": ["x"]}, + {"name": "b", "count": 1, "image": "g:2", "commands": ["x"]}, + ], + } + ) + + # ---- Cross-level conflicting image sources ---- + # Validates `validate_no_conflicting_image_sources_across_levels`. + + def test_service_image_conflicts_with_group_docker_rejected(self): + with pytest.raises( + ConfigurationError, + match="Service-level `image` conflicts with group-level `docker`", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "image": "alpine", + "replicas": [{"count": 1, "docker": True, "commands": ["x"]}], + } + ) + + def test_service_image_conflicts_with_group_python_rejected(self): + with pytest.raises( + ConfigurationError, + match="Service-level `image` conflicts with group-level `python`", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "image": "alpine", + "replicas": [{"count": 1, "python": "3.12", "commands": ["x"]}], + } + ) + + def test_service_image_conflicts_with_group_nvcc_rejected(self): + """Reviewer's exact example.""" + with pytest.raises( + ConfigurationError, + match="Service-level `image` conflicts with group-level `nvcc`", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "image": "alpine", + "replicas": [{"count": 1, "nvcc": True, "commands": ["x"]}], + } + ) + + def test_service_docker_conflicts_with_group_image_rejected(self): + with pytest.raises( + ConfigurationError, + match="Service-level `docker` conflicts with group-level `image`", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "docker": True, + "replicas": [{"count": 1, "image": "alpine", "commands": ["x"]}], + } + ) + + def test_service_docker_conflicts_with_group_python_rejected(self): + with pytest.raises( + ConfigurationError, + match="Service-level `docker` conflicts with group-level `python`", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "docker": True, + "replicas": [{"count": 1, "python": "3.12", "commands": ["x"]}], + } + ) + + def test_service_docker_conflicts_with_group_nvcc_rejected(self): + with pytest.raises( + ConfigurationError, + match="Service-level `docker` conflicts with group-level `nvcc`", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "docker": True, + "replicas": [{"count": 1, "nvcc": True, "commands": ["x"]}], + } + ) + + def test_service_python_conflicts_with_group_image_rejected(self): + with pytest.raises( + ConfigurationError, + match="Service-level `python` conflicts with group-level `image`", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "python": "3.12", + "replicas": [{"count": 1, "image": "alpine", "commands": ["x"]}], + } + ) + + def test_service_python_conflicts_with_group_docker_rejected(self): + with pytest.raises( + ConfigurationError, + match="Service-level `python` conflicts with group-level `docker`", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "python": "3.12", + "replicas": [{"count": 1, "docker": True, "commands": ["x"]}], + } + ) + + def test_service_nvcc_conflicts_with_group_image_rejected(self): + with pytest.raises( + ConfigurationError, + match="Service-level `nvcc` conflicts with group-level `image`", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "nvcc": True, + "replicas": [{"count": 1, "image": "alpine", "commands": ["x"]}], + } + ) + + def test_service_nvcc_conflicts_with_group_docker_rejected(self): + with pytest.raises( + ConfigurationError, + match="Service-level `nvcc` conflicts with group-level `docker`", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "nvcc": True, + "replicas": [{"count": 1, "docker": True, "commands": ["x"]}], + } + ) + + def test_service_python_with_group_nvcc_allowed(self): + """`python` and `nvcc` are compatible base-image knobs and may + coexist across levels.""" + parse_run_configuration( + { + "type": "service", + "port": 8000, + "python": "3.12", + "replicas": [{"count": 1, "nvcc": True, "commands": ["x"]}], + } + ) + + def test_service_nvcc_with_group_python_allowed(self): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "nvcc": True, + "replicas": [{"count": 1, "python": "3.12", "commands": ["x"]}], + } + ) + + def test_replica_group_with_only_image_no_commands_allowed(self): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "replicas": [{"count": 1, "image": "nginx:latest"}], + } + ) + + def test_replica_group_with_only_python_no_commands_rejected(self): + """`python` configures the base image but doesn't supply a runnable + workload — must be paired with `commands` or `image`. Matches + service-level behavior.""" + with pytest.raises( + ConfigurationError, + match="either `commands` or `image` must be set", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "replicas": [{"count": 1, "python": "3.12"}], + } + ) + + def test_replica_group_with_only_nvcc_no_commands_rejected(self): + with pytest.raises( + ConfigurationError, + match="either `commands` or `image` must be set", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "replicas": [{"count": 1, "nvcc": True}], + } + ) + + def test_replica_group_with_only_docker_no_commands_rejected(self): + """`docker: true` runs DIND but injects only `start-dockerd`; + without user commands the replica has no actual workload.""" + with pytest.raises( + ConfigurationError, + match="either `commands` or `image` must be set", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "replicas": [{"count": 1, "docker": True}], + } + ) + + def test_empty_replica_group_rejected(self): + with pytest.raises( + ConfigurationError, + match="either `commands` or `image` must be set", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "replicas": [{"count": 1}], + } + ) + + def test_service_level_image_satisfies_groups_runnable_check(self): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "image": "svc:1.0", + "replicas": [{"count": 1}, {"count": 1}], + } + ) + + +class TestRepoSpec: + @pytest.mark.parametrize("value", [".", "rel/path", "/abs/path/"]) + def test_parse_local_path_no_path(self, value: str): + assert RepoSpec.parse(value) == RepoSpec(local_path=value, path=".") + + @pytest.mark.parametrize( + ["value", "expected_repo_path"], + [[".:/repo", "."], ["rel/path:/repo", "rel/path"], ["/abs/path/:/repo", "/abs/path/"]], + ) + def test_parse_local_path_with_path(self, value: str, expected_repo_path: str): + assert RepoSpec.parse(value) == RepoSpec(local_path=expected_repo_path, path="/repo") + + def test_parse_windows_abs_local_path_no_path(self): + assert RepoSpec.parse("C:\\repo") == RepoSpec(local_path="C:\\repo", path=".") + + def test_parse_windows_abs_local_path_with_path(self): + assert RepoSpec.parse("C:\\repo:/repo") == RepoSpec(local_path="C:\\repo", path="/repo") + + def test_parse_url_no_path(self): + assert RepoSpec.parse("https://example.com/repo.git") == RepoSpec( + url="https://example.com/repo.git", path="." + ) + + def test_parse_url_with_path(self): + assert RepoSpec.parse("https://example.com/repo.git:/repo") == RepoSpec( + url="https://example.com/repo.git", path="/repo" + ) + + def test_parse_scp_no_path(self): + assert RepoSpec.parse("git@example.com:repo.git") == RepoSpec( + url="git@example.com:repo.git", path="." + ) + + def test_parse_scp_with_path(self): + assert RepoSpec.parse("git@example.com:repo.git:/repo") == RepoSpec( + url="git@example.com:repo.git", path="/repo" + ) + + @pytest.mark.parametrize("path", ["~", "~/repo"]) + def test_path_tilde(self, path: str): + assert RepoSpec(local_path=".", path=path).path == path + + def test_error_invalid_mapping_if_more_than_two_parts(self): + with pytest.raises(ValueError, match="Invalid repo"): + RepoSpec.parse("./foo:bar:baz") + + def test_error_local_path_url_mutually_exclusive(self): + with pytest.raises(ValueError, match="mutually exclusive"): + RepoSpec(local_path=".", url="https://example.com/repo.git") + + def test_error_local_path_or_url_required(self): + with pytest.raises(ValueError, match="must be specified"): + RepoSpec() + + def test_error_path_tilde_username_not_supported(self): + with pytest.raises(ValueError, match="syntax is not supported"): + RepoSpec(local_path=".", path="~alice/repo") + def test_registry_auth_hashable(): """ @@ -59,3 +810,68 @@ def test_registry_auth_hashable(): """ registry_auth = RegistryAuth(username="username", password="password") hash(registry_auth) + + +class TestDevEnvironmentConfigurationParams: + def test_windsurf_version_valid_format(self): + params = DevEnvironmentConfigurationParams( + ide="windsurf", version="1.106.0@8951cd3ad688e789573d7f51750d67ae4a0bea7d" + ) + assert params.ide == "windsurf" + assert params.version == "1.106.0@8951cd3ad688e789573d7f51750d67ae4a0bea7d" + + def test_windsurf_version_valid_short_commit(self): + params = DevEnvironmentConfigurationParams(ide="windsurf", version="1.0.0@abc123") + assert params.version == "1.0.0@abc123" + + def test_windsurf_version_empty_allowed(self): + params = DevEnvironmentConfigurationParams(ide="windsurf", version=None) + assert params.ide == "windsurf" + assert params.version is None + + def test_windsurf_version_invalid_missing_at(self): + with pytest.raises(ValueError, match="Invalid Windsurf version format"): + DevEnvironmentConfigurationParams(ide="windsurf", version="1.106.0") + + def test_windsurf_version_invalid_missing_commit(self): + with pytest.raises(ValueError, match="Invalid Windsurf version format"): + DevEnvironmentConfigurationParams(ide="windsurf", version="1.106.0@") + + def test_windsurf_version_invalid_missing_version(self): + with pytest.raises(ValueError, match="Invalid Windsurf version format"): + DevEnvironmentConfigurationParams( + ide="windsurf", version="@8951cd3ad688e789573d7f51750d67ae4a0bea7d" + ) + + def test_windsurf_version_invalid_non_hex_commit(self): + with pytest.raises(ValueError, match="Invalid Windsurf version format"): + DevEnvironmentConfigurationParams(ide="windsurf", version="1.106.0@ghijklmnop") + + def test_vscode_version_not_validated(self): + params = DevEnvironmentConfigurationParams(ide="vscode", version="1.80.0") + assert params.ide == "vscode" + assert params.version == "1.80.0" + + def test_cursor_version_not_validated(self): + params = DevEnvironmentConfigurationParams(ide="cursor", version="0.40.0") + assert params.ide == "cursor" + assert params.version == "0.40.0" + + def test_zed_ide_allowed(self): + params = DevEnvironmentConfigurationParams(ide="zed") + assert params.ide == "zed" + assert params.version is None + + def test_zed_version_not_validated(self): + params = DevEnvironmentConfigurationParams(ide="zed", version="0.100.0") + assert params.ide == "zed" + assert params.version == "0.100.0" + + def test_ide_optional(self): + params = DevEnvironmentConfigurationParams() + assert params.ide is None + assert params.version is None + + def test_version_requires_ide(self): + with pytest.raises(ValueError, match="`version` requires `ide` to be set"): + DevEnvironmentConfigurationParams(version="1.80.0") diff --git a/src/tests/_internal/core/models/test_files.py b/src/tests/_internal/core/models/test_files.py new file mode 100644 index 0000000000..d2761d4e92 --- /dev/null +++ b/src/tests/_internal/core/models/test_files.py @@ -0,0 +1,29 @@ +import pytest +from pydantic import ValidationError + +from dstack._internal.core.models.files import FilePathMapping + + +class TestFilePathMapping: + @pytest.mark.parametrize("value", ["./file", "file", "~/file", "/file"]) + def test_parse_only_local_path(self, value: str): + assert FilePathMapping.parse(value) == FilePathMapping(local_path=value, path=value) + + def test_parse_both_paths(self): + assert FilePathMapping.parse("./foo:./bar") == FilePathMapping( + local_path="./foo", path="./bar" + ) + + def test_parse_windows_abs_path(self): + assert FilePathMapping.parse("C:\\dir:dir") == FilePathMapping( + local_path="C:\\dir", path="dir" + ) + + def test_error_invalid_mapping_if_more_than_two_parts(self): + with pytest.raises(ValueError, match="invalid file path mapping"): + FilePathMapping.parse("./foo:bar:baz") + + @pytest.mark.parametrize("value", ["C:\\", "d:/path/to"]) + def test_error_must_be_unix_path(self, value: str): + with pytest.raises(ValidationError, match="path must be a Unix file path"): + FilePathMapping.parse(value) diff --git a/src/tests/_internal/core/models/test_fleets.py b/src/tests/_internal/core/models/test_fleets.py new file mode 100644 index 0000000000..a9214f7ece --- /dev/null +++ b/src/tests/_internal/core/models/test_fleets.py @@ -0,0 +1,122 @@ +from typing import Any + +import pytest +from pydantic import ValidationError + +from dstack._internal.core.models.fleets import FleetConfiguration, FleetNodesSpec + + +class TestFleetConfiguration: + @pytest.mark.parametrize( + ["input_nodes", "expected_nodes"], + [ + pytest.param( + 1, + FleetNodesSpec( + min=1, + target=1, + max=1, + ), + id="int", + ), + pytest.param( + "1..2", + FleetNodesSpec( + min=1, + target=1, + max=2, + ), + id="closed-range", + ), + pytest.param( + "..2", + FleetNodesSpec( + min=0, + target=0, + max=2, + ), + id="range-without-min", + ), + pytest.param( + "1..", + FleetNodesSpec( + min=1, + target=1, + max=None, + ), + id="range-without-max", + ), + pytest.param( + { + "min": 1, + "max": 2, + }, + FleetNodesSpec( + min=1, + target=1, + max=2, + ), + id="dict-without-target", + ), + pytest.param( + { + "min": 1, + "target": 2, + "max": 3, + }, + FleetNodesSpec( + min=1, + target=2, + max=3, + ), + id="dict-with-all-attributes", + ), + pytest.param( + { + "target": 2, + "max": 3, + }, + FleetNodesSpec( + min=0, + target=2, + max=3, + ), + id="dict-without-min", + ), + pytest.param( + {}, + FleetNodesSpec( + min=0, + target=0, + max=None, + ), + id="dict-empty", + ), + ], + ) + def test_parses_nodes(self, input_nodes: Any, expected_nodes: FleetNodesSpec): + configuration_input = { + "type": "fleet", + "nodes": input_nodes, + } + configuration = FleetConfiguration.parse_obj(configuration_input) + assert configuration.nodes == expected_nodes + + @pytest.mark.parametrize( + ["input_nodes"], + [ + pytest.param("2..1", id="min-gt-max"), + pytest.param({"min": -1}, id="negative-min"), + pytest.param({"target": -1}, id="negative-target"), + pytest.param({"target": 2, "max": 1}, id="target-gt-max"), + pytest.param({"min": 2, "max": 1}, id="min-gt-max"), + pytest.param({"min": 2, "target": 1}, id="min-gt-target"), + ], + ) + def test_rejects_nodes(self, input_nodes: Any): + configuration_input = { + "type": "fleet", + "nodes": input_nodes, + } + with pytest.raises(ValidationError): + FleetConfiguration.parse_obj(configuration_input) diff --git a/src/tests/_internal/core/models/test_instances.py b/src/tests/_internal/core/models/test_instances.py new file mode 100644 index 0000000000..bedc708fa0 --- /dev/null +++ b/src/tests/_internal/core/models/test_instances.py @@ -0,0 +1,36 @@ +from gpuhunt import AcceleratorVendor + +from dstack._internal.core.models.instances import Gpu + + +class TestGpu: + def test_no_vendor_nvidia(self): + gpu = Gpu.parse_obj( + { + "name": "T4", + "memory_mib": 16, + } + ) + assert gpu.vendor == AcceleratorVendor.NVIDIA + assert gpu.name == "T4" + + def test_no_vendor_tpu(self): + gpu = Gpu.parse_obj( + { + "name": "tpu-v3", + "memory_mib": 0, + } + ) + assert gpu.vendor == AcceleratorVendor.GOOGLE + assert gpu.name == "v3" + + def test_vendor_cast_to_enum(self): + gpu = Gpu.parse_obj( + { + "vendor": "AMD", + "name": "MI300X", + "memory_mib": 192, + } + ) + assert gpu.vendor == AcceleratorVendor.AMD + assert gpu.name == "MI300X" diff --git a/src/tests/_internal/core/models/test_profiles.py b/src/tests/_internal/core/models/test_profiles.py new file mode 100644 index 0000000000..246435f6f4 --- /dev/null +++ b/src/tests/_internal/core/models/test_profiles.py @@ -0,0 +1,110 @@ +import pytest +from pydantic import ValidationError + +from dstack._internal.core.backends.vastai.profile_options import VastAIProfileOptions +from dstack._internal.core.compatibility.common import get_profile_excludes +from dstack._internal.core.models.common import EntityReference +from dstack._internal.core.models.profiles import ( + FleetInstanceSelector, + InstanceHostnameSelector, + InstanceNameSelector, + Profile, +) + + +class TestValidateProfileBackendOptions: + def test_duplicate_backend_type_raises_validation_error(self): + with pytest.raises(ValidationError, match="duplicate entry for backend 'vastai'"): + Profile( + backend_options=[ + VastAIProfileOptions(min_score=100), + VastAIProfileOptions(min_score=200), + ] + ) + + def test_single_entry_per_backend_is_valid(self): + profile = Profile(backend_options=[VastAIProfileOptions(min_score=100)]) + assert profile.backend_options is not None + assert len(profile.backend_options) == 1 + + def test_none_backend_options_is_valid(self): + profile = Profile(backend_options=None) + assert profile.backend_options is None + + def test_empty_list_backend_options_is_valid(self): + profile = Profile(backend_options=[]) + assert profile.backend_options == [] + + +class TestProfileInstances: + def test_string_is_parsed_as_instance_name_selector(self): + profile = Profile.parse_obj({"instances": ["my-fleet-1"]}) + + assert profile.instances == [InstanceNameSelector(name="my-fleet-1")] + + @pytest.mark.parametrize( + ("value", "expected"), + [ + ({"name": "my-fleet-1"}, InstanceNameSelector(name="my-fleet-1")), + ({"hostname": "worker-1"}, InstanceHostnameSelector(hostname="worker-1")), + ( + {"fleet": "my-fleet", "instance": 3}, + FleetInstanceSelector(fleet="my-fleet", instance=3), + ), + ( + {"fleet": "other-project/my-fleet", "instance": 3}, + FleetInstanceSelector(fleet="other-project/my-fleet", instance=3), + ), + ], + ) + def test_object_selectors_are_parsed(self, value, expected): + profile = Profile.parse_obj({"instances": [value]}) + + assert profile.instances == [expected] + + def test_parses_fleet_selector_object_notation(self): + profile = Profile.parse_obj( + {"instances": [{"fleet": {"project": "main", "name": "my-fleet"}, "instance": 0}]} + ) + + assert profile.instances == [ + FleetInstanceSelector( + fleet=EntityReference(project="main", name="my-fleet"), instance=0 + ) + ] + + @pytest.mark.parametrize( + "value", + [ + "", + {"name": "my-fleet-1", "hostname": "worker-1"}, + {"name": ""}, + {"hostname": ""}, + {"fleet": "", "instance": 0}, + {"fleet": "project/name/extra", "instance": 0}, + {"fleet": "my-fleet"}, + {"fleet": "my-fleet", "instance": -1}, + {"hostname": "worker-1", "extra": "value"}, + ], + ) + def test_invalid_selector_is_rejected(self, value): + with pytest.raises(ValidationError): + Profile.parse_obj({"instances": [value]}) + + def test_empty_instances_list_is_rejected(self): + with pytest.raises(ValidationError): + Profile.parse_obj({"instances": []}) + + +class TestProfileInstancesCompatibilityExcludes: + def test_excludes_unset_instances(self): + profile = Profile() + + assert "instances" not in profile.dict(exclude=get_profile_excludes(profile)) + + def test_preserves_configured_instances(self): + profile = Profile(instances=[InstanceNameSelector(name="my-fleet-1")]) + + assert profile.dict(exclude=get_profile_excludes(profile))["instances"] == [ + {"name": "my-fleet-1"} + ] diff --git a/src/tests/_internal/core/models/test_resources.py b/src/tests/_internal/core/models/test_resources.py index 578a772eb7..5da32ec8f2 100644 --- a/src/tests/_internal/core/models/test_resources.py +++ b/src/tests/_internal/core/models/test_resources.py @@ -1,7 +1,17 @@ +from typing import Optional + import pytest +from gpuhunt import AcceleratorVendor, CPUArchitecture from pydantic import ValidationError, parse_obj_as -from dstack._internal.core.models.resources import ComputeCapability, GPUSpec, Memory, Range +from dstack._internal.core.models.resources import ( + DEFAULT_CPU_COUNT, + ComputeCapability, + CPUSpec, + GPUSpec, + Memory, + Range, +) class TestMemory: @@ -99,13 +109,126 @@ def test_dict(self): ) +class TestCPU: + def test_integer(self): + assert parse_obj_as(CPUSpec, 1).dict() == {"arch": None, "count": {"min": 1, "max": 1}} + + @pytest.mark.parametrize( + ["value", "expected_arch", "expected_min", "expected_max"], + [ + ["1..2", None, 1, 2], + ["X86", CPUArchitecture.X86, DEFAULT_CPU_COUNT.min, DEFAULT_CPU_COUNT.max], + ["x86:2", CPUArchitecture.X86, 2, 2], + ["2..:ARM", CPUArchitecture.ARM, 2, None], + ], + ) + def test_valid_string( + self, + value: str, + expected_arch: Optional[CPUArchitecture], + expected_min: Optional[int], + expected_max: Optional[int], + ): + assert parse_obj_as(CPUSpec, value).dict() == { + "arch": expected_arch, + "count": {"min": expected_min, "max": expected_max}, + } + + @pytest.mark.parametrize( + ["value", "error"], + [ + ["arm:", "CPU spec contains empty token"], + ["2:foo", "Invalid CPU architecture"], + ["arm:x86", "CPU spec arch conflict"], + ["2:arm:2", "CPU spec count conflict"], + ], + ) + def test_invalid_string(self, value: str, error: str): + with pytest.raises(ValidationError, match=error): + parse_obj_as(CPUSpec, value) + + def test_range_object(self): + assert parse_obj_as(CPUSpec, Range[int](min=1, max=2)).dict() == { + "arch": None, + "count": {"min": 1, "max": 2}, + } + + def test_range_dict(self): + assert parse_obj_as(CPUSpec, {"min": 1, "max": 2}).dict() == { + "arch": None, + "count": {"min": 1, "max": 2}, + } + + def test_valid_dict(self): + assert parse_obj_as(CPUSpec, {"arch": "ARM", "count": {"min": 1, "max": 2}}).dict() == { + "arch": CPUArchitecture.ARM, + "count": {"min": 1, "max": 2}, + } + + def test_invalid_dict(self): + with pytest.raises(ValidationError): + parse_obj_as(CPUSpec, {"arch": "x86", "min": 1, "max": 2}) + + class TestGPU: def test_count(self): assert parse_obj_as(GPUSpec, "1") == parse_obj_as(GPUSpec, {"count": 1}) + @pytest.mark.parametrize( + ["value", "expected"], + [ + pytest.param( + "Nvidia", {"vendor": AcceleratorVendor.NVIDIA}, id="vendor-only-mixedcase" + ), + pytest.param( + "google:v3-64", + {"vendor": AcceleratorVendor.GOOGLE, "name": ["v3-64"]}, + id="vendor-lowercase-and-name", + ), + pytest.param( + "tpu:v5p-1024", + {"vendor": AcceleratorVendor.GOOGLE, "name": ["v5p-1024"]}, + id="tpu-lowercase-and-name", + ), + pytest.param( + "v5litepod-64:TPU", + {"vendor": AcceleratorVendor.GOOGLE, "name": ["v5litepod-64"]}, + id="name-and-tpu-uppercase", + ), + pytest.param( + "MI300X:AMD", + {"vendor": AcceleratorVendor.AMD, "name": ["MI300X"]}, + id="name-and-vendor-uppercase", + ), + ], + ) + def test_vendor_in_string_form(self, value, expected): + assert parse_obj_as(GPUSpec, value) == parse_obj_as(GPUSpec, expected) + + @pytest.mark.parametrize( + ["value", "expected"], + [ + pytest.param(None, None, id="null"), + pytest.param("NVIDIA", AcceleratorVendor.NVIDIA, id="uppercase"), + pytest.param("amd", AcceleratorVendor.AMD, id="lowercase"), + pytest.param("Google", AcceleratorVendor.GOOGLE, id="mixedcase"), + pytest.param("tpu", AcceleratorVendor.GOOGLE, id="tpu-lowercase"), + pytest.param("TPU", AcceleratorVendor.GOOGLE, id="tpu-uppercase"), + pytest.param(AcceleratorVendor.GOOGLE, AcceleratorVendor.GOOGLE, id="enum-value"), + ], + ) + def test_vendor_in_object_form(self, value, expected): + assert parse_obj_as(GPUSpec, {"vendor": value}) == parse_obj_as( + GPUSpec, {"vendor": expected} + ) + def test_name(self): assert parse_obj_as(GPUSpec, "A100") == parse_obj_as(GPUSpec, {"name": ["A100"]}) + def test_name_with_tpu_prefix(self): + spec = parse_obj_as(GPUSpec, "tpu-v3-2048") + assert spec.name == ["v3-2048"] + def test_memory(self): assert parse_obj_as(GPUSpec, "16GB") == parse_obj_as(GPUSpec, {"memory": "16GB"}) @@ -122,11 +245,36 @@ def test_empty_token(self): with pytest.raises(ValidationError): parse_obj_as(GPUSpec, "A100:") - def test_conflict(self): - with pytest.raises(ValidationError): + def test_vendor_conflict(self): + with pytest.raises(ValidationError, match=r"vendor conflict"): + parse_obj_as(GPUSpec, "Nvidia:A100:2:AMD") + + def test_count_conflict(self): + with pytest.raises(ValidationError, match=r"count conflict"): parse_obj_as(GPUSpec, "A100:2:3") def test_memory_range(self): assert parse_obj_as(GPUSpec, "16GB..32") == parse_obj_as( GPUSpec, {"memory": {"min": 16, "max": 32}} ) + + +@pytest.mark.parametrize( + ("r1", "r2", "intersection"), + [ + (Range[int](min=1, max=2), Range[int](min=3, max=4), None), + (Range[int](min=1, max=2), Range[int](min=2, max=3), Range[int](min=2, max=2)), + (Range[int](min=1, max=2), Range[int](min=1, max=2), Range[int](min=1, max=2)), + (Range[int](min=1, max=3), Range[int](min=2, max=4), Range[int](min=2, max=3)), + (Range[int](min=1, max=4), Range[int](min=2, max=3), Range[int](min=2, max=3)), + (Range[int](min=None, max=1), Range[int](min=2, max=None), None), + (Range[int](min=None, max=1), Range[int](min=1, max=None), Range[int](min=1, max=1)), + (Range[int](min=None, max=2), Range[int](min=1, max=None), Range[int](min=1, max=2)), + (Range[int](min=None, max=1), Range[int](min=None, max=2), Range[int](min=None, max=1)), + (Range[int](min=1, max=None), Range[int](min=2, max=None), Range[int](min=2, max=None)), + (Range[int](min=1, max=None), Range[int](min=None, max=2), Range[int](min=1, max=2)), + ], +) +def test_intersect_ranges(r1: Range[int], r2: Range[int], intersection: Range[int]) -> None: + assert r1.intersect(r2) == intersection + assert r2.intersect(r1) == intersection diff --git a/src/tests/_internal/core/models/test_runs.py b/src/tests/_internal/core/models/test_runs.py index 576eba61a1..7968845f4e 100644 --- a/src/tests/_internal/core/models/test_runs.py +++ b/src/tests/_internal/core/models/test_runs.py @@ -1,6 +1,11 @@ +import pytest +from pydantic import ValidationError + +from dstack._internal.core.models.profiles import RetryEvent from dstack._internal.core.models.runs import ( JobStatus, JobTerminationReason, + RunSpec, RunStatus, RunTerminationReason, ) @@ -18,7 +23,142 @@ def test_run_termination_reason_to_status_works_with_all_enum_variants(): assert isinstance(run_status, RunStatus) -def test_job_termination_reason_to_status_works_with_all_enum_varians(): +def test_job_termination_reason_to_status_works_with_all_enum_variants(): for job_termination_reason in JobTerminationReason: job_status = job_termination_reason.to_status() assert isinstance(job_status, JobStatus) + + +def test_job_termination_reason_to_retry_event_works_with_all_enum_variants(): + for job_termination_reason in JobTerminationReason: + retry_event = job_termination_reason.to_retry_event() + assert retry_event is None or isinstance(retry_event, RetryEvent) + + +# Will fail if JobTerminationReason value is added without updating JobSubmission._get_error +def test_get_error_returns_expected_messages(): + # already handled and shown in status_message + no_error_reasons = [ + JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY, + JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY, + JobTerminationReason.WAITING_RUNNER_LIMIT_EXCEEDED, + JobTerminationReason.TERMINATED_BY_USER, + JobTerminationReason.DONE_BY_RUNNER, + JobTerminationReason.ABORTED_BY_USER, + JobTerminationReason.TERMINATED_BY_SERVER, + JobTerminationReason.CONTAINER_EXITED_WITH_ERROR, + ] + + for reason in JobTerminationReason: + if reason.to_error() is None: + # Fail no-error reason is not in the list + assert reason in no_error_reasons + + +# Will fail if RunTerminationReason value is added without updating Run._get_error +def test_run_get_error_returns_none_for_specific_reasons(): + no_error_reasons = [ + RunTerminationReason.ALL_JOBS_DONE, + RunTerminationReason.JOB_FAILED, + RunTerminationReason.STOPPED_BY_USER, + RunTerminationReason.ABORTED_BY_USER, + ] + + for reason in RunTerminationReason: + if reason.to_error() is None: + # Fail no-error reason is not in the list + assert reason in no_error_reasons + + +def _service_run_spec_dict(router_type=None, retry=None, top_level_extras=None): + """Build a minimal RunSpec dict for a service. + + `router_type`: None | "sglang" | "dynamo" — controls whether/how the + second replica group has a router field. + `retry`: optional dict passed as `profile.retry`. + `top_level_extras`: optional dict merged into the service configuration. + """ + replicas = [{"name": "worker", "commands": ["echo hi"], "count": 1}] + if router_type is not None: + replicas.append( + { + "name": "router", + "router": {"type": router_type}, + "commands": ["echo router"], + "count": 1, + } + ) + configuration = { + "type": "service", + "port": 8000, + "replicas": replicas, + } + if top_level_extras: + configuration.update(top_level_extras) + profile = {"name": "default"} + if retry is not None: + profile["retry"] = retry + return { + "run_name": "test-run", + "repo_id": "test-repo", + "configuration_path": "dstack.yaml", + "configuration": configuration, + "profile": profile, + "ssh_key_pub": "ssh-rsa AAAA...", + "repo_data": {"repo_type": "virtual"}, + } + + +class TestDynamoNoRetryValidator: + def test_dynamo_router_with_retry_at_profile_level_is_rejected(self): + spec = _service_run_spec_dict( + router_type="dynamo", + retry={"on_events": ["error"]}, + ) + with pytest.raises(ValidationError, match="Dynamo"): + RunSpec.parse_obj(spec) + + def test_dynamo_router_with_retry_in_configuration_is_rejected(self): + # retry can also be specified at configuration level; _merged_profile + # folds it into merged_profile.retry, so the validator should still + # catch it. + spec = _service_run_spec_dict( + router_type="dynamo", + top_level_extras={"retry": {"on_events": ["error"]}}, + ) + with pytest.raises(ValidationError, match="Dynamo"): + RunSpec.parse_obj(spec) + + def test_dynamo_router_without_retry_is_accepted(self): + spec = _service_run_spec_dict(router_type="dynamo", retry=None) + # Should not raise: + RunSpec.parse_obj(spec) + + def test_sglang_router_with_retry_is_accepted(self): + spec = _service_run_spec_dict( + router_type="sglang", + retry={"on_events": ["error"]}, + ) + # SGLang services are unaffected by the validator. + RunSpec.parse_obj(spec) + + def test_service_without_router_with_retry_is_accepted(self): + spec = _service_run_spec_dict(router_type=None, retry={"on_events": ["error"]}) + RunSpec.parse_obj(spec) + + def test_non_service_run_with_retry_is_accepted(self): + # Validator is service-only. A task or dev-environment with retry + # shouldn't be flagged regardless of the rest of the spec. + spec = { + "run_name": "test-run", + "repo_id": "test-repo", + "configuration_path": "dstack.yaml", + "configuration": { + "type": "task", + "commands": ["echo hi"], + }, + "profile": {"name": "default", "retry": {"on_events": ["error"]}}, + "ssh_key_pub": "ssh-rsa AAAA...", + "repo_data": {"repo_type": "virtual"}, + } + RunSpec.parse_obj(spec) diff --git a/src/tests/_internal/core/models/test_templates.py b/src/tests/_internal/core/models/test_templates.py new file mode 100644 index 0000000000..e73decbb0c --- /dev/null +++ b/src/tests/_internal/core/models/test_templates.py @@ -0,0 +1,264 @@ +import pytest +from pydantic import ValidationError + +from dstack._internal.core.models.templates import ( + EnvUITemplateParameter, + IDEUITemplateParameter, + NameUITemplateParameter, + PythonOrDockerUITemplateParameter, + RepoUITemplateParameter, + ResourcesUITemplateParameter, + UITemplate, + WorkingDirUITemplateParameter, +) + + +class TestUITemplateParameter: + def test_parses_name_parameter(self): + data = {"type": "name"} + template = UITemplate.parse_obj( + { + "type": "template", + "name": "t", + "title": "T", + "parameters": [data], + "configuration": {}, + } + ) + assert len(template.parameters) == 1 + assert isinstance(template.parameters[0], NameUITemplateParameter) + + def test_parses_ide_parameter(self): + data = {"type": "ide"} + template = UITemplate.parse_obj( + { + "type": "template", + "name": "t", + "title": "T", + "parameters": [data], + "configuration": {}, + } + ) + assert isinstance(template.parameters[0], IDEUITemplateParameter) + + def test_parses_resources_parameter(self): + data = {"type": "resources"} + template = UITemplate.parse_obj( + { + "type": "template", + "name": "t", + "title": "T", + "parameters": [data], + "configuration": {}, + } + ) + assert isinstance(template.parameters[0], ResourcesUITemplateParameter) + + def test_parses_python_or_docker_parameter(self): + data = {"type": "python_or_docker"} + template = UITemplate.parse_obj( + { + "type": "template", + "name": "t", + "title": "T", + "parameters": [data], + "configuration": {}, + } + ) + assert isinstance(template.parameters[0], PythonOrDockerUITemplateParameter) + + def test_parses_repo_parameter(self): + data = {"type": "repo"} + template = UITemplate.parse_obj( + { + "type": "template", + "name": "t", + "title": "T", + "parameters": [data], + "configuration": {}, + } + ) + assert isinstance(template.parameters[0], RepoUITemplateParameter) + + def test_parses_working_dir_parameter(self): + data = {"type": "working_dir"} + template = UITemplate.parse_obj( + { + "type": "template", + "name": "t", + "title": "T", + "parameters": [data], + "configuration": {}, + } + ) + assert isinstance(template.parameters[0], WorkingDirUITemplateParameter) + + def test_parses_env_parameter_with_all_fields(self): + data = { + "type": "env", + "title": "Password", + "name": "PASSWORD", + "value": "$random-password", + } + template = UITemplate.parse_obj( + { + "type": "template", + "name": "t", + "title": "T", + "parameters": [data], + "configuration": {}, + } + ) + param = template.parameters[0] + assert isinstance(param, EnvUITemplateParameter) + assert param.title == "Password" + assert param.name == "PASSWORD" + assert param.value == "$random-password" + + def test_parses_env_parameter_with_no_optional_fields(self): + data = {"type": "env"} + template = UITemplate.parse_obj( + { + "type": "template", + "name": "t", + "title": "T", + "parameters": [data], + "configuration": {}, + } + ) + param = template.parameters[0] + assert isinstance(param, EnvUITemplateParameter) + assert param.title is None + assert param.name is None + assert param.value is None + + def test_rejects_unknown_parameter_type(self): + data = {"type": "unknown_type"} + with pytest.raises(ValidationError): + UITemplate.parse_obj( + { + "type": "template", + "name": "t", + "title": "T", + "parameters": [data], + "configuration": {}, + } + ) + + +class TestUITemplate: + def test_parses_desktop_ide_template(self): + data = { + "type": "template", + "name": "desktop-ide", + "title": "Desktop IDE", + "description": "Access the instance from your desktop VS Code, Cursor, or Windsurf.", + "parameters": [ + {"type": "name"}, + {"type": "ide"}, + {"type": "resources"}, + {"type": "python_or_docker"}, + {"type": "repo"}, + {"type": "working_dir"}, + ], + "configuration": {"type": "dev-environment"}, + } + template = UITemplate.parse_obj(data) + assert template.name == "desktop-ide" + assert template.title == "Desktop IDE" + assert ( + template.description + == "Access the instance from your desktop VS Code, Cursor, or Windsurf." + ) + assert len(template.parameters) == 6 + assert template.configuration == {"type": "dev-environment"} + + def test_parses_web_based_ide_template(self): + data = { + "type": "template", + "name": "in-browser-ide", + "title": "In-browser IDE", + "description": "Access the instance using VS Code in the browser.", + "parameters": [ + {"type": "name"}, + {"type": "resources"}, + {"type": "python_or_docker"}, + {"type": "repo"}, + {"type": "working_dir"}, + { + "type": "env", + "title": "Password", + "name": "PASSWORD", + "value": "$random-password", + }, + ], + "configuration": { + "type": "service", + "auth": False, + "https": "auto", + "env": ["BIND_ADDR=0.0.0.0:8080"], + "commands": ["echo hello"], + "port": 8080, + "probes": [{"type": "http", "url": "/https/github.com/healthz"}], + }, + } + template = UITemplate.parse_obj(data) + assert template.name == "in-browser-ide" + assert template.title == "In-browser IDE" + assert len(template.parameters) == 6 + assert isinstance(template.parameters[5], EnvUITemplateParameter) + assert template.configuration["type"] == "service" + assert template.configuration["port"] == 8080 + + def test_rejects_wrong_type(self): + with pytest.raises(ValidationError): + UITemplate.parse_obj( + { + "type": "not-a-template", + "name": "t", + "title": "T", + "configuration": {}, + } + ) + + def test_rejects_missing_configuration(self): + with pytest.raises(ValidationError): + UITemplate.parse_obj( + { + "type": "template", + "name": "t", + "title": "T", + } + ) + + def test_rejects_missing_name(self): + with pytest.raises(ValidationError): + UITemplate.parse_obj( + { + "type": "template", + "title": "T", + "configuration": {}, + } + ) + + def test_empty_parameters_default(self): + template = UITemplate.parse_obj( + { + "type": "template", + "name": "t", + "title": "T", + "configuration": {"type": "task"}, + } + ) + assert template.parameters == [] + + def test_description_is_optional(self): + template = UITemplate.parse_obj( + { + "type": "template", + "name": "t", + "title": "T", + "configuration": {"type": "task"}, + } + ) + assert template.description is None diff --git a/src/tests/_internal/core/models/test_unix.py b/src/tests/_internal/core/models/test_unix.py new file mode 100644 index 0000000000..777987daba --- /dev/null +++ b/src/tests/_internal/core/models/test_unix.py @@ -0,0 +1,43 @@ +import pytest + +from dstack._internal.core.models.unix import UnixUser + + +class TestUnixUser: + @pytest.mark.parametrize( + ["value", "expected"], + [ + ["0", UnixUser(uid=0)], + ["1000", UnixUser(uid=1000)], + ["debian", UnixUser(username="debian")], + ["1000:2000", UnixUser(uid=1000, gid=2000)], + ["1000:wheel", UnixUser(uid=1000, groupname="wheel")], + ["root:0", UnixUser(username="root", gid=0)], + ["admin:wheel", UnixUser(username="admin", groupname="wheel")], + ], + ) + def test_ok(self, value: str, expected: UnixUser): + assert UnixUser.parse(value) == expected + + @pytest.mark.parametrize("value", ["1000:1000:", "user:group:foo:bar"]) + def test_too_many_parts(self, value: str): + with pytest.raises(ValueError, match="too many parts"): + UnixUser.parse(value) + + @pytest.mark.parametrize("value", ["", ":group"]) + def test_empty_user(self, value: str): + with pytest.raises(ValueError, match="empty user name or id"): + UnixUser.parse(value) + + @pytest.mark.parametrize("value", ["-1", "-1:group"]) + def test_negative_uid(self, value: str): + with pytest.raises(ValueError, match="negative uid"): + UnixUser.parse(value) + + def test_empty_group(self): + with pytest.raises(ValueError, match="empty group name or id"): + UnixUser.parse("user:") + + def test_negative_gid(self): + with pytest.raises(ValueError, match="negative gid"): + UnixUser.parse("1000:-1000") diff --git a/src/tests/_internal/core/models/test_volumes.py b/src/tests/_internal/core/models/test_volumes.py new file mode 100644 index 0000000000..3fddfb92fd --- /dev/null +++ b/src/tests/_internal/core/models/test_volumes.py @@ -0,0 +1,99 @@ +import pytest +from pydantic import ValidationError, parse_obj_as + +from dstack._internal.core.models.volumes import ( + InstanceMountPoint, + VolumeMountPoint, + parse_mount_point, +) + + +class TestVolumeMountPoint: + def test_parse(self): + assert VolumeMountPoint.parse("my-vol:/path/./to///dir/") == VolumeMountPoint( + name="my-vol", path="/path/to/dir" + ) + + def test_path_normalization(self): + assert parse_obj_as( + VolumeMountPoint, {"name": "my-vol", "path": "/path/./to///dir/"} + ) == VolumeMountPoint(name="my-vol", path="/path/to/dir") + + @pytest.mark.parametrize("value", ["my-vol", "my-vol:/run:ro"]) + def test_parse_error_invalid_format(self, value: str): + with pytest.raises(ValueError, match="invalid mount point format"): + VolumeMountPoint.parse(value) + + def test_validation_error_empty_path(self): + with pytest.raises(ValidationError, match="empty path"): + parse_obj_as(VolumeMountPoint, {"name": "vol", "path": ""}) + + def test_validation_error_rel_path(self): + with pytest.raises(ValidationError, match="path must be absolute"): + parse_obj_as(VolumeMountPoint, {"name": "vol", "path": "rel/path"}) + + def test_validation_error_parent_dir(self): + with pytest.raises(ValidationError, match=r"\.\. are not allowed"): + parse_obj_as(VolumeMountPoint, {"name": "vol", "path": "/path/../to"}) + + +class TestInstanceBindMountPoint: + def test_parse(self): + assert InstanceMountPoint.parse("/host/.//path/:/run//./path") == InstanceMountPoint( + instance_path="/host/path", path="/run/path" + ) + + def test_path_normalization(self): + assert parse_obj_as( + InstanceMountPoint, {"instance_path": "/host/.//path/", "path": "/run//./path"} + ) == InstanceMountPoint(instance_path="/host/path", path="/run/path") + + @pytest.mark.parametrize("value", ["/path", "/host/path:/run/path:ro"]) + def test_parse_error_invalid_format(self, value: str): + with pytest.raises(ValueError, match="invalid mount point format"): + InstanceMountPoint.parse(value) + + @pytest.mark.parametrize("field", ["instance_path", "path"]) + def test_validation_error_empty_path(self, field: str): + data = {"instance_path": "/instance_path", "path": "/run_path"} + data[field] = "" + with pytest.raises(ValidationError, match="empty path"): + parse_obj_as(InstanceMountPoint, data) + + @pytest.mark.parametrize("field", ["instance_path", "path"]) + def test_validation_error_rel_path(self, field: str): + data = {"instance_path": "/instance_path", "path": "/run_path"} + data[field] = "./rel/path" + with pytest.raises(ValidationError, match="path must be absolute"): + parse_obj_as(InstanceMountPoint, data) + + @pytest.mark.parametrize("field", ["instance_path", "path"]) + def test_validation_error_parent_dir(self, field: str): + data = {"instance_path": "/instance_path", "path": "/run_path"} + data[field] = "/path/../to" + with pytest.raises(ValidationError, match=r"\.\. are not allowed"): + parse_obj_as(InstanceMountPoint, data) + + +class TestParseMountPoint: + def test_parse_volume_mount(self): + assert parse_mount_point("my-vol:/path//to") == VolumeMountPoint( + name="my-vol", path="/path/to" + ) + + def test_parse_instance_mount(self): + assert parse_mount_point("/host:/run/") == InstanceMountPoint( + instance_path="/host", path="/run" + ) + + @pytest.mark.parametrize( + "value", ["my-vol", "my-vol:/run:ro", "/path", "/host/path:/run/path:ro"] + ) + def test_parse_error_invalid_format(self, value: str): + with pytest.raises(ValueError, match="invalid mount point format"): + parse_mount_point(value) + + @pytest.mark.parametrize("value", ["path/to:/run", "./path:/run", "path/:/run"]) + def test_validation_error_rel_local_path(self, value: str): + with pytest.raises(ValidationError, match="path must be absolute"): + parse_mount_point(value) diff --git a/src/tests/_internal/core/services/ssh/__init__.py b/src/tests/_internal/core/services/ssh/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/core/services/ssh/test_client.py b/src/tests/_internal/core/services/ssh/test_client.py new file mode 100644 index 0000000000..a5ea64d4a4 --- /dev/null +++ b/src/tests/_internal/core/services/ssh/test_client.py @@ -0,0 +1,76 @@ +from pathlib import Path + +import pytest + +from dstack._internal.core.services.ssh.client import SSHClientInfo + + +class TestSSHClientInfo: + def test_openbsd(self): + path = Path("/usr/bin/ssh") + info = SSHClientInfo.from_raw_version("OpenSSH_9.7, LibreSSL 3.9.0", path) + assert info == SSHClientInfo( + path=path, + version="9.7", + version_tuple=(9, 7), + for_windows=False, + supports_control_socket=True, + supports_multiplexing=True, + supports_background_mode=True, + ) + + def test_linux(self): + path = Path("/usr/bin/ssh") + info = SSHClientInfo.from_raw_version( + "OpenSSH_9.2p1 Debian-2+deb12u3, OpenSSL 3.0.13 30 Jan 2024", path + ) + assert info == SSHClientInfo( + path=path, + version="9.2p1", + version_tuple=(9, 2), + for_windows=False, + supports_control_socket=True, + supports_multiplexing=True, + supports_background_mode=True, + ) + + def test_macos(self): + path = Path("/usr/bin/ssh") + info = SSHClientInfo.from_raw_version("OpenSSH_9.7p1, LibreSSL 3.3.6", path) + assert info == SSHClientInfo( + path=path, + version="9.7p1", + version_tuple=(9, 7), + for_windows=False, + supports_control_socket=True, + supports_multiplexing=True, + supports_background_mode=True, + ) + + @pytest.mark.windows_only + def test_windows_msys2(self): + path = Path("C:\\Program Files\\Git\\usr\\bin\\ssh.exe") + info = SSHClientInfo.from_raw_version("OpenSSH_9.8p1, OpenSSL 3.2.2 4 Jun 2024", path) + assert info == SSHClientInfo( + path=path, + version="9.8p1", + version_tuple=(9, 8), + for_windows=False, + supports_control_socket=True, + supports_multiplexing=False, + supports_background_mode=True, + ) + + @pytest.mark.windows_only + def test_windows_for_windows(self): + path = Path("C:\\Windows\\System32\\OpenSSH\\ssh.exe") + info = SSHClientInfo.from_raw_version("OpenSSH_for_Windows_8.6p1, LibreSSL 3.4.3", path) + assert info == SSHClientInfo( + path=path, + version="8.6p1", + version_tuple=(8, 6), + for_windows=True, + supports_control_socket=False, + supports_multiplexing=False, + supports_background_mode=False, + ) diff --git a/src/tests/_internal/core/services/ssh/test_key_manager.py b/src/tests/_internal/core/services/ssh/test_key_manager.py new file mode 100644 index 0000000000..7792c70377 --- /dev/null +++ b/src/tests/_internal/core/services/ssh/test_key_manager.py @@ -0,0 +1,89 @@ +import os +import time +import uuid +from datetime import datetime +from pathlib import Path +from unittest.mock import Mock + +from dstack._internal.core.models.users import ( + GlobalRole, + User, + UserPermissions, + UserTokenCreds, + UserWithCreds, +) +from dstack._internal.core.services.ssh.key_manager import ( + KEY_REFRESH_RATE, + UserSSHKeyManager, +) + +SAMPLE_USER = UserWithCreds( + id=uuid.uuid4(), + username="test", + created_at=datetime.now(), + global_role=GlobalRole.USER, + active=True, + email="test@example.com", + permissions=UserPermissions(can_create_projects=False), + creds=UserTokenCreds(token="7f92121b-a1b9-4ff2-8c0e-39070ffcd964"), + ssh_public_key="ssh-rsa AAA.public", + ssh_private_key="-----BEGIN PRIVATE KEY-----\nPRIVATE\n-----END PRIVATE KEY-----", +) +SAMPLE_USER_TOKEN_HASH = "4f010545" # sha1(SAMPLE_USER.creds.token.encode()).hexdigest[:8] + + +def make_api_client(user: User, token_hash: str): + api_client = Mock() + api_client.get_token_hash.return_value = token_hash + api_client.users = Mock() + api_client.users.get_my_user.return_value = user + return api_client + + +def set_mtime(path: Path, ts: float): + os.utime(path, (ts, ts)) + + +def test_get_user_key_downloads_keys(tmp_path: Path): + api_client = make_api_client(user=SAMPLE_USER, token_hash=SAMPLE_USER_TOKEN_HASH) + manager = UserSSHKeyManager(api_client, tmp_path) + + key = manager.get_user_key() + assert key is not None + assert key.public_key == SAMPLE_USER.ssh_public_key + assert key.private_key_path == tmp_path / SAMPLE_USER_TOKEN_HASH + assert (tmp_path / SAMPLE_USER_TOKEN_HASH).read_text() == SAMPLE_USER.ssh_private_key + assert (tmp_path / f"{SAMPLE_USER_TOKEN_HASH}.pub").read_text() == SAMPLE_USER.ssh_public_key + + +def test_get_user_key_uses_existing_key(tmp_path: Path): + api_client = make_api_client(user=SAMPLE_USER, token_hash=SAMPLE_USER_TOKEN_HASH) + (tmp_path / SAMPLE_USER_TOKEN_HASH).write_text("private-contents") + (tmp_path / f"{SAMPLE_USER_TOKEN_HASH}.pub").write_text("public-contents") + + manager = UserSSHKeyManager(api_client, tmp_path) + key = manager.get_user_key() + + assert api_client.users.get_my_user.call_count == 0 + assert key is not None + assert key.public_key == "public-contents" + assert key.private_key_path == (tmp_path / SAMPLE_USER_TOKEN_HASH) + + +def test_get_user_key_redownloads_expired_key(tmp_path: Path): + api_client = make_api_client(user=SAMPLE_USER, token_hash=SAMPLE_USER_TOKEN_HASH) + priv = tmp_path / SAMPLE_USER_TOKEN_HASH + pub = tmp_path / f"{SAMPLE_USER_TOKEN_HASH}.pub" + priv.write_text("old-private") + pub.write_text("old-public") + stale_ts = time.time() - KEY_REFRESH_RATE.total_seconds() - 10 + set_mtime(priv, stale_ts) + set_mtime(pub, stale_ts) + + manager = UserSSHKeyManager(api_client, tmp_path) + key = manager.get_user_key() + assert key is not None + assert key.public_key == SAMPLE_USER.ssh_public_key + assert key.private_key_path == priv + assert priv.read_text() == SAMPLE_USER.ssh_private_key + assert pub.read_text() == SAMPLE_USER.ssh_public_key diff --git a/src/tests/_internal/core/services/ssh/test_tunnel.py b/src/tests/_internal/core/services/ssh/test_tunnel.py new file mode 100644 index 0000000000..b97c86a1e6 --- /dev/null +++ b/src/tests/_internal/core/services/ssh/test_tunnel.py @@ -0,0 +1,253 @@ +from pathlib import Path + +import pytest + +from dstack._internal.core.models.instances import SSHConnectionParams +from dstack._internal.core.services.ssh.client import SSHClientInfo +from dstack._internal.core.services.ssh.tunnel import ( + IPSocket, + SocketPair, + SSHTunnel, + UnixSocket, + ports_to_forwarded_sockets, +) +from dstack._internal.utils.path import FileContent, FilePath + + +class TestSSHTunnel: + @pytest.fixture + def ssh_client_info(self, monkeypatch: pytest.MonkeyPatch) -> SSHClientInfo: + ssh_client_info = SSHClientInfo.from_raw_version("OpenSSH_9.7p1", Path("/usr/bin/ssh")) + monkeypatch.setattr( + "dstack._internal.core.services.ssh.client._ssh_client_info", ssh_client_info + ) + return ssh_client_info + + @pytest.fixture + def sample_tunnel_with_all_params(self, ssh_client_info: SSHClientInfo) -> SSHTunnel: + return SSHTunnel( + destination="ubuntu@my-server", + identity=FilePath("/home/user/.ssh/id_rsa"), + control_sock_path="/tmp/control.sock", + options={"Opt1": "opt1"}, + ssh_config_path="/home/user/.ssh/config", + port=10022, + ssh_proxies=[ + (SSHConnectionParams(hostname="proxy", username="test", port=10022), None) + ], + forwarded_sockets=[SocketPair(UnixSocket("/1"), UnixSocket("/2"))], + reverse_forwarded_sockets=[SocketPair(UnixSocket("/1"), UnixSocket("/2"))], + ) + + @pytest.mark.usefixtures("ssh_client_info") + def test_open_command_basic(self) -> None: + tunnel = SSHTunnel( + destination="ubuntu@my-server", + identity=FilePath("/home/user/.ssh/id_rsa"), + control_sock_path="/tmp/control.sock", + options={ + "Opt1": "opt1", + "Opt2": "opt2", + }, + ssh_config_path="/home/user/.ssh/config", + port=10022, + ) + assert " ".join(tunnel.open_command()) == ( + "/usr/bin/ssh" + " -F /home/user/.ssh/config" + " -i /home/user/.ssh/id_rsa" + f" -E {tunnel.temp_dir.name}/tunnel.log" + " -N -f" + " -o ControlMaster=auto" + " -S /tmp/control.sock" + " -p 10022" + " -o Opt1=opt1" + " -o Opt2=opt2" + " ubuntu@my-server" + ) + + @pytest.mark.usefixtures("ssh_client_info") + def test_open_command_with_temp_identity_file(self) -> None: + tunnel = SSHTunnel( + destination="ubuntu@my-server", + identity=FileContent("my private key"), + control_sock_path="/tmp/control.sock", + options={}, + ) + temp_dir = tunnel.temp_dir.name + assert " ".join(tunnel.open_command()) == ( + "/usr/bin/ssh" + " -F none" + f" -i {temp_dir}/identity" + f" -E {temp_dir}/tunnel.log" + " -N -f" + " -o ControlMaster=auto" + " -S /tmp/control.sock" + " ubuntu@my-server" + ) + assert (Path(temp_dir) / "identity").read_text() == "my private key" + + @pytest.mark.usefixtures("ssh_client_info") + def test_open_command_with_temp_control_socket(self) -> None: + tunnel = SSHTunnel( + destination="ubuntu@my-server", + identity=FilePath("/home/user/.ssh/id_rsa"), + options={}, + ) + temp_dir = tunnel.temp_dir.name + assert " ".join(tunnel.open_command()) == ( + "/usr/bin/ssh" + " -F none" + " -i /home/user/.ssh/id_rsa" + f" -E {temp_dir}/tunnel.log" + " -N -f" + " -o ControlMaster=auto" + f" -S {temp_dir}/control.sock" + " ubuntu@my-server" + ) + + @pytest.mark.usefixtures("ssh_client_info") + def test_open_command_with_one_proxy(self) -> None: + tunnel = SSHTunnel( + destination="ubuntu@my-server", + identity=FilePath("/home/user/.ssh/id_rsa"), + control_sock_path="/tmp/control.sock", + options={}, + ssh_proxies=[ + ( + SSHConnectionParams(hostname="proxy", username="test", port=10022), + FilePath("/home/user/.ssh/proxy"), + ) + ], + ) + assert tunnel.open_command() == [ + "/usr/bin/ssh", + "-F", + "none", + "-i", + "/home/user/.ssh/id_rsa", + "-E", + f"{tunnel.temp_dir.name}/tunnel.log", + "-N", + "-f", + "-o", + "ControlMaster=auto", + "-S", + "/tmp/control.sock", + "-o", + ( + "ProxyCommand=" + "/usr/bin/ssh -i /home/user/.ssh/proxy -W %h:%p -o StrictHostKeyChecking=no" + " -o UserKnownHostsFile=/dev/null -p 10022 test@proxy" + ), + "ubuntu@my-server", + ] + + @pytest.mark.usefixtures("ssh_client_info") + def test_open_command_with_two_proxies(self) -> None: + tunnel = SSHTunnel( + destination="ubuntu@my-server", + identity=FilePath("/home/user/.ssh/id_rsa"), + control_sock_path="/tmp/control.sock", + options={}, + ssh_proxies=[ + ( + SSHConnectionParams(hostname="proxy1", username="test1", port=10022), + None, + ), + ( + SSHConnectionParams(hostname="proxy2", username="test2", port=20022), + FilePath("/home/user/.ssh/proxy2"), + ), + ], + ) + assert tunnel.open_command() == [ + "/usr/bin/ssh", + "-F", + "none", + "-i", + "/home/user/.ssh/id_rsa", + "-E", + f"{tunnel.temp_dir.name}/tunnel.log", + "-N", + "-f", + "-o", + "ControlMaster=auto", + "-S", + "/tmp/control.sock", + "-o", + ( + "ProxyCommand=" + "/usr/bin/ssh -i /home/user/.ssh/proxy2 -W %h:%p -o StrictHostKeyChecking=no" + " -o UserKnownHostsFile=/dev/null" + " -o 'ProxyCommand=/usr/bin/ssh -i /home/user/.ssh/id_rsa -W %%h:%%p" + " -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" + " -p 10022 test1@proxy1'" + " -p 20022 test2@proxy2" + ), + "ubuntu@my-server", + ] + + @pytest.mark.usefixtures("ssh_client_info") + def test_open_command_with_forwarding(self) -> None: + tunnel = SSHTunnel( + destination="ubuntu@my-server", + identity=FilePath("/home/user/.ssh/id_rsa"), + control_sock_path="/tmp/control.sock", + options={}, + forwarded_sockets=[ + SocketPair(local=UnixSocket("/tmp/80"), remote=IPSocket("localhost", 80)), + SocketPair(local=IPSocket("127.0.0.1", 8000), remote=IPSocket("::1", 80)), + ], + reverse_forwarded_sockets=[ + SocketPair(local=UnixSocket("/tmp/local"), remote=UnixSocket("/tmp/remote")), + SocketPair(local=IPSocket("test.local", 80), remote=IPSocket("localhost", 8000)), + ], + ) + assert " ".join(tunnel.open_command()) == ( + "/usr/bin/ssh" + " -F none" + " -i /home/user/.ssh/id_rsa" + f" -E {tunnel.temp_dir.name}/tunnel.log" + " -N -f" + " -o ControlMaster=auto" + " -S /tmp/control.sock" + " -L /tmp/80:localhost:80" + " -L 127.0.0.1:8000:[::1]:80" + " -R /tmp/remote:/tmp/local" + " -R localhost:8000:test.local:80" + " ubuntu@my-server" + ) + + def test_check_command(self, sample_tunnel_with_all_params: SSHTunnel) -> None: + command = sample_tunnel_with_all_params.check_command() + assert command == [ + "/usr/bin/ssh", + "-S", + "/tmp/control.sock", + "-O", + "check", + "ubuntu@my-server", + ] + + def test_close_command(self, sample_tunnel_with_all_params: SSHTunnel) -> None: + command = sample_tunnel_with_all_params.close_command() + assert command == [ + "/usr/bin/ssh", + "-S", + "/tmp/control.sock", + "-O", + "exit", + "ubuntu@my-server", + ] + + def test_exec_command(self, sample_tunnel_with_all_params: SSHTunnel) -> None: + command = sample_tunnel_with_all_params.exec_command() + assert command == ["/usr/bin/ssh", "-S", "/tmp/control.sock", "ubuntu@my-server"] + + +def test_ports_to_forwarded_sockets() -> None: + assert ports_to_forwarded_sockets({80: 8000, 22: 2200}, bind_local="::1") == [ + SocketPair(local=IPSocket("::1", 8000), remote=IPSocket("localhost", 80)), + SocketPair(local=IPSocket("::1", 2200), remote=IPSocket("localhost", 22)), + ] diff --git a/src/tests/_internal/core/services/test_diff.py b/src/tests/_internal/core/services/test_diff.py new file mode 100644 index 0000000000..4e8d355c0d --- /dev/null +++ b/src/tests/_internal/core/services/test_diff.py @@ -0,0 +1,59 @@ +import pytest + +from dstack._internal.core.services.diff import ModelDiff, ModelFieldDiff, flatten_diff_fields + + +@pytest.mark.parametrize( + "diff,expected", + [ + pytest.param({}, [], id="empty_diff"), + pytest.param( + { + "field1": ModelFieldDiff(old="old1", new="new1"), + "field2": ModelFieldDiff(old="old2", new="new2"), + }, + ["field1", "field2"], + id="multiple_fields", + ), + pytest.param( + { + "field1": ModelFieldDiff(old="old1", new="new1"), + "nested": { + "sub1": ModelFieldDiff(old="old_sub1", new="new_sub1"), + }, + }, + ["field1", "nested.sub1"], + id="nested_single_level", + ), + pytest.param( + { + "field1": ModelFieldDiff(old="old1", new="new1"), + "level1": { + "level2": { + "level3": {"deep_field": ModelFieldDiff(old="deep_old", new="deep_new")}, + "field2": ModelFieldDiff(old="old2", new="new2"), + }, + "field3": ModelFieldDiff(old="old3", new="new3"), + }, + }, + ["field1", "level1.level2.level3.deep_field", "level1.level2.field2", "level1.field3"], + id="nested_multiple_levels", + ), + pytest.param( + { + "field1": ModelFieldDiff(old="old1", new="new1"), + "empty_nested": {}, + "nested_with_empty": { + "empty_sub": {}, + "field2": ModelFieldDiff(old="old2", new="new2"), + }, + }, + ["field1", "nested_with_empty.field2"], + id="empty_nested", + ), + ], +) +def test_flatten_diff_fields(diff: ModelDiff, expected: list[str]): + """Test flatten_diff_fields with various diff structures.""" + result = flatten_diff_fields(diff) + assert result == expected diff --git a/src/tests/_internal/core/services/test_logs.py b/src/tests/_internal/core/services/test_logs.py index c1eca72d3a..6c966a3ae6 100644 --- a/src/tests/_internal/core/services/test_logs.py +++ b/src/tests/_internal/core/services/test_logs.py @@ -1,3 +1,5 @@ +import pytest + from dstack._internal.core.models.runs import AppSpec from dstack._internal.core.services.logs import URLReplacer @@ -125,3 +127,27 @@ def test_omit_https_default_port(self): ports={8000: 443}, app_specs=[], hostname="secure.host.com", secure=True ) assert replacer(b"http://0.0.0.0:8000/qwerty") == b"https://secure.host.com/qwerty" + + @pytest.mark.parametrize( + ("in_path", "out_path"), + [ + ("", "/proxy/services/main/service/"), + ("/", "/proxy/services/main/service/"), + ("/a/b/c", "/proxy/services/main/service/a/b/c"), + ("/proxy/services/main/service", "/proxy/services/main/service"), + ("/proxy/services/main/service/", "/proxy/services/main/service/"), + ("/proxy/services/main/service/a/b/c", "/proxy/services/main/service/a/b/c"), + ], + ) + def test_adds_prefix_unless_already_present(self, in_path: str, out_path: str) -> None: + replacer = URLReplacer( + ports={8888: 3000}, + app_specs=[], + hostname="0.0.0.0", + secure=False, + path_prefix="/proxy/services/main/service/", + ) + assert ( + replacer(f"http://0.0.0.0:8888{in_path}".encode()) + == f"http://0.0.0.0:3000{out_path}".encode() + ) diff --git a/src/tests/_internal/proxy/__init__.py b/src/tests/_internal/proxy/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/proxy/gateway/__init__.py b/src/tests/_internal/proxy/gateway/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/proxy/gateway/conftest.py b/src/tests/_internal/proxy/gateway/conftest.py new file mode 100644 index 0000000000..008bf9460d --- /dev/null +++ b/src/tests/_internal/proxy/gateway/conftest.py @@ -0,0 +1,26 @@ +from typing import Generator +from unittest.mock import patch + +import pytest + +from dstack._internal.proxy.gateway.testing.common import Mocks + + +@pytest.fixture +def system_mocks() -> Generator[Mocks, None, None]: + nginx = "dstack._internal.proxy.gateway.services.nginx" + connection = "dstack._internal.proxy.lib.services.service_connection" + with ( + patch(f"{nginx}.sudo") as sudo, + patch(f"{nginx}.Nginx.reload") as reload_nginx, + patch(f"{nginx}.Nginx.run_certbot") as run_certbot, + patch(f"{connection}.ServiceConnection.open") as open_conn, + patch(f"{connection}.ServiceConnection.close") as close_conn, + ): + sudo.return_value = [] + yield Mocks( + reload_nginx=reload_nginx, + run_certbot=run_certbot, + open_conn=open_conn, + close_conn=close_conn, + ) diff --git a/src/tests/_internal/proxy/gateway/repo/__init__.py b/src/tests/_internal/proxy/gateway/repo/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/proxy/gateway/repo/test_repo.py b/src/tests/_internal/proxy/gateway/repo/test_repo.py new file mode 100644 index 0000000000..55cc4dbe94 --- /dev/null +++ b/src/tests/_internal/proxy/gateway/repo/test_repo.py @@ -0,0 +1,37 @@ +from pathlib import Path + +import pytest + +from dstack._internal.proxy.gateway.models import ACMESettings, GlobalProxyConfig, ModelEntrypoint +from dstack._internal.proxy.gateway.repo.repo import GatewayProxyRepo +from dstack._internal.proxy.lib.testing.common import make_project, make_service +from tests._internal.proxy.lib.routers.test_model_proxy import make_model + + +@pytest.mark.asyncio +async def test_persist_repo(tmp_path: Path) -> None: + proj_1 = make_project("proj-1") + proj_2 = make_project("proj-2") + srv_1 = make_service("proj-1", "run-1", domain="run-1.gtw.test") + srv_2 = make_service("proj-2", "run-2", domain="run-2.gtw.test") + model_1 = make_model("proj-1", "model-1", run_name="run-1") + entrypoint_1 = ModelEntrypoint(project_name="proj-1", domain="gateway.gtw.test", https=True) + config = GlobalProxyConfig(acme_settings=ACMESettings(server="https://acme.test")) + file = tmp_path / "state-v2.json" + + repo = GatewayProxyRepo.load(file) + await repo.set_config(config) + await repo.set_project(proj_1) + await repo.set_project(proj_2) + await repo.set_entrypoint(entrypoint_1) + await repo.set_service(srv_1) + await repo.set_service(srv_2) + await repo.set_model(model_1) + + repo = GatewayProxyRepo.load(file) # reload from file + assert await repo.get_config() == config + assert await repo.get_project("proj-1") == proj_1 + assert await repo.get_project("proj-2") == proj_2 + assert await repo.list_entrypoints() == [entrypoint_1] + assert set(await repo.list_services()) == {srv_1, srv_2} + assert await repo.list_models("proj-1") == [model_1] diff --git a/src/tests/_internal/proxy/gateway/repo/test_state_v1.py b/src/tests/_internal/proxy/gateway/repo/test_state_v1.py new file mode 100644 index 0000000000..7856bea2d5 --- /dev/null +++ b/src/tests/_internal/proxy/gateway/repo/test_state_v1.py @@ -0,0 +1,464 @@ +from datetime import datetime +from pathlib import Path + +import pytest + +from dstack._internal.core.models.instances import SSHConnectionParams +from dstack._internal.proxy.gateway.models import ACMESettings, GlobalProxyConfig, ModelEntrypoint +from dstack._internal.proxy.gateway.repo.repo import GatewayProxyRepo +from dstack._internal.proxy.gateway.repo.state_v1 import migrate_from_state_v1 +from dstack._internal.proxy.lib.models import ( + ChatModel, + OpenAIChatModelFormat, + Project, + Replica, + Service, + TGIChatModelFormat, +) +from dstack._internal.proxy.lib.testing.common import make_project + +SAMPLE_STATE_V1 = """ +{ + "store": { + "services": { + "40023ad58d624702b58fbda38489239e": { + "id": "40023ad58d624702b58fbda38489239e", + "domain": "run-1.proj-1.gateway.test", + "https": true, + "auth": true, + "client_max_body_size": 67108864, + "options": { + "openai": { + "model": { + "type": "chat", + "name": "model/1", + "format": "openai", + "prefix": "/v1" + } + } + }, + "replicas": [ + { + "id": "b894c006be5d4034bd751e57af0b4561", + "app_port": 11434, + "ssh_host": "root@10.0.0.1", + "ssh_port": 11165, + "ssh_jump_host": null, + "ssh_jump_port": null, + "ssh_tunnel": { + "temp_dir": "/tmp/tmp9y9zncvb", + "start_cmd": [], + "exit_cmd": [], + "check_cmd": [] + } + } + ] + }, + "c41cac40726b403692ca63292b4a3af8": { + "id": "c41cac40726b403692ca63292b4a3af8", + "domain": "run-2.proj-2.gateway.test", + "https": true, + "auth": false, + "client_max_body_size": 67108864, + "options": {}, + "replicas": [ + { + "id": "4aa53ef5759d400ea41e0ec6a34354ed", + "app_port": 8000, + "ssh_host": "root@localhost", + "ssh_port": 10022, + "ssh_jump_host": "root@10.0.0.2", + "ssh_jump_port": 22, + "ssh_tunnel": { + "temp_dir": "/tmp/tmprz98c6mi", + "start_cmd": [], + "exit_cmd": [], + "check_cmd": [] + } + }, + { + "id": "f1e60fedfc5a46f791f26a042b57b9f4", + "app_port": 8000, + "ssh_host": "root@localhost", + "ssh_port": 10022, + "ssh_jump_host": "root@10.0.0.3", + "ssh_jump_port": 22, + "ssh_tunnel": { + "temp_dir": "/tmp/tmps9v5tsq_", + "start_cmd": [], + "exit_cmd": [], + "check_cmd": [] + } + } + ] + }, + "bcaa6ce30f2a4ffab279a44ef696ec7c": { + "id": "bcaa6ce30f2a4ffab279a44ef696ec7c", + "domain": "run-3.proj-2.gateway.test", + "https": true, + "auth": true, + "client_max_body_size": 67108864, + "options": { + "openai": { + "model": { + "type": "chat", + "name": "model/2", + "format": "tgi", + "chat_template": "test chat template", + "eos_token": "<|eot_id|>" + } + } + }, + "replicas": [ + { + "id": "16fc0bd438d747bba810cef00bc137de", + "app_port": 80, + "ssh_host": "root@localhost", + "ssh_port": 10022, + "ssh_jump_host": "root@10.0.0.4", + "ssh_jump_port": 22, + "ssh_tunnel": { + "temp_dir": "/tmp/tmpzyx08k1s", + "start_cmd": [], + "exit_cmd": [], + "check_cmd": [] + } + } + ] + } + }, + "projects": { + "proj-2": [ + "bcaa6ce30f2a4ffab279a44ef696ec7c", + "c41cac40726b403692ca63292b4a3af8" + ], + "proj-1": [ + "40023ad58d624702b58fbda38489239e" + ] + }, + "entrypoints": { + "gateway.proj-2.gateway.test": [ + "proj-2", + "openai" + ], + "gateway.proj-1.gateway.test": [ + "proj-1", + "openai" + ] + }, + "nginx": { + "configs": { + "443-gateway.proj-2.gateway.test.conf": { + "type": "entrypoint", + "domain": "gateway.proj-2.gateway.test", + "https": true, + "proxy_path": "/api/openai/proj-2" + }, + "443-gateway.proj-1.gateway.test.conf": { + "type": "entrypoint", + "domain": "gateway.proj-1.gateway.test", + "https": true, + "proxy_path": "/api/openai/proj-1" + }, + "443-run-1.proj-1.gateway.test.conf": { + "type": "service", + "domain": "run-1.proj-1.gateway.test", + "https": true, + "project": "proj-1", + "service_id": "40023ad58d624702b58fbda38489239e", + "auth": true, + "client_max_body_size": 67108864, + "servers": { + "b894c006be5d4034bd751e57af0b4561": "unix:/tmp/tmp9y9zncvb/sock" + } + }, + "443-run-2.proj-2.gateway.test.conf": { + "type": "service", + "domain": "run-2.proj-2.gateway.test", + "https": true, + "project": "proj-2", + "service_id": "c41cac40726b403692ca63292b4a3af8", + "auth": false, + "client_max_body_size": 67108864, + "servers": { + "4aa53ef5759d400ea41e0ec6a34354ed": "unix:/tmp/tmprz98c6mi/sock", + "f1e60fedfc5a46f791f26a042b57b9f4": "unix:/tmp/tmps9v5tsq_/sock" + } + }, + "443-run-3.proj-2.gateway.test.conf": { + "type": "service", + "domain": "run-3.proj-2.gateway.test", + "https": true, + "project": "proj-2", + "service_id": "bcaa6ce30f2a4ffab279a44ef696ec7c", + "auth": true, + "client_max_body_size": 67108864, + "servers": { + "16fc0bd438d747bba810cef00bc137de": "unix:/tmp/tmpzyx08k1s/sock" + } + } + }, + "acme_settings": { + "server": "https://acme.test/", + "eab_kid": "test_eab_kid", + "eab_hmac_key": "test_eab_hmac_key" + } + }, + "gateway_https": true + }, + "openai": { + "index": { + "proj-2": { + "chat": { + "model/2": { + "model": { + "type": "chat", + "name": "model/2", + "format": "tgi", + "chat_template": "test chat template", + "eos_token": "<|eot_id|>" + }, + "domain": "run-3.proj-2.gateway.test", + "created": 1734905496 + } + } + }, + "proj-1": { + "chat": { + "model/1": { + "model": { + "type": "chat", + "name": "model/1", + "format": "openai", + "prefix": "/v1" + }, + "domain": "run-1.proj-1.gateway.test", + "created": 1734902765 + } + } + } + }, + "services_index": { + "40023ad58d624702b58fbda38489239e": [ + "proj-1", + "chat", + "model/1" + ], + "bcaa6ce30f2a4ffab279a44ef696ec7c": [ + "proj-2", + "chat", + "model/2" + ] + } + }, + "stats_collector": { + "path": "/var/log/nginx/dstack.access.log", + "resolution": 1, + "ttl": 300, + "services": { + "40023ad58d624702b58fbda38489239e": "run-1.proj-1.gateway.test", + "c41cac40726b403692ca63292b4a3af8": "run-2.proj-2.gateway.test", + "bcaa6ce30f2a4ffab279a44ef696ec7c": "run-3.proj-2.gateway.test" + } + } +} +""" + + +@pytest.mark.asyncio +async def test_migrate_from_state_v1(tmp_path: Path) -> None: + keys_dir = tmp_path / "keys" + v1_file = tmp_path / "state.json" + v2_file = tmp_path / "state-v2.json" + v1_file.write_text(SAMPLE_STATE_V1) + keys_dir.mkdir() + (keys_dir / "proj-1").write_text("test key 1") + (keys_dir / "proj-2").write_text("test key 2") + + migrate_from_state_v1(v1_file, v2_file, keys_dir) + + repo = GatewayProxyRepo.load(v2_file) + assert await repo.get_config() == GlobalProxyConfig( + acme_settings=ACMESettings( + server="https://acme.test/", + eab_kid="test_eab_kid", + eab_hmac_key="test_eab_hmac_key", + ) + ) + assert await repo.get_project("proj-1") == Project(name="proj-1", ssh_private_key="test key 1") + assert await repo.get_project("proj-2") == Project(name="proj-2", ssh_private_key="test key 2") + assert set(await repo.list_entrypoints()) == { + ModelEntrypoint(project_name="proj-1", domain="gateway.proj-1.gateway.test", https=True), + ModelEntrypoint(project_name="proj-2", domain="gateway.proj-2.gateway.test", https=True), + } + assert set(await repo.list_services()) == { + Service( + project_name="proj-1", + run_name="run-1", + domain="run-1.proj-1.gateway.test", + https=True, + auth=True, + client_max_body_size=67108864, + replicas=( + Replica( + id="b894c006be5d4034bd751e57af0b4561", + app_port=11434, + ssh_destination="root@10.0.0.1", + ssh_port=11165, + ssh_proxy=None, + ), + ), + ), + Service( + project_name="proj-2", + run_name="run-2", + domain="run-2.proj-2.gateway.test", + https=True, + auth=False, + client_max_body_size=67108864, + replicas=( + Replica( + id="4aa53ef5759d400ea41e0ec6a34354ed", + app_port=8000, + ssh_destination="root@localhost", + ssh_port=10022, + ssh_proxy=SSHConnectionParams( + hostname="10.0.0.2", + username="root", + port=22, + ), + ), + Replica( + id="f1e60fedfc5a46f791f26a042b57b9f4", + app_port=8000, + ssh_destination="root@localhost", + ssh_port=10022, + ssh_proxy=SSHConnectionParams( + hostname="10.0.0.3", + username="root", + port=22, + ), + ), + ), + ), + Service( + project_name="proj-2", + run_name="run-3", + domain="run-3.proj-2.gateway.test", + https=True, + auth=True, + client_max_body_size=67108864, + replicas=( + Replica( + id="16fc0bd438d747bba810cef00bc137de", + app_port=80, + ssh_destination="root@localhost", + ssh_port=10022, + ssh_proxy=SSHConnectionParams( + hostname="10.0.0.4", + username="root", + port=22, + ), + ), + ), + ), + } + assert await repo.list_models("proj-1") == [ + ChatModel( + project_name="proj-1", + name="model/1", + created_at=datetime.fromtimestamp(1734902765), + run_name="run-1", + format_spec=OpenAIChatModelFormat(prefix="/v1"), + ) + ] + assert await repo.list_models("proj-2") == [ + ChatModel( + project_name="proj-2", + name="model/2", + created_at=datetime.fromtimestamp(1734905496), + run_name="run-3", + format_spec=TGIChatModelFormat( + chat_template="test chat template", + eos_token="<|eot_id|>", + ), + ) + ] + + +EMPTY_STATE_V1 = """ +{ + "store": { + "services": {}, + "projects": {}, + "entrypoints": {}, + "nginx": { + "configs": {}, + "acme_settings": { + "server": null, + "eab_kid": null, + "eab_hmac_key": null + } + }, + "gateway_https": true + }, + "openai": { + "index": {}, + "services_index": {} + }, + "stats_collector": { + "path": "/var/log/nginx/dstack.access.log", + "resolution": 1, + "ttl": 300, + "services": {} + } +} +""" + + +@pytest.mark.asyncio +async def test_migrate_from_empty_state_v1(tmp_path: Path) -> None: + keys_dir = tmp_path / "keys" + v1_file = tmp_path / "state.json" + v2_file = tmp_path / "state-v2.json" + v1_file.write_text(EMPTY_STATE_V1) + + migrate_from_state_v1(v1_file, v2_file, keys_dir) + + repo = GatewayProxyRepo.load(v2_file) + assert await repo.get_config() == GlobalProxyConfig( + acme_settings=ACMESettings( + server=None, + eab_kid=None, + eab_hmac_key=None, + ) + ) + assert await repo.list_entrypoints() == [] + assert await repo.list_services() == [] + + +def test_not_migrates_if_no_state_v1(tmp_path: Path) -> None: + keys_dir = tmp_path / "keys" + v1_file = tmp_path / "state.json" + v2_file = tmp_path / "state-v2.json" + migrate_from_state_v1(v1_file, v2_file, keys_dir) + assert not v2_file.exists() + + +@pytest.mark.asyncio +async def test_not_migrates_if_migrated_before(tmp_path: Path) -> None: + keys_dir = tmp_path / "keys" + v1_file = tmp_path / "state.json" + v2_file = tmp_path / "state-v2.json" + v1_file.write_text(EMPTY_STATE_V1) + + migrate_from_state_v1(v1_file, v2_file, keys_dir) + state_v2_after_initial_migration = v2_file.read_text() + + repo = GatewayProxyRepo.load(v2_file) + await repo.set_project(make_project("test-proj")) + state_v2_after_write_operation = v2_file.read_text() + assert state_v2_after_write_operation != state_v2_after_initial_migration + + migrate_from_state_v1(v1_file, v2_file, keys_dir) + assert v2_file.read_text() == state_v2_after_write_operation diff --git a/src/tests/_internal/proxy/gateway/routers/__init__.py b/src/tests/_internal/proxy/gateway/routers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/proxy/gateway/routers/test_registry.py b/src/tests/_internal/proxy/gateway/routers/test_registry.py new file mode 100644 index 0000000000..239413cfda --- /dev/null +++ b/src/tests/_internal/proxy/gateway/routers/test_registry.py @@ -0,0 +1,544 @@ +import re +from datetime import datetime +from pathlib import Path +from typing import Optional + +import httpx +import pytest +from freezegun import freeze_time + +from dstack._internal.core.errors import SSHError +from dstack._internal.proxy.gateway.app import make_app +from dstack._internal.proxy.gateway.repo.repo import GatewayProxyRepo +from dstack._internal.proxy.gateway.services.nginx import Nginx +from dstack._internal.proxy.gateway.testing.common import Mocks +from dstack._internal.proxy.lib.models import ChatModel, OpenAIChatModelFormat + + +def make_client( + nginx_conf_dir: Path, repo: Optional[GatewayProxyRepo] = None +) -> httpx.AsyncClient: + app = make_app(repo=repo or GatewayProxyRepo(), nginx=Nginx(conf_dir=nginx_conf_dir)) + return httpx.AsyncClient(transport=httpx.ASGITransport(app=app), base_url="http://test/") + + +def register_service_payload( + run_name: str = "test-run", + domain: str = "test-run.gtw.test", + https: bool = False, + auth: bool = False, + client_max_body_size: int = 1024, + options: Optional[dict] = None, + rate_limits: Optional[list[dict]] = None, +) -> dict: + return { + "run_name": run_name, + "domain": domain, + "https": https, + "auth": auth, + "client_max_body_size": client_max_body_size, + "options": options or {}, + "rate_limits": rate_limits or [], + "ssh_private_key": "private-key", + } + + +def register_replica_payload(job_id: str = "xxx-xxx") -> dict: + return { + "job_id": job_id, + "app_port": 8888, + "ssh_host": "host.test", + "ssh_port": 22, + "ssh_proxy": None, + "ssh_head_proxy": None, + "ssh_head_proxy_private_key": None, + } + + +def register_replica_payload_with_head_proxy(job_id: str = "xxx-xxx") -> dict: + return { + "job_id": job_id, + "app_port": 8888, + "ssh_host": "host.test", + "ssh_port": 22, + "ssh_proxy": None, + "ssh_head_proxy": { + "hostname": "proxy.test", + "username": "debian", + "port": 222, + }, + "ssh_head_proxy_private_key": "private-key", + } + + +def sample_model_options(name: str = "test-model") -> dict: + return { + "openai": { + "model": { + "type": "chat", + "name": name, + "format": "openai", + "prefix": "/v1", + } + } + } + + +@pytest.mark.asyncio +class TestRegisterService: + async def test_register(self, tmp_path: Path, system_mocks: Mocks) -> None: + client = make_client(tmp_path) + resp = await client.post( + "/api/registry/test-proj/services/register", + json=register_service_payload( + run_name="test-run", + domain="test-run.gtw.test", + https=False, + auth=False, + client_max_body_size=1024, + ), + ) + assert resp.status_code == 200 + assert resp.json() == {"status": "ok"} + conf = (tmp_path / "443-test-run.gtw.test.conf").read_text() + # general + assert system_mocks.reload_nginx.call_count == 1 + assert "server_name test-run.gtw.test;" in conf + assert "client_max_body_size 1024;" in conf + assert "listen 80;" in conf + # no https + assert system_mocks.run_certbot.call_count == 0 + assert "listen 443" not in conf + # no auth + assert "auth_request /_dstack_auth;" not in conf + # no replicas + assert "upstream" not in conf + assert "return 503;" in conf + + async def test_register_with_https(self, tmp_path: Path, system_mocks: Mocks) -> None: + client = make_client(tmp_path) + resp = await client.post( + "/api/registry/test-proj/services/register", + json=register_service_payload(domain="test-run.gtw.test", https=True), + ) + assert resp.status_code == 200 + conf = (tmp_path / "443-test-run.gtw.test.conf").read_text() + assert "listen 80;" in conf + assert "listen 443 ssl;" in conf + assert "ssl_certificate /etc/letsencrypt/live/test-run.gtw.test/fullchain.pem;" in conf + assert "ssl_certificate_key /etc/letsencrypt/live/test-run.gtw.test/privkey.pem;" in conf + assert system_mocks.run_certbot.call_count == 1 + + async def test_register_with_auth(self, tmp_path: Path, system_mocks: Mocks) -> None: + client = make_client(tmp_path) + resp = await client.post( + "/api/registry/test-proj/services/register", + json=register_service_payload(domain="test-run.gtw.test", auth=True), + ) + assert resp.status_code == 200 + conf = (tmp_path / "443-test-run.gtw.test.conf").read_text() + assert "auth_request /_dstack_auth;" in conf + assert "proxy_pass http://localhost:8000/api/auth/test-proj;" in conf + + async def test_register_same_name_error(self, tmp_path: Path, system_mocks: Mocks) -> None: + client = make_client(tmp_path) + resp = await client.post( + "/api/registry/test-proj/services/register", + json=register_service_payload(run_name="test-run", domain="test-run-1.gtw.test"), + ) + assert resp.status_code == 200 + resp = await client.post( + "/api/registry/test-proj/services/register", + json=register_service_payload(run_name="test-run", domain="test-run-2.gtw.test"), + ) + assert resp.status_code == 400 + assert resp.json() == {"detail": "Service test-proj/test-run is already registered"} + assert (tmp_path / "443-test-run-1.gtw.test.conf").exists() + assert not (tmp_path / "443-test-run-2.gtw.test.conf").exists() + assert system_mocks.reload_nginx.call_count == 1 + + async def test_register_same_name_in_different_projects( + self, tmp_path: Path, system_mocks + ) -> None: + client = make_client(tmp_path) + resp = await client.post( + "/api/registry/proj-1/services/register", + json=register_service_payload(run_name="test-run", domain="test-run.proj-1.gtw.test"), + ) + assert resp.status_code == 200 + resp = await client.post( + "/api/registry/proj-2/services/register", + json=register_service_payload(run_name="test-run", domain="test-run.proj-2.gtw.test"), + ) + assert resp.status_code == 200 + assert (tmp_path / "443-test-run.proj-1.gtw.test.conf").exists() + assert (tmp_path / "443-test-run.proj-2.gtw.test.conf").exists() + + async def test_register_same_domain_error(self, tmp_path: Path, system_mocks: Mocks) -> None: + client = make_client(tmp_path) + resp = await client.post( + "/api/registry/test-proj-1/services/register", + json=register_service_payload(run_name="test-run", domain="test-run.gtw.test"), + ) + assert resp.status_code == 200 + resp = await client.post( + "/api/registry/test-proj/services/register", + json=register_service_payload(run_name="test-run", domain="test-run.gtw.test"), + ) + assert resp.status_code == 400 + assert resp.json() == { + "detail": "Domain name 'test-run.gtw.test' is already taken by another service" + } + assert (tmp_path / "443-test-run.gtw.test.conf").exists() + assert system_mocks.reload_nginx.call_count == 1 + + @freeze_time(datetime(2024, 12, 12, 0, 30)) + async def test_register_with_model(self, tmp_path: Path, system_mocks: Mocks) -> None: + repo = GatewayProxyRepo() + client = make_client(tmp_path, repo=repo) + resp = await client.post( + "/api/registry/test-proj/services/register", + json=register_service_payload( + run_name="test-run", + options=sample_model_options(name="test-model"), + ), + ) + assert resp.status_code == 200 + assert await repo.list_models("test-proj") == [ + ChatModel( + project_name="test-proj", + name="test-model", + created_at=datetime(2024, 12, 12, 0, 30), + run_name="test-run", + format_spec=OpenAIChatModelFormat(prefix="/v1"), + ) + ] + + async def test_register_with_rate_limits(self, tmp_path: Path, system_mocks: Mocks) -> None: + client = make_client(tmp_path) + resp = await client.post( + "/api/registry/test-proj/services/register", + json=register_service_payload( + domain="test-run.gtw.test", + rate_limits=[ + { + "prefix": "/a", + "key": {"type": "ip_address"}, + "rps": 2.5, + "burst": 5, + }, + { + "prefix": "/b", + "key": {"type": "header", "header": "X-Api-Key"}, + "rps": 1, + "burst": 0, + }, + ], + ), + ) + assert resp.status_code == 200 + conf = (tmp_path / "443-test-run.gtw.test.conf").read_text() + assert ( + "limit_req_zone $binary_remote_addr zone=0.test-run.gtw.test:10m rate=150r/m;" in conf + ) + assert "limit_req_zone $http_x_api_key zone=1.test-run.gtw.test:10m rate=60r/m;" in conf + assert "location /a {" in conf + assert "location /b {" in conf + assert "location / {" in conf + assert "limit_req zone=0.test-run.gtw.test burst=5 nodelay;" in conf + assert "limit_req zone=1.test-run.gtw.test;" in conf + + async def test_register_with_root_rate_limit( + self, tmp_path: Path, system_mocks: Mocks + ) -> None: + client = make_client(tmp_path) + resp = await client.post( + "/api/registry/test-proj/services/register", + json=register_service_payload( + domain="test-run.gtw.test", + rate_limits=[ + {"prefix": "/", "key": {"type": "ip_address"}, "rps": 1, "burst": 1}, + ], + ), + ) + assert resp.status_code == 200 + conf = (tmp_path / "443-test-run.gtw.test.conf").read_text() + assert ( + "limit_req_zone $binary_remote_addr zone=0.test-run.gtw.test:10m rate=60r/m;" in conf + ) + assert "location / {" in conf + assert "limit_req zone=0.test-run.gtw.test burst=1 nodelay;" in conf + + async def test_register_without_rate_limits(self, tmp_path: Path, system_mocks: Mocks) -> None: + client = make_client(tmp_path) + resp = await client.post( + "/api/registry/test-proj/services/register", + json=register_service_payload(domain="test-run.gtw.test", rate_limits=[]), + ) + assert resp.status_code == 200 + conf = (tmp_path / "443-test-run.gtw.test.conf").read_text() + assert "limit_req_zone" not in conf + assert "limit_req zone=" not in conf + assert "location / {" in conf + + +@pytest.mark.asyncio +class TestRegisterReplica: + async def test_register(self, tmp_path: Path, system_mocks: Mocks) -> None: + client = make_client(tmp_path) + # register service + resp = await client.post( + "/api/registry/test-proj/services/register", + json=register_service_payload(run_name="test-run", domain="test-run.gtw.test"), + ) + assert resp.status_code == 200 + conf = (tmp_path / "443-test-run.gtw.test.conf").read_text() + assert "upstream" not in conf + # register 2 replicas + resp = await client.post( + "/api/registry/test-proj/services/test-run/replicas/register", + json=register_replica_payload(job_id="xxx-xxx"), + ) + assert resp.status_code == 200 + assert resp.json() == {"status": "ok"} + resp = await client.post( + "/api/registry/test-proj/services/test-run/replicas/register", + json=register_replica_payload_with_head_proxy(job_id="yyy-yyy"), + ) + assert resp.status_code == 200 + assert resp.json() == {"status": "ok"} + conf = (tmp_path / "443-test-run.gtw.test.conf").read_text() + assert "upstream test-run.gtw.test.upstream" in conf + assert (m1 := re.search(r"server unix:/(.+)/replica.sock; # replica xxx-xxx", conf)) + assert (m2 := re.search(r"server unix:/(.+)/replica.sock; # replica yyy-yyy", conf)) + assert m1.group(1) != m2.group(1) + assert system_mocks.reload_nginx.call_count == 3 + assert system_mocks.open_conn.call_count == 2 + + async def test_register_no_service_error(self, tmp_path: Path, system_mocks: Mocks) -> None: + client = make_client(tmp_path) + resp = await client.post( + "/api/registry/test-proj/services/test-run/replicas/register", + json=register_replica_payload(), + ) + assert resp.status_code == 400 + assert resp.json() == { + "detail": "Service test-proj/test-run does not exist, cannot register replica" + } + assert system_mocks.reload_nginx.call_count == 0 + assert system_mocks.open_conn.call_count == 0 + + async def test_register_twice_error(self, tmp_path: Path, system_mocks: Mocks) -> None: + client = make_client(tmp_path) + # register service + resp = await client.post( + "/api/registry/test-proj/services/register", + json=register_service_payload(run_name="test-run", domain="test-run.gtw.test"), + ) + assert resp.status_code == 200 + # register replica + resp = await client.post( + "/api/registry/test-proj/services/test-run/replicas/register", + json=register_replica_payload(job_id="aaa-aaa"), + ) + assert resp.status_code == 200 + # register the same replica + resp = await client.post( + "/api/registry/test-proj/services/test-run/replicas/register", + json=register_replica_payload(job_id="aaa-aaa"), + ) + assert resp.status_code == 400 + assert resp.json() == { + "detail": "Replica aaa-aaa already exists in service test-proj/test-run" + } + assert system_mocks.reload_nginx.call_count == 2 + assert system_mocks.open_conn.call_count == 1 + + async def test_register_connection_error(self, tmp_path: Path, system_mocks: Mocks) -> None: + client = make_client(tmp_path) + # register service + resp = await client.post( + "/api/registry/test-proj/services/register", + json=register_service_payload(run_name="test-run", domain="test-run.gtw.test"), + ) + assert resp.status_code == 200 + conf_before = (tmp_path / "443-test-run.gtw.test.conf").read_text() + # register invalid replica + system_mocks.open_conn.side_effect = SSHError("test error") + resp = await client.post( + "/api/registry/test-proj/services/test-run/replicas/register", + json=register_replica_payload(job_id="abc-def"), + ) + assert resp.status_code == 400 + assert resp.json() == { + "detail": "Cannot register replica abc-def in service test-proj/test-run: test error" + } + conf_after = (tmp_path / "443-test-run.gtw.test.conf").read_text() + assert conf_after == conf_before + + +@pytest.mark.asyncio +class TestUnregisterService: + async def test_unregister(self, tmp_path: Path, system_mocks: Mocks) -> None: + client = make_client(tmp_path) + # register service + resp = await client.post( + "/api/registry/test-proj/services/register", + json=register_service_payload(run_name="test-run", domain="test-run.gtw.test"), + ) + assert resp.status_code == 200 + assert (tmp_path / "443-test-run.gtw.test.conf").exists() + # unregister service + resp = await client.post("/api/registry/test-proj/services/test-run/unregister") + assert resp.status_code == 200 + assert resp.json() == {"status": "ok"} + assert not (tmp_path / "443-test-run.gtw.test.conf").exists() + assert system_mocks.reload_nginx.call_count == 2 + + async def test_unregister_not_registered_error( + self, tmp_path: Path, system_mocks: Mocks + ) -> None: + client = make_client(tmp_path) + resp = await client.post("/api/registry/test-proj/services/test-run/unregister") + assert resp.status_code == 400 + assert resp.json() == { + "detail": "Service test-proj/test-run is not registered, cannot unregister" + } + assert system_mocks.reload_nginx.call_count == 0 + + async def test_unregister_with_replicas(self, tmp_path: Path, system_mocks: Mocks) -> None: + client = make_client(tmp_path) + # register service + resp = await client.post( + "/api/registry/test-proj/services/register", + json=register_service_payload(run_name="test-run", domain="test-run.gtw.test"), + ) + assert resp.status_code == 200 + # register 2 replicas + for job_id in ("xxx-xxx", "yyy-yyy"): + resp = await client.post( + "/api/registry/test-proj/services/test-run/replicas/register", + json=register_replica_payload(job_id=job_id), + ) + assert resp.status_code == 200 + assert (tmp_path / "443-test-run.gtw.test.conf").exists() + # unregister service + resp = await client.post("/api/registry/test-proj/services/test-run/unregister") + assert resp.status_code == 200 + assert not (tmp_path / "443-test-run.gtw.test.conf").exists() + assert system_mocks.reload_nginx.call_count == 4 + assert system_mocks.close_conn.call_count == 2 + + async def test_unregister_with_model(self, tmp_path: Path, system_mocks: Mocks) -> None: + repo = GatewayProxyRepo() + client = make_client(tmp_path, repo=repo) + # register service + resp = await client.post( + "/api/registry/test-proj/services/register", + json=register_service_payload(run_name="test-run", options=sample_model_options()), + ) + assert resp.status_code == 200 + assert len(await repo.list_models("test-proj")) == 1 + # unregister service + resp = await client.post("/api/registry/test-proj/services/test-run/unregister") + assert resp.status_code == 200 + assert resp.json() == {"status": "ok"} + assert len(await repo.list_models("test-proj")) == 0 + + +@pytest.mark.asyncio +class TestUnregisterReplica: + async def test_unregister(self, tmp_path: Path, system_mocks: Mocks) -> None: + client = make_client(tmp_path) + # register service + resp = await client.post( + "/api/registry/test-proj/services/register", + json=register_service_payload(run_name="test-run", domain="test-run.gtw.test"), + ) + assert resp.status_code == 200 + # register 2 replicas + for job_id in ("xxx-xxx", "yyy-yyy"): + resp = await client.post( + "/api/registry/test-proj/services/test-run/replicas/register", + json=register_replica_payload(job_id=job_id), + ) + assert resp.status_code == 200 + conf = (tmp_path / "443-test-run.gtw.test.conf").read_text() + assert "replica xxx-xxx" in conf + assert "replica yyy-yyy" in conf + # unregister 1 replica + resp = await client.post( + "/api/registry/test-proj/services/test-run/replicas/yyy-yyy/unregister" + ) + assert resp.status_code == 200 + assert resp.json() == {"status": "ok"} + conf = (tmp_path / "443-test-run.gtw.test.conf").read_text() + assert "replica xxx-xxx" in conf + assert "replica yyy-yyy" not in conf + assert system_mocks.reload_nginx.call_count == 4 + assert system_mocks.close_conn.call_count == 1 + + async def test_unregister_no_replica_error(self, tmp_path: Path, system_mocks: Mocks) -> None: + client = make_client(tmp_path) + # register service + resp = await client.post( + "/api/registry/test-proj/services/register", + json=register_service_payload(run_name="test-run"), + ) + assert resp.status_code == 200 + # unregister nonexistent replica + resp = await client.post( + "/api/registry/test-proj/services/test-run/replicas/xxx-yyy/unregister" + ) + assert resp.status_code == 400 + assert resp.json() == { + "detail": ( + "Replica xxx-yyy does not exist in service test-proj/test-run, cannot unregister" + ) + } + assert system_mocks.reload_nginx.call_count == 1 + assert system_mocks.close_conn.call_count == 0 + + async def test_unregister_no_service_error(self, tmp_path: Path, system_mocks: Mocks) -> None: + client = make_client(tmp_path) + resp = await client.post( + "/api/registry/test-proj/services/test-run/replicas/xxx-yyy/unregister" + ) + assert resp.status_code == 400 + assert resp.json() == { + "detail": "Service test-proj/test-run does not exist, cannot unregister replica" + } + assert system_mocks.reload_nginx.call_count == 0 + assert system_mocks.close_conn.call_count == 0 + + +@pytest.mark.asyncio +class TestRegisterEntrypoint: + async def test_register(self, tmp_path: Path, system_mocks: Mocks) -> None: + client = make_client(tmp_path) + resp = await client.post( + "/api/registry/test-proj/entrypoints/register", + json={"domain": "gateway.gtw.test", "https": False}, + ) + assert resp.status_code == 200 + assert resp.json() == {"status": "ok"} + conf = (tmp_path / "443-gateway.gtw.test.conf").read_text() + assert "proxy_pass http://localhost:8000/api/models/test-proj/;" in conf + assert "listen 80;" in conf + assert "listen 443" not in conf + assert system_mocks.reload_nginx.call_count == 1 + assert system_mocks.run_certbot.call_count == 0 + + async def test_register_with_https(self, tmp_path: Path, system_mocks: Mocks) -> None: + client = make_client(tmp_path) + resp = await client.post( + "/api/registry/test-proj/entrypoints/register", + json={"domain": "gateway.gtw.test", "https": True}, + ) + assert resp.status_code == 200 + conf = (tmp_path / "443-gateway.gtw.test.conf").read_text() + assert "proxy_pass http://localhost:8000/api/models/test-proj/;" in conf + assert "listen 80;" in conf + assert "listen 443 ssl;" in conf + assert system_mocks.reload_nginx.call_count == 1 + assert system_mocks.run_certbot.call_count == 1 diff --git a/src/tests/_internal/proxy/gateway/routers/test_stats.py b/src/tests/_internal/proxy/gateway/routers/test_stats.py new file mode 100644 index 0000000000..669b9f8394 --- /dev/null +++ b/src/tests/_internal/proxy/gateway/routers/test_stats.py @@ -0,0 +1,144 @@ +from collections.abc import Iterable +from unittest.mock import patch + +import httpx +import pytest + +from dstack._internal.proxy.gateway.app import make_app +from dstack._internal.proxy.gateway.repo.repo import GatewayProxyRepo +from dstack._internal.proxy.lib.models import Service +from dstack._internal.proxy.lib.testing.common import make_project, make_service + + +def make_client(repo: GatewayProxyRepo) -> httpx.AsyncClient: + app = make_app(repo) + return httpx.AsyncClient(transport=httpx.ASGITransport(app=app), base_url="http://test/") + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("services", "collector_stats", "expected_response"), + [ + pytest.param( + [ + make_service("test-proj", "srv-1", domain="srv-1.gtw.test"), + make_service("test-proj", "srv-2", domain="srv-2.gtw.test"), + ], + { + "srv-1.gtw.test": { + 30: {"requests": 1, "request_time": 0.1}, + 60: {"requests": 2, "request_time": 0.2}, + 300: {"requests": 3, "request_time": 0.3}, + }, + "srv-2.gtw.test": { + 30: {"requests": 4, "request_time": 0.4}, + 60: {"requests": 5, "request_time": 0.5}, + 300: {"requests": 6, "request_time": 0.6}, + }, + }, + [ + { + "project_name": "test-proj", + "run_name": "srv-1", + "stats": { + "30": {"requests": 1, "request_time": 0.1}, + "60": {"requests": 2, "request_time": 0.2}, + "300": {"requests": 3, "request_time": 0.3}, + }, + }, + { + "project_name": "test-proj", + "run_name": "srv-2", + "stats": { + "30": {"requests": 4, "request_time": 0.4}, + "60": {"requests": 5, "request_time": 0.5}, + "300": {"requests": 6, "request_time": 0.6}, + }, + }, + ], + id="collects-two-services", + ), + pytest.param( + [ + make_service("test-proj", "has-stats", domain="has-stats.gtw.test"), + make_service("test-proj", "no-stats", domain="no-stats.gtw.test"), + ], + { + "has-stats.gtw.test": { + 30: {"requests": 1, "request_time": 0.1}, + 60: {"requests": 2, "request_time": 0.2}, + 300: {"requests": 3, "request_time": 0.3}, + }, + }, + [ + { + "project_name": "test-proj", + "run_name": "has-stats", + "stats": { + "30": {"requests": 1, "request_time": 0.1}, + "60": {"requests": 2, "request_time": 0.2}, + "300": {"requests": 3, "request_time": 0.3}, + }, + }, + { + "project_name": "test-proj", + "run_name": "no-stats", + "stats": { + "30": {"requests": 0, "request_time": 0.0}, + "60": {"requests": 0, "request_time": 0.0}, + "300": {"requests": 0, "request_time": 0.0}, + }, + }, + ], + id="adds-empty-stats-if-no-stats", + ), + pytest.param( + [ + make_service("test-proj", "relevant", domain="relevant.gtw.test"), + ], + { + "relevant.gtw.test": { + 30: {"requests": 1, "request_time": 0.1}, + 60: {"requests": 2, "request_time": 0.2}, + 300: {"requests": 3, "request_time": 0.3}, + }, + "irrelevant.gtw.test": { + 30: {"requests": 4, "request_time": 0.4}, + 60: {"requests": 5, "request_time": 0.5}, + 300: {"requests": 6, "request_time": 0.6}, + }, + }, + [ + { + "project_name": "test-proj", + "run_name": "relevant", + "stats": { + "30": {"requests": 1, "request_time": 0.1}, + "60": {"requests": 2, "request_time": 0.2}, + "300": {"requests": 3, "request_time": 0.3}, + }, + }, + ], + id="ignores-irrelevant-hosts", + ), + pytest.param( + [], + {}, + [], + id="no-services", + ), + ], +) +async def test_collect_stats(services: Iterable[Service], collector_stats, expected_response): + repo = GatewayProxyRepo() + for service in services: + await repo.set_project(make_project(service.project_name)) + await repo.set_service(service) + client = make_client(repo) + with patch( + "dstack._internal.proxy.gateway.services.stats.StatsCollector.collect" + ) as collect_mock: + collect_mock.return_value = collector_stats + resp = await client.get("/api/stats/collect") + assert resp.status_code == 200 + assert resp.json() == expected_response diff --git a/src/tests/_internal/proxy/gateway/services/__init__.py b/src/tests/_internal/proxy/gateway/services/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/proxy/gateway/services/test_stats.py b/src/tests/_internal/proxy/gateway/services/test_stats.py new file mode 100644 index 0000000000..e019aed7f2 --- /dev/null +++ b/src/tests/_internal/proxy/gateway/services/test_stats.py @@ -0,0 +1,170 @@ +from datetime import datetime, timezone +from pathlib import Path +from textwrap import dedent + +import pytest +from freezegun import freeze_time + +from dstack._internal.proxy.gateway.schemas.stats import Stat +from dstack._internal.proxy.gateway.services.stats import StatsCollector + + +@pytest.mark.asyncio +@freeze_time(datetime(2024, 12, 6, 12, 10, tzinfo=timezone.utc)) +@pytest.mark.parametrize( + ("access_log", "expected_result"), + [ + pytest.param( + dedent( + """ + 2024-12-06T12:08:00+00:00 srv-0.gtw.test 200 0.100 1 + 2024-12-06T12:08:00+00:00 srv-1.gtw.test 200 1.100 1 + 2024-12-06T12:09:15+00:00 srv-0.gtw.test 200 0.200 1 + 2024-12-06T12:09:15+00:00 srv-1.gtw.test 200 1.200 1 + 2024-12-06T12:09:45+00:00 srv-0.gtw.test 200 0.300 1 + """ + ), + { + "srv-0.gtw.test": { + 30: Stat(requests=1, request_time=0.3), + 60: Stat(requests=2, request_time=0.25), + 300: Stat(requests=3, request_time=0.2), + }, + "srv-1.gtw.test": { + 30: Stat(requests=0, request_time=0.0), + 60: Stat(requests=1, request_time=1.2), + 300: Stat(requests=2, request_time=1.15), + }, + }, + id="multiple-services", + ), + pytest.param( + dedent( + """ + 2024-12-06T12:08:00+00:00 srv.gtw.test 200 0.100 1 + 2024-12-06T12:08:00+00:00 srv.gtw.test 200 0.200 1 + 2024-12-06T12:08:00+00:00 srv.gtw.test 200 0.300 1 + 2024-12-06T12:08:01+00:00 srv.gtw.test 200 0.400 1 + 2024-12-06T12:08:01+00:00 srv.gtw.test 200 0.500 1 + """ + ), + { + "srv.gtw.test": { + 30: Stat(requests=0, request_time=0.0), + 60: Stat(requests=0, request_time=0.0), + 300: Stat(requests=5, request_time=0.3), + }, + }, + id="multiple-entries-per-second", + ), + pytest.param( + dedent( + """ + 2024-12-06T12:04:50+00:00 srv.gtw.test 200 0.400 1 + 2024-12-06T12:08:00+00:00 srv.gtw.test 200 0.300 1 + 2024-12-06T12:09:15+00:00 srv.gtw.test 200 0.200 1 + 2024-12-06T12:09:45+00:00 srv.gtw.test 200 0.100 1 + """ + ), + { + "srv.gtw.test": { + 30: Stat(requests=1, request_time=0.1), + 60: Stat(requests=2, request_time=0.15), + 300: Stat(requests=3, request_time=0.2), + }, + }, + id="ignores-out-of-window", + ), + pytest.param( + dedent( + """ + 2024-12-06T12:08:01+00:00 srv.gtw.test 200 0.100 1 + 2024-12-06T12:08:02+00:00 srv.gtw.test 200 0.200 0 + 2024-12-06T12:08:03+00:00 srv.gtw.test 200 0.300 1 + """ + ), + { + "srv.gtw.test": { + 30: Stat(requests=0, request_time=0.0), + 60: Stat(requests=0, request_time=0.0), + 300: Stat(requests=2, request_time=0.2), + }, + }, + id="ignores-replica-not-hit", + ), + pytest.param( + dedent( + """ + 2024-12-06T12:08:01+00:00 srv.gtw.test 200 0.100 + 2024-12-06T12:08:02+00:00 srv.gtw.test 303 0.200 + 2024-12-06T12:08:03+00:00 srv.gtw.test 401 0.300 + 2024-12-06T12:08:04+00:00 srv.gtw.test 502 0.400 + 2024-12-06T12:08:05+00:00 srv.gtw.test 403 0.500 + 2024-12-06T12:08:06+00:00 srv.gtw.test 404 0.600 + """ + ), + { + "srv.gtw.test": { + 30: Stat(requests=0, request_time=0.0), + 60: Stat(requests=0, request_time=0.0), + 300: Stat(requests=4, request_time=0.25), + }, + }, + id="ignores-irrelevant-statuses-in-legacy-pre-0.19.11-log", + ), + pytest.param( + "", + {}, + id="empty-log", + ), + ], +) +async def test_collect_stats(access_log: str, expected_result: dict, tmp_path: Path) -> None: + access_log_path = tmp_path / "dstack.access.log" + access_log_path.write_text(access_log.lstrip()) + collector = StatsCollector(access_log_path) + result = await collector.collect() + assert result == expected_result + + +@pytest.mark.asyncio +@freeze_time(datetime(2024, 12, 6, 12, 10, tzinfo=timezone.utc)) +async def test_collect_stats_after_log_update(tmp_path: Path) -> None: + access_log_path = tmp_path / "dstack.access.log" + collector = StatsCollector(access_log_path) + first_chunk = dedent( + """ + 2024-12-06T12:09:15+00:00 srv.gtw.test 200 0.100 + 2024-12-06T12:09:20+00:00 srv.gtw.test 200 0.200 + """ + ).lstrip() + second_chunk = dedent( + """ + 2024-12-06T12:09:40+00:00 srv.gtw.test 200 0.300 + 2024-12-06T12:09:45+00:00 srv.gtw.test 200 0.400 + 2024-12-06T12:09:50+00:00 srv.gtw.test 200 0.500 + """ + ).lstrip() + first_chunk_stats = { + "srv.gtw.test": { + 30: Stat(requests=0, request_time=0.0), + 60: Stat(requests=2, request_time=0.15), + 300: Stat(requests=2, request_time=0.15), + }, + } + both_chunks_stats = { + "srv.gtw.test": { + 30: Stat(requests=3, request_time=0.4), + 60: Stat(requests=5, request_time=0.3), + 300: Stat(requests=5, request_time=0.3), + }, + } + with open(access_log_path, "w") as f: + f.write(first_chunk) + f.flush() + result = await collector.collect() + assert result == first_chunk_stats + f.write(second_chunk) + f.flush() + result = await collector.collect() + assert result == both_chunks_stats diff --git a/src/tests/_internal/proxy/gateway/test_app.py b/src/tests/_internal/proxy/gateway/test_app.py new file mode 100644 index 0000000000..d1bff6d2ad --- /dev/null +++ b/src/tests/_internal/proxy/gateway/test_app.py @@ -0,0 +1,32 @@ +from pathlib import Path + +import pytest + +from dstack._internal.proxy.gateway.app import lifespan, make_app +from dstack._internal.proxy.gateway.models import ModelEntrypoint +from dstack._internal.proxy.gateway.repo.repo import GatewayProxyRepo +from dstack._internal.proxy.gateway.services.nginx import Nginx +from dstack._internal.proxy.gateway.testing.common import Mocks +from dstack._internal.proxy.lib.testing.common import make_project, make_service + + +@pytest.mark.asyncio +async def test_lifespan(tmp_path: Path, system_mocks: Mocks) -> None: + repo = GatewayProxyRepo() + await repo.set_project(make_project("test-proj")) + await repo.set_entrypoint( + ModelEntrypoint(project_name="proj-1", domain="gateway.gtw.test", https=True) + ) + await repo.set_service( + make_service("test-proj", "test-run", domain="test-run.gtw.test", https=True) + ) + nginx_dir = tmp_path / "nginx" + nginx_dir.mkdir() + app = make_app(repo=repo, nginx=Nginx(conf_dir=nginx_dir)) + async with lifespan(app): + assert (nginx_dir / "00-log-format.conf").exists() + assert (nginx_dir / "443-gateway.gtw.test.conf").exists() + assert (nginx_dir / "443-test-run.gtw.test.conf").exists() + assert system_mocks.open_conn.call_count == 1 + assert system_mocks.close_conn.call_count == 0 + assert system_mocks.close_conn.call_count == 1 diff --git a/src/tests/_internal/proxy/lib/__init__.py b/src/tests/_internal/proxy/lib/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/proxy/lib/routers/__init__.py b/src/tests/_internal/proxy/lib/routers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/proxy/lib/routers/test_model_proxy.py b/src/tests/_internal/proxy/lib/routers/test_model_proxy.py new file mode 100644 index 0000000000..d044f87e02 --- /dev/null +++ b/src/tests/_internal/proxy/lib/routers/test_model_proxy.py @@ -0,0 +1,255 @@ +from datetime import datetime +from typing import AsyncIterator, Generator +from unittest.mock import patch + +import httpx +import openai +import pytest +from fastapi import FastAPI + +from dstack._internal.proxy.gateway.repo.repo import GatewayProxyRepo +from dstack._internal.proxy.lib.auth import BaseProxyAuthProvider +from dstack._internal.proxy.lib.models import ChatModel, OpenAIChatModelFormat +from dstack._internal.proxy.lib.repo import BaseProxyRepo +from dstack._internal.proxy.lib.routers.model_proxy import router +from dstack._internal.proxy.lib.schemas.model_proxy import ( + ChatCompletionsChoice, + ChatCompletionsChunk, + ChatCompletionsChunkChoice, + ChatCompletionsRequest, + ChatCompletionsResponse, + ChatCompletionsUsage, + ChatMessage, +) +from dstack._internal.proxy.lib.services.model_proxy.clients.base import ChatCompletionsClient +from dstack._internal.proxy.lib.testing.auth import ProxyTestAuthProvider +from dstack._internal.proxy.lib.testing.common import ( + ProxyTestDependencyInjector, + make_project, + make_service, +) + +SAMPLE_RESPONSE = "Hello there, how may I assist you today?" + + +class ChatClientStub(ChatCompletionsClient): + async def generate(self, request: ChatCompletionsRequest) -> ChatCompletionsResponse: + return ChatCompletionsResponse( + id="chatcmpl-123", + choices=[ + ChatCompletionsChoice( + finish_reason="stop", + index=0, + message=ChatMessage( + role="assistant", + content=SAMPLE_RESPONSE, + ), + ) + ], + created=int(datetime.now().timestamp()), + model=request.model, + usage=ChatCompletionsUsage( + completion_tokens=12, + prompt_tokens=9, + total_tokens=21, + ), + ) + + async def stream(self, request: ChatCompletionsRequest) -> AsyncIterator[ChatCompletionsChunk]: + for i, word in enumerate(SAMPLE_RESPONSE.split(" ")): + if i > 0: + word = " " + word + yield ChatCompletionsChunk( + id="chatcmpl-123", + choices=[ + ChatCompletionsChunkChoice( + finish_reason=None, + index=0, + delta=dict( + role="assistant", + content=word, + ), + ) + ], + created=int(datetime.now().timestamp()), + model=request.model, + ) + + +def make_model( + project_name: str, name: str, run_name: str, created_at: datetime = datetime.fromtimestamp(0) +) -> ChatModel: + return ChatModel( + project_name=project_name, + name=name, + created_at=created_at, + run_name=run_name, + format_spec=OpenAIChatModelFormat(format="openai", prefix="/v1"), + ) + + +def make_http_client(repo: BaseProxyRepo, auth: BaseProxyAuthProvider) -> httpx.AsyncClient: + app = FastAPI() + app.state.proxy_dependency_injector = ProxyTestDependencyInjector(repo=repo, auth=auth) + app.include_router(router, prefix="/proxy/models") + return httpx.AsyncClient(transport=httpx.ASGITransport(app=app)) + + +def make_openai_client( + repo: BaseProxyRepo, + auth: BaseProxyAuthProvider, + project_name: str, + auth_token: str = "token", +) -> openai.AsyncOpenAI: + http_client = make_http_client(repo, auth) + return openai.AsyncOpenAI( + api_key=auth_token, + base_url=f"http://test-host/proxy/models/{project_name}", + http_client=http_client, + ) + + +@pytest.fixture +def mock_chat_client() -> Generator[None, None, None]: + with ( + patch( + "dstack._internal.proxy.lib.services.service_connection.ServiceConnectionPool.get_or_add" + ), + patch("dstack._internal.proxy.lib.routers.model_proxy.get_chat_client") as get_client_mock, + ): + get_client_mock.return_value = ChatClientStub() + yield + + +@pytest.mark.asyncio +async def test_list_models() -> None: + auth = ProxyTestAuthProvider({"test-proj": {"token"}}) + repo = GatewayProxyRepo() + await repo.set_project(make_project("test-proj")) + await repo.set_service(make_service("test-proj", "test-service-1")) + await repo.set_service(make_service("test-proj", "test-service-2")) + await repo.set_model( + make_model( + "test-proj", "test-model-1", "test-service-1", created_at=datetime.fromtimestamp(123) + ), + ) + await repo.set_model( + make_model( + "test-proj", "test-model-2", "test-service-2", created_at=datetime.fromtimestamp(321) + ), + ) + + client = make_openai_client(repo, auth, "test-proj", auth_token="token") + models = [model async for model in client.models.list()] + + assert models[0].id == "test-model-1" + assert models[0].created == 123 + assert models[0].owned_by == "test-proj" + assert models[1].id == "test-model-2" + assert models[1].created == 321 + assert models[1].owned_by == "test-proj" + + +@pytest.mark.asyncio +async def test_list_models_empty() -> None: + auth = ProxyTestAuthProvider({"test-proj": {"token"}, "test-proj-empty": {"token"}}) + repo = GatewayProxyRepo() + await repo.set_project(make_project("test-proj")) + await repo.set_project(make_project("test-proj-empty")) + await repo.set_service(make_service("test-proj", "test-service")) + await repo.set_model(make_model("test-proj", "test-model", "test-service")) + + client = make_openai_client(repo, auth, "test-proj-empty", auth_token="token") + models = [model async for model in client.models.list()] + assert not models + + +@pytest.mark.asyncio +async def test_chat_completions(mock_chat_client) -> None: + auth = ProxyTestAuthProvider({"test-proj": {"token"}}) + repo = GatewayProxyRepo() + await repo.set_project(make_project("test-proj")) + await repo.set_service(make_service("test-proj", "test-service")) + await repo.set_model(make_model("test-proj", "test-model", "test-service")) + client = make_openai_client(repo, auth, "test-proj", auth_token="token") + completion = await client.chat.completions.create( + model="test-model", + messages=[{"role": "user", "content": "Hi"}], + ) + assert completion.choices[0].message.content == SAMPLE_RESPONSE + + +@pytest.mark.asyncio +async def test_chat_completions_stream(mock_chat_client) -> None: + auth = ProxyTestAuthProvider({"test-proj": {"token"}}) + repo = GatewayProxyRepo() + await repo.set_project(make_project("test-proj")) + await repo.set_service(make_service("test-proj", "test-service")) + await repo.set_model(make_model("test-proj", "test-model", "test-service")) + client = make_openai_client(repo, auth, "test-proj", auth_token="token") + response = await client.chat.completions.create( + model="test-model", + messages=[{"role": "user", "content": "Hi"}], + stream=True, + ) + completion = "" + async for chunk in response: + completion += chunk.choices[0].delta.content + assert completion == SAMPLE_RESPONSE + + +@pytest.mark.asyncio +async def test_chat_completions_model_not_found() -> None: + auth = ProxyTestAuthProvider({"test-proj": {"token"}}) + repo = GatewayProxyRepo() + await repo.set_project(make_project("test-proj")) + client = make_openai_client(repo, auth, "test-proj", auth_token="token") + with pytest.raises(openai.NotFoundError): + await client.chat.completions.create( + model="unknown-model", + messages=[{"role": "user", "content": "Hi"}], + ) + + +@pytest.mark.asyncio +async def test_unauthorized_openai_sdk() -> None: + auth = ProxyTestAuthProvider({"test-proj": {"correct-token"}}) + repo = GatewayProxyRepo() + await repo.set_project(make_project("test-proj")) + client = make_openai_client(repo, auth, "test-proj", auth_token="invalid-token") + + with pytest.raises(openai.PermissionDeniedError): + await client.models.list() + with pytest.raises(openai.PermissionDeniedError): + await client.chat.completions.create( + model="test-model", + messages=[{"role": "user", "content": "Hi"}], + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "headers", + [ + {"Authorization": "Bearer invalid-token"}, + {"Authorization": "Bearer "}, + {"Authorization": "Bearer"}, + {"Authorization": ""}, + None, + ], +) +async def test_unauthorized_http(headers) -> None: + auth = ProxyTestAuthProvider({"test-proj": {"correct-token"}}) + repo = GatewayProxyRepo() + await repo.set_project(make_project("test-proj")) + client = make_http_client(repo, auth) + + resp = await client.get("http://test-host/proxy/models/test-proj/models", headers=headers) + assert resp.status_code == 403 + + resp = await client.post( + "http://test-host/proxy/models/test-proj/chat/completions", + json={"model": "test-model", "messages": [{"role": "user", "content": "Hi"}]}, + headers=headers, + ) + assert resp.status_code == 403 diff --git a/src/tests/_internal/server/background/pipeline_tasks/__init__.py b/src/tests/_internal/server/background/pipeline_tasks/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_base.py b/src/tests/_internal/server/background/pipeline_tasks/test_base.py new file mode 100644 index 0000000000..303fb0854e --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_base.py @@ -0,0 +1,183 @@ +import uuid +from datetime import datetime, timedelta, timezone +from unittest.mock import patch + +import pytest +from sqlalchemy import update +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.server.background.pipeline_tasks.base import Heartbeater, PipelineItem +from dstack._internal.server.models import PlacementGroupModel +from dstack._internal.server.testing.common import ( + create_fleet, + create_placement_group, + create_project, +) + + +@pytest.fixture +def now() -> datetime: + return datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + + +@pytest.fixture +def heartbeater() -> Heartbeater[PipelineItem]: + return Heartbeater( + model_type=PlacementGroupModel, + lock_timeout=timedelta(seconds=30), + heartbeat_trigger=timedelta(seconds=5), + ) + + +async def _create_locked_placement_group( + session: AsyncSession, + now: datetime, + lock_expires_in: timedelta, +) -> PlacementGroupModel: + project = await create_project(session) + fleet = await create_fleet(session=session, project=project) + placement_group = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="test-pg", + ) + placement_group.lock_token = uuid.uuid4() + placement_group.lock_expires_at = now + lock_expires_in + await session.commit() + return placement_group + + +def _placement_group_to_pipeline_item(placement_group: PlacementGroupModel) -> PipelineItem: + assert placement_group.lock_token is not None + assert placement_group.lock_expires_at is not None + return PipelineItem( + __tablename__=PlacementGroupModel.__tablename__, + id=placement_group.id, + lock_token=placement_group.lock_token, + lock_expires_at=placement_group.lock_expires_at, + prev_lock_expired=False, + ) + + +class TestHeartbeater: + @pytest.mark.asyncio + async def test_untrack_preserves_item_when_lock_token_mismatches( + self, heartbeater: Heartbeater[PipelineItem], now: datetime + ): + item = PipelineItem( + __tablename__=PlacementGroupModel.__tablename__, + id=uuid.uuid4(), + lock_token=uuid.uuid4(), + lock_expires_at=now + timedelta(seconds=10), + prev_lock_expired=True, + ) + await heartbeater.track(item) + + stale_item = PipelineItem( + __tablename__=PlacementGroupModel.__tablename__, + id=item.id, + lock_token=uuid.uuid4(), + lock_expires_at=item.lock_expires_at, + prev_lock_expired=False, + ) + await heartbeater.untrack(stale_item) + + assert item.id in heartbeater._items + await heartbeater.untrack(item) + assert item.id not in heartbeater._items + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_heartbeat_extends_locks_close_to_expiration( + self, + test_db, + session: AsyncSession, + heartbeater: Heartbeater[PipelineItem], + now: datetime, + ): + placement_group = await _create_locked_placement_group( + session=session, + now=now, + lock_expires_in=timedelta(seconds=2), + ) + await heartbeater.track(_placement_group_to_pipeline_item(placement_group)) + + with patch( + "dstack._internal.server.background.pipeline_tasks.base.get_current_datetime", + return_value=now, + ): + await heartbeater.heartbeat() + + expected_lock_expires_at = now + timedelta(seconds=30) + tracked_item = heartbeater._items[placement_group.id] + assert tracked_item.lock_expires_at == expected_lock_expires_at + + await session.refresh(placement_group) + assert placement_group.lock_expires_at == expected_lock_expires_at + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_heartbeat_untracks_expired_items_without_db_update( + self, + test_db, + session: AsyncSession, + heartbeater: Heartbeater[PipelineItem], + now: datetime, + ): + original_lock_expires_at = now - timedelta(seconds=1) + placement_group = await _create_locked_placement_group( + session=session, + now=now, + lock_expires_in=timedelta(seconds=-1), + ) + await heartbeater.track(_placement_group_to_pipeline_item(placement_group)) + + with patch( + "dstack._internal.server.background.pipeline_tasks.base.get_current_datetime", + return_value=now, + ): + await heartbeater.heartbeat() + + assert placement_group.id not in heartbeater._items + + await session.refresh(placement_group) + assert placement_group.lock_expires_at == original_lock_expires_at + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_heartbeat_untracks_item_when_lock_token_changed_in_db( + self, + test_db, + session: AsyncSession, + heartbeater: Heartbeater[PipelineItem], + now: datetime, + ): + original_lock_expires_at = now + timedelta(seconds=2) + placement_group = await _create_locked_placement_group( + session=session, + now=now, + lock_expires_in=timedelta(seconds=2), + ) + await heartbeater.track(_placement_group_to_pipeline_item(placement_group)) + + new_lock_token = uuid.uuid4() + await session.execute( + update(PlacementGroupModel) + .where(PlacementGroupModel.id == placement_group.id) + .values(lock_token=new_lock_token) + .execution_options(synchronize_session=False) + ) + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.base.get_current_datetime", + return_value=now, + ): + await heartbeater.heartbeat() + + assert placement_group.id not in heartbeater._items + + await session.refresh(placement_group) + assert placement_group.lock_token == new_lock_token + assert placement_group.lock_expires_at == original_lock_expires_at diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_compute_groups.py b/src/tests/_internal/server/background/pipeline_tasks/test_compute_groups.py new file mode 100644 index 0000000000..66296ba844 --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_compute_groups.py @@ -0,0 +1,223 @@ +import asyncio +import uuid +from datetime import datetime, timedelta, timezone +from unittest.mock import Mock, patch + +import pytest +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.errors import BackendError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.compute_groups import ComputeGroupStatus +from dstack._internal.server.background.pipeline_tasks.base import PipelineItem +from dstack._internal.server.background.pipeline_tasks.compute_groups import ( + ComputeGroupFetcher, + ComputeGroupPipeline, + ComputeGroupWorker, +) +from dstack._internal.server.models import ComputeGroupModel +from dstack._internal.server.testing.common import ( + ComputeMockSpec, + create_compute_group, + create_fleet, + create_project, +) +from dstack._internal.utils.common import get_current_datetime + + +@pytest.fixture +def worker() -> ComputeGroupWorker: + return ComputeGroupWorker(queue=Mock(), heartbeater=Mock(), pipeline_hinter=Mock()) + + +@pytest.fixture +def fetcher() -> ComputeGroupFetcher: + return ComputeGroupFetcher( + queue=asyncio.Queue(), + queue_desired_minsize=1, + min_processing_interval=timedelta(seconds=15), + lock_timeout=timedelta(seconds=30), + heartbeater=Mock(), + ) + + +def _compute_group_to_pipeline_item(compute_group: ComputeGroupModel) -> PipelineItem: + assert compute_group.lock_token is not None + assert compute_group.lock_expires_at is not None + return PipelineItem( + __tablename__=compute_group.__tablename__, + id=compute_group.id, + lock_token=compute_group.lock_token, + lock_expires_at=compute_group.lock_expires_at, + prev_lock_expired=False, + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestComputeGroupFetcher: + async def test_fetch_selects_eligible_compute_groups_and_sets_lock_fields( + self, test_db, session: AsyncSession, fetcher: ComputeGroupFetcher + ): + project = await create_project(session) + fleet = await create_fleet(session=session, project=project) + now = get_current_datetime() + stale = now - timedelta(minutes=1) + + eligible = await create_compute_group( + session=session, + project=project, + fleet=fleet, + last_processed_at=stale - timedelta(seconds=2), + ) + finished = await create_compute_group( + session=session, + project=project, + fleet=fleet, + status=ComputeGroupStatus.TERMINATED, + last_processed_at=stale - timedelta(seconds=1), + ) + recent = await create_compute_group( + session=session, + project=project, + fleet=fleet, + last_processed_at=now, + ) + locked = await create_compute_group( + session=session, + project=project, + fleet=fleet, + last_processed_at=stale, + ) + locked.lock_expires_at = now + timedelta(minutes=1) + locked.lock_token = uuid.uuid4() + locked.lock_owner = "OtherPipeline" + await session.commit() + + items = await fetcher.fetch(limit=10) + + assert [item.id for item in items] == [eligible.id] + + for compute_group in [eligible, finished, recent, locked]: + await session.refresh(compute_group) + + assert eligible.lock_owner == ComputeGroupPipeline.__name__ + assert eligible.lock_expires_at is not None + assert eligible.lock_token is not None + + assert finished.lock_owner is None + assert recent.lock_owner is None + assert locked.lock_owner == "OtherPipeline" + + async def test_fetch_returns_oldest_compute_groups_first_up_to_limit( + self, test_db, session: AsyncSession, fetcher: ComputeGroupFetcher + ): + project = await create_project(session) + fleet = await create_fleet(session=session, project=project) + now = get_current_datetime() + + oldest = await create_compute_group( + session=session, + project=project, + fleet=fleet, + last_processed_at=now - timedelta(minutes=3), + ) + middle = await create_compute_group( + session=session, + project=project, + fleet=fleet, + last_processed_at=now - timedelta(minutes=2), + ) + newest = await create_compute_group( + session=session, + project=project, + fleet=fleet, + last_processed_at=now - timedelta(minutes=1), + ) + + items = await fetcher.fetch(limit=2) + + assert [item.id for item in items] == [oldest.id, middle.id] + + await session.refresh(oldest) + await session.refresh(middle) + await session.refresh(newest) + + assert oldest.lock_owner == ComputeGroupPipeline.__name__ + assert middle.lock_owner == ComputeGroupPipeline.__name__ + assert newest.lock_owner is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestComputeGroupWorker: + async def test_terminates_compute_group( + self, test_db, session: AsyncSession, worker: ComputeGroupWorker + ): + project = await create_project(session) + fleet = await create_fleet(session=session, project=project) + compute_group = await create_compute_group( + session=session, + project=project, + fleet=fleet, + ) + compute_group.lock_token = uuid.uuid4() + compute_group.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + with patch("dstack._internal.server.services.backends.get_project_backend_by_type") as m: + backend_mock = Mock() + compute_mock = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value = compute_mock + m.return_value = backend_mock + backend_mock.TYPE = BackendType.RUNPOD + await worker.process(_compute_group_to_pipeline_item(compute_group)) + compute_mock.terminate_compute_group.assert_called_once() + await session.refresh(compute_group) + assert compute_group.status == ComputeGroupStatus.TERMINATED + assert compute_group.deleted + + async def test_retries_compute_group_termination( + self, test_db, session: AsyncSession, worker: ComputeGroupWorker + ): + project = await create_project(session) + fleet = await create_fleet(session=session, project=project) + compute_group = await create_compute_group( + session=session, + project=project, + fleet=fleet, + last_processed_at=datetime(2023, 1, 2, 3, 0, tzinfo=timezone.utc), + ) + compute_group.lock_token = uuid.uuid4() + compute_group.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + with patch("dstack._internal.server.services.backends.get_project_backend_by_type") as m: + backend_mock = Mock() + compute_mock = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value = compute_mock + m.return_value = backend_mock + backend_mock.TYPE = BackendType.RUNPOD + compute_mock.terminate_compute_group.side_effect = BackendError() + await worker.process(_compute_group_to_pipeline_item(compute_group)) + compute_mock.terminate_compute_group.assert_called_once() + await session.refresh(compute_group) + assert compute_group.status != ComputeGroupStatus.TERMINATED + assert compute_group.first_termination_retry_at is not None + assert compute_group.last_termination_retry_at is not None + # Simulate termination deadline exceeded + compute_group.first_termination_retry_at = datetime(2023, 1, 2, 3, 0, tzinfo=timezone.utc) + compute_group.last_termination_retry_at = datetime(2023, 1, 2, 4, 0, tzinfo=timezone.utc) + compute_group.last_processed_at = datetime(2023, 1, 2, 4, 0, tzinfo=timezone.utc) + compute_group.lock_token = uuid.uuid4() + compute_group.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + with patch("dstack._internal.server.services.backends.get_project_backend_by_type") as m: + backend_mock = Mock() + compute_mock = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value = compute_mock + m.return_value = backend_mock + backend_mock.TYPE = BackendType.RUNPOD + compute_mock.terminate_compute_group.side_effect = BackendError() + await worker.process(_compute_group_to_pipeline_item(compute_group)) + compute_mock.terminate_compute_group.assert_called_once() + await session.refresh(compute_group) + assert compute_group.status == ComputeGroupStatus.TERMINATED diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_fleets.py b/src/tests/_internal/server/background/pipeline_tasks/test_fleets.py new file mode 100644 index 0000000000..2268ad9400 --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_fleets.py @@ -0,0 +1,1587 @@ +import asyncio +import uuid +from datetime import datetime, timedelta, timezone +from unittest.mock import AsyncMock, Mock, patch + +import pytest +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.fleets import ( + FleetNodesSpec, + FleetStatus, + InstanceGroupPlacement, +) +from dstack._internal.core.models.instances import InstanceStatus, InstanceTerminationReason +from dstack._internal.core.models.runs import RunStatus +from dstack._internal.core.models.users import GlobalRole, ProjectRole +from dstack._internal.server.background.pipeline_tasks import fleets as fleets_pipeline +from dstack._internal.server.background.pipeline_tasks.base import PipelineItem +from dstack._internal.server.background.pipeline_tasks.fleets import ( + FleetFetcher, + FleetPipeline, + FleetWorker, +) +from dstack._internal.server.models import ( + EventModel, + EventTargetModel, + ExportedFleetModel, + FleetModel, + InstanceModel, +) +from dstack._internal.server.services.projects import add_project_member +from dstack._internal.server.testing.common import ( + create_export, + create_fleet, + create_instance, + create_placement_group, + create_project, + create_repo, + create_run, + create_user, + get_fleet_configuration, + get_fleet_spec, + get_job_provisioning_data, + get_ssh_fleet_configuration, +) +from dstack._internal.utils.common import get_current_datetime + + +@pytest.fixture +def worker() -> FleetWorker: + return FleetWorker(queue=Mock(), heartbeater=Mock(), pipeline_hinter=Mock()) + + +@pytest.fixture +def fetcher() -> FleetFetcher: + return FleetFetcher( + queue=asyncio.Queue(), + queue_desired_minsize=1, + min_processing_interval=timedelta(seconds=60), + lock_timeout=timedelta(seconds=20), + heartbeater=Mock(), + ) + + +def _fleet_to_pipeline_item(fleet: FleetModel) -> PipelineItem: + assert fleet.lock_token is not None + assert fleet.lock_expires_at is not None + return PipelineItem( + __tablename__=fleet.__tablename__, + id=fleet.id, + lock_token=fleet.lock_token, + lock_expires_at=fleet.lock_expires_at, + prev_lock_expired=False, + ) + + +async def _lock_fleet_for_processing(session: AsyncSession, fleet: FleetModel) -> None: + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestFleetFetcher: + async def test_fetch_selects_eligible_fleets_and_sets_lock_fields( + self, test_db, session: AsyncSession, fetcher: FleetFetcher + ): + project = await create_project(session) + now = get_current_datetime() + + stale = await create_fleet( + session=session, + project=project, + last_processed_at=now - timedelta(minutes=3), + ) + just_created = await create_fleet( + session=session, + project=project, + created_at=now, + last_processed_at=now, + name="just-created", + ) + deleted = await create_fleet( + session=session, + project=project, + deleted=True, + name="deleted", + last_processed_at=now - timedelta(minutes=2), + ) + recent = await create_fleet( + session=session, + project=project, + created_at=now - timedelta(minutes=2), + last_processed_at=now, + name="recent", + ) + locked = await create_fleet( + session=session, + project=project, + name="locked", + last_processed_at=now - timedelta(minutes=1, seconds=1), + ) + locked.lock_expires_at = now + timedelta(minutes=1) + locked.lock_token = uuid.uuid4() + locked.lock_owner = "OtherPipeline" + await session.commit() + + items = await fetcher.fetch(limit=10) + + assert {item.id for item in items} == {stale.id, just_created.id} + + for fleet in [stale, just_created, deleted, recent, locked]: + await session.refresh(fleet) + + assert stale.lock_owner == FleetPipeline.__name__ + assert just_created.lock_owner == FleetPipeline.__name__ + assert stale.lock_expires_at is not None + assert just_created.lock_expires_at is not None + assert stale.lock_token is not None + assert just_created.lock_token is not None + assert len({stale.lock_token, just_created.lock_token}) == 1 + + assert deleted.lock_owner is None + assert recent.lock_owner is None + assert locked.lock_owner == "OtherPipeline" + + async def test_fetch_returns_oldest_fleets_first_up_to_limit( + self, test_db, session: AsyncSession, fetcher: FleetFetcher + ): + project = await create_project(session) + now = get_current_datetime() + + oldest = await create_fleet( + session=session, + project=project, + name="oldest", + last_processed_at=now - timedelta(minutes=4), + ) + middle = await create_fleet( + session=session, + project=project, + name="middle", + last_processed_at=now - timedelta(minutes=3), + ) + newest = await create_fleet( + session=session, + project=project, + name="newest", + last_processed_at=now - timedelta(minutes=2), + ) + + items = await fetcher.fetch(limit=2) + + assert [item.id for item in items] == [oldest.id, middle.id] + + await session.refresh(oldest) + await session.refresh(middle) + await session.refresh(newest) + + assert oldest.lock_owner == FleetPipeline.__name__ + assert middle.lock_owner == FleetPipeline.__name__ + assert newest.lock_owner is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestFleetWorker: + async def test_skips_instance_locking_for_ssh_fleet( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec(conf=get_ssh_fleet_configuration()), + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + ) + original_last_processed_at = fleet.last_processed_at + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + instance.lock_token = uuid.uuid4() + instance.lock_expires_at = datetime(2025, 1, 2, 3, 5, tzinfo=timezone.utc) + instance.lock_owner = "OtherPipeline" + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + await session.refresh(instance) + assert not fleet.deleted + assert fleet.lock_owner is None + assert fleet.lock_token is None + assert fleet.lock_expires_at is None + assert fleet.last_processed_at > original_last_processed_at + assert instance.lock_owner == "OtherPipeline" + + async def test_skips_instance_locking_when_fleet_is_not_ready_for_consolidation( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + ) + original_last_processed_at = fleet.last_processed_at + original_last_consolidated_at = datetime.now(timezone.utc) + fleet.consolidation_attempt = 1 + fleet.last_consolidated_at = original_last_consolidated_at + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + instance.lock_token = uuid.uuid4() + instance.lock_expires_at = datetime(2025, 1, 2, 3, 5, tzinfo=timezone.utc) + instance.lock_owner = "OtherPipeline" + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + await session.refresh(instance) + assert not fleet.deleted + assert fleet.consolidation_attempt == 1 + assert fleet.last_consolidated_at == original_last_consolidated_at + assert fleet.lock_owner is None + assert fleet.lock_token is None + assert fleet.lock_expires_at is None + assert fleet.last_processed_at > original_last_processed_at + assert instance.lock_owner == "OtherPipeline" + + async def test_resets_fleet_lock_when_not_all_instances_can_be_locked( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + instance_num=0, + ) + locked_elsewhere = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + instance_num=1, + ) + original_last_processed_at = fleet.last_processed_at + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + fleet.lock_owner = FleetPipeline.__name__ + locked_elsewhere.lock_token = uuid.uuid4() + locked_elsewhere.lock_expires_at = datetime(2025, 1, 2, 3, 5, tzinfo=timezone.utc) + locked_elsewhere.lock_owner = "OtherPipeline" + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + await session.refresh(locked_elsewhere) + assert fleet.lock_owner == FleetPipeline.__name__ + assert fleet.lock_token is None + assert fleet.lock_expires_at is None + assert fleet.last_processed_at > original_last_processed_at + assert locked_elsewhere.lock_owner == "OtherPipeline" + + async def test_unlocks_instances_after_consolidation( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.configuration.nodes = FleetNodesSpec(min=1, target=1, max=1) + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + instance_num=0, + ) + await _lock_fleet_for_processing(session, fleet) + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(instance) + assert instance.lock_owner is None + assert instance.lock_token is None + assert instance.lock_expires_at is None + + async def test_unlocks_instances_when_fleet_lock_token_changes_after_processing( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.configuration.nodes = FleetNodesSpec(min=1, target=1, max=1) + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + instance_num=0, + ) + await _lock_fleet_for_processing(session, fleet) + + async def mock_process_fleet(*args, **kwargs): + fleet_model = args[0] + fleet_model.lock_token = uuid.uuid4() + return fleets_pipeline._ProcessResult() + + with patch.object( + fleets_pipeline, + "_process_fleet", + AsyncMock(side_effect=mock_process_fleet), + ): + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(instance) + assert instance.lock_owner is None + assert instance.lock_token is None + assert instance.lock_expires_at is None + + async def test_syncs_initial_current_master_for_cluster_fleet( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=2, target=2, max=2), + ) + ), + ) + first_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + job_provisioning_data=None, + offer=None, + instance_num=0, + ) + await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + job_provisioning_data=None, + offer=None, + instance_num=1, + ) + await _lock_fleet_for_processing(session, fleet) + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + assert fleet.current_master_instance_id == first_instance.id + + async def test_keeps_current_master_when_it_is_still_active( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=2, target=2, max=2), + ) + ), + ) + await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + job_provisioning_data=None, + offer=None, + instance_num=0, + ) + current_master = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PROVISIONING, + job_provisioning_data=get_job_provisioning_data(), + instance_num=1, + ) + fleet.current_master_instance_id = current_master.id + await _lock_fleet_for_processing(session, fleet) + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + assert fleet.current_master_instance_id == current_master.id + + async def test_promotes_provisioned_survivor_when_current_master_terminated( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=1, target=1, max=2), + ) + ), + ) + terminated_master = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.TERMINATED, + job_provisioning_data=None, + offer=None, + instance_num=0, + ) + provisioned_survivor = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + job_provisioning_data=get_job_provisioning_data(), + instance_num=1, + ) + fleet.current_master_instance_id = terminated_master.id + await _lock_fleet_for_processing(session, fleet) + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + await session.refresh(terminated_master) + assert terminated_master.deleted + assert fleet.current_master_instance_id == provisioned_survivor.id + + async def test_promotes_next_bootstrap_candidate_when_current_master_terminated( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=1, target=1, max=2), + ) + ), + ) + terminated_master = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.TERMINATED, + job_provisioning_data=None, + offer=None, + instance_num=0, + ) + next_candidate = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + job_provisioning_data=None, + offer=None, + instance_num=1, + ) + fleet.current_master_instance_id = terminated_master.id + await _lock_fleet_for_processing(session, fleet) + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + assert fleet.current_master_instance_id == next_candidate.id + + async def test_does_not_elect_terminating_bootstrap_candidate_as_master( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=1, target=1, max=3), + ) + ), + ) + terminated_master = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.TERMINATED, + job_provisioning_data=None, + offer=None, + instance_num=0, + ) + await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.TERMINATING, + job_provisioning_data=None, + offer=None, + instance_num=1, + ) + pending_candidate = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + job_provisioning_data=None, + offer=None, + instance_num=2, + ) + fleet.current_master_instance_id = terminated_master.id + await _lock_fleet_for_processing(session, fleet) + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + assert fleet.current_master_instance_id == pending_candidate.id + + async def test_clears_current_master_for_non_cluster_fleet( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec(), + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + ) + fleet.current_master_instance_id = instance.id + await _lock_fleet_for_processing(session, fleet) + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + assert fleet.current_master_instance_id is None + + async def test_syncs_current_master_after_creating_missing_instances( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=2, target=2, max=2), + ) + ), + ) + await _lock_fleet_for_processing(session, fleet) + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + instances = ( + ( + await session.execute( + select(InstanceModel) + .where(InstanceModel.fleet_id == fleet.id, InstanceModel.deleted == False) + .order_by(InstanceModel.instance_num, InstanceModel.created_at) + ) + ) + .scalars() + .all() + ) + assert len(instances) == 2 + assert fleet.current_master_instance_id == instances[0].id + + async def test_prefers_surviving_instance_over_new_replacement_for_master_election( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=2, target=2, max=2), + ) + ), + ) + terminated_master = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.TERMINATED, + job_provisioning_data=None, + offer=None, + instance_num=0, + ) + surviving_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + job_provisioning_data=None, + offer=None, + instance_num=1, + ) + fleet.current_master_instance_id = terminated_master.id + await _lock_fleet_for_processing(session, fleet) + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + await session.refresh(terminated_master) + await session.refresh(surviving_instance) + non_deleted_instances = ( + ( + await session.execute( + select(InstanceModel) + .where(InstanceModel.fleet_id == fleet.id, InstanceModel.deleted == False) + .order_by(InstanceModel.instance_num, InstanceModel.created_at) + ) + ) + .scalars() + .all() + ) + + assert terminated_master.deleted + assert fleet.current_master_instance_id == surviving_instance.id + assert len(non_deleted_instances) == 2 + assert any( + instance.id != surviving_instance.id and instance.instance_num == 0 + for instance in non_deleted_instances + ) + + async def test_min_zero_failed_master_terminates_unprovisioned_siblings( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=0, target=3, max=3), + ) + ), + ) + failed_master = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.TERMINATED, + job_provisioning_data=None, + offer=None, + instance_num=0, + ) + failed_master.termination_reason = InstanceTerminationReason.NO_OFFERS + sibling1 = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + job_provisioning_data=None, + offer=None, + instance_num=1, + ) + sibling2 = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + job_provisioning_data=None, + offer=None, + instance_num=2, + ) + fleet.current_master_instance_id = failed_master.id + await _lock_fleet_for_processing(session, fleet) + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + await session.refresh(failed_master) + await session.refresh(sibling1) + await session.refresh(sibling2) + assert failed_master.deleted + assert sibling1.status == InstanceStatus.TERMINATED + assert sibling2.status == InstanceStatus.TERMINATED + assert sibling1.termination_reason == InstanceTerminationReason.MASTER_FAILED + assert sibling2.termination_reason == InstanceTerminationReason.MASTER_FAILED + assert fleet.current_master_instance_id is None + + async def test_master_failure_path_resets_when_sibling_instance_is_locked( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=0, target=3, max=3), + ) + ), + ) + failed_master = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.TERMINATED, + job_provisioning_data=None, + offer=None, + instance_num=0, + ) + failed_master.termination_reason = InstanceTerminationReason.NO_OFFERS + locked_sibling = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + job_provisioning_data=None, + offer=None, + instance_num=1, + ) + free_sibling = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + job_provisioning_data=None, + offer=None, + instance_num=2, + ) + original_last_processed_at = fleet.last_processed_at + fleet.current_master_instance_id = failed_master.id + fleet.consolidation_attempt = 1 + fleet.last_consolidated_at = datetime.now(timezone.utc) + await _lock_fleet_for_processing(session, fleet) + fleet.lock_owner = FleetPipeline.__name__ + locked_sibling.lock_token = uuid.uuid4() + locked_sibling.lock_expires_at = get_current_datetime() + timedelta(minutes=1) + locked_sibling.lock_owner = "OtherPipeline" + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + await session.refresh(failed_master) + await session.refresh(locked_sibling) + await session.refresh(free_sibling) + assert fleet.current_master_instance_id == failed_master.id + assert fleet.lock_owner == FleetPipeline.__name__ + assert fleet.lock_token is None + assert fleet.lock_expires_at is None + assert fleet.last_processed_at > original_last_processed_at + assert not failed_master.deleted + assert locked_sibling.status == InstanceStatus.PENDING + assert locked_sibling.termination_reason is None + assert locked_sibling.lock_owner == "OtherPipeline" + assert free_sibling.status == InstanceStatus.PENDING + assert free_sibling.termination_reason is None + + async def test_min_zero_failed_master_preserves_provisioned_survivor( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=0, target=2, max=2), + ) + ), + ) + failed_master = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.TERMINATED, + job_provisioning_data=None, + offer=None, + instance_num=0, + ) + failed_master.termination_reason = InstanceTerminationReason.NO_OFFERS + provisioned_survivor = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + job_provisioning_data=get_job_provisioning_data(), + instance_num=1, + ) + pending_sibling = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + job_provisioning_data=None, + offer=None, + instance_num=2, + ) + fleet.current_master_instance_id = failed_master.id + await _lock_fleet_for_processing(session, fleet) + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + await session.refresh(provisioned_survivor) + await session.refresh(pending_sibling) + assert provisioned_survivor.status == InstanceStatus.IDLE + assert pending_sibling.status == InstanceStatus.PENDING + assert pending_sibling.termination_reason is None + assert fleet.current_master_instance_id == provisioned_survivor.id + + async def test_deletes_empty_autocreated_fleet( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.autocreated = True + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + assert fleet.deleted + + async def test_deletes_terminating_user_fleet( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + status=FleetStatus.TERMINATING, + ) + + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + assert fleet.deleted + + async def test_does_not_delete_empty_active_user_fleet( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + ) + + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + assert not fleet.deleted + + async def test_does_not_delete_fleet_with_active_run( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + ) + user = await create_user(session=session, global_role=GlobalRole.USER) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo( + session=session, + project_id=project.id, + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.RUNNING, + ) + fleet.runs.append(run) + + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + assert not fleet.deleted + + async def test_does_not_delete_fleet_with_instance( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + ) + user = await create_user(session=session, global_role=GlobalRole.USER) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + ) + fleet.instances.append(instance) + + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + assert not fleet.deleted + + async def test_consolidation_creates_missing_instances( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.configuration.nodes = FleetNodesSpec(min=2, target=2, max=2) + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + instance_num=1, + ) + + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + instances = (await session.execute(select(InstanceModel))).scalars().all() + assert len(instances) == 2 + assert {i.instance_num for i in instances} == {0, 1} + assert fleet.consolidation_attempt == 1 + + async def test_consolidation_terminates_redundant_instances( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.configuration.nodes = FleetNodesSpec(min=1, target=1, max=1) + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + instance1 = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.BUSY, + instance_num=0, + ) + instance2 = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + instance_num=1, + ) + instance3 = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.TERMINATED, + instance_num=2, + ) + + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + await session.refresh(instance1) + await session.refresh(instance2) + await session.refresh(instance3) + assert instance1.status == InstanceStatus.BUSY + assert instance2.status == InstanceStatus.TERMINATING + assert instance3.deleted + assert fleet.consolidation_attempt == 1 + + async def test_consolidation_attempt_increments_when_over_max_and_no_idle_instances( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.configuration.nodes = FleetNodesSpec(min=1, target=1, max=1) + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + instance1 = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.BUSY, + instance_num=0, + ) + instance2 = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.BUSY, + instance_num=1, + ) + + fleet.consolidation_attempt = 2 + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + await session.refresh(instance1) + await session.refresh(instance2) + assert instance1.status == InstanceStatus.BUSY + assert instance2.status == InstanceStatus.BUSY + assert fleet.consolidation_attempt == 3 + + async def test_deletes_related_resources_on_fleet_delete( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + status=FleetStatus.TERMINATING, + ) + placement_group1 = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="test-pg-1", + ) + placement_group2 = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="test-pg-2", + ) + await create_export( + session=session, + exporter_project=project, + importer_projects=[], + exported_fleets=[fleet], + ) + + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + await session.refresh(placement_group1) + await session.refresh(placement_group2) + assert fleet.deleted + assert placement_group1.fleet_deleted + assert placement_group2.fleet_deleted + res = await session.execute(select(ExportedFleetModel)) + assert len(res.scalars().all()) == 0 + + async def test_consolidation_respects_retry_delay( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.configuration.nodes = FleetNodesSpec(min=2, target=2, max=2) + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + instance_num=0, + ) + fleet.consolidation_attempt = 1 + fleet.last_consolidated_at = datetime.now(timezone.utc) + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + instances = ( + ( + await session.execute( + select(InstanceModel).where( + InstanceModel.fleet_id == fleet.id, + InstanceModel.deleted == False, + ) + ) + ) + .scalars() + .all() + ) + assert len(instances) == 1 + assert fleet.consolidation_attempt == 1 + assert not fleet.deleted + + async def test_consolidation_attempt_resets_when_no_changes( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.configuration.nodes = FleetNodesSpec(min=1, target=1, max=1) + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + instance_num=0, + ) + fleet.consolidation_attempt = 3 + previous_last_consolidated_at = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc) + fleet.last_consolidated_at = previous_last_consolidated_at + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + instances = ( + ( + await session.execute( + select(InstanceModel).where( + InstanceModel.fleet_id == fleet.id, + InstanceModel.deleted == False, + ) + ) + ) + .scalars() + .all() + ) + assert len(instances) == 1 + assert fleet.consolidation_attempt == 0 + last_consolidated_at = fleet.last_consolidated_at + assert last_consolidated_at + assert last_consolidated_at > previous_last_consolidated_at + + async def test_consolidation_terminates_idle_instances_not_matching_fleet_spec( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.configuration.nodes = FleetNodesSpec(min=1, target=1, max=2) + spec.configuration.backends = [BackendType.AWS] + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + matching_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + backend=BackendType.AWS, + instance_num=0, + ) + mismatched_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + backend=BackendType.GCP, + instance_num=1, + ) + + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(matching_instance) + await session.refresh(mismatched_instance) + assert matching_instance.status == InstanceStatus.IDLE + assert mismatched_instance.status == InstanceStatus.TERMINATING + assert ( + mismatched_instance.termination_reason == InstanceTerminationReason.FLEET_SPEC_MISMATCH + ) + + async def test_consolidation_preserves_pending_instances_without_offer( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.configuration.nodes = FleetNodesSpec(min=1, target=1, max=1) + spec.configuration.backends = [BackendType.AWS] + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + pending_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + instance_num=0, + offer=None, + job_provisioning_data=None, + ) + + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(pending_instance) + assert pending_instance.status == InstanceStatus.PENDING + + async def test_consolidation_preserves_busy_instances_not_matching_fleet_spec( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.configuration.nodes = FleetNodesSpec(min=1, target=1, max=1) + spec.configuration.backends = [BackendType.AWS] + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + busy_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.BUSY, + backend=BackendType.GCP, + instance_num=0, + ) + + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(busy_instance) + assert busy_instance.status == InstanceStatus.BUSY + + async def test_consolidation_creates_replacements_after_spec_mismatch_termination( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.configuration.nodes = FleetNodesSpec(min=2, target=2, max=2) + spec.configuration.backends = [BackendType.AWS] + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + instance1 = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + backend=BackendType.GCP, + instance_num=0, + ) + instance2 = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + backend=BackendType.GCP, + instance_num=1, + ) + + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + await session.refresh(instance1) + await session.refresh(instance2) + assert instance1.status == InstanceStatus.TERMINATING + assert instance2.status == InstanceStatus.TERMINATING + # New replacement instances should be created to satisfy nodes.min=2 + all_instances = ( + ( + await session.execute( + select(InstanceModel).where( + InstanceModel.fleet_id == fleet.id, + InstanceModel.deleted == False, + ) + ) + ) + .scalars() + .all() + ) + new_instances = [i for i in all_instances if i.status == InstanceStatus.PENDING] + assert len(new_instances) == 2 + assert fleet.consolidation_attempt == 1 + + async def test_consolidation_preserves_instances_matching_fleet_spec( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.configuration.nodes = FleetNodesSpec(min=1, target=1, max=1) + spec.configuration.backends = [BackendType.AWS] + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + backend=BackendType.AWS, + instance_num=0, + ) + + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + await session.refresh(instance) + assert instance.status == InstanceStatus.IDLE + assert fleet.consolidation_attempt == 0 + + async def test_consolidation_stops_at_max_attempts( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.configuration.nodes = FleetNodesSpec(min=2, target=2, max=2) + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + instance_num=0, + ) + fleet.consolidation_attempt = fleets_pipeline._MAX_CONSOLIDATION_ATTEMPTS + fleet.last_consolidated_at = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc) + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + instances = ( + ( + await session.execute( + select(InstanceModel).where( + InstanceModel.fleet_id == fleet.id, + InstanceModel.deleted == False, + ) + ) + ) + .scalars() + .all() + ) + assert len(instances) == 1 + assert fleet.consolidation_attempt == fleets_pipeline._MAX_CONSOLIDATION_ATTEMPTS + assert not fleet.deleted + + async def test_consolidation_emits_event_on_reaching_limit( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.configuration.nodes = FleetNodesSpec(min=2, target=2, max=2) + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + instance_num=0, + ) + fleet.consolidation_attempt = fleets_pipeline._MAX_CONSOLIDATION_ATTEMPTS - 1 + fleet.last_consolidated_at = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc) + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + instances = ( + ( + await session.execute( + select(InstanceModel).where( + InstanceModel.fleet_id == fleet.id, + InstanceModel.deleted == False, + ) + ) + ) + .scalars() + .all() + ) + # Last allowed consolidation still creates the missing instance + assert len(instances) == 2 + assert fleet.consolidation_attempt == fleets_pipeline._MAX_CONSOLIDATION_ATTEMPTS + # Verify the consolidation-stopped event was emitted + event_models = ( + ( + await session.execute( + select(EventModel) + .join(EventTargetModel) + .where(EventTargetModel.entity_id == fleet.id) + ) + ) + .scalars() + .all() + ) + consolidation_stopped_events = [ + e for e in event_models if "consolidation stopped" in e.message + ] + assert len(consolidation_stopped_events) == 1 + + async def test_consolidation_resumes_after_attempt_reset( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.configuration.nodes = FleetNodesSpec(min=2, target=2, max=2) + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + instance_num=0, + ) + # Simulate in-place update resetting the attempt counter + fleet.consolidation_attempt = 0 + fleet.last_consolidated_at = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc) + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + instances = ( + ( + await session.execute( + select(InstanceModel).where( + InstanceModel.fleet_id == fleet.id, + InstanceModel.deleted == False, + ) + ) + ) + .scalars() + .all() + ) + assert len(instances) == 2 + assert fleet.consolidation_attempt == 1 diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_gateways.py b/src/tests/_internal/server/background/pipeline_tasks/test_gateways.py new file mode 100644 index 0000000000..2759d8a236 --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_gateways.py @@ -0,0 +1,789 @@ +import asyncio +import uuid +from datetime import datetime, timedelta, timezone +from unittest.mock import MagicMock, Mock, patch + +import pytest +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import selectinload + +from dstack._internal.core.errors import BackendError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.gateways import ( + GatewayConfiguration, + GatewayProvisioningData, + GatewayStatus, +) +from dstack._internal.server.background.pipeline_tasks.gateways import ( + GatewayFetcher, + GatewayPipeline, + GatewayPipelineItem, + GatewayWorker, +) +from dstack._internal.server.models import GatewayModel +from dstack._internal.server.testing.common import ( + AsyncContextManager, + ComputeMockSpec, + create_backend, + create_gateway, + create_gateway_compute, + create_project, + list_events, +) +from dstack._internal.utils.common import get_current_datetime + + +@pytest.fixture +def worker() -> GatewayWorker: + return GatewayWorker(queue=Mock(), heartbeater=Mock(), pipeline_hinter=Mock()) + + +@pytest.fixture +def fetcher() -> GatewayFetcher: + return GatewayFetcher( + queue=asyncio.Queue(), + queue_desired_minsize=1, + min_processing_interval=timedelta(seconds=15), + lock_timeout=timedelta(seconds=30), + heartbeater=Mock(), + ) + + +def _gateway_to_pipeline_item(gateway_model: GatewayModel) -> GatewayPipelineItem: + assert gateway_model.lock_token is not None + assert gateway_model.lock_expires_at is not None + return GatewayPipelineItem( + __tablename__=gateway_model.__tablename__, + id=gateway_model.id, + lock_token=gateway_model.lock_token, + lock_expires_at=gateway_model.lock_expires_at, + prev_lock_expired=False, + status=gateway_model.status, + to_be_deleted=gateway_model.to_be_deleted, + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestGatewayFetcher: + async def test_fetch_selects_eligible_gateways_and_sets_lock_fields( + self, test_db, session: AsyncSession, fetcher: GatewayFetcher + ): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + now = get_current_datetime() + stale = now - timedelta(minutes=1) + + submitted = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + name="submitted", + status=GatewayStatus.SUBMITTED, + last_processed_at=stale - timedelta(seconds=3), + ) + provisioning = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + name="provisioning", + status=GatewayStatus.PROVISIONING, + last_processed_at=stale - timedelta(seconds=2), + ) + to_be_deleted = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + name="to-be-deleted", + status=GatewayStatus.RUNNING, + last_processed_at=stale - timedelta(seconds=1), + ) + to_be_deleted.to_be_deleted = True + + just_created = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + name="just-created", + status=GatewayStatus.SUBMITTED, + last_processed_at=now, + ) + just_created.created_at = now + just_created.last_processed_at = now + + ineligible_status = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + name="ineligible-status", + status=GatewayStatus.RUNNING, + last_processed_at=stale, + ) + recent = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + name="recent", + status=GatewayStatus.SUBMITTED, + last_processed_at=now, + ) + recent.created_at = now - timedelta(minutes=2) + recent.last_processed_at = now + + locked = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + name="locked", + status=GatewayStatus.SUBMITTED, + last_processed_at=stale + timedelta(seconds=1), + ) + locked.lock_expires_at = now + timedelta(minutes=1) + locked.lock_token = uuid.uuid4() + locked.lock_owner = "OtherPipeline" + await session.commit() + + items = await fetcher.fetch(limit=10) + + assert {item.id for item in items} == { + submitted.id, + provisioning.id, + to_be_deleted.id, + just_created.id, + } + assert {(item.id, item.status, item.to_be_deleted) for item in items} == { + (submitted.id, GatewayStatus.SUBMITTED, False), + (provisioning.id, GatewayStatus.PROVISIONING, False), + (to_be_deleted.id, GatewayStatus.RUNNING, True), + (just_created.id, GatewayStatus.SUBMITTED, False), + } + + for gateway in [ + submitted, + provisioning, + to_be_deleted, + just_created, + ineligible_status, + recent, + locked, + ]: + await session.refresh(gateway) + + fetched_gateways = [submitted, provisioning, to_be_deleted, just_created] + assert all(gateway.lock_owner == GatewayPipeline.__name__ for gateway in fetched_gateways) + assert all(gateway.lock_expires_at is not None for gateway in fetched_gateways) + assert all(gateway.lock_token is not None for gateway in fetched_gateways) + assert len({gateway.lock_token for gateway in fetched_gateways}) == 1 + + assert ineligible_status.lock_owner is None + assert recent.lock_owner is None + assert locked.lock_owner == "OtherPipeline" + + async def test_fetch_returns_oldest_gateways_first_up_to_limit( + self, test_db, session: AsyncSession, fetcher: GatewayFetcher + ): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + now = get_current_datetime() + + oldest = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + name="oldest", + status=GatewayStatus.SUBMITTED, + last_processed_at=now - timedelta(minutes=3), + ) + middle = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + name="middle", + status=GatewayStatus.PROVISIONING, + last_processed_at=now - timedelta(minutes=2), + ) + newest = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + name="newest", + status=GatewayStatus.SUBMITTED, + last_processed_at=now - timedelta(minutes=1), + ) + + items = await fetcher.fetch(limit=2) + + assert [item.id for item in items] == [oldest.id, middle.id] + + await session.refresh(oldest) + await session.refresh(middle) + await session.refresh(newest) + + assert oldest.lock_owner == GatewayPipeline.__name__ + assert middle.lock_owner == GatewayPipeline.__name__ + assert newest.lock_owner is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestGatewayWorkerSubmitted: + async def test_submitted_to_provisioning( + self, test_db, session: AsyncSession, worker: GatewayWorker + ): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + status=GatewayStatus.SUBMITTED, + ) + gateway.lock_token = uuid.uuid4() + gateway.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + with patch( + "dstack._internal.server.services.backends.get_project_backend_with_model_by_type_or_error" + ) as m: + aws = Mock() + m.return_value = (backend, aws) + aws.compute.return_value = Mock(spec=ComputeMockSpec) + aws.compute.return_value.create_gateway.return_value = GatewayProvisioningData( + instance_id="i-1234567890", + ip_address="2.2.2.2", + region="us", + ) + await worker.process(_gateway_to_pipeline_item(gateway)) + m.assert_called_once() + aws.compute.return_value.create_gateway.assert_called_once() + + await session.refresh(gateway) + res = await session.execute( + select(GatewayModel) + .where(GatewayModel.id == gateway.id) + .options(selectinload(GatewayModel.gateway_computes)) + ) + gateway = res.unique().scalar_one() + assert gateway.status == GatewayStatus.PROVISIONING + assert len(gateway.gateway_computes) > 0 + assert gateway.gateway_computes[0].ip_address == "2.2.2.2" + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Gateway status changed SUBMITTED -> PROVISIONING" + + async def test_marks_gateway_as_failed_if_gateway_creation_errors( + self, test_db, session: AsyncSession, worker: GatewayWorker + ): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + status=GatewayStatus.SUBMITTED, + ) + gateway.lock_token = uuid.uuid4() + gateway.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + with patch( + "dstack._internal.server.services.backends.get_project_backend_with_model_by_type_or_error" + ) as m: + aws = Mock() + m.return_value = (backend, aws) + aws.compute.return_value = Mock(spec=ComputeMockSpec) + aws.compute.return_value.create_gateway.side_effect = BackendError("Some error") + await worker.process(_gateway_to_pipeline_item(gateway)) + m.assert_called_once() + aws.compute.return_value.create_gateway.assert_called_once() + + await session.refresh(gateway) + assert gateway.status == GatewayStatus.FAILED + assert gateway.status_message == "Some error" + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Gateway status changed SUBMITTED -> FAILED (Some error)" + + async def test_submitted_creates_multiple_computes_for_multi_replica( + self, test_db, session: AsyncSession, worker: GatewayWorker + ): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + status=GatewayStatus.SUBMITTED, + ) + config = GatewayConfiguration( + name=gateway.name, + backend=BackendType.AWS, + region=gateway.region, + replicas=2, + ) + gateway.configuration = config.json() + gateway.lock_token = uuid.uuid4() + gateway.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + with patch( + "dstack._internal.server.services.backends.get_project_backend_with_model_by_type_or_error" + ) as m: + aws = Mock() + m.return_value = (backend, aws) + aws.compute.return_value = Mock(spec=ComputeMockSpec) + aws.compute.return_value.create_gateway.side_effect = [ + GatewayProvisioningData(instance_id="i-aaa", ip_address="2.2.2.2", region="us"), + GatewayProvisioningData(instance_id="i-bbb", ip_address="3.3.3.3", region="us"), + ] + await worker.process(_gateway_to_pipeline_item(gateway)) + assert aws.compute.return_value.create_gateway.call_count == 2 + + await session.refresh(gateway) + res = await session.execute( + select(GatewayModel) + .where(GatewayModel.id == gateway.id) + .options(selectinload(GatewayModel.gateway_computes)) + ) + gateway = res.unique().scalar_one() + assert gateway.status == GatewayStatus.PROVISIONING + computes = sorted(gateway.gateway_computes, key=lambda c: c.replica_num) + assert len(computes) == 2 + assert computes[0].ip_address == "2.2.2.2" + assert computes[0].replica_num == 0 + assert computes[1].ip_address == "3.3.3.3" + assert computes[1].replica_num == 1 + + async def test_marks_gateway_as_failed_if_second_replica_creation_errors( + self, test_db, session: AsyncSession, worker: GatewayWorker + ): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + status=GatewayStatus.SUBMITTED, + ) + config = GatewayConfiguration( + name=gateway.name, + backend=BackendType.AWS, + region=gateway.region, + replicas=2, + ) + gateway.configuration = config.json() + gateway.lock_token = uuid.uuid4() + gateway.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + with patch( + "dstack._internal.server.services.backends.get_project_backend_with_model_by_type_or_error" + ) as m: + aws = Mock() + m.return_value = (backend, aws) + aws.compute.return_value = Mock(spec=ComputeMockSpec) + aws.compute.return_value.create_gateway.side_effect = [ + GatewayProvisioningData(instance_id="i-aaa", ip_address="2.2.2.2", region="us"), + BackendError("Some error"), + ] + await worker.process(_gateway_to_pipeline_item(gateway)) + assert aws.compute.return_value.create_gateway.call_count == 2 + + await session.refresh(gateway) + res = await session.execute( + select(GatewayModel) + .where(GatewayModel.id == gateway.id) + .options(selectinload(GatewayModel.gateway_computes)) + ) + gateway = res.unique().scalar_one() + assert gateway.status == GatewayStatus.FAILED + assert gateway.status_message == "Some error" + # The first replica's compute is saved even though the second failed + assert len(gateway.gateway_computes) == 1 + assert gateway.gateway_computes[0].ip_address == "2.2.2.2" + assert gateway.gateway_computes[0].replica_num == 0 + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Gateway status changed SUBMITTED -> FAILED (Some error)" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestGatewayWorkerProvisioning: + @pytest.mark.parametrize("legacy_compute", [False, True]) + async def test_provisioning_to_running( + self, test_db, session: AsyncSession, worker: GatewayWorker, legacy_compute: bool + ): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + status=GatewayStatus.PROVISIONING, + ) + if legacy_compute: + gateway_compute = await create_gateway_compute(session=session, backend_id=backend.id) + gateway.gateway_compute_id = gateway_compute.id # pre-0.20.25 relationship style + else: + await create_gateway_compute(session, gateway_id=gateway.id) + gateway.lock_token = uuid.uuid4() + gateway.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + with patch( + "dstack._internal.server.services.gateways.gateway_connections_pool.get_or_add" + ) as pool_add: + pool_add.return_value = MagicMock() + pool_add.return_value.client.return_value = MagicMock(AsyncContextManager()) + await worker.process(_gateway_to_pipeline_item(gateway)) + pool_add.assert_called_once() + + await session.refresh(gateway) + assert gateway.status == GatewayStatus.RUNNING + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Gateway status changed PROVISIONING -> RUNNING" + + async def test_provisioning_to_running_with_multiple_replicas( + self, test_db, session: AsyncSession, worker: GatewayWorker + ): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + status=GatewayStatus.PROVISIONING, + ) + await create_gateway_compute(session, gateway_id=gateway.id, ip_address="1.1.1.1") + compute1 = await create_gateway_compute( + session, gateway_id=gateway.id, ip_address="2.2.2.2" + ) + compute1.replica_num = 1 + gateway.lock_token = uuid.uuid4() + gateway.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + with patch( + "dstack._internal.server.services.gateways.gateway_connections_pool.get_or_add" + ) as pool_add: + pool_add.return_value = MagicMock() + pool_add.return_value.client.return_value = MagicMock(AsyncContextManager()) + await worker.process(_gateway_to_pipeline_item(gateway)) + assert pool_add.call_count == 2 + + await session.refresh(gateway) + assert gateway.status == GatewayStatus.RUNNING + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Gateway status changed PROVISIONING -> RUNNING" + + @pytest.mark.parametrize("legacy_compute", [False, True]) + async def test_marks_gateway_as_failed_if_fails_to_connect( + self, test_db, session: AsyncSession, worker: GatewayWorker, legacy_compute: bool + ): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + status=GatewayStatus.PROVISIONING, + ) + if legacy_compute: + gateway_compute = await create_gateway_compute(session=session, backend_id=backend.id) + gateway.gateway_compute_id = gateway_compute.id # pre-0.20.25 relationship style + else: + gateway_compute = await create_gateway_compute(session, gateway_id=gateway.id) + gateway.lock_token = uuid.uuid4() + gateway.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + with patch( + "dstack._internal.server.services.gateways.connect_to_gateway_with_retry" + ) as connect_to_gateway_with_retry_mock: + connect_to_gateway_with_retry_mock.return_value = None + await worker.process(_gateway_to_pipeline_item(gateway)) + connect_to_gateway_with_retry_mock.assert_called_once() + + await session.refresh(gateway) + await session.refresh(gateway_compute) + assert gateway.status == GatewayStatus.FAILED + assert gateway.status_message == "Failed to connect to gateway" + assert gateway_compute.active is False + events = await list_events(session) + assert len(events) == 1 + assert ( + events[0].message + == "Gateway status changed PROVISIONING -> FAILED (Failed to connect to gateway)" + ) + + async def test_marks_gateway_as_failed_if_any_replica_fails_to_connect( + self, test_db, session: AsyncSession, worker: GatewayWorker + ): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + status=GatewayStatus.PROVISIONING, + ) + compute0 = await create_gateway_compute( + session, gateway_id=gateway.id, ip_address="1.1.1.1" + ) + compute1 = await create_gateway_compute( + session, gateway_id=gateway.id, ip_address="2.2.2.2" + ) + compute1.replica_num = 1 + gateway.lock_token = uuid.uuid4() + gateway.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + with patch( + "dstack._internal.server.services.gateways.connect_to_gateway_with_retry" + ) as connect_mock: + connect_mock.return_value = None + await worker.process(_gateway_to_pipeline_item(gateway)) + assert connect_mock.call_count == 2 + + await session.refresh(gateway) + assert gateway.status == GatewayStatus.FAILED + assert gateway.status_message == "Failed to connect to gateway" + + await session.refresh(compute0) + await session.refresh(compute1) + assert compute0.active is False + assert compute1.active is False + + events = await list_events(session) + assert len(events) == 1 + assert ( + events[0].message + == "Gateway status changed PROVISIONING -> FAILED (Failed to connect to gateway)" + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestGatewayWorkerDeleted: + @pytest.mark.parametrize("legacy_compute", [False, True]) + async def test_deletes_gateway_and_marks_compute_deleted( + self, test_db, session: AsyncSession, worker: GatewayWorker, legacy_compute: bool + ): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + status=GatewayStatus.RUNNING, + ) + if legacy_compute: + gateway_compute = await create_gateway_compute(session=session, backend_id=backend.id) + gateway.gateway_compute_id = gateway_compute.id # pre-0.20.25 relationship style + else: + gateway_compute = await create_gateway_compute( + session=session, backend_id=backend.id, gateway_id=gateway.id + ) + gateway.lock_token = uuid.uuid4() + gateway.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + gateway.to_be_deleted = True + await session.commit() + + with ( + patch( + "dstack._internal.server.services.backends.get_project_backend_by_type_or_error" + ) as get_backend_mock, + patch( + "dstack._internal.server.background.pipeline_tasks.gateways.gateway_connections_pool.remove" + ) as remove_connection_mock, + ): + backend_mock = Mock() + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + get_backend_mock.return_value = backend_mock + + await worker.process(_gateway_to_pipeline_item(gateway)) + + get_backend_mock.assert_called_once() + backend_mock.compute.return_value.terminate_gateway.assert_called_once() + remove_connection_mock.assert_called_once_with(gateway_compute.ip_address) + + await session.refresh(gateway_compute) + res = await session.execute(select(GatewayModel.id).where(GatewayModel.id == gateway.id)) + assert res.scalar_one_or_none() is None + assert gateway_compute.active is False + assert gateway_compute.deleted is True + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Gateway deleted" + + @pytest.mark.parametrize("legacy_compute", [False, True]) + async def test_keeps_gateway_if_terminate_fails( + self, test_db, session: AsyncSession, worker: GatewayWorker, legacy_compute: bool + ): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + status=GatewayStatus.RUNNING, + ) + if legacy_compute: + gateway_compute = await create_gateway_compute(session=session, backend_id=backend.id) + gateway.gateway_compute_id = gateway_compute.id # pre-0.20.25 relationship style + else: + gateway_compute = await create_gateway_compute( + session=session, backend_id=backend.id, gateway_id=gateway.id + ) + gateway.lock_token = uuid.uuid4() + gateway.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + gateway.lock_owner = "GatewayPipeline" + gateway.to_be_deleted = True + original_last_processed_at = gateway.last_processed_at + await session.commit() + + with ( + patch( + "dstack._internal.server.services.backends.get_project_backend_by_type_or_error" + ) as get_backend_mock, + patch( + "dstack._internal.server.background.pipeline_tasks.gateways.gateway_connections_pool.remove" + ) as remove_connection_mock, + ): + backend_mock = Mock() + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.terminate_gateway.side_effect = BackendError( + "Terminate failed" + ) + get_backend_mock.return_value = backend_mock + + await worker.process(_gateway_to_pipeline_item(gateway)) + + get_backend_mock.assert_called_once() + backend_mock.compute.return_value.terminate_gateway.assert_called_once() + remove_connection_mock.assert_not_called() + + await session.refresh(gateway) + await session.refresh(gateway_compute) + assert gateway.to_be_deleted is True + assert gateway.last_processed_at > original_last_processed_at + assert gateway.lock_token is None + assert gateway.lock_expires_at is None + assert gateway.lock_owner is None + assert gateway_compute.active is True + assert gateway_compute.deleted is False + events = await list_events(session) + assert len(events) == 0 + + async def test_deletes_gateway_with_multiple_replicas( + self, test_db, session: AsyncSession, worker: GatewayWorker + ): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + status=GatewayStatus.RUNNING, + ) + compute0 = await create_gateway_compute( + session=session, backend_id=backend.id, gateway_id=gateway.id, ip_address="1.1.1.1" + ) + compute1 = await create_gateway_compute( + session=session, backend_id=backend.id, gateway_id=gateway.id, ip_address="2.2.2.2" + ) + compute1.replica_num = 1 + gateway.lock_token = uuid.uuid4() + gateway.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + gateway.to_be_deleted = True + await session.commit() + + with ( + patch( + "dstack._internal.server.services.backends.get_project_backend_by_type_or_error" + ) as get_backend_mock, + patch( + "dstack._internal.server.background.pipeline_tasks.gateways.gateway_connections_pool.remove" + ) as remove_connection_mock, + ): + backend_mock = Mock() + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + get_backend_mock.return_value = backend_mock + + await worker.process(_gateway_to_pipeline_item(gateway)) + + assert backend_mock.compute.return_value.terminate_gateway.call_count == 2 + assert remove_connection_mock.call_count == 2 + + await session.refresh(compute0) + await session.refresh(compute1) + res = await session.execute(select(GatewayModel.id).where(GatewayModel.id == gateway.id)) + assert res.scalar_one_or_none() is None + assert compute0.active is False + assert compute0.deleted is True + assert compute1.active is False + assert compute1.deleted is True + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Gateway deleted" + + async def test_keeps_gateway_if_second_replica_terminate_fails( + self, test_db, session: AsyncSession, worker: GatewayWorker + ): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + status=GatewayStatus.RUNNING, + ) + compute0 = await create_gateway_compute( + session=session, backend_id=backend.id, gateway_id=gateway.id, ip_address="1.1.1.1" + ) + compute1 = await create_gateway_compute( + session=session, backend_id=backend.id, gateway_id=gateway.id, ip_address="2.2.2.2" + ) + compute1.replica_num = 1 + gateway.lock_token = uuid.uuid4() + gateway.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + gateway.lock_owner = "GatewayPipeline" + gateway.to_be_deleted = True + original_last_processed_at = gateway.last_processed_at + await session.commit() + + with ( + patch( + "dstack._internal.server.services.backends.get_project_backend_by_type_or_error" + ) as get_backend_mock, + patch( + "dstack._internal.server.background.pipeline_tasks.gateways.gateway_connections_pool.remove" + ) as remove_connection_mock, + ): + backend_mock = Mock() + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.terminate_gateway.side_effect = [ + None, + BackendError("Terminate failed"), + ] + get_backend_mock.return_value = backend_mock + + await worker.process(_gateway_to_pipeline_item(gateway)) + + assert backend_mock.compute.return_value.terminate_gateway.call_count == 2 + remove_connection_mock.assert_called_once_with(compute0.ip_address) + + await session.refresh(gateway) + await session.refresh(compute0) + await session.refresh(compute1) + assert gateway.to_be_deleted is True + assert gateway.last_processed_at > original_last_processed_at + assert gateway.lock_token is None + assert gateway.lock_expires_at is None + assert gateway.lock_owner is None + assert compute0.deleted is False + assert compute1.deleted is False diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_instances/__init__.py b/src/tests/_internal/server/background/pipeline_tasks/test_instances/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_instances/conftest.py b/src/tests/_internal/server/background/pipeline_tasks/test_instances/conftest.py new file mode 100644 index 0000000000..bbb48134b2 --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_instances/conftest.py @@ -0,0 +1,58 @@ +import asyncio +import datetime as dt +from unittest.mock import Mock + +import pytest + +from dstack._internal.core.backends.base.compute import GoArchType +from dstack._internal.server.background.pipeline_tasks.instances import ( + InstanceFetcher, + InstanceWorker, +) +from dstack._internal.server.background.pipeline_tasks.instances import ( + ssh_deploy as instances_ssh_deploy, +) +from dstack._internal.server.schemas.instances import InstanceCheck + + +@pytest.fixture +def fetcher() -> InstanceFetcher: + return InstanceFetcher( + queue=asyncio.Queue(), + queue_desired_minsize=1, + min_processing_interval=dt.timedelta(seconds=10), + lock_timeout=dt.timedelta(seconds=30), + heartbeater=Mock(), + ) + + +@pytest.fixture +def worker() -> InstanceWorker: + return InstanceWorker(queue=asyncio.Queue(), heartbeater=Mock(), pipeline_hinter=Mock()) + + +@pytest.fixture +def host_info() -> dict: + return { + "gpu_vendor": "nvidia", + "gpu_name": "T4", + "gpu_memory": 16384, + "gpu_count": 1, + "addresses": ["192.168.100.100/24"], + "disk_size": 260976517120, + "cpus": 32, + "memory": 33544130560, + } + + +@pytest.fixture +def deploy_instance_mock(monkeypatch: pytest.MonkeyPatch, host_info: dict) -> Mock: + mock = Mock( + return_value=( + InstanceCheck(reachable=True), + host_info, + GoArchType.AMD64, + ) + ) + monkeypatch.setattr(instances_ssh_deploy, "_deploy_instance", mock) + return mock diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_instances/helpers.py b/src/tests/_internal/server/background/pipeline_tasks/test_instances/helpers.py new file mode 100644 index 0000000000..81eb0fde5c --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_instances/helpers.py @@ -0,0 +1,40 @@ +import datetime as dt +import uuid + +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.server.background.pipeline_tasks.instances import ( + InstancePipeline, + InstancePipelineItem, + InstanceWorker, +) +from dstack._internal.server.models import InstanceModel + +LOCK_EXPIRES_AT = dt.datetime(2025, 1, 2, 3, 4, tzinfo=dt.timezone.utc) + + +def instance_to_pipeline_item(instance_model: InstanceModel) -> InstancePipelineItem: + assert instance_model.lock_token is not None + assert instance_model.lock_expires_at is not None + return InstancePipelineItem( + __tablename__=instance_model.__tablename__, + id=instance_model.id, + lock_token=instance_model.lock_token, + lock_expires_at=instance_model.lock_expires_at, + prev_lock_expired=False, + status=instance_model.status, + ) + + +def lock_instance(instance_model: InstanceModel) -> None: + instance_model.lock_token = uuid.uuid4() + instance_model.lock_expires_at = LOCK_EXPIRES_AT + instance_model.lock_owner = InstancePipeline.__name__ + + +async def process_instance( + session: AsyncSession, worker: InstanceWorker, instance_model: InstanceModel +) -> None: + lock_instance(instance_model) + await session.commit() + await worker.process(instance_to_pipeline_item(instance_model)) diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_check.py b/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_check.py new file mode 100644 index 0000000000..33e57df016 --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_check.py @@ -0,0 +1,944 @@ +import datetime as dt +import logging +from unittest.mock import Mock + +import pytest +import pytest_asyncio +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.fleets import FleetNodesSpec +from dstack._internal.core.models.health import HealthStatus +from dstack._internal.core.models.instances import InstanceStatus, InstanceTerminationReason +from dstack._internal.core.models.profiles import TerminationPolicy +from dstack._internal.core.models.runs import JobStatus +from dstack._internal.server.background.pipeline_tasks.instances import InstanceWorker +from dstack._internal.server.background.pipeline_tasks.instances import check as instances_check +from dstack._internal.server.models import InstanceHealthCheckModel, InstanceModel +from dstack._internal.server.schemas.health.dcgm import DCGMHealthResponse, DCGMHealthResult +from dstack._internal.server.schemas.instances import InstanceCheck +from dstack._internal.server.schemas.runner import ( + ComponentInfo, + ComponentName, + ComponentStatus, + HealthcheckResponse, + InstanceHealthResponse, + TaskListResponse, +) +from dstack._internal.server.services.runner.client import ComponentList, ShimClient +from dstack._internal.server.testing.common import ( + create_fleet, + create_instance, + create_job, + create_project, + create_repo, + create_run, + create_user, + get_fleet_configuration, + get_fleet_spec, + get_remote_connection_info, + list_events, +) +from dstack._internal.utils.common import get_current_datetime +from tests._internal.server.background.pipeline_tasks.test_instances.helpers import ( + process_instance, +) + + +@pytest.mark.asyncio +@pytest.mark.usefixtures("image_config_mock") +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestCheckInstance: + async def test_check_shim_transitions_provisioning_on_ready( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PROVISIONING, + ) + instance.termination_deadline = get_current_datetime() + dt.timedelta(days=1) + await session.commit() + + monkeypatch.setattr( + instances_check, + "_check_instance_inner", + Mock(return_value=InstanceCheck(reachable=True)), + ) + await process_instance(session, worker, instance) + + await session.refresh(instance) + + assert instance.status == InstanceStatus.IDLE + assert instance.termination_deadline is None + + async def test_check_shim_transitions_provisioning_on_terminating( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PROVISIONING, + ) + instance.started_at = get_current_datetime() + dt.timedelta(minutes=-20) + await session.commit() + + monkeypatch.setattr( + instances_check, + "_check_instance_inner", + Mock(return_value=InstanceCheck(reachable=False, message="Shim problem")), + ) + await process_instance(session, worker, instance) + + await session.refresh(instance) + + assert instance.status == InstanceStatus.TERMINATING + assert instance.termination_deadline is not None + + async def test_check_shim_transitions_provisioning_on_busy( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + ): + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PROVISIONING, + ) + instance.termination_deadline = get_current_datetime().replace( + tzinfo=dt.timezone.utc + ) + dt.timedelta(days=1) + job = await create_job( + session=session, + run=run, + status=JobStatus.SUBMITTED, + instance=instance, + ) + await session.commit() + + monkeypatch.setattr( + instances_check, + "_check_instance_inner", + Mock(return_value=InstanceCheck(reachable=True)), + ) + await process_instance(session, worker, instance) + + await session.refresh(instance) + await session.refresh(job) + + assert instance.status == InstanceStatus.BUSY + assert instance.termination_deadline is None + assert job.instance == instance + + async def test_check_shim_start_termination_deadline( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + unreachable=False, + ) + + monkeypatch.setattr( + instances_check, + "_check_instance_inner", + Mock(return_value=InstanceCheck(reachable=False, message="SSH connection fail")), + ) + await process_instance(session, worker, instance) + + await session.refresh(instance) + + assert instance.status == InstanceStatus.IDLE + assert instance.unreachable is True + assert instance.termination_deadline is not None + assert instance.termination_deadline.replace( + tzinfo=dt.timezone.utc + ) > get_current_datetime() + dt.timedelta(minutes=19) + + async def test_check_shim_does_not_start_termination_deadline_with_ssh_instance( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + unreachable=False, + remote_connection_info=get_remote_connection_info(), + ) + + monkeypatch.setattr( + instances_check, + "_check_instance_inner", + Mock(return_value=InstanceCheck(reachable=False, message="SSH connection fail")), + ) + await process_instance(session, worker, instance) + + await session.refresh(instance) + + assert instance.status == InstanceStatus.IDLE + assert instance.unreachable is True + assert instance.termination_deadline is None + + async def test_check_shim_stop_termination_deadline( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + ) + instance.termination_deadline = get_current_datetime() + dt.timedelta(minutes=19) + await session.commit() + + monkeypatch.setattr( + instances_check, + "_check_instance_inner", + Mock(return_value=InstanceCheck(reachable=True)), + ) + await process_instance(session, worker, instance) + + await session.refresh(instance) + + assert instance.status == InstanceStatus.IDLE + assert instance.termination_deadline is None + + async def test_check_shim_terminate_instance_by_deadline( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + ) + termination_deadline_time = get_current_datetime() + dt.timedelta(minutes=-19) + instance.termination_deadline = termination_deadline_time + await session.commit() + + monkeypatch.setattr( + instances_check, + "_check_instance_inner", + Mock(return_value=InstanceCheck(reachable=False, message="Not ok")), + ) + await process_instance(session, worker, instance) + + await session.refresh(instance) + + assert instance.status == InstanceStatus.TERMINATING + assert instance.termination_deadline == termination_deadline_time + assert instance.termination_reason == InstanceTerminationReason.UNREACHABLE + + @pytest.mark.parametrize( + ["termination_policy", "has_job"], + [ + pytest.param(TerminationPolicy.DESTROY_AFTER_IDLE, False, id="destroy-no-job"), + pytest.param(TerminationPolicy.DESTROY_AFTER_IDLE, True, id="destroy-with-job"), + pytest.param(TerminationPolicy.DONT_DESTROY, False, id="dont-destroy-no-job"), + pytest.param(TerminationPolicy.DONT_DESTROY, True, id="dont-destroy-with-job"), + ], + ) + async def test_check_shim_process_unreachable_state( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + termination_policy: TerminationPolicy, + has_job: bool, + ): + project = await create_project(session=session) + if has_job: + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + job = await create_job( + session=session, + run=run, + status=JobStatus.SUBMITTED, + ) + else: + job = None + instance = await create_instance( + session=session, + project=project, + created_at=get_current_datetime(), + termination_policy=termination_policy, + status=InstanceStatus.IDLE, + unreachable=True, + job=job, + ) + + monkeypatch.setattr( + instances_check, + "_check_instance_inner", + Mock(return_value=InstanceCheck(reachable=True)), + ) + await process_instance(session, worker, instance) + + await session.refresh(instance) + events = await list_events(session) + + assert instance.status == InstanceStatus.IDLE + assert instance.unreachable is False + assert len(events) == 1 + assert events[0].message == "Instance became reachable" + + @pytest.mark.parametrize("health_status", [HealthStatus.HEALTHY, HealthStatus.FAILURE]) + async def test_check_shim_switch_to_unreachable_state( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + health_status: HealthStatus, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + unreachable=False, + health_status=health_status, + ) + + monkeypatch.setattr( + instances_check, + "_check_instance_inner", + Mock(return_value=InstanceCheck(reachable=False)), + ) + await process_instance(session, worker, instance) + + await session.refresh(instance) + events = await list_events(session) + + assert instance.status == InstanceStatus.IDLE + assert instance.unreachable is True + assert instance.health == health_status + assert len(events) == 1 + assert events[0].message == "Instance became unreachable" + + async def test_check_shim_check_instance_health( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + unreachable=False, + health_status=HealthStatus.HEALTHY, + ) + health_response = InstanceHealthResponse( + dcgm=DCGMHealthResponse( + overall_health=DCGMHealthResult.DCGM_HEALTH_RESULT_WARN, + incidents=[], + ) + ) + + monkeypatch.setattr( + instances_check, + "_check_instance_inner", + Mock( + return_value=InstanceCheck( + reachable=True, + health_response=health_response, + ) + ), + ) + await process_instance(session, worker, instance) + + await session.refresh(instance) + events = await list_events(session) + + assert instance.status == InstanceStatus.IDLE + assert instance.unreachable is False + assert instance.health == HealthStatus.WARNING + assert len(events) == 1 + assert events[0].message == "Instance health changed HEALTHY -> WARNING" + + res = await session.execute(select(InstanceHealthCheckModel)) + health_check = res.scalars().one() + assert health_check.status == HealthStatus.WARNING + assert health_check.response == health_response.json() + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestProcessIdleTimeout: + async def test_does_not_terminate_by_idle_timeout_when_fleet_at_min_nodes( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + ): + project = await create_project(session=session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + get_fleet_configuration(nodes=FleetNodesSpec(min=1, target=1, max=1)) + ), + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + ) + instance.termination_idle_time = 300 + instance.termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE + instance.last_job_processed_at = get_current_datetime() + dt.timedelta(minutes=-19) + await session.commit() + + monkeypatch.setattr( + instances_check, + "_check_instance_inner", + Mock(return_value=InstanceCheck(reachable=True)), + ) + + await process_instance(session, worker, instance) + await session.refresh(instance) + + assert instance.status == InstanceStatus.IDLE + assert instance.termination_reason is None + + async def test_terminates_by_idle_timeout_when_fleet_above_min_nodes( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + get_fleet_configuration(nodes=FleetNodesSpec(min=1, target=2, max=2)) + ), + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + ) + await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + ) + instance.termination_idle_time = 300 + instance.termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE + instance.last_job_processed_at = get_current_datetime() + dt.timedelta(minutes=-19) + await session.commit() + + await process_instance(session, worker, instance) + await session.refresh(instance) + + assert instance.status == InstanceStatus.TERMINATING + assert instance.termination_reason == InstanceTerminationReason.IDLE_TIMEOUT + + async def test_terminate_by_idle_timeout( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + ) + instance.termination_idle_time = 300 + instance.termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE + instance.last_job_processed_at = get_current_datetime() + dt.timedelta(minutes=-19) + await session.commit() + + await process_instance(session, worker, instance) + await session.refresh(instance) + + assert instance.status == InstanceStatus.TERMINATING + assert instance.termination_reason == InstanceTerminationReason.IDLE_TIMEOUT + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class BaseTestMaybeInstallComponents: + EXPECTED_VERSION = "0.20.1" + + @pytest_asyncio.fixture + async def instance(self, session: AsyncSession) -> InstanceModel: + project = await create_project(session=session) + return await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + + @pytest.fixture + def component_list(self) -> ComponentList: + return ComponentList() + + @pytest.fixture + def debug_task_log(self, caplog: pytest.LogCaptureFixture) -> pytest.LogCaptureFixture: + caplog.set_level(level=logging.DEBUG, logger=instances_check.__name__) + return caplog + + @pytest.fixture + def shim_client_mock( + self, + monkeypatch: pytest.MonkeyPatch, + component_list: ComponentList, + ) -> Mock: + mock = Mock(spec_set=ShimClient) + mock.healthcheck.return_value = HealthcheckResponse( + service="dstack-shim", + version=self.EXPECTED_VERSION, + ) + mock.get_instance_health.return_value = InstanceHealthResponse() + mock.get_components.return_value = component_list + mock.list_tasks.return_value = TaskListResponse(tasks=[]) + mock.is_safe_to_restart.return_value = False + monkeypatch.setattr( + "dstack._internal.server.services.runner.client.ShimClient.from_address", + Mock(return_value=mock), + ) + return mock + + +@pytest.mark.usefixtures("get_dstack_runner_version_mock") +class TestMaybeInstallRunner(BaseTestMaybeInstallComponents): + @pytest.fixture + def component_list(self) -> ComponentList: + components = ComponentList() + components.add( + ComponentInfo( + name=ComponentName.RUNNER, + version=self.EXPECTED_VERSION, + status=ComponentStatus.INSTALLED, + ), + ) + return components + + @pytest.fixture + def get_dstack_runner_version_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: + mock = Mock(return_value=self.EXPECTED_VERSION) + monkeypatch.setattr(instances_check, "get_dstack_runner_version", mock) + return mock + + @pytest.fixture + def get_dstack_runner_download_url_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: + mock = Mock(return_value="https://example.com/runner") + monkeypatch.setattr(instances_check, "get_dstack_runner_download_url", mock) + return mock + + async def test_cannot_determine_expected_version( + self, + test_db, + instance: InstanceModel, + debug_task_log: pytest.LogCaptureFixture, + shim_client_mock: Mock, + get_dstack_runner_version_mock: Mock, + ): + get_dstack_runner_version_mock.return_value = None + + instances_check._maybe_install_components(instance, shim_client_mock) + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_runner.assert_not_called() + + async def test_expected_version_already_installed( + self, + test_db, + instance: InstanceModel, + debug_task_log: pytest.LogCaptureFixture, + shim_client_mock: Mock, + ): + shim_client_mock.get_components.return_value.runner.version = self.EXPECTED_VERSION + + instances_check._maybe_install_components(instance, shim_client_mock) + + assert "expected runner version already installed" in debug_task_log.text + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_runner.assert_not_called() + + @pytest.mark.parametrize("status", [ComponentStatus.NOT_INSTALLED, ComponentStatus.ERROR]) + async def test_install_not_installed_or_error( + self, + test_db, + instance: InstanceModel, + debug_task_log: pytest.LogCaptureFixture, + shim_client_mock: Mock, + get_dstack_runner_download_url_mock: Mock, + status: ComponentStatus, + ): + shim_client_mock.get_components.return_value.runner.version = "" + shim_client_mock.get_components.return_value.runner.status = status + + instances_check._maybe_install_components(instance, shim_client_mock) + + assert f"installing runner (no version) -> {self.EXPECTED_VERSION}" in debug_task_log.text + get_dstack_runner_download_url_mock.assert_called_once_with( + arch=None, + version=self.EXPECTED_VERSION, + ) + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_runner.assert_called_once_with( + get_dstack_runner_download_url_mock.return_value + ) + + @pytest.mark.parametrize("installed_version", ["0.19.40", "0.21.0", "dev"]) + async def test_install_installed( + self, + test_db, + instance: InstanceModel, + debug_task_log: pytest.LogCaptureFixture, + shim_client_mock: Mock, + get_dstack_runner_download_url_mock: Mock, + installed_version: str, + ): + shim_client_mock.get_components.return_value.runner.version = installed_version + + instances_check._maybe_install_components(instance, shim_client_mock) + + assert ( + f"installing runner {installed_version} -> {self.EXPECTED_VERSION}" + in debug_task_log.text + ) + get_dstack_runner_download_url_mock.assert_called_once_with( + arch=None, + version=self.EXPECTED_VERSION, + ) + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_runner.assert_called_once_with( + get_dstack_runner_download_url_mock.return_value + ) + + async def test_already_installing( + self, + test_db, + instance: InstanceModel, + debug_task_log: pytest.LogCaptureFixture, + shim_client_mock: Mock, + ): + shim_client_mock.get_components.return_value.runner.version = "dev" + shim_client_mock.get_components.return_value.runner.status = ComponentStatus.INSTALLING + + instances_check._maybe_install_components(instance, shim_client_mock) + + assert "runner is already being installed" in debug_task_log.text + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_runner.assert_not_called() + + +@pytest.mark.usefixtures("get_dstack_shim_version_mock") +class TestMaybeInstallShim(BaseTestMaybeInstallComponents): + @pytest.fixture + def component_list(self) -> ComponentList: + components = ComponentList() + components.add( + ComponentInfo( + name=ComponentName.SHIM, + version=self.EXPECTED_VERSION, + status=ComponentStatus.INSTALLED, + ), + ) + return components + + @pytest.fixture + def get_dstack_shim_version_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: + mock = Mock(return_value=self.EXPECTED_VERSION) + monkeypatch.setattr(instances_check, "get_dstack_shim_version", mock) + return mock + + @pytest.fixture + def get_dstack_shim_download_url_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: + mock = Mock(return_value="https://example.com/shim") + monkeypatch.setattr(instances_check, "get_dstack_shim_download_url", mock) + return mock + + async def test_cannot_determine_expected_version( + self, + test_db, + instance: InstanceModel, + debug_task_log: pytest.LogCaptureFixture, + shim_client_mock: Mock, + get_dstack_shim_version_mock: Mock, + ): + get_dstack_shim_version_mock.return_value = None + + instances_check._maybe_install_components(instance, shim_client_mock) + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_shim.assert_not_called() + + async def test_expected_version_already_installed( + self, + test_db, + instance: InstanceModel, + debug_task_log: pytest.LogCaptureFixture, + shim_client_mock: Mock, + ): + shim_client_mock.get_components.return_value.shim.version = self.EXPECTED_VERSION + + instances_check._maybe_install_components(instance, shim_client_mock) + + assert "expected shim version already installed" in debug_task_log.text + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_shim.assert_not_called() + + @pytest.mark.parametrize("status", [ComponentStatus.NOT_INSTALLED, ComponentStatus.ERROR]) + async def test_install_not_installed_or_error( + self, + test_db, + instance: InstanceModel, + debug_task_log: pytest.LogCaptureFixture, + shim_client_mock: Mock, + get_dstack_shim_download_url_mock: Mock, + status: ComponentStatus, + ): + shim_client_mock.get_components.return_value.shim.version = "" + shim_client_mock.get_components.return_value.shim.status = status + + instances_check._maybe_install_components(instance, shim_client_mock) + + assert f"installing shim (no version) -> {self.EXPECTED_VERSION}" in debug_task_log.text + get_dstack_shim_download_url_mock.assert_called_once_with( + arch=None, + version=self.EXPECTED_VERSION, + ) + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_shim.assert_called_once_with( + get_dstack_shim_download_url_mock.return_value + ) + + @pytest.mark.parametrize("installed_version", ["0.19.40", "0.21.0", "dev"]) + async def test_install_installed( + self, + test_db, + instance: InstanceModel, + debug_task_log: pytest.LogCaptureFixture, + shim_client_mock: Mock, + get_dstack_shim_download_url_mock: Mock, + installed_version: str, + ): + shim_client_mock.get_components.return_value.shim.version = installed_version + + instances_check._maybe_install_components(instance, shim_client_mock) + + assert ( + f"installing shim {installed_version} -> {self.EXPECTED_VERSION}" + in debug_task_log.text + ) + get_dstack_shim_download_url_mock.assert_called_once_with( + arch=None, + version=self.EXPECTED_VERSION, + ) + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_shim.assert_called_once_with( + get_dstack_shim_download_url_mock.return_value + ) + + async def test_already_installing( + self, + test_db, + instance: InstanceModel, + debug_task_log: pytest.LogCaptureFixture, + shim_client_mock: Mock, + ): + shim_client_mock.get_components.return_value.shim.version = "dev" + shim_client_mock.get_components.return_value.shim.status = ComponentStatus.INSTALLING + + instances_check._maybe_install_components(instance, shim_client_mock) + + assert "shim is already being installed" in debug_task_log.text + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_shim.assert_not_called() + + +@pytest.mark.usefixtures("maybe_install_runner_mock", "maybe_install_shim_mock") +class TestMaybeRestartShim(BaseTestMaybeInstallComponents): + @pytest.fixture + def component_list(self) -> ComponentList: + components = ComponentList() + components.add( + ComponentInfo( + name=ComponentName.RUNNER, + version=self.EXPECTED_VERSION, + status=ComponentStatus.INSTALLED, + ), + ) + components.add( + ComponentInfo( + name=ComponentName.SHIM, + version=self.EXPECTED_VERSION, + status=ComponentStatus.INSTALLED, + ), + ) + return components + + @pytest.fixture + def maybe_install_runner_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: + mock = Mock(return_value=False) + monkeypatch.setattr(instances_check, "_maybe_install_runner", mock) + return mock + + @pytest.fixture + def maybe_install_shim_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: + mock = Mock(return_value=False) + monkeypatch.setattr(instances_check, "_maybe_install_shim", mock) + return mock + + async def test_up_to_date(self, test_db, instance: InstanceModel, shim_client_mock: Mock): + shim_client_mock.get_version_string.return_value = self.EXPECTED_VERSION + shim_client_mock.is_safe_to_restart.return_value = True + + instances_check._maybe_install_components(instance, shim_client_mock) + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.shutdown.assert_not_called() + + async def test_no_shim_component_info( + self, test_db, instance: InstanceModel, shim_client_mock: Mock + ): + shim_client_mock.get_components.return_value = ComponentList() + shim_client_mock.get_version_string.return_value = "outdated" + shim_client_mock.is_safe_to_restart.return_value = True + + instances_check._maybe_install_components(instance, shim_client_mock) + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.shutdown.assert_not_called() + + async def test_outdated_shutdown_requested( + self, test_db, instance: InstanceModel, shim_client_mock: Mock + ): + shim_client_mock.get_version_string.return_value = "outdated" + shim_client_mock.is_safe_to_restart.return_value = True + + instances_check._maybe_install_components(instance, shim_client_mock) + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.shutdown.assert_called_once_with(force=False) + + async def test_outdated_but_task_wont_survive_restart( + self, test_db, instance: InstanceModel, shim_client_mock: Mock + ): + shim_client_mock.get_version_string.return_value = "outdated" + shim_client_mock.is_safe_to_restart.return_value = False + + instances_check._maybe_install_components(instance, shim_client_mock) + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.shutdown.assert_not_called() + + async def test_outdated_but_runner_installation_in_progress( + self, + test_db, + instance: InstanceModel, + shim_client_mock: Mock, + component_list: ComponentList, + ): + shim_client_mock.get_version_string.return_value = "outdated" + shim_client_mock.is_safe_to_restart.return_value = True + runner_info = component_list.runner + assert runner_info is not None + runner_info.status = ComponentStatus.INSTALLING + + instances_check._maybe_install_components(instance, shim_client_mock) + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.shutdown.assert_not_called() + + async def test_outdated_but_shim_installation_in_progress( + self, + test_db, + instance: InstanceModel, + shim_client_mock: Mock, + component_list: ComponentList, + ): + shim_client_mock.get_version_string.return_value = "outdated" + shim_client_mock.is_safe_to_restart.return_value = True + shim_info = component_list.shim + assert shim_info is not None + shim_info.status = ComponentStatus.INSTALLING + + instances_check._maybe_install_components(instance, shim_client_mock) + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.shutdown.assert_not_called() + + async def test_outdated_but_runner_installation_requested( + self, + test_db, + instance: InstanceModel, + shim_client_mock: Mock, + maybe_install_runner_mock: Mock, + ): + shim_client_mock.get_version_string.return_value = "outdated" + shim_client_mock.is_safe_to_restart.return_value = True + maybe_install_runner_mock.return_value = True + + instances_check._maybe_install_components(instance, shim_client_mock) + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.shutdown.assert_not_called() + + async def test_outdated_but_shim_installation_requested( + self, + test_db, + instance: InstanceModel, + shim_client_mock: Mock, + maybe_install_shim_mock: Mock, + ): + shim_client_mock.get_version_string.return_value = "outdated" + shim_client_mock.is_safe_to_restart.return_value = True + maybe_install_shim_mock.return_value = True + + instances_check._maybe_install_components(instance, shim_client_mock) + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.shutdown.assert_not_called() diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_cloud_provisioning.py b/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_cloud_provisioning.py new file mode 100644 index 0000000000..c559931ac8 --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_cloud_provisioning.py @@ -0,0 +1,931 @@ +from typing import Optional +from unittest.mock import Mock, patch + +import gpuhunt +import pytest +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.errors import NoCapacityError, ProvisioningError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.fleets import FleetNodesSpec, InstanceGroupPlacement +from dstack._internal.core.models.instances import ( + Gpu, + InstanceAvailability, + InstanceOffer, + InstanceOfferWithAvailability, + InstanceStatus, + InstanceTerminationReason, + InstanceType, + Resources, +) +from dstack._internal.core.models.placement import PlacementGroup, PlacementGroupProvisioningData +from dstack._internal.core.models.runs import JobProvisioningData +from dstack._internal.server.background.pipeline_tasks.instances import InstanceWorker +from dstack._internal.server.models import FleetModel, InstanceModel, PlacementGroupModel +from dstack._internal.server.testing.common import ( + ComputeMockSpec, + create_fleet, + create_instance, + create_placement_group, + create_project, + get_fleet_configuration, + get_fleet_spec, + get_instance_offer_with_availability, + get_job_provisioning_data, + get_placement_group_provisioning_data, +) +from tests._internal.server.background.pipeline_tasks.test_instances.helpers import ( + instance_to_pipeline_item, + lock_instance, + process_instance, +) + + +async def _set_current_master_instance( + session: AsyncSession, fleet: FleetModel, instance: InstanceModel +) -> None: + fleet.current_master_instance_id = None if instance is None else instance.id + await session.commit() + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestCloudProvisioning: + @pytest.mark.parametrize( + ["cpus", "gpus", "requested_blocks", "expected_blocks"], + [ + pytest.param(32, 8, 1, 1, id="gpu-instance-no-blocks"), + pytest.param(32, 8, 2, 2, id="gpu-instance-four-gpu-per-block"), + pytest.param(32, 8, 4, 4, id="gpu-instance-two-gpus-per-block"), + pytest.param(32, 8, None, 8, id="gpu-instance-auto-max-gpu"), + pytest.param(4, 8, None, 4, id="gpu-instance-auto-max-cpu"), + pytest.param(8, 8, None, 8, id="gpu-instance-auto-max-cpu-and-gpu"), + pytest.param(32, 0, 1, 1, id="cpu-instance-no-blocks"), + pytest.param(32, 0, 2, 2, id="cpu-instance-four-cpu-per-block"), + pytest.param(32, 0, 4, 4, id="cpu-instance-two-cpus-per-block"), + pytest.param(32, 0, None, 32, id="cpu-instance-auto-max-cpu"), + ], + ) + async def test_creates_instance( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + cpus: int, + gpus: int, + requested_blocks: Optional[int], + expected_blocks: int, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + total_blocks=requested_blocks, + busy_blocks=0, + ) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + m.return_value = [backend_mock] + backend_mock.TYPE = BackendType.AWS + gpu = Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA) + offer = InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="instance", + resources=Resources( + cpus=cpus, + memory_mib=131072, + spot=False, + gpus=[gpu] * gpus, + ), + ), + region="us", + price=1.0, + availability=InstanceAvailability.AVAILABLE, + total_blocks=expected_blocks, + ) + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.get_offers.return_value = [offer] + backend_mock.compute.return_value.create_instance.return_value = JobProvisioningData( + backend=offer.backend, + instance_type=offer.instance, + instance_id="instance_id", + hostname="1.1.1.1", + internal_ip=None, + region=offer.region, + price=offer.price, + username="ubuntu", + ssh_port=22, + ssh_proxy=None, + dockerized=True, + backend_data=None, + ) + + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.PROVISIONING + assert instance.total_blocks == expected_blocks + assert instance.busy_blocks == 0 + + @pytest.mark.parametrize("err", [RuntimeError("Unexpected"), ProvisioningError("Expected")]) + async def test_tries_second_offer_if_first_fails( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + err: Exception, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + ) + aws_mock = Mock() + aws_mock.TYPE = BackendType.AWS + offer = get_instance_offer_with_availability(backend=BackendType.AWS, price=1.0) + aws_mock.compute.return_value = Mock(spec=ComputeMockSpec) + aws_mock.compute.return_value.get_offers.return_value = [offer] + aws_mock.compute.return_value.create_instance.side_effect = err + gcp_mock = Mock() + gcp_mock.TYPE = BackendType.GCP + offer = get_instance_offer_with_availability(backend=BackendType.GCP, price=2.0) + gcp_mock.compute.return_value = Mock(spec=ComputeMockSpec) + gcp_mock.compute.return_value.get_offers.return_value = [offer] + gcp_mock.compute.return_value.create_instance.return_value = get_job_provisioning_data( + backend=offer.backend, + region=offer.region, + price=offer.price, + ) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [aws_mock, gcp_mock] + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.PROVISIONING + aws_mock.compute.return_value.create_instance.assert_called_once() + assert instance.backend == BackendType.GCP + + @pytest.mark.parametrize("err", [RuntimeError("Unexpected"), ProvisioningError("Expected")]) + async def test_fails_if_all_offers_fail( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + err: Exception, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + ) + aws_mock = Mock() + aws_mock.TYPE = BackendType.AWS + offer = get_instance_offer_with_availability(backend=BackendType.AWS, price=1.0) + aws_mock.compute.return_value = Mock(spec=ComputeMockSpec) + aws_mock.compute.return_value.get_offers.return_value = [offer] + aws_mock.compute.return_value.create_instance.side_effect = err + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [aws_mock] + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATED + assert instance.termination_reason == InstanceTerminationReason.NO_OFFERS + + async def test_fails_if_no_offers( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + ) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [] + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATED + assert instance.termination_reason == InstanceTerminationReason.NO_OFFERS + + async def test_waits_when_fleet_has_no_current_master( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + fleet = await create_fleet( + session, + project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=2, target=2, max=2), + ) + ), + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + instance_num=0, + ) + + backend_mock = Mock() + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [backend_mock] + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.PENDING + assert backend_mock.compute.return_value.create_instance.call_count == 0 + + async def test_waits_for_current_master_to_determine_cluster_placement( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + fleet = await create_fleet( + session, + project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=2, target=2, max=2), + ) + ), + ) + master_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + instance_num=0, + ) + sibling_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + instance_num=1, + ) + await _set_current_master_instance(session, fleet, master_instance) + + backend_mock = Mock() + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [backend_mock] + await process_instance(session, worker, sibling_instance) + + await session.refresh(master_instance) + await session.refresh(sibling_instance) + assert master_instance.status == InstanceStatus.PENDING + assert sibling_instance.status == InstanceStatus.PENDING + assert backend_mock.compute.return_value.create_instance.call_count == 0 + + async def test_failed_master_does_not_provision_stale_sibling_until_fleet_reassigns_it( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + fleet = await create_fleet( + session, + project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=2, target=2, max=2), + ) + ), + ) + master_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + instance_num=0, + ) + sibling_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + instance_num=1, + ) + await _set_current_master_instance(session, fleet, master_instance) + + lock_instance(master_instance) + lock_instance(sibling_instance) + await session.commit() + master_item = instance_to_pipeline_item(master_instance) + sibling_item = instance_to_pipeline_item(sibling_instance) + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [] + await worker.process(master_item) + + await session.refresh(master_instance) + await session.refresh(sibling_instance) + assert master_instance.status == InstanceStatus.TERMINATED + assert master_instance.termination_reason == InstanceTerminationReason.NO_OFFERS + assert sibling_instance.status == InstanceStatus.PENDING + + gcp_mock = Mock() + gcp_mock.TYPE = BackendType.GCP + gcp_mock.compute.return_value = Mock(spec=ComputeMockSpec) + gcp_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(backend=BackendType.GCP, region="us-central1") + ] + gcp_mock.compute.return_value.create_instance.return_value = get_job_provisioning_data( + backend=BackendType.GCP, + region="us-central1", + ) + aws_mock = Mock() + aws_mock.TYPE = BackendType.AWS + aws_mock.compute.return_value = Mock(spec=ComputeMockSpec) + aws_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(backend=BackendType.AWS, region="us-east-1") + ] + aws_mock.compute.return_value.create_placement_group.return_value = ( + get_placement_group_provisioning_data() + ) + aws_mock.compute.return_value.create_instance.return_value = get_job_provisioning_data( + backend=BackendType.AWS, + region="us-east-1", + ) + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [gcp_mock, aws_mock] + await worker.process(sibling_item) + + await session.refresh(sibling_instance) + assert sibling_instance.status == InstanceStatus.PENDING + assert gcp_mock.compute.return_value.get_offers.call_count == 0 + assert gcp_mock.compute.return_value.create_instance.call_count == 0 + assert aws_mock.compute.return_value.create_instance.call_count == 0 + + await _set_current_master_instance(session, fleet, sibling_instance) + promoted_backend_mock = Mock() + promoted_backend_mock.TYPE = BackendType.AWS + promoted_backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + promoted_backend_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(backend=BackendType.AWS, region="us-east-1") + ] + promoted_backend_mock.compute.return_value.create_placement_group.return_value = ( + get_placement_group_provisioning_data() + ) + promoted_backend_mock.compute.return_value.create_instance.return_value = ( + get_job_provisioning_data( + backend=BackendType.AWS, + region="us-east-1", + ) + ) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [promoted_backend_mock] + await process_instance(session, worker, sibling_instance) + + await session.refresh(sibling_instance) + assert sibling_instance.status == InstanceStatus.PROVISIONING + assert sibling_instance.backend == BackendType.AWS + assert sibling_instance.region == "us-east-1" + assert promoted_backend_mock.compute.return_value.create_instance.call_count == 1 + + async def test_follows_current_master_backend_and_region_constraints( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + fleet = await create_fleet( + session, + project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=2, target=2, max=2), + ) + ), + ) + master_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + job_provisioning_data=get_job_provisioning_data( + backend=BackendType.AWS, + region="us-east-1", + ), + instance_num=0, + ) + sibling_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + instance_num=1, + ) + await _set_current_master_instance(session, fleet, master_instance) + + gcp_mock = Mock() + gcp_mock.TYPE = BackendType.GCP + gcp_mock.compute.return_value = Mock(spec=ComputeMockSpec) + gcp_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(backend=BackendType.GCP, region="us-central1") + ] + gcp_mock.compute.return_value.create_instance.return_value = get_job_provisioning_data( + backend=BackendType.GCP, + region="us-central1", + ) + aws_mock = Mock() + aws_mock.TYPE = BackendType.AWS + aws_mock.compute.return_value = Mock(spec=ComputeMockSpec) + aws_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(backend=BackendType.AWS, region="us-east-1") + ] + aws_mock.compute.return_value.create_instance.return_value = get_job_provisioning_data( + backend=BackendType.AWS, + region="us-east-1", + ) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [gcp_mock, aws_mock] + await process_instance(session, worker, sibling_instance) + + await session.refresh(sibling_instance) + assert sibling_instance.status == InstanceStatus.PROVISIONING + assert sibling_instance.backend == BackendType.AWS + assert sibling_instance.region == "us-east-1" + assert gcp_mock.compute.return_value.get_offers.call_count == 0 + assert gcp_mock.compute.return_value.create_instance.call_count == 0 + assert aws_mock.compute.return_value.create_instance.call_count == 1 + + async def test_non_master_does_not_create_new_placement_group_without_master_pg( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + fleet = await create_fleet( + session, + project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=2, target=2, max=2), + ) + ), + ) + master_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + job_provisioning_data=get_job_provisioning_data( + backend=BackendType.AWS, + region="us-east-1", + ), + instance_num=0, + ) + sibling_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + instance_num=1, + ) + await _set_current_master_instance(session, fleet, master_instance) + + backend_mock = Mock() + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(backend=BackendType.AWS, region="us-east-1") + ] + backend_mock.compute.return_value.is_suitable_placement_group.return_value = True + backend_mock.compute.return_value.create_instance.return_value = get_job_provisioning_data( + backend=BackendType.AWS, + region="us-east-1", + ) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [backend_mock] + await process_instance(session, worker, sibling_instance) + + await session.refresh(sibling_instance) + assert sibling_instance.status == InstanceStatus.PROVISIONING + assert backend_mock.compute.return_value.create_placement_group.call_count == 0 + placement_groups = (await session.execute(select(PlacementGroupModel))).scalars().all() + assert len(placement_groups) == 0 + + async def test_non_master_reuses_existing_current_master_placement_group( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + fleet = await create_fleet( + session, + project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=3, target=3, max=3), + ) + ), + ) + master_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + job_provisioning_data=get_job_provisioning_data( + backend=BackendType.AWS, + region="us-east-1", + ), + instance_num=0, + ) + current_master_pg = await create_placement_group( + session=session, + project=project, + fleet=fleet, + ) + sibling_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + instance_num=1, + ) + await _set_current_master_instance(session, fleet, master_instance) + + backend_mock = Mock() + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(backend=BackendType.AWS, region="us-east-1") + ] + backend_mock.compute.return_value.is_suitable_placement_group.return_value = True + backend_mock.compute.return_value.create_instance.return_value = get_job_provisioning_data( + backend=BackendType.AWS, + region="us-east-1", + ) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [backend_mock] + await process_instance(session, worker, sibling_instance) + + await session.refresh(sibling_instance) + assert sibling_instance.status == InstanceStatus.PROVISIONING + assert backend_mock.compute.return_value.create_placement_group.call_count == 0 + create_call = backend_mock.compute.return_value.create_instance.call_args + assert create_call is not None + assert create_call.args[2] is not None + assert create_call.args[2].name == current_master_pg.name + placement_groups = (await session.execute(select(PlacementGroupModel))).scalars().all() + assert len(placement_groups) == 1 + + async def test_allows_parallel_processing_after_master_is_provisioned( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + fleet = await create_fleet( + session, + project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=3, target=3, max=3), + ) + ), + ) + master_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + job_provisioning_data=get_job_provisioning_data( + backend=BackendType.AWS, + region="us-east-1", + ), + instance_num=0, + ) + later_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + instance_num=2, + ) + earlier_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + instance_num=1, + ) + await _set_current_master_instance(session, fleet, master_instance) + + backend_mock = Mock() + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(backend=BackendType.AWS, region="us-east-1") + ] + backend_mock.compute.return_value.create_instance.return_value = get_job_provisioning_data( + backend=BackendType.AWS, + region="us-east-1", + ) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [backend_mock] + await process_instance(session, worker, later_instance) + assert backend_mock.compute.return_value.create_instance.call_count == 1 + await process_instance(session, worker, earlier_instance) + + await session.refresh(later_instance) + await session.refresh(earlier_instance) + assert later_instance.status == InstanceStatus.PROVISIONING + assert earlier_instance.status == InstanceStatus.PROVISIONING + assert backend_mock.compute.return_value.create_instance.call_count == 2 + + @pytest.mark.parametrize( + ("placement", "should_create"), + [ + pytest.param(InstanceGroupPlacement.CLUSTER, True, id="placement-cluster"), + pytest.param(None, False, id="no-placement"), + ], + ) + async def test_create_placement_group_if_placement_cluster( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + placement: Optional[InstanceGroupPlacement], + should_create: bool, + ) -> None: + project = await create_project(session=session) + fleet = await create_fleet( + session, + project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=placement, nodes=FleetNodesSpec(min=1, target=1, max=1) + ) + ), + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + ) + if placement == InstanceGroupPlacement.CLUSTER: + await _set_current_master_instance(session, fleet, instance) + backend_mock = Mock() + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability() + ] + backend_mock.compute.return_value.create_instance.return_value = ( + get_job_provisioning_data() + ) + backend_mock.compute.return_value.create_placement_group.return_value = ( + get_placement_group_provisioning_data() + ) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [backend_mock] + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.PROVISIONING + placement_groups = (await session.execute(select(PlacementGroupModel))).scalars().all() + if should_create: + assert backend_mock.compute.return_value.create_placement_group.call_count == 1 + assert len(placement_groups) == 1 + else: + assert backend_mock.compute.return_value.create_placement_group.call_count == 0 + assert len(placement_groups) == 0 + + @pytest.mark.parametrize("can_reuse", [True, False]) + async def test_reuses_placement_group_between_offers_if_the_group_is_suitable( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + can_reuse: bool, + ) -> None: + project = await create_project(session=session) + fleet = await create_fleet( + session, + project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=1, target=1, max=1), + ) + ), + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + ) + await _set_current_master_instance(session, fleet, instance) + backend_mock = Mock() + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(instance_type="bad-offer-1"), + get_instance_offer_with_availability(instance_type="bad-offer-2"), + get_instance_offer_with_availability(instance_type="good-offer"), + ] + + def create_instance_method( + instance_offer: InstanceOfferWithAvailability, *args, **kwargs + ) -> JobProvisioningData: + if instance_offer.instance.name == "good-offer": + return get_job_provisioning_data() + raise NoCapacityError() + + backend_mock.compute.return_value.create_instance = create_instance_method + backend_mock.compute.return_value.create_placement_group.return_value = ( + get_placement_group_provisioning_data() + ) + backend_mock.compute.return_value.is_suitable_placement_group.return_value = can_reuse + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [backend_mock] + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.PROVISIONING + placement_groups = (await session.execute(select(PlacementGroupModel))).scalars().all() + if can_reuse: + assert backend_mock.compute.return_value.create_placement_group.call_count == 1 + assert len(placement_groups) == 1 + else: + assert backend_mock.compute.return_value.create_placement_group.call_count == 3 + assert len(placement_groups) == 3 + to_be_deleted_count = sum(pg.fleet_deleted for pg in placement_groups) + assert to_be_deleted_count == 2 + + async def test_master_reuses_existing_placement_group( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ) -> None: + # Regression test for https://github.com/dstackai/dstack/issues/3904 + project = await create_project(session=session) + fleet = await create_fleet( + session, + project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=1, target=1, max=1), + ) + ), + ) + master_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + ) + await _set_current_master_instance(session, fleet, master_instance) + preexisting_pg = await create_placement_group( + session=session, + project=project, + fleet=fleet, + ) + + backend_mock = Mock() + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability() + ] + backend_mock.compute.return_value.is_suitable_placement_group.return_value = True + backend_mock.compute.return_value.create_instance.return_value = ( + get_job_provisioning_data() + ) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [backend_mock] + await process_instance(session, worker, master_instance) + + await session.refresh(master_instance) + assert master_instance.status == InstanceStatus.PROVISIONING + assert backend_mock.compute.return_value.create_placement_group.call_count == 0 + create_call = backend_mock.compute.return_value.create_instance.call_args + assert create_call is not None + assert create_call.args[2] is not None + assert create_call.args[2].name == preexisting_pg.name + placement_groups = (await session.execute(select(PlacementGroupModel))).scalars().all() + assert len(placement_groups) == 1 + + @pytest.mark.parametrize("err", [NoCapacityError(), RuntimeError()]) + async def test_handles_create_placement_group_errors( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + err: Exception, + ) -> None: + project = await create_project(session=session) + fleet = await create_fleet( + session, + project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=1, target=1, max=1), + ) + ), + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + ) + await _set_current_master_instance(session, fleet, instance) + backend_mock = Mock() + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(instance_type="bad-offer"), + get_instance_offer_with_availability(instance_type="good-offer"), + ] + backend_mock.compute.return_value.create_instance.return_value = ( + get_job_provisioning_data() + ) + + def create_placement_group_method( + placement_group: PlacementGroup, master_instance_offer: InstanceOffer + ) -> PlacementGroupProvisioningData: + if master_instance_offer.instance.name == "good-offer": + return get_placement_group_provisioning_data() + raise err + + backend_mock.compute.return_value.create_placement_group = create_placement_group_method + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [backend_mock] + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.PROVISIONING + assert instance.offer + assert "good-offer" in instance.offer + assert "bad-offer" not in instance.offer + placement_groups = (await session.execute(select(PlacementGroupModel))).scalars().all() + assert len(placement_groups) == 1 diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_pipeline.py b/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_pipeline.py new file mode 100644 index 0000000000..1c0ef2bca8 --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_pipeline.py @@ -0,0 +1,329 @@ +import datetime as dt +import uuid +from unittest.mock import Mock + +import pytest +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.server.background.pipeline_tasks.fleets import FleetPipeline +from dstack._internal.server.background.pipeline_tasks.instances import ( + InstanceFetcher, + InstancePipeline, + InstanceWorker, +) +from dstack._internal.server.background.pipeline_tasks.instances import check as instances_check +from dstack._internal.server.schemas.instances import InstanceCheck +from dstack._internal.server.testing.common import ( + create_compute_group, + create_fleet, + create_instance, + create_project, +) +from dstack._internal.utils.common import get_current_datetime +from tests._internal.server.background.pipeline_tasks.test_instances.helpers import ( + instance_to_pipeline_item, + lock_instance, + process_instance, +) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestInstanceFetcher: + async def test_fetch_selects_eligible_instances_and_sets_lock_fields( + self, test_db, session: AsyncSession, fetcher: InstanceFetcher + ): + project = await create_project(session=session) + fleet = await create_fleet(session=session, project=project) + compute_group = await create_compute_group(session=session, project=project, fleet=fleet) + now = get_current_datetime() + stale = now - dt.timedelta(minutes=1) + + pending = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + last_processed_at=stale - dt.timedelta(seconds=5), + ) + provisioning = await create_instance( + session=session, + project=project, + status=InstanceStatus.PROVISIONING, + name="provisioning", + last_processed_at=stale - dt.timedelta(seconds=4), + ) + busy = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + name="busy", + last_processed_at=stale - dt.timedelta(seconds=3), + ) + idle = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + name="idle", + last_processed_at=stale - dt.timedelta(seconds=2), + ) + terminating = await create_instance( + session=session, + project=project, + status=InstanceStatus.TERMINATING, + name="terminating", + last_processed_at=stale - dt.timedelta(seconds=1), + ) + + deleted = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + name="deleted", + last_processed_at=stale, + ) + deleted.deleted = True + + recent = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + name="recent", + last_processed_at=now, + ) + + terminating_compute_group = await create_instance( + session=session, + project=project, + status=InstanceStatus.TERMINATING, + name="terminating-compute-group", + last_processed_at=stale + dt.timedelta(seconds=1), + ) + terminating_compute_group.compute_group = compute_group + + locked = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + name="locked", + last_processed_at=stale + dt.timedelta(seconds=2), + ) + locked.lock_expires_at = now + dt.timedelta(minutes=1) + locked.lock_token = uuid.uuid4() + locked.lock_owner = "OtherPipeline" + + # Placeholder instance managed by JobSubmittedPipeline — should be skipped + placeholder = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + name="placeholder", + last_processed_at=stale + dt.timedelta(seconds=3), + provisioning_job_id=uuid.uuid4(), + offer=None, + job_provisioning_data=None, + ) + + # Promoted placeholder (PROVISIONING + provisioning_job_id) — should be fetched + promoted = await create_instance( + session=session, + project=project, + status=InstanceStatus.PROVISIONING, + name="promoted", + last_processed_at=stale + dt.timedelta(seconds=4), + provisioning_job_id=uuid.uuid4(), + ) + + await session.commit() + + items = await fetcher.fetch(limit=10) + + assert {item.id for item in items} == { + pending.id, + provisioning.id, + busy.id, + idle.id, + terminating.id, + promoted.id, + } + assert placeholder.id not in {item.id for item in items} + + for instance in [ + pending, + provisioning, + busy, + idle, + terminating, + deleted, + recent, + terminating_compute_group, + locked, + ]: + await session.refresh(instance) + + expected_lock_owner = InstancePipeline.__name__ + fetched_instances = [pending, provisioning, busy, idle, terminating] + assert all(instance.lock_owner == expected_lock_owner for instance in fetched_instances) + assert all(instance.lock_expires_at is not None for instance in fetched_instances) + assert all(instance.lock_token is not None for instance in fetched_instances) + assert len({instance.lock_token for instance in fetched_instances}) == 1 + + assert deleted.lock_owner is None + assert recent.lock_owner is None + assert terminating_compute_group.lock_owner is None + assert locked.lock_owner == "OtherPipeline" + + async def test_fetch_respects_order_and_limit( + self, test_db, session: AsyncSession, fetcher: InstanceFetcher + ): + project = await create_project(session=session) + now = get_current_datetime() + + oldest = await create_instance( + session=session, + project=project, + name="oldest", + last_processed_at=now - dt.timedelta(minutes=3), + ) + middle = await create_instance( + session=session, + project=project, + name="middle", + last_processed_at=now - dt.timedelta(minutes=2), + ) + newest = await create_instance( + session=session, + project=project, + name="newest", + last_processed_at=now - dt.timedelta(minutes=1), + ) + + items = await fetcher.fetch(limit=2) + + assert [item.id for item in items] == [oldest.id, middle.id] + + await session.refresh(oldest) + await session.refresh(middle) + await session.refresh(newest) + + assert oldest.lock_owner == InstancePipeline.__name__ + assert middle.lock_owner == InstancePipeline.__name__ + assert newest.lock_owner is None + + async def test_fetch_allows_stale_instance_locks_if_fleet_is_waiting_for_instance_locks( + self, test_db, session: AsyncSession, fetcher: InstanceFetcher + ): + project = await create_project(session=session) + fleet = await create_fleet(session=session, project=project) + stale = get_current_datetime() - dt.timedelta(minutes=1) + + fleet.lock_owner = FleetPipeline.__name__ + fleet.lock_token = None + fleet.lock_expires_at = None + + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + name="stale-locked", + last_processed_at=stale - dt.timedelta(seconds=1), + ) + lock_instance(instance) + instance.lock_expires_at = stale + await session.commit() + + items = await fetcher.fetch(limit=10) + + assert [item.id for item in items] == [instance.id] + + await session.refresh(instance) + assert instance.lock_owner == InstancePipeline.__name__ + + async def test_fetch_excludes_fresh_instances_when_fleet_is_waiting_for_instance_locks( + self, test_db, session: AsyncSession, fetcher: InstanceFetcher + ): + project = await create_project(session=session) + fleet = await create_fleet(session=session, project=project) + stale = get_current_datetime() - dt.timedelta(minutes=1) + + fleet.lock_owner = FleetPipeline.__name__ + fleet.lock_token = None + fleet.lock_expires_at = None + + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + name="fresh-unlocked", + last_processed_at=stale - dt.timedelta(seconds=1), + ) + await session.commit() + + items = await fetcher.fetch(limit=10) + + assert items == [] + + await session.refresh(instance) + assert instance.lock_owner is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestInstanceWorker: + async def test_process_skips_when_lock_token_changes( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + ) + + lock_instance(instance) + await session.commit() + item = instance_to_pipeline_item(instance) + new_lock_token = uuid.uuid4() + instance.lock_token = new_lock_token + await session.commit() + + await worker.process(item) + await session.refresh(instance) + + assert instance.lock_token == new_lock_token + assert instance.lock_owner == InstancePipeline.__name__ + + async def test_process_unlocks_and_updates_last_processed_at_after_check( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PROVISIONING, + ) + before_processed_at = instance.last_processed_at + + monkeypatch.setattr( + instances_check, + "_check_instance_inner", + Mock(return_value=InstanceCheck(reachable=True)), + ) + await process_instance(session, worker, instance) + + await session.refresh(instance) + + assert instance.status == InstanceStatus.IDLE + assert instance.lock_expires_at is None + assert instance.lock_token is None + assert instance.lock_owner is None + assert instance.last_processed_at > before_processed_at diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_ssh_deploy.py b/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_ssh_deploy.py new file mode 100644 index 0000000000..c103458ed4 --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_ssh_deploy.py @@ -0,0 +1,248 @@ +import datetime as dt +from typing import Optional +from unittest.mock import Mock + +import pytest +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.errors import SSHProvisioningError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import InstanceStatus, InstanceTerminationReason +from dstack._internal.server.background.pipeline_tasks.instances import InstanceWorker +from dstack._internal.server.background.pipeline_tasks.instances import ( + ssh_deploy as instances_ssh_deploy, +) +from dstack._internal.server.testing.common import ( + create_instance, + create_project, + get_job_provisioning_data, + get_remote_connection_info, +) +from dstack._internal.utils.common import get_current_datetime +from tests._internal.server.background.pipeline_tasks.test_instances.helpers import ( + process_instance, +) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestSSHDeploy: + async def test_pending_ssh_instance_terminates_on_provision_timeout( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + created_at=get_current_datetime() - dt.timedelta(days=100), + remote_connection_info=get_remote_connection_info(), + ) + await session.commit() + + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATED + assert instance.termination_reason == InstanceTerminationReason.PROVISIONING_TIMEOUT + + @pytest.mark.parametrize( + ["cpus", "gpus", "requested_blocks", "expected_blocks"], + [ + pytest.param(32, 8, 1, 1, id="gpu-instance-no-blocks"), + pytest.param(32, 8, 2, 2, id="gpu-instance-four-gpu-per-block"), + pytest.param(32, 8, 4, 4, id="gpu-instance-two-gpus-per-block"), + pytest.param(32, 8, None, 8, id="gpu-instance-auto-max-gpu"), + pytest.param(4, 8, None, 4, id="gpu-instance-auto-max-cpu"), + pytest.param(8, 8, None, 8, id="gpu-instance-auto-max-cpu-and-gpu"), + pytest.param(32, 0, 1, 1, id="cpu-instance-no-blocks"), + pytest.param(32, 0, 2, 2, id="cpu-instance-four-cpu-per-block"), + pytest.param(32, 0, 4, 4, id="cpu-instance-two-cpus-per-block"), + pytest.param(32, 0, None, 32, id="cpu-instance-auto-max-cpu"), + ], + ) + async def test_adds_ssh_instance( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + host_info: dict, + deploy_instance_mock: Mock, + cpus: int, + gpus: int, + requested_blocks: Optional[int], + expected_blocks: int, + ): + host_info["cpus"] = cpus + host_info["gpu_count"] = gpus + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + created_at=get_current_datetime(), + remote_connection_info=get_remote_connection_info(), + total_blocks=requested_blocks, + busy_blocks=0, + ) + await session.commit() + + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.IDLE + assert instance.total_blocks == expected_blocks + assert instance.busy_blocks == 0 + deploy_instance_mock.assert_called_once() + + async def test_retries_ssh_instance_if_provisioning_fails( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + deploy_instance_mock: Mock, + ): + deploy_instance_mock.side_effect = SSHProvisioningError("Expected") + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + created_at=get_current_datetime(), + remote_connection_info=get_remote_connection_info(), + ) + await session.commit() + + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.PENDING + assert instance.termination_reason is None + + async def test_terminates_ssh_instance_if_deploy_fails_unexpectedly( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + deploy_instance_mock: Mock, + ): + deploy_instance_mock.side_effect = RuntimeError("Unexpected") + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + created_at=get_current_datetime(), + remote_connection_info=get_remote_connection_info(), + ) + await session.commit() + + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATED + assert instance.termination_reason == InstanceTerminationReason.ERROR + assert instance.termination_reason_message == "Unexpected error when adding SSH instance" + + async def test_terminates_ssh_instance_if_key_is_invalid( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + ): + monkeypatch.setattr( + instances_ssh_deploy, + "ssh_keys_to_pkeys", + Mock(side_effect=ValueError("Bad key")), + ) + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + created_at=get_current_datetime(), + remote_connection_info=get_remote_connection_info(), + ) + await session.commit() + + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATED + assert instance.termination_reason == InstanceTerminationReason.ERROR + assert instance.termination_reason_message == "Unsupported private SSH key type" + + async def test_terminates_ssh_instance_if_internal_ip_cannot_be_resolved_from_network( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + host_info: dict, + deploy_instance_mock: Mock, + ): + host_info["addresses"] = ["192.168.100.100/24"] + project = await create_project(session=session) + job_provisioning_data = get_job_provisioning_data( + dockerized=True, + backend=BackendType.REMOTE, + internal_ip=None, + ) + job_provisioning_data.instance_network = "10.0.0.0/24" + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + created_at=get_current_datetime(), + remote_connection_info=get_remote_connection_info(), + job_provisioning_data=job_provisioning_data, + ) + await session.commit() + + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATED + assert instance.termination_reason == InstanceTerminationReason.ERROR + assert ( + instance.termination_reason_message + == "Failed to locate internal IP address on the given network" + ) + + async def test_terminates_ssh_instance_if_internal_ip_is_not_in_host_interfaces( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + host_info: dict, + deploy_instance_mock: Mock, + ): + host_info["addresses"] = ["192.168.100.100/24"] + project = await create_project(session=session) + job_provisioning_data = get_job_provisioning_data( + dockerized=True, + backend=BackendType.REMOTE, + internal_ip="10.0.0.20", + ) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + created_at=get_current_datetime(), + remote_connection_info=get_remote_connection_info(), + job_provisioning_data=job_provisioning_data, + ) + await session.commit() + + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATED + assert instance.termination_reason == InstanceTerminationReason.ERROR + assert ( + instance.termination_reason_message + == "Specified internal IP not found among instance interfaces" + ) diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_termination.py b/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_termination.py new file mode 100644 index 0000000000..b9da58fc11 --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_termination.py @@ -0,0 +1,219 @@ +import datetime as dt +from contextlib import contextmanager +from typing import Optional +from unittest.mock import AsyncMock, Mock, patch + +import pytest +from freezegun import freeze_time +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.errors import BackendError, NotYetTerminated +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import InstanceStatus, InstanceTerminationReason +from dstack._internal.server.background.pipeline_tasks.instances import InstanceWorker +from dstack._internal.server.background.pipeline_tasks.instances import ( + termination as instances_termination, +) +from dstack._internal.server.testing.common import create_instance, create_project +from tests._internal.server.background.pipeline_tasks.test_instances.helpers import ( + instance_to_pipeline_item, + lock_instance, + process_instance, +) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestTermination: + @staticmethod + @contextmanager + def mock_terminate_in_backend(error: Optional[Exception] = None): + backend = Mock() + backend.TYPE = BackendType.VERDA + terminate_instance = backend.compute.return_value.terminate_instance + if error is not None: + terminate_instance.side_effect = error + with patch.object( + instances_termination.backends_services, + "get_project_backend_by_type", + AsyncMock(return_value=backend), + ): + yield terminate_instance + + async def test_terminate( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.TERMINATING, + ) + instance.termination_reason = InstanceTerminationReason.IDLE_TIMEOUT + instance.last_job_processed_at = dt.datetime.now(dt.timezone.utc) + dt.timedelta( + minutes=-19 + ) + await session.commit() + + with self.mock_terminate_in_backend() as mock: + await process_instance(session, worker, instance) + mock.assert_called_once() + + await session.refresh(instance) + + assert instance.status == InstanceStatus.TERMINATED + assert instance.termination_reason == InstanceTerminationReason.IDLE_TIMEOUT + assert instance.deleted is True + assert instance.deleted_at is not None + assert instance.finished_at is not None + + async def test_terminates_terminating_deleted_instance( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.TERMINATING, + ) + lock_instance(instance) + await session.commit() + item = instance_to_pipeline_item(instance) + instance.deleted = True + instance.termination_reason = InstanceTerminationReason.IDLE_TIMEOUT + instance.last_job_processed_at = instance.deleted_at = dt.datetime.now( + dt.timezone.utc + ) + dt.timedelta(minutes=-19) + await session.commit() + + with self.mock_terminate_in_backend() as mock: + await worker.process(item) + mock.assert_called_once() + + await session.refresh(instance) + + assert instance.status == InstanceStatus.TERMINATED + assert instance.deleted is True + assert instance.deleted_at is not None + assert instance.finished_at is not None + + @pytest.mark.parametrize( + "error", [BackendError("err"), RuntimeError("err"), NotYetTerminated("")] + ) + async def test_terminate_retry( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + error: Exception, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.TERMINATING, + ) + instance.termination_reason = InstanceTerminationReason.IDLE_TIMEOUT + initial_time = dt.datetime(2025, 1, 1, tzinfo=dt.timezone.utc) + instance.last_job_processed_at = initial_time + instance.last_processed_at = initial_time - dt.timedelta(minutes=1) + await session.commit() + + with ( + freeze_time(initial_time + dt.timedelta(minutes=1)), + self.mock_terminate_in_backend(error=error) as mock, + ): + await process_instance(session, worker, instance) + mock.assert_called_once() + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATING + + with ( + freeze_time(initial_time + dt.timedelta(minutes=2)), + self.mock_terminate_in_backend(error=None) as mock, + ): + await process_instance(session, worker, instance) + mock.assert_called_once() + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATED + + async def test_terminate_not_retries_if_too_early( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.TERMINATING, + ) + instance.termination_reason = InstanceTerminationReason.IDLE_TIMEOUT + initial_time = dt.datetime(2025, 1, 1, tzinfo=dt.timezone.utc) + instance.last_job_processed_at = initial_time + instance.last_processed_at = initial_time - dt.timedelta(minutes=1) + await session.commit() + + with ( + freeze_time(initial_time + dt.timedelta(minutes=1)), + self.mock_terminate_in_backend(error=BackendError("err")) as mock, + ): + await process_instance(session, worker, instance) + mock.assert_called_once() + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATING + + instance.last_processed_at = initial_time + await session.commit() + + with ( + freeze_time(initial_time + dt.timedelta(minutes=1, seconds=11)), + self.mock_terminate_in_backend(error=None) as mock, + ): + await process_instance(session, worker, instance) + mock.assert_not_called() + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATING + + async def test_terminate_on_termination_deadline( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.TERMINATING, + ) + instance.termination_reason = InstanceTerminationReason.IDLE_TIMEOUT + initial_time = dt.datetime(2025, 1, 1, tzinfo=dt.timezone.utc) + instance.last_job_processed_at = initial_time + instance.last_processed_at = initial_time - dt.timedelta(minutes=1) + await session.commit() + + with ( + freeze_time(initial_time + dt.timedelta(minutes=1)), + self.mock_terminate_in_backend(error=BackendError("err")) as mock, + ): + await process_instance(session, worker, instance) + mock.assert_called_once() + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATING + + with ( + freeze_time(initial_time + dt.timedelta(minutes=15, seconds=55)), + self.mock_terminate_in_backend(error=None) as mock, + ): + await process_instance(session, worker, instance) + mock.assert_called_once() + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATED diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_placement_groups.py b/src/tests/_internal/server/background/pipeline_tasks/test_placement_groups.py new file mode 100644 index 0000000000..9b3f7fbef1 --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_placement_groups.py @@ -0,0 +1,242 @@ +import asyncio +import uuid +from datetime import datetime, timedelta, timezone +from unittest.mock import Mock, patch + +import pytest +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.errors import PlacementGroupInUseError +from dstack._internal.server.background.pipeline_tasks.base import PipelineItem +from dstack._internal.server.background.pipeline_tasks.placement_groups import ( + PlacementGroupFetcher, + PlacementGroupPipeline, + PlacementGroupWorker, +) +from dstack._internal.server.models import PlacementGroupModel +from dstack._internal.server.testing.common import ( + ComputeMockSpec, + create_fleet, + create_placement_group, + create_project, +) +from dstack._internal.utils.common import get_current_datetime + + +@pytest.fixture +def worker() -> PlacementGroupWorker: + return PlacementGroupWorker(queue=Mock(), heartbeater=Mock(), pipeline_hinter=Mock()) + + +@pytest.fixture +def fetcher() -> PlacementGroupFetcher: + return PlacementGroupFetcher( + queue=asyncio.Queue(), + queue_desired_minsize=1, + min_processing_interval=timedelta(seconds=15), + lock_timeout=timedelta(seconds=30), + heartbeater=Mock(), + ) + + +def _placement_group_to_pipeline_item(placement_group: PlacementGroupModel) -> PipelineItem: + assert placement_group.lock_token is not None + assert placement_group.lock_expires_at is not None + return PipelineItem( + __tablename__=placement_group.__tablename__, + id=placement_group.id, + lock_token=placement_group.lock_token, + lock_expires_at=placement_group.lock_expires_at, + prev_lock_expired=False, + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestPlacementGroupFetcher: + async def test_fetch_selects_eligible_placement_groups_and_sets_lock_fields( + self, test_db, session: AsyncSession, fetcher: PlacementGroupFetcher + ): + project = await create_project(session) + fleet = await create_fleet(session=session, project=project) + now = get_current_datetime() + stale = now - timedelta(minutes=1) + + eligible = await create_placement_group( + session=session, + project=project, + fleet=fleet, + fleet_deleted=True, + ) + eligible.last_processed_at = stale - timedelta(seconds=2) + + fleet_not_deleted = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="fleet-not-deleted", + fleet_deleted=False, + ) + fleet_not_deleted.last_processed_at = stale - timedelta(seconds=1) + + deleted = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="deleted", + fleet_deleted=True, + deleted=True, + ) + deleted.last_processed_at = stale + + recent = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="recent", + fleet_deleted=True, + ) + recent.last_processed_at = now + + locked = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="locked", + fleet_deleted=True, + ) + locked.last_processed_at = stale + timedelta(seconds=1) + locked.lock_expires_at = now + timedelta(minutes=1) + locked.lock_token = uuid.uuid4() + locked.lock_owner = "OtherPipeline" + await session.commit() + + items = await fetcher.fetch(limit=10) + + assert [item.id for item in items] == [eligible.id] + + for placement_group in [eligible, fleet_not_deleted, deleted, recent, locked]: + await session.refresh(placement_group) + + assert eligible.lock_owner == PlacementGroupPipeline.__name__ + assert eligible.lock_expires_at is not None + assert eligible.lock_token is not None + + assert fleet_not_deleted.lock_owner is None + assert deleted.lock_owner is None + assert recent.lock_owner is None + assert locked.lock_owner == "OtherPipeline" + + async def test_fetch_returns_oldest_placement_groups_first_up_to_limit( + self, test_db, session: AsyncSession, fetcher: PlacementGroupFetcher + ): + project = await create_project(session) + fleet = await create_fleet(session=session, project=project) + now = get_current_datetime() + + oldest = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="oldest", + fleet_deleted=True, + ) + oldest.last_processed_at = now - timedelta(minutes=3) + + middle = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="middle", + fleet_deleted=True, + ) + middle.last_processed_at = now - timedelta(minutes=2) + + newest = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="newest", + fleet_deleted=True, + ) + newest.last_processed_at = now - timedelta(minutes=1) + await session.commit() + + items = await fetcher.fetch(limit=2) + + assert [item.id for item in items] == [oldest.id, middle.id] + + await session.refresh(oldest) + await session.refresh(middle) + await session.refresh(newest) + + assert oldest.lock_owner == PlacementGroupPipeline.__name__ + assert middle.lock_owner == PlacementGroupPipeline.__name__ + assert newest.lock_owner is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestPlacementGroupWorker: + async def test_deletes_placement_group( + self, test_db, session: AsyncSession, worker: PlacementGroupWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + ) + placement_group = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="test1-pg", + fleet_deleted=True, + ) + placement_group.lock_token = uuid.uuid4() + placement_group.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + with patch("dstack._internal.server.services.backends.get_project_backend_by_type") as m: + aws_mock = Mock() + m.return_value = aws_mock + aws_mock.compute.return_value = Mock(spec=ComputeMockSpec) + await worker.process(_placement_group_to_pipeline_item(placement_group)) + aws_mock.compute.return_value.delete_placement_group.assert_called_once() + await session.refresh(placement_group) + assert placement_group.deleted + + async def test_retries_placement_group_deletion_if_still_in_use( + self, test_db, session: AsyncSession, worker: PlacementGroupWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + ) + placement_group = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="test2-pg", + fleet_deleted=True, + ) + placement_group.lock_token = uuid.uuid4() + placement_group.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + placement_group.lock_owner = "PlacementGroupPipeline" + original_last_processed_at = placement_group.last_processed_at + await session.commit() + with patch("dstack._internal.server.services.backends.get_project_backend_by_type") as m: + aws_mock = Mock() + m.return_value = aws_mock + aws_mock.compute.return_value = Mock(spec=ComputeMockSpec) + aws_mock.compute.return_value.delete_placement_group.side_effect = ( + PlacementGroupInUseError() + ) + await worker.process(_placement_group_to_pipeline_item(placement_group)) + aws_mock.compute.return_value.delete_placement_group.assert_called_once() + await session.refresh(placement_group) + assert not placement_group.deleted + assert placement_group.last_processed_at > original_last_processed_at + assert placement_group.lock_token is None + assert placement_group.lock_expires_at is None + assert placement_group.lock_owner is None diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_running_jobs.py b/src/tests/_internal/server/background/pipeline_tasks/test_running_jobs.py new file mode 100644 index 0000000000..c3070ae75a --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_running_jobs.py @@ -0,0 +1,2494 @@ +import asyncio +import uuid +from contextlib import asynccontextmanager +from dataclasses import dataclass +from datetime import datetime, timedelta, timezone +from pathlib import Path +from typing import Optional +from unittest.mock import ANY, AsyncMock, MagicMock, Mock, patch + +import pytest +from freezegun import freeze_time +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import selectinload + +from dstack._internal import settings +from dstack._internal.core.errors import SSHError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import NetworkMode +from dstack._internal.core.models.configurations import ( + DevEnvironmentConfiguration, + ProbeConfig, + ServiceConfiguration, + TaskConfiguration, +) +from dstack._internal.core.models.gateways import GatewayStatus +from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.core.models.profiles import StartupOrder, UtilizationPolicy +from dstack._internal.core.models.runs import ( + ClusterInfo, + ImagePullProgress, + JobRuntimeData, + JobStatus, + JobTerminationReason, + RunSpec, + RunStatus, +) +from dstack._internal.core.models.volumes import InstanceMountPoint, VolumeMountPoint, VolumeStatus +from dstack._internal.core.services.ssh.tunnel import SSHTunnel +from dstack._internal.server import settings as server_settings +from dstack._internal.server.background.pipeline_tasks.jobs_running import ( + ROUTER_PROVISIONING_WAIT_TIMEOUT_SECONDS, + JobRunningFetcher, + JobRunningPipeline, + JobRunningPipelineItem, + JobRunningWorker, + _fetch_run_model, + _prepare_startup_context, + _ProcessContext, + _ProcessResult, + _RunnerAvailability, + _SubmitJobToRunnerResult, +) +from dstack._internal.server.background.pipeline_tasks.runs import RunPipeline +from dstack._internal.server.models import JobModel, ProbeModel +from dstack._internal.server.schemas.runner import ( + HealthcheckResponse, + JobInfoResponse, + JobStateEvent, + PortMapping, + PullResponse, + TaskStatus, +) +from dstack._internal.server.services.runner.client import RunnerClient, ShimClient +from dstack._internal.server.services.runs.replicas import RouterEnvStatus +from dstack._internal.server.services.volumes import volume_model_to_volume +from dstack._internal.server.testing.common import ( + create_backend, + create_code, + create_export, + create_fleet, + create_gateway, + create_gateway_compute, + create_instance, + create_job, + create_job_metrics_point, + create_probe, + create_project, + create_repo, + create_run, + create_user, + create_volume, + get_job_provisioning_data, + get_job_runtime_data, + get_run_spec, + get_volume_configuration, + list_events, +) +from dstack._internal.utils.common import get_current_datetime + +pytestmark = pytest.mark.usefixtures("image_config_mock") + + +@dataclass +class _ProbeSetup: + success_streak: int + ready_after: int + + +@pytest.fixture +def fetcher() -> JobRunningFetcher: + return JobRunningFetcher( + queue=asyncio.Queue(), + queue_desired_minsize=1, + min_processing_interval=timedelta(seconds=10), + lock_timeout=timedelta(seconds=30), + heartbeater=Mock(), + ) + + +@pytest.fixture +def worker() -> JobRunningWorker: + return JobRunningWorker(queue=Mock(), heartbeater=Mock(), pipeline_hinter=Mock()) + + +@pytest.fixture +def ssh_tunnel_mock(monkeypatch: pytest.MonkeyPatch) -> Mock: + mock = MagicMock(spec_set=SSHTunnel) + monkeypatch.setattr("dstack._internal.server.services.runner.pool.SSHTunnel", mock) + return mock + + +@pytest.fixture +def shim_client_mock(monkeypatch: pytest.MonkeyPatch) -> Mock: + mock = Mock(spec_set=ShimClient) + mock.healthcheck.return_value = HealthcheckResponse(service="dstack-shim", version="latest") + mock.get_task.return_value.image_pull_progress = None + monkeypatch.setattr( + "dstack._internal.server.services.runner.client.ShimClient.from_address", + Mock(return_value=mock), + ) + return mock + + +@pytest.fixture +def runner_client_mock(monkeypatch: pytest.MonkeyPatch) -> Mock: + mock = Mock(spec_set=RunnerClient) + mock.healthcheck.return_value = HealthcheckResponse( + service="dstack-runner", version="0.0.1.dev2" + ) + monkeypatch.setattr( + "dstack._internal.server.services.runner.client.RunnerClient.from_address", + Mock(return_value=mock), + ) + return mock + + +def _lock_job_foreign(job_model) -> None: + job_model.lock_expires_at = get_current_datetime() + timedelta(minutes=1) + job_model.lock_token = uuid.uuid4() + job_model.lock_owner = "OtherPipeline" + + +def _lock_job_expired_same_owner(job_model) -> None: + job_model.lock_expires_at = get_current_datetime() - timedelta(minutes=1) + job_model.lock_token = uuid.uuid4() + job_model.lock_owner = JobRunningPipeline.__name__ + + +def _lock_job(job_model) -> None: + job_model.lock_expires_at = get_current_datetime() + timedelta(seconds=30) + job_model.lock_token = uuid.uuid4() + job_model.lock_owner = JobRunningPipeline.__name__ + + +def _job_to_pipeline_item(job_model) -> JobRunningPipelineItem: + assert job_model.lock_token is not None + assert job_model.lock_expires_at is not None + return JobRunningPipelineItem( + __tablename__=job_model.__tablename__, + id=job_model.id, + lock_token=job_model.lock_token, + lock_expires_at=job_model.lock_expires_at, + prev_lock_expired=False, + status=job_model.status, + replica_num=job_model.replica_num, + ) + + +async def _process_job( + session: AsyncSession, + worker: JobRunningWorker, + job_model, +) -> None: + _lock_job(job_model) + await session.commit() + await worker.process(_job_to_pipeline_item(job_model)) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestJobRunningFetcher: + async def test_fetch_selects_eligible_jobs_and_sets_lock_fields( + self, test_db, session: AsyncSession, fetcher: JobRunningFetcher + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + now = get_current_datetime() + stale = now - timedelta(minutes=1) + + provisioning = await create_job( + session=session, + run=run, + status=JobStatus.PROVISIONING, + last_processed_at=stale - timedelta(seconds=4), + ) + pulling = await create_job( + session=session, + run=run, + status=JobStatus.PULLING, + last_processed_at=stale - timedelta(seconds=3), + ) + running = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + last_processed_at=stale - timedelta(seconds=2), + ) + expired_same_owner = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + last_processed_at=stale - timedelta(seconds=1), + ) + _lock_job_expired_same_owner(expired_same_owner) + + recent = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + last_processed_at=now, + ) + foreign_locked = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + last_processed_at=stale, + ) + _lock_job_foreign(foreign_locked) + finished = await create_job( + session=session, + run=run, + status=JobStatus.DONE, + last_processed_at=stale - timedelta(seconds=5), + ) + await session.commit() + + items = await fetcher.fetch(limit=10) + + assert [item.id for item in items] == [ + provisioning.id, + pulling.id, + running.id, + expired_same_owner.id, + ] + assert [item.status for item in items] == [ + JobStatus.PROVISIONING, + JobStatus.PULLING, + JobStatus.RUNNING, + JobStatus.RUNNING, + ] + + for job in [ + provisioning, + pulling, + running, + expired_same_owner, + recent, + foreign_locked, + finished, + ]: + await session.refresh(job) + + fetched_jobs = [provisioning, pulling, running, expired_same_owner] + assert all(job.lock_owner == JobRunningPipeline.__name__ for job in fetched_jobs) + assert all(job.lock_expires_at is not None for job in fetched_jobs) + assert all(job.lock_token is not None for job in fetched_jobs) + assert len({job.lock_token for job in fetched_jobs}) == 1 + + assert recent.lock_owner is None + assert foreign_locked.lock_owner == "OtherPipeline" + assert finished.lock_owner is None + + async def test_fetch_excludes_jobs_from_terminating_runs( + self, test_db, session: AsyncSession, fetcher: JobRunningFetcher + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + active_run = await create_run(session=session, project=project, repo=repo, user=user) + terminating_run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="terminating-run", + status=RunStatus.TERMINATING, + ) + now = get_current_datetime() + stale = now - timedelta(minutes=1) + + active_job = await create_job( + session=session, + run=active_run, + status=JobStatus.RUNNING, + last_processed_at=stale - timedelta(seconds=1), + ) + terminating_run_job = await create_job( + session=session, + run=terminating_run, + status=JobStatus.RUNNING, + last_processed_at=stale - timedelta(seconds=2), + ) + + items = await fetcher.fetch(limit=10) + + assert [item.id for item in items] == [active_job.id] + + await session.refresh(active_job) + await session.refresh(terminating_run_job) + + assert active_job.lock_owner == JobRunningPipeline.__name__ + assert terminating_run_job.lock_owner is None + + async def test_fetch_allows_stale_job_locks_even_if_run_is_waiting_for_job_locks( + self, test_db, session: AsyncSession, fetcher: JobRunningFetcher + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + stale = get_current_datetime() - timedelta(minutes=1) + + run.lock_owner = RunPipeline.__name__ + run.lock_token = None + run.lock_expires_at = None + + job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + last_processed_at=stale - timedelta(seconds=1), + ) + _lock_job_expired_same_owner(job) + await session.commit() + + items = await fetcher.fetch(limit=10) + + assert [item.id for item in items] == [job.id] + + await session.refresh(job) + assert job.lock_owner == JobRunningPipeline.__name__ + + async def test_fetch_excludes_jobs_when_run_is_waiting_for_related_job_locks( + self, test_db, session: AsyncSession, fetcher: JobRunningFetcher + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + stale = get_current_datetime() - timedelta(minutes=1) + + run.lock_owner = RunPipeline.__name__ + run.lock_token = None + run.lock_expires_at = None + + job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + last_processed_at=stale - timedelta(seconds=1), + ) + await session.commit() + + items = await fetcher.fetch(limit=10) + + assert items == [] + + await session.refresh(job) + assert job.lock_owner is None + + async def test_fetch_returns_oldest_jobs_first_up_to_limit( + self, test_db, session: AsyncSession, fetcher: JobRunningFetcher + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + now = get_current_datetime() + + oldest = await create_job( + session=session, + run=run, + status=JobStatus.PROVISIONING, + last_processed_at=now - timedelta(minutes=3), + ) + middle = await create_job( + session=session, + run=run, + status=JobStatus.PULLING, + last_processed_at=now - timedelta(minutes=2), + ) + newest = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + last_processed_at=now - timedelta(minutes=1), + ) + + items = await fetcher.fetch(limit=2) + + assert [item.id for item in items] == [oldest.id, middle.id] + + await session.refresh(oldest) + await session.refresh(middle) + await session.refresh(newest) + + assert oldest.lock_owner == JobRunningPipeline.__name__ + assert middle.lock_owner == JobRunningPipeline.__name__ + assert newest.lock_owner is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestJobRunningWorker: + async def test_process_skips_when_lock_token_changes( + self, + test_db, + session: AsyncSession, + worker: JobRunningWorker, + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + instance = await create_instance( + session=session, project=project, status=InstanceStatus.BUSY + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.PROVISIONING, + submitted_at=get_current_datetime(), + job_provisioning_data=get_job_provisioning_data(dockerized=False), + instance=instance, + instance_assigned=True, + ) + _lock_job(job) + await session.commit() + + item = _job_to_pipeline_item(job) + new_lock_token = uuid.uuid4() + job.lock_token = new_lock_token + await session.commit() + + await worker.process(item) + await session.refresh(job) + + assert job.lock_token == new_lock_token + assert job.status == JobStatus.PROVISIONING + assert job.lock_owner == JobRunningPipeline.__name__ + + async def test_leaves_provisioning_job_unchanged_if_runner_not_alive( + self, test_db, session: AsyncSession, worker: JobRunningWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + instance = await create_instance( + session=session, project=project, status=InstanceStatus.BUSY + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.PROVISIONING, + submitted_at=get_current_datetime(), + job_provisioning_data=get_job_provisioning_data(dockerized=False), + instance=instance, + instance_assigned=True, + ) + + with ( + patch("dstack._internal.server.services.runner.pool.SSHTunnel") as ssh_tunnel_cls, + patch( + "dstack._internal.server.services.runner.client.RunnerClient.from_address" + ) as runner_client_cls, + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running._get_job_file_archives", + new_callable=AsyncMock, + ) as get_job_file_archives_mock, + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running._get_job_code", + new_callable=AsyncMock, + ) as get_job_code_mock, + ): + runner_client_mock = runner_client_cls.return_value + runner_client_mock.healthcheck.return_value = None + await _process_job(session, worker, job) + ssh_tunnel_cls.assert_called_once() + runner_client_mock.healthcheck.assert_called_once() + get_job_file_archives_mock.assert_not_awaited() + get_job_code_mock.assert_not_awaited() + + await session.refresh(job) + assert job.status == JobStatus.PROVISIONING + assert job.lock_token is None + assert job.lock_owner is None + + @pytest.mark.parametrize( + ["has_repo_code", "runner_version", "upload_code_call_expected"], + [ + pytest.param(False, "0.20.17", False, id="without-repo-code-new-runner"), + pytest.param(True, "0.20.17", True, id="with-repo-code-new-runner"), + pytest.param(False, "0.20.16", True, id="without-repo-code-old-runner"), + pytest.param(True, "0.20.16", True, id="with-repo-code-old-runner"), + ], + ) + async def test_runs_provisioning_job( + self, + test_db, + session: AsyncSession, + worker: JobRunningWorker, + runner_version: str, + has_repo_code: bool, + upload_code_call_expected: bool, + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + repo_code_hash: Optional[str] = None + if has_repo_code: + repo_code_hash = "blob_hash" + await create_code(session=session, repo=repo, blob_hash=repo_code_hash, blob=b"blob") + run_spec = get_run_spec( + run_name="test-run", + repo_id=repo.name, + repo_code_hash=repo_code_hash, + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name=run_spec.run_name, + run_spec=run_spec, + ) + instance = await create_instance( + session=session, project=project, status=InstanceStatus.BUSY + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.PROVISIONING, + submitted_at=get_current_datetime(), + job_provisioning_data=get_job_provisioning_data(dockerized=False), + job_runtime_data=get_job_runtime_data(), + instance=instance, + instance_assigned=True, + ) + before_processed_at = job.last_processed_at + + with ( + patch("dstack._internal.server.services.runner.pool.SSHTunnel") as ssh_tunnel_cls, + patch.object(RunnerClient, "_healthcheck") as healthcheck_mock, + patch.object(RunnerClient, "submit_job") as submit_job_mock, + patch.object(RunnerClient, "upload_code") as upload_code_mock, + patch.object(RunnerClient, "run_job") as run_job_mock, + ): + healthcheck_mock.return_value = HealthcheckResponse( + service="dstack-runner", version=runner_version + ) + run_job_mock.return_value = JobInfoResponse( + working_dir="/dstack/run", username="dstack" + ) + await _process_job(session, worker, job) + assert ssh_tunnel_cls.call_count == 2 + assert healthcheck_mock.call_count == 2 + submit_job_mock.assert_called_once() + if upload_code_call_expected: + upload_code_mock.assert_called_once() + else: + upload_code_mock.assert_not_called() + run_job_mock.assert_called_once() + + await session.refresh(job) + assert job.status == JobStatus.RUNNING + assert job.lock_token is None + assert job.lock_expires_at is None + assert job.lock_owner is None + assert job.last_processed_at > before_processed_at + job_runtime_data = JobRuntimeData.__response__.parse_raw(job.job_runtime_data) + assert job_runtime_data.working_dir == "/dstack/run" + assert job_runtime_data.username == "dstack" + + @pytest.mark.parametrize("sshproxy_enforced", [False, True]) + async def test_provisioning_shim( + self, + monkeypatch: pytest.MonkeyPatch, + test_db, + session: AsyncSession, + worker: JobRunningWorker, + ssh_tunnel_mock: Mock, + shim_client_mock: Mock, + sshproxy_enforced: bool, + ): + monkeypatch.setattr( + "dstack._internal.server.settings.SSHPROXY_ENFORCED", sshproxy_enforced + ) + project_ssh_pub_key = "__project_ssh_pub_key__" + project = await create_project(session=session, ssh_public_key=project_ssh_pub_key) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec(run_name="test-run", repo_id=repo.name) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="test-run", + run_spec=run_spec, + ) + instance = await create_instance( + session=session, project=project, status=InstanceStatus.BUSY + ) + job_provisioning_data = get_job_provisioning_data(dockerized=True) + + with patch( + "dstack._internal.server.services.jobs.configurators.base.get_default_python_verison" + ) as py_version: + py_version.return_value = "3.13" + job = await create_job( + session=session, + run=run, + status=JobStatus.PROVISIONING, + submitted_at=get_current_datetime(), + job_provisioning_data=job_provisioning_data, + instance=instance, + instance_assigned=True, + ) + + await _process_job(session, worker, job) + + ssh_tunnel_mock.assert_called_once() + shim_client_mock.healthcheck.assert_called_once() + shim_client_mock.submit_task.assert_called_once_with( + task_id=job.id, + name="test-run-0-0", + registry_username="", + registry_password="", + image_name=( + f"dstackai/base:{settings.DSTACK_DOCKER_BASE_IMAGE_VERSION}-" + f"base-ubuntu{settings.DSTACK_DOCKER_BASE_IMAGE_UBUNTU_VERSION}" + ), + container_user="root", + privileged=False, + gpu=None, + cpu=None, + memory=None, + shm_size=None, + network_mode=NetworkMode.HOST, + volumes=[], + volume_mounts=[], + instance_mounts=[], + gpu_devices=[], + host_ssh_user="" if sshproxy_enforced else "ubuntu", + host_ssh_keys=[] if sshproxy_enforced else ["user_ssh_key"], + container_ssh_keys=[project_ssh_pub_key] + if sshproxy_enforced + else [project_ssh_pub_key, "user_ssh_key"], + instance_id=job_provisioning_data.instance_id, + ) + await session.refresh(job) + assert job.status == JobStatus.PULLING + + @pytest.mark.parametrize("privileged", [False, True]) + async def test_provisioning_shim_with_volumes( + self, + monkeypatch: pytest.MonkeyPatch, + test_db, + session: AsyncSession, + worker: JobRunningWorker, + ssh_tunnel_mock: Mock, + shim_client_mock: Mock, + privileged: bool, + ): + monkeypatch.setattr("dstack._internal.server.settings.SSHPROXY_ENFORCED", False) + project_ssh_pub_key = "__project_ssh_pub_key__" + project = await create_project(session=session, ssh_public_key=project_ssh_pub_key) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + configuration=get_volume_configuration( + name="my-vol", backend=BackendType.AWS, region="us-east-1" + ), + backend=BackendType.AWS, + region="us-east-1", + ) + run_spec = get_run_spec(run_name="test-run", repo_id=repo.name) + run_spec.configuration.privileged = privileged + run_spec.configuration.volumes = [ + VolumeMountPoint(name="my-vol", path="/volume"), + InstanceMountPoint(instance_path="/root/.cache", path="/cache"), + ] + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="test-run", + run_spec=run_spec, + ) + instance = await create_instance( + session=session, project=project, status=InstanceStatus.BUSY + ) + job_provisioning_data = get_job_provisioning_data(dockerized=True) + + with patch( + "dstack._internal.server.services.jobs.configurators.base.get_default_python_verison" + ) as py_version: + py_version.return_value = "3.13" + job = await create_job( + session=session, + run=run, + status=JobStatus.PROVISIONING, + submitted_at=get_current_datetime(), + job_provisioning_data=job_provisioning_data, + instance=instance, + instance_assigned=True, + ) + + await _process_job(session, worker, job) + + ssh_tunnel_mock.assert_called_once() + shim_client_mock.healthcheck.assert_called_once() + shim_client_mock.submit_task.assert_called_once_with( + task_id=job.id, + name="test-run-0-0", + registry_username="", + registry_password="", + image_name=( + f"dstackai/base:{settings.DSTACK_DOCKER_BASE_IMAGE_VERSION}-" + f"base-ubuntu{settings.DSTACK_DOCKER_BASE_IMAGE_UBUNTU_VERSION}" + ), + container_user="root", + privileged=privileged, + gpu=None, + cpu=None, + memory=None, + shm_size=None, + network_mode=NetworkMode.HOST, + volumes=[volume_model_to_volume(volume)], + volume_mounts=[VolumeMountPoint(name="my-vol", path="/volume")], + instance_mounts=[InstanceMountPoint(instance_path="/root/.cache", path="/cache")], + gpu_devices=[], + host_ssh_user="ubuntu", + host_ssh_keys=["user_ssh_key"], + container_ssh_keys=[project_ssh_pub_key, "user_ssh_key"], + instance_id=job_provisioning_data.instance_id, + ) + await session.refresh(job) + assert job.status == JobStatus.PULLING + + async def test_pulling_shim( + self, + test_db, + session: AsyncSession, + worker: JobRunningWorker, + ssh_tunnel_mock: Mock, + shim_client_mock: Mock, + runner_client_mock: Mock, + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + repo_code_hash = "blob_hash" + await create_code(session=session, repo=repo, blob_hash=repo_code_hash, blob=b"blob") + run_spec = get_run_spec( + run_name="test-run", + repo_id=repo.name, + repo_code_hash=repo_code_hash, + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name=run_spec.run_name, + run_spec=run_spec, + ) + instance = await create_instance( + session=session, project=project, status=InstanceStatus.BUSY + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.PULLING, + submitted_at=get_current_datetime(), + job_provisioning_data=get_job_provisioning_data(dockerized=True), + job_runtime_data=get_job_runtime_data(network_mode="bridge", ports=None), + instance=instance, + instance_assigned=True, + ) + shim_client_mock.get_task.return_value.status = TaskStatus.RUNNING + shim_client_mock.get_task.return_value.ports = [ + PortMapping(container=10022, host=32771), + PortMapping(container=10999, host=32772), + ] + runner_client_mock.run_job.return_value = JobInfoResponse( + working_dir="/dstack/run", username="dstack" + ) + + await _process_job(session, worker, job) + + assert ssh_tunnel_mock.call_count == 3 + shim_client_mock.get_task.assert_called_once() + assert runner_client_mock.healthcheck.call_count == 2 + runner_client_mock.submit_job.assert_called_once() + runner_client_mock.upload_code.assert_called_once() + runner_client_mock.run_job.assert_called_once() + await session.refresh(job) + assert job.status == JobStatus.RUNNING + job_runtime_data = JobRuntimeData.__response__.parse_raw(job.job_runtime_data) + assert job_runtime_data.ports == {10022: 32771, 10999: 32772} + assert job_runtime_data.working_dir == "/dstack/run" + assert job_runtime_data.username == "dstack" + + async def test_pulling_shim_port_mapping_not_ready( + self, + test_db, + session: AsyncSession, + worker: JobRunningWorker, + ssh_tunnel_mock: Mock, + shim_client_mock: Mock, + runner_client_mock: Mock, + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + instance = await create_instance( + session=session, project=project, status=InstanceStatus.BUSY + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.PULLING, + submitted_at=get_current_datetime(), + job_provisioning_data=get_job_provisioning_data(dockerized=True), + job_runtime_data=get_job_runtime_data(network_mode="bridge", ports=None), + instance=instance, + instance_assigned=True, + ) + shim_client_mock.get_task.return_value.status = TaskStatus.RUNNING + shim_client_mock.get_task.return_value.ports = None + + with ( + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running._get_job_file_archives", + new_callable=AsyncMock, + ) as get_job_file_archives_mock, + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running._get_job_code", + new_callable=AsyncMock, + ) as get_job_code_mock, + ): + await _process_job(session, worker, job) + ssh_tunnel_mock.assert_called_once() + shim_client_mock.get_task.assert_called_once() + runner_client_mock.healthcheck.assert_not_called() + runner_client_mock.submit_job.assert_not_called() + get_job_file_archives_mock.assert_not_awaited() + get_job_code_mock.assert_not_awaited() + + await session.refresh(job) + assert job.status == JobStatus.PULLING + + async def test_pulling_shim_waiting_resets_disconnect_and_emits_reachable_event( + self, + test_db, + session: AsyncSession, + worker: JobRunningWorker, + ssh_tunnel_mock: Mock, + shim_client_mock: Mock, + runner_client_mock: Mock, + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + instance = await create_instance( + session=session, project=project, status=InstanceStatus.BUSY + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.PULLING, + submitted_at=get_current_datetime(), + disconnected_at=get_current_datetime() - timedelta(minutes=1), + job_provisioning_data=get_job_provisioning_data(dockerized=True), + job_runtime_data=get_job_runtime_data(network_mode="bridge", ports=None), + instance=instance, + instance_assigned=True, + ) + shim_client_mock.get_task.return_value.status = TaskStatus.RUNNING + shim_client_mock.get_task.return_value.ports = None + + await _process_job(session, worker, job) + + ssh_tunnel_mock.assert_called_once() + shim_client_mock.get_task.assert_called_once() + runner_client_mock.healthcheck.assert_not_called() + await session.refresh(job) + events = await list_events(session) + assert job.status == JobStatus.PULLING + assert job.disconnected_at is None + assert len(events) == 1 + assert events[0].message == "Job became reachable" + + async def test_pulling_shim_runner_not_ready( + self, + test_db, + session: AsyncSession, + worker: JobRunningWorker, + ssh_tunnel_mock: Mock, + shim_client_mock: Mock, + runner_client_mock: Mock, + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + instance = await create_instance( + session=session, project=project, status=InstanceStatus.BUSY + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.PULLING, + submitted_at=get_current_datetime(), + job_provisioning_data=get_job_provisioning_data(dockerized=True), + job_runtime_data=get_job_runtime_data(network_mode="bridge", ports=None), + instance=instance, + instance_assigned=True, + ) + shim_client_mock.get_task.return_value.status = TaskStatus.RUNNING + shim_client_mock.get_task.return_value.ports = [ + PortMapping(container=10022, host=32771), + PortMapping(container=10999, host=32772), + ] + runner_client_mock.healthcheck.return_value = None + + with ( + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running._get_job_file_archives", + new_callable=AsyncMock, + ) as get_job_file_archives_mock, + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running._get_job_code", + new_callable=AsyncMock, + ) as get_job_code_mock, + ): + await _process_job(session, worker, job) + assert ssh_tunnel_mock.call_count == 2 + shim_client_mock.get_task.assert_called_once() + runner_client_mock.healthcheck.assert_called_once() + runner_client_mock.submit_job.assert_not_called() + get_job_file_archives_mock.assert_not_awaited() + get_job_code_mock.assert_not_awaited() + + await session.refresh(job) + assert job.status == JobStatus.PULLING + + async def test_pulling_shim_uses_runtime_port_mapping_for_runner_calls( + self, + test_db, + session: AsyncSession, + worker: JobRunningWorker, + ssh_tunnel_mock: Mock, + shim_client_mock: Mock, + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + instance = await create_instance( + session=session, project=project, status=InstanceStatus.BUSY + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.PULLING, + submitted_at=get_current_datetime(), + job_provisioning_data=get_job_provisioning_data(dockerized=True), + job_runtime_data=get_job_runtime_data(network_mode="bridge", ports=None), + instance=instance, + instance_assigned=True, + ) + shim_client_mock.get_task.return_value.status = TaskStatus.RUNNING + shim_client_mock.get_task.return_value.ports = [ + PortMapping(container=10022, host=32771), + PortMapping(container=10999, host=32772), + ] + expected_ports = {10022: 32771, 10999: 32772} + + def assert_runner_availability(_, __, job_runtime_data): + assert job_runtime_data is not None + assert job_runtime_data.ports == expected_ports + return _RunnerAvailability.AVAILABLE + + def assert_submit_job_to_runner(_, __, job_runtime_data, **kwargs): + assert job_runtime_data is not None + assert job_runtime_data.ports == expected_ports + return _SubmitJobToRunnerResult(success=True) + + with ( + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running._get_runner_availability", + side_effect=assert_runner_availability, + ) as get_runner_availability_mock, + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running._submit_job_to_runner", + side_effect=assert_submit_job_to_runner, + ) as submit_job_to_runner_mock, + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running._get_job_file_archives", + new_callable=AsyncMock, + return_value=[], + ), + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running._get_job_code", + new_callable=AsyncMock, + return_value=b"", + ), + ): + await _process_job(session, worker, job) + ssh_tunnel_mock.assert_called_once() + get_runner_availability_mock.assert_called_once() + submit_job_to_runner_mock.assert_called_once() + + await session.refresh(job) + assert job.status == JobStatus.PULLING + job_runtime_data = JobRuntimeData.__response__.parse_raw(job.job_runtime_data) + assert job_runtime_data.ports == expected_ports + + async def test_pulling_shim_failed( + self, test_db, session: AsyncSession, worker: JobRunningWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + instance = await create_instance( + session=session, project=project, status=InstanceStatus.IDLE + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.PULLING, + submitted_at=get_current_datetime(), + job_provisioning_data=get_job_provisioning_data(dockerized=True), + instance=instance, + ) + + with ( + patch("dstack._internal.server.services.runner.pool.SSHTunnel") as ssh_tunnel_cls, + ): + from dstack._internal.core.errors import SSHError + + ssh_tunnel_cls.side_effect = SSHError + await _process_job(session, worker, job) + assert ssh_tunnel_cls.call_count == 1 + + await session.refresh(job) + events = await list_events(session) + assert job.disconnected_at is not None + assert job.status == JobStatus.PULLING + assert len(events) == 1 + assert events[0].message == "Job became unreachable" + + with ( + patch("dstack._internal.server.services.runner.pool.SSHTunnel") as ssh_tunnel_cls, + freeze_time(job.disconnected_at + timedelta(minutes=5)), + ): + from dstack._internal.core.errors import SSHError + + ssh_tunnel_cls.side_effect = SSHError + await _process_job(session, worker, job) + assert ssh_tunnel_cls.call_count == 1 + + await session.refresh(job) + assert job.status == JobStatus.TERMINATING + assert job.termination_reason == JobTerminationReason.INSTANCE_UNREACHABLE + assert job.remove_at is None + + async def test_pulling_shim_stores_pull_progress( + self, + test_db, + session: AsyncSession, + worker: JobRunningWorker, + ssh_tunnel_mock: Mock, + shim_client_mock: Mock, + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + instance = await create_instance( + session=session, project=project, status=InstanceStatus.BUSY + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.PULLING, + submitted_at=get_current_datetime(), + job_provisioning_data=get_job_provisioning_data(dockerized=True), + instance=instance, + instance_assigned=True, + ) + progress = ImagePullProgress( + downloaded_bytes=512, extracted_bytes=0, total_bytes=1024, is_total_bytes_final=True + ) + shim_client_mock.get_task.return_value.status = TaskStatus.PULLING + shim_client_mock.get_task.return_value.image_pull_progress = progress + + await _process_job(session, worker, job) + + await session.refresh(job) + assert job.status == JobStatus.PULLING + assert job.image_pull_progress == progress.json() + + async def test_provisioning_shim_force_stop_if_already_running_api_v1( + self, + monkeypatch: pytest.MonkeyPatch, + test_db, + session: AsyncSession, + worker: JobRunningWorker, + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec(run_name="test-run", repo_id=repo.name) + run_spec.configuration.image = "debian" + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="test-run", + run_spec=run_spec, + ) + instance = await create_instance( + session=session, project=project, status=InstanceStatus.BUSY + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.PROVISIONING, + submitted_at=get_current_datetime(), + job_provisioning_data=get_job_provisioning_data(dockerized=True), + instance=instance, + instance_assigned=True, + ) + monkeypatch.setattr( + "dstack._internal.server.services.runner.pool.SSHTunnel", + Mock(return_value=MagicMock()), + ) + shim_client_mock = Mock() + monkeypatch.setattr( + "dstack._internal.server.services.runner.client.ShimClient.from_address", + Mock(return_value=shim_client_mock), + ) + shim_client_mock.healthcheck.return_value = HealthcheckResponse( + service="dstack-shim", version="0.0.1.dev2" + ) + shim_client_mock.is_api_v2_supported.return_value = False + shim_client_mock.submit.return_value = False + + await _process_job(session, worker, job) + + shim_client_mock.healthcheck.assert_called_once() + shim_client_mock.submit.assert_called_once() + shim_client_mock.stop.assert_called_once_with(force=True) + await session.refresh(job) + assert job.status == JobStatus.PROVISIONING + + async def test_master_job_waits_for_workers( + self, test_db, session: AsyncSession, worker: JobRunningWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec(run_name="test-run", repo_id=repo.name) + run_spec.configuration.startup_order = StartupOrder.WORKERS_FIRST + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=run_spec, + ) + instance1 = await create_instance( + session=session, project=project, status=InstanceStatus.BUSY + ) + instance2 = await create_instance( + session=session, project=project, status=InstanceStatus.BUSY + ) + job_provisioning_data = get_job_provisioning_data(dockerized=False) + master_job = await create_job( + session=session, + run=run, + status=JobStatus.PROVISIONING, + submitted_at=get_current_datetime(), + job_provisioning_data=job_provisioning_data, + instance_assigned=True, + instance=instance1, + job_num=0, + last_processed_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + worker_job = await create_job( + session=session, + run=run, + status=JobStatus.PROVISIONING, + submitted_at=get_current_datetime(), + job_provisioning_data=job_provisioning_data, + instance_assigned=True, + instance=instance2, + job_num=1, + last_processed_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + ) + + await _process_job(session, worker, master_job) + await session.refresh(master_job) + assert master_job.status == JobStatus.PROVISIONING + + worker_job.status = JobStatus.RUNNING + master_job.last_processed_at = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + with ( + patch("dstack._internal.server.services.runner.pool.SSHTunnel"), + patch( + "dstack._internal.server.services.runner.client.RunnerClient.from_address" + ) as runner_client_cls, + ): + runner_client_mock = runner_client_cls.return_value + runner_client_mock.healthcheck.return_value = HealthcheckResponse( + service="dstack-runner", version="0.0.1.dev2" + ) + await _process_job(session, worker, master_job) + + await session.refresh(master_job) + assert master_job.status == JobStatus.RUNNING + + async def test_apply_skips_when_lock_token_changes_after_processing( + self, test_db, session: AsyncSession, worker: JobRunningWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + instance = await create_instance( + session=session, project=project, status=InstanceStatus.BUSY + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.PROVISIONING, + submitted_at=get_current_datetime(), + job_provisioning_data=get_job_provisioning_data(dockerized=False), + job_runtime_data=get_job_runtime_data(), + instance=instance, + instance_assigned=True, + ) + _lock_job(job) + await session.commit() + original_lock_token = job.lock_token + replacement_lock_token = uuid.uuid4() + + async def invalidate_lock(*args, **kwargs): + job.lock_token = replacement_lock_token + await session.commit() + return b"" + + with ( + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running._get_runner_availability", + return_value=_RunnerAvailability.AVAILABLE, + ), + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running._get_job_file_archives", + new_callable=AsyncMock, + return_value=[], + ), + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running._get_job_code", + new_callable=AsyncMock, + side_effect=invalidate_lock, + ), + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running._submit_job_to_runner", + return_value=_SubmitJobToRunnerResult(success=True), + ), + ): + await worker.process(_job_to_pipeline_item(job)) + + await session.refresh(job) + assert job.status == JobStatus.PROVISIONING + assert job.lock_token == replacement_lock_token + assert job.lock_token != original_lock_token + + async def test_updates_running_job( + self, + test_db, + session: AsyncSession, + worker: JobRunningWorker, + tmp_path: Path, + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + job_provisioning_data=get_job_provisioning_data(dockerized=False), + instance=instance, + instance_assigned=True, + ) + last_processed_at = job.last_processed_at + + with ( + patch.object(server_settings, "SERVER_DIR_PATH", tmp_path), + patch("dstack._internal.server.services.runner.pool.SSHTunnel") as ssh_tunnel_cls, + patch( + "dstack._internal.server.services.runner.client.RunnerClient.from_address" + ) as runner_client_cls, + ): + runner_client_mock = runner_client_cls.return_value + runner_client_mock.pull.return_value = PullResponse( + job_states=[JobStateEvent(timestamp=1, state=JobStatus.RUNNING)], + job_logs=[], + runner_logs=[], + last_updated=1, + ) + await _process_job(session, worker, job) + ssh_tunnel_cls.assert_called_once() + + await session.refresh(job) + assert job.status == JobStatus.RUNNING + assert job.runner_timestamp == 1 + + job.last_processed_at = last_processed_at + await session.commit() + + with ( + patch("dstack._internal.server.services.runner.pool.SSHTunnel") as ssh_tunnel_cls, + patch( + "dstack._internal.server.services.runner.client.RunnerClient.from_address" + ) as runner_client_cls, + ): + runner_client_mock = runner_client_cls.return_value + runner_client_mock.pull.return_value = PullResponse( + job_states=[JobStateEvent(timestamp=1, state=JobStatus.DONE, exit_status=0)], + job_logs=[], + runner_logs=[], + last_updated=2, + ) + await _process_job(session, worker, job) + ssh_tunnel_cls.assert_called_once() + + await session.refresh(job) + assert job.status == JobStatus.TERMINATING + assert job.termination_reason == JobTerminationReason.DONE_BY_RUNNER + assert job.exit_status == 0 + assert job.runner_timestamp == 2 + + async def test_running_job_disconnect_retries_then_terminates( + self, + test_db, + session: AsyncSession, + worker: JobRunningWorker, + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + job_provisioning_data=get_job_provisioning_data(dockerized=False), + instance=instance, + instance_assigned=True, + ) + + with ( + patch("dstack._internal.server.services.runner.pool.SSHTunnel") as ssh_tunnel_cls, + ): + ssh_tunnel_cls.side_effect = SSHError + await _process_job(session, worker, job) + assert ssh_tunnel_cls.call_count == 1 + + await session.refresh(job) + events = await list_events(session) + assert job.status == JobStatus.RUNNING + assert job.disconnected_at is not None + assert len(events) == 1 + assert events[0].message == "Job became unreachable" + + with ( + patch("dstack._internal.server.services.runner.pool.SSHTunnel") as ssh_tunnel_cls, + freeze_time(job.disconnected_at + timedelta(minutes=5)), + ): + ssh_tunnel_cls.side_effect = SSHError + await _process_job(session, worker, job) + assert ssh_tunnel_cls.call_count == 1 + + await session.refresh(job) + assert job.status == JobStatus.TERMINATING + assert job.termination_reason == JobTerminationReason.INSTANCE_UNREACHABLE + + @pytest.mark.parametrize( + ( + "inactivity_duration", + "no_connections_secs", + "expected_status", + "expected_termination_reason", + "expected_inactivity_secs", + ), + [ + pytest.param( + "1h", + 60 * 60 - 1, + JobStatus.RUNNING, + None, + 60 * 60 - 1, + id="duration-not-exceeded", + ), + pytest.param( + "1h", + 60 * 60, + JobStatus.TERMINATING, + JobTerminationReason.INACTIVITY_DURATION_EXCEEDED, + 60 * 60, + id="duration-exceeded-exactly", + ), + pytest.param( + "1h", + 60 * 60 + 1, + JobStatus.TERMINATING, + JobTerminationReason.INACTIVITY_DURATION_EXCEEDED, + 60 * 60 + 1, + id="duration-exceeded", + ), + pytest.param("off", 60 * 60, JobStatus.RUNNING, None, None, id="duration-off"), + pytest.param(False, 60 * 60, JobStatus.RUNNING, None, None, id="duration-false"), + pytest.param(None, 60 * 60, JobStatus.RUNNING, None, None, id="duration-none"), + pytest.param( + "1h", + None, + JobStatus.TERMINATING, + JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY, + None, + id="legacy-runner", + ), + pytest.param( + None, + None, + JobStatus.RUNNING, + None, + None, + id="legacy-runner-without-duration", + ), + ], + ) + async def test_inactivity_duration( + self, + test_db, + session: AsyncSession, + worker: JobRunningWorker, + inactivity_duration, + no_connections_secs: Optional[int], + expected_status: JobStatus, + expected_termination_reason: Optional[JobTerminationReason], + expected_inactivity_secs: Optional[int], + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.RUNNING, + run_name="test-run", + run_spec=get_run_spec( + run_name="test-run", + repo_id=repo.name, + configuration=DevEnvironmentConfiguration( + name="test-run", + ide="vscode", + inactivity_duration=inactivity_duration, + ), + ), + ) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + job_provisioning_data=get_job_provisioning_data(), + instance=instance, + instance_assigned=True, + ) + with ( + patch("dstack._internal.server.services.runner.pool.SSHTunnel") as ssh_tunnel_cls, + patch( + "dstack._internal.server.services.runner.client.RunnerClient.from_address" + ) as runner_client_cls, + ): + runner_client_mock = runner_client_cls.return_value + runner_client_mock.pull.return_value = PullResponse( + job_states=[], + job_logs=[], + runner_logs=[], + last_updated=0, + no_connections_secs=no_connections_secs, + ) + await _process_job(session, worker, job) + ssh_tunnel_cls.assert_called_once() + runner_client_mock.pull.assert_called_once() + + await session.refresh(job) + assert job.status == expected_status + assert job.termination_reason == expected_termination_reason + assert job.inactivity_secs == expected_inactivity_secs + + @pytest.mark.parametrize( + ["samples", "expected_status"], + [ + pytest.param( + [ + (datetime(2023, 1, 1, 12, 25, 20, tzinfo=timezone.utc), 30), + (datetime(2023, 1, 1, 12, 25, 30, tzinfo=timezone.utc), 30), + (datetime(2023, 1, 1, 12, 29, 50, tzinfo=timezone.utc), 40), + ], + JobStatus.RUNNING, + id="not-enough-points", + ), + pytest.param( + [ + (datetime(2023, 1, 1, 12, 20, 10, tzinfo=timezone.utc), 30), + (datetime(2023, 1, 1, 12, 20, 20, tzinfo=timezone.utc), 30), + (datetime(2023, 1, 1, 12, 29, 50, tzinfo=timezone.utc), 80), + ], + JobStatus.RUNNING, + id="any-above-min", + ), + pytest.param( + [ + (datetime(2023, 1, 1, 12, 10, 10, tzinfo=timezone.utc), 80), + (datetime(2023, 1, 1, 12, 10, 20, tzinfo=timezone.utc), 80), + (datetime(2023, 1, 1, 12, 20, 10, tzinfo=timezone.utc), 30), + (datetime(2023, 1, 1, 12, 20, 20, tzinfo=timezone.utc), 30), + (datetime(2023, 1, 1, 12, 29, 50, tzinfo=timezone.utc), 40), + ], + JobStatus.TERMINATING, + id="all-below-min", + ), + ], + ) + @freeze_time(datetime(2023, 1, 1, 12, 30, tzinfo=timezone.utc)) + async def test_gpu_utilization( + self, + test_db, + session: AsyncSession, + worker: JobRunningWorker, + samples: list[tuple[datetime, int]], + expected_status: JobStatus, + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.RUNNING, + run_name="test-run", + run_spec=get_run_spec( + run_name="test-run", + repo_id=repo.name, + configuration=DevEnvironmentConfiguration( + name="test-run", + ide="vscode", + utilization_policy=UtilizationPolicy( + min_gpu_utilization=80, + time_window=600, + ), + ), + ), + ) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + job_provisioning_data=get_job_provisioning_data(), + instance=instance, + instance_assigned=True, + last_processed_at=datetime(2023, 1, 1, 11, 30, tzinfo=timezone.utc), + ) + for timestamp, gpu_util in samples: + await create_job_metrics_point( + session=session, + job_model=job, + timestamp=timestamp, + gpus_memory_usage_bytes=[1024, 1024], + gpus_util_percent=[gpu_util, 100], + ) + + with ( + patch("dstack._internal.server.services.runner.pool.SSHTunnel") as ssh_tunnel_cls, + patch( + "dstack._internal.server.services.runner.client.RunnerClient.from_address" + ) as runner_client_cls, + ): + runner_client_mock = runner_client_cls.return_value + runner_client_mock.pull.return_value = PullResponse( + job_states=[], + job_logs=[], + runner_logs=[], + last_updated=0, + no_connections_secs=0, + ) + await _process_job(session, worker, job) + ssh_tunnel_cls.assert_called_once() + runner_client_mock.pull.assert_called_once() + + await session.refresh(job) + assert job.status == expected_status + if expected_status == JobStatus.TERMINATING: + assert ( + job.termination_reason == JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY + ) + assert job.termination_reason_message == ( + "The job GPU utilization below 80% for 600 seconds" + ) + else: + assert job.termination_reason is None + assert job.termination_reason_message is None + + @pytest.mark.parametrize("probe_count", [1, 2]) + async def test_creates_probe_models_and_not_registers_service_replica( + self, + test_db, + session: AsyncSession, + worker: JobRunningWorker, + ssh_tunnel_mock: Mock, + shim_client_mock: Mock, + runner_client_mock: Mock, + probe_count: int, + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=get_run_spec( + run_name="test", + repo_id=repo.name, + configuration=ServiceConfiguration( + port=80, + image="ubuntu", + probes=[ProbeConfig(type="http", url=f"/{i}") for i in range(probe_count)], + ), + ), + ) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.PULLING, + job_provisioning_data=get_job_provisioning_data(dockerized=True), + instance=instance, + instance_assigned=True, + ) + shim_client_mock.get_task.return_value.status = TaskStatus.RUNNING + + assert len(job.probes) == 0 + await _process_job(session, worker, job) + + await session.refresh(job) + job = ( + await session.execute( + select(JobModel) + .where(JobModel.id == job.id) + .options(selectinload(JobModel.probes)) + ) + ).scalar_one() + assert job.status == JobStatus.RUNNING + assert [probe.probe_num for probe in job.probes] == list(range(probe_count)) + assert not job.registered + + async def test_registers_service_replica_immediately_if_no_probes( + self, + test_db, + session: AsyncSession, + worker: JobRunningWorker, + ssh_tunnel_mock: Mock, + shim_client_mock: Mock, + runner_client_mock: Mock, + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=get_run_spec( + run_name="test", + repo_id=repo.name, + configuration=ServiceConfiguration(port=80, image="ubuntu"), + ), + ) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.PULLING, + job_provisioning_data=get_job_provisioning_data(dockerized=True), + instance=instance, + instance_assigned=True, + ) + shim_client_mock.get_task.return_value.status = TaskStatus.RUNNING + + await _process_job(session, worker, job) + + await session.refresh(job) + assert job.status == JobStatus.RUNNING + assert job.registered + events = await list_events(session) + assert {event.message for event in events} == { + "Job status changed PULLING -> RUNNING", + "Service replica registered to receive requests", + } + + @pytest.mark.parametrize( + ("probes", "expect_to_register"), + [ + ([_ProbeSetup(success_streak=0, ready_after=1)], False), + ([_ProbeSetup(success_streak=1, ready_after=1)], True), + ( + [ + _ProbeSetup(success_streak=1, ready_after=1), + _ProbeSetup(success_streak=1, ready_after=2), + ], + False, + ), + ( + [ + _ProbeSetup(success_streak=1, ready_after=1), + _ProbeSetup(success_streak=3, ready_after=2), + ], + True, + ), + ], + ) + async def test_registers_service_replica_only_after_probes_pass( + self, + test_db, + session: AsyncSession, + worker: JobRunningWorker, + ssh_tunnel_mock: Mock, + runner_client_mock: Mock, + probes: list[_ProbeSetup], + expect_to_register: bool, + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=get_run_spec( + run_name="test", + repo_id=repo.name, + configuration=ServiceConfiguration( + port=80, + image="ubuntu", + probes=[ + ProbeConfig(type="http", url=f"/{i}", ready_after=probe.ready_after) + for i, probe in enumerate(probes) + ], + ), + ), + ) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + job_provisioning_data=get_job_provisioning_data(dockerized=True), + instance=instance, + instance_assigned=True, + registered=False, + ) + for i, probe in enumerate(probes): + await create_probe( + session=session, + job=job, + probe_num=i, + success_streak=probe.success_streak, + ) + runner_client_mock.pull.return_value = PullResponse( + job_states=[], + job_logs=[], + runner_logs=[], + last_updated=0, + ) + + await _process_job(session, worker, job) + + await session.refresh(job) + events = await list_events(session) + if expect_to_register: + assert job.registered + assert len(events) == 1 + assert events[0].message == "Service replica registered to receive requests" + else: + assert not job.registered + assert not events + + async def test_registers_service_replica_in_gateway( + self, + test_db, + session: AsyncSession, + worker: JobRunningWorker, + ssh_tunnel_mock: Mock, + shim_client_mock: Mock, + runner_client_mock: Mock, + mock_gateway_connection: AsyncMock, + ): + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + repo = await create_repo(session=session, project_id=project.id) + backend = await create_backend(session=session, project_id=project.id) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + status=GatewayStatus.RUNNING, + name="test-gateway", + wildcard_domain="example.com", + ) + await create_gateway_compute( + session=session, + backend_id=backend.id, + gateway_id=gateway.id, + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=get_run_spec( + run_name="test", + repo_id=repo.name, + configuration=ServiceConfiguration( + port=80, image="ubuntu", gateway="test-gateway" + ), + ), + gateway=gateway, + ) + fleet = await create_fleet(session=session, project=project) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + fleet=fleet, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.PULLING, + job_provisioning_data=get_job_provisioning_data(dockerized=True), + instance=instance, + instance_assigned=True, + ) + shim_client_mock.get_task.return_value.status = TaskStatus.RUNNING + + await _process_job(session, worker, job) + + await session.refresh(job) + assert job.status == JobStatus.RUNNING + assert job.registered + events = await list_events(session) + assert {event.message for event in events} == { + "Job status changed PULLING -> RUNNING", + "Service replica registered to receive requests", + } + mock_gateway_connection.return_value.client.return_value.__aenter__.return_value.register_replica.assert_called_once_with( + run=ANY, + job_spec=ANY, + job_submission=ANY, + instance_project_ssh_private_key=None, + ssh_head_proxy=None, + ssh_head_proxy_private_key=None, + ) + + async def test_registers_service_replica_in_gateway_when_running_on_imported_instance( + self, + test_db, + session: AsyncSession, + worker: JobRunningWorker, + ssh_tunnel_mock: Mock, + shim_client_mock: Mock, + runner_client_mock: Mock, + mock_gateway_connection: AsyncMock, + ): + user = await create_user(session=session) + exporter_project = await create_project( + session=session, name="exporter", owner=user, ssh_private_key="exporter-private-key" + ) + importer_project = await create_project(session=session, name="importer", owner=user) + fleet = await create_fleet(session=session, project=exporter_project) + instance = await create_instance( + session=session, + project=exporter_project, + status=InstanceStatus.BUSY, + fleet=fleet, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + repo = await create_repo(session=session, project_id=importer_project.id) + backend = await create_backend(session=session, project_id=importer_project.id) + gateway = await create_gateway( + session=session, + project_id=importer_project.id, + backend_id=backend.id, + status=GatewayStatus.RUNNING, + name="test-gateway", + wildcard_domain="example.com", + ) + await create_gateway_compute( + session=session, + backend_id=backend.id, + gateway_id=gateway.id, + ) + run = await create_run( + session=session, + project=importer_project, + repo=repo, + user=user, + run_spec=get_run_spec( + run_name="test", + repo_id=repo.name, + configuration=ServiceConfiguration( + port=80, image="ubuntu", gateway="test-gateway" + ), + ), + gateway=gateway, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.PULLING, + job_provisioning_data=get_job_provisioning_data(dockerized=True), + instance=instance, + instance_assigned=True, + ) + shim_client_mock.get_task.return_value.status = TaskStatus.RUNNING + + await _process_job(session, worker, job) + + await session.refresh(job) + assert job.status == JobStatus.RUNNING + assert job.registered + events = await list_events(session) + assert {event.message for event in events} == { + "Job status changed PULLING -> RUNNING", + "Service replica registered to receive requests", + } + mock_gateway_connection.return_value.client.return_value.__aenter__.return_value.register_replica.assert_called_once_with( + run=ANY, + job_spec=ANY, + job_submission=ANY, + instance_project_ssh_private_key="exporter-private-key", + ssh_head_proxy=None, + ssh_head_proxy_private_key=None, + ) + + @pytest.mark.parametrize("job_status", [JobStatus.RUNNING, JobStatus.PULLING]) + async def test_terminates_job_when_instance_access_revoked( + self, + test_db, + session: AsyncSession, + worker: JobRunningWorker, + job_status: JobStatus, + ): + user = await create_user(session=session) + exporter_project = await create_project(session=session, name="exporter", owner=user) + importer_project = await create_project(session=session, name="importer", owner=user) + fleet = await create_fleet(session=session, project=exporter_project) + instance = await create_instance( + session=session, + project=exporter_project, + status=InstanceStatus.BUSY, + fleet=fleet, + ) + repo = await create_repo(session=session, project_id=importer_project.id) + run = await create_run( + session=session, + project=importer_project, + repo=repo, + user=user, + ) + job = await create_job( + session=session, + run=run, + status=job_status, + job_provisioning_data=get_job_provisioning_data(dockerized=True), + instance=instance, + instance_assigned=True, + ) + # No export created -> the import link no longer exists -> access revoked + + await _process_job(session, worker, job) + + await session.refresh(job) + assert job.status == JobStatus.TERMINATING + assert job.termination_reason == JobTerminationReason.INSTANCE_ACCESS_REVOKED + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == ( + f"Job status changed {job_status.upper()} -> TERMINATING." + " Termination reason: INSTANCE_ACCESS_REVOKED" + " (The instance is no longer imported into the job's project)" + ) + + @pytest.mark.parametrize("job_status", [JobStatus.RUNNING, JobStatus.PULLING]) + async def test_does_not_terminate_job_when_instance_access_is_valid( + self, + test_db, + session: AsyncSession, + worker: JobRunningWorker, + ssh_tunnel_mock: Mock, + runner_client_mock: Mock, + job_status: JobStatus, + ): + user = await create_user(session=session) + exporter_project = await create_project(session=session, name="exporter", owner=user) + importer_project = await create_project(session=session, name="importer", owner=user) + fleet = await create_fleet(session=session, project=exporter_project) + instance = await create_instance( + session=session, + project=exporter_project, + status=InstanceStatus.BUSY, + fleet=fleet, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + repo = await create_repo(session=session, project_id=importer_project.id) + run = await create_run( + session=session, + project=importer_project, + repo=repo, + user=user, + ) + job = await create_job( + session=session, + run=run, + status=job_status, + # dockerized=True so that the shim port is forwarded for the PULLING case + job_provisioning_data=get_job_provisioning_data(dockerized=True), + instance=instance, + instance_assigned=True, + ) + runner_client_mock.pull.return_value = PullResponse( + job_states=[], job_logs=[], runner_logs=[], last_updated=0 + ) + + await _process_job(session, worker, job) + + await session.refresh(job) + assert job.status == job_status + assert job.termination_reason is None + + async def test_apply_skips_probe_insert_when_lock_token_changes_after_processing( + self, + test_db, + session: AsyncSession, + worker: JobRunningWorker, + ssh_tunnel_mock: Mock, + shim_client_mock: Mock, + runner_client_mock: Mock, + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=get_run_spec( + run_name="test", + repo_id=repo.name, + configuration=ServiceConfiguration( + port=80, + image="ubuntu", + probes=[ProbeConfig(type="http", url="/health")], + ), + ), + ) + instance = await create_instance( + session=session, project=project, status=InstanceStatus.BUSY + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.PULLING, + job_provisioning_data=get_job_provisioning_data(dockerized=True), + instance=instance, + instance_assigned=True, + ) + _lock_job(job) + await session.commit() + replacement_lock_token = uuid.uuid4() + shim_client_mock.get_task.return_value.status = TaskStatus.RUNNING + + async def invalidate_lock(*args, **kwargs): + job.lock_token = replacement_lock_token + await session.commit() + return b"" + + with ( + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running._get_job_file_archives", + new_callable=AsyncMock, + return_value=[], + ), + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running._get_job_code", + new_callable=AsyncMock, + side_effect=invalidate_lock, + ), + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running._submit_job_to_runner", + return_value=_SubmitJobToRunnerResult( + success=True, + set_running_status=True, + ), + ), + ): + await worker.process(_job_to_pipeline_item(job)) + + await session.refresh(job) + assert job.status == JobStatus.PULLING + assert job.lock_token == replacement_lock_token + probes = ( + (await session.execute(select(ProbeModel).where(ProbeModel.job_id == job.id))) + .scalars() + .all() + ) + assert probes == [] + + async def test_provisioning_shim_uses_server_default_registry( + self, + monkeypatch: pytest.MonkeyPatch, + test_db, + session: AsyncSession, + worker: JobRunningWorker, + ssh_tunnel_mock: Mock, + shim_client_mock: Mock, + ): + monkeypatch.setattr(server_settings, "SERVER_DEFAULT_DOCKER_REGISTRY", "registry.example") + monkeypatch.setattr( + server_settings, "SERVER_DEFAULT_DOCKER_REGISTRY_USERNAME", "server-user" + ) + monkeypatch.setattr( + server_settings, "SERVER_DEFAULT_DOCKER_REGISTRY_PASSWORD", "server-pass" + ) + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + configuration=TaskConfiguration(image="ubuntu"), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=run_spec, + ) + instance = await create_instance( + session=session, project=project, status=InstanceStatus.BUSY + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.PROVISIONING, + job_provisioning_data=get_job_provisioning_data(dockerized=True), + instance=instance, + instance_assigned=True, + ) + + await _process_job(session, worker, job) + + shim_client_mock.submit_task.assert_called_once() + call_kwargs = shim_client_mock.submit_task.call_args[1] + assert call_kwargs["image_name"] == "registry.example/ubuntu" + assert call_kwargs["registry_username"] == "server-user" + assert call_kwargs["registry_password"] == "server-pass" + + +def _router_service_configuration(router_type: str) -> ServiceConfiguration: + return ServiceConfiguration.parse_obj( + { + "type": "service", + "port": 8000, + "image": "ubuntu", + "replicas": [ + {"name": "worker", "commands": ["echo worker"], "count": 1}, + { + "name": "router", + "router": {"type": router_type}, + "commands": ["echo router"], + "count": 1, + }, + ], + } + ) + + +@pytest.mark.asyncio +class TestPrepareStartupContextRouterEnv: + def _make_context(self, *, submitted_at: datetime) -> _ProcessContext: + job_model = MagicMock() + job_model.submitted_at = submitted_at + job = MagicMock() + job.job_spec.replica_num = 0 + run = MagicMock() + run.jobs = [] + run.run_spec = MagicMock() + return _ProcessContext( + job_model=job_model, + run_model=MagicMock(), + run=run, + job=job, + job_submission=MagicMock(job_runtime_data=None), + job_provisioning_data=MagicMock(), + instance_access_revoked=False, + ) + + async def test_router_failed_terminates_worker(self): + context = self._make_context( + submitted_at=datetime(2023, 1, 1, 12, 0, 0, tzinfo=timezone.utc), + ) + result = _ProcessResult() + with patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running.get_router_env_for_job", + return_value=RouterEnvStatus.FAILED, + ): + out = await _prepare_startup_context(context=context, result=result) + assert out is None + assert ( + result.job_update_map.get("termination_reason") + == JobTerminationReason.TERMINATED_BY_SERVER + ) + assert "Router replica is in a terminal state" in ( + result.job_update_map.get("termination_reason_message") or "" + ) + + @freeze_time("2023-01-01 12:00:00+00:00") + async def test_router_not_provisioned_within_timeout_defers(self): + context = self._make_context( + submitted_at=datetime(2023, 1, 1, 11, 45, 0, tzinfo=timezone.utc), + ) + result = _ProcessResult() + with patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running.get_router_env_for_job", + return_value=RouterEnvStatus.NOT_PROVISIONED, + ): + out = await _prepare_startup_context(context=context, result=result) + assert out is None + assert result.job_update_map == {} + + @freeze_time("2023-01-01 12:00:00+00:00") + async def test_router_not_provisioned_past_timeout_terminates(self): + context = self._make_context( + submitted_at=datetime(2023, 1, 1, 10, 0, 0, tzinfo=timezone.utc), + ) + result = _ProcessResult() + with patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running.get_router_env_for_job", + return_value=RouterEnvStatus.NOT_PROVISIONED, + ): + out = await _prepare_startup_context(context=context, result=result) + assert out is None + assert ( + result.job_update_map.get("termination_reason") + == JobTerminationReason.TERMINATED_BY_SERVER + ) + msg = result.job_update_map.get("termination_reason_message") or "" + assert str(ROUTER_PROVISIONING_WAIT_TIMEOUT_SECONDS) in msg + assert "internal IP" in msg + + async def test_router_env_dict_populates_startup_context(self): + context = self._make_context( + submitted_at=datetime(2023, 1, 1, 12, 0, 0, tzinfo=timezone.utc), + ) + result = _ProcessResult() + router_env = {"DSTACK_ROUTER_INTERNAL_IP": "10.1.2.3"} + + @asynccontextmanager + async def _fake_session_ctx(): + yield MagicMock() + + with ( + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running.get_router_env_for_job", + return_value=router_env, + ), + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running.get_session_ctx", + _fake_session_ctx, + ), + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running.get_job_attached_volumes", + new_callable=AsyncMock, + return_value=[], + ), + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running.get_repo_creds", + new_callable=AsyncMock, + return_value=None, + ), + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running.get_project_secrets_mapping", + new_callable=AsyncMock, + return_value={}, + ), + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running.repo_model_to_repo_head_with_creds", + return_value=MagicMock(repo_creds=None), + ), + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running.interpolate_job_spec_secrets", + ), + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_running._get_cluster_info", + return_value=ClusterInfo( + job_ips=["10.0.0.1"], master_job_ip="10.0.0.1", gpus_per_job=0 + ), + ), + ): + out = await _prepare_startup_context(context=context, result=result) + assert out is not None + assert out.router_env == router_env + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestFetchRunModelDynamoBranch: + async def test_dynamo_run_loads_all_non_terminated_replicas( + self, test_db, session: AsyncSession + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + configuration=_router_service_configuration("dynamo"), + ) + run = await create_run( + session=session, project=project, repo=repo, user=user, run_spec=run_spec + ) + await create_job( + session=session, + run=run, + replica_num=0, + status=JobStatus.PROVISIONING, + ) + await create_job( + session=session, + run=run, + replica_num=1, + status=JobStatus.PROVISIONING, + ) + run_id = run.id + parsed = RunSpec.__response__.parse_raw(run.run_spec) + await session.commit() + session.expire_all() + run_model = await _fetch_run_model( + session=session, + run_id=run_id, + replica_num=0, + run_spec=parsed, + ) + assert {j.replica_num for j in run_model.jobs} == {0, 1} + + async def test_non_dynamo_loads_only_own_replica(self, test_db, session: AsyncSession): + """Without a Dynamo router, eager-load only the worker replica's jobs.""" + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + configuration=_router_service_configuration("sglang"), + ) + run = await create_run( + session=session, project=project, repo=repo, user=user, run_spec=run_spec + ) + await create_job( + session=session, + run=run, + replica_num=0, + status=JobStatus.PROVISIONING, + ) + await create_job( + session=session, + run=run, + replica_num=1, + status=JobStatus.PROVISIONING, + ) + run_id = run.id + parsed = RunSpec.__response__.parse_raw(run.run_spec) + await session.commit() + session.expire_all() + run_model = await _fetch_run_model( + session=session, + run_id=run_id, + replica_num=0, + run_spec=parsed, + ) + assert {j.replica_num for j in run_model.jobs} == {0} diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_runs/__init__.py b/src/tests/_internal/server/background/pipeline_tasks/test_runs/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_runs/conftest.py b/src/tests/_internal/server/background/pipeline_tasks/test_runs/conftest.py new file mode 100644 index 0000000000..a62770f34b --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_runs/conftest.py @@ -0,0 +1,23 @@ +import asyncio +from datetime import timedelta +from unittest.mock import Mock + +import pytest + +from dstack._internal.server.background.pipeline_tasks.runs import RunFetcher, RunWorker + + +@pytest.fixture +def fetcher() -> RunFetcher: + return RunFetcher( + queue=asyncio.Queue(), + queue_desired_minsize=1, + min_processing_interval=timedelta(seconds=5), + lock_timeout=timedelta(seconds=30), + heartbeater=Mock(), + ) + + +@pytest.fixture +def worker() -> RunWorker: + return RunWorker(queue=Mock(), heartbeater=Mock(), pipeline_hinter=Mock()) diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_runs/helpers.py b/src/tests/_internal/server/background/pipeline_tasks/test_runs/helpers.py new file mode 100644 index 0000000000..1edf90774c --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_runs/helpers.py @@ -0,0 +1,34 @@ +import datetime as dt +import uuid + +from dstack._internal.server.background.pipeline_tasks.runs import ( + RunPipeline, + RunPipelineItem, +) +from dstack._internal.server.models import RunModel + +LOCK_EXPIRES_AT = dt.datetime(2025, 1, 2, 3, 4, tzinfo=dt.timezone.utc) + + +def run_to_pipeline_item(run_model: RunModel) -> RunPipelineItem: + assert run_model.lock_token is not None + assert run_model.lock_expires_at is not None + return RunPipelineItem( + __tablename__=run_model.__tablename__, + id=run_model.id, + lock_token=run_model.lock_token, + lock_expires_at=run_model.lock_expires_at, + prev_lock_expired=False, + status=run_model.status, + ) + + +def lock_run( + run_model: RunModel, + *, + lock_owner: str = RunPipeline.__name__, + lock_expires_at: dt.datetime = LOCK_EXPIRES_AT, +) -> None: + run_model.lock_token = uuid.uuid4() + run_model.lock_expires_at = lock_expires_at + run_model.lock_owner = lock_owner diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_runs/test_active.py b/src/tests/_internal/server/background/pipeline_tasks/test_runs/test_active.py new file mode 100644 index 0000000000..551a957768 --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_runs/test_active.py @@ -0,0 +1,1047 @@ +import json +import uuid +from datetime import timedelta +from unittest.mock import AsyncMock, patch + +import pytest +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.configurations import ( + ScalingSpec, + ServiceConfiguration, + TaskConfiguration, +) +from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.core.models.profiles import ( + Profile, + ProfileRetry, + RetryEvent, + Schedule, + StopCriteria, +) +from dstack._internal.core.models.resources import Range +from dstack._internal.core.models.runs import ( + JobStatus, + JobTerminationReason, + RunStatus, + RunTerminationReason, +) +from dstack._internal.server.background.pipeline_tasks.runs import RunWorker +from dstack._internal.server.models import JobModel +from dstack._internal.server.services.jobs import get_job_spec +from dstack._internal.server.testing.common import ( + create_fleet, + create_instance, + create_job, + create_project, + create_repo, + create_run, + create_user, + get_job_provisioning_data, + get_run_spec, +) +from dstack._internal.utils.common import get_current_datetime +from tests._internal.server.background.pipeline_tasks.test_runs.helpers import ( + lock_run, + run_to_pipeline_item, +) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +@pytest.mark.usefixtures("image_config_mock") +class TestRunActiveWorker: + async def test_transitions_submitted_to_provisioning( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet = await create_fleet(session=session, project=project) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.BUSY, + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.SUBMITTED, + ) + await create_job( + session=session, + run=run, + status=JobStatus.PROVISIONING, + instance=instance, + instance_assigned=True, + ) + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + assert run.status == RunStatus.PROVISIONING + assert run.lock_token is None + + async def test_transitions_provisioning_to_running( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.PROVISIONING, + ) + await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + ) + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + assert run.status == RunStatus.RUNNING + assert run.lock_token is None + + async def test_terminates_run_when_all_jobs_done( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.RUNNING, + ) + await create_job( + session=session, + run=run, + status=JobStatus.DONE, + termination_reason=JobTerminationReason.DONE_BY_RUNNER, + ) + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + assert run.status == RunStatus.TERMINATING + assert run.termination_reason == RunTerminationReason.ALL_JOBS_DONE + assert run.lock_token is None + + async def test_terminates_run_on_job_failure( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.RUNNING, + ) + await create_job( + session=session, + run=run, + status=JobStatus.FAILED, + termination_reason=JobTerminationReason.CONTAINER_EXITED_WITH_ERROR, + ) + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + assert run.status == RunStatus.TERMINATING + assert run.termination_reason == RunTerminationReason.JOB_FAILED + assert run.lock_token is None + + async def test_retries_failed_replica_within_retry_duration( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + """When a replica fails within retry duration, the run goes to PENDING with + resubmission_attempt incremented. The pending worker then creates the new submission.""" + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + profile=Profile( + name="default", + retry=ProfileRetry(duration=3600, on_events=[RetryEvent.ERROR]), + ), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=run_spec, + status=RunStatus.RUNNING, + resubmission_attempt=0, + ) + old_time = get_current_datetime() - timedelta(minutes=5) + await create_job( + session=session, + run=run, + status=JobStatus.FAILED, + termination_reason=JobTerminationReason.CONTAINER_EXITED_WITH_ERROR, + job_provisioning_data=get_job_provisioning_data(), + last_processed_at=old_time, + ) + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + # Retryable failure → PENDING with resubmission_attempt incremented + assert run.status == RunStatus.PENDING + assert run.resubmission_attempt == 1 + assert run.lock_token is None + + async def test_retries_no_capacity_replica_and_keeps_service_running( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + profile=Profile( + name="default", + retry=ProfileRetry(duration=3600, on_events=[RetryEvent.INTERRUPTION]), + ), + configuration=ServiceConfiguration( + port=8080, + commands=["echo Hi!"], + replicas=Range[int](min=2, max=2), + ), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=run_spec, + status=RunStatus.RUNNING, + ) + interrupted_job = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + termination_reason=JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY, + submitted_at=run.submitted_at, + last_processed_at=run.last_processed_at, + replica_num=0, + job_provisioning_data=get_job_provisioning_data(), + ) + healthy_job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + submitted_at=run.submitted_at, + last_processed_at=run.last_processed_at, + replica_num=1, + job_provisioning_data=get_job_provisioning_data(), + ) + lock_run(run) + await session.commit() + + now = run.submitted_at + timedelta(minutes=3) + with patch( + "dstack._internal.server.background.pipeline_tasks.runs.active.get_current_datetime", + return_value=now, + ): + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + await session.refresh(interrupted_job) + await session.refresh(healthy_job) + + jobs = list( + ( + await session.execute( + select(JobModel) + .where(JobModel.run_id == run.id) + .order_by(JobModel.replica_num, JobModel.submission_num) + ) + ).scalars() + ) + retried_job = next(job for job in jobs if job.replica_num == 0 and job.submission_num == 1) + + assert run.status == RunStatus.RUNNING + assert interrupted_job.status == JobStatus.TERMINATING + assert ( + interrupted_job.termination_reason == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY + ) + assert healthy_job.status == JobStatus.RUNNING + assert retried_job.status == JobStatus.SUBMITTED + assert len(jobs) == 3 + + async def test_retries_scheduled_run_no_capacity_from_trigger_time( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + profile=Profile( + name="default", + retry=ProfileRetry(duration=3600, on_events=[RetryEvent.NO_CAPACITY]), + ), + configuration=TaskConfiguration( + commands=["echo hello"], + schedule=Schedule(cron="15 * * * *"), + ), + ) + trigger_time = get_current_datetime() - timedelta(minutes=5) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=run_spec, + status=RunStatus.SUBMITTED, + submitted_at=get_current_datetime() - timedelta(hours=2), + next_triggered_at=trigger_time, + resubmission_attempt=0, + ) + await create_job( + session=session, + run=run, + status=JobStatus.FAILED, + termination_reason=JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY, + ) + lock_run(run) + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.runs.active.get_current_datetime", + return_value=trigger_time + timedelta(minutes=10), + ): + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + assert run.status == RunStatus.PENDING + assert run.resubmission_attempt == 1 + assert run.lock_token is None + + async def test_terminates_scheduled_run_when_no_capacity_retry_exceeded_from_trigger_time( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + profile=Profile( + name="default", + retry=ProfileRetry(duration=600, on_events=[RetryEvent.NO_CAPACITY]), + ), + configuration=TaskConfiguration( + commands=["echo hello"], + schedule=Schedule(cron="15 * * * *"), + ), + ) + trigger_time = get_current_datetime() - timedelta(minutes=20) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=run_spec, + status=RunStatus.SUBMITTED, + submitted_at=get_current_datetime() - timedelta(hours=2), + next_triggered_at=trigger_time, + resubmission_attempt=0, + ) + await create_job( + session=session, + run=run, + status=JobStatus.FAILED, + termination_reason=JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY, + ) + lock_run(run) + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.runs.active.get_current_datetime", + return_value=trigger_time + timedelta(minutes=20), + ): + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + assert run.status == RunStatus.TERMINATING + assert run.termination_reason == RunTerminationReason.RETRY_LIMIT_EXCEEDED + assert run.lock_token is None + + async def test_retrying_multinode_replica_terminates_active_sibling_jobs( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + profile=Profile( + name="default", + retry=ProfileRetry(duration=3600, on_events=[RetryEvent.ERROR]), + ), + configuration=TaskConfiguration( + commands=["echo hello"], + nodes=2, + ), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=run_spec, + status=RunStatus.RUNNING, + ) + failed_job = await create_job( + session=session, + run=run, + status=JobStatus.FAILED, + termination_reason=JobTerminationReason.CONTAINER_EXITED_WITH_ERROR, + replica_num=0, + job_num=0, + job_provisioning_data=get_job_provisioning_data(), + last_processed_at=run.submitted_at, + ) + running_job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + replica_num=0, + job_num=1, + job_provisioning_data=get_job_provisioning_data(), + last_processed_at=run.submitted_at, + ) + lock_run(run) + await session.commit() + + now = run.submitted_at + timedelta(minutes=1) + with patch( + "dstack._internal.server.background.pipeline_tasks.runs.active.get_current_datetime", + return_value=now, + ): + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + await session.refresh(failed_job) + await session.refresh(running_job) + + assert run.status == RunStatus.PENDING + assert failed_job.status == JobStatus.FAILED + assert running_job.status == JobStatus.TERMINATING + assert running_job.termination_reason == JobTerminationReason.TERMINATED_BY_SERVER + assert running_job.termination_reason_message == "Run is to be resubmitted" + + async def test_transitions_to_pending_when_retry_duration_exceeded( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + profile=Profile( + name="default", + retry=ProfileRetry(duration=60, on_events=[RetryEvent.ERROR]), + ), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=run_spec, + status=RunStatus.RUNNING, + resubmission_attempt=0, + ) + # Last provisioned long ago so retry duration is exceeded + very_old_time = get_current_datetime() - timedelta(hours=2) + await create_job( + session=session, + run=run, + status=JobStatus.FAILED, + termination_reason=JobTerminationReason.CONTAINER_EXITED_WITH_ERROR, + job_provisioning_data=get_job_provisioning_data(), + last_processed_at=very_old_time, + ) + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + assert run.status == RunStatus.TERMINATING + assert run.termination_reason == RunTerminationReason.RETRY_LIMIT_EXCEEDED + assert run.lock_token is None + + async def test_stops_on_master_done( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + profile=Profile(name="default", stop_criteria=StopCriteria.MASTER_DONE), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=run_spec, + status=RunStatus.RUNNING, + ) + # Master job (job_num=0) is done + await create_job( + session=session, + run=run, + status=JobStatus.DONE, + termination_reason=JobTerminationReason.DONE_BY_RUNNER, + job_num=0, + ) + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + assert run.status == RunStatus.TERMINATING + assert run.termination_reason == RunTerminationReason.ALL_JOBS_DONE + + async def test_sets_fleet_id_from_job_instance( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet = await create_fleet(session=session, project=project) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.BUSY, + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.SUBMITTED, + ) + assert run.fleet_id is None + await create_job( + session=session, + run=run, + status=JobStatus.PROVISIONING, + instance=instance, + instance_assigned=True, + ) + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + assert run.fleet_id == fleet.id + + async def test_service_noop_when_at_desired_count( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + """Service with 1 RUNNING replica and desired=1 stays RUNNING, no new jobs.""" + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + run_name="service-run", + configuration=ServiceConfiguration( + port=8080, + commands=["echo Hi!"], + ), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="service-run", + run_spec=run_spec, + status=RunStatus.RUNNING, + ) + await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + ) + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + assert run.status == RunStatus.RUNNING + assert run.desired_replica_count == 1 + assert run.desired_replica_counts is not None + counts = json.loads(run.desired_replica_counts) + assert counts == {"0": 1} + assert run.lock_token is None + + async def test_service_scale_up( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + """Service with min=2 and 1 RUNNING replica creates 1 new SUBMITTED job.""" + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + run_name="service-run", + configuration=ServiceConfiguration( + port=8080, + commands=["echo Hi!"], + replicas=Range[int](min=2, max=2), + ), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="service-run", + run_spec=run_spec, + status=RunStatus.SUBMITTED, + ) + await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + replica_num=0, + ) + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + assert run.status == RunStatus.RUNNING + assert run.desired_replica_count == 2 + + res = await session.execute( + select(JobModel).where(JobModel.run_id == run.id).order_by(JobModel.replica_num) + ) + jobs = list(res.scalars().all()) + assert len(jobs) == 2 + assert jobs[0].status == JobStatus.RUNNING + assert jobs[0].replica_num == 0 + assert jobs[1].status == JobStatus.SUBMITTED + assert jobs[1].replica_num == 1 + + async def test_service_scale_down( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + """Service with min=1 and 2 RUNNING replicas terminates 1 with SCALED_DOWN.""" + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + run_name="service-run", + configuration=ServiceConfiguration( + port=8080, + commands=["echo Hi!"], + replicas=Range[int](min=1, max=1), + ), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="service-run", + run_spec=run_spec, + status=RunStatus.RUNNING, + ) + run.desired_replica_count = 2 + run.desired_replica_counts = json.dumps({"0": 2}) + await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + replica_num=0, + ) + await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + replica_num=1, + ) + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + assert run.status == RunStatus.RUNNING + assert run.desired_replica_count == 1 + + res = await session.execute( + select(JobModel).where(JobModel.run_id == run.id).order_by(JobModel.replica_num) + ) + jobs = list(res.scalars().all()) + assert len(jobs) == 2 + # One should remain RUNNING, the other should be TERMINATING with SCALED_DOWN + running = [j for j in jobs if j.status == JobStatus.RUNNING] + terminating = [j for j in jobs if j.status == JobStatus.TERMINATING] + assert len(running) == 1 + assert len(terminating) == 1 + assert terminating[0].termination_reason == JobTerminationReason.SCALED_DOWN + + async def test_service_zero_scale_noop( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + """Active service with 0 desired and no active replicas stays in current status.""" + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + run_name="service-run", + configuration=ServiceConfiguration( + port=8080, + commands=["echo Hi!"], + replicas=Range[int](min=0, max=2), + scaling=ScalingSpec(metric="rps", target=10), + ), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="service-run", + run_spec=run_spec, + status=RunStatus.RUNNING, + ) + run.desired_replica_count = 0 + run.desired_replica_counts = json.dumps({"0": 0}) + # Create a terminated/scaled-down job to have some job history + await create_job( + session=session, + run=run, + status=JobStatus.TERMINATED, + termination_reason=JobTerminationReason.SCALED_DOWN, + replica_num=0, + ) + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + # All replicas scaled down → transitions to PENDING + assert run.status == RunStatus.PENDING + assert run.lock_token is None + + async def test_noops_when_run_lock_changes_after_processing( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.RUNNING, + ) + await create_job( + session=session, + run=run, + status=JobStatus.DONE, + termination_reason=JobTerminationReason.DONE_BY_RUNNER, + ) + lock_run(run) + await session.commit() + item = run_to_pipeline_item(run) + new_lock_token = uuid.uuid4() + + from dstack._internal.server.background.pipeline_tasks.runs.active import ( + ActiveResult, + ActiveRunUpdateMap, + ) + + async def intercept_process(context): + # Change the lock token to simulate concurrent modification + run.lock_token = new_lock_token + run.lock_expires_at = get_current_datetime() + timedelta(minutes=1) + await session.commit() + return ActiveResult( + run_update_map=ActiveRunUpdateMap( + status=RunStatus.TERMINATING, + termination_reason=RunTerminationReason.ALL_JOBS_DONE, + ), + new_job_models=[], + job_id_to_update_map={}, + ) + + with patch( + "dstack._internal.server.background.pipeline_tasks.runs.active.process_active_run", + new=AsyncMock(side_effect=intercept_process), + ): + await worker.process(item) + + await session.refresh(run) + assert run.status == RunStatus.RUNNING + assert run.lock_token == new_lock_token + + async def test_service_in_place_deployment_bump( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + """Service with 1 RUNNING replica at deployment_num=0, run at deployment_num=1, + same job spec → job gets deployment_num bumped to 1.""" + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + run_name="service-run", + configuration=ServiceConfiguration( + port=8080, + commands=["echo Hi!"], + ), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="service-run", + run_spec=run_spec, + status=RunStatus.RUNNING, + deployment_num=1, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + deployment_num=0, + ) + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + assert run.status == RunStatus.RUNNING + + await session.refresh(job) + assert job.deployment_num == 1 + + async def test_service_rolling_deployment_scale_up( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + """Service with 1 out-of-date RUNNING replica whose spec differs from the new + deployment, desired=1 → creates 1 new replica (surge), old registered replica + untouched.""" + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + run_name="service-run", + configuration=ServiceConfiguration( + port=8080, + commands=["echo new!"], + ), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="service-run", + run_spec=run_spec, + status=RunStatus.RUNNING, + deployment_num=1, + ) + old_job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + deployment_num=0, + registered=True, + replica_num=0, + ) + # Make the old job's spec differ from the current run_spec so in-place bump + # cannot be applied and rolling deployment is triggered instead. + old_spec = get_job_spec(old_job) + old_spec.commands = ["echo old!"] + old_job.job_spec_data = old_spec.json() + await session.commit() + + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + assert run.status == RunStatus.RUNNING + + res = await session.execute( + select(JobModel).where(JobModel.run_id == run.id).order_by(JobModel.replica_num) + ) + jobs = list(res.scalars().all()) + assert len(jobs) == 2 + # Old replica still RUNNING (registered, not terminated during rolling) + assert jobs[0].status == JobStatus.RUNNING + assert jobs[0].deployment_num == 0 + # New surge replica created + assert jobs[1].status == JobStatus.SUBMITTED + assert jobs[1].deployment_num == 1 + + async def test_service_rolling_deployment_scale_down_old_unregistered( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + """Service with 1 up-to-date RUNNING+registered and 1 out-of-date RUNNING+unregistered + replica (with a different spec) → old unregistered replica terminated.""" + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + run_name="service-run", + configuration=ServiceConfiguration( + port=8080, + commands=["echo new!"], + ), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="service-run", + run_spec=run_spec, + status=RunStatus.RUNNING, + deployment_num=1, + ) + # Up-to-date registered replica + await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + deployment_num=1, + registered=True, + replica_num=0, + ) + # Out-of-date unregistered replica with different spec + old_job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + deployment_num=0, + registered=False, + replica_num=1, + ) + old_spec = get_job_spec(old_job) + old_spec.commands = ["echo old!"] + old_job.job_spec_data = old_spec.json() + await session.commit() + + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + assert run.status == RunStatus.RUNNING + + await session.refresh(old_job) + assert old_job.status == JobStatus.TERMINATING + assert old_job.termination_reason == JobTerminationReason.SCALED_DOWN + + async def test_service_removed_group_cleanup( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + """Service run with jobs belonging to group "old" not in current config → + those jobs get TERMINATING with SCALED_DOWN.""" + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + # Current config only has group "0" (default) + run_spec = get_run_spec( + repo_id=repo.name, + run_name="service-run", + configuration=ServiceConfiguration( + port=8080, + commands=["echo Hi!"], + ), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="service-run", + run_spec=run_spec, + status=RunStatus.RUNNING, + ) + # Active replica in current group "0" + await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + replica_num=0, + ) + # Replica belonging to a removed group "old" — manually set job_spec_data + old_group_job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + replica_num=1, + ) + # Patch the job spec to have replica_group="old" + old_spec = get_job_spec(old_group_job) + old_spec.replica_group = "old" + old_group_job.job_spec_data = old_spec.json() + await session.commit() + + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + assert run.status == RunStatus.RUNNING + + await session.refresh(old_group_job) + assert old_group_job.status == JobStatus.TERMINATING + assert old_group_job.termination_reason == JobTerminationReason.SCALED_DOWN diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_runs/test_pending.py b/src/tests/_internal/server/background/pipeline_tasks/test_runs/test_pending.py new file mode 100644 index 0000000000..6cfe5fe00e --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_runs/test_pending.py @@ -0,0 +1,291 @@ +import json +import uuid +from datetime import timedelta +from unittest.mock import AsyncMock, patch + +import pytest +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.configurations import ScalingSpec, ServiceConfiguration +from dstack._internal.core.models.resources import Range +from dstack._internal.core.models.runs import ( + JobStatus, + RunStatus, +) +from dstack._internal.server.background.pipeline_tasks.runs import RunWorker +from dstack._internal.server.models import JobModel +from dstack._internal.server.testing.common import ( + create_job, + create_project, + create_repo, + create_run, + create_user, + get_run_spec, +) +from dstack._internal.utils.common import get_current_datetime +from tests._internal.server.background.pipeline_tasks.test_runs.helpers import ( + lock_run, + run_to_pipeline_item, +) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +@pytest.mark.usefixtures("image_config_mock") +class TestRunPendingWorker: + async def test_submits_non_service_run_and_creates_job( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.PENDING, + resubmission_attempt=0, + next_triggered_at=None, + ) + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + assert run.status == RunStatus.SUBMITTED + assert run.desired_replica_count == 1 + assert run.lock_token is None + assert run.lock_expires_at is None + assert run.lock_owner is None + + res = await session.execute(select(JobModel).where(JobModel.run_id == run.id)) + jobs = list(res.scalars().all()) + assert len(jobs) == 1 + assert jobs[0].status == JobStatus.SUBMITTED + assert jobs[0].replica_num == 0 + assert jobs[0].submission_num == 0 + + async def test_skips_retrying_run_when_delay_not_met( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.PENDING, + resubmission_attempt=1, + ) + # Create a job with recent last_processed_at so retry delay is not met + await create_job( + session=session, + run=run, + status=JobStatus.FAILED, + last_processed_at=get_current_datetime(), + ) + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + assert run.status == RunStatus.PENDING + assert run.lock_token is None + assert run.lock_expires_at is None + assert run.lock_owner is None + + async def test_resubmits_retrying_run_after_delay( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.PENDING, + resubmission_attempt=1, + ) + # Create a job with old last_processed_at so retry delay is met (>15s for attempt 1) + old_time = get_current_datetime() - timedelta(minutes=1) + old_job = await create_job( + session=session, + run=run, + status=JobStatus.FAILED, + last_processed_at=old_time, + ) + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + assert run.status == RunStatus.SUBMITTED + assert run.desired_replica_count == 1 + assert run.lock_token is None + assert run.lock_expires_at is None + assert run.lock_owner is None + + # Should have created a new job (retry of the failed one) + res = await session.execute( + select(JobModel) + .where(JobModel.run_id == run.id) + .order_by(JobModel.submitted_at.desc()) + ) + jobs = list(res.scalars().all()) + assert len(jobs) == 2 + new_job = next(j for j in jobs if j.id != old_job.id) + assert new_job.status == JobStatus.SUBMITTED + assert new_job.replica_num == old_job.replica_num + assert new_job.submission_num == old_job.submission_num + 1 + + async def test_noops_when_run_lock_changes_after_processing( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.PENDING, + resubmission_attempt=0, + next_triggered_at=None, + ) + lock_run(run) + await session.commit() + item = run_to_pipeline_item(run) + new_lock_token = uuid.uuid4() + + from dstack._internal.server.background.pipeline_tasks.runs.pending import ( + PendingResult, + PendingRunUpdateMap, + ) + + async def intercept_process(context): + # Change the lock token to simulate concurrent modification + run.lock_token = new_lock_token + run.lock_expires_at = get_current_datetime() + timedelta(minutes=1) + await session.commit() + # Return a result that would normally cause a state change + return PendingResult( + run_update_map=PendingRunUpdateMap( + status=RunStatus.SUBMITTED, + desired_replica_count=1, + ), + new_job_models=[], + ) + + with patch( + "dstack._internal.server.background.pipeline_tasks.runs.pending.process_pending_run", + new=AsyncMock(side_effect=intercept_process), + ): + await worker.process(item) + + await session.refresh(run) + assert run.status == RunStatus.PENDING + assert run.lock_token == new_lock_token + + async def test_submits_service_run_and_creates_jobs( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + run_name="service-run", + configuration=ServiceConfiguration( + port=8080, + commands=["echo Hi!"], + replicas=Range[int](min=2, max=2), + ), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="service-run", + run_spec=run_spec, + status=RunStatus.PENDING, + resubmission_attempt=0, + next_triggered_at=None, + ) + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + assert run.status == RunStatus.SUBMITTED + assert run.desired_replica_count == 2 + assert run.desired_replica_counts is not None + counts = json.loads(run.desired_replica_counts) + assert counts == {"0": 2} + assert run.lock_token is None + assert run.lock_expires_at is None + assert run.lock_owner is None + + res = await session.execute(select(JobModel).where(JobModel.run_id == run.id)) + jobs = list(res.scalars().all()) + assert len(jobs) == 2 + replica_nums = sorted(j.replica_num for j in jobs) + assert replica_nums == [0, 1] + assert all(j.status == JobStatus.SUBMITTED for j in jobs) + assert all(j.submission_num == 0 for j in jobs) + + async def test_noops_for_zero_scaled_service( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + run_name="service-run", + configuration=ServiceConfiguration( + port=8080, + commands=["echo Hi!"], + replicas=Range[int](min=0, max=2), + scaling=ScalingSpec(metric="rps", target=10), + ), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="service-run", + run_spec=run_spec, + status=RunStatus.PENDING, + resubmission_attempt=0, + next_triggered_at=None, + ) + # Set desired_replica_count=0 and desired_replica_counts to match zero-scaled state. + run.desired_replica_count = 0 + run.desired_replica_counts = json.dumps({"0": 0}) + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + assert run.status == RunStatus.PENDING + assert run.lock_token is None + assert run.lock_expires_at is None + assert run.lock_owner is None + + res = await session.execute(select(JobModel).where(JobModel.run_id == run.id)) + jobs = list(res.scalars().all()) + assert len(jobs) == 0 diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_runs/test_pipeline.py b/src/tests/_internal/server/background/pipeline_tasks/test_runs/test_pipeline.py new file mode 100644 index 0000000000..feda67cf3c --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_runs/test_pipeline.py @@ -0,0 +1,267 @@ +import datetime as dt +import uuid + +import pytest +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.runs import RunStatus +from dstack._internal.server.background.pipeline_tasks.runs import ( + RunFetcher, + RunPipeline, +) +from dstack._internal.server.testing.common import ( + create_project, + create_repo, + create_run, + create_user, +) +from dstack._internal.utils.common import get_current_datetime + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestRunFetcher: + async def test_fetch_selects_eligible_runs_and_sets_lock_fields( + self, test_db, session: AsyncSession, fetcher: RunFetcher + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + now = get_current_datetime() + stale = now - dt.timedelta(minutes=1) + + submitted = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="submitted", + status=RunStatus.SUBMITTED, + submitted_at=stale - dt.timedelta(seconds=5), + ) + running = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="running", + status=RunStatus.RUNNING, + submitted_at=stale - dt.timedelta(seconds=4), + ) + pending_retry = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="pending-retry", + status=RunStatus.PENDING, + submitted_at=stale - dt.timedelta(seconds=3), + resubmission_attempt=1, + ) + pending_scheduled_ready = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="pending-scheduled-ready", + status=RunStatus.PENDING, + submitted_at=stale - dt.timedelta(seconds=2), + next_triggered_at=stale, + ) + pending_zero_scaled = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="pending-zero-scaled", + status=RunStatus.PENDING, + submitted_at=stale - dt.timedelta(seconds=1), + ) + future_scheduled = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="future-scheduled", + status=RunStatus.PENDING, + submitted_at=stale, + next_triggered_at=now + dt.timedelta(minutes=1), + ) + finished = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="finished", + status=RunStatus.DONE, + submitted_at=stale + dt.timedelta(seconds=1), + ) + recent = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="recent", + status=RunStatus.RUNNING, + submitted_at=now, + last_processed_at=now + dt.timedelta(seconds=10), + ) + recent_terminating_skip = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="recent-terminating-skip", + status=RunStatus.TERMINATING, + submitted_at=now, + last_processed_at=now + dt.timedelta(seconds=9), + ) + recent_terminating_skip.skip_min_processing_interval = True + await session.commit() + + items = await fetcher.fetch(limit=10) + + assert {item.id for item in items} == { + submitted.id, + running.id, + pending_retry.id, + pending_scheduled_ready.id, + pending_zero_scaled.id, + recent_terminating_skip.id, + } + assert {item.id: item.status for item in items} == { + submitted.id: RunStatus.SUBMITTED, + running.id: RunStatus.RUNNING, + pending_retry.id: RunStatus.PENDING, + pending_scheduled_ready.id: RunStatus.PENDING, + pending_zero_scaled.id: RunStatus.PENDING, + recent_terminating_skip.id: RunStatus.TERMINATING, + } + + for run in [ + submitted, + running, + pending_retry, + pending_scheduled_ready, + pending_zero_scaled, + future_scheduled, + finished, + recent, + recent_terminating_skip, + ]: + await session.refresh(run) + + fetched_runs = [ + submitted, + running, + pending_retry, + pending_scheduled_ready, + pending_zero_scaled, + recent_terminating_skip, + ] + assert all(run.lock_owner == RunPipeline.__name__ for run in fetched_runs) + assert all(run.lock_expires_at is not None for run in fetched_runs) + assert all(run.lock_token is not None for run in fetched_runs) + assert all(not run.skip_min_processing_interval for run in fetched_runs) + assert len({run.lock_token for run in fetched_runs}) == 1 + + assert future_scheduled.lock_owner is None + assert finished.lock_owner is None + assert recent.lock_owner is None + + async def test_fetch_respects_order_and_limit( + self, test_db, session: AsyncSession, fetcher: RunFetcher + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + now = get_current_datetime() + + oldest = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="oldest", + status=RunStatus.SUBMITTED, + submitted_at=now - dt.timedelta(minutes=3), + ) + middle = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="middle", + status=RunStatus.RUNNING, + submitted_at=now - dt.timedelta(minutes=2), + ) + newest = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="newest", + status=RunStatus.SUBMITTED, + submitted_at=now - dt.timedelta(minutes=1), + ) + + items = await fetcher.fetch(limit=2) + + assert [item.id for item in items] == [oldest.id, middle.id] + + await session.refresh(oldest) + await session.refresh(middle) + await session.refresh(newest) + + assert oldest.lock_owner == RunPipeline.__name__ + assert middle.lock_owner == RunPipeline.__name__ + assert newest.lock_owner is None + + async def test_fetch_retries_expired_same_owner_lock_and_skips_foreign_live_lock( + self, test_db, session: AsyncSession, fetcher: RunFetcher + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + now = get_current_datetime() + stale = now - dt.timedelta(minutes=1) + + expired_same_owner = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="expired-same-owner", + status=RunStatus.RUNNING, + submitted_at=stale, + ) + expired_same_owner.lock_expires_at = stale + expired_same_owner.lock_token = uuid.uuid4() + expired_same_owner.lock_owner = RunPipeline.__name__ + + foreign_locked = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="foreign-locked", + status=RunStatus.SUBMITTED, + submitted_at=stale + dt.timedelta(seconds=1), + ) + foreign_locked.lock_expires_at = now + dt.timedelta(minutes=1) + foreign_locked.lock_token = uuid.uuid4() + foreign_locked.lock_owner = "OtherPipeline" + await session.commit() + + items = await fetcher.fetch(limit=10) + + assert [item.id for item in items] == [expired_same_owner.id] + assert items[0].prev_lock_expired is True + + await session.refresh(expired_same_owner) + await session.refresh(foreign_locked) + + assert expired_same_owner.lock_owner == RunPipeline.__name__ + assert expired_same_owner.lock_expires_at is not None + assert expired_same_owner.lock_token is not None + assert foreign_locked.lock_owner == "OtherPipeline" diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_runs/test_termination.py b/src/tests/_internal/server/background/pipeline_tasks/test_runs/test_termination.py new file mode 100644 index 0000000000..69016a915c --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_runs/test_termination.py @@ -0,0 +1,408 @@ +import uuid +from datetime import datetime, timedelta, timezone +from typing import Optional +from unittest.mock import AsyncMock, patch + +import pytest +from freezegun import freeze_time +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.configurations import TaskConfiguration +from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.core.models.profiles import Schedule +from dstack._internal.core.models.runs import ( + JobStatus, + JobTerminationReason, + RunStatus, + RunTerminationReason, +) +from dstack._internal.server.background.pipeline_tasks.jobs_terminating import ( + JobTerminatingPipeline, +) +from dstack._internal.server.background.pipeline_tasks.runs import RunPipeline, RunWorker +from dstack._internal.server.background.pipeline_tasks.runs.terminating import ( + TerminatingResult, + process_terminating_run, +) +from dstack._internal.server.testing.common import ( + create_fleet, + create_instance, + create_job, + create_project, + create_repo, + create_run, + create_user, + get_job_provisioning_data, + get_run_spec, +) +from dstack._internal.utils.common import get_current_datetime +from tests._internal.server.background.pipeline_tasks.test_runs.helpers import ( + lock_run, + run_to_pipeline_item, +) + + +def _lock_job( + job_model, + *, + lock_owner: str = RunPipeline.__name__, + lock_expires_at: Optional[datetime] = None, +) -> None: + if lock_expires_at is None: + lock_expires_at = get_current_datetime() + timedelta(seconds=30) + job_model.lock_token = uuid.uuid4() + job_model.lock_expires_at = lock_expires_at + job_model.lock_owner = lock_owner + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +@pytest.mark.usefixtures("image_config_mock") +class TestRunTerminatingWorker: + async def test_transitions_running_jobs_to_terminating( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.TERMINATING, + termination_reason=RunTerminationReason.JOB_FAILED, + ) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + job_provisioning_data=get_job_provisioning_data(), + instance=instance, + instance_assigned=True, + ) + lock_run(run) + await session.commit() + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(job) + await session.refresh(run) + assert job.status == JobStatus.TERMINATING + assert job.termination_reason == JobTerminationReason.TERMINATED_BY_SERVER + assert job.graceful_termination_attempts == 0 + assert job.skip_min_processing_interval + assert job.remove_at is None + assert job.lock_token is None + assert job.lock_expires_at is None + assert job.lock_owner is None + assert run.status == RunStatus.TERMINATING + assert run.lock_token is None + assert run.lock_expires_at is None + assert run.lock_owner is None + + async def test_updates_delayed_and_regular_jobs_separately( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.TERMINATING, + termination_reason=RunTerminationReason.JOB_FAILED, + ) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + delayed_job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + job_provisioning_data=get_job_provisioning_data(), + instance=instance, + instance_assigned=True, + ) + regular_job = await create_job( + session=session, + run=run, + status=JobStatus.SUBMITTED, + job_num=1, + ) + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(delayed_job) + await session.refresh(regular_job) + assert delayed_job.status == JobStatus.TERMINATING + assert delayed_job.termination_reason == JobTerminationReason.TERMINATED_BY_SERVER + assert delayed_job.graceful_termination_attempts == 0 + assert delayed_job.skip_min_processing_interval + assert delayed_job.remove_at is None + assert regular_job.status == JobStatus.TERMINATING + assert regular_job.termination_reason == JobTerminationReason.TERMINATED_BY_SERVER + assert regular_job.graceful_termination_attempts is None + assert regular_job.skip_min_processing_interval + assert regular_job.remove_at is None + + async def test_finishes_non_scheduled_run_when_all_jobs_are_finished( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.TERMINATING, + termination_reason=RunTerminationReason.JOB_FAILED, + ) + await create_job( + session=session, + run=run, + status=JobStatus.FAILED, + termination_reason=JobTerminationReason.EXECUTOR_ERROR, + ) + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + assert run.status == RunStatus.FAILED + assert run.lock_token is None + assert run.lock_expires_at is None + assert run.lock_owner is None + + @freeze_time(datetime(2023, 1, 2, 3, 10, tzinfo=timezone.utc)) + async def test_reschedules_scheduled_run_and_clears_fleet( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet = await create_fleet(session=session, project=project) + run_spec = get_run_spec( + repo_id=repo.name, + run_name="scheduled-run", + configuration=TaskConfiguration( + nodes=1, + schedule=Schedule(cron="15 * * * *"), + commands=["echo Hi!"], + ), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + fleet=fleet, + run_name="scheduled-run", + run_spec=run_spec, + status=RunStatus.TERMINATING, + termination_reason=RunTerminationReason.ALL_JOBS_DONE, + resubmission_attempt=1, + ) + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + assert run.status == RunStatus.PENDING + assert run.next_triggered_at == datetime(2023, 1, 2, 3, 15, tzinfo=timezone.utc) + assert run.resubmission_attempt == 0 + assert run.fleet_id is None + assert run.lock_token is None + assert run.lock_expires_at is None + assert run.lock_owner is None + + async def test_noops_when_run_lock_changes_after_processing( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.TERMINATING, + termination_reason=RunTerminationReason.JOB_FAILED, + ) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + job_provisioning_data=get_job_provisioning_data(), + instance=instance, + instance_assigned=True, + ) + lock_run(run) + await session.commit() + item = run_to_pipeline_item(run) + new_lock_token = uuid.uuid4() + original_process_terminating_run = process_terminating_run + + async def change_run_lock(context) -> TerminatingResult: + run.lock_token = new_lock_token + run.lock_expires_at = get_current_datetime() + timedelta(minutes=1) + await session.commit() + return await original_process_terminating_run(context) + + with patch( + "dstack._internal.server.background.pipeline_tasks.runs.terminating.process_terminating_run", + new=AsyncMock(side_effect=change_run_lock), + ): + await worker.process(item) + + await session.refresh(run) + await session.refresh(job) + assert run.status == RunStatus.TERMINATING + assert run.lock_token == new_lock_token + assert run.lock_owner == RunPipeline.__name__ + assert job.status == JobStatus.RUNNING + assert job.graceful_termination_attempts is None + assert job.remove_at is None + assert job.lock_token is None + assert job.lock_expires_at is None + assert job.lock_owner is None + + async def test_resets_run_lock_when_related_job_is_locked_by_another_pipeline( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.TERMINATING, + termination_reason=RunTerminationReason.JOB_FAILED, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.SUBMITTED, + ) + _lock_job(job, lock_owner=JobTerminatingPipeline.__name__) + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + await session.refresh(job) + assert run.status == RunStatus.TERMINATING + assert run.lock_owner == RunPipeline.__name__ + assert run.lock_token is None + assert run.lock_expires_at is None + assert job.status == JobStatus.SUBMITTED + assert job.lock_owner == JobTerminatingPipeline.__name__ + + async def test_reclaims_expired_same_owner_related_job_lock( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.TERMINATING, + termination_reason=RunTerminationReason.JOB_FAILED, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.SUBMITTED, + ) + _lock_job( + job, + lock_owner=RunPipeline.__name__, + lock_expires_at=get_current_datetime() - timedelta(minutes=1), + ) + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + await session.refresh(job) + assert job.status == JobStatus.TERMINATING + assert job.termination_reason == JobTerminationReason.TERMINATED_BY_SERVER + assert job.lock_token is None + assert job.lock_expires_at is None + assert job.lock_owner is None + assert run.lock_token is None + assert run.lock_expires_at is None + assert run.lock_owner is None + + async def test_ignores_already_terminating_jobs_when_locking_related_jobs( + self, test_db, session: AsyncSession, worker: RunWorker + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.TERMINATING, + termination_reason=RunTerminationReason.JOB_FAILED, + ) + terminating_job = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + termination_reason=JobTerminationReason.TERMINATED_BY_SERVER, + ) + submitted_job = await create_job( + session=session, + run=run, + status=JobStatus.SUBMITTED, + job_num=1, + ) + _lock_job(terminating_job, lock_owner=JobTerminatingPipeline.__name__) + lock_run(run) + await session.commit() + + await worker.process(run_to_pipeline_item(run)) + + await session.refresh(run) + await session.refresh(terminating_job) + await session.refresh(submitted_job) + assert terminating_job.status == JobStatus.TERMINATING + assert terminating_job.lock_owner == JobTerminatingPipeline.__name__ + assert submitted_job.status == JobStatus.TERMINATING + assert submitted_job.termination_reason == JobTerminationReason.TERMINATED_BY_SERVER + assert submitted_job.lock_token is None + assert submitted_job.lock_expires_at is None + assert submitted_job.lock_owner is None + assert run.lock_token is None + assert run.lock_expires_at is None + assert run.lock_owner is None diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_service_router_worker_sync.py b/src/tests/_internal/server/background/pipeline_tasks/test_service_router_worker_sync.py new file mode 100644 index 0000000000..5827b14966 --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_service_router_worker_sync.py @@ -0,0 +1,453 @@ +import asyncio +import uuid +from datetime import timedelta +from unittest.mock import AsyncMock, Mock, patch + +import pytest +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.configurations import parse_run_configuration +from dstack._internal.core.models.runs import RunStatus +from dstack._internal.server.background.pipeline_tasks.service_router_worker_sync import ( + ServiceRouterWorkerSyncFetcher, + ServiceRouterWorkerSyncPipeline, + ServiceRouterWorkerSyncPipelineItem, + ServiceRouterWorkerSyncWorker, +) +from dstack._internal.server.models import RunModel, ServiceRouterWorkerSyncModel +from dstack._internal.server.testing.common import ( + create_project, + create_repo, + create_run, + create_user, + get_run_spec, +) +from dstack._internal.utils.common import get_current_datetime + + +def _router_service_run_spec(repo_id: str, run_name: str = "test-run"): + conf = parse_run_configuration( + { + "type": "service", + "port": 8000, + "gateway": False, + "replicas": [ + { + "name": "router", + "count": 1, + "commands": ["sglang serve"], + "router": {"type": "sglang"}, + }, + {"name": "worker", "count": 2, "commands": ["worker"]}, + ], + } + ) + return get_run_spec(repo_id=repo_id, run_name=run_name, configuration=conf) + + +async def _add_service_router_worker_sync_row( + session: AsyncSession, + run_id: uuid.UUID, + *, + deleted: bool = False, + created_at=None, + last_processed_at=None, +) -> ServiceRouterWorkerSyncModel: + now = get_current_datetime() + if created_at is None: + created_at = now + if last_processed_at is None: + last_processed_at = now + row = ServiceRouterWorkerSyncModel( + id=uuid.uuid4(), + run_id=run_id, + deleted=deleted, + created_at=created_at, + last_processed_at=last_processed_at, + ) + session.add(row) + await session.commit() + return row + + +def _sync_row_to_pipeline_item( + sync_row: ServiceRouterWorkerSyncModel, +) -> ServiceRouterWorkerSyncPipelineItem: + assert sync_row.lock_token is not None + assert sync_row.lock_expires_at is not None + return ServiceRouterWorkerSyncPipelineItem( + __tablename__=ServiceRouterWorkerSyncModel.__tablename__, + id=sync_row.id, + lock_token=sync_row.lock_token, + lock_expires_at=sync_row.lock_expires_at, + prev_lock_expired=False, + run_id=sync_row.run_id, + ) + + +@pytest.fixture +def fetcher() -> ServiceRouterWorkerSyncFetcher: + return ServiceRouterWorkerSyncFetcher( + queue=asyncio.Queue(), + queue_desired_minsize=1, + min_processing_interval=timedelta(seconds=5), + lock_timeout=timedelta(seconds=25), + heartbeater=Mock(), + ) + + +@pytest.fixture +def worker() -> ServiceRouterWorkerSyncWorker: + return ServiceRouterWorkerSyncWorker(queue=Mock(), heartbeater=Mock(), pipeline_hinter=Mock()) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestServiceRouterWorkerSyncFetcher: + async def test_fetch_selects_eligible_sync_rows_and_sets_lock_fields( + self, test_db, session: AsyncSession, fetcher: ServiceRouterWorkerSyncFetcher + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + now = get_current_datetime() + stale = now - timedelta(minutes=1) + + # Case 1: eligible row + # This row should be fetched. + running = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="running", + status=RunStatus.RUNNING, + submitted_at=stale, + last_processed_at=stale, + run_spec=_router_service_run_spec(repo.name, "running"), + ) + eligible = await _add_service_router_worker_sync_row( + session, + running.id, + created_at=stale, + last_processed_at=stale, + ) + # Case 2: run is submitted. + # This row should not be fetched because the fetcher only wants RUNNING runs. + submitted = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="submitted", + status=RunStatus.SUBMITTED, + submitted_at=stale, + last_processed_at=stale, + run_spec=_router_service_run_spec(repo.name, "submitted"), + ) + sync_submitted = await _add_service_router_worker_sync_row( + session, + submitted.id, + created_at=stale, + last_processed_at=stale, + ) + # Case 3: sync row processed too recently. + # This row should not be fetched because it is too recent. + too_recent = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="too-recent", + status=RunStatus.RUNNING, + submitted_at=stale, + last_processed_at=now, + run_spec=_router_service_run_spec(repo.name, "too-recent"), + ) + created_earlier = now - timedelta(days=1) + sync_too_recent = await _add_service_router_worker_sync_row( + session, + too_recent.id, + created_at=created_earlier, + last_processed_at=now, + ) + # Case 4: sync row already marked deleted. + # This row should not be fetched. + deleted_sync_run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="deleted-sync", + status=RunStatus.RUNNING, + submitted_at=stale, + last_processed_at=stale, + run_spec=_router_service_run_spec(repo.name, "deleted-sync"), + ) + sync_deleted = await _add_service_router_worker_sync_row( + session, + deleted_sync_run.id, + created_at=stale, + last_processed_at=stale, + deleted=True, + ) + # Case 5: sync row locked by another pipeline. + # This row should not be fetched. + locked = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="locked", + status=RunStatus.RUNNING, + submitted_at=stale, + last_processed_at=stale, + run_spec=_router_service_run_spec(repo.name, "locked"), + ) + sync_locked = await _add_service_router_worker_sync_row( + session, + locked.id, + created_at=stale, + last_processed_at=stale, + ) + sync_locked.lock_expires_at = now + timedelta(minutes=1) + sync_locked.lock_token = uuid.uuid4() + sync_locked.lock_owner = "OtherPipeline" + await session.commit() + + items = await fetcher.fetch(limit=10) + # Only case 1 should be fetched. + assert {item.id for item in items} == {eligible.id} + + for row in [ + eligible, + sync_submitted, + sync_too_recent, + sync_deleted, + sync_locked, + ]: + await session.refresh(row) + + assert eligible.lock_owner == ServiceRouterWorkerSyncPipeline.__name__ + assert eligible.lock_expires_at is not None + assert eligible.lock_token is not None + + assert sync_submitted.lock_owner is None + assert sync_too_recent.lock_owner is None + assert sync_deleted.lock_owner is None + assert sync_locked.lock_owner == "OtherPipeline" + + # test_fetch_returns_oldest_sync_rows_first_up_to_limit answers: "When several rows are all + # eligible, does SQL ORDER BY last_processed_at and LIMIT behave as intended?" That's ordering + # + batch size, not eligibility. + async def test_fetch_returns_oldest_sync_rows_first_up_to_limit( + self, test_db, session: AsyncSession, fetcher: ServiceRouterWorkerSyncFetcher + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + now = get_current_datetime() + spec = _router_service_run_spec(repo.name) + + oldest_run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="oldest", + status=RunStatus.RUNNING, + run_spec=spec, + last_processed_at=now - timedelta(minutes=3), + ) + oldest = await _add_service_router_worker_sync_row( + session, + oldest_run.id, + last_processed_at=now - timedelta(minutes=3), + ) + middle_run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="middle", + status=RunStatus.RUNNING, + run_spec=spec, + last_processed_at=now - timedelta(minutes=2), + ) + middle = await _add_service_router_worker_sync_row( + session, + middle_run.id, + last_processed_at=now - timedelta(minutes=2), + ) + newest_run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="newest", + status=RunStatus.RUNNING, + run_spec=spec, + last_processed_at=now - timedelta(minutes=1), + ) + newest = await _add_service_router_worker_sync_row( + session, + newest_run.id, + last_processed_at=now - timedelta(minutes=1), + ) + + items = await fetcher.fetch(limit=2) + + assert [item.id for item in items] == [oldest.id, middle.id] + + await session.refresh(oldest) + await session.refresh(middle) + await session.refresh(newest) + + assert oldest.lock_owner == ServiceRouterWorkerSyncPipeline.__name__ + assert middle.lock_owner == ServiceRouterWorkerSyncPipeline.__name__ + assert newest.lock_owner is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestServiceRouterWorkerSyncWorker: + async def test_process_skips_when_lock_token_changes( + self, + test_db, + session: AsyncSession, + worker: ServiceRouterWorkerSyncWorker, + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.RUNNING, + run_spec=_router_service_run_spec(repo.name), + ) + sync_row = await _add_service_router_worker_sync_row(session, run.id) + sync_row.lock_token = uuid.uuid4() + sync_row.lock_expires_at = get_current_datetime() + timedelta(seconds=30) + sync_row.lock_owner = ServiceRouterWorkerSyncPipeline.__name__ + await session.commit() + + item = _sync_row_to_pipeline_item(sync_row) + new_token = uuid.uuid4() + sync_row.lock_token = new_token + await session.commit() + + await worker.process(item) + await session.refresh(sync_row) + + assert sync_row.lock_token == new_token + assert sync_row.lock_owner == ServiceRouterWorkerSyncPipeline.__name__ + + async def test_marks_sync_row_deleted_when_run_not_running( + self, + test_db, + session: AsyncSession, + worker: ServiceRouterWorkerSyncWorker, + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.DONE, + run_spec=_router_service_run_spec(repo.name), + ) + sync_row = await _add_service_router_worker_sync_row(session, run.id) + sync_row.lock_token = uuid.uuid4() + sync_row.lock_expires_at = get_current_datetime() + timedelta(seconds=30) + sync_row.lock_owner = ServiceRouterWorkerSyncPipeline.__name__ + await session.commit() + + await worker.process(_sync_row_to_pipeline_item(sync_row)) + await session.refresh(sync_row) + + assert sync_row.deleted is True + assert sync_row.lock_token is None + assert sync_row.lock_expires_at is None + assert sync_row.lock_owner is None + + # This can happen when a run previously had a router replica group (so a sync row was created), + # but later its configuration/run_spec is updated (e.g. re-apply) to remove the router group. + async def test_marks_sync_row_deleted_when_no_router_replica_group( + self, + test_db, + session: AsyncSession, + worker: ServiceRouterWorkerSyncWorker, + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.RUNNING, + run_spec=get_run_spec(repo_id=repo.name, run_name="task-run"), + ) + sync_row = await _add_service_router_worker_sync_row(session, run.id) + sync_row.lock_token = uuid.uuid4() + sync_row.lock_expires_at = get_current_datetime() + timedelta(seconds=30) + sync_row.lock_owner = ServiceRouterWorkerSyncPipeline.__name__ + await session.commit() + + await worker.process(_sync_row_to_pipeline_item(sync_row)) + await session.refresh(sync_row) + + assert sync_row.deleted is True + assert sync_row.lock_token is None + + async def test_process_calls_sync_and_unlocks_on_success( + self, + test_db, + session: AsyncSession, + worker: ServiceRouterWorkerSyncWorker, + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.RUNNING, + run_spec=_router_service_run_spec(repo.name), + ) + sync_row = await _add_service_router_worker_sync_row(session, run.id) + sync_row.lock_token = uuid.uuid4() + sync_row.lock_expires_at = get_current_datetime() + timedelta(seconds=30) + sync_row.lock_owner = ServiceRouterWorkerSyncPipeline.__name__ + await session.commit() + item = _sync_row_to_pipeline_item(sync_row) + + with patch( + "dstack._internal.server.background.pipeline_tasks.service_router_worker_sync" + ".sync_router_workers_for_run_model", + new_callable=AsyncMock, + ) as sync_mock: + await worker.process(item) + + sync_mock.assert_awaited_once() + # `await_args` is Optional in stubs; assert for type-checkers. + assert sync_mock.await_args is not None + called_run = sync_mock.await_args.args[0] + assert isinstance(called_run, RunModel) + assert called_run.id == run.id + + await session.refresh(sync_row) + assert sync_row.deleted is False + assert sync_row.lock_token is None + assert sync_row.lock_expires_at is None + assert sync_row.lock_owner is None + assert sync_row.last_processed_at is not None diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_submitted_jobs.py b/src/tests/_internal/server/background/pipeline_tasks/test_submitted_jobs.py new file mode 100644 index 0000000000..b00ce59029 --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_submitted_jobs.py @@ -0,0 +1,2723 @@ +import asyncio +import uuid +from datetime import timedelta +from typing import cast +from unittest.mock import AsyncMock, Mock, call, patch + +import pytest +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import joinedload + +from dstack._internal.core.errors import BackendError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import EntityReference, NetworkMode, RegistryAuth +from dstack._internal.core.models.configurations import ServiceConfiguration, TaskConfiguration +from dstack._internal.core.models.envs import Env +from dstack._internal.core.models.fleets import FleetNodesSpec, InstanceGroupPlacement +from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.core.models.placement import PlacementGroup +from dstack._internal.core.models.profiles import ( + FleetInstanceSelector, + InstanceHostnameSelector, + InstanceNameSelector, + InstanceSelector, + Profile, +) +from dstack._internal.core.models.resources import CPUSpec, Memory, Range, ResourcesSpec +from dstack._internal.core.models.runs import JobRuntimeData, JobStatus, JobTerminationReason +from dstack._internal.core.models.users import GlobalRole +from dstack._internal.core.models.volumes import ( + VolumeAttachmentData, + VolumeMountPoint, + VolumeStatus, +) +from dstack._internal.server import settings as server_settings +from dstack._internal.server.background.pipeline_tasks.jobs_submitted import ( + JobSubmittedFetcher, + JobSubmittedPipeline, + JobSubmittedPipelineItem, + JobSubmittedWorker, + _load_submitted_job_context, +) +from dstack._internal.server.models import ( + ComputeGroupModel, + FleetModel, + InstanceModel, + JobModel, + PlacementGroupModel, + VolumeAttachmentModel, +) +from dstack._internal.server.services.docker import ImageConfig +from dstack._internal.server.services.jobs.configurators.base import JobConfigurator +from dstack._internal.server.testing.common import ( + ComputeMockSpec, + create_export, + create_fleet, + create_instance, + create_job, + create_placement_group, + create_project, + create_repo, + create_run, + create_secret, + create_user, + create_volume, + get_compute_group_provisioning_data, + get_fleet_spec, + get_instance_offer_with_availability, + get_job_provisioning_data, + get_placement_group_provisioning_data, + get_remote_connection_info, + get_run_spec, + get_ssh_fleet_configuration, + get_volume_provisioning_data, +) +from dstack._internal.utils.common import get_current_datetime + +pytestmark = pytest.mark.usefixtures("image_config_mock") + + +@pytest.fixture +def fetcher() -> JobSubmittedFetcher: + return JobSubmittedFetcher( + queue=asyncio.Queue(), + queue_desired_minsize=1, + min_processing_interval=timedelta(seconds=4), + lock_timeout=timedelta(seconds=30), + heartbeater=Mock(), + ) + + +@pytest.fixture +def worker() -> JobSubmittedWorker: + return JobSubmittedWorker(queue=Mock(), heartbeater=Mock(), pipeline_hinter=Mock()) + + +def _lock_job_foreign(job_model: JobModel) -> None: + job_model.lock_expires_at = get_current_datetime() + timedelta(minutes=1) + job_model.lock_token = uuid.uuid4() + job_model.lock_owner = "OtherPipeline" + + +def _lock_job_expired_same_owner(job_model: JobModel) -> None: + job_model.lock_expires_at = get_current_datetime() - timedelta(minutes=1) + job_model.lock_token = uuid.uuid4() + job_model.lock_owner = JobSubmittedPipeline.__name__ + + +def _lock_job(job_model: JobModel) -> None: + job_model.lock_expires_at = get_current_datetime() + timedelta(seconds=30) + job_model.lock_token = uuid.uuid4() + job_model.lock_owner = JobSubmittedPipeline.__name__ + + +def _job_to_pipeline_item(job_model: JobModel) -> JobSubmittedPipelineItem: + assert job_model.lock_token is not None + assert job_model.lock_expires_at is not None + return JobSubmittedPipelineItem( + __tablename__=job_model.__tablename__, + id=job_model.id, + lock_expires_at=job_model.lock_expires_at, + lock_token=job_model.lock_token, + prev_lock_expired=False, + ) + + +async def _process_job( + session: AsyncSession, + worker: JobSubmittedWorker, + job_model: JobModel, +) -> None: + _lock_job(job_model) + await session.commit() + await worker.process(_job_to_pipeline_item(job_model)) + + +async def _get_job(session: AsyncSession, job_id) -> JobModel: + res = await session.execute( + select(JobModel) + .where(JobModel.id == job_id) + .options(joinedload(JobModel.instance)) + .options(joinedload(JobModel.fleet)) + .execution_options(populate_existing=True) + ) + return res.unique().scalar_one() + + +async def _get_placement_groups( + session: AsyncSession, + fleet_id: uuid.UUID, +) -> list[PlacementGroupModel]: + res = await session.execute( + select(PlacementGroupModel) + .where(PlacementGroupModel.fleet_id == fleet_id) + .execution_options(populate_existing=True) + ) + return list(res.scalars().all()) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestJobSubmittedFetcher: + async def test_fetch_selects_eligible_jobs_and_sets_lock_fields( + self, test_db, session: AsyncSession, fetcher: JobSubmittedFetcher + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet = await create_fleet(session=session, project=project) + run = await create_run(session=session, project=project, repo=repo, user=user, fleet=fleet) + run_without_fleet = await create_run( + session=session, project=project, repo=repo, user=user + ) + now = get_current_datetime() + stale = now - timedelta(minutes=1) + + assignment_job = await create_job( + session=session, + run=run, + status=JobStatus.SUBMITTED, + submitted_at=stale - timedelta(minutes=1), + last_processed_at=stale - timedelta(seconds=2), + instance_assigned=False, + ) + provisioning_job = await create_job( + session=session, + run=run, + status=JobStatus.SUBMITTED, + submitted_at=stale - timedelta(minutes=2), + last_processed_at=stale - timedelta(seconds=1), + instance_assigned=True, + job_num=1, + ) + # submitted_at == last_processed_at bypasses the min_processing_interval filter + # so freshly submitted jobs are picked up immediately + fresh_job = await create_job( + session=session, + run=run, + status=JobStatus.SUBMITTED, + submitted_at=now, + last_processed_at=now, + job_num=2, + ) + waiting_master = await create_job( + session=session, + run=run, + status=JobStatus.SUBMITTED, + submitted_at=stale - timedelta(minutes=3), + last_processed_at=stale - timedelta(seconds=3), + waiting_master_job=True, + job_num=3, + ) + waiting_run_fleet = await create_job( + session=session, + run=run_without_fleet, + status=JobStatus.SUBMITTED, + submitted_at=stale - timedelta(minutes=3), + last_processed_at=stale - timedelta(seconds=3), + waiting_master_job=True, + job_num=3, + ) + recent_retry = await create_job( + session=session, + run=run, + status=JobStatus.SUBMITTED, + submitted_at=stale - timedelta(minutes=4), + last_processed_at=now - timedelta(seconds=1), + job_num=4, + ) + foreign_locked = await create_job( + session=session, + run=run, + status=JobStatus.SUBMITTED, + submitted_at=stale - timedelta(minutes=5), + last_processed_at=stale - timedelta(seconds=4), + job_num=5, + ) + _lock_job_foreign(foreign_locked) + await session.commit() + + items = await fetcher.fetch(limit=10) + + assert [item.id for item in items] == [ + assignment_job.id, + provisioning_job.id, + fresh_job.id, + ] + for job in [ + assignment_job, + provisioning_job, + fresh_job, + waiting_master, + waiting_run_fleet, + recent_retry, + foreign_locked, + ]: + await session.refresh(job) + + fetched_jobs = [assignment_job, provisioning_job, fresh_job] + assert all(job.lock_owner == JobSubmittedPipeline.__name__ for job in fetched_jobs) + assert all(job.lock_expires_at is not None for job in fetched_jobs) + assert all(job.lock_token is not None for job in fetched_jobs) + assert len({job.lock_token for job in fetched_jobs}) == 1 + + assert waiting_master.lock_owner is None + assert waiting_run_fleet.lock_owner is None + assert recent_retry.lock_owner is None + assert foreign_locked.lock_owner == "OtherPipeline" + + async def test_fetch_orders_by_priority_then_last_processed_at( + self, test_db, session: AsyncSession, fetcher: JobSubmittedFetcher + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + now = get_current_datetime() + + low_priority_run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="low-priority-run", + priority=1, + ) + high_priority_run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="high-priority-run", + priority=10, + ) + + low_priority_job = await create_job( + session=session, + run=low_priority_run, + submitted_at=now - timedelta(minutes=3), + last_processed_at=now - timedelta(minutes=2), + ) + newer_high_priority_job = await create_job( + session=session, + run=high_priority_run, + submitted_at=now - timedelta(minutes=4), + last_processed_at=now - timedelta(minutes=1), + ) + older_high_priority_job = await create_job( + session=session, + run=high_priority_run, + submitted_at=now - timedelta(minutes=5), + last_processed_at=now - timedelta(minutes=2, seconds=30), + ) + + items = await fetcher.fetch(limit=3) + + assert [item.id for item in items] == [ + older_high_priority_job.id, + newer_high_priority_job.id, + low_priority_job.id, + ] + + async def test_fetch_retries_expired_same_owner_lock_and_respects_limit( + self, test_db, session: AsyncSession, fetcher: JobSubmittedFetcher + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + now = get_current_datetime() + stale = now - timedelta(minutes=1) + + oldest = await create_job( + session=session, + run=run, + submitted_at=stale - timedelta(minutes=2), + last_processed_at=stale - timedelta(seconds=2), + ) + expired_same_owner = await create_job( + session=session, + run=run, + submitted_at=stale - timedelta(minutes=1), + last_processed_at=stale - timedelta(seconds=1), + ) + newest = await create_job( + session=session, + run=run, + submitted_at=stale, + last_processed_at=stale, + ) + _lock_job_expired_same_owner(expired_same_owner) + await session.commit() + + items = await fetcher.fetch(limit=2) + + assert [item.id for item in items] == [oldest.id, expired_same_owner.id] + + await session.refresh(expired_same_owner) + assert expired_same_owner.lock_owner == JobSubmittedPipeline.__name__ + assert expired_same_owner.lock_token is not None + assert expired_same_owner.lock_expires_at is not None + + await session.refresh(newest) + assert newest.lock_owner is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestJobSubmittedWorker: + async def test_provisions_assigned_job_on_existing_instance( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + busy_blocks=1, + ) + job = await create_job( + session=session, + run=run, + instance=instance, + instance_assigned=True, + job_provisioning_data=get_job_provisioning_data(dockerized=True), + ) + previous_last_processed_at = job.last_processed_at + + await _process_job(session=session, worker=worker, job_model=job) + + job = await _get_job(session, job.id) + assert job.status == JobStatus.PROVISIONING + assert job.instance_assigned + assert job.instance is not None and job.instance.id == instance.id + assert job.last_processed_at > previous_last_processed_at + assert job.lock_owner is None + assert job.lock_token is None + assert job.lock_expires_at is None + + async def test_provisions_new_capacity_for_assigned_job_with_placeholder( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.nodes = FleetNodesSpec(min=0, target=0, max=1) + fleet = await create_fleet(session=session, project=project, spec=fleet_spec) + run = await create_run(session=session, project=project, repo=repo, user=user) + job = await create_job(session=session, run=run) + + offer = get_instance_offer_with_availability(backend=BackendType.AWS) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + m.return_value = [backend_mock] + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value.get_offers.return_value = [offer] + backend_mock.compute.return_value.run_job.return_value = get_job_provisioning_data( + dockerized=True, + backend=BackendType.AWS, + ) + + await _process_job(session=session, worker=worker, job_model=job) + + job = await _get_job(session, job.id) + assert job.status == JobStatus.SUBMITTED + assert job.instance_assigned + assert job.instance is not None + placeholder_id = job.instance.id + assert job.used_instance_id == placeholder_id + assert job.instance.status == InstanceStatus.PENDING + + await _process_job(session=session, worker=worker, job_model=job) + + job = await _get_job(session, job.id) + assert job.status == JobStatus.PROVISIONING + assert job.instance is not None + assert job.instance.id == placeholder_id + assert job.used_instance_id == placeholder_id + assert job.instance.status == InstanceStatus.PROVISIONING + assert job.instance.fleet_id == fleet.id + assert job.instance.offer is not None + assert job.instance.provisioning_job_id == job.id # never cleared + res = await session.execute( + select(InstanceModel).where( + InstanceModel.fleet_id == fleet.id, + InstanceModel.deleted == False, + ) + ) + assert len(res.scalars().all()) == 1 + + async def test_multinode_master_reuses_placeholder_when_provisioning_falls_back_to_run_job( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.nodes = FleetNodesSpec(min=0, target=0, max=1) + fleet = await create_fleet(session=session, project=project, spec=fleet_spec) + run_spec = get_run_spec( + run_name="run", + repo_id=repo.name, + configuration=TaskConfiguration(image="debian", nodes=2), + ) + run = await create_run( + session=session, + run_name="run", + project=project, + repo=repo, + user=user, + run_spec=run_spec, + fleet=fleet, + ) + master_job = await create_job( + session=session, + run=run, + job_num=0, + waiting_master_job=False, + ) + worker_job = await create_job( + session=session, + run=run, + job_num=1, + waiting_master_job=True, + ) + + offer = get_instance_offer_with_availability(backend=BackendType.AWS) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + compute_mock = Mock(spec=ComputeMockSpec) + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = compute_mock + m.return_value = [backend_mock] + compute_mock.get_offers.return_value = [offer] + compute_mock.run_job.return_value = get_job_provisioning_data( + dockerized=True, + backend=BackendType.AWS, + ) + + await _process_job(session=session, worker=worker, job_model=master_job) + + master_job = await _get_job(session, master_job.id) + worker_job = await _get_job(session, worker_job.id) + assert master_job.status == JobStatus.SUBMITTED + assert master_job.instance_assigned + assert master_job.instance is not None + placeholder_id = master_job.instance.id + assert master_job.instance.status == InstanceStatus.PENDING + assert master_job.used_instance_id == placeholder_id + assert master_job.fleet_id == fleet.id + assert worker_job.waiting_master_job + compute_mock.run_job.assert_not_called() + compute_mock.run_jobs.assert_not_called() + + competing_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.BUSY, + backend=BackendType.AWS, + job_provisioning_data=get_job_provisioning_data(backend=BackendType.AWS), + ) + + await _process_job(session=session, worker=worker, job_model=master_job) + + master_job = await _get_job(session, master_job.id) + worker_job = await _get_job(session, worker_job.id) + assert master_job.status == JobStatus.PROVISIONING + assert master_job.instance is not None + assert master_job.instance.id == placeholder_id + assert master_job.instance.status == InstanceStatus.PROVISIONING + assert master_job.used_instance_id == placeholder_id + assert worker_job.waiting_master_job is False + compute_mock.run_job.assert_called_once() + compute_mock.run_jobs.assert_not_called() + res = await session.execute( + select(InstanceModel).where( + InstanceModel.fleet_id == fleet.id, + InstanceModel.deleted == False, + ) + ) + assert {instance.id for instance in res.scalars().all()} == { + placeholder_id, + competing_instance.id, + } + + async def test_provisioning_master_job_respects_cluster_placement_in_non_empty_fleet( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.placement = InstanceGroupPlacement.CLUSTER + fleet_spec.configuration.nodes = FleetNodesSpec(min=0, target=0, max=None) + fleet = await create_fleet(session=session, project=project, spec=fleet_spec) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.BUSY, + backend=BackendType.AWS, + job_provisioning_data=get_job_provisioning_data(region="eu-west-1"), + ) + fleet.current_master_instance_id = instance.id + configuration = TaskConfiguration(image="debian", nodes=2) + run_spec = get_run_spec(run_name="run", repo_id=repo.name, configuration=configuration) + run = await create_run( + session=session, + run_name="run", + project=project, + repo=repo, + user=user, + run_spec=run_spec, + fleet=fleet, + ) + job = await create_job(session=session, run=run, instance_assigned=True) + fleet_lock_expires_at = get_current_datetime() + timedelta(minutes=1) + fleet_lock_token = uuid.uuid4() + fleet.lock_expires_at = fleet_lock_expires_at + fleet.lock_token = fleet_lock_token + fleet.lock_owner = "OtherPipeline:cluster-master" + await session.commit() + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + m.return_value = [backend_mock] + backend_mock.TYPE = BackendType.AWS + offer_1 = get_instance_offer_with_availability( + backend=BackendType.AWS, + region="eu-west-2", + ) + offer_2 = get_instance_offer_with_availability( + backend=BackendType.AWS, + region="eu-west-1", + ) + backend_mock.compute.return_value.get_offers.return_value = [offer_1, offer_2] + backend_mock.compute.return_value.run_job.return_value = get_job_provisioning_data( + backend=BackendType.AWS, + ) + + await _process_job(session=session, worker=worker, job_model=job) + + await session.refresh(job) + await session.refresh(fleet) + assert job.status == JobStatus.PROVISIONING + assert fleet.lock_owner == "OtherPipeline:cluster-master" + assert fleet.lock_token == fleet_lock_token + assert fleet.lock_expires_at == fleet_lock_expires_at + backend_mock.compute.return_value.run_job.assert_called_once() + selected_offer = backend_mock.compute.return_value.run_job.call_args[0][2] + assert selected_offer.region == "eu-west-1" + + async def test_defers_new_capacity_provisioning_until_fleet_master_is_elected( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.placement = InstanceGroupPlacement.CLUSTER + fleet_spec.configuration.nodes = FleetNodesSpec(min=0, target=0, max=None) + fleet = await create_fleet(session=session, project=project, spec=fleet_spec) + await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.BUSY, + backend=BackendType.AWS, + job_provisioning_data=get_job_provisioning_data(region="eu-west-1"), + ) + configuration = TaskConfiguration(image="debian", nodes=2) + run_spec = get_run_spec(run_name="run", repo_id=repo.name, configuration=configuration) + run = await create_run( + session=session, + run_name="run", + project=project, + repo=repo, + user=user, + run_spec=run_spec, + fleet=fleet, + ) + job = await create_job( + session=session, + run=run, + instance_assigned=True, + waiting_master_job=False, + ) + previous_last_processed_at = job.last_processed_at + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + await _process_job(session=session, worker=worker, job_model=job) + m.assert_not_called() + + await session.refresh(job) + assert job.status == JobStatus.SUBMITTED + assert job.instance_assigned + assert job.instance is None + assert job.last_processed_at > previous_last_processed_at + assert job.lock_owner is None + assert job.lock_token is None + assert job.lock_expires_at is None + hint_fetch = cast(Mock, worker._pipeline_hinter.hint_fetch) + hint_fetch.assert_has_calls([call(FleetModel.__name__), call(JobModel.__name__)]) + + async def test_provisioning_non_master_job_ignores_cluster_master_fleet_lock( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.placement = InstanceGroupPlacement.CLUSTER + fleet_spec.configuration.nodes = FleetNodesSpec(min=0, target=0, max=None) + fleet = await create_fleet(session=session, project=project, spec=fleet_spec) + configuration = TaskConfiguration(image="debian", nodes=2) + run_spec = get_run_spec(run_name="run", repo_id=repo.name, configuration=configuration) + run = await create_run( + session=session, + run_name="run", + project=project, + repo=repo, + user=user, + run_spec=run_spec, + fleet=fleet, + ) + await create_job( + session=session, + run=run, + job_num=0, + instance_assigned=True, + job_provisioning_data=get_job_provisioning_data(region="eu-west-1"), + waiting_master_job=False, + ) + job = await create_job( + session=session, + run=run, + job_num=1, + instance_assigned=True, + waiting_master_job=False, + ) + fleet_lock_expires_at = get_current_datetime() + timedelta(minutes=1) + fleet_lock_token = uuid.uuid4() + fleet.lock_expires_at = fleet_lock_expires_at + fleet.lock_token = fleet_lock_token + fleet.lock_owner = "OtherPipeline:cluster-master" + await session.commit() + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + m.return_value = [backend_mock] + backend_mock.TYPE = BackendType.AWS + offer_1 = get_instance_offer_with_availability( + backend=BackendType.AWS, + region="eu-west-2", + ) + offer_2 = get_instance_offer_with_availability( + backend=BackendType.AWS, + region="eu-west-1", + ) + backend_mock.compute.return_value.get_offers.return_value = [offer_1, offer_2] + backend_mock.compute.return_value.run_job.return_value = get_job_provisioning_data( + backend=BackendType.AWS, + ) + + await _process_job(session=session, worker=worker, job_model=job) + + await session.refresh(job) + await session.refresh(fleet) + assert job.status == JobStatus.PROVISIONING + assert fleet.lock_owner == "OtherPipeline:cluster-master" + assert fleet.lock_token == fleet_lock_token + assert fleet.lock_expires_at == fleet_lock_expires_at + backend_mock.compute.return_value.run_job.assert_called_once() + selected_offer = backend_mock.compute.return_value.run_job.call_args[0][2] + assert selected_offer.region == "eu-west-1" + + async def test_creates_placement_group_for_cluster_fleet( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.placement = InstanceGroupPlacement.CLUSTER + fleet_spec.configuration.nodes = FleetNodesSpec(min=0, target=0, max=None) + fleet = await create_fleet(session=session, project=project, spec=fleet_spec) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + fleet=fleet, + run_name="test-run", + run_spec=get_run_spec(run_name="test-run", repo_id=repo.name), + ) + job = await create_job(session=session, run=run, instance_assigned=True) + offer = get_instance_offer_with_availability(backend=BackendType.AWS) + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + compute_mock = Mock(spec=ComputeMockSpec) + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = compute_mock + m.return_value = [backend_mock] + compute_mock.get_offers.return_value = [offer] + compute_mock.run_job.return_value = get_job_provisioning_data( + backend=BackendType.AWS, + ) + compute_mock.create_placement_group.return_value = ( + get_placement_group_provisioning_data() + ) + + await _process_job(session=session, worker=worker, job_model=job) + + await session.refresh(job) + await session.refresh(fleet) + assert job.status == JobStatus.PROVISIONING + assert fleet.lock_owner is None + assert fleet.lock_token is None + assert fleet.lock_expires_at is None + compute_mock.create_placement_group.assert_called_once() + compute_mock.run_job.assert_called_once() + assert isinstance(compute_mock.run_job.call_args[0][6], PlacementGroup) + placement_group = (await session.execute(select(PlacementGroupModel))).scalar() + assert placement_group is not None + + async def test_marks_unused_existing_placement_groups_for_cleanup( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.placement = InstanceGroupPlacement.CLUSTER + fleet_spec.configuration.nodes = FleetNodesSpec(min=0, target=0, max=None) + fleet = await create_fleet(session=session, project=project, spec=fleet_spec) + selected_pg = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="selected-pg", + ) + await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="stale-pg", + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + fleet=fleet, + run_name="test-run", + run_spec=get_run_spec(run_name="test-run", repo_id=repo.name), + ) + job = await create_job(session=session, run=run, instance_assigned=True) + offer = get_instance_offer_with_availability(backend=BackendType.AWS) + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + compute_mock = Mock(spec=ComputeMockSpec) + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = compute_mock + m.return_value = [backend_mock] + compute_mock.get_offers.return_value = [offer] + compute_mock.is_suitable_placement_group.side_effect = ( + lambda placement_group, _: placement_group.name == selected_pg.name + ) + compute_mock.run_job.return_value = get_job_provisioning_data( + backend=BackendType.AWS, + ) + + await _process_job(session=session, worker=worker, job_model=job) + + await session.refresh(job) + assert job.status == JobStatus.PROVISIONING + placement_groups = await _get_placement_groups(session=session, fleet_id=fleet.id) + assert {placement_group.name for placement_group in placement_groups} == { + "selected-pg", + "stale-pg", + } + placement_groups_by_name = { + placement_group.name: placement_group for placement_group in placement_groups + } + assert not placement_groups_by_name["selected-pg"].fleet_deleted + assert placement_groups_by_name["stale-pg"].fleet_deleted + compute_mock.create_placement_group.assert_not_called() + + async def test_marks_new_and_existing_placement_groups_for_cleanup_on_failed_provisioning( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.placement = InstanceGroupPlacement.CLUSTER + fleet_spec.configuration.nodes = FleetNodesSpec(min=0, target=0, max=None) + fleet = await create_fleet(session=session, project=project, spec=fleet_spec) + await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="existing-pg", + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + fleet=fleet, + run_name="test-run", + run_spec=get_run_spec(run_name="test-run", repo_id=repo.name), + ) + job = await create_job(session=session, run=run, instance_assigned=True) + offer = get_instance_offer_with_availability(backend=BackendType.AWS) + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + compute_mock = Mock(spec=ComputeMockSpec) + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = compute_mock + m.return_value = [backend_mock] + compute_mock.get_offers.return_value = [offer] + compute_mock.is_suitable_placement_group.return_value = False + compute_mock.create_placement_group.return_value = ( + get_placement_group_provisioning_data() + ) + compute_mock.run_job.side_effect = BackendError("boom") + + await _process_job(session=session, worker=worker, job_model=job) + + await session.refresh(job) + assert job.status == JobStatus.TERMINATING + placement_groups = await _get_placement_groups(session=session, fleet_id=fleet.id) + assert len(placement_groups) == 2 + placement_groups_by_name = { + placement_group.name: placement_group for placement_group in placement_groups + } + assert placement_groups_by_name["existing-pg"].fleet_deleted + new_placement_groups = [ + placement_group + for placement_group in placement_groups + if placement_group.name != "existing-pg" + ] + assert len(new_placement_groups) == 1 + assert new_placement_groups[0].fleet_deleted + compute_mock.create_placement_group.assert_called_once() + + async def test_resets_lock_for_retry_when_cluster_master_fleet_lock_is_unavailable( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.placement = InstanceGroupPlacement.CLUSTER + fleet_spec.configuration.nodes = FleetNodesSpec(min=0, target=0, max=None) + fleet = await create_fleet(session=session, project=project, spec=fleet_spec) + fleet.lock_expires_at = get_current_datetime() + timedelta(minutes=1) + fleet.lock_token = uuid.uuid4() + fleet.lock_owner = "OtherPipeline:cluster-master" + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + fleet=fleet, + ) + job = await create_job(session=session, run=run, instance_assigned=True) + previous_last_processed_at = job.last_processed_at + await session.commit() + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + await _process_job(session=session, worker=worker, job_model=job) + m.assert_not_called() + + await session.refresh(job) + await session.refresh(fleet) + assert job.status == JobStatus.SUBMITTED + assert job.instance_assigned + assert job.last_processed_at > previous_last_processed_at + assert job.lock_owner == JobSubmittedPipeline.__name__ + assert job.lock_token is None + assert job.lock_expires_at is None + assert fleet.lock_owner == "OtherPipeline:cluster-master" + assert fleet.lock_token is not None + assert fleet.lock_expires_at is not None + + async def test_reclaims_stale_related_cluster_master_fleet_lock( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.placement = InstanceGroupPlacement.CLUSTER + fleet_spec.configuration.nodes = FleetNodesSpec(min=0, target=0, max=None) + fleet = await create_fleet(session=session, project=project, spec=fleet_spec) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + fleet=fleet, + ) + job = await create_job(session=session, run=run, instance_assigned=True) + fleet.lock_expires_at = get_current_datetime() - timedelta(minutes=1) + fleet.lock_token = uuid.uuid4() + fleet.lock_owner = f"{JobSubmittedPipeline.__name__}:{job.id}" + await session.commit() + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + m.return_value = [backend_mock] + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(backend=BackendType.AWS) + ] + backend_mock.compute.return_value.run_job.return_value = get_job_provisioning_data( + backend=BackendType.AWS, + ) + + await _process_job(session=session, worker=worker, job_model=job) + + await session.refresh(job) + await session.refresh(fleet) + assert job.status == JobStatus.PROVISIONING + assert fleet.lock_owner is None + assert fleet.lock_token is None + assert fleet.lock_expires_at is None + backend_mock.compute.return_value.run_job.assert_called_once() + + async def test_processes_assignment_and_provisioning_in_separate_passes( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet = await create_fleet(session=session, project=project) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + ) + run = await create_run(session=session, project=project, repo=repo, user=user) + job = await create_job(session=session, run=run) + + await _process_job(session=session, worker=worker, job_model=job) + + job = await _get_job(session, job.id) + assert job.status == JobStatus.SUBMITTED + assert job.instance_assigned + assert job.instance is not None and job.instance.id == instance.id + + await _process_job(session=session, worker=worker, job_model=job) + + job = await _get_job(session, job.id) + assert job.status == JobStatus.PROVISIONING + assert job.instance_assigned + assert job.instance is not None and job.instance.id == instance.id + + async def test_ignores_lock_token_mismatch( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + job = await create_job(session=session, run=run) + _lock_job(job) + await session.commit() + item = _job_to_pipeline_item(job) + + job.lock_token = uuid.uuid4() + await session.commit() + + await worker.process(item) + + await session.refresh(job) + assert job.status == JobStatus.SUBMITTED + assert job.lock_token is not None + + async def test_assigns_job_to_instance( + self, + test_db, + session: AsyncSession, + worker: JobSubmittedWorker, + monkeypatch: pytest.MonkeyPatch, + ): + get_targeted_instance_offers_mock = AsyncMock() + monkeypatch.setattr( + "dstack._internal.server.background.pipeline_tasks.jobs_submitted.get_targeted_instance_offers", + get_targeted_instance_offers_mock, + ) + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet = await create_fleet(session=session, project=project) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + ) + run = await create_run(session=session, project=project, repo=repo, user=user) + job = await create_job(session=session, run=run) + previous_last_processed_at = job.last_processed_at + + await _process_job(session=session, worker=worker, job_model=job) + + job = await _get_job(session, job.id) + await session.refresh(instance) + assert job.status == JobStatus.SUBMITTED + assert job.instance_assigned + assert job.instance is not None and job.instance.id == instance.id + assert job.used_instance_id == instance.id + assert job.fleet_id == fleet.id + assert job.job_provisioning_data == instance.job_provisioning_data + assert job.job_runtime_data is not None + assert job.last_processed_at > previous_last_processed_at + assert job.lock_owner is None + assert job.lock_token is None + assert job.lock_expires_at is None + assert instance.status == InstanceStatus.BUSY + assert instance.busy_blocks == 1 + get_targeted_instance_offers_mock.assert_not_awaited() + + async def test_assigns_job_to_specific_instance( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet = await create_fleet(session=session, project=project, name="my-fleet") + await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + name="my-fleet-0", + ) + selected = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + name="my-fleet-1", + ) + run_spec = get_run_spec( + repo_id=repo.name, + profile=Profile(instances=[InstanceNameSelector(name="my-fleet-1")]), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + fleet=fleet, + run_spec=run_spec, + ) + job = await create_job(session=session, run=run) + + await _process_job(session=session, worker=worker, job_model=job) + + job = await _get_job(session, job.id) + assert job.status == JobStatus.SUBMITTED + assert job.instance_assigned + assert job.instance is not None and job.instance.id == selected.id + assert job.fleet_id == fleet.id + + async def test_assigns_job_to_specific_hostname( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet = await create_fleet(session=session, project=project, name="my-fleet") + await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + remote_connection_info=get_remote_connection_info(host="192.168.1.10"), + ) + selected = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + remote_connection_info=get_remote_connection_info(host="192.168.1.11"), + ) + run_spec = get_run_spec( + repo_id=repo.name, + profile=Profile(instances=[InstanceHostnameSelector(hostname="192.168.1.11")]), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + fleet=fleet, + run_spec=run_spec, + ) + job = await create_job(session=session, run=run) + + await _process_job(session=session, worker=worker, job_model=job) + + job = await _get_job(session, job.id) + assert job.status == JobStatus.SUBMITTED + assert job.instance_assigned + assert job.instance is not None and job.instance.id == selected.id + assert job.fleet_id == fleet.id + + async def test_assigns_service_replicas_to_specific_shared_instance_blocks( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet = await create_fleet(session=session, project=project, name="my-fleet") + selected = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + name="shared-worker", + total_blocks=2, + busy_blocks=0, + ) + run_spec = get_run_spec( + repo_id=repo.name, + configuration=ServiceConfiguration( + port=8080, + commands=["echo"], + replicas=Range[int](min=2, max=2), + resources=ResourcesSpec( + cpu=CPUSpec.parse("1"), + memory=Range[Memory](min=Memory.parse("1GB"), max=None), + gpu=None, + ), + ), + profile=Profile(instances=[InstanceNameSelector(name="shared-worker")]), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + fleet=fleet, + run_spec=run_spec, + ) + first_job = await create_job(session=session, run=run, replica_num=0) + second_job = await create_job(session=session, run=run, replica_num=1) + + await _process_job(session=session, worker=worker, job_model=first_job) + await session.refresh(selected) + assert selected.busy_blocks == 1 + + await _process_job(session=session, worker=worker, job_model=second_job) + + first_job = await _get_job(session, first_job.id) + second_job = await _get_job(session, second_job.id) + await session.refresh(selected) + assert first_job.instance is not None and first_job.instance.id == selected.id + assert second_job.instance is not None and second_job.instance.id == selected.id + assert selected.status == InstanceStatus.BUSY + assert selected.busy_blocks == 2 + + async def test_specific_instance_assignment_stays_in_run_fleet( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_fleet = await create_fleet(session=session, project=project, name="run-fleet") + other_fleet = await create_fleet(session=session, project=project, name="other-fleet") + selected = await create_instance( + session=session, + project=project, + fleet=run_fleet, + status=InstanceStatus.IDLE, + name="run-fleet-0", + price=10, + ) + await create_instance( + session=session, + project=project, + fleet=other_fleet, + status=InstanceStatus.IDLE, + name="other-fleet-0", + price=1, + ) + run_spec = get_run_spec( + repo_id=repo.name, + profile=Profile( + instances=[ + InstanceNameSelector(name="run-fleet-0"), + InstanceNameSelector(name="other-fleet-0"), + ] + ), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + fleet=run_fleet, + run_spec=run_spec, + ) + job = await create_job(session=session, run=run) + + await _process_job(session=session, worker=worker, job_model=job) + + job = await _get_job(session, job.id) + assert job.status == JobStatus.SUBMITTED + assert job.instance_assigned + assert job.instance is not None and job.instance.id == selected.id + assert job.fleet_id == run_fleet.id + + async def test_assigns_job_to_specific_instance_in_imported_fleet( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + exporter_user = await create_user( + session, name="exporter-user", global_role=GlobalRole.USER + ) + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project( + session, name="exporter-project", owner=exporter_user + ) + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + repo = await create_repo(session=session, project_id=importer_project.id) + local_fleet = await create_fleet( + session=session, + project=importer_project, + name="same-fleet", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + exported_fleet = await create_fleet( + session=session, + project=exporter_project, + name="same-fleet", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + await create_instance( + session=session, + project=importer_project, + fleet=local_fleet, + status=InstanceStatus.IDLE, + instance_num=1, + name="local-worker", + ) + selected = await create_instance( + session=session, + project=exporter_project, + fleet=exported_fleet, + status=InstanceStatus.IDLE, + instance_num=1, + name="exported-worker", + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[exported_fleet], + ) + selectors: list[InstanceSelector] = [ + FleetInstanceSelector( + fleet=EntityReference.parse("exporter-project/same-fleet"), + instance=1, + ) + ] + run_spec = get_run_spec( + repo_id=repo.name, + profile=Profile(instances=selectors), + ) + run = await create_run( + session=session, + project=importer_project, + repo=repo, + user=importer_user, + run_spec=run_spec, + ) + job = await create_job(session=session, run=run) + + await _process_job(session=session, worker=worker, job_model=job) + + job = await _get_job(session, job.id) + assert job.status == JobStatus.SUBMITTED + assert job.instance_assigned + assert job.instance is not None and job.instance.id == selected.id + assert job.fleet_id == exported_fleet.id + + async def test_does_not_assign_multinode_job_without_enough_specific_instances( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.placement = InstanceGroupPlacement.CLUSTER + fleet = await create_fleet(session=session, project=project, spec=fleet_spec) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + name="shared-worker", + backend=BackendType.AWS, + total_blocks=2, + busy_blocks=0, + ) + run_spec = get_run_spec( + repo_id=repo.name, + configuration=TaskConfiguration(image="debian", nodes=2, commands=["echo"]), + profile=Profile(instances=[InstanceNameSelector(name="shared-worker")]), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + fleet=fleet, + run_spec=run_spec, + ) + master_job = await create_job( + session=session, + run=run, + job_num=0, + waiting_master_job=False, + ) + worker_job = await create_job( + session=session, + run=run, + job_num=1, + waiting_master_job=True, + ) + + await _process_job(session=session, worker=worker, job_model=master_job) + + master_job = await _get_job(session, master_job.id) + await session.refresh(worker_job) + await session.refresh(instance) + assert master_job.status == JobStatus.TERMINATING + assert ( + master_job.termination_reason + == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY + ) + assert not master_job.instance_assigned + assert worker_job.waiting_master_job + assert instance.status == InstanceStatus.IDLE + assert instance.busy_blocks == 0 + + async def test_assigns_multinode_jobs_to_specific_shared_ssh_instances( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + get_ssh_fleet_configuration( + hosts=["10.0.0.1", "10.0.0.2"], + placement=InstanceGroupPlacement.CLUSTER, + blocks=2, + ) + ), + ) + selected_master = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + name="worker-0", + backend=BackendType.REMOTE, + region="remote", + price=1, + total_blocks=2, + busy_blocks=0, + offer=get_instance_offer_with_availability( + backend=BackendType.REMOTE, + region="remote", + cpu_count=2, + memory_gib=4, + total_blocks=2, + ), + job_provisioning_data=get_job_provisioning_data( + backend=BackendType.REMOTE, + region="remote", + cpu_count=2, + memory_gib=4, + ), + ) + selected_worker = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + name="worker-1", + backend=BackendType.REMOTE, + region="remote", + price=2, + instance_num=1, + total_blocks=2, + busy_blocks=0, + offer=get_instance_offer_with_availability( + backend=BackendType.REMOTE, + region="remote", + cpu_count=2, + memory_gib=4, + total_blocks=2, + ), + job_provisioning_data=get_job_provisioning_data( + backend=BackendType.REMOTE, + region="remote", + cpu_count=2, + memory_gib=4, + ), + ) + run_spec = get_run_spec( + repo_id=repo.name, + configuration=TaskConfiguration( + image="debian", + nodes=2, + commands=["echo"], + resources=ResourcesSpec( + cpu=CPUSpec.parse("1.."), + memory=Range[Memory](min=Memory.parse("1GB"), max=None), + gpu=None, + ), + ), + profile=Profile( + instances=[ + InstanceNameSelector(name="worker-0"), + InstanceNameSelector(name="worker-1"), + ] + ), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + fleet=fleet, + run_spec=run_spec, + ) + master_job = await create_job( + session=session, + run=run, + job_num=0, + waiting_master_job=False, + ) + worker_job = await create_job( + session=session, + run=run, + job_num=1, + waiting_master_job=True, + ) + + await _process_job(session=session, worker=worker, job_model=master_job) + master_job = await _get_job(session, master_job.id) + assert master_job.instance is not None and master_job.instance.id == selected_master.id + + await _process_job(session=session, worker=worker, job_model=master_job) + master_job = await _get_job(session, master_job.id) + await session.refresh(worker_job) + assert master_job.status == JobStatus.PROVISIONING + assert worker_job.waiting_master_job is False + + await _process_job(session=session, worker=worker, job_model=worker_job) + + worker_job = await _get_job(session, worker_job.id) + await session.refresh(selected_master) + await session.refresh(selected_worker) + assert worker_job.instance is not None and worker_job.instance.id == selected_worker.id + assert selected_master.busy_blocks == 2 + assert selected_worker.busy_blocks == 2 + master_runtime = JobRuntimeData.__response__.parse_raw(master_job.job_runtime_data) + worker_runtime = JobRuntimeData.__response__.parse_raw(worker_job.job_runtime_data) + assert master_runtime.network_mode == NetworkMode.HOST + assert worker_runtime.network_mode == NetworkMode.HOST + assert master_runtime.offer is not None and master_runtime.offer.blocks == 2 + assert worker_runtime.offer is not None and worker_runtime.offer.blocks == 2 + + async def test_assigns_multinode_jobs_to_specific_instances_in_same_cluster_fleet( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.placement = InstanceGroupPlacement.CLUSTER + fleet = await create_fleet(session=session, project=project, spec=fleet_spec) + selected_master = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + name="worker-0", + backend=BackendType.AWS, + region="eu-west-1", + price=1, + job_provisioning_data=get_job_provisioning_data(region="eu-west-1"), + ) + selected_worker = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + name="worker-1", + backend=BackendType.AWS, + region="eu-west-1", + price=2, + job_provisioning_data=get_job_provisioning_data(region="eu-west-1"), + ) + run_spec = get_run_spec( + repo_id=repo.name, + configuration=TaskConfiguration(image="debian", nodes=2, commands=["echo"]), + profile=Profile( + instances=[ + InstanceNameSelector(name="worker-0"), + InstanceNameSelector(name="worker-1"), + ] + ), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + fleet=fleet, + run_spec=run_spec, + ) + master_job = await create_job( + session=session, + run=run, + job_num=0, + waiting_master_job=False, + ) + worker_job = await create_job( + session=session, + run=run, + job_num=1, + waiting_master_job=True, + ) + + await _process_job(session=session, worker=worker, job_model=master_job) + master_job = await _get_job(session, master_job.id) + assert master_job.instance is not None and master_job.instance.id == selected_master.id + assert master_job.status == JobStatus.SUBMITTED + + await _process_job(session=session, worker=worker, job_model=master_job) + master_job = await _get_job(session, master_job.id) + await session.refresh(worker_job) + assert master_job.status == JobStatus.PROVISIONING + assert worker_job.waiting_master_job is False + + await _process_job(session=session, worker=worker, job_model=worker_job) + + worker_job = await _get_job(session, worker_job.id) + await session.refresh(selected_master) + await session.refresh(selected_worker) + assert worker_job.status == JobStatus.SUBMITTED + assert worker_job.instance is not None and worker_job.instance.id == selected_worker.id + assert selected_master.busy_blocks == 1 + assert selected_worker.busy_blocks == 1 + + async def test_does_not_create_capacity_when_specific_instance_is_missing( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + await create_fleet(session=session, project=project) + run_spec = get_run_spec( + repo_id=repo.name, + profile=Profile(instances=[InstanceNameSelector(name="missing-instance")]), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=run_spec, + ) + job = await create_job(session=session, run=run) + + await _process_job(session=session, worker=worker, job_model=job) + + job = await _get_job(session, job.id) + assert job.status == JobStatus.TERMINATING + assert job.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY + res = await session.execute(select(InstanceModel)) + assert res.scalars().all() == [] + + async def test_assigns_job_to_imported_fleet( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + exporter_user = await create_user( + session, name="exporter-user", global_role=GlobalRole.USER + ) + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project( + session, name="exporter-project", owner=exporter_user + ) + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + repo = await create_repo(session=session, project_id=importer_project.id) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + instance = await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + status=InstanceStatus.IDLE, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + run = await create_run( + session=session, + project=importer_project, + repo=repo, + user=importer_user, + ) + job = await create_job(session=session, run=run) + + await _process_job(session=session, worker=worker, job_model=job) + + job = await _get_job(session, job.id) + assert job.status == JobStatus.SUBMITTED + assert job.instance_assigned + assert job.instance is not None and job.instance.id == instance.id + assert job.fleet_id == fleet.id + + async def test_assigns_job_to_specific_fleet( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet_1 = await create_fleet(session=session, project=project, name="fleet-1") + fleet_2 = await create_fleet(session=session, project=project, name="fleet-2") + await create_instance( + session=session, + project=project, + fleet=fleet_1, + status=InstanceStatus.IDLE, + name="fleet-1-instance", + ) + instance_2 = await create_instance( + session=session, + project=project, + fleet=fleet_2, + status=InstanceStatus.IDLE, + name="fleet-2-instance", + ) + run_spec = get_run_spec( + repo_id=repo.name, + profile=Profile(name="default", fleets=[fleet_2.name]), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=run_spec, + ) + job = await create_job(session=session, run=run) + + await _process_job(session=session, worker=worker, job_model=job) + + job = await _get_job(session, job.id) + assert job.instance_assigned + assert job.instance is not None and job.instance.id == instance_2.id + assert job.fleet_id == fleet_2.id + + async def test_assignment_creates_placeholder_instance_for_new_capacity( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet = await create_fleet(session=session, project=project) + run = await create_run(session=session, project=project, repo=repo, user=user) + job = await create_job(session=session, run=run) + + offer = get_instance_offer_with_availability(backend=BackendType.AWS) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + m.return_value = [backend_mock] + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value.get_offers.return_value = [offer] + + await _process_job(session=session, worker=worker, job_model=job) + + job = await _get_job(session, job.id) + assert job.status == JobStatus.SUBMITTED + assert job.instance_assigned + assert job.fleet_id == fleet.id + assert job.used_instance_id is not None + # Query the placeholder instance directly to avoid stale session cache + res = await session.execute( + select(InstanceModel) + .where(InstanceModel.id == job.used_instance_id) + .execution_options(populate_existing=True) + ) + placeholder = res.scalar_one() + assert placeholder.status == InstanceStatus.PENDING + assert placeholder.provisioning_job_id == job.id + assert placeholder.fleet_id == fleet.id + assert placeholder.offer is None + assert placeholder.instance_num == 0 + + @pytest.mark.parametrize("fleet_type", ["cloud", "ssh"]) + async def test_job_fails_when_fleet_is_full( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker, fleet_type: str + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + if fleet_type == "cloud": + fleet_spec = get_fleet_spec() + fleet_spec.configuration.nodes = FleetNodesSpec(min=0, target=0, max=1) + else: + fleet_spec = get_fleet_spec(get_ssh_fleet_configuration(hosts=["10.0.0.1"])) + fleet = await create_fleet(session=session, project=project, spec=fleet_spec) + await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.BUSY, + ) + run = await create_run(session=session, project=project, repo=repo, user=user) + job = await create_job(session=session, run=run) + + offer = get_instance_offer_with_availability(backend=BackendType.AWS) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + m.return_value = [backend_mock] + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value.get_offers.return_value = [offer] + + await _process_job(session=session, worker=worker, job_model=job) + + job = await _get_job(session, job.id) + assert job.status == JobStatus.TERMINATING + assert job.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY + assert job.termination_reason_message == "Fleet is at capacity" + # No placeholder must be committed when the fleet is full. + res = await session.execute( + select(InstanceModel).where( + InstanceModel.fleet_id == fleet.id, + InstanceModel.deleted == False, + ) + ) + assert len(res.scalars().all()) == 1 + + async def test_leaves_placeholder_for_terminating_pipeline_on_failed_new_capacity_provisioning( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.nodes = FleetNodesSpec(min=0, target=0, max=None) + fleet = await create_fleet(session=session, project=project, spec=fleet_spec) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + fleet=fleet, + ) + placeholder = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + backend=BackendType.AWS, + ) + job = await create_job( + session=session, run=run, instance=placeholder, instance_assigned=True + ) + placeholder.provisioning_job_id = job.id + await session.commit() + + offer = get_instance_offer_with_availability(backend=BackendType.AWS) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + compute_mock = Mock(spec=ComputeMockSpec) + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = compute_mock + m.return_value = [backend_mock] + compute_mock.get_offers.return_value = [offer] + compute_mock.run_job.side_effect = BackendError("boom") + + await _process_job(session=session, worker=worker, job_model=job) + + job = await _get_job(session, job.id) + assert job.status == JobStatus.TERMINATING + assert job.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY + await session.refresh(placeholder) + assert not placeholder.deleted + assert placeholder.status == InstanceStatus.PENDING + + async def test_provisions_compute_group( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet = await create_fleet(session=session, project=project) + run_spec = get_run_spec(repo_id=repo.name) + run_spec.configuration = TaskConfiguration(nodes=2, commands=["echo"]) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + fleet=fleet, + run_spec=run_spec, + ) + job1 = await create_job( + session=session, + run=run, + instance_assigned=True, + job_num=0, + waiting_master_job=False, + ) + job2 = await create_job( + session=session, + run=run, + instance_assigned=False, + job_num=1, + waiting_master_job=True, + ) + + offer = get_instance_offer_with_availability(backend=BackendType.RUNPOD) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + compute_mock = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value = compute_mock + m.return_value = [backend_mock] + backend_mock.TYPE = BackendType.RUNPOD + compute_mock.get_offers.return_value = [offer] + compute_mock.run_jobs.return_value = get_compute_group_provisioning_data( + job_provisioning_datas=[ + get_job_provisioning_data(dockerized=True, backend=BackendType.RUNPOD), + get_job_provisioning_data(dockerized=True, backend=BackendType.RUNPOD), + ] + ) + + await _process_job(session=session, worker=worker, job_model=job1) + + await session.refresh(job1) + await session.refresh(job2) + assert job1.status == JobStatus.PROVISIONING + assert job2.status == JobStatus.PROVISIONING + res = await session.execute(select(ComputeGroupModel)) + assert res.scalar_one_or_none() is not None + + async def test_defers_job_while_waiting_for_master_provisioning( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec(repo_id=repo.name) + run_spec.configuration = TaskConfiguration(nodes=2, commands=["echo"]) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=run_spec, + ) + await create_job( + session=session, + run=run, + job_num=0, + waiting_master_job=False, + ) + job = await create_job( + session=session, + run=run, + job_num=1, + waiting_master_job=False, + ) + previous_last_processed_at = job.last_processed_at + + await _process_job(session=session, worker=worker, job_model=job) + + await session.refresh(job) + assert job.status == JobStatus.SUBMITTED + assert not job.instance_assigned + assert job.instance_id is None + assert job.fleet_id is None + assert job.last_processed_at > previous_last_processed_at + assert job.lock_owner is None + assert job.lock_token is None + assert job.lock_expires_at is None + + async def test_defers_job_while_waiting_for_run_fleet_assignment( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec(repo_id=repo.name) + run_spec.configuration = TaskConfiguration(nodes=2, commands=["echo"]) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=run_spec, + ) + await create_job( + session=session, + run=run, + job_num=0, + instance_assigned=True, + job_provisioning_data=get_job_provisioning_data(), + waiting_master_job=False, + ) + job = await create_job( + session=session, + run=run, + job_num=1, + waiting_master_job=False, + ) + previous_last_processed_at = job.last_processed_at + + await _process_job(session=session, worker=worker, job_model=job) + + await session.refresh(job) + assert job.status == JobStatus.SUBMITTED + assert not job.instance_assigned + assert job.fleet_id is None + assert job.last_processed_at > previous_last_processed_at + assert job.lock_owner is None + assert job.lock_token is None + assert job.lock_expires_at is None + + async def test_terminates_job_when_volume_preparation_fails( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + volume_provisioning_data=get_volume_provisioning_data(), + ) + volume.to_be_deleted = True + await session.commit() + run_spec = get_run_spec(repo_id=repo.name) + run_spec.configuration.volumes = [VolumeMountPoint(name=volume.name, path="/volume")] + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=run_spec, + ) + job = await create_job(session=session, run=run) + + await _process_job(session=session, worker=worker, job_model=job) + + await session.refresh(job) + assert job.status == JobStatus.TERMINATING + assert job.termination_reason == JobTerminationReason.VOLUME_ERROR + assert job.termination_reason_message is not None + assert "marked for deletion" in job.termination_reason_message + assert job.lock_owner is None + assert job.lock_token is None + assert job.lock_expires_at is None + + async def test_terminates_job_when_specified_fleets_cannot_be_used( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + profile=Profile(name="default", fleets=["missing-fleet"]), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=run_spec, + ) + job = await create_job(session=session, run=run) + + await _process_job(session=session, worker=worker, job_model=job) + + await session.refresh(job) + assert job.status == JobStatus.TERMINATING + assert job.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY + assert job.termination_reason_message == "Failed to use specified fleets" + + async def test_terminates_job_when_no_matching_fleet( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + job = await create_job(session=session, run=run) + + await _process_job(session=session, worker=worker, job_model=job) + + await session.refresh(job) + assert job.status == JobStatus.TERMINATING + assert job.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY + assert job.termination_reason_message is not None + assert "No matching fleet found" in job.termination_reason_message + + async def test_terminates_legacy_autocreated_job_with_no_fleet( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + # Simulate legacy in-flight state: instance_assigned=True but no fleet + job = await create_job(session=session, run=run, instance_assigned=True) + + await _process_job(session=session, worker=worker, job_model=job) + + await session.refresh(job) + assert job.status == JobStatus.TERMINATING + assert job.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY + assert job.termination_reason_message is not None + assert "No matching fleet found" in job.termination_reason_message + + async def test_resets_lock_for_retry_when_existing_instance_offer_cannot_be_locked( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet = await create_fleet(session=session, project=project) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + ) + instance.lock_expires_at = get_current_datetime() + timedelta(minutes=1) + instance.lock_token = uuid.uuid4() + instance.lock_owner = "OtherPipeline" + await session.commit() + + run = await create_run(session=session, project=project, repo=repo, user=user) + job = await create_job(session=session, run=run) + previous_last_processed_at = job.last_processed_at + + await _process_job(session=session, worker=worker, job_model=job) + + await session.refresh(job) + await session.refresh(instance) + assert job.status == JobStatus.SUBMITTED + assert not job.instance_assigned + assert job.instance_id is None + assert job.used_instance_id is None + assert job.last_processed_at > previous_last_processed_at + # lock_owner is intentionally preserved so the fetcher can distinguish + # an in-progress lock from a reset that came from this pipeline + assert job.lock_owner == JobSubmittedPipeline.__name__ + assert job.lock_token is None + assert job.lock_expires_at is None + assert instance.status == InstanceStatus.IDLE + assert instance.busy_blocks == 0 + + async def test_attaches_volume_on_existing_instance( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + volume_provisioning_data=get_volume_provisioning_data(), + backend=BackendType.AWS, + region="us-east-1", + ) + fleet = await create_fleet(session=session, project=project) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.BUSY, + busy_blocks=1, + backend=BackendType.AWS, + region="us-east-1", + ) + run_spec = get_run_spec(repo_id=repo.name) + run_spec.configuration.volumes = [VolumeMountPoint(name=volume.name, path="/volume")] + run = await create_run( + session=session, project=project, repo=repo, user=user, run_spec=run_spec + ) + job = await create_job( + session=session, + run=run, + instance=instance, + instance_assigned=True, + ) + + with patch("dstack._internal.server.services.backends.get_project_backend_by_type") as m: + backend_mock = Mock() + m.return_value = backend_mock + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.attach_volume.return_value = VolumeAttachmentData() + + await _process_job(session=session, worker=worker, job_model=job) + + res = await session.execute( + select(JobModel) + .where(JobModel.id == job.id) + .options( + joinedload(JobModel.instance) + .joinedload(InstanceModel.volume_attachments) + .joinedload(VolumeAttachmentModel.volume) + ) + .execution_options(populate_existing=True) + ) + job = res.unique().scalar_one() + await session.refresh(volume) + assert job.status == JobStatus.PROVISIONING + assert job.instance is not None + assert len(job.instance.volume_attachments) == 1 + assert job.instance.volume_attachments[0].volume_id == volume.id + assert volume.lock_owner is None + assert volume.lock_token is None + assert volume.lock_expires_at is None + backend_mock.compute.return_value.attach_volume.assert_called_once() + + async def test_terminates_job_when_volume_is_locked_for_processing( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + volume_provisioning_data=get_volume_provisioning_data(), + backend=BackendType.AWS, + region="us-east-1", + ) + volume.lock_expires_at = get_current_datetime() + timedelta(minutes=1) + volume.lock_token = uuid.uuid4() + volume.lock_owner = "OtherPipeline" + fleet = await create_fleet(session=session, project=project) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.BUSY, + busy_blocks=1, + backend=BackendType.AWS, + region="us-east-1", + ) + run_spec = get_run_spec(repo_id=repo.name) + run_spec.configuration.volumes = [VolumeMountPoint(name=volume.name, path="/volume")] + run = await create_run( + session=session, project=project, repo=repo, user=user, run_spec=run_spec + ) + job = await create_job( + session=session, + run=run, + instance=instance, + instance_assigned=True, + ) + await session.commit() + + with patch("dstack._internal.server.services.backends.get_project_backend_by_type") as m: + backend_mock = Mock() + m.return_value = backend_mock + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + + await _process_job(session=session, worker=worker, job_model=job) + + await session.refresh(job) + await session.refresh(volume) + assert job.status == JobStatus.TERMINATING + assert job.termination_reason == JobTerminationReason.VOLUME_ERROR + assert job.termination_reason_message is not None + assert "locked for processing" in job.termination_reason_message + assert volume.lock_owner == "OtherPipeline" + assert volume.lock_token is not None + assert volume.lock_expires_at is not None + backend_mock.compute.return_value.attach_volume.assert_not_called() + + async def test_reclaims_stale_related_volume_lock( + self, test_db, session: AsyncSession, worker: JobSubmittedWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + volume_provisioning_data=get_volume_provisioning_data(), + backend=BackendType.AWS, + region="us-east-1", + ) + fleet = await create_fleet(session=session, project=project) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.BUSY, + busy_blocks=1, + backend=BackendType.AWS, + region="us-east-1", + ) + run_spec = get_run_spec(repo_id=repo.name) + run_spec.configuration.volumes = [VolumeMountPoint(name=volume.name, path="/volume")] + run = await create_run( + session=session, project=project, repo=repo, user=user, run_spec=run_spec + ) + job = await create_job( + session=session, + run=run, + instance=instance, + instance_assigned=True, + ) + volume.lock_expires_at = get_current_datetime() - timedelta(minutes=1) + volume.lock_token = uuid.uuid4() + volume.lock_owner = f"{JobSubmittedPipeline.__name__}:{job.id}" + await session.commit() + + with patch("dstack._internal.server.services.backends.get_project_backend_by_type") as m: + backend_mock = Mock() + m.return_value = backend_mock + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.attach_volume.return_value = VolumeAttachmentData() + + await _process_job(session=session, worker=worker, job_model=job) + + await session.refresh(job) + await session.refresh(volume) + assert job.status == JobStatus.PROVISIONING + assert volume.lock_owner is None + assert volume.lock_token is None + assert volume.lock_expires_at is None + backend_mock.compute.return_value.attach_volume.assert_called_once() + + async def test_run_job_uses_server_default_registry( + self, + monkeypatch: pytest.MonkeyPatch, + test_db, + session: AsyncSession, + worker: JobSubmittedWorker, + ): + monkeypatch.setattr(server_settings, "SERVER_DEFAULT_DOCKER_REGISTRY", "registry.example") + monkeypatch.setattr( + server_settings, "SERVER_DEFAULT_DOCKER_REGISTRY_USERNAME", "server-user" + ) + monkeypatch.setattr( + server_settings, "SERVER_DEFAULT_DOCKER_REGISTRY_PASSWORD", "server-pass" + ) + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet = await create_fleet(session=session, project=project) + run_spec = get_run_spec( + run_name="test-run", + repo_id=repo.name, + configuration=TaskConfiguration(image="ubuntu"), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + fleet=fleet, + run_spec=run_spec, + ) + job = await create_job(session=session, run=run, instance_assigned=True) + + offer = get_instance_offer_with_availability(backend=BackendType.RUNPOD) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + m.return_value = [backend_mock] + backend_mock.TYPE = BackendType.RUNPOD + backend_mock.compute.return_value.get_offers.return_value = [offer] + backend_mock.compute.return_value.run_job.return_value = get_job_provisioning_data( + dockerized=False, backend=BackendType.RUNPOD + ) + + await _process_job(session=session, worker=worker, job_model=job) + + backend_mock.compute.return_value.run_job.assert_called_once() + submitted_job = backend_mock.compute.return_value.run_job.call_args[0][1] + assert submitted_job.job_spec.image_name == "registry.example/ubuntu" + assert submitted_job.job_spec.registry_auth == RegistryAuth( + username="server-user", password="server-pass" + ) + + async def test_run_jobs_uses_server_default_registry( + self, + monkeypatch: pytest.MonkeyPatch, + test_db, + session: AsyncSession, + worker: JobSubmittedWorker, + ): + monkeypatch.setattr(server_settings, "SERVER_DEFAULT_DOCKER_REGISTRY", "registry.example") + monkeypatch.setattr( + server_settings, "SERVER_DEFAULT_DOCKER_REGISTRY_USERNAME", "server-user" + ) + monkeypatch.setattr( + server_settings, "SERVER_DEFAULT_DOCKER_REGISTRY_PASSWORD", "server-pass" + ) + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet = await create_fleet(session=session, project=project) + run_spec = get_run_spec( + run_name="test-run", + repo_id=repo.name, + configuration=TaskConfiguration(image="ubuntu", nodes=2, commands=["echo"]), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + fleet=fleet, + run_spec=run_spec, + ) + job1 = await create_job( + session=session, + run=run, + instance_assigned=True, + job_num=0, + waiting_master_job=False, + ) + await create_job( + session=session, + run=run, + instance_assigned=False, + job_num=1, + waiting_master_job=True, + ) + + offer = get_instance_offer_with_availability(backend=BackendType.RUNPOD) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + compute_mock = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value = compute_mock + m.return_value = [backend_mock] + backend_mock.TYPE = BackendType.RUNPOD + compute_mock.get_offers.return_value = [offer] + compute_mock.run_jobs.return_value = get_compute_group_provisioning_data( + job_provisioning_datas=[ + get_job_provisioning_data(dockerized=False, backend=BackendType.RUNPOD), + get_job_provisioning_data(dockerized=False, backend=BackendType.RUNPOD), + ] + ) + + await _process_job(session=session, worker=worker, job_model=job1) + + compute_mock.run_jobs.assert_called_once() + job_configurations = compute_mock.run_jobs.call_args[0][1] + for job_configuration in job_configurations: + assert job_configuration.job.job_spec.image_name == "registry.example/ubuntu" + assert job_configuration.job.job_spec.registry_auth == RegistryAuth( + username="server-user", password="server-pass" + ) + + async def test_interpolates_secrets_when_provisioning_new_capacity( + self, + test_db, + session: AsyncSession, + image_config_mock: ImageConfig, + worker: JobSubmittedWorker, + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet = await create_fleet(session=session, project=project) + await create_secret(session=session, project=project, name="token", value="s3cret") + await create_secret( + session=session, project=project, name="registry_user", value="docker-user" + ) + await create_secret( + session=session, project=project, name="registry_pass", value="docker-pass" + ) + run_spec = get_run_spec( + run_name="test-run", + repo_id=repo.name, + configuration=TaskConfiguration( + image="ubuntu", + env=Env.parse_obj({"TOKEN": "${{ secrets.token }}"}), + registry_auth=RegistryAuth( + username="${{ secrets.registry_user }}", + password="${{ secrets.registry_pass }}", + ), + ), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + fleet=fleet, + run_spec=run_spec, + ) + with patch.object(JobConfigurator, "_get_image_config") as m: + m.return_value = image_config_mock + job = await create_job(session=session, run=run, instance_assigned=True) + + offer = get_instance_offer_with_availability(backend=BackendType.RUNPOD) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + m.return_value = [backend_mock] + backend_mock.TYPE = BackendType.RUNPOD + backend_mock.compute.return_value.get_offers.return_value = [offer] + backend_mock.compute.return_value.run_job.return_value = get_job_provisioning_data( + dockerized=False, backend=BackendType.RUNPOD + ) + + await _process_job(session=session, worker=worker, job_model=job) + + backend_mock.compute.return_value.run_job.assert_called_once() + submitted_job = backend_mock.compute.return_value.run_job.call_args[0][1] + assert submitted_job.job_spec.env == {"TOKEN": "s3cret"} + assert submitted_job.job_spec.registry_auth == RegistryAuth( + username="docker-user", password="docker-pass" + ) + # The persisted JobModel keeps the unresolved literals so secrets aren't leaked. + await session.refresh(job) + assert "${{ secrets.token }}" in job.job_spec_data + assert "${{ secrets.registry_user }}" in job.job_spec_data + assert "${{ secrets.registry_pass }}" in job.job_spec_data + + async def test_terminates_job_when_secret_is_missing( + self, + test_db, + session: AsyncSession, + image_config_mock: ImageConfig, + worker: JobSubmittedWorker, + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet = await create_fleet(session=session, project=project) + run_spec = get_run_spec( + run_name="test-run", + repo_id=repo.name, + configuration=TaskConfiguration( + image="ubuntu", + registry_auth=RegistryAuth( + username="registry_user", + password="${{ secrets.registry_pass }}", + ), + ), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + fleet=fleet, + run_spec=run_spec, + ) + with patch.object(JobConfigurator, "_get_image_config") as m: + m.return_value = image_config_mock + job = await create_job(session=session, run=run, instance_assigned=True) + + offer = get_instance_offer_with_availability(backend=BackendType.RUNPOD) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + m.return_value = [backend_mock] + backend_mock.TYPE = BackendType.RUNPOD + backend_mock.compute.return_value.get_offers.return_value = [offer] + backend_mock.compute.return_value.run_job.return_value = get_job_provisioning_data( + dockerized=False, backend=BackendType.RUNPOD + ) + + await _process_job(session=session, worker=worker, job_model=job) + + await session.refresh(job) + assert job.status == JobStatus.TERMINATING + assert job.termination_reason == JobTerminationReason.TERMINATED_BY_SERVER + assert job.termination_reason_message is not None + assert "Secrets interpolation error" in job.termination_reason_message + backend_mock.compute.return_value.run_job.assert_not_called() + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestLoadSubmittedJobContext: + async def test_single_node_master_loads_only_current_job(self, test_db, session: AsyncSession): + """Master single-node: run_model.jobs should contain only the current job (latest submission).""" + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet = await create_fleet(session=session, project=project) + run = await create_run(session=session, project=project, repo=repo, user=user, fleet=fleet) + job = await create_job(session=session, run=run, status=JobStatus.SUBMITTED) + await session.commit() + + context = await _load_submitted_job_context(session=session, job_model=job) + # Only the current job's latest submission should be loaded. + assert len(context.run_model.jobs) == 1 + assert context.run_model.jobs[0].id == job.id + assert not context.multinode + assert context.jobs_to_provision == [context.job] + + async def test_non_master_loads_master_and_current_job(self, test_db, session: AsyncSession): + """Non-master: run_model.jobs should contain master job + current job (latest submissions).""" + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet = await create_fleet(session=session, project=project) + configuration = TaskConfiguration(image="debian", nodes=2) + run_spec = get_run_spec(run_name="run", repo_id=repo.name, configuration=configuration) + run = await create_run( + session=session, + run_name="run", + project=project, + repo=repo, + user=user, + run_spec=run_spec, + fleet=fleet, + ) + master_job = await create_job( + session=session, + run=run, + job_num=0, + status=JobStatus.SUBMITTED, + instance_assigned=True, + job_provisioning_data=get_job_provisioning_data(), + waiting_master_job=False, + ) + worker_job = await create_job( + session=session, + run=run, + job_num=1, + status=JobStatus.SUBMITTED, + waiting_master_job=False, + ) + await session.commit() + + context = await _load_submitted_job_context(session=session, job_model=worker_job) + # Only master (job_num=0) and current job (job_num=1) should be loaded. + loaded_job_ids = {jm.id for jm in context.run_model.jobs} + assert loaded_job_ids == {master_job.id, worker_job.id} + assert context.jobs_to_provision == [context.job] + + async def test_multinode_master_loads_all_replica_jobs(self, test_db, session: AsyncSession): + """Master multinode: run_model.jobs should contain all same-replica jobs (latest submissions).""" + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet = await create_fleet(session=session, project=project) + configuration = TaskConfiguration(image="debian", nodes=2) + run_spec = get_run_spec(run_name="run", repo_id=repo.name, configuration=configuration) + run = await create_run( + session=session, + run_name="run", + project=project, + repo=repo, + user=user, + run_spec=run_spec, + fleet=fleet, + ) + master_job = await create_job( + session=session, + run=run, + job_num=0, + status=JobStatus.SUBMITTED, + waiting_master_job=False, + ) + worker_job = await create_job( + session=session, + run=run, + job_num=1, + status=JobStatus.SUBMITTED, + waiting_master_job=True, + ) + await session.commit() + + context = await _load_submitted_job_context(session=session, job_model=master_job) + # All jobs in same replica should be loaded. + loaded_job_ids = {jm.id for jm in context.run_model.jobs} + assert loaded_job_ids == {master_job.id, worker_job.id} + assert context.multinode + assert len(context.jobs_to_provision) == 2 + assert len(context.replica_job_model_ids) == 2 + + async def test_loads_only_latest_submission(self, test_db, session: AsyncSession): + """Only the latest submission per (replica_num, job_num) should be loaded, not historical ones.""" + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + fleet = await create_fleet(session=session, project=project) + run = await create_run(session=session, project=project, repo=repo, user=user, fleet=fleet) + # Create two submissions for the same job (simulating resubmission). + await create_job( + session=session, + run=run, + job_num=0, + submission_num=0, + status=JobStatus.SUBMITTED, + ) + latest_job = await create_job( + session=session, + run=run, + job_num=0, + submission_num=1, + status=JobStatus.SUBMITTED, + ) + await session.commit() + + context = await _load_submitted_job_context(session=session, job_model=latest_job) + # Only the latest submission should be loaded. + assert len(context.run_model.jobs) == 1 + assert context.run_model.jobs[0].id == latest_job.id diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_terminating_jobs.py b/src/tests/_internal/server/background/pipeline_tasks/test_terminating_jobs.py new file mode 100644 index 0000000000..af34b1913e --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_terminating_jobs.py @@ -0,0 +1,987 @@ +import asyncio +import uuid +from datetime import datetime, timedelta, timezone +from unittest.mock import AsyncMock, Mock, patch + +import pytest +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import joinedload + +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.core.models.runs import JobStatus, JobTerminationReason +from dstack._internal.core.models.volumes import VolumeStatus +from dstack._internal.server.background.pipeline_tasks.jobs_terminating import ( + JobTerminatingFetcher, + JobTerminatingPipeline, + JobTerminatingPipelineItem, + JobTerminatingWorker, + _get_related_instance_lock_owner, +) +from dstack._internal.server.models import InstanceModel, JobModel, VolumeAttachmentModel +from dstack._internal.server.testing.common import ( + ComputeMockSpec, + create_instance, + create_job, + create_project, + create_repo, + create_run, + create_user, + create_volume, + get_instance_offer_with_availability, + get_job_provisioning_data, + get_job_runtime_data, + get_volume_configuration, + get_volume_provisioning_data, + list_events, +) +from dstack._internal.utils.common import get_current_datetime + + +@pytest.fixture +def worker() -> JobTerminatingWorker: + return JobTerminatingWorker(queue=Mock(), heartbeater=Mock(), pipeline_hinter=Mock()) + + +@pytest.fixture +def fetcher() -> JobTerminatingFetcher: + return JobTerminatingFetcher( + queue=asyncio.Queue(), + queue_desired_minsize=1, + min_processing_interval=timedelta(seconds=15), + lock_timeout=timedelta(seconds=30), + heartbeater=Mock(), + ) + + +def _job_to_pipeline_item(job_model: JobModel) -> JobTerminatingPipelineItem: + assert job_model.lock_token is not None + assert job_model.lock_expires_at is not None + return JobTerminatingPipelineItem( + __tablename__=job_model.__tablename__, + id=job_model.id, + lock_token=job_model.lock_token, + lock_expires_at=job_model.lock_expires_at, + prev_lock_expired=False, + volumes_detached_at=job_model.volumes_detached_at, + ) + + +def _lock_job(job_model: JobModel): + job_model.lock_token = uuid.uuid4() + job_model.lock_expires_at = get_current_datetime() + timedelta(seconds=30) + job_model.lock_owner = JobTerminatingPipeline.__name__ + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +@pytest.mark.usefixtures("image_config_mock") +class TestJobTerminatingFetcher: + async def test_fetch_selects_eligible_jobs_and_sets_lock_fields( + self, test_db, session: AsyncSession, fetcher: JobTerminatingFetcher + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + now = get_current_datetime() + stale = now - timedelta(minutes=1) + + terminating = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + submitted_at=stale - timedelta(minutes=2), + last_processed_at=stale - timedelta(seconds=2), + ) + past_remove_at = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + submitted_at=stale - timedelta(minutes=2), + last_processed_at=stale - timedelta(seconds=1), + ) + past_remove_at.remove_at = stale + past_remove_at.volumes_detached_at = stale - timedelta(seconds=30) + + future_remove_at = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + submitted_at=stale - timedelta(minutes=2), + last_processed_at=stale, + ) + future_remove_at.remove_at = now + timedelta(minutes=1) + + non_terminating = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + submitted_at=stale - timedelta(minutes=2), + last_processed_at=stale, + ) + + recent = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + submitted_at=stale - timedelta(minutes=2), + last_processed_at=now, + ) + recent_skip = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + submitted_at=stale - timedelta(minutes=2), + last_processed_at=now, + ) + recent_skip.skip_min_processing_interval = True + + locked = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + submitted_at=stale - timedelta(minutes=2), + last_processed_at=stale + timedelta(seconds=1), + ) + locked.lock_expires_at = now + timedelta(minutes=1) + locked.lock_token = uuid.uuid4() + locked.lock_owner = "OtherPipeline" + + expired_same_owner = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + submitted_at=stale - timedelta(minutes=2), + last_processed_at=stale + timedelta(seconds=2), + ) + expired_same_owner.lock_expires_at = stale + expired_same_owner.lock_token = uuid.uuid4() + expired_same_owner.lock_owner = JobTerminatingPipeline.__name__ + await session.commit() + + items = await fetcher.fetch(limit=10) + + assert [item.id for item in items] == [ + terminating.id, + past_remove_at.id, + expired_same_owner.id, + recent_skip.id, + ] + assert {(item.id, item.volumes_detached_at) for item in items} == { + (terminating.id, None), + (past_remove_at.id, past_remove_at.volumes_detached_at), + (expired_same_owner.id, None), + (recent_skip.id, None), + } + + for job in [ + terminating, + past_remove_at, + future_remove_at, + non_terminating, + recent, + recent_skip, + locked, + expired_same_owner, + ]: + await session.refresh(job) + + fetched_jobs = [terminating, past_remove_at, expired_same_owner, recent_skip] + assert all(job.lock_owner == JobTerminatingPipeline.__name__ for job in fetched_jobs) + assert all(job.lock_expires_at is not None for job in fetched_jobs) + assert all(job.lock_token is not None for job in fetched_jobs) + assert all(not job.skip_min_processing_interval for job in fetched_jobs) + assert len({job.lock_token for job in fetched_jobs}) == 1 + + assert future_remove_at.lock_owner is None + assert non_terminating.lock_owner is None + assert recent.lock_owner is None + assert locked.lock_owner == "OtherPipeline" + + async def test_fetch_returns_oldest_jobs_first_up_to_limit( + self, test_db, session: AsyncSession, fetcher: JobTerminatingFetcher + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + now = get_current_datetime() + + oldest = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + submitted_at=now - timedelta(minutes=5), + last_processed_at=now - timedelta(minutes=3), + ) + middle = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + submitted_at=now - timedelta(minutes=4), + last_processed_at=now - timedelta(minutes=2), + ) + newest = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + submitted_at=now - timedelta(minutes=3), + last_processed_at=now - timedelta(minutes=1), + ) + + items = await fetcher.fetch(limit=2) + + assert [item.id for item in items] == [oldest.id, middle.id] + + await session.refresh(oldest) + await session.refresh(middle) + await session.refresh(newest) + + assert oldest.lock_owner == JobTerminatingPipeline.__name__ + assert middle.lock_owner == JobTerminatingPipeline.__name__ + assert newest.lock_owner is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +@pytest.mark.usefixtures("image_config_mock") +class TestJobTerminatingWorker: + async def test_stops_job_gracefully_before_terminating_container( + self, test_db, session: AsyncSession, worker: JobTerminatingWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + job = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + termination_reason=JobTerminationReason.TERMINATED_BY_USER, + job_provisioning_data=get_job_provisioning_data(dockerized=True), + instance=instance, + ) + job.graceful_termination_attempts = 0 + _lock_job(job) + await session.commit() + + with ( + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_terminating.stop_runner", + new=AsyncMock(), + ) as stop_runner, + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_terminating._stop_container", + new=AsyncMock(return_value=True), + ) as stop_container, + ): + await worker.process(_job_to_pipeline_item(job)) + + stop_runner.assert_awaited_once() + stop_container.assert_not_awaited() + + await session.refresh(job) + await session.refresh(instance) + assert job.status == JobStatus.TERMINATING + assert job.graceful_termination_attempts == 1 + assert job.remove_at is not None + assert job.instance_id == instance.id + assert job.volumes_detached_at is None + assert job.lock_token is None + assert job.lock_expires_at is None + assert job.lock_owner is None + assert instance.lock_token is None + assert instance.lock_expires_at is None + assert instance.lock_owner is None + + events = await list_events(session) + assert any(event.message == "Graceful job stop requested" for event in events) + + async def test_terminates_gracefully_stopped_job_after_remove_at( + self, test_db, session: AsyncSession, worker: JobTerminatingWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + job = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + termination_reason=JobTerminationReason.TERMINATED_BY_USER, + job_provisioning_data=get_job_provisioning_data(dockerized=True), + instance=instance, + ) + job.graceful_termination_attempts = 1 + job.remove_at = get_current_datetime() - timedelta(minutes=1) + _lock_job(job) + await session.commit() + + with ( + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_terminating.stop_runner", + new=AsyncMock(), + ) as stop_runner, + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_terminating._stop_container", + new=AsyncMock(return_value=True), + ) as stop_container, + ): + await worker.process(_job_to_pipeline_item(job)) + + stop_runner.assert_not_awaited() + stop_container.assert_awaited_once() + + await session.refresh(job) + await session.refresh(instance) + assert job.status == JobStatus.TERMINATED + assert job.graceful_termination_attempts == 1 + assert job.remove_at is not None + assert job.instance_id is None + assert job.lock_token is None + assert job.lock_expires_at is None + assert job.lock_owner is None + assert instance.status == InstanceStatus.IDLE + assert instance.lock_token is None + assert instance.lock_expires_at is None + assert instance.lock_owner is None + + async def test_terminates_job( + self, test_db, session: AsyncSession, worker: JobTerminatingWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + job_provisioning_data = get_job_provisioning_data(dockerized=True) + job = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + termination_reason=JobTerminationReason.TERMINATED_BY_USER, + submitted_at=datetime(2023, 1, 2, 5, 12, 30, 5, tzinfo=timezone.utc), + job_provisioning_data=job_provisioning_data, + instance=instance, + ) + _lock_job(job) + await session.commit() + + with ( + patch("dstack._internal.server.services.runner.pool.SSHTunnel") as SSHTunnelMock, + patch( + "dstack._internal.server.services.runner.client.ShimClient.from_address" + ) as ShimClientMock, + ): + shim_client_mock = ShimClientMock.return_value + await worker.process(_job_to_pipeline_item(job)) + SSHTunnelMock.assert_called_once() + shim_client_mock.healthcheck.assert_called_once() + + await session.refresh(job) + await session.refresh(instance) + assert job.status == JobStatus.TERMINATED + assert job.lock_token is None + assert job.lock_expires_at is None + assert instance.lock_token is None + assert instance.lock_owner is None + + events = await list_events(session) + assert any( + event.message == "Job status changed TERMINATING -> TERMINATED" for event in events + ) + + async def test_detaches_job_volumes( + self, test_db, session: AsyncSession, worker: JobTerminatingWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + backend=BackendType.AWS, + volume_provisioning_data=get_volume_provisioning_data(), + ) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + volumes=[volume], + ) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + job_provisioning_data = get_job_provisioning_data(dockerized=False) + job = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + termination_reason=JobTerminationReason.TERMINATED_BY_USER, + submitted_at=datetime(2023, 1, 2, 5, 12, 30, 5, tzinfo=timezone.utc), + job_provisioning_data=job_provisioning_data, + instance=instance, + ) + _lock_job(job) + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.jobs_terminating.backends_services.get_project_backend_by_type" + ) as m: + backend_mock = Mock() + m.return_value = backend_mock + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.is_volume_detached.return_value = True + await worker.process(_job_to_pipeline_item(job)) + m.assert_awaited_once() + backend_mock.compute.return_value.detach_volume.assert_called_once() + backend_mock.compute.return_value.is_volume_detached.assert_called_once() + + await session.refresh(job) + assert job.status == JobStatus.TERMINATED + await session.refresh(volume) + assert volume.last_job_processed_at is not None + + async def test_force_detaches_job_volumes( + self, test_db, session: AsyncSession, worker: JobTerminatingWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + backend=BackendType.AWS, + volume_provisioning_data=get_volume_provisioning_data(), + ) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + volumes=[volume], + ) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + job_provisioning_data = get_job_provisioning_data(dockerized=False) + job = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + termination_reason=JobTerminationReason.TERMINATED_BY_USER, + submitted_at=datetime(2023, 1, 2, 5, 12, 30, 5, tzinfo=timezone.utc), + job_provisioning_data=job_provisioning_data, + instance=instance, + ) + _lock_job(job) + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.jobs_terminating.backends_services.get_project_backend_by_type" + ) as m: + backend_mock = Mock() + m.return_value = backend_mock + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.is_volume_detached.return_value = False + await worker.process(_job_to_pipeline_item(job)) + + await session.refresh(job) + res = await session.execute(select(JobModel).options(joinedload(JobModel.instance))) + job = res.unique().scalar_one() + assert job.status == JobStatus.TERMINATING + assert job.instance is None + assert job.volumes_detached_at is not None + + _lock_job(job) + await session.commit() + with ( + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_terminating.backends_services.get_project_backend_by_type" + ) as m, + patch( + "dstack._internal.server.background.pipeline_tasks.jobs_terminating.get_current_datetime" + ) as datetime_mock, + ): + datetime_mock.return_value = job.volumes_detached_at.replace( + tzinfo=timezone.utc + ) + timedelta(minutes=30) + backend_mock = Mock() + m.return_value = backend_mock + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.is_volume_detached.return_value = False + await worker.process(_job_to_pipeline_item(job)) + backend_mock.compute.return_value.detach_volume.assert_called_once() + detach_kwargs = backend_mock.compute.return_value.detach_volume.call_args.kwargs + assert detach_kwargs["force"] is True + assert detach_kwargs["volume"].id == volume.id + assert ( + detach_kwargs["provisioning_data"].instance_id == job_provisioning_data.instance_id + ) + backend_mock.compute.return_value.is_volume_detached.assert_called_once() + + await session.refresh(job) + assert job.status == JobStatus.TERMINATING + + _lock_job(job) + await session.commit() + with patch( + "dstack._internal.server.background.pipeline_tasks.jobs_terminating.backends_services.get_project_backend_by_type" + ) as m: + backend_mock = Mock() + m.return_value = backend_mock + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.is_volume_detached.return_value = True + await worker.process(_job_to_pipeline_item(job)) + backend_mock.compute.return_value.is_volume_detached.assert_called_once() + + await session.refresh(job) + await session.refresh(instance, ["volume_attachments"]) + res = await session.execute( + select(InstanceModel) + .where(InstanceModel.id == instance.id) + .options(joinedload(InstanceModel.volume_attachments)) + .execution_options(populate_existing=True) + ) + instance = res.unique().scalar_one() + assert job.status == JobStatus.TERMINATED + assert len(instance.volume_attachments) == 0 + + async def test_terminates_job_on_shared_instance( + self, test_db, session: AsyncSession, worker: JobTerminatingWorker + ): + project = await create_project(session) + user = await create_user(session) + repo = await create_repo(session=session, project_id=project.id) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + total_blocks=4, + busy_blocks=3, + ) + run = await create_run(session=session, project=project, repo=repo, user=user) + shared_offer = get_instance_offer_with_availability(blocks=2, total_blocks=4) + jrd = get_job_runtime_data(offer=shared_offer) + job = await create_job( + session=session, + run=run, + instance_assigned=True, + instance=instance, + job_runtime_data=jrd, + status=JobStatus.TERMINATING, + termination_reason=JobTerminationReason.TERMINATED_BY_USER, + ) + _lock_job(job) + await session.commit() + + await worker.process(_job_to_pipeline_item(job)) + + await session.refresh(job) + await session.refresh(instance) + res = await session.execute(select(JobModel).options(joinedload(JobModel.instance))) + job = res.unique().scalar_one() + assert job.status == JobStatus.TERMINATED + assert job.instance_assigned + assert job.instance is None + assert instance.busy_blocks == 1 + + async def test_detaches_job_volumes_on_shared_instance( + self, test_db, session: AsyncSession, worker: JobTerminatingWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + volume_conf_1 = get_volume_configuration(name="vol-1") + volume_1 = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + backend=BackendType.AWS, + configuration=volume_conf_1, + volume_provisioning_data=get_volume_provisioning_data(), + ) + volume_conf_2 = get_volume_configuration(name="vol-2") + volume_2 = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + backend=BackendType.AWS, + configuration=volume_conf_2, + volume_provisioning_data=get_volume_provisioning_data(), + ) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + volumes=[volume_1, volume_2], + ) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + job_provisioning_data = get_job_provisioning_data(dockerized=False) + job = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + termination_reason=JobTerminationReason.TERMINATED_BY_USER, + submitted_at=datetime(2023, 1, 2, 5, 12, 30, 5, tzinfo=timezone.utc), + job_provisioning_data=job_provisioning_data, + job_runtime_data=get_job_runtime_data(volume_names=["vol-1"]), + instance=instance, + ) + _lock_job(job) + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.jobs_terminating.backends_services.get_project_backend_by_type" + ) as m: + backend_mock = Mock() + m.return_value = backend_mock + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.is_volume_detached.return_value = True + + await worker.process(_job_to_pipeline_item(job)) + + backend_mock.compute.return_value.detach_volume.assert_called_once() + backend_mock.compute.return_value.is_volume_detached.assert_called_once() + + await session.refresh(job) + await session.refresh(instance) + res = await session.execute( + select(InstanceModel).options( + joinedload(InstanceModel.volume_attachments).joinedload( + VolumeAttachmentModel.volume + ) + ) + ) + instance = res.unique().scalar_one() + assert job.status == JobStatus.TERMINATED + assert len(instance.volume_attachments) == 1 + assert instance.volume_attachments[0].volume == volume_2 + + async def test_resets_job_for_retry_if_related_instance_is_locked( + self, test_db, session: AsyncSession, worker: JobTerminatingWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + instance.lock_owner = "OtherPipeline" + instance.lock_token = uuid.uuid4() + instance.lock_expires_at = get_current_datetime() + timedelta(minutes=1) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + job = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + termination_reason=JobTerminationReason.TERMINATED_BY_USER, + instance=instance, + ) + _lock_job(job) + last_processed_before = job.last_processed_at + await session.commit() + + await worker.process(_job_to_pipeline_item(job)) + + await session.refresh(job) + assert job.lock_token is None + assert job.lock_expires_at is None + assert job.lock_owner == JobTerminatingPipeline.__name__ + assert job.last_processed_at > last_processed_before + + async def test_resets_job_for_retry_if_related_instance_is_locked_by_another_job( + self, test_db, session: AsyncSession, worker: JobTerminatingWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + other_job = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + termination_reason=JobTerminationReason.TERMINATED_BY_USER, + instance=instance, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + termination_reason=JobTerminationReason.TERMINATED_BY_USER, + instance=instance, + ) + instance.lock_owner = _get_related_instance_lock_owner(other_job.id) + instance.lock_token = uuid.uuid4() + instance.lock_expires_at = get_current_datetime() - timedelta(minutes=1) + _lock_job(job) + last_processed_before = job.last_processed_at + await session.commit() + + await worker.process(_job_to_pipeline_item(job)) + + await session.refresh(job) + await session.refresh(instance) + assert job.lock_token is None + assert job.lock_expires_at is None + assert job.lock_owner == JobTerminatingPipeline.__name__ + assert job.last_processed_at > last_processed_before + assert instance.lock_owner == _get_related_instance_lock_owner(other_job.id) + + async def test_finishes_job_when_used_instance_is_not_set( + self, test_db, session: AsyncSession, worker: JobTerminatingWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + job = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + termination_reason=JobTerminationReason.TERMINATED_BY_USER, + ) + _lock_job(job) + await session.commit() + + await worker.process(_job_to_pipeline_item(job)) + + await session.refresh(job) + assert job.status == JobStatus.TERMINATED + assert job.lock_token is None + assert job.lock_expires_at is None + + async def test_terminates_job_with_placeholder_instance( + self, test_db, session: AsyncSession, worker: JobTerminatingWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + placeholder = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + provisioning_job_id=uuid.uuid4(), + offer=None, + job_provisioning_data=None, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + termination_reason=JobTerminationReason.TERMINATED_BY_USER, + instance=placeholder, + ) + _lock_job(job) + await session.commit() + + # No mocks needed — placeholder has no VM, no SSH, no container + await worker.process(_job_to_pipeline_item(job)) + + await session.refresh(job) + await session.refresh(placeholder) + assert job.status == JobStatus.TERMINATED + assert job.instance_id is None + assert placeholder.status == InstanceStatus.TERMINATING + + async def test_retries_detaching_when_used_instance_is_missing( + self, test_db, session: AsyncSession, worker: JobTerminatingWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + job = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + termination_reason=JobTerminationReason.TERMINATED_BY_USER, + ) + job.instance_id = None + job.used_instance_id = uuid.uuid4() + job.volumes_detached_at = get_current_datetime() + _lock_job(job) + last_processed_before = job.last_processed_at + await session.commit() + + await worker.process(_job_to_pipeline_item(job)) + + await session.refresh(job) + assert job.status == JobStatus.TERMINATING + assert job.lock_token is None + assert job.lock_expires_at is None + assert job.lock_owner == JobTerminatingPipeline.__name__ + assert job.last_processed_at > last_processed_before + + async def test_retries_terminating_when_used_instance_is_missing( + self, test_db, session: AsyncSession, worker: JobTerminatingWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + job = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + termination_reason=JobTerminationReason.TERMINATED_BY_USER, + ) + job.used_instance_id = uuid.uuid4() + _lock_job(job) + last_processed_before = job.last_processed_at + await session.commit() + + await worker.process(_job_to_pipeline_item(job)) + + await session.refresh(job) + assert job.status == JobStatus.TERMINATING + assert job.lock_token is None + assert job.lock_expires_at is None + assert job.lock_owner == JobTerminatingPipeline.__name__ + assert job.last_processed_at > last_processed_before + + async def test_keeps_related_instance_locked_on_processing_exception( + self, test_db, session: AsyncSession, worker: JobTerminatingWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + job = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + termination_reason=JobTerminationReason.TERMINATED_BY_USER, + job_provisioning_data=get_job_provisioning_data(dockerized=True), + instance=instance, + ) + _lock_job(job) + job_lock_token = job.lock_token + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.jobs_terminating._process_terminating_job", + side_effect=RuntimeError("boom"), + ): + with pytest.raises(RuntimeError, match="boom"): + await worker.process(_job_to_pipeline_item(job)) + + await session.refresh(job) + await session.refresh(instance) + assert job.lock_token == job_lock_token + assert job.lock_owner == JobTerminatingPipeline.__name__ + assert instance.lock_token == job_lock_token + assert instance.lock_owner == _get_related_instance_lock_owner(job.id) + + async def test_stops_job_gracefully_without_provisioning_data_hostname( + self, test_db, session: AsyncSession, worker: JobTerminatingWorker + ): + # Regression test for https://github.com/dstackai/dstack/issues/3950. + # Stopping a job that is still provisioning (no hostname/ssh_port yet) must not raise + # when the graceful stop tries to open an SSH tunnel to the runner. + project = await create_project(session=session) + user = await create_user(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + jpd = get_job_provisioning_data(dockerized=True) + jpd.hostname = None + jpd.ssh_port = None + job = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + termination_reason=JobTerminationReason.TERMINATED_BY_USER, + job_provisioning_data=jpd, + instance=instance, + ) + job.graceful_termination_attempts = 0 + _lock_job(job) + await session.commit() + + await worker.process(_job_to_pipeline_item(job)) + + await session.refresh(job) + assert job.status == JobStatus.TERMINATING + assert job.graceful_termination_attempts == 1 + assert job.remove_at is not None + assert job.instance_id == instance.id + + async def test_terminates_job_without_provisioning_data_hostname( + self, test_db, session: AsyncSession, worker: JobTerminatingWorker + ): + # Regression test for https://github.com/dstackai/dstack/issues/3950. + # The container stop is skipped (and must not raise) when the job has no hostname/ssh_port. + # Dangling containers are cleared later on instance checks by `remove_dangling_tasks_from_instance()`. + project = await create_project(session=session) + user = await create_user(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + jpd = get_job_provisioning_data(dockerized=True) + jpd.hostname = None + jpd.ssh_port = None + job = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + termination_reason=JobTerminationReason.TERMINATED_BY_USER, + job_provisioning_data=jpd, + instance=instance, + ) + job.graceful_termination_attempts = 1 + job.remove_at = get_current_datetime() - timedelta(minutes=1) + _lock_job(job) + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.jobs_terminating._stop_container", + new=AsyncMock(return_value=True), + ) as stop_container: + await worker.process(_job_to_pipeline_item(job)) + + stop_container.assert_not_awaited() + + await session.refresh(job) + await session.refresh(instance) + assert job.status == JobStatus.TERMINATED + assert job.instance_id is None + assert instance.status == InstanceStatus.IDLE diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_volumes.py b/src/tests/_internal/server/background/pipeline_tasks/test_volumes.py new file mode 100644 index 0000000000..2487dc4cfe --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_volumes.py @@ -0,0 +1,557 @@ +import asyncio +import uuid +from datetime import datetime, timedelta, timezone +from unittest.mock import Mock, patch + +import pytest +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.errors import BackendError, BackendNotAvailable +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.volumes import VolumeProvisioningData, VolumeStatus +from dstack._internal.server.background.pipeline_tasks import volumes as volumes_pipeline +from dstack._internal.server.background.pipeline_tasks.volumes import ( + VolumeFetcher, + VolumePipeline, + VolumePipelineItem, + VolumeWorker, +) +from dstack._internal.server.models import VolumeModel +from dstack._internal.server.testing.common import ( + ComputeMockSpec, + create_project, + create_user, + create_volume, + get_volume_configuration, + get_volume_provisioning_data, + list_events, +) +from dstack._internal.utils.common import get_current_datetime + + +@pytest.fixture +def worker() -> VolumeWorker: + return VolumeWorker(queue=Mock(), heartbeater=Mock(), pipeline_hinter=Mock()) + + +@pytest.fixture +def fetcher() -> VolumeFetcher: + return VolumeFetcher( + queue=asyncio.Queue(), + queue_desired_minsize=1, + min_processing_interval=timedelta(seconds=15), + lock_timeout=timedelta(seconds=30), + heartbeater=Mock(), + ) + + +def _volume_to_pipeline_item(volume_model: VolumeModel) -> VolumePipelineItem: + assert volume_model.lock_token is not None + assert volume_model.lock_expires_at is not None + return VolumePipelineItem( + __tablename__=volume_model.__tablename__, + id=volume_model.id, + lock_token=volume_model.lock_token, + lock_expires_at=volume_model.lock_expires_at, + prev_lock_expired=False, + status=volume_model.status, + to_be_deleted=volume_model.to_be_deleted, + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestVolumeFetcher: + async def test_fetch_selects_eligible_volumes_and_sets_lock_fields( + self, test_db, session: AsyncSession, fetcher: VolumeFetcher + ): + project = await create_project(session=session) + user = await create_user(session=session) + now = get_current_datetime() + stale = now - timedelta(minutes=1) + + submitted = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + created_at=stale - timedelta(minutes=1), + last_processed_at=stale - timedelta(seconds=2), + ) + to_be_deleted = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + created_at=stale - timedelta(minutes=1), + last_processed_at=stale - timedelta(seconds=1), + ) + to_be_deleted.to_be_deleted = True + + just_created = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + created_at=now, + last_processed_at=now, + ) + + deleted = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + created_at=stale - timedelta(minutes=1), + last_processed_at=stale, + deleted_at=stale, + ) + recent = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + created_at=now - timedelta(minutes=2), + last_processed_at=now, + ) + locked = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + created_at=stale - timedelta(minutes=1), + last_processed_at=stale + timedelta(seconds=1), + ) + locked.lock_expires_at = now + timedelta(minutes=1) + locked.lock_token = uuid.uuid4() + locked.lock_owner = "OtherPipeline" + await session.commit() + + items = await fetcher.fetch(limit=10) + + assert {item.id for item in items} == { + submitted.id, + to_be_deleted.id, + just_created.id, + } + assert {(item.id, item.status, item.to_be_deleted) for item in items} == { + (submitted.id, VolumeStatus.SUBMITTED, False), + (to_be_deleted.id, VolumeStatus.ACTIVE, True), + (just_created.id, VolumeStatus.SUBMITTED, False), + } + + for volume in [submitted, to_be_deleted, just_created, deleted, recent, locked]: + await session.refresh(volume) + + fetched_volumes = [submitted, to_be_deleted, just_created] + assert all(volume.lock_owner == VolumePipeline.__name__ for volume in fetched_volumes) + assert all(volume.lock_expires_at is not None for volume in fetched_volumes) + assert all(volume.lock_token is not None for volume in fetched_volumes) + assert len({volume.lock_token for volume in fetched_volumes}) == 1 + + assert deleted.lock_owner is None + assert recent.lock_owner is None + assert locked.lock_owner == "OtherPipeline" + + async def test_fetch_returns_oldest_volumes_first_up_to_limit( + self, test_db, session: AsyncSession, fetcher: VolumeFetcher + ): + project = await create_project(session=session) + user = await create_user(session=session) + now = get_current_datetime() + + oldest = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + created_at=now - timedelta(minutes=4), + last_processed_at=now - timedelta(minutes=3), + ) + middle = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + created_at=now - timedelta(minutes=3), + last_processed_at=now - timedelta(minutes=2), + ) + newest = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + created_at=now - timedelta(minutes=2), + last_processed_at=now - timedelta(minutes=1), + ) + + items = await fetcher.fetch(limit=2) + + assert [item.id for item in items] == [oldest.id, middle.id] + + await session.refresh(oldest) + await session.refresh(middle) + await session.refresh(newest) + + assert oldest.lock_owner == VolumePipeline.__name__ + assert middle.lock_owner == VolumePipeline.__name__ + assert newest.lock_owner is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestVolumeWorkerSubmitted: + async def test_submitted_to_active(self, test_db, session: AsyncSession, worker: VolumeWorker): + project = await create_project(session=session) + user = await create_user(session=session) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + ) + volume.lock_token = uuid.uuid4() + volume.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.volumes.backends_services.get_project_backend_by_type_or_error" + ) as get_backend_mock: + backend_mock = Mock() + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.create_volume.return_value = VolumeProvisioningData( + backend=BackendType.AWS, + volume_id="vol-1234", + size_gb=100, + ) + get_backend_mock.return_value = backend_mock + + await worker.process(_volume_to_pipeline_item(volume)) + + get_backend_mock.assert_called_once() + backend_mock.compute.return_value.create_volume.assert_called_once() + backend_mock.compute.return_value.register_volume.assert_not_called() + + await session.refresh(volume) + assert volume.status == VolumeStatus.ACTIVE + assert volume.volume_provisioning_data is not None + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Volume status changed SUBMITTED -> ACTIVE" + + async def test_registers_external_volume( + self, test_db, session: AsyncSession, worker: VolumeWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + configuration=get_volume_configuration(volume_id="vol-external-123"), + ) + volume.lock_token = uuid.uuid4() + volume.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.volumes.backends_services.get_project_backend_by_type_or_error" + ) as get_backend_mock: + backend_mock = Mock() + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.register_volume.return_value = ( + VolumeProvisioningData( + backend=BackendType.AWS, + volume_id="vol-external-123", + size_gb=100, + ) + ) + get_backend_mock.return_value = backend_mock + + await worker.process(_volume_to_pipeline_item(volume)) + + get_backend_mock.assert_called_once() + backend_mock.compute.return_value.register_volume.assert_called_once() + backend_mock.compute.return_value.create_volume.assert_not_called() + + await session.refresh(volume) + assert volume.status == VolumeStatus.ACTIVE + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Volume status changed SUBMITTED -> ACTIVE" + + async def test_marks_volume_failed_if_backend_not_available( + self, test_db, session: AsyncSession, worker: VolumeWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + ) + volume.lock_token = uuid.uuid4() + volume.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.volumes.backends_services.get_project_backend_by_type_or_error" + ) as get_backend_mock: + get_backend_mock.side_effect = BackendNotAvailable() + await worker.process(_volume_to_pipeline_item(volume)) + get_backend_mock.assert_called_once() + + await session.refresh(volume) + assert volume.status == VolumeStatus.FAILED + assert volume.status_message == "Backend not available" + events = await list_events(session) + assert len(events) == 1 + assert ( + events[0].message + == "Volume status changed SUBMITTED -> FAILED (Backend not available)" + ) + + async def test_marks_volume_failed_if_backend_returns_error( + self, test_db, session: AsyncSession, worker: VolumeWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + ) + volume.lock_token = uuid.uuid4() + volume.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.volumes.backends_services.get_project_backend_by_type_or_error" + ) as get_backend_mock: + backend_mock = Mock() + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.create_volume.side_effect = BackendError( + "Some error" + ) + get_backend_mock.return_value = backend_mock + + await worker.process(_volume_to_pipeline_item(volume)) + + get_backend_mock.assert_called_once() + backend_mock.compute.return_value.create_volume.assert_called_once() + + await session.refresh(volume) + assert volume.status == VolumeStatus.FAILED + assert volume.status_message == "Some error" + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Volume status changed SUBMITTED -> FAILED (Some error)" + + async def test_skips_processing_if_lock_token_changed_before_refetch( + self, test_db, session: AsyncSession, worker: VolumeWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + ) + volume.lock_token = uuid.uuid4() + volume.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + item = _volume_to_pipeline_item(volume) + + volume.lock_token = uuid.uuid4() + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.volumes._process_submitted_volume" + ) as process_volume_mock: + await worker.process(item) + process_volume_mock.assert_not_awaited() + + await session.refresh(volume) + assert volume.status == VolumeStatus.SUBMITTED + events = await list_events(session) + assert len(events) == 0 + + async def test_skips_apply_if_lock_token_changed_after_processing( + self, test_db, session: AsyncSession, worker: VolumeWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + ) + volume.lock_token = uuid.uuid4() + volume.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + async def _change_lock_token_and_return_result(_volume_model: VolumeModel): + volume.lock_token = uuid.uuid4() + await session.commit() + return volumes_pipeline._ProcessResult( + update_map={ + "status": VolumeStatus.ACTIVE, + } + ) + + with patch( + "dstack._internal.server.background.pipeline_tasks.volumes._process_submitted_volume", + side_effect=_change_lock_token_and_return_result, + ) as process_volume_mock: + await worker.process(_volume_to_pipeline_item(volume)) + process_volume_mock.assert_awaited_once() + + await session.refresh(volume) + assert volume.status == VolumeStatus.SUBMITTED + events = await list_events(session) + assert len(events) == 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestVolumeWorkerDeleted: + async def test_marks_volume_deleted( + self, test_db, session: AsyncSession, worker: VolumeWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + volume_provisioning_data=get_volume_provisioning_data(backend=BackendType.AWS), + ) + volume.lock_token = uuid.uuid4() + volume.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + volume.to_be_deleted = True + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.volumes.backends_services.get_project_backend_by_type_or_error" + ) as get_backend_mock: + backend_mock = Mock() + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + get_backend_mock.return_value = backend_mock + + await worker.process(_volume_to_pipeline_item(volume)) + + get_backend_mock.assert_called_once() + backend_mock.compute.return_value.delete_volume.assert_called_once() + + await session.refresh(volume) + assert volume.deleted is True + assert volume.deleted_at is not None + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Volume deleted" + + async def test_marks_external_volume_deleted_without_backend_call( + self, test_db, session: AsyncSession, worker: VolumeWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + configuration=get_volume_configuration(volume_id="vol-external-123"), + ) + volume.lock_token = uuid.uuid4() + volume.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + volume.to_be_deleted = True + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.volumes.backends_services.get_project_backend_by_type_or_error" + ) as get_backend_mock: + await worker.process(_volume_to_pipeline_item(volume)) + get_backend_mock.assert_not_called() + + await session.refresh(volume) + assert volume.deleted is True + assert volume.deleted_at is not None + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Volume deleted" + + async def test_marks_volume_deleted_if_backend_not_available( + self, test_db, session: AsyncSession, worker: VolumeWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + volume_provisioning_data=get_volume_provisioning_data(backend=BackendType.AWS), + ) + volume.lock_token = uuid.uuid4() + volume.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + volume.to_be_deleted = True + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.volumes.backends_services.get_project_backend_by_type_or_error" + ) as get_backend_mock: + get_backend_mock.side_effect = BackendNotAvailable() + await worker.process(_volume_to_pipeline_item(volume)) + get_backend_mock.assert_called_once() + + await session.refresh(volume) + assert volume.deleted is True + assert volume.deleted_at is not None + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Volume deleted" + + async def test_marks_volume_deleted_if_backend_delete_errors( + self, test_db, session: AsyncSession, worker: VolumeWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + volume_provisioning_data=get_volume_provisioning_data(backend=BackendType.AWS), + ) + volume.lock_token = uuid.uuid4() + volume.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + volume.to_be_deleted = True + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.volumes.backends_services.get_project_backend_by_type_or_error" + ) as get_backend_mock: + backend_mock = Mock() + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.delete_volume.side_effect = BackendError( + "Delete failed" + ) + get_backend_mock.return_value = backend_mock + + await worker.process(_volume_to_pipeline_item(volume)) + + get_backend_mock.assert_called_once() + backend_mock.compute.return_value.delete_volume.assert_called_once() + + await session.refresh(volume) + assert volume.deleted is True + assert volume.deleted_at is not None + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Volume deleted" diff --git a/src/tests/_internal/server/background/scheduled_tasks/__init__.py b/src/tests/_internal/server/background/scheduled_tasks/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/server/background/scheduled_tasks/test_events.py b/src/tests/_internal/server/background/scheduled_tasks/test_events.py new file mode 100644 index 0000000000..91eb066f58 --- /dev/null +++ b/src/tests/_internal/server/background/scheduled_tasks/test_events.py @@ -0,0 +1,45 @@ +from datetime import datetime +from unittest.mock import patch + +import pytest +from freezegun import freeze_time +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.server import settings +from dstack._internal.server.background.scheduled_tasks.events import delete_events +from dstack._internal.server.services import events +from dstack._internal.server.testing.common import create_user, list_events + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +async def test_deletes_old_events(test_db, session: AsyncSession) -> None: + user = await create_user(session=session) + for i in range(10): + with freeze_time(datetime(2026, 1, 1, i)): + events.emit( + session, + message=f"Event {i}", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(user)], + ) + await session.commit() + + all_events = await list_events(session) + assert len(all_events) == 10 + + with ( + patch.multiple(settings, SERVER_EVENTS_TTL_SECONDS=5 * 3600), + freeze_time(datetime(2026, 1, 1, 10)), + ): + await delete_events() + + remaining_events = await list_events(session) + assert len(remaining_events) == 5 + assert [e.message for e in remaining_events] == [ + "Event 5", + "Event 6", + "Event 7", + "Event 8", + "Event 9", + ] diff --git a/src/tests/_internal/server/background/scheduled_tasks/test_idle_volumes.py b/src/tests/_internal/server/background/scheduled_tasks/test_idle_volumes.py new file mode 100644 index 0000000000..86aeea0f4c --- /dev/null +++ b/src/tests/_internal/server/background/scheduled_tasks/test_idle_volumes.py @@ -0,0 +1,239 @@ +import datetime +from unittest.mock import Mock, patch + +import pytest +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.volumes import VolumeStatus +from dstack._internal.server.background.scheduled_tasks.idle_volumes import ( + _get_idle_time, + _should_delete_volume, + process_idle_volumes, +) +from dstack._internal.server.models import VolumeAttachmentModel +from dstack._internal.server.testing.common import ( + ComputeMockSpec, + create_instance, + create_project, + create_user, + create_volume, + get_volume_configuration, + get_volume_provisioning_data, + list_events, +) +from dstack._internal.utils.common import get_current_datetime + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestProcessIdleVolumes: + async def test_deletes_idle_volumes(self, test_db, session: AsyncSession): + project = await create_project(session=session) + user = await create_user(session=session) + + config1 = get_volume_configuration( + name="test-volume", + auto_cleanup_duration="1h", + ) + config2 = get_volume_configuration( + name="test-volume", + auto_cleanup_duration="3h", + ) + volume1 = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + backend=BackendType.AWS, + configuration=config1, + volume_provisioning_data=get_volume_provisioning_data(), + last_job_processed_at=datetime.datetime.now(datetime.timezone.utc) + - datetime.timedelta(hours=2), + ) + volume2 = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + backend=BackendType.AWS, + configuration=config2, + volume_provisioning_data=get_volume_provisioning_data(), + last_job_processed_at=datetime.datetime.now(datetime.timezone.utc) + - datetime.timedelta(hours=2), + ) + await session.commit() + + with patch( + "dstack._internal.server.services.backends.get_project_backend_by_type_or_error" + ) as m: + aws_mock = Mock() + m.return_value = aws_mock + aws_mock.compute.return_value = Mock(spec=ComputeMockSpec) + await process_idle_volumes() + m.assert_not_called() + + await session.refresh(volume1) + await session.refresh(volume2) + events = await list_events(session) + assert volume1.to_be_deleted + assert not volume1.deleted + assert volume1.deleted_at is None + assert not volume2.to_be_deleted + assert not volume2.deleted + assert volume2.deleted_at is None + assert len(events) == 1 + assert ( + events[0].message + == "Volume marked for deletion due to exceeding auto_cleanup_duration" + ) + + async def test_deletes_idle_volume_with_null_auto_cleanup_enabled( + self, test_db, session: AsyncSession + ): + project = await create_project(session=session) + user = await create_user(session=session) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + backend=BackendType.AWS, + configuration=get_volume_configuration( + name="test-volume", + auto_cleanup_duration="1h", + ), + volume_provisioning_data=get_volume_provisioning_data(), + last_job_processed_at=datetime.datetime.now(datetime.timezone.utc) + - datetime.timedelta(hours=2), + ) + volume.auto_cleanup_enabled = None + await session.commit() + + with patch( + "dstack._internal.server.services.backends.get_project_backend_by_type_or_error" + ) as m: + aws_mock = Mock() + m.return_value = aws_mock + aws_mock.compute.return_value = Mock(spec=ComputeMockSpec) + await process_idle_volumes() + m.assert_not_called() + + await session.refresh(volume) + events = await list_events(session) + assert volume.to_be_deleted + assert not volume.deleted + assert volume.deleted_at is None + assert len(events) == 1 + assert ( + events[0].message + == "Volume marked for deletion due to exceeding auto_cleanup_duration" + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestShouldDeleteVolume: + async def test_no_idle_duration(self, test_db, session: AsyncSession): + project = await create_project(session=session) + user = await create_user(session=session) + + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + backend=BackendType.AWS, + configuration=get_volume_configuration(name="test-volume"), + volume_provisioning_data=get_volume_provisioning_data(), + ) + + assert not _should_delete_volume(volume) + + async def test_idle_duration_disabled(self, test_db, session: AsyncSession): + project = await create_project(session=session) + user = await create_user(session=session) + + config = get_volume_configuration(name="test-volume") + config.auto_cleanup_duration = -1 + + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + backend=BackendType.AWS, + configuration=config, + volume_provisioning_data=get_volume_provisioning_data(), + ) + + assert not _should_delete_volume(volume) + + async def test_volume_attached(self, test_db, session: AsyncSession): + project = await create_project(session=session) + user = await create_user(session=session) + + config = get_volume_configuration(name="test-volume") + config.auto_cleanup_duration = "1h" + + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + backend=BackendType.AWS, + configuration=config, + volume_provisioning_data=get_volume_provisioning_data(), + ) + + instance = await create_instance(session=session, project=project) + volume.attachments.append( + VolumeAttachmentModel(volume_id=volume.id, instance_id=instance.id) + ) + await session.commit() + + assert not _should_delete_volume(volume) + + async def test_idle_duration_threshold(self, test_db, session: AsyncSession): + project = await create_project(session=session) + user = await create_user(session=session) + + config = get_volume_configuration(name="test-volume") + config.auto_cleanup_duration = "1h" + + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + backend=BackendType.AWS, + configuration=config, + volume_provisioning_data=get_volume_provisioning_data(), + ) + + # Not exceeded - 30 minutes ago + volume.last_job_processed_at = get_current_datetime() - datetime.timedelta(minutes=30) + assert not _should_delete_volume(volume) + + # Exceeded - 2 hours ago + volume.last_job_processed_at = get_current_datetime() - datetime.timedelta(hours=2) + assert _should_delete_volume(volume) + + async def test_never_used_volume(self, test_db, session: AsyncSession): + project = await create_project(session=session) + user = await create_user(session=session) + + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + backend=BackendType.AWS, + configuration=get_volume_configuration(name="test-volume"), + volume_provisioning_data=get_volume_provisioning_data(), + created_at=datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(hours=2), + ) + + volume.last_job_processed_at = None + idle_time = _get_idle_time(volume) + assert idle_time.total_seconds() >= 7000 diff --git a/src/tests/_internal/server/background/scheduled_tasks/test_instance_healthchecks.py b/src/tests/_internal/server/background/scheduled_tasks/test_instance_healthchecks.py new file mode 100644 index 0000000000..06ea5ab5ac --- /dev/null +++ b/src/tests/_internal/server/background/scheduled_tasks/test_instance_healthchecks.py @@ -0,0 +1,49 @@ +from datetime import timedelta + +import pytest +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.server.background.scheduled_tasks.instance_healthchecks import ( + delete_instance_healthchecks, +) +from dstack._internal.server.models import InstanceHealthCheckModel, InstanceStatus +from dstack._internal.server.testing.common import ( + create_instance, + create_instance_health_check, + create_project, +) +from dstack._internal.utils.common import get_current_datetime + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +@pytest.mark.usefixtures("test_db", "image_config_mock") +class TestDeleteInstanceHealthChecks: + async def test_deletes_instance_health_checks( + self, monkeypatch: pytest.MonkeyPatch, session: AsyncSession + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, project=project, status=InstanceStatus.IDLE + ) + # 30 minutes + monkeypatch.setattr( + "dstack._internal.server.settings.SERVER_INSTANCE_HEALTH_TTL_SECONDS", 1800 + ) + now = get_current_datetime() + # old check + await create_instance_health_check( + session=session, instance=instance, collected_at=now - timedelta(minutes=40) + ) + # recent check + check = await create_instance_health_check( + session=session, instance=instance, collected_at=now - timedelta(minutes=20) + ) + + await delete_instance_healthchecks() + + res = await session.execute(select(InstanceHealthCheckModel)) + all_checks = res.scalars().all() + assert len(all_checks) == 1 + assert all_checks[0] == check diff --git a/src/tests/_internal/server/background/scheduled_tasks/test_metrics.py b/src/tests/_internal/server/background/scheduled_tasks/test_metrics.py new file mode 100644 index 0000000000..1e3900a449 --- /dev/null +++ b/src/tests/_internal/server/background/scheduled_tasks/test_metrics.py @@ -0,0 +1,188 @@ +from datetime import datetime, timezone +from unittest.mock import patch + +import pytest +from freezegun import freeze_time +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.core.models.runs import JobStatus +from dstack._internal.core.models.users import GlobalRole, ProjectRole +from dstack._internal.server import settings +from dstack._internal.server.background.scheduled_tasks.metrics import ( + collect_metrics, + delete_metrics, +) +from dstack._internal.server.models import JobMetricsPoint +from dstack._internal.server.schemas.runner import GPUMetrics, MetricsResponse +from dstack._internal.server.services.projects import add_project_member +from dstack._internal.server.testing.common import ( + create_instance, + create_job, + create_job_metrics_point, + create_project, + create_repo, + create_run, + create_user, + get_job_provisioning_data, +) + +pytestmark = pytest.mark.usefixtures("image_config_mock") + + +class TestCollectMetrics: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_collects_metrics(self, test_db, session: AsyncSession): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo( + session=session, + project_id=project.id, + ) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + job_provisioning_data=get_job_provisioning_data(), + instance_assigned=True, + instance=instance, + ) + with ( + patch("dstack._internal.server.services.runner.pool.SSHTunnel") as SSHTunnelMock, + patch( + "dstack._internal.server.services.runner.client.RunnerClient.from_address" + ) as RunnerClientMock, + ): + runner_client_mock = RunnerClientMock.return_value + runner_client_mock.get_metrics.return_value = MetricsResponse( + timestamp_micro=1, + cpu_usage_micro=2, + memory_usage_bytes=3, + memory_working_set_bytes=4, + gpus=[ + GPUMetrics( + gpu_memory_usage_bytes=0, + gpu_util_percent=0, + ) + ], + ) + await collect_metrics() + SSHTunnelMock.assert_called_once() + runner_client_mock.get_metrics.assert_called_once() + res = await session.execute(select(JobMetricsPoint)) + metrics_point = res.scalar_one() + assert metrics_point.job_id == job.id + + +class TestDeleteMetrics: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @freeze_time(datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc)) + async def test_deletes_old_metrics_running_job(self, test_db, session: AsyncSession): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo( + session=session, + project_id=project.id, + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + ) + await create_job_metrics_point( + session=session, + job_model=job, + timestamp=datetime(2023, 1, 2, 3, 4, 10, tzinfo=timezone.utc), + ) + await create_job_metrics_point( + session=session, + job_model=job, + timestamp=datetime(2023, 1, 2, 3, 4, 20, tzinfo=timezone.utc), + ) + last_metric = await create_job_metrics_point( + session=session, + job_model=job, + timestamp=datetime(2023, 1, 2, 3, 5, 10, tzinfo=timezone.utc), + ) + with patch.multiple( + settings, SERVER_METRICS_RUNNING_TTL_SECONDS=15, SERVER_METRICS_FINISHED_TTL_SECONDS=0 + ): + await delete_metrics() + res = await session.execute(select(JobMetricsPoint)) + points = res.scalars().all() + assert len(points) == 1 + assert points[0].id == last_metric.id + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @freeze_time(datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc)) + async def test_deletes_old_metrics_finished_job(self, test_db, session: AsyncSession): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo( + session=session, + project_id=project.id, + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.FAILED, + ) + await create_job_metrics_point( + session=session, + job_model=job, + timestamp=datetime(2023, 1, 2, 3, 4, 10, tzinfo=timezone.utc), + ) + await create_job_metrics_point( + session=session, + job_model=job, + timestamp=datetime(2023, 1, 2, 3, 4, 20, tzinfo=timezone.utc), + ) + last_metric = await create_job_metrics_point( + session=session, + job_model=job, + timestamp=datetime(2023, 1, 2, 3, 5, 10, tzinfo=timezone.utc), + ) + with patch.multiple( + settings, SERVER_METRICS_RUNNING_TTL_SECONDS=0, SERVER_METRICS_FINISHED_TTL_SECONDS=15 + ): + await delete_metrics() + res = await session.execute(select(JobMetricsPoint)) + points = res.scalars().all() + assert len(points) == 1 + assert points[0].id == last_metric.id diff --git a/src/tests/_internal/server/background/scheduled_tasks/test_probes.py b/src/tests/_internal/server/background/scheduled_tasks/test_probes.py new file mode 100644 index 0000000000..bfd569ab1b --- /dev/null +++ b/src/tests/_internal/server/background/scheduled_tasks/test_probes.py @@ -0,0 +1,229 @@ +from datetime import datetime, timedelta, timezone +from unittest.mock import patch + +import pytest +from freezegun import freeze_time +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.configurations import ProbeConfig, ServiceConfiguration +from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.core.models.runs import JobStatus +from dstack._internal.server.background.scheduled_tasks.probes import ( + PROCESSING_OVERHEAD_TIMEOUT, + SSH_CONNECT_TIMEOUT, + process_probes, +) +from dstack._internal.server.testing.common import ( + create_instance, + create_job, + create_probe, + create_project, + create_repo, + create_run, + create_user, + get_job_provisioning_data, + get_run_spec, +) + +pytestmark = pytest.mark.usefixtures("image_config_mock") + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestProcessProbes: + async def test_deactivates_probes_for_stopped_job( + self, test_db, session: AsyncSession + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo( + session=session, + project_id=project.id, + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=get_run_spec( + run_name="test", + repo_id=repo.name, + configuration=ServiceConfiguration( + port=80, + image="nginx", + probes=[ + ProbeConfig(type="http", url="/1"), + ProbeConfig(type="http", url="/2"), + ], + ), + ), + ) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + running_job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + job_provisioning_data=get_job_provisioning_data(), + instance=instance, + instance_assigned=True, + ) + terminating_job = await create_job( + session=session, + run=run, + status=JobStatus.TERMINATING, + job_provisioning_data=get_job_provisioning_data(), + instance=instance, + instance_assigned=True, + ) + running_job_probes = [ + await create_probe(session, running_job, probe_num=i) for i in range(2) + ] + terminating_job_probes = [ + await create_probe(session, terminating_job, probe_num=i) for i in range(2) + ] + await process_probes() + for probe in running_job_probes: + await session.refresh(probe) + assert probe.active + for probe in terminating_job_probes: + await session.refresh(probe) + assert not probe.active + + async def test_schedules_probe_execution(self, test_db, session: AsyncSession) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo( + session=session, + project_id=project.id, + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=get_run_spec( + run_name="test", + repo_id=repo.name, + configuration=ServiceConfiguration( + port=80, + image="nginx", + probes=[ + ProbeConfig(type="http", url="/1", timeout="1m"), + ProbeConfig(type="http", url="/2", timeout="2m"), + ], + ), + ), + ) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + job_provisioning_data=get_job_provisioning_data(), + instance=instance, + instance_assigned=True, + ) + probe_1 = await create_probe( + session, job, probe_num=0, due=datetime(2025, 1, 1, 0, 0, 0, tzinfo=timezone.utc) + ) + probe_2 = await create_probe( + session, job, probe_num=1, due=datetime(2025, 1, 1, 0, 0, 0, tzinfo=timezone.utc) + ) + processing_time = datetime(2025, 1, 1, 0, 0, 1, tzinfo=timezone.utc) + with freeze_time(processing_time): + with patch( + "dstack._internal.server.background.scheduled_tasks.probes.PROBES_SCHEDULER" + ) as scheduler_mock: + await process_probes() + assert scheduler_mock.add_job.call_count == 2 + await session.refresh(probe_1) + assert probe_1.active + assert ( + probe_1.due + == processing_time + + timedelta(minutes=1) + + SSH_CONNECT_TIMEOUT + + PROCESSING_OVERHEAD_TIMEOUT + ) + await session.refresh(probe_2) + assert probe_2.active + assert ( + probe_2.due + == processing_time + + timedelta(minutes=2) + + SSH_CONNECT_TIMEOUT + + PROCESSING_OVERHEAD_TIMEOUT + ) + + async def test_deactivates_probe_when_until_ready_and_ready_after_reached( + self, test_db, session: AsyncSession + ) -> None: + project = await create_project(session=session) + user = await create_user(session=session) + repo = await create_repo( + session=session, + project_id=project.id, + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=get_run_spec( + run_name="test", + repo_id=repo.name, + configuration=ServiceConfiguration( + port=80, + image="nginx", + probes=[ + ProbeConfig( + type="http", url="/until_ready", until_ready=True, ready_after=3 + ), + ProbeConfig(type="http", url="/regular", until_ready=False, ready_after=3), + ], + ), + ), + ) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + job_provisioning_data=get_job_provisioning_data(), + instance=instance, + instance_assigned=True, + ) + + probe_until_ready = await create_probe(session, job, probe_num=0, success_streak=3) + probe_regular = await create_probe(session, job, probe_num=1, success_streak=3) + + with patch( + "dstack._internal.server.background.scheduled_tasks.probes.PROBES_SCHEDULER" + ) as scheduler_mock: + await process_probes() + + await session.refresh(probe_until_ready) + await session.refresh(probe_regular) + + assert not probe_until_ready.active + assert probe_until_ready.success_streak == 3 + + assert probe_regular.active + assert probe_regular.success_streak == 3 + assert scheduler_mock.add_job.call_count == 1 # only the regular probe was scheduled + + +# TODO: test probe success and failure +# (skipping for now - a bit difficult to test and most of the logic will be mocked) diff --git a/src/tests/_internal/server/background/scheduled_tasks/test_prometheus_metrics.py b/src/tests/_internal/server/background/scheduled_tasks/test_prometheus_metrics.py new file mode 100644 index 0000000000..0775723b4d --- /dev/null +++ b/src/tests/_internal/server/background/scheduled_tasks/test_prometheus_metrics.py @@ -0,0 +1,209 @@ +from collections.abc import Generator +from datetime import datetime, timezone +from unittest.mock import Mock, patch + +import pytest +import pytest_asyncio +from freezegun import freeze_time +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.core.models.runs import JobStatus +from dstack._internal.core.models.users import GlobalRole, ProjectRole +from dstack._internal.server.background.scheduled_tasks.prometheus_metrics import ( + collect_prometheus_metrics, + delete_prometheus_metrics, +) +from dstack._internal.server.models import JobModel, JobPrometheusMetrics +from dstack._internal.server.services.projects import add_project_member +from dstack._internal.server.testing.common import ( + create_instance, + create_job, + create_job_prometheus_metrics, + create_project, + create_repo, + create_run, + create_user, + get_job_provisioning_data, +) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +@pytest.mark.usefixtures("test_db", "image_config_mock") +class TestCollectPrometheusMetrics: + @pytest_asyncio.fixture + async def job(self, request: pytest.FixtureRequest, session: AsyncSession) -> JobModel: + dockerized: bool + marker = request.node.get_closest_marker("dockerized") + if marker is None: + dockerized = True + else: + dockerized = marker.args[0] + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo( + session=session, + project_id=project.id, + ) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + job_provisioning_data=get_job_provisioning_data(dockerized=dockerized), + instance_assigned=True, + instance=instance, + ) + return job + + @pytest.fixture + def ssh_tunnel_mock(self) -> Generator[Mock, None, None]: + with patch("dstack._internal.server.services.runner.pool.SSHTunnel") as SSHTunnelMock: + yield SSHTunnelMock + + @pytest.fixture + def shim_client_mock(self) -> Generator[Mock, None, None]: + with patch( + "dstack._internal.server.services.runner.client.ShimClient.from_address" + ) as ShimClientMock: + yield ShimClientMock.return_value + + @freeze_time(datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc)) + async def test_inserts_new_record( + self, session: AsyncSession, job: JobModel, ssh_tunnel_mock: Mock, shim_client_mock: Mock + ): + shim_client_mock.get_task_metrics.return_value = "# prom response" + + await collect_prometheus_metrics() + + ssh_tunnel_mock.assert_called_once() + shim_client_mock.get_task_metrics.assert_called_once() + res = await session.execute( + select(JobPrometheusMetrics).where(JobPrometheusMetrics.job_id == job.id) + ) + metrics = res.scalar_one() + assert metrics.text == "# prom response" + assert metrics.collected_at == datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc) + + @freeze_time(datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc)) + async def test_updates_record( + self, session: AsyncSession, job: JobModel, ssh_tunnel_mock: Mock, shim_client_mock: Mock + ): + metrics = await create_job_prometheus_metrics( + session=session, + job=job, + collected_at=datetime(2023, 1, 2, 3, 5, 0), + text="# prom old response", + ) + shim_client_mock.get_task_metrics.return_value = "# prom new response" + + await collect_prometheus_metrics() + + ssh_tunnel_mock.assert_called_once() + shim_client_mock.get_task_metrics.assert_called_once() + res = await session.execute( + select(JobPrometheusMetrics) + .where(JobPrometheusMetrics.job_id == job.id) + .execution_options(populate_existing=True) + ) + metrics = res.scalar_one() + assert metrics.text == "# prom new response" + assert metrics.collected_at == datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc) + + @freeze_time(datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc)) + async def test_skips_recently_updated( + self, session: AsyncSession, job: JobModel, ssh_tunnel_mock: Mock, shim_client_mock: Mock + ): + metrics = await create_job_prometheus_metrics( + session=session, + job=job, + collected_at=datetime(2023, 1, 2, 3, 5, 15), + text="# prom old response", + ) + shim_client_mock.get_task_metrics.return_value = "# prom new response" + + await collect_prometheus_metrics() + + ssh_tunnel_mock.assert_not_called() + shim_client_mock.get_task_metrics.assert_not_called() + res = await session.execute( + select(JobPrometheusMetrics) + .where(JobPrometheusMetrics.job_id == job.id) + .execution_options(populate_existing=True) + ) + metrics = res.scalar_one() + assert metrics.text == "# prom old response" + assert metrics.collected_at == datetime(2023, 1, 2, 3, 5, 15, tzinfo=timezone.utc) + + @freeze_time(datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc)) + @pytest.mark.dockerized(False) + async def test_skips_non_dockerized_jobs( + self, session: AsyncSession, job: JobModel, ssh_tunnel_mock: Mock, shim_client_mock: Mock + ): + await collect_prometheus_metrics() + + ssh_tunnel_mock.assert_not_called() + shim_client_mock.get_task_metrics.assert_not_called() + res = await session.execute( + select(JobPrometheusMetrics).where(JobPrometheusMetrics.job_id == job.id) + ) + metrics = res.scalar_one_or_none() + assert metrics is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +@pytest.mark.usefixtures("test_db", "image_config_mock") +class TestDeletePrometheusMetrics: + @freeze_time(datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc)) + async def test_deletes_old_metrics(self, session: AsyncSession): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + run_1 = await create_run( + session=session, project=project, repo=repo, user=user, run_name="run-1" + ) + job_1 = await create_job(session=session, run=run_1) + # old metrics + await create_job_prometheus_metrics( + session=session, + job=job_1, + collected_at=datetime(2023, 1, 2, 2, 3, 30), + ) + run_2 = await create_run( + session=session, project=project, repo=repo, user=user, run_name="run-2" + ) + job_2 = await create_job(session=session, run=run_2) + # recent metrics + metrics_2 = await create_job_prometheus_metrics( + session=session, + job=job_2, + collected_at=datetime(2023, 1, 2, 3, 5, 0), + ) + + await delete_prometheus_metrics() + + res = await session.execute( + select(JobPrometheusMetrics).join(JobModel).where(JobModel.project_id == project.id) + ) + all_metrics = res.scalars().all() + assert len(all_metrics) == 1 + assert all_metrics[0] == metrics_2 diff --git a/src/tests/_internal/server/background/tasks/test_process_gateways.py b/src/tests/_internal/server/background/tasks/test_process_gateways.py deleted file mode 100644 index 4a1e659ebc..0000000000 --- a/src/tests/_internal/server/background/tasks/test_process_gateways.py +++ /dev/null @@ -1,105 +0,0 @@ -from unittest.mock import MagicMock, Mock, patch - -import pytest -from sqlalchemy.ext.asyncio import AsyncSession - -from dstack._internal.core.errors import BackendError -from dstack._internal.core.models.gateways import GatewayProvisioningData, GatewayStatus -from dstack._internal.server.background.tasks.process_gateways import process_submitted_gateways -from dstack._internal.server.testing.common import ( - AsyncContextManager, - create_backend, - create_gateway, - create_project, -) - - -class TestProcessSubmittedGateways: - @pytest.mark.asyncio - async def test_provisions_gateway(self, test_db, session: AsyncSession): - project = await create_project(session=session) - backend = await create_backend(session=session, project_id=project.id) - gateway = await create_gateway( - session=session, - project_id=project.id, - backend_id=backend.id, - ) - with patch( - "dstack._internal.server.services.backends.get_project_backend_with_model_by_type_or_error" - ) as m, patch( - "dstack._internal.server.services.gateways.gateway_connections_pool.add" - ) as pool_add: - aws = Mock() - m.return_value = (backend, aws) - pool_add.return_value = MagicMock() - pool_add.return_value.client.return_value = MagicMock(AsyncContextManager()) - aws.compute.return_value.create_gateway.return_value = GatewayProvisioningData( - instance_id="i-1234567890", - ip_address="2.2.2.2", - region="us", - ) - await process_submitted_gateways() - m.assert_called_once() - aws.compute.return_value.create_gateway.assert_called_once() - pool_add.assert_called_once() - await session.refresh(gateway) - assert gateway.status == GatewayStatus.RUNNING - assert gateway.gateway_compute is not None - assert gateway.gateway_compute.ip_address == "2.2.2.2" - - @pytest.mark.asyncio - async def test_marks_gateway_as_failed_if_gateway_creation_errors( - self, test_db, session: AsyncSession - ): - project = await create_project(session=session) - backend = await create_backend(session=session, project_id=project.id) - gateway = await create_gateway( - session=session, - project_id=project.id, - backend_id=backend.id, - ) - with patch( - "dstack._internal.server.services.backends.get_project_backend_with_model_by_type_or_error" - ) as m: - aws = Mock() - m.return_value = (backend, aws) - aws.compute.return_value.create_gateway.side_effect = BackendError("Some error") - await process_submitted_gateways() - m.assert_called_once() - aws.compute.return_value.create_gateway.assert_called_once() - await session.refresh(gateway) - assert gateway.status == GatewayStatus.FAILED - assert gateway.status_message == "Some error" - - @pytest.mark.asyncio - async def test_marks_gateway_as_failed_if_fails_to_connect( - self, test_db, session: AsyncSession - ): - project = await create_project(session=session) - backend = await create_backend(session=session, project_id=project.id) - gateway = await create_gateway( - session=session, - project_id=project.id, - backend_id=backend.id, - ) - with patch( - "dstack._internal.server.services.backends.get_project_backend_with_model_by_type_or_error" - ) as m, patch( - "dstack._internal.server.services.gateways.connect_to_gateway_with_retry" - ) as connect_to_gateway_with_retry_mock: - aws = Mock() - m.return_value = (backend, aws) - connect_to_gateway_with_retry_mock.return_value = None - aws.compute.return_value.create_gateway.return_value = GatewayProvisioningData( - instance_id="i-1234567890", - ip_address="2.2.2.2", - region="us", - ) - await process_submitted_gateways() - m.assert_called_once() - aws.compute.return_value.create_gateway.assert_called_once() - connect_to_gateway_with_retry_mock.assert_called_once() - await session.refresh(gateway) - assert gateway.status == GatewayStatus.FAILED - assert gateway.gateway_compute is not None - assert gateway.gateway_compute is not None diff --git a/src/tests/_internal/server/background/tasks/test_process_instances.py b/src/tests/_internal/server/background/tasks/test_process_instances.py deleted file mode 100644 index 64b64e7016..0000000000 --- a/src/tests/_internal/server/background/tasks/test_process_instances.py +++ /dev/null @@ -1,354 +0,0 @@ -import datetime as dt -from unittest.mock import Mock, patch - -import pytest -from sqlalchemy.ext.asyncio import AsyncSession - -from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.instances import ( - InstanceAvailability, - InstanceOfferWithAvailability, - InstanceType, - Resources, -) -from dstack._internal.core.models.profiles import Profile, ProfileRetryPolicy, TerminationPolicy -from dstack._internal.core.models.runs import ( - InstanceStatus, - JobProvisioningData, - JobStatus, -) -from dstack._internal.server.background.tasks.process_instances import ( - HealthStatus, - process_instances, -) -from dstack._internal.server.background.tasks.process_instances import ( - create_instance as task_create_instance, -) -from dstack._internal.server.testing.common import ( - create_instance, - create_job, - create_pool, - create_project, - create_repo, - create_run, - create_user, -) -from dstack._internal.utils.common import get_current_datetime - - -class TestCheckShim: - @pytest.mark.asyncio - async def test_check_shim_transitions_provisioning_on_ready( - self, test_db, session: AsyncSession - ): - project = await create_project(session=session) - pool = await create_pool(session, project) - - instance = await create_instance( - session, project, pool, status=InstanceStatus.PROVISIONING - ) - instance.termination_deadline = get_current_datetime() + dt.timedelta(days=1) - instance.health_status = "ssh connect problem" - - await session.commit() - - with patch( - "dstack._internal.server.background.tasks.process_instances.instance_healthcheck" - ) as healthcheck: - healthcheck.return_value = HealthStatus(healthy=True, reason="OK") - await process_instances() - - await session.refresh(instance) - - assert instance is not None - assert instance.status == InstanceStatus.IDLE - assert instance.termination_deadline is None - assert instance.health_status is None - - @pytest.mark.asyncio - async def test_check_shim_transitions_provisioning_on_terminating( - self, test_db, session: AsyncSession - ): - project = await create_project(session=session) - pool = await create_pool(session, project) - - instance = await create_instance( - session, project, pool, status=InstanceStatus.PROVISIONING - ) - instance.started_at = get_current_datetime() + dt.timedelta(minutes=-20) - instance.health_status = "ssh connect problem" - - await session.commit() - - health_reason = "Shim problem" - - with patch( - "dstack._internal.server.background.tasks.process_instances.instance_healthcheck" - ) as healthcheck: - healthcheck.return_value = HealthStatus(healthy=False, reason=health_reason) - await process_instances() - - await session.refresh(instance) - - assert instance is not None - assert instance.status == InstanceStatus.TERMINATING - assert instance.termination_deadline is not None - assert instance.health_status == health_reason - - @pytest.mark.asyncio - async def test_check_shim_transitions_provisioning_on_busy( - self, test_db, session: AsyncSession - ): - user = await create_user(session=session) - project = await create_project(session=session, owner=user) - pool = await create_pool(session, project) - repo = await create_repo( - session=session, - project_id=project.id, - ) - run = await create_run( - session=session, - project=project, - repo=repo, - user=user, - ) - job = await create_job( - session=session, - run=run, - status=JobStatus.SUBMITTED, - ) - - instance = await create_instance( - session, project, pool, status=InstanceStatus.PROVISIONING - ) - instance.termination_deadline = get_current_datetime().replace( - tzinfo=dt.timezone.utc - ) + dt.timedelta(days=1) - instance.health_status = "ssh connect problem" - instance.job = job - - await session.commit() - - with patch( - "dstack._internal.server.background.tasks.process_instances.instance_healthcheck" - ) as healthcheck: - healthcheck.return_value = HealthStatus(healthy=True, reason="OK") - await process_instances() - - await session.refresh(instance) - - assert instance is not None - assert instance.status == InstanceStatus.BUSY - assert instance.termination_deadline is None - assert instance.health_status is None - assert instance.job == job - - @pytest.mark.asyncio - async def test_check_shim_start_termination_deadline(self, test_db, session: AsyncSession): - project = await create_project(session=session) - pool = await create_pool(session, project) - - instance = await create_instance(session, project, pool, status=InstanceStatus.IDLE) - - health_status = "SSH connection fail" - with patch( - "dstack._internal.server.background.tasks.process_instances.instance_healthcheck" - ) as healthcheck: - healthcheck.return_value = HealthStatus(healthy=False, reason=health_status) - await process_instances() - - await session.refresh(instance) - - assert instance is not None - assert instance.status == InstanceStatus.IDLE - assert instance.termination_deadline is not None - assert instance.termination_deadline.replace( - tzinfo=dt.timezone.utc - ) > get_current_datetime() + dt.timedelta(minutes=19) - assert instance.health_status == health_status - - @pytest.mark.asyncio - async def test_check_shim_stop_termination_deadline(self, test_db, session: AsyncSession): - project = await create_project(session=session) - pool = await create_pool(session, project) - - instance = await create_instance(session, project, pool, status=InstanceStatus.IDLE) - instance.termination_deadline = get_current_datetime() + dt.timedelta(minutes=19) - await session.commit() - - with patch( - "dstack._internal.server.background.tasks.process_instances.instance_healthcheck" - ) as healthcheck: - healthcheck.return_value = HealthStatus(healthy=True, reason="OK") - await process_instances() - - await session.refresh(instance) - - assert instance is not None - assert instance.status == InstanceStatus.IDLE - assert instance.termination_deadline is None - assert instance.health_status is None - - @pytest.mark.asyncio - async def test_check_shim_terminate_instance_by_dedaline(self, test_db, session: AsyncSession): - project = await create_project(session=session) - pool = await create_pool(session, project) - - instance = await create_instance(session, project, pool, status=InstanceStatus.IDLE) - termination_deadline_time = get_current_datetime() + dt.timedelta(minutes=-19) - instance.termination_deadline = termination_deadline_time - await session.commit() - - health_status = "Not ok" - with patch( - "dstack._internal.server.background.tasks.process_instances.instance_healthcheck" - ) as healthcheck: - healthcheck.return_value = HealthStatus(healthy=False, reason=health_status) - await process_instances() - - await session.refresh(instance) - - assert instance is not None - assert instance.status == InstanceStatus.TERMINATING - assert ( - instance.termination_deadline.replace(tzinfo=dt.timezone.utc) - == termination_deadline_time - ) - assert instance.termination_reason == "Termination deadline" - assert instance.health_status == health_status - - -class TestTerminateIdleTime: - @pytest.mark.asyncio - async def test_terminate_by_idle_timeout(self, test_db, session: AsyncSession): - project = await create_project(session=session) - pool = await create_pool(session, project) - instance = await create_instance(session, project, pool, status=InstanceStatus.IDLE) - instance.termination_idle_time = 300 - instance.termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE - instance.last_job_processed_at = get_current_datetime() + dt.timedelta(minutes=-19) - await session.commit() - with patch( - "dstack._internal.server.background.tasks.process_instances.terminate_job_provisioning_data_instance" - ): - await process_instances() - await session.refresh(instance) - assert instance is not None - assert instance.status == InstanceStatus.TERMINATED - assert instance.termination_reason == "Idle timeout" - - -class TestTerminate: - @pytest.mark.asyncio - async def test_terminate(self, test_db, session: AsyncSession): - project = await create_project(session=session) - pool = await create_pool(session, project) - - instance = await create_instance(session, project, pool, status=InstanceStatus.TERMINATING) - - reason = "some reason" - instance.termination_reason = reason - instance.last_job_processed_at = get_current_datetime() + dt.timedelta(minutes=-19) - await session.commit() - - with patch( - "dstack._internal.server.background.tasks.process_instances.backends_services.get_project_backends" - ) as get_backends: - backend = Mock() - backend.TYPE = BackendType.DATACRUNCH - backend.compute.return_value.terminate_instance.return_value = Mock() - - get_backends.return_value = [backend] - - await process_instances() - - await session.refresh(instance) - - assert instance is not None - assert instance.status == InstanceStatus.TERMINATED - assert instance.termination_reason == "some reason" - assert instance.deleted == True - assert instance.deleted_at is not None - assert instance.finished_at is not None - - -class TestCreateInstance: - @pytest.mark.asyncio - async def test_create_instance(self, test_db, session: AsyncSession): - project = await create_project(session=session) - pool = await create_pool(session, project) - instance = await create_instance(session, project, pool) - with patch( - "dstack._internal.server.background.tasks.process_instances.get_create_instance_offers" - ) as get_create_instance_offers: - offer = InstanceOfferWithAvailability( - backend=BackendType.AWS, - instance=InstanceType( - name="instance", - resources=Resources(cpus=1, memory_mib=512, spot=False, gpus=[]), - ), - region="us", - price=1.0, - availability=InstanceAvailability.AVAILABLE, - ) - - backend_mock = Mock() - backend_mock.TYPE = BackendType.AWS - backend_mock.compute.return_value.get_offers.return_value = [offer] - backend_mock.compute.return_value.create_instance.return_value = JobProvisioningData( - backend=offer.backend, - instance_type=offer.instance, - instance_id="instance_id", - hostname="1.1.1.1", - internal_ip=None, - region=offer.region, - price=offer.price, - username="ubuntu", - ssh_port=22, - ssh_proxy=None, - dockerized=True, - backend_data=None, - ) - get_create_instance_offers.return_value = [(backend_mock, offer)] - await task_create_instance(instance_id=instance.id) - - await session.refresh(instance) - assert instance.status == InstanceStatus.PROVISIONING - - @pytest.mark.asyncio - async def test_expire_retry_duration(self, test_db, session: AsyncSession): - project = await create_project(session=session) - pool = await create_pool(session, project) - profile = Profile( - name="test_profile", retry_policy=ProfileRetryPolicy(retry=True, duration=123) - ) - instance = await create_instance( - session, project, pool, profile=profile, status=InstanceStatus.TERMINATING - ) - await task_create_instance(instance_id=instance.id) - await session.refresh(instance) - assert instance.status == InstanceStatus.TERMINATED - assert instance.termination_reason == "Retry duration expired" - - @pytest.mark.asyncio - async def test_retry_delay(self, test_db, session: AsyncSession): - project = await create_project(session=session) - pool = await create_pool(session, project) - profile = Profile( - name="test_profile", retry_policy=ProfileRetryPolicy(retry=True, duration=123) - ) - instance = await create_instance( - session, - project, - pool, - created_at=get_current_datetime(), - profile=profile, - status=InstanceStatus.TERMINATING, - ) - last_retry = get_current_datetime() - dt.timedelta(seconds=10) - instance.last_retry_at = last_retry - session.add(instance) - await session.commit() - await task_create_instance(instance_id=instance.id) - await session.refresh(instance) - assert instance.last_retry_at.replace(tzinfo=dt.timezone.utc) == last_retry diff --git a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py b/src/tests/_internal/server/background/tasks/test_process_running_jobs.py deleted file mode 100644 index dd0b5715c9..0000000000 --- a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py +++ /dev/null @@ -1,328 +0,0 @@ -from datetime import datetime, timezone -from pathlib import Path -from unittest.mock import Mock, patch - -import pytest -from sqlalchemy.ext.asyncio import AsyncSession - -from dstack._internal.core.errors import SSHError -from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.instances import InstanceType, Resources -from dstack._internal.core.models.runs import ( - InstanceStatus, - JobProvisioningData, - JobStatus, - JobTerminationReason, -) -from dstack._internal.server import settings -from dstack._internal.server.background.tasks.process_running_jobs import process_running_jobs -from dstack._internal.server.schemas.runner import HealthcheckResponse, JobStateEvent, PullResponse -from dstack._internal.server.testing.common import ( - create_instance, - create_job, - create_pool, - create_project, - create_repo, - create_run, - create_user, -) - - -def get_job_provisioning_data(dockerized: bool) -> JobProvisioningData: - return JobProvisioningData( - backend=BackendType.AWS, - instance_type=InstanceType( - name="instance", - resources=Resources(cpus=1, memory_mib=512, spot=False, gpus=[]), - ), - instance_id="instance_id", - hostname="127.0.0.4", - region="us-east-1", - price=10.5, - username="ubuntu", - ssh_port=22, - dockerized=dockerized, - backend_data=None, - ssh_proxy=None, - ) - - -class TestProcessRunningJobs: - @pytest.mark.asyncio - async def test_leaves_provisioning_job_unchanged_if_runner_not_alive( - self, test_db, session: AsyncSession - ): - project = await create_project(session=session) - user = await create_user(session=session) - repo = await create_repo( - session=session, - project_id=project.id, - ) - run = await create_run( - session=session, - project=project, - repo=repo, - user=user, - ) - job_provisioning_data = get_job_provisioning_data(dockerized=False) - job = await create_job( - session=session, - run=run, - status=JobStatus.PROVISIONING, - submitted_at=datetime(2023, 1, 2, 5, 12, 30, 5, tzinfo=timezone.utc), - job_provisioning_data=job_provisioning_data, - ) - with patch( - "dstack._internal.server.services.runner.ssh.RunnerTunnel" - ) as RunnerTunnelMock, patch( - "dstack._internal.server.services.runner.client.RunnerClient" - ) as RunnerClientMock, patch( - "dstack._internal.utils.common.get_current_datetime" - ) as datetime_mock: - datetime_mock.return_value = datetime(2023, 1, 2, 5, 12, 30, 10, tzinfo=timezone.utc) - runner_client_mock = RunnerClientMock.return_value - runner_client_mock.healthcheck = Mock() - runner_client_mock.healthcheck.return_value = None - await process_running_jobs() - RunnerTunnelMock.assert_called_once() - runner_client_mock.healthcheck.assert_called_once() - await session.refresh(job) - assert job is not None - assert job.status == JobStatus.PROVISIONING - - @pytest.mark.asyncio - async def test_runs_provisioning_job(self, test_db, session: AsyncSession): - project = await create_project(session=session) - user = await create_user(session=session) - repo = await create_repo( - session=session, - project_id=project.id, - ) - run = await create_run( - session=session, - project=project, - repo=repo, - user=user, - ) - job_provisioning_data = get_job_provisioning_data(dockerized=False) - job = await create_job( - session=session, - run=run, - status=JobStatus.PROVISIONING, - job_provisioning_data=job_provisioning_data, - ) - with patch( - "dstack._internal.server.services.runner.ssh.RunnerTunnel" - ) as RunnerTunnelMock, patch( - "dstack._internal.server.services.runner.client.RunnerClient" - ) as RunnerClientMock: - runner_client_mock = RunnerClientMock.return_value - runner_client_mock.healthcheck.return_value = HealthcheckResponse( - service="dstack-runner", version="0.0.1.dev2" - ) - await process_running_jobs() - RunnerTunnelMock.assert_called_once() - runner_client_mock.healthcheck.assert_called_once() - runner_client_mock.submit_job.assert_called_once() - runner_client_mock.upload_code.assert_called_once() - runner_client_mock.run_job.assert_called_once() - await session.refresh(job) - assert job is not None - assert job.status == JobStatus.RUNNING - - @pytest.mark.asyncio - async def test_updates_running_job(self, test_db, session: AsyncSession, tmp_path: Path): - project = await create_project(session=session) - user = await create_user(session=session) - repo = await create_repo( - session=session, - project_id=project.id, - ) - run = await create_run( - session=session, - project=project, - repo=repo, - user=user, - ) - job_provisioning_data = get_job_provisioning_data(dockerized=False) - job = await create_job( - session=session, - run=run, - status=JobStatus.RUNNING, - job_provisioning_data=job_provisioning_data, - ) - with patch( - "dstack._internal.server.services.runner.ssh.RunnerTunnel" - ) as RunnerTunnelMock, patch( - "dstack._internal.server.services.runner.client.RunnerClient" - ) as RunnerClientMock, patch.object(settings, "SERVER_DIR_PATH", tmp_path): - runner_client_mock = RunnerClientMock.return_value - runner_client_mock.pull.return_value = PullResponse( - job_states=[JobStateEvent(timestamp=1, state=JobStatus.RUNNING)], - job_logs=[], - runner_logs=[], - last_updated=1, - ) - await process_running_jobs() - RunnerTunnelMock.assert_called_once() - await session.refresh(job) - assert job is not None - assert job.status == JobStatus.RUNNING - assert job.runner_timestamp == 1 - with patch( - "dstack._internal.server.services.runner.ssh.RunnerTunnel" - ) as RunnerTunnelMock, patch( - "dstack._internal.server.services.runner.client.RunnerClient" - ) as RunnerClientMock: - runner_client_mock = RunnerClientMock.return_value - runner_client_mock.pull.return_value = PullResponse( - job_states=[JobStateEvent(timestamp=1, state=JobStatus.DONE)], - job_logs=[], - runner_logs=[], - last_updated=2, - ) - await process_running_jobs() - RunnerTunnelMock.assert_called_once() - await session.refresh(job) - assert job is not None - assert job.status == JobStatus.TERMINATING - assert job.termination_reason == JobTerminationReason.DONE_BY_RUNNER - assert job.runner_timestamp == 2 - - @pytest.mark.asyncio - async def test_provisioning_shim(self, test_db, session: AsyncSession): - project_ssh_pub_key = "__project_ssh_pub_key__" - project = await create_project(session=session, ssh_public_key=project_ssh_pub_key) - user = await create_user(session=session) - repo = await create_repo( - session=session, - project_id=project.id, - ) - run = await create_run( - session=session, - project=project, - repo=repo, - user=user, - ) - job_provisioning_data = get_job_provisioning_data(dockerized=True) - - with patch( - "dstack._internal.server.services.jobs.configurators.base.get_default_python_verison" - ) as PyVersion: - PyVersion.return_value = "3.11" - job = await create_job( - session=session, - run=run, - status=JobStatus.PROVISIONING, - job_provisioning_data=job_provisioning_data, - ) - with patch( - "dstack._internal.server.services.runner.ssh.RunnerTunnel" - ) as RunnerTunnelMock, patch( - "dstack._internal.server.services.runner.client.ShimClient" - ) as ShimClientMock: - ShimClientMock.return_value.healthcheck.return_value = HealthcheckResponse( - service="dstack-shim", version="0.0.1.dev2" - ) - await process_running_jobs() - RunnerTunnelMock.assert_called_once() - ShimClientMock.return_value.healthcheck.assert_called_once() - ShimClientMock.return_value.submit.assert_called_once_with( - username="", - password="", - image_name="dstackai/base:py3.11-0.4-cuda-12.1", - container_name="test-run-0-0", - shm_size=None, - public_keys=[project_ssh_pub_key, "user_ssh_key"], - ssh_user="ubuntu", - ssh_key="user_ssh_key", - mounts=[], - volumes=[], - ) - await session.refresh(job) - assert job is not None - assert job.status == JobStatus.PULLING - - @pytest.mark.asyncio - async def test_pulling_shim(self, test_db, session: AsyncSession): - project = await create_project(session=session) - user = await create_user(session=session) - repo = await create_repo( - session=session, - project_id=project.id, - ) - run = await create_run( - session=session, - project=project, - repo=repo, - user=user, - ) - job_provisioning_data = get_job_provisioning_data(dockerized=True) - job = await create_job( - session=session, - run=run, - status=JobStatus.PULLING, - job_provisioning_data=job_provisioning_data, - ) - with patch( - "dstack._internal.server.services.runner.ssh.RunnerTunnel" - ) as RunnerTunnelMock, patch( - "dstack._internal.server.services.runner.client.RunnerClient" - ) as RunnerClientMock, patch( - "dstack._internal.server.services.runner.client.ShimClient" - ) as ShimClientMock: - RunnerTunnelMock.return_value.healthcheck.return_value = HealthcheckResponse( - service="dstack-runner", version="0.0.1.dev2" - ) - await process_running_jobs() - RunnerTunnelMock.assert_called_once() - ShimClientMock.return_value.pull.assert_called_once() - RunnerClientMock.return_value.healthcheck.assert_called_once() - - RunnerClientMock.return_value.submit_job.assert_called_once() - RunnerClientMock.return_value.upload_code.assert_called_once() - RunnerClientMock.return_value.run_job.assert_called_once() - await session.refresh(job) - assert job is not None - assert job.status == JobStatus.RUNNING - - @pytest.mark.asyncio - async def test_pulling_shim_failed(self, test_db, session: AsyncSession): - project = await create_project(session=session) - user = await create_user(session=session) - repo = await create_repo( - session=session, - project_id=project.id, - ) - run = await create_run( - session=session, - project=project, - repo=repo, - user=user, - ) - pool = await create_pool(session, project) - instance = await create_instance( - session=session, - project=project, - pool=pool, - status=InstanceStatus.IDLE, - ) - job_provisioning_data = get_job_provisioning_data(dockerized=True) - job = await create_job( - session=session, - run=run, - status=JobStatus.PULLING, - job_provisioning_data=job_provisioning_data, - instance=instance, - ) - with patch( - "dstack._internal.server.services.runner.ssh.RunnerTunnel" - ) as RunnerTunnelMock, patch("dstack._internal.server.services.runner.ssh.time.sleep"): - RunnerTunnelMock.side_effect = SSHError - await process_running_jobs() - assert RunnerTunnelMock.call_count == 3 - await session.refresh(job) - assert job is not None - assert job.status == JobStatus.TERMINATING - assert job.termination_reason == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY - assert job.remove_at is None diff --git a/src/tests/_internal/server/background/tasks/test_process_runs.py b/src/tests/_internal/server/background/tasks/test_process_runs.py deleted file mode 100644 index eb48b9c4f1..0000000000 --- a/src/tests/_internal/server/background/tasks/test_process_runs.py +++ /dev/null @@ -1,315 +0,0 @@ -import datetime -from typing import Union -from unittest.mock import patch - -import pytest -from pydantic import parse_obj_as -from sqlalchemy.ext.asyncio import AsyncSession - -import dstack._internal.server.background.tasks.process_runs as process_runs -from dstack._internal.core.models.configurations import ServiceConfiguration -from dstack._internal.core.models.profiles import Profile -from dstack._internal.core.models.resources import Range -from dstack._internal.core.models.runs import ( - JobStatus, - JobTerminationReason, - RunStatus, - RunTerminationReason, -) -from dstack._internal.server.models import RunModel -from dstack._internal.server.testing.common import ( - create_instance, - create_job, - create_pool, - create_project, - create_repo, - create_run, - create_user, - get_job_provisioning_data, - get_run_spec, -) - - -async def make_run( - session: AsyncSession, status: RunStatus = RunStatus.SUBMITTED, replicas: Union[str, int] = 1 -) -> RunModel: - project = await create_project(session=session) - user = await create_user(session=session) - repo = await create_repo( - session=session, - project_id=project.id, - ) - project.default_pool = await create_pool( - session=session, project=project, pool_name="default-pool" - ) - run_name = "test-run" - profile = Profile( - name="test-profile", - retry=True, - ) - run_spec = get_run_spec( - repo_id=repo.name, - run_name=run_name, - profile=profile, - configuration=ServiceConfiguration( - commands=["echo hello"], - port=8000, - replicas=parse_obj_as(Range[int], replicas), - ), - ) - return await create_run( - session=session, - project=project, - repo=repo, - user=user, - run_name=run_name, - run_spec=run_spec, - status=status, - ) - - -class TestProcessRuns: - @pytest.mark.asyncio - async def test_submitted_to_provisioning(self, test_db, session: AsyncSession): - run = await make_run(session, status=RunStatus.SUBMITTED) - await create_job(session=session, run=run, status=JobStatus.PROVISIONING) - - await process_runs.process_single_run(run.id, []) - await session.refresh(run) - assert run.status == RunStatus.PROVISIONING - - @pytest.mark.asyncio - async def test_provisioning_to_running(self, test_db, session: AsyncSession): - run = await make_run(session, status=RunStatus.PROVISIONING) - await create_job(session=session, run=run, status=JobStatus.RUNNING) - - await process_runs.process_single_run(run.id, []) - await session.refresh(run) - assert run.status == RunStatus.RUNNING - - @pytest.mark.asyncio - async def test_keep_provisioning(self, test_db, session: AsyncSession): - run = await make_run(session, status=RunStatus.PROVISIONING) - await create_job(session=session, run=run, status=JobStatus.PULLING) - - await process_runs.process_single_run(run.id, []) - await session.refresh(run) - assert run.status == RunStatus.PROVISIONING - - @pytest.mark.asyncio - async def test_running_to_done(self, test_db, session: AsyncSession): - run = await make_run(session, status=RunStatus.RUNNING) - await create_job(session=session, run=run, status=JobStatus.DONE) - - await process_runs.process_single_run(run.id, []) - await session.refresh(run) - assert run.status == RunStatus.TERMINATING - assert run.termination_reason == RunTerminationReason.ALL_JOBS_DONE - - @pytest.mark.asyncio - async def test_terminate_run_jobs(self, test_db, session: AsyncSession): - run = await make_run(session, status=RunStatus.TERMINATING) - run.termination_reason = RunTerminationReason.JOB_FAILED - job = await create_job( - session=session, - run=run, - job_provisioning_data=get_job_provisioning_data(), - status=JobStatus.RUNNING, - ) - - with patch("dstack._internal.server.services.jobs._stop_runner") as stop_runner: - await process_runs.process_single_run(run.id, []) - stop_runner.assert_called_once() - await session.refresh(job) - assert job.status == JobStatus.TERMINATING - assert job.termination_reason == JobTerminationReason.TERMINATED_BY_SERVER - await session.refresh(run) - assert run.status == RunStatus.TERMINATING - - @pytest.mark.asyncio - async def test_retry_running_to_pending(self, test_db, session: AsyncSession): - run = await make_run(session, status=RunStatus.RUNNING) - instance = await create_instance( - session, project=run.project, pool=run.project.default_pool, spot=True - ) - await create_job( - session=session, - run=run, - status=JobStatus.FAILED, - submitted_at=run.submitted_at, - last_processed_at=run.submitted_at, - termination_reason=JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY, - instance=instance, - job_provisioning_data=get_job_provisioning_data(), - ) - with patch("dstack._internal.utils.common.get_current_datetime") as datetime_mock: - datetime_mock.return_value = run.submitted_at + datetime.timedelta(minutes=3) - await process_runs.process_single_run(run.id, []) - await session.refresh(run) - assert run.status == RunStatus.PENDING - - @pytest.mark.asyncio - async def test_retry_running_to_failed(self, test_db, session: AsyncSession): - run = await make_run(session, status=RunStatus.RUNNING) - instance = await create_instance( - session, project=run.project, pool=run.project.default_pool, spot=True - ) - # job exited with non-zero code - await create_job( - session=session, - run=run, - status=JobStatus.FAILED, - termination_reason=None, - instance=instance, - ) - - with patch("dstack._internal.utils.common.get_current_datetime") as datetime_mock: - datetime_mock.return_value = run.submitted_at + datetime.timedelta(minutes=3) - await process_runs.process_single_run(run.id, []) - await session.refresh(run) - assert run.status == RunStatus.TERMINATING - assert run.termination_reason == RunTerminationReason.JOB_FAILED - - @pytest.mark.asyncio - async def test_pending_to_submitted(self, test_db, session: AsyncSession): - run = await make_run(session, status=RunStatus.PENDING) - await create_job(session=session, run=run, status=JobStatus.FAILED) - - await process_runs.process_single_run(run.id, []) - await session.refresh(run) - assert run.status == RunStatus.SUBMITTED - assert len(run.jobs) == 2 - assert run.jobs[0].status == JobStatus.FAILED - assert run.jobs[1].status == JobStatus.SUBMITTED - - -class TestProcessRunsReplicas: - @pytest.mark.asyncio - async def test_submitted_to_provisioning_if_any(self, test_db, session: AsyncSession): - run = await make_run(session, status=RunStatus.SUBMITTED, replicas=2) - await create_job(session=session, run=run, status=JobStatus.SUBMITTED, replica_num=0) - await create_job(session=session, run=run, status=JobStatus.PROVISIONING, replica_num=1) - - await process_runs.process_single_run(run.id, []) - await session.refresh(run) - assert run.status == RunStatus.PROVISIONING - - @pytest.mark.asyncio - async def test_provisioning_to_running_if_any(self, test_db, session: AsyncSession): - run = await make_run(session, status=RunStatus.PROVISIONING, replicas=2) - await create_job(session=session, run=run, status=JobStatus.RUNNING, replica_num=0) - await create_job(session=session, run=run, status=JobStatus.PROVISIONING, replica_num=1) - - await process_runs.process_single_run(run.id, []) - await session.refresh(run) - assert run.status == RunStatus.RUNNING - - @pytest.mark.asyncio - async def test_all_no_capacity_to_pending(self, test_db, session: AsyncSession): - run = await make_run(session, status=RunStatus.RUNNING, replicas=2) - await create_job( - session=session, - run=run, - status=JobStatus.TERMINATING, - termination_reason=JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY, - submitted_at=run.submitted_at, - last_processed_at=run.submitted_at, - replica_num=0, - instance=await create_instance( - session, project=run.project, pool=run.project.default_pool, spot=True - ), - job_provisioning_data=get_job_provisioning_data(), - ) - await create_job( - session=session, - run=run, - status=JobStatus.TERMINATING, - termination_reason=JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY, - submitted_at=run.submitted_at, - last_processed_at=run.submitted_at, - replica_num=1, - instance=await create_instance( - session, project=run.project, pool=run.project.default_pool, spot=True - ), - job_provisioning_data=get_job_provisioning_data(), - ) - with patch("dstack._internal.utils.common.get_current_datetime") as datetime_mock: - datetime_mock.return_value = run.submitted_at + datetime.timedelta(minutes=3) - await process_runs.process_single_run(run.id, []) - await session.refresh(run) - assert run.status == RunStatus.PENDING - - @pytest.mark.asyncio - async def test_some_no_capacity_keep_running(self, test_db, session: AsyncSession): - run = await make_run(session, status=RunStatus.RUNNING, replicas=2) - await create_job( - session=session, - run=run, - status=JobStatus.TERMINATING, - termination_reason=JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY, - submitted_at=run.submitted_at, - last_processed_at=run.last_processed_at, - replica_num=0, - instance=await create_instance( - session, project=run.project, pool=run.project.default_pool, spot=True - ), - job_provisioning_data=get_job_provisioning_data(), - ) - await create_job( - session=session, - run=run, - status=JobStatus.RUNNING, - submitted_at=run.submitted_at, - last_processed_at=run.last_processed_at, - replica_num=1, - job_provisioning_data=get_job_provisioning_data(), - ) - with patch("dstack._internal.utils.common.get_current_datetime") as datetime_mock: - datetime_mock.return_value = run.submitted_at + datetime.timedelta(minutes=3) - await process_runs.process_single_run(run.id, []) - await session.refresh(run) - assert run.status == RunStatus.RUNNING - assert len(run.jobs) == 3 - assert run.jobs[2].status == JobStatus.SUBMITTED - assert run.jobs[2].replica_num == 0 - - @pytest.mark.asyncio - async def test_some_failed_to_terminating(self, test_db, session: AsyncSession): - run = await make_run(session, status=RunStatus.RUNNING, replicas=2) - await create_job( - session=session, - run=run, - status=JobStatus.FAILED, - termination_reason=JobTerminationReason.CONTAINER_EXITED_WITH_ERROR, - replica_num=0, - ) - await create_job(session=session, run=run, status=JobStatus.RUNNING, replica_num=1) - - await process_runs.process_single_run(run.id, []) - await session.refresh(run) - assert run.status == RunStatus.TERMINATING - assert run.termination_reason == RunTerminationReason.JOB_FAILED - - @pytest.mark.asyncio - async def test_pending_to_submitted_adds_replicas(self, test_db, session: AsyncSession): - run = await make_run(session, status=RunStatus.PENDING, replicas=2) - await create_job( - session=session, - run=run, - status=JobStatus.FAILED, - termination_reason=JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY, - replica_num=0, - ) - - await process_runs.process_single_run(run.id, []) - await session.refresh(run) - assert run.status == RunStatus.SUBMITTED - assert len(run.jobs) == 3 - assert run.jobs[1].status == JobStatus.SUBMITTED - assert run.jobs[1].replica_num == 0 - assert run.jobs[2].status == JobStatus.SUBMITTED - assert run.jobs[2].replica_num == 1 - - -# TODO(egor-s): TestProcessRunsMultiNode -# TODO(egor-s): TestProcessRunsAutoScaling diff --git a/src/tests/_internal/server/background/tasks/test_process_submitted_jobs.py b/src/tests/_internal/server/background/tasks/test_process_submitted_jobs.py deleted file mode 100644 index 33b0c6ef13..0000000000 --- a/src/tests/_internal/server/background/tasks/test_process_submitted_jobs.py +++ /dev/null @@ -1,212 +0,0 @@ -from datetime import datetime, timezone -from unittest.mock import Mock, patch - -import pytest -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import joinedload - -from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.instances import ( - InstanceAvailability, - InstanceOfferWithAvailability, - InstanceType, - Resources, -) -from dstack._internal.core.models.profiles import DEFAULT_POOL_NAME, Profile, ProfileRetryPolicy -from dstack._internal.core.models.runs import ( - InstanceStatus, - JobProvisioningData, - JobStatus, - JobTerminationReason, -) -from dstack._internal.server.background.tasks.process_submitted_jobs import process_submitted_jobs -from dstack._internal.server.models import JobModel, ProjectModel -from dstack._internal.server.services.pools import ( - get_or_create_pool_by_name, -) -from dstack._internal.server.testing.common import ( - create_instance, - create_job, - create_project, - create_repo, - create_run, - create_user, - get_run_spec, -) - - -class TestProcessSubmittedJobs: - @pytest.mark.asyncio - async def test_fails_job_when_no_backends(self, test_db, session: AsyncSession): - project = await create_project(session=session) - user = await create_user(session=session) - repo = await create_repo( - session=session, - project_id=project.id, - ) - run = await create_run( - session=session, - project=project, - repo=repo, - user=user, - ) - job = await create_job( - session=session, - run=run, - ) - await process_submitted_jobs() - await session.refresh(job) - assert job is not None - assert job.status == JobStatus.TERMINATING - assert job.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY - - @pytest.mark.asyncio - async def test_provisiones_job(self, test_db, session: AsyncSession): - project = await create_project(session=session) - user = await create_user(session=session) - repo = await create_repo( - session=session, - project_id=project.id, - ) - run = await create_run( - session=session, - project=project, - repo=repo, - user=user, - ) - job = await create_job( - session=session, - run=run, - ) - offer = InstanceOfferWithAvailability( - backend=BackendType.AWS, - instance=InstanceType( - name="instance", - resources=Resources(cpus=1, memory_mib=512, spot=False, gpus=[]), - ), - region="us", - price=1.0, - availability=InstanceAvailability.AVAILABLE, - ) - with patch("dstack._internal.server.services.backends.get_project_backends") as m: - backend_mock = Mock() - m.return_value = [backend_mock] - backend_mock.TYPE = BackendType.AWS - backend_mock.compute.return_value.get_offers.return_value = [offer] - backend_mock.compute.return_value.run_job.return_value = JobProvisioningData( - backend=offer.backend, - instance_type=offer.instance, - instance_id="instance_id", - hostname="1.1.1.1", - internal_ip=None, - region=offer.region, - price=offer.price, - username="ubuntu", - ssh_port=22, - ssh_proxy=None, - dockerized=True, - backend_data=None, - ) - await process_submitted_jobs() - m.assert_called_once() - backend_mock.compute.return_value.get_offers.assert_called_once() - backend_mock.compute.return_value.run_job.assert_called_once() - - await session.refresh(job) - assert job is not None - assert job.status == JobStatus.PROVISIONING - - res = await session.execute( - select(ProjectModel) - .where(ProjectModel.id == project.id) - .options(joinedload(ProjectModel.default_pool)) - ) - project = res.scalar_one() - assert project.default_pool.name == DEFAULT_POOL_NAME - - instance_offer = InstanceOfferWithAvailability.parse_raw( - project.default_pool.instances[0].offer - ) - assert offer == instance_offer - - pool_job_provisioning_data = project.default_pool.instances[0].job_provisioning_data - assert pool_job_provisioning_data == job.job_provisioning_data - - @pytest.mark.asyncio - async def test_fails_job_when_no_capacity(self, test_db, session: AsyncSession): - project = await create_project(session=session) - user = await create_user(session=session) - repo = await create_repo( - session=session, - project_id=project.id, - ) - run = await create_run( - session=session, - project=project, - repo=repo, - user=user, - run_name="test-run", - run_spec=get_run_spec( - run_name="test-run", - repo_id=repo.name, - profile=Profile( - name="default", - retry_policy=ProfileRetryPolicy(retry=True, duration=3600), - ), - ), - ) - job = await create_job( - session=session, - run=run, - submitted_at=datetime(2023, 1, 2, 3, 0, 0, tzinfo=timezone.utc), - ) - with patch("dstack._internal.utils.common.get_current_datetime") as datetime_mock: - datetime_mock.return_value = datetime(2023, 1, 2, 3, 30, 0, tzinfo=timezone.utc) - await process_submitted_jobs() - - await session.refresh(job) - assert job is not None - assert job.status == JobStatus.TERMINATING - assert job.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY - - res = await session.execute( - select(ProjectModel) - .where(ProjectModel.id == project.id) - .options(joinedload(ProjectModel.default_pool)) - ) - project = res.scalar_one() - assert not project.default_pool.instances - - @pytest.mark.asyncio - async def test_job_with_instance(self, test_db, session: AsyncSession): - project = await create_project(session) - user = await create_user(session) - repo = await create_repo( - session=session, - project_id=project.id, - ) - pool = await get_or_create_pool_by_name(session, project, pool_name=None) - instance = await create_instance( - session=session, - project=project, - pool=pool, - status=InstanceStatus.IDLE, - ) - await session.refresh(pool) - run = await create_run( - session=session, - project=project, - repo=repo, - user=user, - ) - job = await create_job( - session=session, - run=run, - ) - await process_submitted_jobs() - await session.refresh(job) - res = await session.execute(select(JobModel).options(joinedload(JobModel.instance))) - job = res.scalar_one() - assert job.status == JobStatus.PROVISIONING - assert job.instance is not None and job.instance.id == instance.id diff --git a/src/tests/_internal/server/compatibility/__init__.py b/src/tests/_internal/server/compatibility/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/server/compatibility/test_gateways.py b/src/tests/_internal/server/compatibility/test_gateways.py new file mode 100644 index 0000000000..4313be0955 --- /dev/null +++ b/src/tests/_internal/server/compatibility/test_gateways.py @@ -0,0 +1,89 @@ +import uuid +from datetime import datetime, timezone + +from packaging.version import Version + +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.gateways import ( + Gateway, + GatewayConfiguration, + GatewayReplica, + GatewayStatus, +) +from dstack._internal.server.compatibility.gateways import patch_gateway +from dstack._internal.utils.common import get_current_datetime + +_CREATED_AT = datetime(2025, 1, 1, tzinfo=timezone.utc) +_CONFIG = GatewayConfiguration(name="gw", backend=BackendType.AWS, region="us") + + +def _make_gateway_replica(hostname: str = "1.2.3.4") -> GatewayReplica: + return GatewayReplica( + hostname=hostname, + replica_num=0, + backend=BackendType.AWS, + region="us", + created_at=get_current_datetime(), + ) + + +def _make_gateway(replicas=None, hostname=None) -> Gateway: + return Gateway( + id=uuid.uuid4(), + name="test", + project_name="proj", + backend=BackendType.AWS, + region="us", + created_at=_CREATED_AT, + status=GatewayStatus.RUNNING, + status_message=None, + hostname=hostname, + wildcard_domain=None, + default=False, + replicas=replicas or [], + configuration=_CONFIG, + ) + + +class TestPatchGateway: + def test_none_version_is_noop(self): + replica = _make_gateway_replica("1.2.3.4") + gw = _make_gateway(replicas=[replica]) + patch_gateway(gw, None) + assert gw.ip_address is None + assert gw.instance_id is None + assert gw.hostname is None + + def test_new_version_is_noop(self): + replica = _make_gateway_replica("1.2.3.4") + gw = _make_gateway(replicas=[replica]) + patch_gateway(gw, Version("0.20.25")) + assert gw.ip_address is None + assert gw.instance_id is None + + def test_old_version_fills_hostname_from_replica(self): + replica = _make_gateway_replica("1.2.3.4") + gw = _make_gateway(replicas=[replica], hostname=None) + patch_gateway(gw, Version("0.20.24")) + assert gw.hostname == "1.2.3.4" + + def test_old_version_keeps_existing_hostname(self): + replica = _make_gateway_replica("1.2.3.4") + gw = _make_gateway(replicas=[replica], hostname="lb.example.com") + patch_gateway(gw, Version("0.20.24")) + assert gw.hostname == "lb.example.com" + + def test_old_version_no_replicas_sets_empty_strings(self): + gw = _make_gateway(replicas=[]) + patch_gateway(gw, Version("0.20.24")) + assert gw.ip_address == "" + assert gw.instance_id == "" + assert gw.hostname == "" + + def test_old_version_multi_replica_concatenates_hostnames(self): + replicas = [_make_gateway_replica("1.2.3.4"), _make_gateway_replica("5.6.7.8")] + gw = _make_gateway(replicas=replicas) + patch_gateway(gw, Version("0.20.24")) + assert gw.hostname == "1.2.3.4\n5.6.7.8" + assert gw.ip_address == "1.2.3.4\n5.6.7.8" + assert gw.instance_id == "" diff --git a/src/tests/_internal/server/conftest.py b/src/tests/_internal/server/conftest.py index 5a46803c5a..3894ac3601 100644 --- a/src/tests/_internal/server/conftest.py +++ b/src/tests/_internal/server/conftest.py @@ -1,22 +1,56 @@ -import pytest_asyncio +from collections.abc import Generator +from pathlib import Path +from unittest.mock import AsyncMock, Mock, patch -from dstack._internal.server.db import Database, override_db -from dstack._internal.server.models import BaseModel +import httpx +import pytest -db = Database("sqlite+aiosqlite://") -override_db(db) +from dstack._internal.server.main import app +from dstack._internal.server.services import encryption as encryption # import for side-effect +from dstack._internal.server.services import logs as logs_services +from dstack._internal.server.services.docker import ImageConfig, ImageConfigObject +from dstack._internal.server.services.logs.filelog import FileLogStorage +from dstack._internal.server.testing.conf import ( # noqa: F401 + postgres_container, + session, + test_db, +) -@pytest_asyncio.fixture -async def test_db(): - async with db.engine.begin() as conn: - await conn.run_sync(BaseModel.metadata.create_all) - yield conn - await conn.run_sync(BaseModel.metadata.drop_all) +@pytest.fixture +def client(): + transport = httpx.ASGITransport(app=app) + return httpx.AsyncClient(transport=transport, base_url="http://test") -@pytest_asyncio.fixture -async def session(): - async with db.get_session() as session: - yield session - await session.commit() +@pytest.fixture +def test_log_storage(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> FileLogStorage: + root = tmp_path / "test_logs" + root.mkdir() + storage = FileLogStorage(root) + monkeypatch.setattr(logs_services, "_log_storage", storage) + return storage + + +@pytest.fixture +def image_config_mock(monkeypatch: pytest.MonkeyPatch) -> ImageConfig: + image_config = ImageConfig.parse_obj({"User": None, "Entrypoint": None, "Cmd": ["/bin/bash"]}) + monkeypatch.setattr( + "dstack._internal.server.services.jobs.configurators.base._get_image_config", + Mock(return_value=image_config), + ) + monkeypatch.setattr( + "dstack._internal.server.services.docker.get_image_config", + Mock(return_value=ImageConfigObject(config=image_config)), + ) + return image_config + + +@pytest.fixture() +def mock_gateway_connection() -> Generator[AsyncMock, None, None]: + with patch( + "dstack._internal.server.services.gateways.gateway_connections_pool.get_or_add" + ) as get_conn_mock: + get_conn_mock.return_value.client = Mock() + get_conn_mock.return_value.client.return_value = AsyncMock() + yield get_conn_mock diff --git a/src/tests/_internal/server/routers/test_auth.py b/src/tests/_internal/server/routers/test_auth.py new file mode 100644 index 0000000000..f4c8bb0e59 --- /dev/null +++ b/src/tests/_internal/server/routers/test_auth.py @@ -0,0 +1,64 @@ +import json +from base64 import b64encode + +import pytest +from httpx import AsyncClient + +from dstack._internal.core.models.auth import OAuthProviderInfo +from dstack._internal.server.services.auth import register_provider + + +class TestListProviders: + @pytest.mark.asyncio + async def test_returns_no_providers(self, client: AsyncClient): + response = await client.post("/api/auth/list_providers") + assert response.status_code == 200 + assert response.json() == [] + + @pytest.mark.asyncio + async def test_returns_registered_providers(self, client: AsyncClient): + register_provider(OAuthProviderInfo(name="provider1", enabled=True)) + register_provider(OAuthProviderInfo(name="provider2", enabled=False)) + response = await client.post("/api/auth/list_providers") + assert response.status_code == 200 + assert response.json() == [ + { + "name": "provider1", + "enabled": True, + }, + { + "name": "provider2", + "enabled": False, + }, + ] + + +class TestGetNextRedirectURL: + @pytest.mark.asyncio + async def test_returns_no_redirect_url_if_local_port_not_set(self, client: AsyncClient): + state = b64encode(json.dumps({"value": "12356", "local_port": None}).encode()).decode() + response = await client.post( + "/api/auth/get_next_redirect", json={"code": "1234", "state": state} + ) + assert response.status_code == 200 + assert response.json() == {"redirect_url": None} + + @pytest.mark.asyncio + async def test_returns_redirect_url_if_local_port_set(self, client: AsyncClient): + state = b64encode(json.dumps({"value": "12356", "local_port": 12345}).encode()).decode() + response = await client.post( + "/api/auth/get_next_redirect", json={"code": "1234", "state": state} + ) + assert response.status_code == 200 + assert response.json() == { + "redirect_url": f"http://localhost:12345/auth/callback?code=1234&state={state}" + } + + @pytest.mark.asyncio + async def test_returns_400_if_state_invalid(self, client: AsyncClient): + state = "some_invalid_state" + response = await client.post( + "/api/auth/get_next_redirect", json={"code": "1234", "state": state} + ) + assert response.status_code == 400 + assert "Invalid state token" in response.json()["detail"][0]["msg"] diff --git a/src/tests/_internal/server/routers/test_backends.py b/src/tests/_internal/server/routers/test_backends.py index db5cbf0449..79fb13667e 100644 --- a/src/tests/_internal/server/routers/test_backends.py +++ b/src/tests/_internal/server/routers/test_backends.py @@ -1,30 +1,46 @@ import json -from operator import itemgetter +from collections.abc import Sequence +from datetime import datetime, timezone +from typing import Any, Optional from unittest.mock import Mock, patch import pytest import yaml -from fastapi.testclient import TestClient +from httpx import AsyncClient from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from dstack._internal.core.backends.oci import region as oci_region -from dstack._internal.core.errors import BackendAuthError from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import ( + Gpu, + InstanceOffer, + InstanceStatus, + InstanceType, + Resources, +) from dstack._internal.core.models.users import GlobalRole, ProjectRole -from dstack._internal.server.main import app +from dstack._internal.core.models.volumes import VolumeStatus from dstack._internal.server.models import BackendModel from dstack._internal.server.services.projects import add_project_member from dstack._internal.server.testing.common import ( create_backend, + create_fleet, + create_instance, create_project, create_user, + create_volume, get_auth_headers, + get_volume_provisioning_data, ) +from dstack._internal.utils.crypto import generate_rsa_key_pair_bytes -client = TestClient(app) - - +FAKE_NEBIUS_SERVICE_ACCOUNT_CREDS = { + "type": "service_account", + "service_account_id": "serviceaccount-e00test", + "public_key_id": "publickey-e00test", + "private_key_content": generate_rsa_key_pair_bytes()[0].decode(), +} FAKE_OCI_CLIENT_CREDS = { "type": "client", "user": "ocid1.user.oc1..aaaaaaaa", @@ -48,693 +64,58 @@ } +def _nebius_project( + id: str = "project-e00test", + name: str = "default-project-eu-north1", + region: str = "eu-north1", +): + project = Mock() + project.metadata.id = id + project.metadata.name = name + project.status.region = region + return project + + class TestListBackendTypes: - def test_returns_backend_types(self): - response = client.post("/api/backends/list_types") + @pytest.mark.asyncio + async def test_returns_backend_types(self, client: AsyncClient): + response = await client.post("/api/backends/list_types") assert response.status_code == 200, response.json() assert response.json() == [ + "amddevcloud", "aws", "azure", + "cloudrift", + "crusoe", "cudo", "datacrunch", + "digitalocean", "gcp", + "hotaisle", + "jarvislabs", "kubernetes", "lambda", "nebius", "oci", "runpod", - "tensordock", "vastai", + "verda", + "vultr", ] -class TestGetBackendConfigValuesAWS: - @pytest.mark.asyncio - async def test_returns_initial_config(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - body = {"type": "aws"} - with patch( - "dstack._internal.core.backends.aws.auth.default_creds_available" - ) as default_creds_available_mock: - default_creds_available_mock.return_value = False - response = client.post( - "/api/backends/config_values", - headers=get_auth_headers(user.token), - json=body, - ) - default_creds_available_mock.assert_called() - assert response.status_code == 200, response.json() - assert response.json() == { - "type": "aws", - "default_creds": False, - "regions": None, - } - - @pytest.mark.asyncio - async def test_returns_invalid_credentials(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - body = { - "type": "aws", - "creds": { - "type": "access_key", - "access_key": "1234", - "secret_key": "1234", - }, - } - with patch( - "dstack._internal.core.backends.aws.auth.default_creds_available" - ) as default_creds_available_mock, patch( - "dstack._internal.core.backends.aws.auth.authenticate" - ) as authenticate_mock: - authenticate_mock.side_effect = BackendAuthError() - response = client.post( - "/api/backends/config_values", - headers=get_auth_headers(user.token), - json=body, - ) - default_creds_available_mock.assert_called() - authenticate_mock.assert_called() - assert response.status_code == 400 - assert response.json() == { - "detail": [ - { - "code": "invalid_credentials", - "msg": "Invalid credentials", - "fields": ["creds", "access_key"], - }, - { - "code": "invalid_credentials", - "msg": "Invalid credentials", - "fields": ["creds", "secret_key"], - }, - ] - } - - @pytest.mark.asyncio - async def test_returns_config_on_valid_creds(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - body = { - "type": "aws", - "creds": { - "type": "access_key", - "access_key": "1234", - "secret_key": "1234", - }, - } - with patch( - "dstack._internal.core.backends.aws.auth.default_creds_available" - ) as default_creds_available_mock, patch( - "dstack._internal.core.backends.aws.auth.authenticate" - ) as authenticate_mock, patch( - "dstack._internal.core.backends.aws.compute.get_vpc_id_subnet_id_or_error" - ): - default_creds_available_mock.return_value = True - response = client.post( - "/api/backends/config_values", - headers=get_auth_headers(user.token), - json=body, - ) - default_creds_available_mock.assert_called() - authenticate_mock.assert_called() - assert response.status_code == 200, response.json() - assert response.json() == { - "type": "aws", - "default_creds": True, - "regions": { - "selected": [ - "us-east-1", - "us-east-2", - "us-west-1", - "us-west-2", - "ap-southeast-1", - "ca-central-1", - "eu-central-1", - "eu-west-1", - "eu-west-2", - "eu-west-3", - "eu-north-1", - ], - "values": [ - {"label": "us-east-1", "value": "us-east-1"}, - {"label": "us-east-2", "value": "us-east-2"}, - {"label": "us-west-1", "value": "us-west-1"}, - {"label": "us-west-2", "value": "us-west-2"}, - {"label": "ap-southeast-1", "value": "ap-southeast-1"}, - {"label": "ca-central-1", "value": "ca-central-1"}, - {"label": "eu-central-1", "value": "eu-central-1"}, - {"label": "eu-west-1", "value": "eu-west-1"}, - {"label": "eu-west-2", "value": "eu-west-2"}, - {"label": "eu-west-3", "value": "eu-west-3"}, - {"label": "eu-north-1", "value": "eu-north-1"}, - ], - }, - } - - -class TestGetBackendConfigValuesAzure: - @pytest.mark.asyncio - async def test_returns_initial_config(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - body = {"type": "azure"} - with patch( - "dstack._internal.core.backends.azure.auth.default_creds_available" - ) as default_creds_available_mock: - default_creds_available_mock.return_value = False - response = client.post( - "/api/backends/config_values", - headers=get_auth_headers(user.token), - json=body, - ) - default_creds_available_mock.assert_called() - assert response.status_code == 200, response.json() - assert response.json() == { - "type": "azure", - "default_creds": False, - "tenant_id": None, - "subscription_id": None, - "locations": None, - } - - @pytest.mark.asyncio - async def test_returns_invalid_credentials(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - body = { - "type": "azure", - "creds": { - "type": "client", - "tenant_id": "1234", - "client_id": "1234", - "client_secret": "1234", - }, - } - with patch( - "dstack._internal.core.backends.azure.auth.default_creds_available" - ) as default_creds_available_mock, patch( - "dstack._internal.core.backends.azure.auth.authenticate" - ) as authenticate_mock: - default_creds_available_mock.return_value = False - authenticate_mock.side_effect = BackendAuthError() - response = client.post( - "/api/backends/config_values", - headers=get_auth_headers(user.token), - json=body, - ) - default_creds_available_mock.assert_called() - authenticate_mock.assert_called() - assert response.status_code == 400 - assert response.json() == { - "detail": [ - { - "code": "invalid_credentials", - "msg": "Invalid credentials", - "fields": ["creds", "tenant_id"], - }, - { - "code": "invalid_credentials", - "msg": "Invalid credentials", - "fields": ["creds", "client_id"], - }, - { - "code": "invalid_credentials", - "msg": "Invalid credentials", - "fields": ["creds", "client_secret"], - }, - ] - } - - @pytest.mark.asyncio - @pytest.mark.parametrize( - "body", - [ - { - "type": "azure", - "creds": { - "type": "client", - "client_id": "1234", - "client_secret": "1234", - }, - "tenant_id": "test_tenant", - }, - { - "type": "azure", - "creds": { - "type": "client", - "tenant_id": "test_tenant", - "client_id": "1234", - "client_secret": "1234", - }, - }, - ], - ) - async def test_returns_config_on_valid_creds(self, test_db, session: AsyncSession, body): - user = await create_user(session=session, global_role=GlobalRole.USER) - with patch( - "dstack._internal.core.backends.azure.auth.default_creds_available" - ) as default_creds_available_mock, patch( - "dstack._internal.core.backends.azure.auth.authenticate" - ) as authenticate_mock, patch( - "azure.mgmt.subscription.SubscriptionClient" - ) as SubscriptionClientMock: - default_creds_available_mock.return_value = False - authenticate_mock.return_value = None, "test_tenant" - client_mock = SubscriptionClientMock.return_value - tenant_mock = Mock() - tenant_mock.tenant_id = "test_tenant" - client_mock.tenants.list.return_value = [tenant_mock] - subscription_mock = Mock() - subscription_mock.subscription_id = "test_subscription" - subscription_mock.display_name = "Subscription" - client_mock.subscriptions.list.return_value = [subscription_mock] - response = client.post( - "/api/backends/config_values", - headers=get_auth_headers(user.token), - json=body, - ) - assert response.status_code == 200 - assert response.json() == { - "type": "azure", - "default_creds": False, - "tenant_id": { - "selected": "test_tenant", - "values": [ - { - "value": "test_tenant", - "label": "test_tenant", - } - ], - }, - "subscription_id": { - "selected": "test_subscription", - "values": [ - { - "value": "test_subscription", - "label": "Subscription (test_subscription)", - } - ], - }, - "locations": { - "selected": [ - "centralus", - "eastus", - "eastus2", - "southcentralus", - "westus2", - "westus3", - "canadacentral", - "francecentral", - "germanywestcentral", - "northeurope", - "swedencentral", - "uksouth", - "westeurope", - "southeastasia", - "eastasia", - "brazilsouth", - ], - "values": [ - {"value": "centralus", "label": "centralus"}, - {"value": "eastus", "label": "eastus"}, - {"value": "eastus2", "label": "eastus2"}, - {"value": "southcentralus", "label": "southcentralus"}, - {"value": "westus2", "label": "westus2"}, - {"value": "westus3", "label": "westus3"}, - {"value": "canadacentral", "label": "canadacentral"}, - {"value": "francecentral", "label": "francecentral"}, - {"value": "germanywestcentral", "label": "germanywestcentral"}, - {"value": "northeurope", "label": "northeurope"}, - {"value": "swedencentral", "label": "swedencentral"}, - {"value": "uksouth", "label": "uksouth"}, - {"value": "westeurope", "label": "westeurope"}, - {"value": "southeastasia", "label": "southeastasia"}, - {"value": "eastasia", "label": "eastasia"}, - {"value": "brazilsouth", "label": "brazilsouth"}, - ], - }, - } - - -class TestGetBackendConfigValuesGCP: - @pytest.mark.asyncio - async def test_returns_initial_config(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - body = {"type": "gcp"} - with patch("dstack._internal.core.backends.gcp.auth.default_creds_available") as m: - m.return_value = True - response = client.post( - "/api/backends/config_values", - headers=get_auth_headers(user.token), - json=body, - ) - assert response.status_code == 200, response.json() - assert response.json() == { - "type": "gcp", - "default_creds": True, - "project_id": None, - "regions": None, - } - - @pytest.mark.asyncio - async def test_returns_invalid_credentials(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - body = { - "type": "gcp", - "creds": { - "type": "service_account", - "filename": "1234", - "data": "1234", - }, - } - with patch( - "dstack._internal.core.backends.gcp.auth.default_creds_available" - ) as default_creds_available_mock, patch( - "dstack._internal.core.backends.gcp.auth.authenticate" - ) as authenticate_mock: - default_creds_available_mock.return_value = False - authenticate_mock.side_effect = BackendAuthError() - response = client.post( - "/api/backends/config_values", - headers=get_auth_headers(user.token), - json=body, - ) - authenticate_mock.assert_called() - assert response.status_code == 400 - assert response.json() == { - "detail": [ - { - "code": "invalid_credentials", - "msg": "Invalid credentials", - "fields": ["creds", "data"], - }, - ] - } - - @pytest.mark.asyncio - async def test_returns_config_on_valid_creds(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - body = { - "type": "gcp", - "creds": { - "type": "service_account", - "filename": "1234", - "data": "1234", - }, - "project_id": "test_project", - } - with patch( - "dstack._internal.core.backends.gcp.auth.default_creds_available" - ) as default_creds_available_mock, patch( - "dstack._internal.core.backends.gcp.auth.authenticate" - ) as authenticate_mock, patch( - "dstack._internal.core.backends.gcp.resources.check_vpc" - ) as check_vpc_mock: - default_creds_available_mock.return_value = False - authenticate_mock.return_value = {}, "test_project" - response = client.post( - "/api/backends/config_values", - headers=get_auth_headers(user.token), - json=body, - ) - authenticate_mock.assert_called() - check_vpc_mock.assert_called() - assert response.status_code == 200, response.json() - assert response.json() == { - "type": "gcp", - "default_creds": False, - "project_id": { - "selected": "test_project", - "values": [ - { - "value": "test_project", - "label": "test_project", - } - ], - }, - "regions": { - "selected": [ - "northamerica-northeast1", - "northamerica-northeast2", - "us-central1", - "us-east1", - "us-east4", - "us-east5", - "us-south1", - "us-west1", - "us-west2", - "us-west3", - "us-west4", - "southamerica-east1", - "southamerica-west1", - "europe-central2", - "europe-north1", - "europe-southwest1", - "europe-west1", - "europe-west2", - "europe-west3", - "europe-west4", - "europe-west6", - "europe-west8", - "europe-west9", - "asia-east1", - "asia-east2", - "asia-northeast1", - "asia-northeast2", - "asia-northeast3", - "asia-south1", - "asia-south2", - "asia-southeast1", - "asia-southeast2", - "me-west1", - "australia-southeast1", - "australia-southeast2", - ], - "values": [ - { - "value": "northamerica-northeast1", - "label": "northamerica-northeast1", - }, - { - "value": "northamerica-northeast2", - "label": "northamerica-northeast2", - }, - {"value": "us-central1", "label": "us-central1"}, - {"value": "us-east1", "label": "us-east1"}, - {"value": "us-east4", "label": "us-east4"}, - {"value": "us-east5", "label": "us-east5"}, - {"value": "us-south1", "label": "us-south1"}, - {"value": "us-west1", "label": "us-west1"}, - {"value": "us-west2", "label": "us-west2"}, - {"value": "us-west3", "label": "us-west3"}, - {"value": "us-west4", "label": "us-west4"}, - {"value": "southamerica-east1", "label": "southamerica-east1"}, - {"value": "southamerica-west1", "label": "southamerica-west1"}, - {"value": "europe-central2", "label": "europe-central2"}, - {"value": "europe-north1", "label": "europe-north1"}, - {"value": "europe-southwest1", "label": "europe-southwest1"}, - {"value": "europe-west1", "label": "europe-west1"}, - {"value": "europe-west2", "label": "europe-west2"}, - {"value": "europe-west3", "label": "europe-west3"}, - {"value": "europe-west4", "label": "europe-west4"}, - {"value": "europe-west6", "label": "europe-west6"}, - {"value": "europe-west8", "label": "europe-west8"}, - {"value": "europe-west9", "label": "europe-west9"}, - {"value": "asia-east1", "label": "asia-east1"}, - {"value": "asia-east2", "label": "asia-east2"}, - {"value": "asia-northeast1", "label": "asia-northeast1"}, - {"value": "asia-northeast2", "label": "asia-northeast2"}, - {"value": "asia-northeast3", "label": "asia-northeast3"}, - {"value": "asia-south1", "label": "asia-south1"}, - {"value": "asia-south2", "label": "asia-south2"}, - {"value": "asia-southeast1", "label": "asia-southeast1"}, - {"value": "asia-southeast2", "label": "asia-southeast2"}, - {"value": "me-west1", "label": "me-west1"}, - {"value": "australia-southeast1", "label": "australia-southeast1"}, - {"value": "australia-southeast2", "label": "australia-southeast2"}, - ], - }, - } - - -class TestGetBackendConfigValuesLambda: - @pytest.mark.asyncio - async def test_returns_initial_config(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - body = {"type": "lambda"} - response = client.post( - "/api/backends/config_values", - headers=get_auth_headers(user.token), - json=body, - ) - assert response.status_code == 200, response.json() - assert response.json() == { - "type": "lambda", - "regions": None, - } - - @pytest.mark.asyncio - async def test_returns_invalid_credentials(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - body = { - "type": "lambda", - "creds": { - "type": "api_key", - "api_key": "1234", - }, - } - with patch("dstack._internal.core.backends.lambdalabs.api_client.LambdaAPIClient") as m: - m.return_value.validate_api_key.return_value = False - response = client.post( - "/api/backends/config_values", - headers=get_auth_headers(user.token), - json=body, - ) - m.return_value.validate_api_key.assert_called() - assert response.status_code == 400, response.json() - assert response.json() == { - "detail": [ - { - "code": "invalid_credentials", - "msg": "Invalid credentials", - "fields": ["creds", "api_key"], - }, - ] - } - - @pytest.mark.asyncio - async def test_returns_config_on_valid_creds(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - body = { - "type": "lambda", - "creds": { - "type": "api_key", - "api_key": "1234", - }, - } - with patch("dstack._internal.core.backends.lambdalabs.api_client.LambdaAPIClient") as m: - m.return_value.validate_api_key.return_value = True - response = client.post( - "/api/backends/config_values", - headers=get_auth_headers(user.token), - json=body, - ) - m.return_value.validate_api_key.assert_called() - assert response.status_code == 200, response.json() - assert response.json() == { - "type": "lambda", - "regions": { - "selected": ["us-east-1"], - "values": [ - {"value": "us-south-1", "label": "us-south-1"}, - {"value": "us-west-2", "label": "us-west-2"}, - {"value": "us-west-1", "label": "us-west-1"}, - {"value": "us-midwest-1", "label": "us-midwest-1"}, - {"value": "us-west-3", "label": "us-west-3"}, - {"value": "us-east-1", "label": "us-east-1"}, - { - "value": "australia-southeast-1", - "label": "australia-southeast-1", - }, - {"value": "europe-central-1", "label": "europe-central-1"}, - {"value": "asia-south-1", "label": "asia-south-1"}, - {"value": "me-west-1", "label": "me-west-1"}, - {"value": "europe-south-1", "label": "europe-south-1"}, - {"value": "asia-northeast-1", "label": "asia-northeast-1"}, - ], - }, - } - - -class TestGetBackendConfigValuesOCI: - @pytest.mark.asyncio - async def test_returns_initial_config(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - body = {"type": "oci"} - with patch( - "dstack._internal.core.backends.oci.auth.default_creds_available" - ) as default_creds_available_mock: - default_creds_available_mock.return_value = False - response = client.post( - "/api/backends/config_values", - headers=get_auth_headers(user.token), - json=body, - ) - default_creds_available_mock.assert_called() - assert response.status_code == 200, response.json() - assert response.json() == { - "type": "oci", - "default_creds": False, - "regions": None, - "compartment_id": None, - } - - @pytest.mark.asyncio - async def test_returns_invalid_credentials(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - body = { - "type": "oci", - "creds": FAKE_OCI_CLIENT_CREDS, - } - with patch( - "dstack._internal.core.backends.oci.auth.default_creds_available" - ) as default_creds_available_mock: - response = client.post( - "/api/backends/config_values", - headers=get_auth_headers(user.token), - json=body, - ) - default_creds_available_mock.assert_called() - assert response.status_code == 400 - error = response.json()["detail"][0] - assert error["code"] == "invalid_credentials" - assert error["msg"].startswith("Invalid credentials") - - @pytest.mark.asyncio - async def test_returns_config_on_valid_creds(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - body = { - "type": "oci", - "creds": FAKE_OCI_CLIENT_CREDS, - } - with patch( - "dstack._internal.core.backends.oci.auth.default_creds_available" - ) as default_creds_available_mock, patch( - "dstack._internal.server.services.backends.configurators.oci.get_subscribed_regions" - ) as get_regions_mock: - default_creds_available_mock.return_value = True - get_regions_mock.return_value = SAMPLE_OCI_SUBSCRIBED_REGIONS - response = client.post( - "/api/backends/config_values", - headers=get_auth_headers(user.token), - json=body, - ) - default_creds_available_mock.assert_called() - get_regions_mock.assert_called() - body = response.json() - body["regions"]["selected"].sort() - body["regions"]["values"].sort(key=itemgetter("value")) - assert response.status_code == 200, response.json() - assert body == { - "type": "oci", - "default_creds": True, - "regions": { - "selected": ["eu-frankfurt-1", "me-dubai-1"], - "values": [ - {"label": "eu-frankfurt-1", "value": "eu-frankfurt-1"}, - {"label": "me-dubai-1", "value": "me-dubai-1"}, - ], - }, - "compartment_id": None, - } - - class TestCreateBackend: @pytest.mark.asyncio - async def test_returns_403_if_not_admin(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_admin( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/backends/create", headers=get_auth_headers(user.token), json={}, @@ -742,7 +123,8 @@ async def test_returns_403_if_not_admin(self, test_db, session: AsyncSession): assert response.status_code == 403 @pytest.mark.asyncio - async def test_creates_aws_backend(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_creates_aws_backend(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( @@ -757,23 +139,26 @@ async def test_creates_aws_backend(self, test_db, session: AsyncSession): }, "regions": ["us-west-1"], } - with patch( - "dstack._internal.core.backends.aws.auth.default_creds_available" - ) as default_creds_available_mock, patch( - "dstack._internal.core.backends.aws.auth.authenticate" - ), patch("dstack._internal.core.backends.aws.compute.get_vpc_id_subnet_id_or_error"): - default_creds_available_mock.return_value = False - response = client.post( + with ( + patch("dstack._internal.core.backends.aws.auth.authenticate"), + patch("dstack._internal.core.backends.aws.compute.get_vpc_id_subnets_ids_or_error"), + ): + response = await client.post( f"/api/project/{project.name}/backends/create", headers=get_auth_headers(user.token), json=body, ) assert response.status_code == 200, response.json() res = await session.execute(select(BackendModel)) - assert len(res.scalars().all()) == 1 + backend = res.scalars().one() + assert backend.source_config is not None + assert backend.source_auth is not None + assert json.loads(backend.source_config)["regions"] == ["us-west-1"] + assert json.loads(backend.source_auth.get_plaintext_or_error()) == body["creds"] @pytest.mark.asyncio - async def test_creates_gcp_backend(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_creates_gcp_backend(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( @@ -789,17 +174,13 @@ async def test_creates_gcp_backend(self, test_db, session: AsyncSession): "project_id": "test_project", "regions": ["us-east1"], } - with patch( - "dstack._internal.core.backends.gcp.auth.default_creds_available" - ) as default_creds_available_mock, patch( - "dstack._internal.core.backends.gcp.auth.authenticate" - ) as authenticate_mock, patch( - "dstack._internal.core.backends.gcp.resources.check_vpc" - ) as check_vpc_mock: - default_creds_available_mock.return_value = False + with ( + patch("dstack._internal.core.backends.gcp.auth.authenticate") as authenticate_mock, + patch("dstack._internal.core.backends.gcp.resources.check_vpc") as check_vpc_mock, + ): credentials_mock = Mock() authenticate_mock.return_value = credentials_mock, "test_project" - response = client.post( + response = await client.post( f"/api/project/{project.name}/backends/create", headers=get_auth_headers(user.token), json=body, @@ -810,7 +191,10 @@ async def test_creates_gcp_backend(self, test_db, session: AsyncSession): assert len(res.scalars().all()) == 1 @pytest.mark.asyncio - async def test_creates_lambda_backend(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_creates_lambda_backend( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( @@ -826,7 +210,7 @@ async def test_creates_lambda_backend(self, test_db, session: AsyncSession): } with patch("dstack._internal.core.backends.lambdalabs.api_client.LambdaAPIClient") as m: m.return_value.validate_api_key.return_value = True - response = client.post( + response = await client.post( f"/api/project/{project.name}/backends/create", headers=get_auth_headers(user.token), json=body, @@ -837,7 +221,196 @@ async def test_creates_lambda_backend(self, test_db, session: AsyncSession): assert len(res.scalars().all()) == 1 @pytest.mark.asyncio - async def test_creates_oci_backend(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + class TestNebius: + @pytest.fixture(autouse=True) + def patch_catalog(self): + with patch( + "dstack._internal.core.backends.nebius.resources.get_catalog_offers" + ) as get_catalog_offers_mock: + get_catalog_offers_mock.return_value = [ + InstanceOffer( + backend=BackendType.NEBIUS, + instance=InstanceType( + name="gpu-h100-sxm 8gpu-128vcpu-1600gb", + resources=Resources( + cpus=128, + memory_mib=1600 * 1024, + gpus=[Gpu(name="H100", memory_mib=80 * 1024)] * 8, + spot=False, + ), + ), + region="eu-north1", + price=23.6, + backend_data={"fabrics": ["fabric-2", "fabric-3"]}, + ) + ] + yield + + async def test_not_creates_with_invalid_creds( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + body = { + "type": "nebius", + "creds": FAKE_NEBIUS_SERVICE_ACCOUNT_CREDS, + } + with patch( + "dstack._internal.core.backends.nebius.resources.list_tenant_projects" + ) as projects_mock: + projects_mock.side_effect = ValueError() + response = await client.post( + f"/api/project/{project.name}/backends/create", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 400, response.json() + res = await session.execute(select(BackendModel)) + assert len(res.scalars().all()) == 0 + + @pytest.mark.parametrize( + ("config_extra", "mocked_projects", "error"), + [ + pytest.param( + {}, + [_nebius_project()], + None, + id="default", + ), + pytest.param( + {"regions": ["eu-north1"]}, + [ + _nebius_project( + "project-e00test", "default-project-eu-north1", "eu-north1" + ), + _nebius_project("project-e01test", "default-project-eu-west1", "eu-west1"), + ], + None, + id="with-regions", + ), + pytest.param( + {"regions": ["xx-xxxx1"]}, + [_nebius_project()], + "do not exist in this Nebius tenancy", + id="error-invalid-regions", + ), + pytest.param( + {"regions": ["eu-north1"]}, + [ + _nebius_project( + "project-e00test0", "default-project-eu-north1", "eu-north1" + ), + _nebius_project("project-e00test1", "non-default-project", "eu-north1"), + ], + None, + id="finds-default-project-among-many", + ), + pytest.param( + {"regions": ["eu-north1"]}, + [ + _nebius_project("project-e00test0", "non-default-project-0", "eu-north1"), + _nebius_project("project-e00test1", "non-default-project-1", "eu-north1"), + ], + "Could not find the default project in region eu-north1", + id="error-no-default-project", + ), + pytest.param( + {"projects": ["project-e00test0"]}, + [ + _nebius_project("project-e00test0", "non-default-project-0", "eu-north1"), + _nebius_project("project-e00test1", "non-default-project-1", "eu-north1"), + ], + None, + id="with-projects", + ), + pytest.param( + {"projects": ["project-e00xxxx"]}, + [_nebius_project()], + "not found in this Nebius tenancy", + id="error-invalid-projects", + ), + pytest.param( + {"projects": ["project-e00test0", "project-e00test1"]}, + [ + _nebius_project("project-e00test0", "non-default-project-0", "eu-north1"), + _nebius_project("project-e00test1", "non-default-project-1", "eu-north1"), + ], + "both belong to the same region", + id="error-multiple-projects-in-same-region", + ), + pytest.param( + { + "regions": ["eu-north1"], + "projects": ["project-e00test"], + }, + [ + _nebius_project( + "project-e00test", "default-project-eu-north1", "eu-north1" + ), + _nebius_project("project-e01test", "default-project-eu-west1", "eu-west1"), + ], + None, + id="with-regions-and-projects", + ), + pytest.param( + {"fabrics": ["fabric-2", "fabric-3"]}, + [_nebius_project()], + None, + id="with-valid-fabrics", + ), + pytest.param( + {"fabrics": ["fabric-2", "fabric-invalid"]}, + [_nebius_project()], + "InfiniBand fabrics do not exist", + id="with-invalid-fabrics", + ), + ], + ) + async def test_create( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + config_extra: dict[str, Any], + mocked_projects: Sequence[Any], + error: Optional[str], + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + body = { + "type": "nebius", + "creds": FAKE_NEBIUS_SERVICE_ACCOUNT_CREDS, + **config_extra, + } + with patch( + "dstack._internal.core.backends.nebius.resources.list_tenant_projects" + ) as projects_mock: + projects_mock.return_value = mocked_projects + response = await client.post( + f"/api/project/{project.name}/backends/create", + headers=get_auth_headers(user.token), + json=body, + ) + if not error: + assert response.status_code == 200, response.json() + res = await session.execute(select(BackendModel)) + assert len(res.scalars().all()) == 1 + else: + assert response.status_code == 400, response.json() + assert error in response.json()["detail"][0]["msg"] + res = await session.execute(select(BackendModel)) + assert len(res.scalars().all()) == 0 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_creates_oci_backend(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( @@ -847,17 +420,17 @@ async def test_creates_oci_backend(self, test_db, session: AsyncSession): "type": "oci", "creds": FAKE_OCI_CLIENT_CREDS, } - with patch( - "dstack._internal.core.backends.oci.auth.default_creds_available" - ) as default_creds_available_mock, patch( - "dstack._internal.server.services.backends.configurators.oci.get_subscribed_regions" - ) as get_regions_mock, patch( - "dstack._internal.server.services.backends.configurators.oci._create_resources" - ) as create_resources_mock: - default_creds_available_mock.return_value = False + with ( + patch( + "dstack._internal.core.backends.oci.configurator.get_subscribed_regions" + ) as get_regions_mock, + patch( + "dstack._internal.core.backends.oci.configurator._create_resources" + ) as create_resources_mock, + ): get_regions_mock.return_value = SAMPLE_OCI_SUBSCRIBED_REGIONS create_resources_mock.return_value = SAMPLE_OCI_COMPARTMENT_ID, SAMPLE_OCI_SUBNETS - response = client.post( + response = await client.post( f"/api/project/{project.name}/backends/create", headers=get_auth_headers(user.token), json=body, @@ -867,8 +440,9 @@ async def test_creates_oci_backend(self, test_db, session: AsyncSession): assert len(res.scalars().all()) == 1 @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_not_creates_oci_backend_if_regions_not_subscribed( - self, test_db, session: AsyncSession + self, test_db, session: AsyncSession, client: AsyncClient ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) @@ -880,15 +454,14 @@ async def test_not_creates_oci_backend_if_regions_not_subscribed( "creds": FAKE_OCI_CLIENT_CREDS, "regions": ["me-dubai-1", "eu-frankfurt-1", "us-ashburn-1"], } - with patch( - "dstack._internal.core.backends.oci.auth.default_creds_available" - ) as default_creds_available_mock, patch( - "dstack._internal.server.services.backends.configurators.oci.get_subscribed_regions" - ) as get_regions_mock: - default_creds_available_mock.return_value = False + with ( + patch( + "dstack._internal.core.backends.oci.configurator.get_subscribed_regions" + ) as get_regions_mock, + ): # us-ashburn-1 not subscribed get_regions_mock.return_value = SAMPLE_OCI_SUBSCRIBED_REGIONS - response = client.post( + response = await client.post( f"/api/project/{project.name}/backends/create", headers=get_auth_headers(user.token), json=body, @@ -898,7 +471,8 @@ async def test_not_creates_oci_backend_if_regions_not_subscribed( assert len(res.scalars().all()) == 0 @pytest.mark.asyncio - async def test_create_azure_backend(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_create_azure_backend(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( @@ -914,17 +488,14 @@ async def test_create_azure_backend(self, test_db, session: AsyncSession): }, "tenant_id": "test_tenant", "subscription_id": "test_subscription", - "locations": ["eastus"], + "regions": ["eastus"], } - with patch( - "dstack._internal.core.backends.azure.auth.authenticate" - ) as authenticate_mock, patch( - "azure.mgmt.subscription.SubscriptionClient" - ) as SubscriptionClientMock, patch( - "azure.mgmt.resource.ResourceManagementClient" - ) as ResourceManagementClientMock, patch( - "azure.mgmt.network.NetworkManagementClient" - ) as NetworkManagementClientMock: + with ( + patch("dstack._internal.core.backends.azure.auth.authenticate") as authenticate_mock, + patch("azure.mgmt.subscription.SubscriptionClient") as SubscriptionClientMock, + patch("azure.mgmt.resource.ResourceManagementClient") as ResourceManagementClientMock, + patch("azure.mgmt.network.NetworkManagementClient") as NetworkManagementClientMock, + ): authenticate_mock.return_value = None, "test_tenant" subscription_client_mock = SubscriptionClientMock.return_value tenant_mock = Mock() @@ -940,7 +511,7 @@ async def test_create_azure_backend(self, test_db, session: AsyncSession): resource_client_mock.resource_groups.create_or_update.return_value = ( resource_group_mock ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/backends/create", headers=get_auth_headers(user.token), json=body, @@ -954,7 +525,10 @@ async def test_create_azure_backend(self, test_db, session: AsyncSession): assert len(res.scalars().all()) == 1 @pytest.mark.asyncio - async def test_returns_400_if_backend_exists(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_400_if_backend_exists( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( @@ -969,13 +543,11 @@ async def test_returns_400_if_backend_exists(self, test_db, session: AsyncSessio }, "regions": ["us-west-1"], } - with patch( - "dstack._internal.core.backends.aws.auth.default_creds_available" - ) as default_creds_available_mock, patch( - "dstack._internal.core.backends.aws.auth.authenticate" - ), patch("dstack._internal.core.backends.aws.compute.get_vpc_id_subnet_id_or_error"): - default_creds_available_mock.return_value = False - response = client.post( + with ( + patch("dstack._internal.core.backends.aws.auth.authenticate"), + patch("dstack._internal.core.backends.aws.compute.get_vpc_id_subnets_ids_or_error"), + ): + response = await client.post( f"/api/project/{project.name}/backends/create", headers=get_auth_headers(user.token), json=body, @@ -983,13 +555,10 @@ async def test_returns_400_if_backend_exists(self, test_db, session: AsyncSessio assert response.status_code == 200, response.json() res = await session.execute(select(BackendModel)) assert len(res.scalars().all()) == 1 - with patch( - "dstack._internal.core.backends.aws.auth.default_creds_available" - ) as default_creds_available_mock, patch( - "dstack._internal.core.backends.aws.auth.authenticate" - ) as authenticate_mock: # noqa: F841 - default_creds_available_mock.return_value = False - response = client.post( + with ( + patch("dstack._internal.core.backends.aws.auth.authenticate") as authenticate_mock, # noqa: F841 + ): + response = await client.post( f"/api/project/{project.name}/backends/create", headers=get_auth_headers(user.token), json=body, @@ -1001,13 +570,16 @@ async def test_returns_400_if_backend_exists(self, test_db, session: AsyncSessio class TestUpdateBackend: @pytest.mark.asyncio - async def test_returns_403_if_not_admin(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_admin( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/backends/update", headers=get_auth_headers(user.token), json={}, @@ -1015,7 +587,8 @@ async def test_returns_403_if_not_admin(self, test_db, session: AsyncSession): assert response.status_code == 403 @pytest.mark.asyncio - async def test_updates_backend(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_updates_backend(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( @@ -1033,13 +606,11 @@ async def test_updates_backend(self, test_db, session: AsyncSession): }, "regions": ["us-east-1"], } - with patch( - "dstack._internal.core.backends.aws.auth.default_creds_available" - ) as default_creds_available_mock, patch( - "dstack._internal.core.backends.aws.auth.authenticate" - ), patch("dstack._internal.core.backends.aws.compute.get_vpc_id_subnet_id_or_error"): - default_creds_available_mock.return_value = False - response = client.post( + with ( + patch("dstack._internal.core.backends.aws.auth.authenticate"), + patch("dstack._internal.core.backends.aws.compute.get_vpc_id_subnets_ids_or_error"), + ): + response = await client.post( f"/api/project/{project.name}/backends/update", headers=get_auth_headers(user.token), json=body, @@ -1047,9 +618,16 @@ async def test_updates_backend(self, test_db, session: AsyncSession): assert response.status_code == 200, response.json() await session.refresh(backend) assert json.loads(backend.config)["regions"] == ["us-east-1"] + assert backend.source_config is not None + assert backend.source_auth is not None + assert json.loads(backend.source_config)["regions"] == ["us-east-1"] + assert json.loads(backend.source_auth.get_plaintext_or_error()) == body["creds"] @pytest.mark.asyncio - async def test_returns_400_if_backend_does_not_exist(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_400_if_backend_does_not_exist( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( @@ -1064,7 +642,7 @@ async def test_returns_400_if_backend_does_not_exist(self, test_db, session: Asy }, "regions": ["us-east-1"], } - response = client.post( + response = await client.post( f"/api/project/{project.name}/backends/update", headers=get_auth_headers(user.token), json=body, @@ -1074,27 +652,31 @@ async def test_returns_400_if_backend_does_not_exist(self, test_db, session: Asy class TestDeleteBackends: @pytest.mark.asyncio - async def test_returns_403_if_not_admin(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_admin( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/backends/delete", headers=get_auth_headers(user.token), ) assert response.status_code == 403 @pytest.mark.asyncio - async def test_deletes_backends(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_deletes_backends(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.ADMIN ) backend = await create_backend(session=session, project_id=project.id) - response = client.post( + response = await client.post( f"/api/project/{project.name}/backends/delete", headers=get_auth_headers(user.token), json={"backends_names": [backend.type.value]}, @@ -1103,31 +685,129 @@ async def test_deletes_backends(self, test_db, session: AsyncSession): res = await session.execute(select(BackendModel)) assert len(res.scalars().all()) == 0 + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_400_if_backend_has_active_instances( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + backend = await create_backend(session=session, project_id=project.id) + fleet = await create_fleet(session=session, project=project) + instance1 = await create_instance( + session=session, + project=project, + status=InstanceStatus.TERMINATED, + backend=backend.type, + ) + instance2 = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + backend=backend.type, + ) + fleet.instances.append(instance1) + fleet.instances.append(instance2) + await session.commit() + response = await client.post( + f"/api/project/{project.name}/backends/delete", + headers=get_auth_headers(user.token), + json={"backends_names": [backend.type.value]}, + ) + assert response.status_code == 400 + res = await session.execute(select(BackendModel)) + assert len(res.scalars().all()) == 1 + fleet.instances.pop() + await session.commit() + response = await client.post( + f"/api/project/{project.name}/backends/delete", + headers=get_auth_headers(user.token), + json={"backends_names": [backend.type.value]}, + ) + assert response.status_code == 200 + res = await session.execute(select(BackendModel)) + assert len(res.scalars().all()) == 0 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_400_if_backend_has_active_volumes( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + backend = await create_backend(session=session, project_id=project.id) + await create_volume( + session=session, + project=project, + user=user, + backend=backend.type, + volume_provisioning_data=get_volume_provisioning_data(backend=backend.type), + status=VolumeStatus.ACTIVE, + deleted_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + volume2 = await create_volume( + session=session, + project=project, + user=user, + backend=backend.type, + volume_provisioning_data=get_volume_provisioning_data(backend=backend.type), + status=VolumeStatus.ACTIVE, + ) + await session.commit() + response = await client.post( + f"/api/project/{project.name}/backends/delete", + headers=get_auth_headers(user.token), + json={"backends_names": [backend.type.value]}, + ) + assert response.status_code == 400 + res = await session.execute(select(BackendModel)) + assert len(res.scalars().all()) == 1 + await session.delete(volume2) + await session.commit() + response = await client.post( + f"/api/project/{project.name}/backends/delete", + headers=get_auth_headers(user.token), + json={"backends_names": [backend.type.value]}, + ) + assert response.status_code == 200 + res = await session.execute(select(BackendModel)) + assert len(res.scalars().all()) == 0 + class TestGetConfigInfo: @pytest.mark.asyncio - async def test_returns_403_if_not_admin(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_admin( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) backend = await create_backend(session=session, project_id=project.id) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/backends/{backend.type.value}/config_info", headers=get_auth_headers(user.token), ) assert response.status_code == 403 @pytest.mark.asyncio - async def test_returns_config_info(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_config_info(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) backend = await create_backend(session=session, project_id=project.id) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.ADMIN ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/backends/{backend.type.value}/config_info", headers=get_auth_headers(user.token), ) @@ -1139,19 +819,25 @@ async def test_returns_config_info(self, test_db, session: AsyncSession): "vpc_ids": None, "default_vpcs": None, "public_ips": None, - "creds": json.loads(backend.auth), + "iam_instance_profile": None, + "tags": None, + "os_images": None, + "creds": json.loads(backend.auth.get_plaintext_or_error()), } class TestCreateBackendYAML: @pytest.mark.asyncio - async def test_returns_403_if_not_admin(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_admin( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/backends/create_yaml", headers=get_auth_headers(user.token), json={}, @@ -1159,7 +845,8 @@ async def test_returns_403_if_not_admin(self, test_db, session: AsyncSession): assert response.status_code == 403 @pytest.mark.asyncio - async def test_creates_aws_backend(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_creates_aws_backend(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( @@ -1175,13 +862,11 @@ async def test_creates_aws_backend(self, test_db, session: AsyncSession): "regions": ["us-west-1"], } body = {"config_yaml": yaml.dump(config_dict)} - with patch( - "dstack._internal.core.backends.aws.auth.default_creds_available" - ) as default_creds_available_mock, patch( - "dstack._internal.core.backends.aws.auth.authenticate" - ), patch("dstack._internal.core.backends.aws.compute.get_vpc_id_subnet_id_or_error"): - default_creds_available_mock.return_value = False - response = client.post( + with ( + patch("dstack._internal.core.backends.aws.auth.authenticate"), + patch("dstack._internal.core.backends.aws.compute.get_vpc_id_subnets_ids_or_error"), + ): + response = await client.post( f"/api/project/{project.name}/backends/create_yaml", headers=get_auth_headers(user.token), json=body, @@ -1191,7 +876,8 @@ async def test_creates_aws_backend(self, test_db, session: AsyncSession): assert len(res.scalars().all()) == 1 @pytest.mark.asyncio - async def test_creates_oci_backend(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_creates_oci_backend(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( @@ -1202,17 +888,17 @@ async def test_creates_oci_backend(self, test_db, session: AsyncSession): "creds": FAKE_OCI_CLIENT_CREDS, } body = {"config_yaml": yaml.dump(config_dict)} - with patch( - "dstack._internal.core.backends.oci.auth.default_creds_available" - ) as default_creds_available_mock, patch( - "dstack._internal.server.services.backends.configurators.oci.get_subscribed_regions" - ) as get_regions_mock, patch( - "dstack._internal.server.services.backends.configurators.oci._create_resources" - ) as create_resources_mock: - default_creds_available_mock.return_value = False + with ( + patch( + "dstack._internal.core.backends.oci.configurator.get_subscribed_regions" + ) as get_regions_mock, + patch( + "dstack._internal.core.backends.oci.configurator._create_resources" + ) as create_resources_mock, + ): get_regions_mock.return_value = SAMPLE_OCI_SUBSCRIBED_REGIONS create_resources_mock.return_value = SAMPLE_OCI_COMPARTMENT_ID, SAMPLE_OCI_SUBNETS - response = client.post( + response = await client.post( f"/api/project/{project.name}/backends/create_yaml", headers=get_auth_headers(user.token), json=body, @@ -1224,13 +910,16 @@ async def test_creates_oci_backend(self, test_db, session: AsyncSession): class TestUpdateBackendYAML: @pytest.mark.asyncio - async def test_returns_403_if_not_admin(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_admin( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/backends/update_yaml", headers=get_auth_headers(user.token), json={}, @@ -1238,7 +927,8 @@ async def test_returns_403_if_not_admin(self, test_db, session: AsyncSession): assert response.status_code == 403 @pytest.mark.asyncio - async def test_updates_aws_backend(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_updates_aws_backend(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( @@ -1260,13 +950,11 @@ async def test_updates_aws_backend(self, test_db, session: AsyncSession): "regions": ["us-east-1"], } body = {"config_yaml": yaml.dump(config_dict)} - with patch( - "dstack._internal.core.backends.aws.auth.default_creds_available" - ) as default_creds_available_mock, patch( - "dstack._internal.core.backends.aws.auth.authenticate" - ), patch("dstack._internal.core.backends.aws.compute.get_vpc_id_subnet_id_or_error"): - default_creds_available_mock.return_value = False - response = client.post( + with ( + patch("dstack._internal.core.backends.aws.auth.authenticate"), + patch("dstack._internal.core.backends.aws.compute.get_vpc_id_subnets_ids_or_error"), + ): + response = await client.post( f"/api/project/{project.name}/backends/update_yaml", headers=get_auth_headers(user.token), json=body, @@ -1278,7 +966,10 @@ async def test_updates_aws_backend(self, test_db, session: AsyncSession): class TestGetConfigYAML: @pytest.mark.asyncio - async def test_returns_403_if_not_admin(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_admin( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( @@ -1290,7 +981,7 @@ async def test_returns_403_if_not_admin(self, test_db, session: AsyncSession): backend_type=BackendType.AWS, config={"regions": ["us-west-1"]}, ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/backends/aws/get_yaml", headers=get_auth_headers(user.token), json={}, @@ -1298,7 +989,8 @@ async def test_returns_403_if_not_admin(self, test_db, session: AsyncSession): assert response.status_code == 403 @pytest.mark.asyncio - async def test_returns_config_yaml(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_config_yaml(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( @@ -1317,7 +1009,7 @@ async def test_returns_config_yaml(self, test_db, session: AsyncSession): config=config, auth=auth, ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/backends/aws/get_yaml", headers=get_auth_headers(user.token), json={}, diff --git a/src/tests/_internal/server/routers/test_events.py b/src/tests/_internal/server/routers/test_events.py new file mode 100644 index 0000000000..cb8e44b85a --- /dev/null +++ b/src/tests/_internal/server/routers/test_events.py @@ -0,0 +1,1556 @@ +import uuid +from datetime import datetime +from unittest.mock import patch + +import pytest +import pytest_asyncio +from freezegun import freeze_time +from httpx import AsyncClient +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.users import GlobalRole, ProjectRole +from dstack._internal.server.services import events +from dstack._internal.server.services.projects import add_project_member +from dstack._internal.server.testing.common import ( + create_export, + create_fleet, + create_instance, + create_job, + create_project, + create_repo, + create_run, + create_user, + get_auth_headers, + get_fleet_spec, + get_ssh_fleet_configuration, +) + +pytestmark = [ + pytest.mark.asyncio, + pytest.mark.usefixtures("test_db", "image_config_mock"), + pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True), +] + + +class TestListEventsGeneral: + async def test_response_format(self, session: AsyncSession, client: AsyncClient) -> None: + user = await create_user(session=session, name="test_user") + project = await create_project(session=session, owner=user, name="test_project") + await add_project_member( + session=session, + project=project, + user=user, + project_role=ProjectRole.ADMIN, + ) + event_ids = [uuid.uuid4() for _ in range(2)] + with patch("uuid.uuid4", side_effect=event_ids): + with freeze_time(datetime(2026, 1, 1, 12, 0, 0)): + events.emit( + session, + "User added to project", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(user), events.Target.from_model(project)], + ) + with freeze_time(datetime(2026, 1, 1, 12, 0, 1)): + events.emit( + session, + "Project updated", + actor=events.SystemActor(), + targets=[events.Target.from_model(project)], + ) + await session.commit() + + resp = await client.post("/api/events/list", headers=get_auth_headers(user.token), json={}) + resp.raise_for_status() + resp_data = resp.json() + for event in resp_data: + event["targets"].sort(key=lambda t: t["type"]) # for consistent comparison + assert resp_data == [ + { + "id": str(event_ids[1]), + "message": "Project updated", + "recorded_at": "2026-01-01T12:00:01+00:00", + "actor_user_id": None, + "actor_user": None, + "is_actor_user_deleted": None, + "targets": [ + { + "type": "project", + "project_id": str(project.id), + "project_name": "test_project", + "is_project_deleted": False, + "id": str(project.id), + "name": "test_project", + }, + ], + }, + { + "id": str(event_ids[0]), + "message": "User added to project", + "recorded_at": "2026-01-01T12:00:00+00:00", + "actor_user_id": str(user.id), + "actor_user": "test_user", + "is_actor_user_deleted": False, + "targets": [ + { + "type": "project", + "project_id": str(project.id), + "project_name": "test_project", + "is_project_deleted": False, + "id": str(project.id), + "name": "test_project", + }, + { + "type": "user", + "project_id": None, + "project_name": None, + "is_project_deleted": None, + "id": str(user.id), + "name": "test_user", + }, + ], + }, + ] + + async def test_deleted_actor_and_project( + self, session: AsyncSession, client: AsyncClient + ) -> None: + user = await create_user(session=session, name="test_user") + project = await create_project(session=session, owner=user, name="test_project") + events.emit( + session, + "Project deleted", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(project)], + ) + user.original_name = user.name + user.name = "_deleted_user_placeholder" + user.deleted = True + project.original_name = project.name + project.name = "_deleted_project_placeholder" + project.deleted = True + await session.commit() + other_user = await create_user(session=session, name="other_user") + + resp = await client.post( + "/api/events/list", headers=get_auth_headers(other_user.token), json={} + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["actor_user_id"] == str(user.id) + assert resp.json()[0]["actor_user"] == "test_user" + assert resp.json()[0]["is_actor_user_deleted"] == True + assert len(resp.json()[0]["targets"]) == 1 + assert resp.json()[0]["targets"][0]["project_id"] == str(project.id) + assert resp.json()[0]["targets"][0]["project_name"] == "test_project" + assert resp.json()[0]["targets"][0]["is_project_deleted"] == True + + async def test_empty_response_when_no_events( + self, session: AsyncSession, client: AsyncClient + ) -> None: + user = await create_user(session=session) + resp = await client.post("/api/events/list", headers=get_auth_headers(user.token), json={}) + resp.raise_for_status() + assert resp.json() == [] + + +class TestListEventsAccessControl: + async def test_user_sees_events_about_themselves( + self, session: AsyncSession, client: AsyncClient + ) -> None: + admin_user = await create_user( + session=session, + name="admin", + global_role=GlobalRole.ADMIN, + ) + regular_user = await create_user( + session=session, + name="regular", + global_role=GlobalRole.USER, + ) + events.emit( + session, + "User created", + actor=events.UserActor.from_user(admin_user), + targets=[events.Target.from_model(admin_user)], + ) + events.emit( + session, + "User created", + actor=events.UserActor.from_user(admin_user), + targets=[events.Target.from_model(regular_user)], + ) + await session.commit() + + # Regular user only sees the event about themselves + resp = await client.post( + "/api/events/list", headers=get_auth_headers(regular_user.token), json={} + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["targets"][0]["id"] == str(regular_user.id) + + # Admin sees all events + resp = await client.post( + "/api/events/list", headers=get_auth_headers(admin_user.token), json={} + ) + resp.raise_for_status() + assert len(resp.json()) == 2 + + async def test_user_sees_events_within_their_project( + self, session: AsyncSession, client: AsyncClient + ) -> None: + admin_user = await create_user( + session=session, + name="admin", + global_role=GlobalRole.ADMIN, + ) + regular_user = await create_user( + session=session, + name="regular", + global_role=GlobalRole.USER, + ) + admin_project = await create_project( + session=session, + name="admin", + owner=admin_user, + ) + regular_project = await create_project( + session=session, + name="regular", + owner=regular_user, + ) + await add_project_member( + session=session, + project=admin_project, + user=admin_user, + project_role=ProjectRole.ADMIN, + ) + await add_project_member( + session=session, + project=regular_project, + user=regular_user, + project_role=ProjectRole.USER, + ) + admin_fleet = await create_fleet( + session=session, + project=admin_project, + name="admin", + ) + regular_fleet = await create_fleet( + session=session, + project=regular_project, + name="regular", + ) + events.emit( + session, + "Project created", + actor=events.UserActor.from_user(admin_user), + targets=[events.Target.from_model(admin_project)], + ) + events.emit( + session, + "Project created", + actor=events.UserActor.from_user(admin_user), + targets=[events.Target.from_model(regular_project)], + ) + events.emit( + session, + "Fleet created", + actor=events.UserActor.from_user(admin_user), + targets=[events.Target.from_model(admin_fleet)], + ) + events.emit( + session, + "Fleet created", + actor=events.UserActor.from_user(admin_user), + targets=[events.Target.from_model(regular_fleet)], + ) + await session.commit() + + # Regular user only sees the events within their project + resp = await client.post( + "/api/events/list", headers=get_auth_headers(regular_user.token), json={} + ) + resp.raise_for_status() + assert len(resp.json()) == 2 + assert {resp.json()[0]["targets"][0]["id"], resp.json()[1]["targets"][0]["id"]} == { + str(regular_project.id), + str(regular_fleet.id), + } + + # Admin sees all events + resp = await client.post( + "/api/events/list", headers=get_auth_headers(admin_user.token), json={} + ) + resp.raise_for_status() + assert len(resp.json()) == 4 + + async def test_filters_do_not_bypass_access_control( + self, session: AsyncSession, client: AsyncClient + ) -> None: + admin = await create_user( + session=session, + name="admin", + global_role=GlobalRole.ADMIN, + ) + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session) + fleet = await create_fleet(session=session, project=project) + events.emit( + session, + "Project created", + actor=events.UserActor.from_user(admin), + targets=[events.Target.from_model(project)], + ) + events.emit( + session, + "Fleet created", + actor=events.UserActor.from_user(admin), + targets=[events.Target.from_model(fleet)], + ) + await session.commit() + + # Regular user can't see events from a project they are not a member of + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"within_projects": [str(project.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 0 + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"target_projects": [str(project.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 0 + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"target_fleets": [str(fleet.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 0 + + # Admin can see the events + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(admin.token), + json={"within_projects": [str(project.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 2 + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(admin.token), + json={"target_projects": [str(project.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(admin.token), + json={"target_fleets": [str(fleet.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + + +class TestListEventsFilters: + async def test_target_projects(self, session: AsyncSession, client: AsyncClient) -> None: + user = await create_user(session=session) + project_a = await create_project(session=session, name="project_a", owner=user) + project_b = await create_project(session=session, name="project_b", owner=user) + fleet_a = await create_fleet(session=session, project=project_a) + events.emit( + session, + "User created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(user)], + ) + events.emit( + session, + "Project created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(project_a)], + ) + events.emit( + session, + "Project created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(project_b)], + ) + events.emit( + session, + "Fleet created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(fleet_a)], + ) + await session.commit() + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"target_projects": [str(project_a.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["targets"][0]["id"] == str(project_a.id) + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"target_projects": [str(project_b.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["targets"][0]["id"] == str(project_b.id) + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"target_projects": [str(project_a.id), str(project_b.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 2 + + async def test_target_users(self, session: AsyncSession, client: AsyncClient) -> None: + user_a = await create_user(session=session, name="user_a") + user_b = await create_user(session=session, name="user_b") + project_a = await create_project(session=session, name="project_a", owner=user_a) + events.emit( + session, + "User created", + actor=events.UserActor.from_user(user_a), + targets=[events.Target.from_model(user_a)], + ) + events.emit( + session, + "User created", + actor=events.UserActor.from_user(user_b), + targets=[events.Target.from_model(user_b)], + ) + events.emit( + session, + "Project created", + actor=events.UserActor.from_user(user_a), + targets=[events.Target.from_model(project_a)], + ) + await session.commit() + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user_a.token), + json={"target_users": [str(user_a.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["targets"][0]["id"] == str(user_a.id) + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user_b.token), + json={"target_users": [str(user_b.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["targets"][0]["id"] == str(user_b.id) + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user_a.token), + json={"target_users": [str(user_a.id), str(user_b.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 2 + + async def test_target_fleets(self, session: AsyncSession, client: AsyncClient) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + fleet_a = await create_fleet( + session=session, + project=project, + name="fleet_a", + ) + fleet_b = await create_fleet( + session=session, + project=project, + name="fleet_b", + ) + instance_a = await create_instance( + session=session, + project=project, + fleet=fleet_a, + ) + events.emit( + session, + "Fleet created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(fleet_a)], + ) + events.emit( + session, + "Fleet created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(fleet_b)], + ) + events.emit( + session, + "Instance created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(instance_a)], + ) + await session.commit() + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"target_fleets": [str(fleet_a.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["targets"][0]["id"] == str(fleet_a.id) + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"target_fleets": [str(fleet_b.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["targets"][0]["id"] == str(fleet_b.id) + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"target_fleets": [str(fleet_a.id), str(fleet_b.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 2 + + async def test_target_instances(self, session: AsyncSession, client: AsyncClient) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + fleet = await create_fleet(session=session, project=project) + instance_a = await create_instance( + session=session, + project=project, + fleet=fleet, + ) + instance_b = await create_instance( + session=session, + project=project, + fleet=fleet, + ) + events.emit( + session, + "Fleet created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(fleet)], + ) + events.emit( + session, + "Instance created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(instance_a)], + ) + events.emit( + session, + "Instance created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(instance_b)], + ) + await session.commit() + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"target_instances": [str(instance_a.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["targets"][0]["id"] == str(instance_a.id) + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"target_instances": [str(instance_b.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["targets"][0]["id"] == str(instance_b.id) + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"target_instances": [str(instance_a.id), str(instance_b.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 2 + + async def test_target_runs(self, session: AsyncSession, client: AsyncClient) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + repo = await create_repo(session=session, project_id=project.id) + run_a = await create_run( + session=session, + project=project, + run_name="run_a", + repo=repo, + user=user, + ) + run_b = await create_run( + session=session, + project=project, + run_name="run_b", + repo=repo, + user=user, + ) + job_a = await create_job( + session=session, + run=run_a, + ) + events.emit( + session, + "Run created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(run_a)], + ) + events.emit( + session, + "Run created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(run_b)], + ) + events.emit( + session, + "Job created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(job_a)], + ) + await session.commit() + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"target_runs": [str(run_a.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["targets"][0]["id"] == str(run_a.id) + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"target_runs": [str(run_b.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["targets"][0]["id"] == str(run_b.id) + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"target_runs": [str(run_a.id), str(run_b.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 2 + + async def test_target_jobs(self, session: AsyncSession, client: AsyncClient) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + run_name="run", + repo=repo, + user=user, + ) + job_a = await create_job( + session=session, + run=run, + ) + job_b = await create_job( + session=session, + run=run, + ) + events.emit( + session, + "Run created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(run)], + ) + events.emit( + session, + "Job created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(job_a)], + ) + events.emit( + session, + "Job created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(job_b)], + ) + await session.commit() + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"target_jobs": [str(job_a.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["targets"][0]["id"] == str(job_a.id) + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"target_jobs": [str(job_b.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["targets"][0]["id"] == str(job_b.id) + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"target_jobs": [str(job_a.id), str(job_b.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 2 + + async def test_within_projects(self, session: AsyncSession, client: AsyncClient) -> None: + user = await create_user(session=session) + project_a = await create_project(session=session, name="project_a", owner=user) + project_b = await create_project(session=session, name="project_b", owner=user) + fleet_a = await create_fleet(session=session, project=project_a) + instance_a = await create_instance( + session=session, + project=project_a, + fleet=fleet_a, + ) + events.emit( + session, + "User created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(user)], + ) + events.emit( + session, + "Project created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(project_a)], + ) + events.emit( + session, + "Project created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(project_b)], + ) + events.emit( + session, + "Fleet created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(fleet_a)], + ) + events.emit( + session, + "Instance created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(instance_a)], + ) + await session.commit() + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"within_projects": [str(project_a.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 3 + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"within_projects": [str(project_b.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"within_projects": [str(project_a.id), str(project_b.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 4 + + async def test_within_fleets(self, session: AsyncSession, client: AsyncClient) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + fleet_a = await create_fleet( + session=session, + project=project, + name="fleet_a", + ) + fleet_b = await create_fleet( + session=session, + project=project, + name="fleet_b", + ) + isinstance_a = await create_instance( + session=session, + project=project, + fleet=fleet_a, + ) + events.emit( + session, + "Project created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(project)], + ) + events.emit( + session, + "Fleet created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(fleet_a)], + ) + events.emit( + session, + "Fleet created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(fleet_b)], + ) + events.emit( + session, + "Instance created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(isinstance_a)], + ) + await session.commit() + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"within_fleets": [str(fleet_a.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 2 + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"within_fleets": [str(fleet_b.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"within_fleets": [str(fleet_a.id), str(fleet_b.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 3 + + async def test_within_runs(self, session: AsyncSession, client: AsyncClient) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + repo = await create_repo(session=session, project_id=project.id) + run_a = await create_run( + session=session, + project=project, + run_name="run_a", + repo=repo, + user=user, + ) + run_b = await create_run( + session=session, + project=project, + run_name="run_b", + repo=repo, + user=user, + ) + job_a = await create_job( + session=session, + run=run_a, + ) + events.emit( + session, + "Project created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(project)], + ) + events.emit( + session, + "Run created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(run_a)], + ) + events.emit( + session, + "Run created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(run_b)], + ) + events.emit( + session, + "Job created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(job_a)], + ) + await session.commit() + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"within_runs": [str(run_a.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 2 + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"within_runs": [str(run_b.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"within_runs": [str(run_a.id), str(run_b.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 3 + + async def test_include_target_types(self, session: AsyncSession, client: AsyncClient) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + fleet = await create_fleet(session=session, project=project) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + ) + events.emit( + session, + "Project created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(project)], + ) + events.emit( + session, + "Fleet created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(fleet)], + ) + events.emit( + session, + "Instance created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(instance)], + ) + await session.commit() + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"include_target_types": ["fleet"]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["targets"][0]["type"] == "fleet" + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"include_target_types": ["instance"]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["targets"][0]["type"] == "instance" + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"include_target_types": ["project", "fleet"]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 2 + assert {resp.json()[0]["targets"][0]["type"], resp.json()[1]["targets"][0]["type"]} == { + "project", + "fleet", + } + + async def test_within_projects_and_include_target_types( + self, session: AsyncSession, client: AsyncClient + ) -> None: + user = await create_user(session=session) + project_a = await create_project(session=session, name="project_a", owner=user) + project_b = await create_project(session=session, name="project_b", owner=user) + fleet_a = await create_fleet(session=session, project=project_a) + instance_a = await create_instance( + session=session, + project=project_a, + fleet=fleet_a, + ) + fleet_b = await create_fleet(session=session, project=project_b) + instance_b = await create_instance( + session=session, + project=project_b, + fleet=fleet_b, + ) + events.emit( + session, + "Project created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(project_a)], + ) + events.emit( + session, + "Fleet created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(fleet_a)], + ) + events.emit( + session, + "Instance created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(instance_a)], + ) + events.emit( + session, + "Project created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(project_b)], + ) + events.emit( + session, + "Fleet created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(fleet_b)], + ) + events.emit( + session, + "Instance created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(instance_b)], + ) + await session.commit() + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={ + "within_projects": [str(project_a.id)], + "include_target_types": ["fleet"], + }, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["targets"][0]["type"] == "fleet" + assert resp.json()[0]["targets"][0]["id"] == str(fleet_a.id) + + async def test_actors(self, session: AsyncSession, client: AsyncClient) -> None: + user_a = await create_user(session=session, name="user_a") + user_b = await create_user(session=session, name="user_b") + project_a = await create_project(session=session, owner=user_a, name="project_a") + project_b = await create_project(session=session, owner=user_b, name="project_b") + events.emit( + session, + "Project created", + actor=events.UserActor.from_user(user_a), + targets=[events.Target.from_model(project_a)], + ) + events.emit( + session, + "Project created", + actor=events.UserActor.from_user(user_b), + targets=[events.Target.from_model(project_b)], + ) + events.emit( + session, + "Project updated", + actor=events.SystemActor(), + targets=[events.Target.from_model(project_a)], + ) + await session.commit() + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user_a.token), + json={"actors": [str(user_a.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["message"] == "Project created" + assert resp.json()[0]["targets"][0]["id"] == str(project_a.id) + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user_a.token), + json={"actors": [str(user_b.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["message"] == "Project created" + assert resp.json()[0]["targets"][0]["id"] == str(project_b.id) + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user_a.token), + json={"actors": [None]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["message"] == "Project updated" + assert resp.json()[0]["targets"][0]["id"] == str(project_a.id) + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user_a.token), + json={"actors": [str(user_a.id), None]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 2 + assert {resp.json()[0]["targets"][0]["id"], resp.json()[1]["targets"][0]["id"]} == { + str(project_a.id) + } + + async def test_event_included_if_at_least_one_target_is_within_filters( + self, session: AsyncSession, client: AsyncClient + ) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + fleet = await create_fleet(session=session, project=project) + instance_a = await create_instance( + session=session, + project=project, + fleet=fleet, + ) + instance_b = await create_instance( + session=session, + project=project, + fleet=fleet, + ) + events.emit( + session, + "Fleet instances created", + actor=events.UserActor.from_user(user), + targets=[ + events.Target.from_model(instance_a), + events.Target.from_model(instance_b), + ], + ) + instance_c = await create_instance( + session=session, + project=project, + fleet=fleet, + ) + events.emit( + session, + "Instance created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(instance_c)], + ) + await session.commit() + + for target_instances in [[instance_a.id], [instance_b.id], [instance_a.id, instance_b.id]]: + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"target_instances": list(map(str, target_instances))}, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["message"] == "Fleet instances created" + assert len(resp.json()[0]["targets"]) == 2 + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"target_instances": [str(instance_c.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["message"] == "Instance created" + assert len(resp.json()[0]["targets"]) == 1 + + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(user.token), + json={"target_instances": [str(instance_a.id), str(instance_c.id)]}, + ) + resp.raise_for_status() + assert len(resp.json()) == 2 + + +class TestListEventsPagination: + @pytest.mark.parametrize("ascending", [True, False]) + async def test_pagination( + self, session: AsyncSession, client: AsyncClient, ascending: bool + ) -> None: + users = [] + for i in range(5): + user = await create_user(session=session, name=f"user_{i}") + users.append(user) + with freeze_time(datetime(2026, 1, 1, 12, 0, 0, i)): + events.emit( + session, + "User created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(user)], + ) + await session.commit() + + if not ascending: + users.reverse() + + resp = await client.post( + "/api/events/list", + json={ + "limit": 2, + "ascending": ascending, + }, + headers=get_auth_headers(users[0].token), + ) + resp.raise_for_status() + assert len(resp.json()) == 2 + assert resp.json()[0]["targets"][0]["name"] == users[0].name + assert resp.json()[1]["targets"][0]["name"] == users[1].name + + resp = await client.post( + "/api/events/list", + json={ + "limit": 2, + "ascending": ascending, + "prev_id": resp.json()[-1]["id"], + "prev_recorded_at": resp.json()[-1]["recorded_at"], + }, + headers=get_auth_headers(users[0].token), + ) + resp.raise_for_status() + assert len(resp.json()) == 2 + assert resp.json()[0]["targets"][0]["name"] == users[2].name + assert resp.json()[1]["targets"][0]["name"] == users[3].name + + resp = await client.post( + "/api/events/list", + json={ + "limit": 2, + "ascending": ascending, + "prev_id": resp.json()[-1]["id"], + "prev_recorded_at": resp.json()[-1]["recorded_at"], + }, + headers=get_auth_headers(users[0].token), + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["targets"][0]["name"] == users[4].name + + resp = await client.post( + "/api/events/list", + json={ + "limit": 2, + "ascending": ascending, + "prev_id": resp.json()[-1]["id"], + "prev_recorded_at": resp.json()[-1]["recorded_at"], + }, + headers=get_auth_headers(users[0].token), + ) + resp.raise_for_status() + assert len(resp.json()) == 0 + + async def test_limits_events_regardless_number_of_targets( + self, session: AsyncSession, client: AsyncClient + ) -> None: + users = [await create_user(session=session, name=f"user_{i}") for i in range(3)] + with freeze_time(datetime(2026, 1, 1, 12, 0, 0, 0)): + events.emit( + session, + "Users batch created", + actor=events.SystemActor(), + targets=[events.Target.from_model(users[0]), events.Target.from_model(users[1])], + ) + with freeze_time(datetime(2026, 1, 1, 12, 0, 0, 1)): + events.emit( + session, + "User created", + actor=events.SystemActor(), + targets=[events.Target.from_model(users[2])], + ) + await session.commit() + + resp = await client.post( + "/api/events/list", + json={ + "limit": 1, + "ascending": True, + }, + headers=get_auth_headers(users[0].token), + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["message"] == "Users batch created" + assert len(resp.json()[0]["targets"]) == 2 + assert {resp.json()[0]["targets"][0]["id"], resp.json()[0]["targets"][1]["id"]} == { + str(users[0].id), + str(users[1].id), + } + + resp = await client.post( + "/api/events/list", + json={ + "limit": 1, + "ascending": True, + "prev_id": resp.json()[-1]["id"], + "prev_recorded_at": resp.json()[-1]["recorded_at"], + }, + headers=get_auth_headers(users[0].token), + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["message"] == "User created" + assert len(resp.json()[0]["targets"]) == 1 + assert resp.json()[0]["targets"][0]["id"] == str(users[2].id) + + resp = await client.post( + "/api/events/list", + json={ + "limit": 2, + "ascending": True, + }, + headers=get_auth_headers(users[0].token), + ) + resp.raise_for_status() + assert len(resp.json()) == 2 + + +class TestListEventsWithExportedFleet: + @pytest_asyncio.fixture + async def exported_fleet_setup(self, session: AsyncSession): + # Create exporter user and project + exporter_user = await create_user( + session, name="exporter-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project( + session, name="exporter-project", owner=exporter_user + ) + await add_project_member( + session=session, + project=exporter_project, + user=exporter_user, + project_role=ProjectRole.USER, + ) + + # Create first importer user and project + importer_user_1 = await create_user( + session, name="importer-user-1", global_role=GlobalRole.USER + ) + importer_project_1 = await create_project( + session, name="importer-project-1", owner=importer_user_1 + ) + await add_project_member( + session=session, + project=importer_project_1, + user=importer_user_1, + project_role=ProjectRole.USER, + ) + + # Create second importer user and project + importer_user_2 = await create_user( + session, name="importer-user-2", global_role=GlobalRole.USER + ) + importer_project_2 = await create_project( + session, name="importer-project-2", owner=importer_user_2 + ) + await add_project_member( + session=session, + project=importer_project_2, + user=importer_user_2, + project_role=ProjectRole.USER, + ) + + # Create fleet and instance + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + events.emit( + session=session, + message="Fleet created", + actor=events.UserActor.from_user(exporter_user), + targets=[events.Target.from_model(fleet)], + ) + instance = await create_instance( + session=session, project=exporter_project, fleet=fleet, name="exported-fleet-0" + ) + events.emit( + session=session, + message="Instance created", + actor=events.SystemActor(), + targets=[events.Target.from_model(instance)], + ) + + # Create export + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project_1, importer_project_2], + exported_fleets=[fleet], + ) + + # Create first importer run and job + importer_run_1 = await create_run( + session=session, + project=importer_project_1, + user=importer_user_1, + repo=await create_repo(session=session, project_id=importer_project_1.id), + run_name="importer-run-1", + ) + events.emit( + session=session, + message="Run created", + actor=events.UserActor.from_user(importer_user_1), + targets=[events.Target.from_model(importer_run_1)], + ) + importer_job_1 = await create_job( + session=session, + run=importer_run_1, + fleet=fleet, + instance=instance, + ) + events.emit( + session=session, + message="Job assigned to instance", + actor=events.SystemActor(), + targets=[events.Target.from_model(importer_job_1), events.Target.from_model(instance)], + ) + + # Create second importer run and job + importer_run_2 = await create_run( + session=session, + project=importer_project_2, + user=importer_user_2, + repo=await create_repo(session=session, project_id=importer_project_2.id), + run_name="importer-run-2", + ) + events.emit( + session=session, + message="Run created", + actor=events.UserActor.from_user(importer_user_2), + targets=[events.Target.from_model(importer_run_2)], + ) + importer_job_2 = await create_job( + session=session, + run=importer_run_2, + fleet=fleet, + instance=instance, + ) + events.emit( + session=session, + message="Job assigned to instance", + actor=events.SystemActor(), + targets=[events.Target.from_model(importer_job_2), events.Target.from_model(instance)], + ) + + await session.commit() + + return { + "exporter_user": exporter_user, + "importer_user_1": importer_user_1, + "importer_user_2": importer_user_2, + "exported_fleet": fleet, + } + + @pytest.mark.parametrize("with_filter", [True, False]) + async def test_exporter_user_sees_all_events_targeting_exported_fleet( + self, + session: AsyncSession, + client: AsyncClient, + exported_fleet_setup: dict, + with_filter: bool, + ) -> None: + filters = {} + if with_filter: + filters = {"within_fleets": [str(exported_fleet_setup["exported_fleet"].id)]} + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(exported_fleet_setup["exporter_user"].token), + json={"ascending": True, **filters}, + ) + resp.raise_for_status() + assert resp.json()[0]["message"] == "Fleet created" + assert resp.json()[1]["message"] == "Instance created" + assert resp.json()[2]["message"] == "Job assigned to instance" + assert {t["name"] for t in resp.json()[2]["targets"]} == { + "exported-fleet-0", + "importer-run-1-0-0", + } + assert resp.json()[3]["message"] == "Job assigned to instance" + assert {t["name"] for t in resp.json()[3]["targets"]} == { + "exported-fleet-0", + "importer-run-2-0-0", + } + assert len(resp.json()) == 4 + + @pytest.mark.parametrize( + ("user_key", "job_name"), + [ + ("importer_user_1", "importer-run-1-0-0"), + ("importer_user_2", "importer-run-2-0-0"), + ], + ) + async def test_importer_user_sees_only_events_about_their_own_run( + self, + session: AsyncSession, + client: AsyncClient, + exported_fleet_setup: dict, + user_key: str, + job_name: str, + ) -> None: + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(exported_fleet_setup[user_key].token), + json={"ascending": True}, + ) + resp.raise_for_status() + assert resp.json()[0]["message"] == "Run created" + assert resp.json()[1]["message"] == "Job assigned to instance" + assert {t["name"] for t in resp.json()[1]["targets"]} == {"exported-fleet-0", job_name} + assert len(resp.json()) == 2 + + @pytest.mark.parametrize( + ("user_key", "job_name"), + [ + ("importer_user_1", "importer-run-1-0-0"), + ("importer_user_2", "importer-run-2-0-0"), + ], + ) + async def test_importer_user_can_filter_by_imported_fleet( + self, + session: AsyncSession, + client: AsyncClient, + exported_fleet_setup: dict, + user_key: str, + job_name: str, + ) -> None: + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(exported_fleet_setup[user_key].token), + json={ + "ascending": True, + "within_fleets": [str(exported_fleet_setup["exported_fleet"].id)], + }, + ) + resp.raise_for_status() + assert resp.json()[0]["message"] == "Job assigned to instance" + assert {t["name"] for t in resp.json()[0]["targets"]} == {"exported-fleet-0", job_name} + assert len(resp.json()) == 1 diff --git a/src/tests/_internal/server/routers/test_exports.py b/src/tests/_internal/server/routers/test_exports.py new file mode 100644 index 0000000000..907dab95aa --- /dev/null +++ b/src/tests/_internal/server/routers/test_exports.py @@ -0,0 +1,1413 @@ +from typing import Optional + +import pytest +from httpx import AsyncClient +from sqlalchemy import func, select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.users import GlobalRole, ProjectRole +from dstack._internal.server.models import ExportModel, ImportModel +from dstack._internal.server.services.projects import add_project_member +from dstack._internal.server.testing.common import ( + create_backend, + create_export, + create_fleet, + create_gateway, + create_project, + create_user, + get_auth_headers, + get_fleet_spec, + get_ssh_fleet_configuration, +) + +pytestmark = [ + pytest.mark.asyncio, + pytest.mark.usefixtures("test_db"), + pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True), +] + + +class TestCreateExport: + async def test_returns_403_if_not_authenticated(self, client: AsyncClient): + response = await client.post( + "/api/project/TestProject/exports/create", + json={ + "name": "test-export", + "importer_projects": ["OtherProject"], + "exported_fleets": ["fleet1"], + }, + ) + assert response.status_code in [401, 403] + + async def test_returns_403_if_not_admin(self, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + response = await client.post( + f"/api/project/{project.name}/exports/create", + headers=get_auth_headers(user.token), + json={ + "name": "test-export", + "importer_projects": ["OtherProject"], + "exported_fleets": ["fleet1"], + }, + ) + assert response.status_code == 403 + + async def test_create_global_returns_403_if_not_global_admin( + self, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + response = await client.post( + f"/api/project/{project.name}/exports/create", + headers=get_auth_headers(user.token), + json={"name": "my-export", "is_global": True}, + ) + assert response.status_code == 403 + + @pytest.mark.parametrize( + ("global_role", "importer_project_role"), + [(GlobalRole.ADMIN, None), (GlobalRole.USER, ProjectRole.ADMIN)], + ) + async def test_creates_export( + self, + session: AsyncSession, + client: AsyncClient, + global_role: GlobalRole, + importer_project_role: Optional[ProjectRole], + ): + user = await create_user(session=session, global_role=global_role) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + + importer_project = await create_project( + session=session, name="ImporterProject", owner=user + ) + if importer_project_role is not None: + await add_project_member( + session=session, + project=importer_project, + user=user, + project_role=importer_project_role, + ) + await create_fleet( + session=session, + project=project, + name="fleet1", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + backend = await create_backend(session=session, project_id=project.id) + await create_gateway( + session=session, project_id=project.id, backend_id=backend.id, name="gateway1" + ) + + response = await client.post( + f"/api/project/{project.name}/exports/create", + headers=get_auth_headers(user.token), + json={ + "name": "test-export", + "importer_projects": ["ImporterProject"], + "exported_fleets": ["fleet1"], + "exported_gateways": ["gateway1"], + }, + ) + assert response.status_code == 200 + export_response = response.json() + assert export_response["name"] == "test-export" + assert export_response["is_global"] == False + assert len(export_response["imports"]) == 1 + assert export_response["imports"][0]["project_name"] == "ImporterProject" + assert len(export_response["exported_fleets"]) == 1 + assert export_response["exported_fleets"][0]["name"] == "fleet1" + assert len(export_response["exported_gateways"]) == 1 + assert export_response["exported_gateways"][0]["name"] == "gateway1" + + res = await session.execute(select(ExportModel).where(ExportModel.name == "test-export")) + assert res.scalar() is not None + + async def test_creates_empty_export(self, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + + response = await client.post( + f"/api/project/{project.name}/exports/create", + headers=get_auth_headers(user.token), + json={ + "name": "empty-export", + }, + ) + assert response.status_code == 200 + export_response = response.json() + assert export_response["name"] == "empty-export" + assert len(export_response["imports"]) == 0 + assert len(export_response["exported_fleets"]) == 0 + + res = await session.execute(select(ExportModel).where(ExportModel.name == "empty-export")) + assert res.scalar() is not None + + async def test_creates_global_export(self, session: AsyncSession, client: AsyncClient): + admin = await create_user(session=session, global_role=GlobalRole.ADMIN) + exporter_project = await create_project( + session=session, name="ExporterProject", owner=admin + ) + await add_project_member( + session=session, project=exporter_project, user=admin, project_role=ProjectRole.ADMIN + ) + project_a = await create_project(session=session, name="ProjectA", owner=admin) + project_b = await create_project(session=session, name="ProjectB", owner=admin) + + response = await client.post( + f"/api/project/{exporter_project.name}/exports/create", + headers=get_auth_headers(admin.token), + json={"name": "my-export", "is_global": True}, + ) + assert response.status_code == 200 + data = response.json() + assert data["is_global"] is True + imported_names = {imp["project_name"] for imp in data["imports"]} + assert imported_names == {project_a.name, project_b.name} + assert exporter_project.name not in imported_names + res = await session.execute(select(func.count()).select_from(ExportModel)) + assert res.scalar_one() == 1 + res = await session.execute(select(func.count()).select_from(ImportModel)) + assert res.scalar_one() == 2 + + @pytest.mark.parametrize( + "body,error", + [ + pytest.param( + { + "name": "test-export", + "importer_projects": ["nonexistent"], + }, + "Projects {'nonexistent'} not found or you are not allowed to add them as importers", + id="nonexistent-project", + ), + pytest.param( + { + "name": "test-export", + "importer_projects": ["NotPermittedProject"], + }, + "Projects {'notpermittedproject'} not found or you are not allowed to add them as importers", + id="not-permitted-project", + ), + pytest.param( + { + "name": "test-export", + "exported_fleets": ["nonexistent-fleet"], + }, + "Fleets {'nonexistent-fleet'} not found in project 'ExporterProject'", + id="nonexistent-fleet", + ), + pytest.param( + { + "name": "test-export", + "exported_gateways": ["nonexistent-gateway"], + }, + "Gateways {'nonexistent-gateway'} not found in project 'ExporterProject'", + id="nonexistent-gateway", + ), + pytest.param( + { + "name": "test-export", + "importer_projects": [ + "ImporterProject", + "iMpOrTeRpRoJeCt", + ], # case-insensitive + }, + "Some importer projects are listed for addition more than once", + id="duplicate-project", + ), + pytest.param( + { + "name": "test-export", + "exported_fleets": ["exported-fleet", "exported-fleet"], + }, + "Some fleets are listed for addition more than once", + id="duplicate-fleet", + ), + pytest.param( + { + "name": "test-export", + "exported_gateways": ["exported-gateway", "exported-gateway"], + }, + "Some gateways are listed for addition more than once", + id="duplicate-gateway", + ), + pytest.param( + { + "name": "test-export", + "exported_fleets": ["cloud-fleet"], + }, + "Fleets ['cloud-fleet'] are cloud fleets. Can only export SSH fleets", + id="cloud-fleet", + ), + pytest.param( + { + "name": "test-export", + "importer_projects": ["eXpOrTeRpRoJeCt"], # case-insensitive + }, + "Project 'ExporterProject' cannot import from itself", + id="self-import", + ), + pytest.param( + { + "name": "", + }, + "Resource name should match regex '^[a-z][a-z0-9-]{1,40}$'", + id="empty-name", + ), + pytest.param( + { + "name": "a" * 256, + }, + "Resource name should match regex '^[a-z][a-z0-9-]{1,40}$'", + id="long-name", + ), + pytest.param( + { + "name": "!@#$", + }, + "Resource name should match regex '^[a-z][a-z0-9-]{1,40}$'", + id="invalid-name", + ), + ], + ) + async def test_rejects_invalid_export( + self, session: AsyncSession, client: AsyncClient, body: dict, error: str + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, name="ExporterProject", owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + importer_project = await create_project( + session=session, name="ImporterProject", owner=user + ) + await add_project_member( + session=session, project=importer_project, user=user, project_role=ProjectRole.ADMIN + ) + await create_fleet( + session=session, + project=project, + name="exported-fleet", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + await create_fleet(session=session, project=project, name="cloud-fleet") + backend = await create_backend(session=session, project_id=project.id) + await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + name="exported-gateway", + ) + not_permitted_project = await create_project( + session=session, name="NotPermittedProject", owner=user + ) + await add_project_member( + session=session, + project=not_permitted_project, + user=user, + project_role=ProjectRole.USER, + ) + + response = await client.post( + f"/api/project/{project.name}/exports/create", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 400 + assert error in response.json()["detail"][0]["msg"] + res = await session.execute(select(func.count()).select_from(ExportModel)) + assert res.scalar_one() == 0 + + async def test_rejects_invalid_global_export_with_importer_projects( + self, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.ADMIN) + project = await create_project(session=session, name="ExporterProject", owner=user) + response = await client.post( + f"/api/project/{project.name}/exports/create", + headers=get_auth_headers(user.token), + json={ + "name": "test-export", + "is_global": True, + "importer_projects": ["ImporterProject"], + }, + ) + assert response.status_code == 400 + assert ( + "Do not specify any importer projects when creating a global export" + in response.json()["detail"][0]["msg"] + ) + res = await session.execute(select(func.count()).select_from(ExportModel)) + assert res.scalar_one() == 0 + + async def test_rejects_export_on_name_conflict( + self, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, name="Project") + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + await create_export( + session=session, + exporter_project=project, + importer_projects=[], + exported_fleets=[], + name="test-export", + ) + + response = await client.post( + f"/api/project/{project.name}/exports/create", + headers=get_auth_headers(user.token), + json={"name": "test-export"}, + ) + assert response.status_code == 400 + assert response.json()["detail"][0]["code"] == "resource_exists" + assert ( + response.json()["detail"][0]["msg"] + == "Export 'test-export' already exists in project 'Project'" + ) + res = await session.execute(select(func.count()).select_from(ExportModel)) + assert res.scalar_one() == 1 + + +class TestUpdateExport: + async def test_returns_403_if_not_authenticated(self, client: AsyncClient): + response = await client.post( + "/api/project/TestProject/exports/update", + json={ + "name": "test-export", + }, + ) + assert response.status_code in [401, 403] + + async def test_returns_403_if_not_admin(self, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + response = await client.post( + f"/api/project/{project.name}/exports/update", + headers=get_auth_headers(user.token), + json={ + "name": "test-export", + }, + ) + assert response.status_code == 403 + + @pytest.mark.parametrize( + ("global_role", "importer_project_role"), + [(GlobalRole.ADMIN, None), (GlobalRole.USER, ProjectRole.ADMIN)], + ) + async def test_updates_export( + self, + session: AsyncSession, + client: AsyncClient, + global_role: GlobalRole, + importer_project_role: Optional[ProjectRole], + ): + user = await create_user(session=session, global_role=global_role) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + + other_project = await create_project(session=session, name="OtherProject", owner=user) + another_project = await create_project(session=session, name="AnotherProject", owner=user) + fleet1 = await create_fleet( + session=session, + project=project, + name="fleet1", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + fleet2 = await create_fleet( + session=session, + project=project, + name="fleet2", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + backend = await create_backend(session=session, project_id=project.id) + gateway1 = await create_gateway( + session=session, project_id=project.id, backend_id=backend.id, name="gateway1" + ) + gateway2 = await create_gateway( + session=session, project_id=project.id, backend_id=backend.id, name="gateway2" + ) + export = await create_export( + session=session, + exporter_project=project, + importer_projects=[other_project, another_project], + exported_fleets=[fleet1, fleet2], + exported_gateways=[gateway1, gateway2], + name="test-export", + ) + + new_project1 = await create_project(session=session, name="NewProject1", owner=user) + new_project2 = await create_project(session=session, name="NewProject2", owner=user) + await create_fleet( + session=session, + project=project, + name="fleet3", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + await create_fleet( + session=session, + project=project, + name="fleet4", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + await create_gateway( + session=session, project_id=project.id, backend_id=backend.id, name="gateway3" + ) + if importer_project_role is not None: + await add_project_member( + session=session, project=new_project1, user=user, project_role=ProjectRole.ADMIN + ) + await add_project_member( + session=session, project=new_project2, user=user, project_role=ProjectRole.ADMIN + ) + + response = await client.post( + f"/api/project/{project.name}/exports/update", + headers=get_auth_headers(user.token), + json={ + "name": "test-export", + "add_importer_projects": ["NewProject1", "NewProject2"], + "remove_importer_projects": ["AnotherProject"], + "add_exported_fleets": ["fleet3", "fleet4"], + "remove_exported_fleets": ["fleet2"], + "add_exported_gateways": ["gateway3"], + "remove_exported_gateways": ["gateway2"], + }, + ) + assert response.status_code == 200 + export_response = response.json() + + assert export_response["name"] == "test-export" + assert len(export_response["imports"]) == 3 + assert {imp["project_name"] for imp in export_response["imports"]} == { + "OtherProject", + "NewProject1", + "NewProject2", + } + assert len(export_response["exported_fleets"]) == 3 + assert {fleet["name"] for fleet in export_response["exported_fleets"]} == { + "fleet1", + "fleet3", + "fleet4", + } + assert len(export_response["exported_gateways"]) == 2 + assert {g["name"] for g in export_response["exported_gateways"]} == { + "gateway1", + "gateway3", + } + + await session.refresh(export, ["imports", "exported_fleets", "exported_gateways"]) + assert len(export.imports) == 3 + assert len(export.exported_fleets) == 3 + assert len(export.exported_gateways) == 2 + + response = await client.post( + f"/api/project/{project.name}/exports/list", headers=get_auth_headers(user.token) + ) + assert response.status_code == 200 + export_list = response.json() + assert len(export_list) == 1 + export_response["imports"].sort(key=lambda i: i["project_name"]) + export_list[0]["imports"].sort(key=lambda i: i["project_name"]) + export_response["exported_fleets"].sort(key=lambda f: f["name"]) + export_list[0]["exported_fleets"].sort(key=lambda f: f["name"]) + export_response["exported_gateways"].sort(key=lambda g: g["name"]) + export_list[0]["exported_gateways"].sort(key=lambda g: g["name"]) + assert export_list[0] == export_response + + async def test_can_add_same_entities_as_existing_deleted_ones( + self, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + + deleted_importer_project = await create_project( + session=session, name="_deleted_ImporterProject", owner=user, deleted=True + ) + importer_project = await create_project( + session=session, name="ImporterProject", owner=user + ) + await add_project_member( + session=session, project=importer_project, user=user, project_role=ProjectRole.ADMIN + ) + deleted_fleet = await create_fleet( + session=session, + project=project, + name="fleet", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + deleted=True, + ) + fleet = await create_fleet( + session=session, + project=project, + name=deleted_fleet.name, + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + export = await create_export( + session=session, + exporter_project=project, + importer_projects=[deleted_importer_project], + exported_fleets=[deleted_fleet], + name="test-export", + ) + + response = await client.post( + f"/api/project/{project.name}/exports/update", + headers=get_auth_headers(user.token), + json={ + "name": "test-export", + "add_importer_projects": ["ImporterProject"], + "add_exported_fleets": ["fleet"], + }, + ) + assert response.status_code == 200 + export_response = response.json() + + assert export_response["name"] == "test-export" + assert len(export_response["imports"]) == 1 + assert export_response["imports"][0]["project_name"] == "ImporterProject" + assert len(export_response["exported_fleets"]) == 1 + assert export_response["exported_fleets"][0]["name"] == "fleet" + assert export_response["exported_fleets"][0]["id"] == str(fleet.id) + + await session.refresh(export, ["imports", "exported_fleets"]) + # deleted imports and fleets are still in the database, just not returned in the response + assert len(export.imports) == 2 + assert len(export.exported_fleets) == 2 + + response = await client.post( + f"/api/project/{project.name}/exports/list", headers=get_auth_headers(user.token) + ) + assert response.status_code == 200 + assert len(response.json()) == 1 + assert response.json()[0] == export_response + + @pytest.mark.parametrize( + "body,error", + [ + pytest.param( + { + "name": "nonexistent-export", + "add_importer_projects": ["NotImporterProject"], + }, + "Export 'nonexistent-export' not found in project 'ExporterProject'", + id="nonexistent-export", + ), + pytest.param( + { + "name": "test-export", + }, + "No changes specified", + id="no-changes", + ), + pytest.param( + { + "name": "test-export", + "add_importer_projects": ["nonexistent"], + }, + "Projects {'nonexistent'} not found or you are not allowed to add them as importers", + id="add-nonexistent-project", + ), + pytest.param( + { + "name": "test-export", + "add_importer_projects": ["NotPermittedProject"], + }, + "Projects {'notpermittedproject'} not found or you are not allowed to add them as importers", + id="add-not-permitted-project", + ), + pytest.param( + { + "name": "test-export", + "add_exported_fleets": ["nonexistent-fleet"], + }, + "Fleets {'nonexistent-fleet'} not found in project 'ExporterProject'", + id="add-nonexistent-fleet", + ), + pytest.param( + { + "name": "test-export", + "add_exported_gateways": ["nonexistent-gateway"], + }, + "Gateways {'nonexistent-gateway'} not found in project 'ExporterProject'", + id="add-nonexistent-gateway", + ), + pytest.param( + { + "name": "test-export", + "add_importer_projects": ["iMpOrTeRpRoJeCt"], # case-insensitive + }, + "Projects {'importerproject'} are already importing export 'test-export'", + id="add-already-added-project", + ), + pytest.param( + { + "name": "test-export", + "add_importer_projects": [ + "ImporterProject", + "iMpOrTeRpRoJeCt", + ], # case-insensitive + }, + "Some importer projects are listed for addition more than once", + id="add-duplicate-project", + ), + pytest.param( + { + "name": "test-export", + "add_exported_fleets": ["exported-fleet"], + }, + "Fleets {'exported-fleet'} are already exported by export 'test-export'", + id="add-already-added-fleet", + ), + pytest.param( + { + "name": "test-export", + "add_exported_fleets": ["exported-fleet", "exported-fleet"], + }, + "Some fleets are listed for addition more than once", + id="add-duplicate-fleet", + ), + pytest.param( + { + "name": "test-export", + "add_exported_gateways": ["exported-gateway"], + }, + "Gateways {'exported-gateway'} are already exported by export 'test-export'", + id="add-already-added-gateway", + ), + pytest.param( + { + "name": "test-export", + "add_exported_gateways": ["not-exported-gateway", "not-exported-gateway"], + }, + "Some gateways are listed for addition more than once", + id="add-duplicate-gateway", + ), + pytest.param( + { + "name": "test-export", + "add_exported_fleets": ["cloud-fleet"], + }, + "Fleets ['cloud-fleet'] are cloud fleets. Can only export SSH fleets", + id="add-cloud-fleet", + ), + pytest.param( + { + "name": "test-export", + "add_importer_projects": ["eXpOrTeRpRoJeCt"], # case-insensitive + }, + "Project 'ExporterProject' cannot import from itself", + id="add-self-import", + ), + pytest.param( + { + "name": "test-export", + "remove_importer_projects": ["NotImporterProject"], + }, + "Projects {'notimporterproject'} are not importing export 'test-export'", + id="remove-not-added-project", + ), + pytest.param( + { + "name": "test-export", + "remove_importer_projects": ["nonexistent"], + }, + "Projects {'nonexistent'} are not importing export 'test-export'", + id="remove-nonexistent-project", + ), + pytest.param( + { + "name": "test-export", + "remove_exported_fleets": ["not-exported-fleet"], + }, + "Fleets {'not-exported-fleet'} are not exported by export 'test-export'", + id="remove-not-exported-fleet", + ), + pytest.param( + { + "name": "test-export", + "remove_exported_fleets": ["nonexistent-fleet"], + }, + "Fleets {'nonexistent-fleet'} are not exported by export 'test-export'", + id="remove-nonexistent-fleet", + ), + pytest.param( + { + "name": "test-export", + "remove_exported_gateways": ["not-exported-gateway"], + }, + "Gateways {'not-exported-gateway'} are not exported by export 'test-export'", + id="remove-not-exported-gateway", + ), + pytest.param( + { + "name": "test-export", + "remove_exported_gateways": ["nonexistent-gateway"], + }, + "Gateways {'nonexistent-gateway'} are not exported by export 'test-export'", + id="remove-nonexistent-gateway", + ), + pytest.param( + { + "name": "test-export", + "remove_importer_projects": [ + "ImporterProject", + "iMpOrTeRpRoJeCt", + ], # case-insensitive + }, + "Some importer projects are listed for removal more than once", + id="remove-duplicate-project", + ), + pytest.param( + { + "name": "test-export", + "remove_exported_fleets": ["exported-fleet", "exported-fleet"], + }, + "Some fleets are listed for removal more than once", + id="remove-duplicate-fleet", + ), + pytest.param( + { + "name": "test-export", + "remove_exported_gateways": ["exported-gateway", "exported-gateway"], + }, + "Some gateways are listed for removal more than once", + id="remove-duplicate-gateway", + ), + pytest.param( + { + "name": "test-export", + "add_importer_projects": ["NotImporterProject"], + "remove_importer_projects": ["NoTiMpOrTeRpRoJeCt"], # case-insensitive + }, + "Projects {'notimporterproject'} are listed for both addition and removal. Cannot add and remove at the same time", + id="add-remove-same-project", + ), + pytest.param( + { + "name": "test-export", + "add_exported_fleets": ["not-exported-fleet"], + "remove_exported_fleets": ["not-exported-fleet"], + }, + "Fleets {'not-exported-fleet'} are listed for both addition and removal. Cannot add and remove at the same time", + id="add-remove-same-fleet", + ), + pytest.param( + { + "name": "test-export", + "add_exported_gateways": ["not-exported-gateway"], + "remove_exported_gateways": ["not-exported-gateway"], + }, + "Gateways {'not-exported-gateway'} are listed for both addition and removal. Cannot add and remove at the same time", + id="add-remove-same-gateway", + ), + pytest.param( + { + "name": "test-export", + "set_global": True, + "unset_global": True, + }, + "Cannot set and unset global at the same time", + id="set-and-unset-global", + ), + pytest.param( + { + "name": "test-export", + "unset_global": True, + }, + "The export is already not global", + id="unset-non-global", + ), + pytest.param( + { + "name": "test-export", + "set_global": True, + "add_importer_projects": ["NotImporterProject"], + }, + "Cannot change global status and add/remove importers at the same time", + id="set-global-with-importer-changes", + ), + pytest.param( + { + "name": "test-export", + "unset_global": True, + "remove_importer_projects": ["ImporterProject"], + }, + "Cannot change global status and add/remove importers at the same time", + id="unset-global-with-importer-changes", + ), + ], + ) + async def test_rejects_invalid_update( + self, session: AsyncSession, client: AsyncClient, body: dict, error: str + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, name="ExporterProject", owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + importer_project = await create_project( + session=session, name="ImporterProject", owner=user + ) + await add_project_member( + session=session, project=importer_project, user=user, project_role=ProjectRole.ADMIN + ) + exported_fleet = await create_fleet( + session=session, + project=project, + name="exported-fleet", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + backend = await create_backend(session=session, project_id=project.id) + exported_gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + name="exported-gateway", + ) + await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + name="not-exported-gateway", + ) + await create_export( + session=session, + exporter_project=project, + importer_projects=[importer_project], + exported_fleets=[exported_fleet], + exported_gateways=[exported_gateway], + name="test-export", + ) + await create_fleet(session=session, project=project, name="cloud-fleet") + await create_fleet( + session=session, + project=project, + name="not-exported-fleet", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + not_importer_project = await create_project( + session=session, name="NotImporterProject", owner=user + ) + await add_project_member( + session=session, + project=not_importer_project, + user=user, + project_role=ProjectRole.ADMIN, + ) + not_permitted_project = await create_project( + session=session, name="NotPermittedProject", owner=user + ) + await add_project_member( + session=session, + project=not_permitted_project, + user=user, + project_role=ProjectRole.USER, + ) + + response = await client.post( + f"/api/project/{project.name}/exports/list", headers=get_auth_headers(user.token) + ) + assert response.status_code == 200 + canonical_exports = response.json() + + response = await client.post( + f"/api/project/{project.name}/exports/update", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 400 + assert error in response.json()["detail"][0]["msg"] + + response = await client.post( + f"/api/project/{project.name}/exports/list", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + assert response.json() == canonical_exports + + async def test_set_global_returns_403_if_not_global_admin( + self, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + await create_export( + session=session, + exporter_project=project, + importer_projects=[], + exported_fleets=[], + name="my-export", + ) + + response = await client.post( + f"/api/project/{project.name}/exports/update", + headers=get_auth_headers(user.token), + json={"name": "my-export", "set_global": True}, + ) + assert response.status_code == 403 + + async def test_project_admin_can_unset_global( + self, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + await create_export( + session=session, + exporter_project=project, + importer_projects=[], + exported_fleets=[], + name="my-export", + is_global=True, + ) + + response = await client.post( + f"/api/project/{project.name}/exports/update", + headers=get_auth_headers(user.token), + json={"name": "my-export", "unset_global": True}, + ) + assert response.status_code == 200 + assert response.json()["is_global"] is False + + async def test_set_global(self, session: AsyncSession, client: AsyncClient): + admin = await create_user(session=session, global_role=GlobalRole.ADMIN) + exporter_project = await create_project( + session=session, name="ExporterProject", owner=admin + ) + already_importing = await create_project( + session=session, name="AlreadyImporting", owner=admin + ) + not_yet_importing = await create_project( + session=session, name="NotYetImporting", owner=admin + ) + export = await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[already_importing], + exported_fleets=[], + name="my-export", + ) + + response = await client.post( + f"/api/project/{exporter_project.name}/exports/update", + headers=get_auth_headers(admin.token), + json={"name": "my-export", "set_global": True}, + ) + assert response.status_code == 200 + data = response.json() + assert data["is_global"] is True + imported_names = {imp["project_name"] for imp in data["imports"]} + assert imported_names == {already_importing.name, not_yet_importing.name} + assert exporter_project.name not in imported_names + await session.refresh(export, ["imports"]) + assert len(export.imports) == 2 + + async def test_unset_global_keeps_imports(self, session: AsyncSession, client: AsyncClient): + admin = await create_user(session=session, global_role=GlobalRole.ADMIN) + exporter_project = await create_project( + session=session, name="ExporterProject", owner=admin + ) + importer = await create_project(session=session, name="ImporterProject", owner=admin) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer], + exported_fleets=[], + name="my-export", + is_global=True, + ) + + response = await client.post( + f"/api/project/{exporter_project.name}/exports/update", + headers=get_auth_headers(admin.token), + json={"name": "my-export", "unset_global": True}, + ) + assert response.status_code == 200 + data = response.json() + assert data["is_global"] is False + # imports still present + assert len(data["imports"]) == 1 + assert data["imports"][0]["project_name"] == importer.name + + async def test_cannot_remove_importer_from_global_export( + self, session: AsyncSession, client: AsyncClient + ): + admin = await create_user(session=session, global_role=GlobalRole.ADMIN) + exporter_project = await create_project(session=session, owner=admin) + importer = await create_project(session=session, name="ImporterProject", owner=admin) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer], + exported_fleets=[], + name="my-export", + is_global=True, + ) + + response = await client.post( + f"/api/project/{exporter_project.name}/exports/update", + headers=get_auth_headers(admin.token), + json={ + "name": "my-export", + "remove_importer_projects": [importer.name], + }, + ) + assert response.status_code == 400 + assert ( + "Cannot remove importers from a global export" in response.json()["detail"][0]["msg"] + ) + + async def test_can_add_missing_importer_to_global_export( + self, session: AsyncSession, client: AsyncClient + ): + """ + Global exports should always be imported in all projects, but in case this invariant + is ever violated (e.g., due to bugs or unforeseen race conditions), adding a missing + importer is still allowed. + """ + admin = await create_user(session=session, global_role=GlobalRole.ADMIN) + exporter_project = await create_project(session=session, owner=admin) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[], + exported_fleets=[], + name="my-export", + is_global=True, + ) + importer = await create_project(session=session, name="ImporterProject", owner=admin) + + response = await client.post( + f"/api/project/{exporter_project.name}/exports/update", + headers=get_auth_headers(admin.token), + json={ + "name": "my-export", + "add_importer_projects": [importer.name], + }, + ) + assert response.status_code == 200 + export_response = response.json() + assert len(export_response["imports"]) == 1 + assert export_response["imports"][0]["project_name"] == importer.name + + async def test_set_global_already_global_returns_400( + self, session: AsyncSession, client: AsyncClient + ): + admin = await create_user(session=session, global_role=GlobalRole.ADMIN) + project = await create_project(session=session, owner=admin) + await add_project_member( + session=session, project=project, user=admin, project_role=ProjectRole.ADMIN + ) + await create_export( + session=session, + exporter_project=project, + importer_projects=[], + exported_fleets=[], + name="my-export", + is_global=True, + ) + + response = await client.post( + f"/api/project/{project.name}/exports/update", + headers=get_auth_headers(admin.token), + json={"name": "my-export", "set_global": True}, + ) + assert response.status_code == 400 + assert "The export is already global" in response.json()["detail"][0]["msg"] + + +class TestDeleteExport: + async def test_returns_403_if_not_authenticated(self, client: AsyncClient): + response = await client.post( + "/api/project/TestProject/exports/delete", + json={"name": "test-export"}, + ) + assert response.status_code in [401, 403] + + async def test_returns_403_if_not_admin(self, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + response = await client.post( + f"/api/project/{project.name}/exports/delete", + headers=get_auth_headers(user.token), + json={"name": "test-export"}, + ) + assert response.status_code == 403 + + async def test_deletes_export(self, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + + other_project = await create_project(session=session, name="OtherProject", owner=user) + fleet = await create_fleet( + session=session, + project=project, + name="fleet1", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + await create_export( + session=session, + exporter_project=project, + importer_projects=[other_project], + exported_fleets=[fleet], + name="test-export", + ) + + response = await client.post( + f"/api/project/{project.name}/exports/delete", + headers=get_auth_headers(user.token), + json={"name": "test-export"}, + ) + assert response.status_code == 200 + + res = await session.execute(select(ExportModel).where(ExportModel.name == "test-export")) + assert res.scalar() is None + + async def test_returns_400_for_nonexistent_export( + self, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + response = await client.post( + f"/api/project/{project.name}/exports/delete", + headers=get_auth_headers(user.token), + json={"name": "nonexistent-export"}, + ) + assert response.status_code == 400 + assert response.json()["detail"][0]["code"] == "resource_not_exists" + + async def test_project_admin_can_delete_global_export( + self, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + export = await create_export( + session=session, + exporter_project=project, + importer_projects=[], + exported_fleets=[], + name="my-export", + is_global=True, + ) + + response = await client.post( + f"/api/project/{project.name}/exports/delete", + headers=get_auth_headers(user.token), + json={"name": export.name}, + ) + assert response.status_code == 200 + + res = await session.execute(select(ExportModel)) + assert res.scalar() is None + + +class TestListExports: + async def test_returns_403_if_not_authenticated(self, client: AsyncClient): + response = await client.post( + "/api/project/TestProject/exports/list", + ) + assert response.status_code in [401, 403] + + async def test_returns_403_if_not_member(self, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + response = await client.post( + f"/api/project/{project.name}/exports/list", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 403 + + @pytest.mark.parametrize( + "global_role, project_role", + [ + (GlobalRole.ADMIN, None), + (GlobalRole.USER, ProjectRole.USER), + ], + ) + async def test_lists_exports( + self, + session: AsyncSession, + client: AsyncClient, + global_role: GlobalRole, + project_role: Optional[ProjectRole], + ): + user = await create_user(session=session, global_role=global_role) + project = await create_project(session=session, owner=user) + if project_role: + await add_project_member( + session=session, project=project, user=user, project_role=project_role + ) + + other_project = await create_project(session=session, name="OtherProject", owner=user) + fleet1 = await create_fleet( + session=session, + project=project, + name="fleet1", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + fleet2 = await create_fleet( + session=session, + project=project, + name="fleet2", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + backend = await create_backend(session=session, project_id=project.id) + gateway1 = await create_gateway( + session=session, project_id=project.id, backend_id=backend.id, name="gateway1" + ) + gateway2 = await create_gateway( + session=session, project_id=project.id, backend_id=backend.id, name="gateway2" + ) + for name, fleet, gateway in ( + ("export1", fleet1, gateway1), + ("export2", fleet2, gateway2), + ): + await create_export( + session=session, + exporter_project=project, + importer_projects=[other_project], + exported_fleets=[fleet], + exported_gateways=[gateway], + name=name, + ) + + response = await client.post( + f"/api/project/{project.name}/exports/list", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + exports = response.json() + assert len(exports) == 2 + exports.sort(key=lambda e: e["name"]) + + assert exports[0]["name"] == "export1" + assert len(exports[0]["imports"]) == 1 + assert exports[0]["imports"][0]["project_name"] == "OtherProject" + assert len(exports[0]["exported_fleets"]) == 1 + assert exports[0]["exported_fleets"][0]["name"] == "fleet1" + assert len(exports[0]["exported_gateways"]) == 1 + assert exports[0]["exported_gateways"][0]["name"] == "gateway1" + + assert exports[1]["name"] == "export2" + assert len(exports[1]["imports"]) == 1 + assert exports[1]["imports"][0]["project_name"] == "OtherProject" + assert len(exports[1]["exported_fleets"]) == 1 + assert exports[1]["exported_fleets"][0]["name"] == "fleet2" + assert len(exports[1]["exported_gateways"]) == 1 + assert exports[1]["exported_gateways"][0]["name"] == "gateway2" + + @pytest.mark.parametrize( + "global_role, project_role", + [ + (GlobalRole.ADMIN, None), + (GlobalRole.USER, ProjectRole.USER), + ], + ) + async def test_returns_empty_list_when_no_exports( + self, + session: AsyncSession, + client: AsyncClient, + global_role: GlobalRole, + project_role: Optional[ProjectRole], + ): + user = await create_user(session=session, global_role=global_role) + project = await create_project(session=session, owner=user) + if project_role: + await add_project_member( + session=session, project=project, user=user, project_role=project_role + ) + + response = await client.post( + f"/api/project/{project.name}/exports/list", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + assert response.json() == [] + + async def test_not_includes_deleted_entities(self, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + + importer_project = await create_project( + session=session, name="ImporterProject", owner=user + ) + deleted_importer_project = await create_project( + session=session, name="DeletedImporterProject", owner=user, deleted=True + ) + fleet = await create_fleet( + session=session, + project=project, + name="fleet", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + deleted_fleet = await create_fleet( + session=session, + project=project, + name="deleted-fleet", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + deleted=True, + ) + await create_export( + session=session, + exporter_project=project, + importer_projects=[importer_project, deleted_importer_project], + exported_fleets=[fleet, deleted_fleet], + name="test-export", + ) + + response = await client.post( + f"/api/project/{project.name}/exports/list", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + exports = response.json() + assert len(exports) == 1 + assert exports[0]["name"] == "test-export" + assert len(exports[0]["imports"]) == 1 + assert exports[0]["imports"][0]["project_name"] == "ImporterProject" + assert len(exports[0]["exported_fleets"]) == 1 + assert exports[0]["exported_fleets"][0]["name"] == "fleet" diff --git a/src/tests/_internal/server/routers/test_files.py b/src/tests/_internal/server/routers/test_files.py new file mode 100644 index 0000000000..c83938e71b --- /dev/null +++ b/src/tests/_internal/server/routers/test_files.py @@ -0,0 +1,186 @@ +from unittest.mock import AsyncMock, Mock + +import pytest +from httpx import AsyncClient +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.users import GlobalRole +from dstack._internal.server.models import FileArchiveModel +from dstack._internal.server.services.storage import BaseStorage +from dstack._internal.server.testing.common import ( + create_file_archive, + create_user, + get_auth_headers, +) + +pytestmark = [ + pytest.mark.asyncio, + pytest.mark.usefixtures("test_db"), + pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True), +] + + +class TestGetArchiveByHash: + async def test_returns_403_if_not_authenticated(self, client: AsyncClient): + response = await client.post( + "/api/files/get_archive_by_hash", + json={"hash": "blob_hash"}, + ) + assert response.status_code in [401, 403] + + async def test_returns_400_if_archive_does_not_exist( + self, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + response = await client.post( + "/api/files/get_archive_by_hash", + headers=get_auth_headers(user.token), + json={"hash": "blob_hash"}, + ) + assert response.status_code == 400, response.json() + + async def test_returns_archive(self, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session, global_role=GlobalRole.USER) + archive = await create_file_archive( + session=session, user_id=user.id, blob_hash="blob_hash", blob=b"blob_content" + ) + response = await client.post( + "/api/files/get_archive_by_hash", + headers=get_auth_headers(user.token), + json={"hash": archive.blob_hash}, + ) + assert response.status_code == 200, response.json() + assert response.json() == { + "id": str(archive.id), + "hash": archive.blob_hash, + } + + +class TestUploadArchive: + file_hash = "blob_hash" + file_content = b"blob_content" + file = (file_hash, file_content) + + @pytest.fixture + def default_storage_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: + storage_mock = Mock(spec_set=BaseStorage) + monkeypatch.setattr( + "dstack._internal.server.services.files.get_default_storage", lambda: storage_mock + ) + return storage_mock + + @pytest.fixture + def no_default_storage(self, monkeypatch: pytest.MonkeyPatch): + monkeypatch.setattr( + "dstack._internal.server.services.files.get_default_storage", lambda: None + ) + + async def test_returns_403_if_not_authenticated(self, client: AsyncClient): + response = await client.post( + "/api/files/upload_archive", + files={"file": self.file}, + ) + assert response.status_code in [401, 403] + + async def test_returns_existing_archive( + self, session: AsyncSession, client: AsyncClient, default_storage_mock: Mock + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + existing_archive = await create_file_archive( + session=session, user_id=user.id, blob_hash=self.file_hash, blob=b"existing_blob" + ) + response = await client.post( + "/api/files/upload_archive", + headers=get_auth_headers(user.token), + files={"file": self.file}, + ) + assert response.status_code == 200, response.json() + assert response.json() == { + "id": str(existing_archive.id), + "hash": self.file_hash, + } + res = await session.execute( + select(FileArchiveModel).where(FileArchiveModel.user_id == user.id) + ) + archive = res.scalar_one() + assert archive.id == existing_archive.id + assert archive.blob_hash == self.file_hash + assert archive.blob == existing_archive.blob + default_storage_mock.upload_archive.assert_not_called() + + @pytest.mark.usefixtures("no_default_storage") + async def test_uploads_archive_to_db(self, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session, global_role=GlobalRole.USER) + response = await client.post( + "/api/files/upload_archive", + headers=get_auth_headers(user.token), + files={"file": self.file}, + ) + assert response.status_code == 200, response.json() + assert response.json()["hash"] == self.file_hash + res = await session.execute( + select(FileArchiveModel).where(FileArchiveModel.user_id == user.id) + ) + archive = res.scalar_one() + assert archive.blob_hash == self.file_hash + assert archive.blob == self.file_content + + async def test_uploads_archive_to_storage( + self, session: AsyncSession, client: AsyncClient, default_storage_mock: Mock + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + response = await client.post( + "/api/files/upload_archive", + headers=get_auth_headers(user.token), + files={"file": self.file}, + ) + assert response.status_code == 200, response.json() + assert response.json()["hash"] == self.file_hash + res = await session.execute( + select(FileArchiveModel).where(FileArchiveModel.user_id == user.id) + ) + archive = res.scalar_one() + assert archive.blob_hash == self.file_hash + assert archive.blob is None + default_storage_mock.upload_archive.assert_called_once_with( + str(user.id), self.file_hash, self.file_content + ) + + async def test_handles_race_condition( + self, + monkeypatch: pytest.MonkeyPatch, + session: AsyncSession, + client: AsyncClient, + default_storage_mock: Mock, + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + existing_archive = await create_file_archive( + session=session, user_id=user.id, blob_hash=self.file_hash, blob=None + ) + monkeypatch.setattr( + "dstack._internal.server.services.files.get_archive_model_by_hash", + # first call checks if already uploaded (not yet) + # second call refetches after unique constraint violation + AsyncMock(side_effect=[None, existing_archive]), + ) + response = await client.post( + "/api/files/upload_archive", + headers=get_auth_headers(user.token), + files={"file": self.file}, + ) + assert response.status_code == 200, response.json() + assert response.json() == { + "id": str(existing_archive.id), + "hash": self.file_hash, + } + res = await session.execute( + select(FileArchiveModel).where(FileArchiveModel.user_id == user.id) + ) + archive = res.scalar_one() + assert archive.id == existing_archive.id + assert archive.blob_hash == self.file_hash + assert archive.blob is None + default_storage_mock.upload_archive.assert_called_once_with( + str(user.id), self.file_hash, self.file_content + ) diff --git a/src/tests/_internal/server/routers/test_fleets.py b/src/tests/_internal/server/routers/test_fleets.py new file mode 100644 index 0000000000..04d0145dfe --- /dev/null +++ b/src/tests/_internal/server/routers/test_fleets.py @@ -0,0 +1,2570 @@ +import json +from datetime import datetime, timezone +from typing import Any, Literal, Optional, Union +from unittest.mock import Mock, patch +from uuid import uuid4 + +import pytest +from freezegun import freeze_time +from httpx import AsyncClient +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import EntityReference +from dstack._internal.core.models.envs import Env +from dstack._internal.core.models.fleets import ( + FleetConfiguration, + FleetNodesSpec, + FleetStatus, + InstanceGroupPlacement, + SSHHostParams, + SSHParams, +) +from dstack._internal.core.models.instances import ( + InstanceAvailability, + InstanceOfferWithAvailability, + InstanceStatus, + InstanceType, + Resources, + SSHKey, +) +from dstack._internal.core.models.profiles import Profile +from dstack._internal.core.models.users import GlobalRole, ProjectRole +from dstack._internal.server.models import FleetModel, InstanceModel +from dstack._internal.server.services.fleets import fleet_model_to_fleet +from dstack._internal.server.services.permissions import DefaultPermissions +from dstack._internal.server.services.projects import add_project_member +from dstack._internal.server.testing.common import ( + create_export, + create_fleet, + create_instance, + create_job, + create_project, + create_repo, + create_run, + create_user, + default_permissions_context, + get_auth_headers, + get_fleet_configuration, + get_fleet_spec, + get_instance_offer_with_availability, + get_job_provisioning_data, + get_private_key_string, + get_remote_connection_info, + get_ssh_fleet_configuration, +) +from dstack._internal.server.testing.matchers import SomeUUID4Str + +pytestmark = pytest.mark.usefixtures("image_config_mock") + + +class TestListFleets: + @pytest.mark.asyncio + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/fleets/list") + assert response.status_code in [401, 403] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_lists_fleets_across_projects( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.ADMIN) + project1 = await create_project(session, name="project1", owner=user) + fleet1_spec = get_fleet_spec() + fleet1_spec.configuration.name = "fleet1" + await create_fleet( + session=session, + project=project1, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + spec=fleet1_spec, + ) + project2 = await create_project(session, name="project2", owner=user) + fleet2_spec = get_fleet_spec() + fleet2_spec.configuration.name = "fleet2" + await create_fleet( + session=session, + project=project2, + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + spec=fleet2_spec, + ) + response = await client.post( + "/api/fleets/list", + headers=get_auth_headers(user.token), + json={}, + ) + response_json = response.json() + assert response.status_code == 200, response_json + assert len(response_json) == 2 + assert response_json[0]["name"] == "fleet2" + assert response_json[1]["name"] == "fleet1" + response = await client.post( + "/api/fleets/list", + headers=get_auth_headers(user.token), + json={"prev_created_at": response_json[0]["created_at"]}, + ) + response_json = response.json() + assert response.status_code == 200, response_json + assert len(response_json) == 1 + assert response_json[0]["name"] == "fleet1" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_non_admin_cannot_see_others_projects( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user1 = await create_user(session, name="user1", global_role=GlobalRole.USER) + user2 = await create_user(session, name="user2", global_role=GlobalRole.USER) + project1 = await create_project(session, name="project1", owner=user1) + project2 = await create_project(session, name="project2", owner=user2) + await add_project_member( + session=session, project=project1, user=user1, project_role=ProjectRole.USER + ) + await add_project_member( + session=session, project=project2, user=user2, project_role=ProjectRole.USER + ) + await create_fleet( + session=session, + project=project1, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + await create_fleet( + session=session, + project=project2, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + response = await client.post( + "/api/fleets/list", + headers=get_auth_headers(user1.token), + json={}, + ) + response_json = response.json() + assert response.status_code == 200, response_json + assert len(response_json) == 1 + assert response_json[0]["project_name"] == "project1" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize("with_project_name_filter", [True, False]) + async def test_returns_imported_fleet_with_include_imported( + self, test_db, session: AsyncSession, client: AsyncClient, with_project_name_filter: bool + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + instance = await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + await create_fleet( + session=session, + project=importer_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="local-fleet")), + ) + response = await client.post( + "/api/fleets/list", + headers=get_auth_headers(importer_user.token), + json={ + "include_imported": True, + "project_name": "importer-project" if with_project_name_filter else None, + }, + ) + assert response.status_code == 200 + response_json = response.json() + response_json.sort(key=lambda f: f["name"]) + assert len(response_json) == 2 + assert response_json[0]["name"] == "exported-fleet" + assert response_json[0]["project_name"] == "exporter-project" + assert len(response_json[0]["instances"]) == 1 + assert response_json[0]["instances"][0]["id"] == str(instance.id) + assert response_json[1]["name"] == "local-fleet" + assert response_json[1]["project_name"] == "importer-project" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_not_returns_imported_fleet_without_include_imported( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + await create_fleet( + session=session, + project=importer_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="local-fleet")), + ) + response = await client.post( + "/api/fleets/list", + headers=get_auth_headers(importer_user.token), + json={}, + ) + assert response.status_code == 200 + response_json = response.json() + assert len(response_json) == 1 + assert response_json[0]["name"] == "local-fleet" + assert response_json[0]["project_name"] == "importer-project" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_imported_fleet_once_when_user_member_of_both_projects( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, name="user", global_role=GlobalRole.USER) + exporter_project = await create_project(session, name="exporter-project", owner=user) + importer_project = await create_project(session, name="importer-project", owner=user) + await add_project_member( + session=session, + project=exporter_project, + user=user, + project_role=ProjectRole.USER, + ) + await add_project_member( + session=session, + project=importer_project, + user=user, + project_role=ProjectRole.USER, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="shared-fleet")), + ) + instance = await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="local-exporter-fleet")), + ) + await create_fleet( + session=session, + project=importer_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="local-importer-fleet")), + ) + response = await client.post( + "/api/fleets/list", + headers=get_auth_headers(user.token), + json={"include_imported": True}, + ) + assert response.status_code == 200 + response_json = response.json() + response_json.sort(key=lambda f: f["name"]) + assert len(response_json) == 3 + assert response_json[0]["name"] == "local-exporter-fleet" + assert response_json[0]["project_name"] == "exporter-project" + assert response_json[1]["name"] == "local-importer-fleet" + assert response_json[1]["project_name"] == "importer-project" + assert response_json[2]["name"] == "shared-fleet" + assert response_json[2]["project_name"] == "exporter-project" + assert len(response_json[2]["instances"]) == 1 + assert response_json[2]["instances"][0]["id"] == str(instance.id) + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_fleet_once_if_imported_twice( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.USER, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + instance = await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + ) + for name in ["export-1", "export-2"]: + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + name=name, + ) + response = await client.post( + "/api/fleets/list", + headers=get_auth_headers(importer_user.token), + json={"include_imported": True}, + ) + assert response.status_code == 200 + response_json = response.json() + assert len(response_json) == 1 + assert response_json[0]["name"] == "exported-fleet" + assert response_json[0]["project_name"] == "exporter-project" + assert len(response_json[0]["instances"]) == 1 + assert response_json[0]["instances"][0]["id"] == str(instance.id) + + +class TestListProjectFleets: + @pytest.mark.asyncio + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/project/main/fleets/list") + assert response.status_code in [401, 403] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_lists_fleets(self, test_db, session: AsyncSession, client: AsyncClient): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet = await create_fleet( + session=session, + project=project, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + response = await client.post( + f"/api/project/{project.name}/fleets/list", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + assert response.json() == [ + { + "id": str(fleet.id), + "name": fleet.name, + "project_name": project.name, + "spec": json.loads(fleet.spec), + "created_at": "2023-01-02T03:04:00+00:00", + "status": fleet.status.value, + "status_message": None, + "instances": [], + } + ] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_imported_fleet_with_include_imported( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + instance = await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + await create_fleet( + session=session, + project=importer_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="local-fleet")), + ) + response = await client.post( + f"/api/project/{importer_project.name}/fleets/list", + headers=get_auth_headers(importer_user.token), + json={"include_imported": True}, + ) + assert response.status_code == 200 + response_json = response.json() + response_json.sort(key=lambda f: f["name"]) + assert len(response_json) == 2 + assert response_json[0]["name"] == "exported-fleet" + assert response_json[0]["project_name"] == "exporter-project" + assert len(response_json[0]["instances"]) == 1 + assert response_json[0]["instances"][0]["id"] == str(instance.id) + assert response_json[1]["name"] == "local-fleet" + assert response_json[1]["project_name"] == "importer-project" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_not_returns_imported_fleet_without_include_imported( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + await create_fleet( + session=session, + project=importer_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="local-fleet")), + ) + response = await client.post( + f"/api/project/{importer_project.name}/fleets/list", + headers=get_auth_headers(importer_user.token), + json={}, # No include_imported parameter + ) + assert response.status_code == 200 + response_json = response.json() + assert len(response_json) == 1 + assert response_json[0]["name"] == "local-fleet" + assert response_json[0]["project_name"] == "importer-project" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_fleet_once_if_imported_twice( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.USER, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + instance = await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + ) + for name in ["export-1", "export-2"]: + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + name=name, + ) + response = await client.post( + f"/api/project/{importer_project.name}/fleets/list", + headers=get_auth_headers(importer_user.token), + json={"include_imported": True}, + ) + assert response.status_code == 200 + response_json = response.json() + assert len(response_json) == 1 + assert response_json[0]["name"] == "exported-fleet" + assert response_json[0]["project_name"] == "exporter-project" + assert len(response_json[0]["instances"]) == 1 + assert response_json[0]["instances"][0]["id"] == str(instance.id) + + +class TestGetFleet: + @pytest.mark.asyncio + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/project/main/fleets/get") + assert response.status_code in [401, 403] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + "by_id", [pytest.param(False, id="by-name"), pytest.param(True, id="by-id")] + ) + async def test_returns_403_on_nonexistent_fleet_in_foreign_project( + self, test_db, session: AsyncSession, client: AsyncClient, by_id: bool + ): + await create_project(session, name="test-project") + user = await create_user(session, global_role=GlobalRole.USER) # not a project member + if by_id: + body = {"id": str(uuid4())} + else: + body = {"name": "nonexistent"} + response = await client.post( + "/api/project/test-project/fleets/get", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + "by_id", [pytest.param(False, id="by-name"), pytest.param(True, id="by-id")] + ) + async def test_returns_403_on_deleted_fleet_in_foreign_project( + self, test_db, session: AsyncSession, client: AsyncClient, by_id: bool + ): + project = await create_project(session, name="test-project") + user = await create_user(session, global_role=GlobalRole.USER) # not a project member + fleet = await create_fleet( + session=session, project=project, deleted=True, name="deleted-fleet" + ) + if by_id: + body = {"id": str(fleet.id)} + else: + body = {"name": "deleted-fleet"} + response = await client.post( + "/api/project/test-project/fleets/get", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize("deleted", [False, True]) + async def test_returns_fleet_by_id( + self, test_db, session: AsyncSession, client: AsyncClient, deleted: bool + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet = await create_fleet( + session=session, + project=project, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + deleted=deleted, + ) + response = await client.post( + f"/api/project/{project.name}/fleets/get", + headers=get_auth_headers(user.token), + json={"id": str(fleet.id)}, + ) + assert response.status_code == 200 + assert response.json() == { + "id": str(fleet.id), + "name": fleet.name, + "project_name": project.name, + "spec": json.loads(fleet.spec), + "created_at": "2023-01-02T03:04:00+00:00", + "status": fleet.status.value, + "status_message": None, + "instances": [], + } + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_not_deleted_fleet_by_name( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + active_fleet = await create_fleet( + session=session, + project=project, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + fleet_id=uuid4(), + ) + deleted_fleet = await create_fleet( + session=session, + project=project, + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + fleet_id=uuid4(), + deleted=True, + ) + assert active_fleet.name == deleted_fleet.name + assert active_fleet.id != deleted_fleet.id + response = await client.post( + f"/api/project/{project.name}/fleets/get", + headers=get_auth_headers(user.token), + json={"name": active_fleet.name}, + ) + assert response.status_code == 200 + assert response.json() == { + "id": str(active_fleet.id), + "name": active_fleet.name, + "project_name": project.name, + "spec": json.loads(active_fleet.spec), + "created_at": "2023-01-02T03:04:00+00:00", + "status": active_fleet.status.value, + "status_message": None, + "instances": [], + } + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_not_returns_by_name_if_fleet_deleted( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet = await create_fleet(session=session, project=project, deleted=True) + response = await client.post( + f"/api/project/{project.name}/fleets/get", + headers=get_auth_headers(user.token), + json={"name": fleet.name}, + ) + assert response.status_code == 400 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_not_returns_by_name_if_fleet_does_not_exist( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + response = await client.post( + f"/api/project/{project.name}/fleets/get", + headers=get_auth_headers(user.token), + json={"name": "some_fleet"}, + ) + assert response.status_code == 400 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + "by_id", [pytest.param(False, id="by-name"), pytest.param(True, id="by-id")] + ) + async def test_returns_foreign_fleet_to_global_admin( + self, test_db, session: AsyncSession, client: AsyncClient, by_id: bool + ): + admin = await create_user(session, global_role=GlobalRole.ADMIN) + project = await create_project(session, name="test-project") + fleet = await create_fleet(session=session, project=project, name="test-fleet") + if by_id: + body = {"id": str(fleet.id)} + else: + body = {"name": "test-fleet"} + response = await client.post( + "/api/project/test-project/fleets/get", + headers=get_auth_headers(admin.token), + json=body, + ) + assert response.status_code == 200 + assert response.json()["name"] == "test-fleet" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + "by_id", [pytest.param(False, id="by-name"), pytest.param(False, id="by-id")] + ) + async def test_returns_imported_fleet( + self, test_db, session: AsyncSession, client: AsyncClient, by_id: bool + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + instance = await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + if by_id: + body = {"id": str(fleet.id)} + else: + body = {"name": "exported-fleet"} + response = await client.post( + "/api/project/exporter-project/fleets/get", + headers=get_auth_headers(importer_user.token), + json=body, + ) + assert response.status_code == 200 + assert response.json()["id"] == str(fleet.id) + assert response.json()["name"] == "exported-fleet" + assert response.json()["project_name"] == "exporter-project" + assert len(response.json()["instances"]) == 1 + assert response.json()["instances"][0]["id"] == str(instance.id) + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + "by_id", [pytest.param(False, id="by-name"), pytest.param(False, id="by-id")] + ) + async def test_returns_403_on_foreign_fleet_if_not_imported( + self, test_db, session: AsyncSession, client: AsyncClient, by_id: bool + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + not_importer_user = await create_user( + session, name="not-importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project( + session, name="exporter-project", owner=importer_user + ) + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + not_importer_project = await create_project( + session, name="not-importer-project", owner=not_importer_user + ) + await add_project_member( + session=session, + project=not_importer_project, + user=not_importer_user, + project_role=ProjectRole.USER, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + if by_id: + body = {"id": str(fleet.id)} + else: + body = {"name": "exported-fleet"} + response = await client.post( + "/api/project/exporter-project/fleets/get", + headers=get_auth_headers(not_importer_user.token), + json=body, + ) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize( + "client_version,expected_fleets", + [ + ( + "0.20.13", + [ + "my-fleet", + "other-project/other-fleet", + ], + ), + ( + "0.20.14", + [ + {"project": None, "name": "my-fleet"}, + {"project": "other-project", "name": "other-fleet"}, + ], + ), + ( + None, + [ + {"project": None, "name": "my-fleet"}, + {"project": "other-project", "name": "other-fleet"}, + ], + ), + ], + ) + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_patches_profile_fleets_for_old_clients( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + client_version: Optional[str], + expected_fleets: list, + ) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + + fleets: list[Union[EntityReference, str]] = [ + EntityReference(project=None, name="my-fleet"), + EntityReference(project="other-project", name="other-fleet"), + ] + spec = get_fleet_spec( + profile=Profile(fleets=fleets), + ) + fleet = await create_fleet(session=session, project=project, spec=spec) + + headers = get_auth_headers(user.token) + if client_version is not None: + headers["X-API-Version"] = client_version + response = await client.post( + f"/api/project/{project.name}/fleets/get", + headers=headers, + json={"id": str(fleet.id)}, + ) + + assert response.status_code == 200 + assert response.json()["spec"]["profile"]["fleets"] == expected_fleets + + +class TestApplyFleetPlan: + @pytest.mark.asyncio + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/project/main/fleets/apply") + assert response.status_code in [401, 403] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @freeze_time(datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc)) + async def test_creates_fleet(self, test_db, session: AsyncSession, client: AsyncClient): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + spec = get_fleet_spec(conf=get_fleet_configuration()) + response = await client.post( + f"/api/project/{project.name}/fleets/apply", + headers=get_auth_headers(user.token), + json={"plan": {"spec": spec.dict()}, "force": False}, + ) + assert response.status_code == 200 + assert response.json() == { + "id": SomeUUID4Str(), + "name": spec.configuration.name, + "project_name": project.name, + "spec": { + "configuration_path": spec.configuration_path, + "configuration": { + "nodes": {"min": 1, "max": 1}, + "placement": None, + "env": {}, + "ssh_config": None, + "resources": None, + "backends": None, + "regions": None, + "availability_zones": None, + "instance_types": None, + "spot_policy": None, + "retry": None, + "max_price": None, + "idle_duration": None, + "type": "fleet", + "name": "test-fleet", + "reservation": None, + "blocks": 1, + "tags": None, + "backend_options": None, + }, + "profile": { + "backends": None, + "regions": None, + "availability_zones": None, + "instance_types": None, + "spot_policy": None, + "retry": None, + "max_duration": None, + "stop_duration": None, + "max_price": None, + "creation_policy": None, + "idle_duration": None, + "utilization_policy": None, + "startup_order": None, + "stop_criteria": None, + "schedule": None, + "name": "", + "default": False, + "reservation": None, + "fleets": None, + "tags": None, + "backend_options": None, + "instances": None, + }, + "autocreated": False, + }, + "created_at": "2023-01-02T03:04:00+00:00", + "status": "active", + "status_message": None, + "instances": [ + { + "id": SomeUUID4Str(), + "project_name": project.name, + "name": f"{spec.configuration.name}-0", + "fleet_id": SomeUUID4Str(), + "fleet_name": spec.configuration.name, + "instance_num": 0, + "job_name": None, + "hostname": None, + "status": "pending", + "unreachable": False, + "health_status": "healthy", + "termination_reason": None, + "termination_reason_message": None, + "created": "2023-01-02T03:04:00+00:00", + "finished_at": None, + "backend": None, + "region": None, + "availability_zone": None, + "instance_type": None, + "price": None, + "total_blocks": 1, + "busy_blocks": 0, + } + ], + } + for instance in response.json()["instances"]: + assert instance["fleet_id"] == response.json()["id"] + res = await session.execute(select(FleetModel)) + assert res.scalar_one() + res = await session.execute(select(InstanceModel)) + assert res.unique().scalar_one() + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @freeze_time(datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc)) + async def test_creates_ssh_fleet(self, test_db, session: AsyncSession, client: AsyncClient): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + conf = get_ssh_fleet_configuration( + name="test-ssh-fleet", + user="ubuntu", + ssh_key=SSHKey(public="", private=get_private_key_string()), + hosts=["1.1.1.1"], + network=None, + ) + spec = get_fleet_spec(conf=conf) + response = await client.post( + f"/api/project/{project.name}/fleets/apply", + headers=get_auth_headers(user.token), + json={"plan": {"spec": spec.dict()}, "force": False}, + ) + assert response.status_code == 200, response.json() + assert response.json() == { + "id": SomeUUID4Str(), + "name": spec.configuration.name, + "project_name": project.name, + "spec": { + "configuration_path": spec.configuration_path, + "configuration": { + "env": {}, + "ssh_config": { + "user": "ubuntu", + "port": None, + "identity_file": None, + "ssh_key": None, # should not return ssh_key + "proxy_jump": None, + "hosts": ["1.1.1.1"], + "network": None, + }, + "nodes": None, + "placement": None, + "resources": None, + "backends": None, + "regions": None, + "availability_zones": None, + "instance_types": None, + "spot_policy": None, + "retry": None, + "max_price": None, + "idle_duration": None, + "type": "fleet", + "name": spec.configuration.name, + "reservation": None, + "blocks": 1, + "tags": None, + "backend_options": None, + }, + "profile": { + "backends": None, + "regions": None, + "availability_zones": None, + "instance_types": None, + "spot_policy": None, + "retry": None, + "max_duration": None, + "stop_duration": None, + "max_price": None, + "creation_policy": None, + "idle_duration": None, + "utilization_policy": None, + "startup_order": None, + "stop_criteria": None, + "schedule": None, + "name": "", + "default": False, + "reservation": None, + "fleets": None, + "tags": None, + "backend_options": None, + "instances": None, + }, + "autocreated": False, + }, + "created_at": "2023-01-02T03:04:00+00:00", + "status": "active", + "status_message": None, + "instances": [ + { + "id": SomeUUID4Str(), + "project_name": project.name, + "backend": "remote", + "instance_type": { + "name": "ssh", + "resources": { + "cpu_arch": None, + "cpus": 2, + "memory_mib": 8, + "gpus": [], + "spot": False, + "disk": {"size_mib": 102400}, + "description": "", + }, + }, + "name": f"{spec.configuration.name}-0", + "fleet_id": SomeUUID4Str(), + "fleet_name": spec.configuration.name, + "instance_num": 0, + "job_name": None, + "hostname": "1.1.1.1", + "status": "pending", + "unreachable": False, + "health_status": "healthy", + "termination_reason": None, + "termination_reason_message": None, + "created": "2023-01-02T03:04:00+00:00", + "finished_at": None, + "region": "remote", + "availability_zone": None, + "price": 0.0, + "total_blocks": 1, + "busy_blocks": 0, + } + ], + } + for instance in response.json()["instances"]: + assert instance["fleet_id"] == response.json()["id"] + res = await session.execute(select(FleetModel)) + assert res.scalar_one() + res = await session.execute(select(InstanceModel)) + instance = res.unique().scalar_one() + assert instance.remote_connection_info is not None + + @pytest.mark.parametrize( + ["top_level_blocks", "host_blocks", "host_type", "expected_blocks"], + [ + pytest.param(None, None, str, 1, id="global-default-string"), + pytest.param(None, None, SSHHostParams, 1, id="global-default-object"), + pytest.param(4, None, str, 4, id="top-level-int-string"), + pytest.param(4, None, SSHHostParams, 4, id="top-level-int-object"), + pytest.param("auto", None, str, None, id="top-level-auto-string"), + pytest.param("auto", None, SSHHostParams, None, id="top-level-auto-object"), + pytest.param("auto", 4, SSHHostParams, 4, id="host-level-int"), + pytest.param(4, "auto", SSHHostParams, None, id="host-level-auto"), + ], + ) + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_creates_ssh_fleet_with_blocks( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + top_level_blocks: Optional[Union[int, Literal["auto"]]], + host_blocks: Optional[Union[int, Literal["auto"]]], + host_type: Union[type[str], type[SSHHostParams]], + expected_blocks: Optional[int], + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + if host_type is str: + host = "1.1.1.1" + elif host_blocks is None: + host = SSHHostParams(hostname="1.1.1.1") + else: + host = SSHHostParams(hostname="1.1.1.1", blocks=host_blocks) + conf = get_ssh_fleet_configuration(blocks=top_level_blocks, hosts=[host]) + spec = get_fleet_spec(conf=conf) + response = await client.post( + f"/api/project/{project.name}/fleets/apply", + headers=get_auth_headers(user.token), + json={"plan": {"spec": spec.dict()}, "force": False}, + ) + assert response.status_code == 200, response.json() + res = await session.execute(select(FleetModel)) + assert len(res.scalars().all()) == 1 + res = await session.execute(select(InstanceModel)) + instance = res.scalar_one() + assert instance.total_blocks == expected_blocks + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @freeze_time(datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), real_asyncio=True) + async def test_updates_ssh_fleet(self, test_db, session: AsyncSession, client: AsyncClient): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + current_conf = get_ssh_fleet_configuration( + name="test-ssh-fleet", + user="ubuntu", + ssh_key=SSHKey(public="", private=get_private_key_string()), + hosts=["10.0.0.100"], + network=None, + ) + current_spec = get_fleet_spec(conf=current_conf) + spec = current_spec.copy(deep=True) + # 10.0.0.100 removed, 10.0.0.101 added + spec.configuration.ssh_config.hosts = ["10.0.0.101"] + + fleet = await create_fleet(session=session, project=project, spec=current_spec) + instance_type = InstanceType( + name="ssh", + resources=Resources(cpus=2, memory_mib=8, gpus=[], spot=False), + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + backend=BackendType.REMOTE, + name="test-ssh-fleet-0", + region="remote", + price=0.0, + status=InstanceStatus.IDLE, + offer=get_instance_offer_with_availability( + backend=BackendType.REMOTE, + region="remote", + price=0.0, + ), + job_provisioning_data=get_job_provisioning_data( + instance_type=instance_type, + hostname="10.0.0.100", + ), + remote_connection_info=get_remote_connection_info(host="10.0.0.100"), + ) + + response = await client.post( + f"/api/project/{project.name}/fleets/apply", + headers=get_auth_headers(user.token), + json={ + "plan": { + "spec": spec.dict(), + "current_resource": _fleet_model_to_json_dict(fleet), + }, + "force": False, + }, + ) + + assert response.status_code == 200, response.json() + assert response.json() == { + "id": str(fleet.id), + "name": spec.configuration.name, + "project_name": project.name, + "spec": { + "configuration_path": spec.configuration_path, + "configuration": { + "env": {}, + "ssh_config": { + "user": "ubuntu", + "port": None, + "identity_file": None, + "ssh_key": None, # should not return ssh_key + "proxy_jump": None, + "hosts": ["10.0.0.101"], + "network": None, + }, + "nodes": None, + "placement": None, + "resources": None, + "backends": None, + "regions": None, + "availability_zones": None, + "instance_types": None, + "spot_policy": None, + "retry": None, + "max_price": None, + "idle_duration": None, + "type": "fleet", + "name": spec.configuration.name, + "reservation": None, + "blocks": 1, + "tags": None, + "backend_options": None, + }, + "profile": { + "backends": None, + "regions": None, + "availability_zones": None, + "instance_types": None, + "spot_policy": None, + "retry": None, + "max_duration": None, + "stop_duration": None, + "max_price": None, + "creation_policy": None, + "idle_duration": None, + "utilization_policy": None, + "startup_order": None, + "stop_criteria": None, + "schedule": None, + "name": "", + "default": False, + "reservation": None, + "fleets": None, + "tags": None, + "backend_options": None, + "instances": None, + }, + "autocreated": False, + }, + "created_at": "2023-01-02T03:04:00+00:00", + "status": "active", + "status_message": None, + "instances": [ + { + "id": str(instance.id), + "project_name": project.name, + "backend": "remote", + "instance_type": { + "name": "ssh", + "resources": { + "cpu_arch": None, + "cpus": 2, + "memory_mib": 8, + "gpus": [], + "spot": False, + "disk": {"size_mib": 102400}, + "description": "", + }, + }, + "name": "test-ssh-fleet-0", + "fleet_id": str(fleet.id), + "fleet_name": "test-ssh-fleet", + "instance_num": 0, + "job_name": None, + "hostname": "10.0.0.100", + "status": "terminating", + "unreachable": False, + "health_status": "healthy", + "termination_reason": "terminated_by_user", + "termination_reason_message": None, + "created": "2023-01-02T03:04:00+00:00", + "finished_at": None, + "region": "remote", + "availability_zone": None, + "price": 0.0, + "total_blocks": 1, + "busy_blocks": 0, + }, + { + "id": SomeUUID4Str(), + "project_name": project.name, + "backend": "remote", + "instance_type": { + "name": "ssh", + "resources": { + "cpu_arch": None, + "cpus": 2, + "memory_mib": 8, + "gpus": [], + "spot": False, + "disk": {"size_mib": 102400}, + "description": "", + }, + }, + "name": "test-ssh-fleet-1", + "fleet_id": str(fleet.id), + "fleet_name": "test-ssh-fleet", + "instance_num": 1, + "job_name": None, + "hostname": "10.0.0.101", + "status": "pending", + "unreachable": False, + "health_status": "healthy", + "termination_reason": None, + "termination_reason_message": None, + "created": "2023-01-02T03:04:00+00:00", + "finished_at": None, + "region": "remote", + "availability_zone": None, + "price": 0.0, + "total_blocks": 1, + "busy_blocks": 0, + }, + ], + } + res = await session.execute(select(FleetModel)) + assert res.scalar_one() + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATING + res = await session.execute( + select(InstanceModel).where(InstanceModel.id == response.json()["instances"][1]["id"]) + ) + instance = res.unique().scalar_one() + assert instance.status == InstanceStatus.PENDING + assert instance.remote_connection_info is not None + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_updates_cloud_fleet_nodes_in_place_when_fleet_in_use( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + current_spec = get_fleet_spec( + conf=get_fleet_configuration(nodes=FleetNodesSpec(min=0, target=0, max=2)) + ) + fleet = await create_fleet(session=session, project=project, spec=current_spec) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user, fleet=fleet) + job = await create_job(session=session, run=run, fleet=fleet) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + job=job, + status=InstanceStatus.BUSY, + instance_num=0, + ) + spec = current_spec.copy(deep=True) + spec.configuration.nodes = FleetNodesSpec(min=1, target=1, max=3) + + response = await client.post( + f"/api/project/{project.name}/fleets/apply", + headers=get_auth_headers(user.token), + json={ + "plan": { + "spec": spec.dict(), + "current_resource": _fleet_model_to_json_dict(fleet), + }, + "force": False, + }, + ) + + response_json = response.json() + assert response.status_code == 200, response_json + assert response_json["id"] == str(fleet.id) + assert response_json["spec"]["configuration"]["nodes"] == {"min": 1, "max": 3} + + await session.refresh(fleet) + await session.refresh(instance) + assert json.loads(fleet.spec)["configuration"]["nodes"] == {"min": 1, "max": 3} + assert instance.status == InstanceStatus.BUSY + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_updates_cloud_fleet_nodes_target_without_changing_instance_count( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + current_spec = get_fleet_spec( + conf=get_fleet_configuration(nodes=FleetNodesSpec(min=0, target=0, max=1)) + ) + fleet = await create_fleet(session=session, project=project, spec=current_spec) + spec = current_spec.copy(deep=True) + spec.configuration.nodes = FleetNodesSpec(min=0, target=1, max=1) + + response = await client.post( + f"/api/project/{project.name}/fleets/apply", + headers=get_auth_headers(user.token), + json={ + "plan": { + "spec": spec.dict(), + "current_resource": _fleet_model_to_json_dict(fleet), + }, + "force": False, + }, + ) + + response_json = response.json() + assert response.status_code == 200, response_json + assert response_json["id"] == str(fleet.id) + assert response_json["spec"]["configuration"]["nodes"] == { + "min": 0, + "target": 1, + "max": 1, + } + + await session.refresh(fleet) + assert json.loads(fleet.spec)["configuration"]["nodes"] == { + "min": 0, + "target": 1, + "max": 1, + } + res = await session.execute( + select(InstanceModel).where( + InstanceModel.fleet_id == fleet.id, + InstanceModel.deleted == False, + ) + ) + assert list(res.scalars().all()) == [] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @freeze_time(datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc)) + async def test_errors_if_ssh_key_is_bad( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + spec = get_fleet_spec( + conf=FleetConfiguration( + name="test-ssh-fleet", + ssh_config=SSHParams( + user="ubuntu", + ssh_key=SSHKey(public="", private="123"), + hosts=["1.1.1.1"], + network=None, + ), + ) + ) + response = await client.post( + f"/api/project/{project.name}/fleets/apply", + headers=get_auth_headers(user.token), + json={"plan": {"spec": spec.dict()}, "force": False}, + ) + assert response.status_code == 400 + + @pytest.mark.parametrize( + ["field_name", "field_value"], + [ + pytest.param("backends", [BackendType.AWS], id="backends"), + pytest.param("regions", ["eu-west-1"], id="regions"), + pytest.param("instance_types", ["g6e.24xlarge"], id="instance_types"), + pytest.param("idle_duration", 60, id="idle_duration"), + pytest.param("tags", {}, id="tags"), # falsy value + ], + ) + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_errors_if_ssh_fleet_uses_backend_only_field( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + field_name: str, + field_value: Any, + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + conf = get_ssh_fleet_configuration(name="test-ssh-fleet", hosts=["1.1.1.1"]) + setattr(conf, field_name, field_value) + spec = get_fleet_spec(conf=conf) + response = await client.post( + f"/api/project/{project.name}/fleets/apply", + headers=get_auth_headers(user.token), + json={"plan": {"spec": spec.dict()}, "force": False}, + ) + assert response.status_code == 400, response.json() + assert response.json()["detail"][0]["msg"] == ( + f"SSH fleet configuration does not support the following fields: ['{field_name}']" + ) + + @pytest.mark.parametrize( + ["field_name", "field_value"], + [ + pytest.param("env", Env.parse_obj({"K": "V"}), id="env"), + ], + ) + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_errors_if_backend_fleet_uses_ssh_only_field( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + field_name: str, + field_value: Any, + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + conf = get_fleet_configuration() + setattr(conf, field_name, field_value) + spec = get_fleet_spec(conf=conf) + response = await client.post( + f"/api/project/{project.name}/fleets/apply", + headers=get_auth_headers(user.token), + json={"plan": {"spec": spec.dict()}, "force": False}, + ) + assert response.status_code == 400, response.json() + assert response.json()["detail"][0]["msg"] == ( + f"Backend fleet configuration does not support the following fields: ['{field_name}']" + ) + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_forbids_if_no_permission_to_manage_ssh_fleets( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + spec = get_fleet_spec( + conf=FleetConfiguration( + name="test-ssh-fleet", + ssh_config=SSHParams( + user="ubuntu", + ssh_key=SSHKey(public="", private=get_private_key_string()), + hosts=["1.1.1.1"], + network=None, + ), + ) + ) + with default_permissions_context( + DefaultPermissions(allow_non_admins_manage_ssh_fleets=False) + ): + response = await client.post( + f"/api/project/{project.name}/fleets/apply", + headers=get_auth_headers(user.token), + json={"plan": {"spec": spec.dict()}, "force": False}, + ) + assert response.status_code in [401, 403] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_importer_member_cannot_apply_plan_on_imported_fleet( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + spec = get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=spec, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + response = await client.post( + f"/api/project/{exporter_project.name}/fleets/apply", + headers=get_auth_headers(importer_user.token), + json={"plan": {"spec": spec.dict()}, "force": False}, + ) + assert response.status_code == 403 + + +class TestDeleteFleets: + @pytest.mark.asyncio + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/project/main/fleets/delete") + assert response.status_code in [401, 403] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_terminates_fleet_instances( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet = await create_fleet(session=session, project=project) + instance = await create_instance( + session=session, + project=project, + ) + fleet.instances.append(instance) + await session.commit() + response = await client.post( + f"/api/project/{project.name}/fleets/delete", + headers=get_auth_headers(user.token), + json={"names": [fleet.name]}, + ) + assert response.status_code == 200 + await session.refresh(fleet) + await session.refresh(instance) + assert not fleet.deleted # should not be deleted yet + assert instance.status == InstanceStatus.TERMINATING + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_400_when_fleet_in_use( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet = await create_fleet(session=session, project=project) + repo = await create_repo( + session=session, + project_id=project.id, + ) + await create_run( + session=session, + project=project, + repo=repo, + user=user, + fleet=fleet, + ) + response = await client.post( + f"/api/project/{project.name}/fleets/delete", + headers=get_auth_headers(user.token), + json={"names": [fleet.name]}, + ) + assert response.status_code == 400 + await session.refresh(fleet) + assert not fleet.deleted + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_400_when_fleet_instance_in_use( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet = await create_fleet(session=session, project=project) + repo = await create_repo( + session=session, + project_id=project.id, + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + ) + job = await create_job( + session=session, + run=run, + ) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + job=job, + ) + fleet.instances.append(instance) + await session.commit() + response = await client.post( + f"/api/project/{project.name}/fleets/delete", + headers=get_auth_headers(user.token), + json={"names": [fleet.name]}, + ) + assert response.status_code == 400 + await session.refresh(fleet) + assert not fleet.deleted + assert instance.status == InstanceStatus.BUSY + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_400_when_fleet_locked( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet = await create_fleet(session=session, project=project) + instance = await create_instance( + session=session, + project=project, + ) + fleet.instances.append(instance) + fleet.lock_expires_at = datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc) + await session.commit() + + response = await client.post( + f"/api/project/{project.name}/fleets/delete", + headers=get_auth_headers(user.token), + json={"names": [fleet.name]}, + ) + assert response.status_code == 400 + + await session.refresh(fleet) + await session.refresh(instance) + assert fleet.status != FleetStatus.TERMINATING + assert instance.status != InstanceStatus.TERMINATING + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_forbids_if_no_permission_to_manage_ssh_fleets( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + spec = get_fleet_spec( + conf=FleetConfiguration( + name="test-ssh-fleet", + ssh_config=SSHParams( + user="ubuntu", + ssh_key=SSHKey(public="", private=get_private_key_string()), + hosts=["1.1.1.1"], + network=None, + ), + ) + ) + fleet = await create_fleet(session=session, project=project, spec=spec) + with default_permissions_context( + DefaultPermissions(allow_non_admins_manage_ssh_fleets=False) + ): + response = await client.post( + f"/api/project/{project.name}/fleets/delete", + headers=get_auth_headers(user.token), + json={"names": [fleet.name]}, + ) + assert response.status_code in [401, 403] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_importer_member_cannot_delete_imported_fleet( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + response = await client.post( + f"/api/project/{exporter_project.name}/fleets/delete", + headers=get_auth_headers(importer_user.token), + json={"names": [fleet.name]}, + ) + assert response.status_code == 403 + + +class TestDeleteFleetInstances: + @pytest.mark.asyncio + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/project/main/fleets/delete_instances") + assert response.status_code in [401, 403] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_terminates_fleet_instances( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet = await create_fleet(session=session, project=project) + instance1 = await create_instance( + session=session, + project=project, + instance_num=1, + ) + instance2 = await create_instance( + session=session, + project=project, + instance_num=2, + status=InstanceStatus.IDLE, + ) + instance3 = await create_instance( + session=session, + project=project, + instance_num=3, + status=InstanceStatus.BUSY, + ) + fleet.instances.append(instance1) + fleet.instances.append(instance2) + fleet.instances.append(instance3) + repo = await create_repo( + session=session, + project_id=project.id, + ) + # Run assigned to instance 3. Should not interfere with deleting instance 1. + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + fleet=fleet, + ) + await create_job( + session=session, + run=run, + instance=instance3, + ) + response = await client.post( + f"/api/project/{project.name}/fleets/delete_instances", + headers=get_auth_headers(user.token), + json={"name": fleet.name, "instance_nums": [1]}, + ) + assert response.status_code == 200 + await session.refresh(fleet) + await session.refresh(instance1) + await session.refresh(instance2) + + assert instance1.status == InstanceStatus.TERMINATING + assert instance2.status != InstanceStatus.TERMINATING + assert fleet.status != FleetStatus.TERMINATING + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_ignores_lock_on_non_selected_instances( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet = await create_fleet(session=session, project=project) + instance1 = await create_instance( + session=session, + project=project, + instance_num=1, + ) + instance2 = await create_instance( + session=session, + project=project, + instance_num=2, + ) + fleet.instances.append(instance1) + fleet.instances.append(instance2) + instance2.lock_expires_at = datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc) + await session.commit() + + response = await client.post( + f"/api/project/{project.name}/fleets/delete_instances", + headers=get_auth_headers(user.token), + json={"name": fleet.name, "instance_nums": [1]}, + ) + assert response.status_code == 200 + await session.refresh(fleet) + await session.refresh(instance1) + await session.refresh(instance2) + assert instance1.status == InstanceStatus.TERMINATING + assert instance2.status != InstanceStatus.TERMINATING + assert fleet.status != FleetStatus.TERMINATING + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_400_when_selected_instance_locked( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet = await create_fleet(session=session, project=project) + instance1 = await create_instance( + session=session, + project=project, + instance_num=1, + ) + instance2 = await create_instance( + session=session, + project=project, + instance_num=2, + ) + fleet.instances.append(instance1) + fleet.instances.append(instance2) + instance1.lock_expires_at = datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc) + await session.commit() + + response = await client.post( + f"/api/project/{project.name}/fleets/delete_instances", + headers=get_auth_headers(user.token), + json={"name": fleet.name, "instance_nums": [1]}, + ) + assert response.status_code == 400 + await session.refresh(fleet) + await session.refresh(instance1) + await session.refresh(instance2) + assert instance1.status != InstanceStatus.TERMINATING + assert instance2.status != InstanceStatus.TERMINATING + assert fleet.status != FleetStatus.TERMINATING + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_400_when_deleting_busy_instances( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet = await create_fleet(session=session, project=project) + repo = await create_repo( + session=session, + project_id=project.id, + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + ) + job = await create_job( + session=session, + run=run, + ) + instance = await create_instance( + session=session, + project=project, + instance_num=1, + status=InstanceStatus.BUSY, + job=job, + ) + fleet.instances.append(instance) + await session.commit() + response = await client.post( + f"/api/project/{project.name}/fleets/delete_instances", + headers=get_auth_headers(user.token), + json={"name": fleet.name, "instance_nums": [1]}, + ) + assert response.status_code == 400 + await session.refresh(fleet) + await session.refresh(instance) + + assert instance.status != InstanceStatus.TERMINATING + assert fleet.status != FleetStatus.TERMINATING + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_400_when_fleet_locked( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet = await create_fleet(session=session, project=project) + instance = await create_instance( + session=session, + project=project, + instance_num=1, + ) + fleet.instances.append(instance) + fleet.lock_expires_at = datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc) + await session.commit() + + response = await client.post( + f"/api/project/{project.name}/fleets/delete_instances", + headers=get_auth_headers(user.token), + json={"name": fleet.name, "instance_nums": [1]}, + ) + assert response.status_code == 400 + + await session.refresh(fleet) + await session.refresh(instance) + assert fleet.status != FleetStatus.TERMINATING + assert instance.status != InstanceStatus.TERMINATING + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_importer_member_cannot_delete_imported_fleet_instances( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + instance_num=1, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + response = await client.post( + f"/api/project/{exporter_project.name}/fleets/delete_instances", + headers=get_auth_headers(importer_user.token), + json={"name": fleet.name, "instance_nums": [1]}, + ) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_rejects_deleting_placeholder_instance( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet = await create_fleet(session=session, project=project) + await create_instance( + session=session, + project=project, + fleet=fleet, + instance_num=0, + status=InstanceStatus.PENDING, + provisioning_job_id=uuid4(), + offer=None, + job_provisioning_data=None, + ) + response = await client.post( + f"/api/project/{project.name}/fleets/delete_instances", + headers=get_auth_headers(user.token), + json={"name": fleet.name, "instance_nums": [0]}, + ) + assert response.status_code == 400 + assert "provisioning" in response.text + + +class TestGetPlan: + @pytest.mark.asyncio + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/project/main/fleets/get_plan") + assert response.status_code in [401, 403] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_create_plan_for_new_fleet( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + offers = [ + InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="instance", + resources=Resources(cpus=1, memory_mib=512, spot=False, gpus=[]), + ), + region="us", + price=1.0, + availability=InstanceAvailability.AVAILABLE, + ) + ] + spec = get_fleet_spec() + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + m.return_value = [backend_mock] + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value.get_offers.return_value = offers + response = await client.post( + f"/api/project/{project.name}/fleets/get_plan", + headers=get_auth_headers(user.token), + json={"spec": spec.dict()}, + ) + backend_mock.compute.return_value.get_offers.assert_called_once() + + assert response.status_code == 200 + assert response.json() == { + "project_name": project.name, + "user": user.name, + "spec": json.loads(spec.json()), + "effective_spec": json.loads(spec.json()), + "current_resource": None, + "offers": [json.loads(o.json()) for o in offers], + "total_offers": len(offers), + "max_offer_price": 1.0, + "action": "create", + } + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_offers_for_elastic_container_backend_fleet( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + offer = get_instance_offer_with_availability( + backend=BackendType.RUNPOD, + region="US-OR-1", + price=0.7185, + ) + spec = get_fleet_spec( + conf=get_fleet_configuration(nodes=FleetNodesSpec(min=0, target=0, max=1)) + ) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + m.return_value = [backend_mock] + backend_mock.TYPE = BackendType.RUNPOD + backend_mock.compute.return_value.get_offers.return_value = [offer] + response = await client.post( + f"/api/project/{project.name}/fleets/get_plan", + headers=get_auth_headers(user.token), + json={"spec": spec.dict()}, + ) + backend_mock.compute.return_value.get_offers.assert_called_once() + + response_json = response.json() + assert response.status_code == 200, response_json + assert response_json["offers"] == [json.loads(offer.json())] + assert response_json["total_offers"] == 1 + assert response_json["max_offer_price"] == offer.price + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_no_offers_for_non_elastic_container_backend_fleet( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + offer = get_instance_offer_with_availability( + backend=BackendType.RUNPOD, + region="US-OR-1", + price=0.7185, + ) + spec = get_fleet_spec( + conf=get_fleet_configuration(nodes=FleetNodesSpec(min=0, target=1, max=1)) + ) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + m.return_value = [backend_mock] + backend_mock.TYPE = BackendType.RUNPOD + backend_mock.compute.return_value.get_offers.return_value = [offer] + response = await client.post( + f"/api/project/{project.name}/fleets/get_plan", + headers=get_auth_headers(user.token), + json={"spec": spec.dict()}, + ) + backend_mock.compute.return_value.get_offers.assert_called_once() + + response_json = response.json() + assert response.status_code == 200, response_json + assert response_json["offers"] == [] + assert response_json["total_offers"] == 0 + assert response_json["max_offer_price"] is None + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_update_plan_for_existing_fleet( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + conf = get_ssh_fleet_configuration(hosts=["10.0.0.100"]) + spec = get_fleet_spec(conf=conf) + effective_spec = spec.copy(deep=True) + effective_spec.configuration.ssh_config.ssh_key = None + current_spec = spec.copy(deep=True) + # `hosts` can be updated in-place + current_spec.configuration.ssh_config.hosts = ["10.0.0.100", "10.0.0.101"] + fleet = await create_fleet(session=session, project=project, spec=current_spec) + + response = await client.post( + f"/api/project/{project.name}/fleets/get_plan", + headers=get_auth_headers(user.token), + json={"spec": spec.dict()}, + ) + + assert response.status_code == 200 + assert response.json() == { + "project_name": project.name, + "user": user.name, + "spec": spec.dict(), + "effective_spec": effective_spec.dict(), + "current_resource": _fleet_model_to_json_dict(fleet), + "offers": [], + "total_offers": 0, + "max_offer_price": None, + "action": "update", + } + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_update_plan_for_existing_cloud_fleet_nodes_update( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + current_spec = get_fleet_spec( + conf=get_fleet_configuration(nodes=FleetNodesSpec(min=0, target=0, max=1)) + ) + spec = current_spec.copy(deep=True) + spec.configuration.nodes = FleetNodesSpec(min=1, target=1, max=1) + fleet = await create_fleet(session=session, project=project, spec=current_spec) + + response = await client.post( + f"/api/project/{project.name}/fleets/get_plan", + headers=get_auth_headers(user.token), + json={"spec": spec.dict()}, + ) + + response_json = response.json() + assert response.status_code == 200, response_json + assert response_json["current_resource"]["id"] == str(fleet.id) + assert response_json["action"] == "update" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_create_plan_for_existing_cloud_fleet_blocks_update( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + current_spec = get_fleet_spec( + conf=get_fleet_configuration(nodes=FleetNodesSpec(min=0, target=0, max=1)) + ) + spec = current_spec.copy(deep=True) + spec.configuration.blocks = 2 + fleet = await create_fleet(session=session, project=project, spec=current_spec) + + response = await client.post( + f"/api/project/{project.name}/fleets/get_plan", + headers=get_auth_headers(user.token), + json={"spec": spec.dict()}, + ) + + response_json = response.json() + assert response.status_code == 200, response_json + assert response_json["current_resource"]["id"] == str(fleet.id) + assert response_json["action"] == "create" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_update_plan_for_existing_cloud_fleet_provisioning_fields_update( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + current_spec = get_fleet_spec( + conf=get_fleet_configuration(nodes=FleetNodesSpec(min=0, target=0, max=1)) + ) + spec = current_spec.copy(deep=True) + spec.configuration.backends = [BackendType.AWS] + spec.configuration.regions = ["us-east-1"] + fleet = await create_fleet(session=session, project=project, spec=current_spec) + + response = await client.post( + f"/api/project/{project.name}/fleets/get_plan", + headers=get_auth_headers(user.token), + json={"spec": spec.dict()}, + ) + + response_json = response.json() + assert response.status_code == 200, response_json + assert response_json["current_resource"]["id"] == str(fleet.id) + assert response_json["action"] == "update" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_create_plan_for_existing_fleet( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + conf = get_ssh_fleet_configuration(placement=InstanceGroupPlacement.ANY) + spec = get_fleet_spec(conf=conf) + effective_spec = spec.copy(deep=True) + effective_spec.configuration.ssh_config.ssh_key = None + current_spec = spec.copy(deep=True) + # `placement` cannot be updated in-place + current_spec.configuration.placement = InstanceGroupPlacement.CLUSTER + fleet = await create_fleet(session=session, project=project, spec=current_spec) + + response = await client.post( + f"/api/project/{project.name}/fleets/get_plan", + headers=get_auth_headers(user.token), + json={"spec": spec.dict()}, + ) + + assert response.status_code == 200 + assert response.json() == { + "project_name": project.name, + "user": user.name, + "spec": spec.dict(), + "effective_spec": effective_spec.dict(), + "current_resource": _fleet_model_to_json_dict(fleet), + "offers": [], + "total_offers": 0, + "max_offer_price": None, + "action": "create", + } + + @pytest.mark.parametrize( + ("client_version", "expected_availability"), + [ + ("0.20.3", InstanceAvailability.NOT_AVAILABLE), + ("0.20.4", InstanceAvailability.NO_BALANCE), + (None, InstanceAvailability.NO_BALANCE), + ], + ) + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_replaces_no_balance_with_not_available_for_old_clients( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + client_version: Optional[str], + expected_availability: InstanceAvailability, + ): + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + offers = [ + InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="instance-1", + resources=Resources(cpus=1, memory_mib=512, spot=False, gpus=[]), + ), + region="us", + price=1.0, + availability=InstanceAvailability.AVAILABLE, + ), + InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="instance-2", + resources=Resources(cpus=2, memory_mib=1024, spot=False, gpus=[]), + ), + region="us", + price=2.0, + availability=InstanceAvailability.NO_BALANCE, + ), + ] + headers = get_auth_headers(user.token) + if client_version is not None: + headers["X-API-Version"] = client_version + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + m.return_value = [backend_mock] + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value.get_offers.return_value = offers + response = await client.post( + f"/api/project/{project.name}/fleets/get_plan", + headers=headers, + json={"spec": get_fleet_spec().dict()}, + ) + + assert response.status_code == 200 + offers = response.json()["offers"] + assert len(offers) == 2 + assert offers[0]["availability"] == InstanceAvailability.AVAILABLE.value + assert offers[1]["availability"] == expected_availability.value + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_importer_member_cannot_get_plan_for_imported_fleet( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + spec = get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")) + fleet = await create_fleet(session=session, project=exporter_project, spec=spec) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + response = await client.post( + f"/api/project/{exporter_project.name}/fleets/get_plan", + headers=get_auth_headers(importer_user.token), + json={"spec": spec.dict()}, + ) + assert response.status_code == 403 + + +def _fleet_model_to_json_dict(fleet: FleetModel) -> dict: + return json.loads(fleet_model_to_fleet(fleet).json()) diff --git a/src/tests/_internal/server/routers/test_gateways.py b/src/tests/_internal/server/routers/test_gateways.py index 3f95ad0121..075d1f6d4a 100644 --- a/src/tests/_internal/server/routers/test_gateways.py +++ b/src/tests/_internal/server/routers/test_gateways.py @@ -1,69 +1,84 @@ -from unittest.mock import Mock, patch +from typing import Any +from unittest.mock import patch import pytest -from fastapi.testclient import TestClient +from httpx import AsyncClient from sqlalchemy.ext.asyncio import AsyncSession -from dstack._internal.core.errors import DstackError from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.users import GlobalRole, ProjectRole -from dstack._internal.server.main import app -from dstack._internal.server.services.gateways import ( - gateway_model_to_gateway, - get_project_default_gateway, -) from dstack._internal.server.services.projects import add_project_member from dstack._internal.server.testing.common import ( + clear_events, create_backend, + create_export, create_gateway, create_gateway_compute, create_project, create_user, get_auth_headers, + list_events, ) - -client = TestClient(app) +from dstack._internal.server.testing.matchers import SomeUUID4Str class TestListAndGetGateways: @pytest.mark.asyncio - async def test_returns_40x_if_not_authenticated(self, test_db, session: AsyncSession): - response = client.post("/api/project/main/gateways/list") - assert response.status_code == 403 + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/project/main/gateways/list") + assert response.status_code in [401, 403] @pytest.mark.asyncio - async def test_list(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize("legacy_compute", [False, True]) + async def test_list( + self, test_db, session: AsyncSession, client: AsyncClient, legacy_compute: bool + ): user = await create_user(session, global_role=GlobalRole.USER) project = await create_project(session) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) backend = await create_backend(session=session, project_id=project.id) - gateway_compute = await create_gateway_compute( - session=session, - backend_id=backend.id, - ) gateway = await create_gateway( session=session, project_id=project.id, backend_id=backend.id, - gateway_compute_id=gateway_compute.id, ) - response = client.post( + if legacy_compute: + gateway_compute = await create_gateway_compute(session=session, backend_id=backend.id) + gateway.gateway_compute_id = gateway_compute.id # pre-0.20.25 relationship style + else: + gateway_compute = await create_gateway_compute( + session=session, backend_id=backend.id, gateway_id=gateway.id + ) + await session.commit() + response = await client.post( f"/api/project/{project.name}/gateways/list", headers=get_auth_headers(user.token), ) assert response.status_code == 200 assert response.json() == [ { + "id": SomeUUID4Str(), + "project_name": project.name, "backend": backend.type.value, "created_at": response.json()[0]["created_at"], "default": False, "status": "submitted", "status_message": None, - "instance_id": gateway_compute.instance_id, - "ip_address": gateway_compute.ip_address, - "hostname": gateway_compute.ip_address, + "replicas": [ + { + "hostname": gateway_compute.ip_address, + "replica_num": 0, + "backend": backend.type.value, + "region": "us", + "created_at": response.json()[0]["replicas"][0]["created_at"], + } + ], + "instance_id": None, + "ip_address": None, + "hostname": None, "name": gateway.name, "region": gateway.region, "wildcard_domain": gateway.wildcard_domain, @@ -72,47 +87,69 @@ async def test_list(self, test_db, session: AsyncSession): "name": gateway.name, "backend": backend.type.value, "region": gateway.region, + "instance_type": None, + "router": None, "domain": gateway.wildcard_domain, "default": False, "public_ip": True, "certificate": {"type": "lets-encrypt"}, + "tags": None, + "replicas": None, }, } ] @pytest.mark.asyncio - async def test_get(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize("legacy_compute", [False, True]) + async def test_get( + self, test_db, session: AsyncSession, client: AsyncClient, legacy_compute: bool + ): user = await create_user(session, global_role=GlobalRole.USER) project = await create_project(session) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) backend = await create_backend(session, project.id) - gateway_compute = await create_gateway_compute( - session=session, - backend_id=backend.id, - ) gateway = await create_gateway( session=session, project_id=project.id, backend_id=backend.id, - gateway_compute_id=gateway_compute.id, ) - response = client.post( + if legacy_compute: + gateway_compute = await create_gateway_compute(session=session, backend_id=backend.id) + gateway.gateway_compute_id = gateway_compute.id # pre-0.20.25 relationship style + else: + gateway_compute = await create_gateway_compute( + session=session, backend_id=backend.id, gateway_id=gateway.id + ) + await session.commit() + response = await client.post( f"/api/project/{project.name}/gateways/get", json={"name": gateway.name}, headers=get_auth_headers(user.token), ) assert response.status_code == 200 assert response.json() == { + "id": SomeUUID4Str(), + "project_name": project.name, "backend": backend.type.value, "created_at": response.json()["created_at"], "default": False, "status": "submitted", "status_message": None, - "instance_id": gateway_compute.instance_id, - "ip_address": gateway_compute.ip_address, - "hostname": gateway_compute.ip_address, + "replicas": [ + { + "hostname": gateway_compute.ip_address, + "replica_num": 0, + "backend": backend.type.value, + "region": "us", + "created_at": response.json()["replicas"][0]["created_at"], + } + ], + "instance_id": None, + "ip_address": None, + "hostname": None, "name": gateway.name, "region": gateway.region, "wildcard_domain": gateway.wildcard_domain, @@ -121,65 +158,340 @@ async def test_get(self, test_db, session: AsyncSession): "name": gateway.name, "backend": backend.type.value, "region": gateway.region, + "instance_type": None, + "router": None, "domain": gateway.wildcard_domain, "default": False, "public_ip": True, "certificate": {"type": "lets-encrypt"}, + "tags": None, + "replicas": None, }, } @pytest.mark.asyncio - async def test_get_missing(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_list_legacy_client_populates_compat_fields( + self, test_db, session: AsyncSession, client: AsyncClient + ): + """Old clients (< 0.20.25) get ip_address/instance_id/hostname back-filled.""" + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + backend = await create_backend(session=session, project_id=project.id) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + ) + gateway_compute = await create_gateway_compute( + session=session, + backend_id=backend.id, + gateway_id=gateway.id, + ) + response = await client.post( + f"/api/project/{project.name}/gateways/list", + headers={**get_auth_headers(user.token), "x-api-version": "0.20.24"}, + ) + assert response.status_code == 200 + assert len(response.json()) == 1 + gw = response.json()[0] + assert gw["ip_address"] == gateway_compute.ip_address + assert gw["instance_id"] == "" + assert gw["hostname"] == gateway_compute.ip_address + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_list_non_member_public_project( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session, is_public=True) + backend = await create_backend(session=session, project_id=project.id) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + ) + await create_gateway_compute( + session=session, + backend_id=backend.id, + gateway_id=gateway.id, + ) + response = await client.post( + f"/api/project/{project.name}/gateways/list", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + assert len(response.json()) == 1 + assert response.json()[0]["name"] == gateway.name + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_get_non_member_public_project( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session, is_public=True) + backend = await create_backend(session, project.id) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + ) + await create_gateway_compute( + session=session, + backend_id=backend.id, + gateway_id=gateway.id, + ) + response = await client.post( + f"/api/project/{project.name}/gateways/get", + json={"name": gateway.name}, + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + assert response.json()["name"] == gateway.name + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_get_missing(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session, global_role=GlobalRole.USER) project = await create_project(session) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/gateways/get", json={"name": "missing"}, headers=get_auth_headers(user.token), ) assert response.status_code == 400 + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_list_returns_imported_gateway_with_include_imported( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + backend = await create_backend(session=session, project_id=exporter_project.id) + gateway = await create_gateway( + session=session, + project_id=exporter_project.id, + backend_id=backend.id, + name="exported-gateway", + ) + await create_gateway_compute(session=session, backend_id=backend.id, gateway_id=gateway.id) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[], + exported_gateways=[gateway], + ) + response = await client.post( + f"/api/project/{importer_project.name}/gateways/list", + headers=get_auth_headers(importer_user.token), + json={"include_imported": True}, + ) + assert response.status_code == 200 + response_json = response.json() + assert len(response_json) == 1 + assert response_json[0]["name"] == "exported-gateway" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_list_not_returns_imported_gateway_without_include_imported( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + backend = await create_backend(session=session, project_id=exporter_project.id) + gateway = await create_gateway( + session=session, + project_id=exporter_project.id, + backend_id=backend.id, + name="exported-gateway", + ) + await create_gateway_compute(session=session, backend_id=backend.id, gateway_id=gateway.id) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[], + exported_gateways=[gateway], + ) + response = await client.post( + f"/api/project/{importer_project.name}/gateways/list", + headers=get_auth_headers(importer_user.token), + json={}, + ) + assert response.status_code == 200 + assert response.json() == [] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_get_returns_imported_gateway( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + backend = await create_backend(session=session, project_id=exporter_project.id) + gateway = await create_gateway( + session=session, + project_id=exporter_project.id, + backend_id=backend.id, + name="exported-gateway", + ) + await create_gateway_compute(session=session, backend_id=backend.id, gateway_id=gateway.id) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[], + exported_gateways=[gateway], + ) + response = await client.post( + f"/api/project/{exporter_project.name}/gateways/get", + headers=get_auth_headers(importer_user.token), + json={"name": "exported-gateway"}, + ) + assert response.status_code == 200 + assert response.json()["name"] == "exported-gateway" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_get_returns_403_on_foreign_gateway_if_not_imported( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + not_importer_user = await create_user( + session, name="not-importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + not_importer_project = await create_project( + session, name="not-importer-project", owner=not_importer_user + ) + await add_project_member( + session=session, + project=not_importer_project, + user=not_importer_user, + project_role=ProjectRole.USER, + ) + backend = await create_backend(session=session, project_id=exporter_project.id) + gateway = await create_gateway( + session=session, + project_id=exporter_project.id, + backend_id=backend.id, + name="exported-gateway", + ) + await create_gateway_compute(session=session, backend_id=backend.id, gateway_id=gateway.id) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[], + exported_gateways=[gateway], + ) + response = await client.post( + f"/api/project/{exporter_project.name}/gateways/get", + headers=get_auth_headers(not_importer_user.token), + json={"name": "exported-gateway"}, + ) + assert response.status_code == 403 + class TestCreateGateway: @pytest.mark.asyncio - async def test_only_admin_can_create(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_only_admin_can_create( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session, global_role=GlobalRole.USER) project = await create_project(session) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/gateways/create", headers=get_auth_headers(user.token), ) assert response.status_code == 403 @pytest.mark.asyncio - async def test_create_gateway(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_create_gateway(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session, global_role=GlobalRole.USER) project = await create_project(session) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.ADMIN ) backend = await create_backend(session, project.id, backend_type=BackendType.AWS) - response = client.post( + response = await client.post( f"/api/project/{project.name}/gateways/create", - json={"name": "test", "backend_type": "aws", "region": "us"}, + json={ + "configuration": { + "type": "gateway", + "name": "test", + "backend": "aws", + "region": "us", + }, + }, headers=get_auth_headers(user.token), ) assert response.status_code == 200 assert response.json() == { + "id": SomeUUID4Str(), + "project_name": project.name, "name": "test", "backend": "aws", "region": "us", "status": "submitted", "status_message": None, - "instance_id": "", - "ip_address": "", - "hostname": "", + "replicas": [], + "instance_id": None, + "ip_address": None, + "hostname": None, "wildcard_domain": None, "default": True, "created_at": response.json()["created_at"], @@ -188,15 +500,55 @@ async def test_create_gateway(self, test_db, session: AsyncSession): "name": "test", "backend": backend.type.value, "region": "us", + "instance_type": None, + "router": None, "domain": None, "default": True, "public_ip": True, "certificate": {"type": "lets-encrypt"}, + "tags": None, + "replicas": None, }, } + events = await list_events(session) + assert events[0].message == "Gateway created. Status: SUBMITTED" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_create_multi_replica_gateway( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + await create_backend(session, project.id, backend_type=BackendType.AWS) + response = await client.post( + f"/api/project/{project.name}/gateways/create", + json={ + "configuration": { + "type": "gateway", + "name": "test", + "backend": "aws", + "region": "us", + "replicas": 2, + "certificate": None, + }, + }, + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + assert response.json()["configuration"]["replicas"] == 2 + assert response.json()["replicas"] == [] # populated later by pipelines + events = await list_events(session) + assert events[0].message == "Gateway created. Status: SUBMITTED" @pytest.mark.asyncio - async def test_create_gateway_without_name(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_create_gateway_without_name( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session, global_role=GlobalRole.USER) project = await create_project(session) await add_project_member( @@ -205,22 +557,32 @@ async def test_create_gateway_without_name(self, test_db, session: AsyncSession) backend = await create_backend(session, project.id, backend_type=BackendType.AWS) with patch("dstack._internal.server.services.gateways.random_names.generate_name") as g: g.return_value = "random-name" - response = client.post( + response = await client.post( f"/api/project/{project.name}/gateways/create", - json={"name": None, "backend_type": "aws", "region": "us"}, + json={ + "configuration": { + "type": "gateway", + "name": None, + "backend": "aws", + "region": "us", + }, + }, headers=get_auth_headers(user.token), ) g.assert_called_once() assert response.status_code == 200 assert response.json() == { + "id": SomeUUID4Str(), + "project_name": project.name, "name": "random-name", "backend": "aws", "region": "us", "status": "submitted", "status_message": None, - "instance_id": "", - "ip_address": "", - "hostname": "", + "replicas": [], + "instance_id": None, + "ip_address": None, + "hostname": None, "wildcard_domain": None, "default": True, "created_at": response.json()["created_at"], @@ -229,53 +591,196 @@ async def test_create_gateway_without_name(self, test_db, session: AsyncSession) "name": "random-name", "backend": backend.type.value, "region": "us", + "instance_type": None, + "router": None, "domain": None, "default": True, "public_ip": True, "certificate": {"type": "lets-encrypt"}, + "tags": None, + "replicas": None, }, } + events = await list_events(session) + assert events[0].message == "Gateway created. Status: SUBMITTED" @pytest.mark.asyncio - async def test_create_gateway_missing_backend(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_create_gateway_missing_backend( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session, global_role=GlobalRole.USER) project = await create_project(session) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.ADMIN ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/gateways/create", - json={"name": "test", "backend_type": "aws", "region": "us"}, + json={ + "configuration": { + "type": "gateway", + "name": "test", + "backend": "aws", + "region": "us", + }, + }, headers=get_auth_headers(user.token), ) assert response.status_code == 400 - -class TestDefaultGateway: @pytest.mark.asyncio - async def test_get_default_gateway(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_create_gateway_with_valid_domain_interpolation( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) project = await create_project(session) - backend = await create_backend(session, project.id) - gateway = await create_gateway(session, project.id, backend.id) - async with session.begin(): - project.default_gateway_id = gateway.id - session.add(project) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + await create_backend(session, project.id, backend_type=BackendType.AWS) + response = await client.post( + f"/api/project/{project.name}/gateways/create", + json={ + "configuration": { + "type": "gateway", + "name": "test", + "backend": "aws", + "region": "us", + "domain": "${{ run.project_name }}.example.com", + }, + }, + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 - res = await get_project_default_gateway(session, project) - assert res is not None - assert res.dict() == gateway_model_to_gateway(gateway).dict() + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_create_gateway_with_invalid_domain_interpolation( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + await create_backend(session, project.id, backend_type=BackendType.AWS) + response = await client.post( + f"/api/project/{project.name}/gateways/create", + json={ + "configuration": { + "type": "gateway", + "name": "test", + "backend": "aws", + "region": "us", + "domain": "${{ run.unknown_variable }}.example.com", + }, + }, + headers=get_auth_headers(user.token), + ) + assert response.status_code == 400 @pytest.mark.asyncio - async def test_default_gateway_is_missing(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + "configuration, expected_error", + [ + pytest.param( + { + "type": "gateway", + "name": "test", + "backend": "aws", + "region": "us", + "domain": "${{ run.unknown_variable }}.example.com", + }, + "Cannot interpolate gateway domain name: Failed to interpolate due to missing vars: ['run.unknown_variable']", + id="invalid-domain-interpolation", + ), + pytest.param( + { + "type": "gateway", + "name": "test", + "backend": "aws", + "region": "us", + "certificate": { + "type": "acm", + "arn": "arn:aws:acm:us-east-1:123456789:certificate/abc", + }, + "replicas": 2, + }, + "Replicated gateways do not support certificates." + " Set either `certificate: null` or `replicas: 1` in the gateway configuration", + id="multi-replica-with-acm-cert", + ), + pytest.param( + { + "type": "gateway", + "name": "test", + "backend": "aws", + "region": "us", + "certificate": {"type": "lets-encrypt"}, + "replicas": 2, + }, + "Replicated gateways do not support certificates." + " Set either `certificate: null` or `replicas: 1` in the gateway configuration", + id="multi-replica-with-letsencrypt-cert", + ), + pytest.param( + { + "type": "gateway", + "name": "test", + "backend": "aws", + "region": "us", + "certificate": None, + "router": {"type": "sglang"}, + "replicas": 2, + }, + "The deprecated `router` property is not supported for multi-replica gateways", + id="multi-replica-with-router", + ), + pytest.param( + { + "type": "gateway", + "name": "test", + "backend": "aws", + "region": "us", + "certificate": None, + "replicas": 4, + }, + "Cannot provision 4 gateway replicas. This server allows at most 3", + id="replicas-exceed-max", + ), + ], + ) + async def test_invalid_configuration_rejected( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + configuration: dict[str, Any], + expected_error: str, + ): + user = await create_user(session, global_role=GlobalRole.USER) project = await create_project(session) - backend = await create_backend(session, project.id) - await create_gateway(session, project.id, backend.id) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + await create_backend(session, project.id, backend_type=BackendType.AWS) + response = await client.post( + f"/api/project/{project.name}/gateways/create", + json={"configuration": configuration}, + headers=get_auth_headers(user.token), + ) + assert response.status_code == 400 + assert response.json()["detail"][0]["msg"] == expected_error - res = await get_project_default_gateway(session, project) - assert res is None +class TestDefaultGateway: @pytest.mark.asyncio - async def test_only_admin_can_set_default_gateway(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_only_admin_can_set_default_gateway( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session, global_role=GlobalRole.USER) project = await create_project(session) await add_project_member( @@ -283,7 +788,7 @@ async def test_only_admin_can_set_default_gateway(self, test_db, session: AsyncS ) backend = await create_backend(session, project.id) gateway = await create_gateway(session, project.id, backend.id) - response = client.post( + response = await client.post( f"/api/project/{project.name}/gateways/set_default", json={"name": gateway.name}, headers=get_auth_headers(user.token), @@ -291,45 +796,58 @@ async def test_only_admin_can_set_default_gateway(self, test_db, session: AsyncS assert response.status_code == 403 @pytest.mark.asyncio - async def test_set_default_gateway(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_set_default_gateway(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session, global_role=GlobalRole.USER) project = await create_project(session) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.ADMIN ) backend = await create_backend(session, project.id) - gateway_compute = await create_gateway_compute( + gateway = await create_gateway( session=session, + project_id=project.id, backend_id=backend.id, + name="first_gateway", ) - gateway = await create_gateway( + gateway_compute = await create_gateway_compute( session=session, - project_id=project.id, backend_id=backend.id, - gateway_compute_id=gateway_compute.id, + gateway_id=gateway.id, ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/gateways/set_default", json={"name": gateway.name}, headers=get_auth_headers(user.token), ) assert response.status_code == 200 - response = client.post( + response = await client.post( f"/api/project/{project.name}/gateways/get", json={"name": gateway.name}, headers=get_auth_headers(user.token), ) assert response.status_code == 200 assert response.json() == { + "id": SomeUUID4Str(), + "project_name": project.name, "backend": backend.type.value, "created_at": response.json()["created_at"], "default": True, "status": "submitted", "status_message": None, - "instance_id": gateway_compute.instance_id, - "ip_address": gateway_compute.ip_address, - "hostname": gateway_compute.ip_address, + "replicas": [ + { + "hostname": gateway_compute.ip_address, + "replica_num": 0, + "backend": backend.type.value, + "region": "us", + "created_at": response.json()["replicas"][0]["created_at"], + } + ], + "instance_id": None, + "ip_address": None, + "hostname": None, "name": gateway.name, "region": gateway.region, "wildcard_domain": gateway.wildcard_domain, @@ -338,44 +856,214 @@ async def test_set_default_gateway(self, test_db, session: AsyncSession): "name": gateway.name, "backend": backend.type.value, "region": gateway.region, + "instance_type": None, + "router": None, "domain": gateway.wildcard_domain, "default": True, "public_ip": True, "certificate": {"type": "lets-encrypt"}, + "tags": None, + "replicas": None, }, } + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Gateway set as project default" + + second_gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + name="second_gateway", + ) + await create_gateway_compute( + session=session, + backend_id=backend.id, + gateway_id=second_gateway.id, + ) + await clear_events(session) + response = await client.post( + f"/api/project/{project.name}/gateways/set_default", + json={"name": second_gateway.name}, + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + events = await list_events(session) + assert len(events) == 2 + actual_events = [({t.entity_name for t in e.targets}, e.message) for e in events] + expected_events = [ + ({"first_gateway", project.name}, "Gateway unset as project default"), + ({"second_gateway", project.name}, "Gateway set as project default"), + ] + assert ( + actual_events == expected_events + # in case events are emitted exactly at the same time + or actual_events == expected_events[::-1] + ) @pytest.mark.asyncio - async def test_set_default_gateway_missing(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_set_default_gateway_missing( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session, global_role=GlobalRole.USER) project = await create_project(session) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.ADMIN ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/gateways/set_default", json={"name": "missing"}, headers=get_auth_headers(user.token), ) assert response.status_code == 400 + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_importer_member_cannot_set_default_imported_gateway( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + backend = await create_backend(session=session, project_id=exporter_project.id) + gateway = await create_gateway( + session=session, + project_id=exporter_project.id, + backend_id=backend.id, + name="exported-gateway", + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[], + exported_gateways=[gateway], + ) + response = await client.post( + f"/api/project/{exporter_project.name}/gateways/set_default", + headers=get_auth_headers(importer_user.token), + json={"name": gateway.name}, + ) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_set_imported_gateway_as_default( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + backend = await create_backend(session=session, project_id=exporter_project.id) + gateway = await create_gateway( + session=session, + project_id=exporter_project.id, + backend_id=backend.id, + name="exported-gateway", + ) + await create_gateway_compute(session=session, backend_id=backend.id, gateway_id=gateway.id) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[], + exported_gateways=[gateway], + ) + response = await client.post( + f"/api/project/{importer_project.name}/gateways/set_default", + headers=get_auth_headers(importer_user.token), + json={"name": gateway.name, "gateway_project": exporter_project.name}, + ) + assert response.status_code == 200 + await session.refresh(importer_project) + assert importer_project.default_gateway_id == gateway.id + events = await list_events(session) + assert any(e.message == "Gateway set as project default" for e in events) + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_cannot_set_non_imported_foreign_gateway_as_default( + self, test_db, session: AsyncSession, client: AsyncClient + ): + not_importer_user = await create_user( + session, name="not-importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + not_importer_project = await create_project( + session, name="not-importer-project", owner=not_importer_user + ) + await add_project_member( + session=session, + project=not_importer_project, + user=not_importer_user, + project_role=ProjectRole.ADMIN, + ) + backend = await create_backend(session=session, project_id=exporter_project.id) + gateway = await create_gateway( + session=session, + project_id=exporter_project.id, + backend_id=backend.id, + name="exported-gateway", + ) + await create_gateway_compute(session=session, backend_id=backend.id, gateway_id=gateway.id) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[], + exported_fleets=[], + exported_gateways=[gateway], + ) + response = await client.post( + f"/api/project/{not_importer_project.name}/gateways/set_default", + headers=get_auth_headers(not_importer_user.token), + json={"name": gateway.name, "gateway_project": exporter_project.name}, + ) + assert response.status_code == 400 + class TestDeleteGateway: @pytest.mark.asyncio - async def test_only_admin_can_delete(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_only_admin_can_delete( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session, global_role=GlobalRole.USER) project = await create_project(session) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/gateways/delete", headers=get_auth_headers(user.token), ) assert response.status_code == 403 @pytest.mark.asyncio - async def test_delete_gateway(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_marks_gateways_to_be_deleted( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session, global_role=GlobalRole.USER) project = await create_project(session) await add_project_member( @@ -383,154 +1071,238 @@ async def test_delete_gateway(self, test_db, session: AsyncSession): ) backend_aws = await create_backend(session, project.id) backend_gcp = await create_backend(session, project.id, backend_type=BackendType.GCP) - gateway_compute_aws = await create_gateway_compute( - session=session, - backend_id=backend_aws.id, - ) gateway_aws = await create_gateway( session=session, project_id=project.id, backend_id=backend_aws.id, name="gateway-aws", - gateway_compute_id=gateway_compute_aws.id, ) - gateway_compute_gcp = await create_gateway_compute( + gateway_compute_aws = await create_gateway_compute( session=session, - backend_id=backend_gcp.id, + backend_id=backend_aws.id, + gateway_id=gateway_aws.id, ) gateway_gcp = await create_gateway( session=session, project_id=project.id, backend_id=backend_gcp.id, name="gateway-gcp", - gateway_compute_id=gateway_compute_gcp.id, ) - with patch( - "dstack._internal.server.services.gateways.get_project_backend_by_type_or_error" - ) as m: - aws = Mock() - aws.compute.return_value.terminate_gateway.return_value = None # success - gcp = Mock() - gcp.compute.return_value.terminate_gateway.side_effect = DstackError() # fail - - def get_backend(_, backend_type): - return {BackendType.AWS: aws, BackendType.GCP: gcp}[backend_type] - - m.side_effect = get_backend + gateway_compute_gcp = await create_gateway_compute( + session=session, + backend_id=backend_gcp.id, + gateway_id=gateway_gcp.id, + ) + response = await client.post( + f"/api/project/{project.name}/gateways/delete", + json={"names": [gateway_aws.name, gateway_gcp.name]}, + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 - response = client.post( - f"/api/project/{project.name}/gateways/delete", - json={"names": [gateway_aws.name, gateway_gcp.name]}, - headers=get_auth_headers(user.token), - ) - aws.compute.return_value.terminate_gateway.assert_called_once() - gcp.compute.return_value.terminate_gateway.assert_called_once() - assert response.status_code == 200 + await session.refresh(gateway_aws) + await session.refresh(gateway_gcp) + await session.refresh(gateway_compute_aws) + await session.refresh(gateway_compute_gcp) + assert gateway_aws.to_be_deleted is True + assert gateway_gcp.to_be_deleted is True + assert gateway_compute_aws.active is True + assert gateway_compute_aws.deleted is False + assert gateway_compute_gcp.active is True + assert gateway_compute_gcp.deleted is False - response = client.post( + response = await client.post( f"/api/project/{project.name}/gateways/list", headers=get_auth_headers(user.token), ) assert response.status_code == 200 - assert response.json() == [ - { - "backend": backend_gcp.type.value, - "created_at": response.json()[0]["created_at"], - "default": False, - "status": "submitted", - "status_message": None, - "instance_id": gateway_compute_gcp.instance_id, - "ip_address": gateway_compute_gcp.ip_address, - "hostname": gateway_compute_gcp.ip_address, - "name": gateway_gcp.name, - "region": gateway_gcp.region, - "wildcard_domain": gateway_gcp.wildcard_domain, - "configuration": { - "type": "gateway", - "name": gateway_gcp.name, - "backend": backend_gcp.type.value, - "region": gateway_gcp.region, - "domain": gateway_gcp.wildcard_domain, - "default": False, - "public_ip": True, - "certificate": {"type": "lets-encrypt"}, - }, - } - ] + assert {g["name"] for g in response.json()} == {"gateway-aws", "gateway-gcp"} + + events = await list_events(session) + assert len(events) == 2 + assert all(e.message == "Gateway marked for deletion" for e in events) + assert {e.targets[0].entity_name for e in events} == {"gateway-aws", "gateway-gcp"} + assert all(e.actor_user_id == user.id for e in events) + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_importer_member_cannot_delete_imported_gateway( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + backend = await create_backend(session=session, project_id=exporter_project.id) + gateway = await create_gateway( + session=session, + project_id=exporter_project.id, + backend_id=backend.id, + name="exported-gateway", + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[], + exported_gateways=[gateway], + ) + response = await client.post( + f"/api/project/{exporter_project.name}/gateways/delete", + headers=get_auth_headers(importer_user.token), + json={"names": [gateway.name]}, + ) + assert response.status_code == 403 class TestUpdateGateway: @pytest.mark.asyncio - async def test_only_admin_can_set_wildcard_domain(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_only_admin_can_set_wildcard_domain( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session, global_role=GlobalRole.USER) project = await create_project(session) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/gateways/set_wildcard_domain", headers=get_auth_headers(user.token), ) assert response.status_code == 403 @pytest.mark.asyncio - async def test_set_wildcard_domain(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_set_wildcard_domain(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session, global_role=GlobalRole.USER) project = await create_project(session) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.ADMIN ) backend = await create_backend(session, project.id) - gateway_compute = await create_gateway_compute( + gateway = await create_gateway( session=session, + project_id=project.id, backend_id=backend.id, + wildcard_domain="old.example", ) - gateway = await create_gateway( + gateway_compute = await create_gateway_compute( session=session, - project_id=project.id, backend_id=backend.id, - gateway_compute_id=gateway_compute.id, + gateway_id=gateway.id, ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/gateways/set_wildcard_domain", - json={"name": gateway.name, "wildcard_domain": "test.com"}, + json={"name": gateway.name, "wildcard_domain": "new.example"}, headers=get_auth_headers(user.token), ) assert response.status_code == 200 assert response.json() == { + "id": SomeUUID4Str(), + "project_name": project.name, "backend": backend.type.value, "created_at": response.json()["created_at"], "status": "submitted", "status_message": None, "default": False, - "instance_id": gateway_compute.instance_id, - "ip_address": gateway_compute.ip_address, - "hostname": gateway_compute.ip_address, + "replicas": [ + { + "hostname": gateway_compute.ip_address, + "replica_num": 0, + "backend": backend.type.value, + "region": "us", + "created_at": response.json()["replicas"][0]["created_at"], + } + ], + "instance_id": None, + "ip_address": None, + "hostname": None, "name": gateway.name, "region": gateway.region, - "wildcard_domain": "test.com", + "wildcard_domain": "new.example", "configuration": { "type": "gateway", "name": gateway.name, "backend": backend.type.value, "region": gateway.region, - "domain": "test.com", + "instance_type": None, + "router": None, + "domain": "new.example", "default": False, "public_ip": True, "certificate": {"type": "lets-encrypt"}, + "tags": None, + "replicas": None, }, } + events = await list_events(session) + assert len(events) == 1 + assert ( + events[0].message == "Gateway wildcard domain changed 'old.example' -> 'new.example'" + ) @pytest.mark.asyncio - async def test_set_wildcard_domain_missing(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_set_wildcard_domain_missing( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session, global_role=GlobalRole.USER) project = await create_project(session) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.ADMIN ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/gateways/set_wildcard_domain", json={"name": "missing", "wildcard_domain": "test.com"}, headers=get_auth_headers(user.token), ) assert response.status_code == 400 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_importer_member_cannot_set_wildcard_domain_on_imported_gateway( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + backend = await create_backend(session=session, project_id=exporter_project.id) + gateway = await create_gateway( + session=session, + project_id=exporter_project.id, + backend_id=backend.id, + name="exported-gateway", + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[], + exported_gateways=[gateway], + ) + response = await client.post( + f"/api/project/{exporter_project.name}/gateways/set_wildcard_domain", + headers=get_auth_headers(importer_user.token), + json={"name": gateway.name, "wildcard_domain": "new.example"}, + ) + assert response.status_code == 403 diff --git a/src/tests/_internal/server/routers/test_gpus.py b/src/tests/_internal/server/routers/test_gpus.py new file mode 100644 index 0000000000..a09b99e4e1 --- /dev/null +++ b/src/tests/_internal/server/routers/test_gpus.py @@ -0,0 +1,626 @@ +from typing import Dict, List, Optional +from unittest.mock import Mock, patch + +import gpuhunt +import pytest +from httpx import AsyncClient +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.configurations import TaskConfiguration +from dstack._internal.core.models.instances import ( + Gpu, + InstanceAvailability, + InstanceOfferWithAvailability, + InstanceType, + Resources, +) +from dstack._internal.core.models.profiles import Profile +from dstack._internal.core.models.runs import RunSpec +from dstack._internal.core.models.users import GlobalRole, ProjectRole +from dstack._internal.server.services.projects import add_project_member +from dstack._internal.server.testing.common import ( + create_fleet, + create_project, + create_repo, + create_user, + get_auth_headers, + get_fleet_spec, + get_run_spec, +) + +pytestmark = pytest.mark.usefixtures("image_config_mock") + + +# GPU Test Fixtures and Helpers + + +async def gpu_test_setup(session: AsyncSession): + """Common setup for GPU tests: user, project, repo, run_spec.""" + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec(run_name="test-run", repo_id=repo.name) + return user, project, repo, run_spec + + +def create_gpu_offer( + backend: BackendType, + gpu_name: str, + gpu_memory_mib: int, + price: float, + spot: bool = False, + region: str = "us-west-2", + availability: InstanceAvailability = InstanceAvailability.AVAILABLE, + gpu_count: int = 1, + instance_name: Optional[str] = None, + vendor: gpuhunt.AcceleratorVendor = gpuhunt.AcceleratorVendor.NVIDIA, +) -> InstanceOfferWithAvailability: + """Helper to create GPU offers with sensible defaults.""" + if instance_name is None: + instance_name = f"{gpu_name.lower()}-instance" + + gpus = [Gpu(name=gpu_name, memory_mib=gpu_memory_mib, vendor=vendor) for _ in range(gpu_count)] + cpus = max(4, gpu_count * 4) + memory_mib = max(16384, gpu_count * 16384) + + return InstanceOfferWithAvailability( + backend=backend, + instance=InstanceType( + name=instance_name, + resources=Resources(cpus=cpus, memory_mib=memory_mib, spot=spot, gpus=gpus), + ), + region=region, + price=price, + availability=availability, + ) + + +def create_mock_backends_with_offers( + offers_by_backend: Dict[BackendType, List[InstanceOfferWithAvailability]], +) -> List[Mock]: + """Helper to create mocked backends with specific offers.""" + mocked_backends = [] + + for backend_type, offers in offers_by_backend.items(): + backend_mock = Mock() + backend_mock.TYPE = backend_type + backend_mock.compute.return_value.get_offers.return_value = offers + mocked_backends.append(backend_mock) + + return mocked_backends + + +async def call_gpus_api( + client: AsyncClient, + project_name: str, + user_token: str, + run_spec: RunSpec, + group_by: Optional[List[str]] = None, + client_version: Optional[str] = None, +): + """Helper to call the GPUs API with standard parameters.""" + json_data = {"run_spec": run_spec.dict()} + if group_by is not None: + json_data["group_by"] = group_by + headers = get_auth_headers(user_token) + if client_version is not None: + headers["X-API-Version"] = client_version + + return await client.post( + f"/api/project/{project_name}/gpus/list", + headers=headers, + json=json_data, + ) + + +class TestListGpus: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_project_member( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + run_spec = get_run_spec(run_name="test-run", repo_id="test-repo") + response = await call_gpus_api(client, project.name, user.token, run_spec) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_gpus_without_group_by( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user, project, repo, run_spec = await gpu_test_setup(session) + + offer_aws = create_gpu_offer(BackendType.AWS, "T4", 16384, 0.50, spot=False) + offer_runpod = create_gpu_offer( + BackendType.RUNPOD, "RTX4090", 24576, 0.35, spot=True, region="us-east-1" + ) + offers_by_backend = {BackendType.AWS: [offer_aws], BackendType.RUNPOD: [offer_runpod]} + mocked_backends = create_mock_backends_with_offers(offers_by_backend) + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = mocked_backends + response = await call_gpus_api(client, project.name, user.token, run_spec) + + assert response.status_code == 200 + response_data = response.json() + assert "gpus" in response_data + assert isinstance(response_data["gpus"], list) + assert len(response_data["gpus"]) >= 1 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_filters_gpus_by_multiple_specified_fleets( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user, project, repo, _ = await gpu_test_setup(session) + await create_fleet( + session=session, + project=project, + spec=get_fleet_spec(profile=Profile(backends=[BackendType.AWS])), + name="aws-fleet", + ) + await create_fleet( + session=session, + project=project, + spec=get_fleet_spec(profile=Profile(backends=[BackendType.RUNPOD])), + name="runpod-fleet", + ) + await create_fleet( + session=session, + project=project, + spec=get_fleet_spec(profile=Profile(backends=[BackendType.VASTAI])), + name="vastai-fleet", + ) + run_spec = get_run_spec( + run_name="test-run", + repo_id=repo.name, + configuration=TaskConfiguration( + commands=[":"], + image="scratch", + user="root", + fleets=["aws-fleet", "runpod-fleet"], + ), + ) + + offers_by_backend = { + BackendType.AWS: [create_gpu_offer(BackendType.AWS, "T4", 16384, 0.50)], + BackendType.RUNPOD: [ + create_gpu_offer( + BackendType.RUNPOD, + "RTX4090", + 24576, + 0.35, + region="us-east-1", + ) + ], + BackendType.VASTAI: [create_gpu_offer(BackendType.VASTAI, "A100", 81920, 1.20)], + } + mocked_backends = create_mock_backends_with_offers(offers_by_backend) + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = mocked_backends + response = await call_gpus_api( + client, + project.name, + user.token, + run_spec, + group_by=["backend"], + ) + + assert response.status_code == 200 + response_data = response.json() + assert {gpu["backend"] for gpu in response_data["gpus"]} == {"aws", "runpod"} + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_empty_gpus_when_no_offers( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec(run_name="test-run", repo_id=repo.name) + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock_aws = Mock() + backend_mock_aws.TYPE = BackendType.AWS + backend_mock_aws.compute.return_value.get_offers.return_value = [] + m.return_value = [backend_mock_aws] + + response = await client.post( + f"/api/project/{project.name}/gpus/list", + headers=get_auth_headers(user.token), + json={"run_spec": run_spec.dict()}, + ) + + assert response.status_code == 200 + response_data = response.json() + assert "gpus" in response_data + assert isinstance(response_data["gpus"], list) + assert len(response_data["gpus"]) == 0 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_invalid_group_by_rejected( + self, test_db, session: AsyncSession, client: AsyncClient + ): + """Test that invalid group_by values are properly rejected.""" + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec(run_name="test-run", repo_id=repo.name) + + response = await client.post( + f"/api/project/{project.name}/gpus/list", + headers=get_auth_headers(user.token), + json={"run_spec": run_spec.dict(), "group_by": ["invalid_field"]}, + ) + assert response.status_code == 422 + assert "validation error" in response.text.lower() or "invalid" in response.text.lower() + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_region_without_backend_rejected( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user, project, repo, run_spec = await gpu_test_setup(session) + + response = await call_gpus_api( + client, project.name, user.token, run_spec, group_by=["region"] + ) + + assert response.status_code == 400 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_exact_aggregation_values( + self, test_db, session: AsyncSession, client: AsyncClient + ): + """Test exact aggregation values with precise validation (no >= or <=).""" + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec(run_name="test-run", repo_id=repo.name) + + offer_t4_spot = InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="g4dn.xlarge", + resources=Resources( + cpus=4, + memory_mib=16384, + spot=True, + gpus=[ + Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA) + ], + ), + ), + region="us-west-2", + price=0.30, + availability=InstanceAvailability.AVAILABLE, + ) + offer_t4_ondemand = InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="g4dn.2xlarge", + resources=Resources( + cpus=8, + memory_mib=32768, + spot=False, + gpus=[ + Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA) + ], + ), + ), + region="us-west-2", + price=0.60, + availability=InstanceAvailability.AVAILABLE, + ) + offer_t4_quota = InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="g4dn.4xlarge", + resources=Resources( + cpus=16, + memory_mib=65536, + spot=True, + gpus=[ + Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA) + ], + ), + ), + region="us-east-1", + price=0.45, + availability=InstanceAvailability.NO_QUOTA, + ) + offer_t4_multi = InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="g4dn.12xlarge", + resources=Resources( + cpus=48, + memory_mib=196608, + spot=False, + gpus=[ + Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA), + Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA), + Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA), + Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA), + ], + ), + ), + region="us-west-2", + price=2.40, + availability=InstanceAvailability.AVAILABLE, + ) + + offer_runpod_rtx_east = create_gpu_offer( + BackendType.RUNPOD, "RTX4090", 24576, 0.75, spot=True, region="us-east-1" + ) + offer_runpod_rtx_eu = create_gpu_offer( + BackendType.RUNPOD, "RTX4090", 24576, 0.65, spot=False, region="eu-west-1" + ) + offer_runpod_t4_east = create_gpu_offer( + BackendType.RUNPOD, "T4", 16384, 0.25, spot=True, region="us-east-1" + ) + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock_aws = Mock() + backend_mock_aws.TYPE = BackendType.AWS + backend_mock_aws.compute.return_value.get_offers.return_value = [ + offer_t4_spot, + offer_t4_ondemand, + offer_t4_quota, + offer_t4_multi, + ] + + backend_mock_runpod = Mock() + backend_mock_runpod.TYPE = BackendType.RUNPOD + backend_mock_runpod.compute.return_value.get_offers.return_value = [ + offer_runpod_rtx_east, + offer_runpod_rtx_eu, + offer_runpod_t4_east, + ] + + m.return_value = [backend_mock_aws, backend_mock_runpod] + + response = await client.post( + f"/api/project/{project.name}/gpus/list", + headers=get_auth_headers(user.token), + json={"run_spec": run_spec.dict()}, + ) + assert response.status_code == 200 + data = response.json() + + assert len(data["gpus"]) == 2 + + t4_gpu = next((gpu for gpu in data["gpus"] if gpu["name"] == "T4"), None) + rtx_gpu = next((gpu for gpu in data["gpus"] if gpu["name"] == "RTX4090"), None) + + assert t4_gpu is not None + assert rtx_gpu is not None + + assert t4_gpu["price"]["min"] == 0.25 + assert t4_gpu["price"]["max"] == 0.60 + assert set(t4_gpu["backends"]) == {"aws", "runpod"} + + assert rtx_gpu["price"]["min"] == 0.65 + assert rtx_gpu["price"]["max"] == 0.75 + assert set(rtx_gpu["backends"]) == {"runpod"} + + response_count_grouped = await client.post( + f"/api/project/{project.name}/gpus/list", + headers=get_auth_headers(user.token), + json={"run_spec": run_spec.dict(), "group_by": ["count"]}, + ) + assert response_count_grouped.status_code == 200 + count_grouped_data = response_count_grouped.json() + + assert len(count_grouped_data["gpus"]) == 3 + + t4_single_group = None + t4_multi_group = None + rtx_single_group = None + + for gpu in count_grouped_data["gpus"]: + if gpu["name"] == "T4" and gpu["count"]["min"] == 1 and gpu["count"]["max"] == 1: + t4_single_group = gpu + elif gpu["name"] == "T4" and gpu["count"]["min"] == 4 and gpu["count"]["max"] == 4: + t4_multi_group = gpu + elif ( + gpu["name"] == "RTX4090" + and gpu["count"]["min"] == 1 + and gpu["count"]["max"] == 1 + ): + rtx_single_group = gpu + + assert t4_single_group is not None + assert t4_multi_group is not None + assert rtx_single_group is not None + + assert t4_single_group["price"]["min"] == 0.25 + assert t4_single_group["price"]["max"] == 0.60 + assert t4_multi_group["price"]["min"] == 0.60 + assert t4_multi_group["price"]["max"] == 0.60 + assert rtx_single_group["price"]["min"] == 0.65 + assert rtx_single_group["price"]["max"] == 0.75 + + assert set(t4_single_group["backends"]) == {"aws", "runpod"} + assert set(t4_multi_group["backends"]) == {"aws"} + + response_backend = await client.post( + f"/api/project/{project.name}/gpus/list", + headers=get_auth_headers(user.token), + json={"run_spec": run_spec.dict(), "group_by": ["backend"]}, + ) + assert response_backend.status_code == 200 + backend_data = response_backend.json() + + assert len(backend_data["gpus"]) == 3 + + t4_runpod = next( + ( + gpu + for gpu in backend_data["gpus"] + if gpu["name"] == "T4" and gpu.get("backend") == "runpod" + ), + None, + ) + t4_aws = next( + ( + gpu + for gpu in backend_data["gpus"] + if gpu["name"] == "T4" and gpu.get("backend") == "aws" + ), + None, + ) + rtx_runpod = next( + ( + gpu + for gpu in backend_data["gpus"] + if gpu["name"] == "RTX4090" and gpu.get("backend") == "runpod" + ), + None, + ) + + assert t4_runpod is not None + assert t4_aws is not None + assert rtx_runpod is not None + + assert t4_aws["price"] == {"min": 0.30, "max": 0.60} + assert t4_aws["count"] == {"min": 1, "max": 4} + assert t4_runpod["price"] == {"min": 0.25, "max": 0.25} + assert rtx_runpod["price"] == {"min": 0.65, "max": 0.75} + + response_region = await client.post( + f"/api/project/{project.name}/gpus/list", + headers=get_auth_headers(user.token), + json={"run_spec": run_spec.dict(), "group_by": ["backend", "region"]}, + ) + assert response_region.status_code == 200 + region_data = response_region.json() + + assert len(region_data["gpus"]) == 5 + + t4_aws_uswest2 = next( + ( + gpu + for gpu in region_data["gpus"] + if gpu["name"] == "T4" + and gpu.get("backend") == "aws" + and gpu.get("region") == "us-west-2" + ), + None, + ) + t4_runpod_useast1 = next( + ( + gpu + for gpu in region_data["gpus"] + if gpu["name"] == "T4" + and gpu.get("backend") == "runpod" + and gpu.get("region") == "us-east-1" + ), + None, + ) + + rtx_runpod_useast1 = next( + ( + gpu + for gpu in region_data["gpus"] + if gpu["name"] == "RTX4090" + and gpu.get("backend") == "runpod" + and gpu.get("region") == "us-east-1" + ), + None, + ) + rtx_runpod_euwest1 = next( + ( + gpu + for gpu in region_data["gpus"] + if gpu["name"] == "RTX4090" + and gpu.get("backend") == "runpod" + and gpu.get("region") == "eu-west-1" + ), + None, + ) + + assert t4_aws_uswest2 is not None + assert t4_runpod_useast1 is not None + assert rtx_runpod_useast1 is not None + assert rtx_runpod_euwest1 is not None + + assert t4_aws_uswest2["backend"] == "aws" + assert t4_aws_uswest2["region"] == "us-west-2" + assert t4_aws_uswest2["price"]["min"] == 0.30 + assert t4_aws_uswest2["price"]["max"] == 0.60 + + assert t4_runpod_useast1["backend"] == "runpod" + assert t4_runpod_useast1["region"] == "us-east-1" + assert t4_runpod_useast1["price"]["min"] == 0.25 + assert t4_runpod_useast1["price"]["max"] == 0.25 + + assert rtx_runpod_useast1["backend"] == "runpod" + assert rtx_runpod_useast1["region"] == "us-east-1" + assert rtx_runpod_useast1["price"]["min"] == 0.75 + assert rtx_runpod_useast1["price"]["max"] == 0.75 + + assert rtx_runpod_euwest1["backend"] == "runpod" + assert rtx_runpod_euwest1["region"] == "eu-west-1" + assert rtx_runpod_euwest1["price"]["min"] == 0.65 + assert rtx_runpod_euwest1["price"]["max"] == 0.65 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + ("client_version", "expected_availability"), + [ + ("0.20.3", InstanceAvailability.NOT_AVAILABLE), + ("0.20.4", InstanceAvailability.NO_BALANCE), + (None, InstanceAvailability.NO_BALANCE), + ], + ) + async def test_replaces_no_balance_with_not_available_for_old_clients( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + client_version: Optional[str], + expected_availability: InstanceAvailability, + ): + user, project, repo, run_spec = await gpu_test_setup(session) + + available_offer = create_gpu_offer( + BackendType.AWS, "T4", 16384, 0.50, availability=InstanceAvailability.AVAILABLE + ) + no_balance_offer = create_gpu_offer( + BackendType.AWS, "L4", 24 * 1024, 1.0, availability=InstanceAvailability.NO_BALANCE + ) + offers_by_backend = {BackendType.AWS: [available_offer, no_balance_offer]} + mocked_backends = create_mock_backends_with_offers(offers_by_backend) + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = mocked_backends + response = await call_gpus_api( + client, project.name, user.token, run_spec, client_version=client_version + ) + + assert response.status_code == 200 + response_data = response.json() + assert len(response_data["gpus"]) == 2 + assert response_data["gpus"][0]["availability"] == [InstanceAvailability.AVAILABLE.value] + assert response_data["gpus"][1]["availability"] == [expected_availability.value] diff --git a/src/tests/_internal/server/routers/test_imports.py b/src/tests/_internal/server/routers/test_imports.py new file mode 100644 index 0000000000..c162d0d8ec --- /dev/null +++ b/src/tests/_internal/server/routers/test_imports.py @@ -0,0 +1,392 @@ +from typing import Optional + +import pytest +from httpx import AsyncClient +from sqlalchemy import func, select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.users import GlobalRole, ProjectRole +from dstack._internal.server.models import ExportModel, ImportModel +from dstack._internal.server.services.projects import add_project_member +from dstack._internal.server.testing.common import ( + create_backend, + create_export, + create_fleet, + create_gateway, + create_project, + create_user, + get_auth_headers, + get_fleet_spec, + get_ssh_fleet_configuration, +) + +pytestmark = [ + pytest.mark.asyncio, + pytest.mark.usefixtures("test_db"), + pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True), +] + + +class TestDeleteImport: + async def test_returns_403_if_not_authenticated(self, client: AsyncClient): + response = await client.post( + "/api/project/TestProject/imports/delete", + json={"export_name": "test-export", "export_project_name": "ExporterProject"}, + ) + assert response.status_code in [401, 403] + + async def test_returns_403_if_not_admin(self, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session, global_role=GlobalRole.USER) + exporter_project = await create_project( + session=session, name="ExporterProject", owner=user + ) + importer_project = await create_project( + session=session, name="ImporterProject", owner=user + ) + # The user is admin of the exporter project, but not of the importer + await add_project_member( + session=session, project=exporter_project, user=user, project_role=ProjectRole.ADMIN + ) + await add_project_member( + session=session, project=importer_project, user=user, project_role=ProjectRole.USER + ) + response = await client.post( + f"/api/project/{importer_project.name}/imports/delete", + headers=get_auth_headers(user.token), + json={"export_name": "test-export", "export_project_name": "ExporterProject"}, + ) + assert response.status_code == 403 + + async def test_deletes_import(self, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session, global_role=GlobalRole.USER) + importer_project = await create_project( + session=session, name="ImporterProject", owner=user + ) + await add_project_member( + session=session, project=importer_project, user=user, project_role=ProjectRole.ADMIN + ) + exporter_project = await create_project(session=session, name="ExporterProject") + fleet = await create_fleet( + session=session, + project=exporter_project, + name="fleet1", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + name="test-export", + ) + + response = await client.post( + f"/api/project/{importer_project.name}/imports/delete", + headers=get_auth_headers(user.token), + json={ + "export_name": "test-export", + "export_project_name": "ExPoRtErPrOjEcT", # case-insensitive + }, + ) + assert response.status_code == 200 + + res = await session.execute(select(func.count()).select_from(ImportModel)) + assert res.scalar_one() == 0 + res = await session.execute(select(func.count()).select_from(ExportModel)) + assert res.scalar_one() == 1 + + async def test_returns_400_for_nonexistent_import( + self, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + importer_project = await create_project( + session=session, name="ImporterProject", owner=user + ) + await add_project_member( + session=session, project=importer_project, user=user, project_role=ProjectRole.ADMIN + ) + + exporter_project = await create_project( + session=session, name="ExporterProject", owner=user + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[], + exported_fleets=[], + name="test-export", + ) + + async def assert_not_found(export_project_name, export_name): + response = await client.post( + f"/api/project/{importer_project.name}/imports/delete", + headers=get_auth_headers(user.token), + json={"export_name": export_name, "export_project_name": export_project_name}, + ) + assert response.status_code == 400 + assert response.json()["detail"][0]["code"] == "resource_not_exists" + # The error should be the same regardless of what wasn't found + # (the exporter, the export, or the import), + # so that users cannot infer the existence of exports they are not given access to. + assert response.json()["detail"][0]["msg"] == ( + f"Import '{export_project_name}/{export_name}' not found in project 'ImporterProject'" + ) + + # Exporter not found + await assert_not_found(export_project_name="WrongProject", export_name="test-export") + # Export not found + await assert_not_found(export_project_name="ExporterProject", export_name="wrong-export") + # Import not found + await assert_not_found(export_project_name="ExporterProject", export_name="test-export") + + async def test_cannot_delete_import_of_global_export( + self, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + importer_project = await create_project( + session=session, name="ImporterProject", owner=user + ) + await add_project_member( + session=session, project=importer_project, user=user, project_role=ProjectRole.ADMIN + ) + + exporter_project = await create_project( + session=session, name="ExporterProject", owner=user + ) + export = await create_export( + session=session, + is_global=True, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[], + name="test-export", + ) + + response = await client.post( + f"/api/project/{importer_project.name}/imports/delete", + headers=get_auth_headers(user.token), + json={"export_name": export.name, "export_project_name": exporter_project.name}, + ) + assert response.status_code == 400 + assert ( + response.json()["detail"][0]["msg"] + == "'ExporterProject/test-export' is a global export, cannot stop importing" + ) + + +class TestListImports: + async def test_returns_403_if_not_authenticated(self, client: AsyncClient): + response = await client.post( + "/api/project/TestProject/imports/list", + ) + assert response.status_code in [401, 403] + + async def test_returns_403_if_not_member(self, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + response = await client.post( + f"/api/project/{project.name}/imports/list", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 403 + + @pytest.mark.parametrize( + "global_role, project_role", + [ + (GlobalRole.ADMIN, None), + (GlobalRole.USER, ProjectRole.USER), + ], + ) + async def test_lists_imports( + self, + session: AsyncSession, + client: AsyncClient, + global_role: GlobalRole, + project_role: Optional[ProjectRole], + ): + user = await create_user(session=session, global_role=global_role) + importer_project = await create_project( + session=session, name="ImporterProject", owner=user + ) + if project_role: + await add_project_member( + session=session, project=importer_project, user=user, project_role=project_role + ) + + exporter_project1 = await create_project( + session=session, name="ExporterProject1", owner=user + ) + exporter_project2 = await create_project( + session=session, name="ExporterProject2", owner=user + ) + fleet1 = await create_fleet( + session=session, + project=exporter_project1, + name="fleet1", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + fleet2 = await create_fleet( + session=session, + project=exporter_project2, + name="fleet2", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + backend1 = await create_backend(session=session, project_id=exporter_project1.id) + gateway1 = await create_gateway( + session=session, + project_id=exporter_project1.id, + backend_id=backend1.id, + name="gateway1", + ) + backend2 = await create_backend(session=session, project_id=exporter_project2.id) + gateway2 = await create_gateway( + session=session, + project_id=exporter_project2.id, + backend_id=backend2.id, + name="gateway2", + ) + await create_export( + session=session, + exporter_project=exporter_project1, + importer_projects=[importer_project], + exported_fleets=[fleet1], + exported_gateways=[gateway1], + name="export1", + ) + await create_export( + session=session, + exporter_project=exporter_project2, + importer_projects=[importer_project], + exported_fleets=[fleet2], + exported_gateways=[gateway2], + name="export2", + ) + + response = await client.post( + f"/api/project/{importer_project.name}/imports/list", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + imports = response.json() + assert len(imports) == 2 + imports.sort(key=lambda i: i["export"]["name"]) + + assert imports[0]["export"]["name"] == "export1" + assert imports[0]["export"]["project_name"] == "ExporterProject1" + assert len(imports[0]["export"]["exported_fleets"]) == 1 + assert imports[0]["export"]["exported_fleets"][0]["name"] == "fleet1" + assert len(imports[0]["export"]["exported_gateways"]) == 1 + assert imports[0]["export"]["exported_gateways"][0]["name"] == "gateway1" + + assert imports[1]["export"]["name"] == "export2" + assert imports[1]["export"]["project_name"] == "ExporterProject2" + assert len(imports[1]["export"]["exported_fleets"]) == 1 + assert imports[1]["export"]["exported_fleets"][0]["name"] == "fleet2" + assert len(imports[1]["export"]["exported_gateways"]) == 1 + assert imports[1]["export"]["exported_gateways"][0]["name"] == "gateway2" + + @pytest.mark.parametrize( + "global_role, project_role", + [ + (GlobalRole.ADMIN, None), + (GlobalRole.USER, ProjectRole.USER), + ], + ) + async def test_returns_empty_list_when_no_imports( + self, + session: AsyncSession, + client: AsyncClient, + global_role: GlobalRole, + project_role: Optional[ProjectRole], + ): + user = await create_user(session=session, global_role=global_role) + project = await create_project(session=session, owner=user) + if project_role: + await add_project_member( + session=session, project=project, user=user, project_role=project_role + ) + + response = await client.post( + f"/api/project/{project.name}/imports/list", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + assert response.json() == [] + + async def test_not_includes_deleted_fleets(self, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session, global_role=GlobalRole.USER) + importer_project = await create_project( + session=session, name="ImporterProject", owner=user + ) + await add_project_member( + session=session, project=importer_project, user=user, project_role=ProjectRole.USER + ) + exporter_project = await create_project( + session=session, name="ExporterProject", owner=user + ) + + fleet = await create_fleet( + session=session, + project=exporter_project, + name="fleet", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + deleted_fleet = await create_fleet( + session=session, + project=exporter_project, + name="deleted-fleet", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + deleted=True, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet, deleted_fleet], + name="test-export", + ) + + response = await client.post( + f"/api/project/{importer_project.name}/imports/list", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + imports = response.json() + assert len(imports) == 1 + assert imports[0]["export"]["name"] == "test-export" + assert len(imports[0]["export"]["exported_fleets"]) == 1 + assert imports[0]["export"]["exported_fleets"][0]["name"] == "fleet" + + async def test_does_not_include_other_projects_imports( + self, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + other_project = await create_project(session=session, name="OtherProject", owner=user) + exporter_project = await create_project( + session=session, name="ExporterProject", owner=user + ) + + fleet = await create_fleet( + session=session, + project=exporter_project, + name="fleet", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[other_project], + exported_fleets=[fleet], + name="test-export", + ) + + response = await client.post( + f"/api/project/{project.name}/imports/list", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + assert response.json() == [] diff --git a/src/tests/_internal/server/routers/test_instances.py b/src/tests/_internal/server/routers/test_instances.py new file mode 100644 index 0000000000..439538c14c --- /dev/null +++ b/src/tests/_internal/server/routers/test_instances.py @@ -0,0 +1,837 @@ +import datetime as dt +import uuid +from dataclasses import dataclass +from itertools import count + +import pytest +import pytest_asyncio +from httpx import AsyncClient +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.core.models.users import GlobalRole, ProjectRole +from dstack._internal.server.models import UserModel +from dstack._internal.server.services.projects import add_project_member +from dstack._internal.server.testing.common import ( + create_export, + create_fleet, + create_instance, + create_instance_health_check, + create_project, + create_user, + get_auth_headers, + get_fleet_configuration, + get_fleet_spec, + get_ssh_fleet_configuration, +) + + +@dataclass +class PreparedData: + users: list[UserModel] + + +SAMPLE_FLEET_IDS = [uuid.uuid4() for _ in range(3)] + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestListInstances: + @pytest_asyncio.fixture + async def data(self, session: AsyncSession) -> PreparedData: + users = [ + await create_user(session, name="user0", global_role=GlobalRole.ADMIN), + await create_user(session, name="user1", global_role=GlobalRole.USER), + await create_user(session, name="user2", global_role=GlobalRole.USER), + ] + projects = [ + await create_project(session, owner=users[0], name="project0"), + await create_project(session, owner=users[1], name="project1"), + await create_project(session, owner=users[2], name="project2"), + ] + await add_project_member( + session, project=projects[0], user=users[0], project_role=ProjectRole.ADMIN + ) + await add_project_member( + session, project=projects[1], user=users[1], project_role=ProjectRole.ADMIN + ) + await add_project_member( + session, project=projects[2], user=users[2], project_role=ProjectRole.ADMIN + ) + await add_project_member( + session, project=projects[2], user=users[1], project_role=ProjectRole.USER + ) + fleets = [ + await create_fleet( + session, + projects[0], + spec=get_fleet_spec(conf=get_fleet_configuration("fleet0")), + fleet_id=SAMPLE_FLEET_IDS[0], + ), + await create_fleet( + session, + projects[1], + spec=get_fleet_spec(conf=get_fleet_configuration("fleet1")), + fleet_id=SAMPLE_FLEET_IDS[1], + ), + await create_fleet( + session, + projects[2], + spec=get_fleet_spec(conf=get_fleet_configuration("fleet2")), + fleet_id=SAMPLE_FLEET_IDS[2], + ), + ] + _ = [ + await create_instance( + session=session, + project=projects[0], + fleet=fleets[0], + created_at=dt.datetime(2024, 1, 1, tzinfo=dt.timezone.utc), + name="fleet0-0", + ), + await create_instance( + session=session, + project=projects[1], + fleet=fleets[1], + created_at=dt.datetime(2024, 1, 2, tzinfo=dt.timezone.utc), + name="fleet1-0", + ), + await create_instance( + session=session, + project=projects[2], + fleet=fleets[2], + created_at=dt.datetime(2024, 1, 3, tzinfo=dt.timezone.utc), + name="fleet2-0", + ), + await create_instance( + session=session, + project=projects[2], + fleet=fleets[2], + created_at=dt.datetime(2024, 1, 4, tzinfo=dt.timezone.utc), + instance_num=1, + name="fleet2-1", + status=InstanceStatus.TERMINATED, + ), + ] + return PreparedData(users=users) + + @pytest.mark.parametrize( + ("user", "expected_instances"), + [ + pytest.param( + 0, + ["fleet0-0", "fleet1-0", "fleet2-0", "fleet2-1"], + id="global-admin", + ), + pytest.param( + 1, + ["fleet1-0", "fleet2-0", "fleet2-1"], + id="admin-in-one-project-user-in-other", + ), + pytest.param( + 2, + ["fleet2-0", "fleet2-1"], + id="project-admin", + ), + ], + ) + async def test_project_access( + self, user: int, expected_instances: list[str], data: PreparedData, client: AsyncClient + ) -> None: + resp = await client.post( + "/api/instances/list", + headers=get_auth_headers(data.users[user].token), + json={"ascending": True}, + ) + assert resp.status_code == 200 + instances = [instance["name"] for instance in resp.json()] + assert instances == expected_instances + + @pytest.mark.parametrize( + ("filters", "expected_instances"), + [ + pytest.param( + {"project_names": ["project1", "project2"]}, + ["fleet1-0", "fleet2-0", "fleet2-1"], + id="two-projects", + ), + pytest.param( + {"project_names": ["project1"]}, + ["fleet1-0"], + id="one-project", + ), + pytest.param( + {"project_names": ["project0"]}, + [], + id="forbidden-project", + ), + pytest.param( + {"project_names": ["nonexistent"]}, + [], + id="nonexistent-project", + ), + pytest.param( + {"fleet_ids": [str(SAMPLE_FLEET_IDS[1]), str(SAMPLE_FLEET_IDS[2])]}, + ["fleet1-0", "fleet2-0", "fleet2-1"], + id="two-fleets", + ), + pytest.param( + {"fleet_ids": [str(SAMPLE_FLEET_IDS[1])]}, + ["fleet1-0"], + id="one-fleet", + ), + pytest.param( + {"fleet_ids": [str(SAMPLE_FLEET_IDS[0])]}, + [], + id="forbidden-fleet", + ), + pytest.param( + {"fleet_ids": [str(uuid.uuid4())]}, + [], + id="nonexistent-fleet", + ), + pytest.param( + {"project_names": ["project1"], "fleet_ids": [str(SAMPLE_FLEET_IDS[1])]}, + ["fleet1-0"], + id="project-and-fleet-match", + ), + pytest.param( + {"project_names": ["project2"], "fleet_ids": [str(SAMPLE_FLEET_IDS[1])]}, + [], + id="project-and-fleet-no-match", + ), + pytest.param( + {"only_active": True, "project_names": ["project2"]}, + ["fleet2-0"], + id="only-active", + ), + ], + ) + async def test_filters( + self, + filters: dict, + expected_instances: list[str], + data: PreparedData, + client: AsyncClient, + ) -> None: + resp = await client.post( + "/api/instances/list", + headers=get_auth_headers(data.users[1].token), + json={"ascending": True, **filters}, + ) + assert resp.status_code == 200 + instances = [instance["name"] for instance in resp.json()] + assert instances == expected_instances + + @pytest.mark.parametrize( + ("is_ascending", "expected_pages"), + [ + pytest.param(True, [["fleet1-0", "fleet2-0"], ["fleet2-1"]], id="ascending"), + pytest.param(False, [["fleet2-1", "fleet2-0"], ["fleet1-0"]], id="descending"), + ], + ) + async def test_pagination( + self, + is_ascending: bool, + expected_pages: list[list[str]], + data: PreparedData, + client: AsyncClient, + ) -> None: + pages = [] + prev_id = None + prev_created_at = None + for page_no in count(): + if page_no == 10: + raise RuntimeError("Too many pages") + resp = await client.post( + "/api/instances/list", + headers=get_auth_headers(data.users[1].token), + json={ + "ascending": is_ascending, + "limit": 2, + "project_names": ["project1", "project2"], + "prev_id": prev_id, + "prev_created_at": prev_created_at, + }, + ) + assert resp.status_code == 200 + page = [] + for instance in resp.json(): + page.append(instance["name"]) + prev_id = instance["id"] + prev_created_at = instance["created"] + if not page: + break + pages.append(page) + assert pages == expected_pages + + async def test_not_authenticated(self, client: AsyncClient, data) -> None: + resp = await client.post("/api/instances/list", json={}) + assert resp.status_code in [401, 403] + + @pytest.mark.parametrize("with_project_name_filter", [True, False]) + async def test_returns_imported_instances_with_include_imported( + self, session: AsyncSession, client: AsyncClient, with_project_name_filter: bool + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.USER, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + await create_instance( + session=session, project=exporter_project, fleet=fleet, name="exported-fleet-0" + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + local_fleet = await create_fleet( + session=session, + project=importer_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="local-fleet")), + ) + await create_instance( + session=session, project=importer_project, fleet=local_fleet, name="local-fleet-0" + ) + response = await client.post( + "/api/instances/list", + headers=get_auth_headers(importer_user.token), + json={ + "include_imported": True, + "project_names": ["importer-project"] if with_project_name_filter else None, + }, + ) + assert response.status_code == 200 + response_json = response.json() + response_json.sort(key=lambda i: i["name"]) + assert len(response_json) == 2 + assert response_json[0]["name"] == "exported-fleet-0" + assert response_json[0]["project_name"] == "exporter-project" + assert response_json[0]["fleet_name"] == "exported-fleet" + assert response_json[1]["name"] == "local-fleet-0" + assert response_json[1]["project_name"] == "importer-project" + assert response_json[1]["fleet_name"] == "local-fleet" + + async def test_not_returns_imported_instances_without_include_imported( + self, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.USER, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + name="exported-fleet-0", + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + local_fleet = await create_fleet( + session=session, + project=importer_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="local-fleet")), + ) + await create_instance( + session=session, project=importer_project, fleet=local_fleet, name="local-fleet-0" + ) + response = await client.post( + "/api/instances/list", + headers=get_auth_headers(importer_user.token), + json={}, # No include_imported + ) + assert response.status_code == 200 + response_json = response.json() + assert len(response_json) == 1 + assert response_json[0]["name"] == "local-fleet-0" + assert response_json[0]["project_name"] == "importer-project" + assert response_json[0]["fleet_name"] == "local-fleet" + + async def test_returns_imported_instances_once_when_user_member_of_both_projects( + self, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, name="user", global_role=GlobalRole.USER) + exporter_project = await create_project(session, name="exporter-project", owner=user) + importer_project = await create_project(session, name="importer-project", owner=user) + await add_project_member( + session=session, + project=exporter_project, + user=user, + project_role=ProjectRole.USER, + ) + await add_project_member( + session=session, + project=importer_project, + user=user, + project_role=ProjectRole.USER, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="shared-fleet")), + ) + await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + name="shared-fleet-0", + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + local_exporter_fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="local-exporter-fleet")), + ) + await create_instance( + session=session, + project=exporter_project, + fleet=local_exporter_fleet, + name="local-exported-fleet-0", + ) + local_importer_fleet = await create_fleet( + session=session, + project=importer_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="local-importer-fleet")), + ) + await create_instance( + session=session, + project=importer_project, + fleet=local_importer_fleet, + name="local-importer-fleet-0", + ) + response = await client.post( + "/api/instances/list", + headers=get_auth_headers(user.token), + json={"include_imported": True}, + ) + assert response.status_code == 200 + response_json = response.json() + response_json.sort(key=lambda i: i["name"]) + assert len(response_json) == 3 + assert response_json[0]["name"] == "local-exported-fleet-0" + assert response_json[0]["project_name"] == "exporter-project" + assert response_json[0]["fleet_name"] == "local-exporter-fleet" + assert response_json[1]["name"] == "local-importer-fleet-0" + assert response_json[1]["project_name"] == "importer-project" + assert response_json[1]["fleet_name"] == "local-importer-fleet" + assert response_json[2]["name"] == "shared-fleet-0" + assert response_json[2]["project_name"] == "exporter-project" + assert response_json[2]["fleet_name"] == "shared-fleet" + + async def test_returns_instance_once_if_imported_twice( + self, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.USER, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + name="exported-fleet-0", + ) + for name in ["export-1", "export-2"]: + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + name=name, + ) + response = await client.post( + "/api/instances/list", + headers=get_auth_headers(importer_user.token), + json={"include_imported": True}, + ) + assert response.status_code == 200 + response_json = response.json() + assert len(response_json) == 1 + assert response_json[0]["name"] == "exported-fleet-0" + assert response_json[0]["project_name"] == "exporter-project" + assert response_json[0]["fleet_name"] == "exported-fleet" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +@pytest.mark.usefixtures("test_db") +class TestGetInstanceHealthChecks: + async def test_returns_403_if_not_project_member( + self, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + response = await client.post( + f"/api/project/{project.name}/instances/get_instance_health_checks", + headers=get_auth_headers(user.token), + json={ + "fleet_name": "test", + "instance_num": 0, + }, + ) + assert response.status_code == 403 + + async def test_returns_400_if_instance_not_found( + self, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session, project=project, user=user, project_role=ProjectRole.USER + ) + + response = await client.post( + f"/api/project/{project.name}/instances/get_instance_health_checks", + headers=get_auth_headers(user.token), + json={ + "fleet_name": "test", + "instance_num": 0, + }, + ) + assert response.status_code == 400 + + async def test_returns_health_checks(self, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet = await create_fleet(session=session, project=project) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + ) + await create_instance_health_check( + session=session, + instance=instance, + collected_at=dt.datetime(2025, 1, 1, 12, 0, tzinfo=dt.timezone.utc), + response="{}", + ) + health_response_with_dcgm = """ + { + "dcgm": { + "overall_health": 20, + "incidents": [{ + "system": 16, + "health": 20, + "error_message": "Detected 333 volatile double-bit ECC error(s) in GPU 0.", + "error_code": 4, + "entity_group_id": 1, + "entity_id": 0 + }] + } + } + """ + await create_instance_health_check( + session=session, + instance=instance, + collected_at=dt.datetime(2025, 1, 1, 12, 1, tzinfo=dt.timezone.utc), + response=health_response_with_dcgm, + ) + + response = await client.post( + f"/api/project/{project.name}/instances/get_instance_health_checks", + headers=get_auth_headers(user.token), + json={ + "fleet_name": fleet.name, + "instance_num": instance.instance_num, + }, + ) + + assert response.status_code == 200 + assert response.json() == { + "health_checks": [ + { + "collected_at": "2025-01-01T12:01:00+00:00", + "status": "failure", + "events": [ + { + "timestamp": "2025-01-01T12:01:00+00:00", + "status": "failure", + "message": "Detected 333 volatile double-bit ECC error(s) in GPU 0.", + } + ], + }, + {"collected_at": "2025-01-01T12:00:00+00:00", "status": "healthy", "events": []}, + ] + } + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +@pytest.mark.usefixtures("test_db") +class TestCompatibility: + async def test_converts_legacy_termination_reason_string( + self, session: AsyncSession, client: AsyncClient + ) -> None: + user = await create_user(session) + project = await create_project(session, owner=user) + fleet = await create_fleet(session, project) + await create_instance(session=session, project=project, fleet=fleet) + await session.execute( + text("UPDATE instances SET termination_reason = 'Fleet has too many instances'") + ) + await session.commit() + resp = await client.post( + "/api/instances/list", headers=get_auth_headers(user.token), json={} + ) + # Must convert legacy "Fleet has too many instances" to "max_instances_limit" + assert resp.json()[0]["termination_reason"] == "max_instances_limit" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestGetInstance: + async def test_returns_instance_by_id( + self, session: AsyncSession, client: AsyncClient + ) -> None: + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session, owner=user) + await add_project_member( + session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + fleet = await create_fleet(session, project) + instance = await create_instance(session=session, project=project, fleet=fleet) + + resp = await client.post( + f"/api/project/{project.name}/instances/get", + headers=get_auth_headers(user.token), + json={"id": str(instance.id)}, + ) + assert resp.status_code == 200 + resp_data = resp.json() + assert resp_data["id"] == str(instance.id) + assert resp_data["project_name"] == project.name + assert resp_data["fleet_name"] == fleet.name + + async def test_returns_instance_to_global_admin( + self, session: AsyncSession, client: AsyncClient + ) -> None: + admin = await create_user(session, global_role=GlobalRole.ADMIN, name="global-admin") + project = await create_project(session) + fleet = await create_fleet(session, project) + instance = await create_instance(session=session, project=project, fleet=fleet) + + resp = await client.post( + f"/api/project/{project.name}/instances/get", + headers=get_auth_headers(admin.token), + json={"id": str(instance.id)}, + ) + assert resp.status_code == 200 + resp_data = resp.json() + assert resp_data["id"] == str(instance.id) + + async def test_returns_400_if_instance_not_found( + self, session: AsyncSession, client: AsyncClient + ) -> None: + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session, owner=user) + await add_project_member( + session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + + resp = await client.post( + f"/api/project/{project.name}/instances/get", + headers=get_auth_headers(user.token), + json={"id": str(uuid.uuid4())}, + ) + assert resp.status_code == 400 + assert resp.json()["detail"][0]["code"] == "resource_not_exists" + + async def test_returns_400_if_instance_exists_in_different_project( + self, session: AsyncSession, client: AsyncClient + ) -> None: + user = await create_user(session, global_role=GlobalRole.USER) + + project1 = await create_project(session, owner=user, name="p1") + project2 = await create_project(session, owner=user, name="p2") + + await add_project_member( + session, project=project1, user=user, project_role=ProjectRole.ADMIN + ) + await add_project_member( + session, project=project2, user=user, project_role=ProjectRole.ADMIN + ) + + fleet = await create_fleet(session, project2) + instance = await create_instance(session=session, project=project2, fleet=fleet) + + resp = await client.post( + f"/api/project/{project1.name}/instances/get", + headers=get_auth_headers(user.token), + json={"id": str(instance.id)}, + ) + assert resp.status_code == 400 + assert resp.json()["detail"][0]["code"] == "resource_not_exists" + + async def test_returns_403_if_not_project_member( + self, session: AsyncSession, client: AsyncClient + ) -> None: + user = await create_user(session, name="non_member", global_role=GlobalRole.USER) + project = await create_project(session) + fleet = await create_fleet(session, project) + instance = await create_instance(session=session, project=project, fleet=fleet) + + resp = await client.post( + f"/api/project/{project.name}/instances/get", + headers=get_auth_headers(user.token), + json={"id": str(instance.id)}, + ) + assert resp.status_code == 403 + + async def test_returns_403_if_not_project_member_and_instance_not_exists( + self, session: AsyncSession, client: AsyncClient + ) -> None: + user = await create_user(session, name="non_member", global_role=GlobalRole.USER) + project = await create_project(session) + + resp = await client.post( + f"/api/project/{project.name}/instances/get", + headers=get_auth_headers(user.token), + json={"id": str(uuid.uuid4())}, + ) + assert resp.status_code == 403 + + async def test_returns_imported_instance( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.USER, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + instance = await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + response = await client.post( + "/api/project/exporter-project/instances/get", + headers=get_auth_headers(importer_user.token), + json={"id": str(instance.id)}, + ) + assert response.status_code == 200 + response_json = response.json() + assert response_json["id"] == str(instance.id) + assert response_json["project_name"] == "exporter-project" + assert response_json["fleet_name"] == "exported-fleet" + + async def test_returns_403_on_foreign_instance_if_not_imported( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + not_importer_user = await create_user( + session, name="not-importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project( + session, name="exporter-project", owner=importer_user + ) + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + not_importer_project = await create_project( + session, name="not-importer-project", owner=not_importer_user + ) + await add_project_member( + session=session, + project=not_importer_project, + user=not_importer_user, + project_role=ProjectRole.USER, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + instance = await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + response = await client.post( + "/api/project/exporter-project/instances/get", + headers=get_auth_headers(not_importer_user.token), + json={"id": str(instance.id)}, + ) + assert response.status_code == 403 diff --git a/src/tests/_internal/server/routers/test_logs.py b/src/tests/_internal/server/routers/test_logs.py index 1804799b5f..33d904d56a 100644 --- a/src/tests/_internal/server/routers/test_logs.py +++ b/src/tests/_internal/server/routers/test_logs.py @@ -1,39 +1,39 @@ -from pathlib import Path -from unittest.mock import patch - import pytest -from fastapi.testclient import TestClient +from httpx import AsyncClient from sqlalchemy.ext.asyncio import AsyncSession from dstack._internal.core.models.users import GlobalRole, ProjectRole -from dstack._internal.server import settings -from dstack._internal.server.main import app +from dstack._internal.server.services.logs.filelog import FileLogStorage from dstack._internal.server.services.projects import add_project_member from dstack._internal.server.testing.common import create_project, create_user, get_auth_headers -client = TestClient(app) - class TestPollLogs: @pytest.mark.asyncio - async def test_returns_403_if_not_project_member(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_project_member( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) - response = client.post( + response = await client.post( f"/api/project/{project.name}/logs/poll", headers=get_auth_headers(user.token), ) assert response.status_code == 403 @pytest.mark.asyncio - async def test_returns_logs(self, test_db, session: AsyncSession, tmp_path: Path): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_logs( + self, test_db, test_log_storage: FileLogStorage, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) runner_log_path = ( - tmp_path + test_log_storage.root / "projects" / project.name / "logs" @@ -47,54 +47,56 @@ async def test_returns_logs(self, test_db, session: AsyncSession, tmp_path: Path '{"timestamp": "2023-10-06T10:01:53.234235+00:00", "log_source": "stdout", "message": "World"}\n' '{"timestamp": "2023-10-06T10:01:53.234236+00:00", "log_source": "stdout", "message": "!"}\n' ) - with patch.object(settings, "SERVER_DIR_PATH", tmp_path): - response = client.post( - f"/api/project/{project.name}/logs/poll", - headers=get_auth_headers(user.token), - json={ - "run_name": "test_run", - "job_submission_id": "1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e", - "diagnose": True, - }, - ) + response = await client.post( + f"/api/project/{project.name}/logs/poll", + headers=get_auth_headers(user.token), + json={ + "run_name": "test_run", + "job_submission_id": "1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e", + "diagnose": True, + }, + ) assert response.status_code == 200, response.json() assert response.json() == { "logs": [ { "timestamp": "2023-10-06T10:01:53.234234+00:00", "log_source": "stdout", - "message": "Hello", + "message": "SGVsbG8=", }, { "timestamp": "2023-10-06T10:01:53.234235+00:00", "log_source": "stdout", - "message": "World", + "message": "V29ybGQ=", }, { "timestamp": "2023-10-06T10:01:53.234236+00:00", "log_source": "stdout", - "message": "!", + "message": "IQ==", }, - ] + ], + "external_url": None, + "next_token": None, } - with patch.object(settings, "SERVER_DIR_PATH", tmp_path): - response = client.post( - f"/api/project/{project.name}/logs/poll", - headers=get_auth_headers(user.token), - json={ - "run_name": "test_run", - "job_submission_id": "1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e", - "start_time": "2023-10-06T10:01:53.234235+00:00", - "diagnose": True, - }, - ) + response = await client.post( + f"/api/project/{project.name}/logs/poll", + headers=get_auth_headers(user.token), + json={ + "run_name": "test_run", + "job_submission_id": "1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e", + "start_time": "2023-10-06T10:01:53.234235+00:00", + "diagnose": True, + }, + ) assert response.status_code == 200, response.json() assert response.json() == { "logs": [ { "timestamp": "2023-10-06T10:01:53.234236+00:00", "log_source": "stdout", - "message": "!", + "message": "IQ==", }, - ] + ], + "external_url": None, + "next_token": None, } diff --git a/src/tests/_internal/server/routers/test_metrics.py b/src/tests/_internal/server/routers/test_metrics.py new file mode 100644 index 0000000000..19e849f413 --- /dev/null +++ b/src/tests/_internal/server/routers/test_metrics.py @@ -0,0 +1,198 @@ +from datetime import datetime, timezone + +import pytest +from httpx import AsyncClient +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.users import GlobalRole, ProjectRole +from dstack._internal.server.services.projects import add_project_member +from dstack._internal.server.testing.common import ( + create_job, + create_job_metrics_point, + create_project, + create_repo, + create_run, + create_user, + get_auth_headers, + get_instance_offer_with_availability, + get_job_provisioning_data, + get_job_runtime_data, +) + +pytestmark = pytest.mark.usefixtures("image_config_mock") + + +class TestGetJobMetrics: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_project_member( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + response = await client.get( + f"/api/project/{project.name}/metrics/job/test", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_metrics(self, test_db, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo( + session=session, + project_id=project.id, + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + ) + jpd = get_job_provisioning_data( + cpu_count=128, memory_gib=256, gpu_count=2, gpu_memory_gib=32 + ) + offer = get_instance_offer_with_availability( + cpu_count=64, memory_gib=128, gpu_count=1, gpu_memory_gib=32 + ) + jrd = get_job_runtime_data(offer=offer) + job = await create_job( + session=session, + run=run, + job_provisioning_data=jpd, + job_runtime_data=jrd, + ) + await create_job_metrics_point( + session=session, + job_model=job, + timestamp=datetime(2023, 1, 2, 3, 4, 5, tzinfo=timezone.utc), + cpu_usage_micro=2 * 1_000_000, + memory_usage_bytes=256, + memory_working_set_bytes=128, + gpus_memory_usage_bytes=[256], + gpus_util_percent=[2], + ) + await create_job_metrics_point( + session=session, + job_model=job, + timestamp=datetime(2023, 1, 2, 3, 4, 15, tzinfo=timezone.utc), + cpu_usage_micro=4 * 1_000_000, + memory_usage_bytes=512, + memory_working_set_bytes=256, + gpus_memory_usage_bytes=[512], + gpus_util_percent=[6], + ) + await create_job_metrics_point( + session=session, + job_model=job, + timestamp=datetime(2023, 1, 2, 3, 4, 25, tzinfo=timezone.utc), + cpu_usage_micro=10 * 1_000_000, + memory_usage_bytes=1024, + memory_working_set_bytes=512, + gpus_memory_usage_bytes=[1024], + gpus_util_percent=[10], + ) + response = await client.get( + f"/api/project/{project.name}/metrics/job/{run.run_name}", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + # Returns one last sample by default. Filtering is tested in services/test_metrics.py + assert response.json() == { + "metrics": [ + { + "name": "cpu_usage_percent", + "timestamps": ["2023-01-02T03:04:25+00:00"], + "values": [60], + }, + { + "name": "memory_usage_bytes", + "timestamps": ["2023-01-02T03:04:25+00:00"], + "values": [1024], + }, + { + "name": "memory_working_set_bytes", + "timestamps": ["2023-01-02T03:04:25+00:00"], + "values": [512], + }, + { + "name": "cpus_detected_num", + "timestamps": ["2023-01-02T03:04:25+00:00"], + "values": [64], + }, + { + "name": "memory_total_bytes", + "timestamps": ["2023-01-02T03:04:25+00:00"], + "values": [137438953472], + }, + { + "name": "gpus_detected_num", + "timestamps": ["2023-01-02T03:04:25+00:00"], + "values": [1], + }, + { + "name": "gpu_memory_total_bytes", + "timestamps": ["2023-01-02T03:04:25+00:00"], + "values": [34359738368], + }, + { + "name": "gpu_memory_usage_bytes_gpu0", + "timestamps": ["2023-01-02T03:04:25+00:00"], + "values": [1024], + }, + { + "name": "gpu_util_percent_gpu0", + "timestamps": ["2023-01-02T03:04:25+00:00"], + "values": [10], + }, + ] + } + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_ignores_deleted_runs(self, test_db, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + deleted_run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="test-run", + deleted=True, + ) + active_run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="test-run", + ) + await create_job(session=session, run=deleted_run, job_num=0) + await create_job(session=session, run=deleted_run, job_num=1) + await create_job(session=session, run=active_run, job_num=0) + response_job_0 = await client.get( + f"/api/project/{project.name}/metrics/job/test-run", + params={"job_num": 0}, + headers=get_auth_headers(user.token), + ) + response_job_1 = await client.get( + f"/api/project/{project.name}/metrics/job/test-run", + params={"job_num": 1}, + headers=get_auth_headers(user.token), + ) + # Only deleted_run has job_num=1, but it's deleted + assert response_job_1.status_code == 400 + assert response_job_1.json()["detail"][0]["code"] == "resource_not_exists" + # job_num=0 is taken from active_run + assert response_job_0.status_code == 200 + assert response_job_0.json() == {"metrics": []} diff --git a/src/tests/_internal/server/routers/test_pools.py b/src/tests/_internal/server/routers/test_pools.py deleted file mode 100644 index a2a07a377c..0000000000 --- a/src/tests/_internal/server/routers/test_pools.py +++ /dev/null @@ -1,550 +0,0 @@ -import datetime as dt - -import pytest -from fastapi.testclient import TestClient -from freezegun import freeze_time -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession - -from dstack._internal.core.models.instances import SSHKey -from dstack._internal.core.models.profiles import DEFAULT_POOL_NAME -from dstack._internal.core.models.users import GlobalRole, ProjectRole -from dstack._internal.server.main import app -from dstack._internal.server.models import PoolModel -from dstack._internal.server.schemas.pools import ( - CreatePoolRequest, - DeletePoolRequest, - RemoveInstanceRequest, - SetDefaultPoolRequest, - ShowPoolRequest, -) -from dstack._internal.server.schemas.runs import AddRemoteInstanceRequest -from dstack._internal.server.services.projects import add_project_member -from dstack._internal.server.testing.common import ( - create_instance, - create_pool, - create_project, - create_user, - get_auth_headers, -) - -client = TestClient(app) - -TEST_POOL_NAME = "test_router_pool_name" - - -class TestListPools: - @pytest.mark.asyncio - async def test_returns_403_if_not_authenticated(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - response = client.post( - f"/api/project/{project.name}/pool/list", - json={}, - ) - assert response.status_code == 403 - - @pytest.mark.asyncio - @freeze_time(dt.datetime(2023, 10, 4, 12, 0, tzinfo=dt.timezone.utc)) - async def test_creates_and_lists_default_pool(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - await add_project_member( - session=session, project=project, user=user, project_role=ProjectRole.USER - ) - response = client.post( - f"/api/project/{project.name}/pool/list", - headers=get_auth_headers(user.token), - json={}, - ) - assert response.status_code == 200 - result = response.json() - expected = [ - { - "name": "default-pool", - "default": True, - "created_at": "2023-10-04T12:00:00+00:00", - "total_instances": 0, - "available_instances": 0, - } - ] - assert result == expected - - -class TestDeletePool: - @pytest.mark.asyncio - async def test_returns_403_if_not_authenticated(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - response = client.post( - f"/api/project/{project.name}/pool/delete", - json=DeletePoolRequest(name=TEST_POOL_NAME, force=False).dict(), - ) - assert response.status_code == 403 - - @pytest.mark.asyncio - async def test_delete_last_pool(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - await add_project_member( - session=session, project=project, user=user, project_role=ProjectRole.ADMIN - ) - pool = await create_pool(session, project, pool_name=TEST_POOL_NAME) - response = client.post( - f"/api/project/{project.name}/pool/delete", - headers=get_auth_headers(user.token), - json=DeletePoolRequest(name=TEST_POOL_NAME, force=False).dict(), - ) - assert response.status_code == 200 - assert response.json() is None - - response = client.post( - f"/api/project/{project.name}/pool/list", - headers=get_auth_headers(user.token), - json={}, - ) - assert response.status_code == 200 - - result = response.json() - assert len(result) == 1 - - default_pool = result[0] - assert default_pool["name"] == DEFAULT_POOL_NAME - assert dt.datetime.fromisoformat(default_pool["created_at"]) > pool.created_at - - @pytest.mark.asyncio - async def test_deletes_pool(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - await add_project_member( - session=session, project=project, user=user, project_role=ProjectRole.ADMIN - ) - pool1 = await create_pool(session, project, pool_name=f"{TEST_POOL_NAME}-left") - pool2 = await create_pool(session, project, pool_name=f"{TEST_POOL_NAME}-right") - response = client.post( - f"/api/project/{project.name}/pool/delete", - headers=get_auth_headers(user.token), - json=DeletePoolRequest(name=pool1.name, force=False).dict(), - ) - assert response.status_code == 200 - assert response.json() is None - res = await session.execute(select(PoolModel).where(PoolModel.deleted == False)) - pool = res.scalar_one() - assert pool.name == pool2.name - - @pytest.mark.asyncio - async def test_returns_400_if_pool_missing(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - await add_project_member( - session=session, project=project, user=user, project_role=ProjectRole.ADMIN - ) - response = client.post( - f"/api/project/{project.name}/pool/delete", - headers=get_auth_headers(user.token), - json=DeletePoolRequest(name="missing name", force=False).dict(), - ) - assert response.status_code == 400 - - -class TestSetDefaultPool: - @pytest.mark.asyncio - async def test_returns_403_if_not_authenticated(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - response = client.post( - f"/api/project/{project.name}/pool/set_default", - json=SetDefaultPoolRequest(pool_name=TEST_POOL_NAME).dict(), - ) - assert response.status_code == 403 - - @pytest.mark.asyncio - async def test_sets_default(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - await add_project_member( - session=session, project=project, user=user, project_role=ProjectRole.ADMIN - ) - pool = await create_pool(session, project, pool_name=f"{TEST_POOL_NAME}-right") - response = client.post( - f"/api/project/{project.name}/pool/set_default", - headers=get_auth_headers(user.token), - json=SetDefaultPoolRequest(pool_name=pool.name).dict(), - ) - assert response.status_code == 200 - await session.refresh(project) - assert project.default_pool_id == pool.id - - @pytest.mark.asyncio - async def test_returns_400_if_pool_missing(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - await add_project_member( - session=session, project=project, user=user, project_role=ProjectRole.ADMIN - ) - response = client.post( - f"/api/project/{project.name}/pool/set_default", - headers=get_auth_headers(user.token), - json=SetDefaultPoolRequest(pool_name="missing pool").dict(), - ) - assert response.status_code == 400 - - -class TestCreatePool: - @pytest.mark.asyncio - async def test_returns_403_if_not_authenticated(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - response = client.post( - f"/api/project/{project.name}/pool/create", - json=CreatePoolRequest(name=TEST_POOL_NAME).dict(), - ) - assert response.status_code == 403 - - @pytest.mark.asyncio - async def test_create_pool(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - await add_project_member( - session=session, project=project, user=user, project_role=ProjectRole.ADMIN - ) - response = client.post( - f"/api/project/{project.name}/pool/create", - headers=get_auth_headers(user.token), - json=CreatePoolRequest(name=TEST_POOL_NAME).dict(), - ) - assert response.status_code == 200 - assert response.json() is None - res = await session.execute(select(PoolModel).where(PoolModel.deleted == False)) - res.scalar_one() - - @pytest.mark.asyncio - async def test_returns_400_on_duplicate_name(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - await add_project_member( - session=session, project=project, user=user, project_role=ProjectRole.ADMIN - ) - response = client.post( - f"/api/project/{project.name}/pool/create", - headers=get_auth_headers(user.token), - json=CreatePoolRequest(name=TEST_POOL_NAME).dict(), - ) - assert response.status_code == 200 - assert response.json() is None - response = client.post( - f"/api/project/{project.name}/pool/create", - headers=get_auth_headers(user.token), - json=CreatePoolRequest(name=TEST_POOL_NAME).dict(), - ) - assert response.status_code == 400 - - -class TestShowPool: - @pytest.mark.asyncio - async def test_returns_403_if_not_authenticated(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - response = client.post( - f"/api/project/{project.name}/pool/show", - json=CreatePoolRequest(name=TEST_POOL_NAME).dict(), - ) - assert response.status_code == 403 - - @pytest.mark.asyncio - async def test_show_pool(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - await add_project_member( - session=session, project=project, user=user, project_role=ProjectRole.ADMIN - ) - pool = await create_pool(session, project, pool_name=TEST_POOL_NAME) - instance = await create_instance( - session=session, - project=project, - pool=pool, - ) - response = client.post( - f"/api/project/{project.name}/pool/show", - headers=get_auth_headers(user.token), - json=ShowPoolRequest(name=TEST_POOL_NAME).dict(), - ) - assert response.status_code == 200 - assert response.json() == { - "name": "test_router_pool_name", - "instances": [ - { - "backend": "datacrunch", - "instance_type": { - "name": "instance", - "resources": { - "cpus": 1, - "memory_mib": 512, - "gpus": [], - "spot": False, - "disk": {"size_mib": 102400}, - "description": "", - }, - }, - "id": str(instance.id), - "project_name": project.name, - "name": "test_instance", - "job_name": None, - "job_status": None, - "hostname": "running_instance.ip", - "status": "idle", - "unreachable": False, - "created": "2023-01-02T03:04:00+00:00", - "pool_name": "test_router_pool_name", - "region": "en", - "price": 1, - } - ], - } - - @pytest.mark.asyncio - async def test_show_missing_pool(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - await add_project_member( - session=session, project=project, user=user, project_role=ProjectRole.ADMIN - ) - pool = await create_pool(session, project, pool_name=TEST_POOL_NAME) - await create_instance( - session=session, - project=project, - pool=pool, - ) - response = client.post( - f"/api/project/{project.name}/pool/show", - headers=get_auth_headers(user.token), - json=ShowPoolRequest(name="missing_pool").dict(), - ) - assert response.status_code == 400 - assert response.json() == { - "detail": [{"msg": "Pool not found", "code": "resource_not_exists"}] - } - - -class TestAddRemote: - @pytest.mark.asyncio - async def test_returns_403_if_not_authenticated(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - remote = AddRemoteInstanceRequest( - instance_name="test_instance_name", - instance_network=None, - region="", - host="localhost", - port=22, - pool_name="pool_name", - ssh_user="user", - ssh_keys=[SSHKey(public="abc")], - ) - response = client.post( - f"/api/project/{project.name}/pool/add_remote", - json=remote.dict(), - ) - assert response.status_code == 403 - - @pytest.mark.asyncio - async def test_add_remote(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - await add_project_member( - session=session, project=project, user=user, project_role=ProjectRole.ADMIN - ) - remote = AddRemoteInstanceRequest( - instance_name="test_instance_name", - instance_network=None, - region="", - host="localhost", - port=22, - pool_name="pool_name", - ssh_user="user", - ssh_keys=[SSHKey(public="abc")], - ) - response = client.post( - f"/api/project/{project.name}/pool/add_remote", - headers=get_auth_headers(user.token), - json=remote.dict(), - ) - assert response.status_code == 200 - - data = response.json() - assert data["status"] == "pending" - assert data["name"] == "test_instance_name" - - -class TestRemoveInstance: - @pytest.mark.asyncio - async def test_returns_403_if_not_authenticated(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - remote = AddRemoteInstanceRequest( - instance_name="test_instance_name", - instance_network=None, - region="", - host="localhost", - port=22, - pool_name="pool_name", - ssh_user="user", - ssh_keys=[SSHKey(public="abc")], - ) - response = client.post( - f"/api/project/{project.name}/pool/add_remote", - json=remote.dict(), - ) - assert response.status_code == 403 - - @pytest.mark.asyncio - async def test_remove_instance(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - await add_project_member( - session=session, project=project, user=user, project_role=ProjectRole.ADMIN - ) - pool = await create_pool(session, project, pool_name=TEST_POOL_NAME) - instance = await create_instance( - session=session, - project=project, - pool=pool, - ) - response = client.post( - f"/api/project/{project.name}/pool/remove", - headers=get_auth_headers(user.token), - json=RemoveInstanceRequest( - pool_name=TEST_POOL_NAME, - instance_name=instance.name, - ).dict(), - ) - assert response.status_code == 200 - assert response.json() is None - - response = client.post( - f"/api/project/{project.name}/pool/show", - headers=get_auth_headers(user.token), - json=ShowPoolRequest(name=TEST_POOL_NAME).dict(), - ) - assert response.status_code == 200 - assert response.json() == { - "name": "test_router_pool_name", - "instances": [ - { - "backend": "datacrunch", - "instance_type": { - "name": "instance", - "resources": { - "cpus": 1, - "memory_mib": 512, - "gpus": [], - "spot": False, - "disk": {"size_mib": 102400}, - "description": "", - }, - }, - "id": str(instance.id), - "project_name": project.name, - "name": "test_instance", - "job_name": None, - "job_status": None, - "hostname": "running_instance.ip", - "status": "terminating", - "unreachable": False, - "created": "2023-01-02T03:04:00+00:00", - "pool_name": "test_router_pool_name", - "region": "en", - "price": 1, - } - ], - } - - -class TestListInstances: - @pytest.mark.asyncio - async def test_returns_403_if_not_authenticated(self, test_db, session: AsyncSession): - response = client.post( - "/api/pools/list_instances", - json={}, - ) - assert response.status_code == 403 - - @pytest.mark.asyncio - async def test_lists_instances(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - await add_project_member( - session=session, project=project, user=user, project_role=ProjectRole.ADMIN - ) - pool = await create_pool(session, project, pool_name=TEST_POOL_NAME) - instance1 = await create_instance( - session=session, - project=project, - pool=pool, - created_at=dt.datetime(2023, 10, 4, 12, 0, tzinfo=dt.timezone.utc), - ) - instance2 = await create_instance( - session=session, - project=project, - pool=pool, - created_at=dt.datetime(2023, 10, 5, 12, 0, tzinfo=dt.timezone.utc), - ) - response = client.post( - "/api/pools/list_instances", - headers=get_auth_headers(user.token), - json={}, - ) - assert response.status_code == 200 - response_json = response.json() - assert len(response_json) == 2 - assert response_json[0]["id"] == str(instance2.id) - assert response_json[1]["id"] == str(instance1.id) - - @pytest.mark.asyncio - async def test_lists_paginated_instances(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - await add_project_member( - session=session, project=project, user=user, project_role=ProjectRole.ADMIN - ) - pool = await create_pool(session, project, pool_name=TEST_POOL_NAME) - instance1 = await create_instance( - session=session, - project=project, - pool=pool, - created_at=dt.datetime(2023, 10, 5, 12, 0, tzinfo=dt.timezone.utc), - ) - instance2 = await create_instance( - session=session, - project=project, - pool=pool, - created_at=dt.datetime(2023, 10, 3, 12, 0, tzinfo=dt.timezone.utc), - ) - instance3 = await create_instance( - session=session, - project=project, - pool=pool, - created_at=dt.datetime(2023, 10, 6, 12, 0, tzinfo=dt.timezone.utc), - ) - response = client.post( - "/api/pools/list_instances", - headers=get_auth_headers(user.token), - json={"limit": 2}, - ) - assert response.status_code == 200 - response_json = response.json() - assert len(response_json) == 2 - assert response_json[0]["id"] == str(instance3.id) - assert response_json[1]["id"] == str(instance1.id) - response = client.post( - "/api/pools/list_instances", - headers=get_auth_headers(user.token), - json={ - "prev_id": response_json[1]["id"], - "prev_created_at": response_json[1]["created"], - }, - ) - assert response.status_code == 200 - response_json = response.json() - assert len(response_json) == 1 - assert response_json[0]["id"] == str(instance2.id) diff --git a/src/tests/_internal/server/routers/test_projects.py b/src/tests/_internal/server/routers/test_projects.py index c00d54bd62..67afa77390 100644 --- a/src/tests/_internal/server/routers/test_projects.py +++ b/src/tests/_internal/server/routers/test_projects.py @@ -1,50 +1,62 @@ -import json +from datetime import datetime, timezone from unittest.mock import patch from uuid import UUID import pytest -from fastapi.testclient import TestClient +from freezegun import freeze_time +from httpx import AsyncClient from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession +from dstack._internal.core.models.fleets import FleetStatus +from dstack._internal.core.models.runs import RunStatus from dstack._internal.core.models.users import GlobalRole, ProjectRole -from dstack._internal.server.main import app -from dstack._internal.server.models import MemberModel, ProjectModel +from dstack._internal.server.models import ExportModel, ImportModel, MemberModel, ProjectModel +from dstack._internal.server.services.permissions import DefaultPermissions from dstack._internal.server.services.projects import add_project_member from dstack._internal.server.testing.common import ( - create_backend, + create_export, + create_fleet, create_project, + create_repo, + create_run, create_user, + create_volume, + default_permissions_context, get_auth_headers, ) -client = TestClient(app) - class TestListProjects: - def test_returns_40x_if_not_authenticated(self): - response = client.post("/api/projects/list") + @pytest.mark.asyncio + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/projects/list") assert response.status_code in [401, 403] @pytest.mark.asyncio - async def test_returns_empty_list(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_empty_list(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session=session) - response = client.post("/api/projects/list", headers=get_auth_headers(user.token)) + response = await client.post("/api/projects/list", headers=get_auth_headers(user.token)) assert response.status_code in [200] assert response.json() == [] @pytest.mark.asyncio - async def test_returns_projects(self, test_db, session: AsyncSession): - user = await create_user(session=session) - project = await create_project(session=session, owner=user) - await add_project_member( - session=session, project=project, user=user, project_role=ProjectRole.ADMIN + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_projects(self, test_db, session: AsyncSession, client: AsyncClient): + user = await create_user( + session=session, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), ) - backend = await create_backend( + project = await create_project( session=session, - project_id=project.id, + owner=user, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN ) - response = client.post("/api/projects/list", headers=get_auth_headers(user.token)) + response = await client.post("/api/projects/list", headers=get_auth_headers(user.token)) assert response.status_code in [200] assert response.json() == [ { @@ -53,333 +65,2291 @@ async def test_returns_projects(self, test_db, session: AsyncSession): "owner": { "id": str(user.id), "username": user.name, + "created_at": "2023-01-02T03:04:00+00:00", "global_role": user.global_role, "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": None, }, - "backends": [ - { - "name": backend.type, - "config": { - "type": backend.type, - "regions": json.loads(backend.config)["regions"], - "vpc_name": None, - "vpc_ids": None, - "default_vpcs": None, - "public_ips": None, - }, - } - ], - "members": [ - { - "user": { - "id": str(user.id), - "username": user.name, - "global_role": user.global_role, - "email": None, - }, - "project_role": ProjectRole.ADMIN, - } - ], + "created_at": "2023-01-02T03:04:00+00:00", + "backends": [], + "members": [], + "is_public": False, + "templates_repo": None, } ] + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_public_projects_to_non_members( + self, test_db, session: AsyncSession, client: AsyncClient + ): + # Create project owner + owner = await create_user( + session=session, + name="owner", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + global_role=GlobalRole.USER, + ) -class TestCreateProject: - def test_returns_40x_if_not_authenticated(self): - response = client.post("/api/projects/create") - assert response.status_code in [401, 403] + # Create a different user who is not a member + non_member = await create_user( + session=session, + name="non_member", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + global_role=GlobalRole.USER, + ) - @pytest.mark.asyncio - async def test_creates_project(self, test_db, session: AsyncSession): - user = await create_user(session=session) - project_id = UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e") - project_name = "test_project" - body = {"project_name": project_name} - with patch("uuid.uuid4") as m: - m.return_value = project_id - response = client.post( - "/api/projects/create", - headers=get_auth_headers(user.token), - json=body, - ) - assert response.status_code == 200, response.json() - assert response.json() == { - "project_id": str(project_id), - "project_name": project_name, - "owner": { - "id": str(user.id), - "username": user.name, - "global_role": user.global_role, - "email": None, - }, - "backends": [], - "members": [ - { - "user": { - "id": str(user.id), - "username": user.name, - "global_role": user.global_role, - "email": None, - }, - "project_role": ProjectRole.ADMIN, - } - ], - } + # Create a public project + public_project = await create_project( + session=session, + owner=owner, + name="public_project", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + is_public=True, + ) - @pytest.mark.asyncio - async def test_return_400_if_project_name_is_taken(self, test_db, session: AsyncSession): - user = await create_user(session=session) - with patch("uuid.uuid4") as m: - m.return_value = UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e") - response = client.post( - "/api/projects/create", - headers=get_auth_headers(user.token), - json={"project_name": "TestProject"}, - ) - assert response.status_code == 200 - # Project name uniqueness check should be case insensitive - for project_name in ["testproject", "TestProject", "TESTPROJECT"]: - with patch("uuid.uuid4") as m: - m.return_value = UUID("2b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e") - response = client.post( - "/api/projects/create", - headers=get_auth_headers(user.token), - json={"project_name": project_name}, - ) - assert response.status_code == 400 - res = await session.execute( - select(ProjectModel).where( - ProjectModel.name.in_(["TestProject", "testproject", "TestProject", "TESTPROJECT"]) - ) + # Create a private project + private_project = await create_project( + session=session, + owner=owner, + name="private_project", + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + is_public=False, ) - assert len(res.scalars().all()) == 1 + + # Add owner as admin to both projects + await add_project_member( + session=session, project=public_project, user=owner, project_role=ProjectRole.ADMIN + ) + await add_project_member( + session=session, project=private_project, user=owner, project_role=ProjectRole.ADMIN + ) + + # List projects as non-member - should only see public project + response = await client.post( + "/api/projects/list", headers=get_auth_headers(non_member.token) + ) + assert response.status_code == 200 + projects = response.json() + + # Should only see the public project + assert len(projects) == 1 + assert projects[0]["project_name"] == "public_project" + assert projects[0]["is_public"] is True @pytest.mark.asyncio - async def test_returns_400_if_user_project_quota_exceeded( - self, test_db, session: AsyncSession + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_member_sees_both_public_and_private_projects( + self, test_db, session: AsyncSession, client: AsyncClient ): - user = await create_user(session=session, name="owner", global_role=GlobalRole.USER) - for i in range(10): - response = client.post( - "/api/projects/create", - headers=get_auth_headers(user.token), - json={"project_name": f"project{i}"}, - ) - assert response.status_code == 200, response.json() - response = client.post( - "/api/projects/create", - headers=get_auth_headers(user.token), - json={"project_name": "project11"}, + # Create project owner + owner = await create_user( + session=session, + name="owner", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + global_role=GlobalRole.USER, ) - assert response.status_code == 400 - assert response.json() == { - "detail": [{"code": "error", "msg": "User project quota exceeded"}] - } - @pytest.mark.asyncio - async def test_no_project_quota_for_global_admins(self, test_db, session: AsyncSession): - user = await create_user(session=session, name="owner", global_role=GlobalRole.ADMIN) - for i in range(12): - response = client.post( - "/api/projects/create", - headers=get_auth_headers(user.token), - json={"project_name": f"project{i}"}, - ) - assert response.status_code == 200, response.json() + # Create a user who will be a member + member = await create_user( + session=session, + name="member", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + global_role=GlobalRole.USER, + ) + # Create a public project + public_project = await create_project( + session=session, + owner=owner, + name="public_project", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + is_public=True, + ) -class TestDeleteProject: - def test_returns_40x_if_not_authenticated(self): - response = client.post("/api/projects/delete") - assert response.status_code in [401, 403] + # Create a private project + private_project = await create_project( + session=session, + owner=owner, + name="private_project", + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + is_public=False, + ) - @pytest.mark.asyncio - async def test_cannot_delete_the_only_project(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) + # Add member to the private project only await add_project_member( - session=session, project=project, user=user, project_role=ProjectRole.ADMIN - ) - response = client.post( - "/api/projects/delete", - headers=get_auth_headers(user.token), - json={"projects_names": [project.name]}, + session=session, project=private_project, user=member, project_role=ProjectRole.USER ) - assert response.status_code == 400 - await session.refresh(project) - assert not project.deleted - @pytest.mark.asyncio - async def test_deletes_projects(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - project1 = await create_project(session=session, owner=user, name="project1") + # Add owner as admin to both projects await add_project_member( - session=session, project=project1, user=user, project_role=ProjectRole.ADMIN + session=session, project=public_project, user=owner, project_role=ProjectRole.ADMIN ) - project2 = await create_project(session=session, owner=user, name="project2") await add_project_member( - session=session, project=project2, user=user, project_role=ProjectRole.ADMIN - ) - response = client.post( - "/api/projects/delete", - headers=get_auth_headers(user.token), - json={"projects_names": [project1.name]}, + session=session, project=private_project, user=owner, project_role=ProjectRole.ADMIN ) + + # List projects as member - should see both projects + response = await client.post("/api/projects/list", headers=get_auth_headers(member.token)) assert response.status_code == 200 - await session.refresh(project1) - await session.refresh(project2) - assert project1.deleted - assert not project2.deleted + projects = response.json() + + # Should see both projects, sorted by created_at + assert len(projects) == 2 + project_names = [p["project_name"] for p in projects] + assert "public_project" in project_names + assert "private_project" in project_names @pytest.mark.asyncio - async def test_returns_403_if_not_project_admin(self, test_db, session: AsyncSession): - owner = await create_user(session=session, name="owner", global_role=GlobalRole.USER) - user = await create_user(session=session, global_role=GlobalRole.USER) - project1 = await create_project(session=session, name="project1", owner=owner) - project2 = await create_project(session=session, name="project2", owner=owner) - await add_project_member( - session=session, project=project1, user=user, project_role=ProjectRole.ADMIN + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_paginated_projects( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user( + session=session, + created_at=datetime(2023, 1, 2, 3, 0, tzinfo=timezone.utc), + global_role=GlobalRole.ADMIN, ) - await add_project_member( - session=session, project=project2, user=user, project_role=ProjectRole.USER + project1 = await create_project( + session=session, + name="project1", + owner=user, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), ) - response = client.post( - "/api/projects/delete", + project2 = await create_project( + session=session, + name="project2", + owner=user, + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + ) + project3 = await create_project( + session=session, + name="project3", + owner=user, + created_at=datetime(2023, 1, 2, 3, 6, tzinfo=timezone.utc), + ) + response = await client.post( + "/api/projects/list", headers=get_auth_headers(user.token), - json={"projects_names": [project1.name, project2.name]}, + json={"limit": 1}, ) - assert response.status_code == 403 - res = await session.execute(select(ProjectModel)) - assert len(res.all()) == 2 - - @pytest.mark.asyncio - async def test_returns_403_if_not_project_member(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, name="project") - response = client.post( - "/api/projects/delete", + assert response.status_code == 200 + assert response.json() == [ + { + "project_id": str(project3.id), + "project_name": project3.name, + "owner": { + "id": str(user.id), + "username": user.name, + "created_at": "2023-01-02T03:00:00+00:00", + "global_role": user.global_role, + "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": None, + }, + "created_at": "2023-01-02T03:06:00+00:00", + "backends": [], + "members": [], + "is_public": False, + "templates_repo": None, + } + ] + response = await client.post( + "/api/projects/list", headers=get_auth_headers(user.token), - json={"projects_names": [project.name]}, + json={ + "prev_created_at": "2023-01-02T03:06:00+00:00", + "prev_id": str(project3.id), + "limit": 1, + }, ) - assert response.status_code == 403 - res = await session.execute(select(ProjectModel)) - assert len(res.all()) == 1 - - -class TestGetProject: - def test_returns_40x_if_not_authenticated(self): - response = client.post("/api/projects/test_project/get") - assert response.status_code in [401, 403] - - @pytest.mark.asyncio - async def test_returns_404_if_project_does_not_exist(self, test_db, session: AsyncSession): - user = await create_user(session=session) - response = client.post( - "/api/projects/test_project/get", + assert response.status_code == 200 + assert response.json() == [ + { + "project_id": str(project2.id), + "project_name": project2.name, + "owner": { + "id": str(user.id), + "username": user.name, + "created_at": "2023-01-02T03:00:00+00:00", + "global_role": user.global_role, + "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": None, + }, + "created_at": "2023-01-02T03:05:00+00:00", + "backends": [], + "members": [], + "is_public": False, + "templates_repo": None, + } + ] + response = await client.post( + "/api/projects/list", headers=get_auth_headers(user.token), + json={ + "prev_created_at": "2023-01-02T03:05:00+00:00", + "prev_id": str(project2.id), + "limit": 1, + }, ) - assert response.status_code == 404, response.json() + assert response.status_code == 200 + assert response.json() == [ + { + "project_id": str(project1.id), + "project_name": project1.name, + "owner": { + "id": str(user.id), + "username": user.name, + "created_at": "2023-01-02T03:00:00+00:00", + "global_role": user.global_role, + "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": None, + }, + "created_at": "2023-01-02T03:04:00+00:00", + "backends": [], + "members": [], + "is_public": False, + "templates_repo": None, + } + ] @pytest.mark.asyncio - async def test_returns_project(self, test_db, session: AsyncSession): - user = await create_user(session=session) - project = await create_project(session=session, owner=user) - await add_project_member( - session=session, project=project, user=user, project_role=ProjectRole.ADMIN + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_total_count(self, test_db, session: AsyncSession, client: AsyncClient): + user = await create_user( + session=session, + created_at=datetime(2023, 1, 2, 3, 0, tzinfo=timezone.utc), + global_role=GlobalRole.ADMIN, ) - response = client.post( - "/api/projects/test_project/get", + await create_project( + session=session, + name="project1", + owner=user, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + project3 = await create_project( + session=session, + name="project3", + owner=user, + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + ) + response = await client.post( + "/api/projects/list", headers=get_auth_headers(user.token), + json={"limit": 1, "return_total_count": True}, ) - assert response.status_code == 200, response.json() + assert response.status_code == 200 assert response.json() == { - "project_id": str(project.id), - "project_name": project.name, - "owner": { - "id": str(user.id), - "username": user.name, - "global_role": user.global_role, - "email": None, - }, - "backends": [], - "members": [ + "total_count": 2, + "projects": [ { - "user": { + "project_id": str(project3.id), + "project_name": project3.name, + "owner": { "id": str(user.id), "username": user.name, + "created_at": "2023-01-02T03:00:00+00:00", "global_role": user.global_role, "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": None, }, - "project_role": ProjectRole.ADMIN, + "created_at": "2023-01-02T03:05:00+00:00", + "backends": [], + "members": [], + "is_public": False, + "templates_repo": None, } ], } -class TestSetProjectMembers: - def test_returns_40x_if_not_authenticated(self): - response = client.post("/api/projects/test_project/get") +class TestListOnlyNoFleets: + @pytest.mark.asyncio + async def test_list_only_no_fleets_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/projects/list_only_no_fleets") assert response.status_code in [401, 403] @pytest.mark.asyncio - async def test_sets_project_members(self, test_db, session: AsyncSession): - project = await create_project(session=session) - admin = await create_user(session=session) + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_only_no_fleets_returns_projects_without_active_fleets( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.ADMIN) + + # Create project with no fleets + project_no_fleets = await create_project( + session=session, + owner=user, + name="project_no_fleets", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) await add_project_member( - session=session, project=project, user=admin, project_role=ProjectRole.ADMIN + session=session, project=project_no_fleets, user=user, project_role=ProjectRole.ADMIN ) - user1 = await create_user(session=session, name="user1") - user2 = await create_user(session=session, name="user2") - members = [ + + # Create project with active fleet + project_with_active_fleet = await create_project( + session=session, + owner=user, + name="project_with_active_fleet", + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, + project=project_with_active_fleet, + user=user, + project_role=ProjectRole.ADMIN, + ) + await create_fleet( + session=session, + project=project_with_active_fleet, + deleted=False, + ) + + # Create project with deleted fleet (should be included) + project_with_deleted_fleet = await create_project( + session=session, + owner=user, + name="project_with_deleted_fleet", + created_at=datetime(2023, 1, 2, 3, 6, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, + project=project_with_deleted_fleet, + user=user, + project_role=ProjectRole.ADMIN, + ) + deleted_fleet = await create_fleet( + session=session, + project=project_with_deleted_fleet, + deleted=True, + ) + deleted_fleet.status = FleetStatus.TERMINATED + await session.commit() + + # Test with list_only_no_fleets endpoint + response = await client.post( + "/api/projects/list_only_no_fleets", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + projects = response.json() + + # Should only return projects without active fleets + assert len(projects) == 2 + project_names = {p["project_name"] for p in projects} + assert "project_no_fleets" in project_names + assert "project_with_deleted_fleet" in project_names + assert "project_with_active_fleet" not in project_names + + # Test with regular list endpoint (default) + response = await client.post( + "/api/projects/list", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + projects = response.json() + + # Should return all projects + assert len(projects) == 3 + project_names = {p["project_name"] for p in projects} + assert "project_no_fleets" in project_names + assert "project_with_active_fleet" in project_names + assert "project_with_deleted_fleet" in project_names + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_only_no_fleets_with_multiple_fleets( + self, test_db, session: AsyncSession, client: AsyncClient + ): + """Test project with multiple fleets - some active, some deleted""" + user = await create_user(session=session, global_role=GlobalRole.ADMIN) + + # Create project with both active and deleted fleets + project_mixed = await create_project( + session=session, + owner=user, + name="project_mixed", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, project=project_mixed, user=user, project_role=ProjectRole.ADMIN + ) + # Add active fleet - should exclude project + await create_fleet( + session=session, + project=project_mixed, + deleted=False, + ) + # Add deleted fleet - should not affect exclusion + deleted_fleet = await create_fleet( + session=session, + project=project_mixed, + deleted=True, + ) + deleted_fleet.status = FleetStatus.TERMINATED + await session.commit() + + # Project should NOT be included because it has an active fleet + response = await client.post( + "/api/projects/list_only_no_fleets", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + projects = response.json() + project_names = {p["project_name"] for p in projects} + assert "project_mixed" not in project_names + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_only_no_fleets_empty_result( + self, test_db, session: AsyncSession, client: AsyncClient + ): + """Test when all projects have active fleets""" + user = await create_user(session=session, global_role=GlobalRole.ADMIN) + + # Create projects, all with active fleets + for i in range(3): + project = await create_project( + session=session, + owner=user, + name=f"project_{i}", + created_at=datetime(2023, 1, 2, 3, 4 + i, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + await create_fleet( + session=session, + project=project, + deleted=False, + ) + + # Should return empty list + response = await client.post( + "/api/projects/list_only_no_fleets", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + projects = response.json() + assert len(projects) == 0 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_only_no_fleets_not_includes_project_with_imported_fleets( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + exporter_project = await create_project( + session=session, owner=user, name="exporter_project" + ) + await add_project_member( + session=session, project=exporter_project, user=user, project_role=ProjectRole.USER + ) + fleet = await create_fleet(session=session, project=exporter_project) + importer_project = await create_project( + session=session, owner=user, name="importer_project" + ) + await add_project_member( + session=session, project=importer_project, user=user, project_role=ProjectRole.USER + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + project_no_fleets = await create_project( + session=session, owner=user, name="project_no_fleets" + ) + await add_project_member( + session=session, project=project_no_fleets, user=user, project_role=ProjectRole.USER + ) + + response = await client.post( + "/api/projects/list_only_no_fleets", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + projects = response.json() + + assert len(projects) == 1 + assert projects[0]["project_name"] == "project_no_fleets" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_only_no_fleets_respects_user_permissions( + self, test_db, session: AsyncSession, client: AsyncClient + ): + # Create regular user (not admin) + user = await create_user(session=session, global_role=GlobalRole.USER) + + # Create another user + owner = await create_user(session=session, name="owner", global_role=GlobalRole.USER) + + # Create project where user is a member (no fleets) + project_member = await create_project( + session=session, + owner=owner, + name="project_member", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, project=project_member, user=user, project_role=ProjectRole.USER + ) + await add_project_member( + session=session, project=project_member, user=owner, project_role=ProjectRole.ADMIN + ) + + # Create public project where user is NOT a member (no fleets) + public_project = await create_project( + session=session, + owner=owner, + name="public_project", + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + is_public=True, + ) + await add_project_member( + session=session, project=public_project, user=owner, project_role=ProjectRole.ADMIN + ) + + # Create private project where user is NOT a member (should not see this) + private_project = await create_project( + session=session, + owner=owner, + name="private_project", + created_at=datetime(2023, 1, 2, 3, 6, tzinfo=timezone.utc), + is_public=False, + ) + await add_project_member( + session=session, project=private_project, user=owner, project_role=ProjectRole.ADMIN + ) + + # Test with list_only_no_fleets endpoint + response = await client.post( + "/api/projects/list_only_no_fleets", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + projects = response.json() + + # Should only return member projects without active fleets + # (public projects where user is not a member are no longer included) + assert len(projects) == 1 + project_names = {p["project_name"] for p in projects} + assert "project_member" in project_names + assert "public_project" not in project_names + assert "private_project" not in project_names + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_only_no_fleets_regular_user_filters_active_fleets( + self, test_db, session: AsyncSession, client: AsyncClient + ): + """Test that regular users correctly filter out projects with active fleets""" + # Create regular user (not admin) + user = await create_user(session=session, global_role=GlobalRole.USER) + + # Create another user + owner = await create_user(session=session, name="owner", global_role=GlobalRole.USER) + + # Create member project with no fleets (should be included) + project_member_no_fleet = await create_project( + session=session, + owner=owner, + name="project_member_no_fleet", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, + project=project_member_no_fleet, + user=user, + project_role=ProjectRole.USER, + ) + + # Create member project with active fleet (should be excluded) + project_member_with_fleet = await create_project( + session=session, + owner=owner, + name="project_member_with_fleet", + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, + project=project_member_with_fleet, + user=user, + project_role=ProjectRole.USER, + ) + await create_fleet( + session=session, + project=project_member_with_fleet, + deleted=False, + ) + + # Create public project where user is a member with no fleets (should be included) + public_project_no_fleet = await create_project( + session=session, + owner=owner, + name="public_project_no_fleet", + created_at=datetime(2023, 1, 2, 3, 6, tzinfo=timezone.utc), + is_public=True, + ) + await add_project_member( + session=session, + project=public_project_no_fleet, + user=user, + project_role=ProjectRole.USER, + ) + + # Create public project where user is a member with active fleet (should be excluded) + public_project_with_fleet = await create_project( + session=session, + owner=owner, + name="public_project_with_fleet", + created_at=datetime(2023, 1, 2, 3, 7, tzinfo=timezone.utc), + is_public=True, + ) + await add_project_member( + session=session, + project=public_project_with_fleet, + user=user, + project_role=ProjectRole.USER, + ) + await create_fleet( + session=session, + project=public_project_with_fleet, + deleted=False, + ) + + # Test with list_only_no_fleets endpoint + response = await client.post( + "/api/projects/list_only_no_fleets", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + projects = response.json() + + # Should only return member projects without active fleets + assert len(projects) == 2 + project_names = {p["project_name"] for p in projects} + assert "project_member_no_fleet" in project_names + assert "public_project_no_fleet" in project_names + assert "project_member_with_fleet" not in project_names + assert "public_project_with_fleet" not in project_names + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_only_no_fleets_filters_active_fleets_correctly( + self, test_db, session: AsyncSession, client: AsyncClient + ): + """Test that projects with active fleets are correctly filtered out""" + user = await create_user(session=session, global_role=GlobalRole.ADMIN) + + # Create project with active fleet + project_with_active = await create_project( + session=session, + owner=user, + name="project_with_active", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, project=project_with_active, user=user, project_role=ProjectRole.ADMIN + ) + active_fleet = await create_fleet( + session=session, + project=project_with_active, + deleted=False, + ) + active_fleet.status = FleetStatus.ACTIVE + await session.commit() + + # Create project with terminated but not deleted fleet (still active) + project_with_terminated = await create_project( + session=session, + owner=user, + name="project_with_terminated", + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, + project=project_with_terminated, + user=user, + project_role=ProjectRole.ADMIN, + ) + terminated_fleet = await create_fleet( + session=session, + project=project_with_terminated, + deleted=False, + ) + terminated_fleet.status = FleetStatus.TERMINATED + await session.commit() + + # Both should be excluded + response = await client.post( + "/api/projects/list_only_no_fleets", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + projects = response.json() + project_names = {p["project_name"] for p in projects} + assert "project_with_active" not in project_names + assert "project_with_terminated" not in project_names + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_only_no_fleets_sorted_by_created_at( + self, test_db, session: AsyncSession, client: AsyncClient + ): + """Test that results are sorted by created_at""" + user = await create_user(session=session, global_role=GlobalRole.ADMIN) + + # Create projects in reverse order + project_3 = await create_project( + session=session, + owner=user, + name="project_3", + created_at=datetime(2023, 1, 2, 3, 6, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, project=project_3, user=user, project_role=ProjectRole.ADMIN + ) + + project_1 = await create_project( + session=session, + owner=user, + name="project_1", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, project=project_1, user=user, project_role=ProjectRole.ADMIN + ) + + project_2 = await create_project( + session=session, + owner=user, + name="project_2", + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, project=project_2, user=user, project_role=ProjectRole.ADMIN + ) + + # Results should be sorted by created_at ascending + response = await client.post( + "/api/projects/list_only_no_fleets", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + projects = response.json() + assert len(projects) == 3 + assert projects[0]["project_name"] == "project_1" + assert projects[1]["project_name"] == "project_2" + assert projects[2]["project_name"] == "project_3" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_only_no_fleets_admin_requires_membership( + self, test_db, session: AsyncSession, client: AsyncClient + ): + """Test that admins also require membership (unified behavior)""" + # Create admin user + admin = await create_user(session=session, global_role=GlobalRole.ADMIN) + + # Create another user + owner = await create_user(session=session, name="owner", global_role=GlobalRole.USER) + + # Create project where admin is a member (no fleets) - should be included + project_with_membership = await create_project( + session=session, + owner=owner, + name="project_with_membership", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, + project=project_with_membership, + user=admin, + project_role=ProjectRole.ADMIN, + ) + + # Create project where admin is NOT a member (no fleets) - should NOT be included + project_without_membership = await create_project( + session=session, + owner=owner, + name="project_without_membership", + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, + project=project_without_membership, + user=owner, + project_role=ProjectRole.ADMIN, + ) + + # Test with list_only_no_fleets endpoint + response = await client.post( + "/api/projects/list_only_no_fleets", + headers=get_auth_headers(admin.token), + ) + assert response.status_code == 200 + projects = response.json() + + # Should only return project where admin is a member + assert len(projects) == 1 + project_names = {p["project_name"] for p in projects} + assert "project_with_membership" in project_names + assert "project_without_membership" not in project_names + + +class TestCreateProject: + @pytest.mark.asyncio + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/projects/create") + assert response.status_code in [401, 403] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @freeze_time(datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc)) + async def test_creates_project(self, test_db, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session) + project_id = UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e") + project_name = "test_project" + body = {"project_name": project_name} + with patch("uuid.uuid4") as m: + m.return_value = project_id + response = await client.post( + "/api/projects/create", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 200, response.json() + assert response.json() == { + "project_id": str(project_id), + "project_name": project_name, + "owner": { + "id": str(user.id), + "username": user.name, + "created_at": "2023-01-02T03:04:00+00:00", + "global_role": user.global_role, + "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": user.ssh_public_key, + }, + "created_at": "2023-01-02T03:04:00+00:00", + "backends": [], + "members": [ + { + "user": { + "id": str(user.id), + "username": user.name, + "created_at": "2023-01-02T03:04:00+00:00", + "global_role": user.global_role, + "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": user.ssh_public_key, + }, + "project_role": ProjectRole.ADMIN, + "permissions": { + "can_manage_ssh_fleets": True, + "can_manage_secrets": True, + }, + } + ], + "is_public": False, + "templates_repo": None, + } + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_400_if_project_name_is_taken( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session) + with patch("uuid.uuid4") as m: + m.return_value = UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e") + response = await client.post( + "/api/projects/create", + headers=get_auth_headers(user.token), + json={"project_name": "TestProject"}, + ) + assert response.status_code == 200 + # Project name uniqueness check should be case insensitive + for project_name in ["testproject", "TestProject", "TESTPROJECT"]: + with patch("uuid.uuid4") as m: + m.return_value = UUID("2b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e") + response = await client.post( + "/api/projects/create", + headers=get_auth_headers(user.token), + json={"project_name": project_name}, + ) + assert response.status_code == 400 + res = await session.execute( + select(ProjectModel).where( + ProjectModel.name.in_(["TestProject", "testproject", "TestProject", "TESTPROJECT"]) + ) + ) + assert len(res.scalars().all()) == 1 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_400_if_user_project_quota_exceeded( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, name="owner", global_role=GlobalRole.USER) + for i in range(10): + response = await client.post( + "/api/projects/create", + headers=get_auth_headers(user.token), + json={"project_name": f"project{i}"}, + ) + assert response.status_code == 200, response.json() + response = await client.post( + "/api/projects/create", + headers=get_auth_headers(user.token), + json={"project_name": "project11"}, + ) + assert response.status_code == 400 + assert response.json() == { + "detail": [{"code": "error", "msg": "User project quota exceeded"}] + } + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_no_project_quota_for_global_admins( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, name="owner", global_role=GlobalRole.ADMIN) + for i in range(12): + response = await client.post( + "/api/projects/create", + headers=get_auth_headers(user.token), + json={"project_name": f"project{i}"}, + ) + assert response.status_code == 200, response.json() + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_forbids_if_no_permission_to_create_projects( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + with default_permissions_context( + DefaultPermissions(allow_non_admins_create_projects=False) + ): + response = await client.post( + "/api/projects/create", + headers=get_auth_headers(user.token), + json={"project_name": "new_project"}, + ) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @freeze_time(datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc)) + async def test_creates_public_project( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session) + project_id = UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e") + project_name = "test_public_project" + body = {"project_name": project_name, "is_public": True} + with patch("uuid.uuid4") as m: + m.return_value = project_id + response = await client.post( + "/api/projects/create", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 200, response.json() + + # Check that the response includes is_public=True + response_data = response.json() + assert "is_public" in response_data + assert response_data["is_public"] is True + + # Verify the project was created as public in the database + res = await session.execute(select(ProjectModel).where(ProjectModel.name == project_name)) + project = res.scalar_one() + assert project.is_public is True + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @freeze_time(datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc)) + async def test_creates_private_project_by_default( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session) + project_id = UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e") + project_name = "test_private_project" + body = {"project_name": project_name} + + with patch("uuid.uuid4", return_value=project_id): + response = await client.post( + "/api/projects/create", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 200, response.json() + + # Check that the response includes is_public=False (default) + response_data = response.json() + assert "is_public" in response_data + assert response_data["is_public"] is False + + # Verify the project was created as private in the database + res = await session.execute(select(ProjectModel).where(ProjectModel.name == project_name)) + project = res.scalar_one() + assert project.is_public is False + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @freeze_time(datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc)) + async def test_creates_private_project_explicitly( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session) + project_id = UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e") + project_name = "test_explicit_private_project" + body = {"project_name": project_name, "is_public": False} + + with patch("uuid.uuid4", return_value=project_id): + response = await client.post( + "/api/projects/create", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 200, response.json() + + # Check that the response includes is_public=False (explicit) + response_data = response.json() + assert "is_public" in response_data + assert response_data["is_public"] is False + + # Verify the project was created as private in the database + res = await session.execute(select(ProjectModel).where(ProjectModel.name == project_name)) + project = res.scalar_one() + assert project.is_public is False + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_new_project_imports_global_exports( + self, session: AsyncSession, client: AsyncClient + ): + exporter_project = await create_project(session=session, name="ExporterProject") + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[], + exported_fleets=[], + name="non-global", + is_global=False, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[], + exported_fleets=[], + name="global-export", + is_global=True, + ) + user = await create_user(session=session, global_role=GlobalRole.USER) + + response = await client.post( + "/api/projects/create", + headers=get_auth_headers(user.token), + json={"project_name": "new-project"}, + ) + assert response.status_code == 200 + + response = await client.post( + "/api/project/new-project/imports/list", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + imports = response.json() + assert len(imports) == 1 + assert imports[0]["export"]["name"] == "global-export" + + +class TestDeleteProject: + @pytest.mark.asyncio + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/projects/delete") + assert response.status_code in [401, 403] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_deletes_the_only_project( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + response = await client.post( + "/api/projects/delete", + headers=get_auth_headers(user.token), + json={"projects_names": [project.name]}, + ) + assert response.status_code == 200 + await session.refresh(project) + assert project.deleted + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize("project_name", ["project1", "a" * 50]) + async def test_deletes_projects( + self, test_db, session: AsyncSession, client: AsyncClient, project_name: str + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project1 = await create_project(session=session, owner=user, name=project_name) + await add_project_member( + session=session, project=project1, user=user, project_role=ProjectRole.ADMIN + ) + project2 = await create_project(session=session, owner=user, name="project2") + await add_project_member( + session=session, project=project2, user=user, project_role=ProjectRole.ADMIN + ) + response = await client.post( + "/api/projects/delete", + headers=get_auth_headers(user.token), + json={"projects_names": [project1.name]}, + ) + assert response.status_code == 200 + await session.refresh(project1) + await session.refresh(project2) + assert project1.deleted + assert not project2.deleted + # Validate an event is emitted + response = await client.post( + "/api/events/list", headers=get_auth_headers(user.token), json={} + ) + assert response.status_code == 200 + assert len(response.json()) == 1 + assert response.json()[0]["message"] == "Project deleted" + assert len(response.json()[0]["targets"]) == 1 + assert response.json()[0]["targets"][0]["id"] == str(project1.id) + assert response.json()[0]["targets"][0]["name"] == project_name + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_400_if_project_does_not_exist( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.ADMIN) + response = await client.post( + "/api/projects/delete", + headers=get_auth_headers(user.token), + json={"projects_names": ["random_project"]}, + ) + assert response.status_code == 400 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_project_admin( + self, test_db, session: AsyncSession, client: AsyncClient + ): + owner = await create_user(session=session, name="owner", global_role=GlobalRole.USER) + user = await create_user(session=session, global_role=GlobalRole.USER) + project1 = await create_project(session=session, name="project1", owner=owner) + project2 = await create_project(session=session, name="project2", owner=owner) + await add_project_member( + session=session, project=project1, user=user, project_role=ProjectRole.ADMIN + ) + await add_project_member( + session=session, project=project2, user=user, project_role=ProjectRole.USER + ) + response = await client.post( + "/api/projects/delete", + headers=get_auth_headers(user.token), + json={"projects_names": [project1.name, project2.name]}, + ) + assert response.status_code == 403 + res = await session.execute(select(ProjectModel).where(ProjectModel.deleted.is_(False))) + assert len(res.all()) == 2 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_project_member( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, name="project") + response = await client.post( + "/api/projects/delete", + headers=get_auth_headers(user.token), + json={"projects_names": [project.name]}, + ) + assert response.status_code == 403 + res = await session.execute(select(ProjectModel).where(ProjectModel.deleted.is_(False))) + assert len(res.all()) == 1 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_errors_if_project_has_active_runs( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.ADMIN) + project = await create_project(session=session, name="project") + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.SUBMITTED, + ) + response = await client.post( + "/api/projects/delete", + headers=get_auth_headers(user.token), + json={"projects_names": [project.name]}, + ) + assert response.status_code == 400 + res = await session.execute(select(ProjectModel).where(ProjectModel.deleted.is_(False))) + assert len(res.all()) == 1 + run.status = RunStatus.TERMINATED + await session.commit() + response = await client.post( + "/api/projects/delete", + headers=get_auth_headers(user.token), + json={"projects_names": [project.name]}, + ) + assert response.status_code == 200 + res = await session.execute(select(ProjectModel).where(ProjectModel.deleted.is_(False))) + assert len(res.all()) == 0 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_errors_if_project_has_active_fleets( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.ADMIN) + project = await create_project(session=session, name="project") + fleet = await create_fleet( + session=session, + project=project, + deleted=False, + ) + response = await client.post( + "/api/projects/delete", + headers=get_auth_headers(user.token), + json={"projects_names": [project.name]}, + ) + assert response.status_code == 400 + res = await session.execute(select(ProjectModel).where(ProjectModel.deleted.is_(False))) + assert len(res.all()) == 1 + fleet.status = FleetStatus.TERMINATED + fleet.deleted = True + await session.commit() + response = await client.post( + "/api/projects/delete", + headers=get_auth_headers(user.token), + json={"projects_names": [project.name]}, + ) + assert response.status_code == 200 + res = await session.execute(select(ProjectModel).where(ProjectModel.deleted.is_(False))) + assert len(res.all()) == 0 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_errors_if_project_has_active_volumes( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.ADMIN) + project = await create_project(session=session, name="project") + volume = await create_volume( + session=session, + project=project, + user=user, + ) + response = await client.post( + "/api/projects/delete", + headers=get_auth_headers(user.token), + json={"projects_names": [project.name]}, + ) + assert response.status_code == 400 + res = await session.execute(select(ProjectModel).where(ProjectModel.deleted.is_(False))) + assert len(res.all()) == 1 + volume.deleted = True + await session.commit() + response = await client.post( + "/api/projects/delete", + headers=get_auth_headers(user.token), + json={"projects_names": [project.name]}, + ) + assert response.status_code == 200 + res = await session.execute(select(ProjectModel).where(ProjectModel.deleted.is_(False))) + assert len(res.all()) == 0 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_deletes_export_models_on_project_delete( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.ADMIN) + project = await create_project(session=session, owner=user) + fleet = await create_fleet(session=session, project=project, deleted=True) + await create_export( + session=session, + exporter_project=project, + importer_projects=[], + exported_fleets=[fleet], + ) + + res = await session.execute(select(ExportModel)) + assert len(res.scalars().all()) == 1 + + response = await client.post( + "/api/projects/delete", + headers=get_auth_headers(user.token), + json={"projects_names": [project.name]}, + ) + assert response.status_code == 200 + + res = await session.execute(select(ExportModel)) + assert len(res.scalars().all()) == 0 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_deletes_import_models_on_project_delete( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.ADMIN) + exporter_project = await create_project(session=session, owner=user, name="exporter") + importer_project = await create_project(session=session, owner=user, name="importer") + fleet = await create_fleet(session=session, project=exporter_project, deleted=True) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + + res = await session.execute(select(ImportModel)) + assert len(res.scalars().all()) == 1 + + response = await client.post( + "/api/projects/delete", + headers=get_auth_headers(user.token), + json={"projects_names": [importer_project.name]}, + ) + assert response.status_code == 200 + + res = await session.execute(select(ImportModel)) + assert len(res.scalars().all()) == 0 + + +class TestGetProject: + @pytest.mark.asyncio + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/projects/test_project/get") + assert response.status_code in [401, 403] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_404_if_project_does_not_exist( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session) + response = await client.post( + "/api/projects/test_project/get", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 404, response.json() + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_project(self, test_db, session: AsyncSession, client: AsyncClient): + user = await create_user( + session=session, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + project = await create_project( + session=session, + owner=user, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + response = await client.post( + "/api/projects/test_project/get", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200, response.json() + assert response.json() == { + "project_id": str(project.id), + "project_name": project.name, + "owner": { + "id": str(user.id), + "username": user.name, + "created_at": "2023-01-02T03:04:00+00:00", + "global_role": user.global_role, + "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": None, + }, + "created_at": "2023-01-02T03:04:00+00:00", + "backends": [], + "members": [ + { + "user": { + "id": str(user.id), + "username": user.name, + "created_at": "2023-01-02T03:04:00+00:00", + "global_role": user.global_role, + "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": None, + }, + "project_role": ProjectRole.ADMIN, + "permissions": { + "can_manage_ssh_fleets": True, + "can_manage_secrets": True, + }, + } + ], + "is_public": False, + "templates_repo": None, + } + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_non_member_can_access_public_project( + self, test_db, session: AsyncSession, client: AsyncClient + ): + # Create project owner + owner = await create_user( + session=session, + name="owner", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + global_role=GlobalRole.USER, # Make owner a regular user + ) + + # Create public project + project = await create_project( + session=session, + owner=owner, + name="public_project", + is_public=True, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, project=project, user=owner, project_role=ProjectRole.ADMIN + ) + + # Create non-member user as regular user (not global admin) + non_member = await create_user( + session=session, + name="non_member", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + global_role=GlobalRole.USER, # Make non_member a regular user + ) + + # Non-member should be able to access public project details + response = await client.post( + f"/api/projects/{project.name}/get", + headers=get_auth_headers(non_member.token), + ) + assert response.status_code == 200, response.json() + + # Verify response includes is_public=True + response_data = response.json() + assert response_data["is_public"] is True + assert response_data["project_name"] == "public_project" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_non_member_cannot_access_private_project( + self, test_db, session: AsyncSession, client: AsyncClient + ): + # Create project owner + owner = await create_user( + session=session, + name="owner", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + global_role=GlobalRole.USER, # Make owner a regular user + ) + + # Create private project + project = await create_project( + session=session, + owner=owner, + name="private_project", + is_public=False, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, project=project, user=owner, project_role=ProjectRole.ADMIN + ) + + # Create non-member user as regular user (not global admin) + non_member = await create_user( + session=session, + name="non_member", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + global_role=GlobalRole.USER, # Make non_member a regular user + ) + + # Non-member should NOT be able to access private project details + response = await client.post( + f"/api/projects/{project.name}/get", + headers=get_auth_headers(non_member.token), + ) + assert response.status_code == 403, response.json() + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_member_can_access_both_public_and_private_projects( + self, test_db, session: AsyncSession, client: AsyncClient + ): + # Create project owner + owner = await create_user( + session=session, + name="owner", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + global_role=GlobalRole.USER, # Make owner a regular user + ) + + # Create public project + public_project = await create_project( + session=session, + owner=owner, + name="public_project", + is_public=True, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, project=public_project, user=owner, project_role=ProjectRole.ADMIN + ) + + # Create private project + private_project = await create_project( + session=session, + owner=owner, + name="private_project", + is_public=False, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, project=private_project, user=owner, project_role=ProjectRole.ADMIN + ) + + # Create member user as regular user (not global admin) and add to both projects + member = await create_user( + session=session, + name="member", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + global_role=GlobalRole.USER, # Make member a regular user + ) + await add_project_member( + session=session, project=public_project, user=member, project_role=ProjectRole.USER + ) + await add_project_member( + session=session, project=private_project, user=member, project_role=ProjectRole.USER + ) + + # Member should be able to access both public and private projects + response = await client.post( + f"/api/projects/{public_project.name}/get", + headers=get_auth_headers(member.token), + ) + assert response.status_code == 200, response.json() + assert response.json()["is_public"] is True + + response = await client.post( + f"/api/projects/{private_project.name}/get", + headers=get_auth_headers(member.token), + ) + assert response.status_code == 200, response.json() + assert response.json()["is_public"] is False + + +class TestSetProjectMembers: + @pytest.mark.asyncio + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/projects/test_project/get") + assert response.status_code in [401, 403] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_sets_project_members(self, test_db, session: AsyncSession, client: AsyncClient): + project = await create_project( + session=session, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + admin = await create_user( + session=session, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, + project=project, + user=admin, + project_role=ProjectRole.ADMIN, + ) + user1 = await create_user( + session=session, + name="user1", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + user2 = await create_user( + session=session, + name="user2", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + members = [ + { + "username": admin.name, + "project_role": ProjectRole.ADMIN, + }, + { + "username": user1.name, + "project_role": ProjectRole.ADMIN, + }, + { + "username": user2.name, + "project_role": ProjectRole.USER, + }, + ] + body = {"members": members} + response = await client.post( + f"/api/projects/{project.name}/set_members", + headers=get_auth_headers(admin.token), + json=body, + ) + assert response.status_code == 200, response.json() + assert response.json()["members"] == [ + { + "user": { + "id": str(admin.id), + "username": admin.name, + "created_at": "2023-01-02T03:04:00+00:00", + "global_role": admin.global_role, + "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": admin.ssh_public_key, + }, + "project_role": ProjectRole.ADMIN, + "permissions": { + "can_manage_ssh_fleets": True, + "can_manage_secrets": True, + }, + }, + { + "user": { + "id": str(user1.id), + "username": user1.name, + "created_at": "2023-01-02T03:04:00+00:00", + "global_role": user1.global_role, + "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": user1.ssh_public_key, + }, + "project_role": ProjectRole.ADMIN, + "permissions": { + "can_manage_ssh_fleets": True, + "can_manage_secrets": True, + }, + }, + { + "user": { + "id": str(user2.id), + "username": user2.name, + "created_at": "2023-01-02T03:04:00+00:00", + "global_role": user2.global_role, + "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": user2.ssh_public_key, + }, + "project_role": ProjectRole.USER, + "permissions": { + "can_manage_ssh_fleets": True, + "can_manage_secrets": True, + }, + }, + ] + res = await session.execute(select(MemberModel)) + members = res.scalars().all() + assert len(members) == 3 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_sets_project_members_by_email( + self, test_db, session: AsyncSession, client: AsyncClient + ): + project = await create_project( + session=session, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + admin = await create_user( + session=session, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + global_role=GlobalRole.ADMIN, + ) + user1 = await create_user( + session=session, + name="user1", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + email="testemail@example.com", + ) + members = [ + { + "username": user1.email, + "project_role": ProjectRole.ADMIN, + }, + ] + body = {"members": members} + response = await client.post( + f"/api/projects/{project.name}/set_members", + headers=get_auth_headers(admin.token), + json=body, + ) + assert response.status_code == 200, response.json() + assert response.json()["members"] == [ + { + "user": { + "id": str(user1.id), + "username": user1.name, + "created_at": "2023-01-02T03:04:00+00:00", + "global_role": user1.global_role, + "email": user1.email, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": user1.ssh_public_key, + }, + "project_role": ProjectRole.ADMIN, + "permissions": { + "can_manage_ssh_fleets": True, + "can_manage_secrets": True, + }, + }, + ] + res = await session.execute(select(MemberModel)) + members = res.scalars().all() + assert len(members) == 1 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_manager_cannot_set_project_admins( + self, test_db, session: AsyncSession, client: AsyncClient + ): + project = await create_project(session=session) + user = await create_user(session=session, global_role=GlobalRole.USER) + await add_project_member( + session=session, + project=project, + user=user, + project_role=ProjectRole.MANAGER, + ) + user1 = await create_user(session=session, name="user1") + members = [ { - "username": admin.name, + "username": user.name, "project_role": ProjectRole.ADMIN, }, { "username": user1.name, "project_role": ProjectRole.ADMIN, }, + ] + body = {"members": members} + response = await client.post( + f"/api/projects/{project.name}/set_members", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_global_admin_manager_can_set_project_admins( + self, test_db, session: AsyncSession, client: AsyncClient + ): + project = await create_project(session=session) + user = await create_user(session=session, global_role=GlobalRole.ADMIN) + await add_project_member( + session=session, + project=project, + user=user, + project_role=ProjectRole.MANAGER, + ) + user1 = await create_user(session=session, name="user1") + members = [ + { + "username": user.name, + "project_role": ProjectRole.ADMIN, + }, { - "username": user2.name, - "project_role": ProjectRole.USER, + "username": user1.name, + "project_role": ProjectRole.ADMIN, }, ] body = {"members": members} - response = client.post( + response = await client.post( f"/api/projects/{project.name}/set_members", - headers=get_auth_headers(admin.token), + headers=get_auth_headers(user.token), json=body, ) - assert response.status_code == 200, response.json() - assert response.json()["members"] == [ + assert response.status_code == 200 + res = await session.execute(select(MemberModel)) + members = res.scalars().all() + assert len(members) == 2 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_cannot_set_same_user_twice( + self, test_db, session: AsyncSession, client: AsyncClient + ): + project = await create_project(session=session) + user = await create_user(session=session, global_role=GlobalRole.ADMIN) + user1 = await create_user(session=session, name="user1") + members = [ { - "user": { - "id": str(admin.id), - "username": admin.name, - "global_role": admin.global_role, - "email": None, - }, + "username": user1.name, "project_role": ProjectRole.ADMIN, }, { - "user": { - "id": str(user1.id), - "username": user1.name, - "global_role": user1.global_role, - "email": None, - }, + "username": user1.name, "project_role": ProjectRole.ADMIN, }, + ] + body = {"members": members} + response = await client.post( + f"/api/projects/{project.name}/set_members", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 400 + res = await session.execute(select(MemberModel)) + members = res.scalars().all() + assert len(members) == 0 + + +class TestAddProjectMembers: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_add_member_errors_on_nonexistent_user( + self, test_db, session: AsyncSession, client: AsyncClient + ): + # Setup project and admin + project = await create_project( + session=session, created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc) + ) + admin = await create_user( + session=session, created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc) + ) + await add_project_member( + session=session, project=project, user=admin, project_role=ProjectRole.ADMIN + ) + + # Try to add non-existent user - should now error instead of silently skipping + body = {"members": [{"username": "nonexistent", "project_role": "user"}]} + response = await client.post( + f"/api/projects/{project.name}/add_members", + headers=get_auth_headers(admin.token), + json=body, + ) + + # Operation should fail with 400 error for non-existent user + assert response.status_code == 400 + response_json = response.json() + assert "User not found: nonexistent" in str(response_json) + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_add_member_manager_cannot_add_admin_without_global_admin( + self, test_db, session: AsyncSession, client: AsyncClient + ): + # Setup project with manager (not global admin) + project = await create_project( + session=session, created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc) + ) + manager = await create_user( + session=session, + global_role=GlobalRole.USER, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, project=project, user=manager, project_role=ProjectRole.MANAGER + ) + + # Create user to add + _new_user = await create_user( + session=session, + name="newuser", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + + # Try to add admin + body = {"members": [{"username": "newuser", "project_role": "admin"}]} + response = await client.post( + f"/api/projects/{project.name}/add_members", + headers=get_auth_headers(manager.token), + json=body, + ) + + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_cannot_add_same_user_twice( + self, test_db, session: AsyncSession, client: AsyncClient + ): + project = await create_project(session=session) + user = await create_user(session=session, global_role=GlobalRole.ADMIN) + user1 = await create_user(session=session, name="user1") + members = [ { - "user": { - "id": str(user2.id), - "username": user2.name, - "global_role": user2.global_role, - "email": None, - }, - "project_role": ProjectRole.USER, + "username": user1.name, + "project_role": ProjectRole.ADMIN, + }, + { + "username": user1.name, + "project_role": ProjectRole.ADMIN, }, ] + body = {"members": members} + response = await client.post( + f"/api/projects/{project.name}/add_members", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 400, response.json() res = await session.execute(select(MemberModel)) members = res.scalars().all() - assert len(members) == 3 + assert len(members) == 0 + + +class TestUpdateProjectVisibility: + @pytest.mark.asyncio + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/projects/test/update") + assert response.status_code in [401, 403] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_404_if_project_does_not_exist( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session) + response = await client.post( + "/api/projects/nonexistent/update", + headers=get_auth_headers(user.token), + json={"is_public": True}, + ) + assert response.status_code == 404 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_project_admin_can_update_visibility( + self, test_db, session: AsyncSession, client: AsyncClient + ): + # Setup project with admin + admin_user = await create_user(session=session, name="admin", global_role=GlobalRole.USER) + project = await create_project(session=session, owner=admin_user, is_public=False) + await add_project_member( + session=session, project=project, user=admin_user, project_role=ProjectRole.ADMIN + ) + + # Admin should be able to make project public + response = await client.post( + f"/api/projects/{project.name}/update", + headers=get_auth_headers(admin_user.token), + json={"is_public": True}, + ) + assert response.status_code == 200 + assert response.json()["is_public"] == True + + # Admin should be able to make project private again + response = await client.post( + f"/api/projects/{project.name}/update", + headers=get_auth_headers(admin_user.token), + json={"is_public": False}, + ) + assert response.status_code == 200 + assert response.json()["is_public"] == False + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_regular_user_cannot_update_visibility( + self, test_db, session: AsyncSession, client: AsyncClient + ): + # Setup project with admin and regular user + admin_user = await create_user(session=session, name="admin", global_role=GlobalRole.USER) + regular_user = await create_user(session=session, name="user", global_role=GlobalRole.USER) + project = await create_project(session=session, owner=admin_user, is_public=False) + await add_project_member( + session=session, project=project, user=admin_user, project_role=ProjectRole.ADMIN + ) + await add_project_member( + session=session, project=project, user=regular_user, project_role=ProjectRole.USER + ) + + # Regular user should not be able to update visibility + response = await client.post( + f"/api/projects/{project.name}/update", + headers=get_auth_headers(regular_user.token), + json={"is_public": True}, + ) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_non_member_cannot_update_visibility( + self, test_db, session: AsyncSession, client: AsyncClient + ): + # Setup project with admin and separate non-member user + admin_user = await create_user(session=session, name="admin", global_role=GlobalRole.USER) + non_member_user = await create_user( + session=session, name="nonmember", global_role=GlobalRole.USER + ) + project = await create_project(session=session, owner=admin_user, is_public=False) + await add_project_member( + session=session, project=project, user=admin_user, project_role=ProjectRole.ADMIN + ) + + # Non-member should not be able to update visibility + response = await client.post( + f"/api/projects/{project.name}/update", + headers=get_auth_headers(non_member_user.token), + json={"is_public": True}, + ) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_global_admin_can_update_any_project_visibility( + self, test_db, session: AsyncSession, client: AsyncClient + ): + # Setup project with regular owner and global admin + project_owner = await create_user( + session=session, name="owner", global_role=GlobalRole.USER + ) + global_admin = await create_user( + session=session, name="admin", global_role=GlobalRole.ADMIN + ) + project = await create_project(session=session, owner=project_owner, is_public=False) + await add_project_member( + session=session, project=project, user=project_owner, project_role=ProjectRole.ADMIN + ) + + # Global admin should be able to update any project's visibility + response = await client.post( + f"/api/projects/{project.name}/update", + headers=get_auth_headers(global_admin.token), + json={"is_public": True}, + ) + assert response.status_code == 200 + assert response.json()["is_public"] == True + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_can_update_templates_repo( + self, test_db, session: AsyncSession, client: AsyncClient + ): + admin_user = await create_user(session=session, name="admin", global_role=GlobalRole.USER) + project = await create_project(session=session, owner=admin_user, is_public=False) + await add_project_member( + session=session, project=project, user=admin_user, project_role=ProjectRole.ADMIN + ) + + with patch( + "dstack._internal.server.services.projects.templates_service.validate_templates_repo_access" + ): + response = await client.post( + f"/api/projects/{project.name}/update", + headers=get_auth_headers(admin_user.token), + json={"templates_repo": "https://github.com/org/templates.git"}, + ) + assert response.status_code == 200 + assert response.json()["templates_repo"] == "https://github.com/org/templates.git" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_omitted_templates_repo_does_not_clear_existing_value( + self, test_db, session: AsyncSession, client: AsyncClient + ): + admin_user = await create_user(session=session, name="admin", global_role=GlobalRole.USER) + project = await create_project( + session=session, + owner=admin_user, + is_public=False, + templates_repo="https://github.com/org/templates.git", + ) + await add_project_member( + session=session, project=project, user=admin_user, project_role=ProjectRole.ADMIN + ) + + response = await client.post( + f"/api/projects/{project.name}/update", + headers=get_auth_headers(admin_user.token), + json={"is_public": True}, + ) + assert response.status_code == 200 + assert response.json()["templates_repo"] == "https://github.com/org/templates.git" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_can_reset_templates_repo_with_explicit_flag( + self, test_db, session: AsyncSession, client: AsyncClient + ): + admin_user = await create_user(session=session, name="admin", global_role=GlobalRole.USER) + project = await create_project( + session=session, + owner=admin_user, + is_public=False, + templates_repo="https://github.com/org/templates.git", + ) + await add_project_member( + session=session, project=project, user=admin_user, project_role=ProjectRole.ADMIN + ) + + response = await client.post( + f"/api/projects/{project.name}/update", + headers=get_auth_headers(admin_user.token), + json={"reset_templates_repo": True}, + ) + assert response.status_code == 200 + assert response.json().get("templates_repo") is None + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_null_templates_repo_without_reset_does_not_clear_existing_value( + self, test_db, session: AsyncSession, client: AsyncClient + ): + admin_user = await create_user(session=session, name="admin", global_role=GlobalRole.USER) + project = await create_project( + session=session, + owner=admin_user, + is_public=False, + templates_repo="https://github.com/org/templates.git", + ) + await add_project_member( + session=session, project=project, user=admin_user, project_role=ProjectRole.ADMIN + ) + + response = await client.post( + f"/api/projects/{project.name}/update", + headers=get_auth_headers(admin_user.token), + json={"templates_repo": None}, + ) + assert response.status_code == 200 + assert response.json()["templates_repo"] == "https://github.com/org/templates.git" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_normalizes_empty_templates_repo_to_null( + self, test_db, session: AsyncSession, client: AsyncClient + ): + admin_user = await create_user(session=session, name="admin", global_role=GlobalRole.USER) + project = await create_project(session=session, owner=admin_user, is_public=False) + await add_project_member( + session=session, project=project, user=admin_user, project_role=ProjectRole.ADMIN + ) + + response = await client.post( + f"/api/projects/{project.name}/update", + headers=get_auth_headers(admin_user.token), + json={"templates_repo": " "}, + ) + assert response.status_code == 200 + assert response.json().get("templates_repo") is None + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_trims_templates_repo_url( + self, test_db, session: AsyncSession, client: AsyncClient + ): + admin_user = await create_user(session=session, name="admin", global_role=GlobalRole.USER) + project = await create_project(session=session, owner=admin_user, is_public=False) + await add_project_member( + session=session, project=project, user=admin_user, project_role=ProjectRole.ADMIN + ) + + with patch( + "dstack._internal.server.services.projects.templates_service.validate_templates_repo_access" + ): + response = await client.post( + f"/api/projects/{project.name}/update", + headers=get_auth_headers(admin_user.token), + json={"templates_repo": " https://github.com/org/templates.git "}, + ) + assert response.status_code == 200 + assert response.json()["templates_repo"] == "https://github.com/org/templates.git" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_rejects_unreachable_templates_repo( + self, test_db, session: AsyncSession, client: AsyncClient + ): + admin_user = await create_user(session=session, name="admin", global_role=GlobalRole.USER) + project = await create_project(session=session, owner=admin_user, is_public=False) + await add_project_member( + session=session, project=project, user=admin_user, project_role=ProjectRole.ADMIN + ) + + with patch( + "dstack._internal.server.services.projects.templates_service.validate_templates_repo_access", + side_effect=ValueError( + "Cannot access templates repo: https://github.com/dstackai/dstack-sky-templates11" + ), + ): + response = await client.post( + f"/api/projects/{project.name}/update", + headers=get_auth_headers(admin_user.token), + json={"templates_repo": "https://github.com/dstackai/dstack-sky-templates11"}, + ) + + assert response.status_code == 400 + assert response.json() == { + "detail": [ + { + "code": "error", + "msg": "Cannot access templates repo: https://github.com/dstackai/dstack-sky-templates11", + } + ] + } diff --git a/src/tests/_internal/server/routers/test_prometheus.py b/src/tests/_internal/server/routers/test_prometheus.py new file mode 100644 index 0000000000..f87f43a80f --- /dev/null +++ b/src/tests/_internal/server/routers/test_prometheus.py @@ -0,0 +1,445 @@ +from datetime import datetime, timedelta, timezone +from textwrap import dedent +from typing import Optional +from unittest.mock import patch + +import pytest +from freezegun import freeze_time +from httpx import AsyncClient +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.configurations import DevEnvironmentConfiguration +from dstack._internal.core.models.runs import ( + JobProvisioningData, + JobRuntimeData, + JobStatus, + RunStatus, +) +from dstack._internal.core.models.users import GlobalRole, ProjectRole +from dstack._internal.server.models import JobModel, ProjectModel, RunModel, UserModel +from dstack._internal.server.services.projects import add_project_member +from dstack._internal.server.testing.common import ( + create_fleet, + create_instance, + create_job, + create_job_metrics_point, + create_job_prometheus_metrics, + create_project, + create_repo, + create_run, + create_user, + get_auth_headers, + get_instance_offer_with_availability, + get_job_provisioning_data, + get_job_runtime_data, + get_run_spec, +) + +BASE_HTTP_METRICS = b""" +# HELP python_gc_objects_collected_total Objects collected during gc +# TYPE python_gc_objects_collected_total counter +python_gc_objects_collected_total{generation="0"} 13159.0 +python_gc_objects_collected_total{generation="1"} 1583.0 +python_gc_objects_collected_total{generation="2"} 81.0 +# HELP python_gc_objects_uncollectable_total Uncollectable objects found during GC +# TYPE python_gc_objects_uncollectable_total counter +python_gc_objects_uncollectable_total{generation="0"} 0.0 +python_gc_objects_uncollectable_total{generation="1"} 0.0 +python_gc_objects_uncollectable_total{generation="2"} 0.0 +# HELP python_gc_collections_total Number of times this generation was collected +# TYPE python_gc_collections_total counter +python_gc_collections_total{generation="0"} 1609.0 +python_gc_collections_total{generation="1"} 146.0 +python_gc_collections_total{generation="2"} 9.0 +# HELP python_info Python platform information +# TYPE python_info gauge +python_info{implementation="CPython",major="3",minor="12",patchlevel="2",version="3.12.2"} 1.0 +# HELP dstack_server_requests_total Total number of HTTP requests +# TYPE dstack_server_requests_total counter +dstack_server_requests_total{endpoint="/metrics",http_status="200",method="GET",project_name="None"} 1.0 +# HELP dstack_server_requests_created Total number of HTTP requests +# TYPE dstack_server_requests_created gauge +dstack_server_requests_created{endpoint="/metrics",http_status="200",method="GET",project_name="None"} 1.67262864e+09 +# HELP dstack_server_request_duration_seconds HTTP request duration in seconds +# TYPE dstack_server_request_duration_seconds histogram +dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="0.005",method="GET",project_name="None"} 1.0 +dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="0.01",method="GET",project_name="None"} 1.0 +dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="0.025",method="GET",project_name="None"} 1.0 +dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="0.05",method="GET",project_name="None"} 1.0 +dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="0.075",method="GET",project_name="None"} 1.0 +dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="0.1",method="GET",project_name="None"} 1.0 +dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="0.25",method="GET",project_name="None"} 1.0 +dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="0.5",method="GET",project_name="None"} 1.0 +dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="0.75",method="GET",project_name="None"} 1.0 +dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="1.0",method="GET",project_name="None"} 1.0 +dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="2.5",method="GET",project_name="None"} 1.0 +dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="5.0",method="GET",project_name="None"} 1.0 +dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="7.5",method="GET",project_name="None"} 1.0 +dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="10.0",method="GET",project_name="None"} 1.0 +dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="+Inf",method="GET",project_name="None"} 1.0 +dstack_server_request_duration_seconds_count{endpoint="/metrics",http_status="200",method="GET",project_name="None"} 1.0 +dstack_server_request_duration_seconds_sum{endpoint="/metrics",http_status="200",method="GET",project_name="None"} 0.0 +# HELP dstack_server_request_duration_seconds_created HTTP request duration in seconds +# TYPE dstack_server_request_duration_seconds_created gauge +dstack_server_request_duration_seconds_created{endpoint="/metrics",http_status="200",method="GET",project_name="None"} 1.67262864e+09 +""" + + +@pytest.fixture +def enable_metrics(monkeypatch: pytest.MonkeyPatch): + monkeypatch.setattr("dstack._internal.server.settings.ENABLE_PROMETHEUS_METRICS", True) + monkeypatch.setattr("dstack._internal.server.routers.prometheus._auth._token", None) + + +FAKE_NOW = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc) + + +@freeze_time(FAKE_NOW) +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +@pytest.mark.usefixtures("image_config_mock", "test_db", "enable_metrics") +class TestGetPrometheusMetrics: + @patch("prometheus_client.generate_latest", lambda: BASE_HTTP_METRICS) + async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session, name="test-user", global_role=GlobalRole.USER) + offer = get_instance_offer_with_availability( + instance_type="test-type", + cpu_count=32, + memory_gib=128, + gpu_count=2, + gpu_name="V4", + gpu_memory_gib=16, + price=12, + ) + project_2 = await _create_project(session, "project-2", user) + jpd_2_1 = get_job_provisioning_data( + backend=BackendType.AWS, + cpu_count=16, + memory_gib=64, + gpu_name="T4", + gpu_count=2, + price=16, + ) + job_2_1 = await _create_job( + session=session, + run_name="run-1", + project=project_2, + user=user, + status=JobStatus.RUNNING, + job_provisioning_data=jpd_2_1, + submitted_at=FAKE_NOW - timedelta(seconds=100), + ) + await create_job_prometheus_metrics( + session=session, + job=job_2_1, + text=dedent(""" + # HELP FIELD_1 Test field 1 + # TYPE FIELD_1 gauge + FIELD_1{gpu="0"} 100 + FIELD_1{gpu="1"} 200 + """), + ) + project_1 = await _create_project(session, "project-1", user) + # jrd.offer.instance.resources has higher priority than jpd.instance_type.resources, + # should be ignored + jpd_1_1 = get_job_provisioning_data(backend=BackendType.AWS, gpu_count=4, gpu_name="T4") + jrd_1_1 = get_job_runtime_data(offer=offer) + job_1_1 = await _create_job( + session=session, + run_name="run-1", + project=project_1, + user=user, + status=JobStatus.RUNNING, + job_provisioning_data=jpd_1_1, + job_runtime_data=jrd_1_1, + submitted_at=FAKE_NOW - timedelta(seconds=120), + ) + await create_job_prometheus_metrics( + session=session, + job=job_1_1, + text=dedent(""" + # Comments should be skipped + + # HELP FIELD_1 Test field 1 + # TYPE FIELD_1 gauge + FIELD_1{gpu="0"} 350 + FIELD_1{gpu="1"} 400 + + # HELP FIELD_2 Test field 2 + # TYPE FIELD_2 counter + FIELD_2{gpu="0"} 337325 1395066363000 + FIELD_2{gpu="1"} 987169 1395066363010 + """), + ) + await create_job_metrics_point( + session=session, + job_model=job_1_1, + timestamp=FAKE_NOW - timedelta(seconds=30), + cpu_usage_micro=3_500_000, + memory_working_set_bytes=3_221_225_472, + memory_usage_bytes=4_294_967_296, + gpus_util_percent=[80, 90], + gpus_memory_usage_bytes=[1_073_741_824, 2_147_483_648], + ) + # Older, ignored + await create_job_metrics_point( + session=session, + job_model=job_1_1, + timestamp=FAKE_NOW - timedelta(seconds=60), + cpu_usage_micro=2_000_000, + memory_working_set_bytes=1_073_741_824, + memory_usage_bytes=2_147_483_648, + ) + jpd_1_2 = get_job_provisioning_data( + backend=BackendType.AWS, + cpu_count=24, + memory_gib=224, + gpu_count=3, + gpu_name="L4", + price=12.5, + ) + job_1_2 = await _create_job( + session=session, + run_name="run-2", + project=project_1, + user=user, + status=JobStatus.RUNNING, + job_provisioning_data=jpd_1_2, + submitted_at=FAKE_NOW - timedelta(seconds=150), + ) + + await create_job_prometheus_metrics( + session=session, + job=job_1_2, + text=dedent(""" + # HELP FIELD_1 Test field 1 + # TYPE FIELD_1 gauge + FIELD_1{gpu="0"} 1200.0 + FIELD_1{gpu="1"} 1600.0 + FIELD_1{gpu="2"} 2400.0 + """), + ) + # Terminated job, should not appear in the response + job_1_3 = await _create_job(session, "run-3", project_1, user, JobStatus.TERMINATED) + await create_job_prometheus_metrics( + session=session, + job=job_1_3, + text=dedent(""" + # HELP FIELD_1 Test field 1 + # TYPE FIELD_1 gauge + FIELD_1{gpu="0"} 10 + FIELD_1{gpu="1"} 20 + """), + ) + await _create_run(session, "done", project_1, user, RunStatus.DONE) + other_user = await create_user( + session=session, name="other-user", global_role=GlobalRole.USER + ) + await add_project_member( + session=session, project=project_2, user=other_user, project_role=ProjectRole.USER + ) + await _create_run(session, "failed-1", project_2, other_user, RunStatus.FAILED) + await _create_run(session, "failed-2", project_2, other_user, RunStatus.FAILED) + fleet = await create_fleet(session=session, project=project_1, name="test-fleet") + instance = await create_instance( + session=session, + project=project_1, + fleet=fleet, + backend=BackendType.AWS, + offer=offer, + price=14, + created_at=FAKE_NOW - timedelta(hours=1), + name="test-instance", + ) + + response = await client.get("/metrics") + + assert response.status_code == 200 + expected = ( + dedent(f"""\ + # HELP dstack_instance_duration_seconds_total Total seconds the instance is running + # TYPE dstack_instance_duration_seconds_total counter + dstack_instance_duration_seconds_total{{dstack_project_name="project-1",dstack_fleet_name="test-fleet",dstack_fleet_id="{fleet.id}",dstack_instance_name="test-instance",dstack_instance_id="{instance.id}",dstack_instance_type="test-type",dstack_backend="aws",dstack_gpu="V4"}} 3600.0 + # HELP dstack_instance_price_dollars_per_hour Instance price, USD/hour + # TYPE dstack_instance_price_dollars_per_hour gauge + dstack_instance_price_dollars_per_hour{{dstack_project_name="project-1",dstack_fleet_name="test-fleet",dstack_fleet_id="{fleet.id}",dstack_instance_name="test-instance",dstack_instance_id="{instance.id}",dstack_instance_type="test-type",dstack_backend="aws",dstack_gpu="V4"}} 14.0 + # HELP dstack_instance_gpu_count Instance GPU count + # TYPE dstack_instance_gpu_count gauge + dstack_instance_gpu_count{{dstack_project_name="project-1",dstack_fleet_name="test-fleet",dstack_fleet_id="{fleet.id}",dstack_instance_name="test-instance",dstack_instance_id="{instance.id}",dstack_instance_type="test-type",dstack_backend="aws",dstack_gpu="V4"}} 2.0 + # HELP dstack_run_count_total Total runs count + # TYPE dstack_run_count_total counter + dstack_run_count_total{{dstack_project_name="project-1",dstack_user_name="test-user"}} 4.0 + dstack_run_count_total{{dstack_project_name="project-2",dstack_user_name="other-user"}} 2.0 + dstack_run_count_total{{dstack_project_name="project-2",dstack_user_name="test-user"}} 1.0 + # HELP dstack_run_count_terminated_total Terminated runs count + # TYPE dstack_run_count_terminated_total counter + dstack_run_count_terminated_total{{dstack_project_name="project-1",dstack_user_name="test-user"}} 0.0 + dstack_run_count_terminated_total{{dstack_project_name="project-2",dstack_user_name="other-user"}} 0.0 + dstack_run_count_terminated_total{{dstack_project_name="project-2",dstack_user_name="test-user"}} 0.0 + # HELP dstack_run_count_failed_total Failed runs count + # TYPE dstack_run_count_failed_total counter + dstack_run_count_failed_total{{dstack_project_name="project-1",dstack_user_name="test-user"}} 0.0 + dstack_run_count_failed_total{{dstack_project_name="project-2",dstack_user_name="other-user"}} 2.0 + dstack_run_count_failed_total{{dstack_project_name="project-2",dstack_user_name="test-user"}} 0.0 + # HELP dstack_run_count_done_total Done runs count + # TYPE dstack_run_count_done_total counter + dstack_run_count_done_total{{dstack_project_name="project-1",dstack_user_name="test-user"}} 1.0 + dstack_run_count_done_total{{dstack_project_name="project-2",dstack_user_name="other-user"}} 0.0 + dstack_run_count_done_total{{dstack_project_name="project-2",dstack_user_name="test-user"}} 0.0 + # HELP dstack_job_duration_seconds_total Total seconds the job is running + # TYPE dstack_job_duration_seconds_total counter + dstack_job_duration_seconds_total{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 120.0 + dstack_job_duration_seconds_total{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 150.0 + dstack_job_duration_seconds_total{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 100.0 + # HELP dstack_job_price_dollars_per_hour Job instance price, USD/hour + # TYPE dstack_job_price_dollars_per_hour gauge + dstack_job_price_dollars_per_hour{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 12.0 + dstack_job_price_dollars_per_hour{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 12.5 + dstack_job_price_dollars_per_hour{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 16.0 + # HELP dstack_job_gpu_count Job GPU count + # TYPE dstack_job_gpu_count gauge + dstack_job_gpu_count{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 2.0 + dstack_job_gpu_count{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 3.0 + dstack_job_gpu_count{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 2.0 + # HELP dstack_job_cpu_count Job CPU count + # TYPE dstack_job_cpu_count gauge + dstack_job_cpu_count{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 32.0 + dstack_job_cpu_count{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 24.0 + dstack_job_cpu_count{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 16.0 + # HELP dstack_job_cpu_time_seconds_total Total CPU time consumed by the job, seconds + # TYPE dstack_job_cpu_time_seconds_total counter + dstack_job_cpu_time_seconds_total{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 3.5 + # HELP dstack_job_memory_total_bytes Total memory allocated for the job, bytes + # TYPE dstack_job_memory_total_bytes gauge + dstack_job_memory_total_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 137438953472.0 + dstack_job_memory_total_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 240518168576.0 + dstack_job_memory_total_bytes{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 68719476736.0 + # HELP dstack_job_memory_usage_bytes Memory used by the job (including cache), bytes + # TYPE dstack_job_memory_usage_bytes gauge + dstack_job_memory_usage_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 4294967296.0 + # HELP dstack_job_memory_working_set_bytes Memory used by the job (not including cache), bytes + # TYPE dstack_job_memory_working_set_bytes gauge + dstack_job_memory_working_set_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 3221225472.0 + # HELP dstack_job_gpu_usage_ratio Job GPU usage, percent (as 0.0-1.0) + # TYPE dstack_job_gpu_usage_ratio gauge + dstack_job_gpu_usage_ratio{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="0"}} 0.8 + dstack_job_gpu_usage_ratio{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="1"}} 0.9 + # HELP dstack_job_gpu_memory_total_bytes Total GPU memory allocated for the job, bytes + # TYPE dstack_job_gpu_memory_total_bytes gauge + dstack_job_gpu_memory_total_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="0"}} 17179869184.0 + dstack_job_gpu_memory_total_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="1"}} 17179869184.0 + # HELP dstack_job_gpu_memory_usage_bytes GPU memory used by the job, bytes + # TYPE dstack_job_gpu_memory_usage_bytes gauge + dstack_job_gpu_memory_usage_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="0"}} 1073741824.0 + dstack_job_gpu_memory_usage_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="1"}} 2147483648.0 + # HELP FIELD_1 Test field 1 + # TYPE FIELD_1 gauge + FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 350.0 + FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 400.0 + FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 1200.0 + FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 1600.0 + FIELD_1{{gpu="2",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 2400.0 + FIELD_1{{gpu="0",dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 100.0 + FIELD_1{{gpu="1",dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 200.0 + # HELP FIELD_2 Test field 2 + # TYPE FIELD_2 counter + FIELD_2{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 337325.0 1395066363000 + FIELD_2{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 987169.0 1395066363010 + """) + + "\n" + + BASE_HTTP_METRICS.decode().strip() + ) + assert response.text.strip() == expected + + @patch("prometheus_client.generate_latest", lambda: BASE_HTTP_METRICS) + async def test_returns_empty_response_if_no_runs(self, client: AsyncClient): + response = await client.get("/metrics") + assert response.status_code == 200 + assert response.text.strip() == BASE_HTTP_METRICS.decode().strip() + + async def test_returns_404_if_not_enabled( + self, monkeypatch: pytest.MonkeyPatch, client: AsyncClient + ): + monkeypatch.setattr("dstack._internal.server.settings.ENABLE_PROMETHEUS_METRICS", False) + response = await client.get("/metrics") + assert response.status_code == 404 + + @pytest.mark.parametrize("token", [None, "foo"]) + async def test_returns_403_if_not_authenticated( + self, monkeypatch: pytest.MonkeyPatch, client: AsyncClient, token: Optional[str] + ): + monkeypatch.setattr("dstack._internal.server.routers.prometheus._auth._token", b"secret") + if token is not None: + headers = get_auth_headers(token) + else: + headers = None + response = await client.get("/metrics", headers=headers) + assert response.status_code in [401, 403] + + async def test_returns_200_if_token_is_valid( + self, monkeypatch: pytest.MonkeyPatch, client: AsyncClient + ): + monkeypatch.setattr("dstack._internal.server.routers.prometheus._auth._token", b"secret") + response = await client.get("/metrics", headers=get_auth_headers("secret")) + assert response.status_code == 200 + + +async def _create_project(session: AsyncSession, name: str, user: UserModel) -> ProjectModel: + project = await create_project(session=session, owner=user, name=name) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + return project + + +async def _create_run( + session: AsyncSession, + run_name: str, + project: ProjectModel, + user: UserModel, + status: RunStatus, + submitted_at: datetime = FAKE_NOW, +) -> RunModel: + repo = await create_repo(session=session, project_id=project.id, repo_name=f"{run_name}-repo") + configuration = DevEnvironmentConfiguration(ide="vscode") + run_spec = get_run_spec(run_name=run_name, repo_id=repo.name, configuration=configuration) + return await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name=run_name, + run_spec=run_spec, + status=status, + submitted_at=submitted_at, + ) + + +async def _create_job( + session: AsyncSession, + run_name: str, + project: ProjectModel, + user: UserModel, + status: JobStatus, + job_provisioning_data: Optional[JobProvisioningData] = None, + job_runtime_data: Optional[JobRuntimeData] = None, + submitted_at: datetime = FAKE_NOW, +) -> JobModel: + run = await _create_run( + session=session, + run_name=run_name, + project=project, + user=user, + status=RunStatus.SUBMITTED, + submitted_at=submitted_at, + ) + job = await create_job( + session=session, + run=run, + status=status, + job_provisioning_data=job_provisioning_data, + job_runtime_data=job_runtime_data, + submitted_at=submitted_at, + ) + return job diff --git a/src/tests/_internal/server/routers/test_public_keys.py b/src/tests/_internal/server/routers/test_public_keys.py new file mode 100644 index 0000000000..80954e0262 --- /dev/null +++ b/src/tests/_internal/server/routers/test_public_keys.py @@ -0,0 +1,288 @@ +import uuid +from datetime import datetime, timezone +from unittest.mock import AsyncMock + +import pytest +from freezegun import freeze_time +from httpx import AsyncClient +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.server.models import UserPublicKeyModel +from dstack._internal.server.testing.common import ( + create_user, + create_user_public_key, + get_auth_headers, +) +from dstack._internal.server.testing.matchers import SomeUUID4Str + + +@pytest.mark.asyncio +class TestListUserPublicKeys: + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/users/public_keys/list") + assert response.status_code in [401, 403] + + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.usefixtures("test_db") + async def test_lists_own_public_keys(self, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session) + key = await create_user_public_key( + session=session, + user=user, + name="my-key", + type="ssh-ed25519", + fingerprint="SHA256:testfingerprint", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + response = await client.post( + "/api/users/public_keys/list", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + assert response.json() == [ + { + "id": str(key.id), + "added_at": "2023-01-02T03:04:00+00:00", + "name": "my-key", + "type": "ssh-ed25519", + "fingerprint": "SHA256:testfingerprint", + } + ] + + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.usefixtures("test_db") + async def test_does_not_list_other_users_keys( + self, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session) + other_user = await create_user(session=session, name="other_user") + await create_user_public_key(session=session, user=other_user) + response = await client.post( + "/api/users/public_keys/list", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + assert response.json() == [] + + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.usefixtures("test_db") + async def test_returns_keys_in_reverse_chronological_order( + self, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session) + key1 = await create_user_public_key( + session=session, + user=user, + name="older-key", + fingerprint="SHA256:fingerprint1", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + key2 = await create_user_public_key( + session=session, + user=user, + name="newer-key", + fingerprint="SHA256:fingerprint2", + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + ) + response = await client.post( + "/api/users/public_keys/list", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + data = response.json() + assert len(data) == 2 + assert data[0]["id"] == str(key2.id) + assert data[1]["id"] == str(key1.id) + + +@pytest.mark.asyncio +class TestAddUserPublicKey: + PUBLIC_KEY_NO_COMMENT = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAA" + PUBLIC_KEY = f"{PUBLIC_KEY_NO_COMMENT} test@example.com" + + @pytest.fixture + def validate_openssh_public_key_mock(self, monkeypatch: pytest.MonkeyPatch) -> AsyncMock: + mock = AsyncMock() + monkeypatch.setattr( + "dstack._internal.server.services.public_keys.validate_openssh_public_key", mock + ) + return mock + + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post( + "/api/users/public_keys/add", + json={"key": self.PUBLIC_KEY}, + ) + assert response.status_code in [401, 403] + + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.usefixtures("test_db") + @freeze_time(datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc)) + async def test_adds_valid_public_key( + self, + session: AsyncSession, + client: AsyncClient, + validate_openssh_public_key_mock: AsyncMock, + ): + user = await create_user(session=session) + response = await client.post( + "/api/users/public_keys/add", + headers=get_auth_headers(user.token), + json={"key": self.PUBLIC_KEY}, + ) + assert response.status_code == 200 + assert response.json() == { + "id": SomeUUID4Str(), + "type": "ssh-ed25519", + "name": "test@example.com", + "fingerprint": "SHA256:uALbfMqe7g4MMaRS5NMJen38dAEHwtxzR0iX0Ymuc80", + "added_at": "2023-01-02T03:04:00+00:00", + } + validate_openssh_public_key_mock.assert_awaited_once_with(self.PUBLIC_KEY) + + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.usefixtures("test_db") + @pytest.mark.usefixtures("validate_openssh_public_key_mock") + async def test_adds_key_with_custom_name(self, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session) + response = await client.post( + "/api/users/public_keys/add", + headers=get_auth_headers(user.token), + json={"key": self.PUBLIC_KEY, "name": "my-laptop"}, + ) + assert response.status_code == 200 + assert response.json()["name"] == "my-laptop" + + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.usefixtures("test_db") + @pytest.mark.usefixtures("validate_openssh_public_key_mock") + async def test_uses_md5_as_name_when_no_comment_and_no_name( + self, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session) + response = await client.post( + "/api/users/public_keys/add", + headers=get_auth_headers(user.token), + json={"key": self.PUBLIC_KEY_NO_COMMENT}, + ) + assert response.status_code == 200 + assert response.json()["name"] == "744e414c6ac55e3f15c1dd48229cbe74" + + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.usefixtures("test_db") + @pytest.mark.parametrize( + "key", + [ + pytest.param("sha-rsa-invalid", id="only-one-field"), + pytest.param("ssh-rsa AAAAB3NzaC1kc3M=", id="dsa-declared-as-rsa"), + ], + ) + async def test_returns_400_for_invalid_key( + self, session: AsyncSession, client: AsyncClient, key: str + ): + user = await create_user(session=session) + response = await client.post( + "/api/users/public_keys/add", + headers=get_auth_headers(user.token), + json={"key": key}, + ) + assert response.status_code == 400 + assert "Invalid public key" in response.json()["detail"][0]["msg"] + + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.usefixtures("test_db") + async def test_returns_400_for_unsupported_key( + self, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session) + response = await client.post( + "/api/users/public_keys/add", + headers=get_auth_headers(user.token), + json={"key": "ssh-dss AAAAB3NzaC1kc3M="}, + ) + assert response.status_code == 400 + assert response.json()["detail"][0]["msg"] == "Unsupported key type: ssh-dss" + + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.usefixtures("test_db") + @pytest.mark.usefixtures("validate_openssh_public_key_mock") + async def test_returns_400_resource_exists_for_duplicate_key( + self, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session) + response = await client.post( + "/api/users/public_keys/add", + headers=get_auth_headers(user.token), + json={"key": self.PUBLIC_KEY}, + ) + assert response.status_code == 200 + response = await client.post( + "/api/users/public_keys/add", + headers=get_auth_headers(user.token), + # The same key, the comment does not matter + json={"key": self.PUBLIC_KEY_NO_COMMENT}, + ) + assert response.status_code == 400 + assert response.json()["detail"][0]["code"] == "resource_exists" + + +@pytest.mark.asyncio +class TestDeleteUserPublicKeys: + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post( + "/api/users/public_keys/delete", + json={"ids": [str(uuid.uuid4())]}, + ) + assert response.status_code in [401, 403] + + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.usefixtures("test_db") + async def test_deletes_public_key(self, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session) + key = await create_user_public_key(session=session, user=user) + other_key = await create_user_public_key( + session=session, user=user, fingerprint="SHA256:other" + ) + response = await client.post( + "/api/users/public_keys/delete", + headers=get_auth_headers(user.token), + json={"ids": [str(key.id)]}, + ) + assert response.status_code == 200 + res = await session.execute( + select(UserPublicKeyModel).where(UserPublicKeyModel.user_id == user.id) + ) + assert res.scalars().all() == [other_key] + + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.usefixtures("test_db") + async def test_silently_ignores_nonexistent_ids( + self, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session) + response = await client.post( + "/api/users/public_keys/delete", + headers=get_auth_headers(user.token), + json={"ids": [str(uuid.uuid4())]}, + ) + assert response.status_code == 200 + + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.usefixtures("test_db") + async def test_does_not_delete_other_users_keys( + self, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session) + other_user = await create_user(session=session, name="other_user") + other_user_key = await create_user_public_key(session=session, user=other_user) + response = await client.post( + "/api/users/public_keys/delete", + headers=get_auth_headers(user.token), + json={"ids": [str(other_user_key.id)]}, + ) + assert response.status_code == 200 + res = await session.execute( + select(UserPublicKeyModel).where(UserPublicKeyModel.user_id == other_user.id) + ) + assert res.scalars().all() == [other_user_key] diff --git a/src/tests/_internal/server/routers/test_repos.py b/src/tests/_internal/server/routers/test_repos.py index cc8fd659fa..d85cd6635b 100644 --- a/src/tests/_internal/server/routers/test_repos.py +++ b/src/tests/_internal/server/routers/test_repos.py @@ -1,43 +1,48 @@ import json +from unittest.mock import AsyncMock, Mock import pytest -from fastapi.testclient import TestClient +from httpx import AsyncClient from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from dstack._internal.core.models.users import GlobalRole, ProjectRole -from dstack._internal.server.main import app -from dstack._internal.server.models import CodeModel, RepoModel +from dstack._internal.server.models import CodeModel, RepoCredsModel, RepoModel from dstack._internal.server.services.projects import add_project_member +from dstack._internal.server.services.storage import BaseStorage from dstack._internal.server.testing.common import ( + create_code, create_project, create_repo, + create_repo_creds, create_user, get_auth_headers, ) -client = TestClient(app) - class TestListRepos: @pytest.mark.asyncio - async def test_returns_403_if_not_project_member(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_project_member( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) - response = client.post( + response = await client.post( f"/api/project/{project.name}/repos/list", headers=get_auth_headers(user.token), ) assert response.status_code == 403 @pytest.mark.asyncio - async def test_returns_empty_list(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_empty_list(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/repos/list", headers=get_auth_headers(user.token), ) @@ -45,14 +50,15 @@ async def test_returns_empty_list(self, test_db, session: AsyncSession): assert response.json() == [] @pytest.mark.asyncio - async def test_returns_repos(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_repos(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) repo = await create_repo(session=session, project_id=project.id) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/repos/list", headers=get_auth_headers(user.token), ) @@ -67,23 +73,29 @@ async def test_returns_repos(self, test_db, session: AsyncSession): class TestGetRepo: @pytest.mark.asyncio - async def test_returns_403_if_not_project_member(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_project_member( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) - response = client.post( + response = await client.post( f"/api/project/{project.name}/repos/get", headers=get_auth_headers(user.token), ) assert response.status_code == 403 @pytest.mark.asyncio - async def test_returns_400_if_repo_does_not_exist(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_400_if_repo_does_not_exist( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/repos/get", headers=get_auth_headers(user.token), json={"repo_id": "some_repo", "include_creds": False}, @@ -91,14 +103,15 @@ async def test_returns_400_if_repo_does_not_exist(self, test_db, session: AsyncS assert response.status_code == 400 @pytest.mark.asyncio - async def test_returns_repo(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_repo(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) repo = await create_repo(session=session, project_id=project.id) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/repos/get", headers=get_auth_headers(user.token), json={"repo_id": repo.name, "include_creds": False}, @@ -111,14 +124,58 @@ async def test_returns_repo(self, test_db, session: AsyncSession): } @pytest.mark.asyncio - async def test_returns_repo_with_creds(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_repo_with_legacy_creds( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) - repo = await create_repo(session=session, project_id=project.id) + legacy_creds = { + "clone_url": "https://github.com/dstackai/dstack.git", + "private_key": None, + "oauth_token": "test_token", + } + repo = await create_repo(session=session, project_id=project.id, creds=legacy_creds) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + response = await client.post( + f"/api/project/{project.name}/repos/get", + headers=get_auth_headers(user.token), + json={"repo_id": repo.name, "include_creds": True}, + ) + assert response.status_code == 200, response.json() + assert response.json() == { + "repo_id": repo.name, + "repo_info": json.loads(repo.info), + "repo_creds": legacy_creds, + } + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_repo_with_user_creds( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + legacy_creds = { + "clone_url": "https://github.com/dstackai/dstack.git", + "private_key": None, + "oauth_token": "legacy_creds", + } + repo = await create_repo(session=session, project_id=project.id, creds=legacy_creds) + user_creds = { + "clone_url": "https://github.com/dstackai/dstack.git", + "private_key": None, + "oauth_token": "user_creds", + } + await create_repo_creds( + session=session, repo_id=repo.id, user_id=user.id, creds=user_creds + ) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/repos/get", headers=get_auth_headers(user.token), json={"repo_id": repo.name, "include_creds": True}, @@ -127,23 +184,27 @@ async def test_returns_repo_with_creds(self, test_db, session: AsyncSession): assert response.json() == { "repo_id": repo.name, "repo_info": json.loads(repo.info), - "repo_creds": json.loads(repo.creds), + "repo_creds": user_creds, } class TestInitRepo: @pytest.mark.asyncio - async def test_returns_403_if_not_project_member(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_project_member( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) - response = client.post( + response = await client.post( f"/api/project/{project.name}/repos/init", headers=get_auth_headers(user.token), ) assert response.status_code == 403 @pytest.mark.asyncio - async def test_creates_remote_repo(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_creates_remote_repo(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( @@ -153,31 +214,33 @@ async def test_creates_remote_repo(self, test_db, session: AsyncSession): "repo_id": "test_repo", "repo_info": { "repo_type": "remote", - "repo_host_name": "github.com", - "repo_port": None, - "repo_user_name": "dstackai", "repo_name": "dstack", }, "repo_creds": { - "protocol": "https", + "clone_url": "https://github.com/dstackai/dstack.git", "private_key": None, "oauth_token": "test_token", }, } - response = client.post( + response = await client.post( f"/api/project/{project.name}/repos/init", headers=get_auth_headers(user.token), json=body, ) assert response.status_code == 200, response.json() res = await session.execute(select(RepoModel)) - repo = res.scalar() + repo = res.scalar_one() assert repo.name == body["repo_id"] assert json.loads(repo.info) == body["repo_info"] - assert json.loads(repo.creds) == body["repo_creds"] + assert repo.creds is None + res = await session.execute(select(RepoCredsModel)) + repo_creds = res.scalar_one() + assert repo_creds.creds.plaintext is not None + assert json.loads(repo_creds.creds.plaintext) == body["repo_creds"] @pytest.mark.asyncio - async def test_updates_remote_repo(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_updates_remote_repo(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( @@ -187,18 +250,15 @@ async def test_updates_remote_repo(self, test_db, session: AsyncSession): "repo_id": "test_repo", "repo_info": { "repo_type": "remote", - "repo_host_name": "github.com", - "repo_port": None, - "repo_user_name": "dstackai", "repo_name": "dstack", }, "repo_creds": { - "protocol": "https", + "clone_url": "https://github.com/dstackai/dstack.git", "private_key": None, "oauth_token": "test_token", }, } - response = client.post( + response = await client.post( f"/api/project/{project.name}/repos/init", headers=get_auth_headers(user.token), json=body1, @@ -208,47 +268,52 @@ async def test_updates_remote_repo(self, test_db, session: AsyncSession): "repo_id": "test_repo", "repo_info": { "repo_type": "remote", - "repo_host_name": "github.com", - "repo_port": None, - "repo_user_name": "dstackai", "repo_name": "dstack", }, "repo_creds": { - "protocol": "https", + "clone_url": "https://github.com/dstackai/dstack.git", "private_key": None, "oauth_token": "test_token_updated", }, } - response = client.post( + response = await client.post( f"/api/project/{project.name}/repos/init", headers=get_auth_headers(user.token), json=body2, ) res = await session.execute(select(RepoModel)) repo = res.scalar_one() - assert json.loads(repo.creds) == body2["repo_creds"] + assert repo.creds is None + res = await session.execute(select(RepoCredsModel)) + repo_creds = res.scalar_one() + assert repo_creds.creds.plaintext is not None + assert json.loads(repo_creds.creds.plaintext) == body2["repo_creds"] class TestDeleteRepos: @pytest.mark.asyncio - async def test_returns_403_if_not_project_member(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_project_member( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) - response = client.post( + response = await client.post( f"/api/project/{project.name}/repos/delete", headers=get_auth_headers(user.token), ) assert response.status_code == 403 @pytest.mark.asyncio - async def test_deletes_repos(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_deletes_repos(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) repo = await create_repo(session=session, project_id=project.id) - response = client.post( + response = await client.post( f"/api/project/{project.name}/repos/delete", headers=get_auth_headers(user.token), json={"repos_ids": [repo.name]}, @@ -259,20 +324,38 @@ async def test_deletes_repos(self, test_db, session: AsyncSession): assert repo is None +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +@pytest.mark.usefixtures("test_db") class TestUploadCode: - @pytest.mark.asyncio - async def test_returns_403_if_not_project_member(self, test_db, session: AsyncSession): + @pytest.fixture + def default_storage_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: + storage_mock = Mock(spec_set=BaseStorage) + monkeypatch.setattr( + "dstack._internal.server.services.repos.get_default_storage", lambda: storage_mock + ) + return storage_mock + + @pytest.fixture + def no_default_storage(self, monkeypatch: pytest.MonkeyPatch): + monkeypatch.setattr( + "dstack._internal.server.services.repos.get_default_storage", lambda: None + ) + + async def test_returns_403_if_not_project_member( + self, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) - response = client.post( + response = await client.post( f"/api/project/{project.name}/repos/upload_code", headers=get_auth_headers(user.token), params={"repo_id": "test_repo"}, ) assert response.status_code == 403 - @pytest.mark.asyncio - async def test_uploads_code(self, test_db, session: AsyncSession): + @pytest.mark.usefixtures("no_default_storage") + async def test_uploads_code_to_db(self, session: AsyncSession, client: AsyncClient): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( @@ -280,7 +363,7 @@ async def test_uploads_code(self, test_db, session: AsyncSession): ) repo = await create_repo(session=session, project_id=project.id) file = ("blob_hash", b"blob_content") - response = client.post( + response = await client.post( f"/api/project/{project.name}/repos/upload_code", headers=get_auth_headers(user.token), params={"repo_id": repo.name}, @@ -288,12 +371,39 @@ async def test_uploads_code(self, test_db, session: AsyncSession): ) assert response.status_code == 200, response.json() res = await session.execute(select(CodeModel)) - code = res.scalar() + code = res.scalar_one() assert code.blob_hash == file[0] assert code.blob == file[1] - @pytest.mark.asyncio - async def test_uploads_same_code_for_different_repos(self, test_db, session: AsyncSession): + async def test_uploads_code_to_storage( + self, session: AsyncSession, client: AsyncClient, default_storage_mock: Mock + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + file = ("blob_hash", b"blob_content") + response = await client.post( + f"/api/project/{project.name}/repos/upload_code", + headers=get_auth_headers(user.token), + params={"repo_id": repo.name}, + files={"file": file}, + ) + assert response.status_code == 200, response.json() + res = await session.execute(select(CodeModel)) + code = res.scalar_one() + assert code.blob_hash == file[0] + assert code.blob is None + default_storage_mock.upload_code.assert_called_once_with( + project.name, repo.name, file[0], file[1] + ) + + @pytest.mark.usefixtures("no_default_storage") + async def test_uploads_same_code_for_different_repos( + self, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( @@ -302,14 +412,14 @@ async def test_uploads_same_code_for_different_repos(self, test_db, session: Asy repo1 = await create_repo(session=session, repo_name="repo1", project_id=project.id) repo2 = await create_repo(session=session, repo_name="repo2", project_id=project.id) file = ("blob_hash", b"blob_content") - response = client.post( + response = await client.post( f"/api/project/{project.name}/repos/upload_code", headers=get_auth_headers(user.token), params={"repo_id": repo1.name}, files={"file": file}, ) assert response.status_code == 200, response.json() - response = client.post( + response = await client.post( f"/api/project/{project.name}/repos/upload_code", headers=get_auth_headers(user.token), params={"repo_id": repo2.name}, @@ -319,3 +429,36 @@ async def test_uploads_same_code_for_different_repos(self, test_db, session: Asy res = await session.execute(select(CodeModel)) codes = res.scalars().all() assert len(codes) == 2 + + async def test_handles_race_condition( + self, + monkeypatch: pytest.MonkeyPatch, + session: AsyncSession, + client: AsyncClient, + default_storage_mock: Mock, + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + file = ("blob_hash", b"blob_content") + code = await create_code(session=session, repo=repo, blob_hash=file[0], blob=file[1]) + monkeypatch.setattr( + "dstack._internal.server.services.repos.get_code_model", AsyncMock(return_value=None) + ) + response = await client.post( + f"/api/project/{project.name}/repos/upload_code", + headers=get_auth_headers(user.token), + params={"repo_id": repo.name}, + files={"file": file}, + ) + assert response.status_code == 200, response.json() + res = await session.execute(select(CodeModel)) + code = res.scalar_one() + assert code.blob_hash == file[0] + assert code.blob == file[1] + default_storage_mock.upload_code.assert_called_once_with( + project.name, repo.name, file[0], file[1] + ) diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py index 8d2e353a00..456873f1e0 100644 --- a/src/tests/_internal/server/routers/test_runs.py +++ b/src/tests/_internal/server/routers/test_runs.py @@ -1,50 +1,91 @@ +import copy import json from datetime import datetime, timezone -from typing import Dict, List, Optional -from unittest.mock import Mock, patch +from typing import Dict, List, Optional, Tuple, Union +from unittest.mock import AsyncMock, Mock, patch from uuid import UUID import pytest -from fastapi.testclient import TestClient +from freezegun import freeze_time +from httpx import AsyncClient from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession +from dstack._internal.core.errors import GatewayError from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import ApplyAction, EntityReference +from dstack._internal.core.models.configurations import ( + AnyRunConfiguration, + DevEnvironmentConfiguration, + ReplicaGroup, + ScalingSpec, + ServiceConfiguration, + TaskConfiguration, + parse_run_configuration, +) +from dstack._internal.core.models.fleets import FleetNodesSpec, InstanceGroupPlacement +from dstack._internal.core.models.gateways import GatewayStatus from dstack._internal.core.models.instances import ( + Gpu, InstanceAvailability, InstanceOfferWithAvailability, + InstanceStatus, InstanceType, Resources, ) -from dstack._internal.core.models.profiles import DEFAULT_POOL_NAME, Profile -from dstack._internal.core.models.resources import ResourcesSpec +from dstack._internal.core.models.profiles import Profile, Schedule +from dstack._internal.core.models.resources import GPUSpec, Range, ResourcesSpec from dstack._internal.core.models.runs import ( - JobProvisioningData, + ApplyRunPlanInput, JobSpec, JobStatus, - JobTerminationReason, Requirements, + Run, RunSpec, RunStatus, RunTerminationReason, ) from dstack._internal.core.models.users import GlobalRole, ProjectRole -from dstack._internal.server.background.tasks.process_instances import process_instances -from dstack._internal.server.main import app +from dstack._internal.core.models.volumes import InstanceMountPoint, MountPoint from dstack._internal.server.models import JobModel, RunModel -from dstack._internal.server.schemas.runs import CreateInstanceRequest +from dstack._internal.server.schemas.runs import ApplyRunPlanRequest from dstack._internal.server.services.projects import add_project_member +from dstack._internal.server.services.resources import ( + set_gpu_vendor_default, + set_resources_defaults, +) +from dstack._internal.server.services.runs import run_model_to_run +from dstack._internal.server.services.runs.spec import validate_run_spec_and_set_defaults from dstack._internal.server.testing.common import ( + create_backend, + create_export, + create_fleet, + create_gateway, + create_gateway_compute, + create_instance, create_job, create_project, create_repo, create_run, create_user, get_auth_headers, + get_fleet_configuration, + get_fleet_spec, + get_instance_offer_with_availability, get_job_provisioning_data, + get_job_runtime_data, + get_run_spec, + get_ssh_fleet_configuration, + list_events, ) +from dstack._internal.server.testing.matchers import SomeUUID4Str + +pytestmark = pytest.mark.usefixtures("image_config_mock", "disable_sshproxy") + -client = TestClient(app) +@pytest.fixture +def disable_sshproxy(monkeypatch: pytest.MonkeyPatch): + monkeypatch.setattr("dstack._internal.server.settings.SSHPROXY_ENABLED", False) def get_dev_env_run_plan_dict( @@ -55,101 +96,196 @@ def get_dev_env_run_plan_dict( offers: List[InstanceOfferWithAvailability] = [], total_offers: int = 0, max_price: Optional[float] = None, + action: ApplyAction = ApplyAction.CREATE, + current_resource: Optional[Run] = None, + privileged: bool = False, + docker: bool = False, + volumes: List[MountPoint] = [], ) -> Dict: + # When docker=True, commands should start with start-dockerd + if docker: + commands = [ + "/bin/bash", + "-i", + "-c", + ( + "start-dockerd" + " && (echo 'uv pip install ipykernel...'" + " && uv pip install -q --no-cache-dir ipykernel 2> /dev/null)" + " || (echo 'pip install ipykernel...'" + " && pip install -q --no-cache-dir ipykernel 2> /dev/null)" + " || echo 'no uv or pip found, ipykernel was not installed'" + " && echo" + " && echo 'To open in VS Code, use link below:'" + " && echo" + ' && echo " vscode://vscode-remote/ssh-remote+dry-run$DSTACK_WORKING_DIR"' + " && echo" + " && echo 'To connect via SSH, use: `ssh dry-run`'" + " && echo" + " && echo -n 'To exit, press Ctrl+C.'" + " && tail -f /dev/null" + ), + ] + image_name = "dstackai/dind" + else: + commands = [ + "/bin/bash", + "-i", + "-c", + ( + "eval $(echo 'export DSTACK_VENV_DIR=/dstack/venv' | sudo tee -a /dstack/profile)" + " && sudo rm -rf $DSTACK_VENV_DIR" + " && sudo mkdir $DSTACK_VENV_DIR" + " && sudo chown $(id -u):$(id -g) $DSTACK_VENV_DIR" + " && uv venv -q --prompt dstack -p 3.13 --seed $DSTACK_VENV_DIR" + " && eval $(echo '. $DSTACK_VENV_DIR/bin/activate' | sudo tee -a /dstack/profile)" + " && (echo 'uv pip install ipykernel...'" + " && uv pip install -q --no-cache-dir ipykernel 2> /dev/null)" + " || (echo 'pip install ipykernel...'" + " && pip install -q --no-cache-dir ipykernel 2> /dev/null)" + " || echo 'no uv or pip found, ipykernel was not installed'" + " && echo" + " && echo 'To open in VS Code, use link below:'" + " && echo" + ' && echo " vscode://vscode-remote/ssh-remote+dry-run$DSTACK_WORKING_DIR"' + " && echo" + " && echo 'To connect via SSH, use: `ssh dry-run`'" + " && echo" + " && echo -n 'To exit, press Ctrl+C.'" + " && tail -f /dev/null" + ), + ] + image_name = "dstackai/base:0.14-base-ubuntu24.04" + + run_spec = { + "configuration": { + "entrypoint": None, + "env": {}, + "working_dir": None, + "home_dir": "/root", + "ide": "vscode", + "inactivity_duration": None, + "version": None, + "image": None, + "user": None, + "docker": docker, + "shell": None, + "privileged": privileged, + "init": [], + "ports": [], + "python": "3.13" if not docker else None, + "nvcc": None, + "registry_auth": None, + "setup": [], + "type": "dev-environment", + "name": None, + "resources": { + "cpu": {"min": 2, "max": None}, + "memory": {"min": 8.0, "max": None}, + "disk": None, + "gpu": None, + "shm_size": None, + }, + "volumes": [json.loads(v.json()) for v in volumes], + "repos": [ + { + "url": "https://github.com/dstackai/dstack", + "branch": None, + "hash": None, + "local_path": None, + "path": "~/repo", + "if_exists": "error", + }, + ], + "files": [], + "backends": ["aws", "azure", "gcp", "lambda", "runpod"], + "regions": ["us"], + "availability_zones": None, + "instance_types": None, + "creation_policy": None, + "single_branch": None, + "max_duration": "off", + "stop_duration": None, + "max_price": None, + "retry": None, + "spot_policy": "auto", + "idle_duration": None, + "utilization_policy": None, + "startup_order": None, + "stop_criteria": None, + "schedule": None, + "reservation": None, + "fleets": None, + "tags": None, + "backend_options": None, + "instances": None, + "priority": 0, + }, + "configuration_path": "dstack.yaml", + "file_archives": [], + "profile": { + "backends": ["aws", "azure", "gcp", "lambda", "runpod"], + "regions": ["us"], + "availability_zones": None, + "instance_types": None, + "creation_policy": None, + "default": False, + "max_duration": "off", + "stop_duration": None, + "max_price": None, + "name": "string", + "retry": None, + "spot_policy": "auto", + "idle_duration": None, + "utilization_policy": None, + "startup_order": None, + "stop_criteria": None, + "schedule": None, + "reservation": None, + "fleets": None, + "tags": None, + "backend_options": None, + "instances": None, + }, + "repo_code_hash": None, + "repo_data": { + "repo_type": "remote", + "repo_name": "dstack", + "repo_branch": None, + "repo_hash": None, + "repo_config_name": None, + "repo_config_email": None, + }, + "repo_id": repo_id, + "repo_dir": "~/repo", + "run_name": run_name, + "ssh_key_pub": "ssh_key", + "working_dir": None, + } return { "project_name": project_name, "user": username, - "run_spec": { - "configuration": { - "entrypoint": None, - "env": {}, - "home_dir": "/root", - "ide": "vscode", - "version": None, - "image": None, - "init": [], - "ports": [], - "python": "3.8", - "registry_auth": None, - "setup": [], - "type": "dev-environment", - "resources": { - "cpu": {"min": 2, "max": None}, - "memory": {"min": 8.0, "max": None}, - "disk": None, - "gpu": None, - "shm_size": None, - }, - "volumes": [], - "backends": ["local", "aws", "azure", "gcp", "lambda"], - "regions": ["us"], - "instance_types": None, - "creation_policy": None, - "instance_name": None, - "max_duration": "off", - "max_price": None, - "pool_name": DEFAULT_POOL_NAME, - "retry": None, - "retry_policy": None, - "spot_policy": "spot", - "termination_idle_time": 300, - "termination_policy": None, - }, - "configuration_path": "dstack.yaml", - "profile": { - "backends": ["local", "aws", "azure", "gcp", "lambda"], - "regions": ["us"], - "instance_types": None, - "creation_policy": None, - "default": False, - "instance_name": None, - "max_duration": "off", - "max_price": None, - "name": "string", - "pool_name": DEFAULT_POOL_NAME, - "retry": None, - "retry_policy": None, - "spot_policy": "spot", - "termination_idle_time": 300, - "termination_policy": None, - }, - "repo_code_hash": None, - "repo_data": {"repo_dir": "/repo", "repo_type": "local"}, - "repo_id": repo_id, - "run_name": run_name, - "ssh_key_pub": "ssh_key", - "working_dir": ".", - }, + "run_spec": run_spec, + "effective_run_spec": run_spec, "job_plans": [ { "job_spec": { "app_specs": [], - "commands": [ - "/bin/bash", - "-i", - "-c", - "env >> ~/.ssh/environment && " - "(echo pip install ipykernel... && " - "pip install -q --no-cache-dir " - 'ipykernel 2> /dev/null) || echo "no ' - 'pip, ipykernel was not installed" ' - "&& echo '' && echo To open in VS " - "Code Desktop, use link below: && " - "echo '' && echo ' " - "vscode://vscode-remote/ssh-remote+dry-run/workflow' " - "&& echo '' && echo 'To connect via " - "SSH, use: `ssh dry-run`' && echo '' " - "&& echo -n 'To exit, press Ctrl+C.' " - "&& tail -f /dev/null", - ], + "commands": commands, "env": {}, "home_dir": "/root", - "image_name": "dstackai/base:py3.8-0.4-cuda-12.1", + "image_name": image_name, + "user": None, + "privileged": True if docker else privileged, "job_name": f"{run_name}-0-0", "replica_num": 0, "job_num": 0, "jobs_per_replica": 1, + "replica_group": "0", + "single_branch": False, "max_duration": None, + "stop_duration": 300, + "utilization_policy": None, "registry_auth": None, "requirements": { "resources": { @@ -160,51 +296,139 @@ def get_dev_env_run_plan_dict( "shm_size": None, }, "max_price": None, - "spot": True, + "spot": None, + "reservation": None, + "multinode": False, + "backend_options": None, }, "retry": None, - "retry_policy": {"retry": False, "duration": None}, - "working_dir": ".", + "volumes": volumes, + "ssh_key": None, + "working_dir": None, + "repo_code_hash": None, + "repo_data": { + "repo_type": "remote", + "repo_name": "dstack", + "repo_branch": None, + "repo_hash": None, + "repo_config_name": None, + "repo_config_email": None, + }, + "repo_dir": "~/repo", + "repo_exists_action": "error", + "file_archives": [], + "service_port": None, + "probes": [], }, "offers": [json.loads(o.json()) for o in offers], "total_offers": total_offers, "max_price": max_price, } ], + "current_resource": current_resource.dict() if current_resource else None, + "action": action.value, } def get_dev_env_run_dict( - run_id: str = "1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e", - job_id: str = "1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e", + run_id: Union[str, SomeUUID4Str] = "1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e", + job_id: Union[str, SomeUUID4Str] = "1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e", project_name: str = "test_project", username: str = "test_user", - run_name: str = "run_name", + run_name: Optional[str] = "run_name", repo_id: str = "test_repo", submitted_at: str = "2023-01-02T03:04:00+00:00", last_processed_at: str = "2023-01-02T03:04:00+00:00", - finished_at: str = "2023-01-02T03:04:00+00:00", + finished_at: Optional[str] = "2023-01-02T03:04:00+00:00", + privileged: bool = False, + docker: Optional[bool] = None, + deleted: bool = False, ) -> Dict: + # When docker=True, commands should start with start-dockerd and use dind image + if docker: + commands = [ + "/bin/bash", + "-i", + "-c", + ( + "start-dockerd" + " && (echo 'uv pip install ipykernel...'" + " && uv pip install -q --no-cache-dir ipykernel 2> /dev/null)" + " || (echo 'pip install ipykernel...'" + " && pip install -q --no-cache-dir ipykernel 2> /dev/null)" + " || echo 'no uv or pip found, ipykernel was not installed'" + " && echo" + " && echo 'To open in VS Code, use link below:'" + " && echo" + ' && echo " vscode://vscode-remote/ssh-remote+test-run$DSTACK_WORKING_DIR"' + " && echo" + " && echo 'To connect via SSH, use: `ssh test-run`'" + " && echo" + " && echo -n 'To exit, press Ctrl+C.'" + " && tail -f /dev/null" + ), + ] + image_name = "dstackai/dind" + else: + commands = [ + "/bin/bash", + "-i", + "-c", + ( + "eval $(echo 'export DSTACK_VENV_DIR=/dstack/venv' | sudo tee -a /dstack/profile)" + " && sudo rm -rf $DSTACK_VENV_DIR" + " && sudo mkdir $DSTACK_VENV_DIR" + " && sudo chown $(id -u):$(id -g) $DSTACK_VENV_DIR" + " && uv venv -q --prompt dstack -p 3.13 --seed $DSTACK_VENV_DIR" + " && eval $(echo '. $DSTACK_VENV_DIR/bin/activate' | sudo tee -a /dstack/profile)" + " && (echo 'uv pip install ipykernel...'" + " && uv pip install -q --no-cache-dir ipykernel 2> /dev/null)" + " || (echo 'pip install ipykernel...'" + " && pip install -q --no-cache-dir ipykernel 2> /dev/null)" + " || echo 'no uv or pip found, ipykernel was not installed'" + " && echo" + " && echo 'To open in VS Code, use link below:'" + " && echo" + ' && echo " vscode://vscode-remote/ssh-remote+test-run$DSTACK_WORKING_DIR"' + " && echo" + " && echo 'To connect via SSH, use: `ssh test-run`'" + " && echo" + " && echo -n 'To exit, press Ctrl+C.'" + " && tail -f /dev/null" + ), + ] + image_name = "dstackai/base:0.14-base-ubuntu24.04" + return { "id": run_id, "project_name": project_name, "user": username, + "fleet": None, "submitted_at": submitted_at, "last_processed_at": last_processed_at, "status": "submitted", + "status_message": "submitted", "run_spec": { "configuration": { "entrypoint": None, "env": {}, "home_dir": "/root", + "working_dir": None, "ide": "vscode", + "inactivity_duration": None, "version": None, "image": None, + "user": None, + "docker": docker, + "shell": None, + "privileged": privileged, "init": [], "ports": [], - "python": "3.8", + "python": "3.13" if not docker else None, + "nvcc": None, "registry_auth": None, "setup": [], + "name": None, "type": "dev-environment", "resources": { "cpu": {"min": 2, "max": None}, @@ -214,75 +438,100 @@ def get_dev_env_run_dict( "shm_size": None, }, "volumes": [], - "backends": ["local", "aws", "azure", "gcp", "lambda"], + "repos": [ + { + "url": "https://github.com/dstackai/dstack", + "branch": None, + "hash": None, + "local_path": None, + "path": "~/repo", + "if_exists": "error", + }, + ], + "files": [], + "backends": ["aws", "azure", "gcp", "lambda"], "regions": ["us"], + "availability_zones": None, "instance_types": None, "creation_policy": None, - "instance_name": None, + "single_branch": None, "max_duration": "off", + "stop_duration": None, "max_price": None, - "pool_name": DEFAULT_POOL_NAME, "retry": None, - "retry_policy": None, - "spot_policy": "spot", - "termination_idle_time": 300, - "termination_policy": None, + "spot_policy": "auto", + "idle_duration": None, + "utilization_policy": None, + "startup_order": None, + "stop_criteria": None, + "schedule": None, + "reservation": None, + "fleets": None, + "tags": None, + "backend_options": None, + "instances": None, + "priority": 0, }, "configuration_path": "dstack.yaml", + "file_archives": [], "profile": { - "backends": ["local", "aws", "azure", "gcp", "lambda"], + "backends": ["aws", "azure", "gcp", "lambda"], "regions": ["us"], + "availability_zones": None, "instance_types": None, "creation_policy": None, "default": False, - "instance_name": None, "max_duration": "off", + "stop_duration": None, "max_price": None, "name": "string", - "pool_name": DEFAULT_POOL_NAME, "retry": None, - "retry_policy": None, - "spot_policy": "spot", - "termination_idle_time": 300, - "termination_policy": None, + "spot_policy": "auto", + "idle_duration": None, + "utilization_policy": None, + "startup_order": None, + "stop_criteria": None, + "schedule": None, + "reservation": None, + "fleets": None, + "tags": None, + "backend_options": None, + "instances": None, }, "repo_code_hash": None, - "repo_data": {"repo_dir": "/repo", "repo_type": "local"}, + "repo_data": { + "repo_type": "remote", + "repo_name": "dstack", + "repo_branch": None, + "repo_hash": None, + "repo_config_name": None, + "repo_config_email": None, + }, "repo_id": repo_id, + "repo_dir": "~/repo", "run_name": run_name, "ssh_key_pub": "ssh_key", - "working_dir": ".", + "working_dir": None, }, "jobs": [ { "job_spec": { "app_specs": [], - "commands": [ - "/bin/bash", - "-i", - "-c", - "env >> ~/.ssh/environment && " - "(echo pip install ipykernel... && " - "pip install -q --no-cache-dir " - 'ipykernel 2> /dev/null) || echo "no ' - 'pip, ipykernel was not installed" ' - "&& echo '' && echo To open in VS " - "Code Desktop, use link below: && " - "echo '' && echo ' " - "vscode://vscode-remote/ssh-remote+test-run/workflow' " - "&& echo '' && echo 'To connect via " - "SSH, use: `ssh test-run`' && echo '' " - "&& echo -n 'To exit, press Ctrl+C.' " - "&& tail -f /dev/null", - ], + "commands": commands, "env": {}, "home_dir": "/root", - "image_name": "dstackai/base:py3.8-0.4-cuda-12.1", + "image_name": image_name, + "user": None, + "privileged": True if docker else privileged, "job_name": f"{run_name}-0-0", "replica_num": 0, "job_num": 0, "jobs_per_replica": 1, + "replica_group": "0", + "single_branch": False, "max_duration": None, + "stop_duration": 300, + "utilization_policy": None, "registry_auth": None, "requirements": { "resources": { @@ -293,54 +542,141 @@ def get_dev_env_run_dict( "shm_size": None, }, "max_price": None, - "spot": True, + "spot": None, + "reservation": None, + "multinode": False, + "backend_options": None, }, "retry": None, - "retry_policy": {"retry": False, "duration": None}, - "working_dir": ".", + "volumes": [], + "ssh_key": None, + "working_dir": None, + "repo_code_hash": None, + "repo_data": { + "repo_type": "remote", + "repo_name": "dstack", + "repo_branch": None, + "repo_hash": None, + "repo_config_name": None, + "repo_config_email": None, + }, + "repo_dir": "~/repo", + "repo_exists_action": "error", + "file_archives": [], + "service_port": None, + "probes": [], }, "job_submissions": [ { "id": job_id, "submission_num": 0, + "deployment_num": 0, "submitted_at": submitted_at, "last_processed_at": last_processed_at, "finished_at": finished_at, + "inactivity_secs": None, "status": "submitted", + "status_message": "submitted", "termination_reason": None, "termination_reason_message": None, + "error": None, + "exit_status": None, "job_provisioning_data": None, + "job_runtime_data": None, + "probes": [], + "image_pull_progress": None, } ], + "job_connection_info": None, } ], "latest_job_submission": { "id": job_id, "submission_num": 0, + "deployment_num": 0, "submitted_at": submitted_at, "last_processed_at": last_processed_at, + "inactivity_secs": None, "finished_at": finished_at, "status": "submitted", + "status_message": "submitted", "termination_reason": None, "termination_reason_message": None, + "error": None, + "exit_status": None, "job_provisioning_data": None, + "job_runtime_data": None, + "probes": [], + "image_pull_progress": None, }, "cost": 0.0, "service": None, + "deployment_num": 0, "termination_reason": None, + "error": None, + "deleted": deleted, + "next_triggered_at": None, + } + + +def get_service_run_spec( + repo_id: str, + run_name: Optional[str] = None, + gateway: Optional[Union[bool, str]] = None, + model: Union[str, dict] = "test-model", +) -> dict: + return { + "configuration": { + "type": "service", + "commands": ["python -m http.server"], + "port": 8000, + "gateway": gateway, + "model": model, + "repos": [ + { + "url": "https://github.com/dstackai/dstack", + "branch": None, + "hash": None, + "local_path": None, + "path": "~/repo", + "if_exists": "error", + }, + ], + }, + "configuration_path": "dstack.yaml", + "file_archives": [], + "profile": { + "name": "string", + }, + "repo_code_hash": None, + "repo_data": { + "repo_type": "remote", + "repo_name": "dstack", + "repo_branch": None, + "repo_hash": None, + "repo_config_name": None, + "repo_config_email": None, + }, + "repo_id": repo_id, + "repo_dir": "~/repo", + "run_name": run_name, + "ssh_key_pub": "ssh_key", + "working_dir": None, } class TestListRuns: @pytest.mark.asyncio - async def test_returns_40x_if_not_authenticated(self, test_db, session: AsyncSession): - response = client.post("/api/runs/list") + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/runs/list") assert response.status_code in [401, 403] @pytest.mark.asyncio - async def test_lists_runs(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_lists_runs(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) + fleet = await create_fleet(session=session, project=project) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) @@ -354,6 +690,7 @@ async def test_lists_runs(self, test_db, session: AsyncSession): project=project, repo=repo, user=user, + fleet=fleet, submitted_at=run1_submitted_at, ) run1_spec = RunSpec.parse_raw(run1.run_spec) @@ -370,10 +707,11 @@ async def test_lists_runs(self, test_db, session: AsyncSession): project=project, repo=repo, user=user, + fleet=fleet, submitted_at=run2_submitted_at, ) run2_spec = RunSpec.parse_raw(run2.run_spec) - response = client.post( + response = await client.post( "/api/runs/list", headers=get_auth_headers(user.token), json={}, @@ -384,9 +722,14 @@ async def test_lists_runs(self, test_db, session: AsyncSession): "id": str(run1.id), "project_name": project.name, "user": user.name, + "fleet": { + "id": str(fleet.id), + "name": fleet.name, + }, "submitted_at": run1_submitted_at.isoformat(), "last_processed_at": run1_submitted_at.isoformat(), "status": "submitted", + "status_message": "submitted", "run_spec": run1_spec.dict(), "jobs": [ { @@ -395,50 +738,83 @@ async def test_lists_runs(self, test_db, session: AsyncSession): { "id": str(job.id), "submission_num": 0, + "deployment_num": 0, "submitted_at": run1_submitted_at.isoformat(), "last_processed_at": run1_submitted_at.isoformat(), "finished_at": None, + "inactivity_secs": None, "status": "submitted", + "status_message": "submitted", "termination_reason": None, "termination_reason_message": None, + "error": None, + "exit_status": None, "job_provisioning_data": None, + "job_runtime_data": None, + "probes": [], + "image_pull_progress": None, } ], + "job_connection_info": None, } ], "latest_job_submission": { "id": str(job.id), "submission_num": 0, + "deployment_num": 0, "submitted_at": run1_submitted_at.isoformat(), "last_processed_at": run1_submitted_at.isoformat(), "finished_at": None, + "inactivity_secs": None, "status": "submitted", + "status_message": "submitted", "termination_reason_message": None, "termination_reason": None, + "error": None, + "exit_status": None, "job_provisioning_data": None, + "job_runtime_data": None, + "probes": [], + "image_pull_progress": None, }, "cost": 0, "service": None, + "deployment_num": 0, "termination_reason": None, + "error": None, + "deleted": False, + "next_triggered_at": None, }, { "id": str(run2.id), "project_name": project.name, "user": user.name, + "fleet": { + "id": str(fleet.id), + "name": fleet.name, + }, "submitted_at": run2_submitted_at.isoformat(), "last_processed_at": run2_submitted_at.isoformat(), "status": "submitted", + "status_message": "submitted", "run_spec": run2_spec.dict(), "jobs": [], "latest_job_submission": None, "cost": 0, "service": None, + "deployment_num": 0, "termination_reason": None, + "error": None, + "deleted": False, + "next_triggered_at": None, }, ] @pytest.mark.asyncio - async def test_lists_runs_pagination(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_lists_runs_pagination( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( @@ -471,7 +847,7 @@ async def test_lists_runs_pagination(self, test_db, session: AsyncSession): user=user, submitted_at=datetime(2023, 1, 2, 5, 15, tzinfo=timezone.utc), ) - response1 = client.post( + response1 = await client.post( "/api/runs/list", headers=get_auth_headers(user.token), json={"limit": 2}, @@ -481,7 +857,7 @@ async def test_lists_runs_pagination(self, test_db, session: AsyncSession): assert len(response1_json) == 2 assert response1_json[0]["id"] == str(run3.id) assert response1_json[1]["id"] == str(run1.id) - response2 = client.post( + response2 = await client.post( "/api/runs/list", headers=get_auth_headers(user.token), json={ @@ -495,155 +871,2565 @@ async def test_lists_runs_pagination(self, test_db, session: AsyncSession): assert len(response2_json) == 1 assert response2_json[0]["id"] == str(run2.id) - -class TestGetRunPlan: @pytest.mark.asyncio - async def test_returns_403_if_not_project_member(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_limits_job_submissions( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) - response = client.post( - f"/api/project/{project.name}/runs/get_plan", + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo( + session=session, + project_id=project.id, + ) + run_submitted_at = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + submitted_at=run_submitted_at, + ) + run_spec = RunSpec.parse_raw(run.run_spec) + await create_job( + session=session, + run=run, + submitted_at=run_submitted_at, + last_processed_at=run_submitted_at, + ) + job2 = await create_job( + session=session, + run=run, + submitted_at=run_submitted_at, + last_processed_at=run_submitted_at, + ) + job2_spec = JobSpec.parse_raw(job2.job_spec_data) + response = await client.post( + "/api/runs/list", headers=get_auth_headers(user.token), + json={"job_submissions_limit": 1}, ) - assert response.status_code == 403 + assert response.status_code == 200, response.json() + assert response.json() == [ + { + "id": str(run.id), + "project_name": project.name, + "user": user.name, + "fleet": None, + "submitted_at": run_submitted_at.isoformat(), + "last_processed_at": run_submitted_at.isoformat(), + "status": "submitted", + "status_message": "submitted", + "run_spec": run_spec.dict(), + "jobs": [ + { + "job_spec": job2_spec.dict(), + "job_submissions": [ + { + "id": str(job2.id), + "submission_num": 0, + "deployment_num": 0, + "submitted_at": run_submitted_at.isoformat(), + "last_processed_at": run_submitted_at.isoformat(), + "finished_at": None, + "inactivity_secs": None, + "status": "submitted", + "status_message": "submitted", + "termination_reason": None, + "termination_reason_message": None, + "error": None, + "exit_status": None, + "job_provisioning_data": None, + "job_runtime_data": None, + "probes": [], + "image_pull_progress": None, + } + ], + "job_connection_info": None, + } + ], + "latest_job_submission": { + "id": str(job2.id), + "submission_num": 0, + "deployment_num": 0, + "submitted_at": run_submitted_at.isoformat(), + "last_processed_at": run_submitted_at.isoformat(), + "finished_at": None, + "inactivity_secs": None, + "status": "submitted", + "status_message": "submitted", + "termination_reason_message": None, + "termination_reason": None, + "error": None, + "exit_status": None, + "job_provisioning_data": None, + "job_runtime_data": None, + "probes": [], + "image_pull_progress": None, + }, + "cost": 0, + "service": None, + "deployment_num": 0, + "termination_reason": None, + "error": None, + "deleted": False, + "next_triggered_at": None, + }, + ] @pytest.mark.asyncio - async def test_returns_run_plan(self, test_db, session: AsyncSession): - user = await create_user(session=session, global_role=GlobalRole.USER) + @pytest.mark.parametrize( + "client_version,expected_probes", + [ + ("0.20.7", []), + ("0.20.8", None), + (None, None), + ], + ) + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_patches_service_configuration_probes_for_old_clients( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + client_version: Optional[str], + expected_probes: Optional[list], + ) -> None: + user = await create_user(session=session) project = await create_project(session=session, owner=user) - await add_project_member( - session=session, project=project, user=user, project_role=ProjectRole.USER - ) repo = await create_repo(session=session, project_id=project.id) - offers = [ - InstanceOfferWithAvailability( - backend=BackendType.AWS, - instance=InstanceType( - name="instance", - resources=Resources(cpus=1, memory_mib=512, spot=False, gpus=[]), - ), - region="us", - price=1.0, - availability=InstanceAvailability.AVAILABLE, - ) - ] - run_plan_dict = get_dev_env_run_plan_dict( - project_name=project.name, - username=user.name, + + service_conf = ServiceConfiguration( + commands=["echo hello"], + port=80, + probes=None, # This should be patched to [] for clients prior to 0.20.8 + ) + run_spec = get_run_spec( + configuration=service_conf, repo_id=repo.name, - offers=offers, - total_offers=1, - max_price=1.0, ) - body = {"run_spec": run_plan_dict["run_spec"]} - with patch("dstack._internal.server.services.backends.get_project_backends") as m: - backend_mock = Mock() - m.return_value = [backend_mock] - backend_mock.TYPE = BackendType.AWS - backend_mock.compute.return_value.get_offers.return_value = offers - response = client.post( - f"/api/project/{project.name}/runs/get_plan", - headers=get_auth_headers(user.token), - json=body, - ) - assert response.status_code == 200, response.json() - assert response.json() == run_plan_dict + await create_run(session=session, project=project, repo=repo, user=user, run_spec=run_spec) + + headers = get_auth_headers(user.token) + if client_version is not None: + headers["X-API-Version"] = client_version + response = await client.post( + "/api/runs/list", + headers=headers, + json={"project_name": project.name}, + ) + assert response.status_code == 200 + runs_list = response.json() + assert len(runs_list) == 1 + assert runs_list[0]["run_spec"]["configuration"]["probes"] == expected_probes -class TestSubmitRun: + +class TestGetRun: @pytest.mark.asyncio - async def test_returns_403_if_not_project_member(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_project_member( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) - response = client.post( - f"/api/project/{project.name}/runs/submit", + response = await client.post( + f"/api/project/{project.name}/runs/get", headers=get_auth_headers(user.token), + json={"run_name": "myrun"}, ) assert response.status_code == 403 @pytest.mark.asyncio - async def test_submits_run(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_run_given_name( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) - run_id = UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e") - submitted_at = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc) - submitted_at_formatted = "2023-01-02T03:04:00+00:00" - last_processed_at_formatted = submitted_at_formatted - repo = await create_repo(session=session, project_id=project.id) - run_dict = get_dev_env_run_dict( - run_id=str(run_id), - job_id=str(run_id), - project_name=project.name, - username=user.name, - submitted_at=submitted_at_formatted, - last_processed_at=last_processed_at_formatted, - finished_at=None, - run_name="test-run", - repo_id=repo.name, + repo = await create_repo( + session=session, + project_id=project.id, + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + ) + response = await client.post( + f"/api/project/{project.name}/runs/get", + headers=get_auth_headers(user.token), + json={"run_name": "nonexistent_run_name"}, + ) + assert response.status_code == 400 + response = await client.post( + f"/api/project/{project.name}/runs/get", + headers=get_auth_headers(user.token), + json={"run_name": run.run_name}, ) - body = {"run_spec": run_dict["run_spec"]} - with patch("uuid.uuid4") as uuid_mock, patch( - "dstack._internal.utils.common.get_current_datetime" - ) as datetime_mock, patch( - "dstack._internal.server.services.backends.get_project_backends" - ) as get_project_backends_mock: - get_project_backends_mock.return_value = [Mock()] - uuid_mock.return_value = run_id - datetime_mock.return_value = submitted_at - response = client.post( - f"/api/project/{project.name}/runs/submit", - headers=get_auth_headers(user.token), - json=body, - ) assert response.status_code == 200, response.json() - assert response.json() == run_dict - res = await session.execute(select(RunModel)) - run = res.scalar() - assert run is not None - res = await session.execute(select(JobModel)) - job = res.scalar() - assert job is not None + assert response.json()["id"] == str(run.id) @pytest.mark.asyncio - async def test_submits_run_without_run_name(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_deleted_run_given_id( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) - repo = await create_repo(session=session, project_id=project.id) - run_dict = get_dev_env_run_dict( - project_name=project.name, - username=user.name, - run_name=None, - repo_id=repo.name, + repo = await create_repo( + session=session, + project_id=project.id, ) - body = {"run_spec": run_dict["run_spec"]} - with patch("uuid.uuid4") as uuid_mock, patch( - "dstack._internal.server.services.backends.get_project_backends" - ) as get_project_backends_mock: - get_project_backends_mock.return_value = [Mock()] - uuid_mock.return_value = run_dict["id"] - response = client.post( - f"/api/project/{project.name}/runs/submit", - headers=get_auth_headers(user.token), - json=body, - ) - assert response.status_code == 200 - assert response.json()["run_spec"]["run_name"] is not None - res = await session.execute(select(RunModel)) - run = res.scalar() - assert run is not None - res = await session.execute(select(JobModel)) - job = res.scalar() - assert job is not None + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + deleted=True, + ) + response = await client.post( + f"/api/project/{project.name}/runs/get", + headers=get_auth_headers(user.token), + json={"id": str(run.id)}, + ) + assert response.status_code == 200, response.json() + assert response.json()["id"] == str(run.id) @pytest.mark.asyncio @pytest.mark.parametrize( - "run_name", + "client_version,expected_probes", + [ + ("0.20.7", []), + ("0.20.8", None), + (None, None), + ], + ) + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_patches_service_configuration_probes_for_old_clients( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + client_version: Optional[str], + expected_probes: Optional[list], + ) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + repo = await create_repo(session=session, project_id=project.id) + + service_conf = ServiceConfiguration( + commands=["echo hello"], + port=80, + probes=None, # This should be patched to [] for clients prior to 0.20.8 + ) + run_spec = get_run_spec( + configuration=service_conf, + repo_id=repo.name, + ) + run = await create_run( + session=session, project=project, repo=repo, user=user, run_spec=run_spec + ) + + headers = get_auth_headers(user.token) + if client_version is not None: + headers["X-API-Version"] = client_version + response = await client.post( + f"/api/project/{project.name}/runs/get", + headers=headers, + json={"run_name": run.run_name}, + ) + + assert response.status_code == 200 + assert response.json()["run_spec"]["configuration"]["probes"] == expected_probes + + @pytest.mark.asyncio + @pytest.mark.parametrize( + ("ide", "ide_name", "attached_ide_url", "proxied_ide_url_tmpl"), + [ + pytest.param( + "vscode", + "VS Code", + "vscode://vscode-remote/ssh-remote+dev-env/test", + "vscode://vscode-remote/ssh-remote+{auth}/test", + id="vscode", + ), + pytest.param( + "cursor", + "Cursor", + "cursor://vscode-remote/ssh-remote+dev-env/test", + "cursor://vscode-remote/ssh-remote+{auth}/test", + id="cursor", + ), + pytest.param( + "windsurf", + "Windsurf", + "windsurf://vscode-remote/ssh-remote+dev-env/test", + "windsurf://vscode-remote/ssh-remote+{auth}/test", + id="windsurf", + ), + pytest.param( + "zed", + "Zed", + "zed://ssh/dev-env/test", + "zed://ssh/{auth}/test", + id="zed", + ), + ], + ) + @pytest.mark.parametrize( + "sshproxy", + [ + pytest.param(False, id="without-sshproxy"), + pytest.param(True, id="with-sshproxy"), + ], + ) + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_run_with_job_connection_info_dev_environment( + self, + monkeypatch: pytest.MonkeyPatch, + test_db, + session: AsyncSession, + client: AsyncClient, + sshproxy: bool, + ide: str, + ide_name: str, + attached_ide_url: str, + proxied_ide_url_tmpl: str, + ): + monkeypatch.setattr("dstack._internal.server.settings.SSHPROXY_ENABLED", sshproxy) + monkeypatch.setattr("dstack._internal.server.settings.SSHPROXY_HOSTNAME", "example.com") + monkeypatch.setattr("dstack._internal.server.settings.SSHPROXY_PORT", 2222) + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo( + session=session, + project_id=project.id, + ) + run_spec = get_run_spec( + repo_id=repo.name, + run_name="dev-env", + configuration=DevEnvironmentConfiguration(ide=ide), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=run_spec, + run_name=run_spec.run_name, + ) + job_runtime_data = get_job_runtime_data(working_dir="/test") + job = await create_job( + session=session, run=run, status=JobStatus.RUNNING, job_runtime_data=job_runtime_data + ) + response = await client.post( + f"/api/project/{project.name}/runs/get", + headers=get_auth_headers(user.token), + json={"run_name": run.run_name}, + ) + assert response.status_code == 200, response.json() + proxied_authority = f"{job.id.hex}@example.com:2222" + assert response.json()["jobs"][0]["job_connection_info"] == { + "ide_name": ide_name, + "attached_ide_url": attached_ide_url, + "proxied_ide_url": proxied_ide_url_tmpl.format(auth=proxied_authority) + if sshproxy + else None, + "attached_ssh_command": ["ssh", "dev-env"], + "proxied_ssh_command": ["ssh", f"{job.id.hex}@example.com", "-p", "2222"] + if sshproxy + else None, + "sshproxy_hostname": "example.com" if sshproxy else None, + "sshproxy_port": 2222 if sshproxy else None, + "sshproxy_upstream_id": job.id.hex if sshproxy else None, + } + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_run_with_job_connection_info_multi_replica_multi_node_task( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo( + session=session, + project_id=project.id, + ) + run_spec = get_run_spec( + repo_id=repo.name, + run_name="test-task", + configuration=TaskConfiguration(commands=["sleep inf"]), + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=run_spec, + run_name=run_spec.run_name, + ) + job_runtime_data = get_job_runtime_data(working_dir="/test") + for replica_num in range(2): + for job_num in range(2): + await create_job( + session=session, + run=run, + # test-task-1-1 is still PULLING, other jobs are RUNNING + status=JobStatus.PULLING if replica_num == job_num == 1 else JobStatus.RUNNING, + job_runtime_data=job_runtime_data, + replica_num=replica_num, + job_num=job_num, + ) + response = await client.post( + f"/api/project/{project.name}/runs/get", + headers=get_auth_headers(user.token), + json={"run_name": run.run_name}, + ) + assert response.status_code == 200, response.json() + jobs = response.json()["jobs"] + common_fields = { + "ide_name": None, + "attached_ide_url": None, + "proxied_ide_url": None, + "proxied_ssh_command": None, + "sshproxy_hostname": None, + "sshproxy_port": None, + "sshproxy_upstream_id": None, + } + assert jobs[0]["job_connection_info"] == { + "attached_ssh_command": ["ssh", "test-task"], + **common_fields, + } + assert jobs[1]["job_connection_info"] == { + "attached_ssh_command": ["ssh", "test-task-1-0"], + **common_fields, + } + assert jobs[2]["job_connection_info"] == { + "attached_ssh_command": ["ssh", "test-task-0-1"], + **common_fields, + } + assert jobs[3]["job_connection_info"] is None + + @pytest.mark.asyncio + @pytest.mark.parametrize( + "client_version,expected_fleets", + [ + ( + "0.20.13", + [ + "my-fleet", + "other-project/other-fleet", + ], + ), + ( + "0.20.14", + [ + {"project": None, "name": "my-fleet"}, + {"project": "other-project", "name": "other-fleet"}, + ], + ), + ( + None, + [ + {"project": None, "name": "my-fleet"}, + {"project": "other-project", "name": "other-fleet"}, + ], + ), + ], + ) + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_patches_fleets_for_old_clients( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + client_version: Optional[str], + expected_fleets: list, + ) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + repo = await create_repo(session=session, project_id=project.id) + + fleets: list[Union[EntityReference, str]] = [ + EntityReference(project=None, name="my-fleet"), + EntityReference(project="other-project", name="other-fleet"), + ] + run_spec = get_run_spec( + configuration=TaskConfiguration( + commands=["echo hello"], + fleets=fleets, + ), + repo_id=repo.name, + profile=Profile( + fleets=fleets, + ), + ) + run = await create_run( + session=session, project=project, repo=repo, user=user, run_spec=run_spec + ) + + headers = get_auth_headers(user.token) + if client_version is not None: + headers["X-API-Version"] = client_version + response = await client.post( + f"/api/project/{project.name}/runs/get", + headers=headers, + json={"run_name": run.run_name}, + ) + + assert response.status_code == 200 + assert response.json()["run_spec"]["configuration"]["fleets"] == expected_fleets + assert response.json()["run_spec"]["profile"]["fleets"] == expected_fleets + + @pytest.mark.asyncio + @pytest.mark.parametrize( + "client_version,expected_gateway", + [ + ( + "0.20.19", + "other-project/my-gateway", + ), + ( + "0.20.20", + {"project": "other-project", "name": "my-gateway"}, + ), + ( + None, + {"project": "other-project", "name": "my-gateway"}, + ), + ], + ) + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_patches_service_gateway_for_old_clients( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + client_version: Optional[str], + expected_gateway, + ) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + repo = await create_repo(session=session, project_id=project.id) + + run_spec = get_run_spec( + configuration=ServiceConfiguration( + commands=["echo hello"], + port=80, + gateway=EntityReference(project="other-project", name="my-gateway"), + ), + repo_id=repo.name, + ) + run = await create_run( + session=session, project=project, repo=repo, user=user, run_spec=run_spec + ) + + headers = get_auth_headers(user.token) + if client_version is not None: + headers["X-API-Version"] = client_version + response = await client.post( + f"/api/project/{project.name}/runs/get", + headers=headers, + json={"run_name": run.run_name}, + ) + + assert response.status_code == 200 + assert response.json()["run_spec"]["configuration"]["gateway"] == expected_gateway + + +class TestGetRunPlan: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_project_member( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + response = await client.post( + f"/api/project/{project.name}/runs/get_plan", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_run_plan_privileged_false( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.nodes = FleetNodesSpec(min=0, target=0, max=None) + await create_fleet(session=session, project=project, spec=fleet_spec) + repo = await create_repo(session=session, project_id=project.id) + offer_aws = InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="instance", + resources=Resources(cpus=2, memory_mib=8192, spot=False, gpus=[]), + ), + region="us", + price=1.0, + availability=InstanceAvailability.AVAILABLE, + ) + offer_runpod = InstanceOfferWithAvailability( + backend=BackendType.RUNPOD, + instance=InstanceType( + name="instance", + resources=Resources(cpus=2, memory_mib=8192, spot=False, gpus=[]), + ), + region="us", + price=2.0, + availability=InstanceAvailability.AVAILABLE, + ) + run_plan_dict = get_dev_env_run_plan_dict( + project_name=project.name, + username=user.name, + repo_id=repo.name, + offers=[offer_aws, offer_runpod], + total_offers=2, + max_price=2.0, + privileged=False, + ) + body = {"run_spec": run_plan_dict["run_spec"]} + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock_aws = Mock() + backend_mock_aws.TYPE = BackendType.AWS + backend_mock_aws.compute.return_value.get_offers.return_value = [offer_aws] + backend_mock_runpod = Mock() + backend_mock_runpod.TYPE = BackendType.RUNPOD + backend_mock_runpod.compute.return_value.get_offers.return_value = [offer_runpod] + m.return_value = [backend_mock_aws, backend_mock_runpod] + response = await client.post( + f"/api/project/{project.name}/runs/get_plan", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 200, response.json() + assert response.json() == run_plan_dict + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_run_plan_privileged_true( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.nodes = FleetNodesSpec(min=0, target=0, max=None) + await create_fleet(session=session, project=project, spec=fleet_spec) + repo = await create_repo(session=session, project_id=project.id) + offer_aws = InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="instance", + resources=Resources(cpus=2, memory_mib=8192, spot=False, gpus=[]), + ), + region="us", + price=1.0, + availability=InstanceAvailability.AVAILABLE, + ) + offer_runpod = InstanceOfferWithAvailability( + backend=BackendType.RUNPOD, + instance=InstanceType( + name="instance", + resources=Resources(cpus=2, memory_mib=8192, spot=False, gpus=[]), + ), + region="us", + price=2.0, + availability=InstanceAvailability.AVAILABLE, + ) + run_plan_dict = get_dev_env_run_plan_dict( + project_name=project.name, + username=user.name, + repo_id=repo.name, + offers=[offer_aws], + total_offers=1, + max_price=1.0, + privileged=True, + ) + body = {"run_spec": run_plan_dict["run_spec"]} + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock_aws = Mock() + backend_mock_aws.TYPE = BackendType.AWS + backend_mock_aws.compute.return_value.get_offers.return_value = [offer_aws] + backend_mock_runpod = Mock() + backend_mock_runpod.TYPE = BackendType.RUNPOD + backend_mock_runpod.compute.return_value.get_offers.return_value = [offer_runpod] + m.return_value = [backend_mock_aws, backend_mock_runpod] + response = await client.post( + f"/api/project/{project.name}/runs/get_plan", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 200, response.json() + assert response.json() == run_plan_dict + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_task_with_two_nodes_returns_two_job_plans( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.nodes = FleetNodesSpec(min=0, target=0, max=None) + fleet_spec.configuration.placement = InstanceGroupPlacement.CLUSTER + await create_fleet(session=session, project=project, spec=fleet_spec) + repo = await create_repo(session=session, project_id=project.id) + offer = InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="instance", + resources=Resources(cpus=4, memory_mib=16384, spot=False, gpus=[]), + ), + region="us", + price=1.0, + availability=InstanceAvailability.AVAILABLE, + ) + run_spec = get_run_spec( + repo_id=repo.name, + configuration=TaskConfiguration(commands=["echo hi"], nodes=2), + ) + body = {"run_spec": json.loads(run_spec.json())} + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value.get_offers.return_value = [offer] + m.return_value = [backend_mock] + response = await client.post( + f"/api/project/{project.name}/runs/get_plan", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 200, response.json() + job_plans = response.json()["job_plans"] + assert len(job_plans) == 2 + assert job_plans[0]["job_spec"]["job_num"] == 0 + assert job_plans[1]["job_spec"]["job_num"] == 1 + assert len(job_plans[0]["offers"]) == 1 + assert job_plans[0]["offers"] == job_plans[1]["offers"] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_service_with_two_replica_groups_returns_two_job_plans( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.nodes = FleetNodesSpec(min=0, target=0, max=None) + await create_fleet(session=session, project=project, spec=fleet_spec) + repo = await create_repo(session=session, project_id=project.id) + gpu_offer = InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="gpu-instance", + resources=Resources( + cpus=8, + memory_mib=32768, + spot=False, + gpus=[Gpu(name="A100", memory_mib=40960)], + ), + ), + region="us", + price=5.0, + availability=InstanceAvailability.AVAILABLE, + ) + cpu_offer = InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="cpu-instance", + resources=Resources(cpus=4, memory_mib=16384, spot=False, gpus=[]), + ), + region="us", + price=1.0, + availability=InstanceAvailability.AVAILABLE, + ) + run_spec = get_run_spec( + repo_id=repo.name, + configuration=ServiceConfiguration( + port=8080, + gateway=False, + replicas=[ + ReplicaGroup( + name="gpu-group", + count=Range[int](min=2, max=2), + resources=ResourcesSpec(gpu=GPUSpec()), + commands=["python server.py"], + ), + ReplicaGroup( + name="cpu-group", + count=Range[int](min=1, max=1), + resources=ResourcesSpec(gpu=None), + commands=["python router.py"], + ), + ], + ), + ) + body = {"run_spec": json.loads(run_spec.json())} + + def offers_by_requirements(requirements: Requirements): + if ( + requirements.resources.gpu is not None + and requirements.resources.gpu.count.min is not None + and requirements.resources.gpu.count.min > 0 + ): + return [gpu_offer] + return [cpu_offer] + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value.get_offers.side_effect = offers_by_requirements + m.return_value = [backend_mock] + response = await client.post( + f"/api/project/{project.name}/runs/get_plan", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 200, response.json() + gpu_job_plan, cpu_job_plan = response.json()["job_plans"] + assert gpu_job_plan["offers"][0]["instance"]["resources"]["gpus"] != [] + assert cpu_job_plan["offers"][0]["instance"]["resources"]["gpus"] == [] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_service_reservation_group_filters_backends_by_reservation_support( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.nodes = FleetNodesSpec(min=0, target=0, max=None) + await create_fleet(session=session, project=project, spec=fleet_spec) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + configuration=ServiceConfiguration( + port=8080, + gateway=False, + image="nginx", + replicas=[ + ReplicaGroup( + name="reserved-group", + count=Range[int](min=1, max=1), + reservation="my-reservation-id", + ), + ReplicaGroup( + count=Range[int](min=1, max=1), + name="unreserved-group", + ), + ], + ), + ) + body = {"run_spec": json.loads(run_spec.json())} + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + aws_backend_mock = Mock() + aws_backend_mock.TYPE = BackendType.AWS + aws_backend_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(backend=BackendType.AWS, price=2) + ] + verda_backend_mock = Mock() + verda_backend_mock.TYPE = BackendType.VERDA + verda_backend_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(backend=BackendType.VERDA, price=1) + ] + m.return_value = [aws_backend_mock, verda_backend_mock] + + response = await client.post( + f"/api/project/{project.name}/runs/get_plan", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 200, response.json() + reserved_job_plan, unreserved_job_plan = response.json()["job_plans"] + + # Verda offer not included for `reserved-group`, since Verda does not support reservations + assert reserved_job_plan["offers"][0]["backend"] == "aws" + assert len(reserved_job_plan["offers"]) == 1 + + assert unreserved_job_plan["offers"][0]["backend"] == "verda" + assert unreserved_job_plan["offers"][1]["backend"] == "aws" + assert len(unreserved_job_plan["offers"]) == 2 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_run_plan_docker_true( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.nodes = FleetNodesSpec(min=0, target=0, max=None) + await create_fleet(session=session, project=project, spec=fleet_spec) + repo = await create_repo(session=session, project_id=project.id) + offer_aws = InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="instance", + resources=Resources(cpus=2, memory_mib=8192, spot=False, gpus=[]), + ), + region="us", + price=1.0, + availability=InstanceAvailability.AVAILABLE, + ) + offer_runpod = InstanceOfferWithAvailability( + backend=BackendType.RUNPOD, + instance=InstanceType( + name="instance", + resources=Resources(cpus=2, memory_mib=8192, spot=False, gpus=[]), + ), + region="us", + price=2.0, + availability=InstanceAvailability.AVAILABLE, + ) + run_plan_dict = get_dev_env_run_plan_dict( + project_name=project.name, + username=user.name, + repo_id=repo.name, + offers=[offer_aws], + total_offers=1, + max_price=1.0, + docker=True, + ) + body = {"run_spec": run_plan_dict["run_spec"]} + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock_aws = Mock() + backend_mock_aws.TYPE = BackendType.AWS + backend_mock_aws.compute.return_value.get_offers.return_value = [offer_aws] + backend_mock_runpod = Mock() + backend_mock_runpod.TYPE = BackendType.RUNPOD + backend_mock_runpod.compute.return_value.get_offers.return_value = [offer_runpod] + m.return_value = [backend_mock_aws, backend_mock_runpod] + response = await client.post( + f"/api/project/{project.name}/runs/get_plan", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 200, response.json() + assert response.json() == run_plan_dict + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_run_plan_instance_volumes( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.nodes = FleetNodesSpec(min=0, target=0, max=None) + await create_fleet(session=session, project=project, spec=fleet_spec) + repo = await create_repo(session=session, project_id=project.id) + offer_aws = InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="instance", + resources=Resources(cpus=1, memory_mib=512, spot=False, gpus=[]), + ), + region="us", + price=1.0, + availability=InstanceAvailability.AVAILABLE, + ) + offer_runpod = InstanceOfferWithAvailability( + backend=BackendType.RUNPOD, + instance=InstanceType( + name="instance", + resources=Resources(cpus=1, memory_mib=512, spot=False, gpus=[]), + ), + region="us", + price=2.0, + availability=InstanceAvailability.AVAILABLE, + ) + run_plan_dict = get_dev_env_run_plan_dict( + project_name=project.name, + username=user.name, + repo_id=repo.name, + offers=[offer_aws], + total_offers=1, + max_price=1.0, + volumes=[InstanceMountPoint.parse("/data:/data")], + ) + body = {"run_spec": run_plan_dict["run_spec"]} + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock_aws = Mock() + backend_mock_aws.TYPE = BackendType.AWS + backend_mock_aws.compute.return_value.get_offers.return_value = [offer_aws] + backend_mock_runpod = Mock() + backend_mock_runpod.TYPE = BackendType.RUNPOD + backend_mock_runpod.compute.return_value.get_offers.return_value = [offer_runpod] + m.return_value = [backend_mock_aws, backend_mock_runpod] + response = await client.post( + f"/api/project/{project.name}/runs/get_plan", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 200, response.json() + assert response.json() == run_plan_dict + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + "configuration", + [ + pytest.param({"type": "dev-environment", "ide": "vscode"}, id="regular-configuration"), + pytest.param( + {"type": "task", "commands": [":"], "image": "scratch"}, + id="special-configuration-used-by-dstack-offer-cli-command", + ), + ], + ) + async def test_returns_run_plan_with_offer_from_imported_fleet( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + configuration: dict, + ) -> None: + importer_user = await create_user(session, global_role=GlobalRole.USER) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.USER, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + instance_num=1, + backend=BackendType.REMOTE, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + + run_spec = {"configuration": configuration} + body = {"run_spec": run_spec} + response = await client.post( + "/api/project/importer-project/runs/get_plan", + headers=get_auth_headers(importer_user.token), + json=body, + ) + assert response.status_code == 200, response.json() + response_json = response.json() + assert response_json["project_name"] == "importer-project" + assert response_json["job_plans"][0]["offers"][0]["backend"] == "remote" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_no_offers_if_imported_ssh_fleet_is_empty( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + ) -> None: + importer_user = await create_user(session, global_role=GlobalRole.USER) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.USER, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + + run_spec = {"configuration": {"type": "dev-environment", "ide": "vscode"}} + body = {"run_spec": run_spec} + response = await client.post( + "/api/project/importer-project/runs/get_plan", + headers=get_auth_headers(importer_user.token), + json=body, + ) + assert response.status_code == 200, response.json() + response_json = response.json() + assert response_json["project_name"] == "importer-project" + assert len(response_json["job_plans"][0]["offers"]) == 0 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + ("configured_fleet", "expected_price"), + [ + ("exporter-a/test-fleet", 1.0), + ("exporter-b/test-fleet", 2.0), + ("importer/test-fleet", 3.0), + ("test-fleet", 3.0), + ], + ) + async def test_returns_run_plan_offers_from_specified_fleet_across_projects( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + configured_fleet: str, + expected_price: float, + ) -> None: + user = await create_user(session, global_role=GlobalRole.USER) + exporter_a = await create_project(session, name="exporter-a", owner=user) + exporter_b = await create_project(session, name="exporter-b", owner=user) + importer = await create_project(session, name="importer", owner=user) + await add_project_member( + session=session, + project=importer, + user=user, + project_role=ProjectRole.USER, + ) + fleet_a = await create_fleet( + session=session, + project=exporter_a, + name="test-fleet", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + await create_instance( + session=session, + project=exporter_a, + fleet=fleet_a, + backend=BackendType.REMOTE, + price=1.0, + ) + fleet_b = await create_fleet( + session=session, + project=exporter_b, + name="test-fleet", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + await create_instance( + session=session, + project=exporter_b, + fleet=fleet_b, + backend=BackendType.REMOTE, + price=2.0, + ) + fleet_importer = await create_fleet( + session=session, + project=importer, + name="test-fleet", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + await create_instance( + session=session, + project=importer, + fleet=fleet_importer, + backend=BackendType.REMOTE, + price=3.0, + ) + await create_export( + session=session, + exporter_project=exporter_a, + importer_projects=[importer], + exported_fleets=[fleet_a], + ) + await create_export( + session=session, + exporter_project=exporter_b, + importer_projects=[importer], + exported_fleets=[fleet_b], + ) + + run_spec = { + "configuration": { + "type": "dev-environment", + "ide": "vscode", + "fleets": [configured_fleet], + } + } + body = {"run_spec": run_spec} + response = await client.post( + "/api/project/importer/runs/get_plan", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 200, response.json() + response_json = response.json() + assert response_json["project_name"] == "importer" + offers = response_json["job_plans"][0]["offers"] + assert offers[0]["price"] == expected_price + assert len(offers) == 1 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_no_offers_if_imported_fleet_specified_without_project_prefix( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + ) -> None: + importer_user = await create_user(session, global_role=GlobalRole.USER) + exporter_a = await create_project(session, name="exporter-a") + importer = await create_project(session, name="importer", owner=importer_user) + await add_project_member( + session=session, + project=importer, + user=importer_user, + project_role=ProjectRole.USER, + ) + fleet_a = await create_fleet( + session=session, + project=exporter_a, + name="test-fleet", + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + await create_instance( + session=session, + project=exporter_a, + fleet=fleet_a, + backend=BackendType.REMOTE, + ) + await create_export( + session=session, + exporter_project=exporter_a, + importer_projects=[importer], + exported_fleets=[fleet_a], + ) + + run_spec = { + "configuration": { + "type": "dev-environment", + "ide": "vscode", + "fleets": ["test-fleet"], # won't work, should be exporter-a/test-fleet + } + } + body = {"run_spec": run_spec} + response = await client.post( + "/api/project/importer/runs/get_plan", + headers=get_auth_headers(importer_user.token), + json=body, + ) + assert response.status_code == 200, response.json() + response_json = response.json() + assert response_json["project_name"] == "importer" + assert len(response_json["job_plans"][0]["offers"]) == 0 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + "configuration", + [ + pytest.param({"type": "dev-environment"}, id="regular-configuration"), + pytest.param( + {"type": "task", "commands": [":"], "image": "scratch"}, + id="special-configuration-used-by-dstack-offer-cli-command", + ), + pytest.param( + {"type": "task", "commands": [":"], "image": "scratch", "fleets": ["test-fleet"]}, + id="special-configuration-used-by-dstack-offer-cli-command-with-fleets", # --fleet + ), + ], + ) + async def test_preserves_backend_specific_offer_order( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + configuration: dict, + ) -> None: + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, + project=project, + user=user, + project_role=ProjectRole.USER, + ) + repo = await create_repo(session=session, project_id=project.id) + await create_fleet( + session=session, + project=project, + spec=get_fleet_spec(conf=get_fleet_configuration(name="test-fleet")), + ) + + run_spec = get_run_spec( + repo_id=repo.name, configuration=parse_run_configuration(configuration) + ) + body = {"run_spec": run_spec.dict()} + + backend_mock_aws = Mock() + backend_mock_aws.TYPE = BackendType.AWS + backend_mock_aws.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(backend=BackendType.AWS, price=1.0), + get_instance_offer_with_availability(backend=BackendType.AWS, price=4.0), + ] + backend_mock_vastai = Mock() + backend_mock_vastai.TYPE = BackendType.VASTAI + backend_mock_vastai.compute.return_value.get_offers.return_value = [ + # not ordered by price - custom order should be preserved + get_instance_offer_with_availability(backend=BackendType.VASTAI, price=3.0), + get_instance_offer_with_availability(backend=BackendType.VASTAI, price=2.0), + ] + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [backend_mock_aws, backend_mock_vastai] + response = await client.post( + f"/api/project/{project.name}/runs/get_plan", + headers=get_auth_headers(user.token), + json=body, + ) + + assert response.status_code == 200, response.json() + offers = [(o["backend"], o["price"]) for o in response.json()["job_plans"][0]["offers"]] + expected_offers = [ + (BackendType.AWS.value, 1.0), + (BackendType.VASTAI.value, 3.0), + (BackendType.VASTAI.value, 2.0), + (BackendType.AWS.value, 4.0), + ] + assert offers == expected_offers + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_offer_cli_preserves_backend_specific_offer_order_across_fleets( + self, test_db, session: AsyncSession, client: AsyncClient + ) -> None: + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, + project=project, + user=user, + project_role=ProjectRole.USER, + ) + repo = await create_repo(session=session, project_id=project.id) + await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + conf=get_fleet_configuration(name="fleet-aws", backends=[BackendType.AWS]) + ), + ) + await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + conf=get_fleet_configuration(name="fleet-vastai", backends=[BackendType.VASTAI]) + ), + ) + + run_spec = get_run_spec( + repo_id=repo.name, + configuration=TaskConfiguration( + commands=[":"], + image="scratch", + fleets=["fleet-aws", "fleet-vastai"], + ), + ) + body = {"run_spec": run_spec.dict()} + + backend_mock_aws = Mock() + backend_mock_aws.TYPE = BackendType.AWS + backend_mock_aws.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(backend=BackendType.AWS, price=1.0), + get_instance_offer_with_availability(backend=BackendType.AWS, price=4.0), + ] + backend_mock_vastai = Mock() + backend_mock_vastai.TYPE = BackendType.VASTAI + backend_mock_vastai.compute.return_value.get_offers.return_value = [ + # not ordered by price - custom order should be preserved + get_instance_offer_with_availability(backend=BackendType.VASTAI, price=3.0), + get_instance_offer_with_availability(backend=BackendType.VASTAI, price=2.0), + ] + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [backend_mock_aws, backend_mock_vastai] + response = await client.post( + f"/api/project/{project.name}/runs/get_plan", + headers=get_auth_headers(user.token), + json=body, + ) + + assert response.status_code == 200, response.json() + offers = [(o["backend"], o["price"]) for o in response.json()["job_plans"][0]["offers"]] + expected_offers = [ + (BackendType.AWS.value, 1.0), + (BackendType.VASTAI.value, 3.0), + (BackendType.VASTAI.value, 2.0), + (BackendType.AWS.value, 4.0), + ] + assert offers == expected_offers + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_offer_cli_returns_offers_from_all_specified_fleets( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + ) -> None: + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, + project=project, + user=user, + project_role=ProjectRole.USER, + ) + repo = await create_repo(session=session, project_id=project.id) + + fleet_a = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="fleet-a")), + ) + await create_instance( + session=session, + project=project, + fleet=fleet_a, + backend=BackendType.REMOTE, + price=1.0, + ) + fleet_b = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="fleet-b")), + ) + await create_instance( + session=session, + project=project, + fleet=fleet_b, + backend=BackendType.REMOTE, + price=2.0, + ) + fleet_c = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="fleet-c")), + ) + await create_instance( + session=session, + project=project, + fleet=fleet_c, + backend=BackendType.REMOTE, + price=3.0, + ) + + run_spec = get_run_spec( + repo_id=repo.name, + configuration=TaskConfiguration( + commands=[":"], + image="scratch", + user="root", + fleets=["fleet-a", "fleet-b"], + ), + ) + response = await client.post( + f"/api/project/{project.name}/runs/get_plan", + headers=get_auth_headers(user.token), + json={"run_spec": run_spec.dict()}, + ) + + assert response.status_code == 200, response.json() + offers = response.json()["job_plans"][0]["offers"] + assert len(offers) == 2 + assert [offer["price"] for offer in offers] == [1.0, 2.0] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_offer_cli_deduplicates_identical_backend_offers_across_specified_fleets( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + ) -> None: + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, + project=project, + user=user, + project_role=ProjectRole.USER, + ) + repo = await create_repo(session=session, project_id=project.id) + await create_fleet( + session=session, + project=project, + name="fleet-a", + spec=get_fleet_spec(profile=Profile(backends=[BackendType.AWS])), + ) + await create_fleet( + session=session, + project=project, + name="fleet-b", + spec=get_fleet_spec(profile=Profile(backends=[BackendType.AWS])), + ) + + run_spec = get_run_spec( + repo_id=repo.name, + configuration=TaskConfiguration( + commands=[":"], + image="scratch", + user="root", + fleets=["fleet-a", "fleet-b"], + ), + ) + body = {"run_spec": run_spec.dict()} + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock_aws = Mock() + backend_mock_aws.TYPE = BackendType.AWS + backend_mock_aws.compute.return_value.get_offers.return_value = [ + InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="instance-aws", + resources=Resources(cpus=2, memory_mib=8192, spot=False, gpus=[]), + ), + region="us", + price=1.0, + backend_data={"provider_data": {"zone": "us-a"}, "labels": ["gpu"]}, + availability=InstanceAvailability.AVAILABLE, + availability_zones=["us-a"], + ) + ] + m.return_value = [backend_mock_aws] + + response = await client.post( + f"/api/project/{project.name}/runs/get_plan", + headers=get_auth_headers(user.token), + json=body, + ) + + assert response.status_code == 200, response.json() + job_plan = response.json()["job_plans"][0] + assert job_plan["total_offers"] == 1 + assert len(job_plan["offers"]) == 1 + assert job_plan["offers"][0]["price"] == 1.0 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_offer_cli_keeps_identical_existing_instances_from_specified_fleets( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + ) -> None: + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, + project=project, + user=user, + project_role=ProjectRole.USER, + ) + repo = await create_repo(session=session, project_id=project.id) + + fleet_a = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="fleet-a")), + ) + await create_instance( + session=session, + project=project, + fleet=fleet_a, + backend=BackendType.REMOTE, + price=1.0, + ) + fleet_b = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="fleet-b")), + ) + await create_instance( + session=session, + project=project, + fleet=fleet_b, + backend=BackendType.REMOTE, + price=1.0, + ) + + run_spec = get_run_spec( + repo_id=repo.name, + configuration=TaskConfiguration( + commands=[":"], + image="scratch", + user="root", + fleets=["fleet-a", "fleet-b"], + ), + ) + response = await client.post( + f"/api/project/{project.name}/runs/get_plan", + headers=get_auth_headers(user.token), + json={"run_spec": run_spec.dict()}, + ) + + assert response.status_code == 200, response.json() + job_plan = response.json()["job_plans"][0] + assert job_plan["total_offers"] == 2 + assert len(job_plan["offers"]) == 2 + assert [offer["price"] for offer in job_plan["offers"]] == [1.0, 1.0] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_offer_cli_without_fleet_keeps_global_offers( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + ) -> None: + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, + project=project, + user=user, + project_role=ProjectRole.USER, + ) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + configuration=TaskConfiguration( + commands=[":"], + image="scratch", + user="root", + ), + ) + body = {"run_spec": run_spec.dict()} + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock_aws = Mock() + backend_mock_aws.TYPE = BackendType.AWS + backend_mock_aws.compute.return_value.get_offers.return_value = [ + InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="instance-aws", + resources=Resources(cpus=2, memory_mib=8192, spot=False, gpus=[]), + ), + region="us", + price=1.0, + availability=InstanceAvailability.AVAILABLE, + ) + ] + backend_mock_runpod = Mock() + backend_mock_runpod.TYPE = BackendType.RUNPOD + backend_mock_runpod.compute.return_value.get_offers.return_value = [ + InstanceOfferWithAvailability( + backend=BackendType.RUNPOD, + instance=InstanceType( + name="instance-runpod", + resources=Resources(cpus=2, memory_mib=8192, spot=False, gpus=[]), + ), + region="us", + price=2.0, + availability=InstanceAvailability.AVAILABLE, + ) + ] + m.return_value = [backend_mock_aws, backend_mock_runpod] + response = await client.post( + f"/api/project/{project.name}/runs/get_plan", + headers=get_auth_headers(user.token), + json=body, + ) + + assert response.status_code == 200, response.json() + offers = response.json()["job_plans"][0]["offers"] + assert [offer["backend"] for offer in offers] == ["aws", "runpod"] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_offer_without_fleets_uses_global_offer_collection( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + ) -> None: + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, + project=project, + user=user, + project_role=ProjectRole.USER, + ) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + configuration=TaskConfiguration( + commands=[":"], + image="scratch", + user="root", + ), + ) + global_offer = get_instance_offer_with_availability(price=1.0) + with ( + patch( + "dstack._internal.server.services.runs.plan._get_non_fleet_offers", + new=AsyncMock(return_value=([(Mock(), global_offer)], [])), + ) as get_non_fleet_offers_mock, + patch( + "dstack._internal.server.services.runs.plan._get_offers_in_run_candidate_fleets", + new=AsyncMock( + side_effect=AssertionError( + "_get_offers_in_run_candidate_fleets should not be called" + ) + ), + ) as get_offers_in_run_candidate_fleets_mock, + patch( + "dstack._internal.server.services.runs.plan.find_optimal_fleet_with_offers", + new=AsyncMock( + side_effect=AssertionError( + "find_optimal_fleet_with_offers should not be called" + ) + ), + ) as find_optimal_fleet_with_offers_mock, + ): + response = await client.post( + f"/api/project/{project.name}/runs/get_plan", + headers=get_auth_headers(user.token), + json={"run_spec": run_spec.dict()}, + ) + + assert response.status_code == 200, response.json() + get_non_fleet_offers_mock.assert_awaited_once() + get_offers_in_run_candidate_fleets_mock.assert_not_called() + find_optimal_fleet_with_offers_mock.assert_not_called() + job_plan = response.json()["job_plans"][0] + assert job_plan["total_offers"] == 1 + assert job_plan["offers"][0]["price"] == 1.0 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_offer_with_fleets_uses_selected_fleet_offer_collection( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + ) -> None: + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, + project=project, + user=user, + project_role=ProjectRole.USER, + ) + repo = await create_repo(session=session, project_id=project.id) + selected_fleets = ["fleet-a", "fleet-b"] + run_spec = get_run_spec( + repo_id=repo.name, + profile=Profile(name="default", fleets=selected_fleets), + configuration=TaskConfiguration( + commands=[":"], + image="scratch", + user="root", + fleets=selected_fleets, + ), + ) + fleet_offer = get_instance_offer_with_availability(price=2.0) + with ( + patch( + "dstack._internal.server.services.runs.plan._get_non_fleet_offers", + new=AsyncMock( + side_effect=AssertionError("_get_non_fleet_offers should not be called") + ), + ) as get_non_fleet_offers_mock, + patch( + "dstack._internal.server.services.runs.plan._get_offers_in_run_candidate_fleets", + new=AsyncMock(return_value=([(Mock(), fleet_offer)], [])), + ) as get_offers_in_run_candidate_fleets_mock, + patch( + "dstack._internal.server.services.runs.plan.find_optimal_fleet_with_offers", + new=AsyncMock( + side_effect=AssertionError( + "find_optimal_fleet_with_offers should not be called" + ) + ), + ) as find_optimal_fleet_with_offers_mock, + ): + response = await client.post( + f"/api/project/{project.name}/runs/get_plan", + headers=get_auth_headers(user.token), + json={"run_spec": run_spec.dict()}, + ) + + assert response.status_code == 200, response.json() + get_non_fleet_offers_mock.assert_not_called() + get_offers_in_run_candidate_fleets_mock.assert_awaited_once() + find_optimal_fleet_with_offers_mock.assert_not_called() + job_plan = response.json()["job_plans"][0] + assert job_plan["total_offers"] == 1 + assert job_plan["offers"][0]["price"] == 2.0 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_regular_run_plan_uses_best_fleet_candidate_selection( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + ) -> None: + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, + project=project, + user=user, + project_role=ProjectRole.USER, + ) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + configuration=TaskConfiguration( + commands=["echo ok"], + image="scratch", + user="root", + ), + ) + chosen_fleet_offer = get_instance_offer_with_availability(price=3.0) + with ( + patch( + "dstack._internal.server.services.runs.plan._select_candidate_fleet_models", + new=AsyncMock(return_value=[Mock()]), + ) as select_candidate_fleet_models_mock, + patch( + "dstack._internal.server.services.runs.plan.find_optimal_fleet_with_offers", + new=AsyncMock(return_value=(Mock(), [(Mock(), chosen_fleet_offer)], [])), + ) as find_optimal_fleet_with_offers_mock, + patch( + "dstack._internal.server.services.runs.plan._get_non_fleet_offers", + new=AsyncMock( + side_effect=AssertionError("_get_non_fleet_offers should not be called") + ), + ) as get_non_fleet_offers_mock, + patch( + "dstack._internal.server.services.runs.plan._get_offers_in_run_candidate_fleets", + new=AsyncMock( + side_effect=AssertionError( + "_get_offers_in_run_candidate_fleets should not be called" + ) + ), + ) as get_offers_in_run_candidate_fleets_mock, + ): + response = await client.post( + f"/api/project/{project.name}/runs/get_plan", + headers=get_auth_headers(user.token), + json={"run_spec": run_spec.dict()}, + ) + + assert response.status_code == 200, response.json() + select_candidate_fleet_models_mock.assert_awaited_once() + find_optimal_fleet_with_offers_mock.assert_awaited_once() + get_non_fleet_offers_mock.assert_not_called() + get_offers_in_run_candidate_fleets_mock.assert_not_called() + job_plan = response.json()["job_plans"][0] + assert job_plan["total_offers"] == 1 + assert job_plan["offers"][0]["price"] == 3.0 + + @pytest.mark.parametrize( + ("client_version", "expected_availability"), + [ + ("0.20.3", InstanceAvailability.NOT_AVAILABLE), + ("0.20.4", InstanceAvailability.NO_BALANCE), + (None, InstanceAvailability.NO_BALANCE), + ], + ) + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_replaces_no_balance_with_not_available_for_old_clients( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + client_version: Optional[str], + expected_availability: InstanceAvailability, + ) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.nodes = FleetNodesSpec(min=0, target=0, max=None) + await create_fleet(session=session, project=project, spec=fleet_spec) + repo = await create_repo(session=session, project_id=project.id) + offers = [ + InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="instance-1", + resources=Resources(cpus=1, memory_mib=512, spot=False, gpus=[]), + ), + region="us", + price=1.0, + availability=InstanceAvailability.AVAILABLE, + ), + InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="instance-2", + resources=Resources(cpus=2, memory_mib=1024, spot=False, gpus=[]), + ), + region="us", + price=2.0, + availability=InstanceAvailability.NO_BALANCE, + ), + ] + run_plan_dict = get_dev_env_run_plan_dict( + project_name=project.name, + username=user.name, + repo_id=repo.name, + offers=offers, + total_offers=1, + max_price=1.0, + ) + body = {"run_spec": run_plan_dict["run_spec"]} + headers = get_auth_headers(user.token) + if client_version is not None: + headers["X-API-Version"] = client_version + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value.get_offers.return_value = offers + m.return_value = [backend_mock] + response = await client.post( + f"/api/project/{project.name}/runs/get_plan", + headers=headers, + json=body, + ) + offers = response.json()["job_plans"][0]["offers"] + assert len(offers) == 2 + assert offers[0]["availability"] == InstanceAvailability.AVAILABLE.value + assert offers[1]["availability"] == expected_availability.value + + @pytest.mark.asyncio + @pytest.mark.parametrize( + ("old_conf", "new_conf", "action"), + [ + pytest.param( + ServiceConfiguration( + commands=["one", "two"], + port=80, + replicas=Range(min=1, max=1), + scaling=None, + ), + ServiceConfiguration( + commands=["one", "two"], + port=80, + replicas=Range(min=2, max=4), + scaling=ScalingSpec(metric="rps", target=5), + ), + "update", + id="update-service", + ), + pytest.param( + ServiceConfiguration( + commands=["one", "two"], + port=80, + gateway=None, + replicas=Range(min=1, max=1), + scaling=None, + ), + ServiceConfiguration( + commands=["one", "two"], + port=8080, + gateway="test-gateway", # not updatable + replicas=Range(min=2, max=4), + scaling=ScalingSpec(metric="rps", target=5), + ), + "create", + id="no-update-service", + ), + pytest.param( + DevEnvironmentConfiguration(ide="vscode", inactivity_duration=False), + DevEnvironmentConfiguration(ide="vscode", inactivity_duration="30m"), + "update", + id="update-dev-env", + ), + pytest.param( + TaskConfiguration(image="test-image-1"), + TaskConfiguration(image="test-image-2"), + "create", + id="no-update-task", + ), + pytest.param( + DevEnvironmentConfiguration(ide="vscode", image="test-image"), + TaskConfiguration(image="test-image"), + "create", + id="no-update-on-type-change", + ), + ], + ) + async def test_returns_update_or_create_action_on_conf_change( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + old_conf: AnyRunConfiguration, + new_conf: AnyRunConfiguration, + action: str, + ) -> None: + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec(run_name="test-run", repo_id=repo.name, configuration=old_conf) + run_model = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name=run_spec.run_name, + run_spec=run_spec, + ) + run = run_model_to_run(run_model) + # Apply the same defaults the server applies to current_resource + set_resources_defaults(run.run_spec.configuration.resources) + set_gpu_vendor_default( + run.run_spec.configuration.resources, + image=run.run_spec.configuration.image, + docker=getattr(run.run_spec.configuration, "docker", None), + ) + run_spec.configuration = new_conf + response = await client.post( + f"/api/project/{project.name}/runs/get_plan", + headers=get_auth_headers(user.token), + json={"run_spec": run_spec.dict()}, + ) + assert response.status_code == 200 + response_json = response.json() + assert response_json["action"] == action + assert response_json["current_resource"] == json.loads(run.json()) + + @pytest.mark.asyncio + @pytest.mark.usefixtures("test_db") + async def test_generates_user_ssh_key(self, session: AsyncSession, client: AsyncClient): + user = await create_user( + session=session, global_role=GlobalRole.USER, ssh_public_key=None, ssh_private_key=None + ) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec(run_name="test-run", repo_id=repo.name, ssh_key_pub=None) + + response = await client.post( + f"/api/project/{project.name}/runs/get_plan", + headers=get_auth_headers(user.token), + json={"run_spec": run_spec.dict()}, + ) + + assert response.status_code == 200, response.json() + run_spec_ssh_public_key = response.json()["effective_run_spec"]["ssh_key_pub"] + assert run_spec_ssh_public_key is not None + await session.refresh(user) + assert user.ssh_public_key == run_spec_ssh_public_key + assert user.ssh_private_key is not None + + @pytest.mark.asyncio + @pytest.mark.parametrize( + "client_version,expected_probes", + [ + ("0.20.7", []), + ("0.20.8", None), + (None, None), + ], + ) + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_patches_service_configuration_probes_for_old_clients( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + client_version: Optional[str], + expected_probes: Optional[list], + ) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + repo = await create_repo(session=session, project_id=project.id) + + service_conf = ServiceConfiguration( + commands=["echo hello"], + port=80, + probes=None, # This should be patched to [] for clients prior to 0.20.8 + ) + run_spec = get_run_spec( + run_name="test-service", + configuration=service_conf, + repo_id=repo.name, + ) + await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_spec=run_spec, + run_name="test-service", + ) + + body = {"run_spec": run_spec.dict()} + headers = get_auth_headers(user.token) + if client_version is not None: + headers["X-API-Version"] = client_version + response = await client.post( + f"/api/project/{project.name}/runs/get_plan", + headers=headers, + json=body, + ) + + assert response.status_code == 200 + run_plan = response.json() + assert run_plan["run_spec"]["configuration"]["probes"] == expected_probes + assert run_plan["effective_run_spec"]["configuration"]["probes"] == expected_probes + assert ( + run_plan["current_resource"]["run_spec"]["configuration"]["probes"] == expected_probes + ) + + +class TestApplyPlan: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_project_member( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + response = await client.post( + f"/api/project/{project.name}/runs/apply", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_submits_new_run_if_no_current_resource( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + submitted_at = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc) + submitted_at_formatted = "2023-01-02T03:04:00+00:00" + last_processed_at_formatted = submitted_at_formatted + repo = await create_repo(session=session, project_id=project.id) + run_dict = get_dev_env_run_dict( + run_id=SomeUUID4Str(), + job_id=SomeUUID4Str(), + project_name=project.name, + username=user.name, + submitted_at=submitted_at_formatted, + last_processed_at=last_processed_at_formatted, + finished_at=None, + run_name="test-run", + repo_id=repo.name, + ) + with patch("dstack._internal.utils.common.get_current_datetime") as datetime_mock: + datetime_mock.return_value = submitted_at + response = await client.post( + f"/api/project/{project.name}/runs/apply", + headers=get_auth_headers(user.token), + json={ + "plan": { + "run_spec": run_dict["run_spec"], + "current_resource": None, + }, + "force": False, + }, + ) + assert response.status_code == 200, response.json() + assert response.json() == run_dict + res = await session.execute(select(RunModel)) + run = res.scalar() + assert run is not None + res = await session.execute(select(JobModel)) + job = res.scalar() + assert job is not None + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_updates_run(self, test_db, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + run_name="test-service", + configuration_path="old.dstack.yml", + repo_id=repo.name, + configuration=ServiceConfiguration( + type="service", + commands=["one", "two"], + port=80, + replicas=Range(min=1, max=1), + ), + ) + # set defaults to avoid phantom changes being detected + validate_run_spec_and_set_defaults(user, run_spec) + run_model = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name=run_spec.run_name, + run_spec=run_spec, + ) + run = run_model_to_run(run_model) + run_spec.configuration_path = "new.dstack.yml" + run_spec.configuration.replicas = Range(min=2, max=2) + response = await client.post( + f"/api/project/{project.name}/runs/apply", + headers=get_auth_headers(user.token), + # Call json.loads to serialize UUID + json=json.loads( + ApplyRunPlanRequest( + plan=ApplyRunPlanInput( + run_spec=run_spec, + current_resource=run, + ), + force=False, + ).json() + ), + ) + assert response.status_code == 200, response.json() + await session.refresh(run_model) + updated_run = run_model_to_run(run_model) + assert run.deployment_num == 0 + assert updated_run.deployment_num == 1 + assert run.run_spec.configuration_path == "old.dstack.yml" + assert updated_run.run_spec.configuration_path == "new.dstack.yml" + assert run.run_spec.configuration.replicas == Range(min=1, max=1) + assert updated_run.run_spec.configuration.replicas == Range(min=2, max=2) + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == ( + "Run updated. Deployment: 1. Changed fields: configuration_path, configuration.replicas" + ) + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_creates_pending_run_if_run_is_scheduled( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + run_name="test-run", + repo_id=repo.name, + ) + run_spec.configuration.schedule = Schedule(cron=["5 * * * *", "10 * * * *"]) + with freeze_time(datetime(2023, 1, 2, 3, 9, tzinfo=timezone.utc)): + response = await client.post( + f"/api/project/{project.name}/runs/apply", + headers=get_auth_headers(user.token), + json={ + "plan": { + "run_spec": json.loads(run_spec.json()), + "current_resource": None, + }, + "force": False, + }, + ) + assert response.status_code == 200, response.json() + res = await session.execute(select(RunModel)) + run = res.scalar() + assert run is not None + assert run.status == RunStatus.PENDING + assert run.next_triggered_at == datetime(2023, 1, 2, 3, 10, tzinfo=timezone.utc) + + @pytest.mark.asyncio + @pytest.mark.usefixtures("test_db") + async def test_generates_user_ssh_key(self, session: AsyncSession, client: AsyncClient): + user = await create_user( + session=session, global_role=GlobalRole.USER, ssh_public_key=None, ssh_private_key=None + ) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec(run_name="test-run", repo_id=repo.name, ssh_key_pub=None) + + response = await client.post( + f"/api/project/{project.name}/runs/apply", + headers=get_auth_headers(user.token), + json={ + "plan": { + "run_spec": run_spec.dict(), + "current_resource": None, + }, + "force": False, + }, + ) + + assert response.status_code == 200, response.json() + run_spec_ssh_public_key = response.json()["run_spec"]["ssh_key_pub"] + assert run_spec_ssh_public_key is not None + await session.refresh(user) + assert user.ssh_public_key == run_spec_ssh_public_key + assert user.ssh_private_key is not None + + @pytest.mark.asyncio + @pytest.mark.parametrize( + "client_version,expected_probes", + [ + ("0.20.7", []), # Prior to 0.20.8, probes=None should be patched to [] + ("0.20.8", None), # 0.20.8 and later should keep probes=None + (None, None), # None client version should keep probes=None + ], + ) + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_patches_service_configuration_probes_for_old_clients( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + client_version: Optional[str], + expected_probes: Optional[list], + ) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + repo = await create_repo(session=session, project_id=project.id) + + service_conf = ServiceConfiguration( + commands=["echo hello"], + port=80, + probes=None, # This should be patched to [] for clients prior to 0.20.8 + ) + run_spec = get_run_spec( + run_name="test-service", + configuration=service_conf, + repo_id=repo.name, + ) + + headers = get_auth_headers(user.token) + if client_version is not None: + headers["X-API-Version"] = client_version + response = await client.post( + f"/api/project/{project.name}/runs/apply", + headers=headers, + json={ + "plan": { + "run_spec": run_spec.dict(), + "current_resource": None, + }, + "force": False, + }, + ) + + assert response.status_code == 200 + assert response.json()["run_spec"]["configuration"]["probes"] == expected_probes + + +class TestSubmitRun: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_project_member( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + response = await client.post( + f"/api/project/{project.name}/runs/submit", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("privileged", [None, False, True]) + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_submits_run( + self, test_db, session: AsyncSession, client: AsyncClient, privileged: Optional[bool] + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + submitted_at = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc) + submitted_at_formatted = "2023-01-02T03:04:00+00:00" + last_processed_at_formatted = submitted_at_formatted + repo = await create_repo(session=session, project_id=project.id) + run_dict = get_dev_env_run_dict( + run_id=SomeUUID4Str(), + job_id=SomeUUID4Str(), + project_name=project.name, + username=user.name, + submitted_at=submitted_at_formatted, + last_processed_at=last_processed_at_formatted, + finished_at=None, + run_name="test-run", + repo_id=repo.name, + privileged=bool(privileged), + ) + run_spec = copy.deepcopy(run_dict["run_spec"]) + if privileged is None: + del run_spec["configuration"]["privileged"] + body = {"run_spec": run_spec} + with patch("dstack._internal.utils.common.get_current_datetime") as datetime_mock: + datetime_mock.return_value = submitted_at + response = await client.post( + f"/api/project/{project.name}/runs/submit", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 200, response.json() + assert response.json() == run_dict + res = await session.execute(select(RunModel)) + run = res.scalar() + assert run is not None + res = await session.execute(select(JobModel)) + job = res.scalar() + assert job is not None + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_submits_run_docker_true( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + submitted_at = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc) + submitted_at_formatted = "2023-01-02T03:04:00+00:00" + last_processed_at_formatted = submitted_at_formatted + repo = await create_repo(session=session, project_id=project.id) + run_dict = get_dev_env_run_dict( + run_id=SomeUUID4Str(), + job_id=SomeUUID4Str(), + project_name=project.name, + username=user.name, + submitted_at=submitted_at_formatted, + last_processed_at=last_processed_at_formatted, + finished_at=None, + run_name="test-run", + repo_id=repo.name, + docker=True, + privileged=True, # docker=True automatically enables privileged mode + ) + body = {"run_spec": run_dict["run_spec"]} + with patch("dstack._internal.utils.common.get_current_datetime") as datetime_mock: + datetime_mock.return_value = submitted_at + response = await client.post( + f"/api/project/{project.name}/runs/submit", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 200, response.json() + assert response.json() == run_dict + res = await session.execute(select(RunModel)) + run = res.scalar() + assert run is not None + res = await session.execute(select(JobModel)) + job = res.scalar() + assert job is not None + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_submits_run_without_run_name( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + run_dict = get_dev_env_run_dict( + project_name=project.name, + username=user.name, + run_name=None, + repo_id=repo.name, + ) + body = {"run_spec": run_dict["run_spec"]} + response = await client.post( + f"/api/project/{project.name}/runs/submit", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 200 + assert response.json()["run_spec"]["run_name"] is not None + res = await session.execute(select(RunModel)) + run = res.scalar() + assert run is not None + res = await session.execute(select(JobModel)) + job = res.scalar() + assert job is not None + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + "run_name", [ "run_with_underscores", "RunWithUppercase", @@ -651,7 +3437,7 @@ async def test_submits_run_without_run_name(self, test_db, session: AsyncSession ], ) async def test_returns_400_if_bad_run_name( - self, test_db, session: AsyncSession, run_name: str + self, test_db, session: AsyncSession, client: AsyncClient, run_name: str ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) @@ -666,12 +3452,9 @@ async def test_returns_400_if_bad_run_name( repo_id=repo.name, ) body = {"run_spec": run_dict["run_spec"]} - with patch("uuid.uuid4") as uuid_mock, patch( - "dstack._internal.server.services.backends.get_project_backends" - ) as get_project_backends_mock: - get_project_backends_mock.return_value = [Mock()] - uuid_mock.return_value = run_dict["id"] - response = client.post( + with patch("uuid.uuid4") as uuid_mock: + uuid_mock.return_value = UUID(run_dict["id"]) + response = await client.post( f"/api/project/{project.name}/runs/submit", headers=get_auth_headers(user.token), json=body, @@ -679,7 +3462,10 @@ async def test_returns_400_if_bad_run_name( assert response.status_code == 400 @pytest.mark.asyncio - async def test_returns_400_if_repo_does_not_exist(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_400_if_repo_does_not_exist( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( @@ -691,7 +3477,7 @@ async def test_returns_400_if_repo_does_not_exist(self, test_db, session: AsyncS repo_id="repo1234", ) body = {"run_spec": run_dict["run_spec"]} - response = client.post( + response = await client.post( f"/api/project/{project.name}/runs/submit", headers=get_auth_headers(user.token), json=body, @@ -701,17 +3487,23 @@ async def test_returns_400_if_repo_does_not_exist(self, test_db, session: AsyncS class TestStopRuns: @pytest.mark.asyncio - async def test_returns_403_if_not_project_member(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_project_member( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) - response = client.post( + response = await client.post( f"/api/project/{project.name}/runs/stop", headers=get_auth_headers(user.token), ) assert response.status_code == 403 @pytest.mark.asyncio - async def test_terminates_submitted_run(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_marks_submitted_run_as_terminating( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( @@ -727,25 +3519,25 @@ async def test_terminates_submitted_run(self, test_db, session: AsyncSession): repo=repo, user=user, ) - job = await create_job( + await create_job( session=session, run=run, ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/runs/stop", headers=get_auth_headers(user.token), json={"runs_names": [run.run_name], "abort": False}, ) assert response.status_code == 200 await session.refresh(run) - assert run.status == RunStatus.TERMINATED + assert run.status == RunStatus.TERMINATING assert run.termination_reason == RunTerminationReason.STOPPED_BY_USER - await session.refresh(job) - assert job.status == JobStatus.TERMINATED - assert job.termination_reason == JobTerminationReason.TERMINATED_BY_USER @pytest.mark.asyncio - async def test_terminates_running_run(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_marks_running_run_as_terminating( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session) await add_project_member( @@ -762,29 +3554,34 @@ async def test_terminates_running_run(self, test_db, session: AsyncSession): user=user, status=RunStatus.RUNNING, ) - job = await create_job( + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + await create_job( session=session, run=run, job_provisioning_data=get_job_provisioning_data(), status=JobStatus.RUNNING, + instance=instance, + instance_assigned=True, + ) + response = await client.post( + f"/api/project/{project.name}/runs/stop", + headers=get_auth_headers(user.token), + json={"runs_names": [run.run_name], "abort": False}, ) - with patch("dstack._internal.server.services.jobs._stop_runner") as stop_runner: - response = client.post( - f"/api/project/{project.name}/runs/stop", - headers=get_auth_headers(user.token), - json={"runs_names": [run.run_name], "abort": False}, - ) - stop_runner.assert_called_once() assert response.status_code == 200 await session.refresh(run) assert run.status == RunStatus.TERMINATING assert run.termination_reason == RunTerminationReason.STOPPED_BY_USER - await session.refresh(job) - assert job.status == JobStatus.TERMINATING - assert job.termination_reason == JobTerminationReason.TERMINATED_BY_USER @pytest.mark.asyncio - async def test_leaves_finished_runs_unchanged(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_leaves_finished_runs_unchanged( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( @@ -806,7 +3603,7 @@ async def test_leaves_finished_runs_unchanged(self, test_db, session: AsyncSessi run=run, status=JobStatus.FAILED, ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/runs/stop", headers=get_auth_headers(user.token), json={"runs_names": [run.run_name], "abort": False}, @@ -818,17 +3615,21 @@ async def test_leaves_finished_runs_unchanged(self, test_db, session: AsyncSessi class TestDeleteRuns: @pytest.mark.asyncio - async def test_returns_403_if_not_project_member(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_project_member( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) - response = client.post( + response = await client.post( f"/api/project/{project.name}/runs/delete", headers=get_auth_headers(user.token), ) assert response.status_code == 403 @pytest.mark.asyncio - async def test_deletes_runs(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_deletes_runs(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( @@ -852,7 +3653,7 @@ async def test_deletes_runs(self, test_db, session: AsyncSession): ) session.add(run) await session.commit() - response = client.post( + response = await client.post( f"/api/project/{project.name}/runs/delete", headers=get_auth_headers(user.token), json={"runs_names": [run.run_name]}, @@ -864,7 +3665,10 @@ async def test_deletes_runs(self, test_db, session: AsyncSession): assert job.status == JobStatus.FAILED @pytest.mark.asyncio - async def test_returns_400_if_runs_active(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_400_if_runs_active( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( @@ -884,7 +3688,7 @@ async def test_returns_400_if_runs_active(self, test_db, session: AsyncSession): session=session, run=run, ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/runs/delete", headers=get_auth_headers(user.token), json={"runs_names": [run.run_name]}, @@ -896,172 +3700,569 @@ async def test_returns_400_if_runs_active(self, test_db, session: AsyncSession): assert len(res.scalars().all()) == 1 -class TestCreateInstance: +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestSubmitService: + pytestmark = pytest.mark.usefixtures("mock_gateway_connection") + @pytest.mark.asyncio - async def test_returns_403_if_not_project_member(self, test_db, session: AsyncSession): + @pytest.mark.parametrize( + ( + "existing_gateways", + "specified_gateway_in_run_conf", + "expected_service_url", + "expected_model_url", + "is_gateway", + "model", + ), + [ + pytest.param( + [("default-gateway", True), ("non-default-gateway", False)], + None, + "https://test-service.default-gateway.example", + "https://test-service.default-gateway.example/v1", + True, + "test-model", + id="submits-to-default-gateway", + ), + pytest.param( + [("default-gateway", True), ("non-default-gateway", False)], + True, + "https://test-service.default-gateway.example", + "https://test-service.default-gateway.example/v1", + True, + "test-model", + id="submits-to-default-gateway-when-gateway-true", + ), + pytest.param( + [("default-gateway", True), ("non-default-gateway", False)], + "non-default-gateway", + "https://test-service.non-default-gateway.example", + "https://test-service.non-default-gateway.example/v1", + True, + "test-model", + id="submits-to-specified-gateway", + ), + pytest.param( + [("non-default-gateway", False)], + None, + "/proxy/services/test-project/test-service/", + "/proxy/services/test-project/test-service/v1", + False, + "test-model", + id="submits-in-server-when-no-default-gateway", + ), + pytest.param( + [("default-gateway", True)], + False, + "/proxy/services/test-project/test-service/", + "/proxy/services/test-project/test-service/v1", + False, + "test-model", + id="submits-in-server-when-specified", + ), + pytest.param( + [("default-gateway", True)], + None, + "https://test-service.default-gateway.example", + "https://gateway.default-gateway.example", + True, + { + "type": "chat", + "name": "test-model", + "format": "tgi", + "chat_template": "test", + "eos_token": "", + }, + id="submits-tgi-model-to-gateway", + ), + ], + ) + async def test_submit_to_correct_proxy( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + existing_gateways: List[Tuple[str, bool]], + specified_gateway_in_run_conf: Union[str, bool, None], + expected_service_url: str, + expected_model_url: str, + is_gateway: bool, + model: Union[str, dict], + ) -> None: user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - response = client.post( - f"/api/project/{project.name}/runs/create_instance", + project = await create_project(session=session, owner=user, name="test-project") + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + backend = await create_backend(session=session, project_id=project.id) + for gateway_name, is_default in existing_gateways: + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + status=GatewayStatus.RUNNING, + name=gateway_name, + wildcard_domain=f"{gateway_name}.example", + ) + await create_gateway_compute( + session=session, + backend_id=backend.id, + gateway_id=gateway.id, + ) + if is_default: + project.default_gateway_id = gateway.id + await session.commit() + run_spec = get_service_run_spec( + repo_id=repo.name, + run_name="test-service", + gateway=specified_gateway_in_run_conf, + model=model, + ) + response = await client.post( + f"/api/project/{project.name}/runs/submit", headers=get_auth_headers(user.token), + json={"run_spec": run_spec}, ) - assert response.status_code == 403 + assert response.status_code == 200 + assert response.json()["service"]["url"] == expected_service_url + assert response.json()["service"]["model"]["base_url"] == expected_model_url + events = await list_events(session) + assert ("Service registered in gateway" in {e.message for e in events}) == is_gateway @pytest.mark.asyncio - async def test_creates_instance(self, test_db, session: AsyncSession): + async def test_return_error_if_specified_gateway_not_exists( + self, test_db, session: AsyncSession, client: AsyncClient + ) -> None: user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) - request = CreateInstanceRequest( - profile=Profile(name="test_profile"), - requirements=Requirements(resources=ResourcesSpec(cpu=1)), + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_service_run_spec(repo_id=repo.name, gateway="nonexistent") + response = await client.post( + f"/api/project/{project.name}/runs/submit", + headers=get_auth_headers(user.token), + json={"run_spec": run_spec}, ) - instance_id = UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e") - with patch( - "dstack._internal.server.services.runs.get_offers_by_requirements" - ) as run_plan_by_req, patch("uuid.uuid4") as uuid_mock: - uuid_mock.return_value = instance_id - offer = InstanceOfferWithAvailability( - backend=BackendType.AWS, - instance=InstanceType( - name="instance", - resources=Resources(cpus=1, memory_mib=512, spot=False, gpus=[]), - ), - region="eu", - price=1.0, - availability=InstanceAvailability.AVAILABLE, - ) - backend = Mock() - backend.compute.return_value.get_offers.return_value = [offer] - backend.compute.return_value.create_instance.return_value = JobProvisioningData( - backend=offer.backend, - instance_type=offer.instance, - instance_id="test_instance", - hostname="1.1.1.1", - internal_ip=None, - region=offer.region, - price=offer.price, - username="ubuntu", - ssh_port=22, - ssh_proxy=None, - dockerized=True, - backend_data=None, - ) - backend.TYPE = BackendType.AWS - run_plan_by_req.return_value = [(backend, offer)] - response = client.post( - f"/api/project/{project.name}/runs/create_instance", - headers=get_auth_headers(user.token), - json=request.dict(), - ) - assert response.status_code == 200 - result = response.json() - expected = { - "id": str(instance_id), - "project_name": project.name, - "backend": None, - "instance_type": None, - "name": result["name"], - "job_name": None, - "job_status": None, - "hostname": None, - "status": "pending", - "unreachable": False, - "created": result["created"], - "pool_name": "default-pool", - "region": None, - "price": None, - } - assert result == expected + assert response.status_code == 400 + assert response.json() == { + "detail": [ + { + "msg": f"Gateway nonexistent does not exist in project {project.name}", + "code": "resource_not_exists", + } + ] + } @pytest.mark.asyncio - async def test_error_if_backends_do_not_support_create_instance( - self, test_db, session: AsyncSession - ): + async def test_return_error_if_specified_gateway_is_true_and_no_gateway_exists( + self, test_db, session: AsyncSession, client: AsyncClient + ) -> None: user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) - request = CreateInstanceRequest( - profile=Profile(name="test_profile"), - requirements=Requirements(resources=ResourcesSpec(cpu=1)), + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_service_run_spec(repo_id=repo.name, gateway=True) + response = await client.post( + f"/api/project/{project.name}/runs/submit", + headers=get_auth_headers(user.token), + json={"run_spec": run_spec}, + ) + assert response.status_code == 400 + assert response.json() == { + "detail": [ + { + "msg": "The service requires a gateway, but there is no default gateway in the project", + "code": "resource_not_exists", + } + ] + } + + @pytest.mark.asyncio + async def test_submit_to_foreign_gateway_only_if_imported( + self, test_db, session: AsyncSession, client: AsyncClient + ) -> None: + exporter_user = await create_user( + session=session, global_role=GlobalRole.USER, name="exporter_user" ) - with patch( - "dstack._internal.server.services.runs.get_offers_by_requirements" - ) as run_plan_by_req: - offer = InstanceOfferWithAvailability( - backend=BackendType.AZURE, - instance=InstanceType( - name="instance", - resources=Resources(cpus=1, memory_mib=512, spot=False, gpus=[]), - ), - region="eu", - price=1.0, - availability=InstanceAvailability.AVAILABLE, - ) - backend = Mock() - backend.TYPE = BackendType.AZURE - backend.compute.return_value.get_offers.return_value = [offer] - backend.compute.return_value.create_instance.side_effect = NotImplementedError() - run_plan_by_req.return_value = [(backend, offer)] - response = client.post( - f"/api/project/{project.name}/runs/create_instance", - headers=get_auth_headers(user.token), - json=request.dict(), - ) - assert response.status_code == 200 - await process_instances() + exporter_project = await create_project( + session=session, owner=exporter_user, name="exporter-project" + ) + backend = await create_backend(session=session, project_id=exporter_project.id) + gateway = await create_gateway( + session=session, + project_id=exporter_project.id, + backend_id=backend.id, + status=GatewayStatus.RUNNING, + name="exported-gateway", + wildcard_domain="exported-gateway.example", + ) + await create_gateway_compute(session=session, backend_id=backend.id, gateway_id=gateway.id) + + importer_user = await create_user( + session=session, global_role=GlobalRole.USER, name="importer_user" + ) + importer_project = await create_project( + session=session, owner=importer_user, name="importer-project" + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.USER, + ) + importer_repo = await create_repo(session=session, project_id=importer_project.id) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[], + exported_gateways=[gateway], + ) + + not_importer_user = await create_user( + session=session, global_role=GlobalRole.USER, name="not_importer_user" + ) + not_importer_project = await create_project( + session=session, owner=not_importer_user, name="not-importer-project" + ) + await add_project_member( + session=session, + project=not_importer_project, + user=not_importer_user, + project_role=ProjectRole.USER, + ) + not_importer_repo = await create_repo(session=session, project_id=not_importer_project.id) + + importer_run_spec = get_service_run_spec( + repo_id=importer_repo.name, + run_name="test-service", + gateway="exporter-project/exported-gateway", + ) + response = await client.post( + f"/api/project/{importer_project.name}/runs/submit", + headers=get_auth_headers(importer_user.token), + json={"run_spec": importer_run_spec}, + ) + assert response.status_code == 200 + assert response.json()["service"]["url"] == "https://test-service.exported-gateway.example" + + not_importer_run_spec = get_service_run_spec( + repo_id=not_importer_repo.name, + gateway="exporter-project/exported-gateway", + ) + response = await client.post( + f"/api/project/{not_importer_project.name}/runs/submit", + headers=get_auth_headers(not_importer_user.token), + json={"run_spec": not_importer_run_spec}, + ) + assert response.status_code == 400 + assert response.json() == { + "detail": [ + { + "msg": "Gateway exporter-project/exported-gateway does not exist in project not-importer-project", + "code": "resource_not_exists", + } + ] + } @pytest.mark.asyncio - async def test_backend_does_not_support_create_instance(self, test_db, session: AsyncSession): + async def test_not_submits_to_default_gateway_if_not_imported( + self, test_db, session: AsyncSession, client: AsyncClient + ) -> None: user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) + gateway_project = await create_project(session=session, owner=user, name="gateway-project") + backend = await create_backend(session=session, project_id=gateway_project.id) + gateway = await create_gateway( + session=session, + project_id=gateway_project.id, + backend_id=backend.id, + status=GatewayStatus.RUNNING, + ) + await create_gateway_compute(session=session, backend_id=backend.id, gateway_id=gateway.id) + + service_project = await create_project(session=session, owner=user, name="service-project") + # The project's default_gateway_id may point to the gateway (e.g., if the gateway was + # imported previously), but that does not authorize the project to use this gateway if it + # is no longer imported. + service_project.default_gateway_id = gateway.id + await session.commit() + await add_project_member( + session=session, + project=service_project, + user=user, + project_role=ProjectRole.USER, + ) + repo = await create_repo(session=session, project_id=service_project.id) + + run_spec = get_service_run_spec( + repo_id=repo.name, + gateway=True, + ) + response = await client.post( + f"/api/project/{service_project.name}/runs/submit", + headers=get_auth_headers(user.token), + json={"run_spec": run_spec}, + ) + assert response.status_code == 400 + assert response.json() == { + "detail": [ + { + "msg": "The service requires a gateway, but there is no default gateway in the project", + "code": "resource_not_exists", + } + ] + } + + @pytest.mark.asyncio + async def test_interpolates_project_name_in_imported_gateway_domain( + self, test_db, session: AsyncSession, client: AsyncClient + ) -> None: + exporter_user = await create_user( + session=session, global_role=GlobalRole.USER, name="exporter_user" + ) + exporter_project = await create_project( + session=session, owner=exporter_user, name="exporter-project" + ) + backend = await create_backend(session=session, project_id=exporter_project.id) + gateway = await create_gateway( + session=session, + project_id=exporter_project.id, + backend_id=backend.id, + status=GatewayStatus.RUNNING, + name="exported-gateway", + wildcard_domain="${{ run.project_name }}.example.com", + ) + await create_gateway_compute(session=session, backend_id=backend.id, gateway_id=gateway.id) + + importer_user = await create_user( + session=session, global_role=GlobalRole.USER, name="importer_user" + ) + importer_project = await create_project( + session=session, owner=importer_user, name="importer-project" + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.USER, + ) + importer_repo = await create_repo(session=session, project_id=importer_project.id) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[], + exported_gateways=[gateway], + ) + + run_spec = get_service_run_spec( + repo_id=importer_repo.name, + run_name="test-service", + gateway="exporter-project/exported-gateway", + ) + response = await client.post( + f"/api/project/{importer_project.name}/runs/submit", + headers=get_auth_headers(importer_user.token), + json={"run_spec": run_spec}, + ) + assert response.status_code == 200 + assert ( + response.json()["service"]["url"] + == "https://test-service.importer-project.example.com" + ) + + @pytest.mark.asyncio + async def test_returns_error_if_imported_gateway_domain_has_unknown_variable( + self, test_db, session: AsyncSession, client: AsyncClient + ) -> None: + exporter_user = await create_user( + session=session, global_role=GlobalRole.USER, name="exporter_user" + ) + exporter_project = await create_project( + session=session, owner=exporter_user, name="exporter-project" + ) + backend = await create_backend(session=session, project_id=exporter_project.id) + gateway = await create_gateway( + session=session, + project_id=exporter_project.id, + backend_id=backend.id, + status=GatewayStatus.RUNNING, + name="exported-gateway", + wildcard_domain="${{ run.unknown_variable }}.example.com", + ) + await create_gateway_compute(session=session, backend_id=backend.id, gateway_id=gateway.id) + + importer_user = await create_user( + session=session, global_role=GlobalRole.USER, name="importer_user" + ) + importer_project = await create_project( + session=session, owner=importer_user, name="importer-project" + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.USER, + ) + importer_repo = await create_repo(session=session, project_id=importer_project.id) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[], + exported_gateways=[gateway], + ) + + run_spec = get_service_run_spec( + repo_id=importer_repo.name, + run_name="test-service", + gateway="exporter-project/exported-gateway", + ) + response = await client.post( + f"/api/project/{importer_project.name}/runs/submit", + headers=get_auth_headers(importer_user.token), + json={"run_spec": run_spec}, + ) + assert response.status_code == 400 + assert response.json() == { + "detail": [ + { + "msg": "Cannot interpolate gateway domain name: Failed to interpolate due to missing vars: ['run.unknown_variable']", + "code": "gateway_error", + } + ] + } + + @pytest.mark.asyncio + async def test_unregister_dangling_service( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + mock_gateway_connection: AsyncMock, + ) -> None: + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user, name="test-project") await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) - request = CreateInstanceRequest( - profile=Profile(name="test_profile"), - requirements=Requirements(resources=ResourcesSpec(cpu=1)), + repo = await create_repo(session=session, project_id=project.id) + backend = await create_backend(session=session, project_id=project.id) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + status=GatewayStatus.RUNNING, + wildcard_domain="example.com", + ) + await create_gateway_compute(session=session, backend_id=backend.id, gateway_id=gateway.id) + project.default_gateway_id = gateway.id + await session.commit() + + client_mock = ( + mock_gateway_connection.return_value.client.return_value.__aenter__.return_value ) + client_mock.register_service.side_effect = [ + GatewayError("Service test-project/test-service is already registered"), + None, # Second call succeeds + ] - with patch( - "dstack._internal.server.services.runs.get_offers_by_requirements" - ) as run_plan_by_req: - offers = InstanceOfferWithAvailability( - backend=BackendType.VASTAI, - instance=InstanceType( - name="instance", - resources=Resources(cpus=1, memory_mib=512, spot=False, gpus=[]), - ), - region="eu", - price=1.0, - availability=InstanceAvailability.AVAILABLE, - ) + response = await client.post( + "/api/project/test-project/runs/submit", + headers=get_auth_headers(user.token), + json={"run_spec": get_service_run_spec(repo_id=repo.name, run_name="test-service")}, + ) - backend = Mock() - backend.TYPE = BackendType.VASTAI - backend.compute.return_value.get_offers.return_value = [offers] - backend.compute.return_value.create_instance.side_effect = NotImplementedError() - run_plan_by_req.return_value = [(backend, offers)] + assert response.status_code == 200 + assert response.json()["service"]["url"] == "https://test-service.example.com" + # Verify that unregister_service was called to clean up the dangling service + client_mock.unregister_service.assert_called_once_with( + project=project.name, + run_name="test-service", + ) + # Verify that register_service was called twice (first failed, then succeeded) + assert client_mock.register_service.call_count == 2 - response = client.post( - f"/api/project/{project.name}/runs/create_instance", - headers=get_auth_headers(user.token), - json=request.dict(), - ) + @pytest.mark.asyncio + async def test_return_error_if_default_gateway_forbids_new_services( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + ) -> None: + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user, name="test-project") + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + backend = await create_backend(session=session, project_id=project.id) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + status=GatewayStatus.RUNNING, + wildcard_domain="example.com", + forbid_new_services=True, + ) + await create_gateway_compute(session=session, backend_id=backend.id, gateway_id=gateway.id) + project.default_gateway_id = gateway.id + await session.commit() - assert response.status_code == 400 + response = await client.post( + "/api/project/test-project/runs/submit", + headers=get_auth_headers(user.token), + json={"run_spec": get_service_run_spec(repo_id=repo.name, run_name="test-service")}, + ) - result = response.json() - expected = { - "detail": [ - { - "msg": "Backends do not support create_instance. Try to select other backends.", - "code": "error", - } - ] - } - assert result == expected + assert response.status_code == 400 + assert response.json() == { + "detail": [{"msg": "Gateway does not accept new services", "code": "error"}] + } + + @pytest.mark.asyncio + async def test_return_error_if_explicitly_specified_gateway_forbids_new_services( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + ) -> None: + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user, name="test-project") + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + backend = await create_backend(session=session, project_id=project.id) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + status=GatewayStatus.RUNNING, + name="restricted-gateway", + wildcard_domain="example.com", + forbid_new_services=True, + ) + await create_gateway_compute(session=session, backend_id=backend.id, gateway_id=gateway.id) + + response = await client.post( + "/api/project/test-project/runs/submit", + headers=get_auth_headers(user.token), + json={ + "run_spec": get_service_run_spec( + repo_id=repo.name, + run_name="test-service", + gateway="restricted-gateway", + ) + }, + ) + + assert response.status_code == 400 + assert response.json() == { + "detail": [{"msg": "Gateway does not accept new services", "code": "error"}] + } diff --git a/src/tests/_internal/server/routers/test_secrets.py b/src/tests/_internal/server/routers/test_secrets.py new file mode 100644 index 0000000000..2c617a5103 --- /dev/null +++ b/src/tests/_internal/server/routers/test_secrets.py @@ -0,0 +1,455 @@ +import pytest +from httpx import AsyncClient +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.users import GlobalRole, ProjectRole +from dstack._internal.server.models import SecretModel +from dstack._internal.server.services.permissions import DefaultPermissions +from dstack._internal.server.services.projects import add_project_member +from dstack._internal.server.testing.common import ( + create_project, + create_secret, + create_user, + default_permissions_context, + get_auth_headers, + list_events, +) + + +class TestListSecrets: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_authorized( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + response = await client.post( + f"/api/project/{project.name}/secrets/list", + headers=get_auth_headers(user.token), + json={}, + ) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_for_manager_by_default( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + manager = await create_user(session=session, name="manager", global_role=GlobalRole.USER) + await add_project_member( + session=session, project=project, user=manager, project_role=ProjectRole.MANAGER + ) + response = await client.post( + f"/api/project/{project.name}/secrets/list", + headers=get_auth_headers(manager.token), + json={}, + ) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_manager_can_list_when_allowed( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + manager = await create_user(session=session, name="manager", global_role=GlobalRole.USER) + await add_project_member( + session=session, project=project, user=manager, project_role=ProjectRole.MANAGER + ) + with default_permissions_context(DefaultPermissions(allow_managers_manage_secrets=True)): + response = await client.post( + f"/api/project/{project.name}/secrets/list", + headers=get_auth_headers(manager.token), + json={}, + ) + assert response.status_code == 200 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_lists_secrets(self, test_db, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + secret1 = await create_secret( + session=session, project=project, name="secret1", value="123456" + ) + secret2 = await create_secret( + session=session, project=project, name="secret2", value="123456" + ) + response = await client.post( + f"/api/project/{project.name}/secrets/list", + headers=get_auth_headers(user.token), + json={}, + ) + assert response.status_code == 200 + assert response.json() == [ + { + "id": str(secret2.id), + "name": "secret2", + "value": None, + }, + { + "id": str(secret1.id), + "name": "secret1", + "value": None, + }, + ] + + +class TestGetSecret: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_authorized( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + response = await client.post( + f"/api/project/{project.name}/secrets/get", + headers=get_auth_headers(user.token), + json={"name": "my_secret"}, + ) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_for_manager_by_default( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + manager = await create_user(session=session, name="manager", global_role=GlobalRole.USER) + await add_project_member( + session=session, project=project, user=manager, project_role=ProjectRole.MANAGER + ) + response = await client.post( + f"/api/project/{project.name}/secrets/get", + headers=get_auth_headers(manager.token), + json={"name": "my_secret"}, + ) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_manager_can_get_when_allowed( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + manager = await create_user(session=session, name="manager", global_role=GlobalRole.USER) + await add_project_member( + session=session, project=project, user=manager, project_role=ProjectRole.MANAGER + ) + await create_secret(session=session, project=project, name="secret1", value="123456") + with default_permissions_context(DefaultPermissions(allow_managers_manage_secrets=True)): + response = await client.post( + f"/api/project/{project.name}/secrets/get", + headers=get_auth_headers(manager.token), + json={"name": "secret1"}, + ) + assert response.status_code == 200 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_secret_with_value( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + secret = await create_secret( + session=session, project=project, name="secret1", value="123456" + ) + response = await client.post( + f"/api/project/{project.name}/secrets/get", + headers=get_auth_headers(user.token), + json={"name": "secret1"}, + ) + assert response.status_code == 200 + assert response.json() == { + "id": str(secret.id), + "name": "secret1", + "value": "123456", + } + + +class TestCreateOrUpdateSecret: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_authorized( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + response = await client.post( + f"/api/project/{project.name}/secrets/create_or_update", + headers=get_auth_headers(user.token), + json={"name": "my_secret"}, + ) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_for_manager_by_default( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + manager = await create_user(session=session, name="manager", global_role=GlobalRole.USER) + await add_project_member( + session=session, project=project, user=manager, project_role=ProjectRole.MANAGER + ) + response = await client.post( + f"/api/project/{project.name}/secrets/create_or_update", + headers=get_auth_headers(manager.token), + json={"name": "my_secret", "value": "123456"}, + ) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_manager_can_create_when_allowed( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + manager = await create_user(session=session, name="manager", global_role=GlobalRole.USER) + await add_project_member( + session=session, project=project, user=manager, project_role=ProjectRole.MANAGER + ) + with default_permissions_context(DefaultPermissions(allow_managers_manage_secrets=True)): + response = await client.post( + f"/api/project/{project.name}/secrets/create_or_update", + headers=get_auth_headers(manager.token), + json={"name": "secret1", "value": "123456"}, + ) + assert response.status_code == 200 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_creates_secret(self, test_db, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + response = await client.post( + f"/api/project/{project.name}/secrets/create_or_update", + headers=get_auth_headers(user.token), + json={"name": "secret1", "value": "123456"}, + ) + assert response.status_code == 200 + res = await session.execute(select(SecretModel)) + secret_model = res.scalar() + assert secret_model is not None + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Secret created" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_updates_secret(self, test_db, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + secret = await create_secret( + session=session, project=project, name="secret1", value="old_value" + ) + response = await client.post( + f"/api/project/{project.name}/secrets/create_or_update", + headers=get_auth_headers(user.token), + json={"name": "secret1", "value": "new_value"}, + ) + assert response.status_code == 200 + await session.refresh(secret) + assert secret.value.get_plaintext_or_error() == "new_value" + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Secret updated" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_no_event_if_value_unchanged( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + await create_secret(session=session, project=project, name="secret1", value="value") + response = await client.post( + f"/api/project/{project.name}/secrets/create_or_update", + headers=get_auth_headers(user.token), + json={"name": "secret1", "value": "value"}, + ) + assert response.status_code == 200 + events = await list_events(session) + assert len(events) == 0 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + "name, value", + [ + ("too_long_secret_value", "a" * 5001), + ("", "empty_name"), + ("@7&.", "wierd_name_chars"), + ], + ) + async def test_rejects_bad_names_values( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + name: str, + value, + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + response = await client.post( + f"/api/project/{project.name}/secrets/create_or_update", + headers=get_auth_headers(user.token), + json={"name": name, "value": value}, + ) + assert response.status_code == 400 + res = await session.execute(select(SecretModel)) + secret_model = res.scalar() + assert secret_model is None + + +class TestDeleteSecrets: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_authorized( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + response = await client.post( + f"/api/project/{project.name}/secrets/delete", + headers=get_auth_headers(user.token), + json={"secrets_names": ["my_secret"]}, + ) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_for_manager_by_default( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + manager = await create_user(session=session, name="manager", global_role=GlobalRole.USER) + await add_project_member( + session=session, project=project, user=manager, project_role=ProjectRole.MANAGER + ) + response = await client.post( + f"/api/project/{project.name}/secrets/delete", + headers=get_auth_headers(manager.token), + json={"secrets_names": ["my_secret"]}, + ) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_manager_can_delete_when_allowed( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + manager = await create_user(session=session, name="manager", global_role=GlobalRole.USER) + await add_project_member( + session=session, project=project, user=manager, project_role=ProjectRole.MANAGER + ) + await create_secret(session=session, project=project, name="secret1", value="123456") + with default_permissions_context(DefaultPermissions(allow_managers_manage_secrets=True)): + response = await client.post( + f"/api/project/{project.name}/secrets/delete", + headers=get_auth_headers(manager.token), + json={"secrets_names": ["secret1"]}, + ) + assert response.status_code == 200 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_deletes_secrets(self, test_db, session: AsyncSession, client: AsyncClient): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + # Create two secrets + await create_secret(session=session, project=project, name="secret1", value="123456") + await create_secret(session=session, project=project, name="secret2", value="abcdef") + + # Verify both secrets exist + res = await session.execute( + select(SecretModel).where(SecretModel.project_id == project.id) + ) + secrets = res.scalars().all() + assert len(secrets) == 2 + + # Delete one secret + response = await client.post( + f"/api/project/{project.name}/secrets/delete", + headers=get_auth_headers(user.token), + json={"secrets_names": ["secret1"]}, + ) + assert response.status_code == 200 + + # Verify only one secret remains + res = await session.execute( + select(SecretModel).where(SecretModel.project_id == project.id) + ) + secrets = res.scalars().all() + assert len(secrets) == 1 + assert secrets[0].name == "secret2" + + # Verify event was emitted + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Secret deleted" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_delete_nonexistent_secret_raises_error( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + response = await client.post( + f"/api/project/{project.name}/secrets/delete", + headers=get_auth_headers(user.token), + json={"secrets_names": ["nonexistent_secret"]}, + ) + assert response.status_code == 400 # ResourceNotExistsError should return 404 diff --git a/src/tests/_internal/server/routers/test_server.py b/src/tests/_internal/server/routers/test_server.py new file mode 100644 index 0000000000..2178b44b84 --- /dev/null +++ b/src/tests/_internal/server/routers/test_server.py @@ -0,0 +1,15 @@ +from unittest.mock import patch + +import pytest +from httpx import AsyncClient + +from dstack._internal import settings + + +class TestGetInfo: + @pytest.mark.asyncio + async def test_returns_server_info(self, test_db, client: AsyncClient): + with patch.object(settings, "DSTACK_VERSION", "0.18.10"): + response = await client.post("/api/server/get_info") + assert response.status_code == 200 + assert response.json() == {"server_version": "0.18.10"} diff --git a/src/tests/_internal/server/routers/test_sshproxy.py b/src/tests/_internal/server/routers/test_sshproxy.py new file mode 100644 index 0000000000..2b761b43f5 --- /dev/null +++ b/src/tests/_internal/server/routers/test_sshproxy.py @@ -0,0 +1,198 @@ +import os +from typing import Optional + +import pytest +from httpx import AsyncClient +from pytest_unordered import unordered +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.errors import ServerClientErrorCode +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.configurations import DevEnvironmentConfiguration +from dstack._internal.core.models.runs import ( + JobStatus, +) +from dstack._internal.server.testing.common import ( + create_instance, + create_job, + create_project, + create_repo, + create_run, + create_user, + create_user_public_key, + get_auth_headers, + get_job_provisioning_data, + get_job_runtime_data, + get_run_spec, +) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +@pytest.mark.usefixtures("image_config_mock", "test_db") +class TestGetUpstream: + @pytest.fixture + def token(self) -> str: + token_var = "DSTACK_SSHPROXY_API_TOKEN" + token = os.getenv(token_var) + assert token is not None, f"{token_var} must be set via pytest-env" + return token + + async def test_returns_40x_if_no_api_token_provided(self, client: AsyncClient): + response = await client.post("/api/sshproxy/get_upstream") + + assert response.status_code in [401, 403] + + async def test_returns_40x_if_api_token_is_not_valid(self, client: AsyncClient): + response = await client.post( + "/api/sshproxy/get_upstream", headers=get_auth_headers("invalid-token") + ) + + assert response.status_code in [401, 403] + + async def test_returns_resource_not_exists_if_upstream_id_is_not_uuid( + self, client: AsyncClient, token: str + ): + response = await client.post( + "/api/sshproxy/get_upstream", + headers=get_auth_headers(token), + json={"id": "some-string"}, + ) + + assert response.json()["detail"][0]["code"] == ServerClientErrorCode.RESOURCE_NOT_EXISTS + + async def test_returns_resource_not_exists_if_job_is_not_running( + self, + session: AsyncSession, + client: AsyncClient, + token: str, + ): + project = await create_project(session=session) + instance = await create_instance(session=session, project=project) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, user=user, repo=repo) + job = await create_job( + session=session, + run=run, + instance=instance, + status=JobStatus.TERMINATING, + ) + + response = await client.post( + "/api/sshproxy/get_upstream", + headers=get_auth_headers(token), + json={"id": str(job.id)}, + ) + + assert response.json()["detail"][0]["code"] == ServerClientErrorCode.RESOURCE_NOT_EXISTS + + async def test_response( + self, + session: AsyncSession, + client: AsyncClient, + token: str, + ): + project = await create_project(session=session, ssh_private_key="project-key") + instance = await create_instance( + session=session, project=project, backend=BackendType.RUNPOD + ) + user = await create_user(session=session, ssh_public_key="user-key") + await create_user_public_key( + session=session, user=user, fingerprint="SHA256:fp1", key="user-uploaded-key-1" + ) + await create_user_public_key( + session=session, user=user, fingerprint="SHA256:fp2", key="user-uploaded-key-2" + ) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec(repo_id=repo.name, ssh_key_pub="run-spec-key") + run = await create_run( + session=session, project=project, user=user, repo=repo, run_spec=run_spec + ) + jpd = get_job_provisioning_data( + dockerized=False, + backend=BackendType.RUNPOD, + hostname="100.100.100.100", + username="root", + ssh_port=32768, + ssh_proxy=None, + ) + jrd = get_job_runtime_data(username="test-user") + job = await create_job( + session=session, + run=run, + instance=instance, + job_provisioning_data=jpd, + job_runtime_data=jrd, + status=JobStatus.RUNNING, + ) + + response = await client.post( + "/api/sshproxy/get_upstream", + headers=get_auth_headers(token), + json={"id": str(job.id)}, + ) + + assert response.json() == { + "hosts": [ + { + "host": "100.100.100.100", + "port": 32768, + "private_key": "project-key", + "user": "test-user", + }, + ], + "authorized_keys": unordered( + [ + "user-key", + "user-uploaded-key-1", + "user-uploaded-key-2", + "run-spec-key", + ] + ), + } + + @pytest.mark.parametrize( + ["jrd_user", "conf_user", "expected_user"], + [ + pytest.param("jrd", "conf", "jrd", id="from-runner"), + pytest.param(None, "conf", "conf", id="from-configuration"), + pytest.param(None, None, "root", id="default"), + ], + ) + async def test_username_fallbacks( + self, + session: AsyncSession, + client: AsyncClient, + token: str, + jrd_user: Optional[str], + conf_user: Optional[str], + expected_user: str, + ): + project = await create_project(session=session, ssh_private_key="project-key") + instance = await create_instance(session=session, project=project, backend=BackendType.AWS) + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + configuration = DevEnvironmentConfiguration(ide="vscode", user=conf_user) + run_spec = get_run_spec(repo_id=repo.name, configuration=configuration) + run = await create_run( + session=session, project=project, user=user, repo=repo, run_spec=run_spec + ) + jpd = get_job_provisioning_data(dockerized=True, backend=BackendType.AWS, username="root") + jrd = get_job_runtime_data(username=jrd_user) + job = await create_job( + session=session, + run=run, + instance=instance, + job_provisioning_data=jpd, + job_runtime_data=jrd, + status=JobStatus.RUNNING, + ) + + response = await client.post( + "/api/sshproxy/get_upstream", + headers=get_auth_headers(token), + json={"id": str(job.id)}, + ) + + assert response.json()["hosts"][-1]["user"] == expected_user diff --git a/src/tests/_internal/server/routers/test_templates.py b/src/tests/_internal/server/routers/test_templates.py new file mode 100644 index 0000000000..1e0d75f59b --- /dev/null +++ b/src/tests/_internal/server/routers/test_templates.py @@ -0,0 +1,177 @@ +from pathlib import Path +from unittest.mock import patch + +import pytest +import yaml +from git import GitCommandError +from httpx import AsyncClient +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.users import GlobalRole, ProjectRole +from dstack._internal.server.services import templates as templates_service +from dstack._internal.server.services.projects import add_project_member +from dstack._internal.server.testing.common import ( + create_project, + create_user, + get_auth_headers, +) + + +@pytest.fixture(autouse=True) +def _reset_cache(): + """Reset the templates cache before each test.""" + templates_service._templates_cache.clear() + yield + templates_service._templates_cache.clear() + + +class TestListTemplates: + @pytest.mark.asyncio + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/project/test_project/templates/list") + assert response.status_code in [401, 403] + + @pytest.mark.asyncio + async def test_returns_empty_list_when_no_repo( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + with patch.object(templates_service.settings, "SERVER_TEMPLATES_REPO", None): + response = await client.post( + f"/api/project/{project.name}/templates/list", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + assert response.json() == [] + + @pytest.mark.asyncio + async def test_uses_project_templates_repo_when_set( + self, test_db, session: AsyncSession, client: AsyncClient, tmp_path: Path + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project( + session=session, owner=user, name="project-with-templates", is_public=False + ) + project.templates_repo = "https://project.example/repo.git" + await session.commit() + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + templates_dir = tmp_path / ".dstack" / "templates" + templates_dir.mkdir(parents=True) + with open(templates_dir / "desktop-ide.yml", "w") as f: + yaml.dump( + { + "type": "template", + "name": "desktop-ide", + "title": "Desktop IDE", + "parameters": [{"type": "name"}], + "configuration": {"type": "dev-environment"}, + }, + f, + ) + + with patch.object(templates_service, "_fetch_templates_repo", return_value=tmp_path): + response = await client.post( + f"/api/project/{project.name}/templates/list", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + assert len(response.json()) == 1 + assert response.json()[0]["name"] == "desktop-ide" + + @pytest.mark.asyncio + async def test_returns_templates( + self, test_db, session: AsyncSession, client: AsyncClient, tmp_path: Path + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + templates_dir = tmp_path / ".dstack" / "templates" + templates_dir.mkdir(parents=True) + for filename, data in [ + ( + "desktop-ide.yml", + { + "type": "template", + "name": "desktop-ide", + "title": "Desktop IDE", + "description": "Access the instance from your desktop IDE.", + "parameters": [{"type": "name"}, {"type": "ide"}], + "configuration": {"type": "dev-environment"}, + }, + ), + ( + "web-based-ide.yml", + { + "type": "template", + "name": "in-browser-ide", + "title": "In-browser IDE", + "parameters": [ + {"type": "name"}, + { + "type": "env", + "title": "Password", + "name": "PASSWORD", + "value": "$random-password", + }, + ], + "configuration": {"type": "service", "port": 8080}, + }, + ), + ]: + with open(templates_dir / filename, "w") as f: + yaml.dump(data, f) + + with ( + patch.object( + templates_service.settings, "SERVER_TEMPLATES_REPO", "https://example.com" + ), + patch.object(templates_service, "_fetch_templates_repo", return_value=tmp_path), + ): + response = await client.post( + f"/api/project/{project.name}/templates/list", + headers=get_auth_headers(user.token), + ) + + assert response.status_code == 200 + data = response.json() + assert len(data) == 2 + assert data[0]["name"] == "desktop-ide" + assert data[0]["description"] == "Access the instance from your desktop IDE." + assert data[0]["configuration"]["type"] == "dev-environment" + assert data[1]["name"] == "in-browser-ide" + assert data[1]["parameters"][1]["type"] == "env" + assert data[1]["parameters"][1]["name"] == "PASSWORD" + assert data[1]["configuration"]["port"] == 8080 + + @pytest.mark.asyncio + async def test_returns_empty_when_repo_fetch_fails( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + project.templates_repo = "https://github.com/dstackai/dstack-sky" + await session.commit() + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + + with patch.object( + templates_service, + "_fetch_templates_repo", + side_effect=GitCommandError(["git", "clone"], 128, stderr="not found"), + ): + response = await client.post( + f"/api/project/{project.name}/templates/list", + headers=get_auth_headers(user.token), + ) + + assert response.status_code == 200 + assert response.json() == [] diff --git a/src/tests/_internal/server/routers/test_users.py b/src/tests/_internal/server/routers/test_users.py index 8245aecf57..24af6af217 100644 --- a/src/tests/_internal/server/routers/test_users.py +++ b/src/tests/_internal/server/routers/test_users.py @@ -1,67 +1,370 @@ +from datetime import datetime, timezone from unittest.mock import patch from uuid import UUID import pytest -from fastapi.testclient import TestClient +from freezegun import freeze_time +from httpx import AsyncClient from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession +from dstack._internal.core.models.projects import ProjectRole +from dstack._internal.core.models.runs import JobStatus, RunStatus from dstack._internal.core.models.users import GlobalRole -from dstack._internal.server.main import app -from dstack._internal.server.models import UserModel -from dstack._internal.server.testing.common import create_user, get_auth_headers - -client = TestClient(app) +from dstack._internal.server.models import MemberModel, UserModel +from dstack._internal.server.services.projects import add_project_member +from dstack._internal.server.testing.common import ( + create_job, + create_probe, + create_project, + create_repo, + create_run, + create_user, + get_auth_headers, +) class TestListUsers: - def test_returns_40x_if_not_authenticated(self): - response = client.post("/api/users/list") + @pytest.mark.asyncio + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/users/list") assert response.status_code in [401, 403] @pytest.mark.asyncio - async def test_returns_users(self, test_db, session): - user = await create_user(session=session) - response = client.post("/api/users/list", headers=get_auth_headers(user.token)) + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_admins_see_all_non_deleted_users( + self, test_db, session: AsyncSession, client: AsyncClient + ): + admin = await create_user( + session=session, + name="admin", + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + global_role=GlobalRole.ADMIN, + ) + other_user = await create_user( + session=session, + name="other_user", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + global_role=GlobalRole.USER, + ) + await create_user( + session=session, + name="deleted_user", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + global_role=GlobalRole.USER, + deleted=True, + ) + response = await client.post("/api/users/list", headers=get_auth_headers(admin.token)) assert response.status_code in [200] assert response.json() == [ { - "id": str(user.id), - "username": user.name, - "global_role": user.global_role, + "id": str(admin.id), + "username": admin.name, + "created_at": "2023-01-02T03:05:00+00:00", + "global_role": admin.global_role, "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": None, + }, + { + "id": str(other_user.id), + "username": other_user.name, + "created_at": "2023-01-02T03:04:00+00:00", + "global_role": other_user.global_role, + "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": None, + }, + ] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_total_count(self, test_db, session: AsyncSession, client: AsyncClient): + admin = await create_user( + session=session, + name="admin", + created_at=datetime(2023, 1, 2, 3, 6, tzinfo=timezone.utc), + global_role=GlobalRole.ADMIN, + ) + await create_user( + session=session, + name="user_one", + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + global_role=GlobalRole.USER, + ) + await create_user( + session=session, + name="deleted_user", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + global_role=GlobalRole.USER, + deleted=True, + ) + response = await client.post( + "/api/users/list", + headers=get_auth_headers(admin.token), + json={"limit": 1, "return_total_count": True}, + ) + assert response.status_code == 200 + assert response.json() == { + "total_count": 2, + "users": [ + { + "id": str(admin.id), + "username": admin.name, + "created_at": "2023-01-02T03:06:00+00:00", + "global_role": admin.global_role, + "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": None, + } + ], + } + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_paginates_results(self, test_db, session: AsyncSession, client: AsyncClient): + admin = await create_user( + session=session, + name="admin", + created_at=datetime(2023, 1, 2, 3, 6, tzinfo=timezone.utc), + global_role=GlobalRole.ADMIN, + ) + user_one = await create_user( + session=session, + name="user_one", + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + global_role=GlobalRole.USER, + ) + await create_user( + session=session, + name="user_two", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + global_role=GlobalRole.USER, + ) + response = await client.post( + "/api/users/list", + headers=get_auth_headers(admin.token), + json={"limit": 1}, + ) + assert response.status_code == 200 + assert response.json() == [ + { + "id": str(admin.id), + "username": admin.name, + "created_at": "2023-01-02T03:06:00+00:00", + "global_role": admin.global_role, + "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": None, + } + ] + response = await client.post( + "/api/users/list", + headers=get_auth_headers(admin.token), + json={ + "prev_created_at": "2023-01-02T03:06:00+00:00", + "prev_id": str(admin.id), + "limit": 1, + }, + ) + assert response.status_code == 200 + assert response.json() == [ + { + "id": str(user_one.id), + "username": user_one.name, + "created_at": "2023-01-02T03:05:00+00:00", + "global_role": user_one.global_role, + "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": None, + } + ] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_filters_by_name_pattern( + self, test_db, session: AsyncSession, client: AsyncClient + ): + admin = await create_user( + session=session, + name="admin", + created_at=datetime(2023, 1, 2, 3, 6, tzinfo=timezone.utc), + global_role=GlobalRole.ADMIN, + ) + matching_user = await create_user( + session=session, + name="alpha_user", + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + global_role=GlobalRole.USER, + ) + await create_user( + session=session, + name="bravo", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + global_role=GlobalRole.USER, + ) + response = await client.post( + "/api/users/list", + headers=get_auth_headers(admin.token), + json={"name_pattern": "alpha"}, + ) + assert response.status_code == 200 + assert response.json() == [ + { + "id": str(matching_user.id), + "username": matching_user.name, + "created_at": "2023-01-02T03:05:00+00:00", + "global_role": matching_user.global_role, + "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": None, + } + ] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_non_admins_see_only_themselves( + self, test_db, session: AsyncSession, client: AsyncClient + ): + await create_user( + session=session, + name="admin", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + global_role=GlobalRole.ADMIN, + ) + other_user = await create_user( + session=session, + name="other_user", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + global_role=GlobalRole.USER, + ) + response = await client.post("/api/users/list", headers=get_auth_headers(other_user.token)) + assert response.status_code in [200] + assert response.json() == [ + { + "id": str(other_user.id), + "username": other_user.name, + "created_at": "2023-01-02T03:04:00+00:00", + "global_role": other_user.global_role, + "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": None, } ] class TestGetMyUser: - def test_returns_40x_if_not_authenticated(self): - response = client.post("/api/users/get_my_user") + @pytest.mark.asyncio + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/users/get_my_user") assert response.status_code in [401, 403] @pytest.mark.asyncio - async def test_returns_logged_in_user(self, test_db, session): - user = await create_user(session=session) - response = client.post("/api/users/get_my_user", headers=get_auth_headers(user.token)) + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_40x_if_deactivated( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, active=False) + response = await client.post( + "/api/users/get_my_user", headers=get_auth_headers(user.token) + ) + assert response.status_code in [401, 403] + user.active = True + await session.commit() + response = await client.post( + "/api/users/get_my_user", headers=get_auth_headers(user.token) + ) + assert response.status_code == 200 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_logged_in_user( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user( + session=session, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ssh_public_key="public-key", + ssh_private_key="private-key", + ) + response = await client.post( + "/api/users/get_my_user", headers=get_auth_headers(user.token) + ) assert response.status_code == 200 assert response.json() == { "id": str(user.id), "username": user.name, + "created_at": "2023-01-02T03:04:00+00:00", "global_role": user.global_role, "email": None, + "creds": {"token": user.token.get_plaintext_or_error()}, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_private_key": "private-key", + "ssh_public_key": "public-key", } + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_generates_ssh_key_if_missing( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user( + session=session, + ssh_public_key=None, + ssh_private_key=None, + ) + with patch("dstack._internal.utils.crypto.generate_rsa_key_pair_bytes") as gen_mock: + gen_mock.return_value = (b"private-key", b"ssh-rsa AAA.public-key user\n") + response = await client.post( + "/api/users/get_my_user", headers=get_auth_headers(user.token) + ) + assert response.status_code == 200 + data = response.json() + assert data["ssh_private_key"] == "private-key" + assert data["ssh_public_key"] == "ssh-rsa AAA.public-key user\n" + await session.refresh(user) + assert user.ssh_private_key == data["ssh_private_key"] + assert user.ssh_public_key == data["ssh_public_key"] + class TestGetUser: - def test_returns_40x_if_not_authenticated(self): - response = client.post("/api/users/get_user") + @pytest.mark.asyncio + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/users/get_user") assert response.status_code in [401, 403] @pytest.mark.asyncio - async def test_returns_400_if_not_global_admin(self, test_db, session): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_400_if_not_global_admin( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.USER) other_user = await create_user(session=session, name="other_user", token="1234") - response = client.post( + response = await client.post( "/api/users/get_user", headers=get_auth_headers(user.token), json={"username": other_user.name}, @@ -69,10 +372,18 @@ async def test_returns_400_if_not_global_admin(self, test_db, session): assert response.status_code == 400 @pytest.mark.asyncio - async def test_returns_logged_in_user(self, test_db, session): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_logged_in_user( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session=session, global_role=GlobalRole.ADMIN) - other_user = await create_user(session=session, name="other_user", token="1234") - response = client.post( + other_user = await create_user( + session=session, + name="other_user", + token="1234", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + response = await client.post( "/api/users/get_user", headers=get_auth_headers(user.token), json={"username": other_user.name}, @@ -81,47 +392,73 @@ async def test_returns_logged_in_user(self, test_db, session): assert response.json() == { "id": str(other_user.id), "username": other_user.name, + "created_at": "2023-01-02T03:04:00+00:00", "global_role": other_user.global_role, "email": None, "creds": {"token": "1234"}, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_private_key": None, + "ssh_public_key": None, } class TestCreateUser: - def test_returns_40x_if_not_authenticated(self): - response = client.post("/api/users/create") + @pytest.mark.asyncio + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/users/create") assert response.status_code in [401, 403] @pytest.mark.asyncio - async def test_creates_user(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @freeze_time(datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc)) + async def test_creates_user(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(name="admin", session=session) with patch("uuid.uuid4") as uuid_mock: uuid_mock.return_value = UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e") - response = client.post( + response = await client.post( "/api/users/create", headers=get_auth_headers(user.token), json={ "username": "test", "global_role": GlobalRole.USER, "email": "test@example.com", + "active": True, }, ) assert response.status_code == 200 - assert response.json() == { + user_data = response.json() + ssh_public_key = user_data["ssh_public_key"] + assert user_data == { "id": "1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e", "username": "test", + "created_at": "2023-01-02T03:04:00+00:00", "global_role": "user", "email": "test@example.com", + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": ssh_public_key, } res = await session.execute(select(UserModel).where(UserModel.name == "test")) assert len(res.scalars().all()) == 1 @pytest.mark.asyncio - async def test_return_400_if_username_taken(self, test_db, session: AsyncSession): - user = await create_user(name="admin", session=session) + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @freeze_time(datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc)) + async def test_return_400_if_username_taken( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user( + name="admin", + session=session, + ) with patch("uuid.uuid4") as uuid_mock: uuid_mock.return_value = UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e") - response = client.post( + response = await client.post( "/api/users/create", headers=get_auth_headers(user.token), json={ @@ -130,17 +467,25 @@ async def test_return_400_if_username_taken(self, test_db, session: AsyncSession }, ) assert response.status_code == 200 - assert response.json() == { + user_data = response.json() + ssh_public_key = user_data["ssh_public_key"] + assert user_data == { "id": "1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e", "username": "Test", + "created_at": "2023-01-02T03:04:00+00:00", "global_role": "user", "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": ssh_public_key, } # Username uniqueness check should be case insensitive for username in ["test", "Test", "TesT"]: with patch("uuid.uuid4") as uuid_mock: uuid_mock.return_value = UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e") - response = client.post( + response = await client.post( "/api/users/create", headers=get_auth_headers(user.token), json={ @@ -154,21 +499,188 @@ async def test_return_400_if_username_taken(self, test_db, session: AsyncSession ) assert len(res.scalars().all()) == 1 + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @freeze_time(datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc)) + async def test_returns_400_if_username_invalid( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + ): + user = await create_user( + name="admin", + session=session, + ) + response = await client.post( + "/api/users/create", + headers=get_auth_headers(user.token), + json={ + "username": "Invalid#$username", + "global_role": GlobalRole.USER, + }, + ) + assert response.status_code == 400 + class TestDeleteUsers: - def test_returns_40x_if_not_authenticated(self): - response = client.post("/api/users/delete") + @pytest.mark.asyncio + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/users/delete") assert response.status_code in [401, 403] @pytest.mark.asyncio - async def test_deletes_users(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize("username", ["test", "a" * 50]) + async def test_deletes_users( + self, test_db, session: AsyncSession, client: AsyncClient, username: str + ): admin = await create_user(name="admin", session=session) - user = await create_user(name="test", session=session) - response = client.post( + user = await create_user(name=username, session=session) + response = await client.post( "/api/users/delete", headers=get_auth_headers(admin.token), json={"users": [user.name]}, ) assert response.status_code == 200 + + # Validate the user is deleted res = await session.execute(select(UserModel).where(UserModel.name == user.name)) assert len(res.scalars().all()) == 0 + + # Validate an event is emitted + response = await client.post( + "/api/events/list", headers=get_auth_headers(admin.token), json={} + ) + assert response.status_code == 200 + assert len(response.json()) == 1 + assert response.json()[0]["message"] == "User deleted" + assert len(response.json()[0]["targets"]) == 1 + assert response.json()[0]["targets"][0]["id"] == str(user.id) + assert response.json()[0]["targets"][0]["name"] == user.name + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_400_if_users_not_exist( + self, test_db, session: AsyncSession, client: AsyncClient + ): + admin = await create_user(name="admin", session=session) + user1 = await create_user(name="test1", session=session) + user2 = await create_user(name="test2", session=session) + response = await client.post( + "/api/users/delete", + headers=get_auth_headers(admin.token), + json={"users": [user1.name, "non_existing_user"]}, + ) + assert response.status_code == 400 + response = await client.post( + "/api/users/delete", + headers=get_auth_headers(admin.token), + json={"users": [user1.name, user2.name]}, + ) + assert response.status_code == 200 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.usefixtures("image_config_mock") + async def test_deletes_user_with_resources( + self, test_db, session: AsyncSession, client: AsyncClient + ): + admin = await create_user(name="admin", session=session) + user = await create_user(name="temp", session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.RUNNING, + ) + job = await create_job(session=session, run=run, status=JobStatus.RUNNING) + await create_probe(session=session, job=job) + response = await client.post( + "/api/users/delete", + headers=get_auth_headers(admin.token), + json={"users": [user.name]}, + ) + assert response.status_code == 200 + res = await session.execute(select(UserModel).where(UserModel.name == user.name)) + assert res.scalar() is None + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.usefixtures("image_config_mock") + async def test_deleting_users_deletes_members( + self, test_db, session: AsyncSession, client: AsyncClient + ): + admin = await create_user(name="admin", session=session) + user = await create_user(name="temp", session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + response = await client.post( + "/api/users/delete", + headers=get_auth_headers(admin.token), + json={"users": [user.name]}, + ) + assert response.status_code == 200 + res = await session.execute(select(UserModel).where(UserModel.name == user.name)) + assert res.scalar() is None + res = await session.execute(select(MemberModel).where(MemberModel.user_id == user.id)) + assert res.scalar() is None + + +class TestRefreshToken: + @pytest.mark.asyncio + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/users/refresh_token") + assert response.status_code in [401, 403] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_refreshes_token(self, test_db, session: AsyncSession, client: AsyncClient): + user1 = await create_user(name="user1", session=session) + old_token = user1.token + response = await client.post( + "/api/users/refresh_token", + headers=get_auth_headers(user1.token), + json={"username": user1.name}, + ) + assert response.status_code == 200 + assert response.json()["creds"]["token"] != old_token + await session.refresh(user1) + assert user1.token != old_token + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_non_admin_refreshes_for_other_user( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user1 = await create_user(name="user1", session=session, global_role=GlobalRole.USER) + user2 = await create_user(name="user2", session=session) + response = await client.post( + "/api/users/refresh_token", + headers=get_auth_headers(user1.token), + json={"username": user2.name}, + ) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_global_admin_refreshes_token( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user1 = await create_user(name="user1", session=session, global_role=GlobalRole.ADMIN) + user2 = await create_user(name="user2", session=session) + old_token = user2.token + response = await client.post( + "/api/users/refresh_token", + headers=get_auth_headers(user1.token), + json={"username": user2.name}, + ) + assert response.status_code == 200 + assert response.json()["creds"]["token"] != old_token + await session.refresh(user2) + assert user2.token != old_token diff --git a/src/tests/_internal/server/routers/test_volumes.py b/src/tests/_internal/server/routers/test_volumes.py index 0699b197cc..a2ea344935 100644 --- a/src/tests/_internal/server/routers/test_volumes.py +++ b/src/tests/_internal/server/routers/test_volumes.py @@ -1,41 +1,202 @@ import json from datetime import datetime, timezone -from unittest.mock import Mock, patch +from unittest.mock import patch from uuid import UUID import pytest -from fastapi.testclient import TestClient from freezegun import freeze_time +from httpx import AsyncClient from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.users import GlobalRole, ProjectRole -from dstack._internal.server.main import app -from dstack._internal.server.models import VolumeModel +from dstack._internal.server.models import VolumeAttachmentModel, VolumeModel from dstack._internal.server.services.projects import add_project_member from dstack._internal.server.testing.common import ( create_instance, - create_pool, create_project, create_user, create_volume, get_auth_headers, get_volume_configuration, get_volume_provisioning_data, + list_events, ) -client = TestClient(app) - class TestListVolumes: @pytest.mark.asyncio - async def test_returns_40x_if_not_authenticated(self, test_db, session: AsyncSession): - response = client.post("/api/project/main/volumes/list") - assert response.status_code == 403 + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/volumes/list") + assert response.status_code in [401, 403] @pytest.mark.asyncio - async def test_lists_volumes(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_lists_volumes_across_projects( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.ADMIN) + project1 = await create_project(session, name="project1", owner=user) + volume1 = await create_volume( + session=session, + project=project1, + user=user, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + configuration=get_volume_configuration(name="volume1"), + ) + project2 = await create_project(session, name="project2", owner=user) + volume2 = await create_volume( + session=session, + project=project2, + user=user, + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + configuration=get_volume_configuration(name="volume2"), + ) + response = await client.post( + "/api/volumes/list", + headers=get_auth_headers(user.token), + json={}, + ) + assert response.status_code == 200, response.json() + assert response.json() == [ + { + "id": str(volume2.id), + "name": volume2.name, + "project_name": project2.name, + "user": user.name, + "configuration": json.loads(volume2.configuration), + "external": False, + "created_at": "2023-01-02T03:05:00+00:00", + "last_processed_at": "2023-01-02T03:05:00+00:00", + "status": "submitted", + "status_message": None, + "deleted": False, + "deleted_at": None, + "volume_id": None, + "provisioning_data": None, + "cost": 0.0, + "attachments": [], + "attachment_data": None, + }, + { + "id": str(volume1.id), + "name": volume1.name, + "project_name": project1.name, + "user": user.name, + "configuration": json.loads(volume1.configuration), + "external": False, + "created_at": "2023-01-02T03:04:00+00:00", + "last_processed_at": "2023-01-02T03:04:00+00:00", + "status": "submitted", + "status_message": None, + "deleted": False, + "deleted_at": None, + "volume_id": None, + "provisioning_data": None, + "cost": 0.0, + "attachments": [], + "attachment_data": None, + }, + ] + response = await client.post( + "/api/volumes/list", + headers=get_auth_headers(user.token), + json={ + "prev_created_at": "2023-01-02T03:05:00+00:00", + "prev_id": str(volume2.id), + }, + ) + assert response.status_code == 200 + assert response.json() == [ + { + "id": str(volume1.id), + "name": volume1.name, + "project_name": project1.name, + "user": user.name, + "configuration": json.loads(volume1.configuration), + "external": False, + "created_at": "2023-01-02T03:04:00+00:00", + "last_processed_at": "2023-01-02T03:04:00+00:00", + "status": "submitted", + "status_message": None, + "deleted": False, + "deleted_at": None, + "volume_id": None, + "provisioning_data": None, + "cost": 0.0, + "attachments": [], + "attachment_data": None, + }, + ] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_non_admin_cannot_see_others_projects( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user1 = await create_user(session, name="user1", global_role=GlobalRole.USER) + user2 = await create_user(session, name="user2", global_role=GlobalRole.USER) + project1 = await create_project(session, name="project1", owner=user1) + project2 = await create_project(session, name="project2", owner=user2) + await add_project_member( + session=session, project=project1, user=user1, project_role=ProjectRole.USER + ) + await add_project_member( + session=session, project=project2, user=user2, project_role=ProjectRole.USER + ) + volume1 = await create_volume( + session=session, + project=project1, + user=user1, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + configuration=get_volume_configuration(name="volume1"), + ) + await create_volume( + session=session, + project=project2, + user=user2, + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + configuration=get_volume_configuration(name="volume2"), + ) + response = await client.post( + "/api/volumes/list", + headers=get_auth_headers(user1.token), + json={}, + ) + assert response.status_code == 200, response.json() + assert response.json() == [ + { + "id": str(volume1.id), + "name": volume1.name, + "project_name": project1.name, + "user": user1.name, + "configuration": json.loads(volume1.configuration), + "external": False, + "created_at": "2023-01-02T03:04:00+00:00", + "last_processed_at": "2023-01-02T03:04:00+00:00", + "status": "submitted", + "status_message": None, + "deleted": False, + "deleted_at": None, + "volume_id": None, + "provisioning_data": None, + "cost": 0.0, + "attachments": [], + "attachment_data": None, + }, + ] + + +class TestListProjectVolumes: + @pytest.mark.asyncio + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/project/main/volumes/list") + assert response.status_code in [401, 403] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_lists_volumes(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session, global_role=GlobalRole.USER) project = await create_project(session) await add_project_member( @@ -44,38 +205,46 @@ async def test_lists_volumes(self, test_db, session: AsyncSession): volume = await create_volume( session=session, project=project, + user=user, created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/volumes/list", headers=get_auth_headers(user.token), ) assert response.status_code == 200 assert response.json() == [ { + "id": str(volume.id), "name": volume.name, "project_name": project.name, + "user": user.name, "configuration": json.loads(volume.configuration), "external": False, "created_at": "2023-01-02T03:04:00+00:00", + "last_processed_at": "2023-01-02T03:04:00+00:00", "status": "submitted", "status_message": None, + "deleted": False, + "deleted_at": None, "volume_id": None, "provisioning_data": None, + "cost": 0.0, + "attachments": [], "attachment_data": None, - "volume_model_id": str(volume.id), } ] class TestGetVolume: @pytest.mark.asyncio - async def test_returns_40x_if_not_authenticated(self, test_db, session: AsyncSession): - response = client.post("/api/project/main/volumes/get") - assert response.status_code == 403 + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/project/main/volumes/get") + assert response.status_code in [401, 403] @pytest.mark.asyncio - async def test_returns_volume(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_volume(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session, global_role=GlobalRole.USER) project = await create_project(session) await add_project_member( @@ -84,36 +253,46 @@ async def test_returns_volume(self, test_db, session: AsyncSession): volume = await create_volume( session=session, project=project, + user=user, created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/volumes/get", headers=get_auth_headers(user.token), json={"name": volume.name}, ) assert response.status_code == 200 assert response.json() == { + "id": str(volume.id), "name": volume.name, "project_name": project.name, + "user": user.name, "configuration": json.loads(volume.configuration), "external": False, "created_at": "2023-01-02T03:04:00+00:00", + "last_processed_at": "2023-01-02T03:04:00+00:00", "status": "submitted", "status_message": None, + "deleted": False, + "deleted_at": None, "volume_id": None, "provisioning_data": None, + "cost": 0.0, + "attachments": [], "attachment_data": None, - "volume_model_id": str(volume.id), } @pytest.mark.asyncio - async def test_returns_400_if_volume_does_not_exist(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_400_if_volume_does_not_exist( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session, global_role=GlobalRole.USER) project = await create_project(session) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) - response = client.post( + response = await client.post( f"/api/project/{project.name}/volumes/get", headers=get_auth_headers(user.token), json={"name": "some_volume"}, @@ -123,13 +302,14 @@ async def test_returns_400_if_volume_does_not_exist(self, test_db, session: Asyn class TestCreateVolume: @pytest.mark.asyncio - async def test_returns_40x_if_not_authenticated(self, test_db, session: AsyncSession): - response = client.post("/api/project/main/volumes/create") - assert response.status_code == 403 + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/project/main/volumes/create") + assert response.status_code in [401, 403] @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) @freeze_time(datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc)) - async def test_creates_volume(self, test_db, session: AsyncSession): + async def test_creates_volume(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session, global_role=GlobalRole.USER) project = await create_project(session) await add_project_member( @@ -138,37 +318,47 @@ async def test_creates_volume(self, test_db, session: AsyncSession): configuration = get_volume_configuration(backend=BackendType.AWS) with patch("uuid.uuid4") as m: m.return_value = UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e") - response = client.post( + response = await client.post( f"/api/project/{project.name}/volumes/create", headers=get_auth_headers(user.token), json={"configuration": configuration.dict()}, ) assert response.status_code == 200 assert response.json() == { + "id": "1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e", "name": configuration.name, "project_name": project.name, "configuration": configuration, + "user": user.name, "external": False, "created_at": "2023-01-02T03:04:00+00:00", + "last_processed_at": "2023-01-02T03:04:00+00:00", "status": "submitted", "status_message": None, + "deleted": False, + "deleted_at": None, "volume_id": None, "provisioning_data": None, + "cost": 0.0, + "attachments": [], "attachment_data": None, - "volume_model_id": "1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e", } res = await session.execute(select(VolumeModel)) assert res.scalar_one() + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Volume created. Status: SUBMITTED" class TestDeleteVolumes: @pytest.mark.asyncio - async def test_returns_40x_if_not_authenticated(self, test_db, session: AsyncSession): - response = client.post("/api/project/main/volumes/delete") - assert response.status_code == 403 + async def test_returns_40x_if_not_authenticated(self, client: AsyncClient): + response = await client.post("/api/project/main/volumes/delete") + assert response.status_code in [401, 403] @pytest.mark.asyncio - async def test_deletes_volumes(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_deletes_volumes(self, test_db, session: AsyncSession, client: AsyncClient): user = await create_user(session, global_role=GlobalRole.USER) project = await create_project(session) await add_project_member( @@ -177,44 +367,44 @@ async def test_deletes_volumes(self, test_db, session: AsyncSession): volume = await create_volume( session=session, project=project, + user=user, volume_provisioning_data=get_volume_provisioning_data(), ) - with patch( - "dstack._internal.server.services.backends.get_project_backend_by_type_or_error" - ) as m: - aws_mock = Mock() - m.return_value = aws_mock - response = client.post( - f"/api/project/{project.name}/volumes/delete", - headers=get_auth_headers(user.token), - json={"names": [volume.name]}, - ) - aws_mock.compute.return_value.delete_volume.assert_called() + response = await client.post( + f"/api/project/{project.name}/volumes/delete", + headers=get_auth_headers(user.token), + json={"names": [volume.name]}, + ) assert response.status_code == 200 await session.refresh(volume) - assert volume.deleted + assert volume.to_be_deleted + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Volume marked for deletion" @pytest.mark.asyncio - async def test_returns_400_when_deleting_volumes_in_use(self, test_db, session: AsyncSession): + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_400_when_volumes_in_use( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session, global_role=GlobalRole.USER) project = await create_project(session) - pool = await create_pool(session=session, project=project) await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.USER ) volume = await create_volume( session=session, project=project, + user=user, volume_provisioning_data=get_volume_provisioning_data(), ) instance = await create_instance( session=session, project=project, - pool=pool, ) - volume.instances.append(instance) + volume.attachments.append(VolumeAttachmentModel(instance=instance)) await session.commit() - response = client.post( + response = await client.post( f"/api/project/{project.name}/volumes/delete", headers=get_auth_headers(user.token), json={"names": [volume.name]}, @@ -222,3 +412,5 @@ async def test_returns_400_when_deleting_volumes_in_use(self, test_db, session: assert response.status_code == 400 await session.refresh(volume) assert not volume.deleted + events = await list_events(session) + assert len(events) == 0 diff --git a/src/tests/_internal/server/services/backends/__init__.py b/src/tests/_internal/server/services/backends/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/server/services/backends/test_provisioning.py b/src/tests/_internal/server/services/backends/test_provisioning.py new file mode 100644 index 0000000000..51637d2c3a --- /dev/null +++ b/src/tests/_internal/server/services/backends/test_provisioning.py @@ -0,0 +1,165 @@ +import pytest + +from dstack._internal import settings +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import InstanceType +from dstack._internal.core.models.runs import JobProvisioningData +from dstack._internal.server.services.backends.provisioning import ( + resolve_provisioning_image, +) +from dstack._internal.server.testing.common import get_job_provisioning_data + + +class TestResolveProvisioningImageName: + @staticmethod + def _create_job_provisioning_data_with_instance_type( + backend: BackendType, + instance_type: str, + ) -> JobProvisioningData: + job_provisioning_data = get_job_provisioning_data(backend=backend) + job_provisioning_data.instance_type = InstanceType( + name=instance_type, + resources=job_provisioning_data.instance_type.resources, + ) + return job_provisioning_data + + @staticmethod + def _call_resolve_provisioning_image( + image_name: str, + backend: BackendType, + instance_type: str, + ) -> str: + job_provisioning_data = ( + TestResolveProvisioningImageName._create_job_provisioning_data_with_instance_type( + backend, + instance_type, + ) + ) + image_name, _ = resolve_provisioning_image(image_name, None, job_provisioning_data) + return image_name + + @pytest.mark.parametrize( + ("suffix", "instance_type"), + [ + ("-base", "p6-b200.48xlarge"), + ("-devel", "p5.48xlarge"), + ], + ) + def test_patch_aws_efa_instance_with_suffix(self, suffix: str, instance_type: str) -> None: + image_name = ( + f"{settings.DSTACK_DOCKER_BASE_IMAGE}:{settings.DSTACK_DOCKER_BASE_IMAGE_VERSION}{suffix}" + f"-ubuntu{settings.DSTACK_DOCKER_BASE_IMAGE_UBUNTU_VERSION}" + ) + result = self._call_resolve_provisioning_image( + image_name, + BackendType.AWS, + instance_type, + ) + expected = ( + f"{settings.DSTACK_DOCKER_BASE_IMAGE}:{settings.DSTACK_DOCKER_BASE_IMAGE_VERSION}" + f"-devel-efa-ubuntu{settings.DSTACK_DOCKER_BASE_IMAGE_UBUNTU_VERSION}" + ) + assert result == expected + + @pytest.mark.parametrize("suffix", ["-base", "-devel"]) + @pytest.mark.parametrize( + "instance_type", + [ + "p5.48xlarge", + "p5e.48xlarge", + "p4d.24xlarge", + "p4de.24xlarge", + "g6.8xlarge", + "g6e.8xlarge", + "g7e.8xlarge", + ], + ) + def test_patch_all_efa_instance_types(self, instance_type: str, suffix: str) -> None: + image_name = ( + f"{settings.DSTACK_DOCKER_BASE_IMAGE}:{settings.DSTACK_DOCKER_BASE_IMAGE_VERSION}{suffix}" + f"-ubuntu{settings.DSTACK_DOCKER_BASE_IMAGE_UBUNTU_VERSION}" + ) + result = self._call_resolve_provisioning_image( + image_name, + BackendType.AWS, + instance_type, + ) + expected = ( + f"{settings.DSTACK_DOCKER_BASE_IMAGE}:{settings.DSTACK_DOCKER_BASE_IMAGE_VERSION}" + f"-devel-efa-ubuntu{settings.DSTACK_DOCKER_BASE_IMAGE_UBUNTU_VERSION}" + ) + assert result == expected + + @pytest.mark.parametrize("suffix", ["-base", "-devel"]) + @pytest.mark.parametrize( + "backend", + [BackendType.GCP, BackendType.AZURE, BackendType.LAMBDA], + ) + @pytest.mark.parametrize( + "instance_type", + ["standard-4", "p5.xlarge", "p6.2xlarge", "g6.xlarge"], + ) + def test_no_patch_non_aws_backends( + self, + backend: BackendType, + suffix: str, + instance_type: str, + ) -> None: + image_name = ( + f"{settings.DSTACK_DOCKER_BASE_IMAGE}:{settings.DSTACK_DOCKER_BASE_IMAGE_VERSION}{suffix}" + f"-ubuntu{settings.DSTACK_DOCKER_BASE_IMAGE_UBUNTU_VERSION}" + ) + result = self._call_resolve_provisioning_image(image_name, backend, instance_type) + assert result == image_name + + @pytest.mark.parametrize("suffix", ["-base", "-devel"]) + @pytest.mark.parametrize( + "instance_type", + ["t3.micro", "m5.large", "c5.xlarge", "r5.2xlarge", "m6i.large", "g6.xlarge"], + ) + def test_no_patch_non_efa_aws_instances(self, instance_type: str, suffix: str) -> None: + image_name = f"{settings.DSTACK_DOCKER_BASE_IMAGE}:{settings.DSTACK_DOCKER_BASE_IMAGE_VERSION}{suffix}" + result = self._call_resolve_provisioning_image( + image_name, + BackendType.AWS, + instance_type, + ) + assert result == image_name + + @pytest.mark.parametrize( + "instance_type", + ["p5.xlarge", "p6.2xlarge", "t3.micro", "m5.large"], + ) + @pytest.mark.parametrize( + "image_name", + [ + "ubuntu:20.04", + "nvidia/cuda:11.8-runtime-ubuntu20.04", + "python:3.9-slim", + "custom/image:latest", + f"{settings.DSTACK_DOCKER_BASE_IMAGE}:{settings.DSTACK_DOCKER_BASE_IMAGE_VERSION}-custom", + f"{settings.DSTACK_DOCKER_BASE_IMAGE}:{settings.DSTACK_DOCKER_BASE_IMAGE_VERSION}-devel-efa", + f"{settings.DSTACK_DOCKER_BASE_IMAGE}:{settings.DSTACK_DOCKER_BASE_IMAGE_VERSION}", + ], + ) + def test_no_patch_other_images(self, instance_type: str, image_name: str) -> None: + result = self._call_resolve_provisioning_image( + image_name, + BackendType.AWS, + instance_type, + ) + assert result == image_name + + @pytest.mark.parametrize("suffix", ["-base", "-devel"]) + def test_patch_aws_efa_image_with_registry_prefix(self, suffix: str) -> None: + registry = "registry.example" + image_name = ( + f"{registry}/{settings.DSTACK_DOCKER_BASE_IMAGE}:{settings.DSTACK_DOCKER_BASE_IMAGE_VERSION}{suffix}" + f"-ubuntu{settings.DSTACK_DOCKER_BASE_IMAGE_UBUNTU_VERSION}" + ) + result = self._call_resolve_provisioning_image(image_name, BackendType.AWS, "p5.48xlarge") + expected = ( + f"{registry}/{settings.DSTACK_DOCKER_BASE_IMAGE}:{settings.DSTACK_DOCKER_BASE_IMAGE_VERSION}" + f"-devel-efa-ubuntu{settings.DSTACK_DOCKER_BASE_IMAGE_UBUNTU_VERSION}" + ) + assert result == expected diff --git a/src/tests/_internal/server/services/encryption/__init__.py b/src/tests/_internal/server/services/encryption/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/server/services/encryption/keys/__init__.py b/src/tests/_internal/server/services/encryption/keys/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/server/services/encryption/keys/test_aes.py b/src/tests/_internal/server/services/encryption/keys/test_aes.py new file mode 100644 index 0000000000..a369ea6486 --- /dev/null +++ b/src/tests/_internal/server/services/encryption/keys/test_aes.py @@ -0,0 +1,18 @@ +from dstack._internal.server.services.encryption.keys.aes import ( + AESEncryptionKey, + AESEncryptionKeyConfig, +) + + +def get_aes_secret() -> str: + return "E5yzN6V3XvBq/f085ISWFCdgnOGED0kuFaAkASlmmO4=" + + +class TestAESEncryptionKey: + def test_encrypts_decrypts(self): + key = AESEncryptionKey(AESEncryptionKeyConfig(secret=get_aes_secret(), name="key1")) + plaintext = "This is a test string." + encrypted_text = key.encrypt(plaintext) + assert encrypted_text != plaintext + decrypted_text = key.decrypt(encrypted_text) + assert decrypted_text == plaintext diff --git a/src/tests/_internal/server/services/encryption/test_encryption.py b/src/tests/_internal/server/services/encryption/test_encryption.py new file mode 100644 index 0000000000..7468f20ebe --- /dev/null +++ b/src/tests/_internal/server/services/encryption/test_encryption.py @@ -0,0 +1,75 @@ +import pytest + +from dstack._internal.server.services.encryption import ( + EncryptionError, + decrypt, + encrypt, + encryption_keys_context, + get_identity_encryption_key, +) +from dstack._internal.server.services.encryption.keys.aes import ( + AESEncryptionKey, + AESEncryptionKeyConfig, +) + + +class TestEncrypt: + def test_encrypts_with_identity_when_no_keys_set(self): + text = "some text" + assert encrypt(text) == f"enc:identity:noname:{text}" + + def test_encrypts_with_first_key(self): + text = "some text" + with encryption_keys_context( + [ + AESEncryptionKey( + AESEncryptionKeyConfig( + secret="cR2r1JmkPyL6edBQeHKz6ZBjCfS2oWk87Gc2G3wHVoA=", + name="key1", + ) + ), + get_identity_encryption_key(), + ] + ): + assert encrypt(plaintext=text).startswith("enc:aes:key1") + + +class TestDecrypt: + def test_tries_all_keys(self): + ciphertext = "enc:identity:noname:encrypted text" + with pytest.raises(EncryptionError): + with encryption_keys_context( + [ + AESEncryptionKey( + AESEncryptionKeyConfig( + secret="cR2r1JmkPyL6edBQeHKz6ZBjCfS2oWk87Gc2G3wHVoA=", + name="key1", + ) + ), + AESEncryptionKey( + AESEncryptionKeyConfig( + secret="4nr0Hr4bck/xURGbpdDwnDwBP1iGnTtYZT752h/kWno=", + name="key2", + ) + ), + ] + ): + decrypt(ciphertext) + with encryption_keys_context( + [ + AESEncryptionKey( + AESEncryptionKeyConfig( + secret="cR2r1JmkPyL6edBQeHKz6ZBjCfS2oWk87Gc2G3wHVoA=", + name="key1", + ) + ), + AESEncryptionKey( + AESEncryptionKeyConfig( + secret="4nr0Hr4bck/xURGbpdDwnDwBP1iGnTtYZT752h/kWno=", + name="key2", + ) + ), + get_identity_encryption_key(), + ] + ): + assert decrypt(ciphertext) == "encrypted text" diff --git a/src/tests/_internal/server/services/gateways/test_autoscalers.py b/src/tests/_internal/server/services/gateways/test_autoscalers.py deleted file mode 100644 index 2db8dad39f..0000000000 --- a/src/tests/_internal/server/services/gateways/test_autoscalers.py +++ /dev/null @@ -1,137 +0,0 @@ -import datetime -from typing import Dict -from unittest.mock import patch - -import pytest - -from dstack._internal.server.services.gateways.autoscalers import ReplicaInfo, RPSAutoscaler -from dstack._internal.server.services.gateways.client import Stat - - -@pytest.fixture -def rps_scaler(): - return RPSAutoscaler(0, 5, 10, 5 * 60, 10 * 60) - - -@pytest.fixture -def time(): - dt = datetime.datetime(2024, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) - with patch("dstack._internal.utils.common.get_current_datetime") as mock: - mock.return_value = dt - yield dt - - -def stats(rps: float) -> Dict[int, Stat]: - return {60: Stat(requests=int(rps * 60), request_time=0.1)} - - -def replica(time: datetime.datetime, active: bool = True, timestamp: int = -3600) -> ReplicaInfo: - return ReplicaInfo( - active=active, - timestamp=time + datetime.timedelta(seconds=timestamp), - ) - - -class TestRPSAutoscaler: - def test_do_not_scale(self, rps_scaler, time): - assert rps_scaler.scale([replica(time, active=True)], stats(rps=10)) == 0 - - def test_scale_up(self, rps_scaler, time): - assert rps_scaler.scale([replica(time, active=True)], stats(rps=20)) == 1 - - def test_scale_up_high_load(self, rps_scaler, time): - assert ( - rps_scaler.scale( - [ - replica(time, active=True), - replica(time, active=True), - ], - stats(rps=50), - ) - == 3 - ) - - def test_scale_up_replicas_limit(self, rps_scaler, time): - assert ( - rps_scaler.scale( - [ - replica(time, active=True), - replica(time, active=True), - ], - stats(rps=1000), - ) - == 3 - ) - - def test_scale_down(self, rps_scaler, time): - assert ( - rps_scaler.scale( - [replica(time, active=True), replica(time, active=True)], stats(rps=5) - ) - == -1 - ) - - def test_scale_up_delayed_running(self, rps_scaler, time): - assert ( - rps_scaler.scale( - [ - # submitted 1 minute ago, but the delay is 5 minutes - replica(time, active=True, timestamp=-60), - ], - stats(rps=20), - ) - == 0 - ) - - def test_scale_up_delayed_terminated(self, rps_scaler, time): - assert ( - rps_scaler.scale( - [ - replica(time, active=True), - # terminated 1 minute ago, but the delay is 5 minutes - replica(time, active=False, timestamp=-60), - ], - stats(rps=20), - ) - == 0 - ) - - def test_scale_down_delayed(self, rps_scaler, time): - assert ( - rps_scaler.scale( - [ - replica(time, active=True), - # submitted 5 minutes ago, but the delay is 10 minutes - replica(time, active=True, timestamp=-5 * 60), - ], - stats(rps=5), - ) - == 0 - ) - - def test_scale_from_zero_immediately(self, rps_scaler, time): - assert rps_scaler.scale([], stats(rps=5)) == 1 - - def test_scale_from_zero_immediately_terminated(self, rps_scaler, time): - assert ( - rps_scaler.scale( - [ - # terminated 1 minute ago, but there are requests - replica(time, active=False, timestamp=-60), - ], - stats(rps=5), - ) - == 1 - ) - - def test_scale_to_zero(self, rps_scaler, time): - assert ( - rps_scaler.scale( - [ - replica(time, active=True), - replica(time, active=True), - ], - stats(rps=0), - ) - == -2 - ) diff --git a/src/tests/_internal/server/services/gateways/test_gateways.py b/src/tests/_internal/server/services/gateways/test_gateways.py new file mode 100644 index 0000000000..aaf8fe6d52 --- /dev/null +++ b/src/tests/_internal/server/services/gateways/test_gateways.py @@ -0,0 +1,88 @@ +import pytest +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.proxy.gateway.const import SERVICE_SCALING_WINDOWS +from dstack._internal.proxy.gateway.schemas.stats import Stat +from dstack._internal.server.services.gateways import ( + _merge_per_window_stats, + get_gateway_compute_models, +) +from dstack._internal.server.testing.common import ( + create_backend, + create_gateway, + create_gateway_compute, + create_project, +) + + +class TestMergePerWindowStats: + def test_empty_returns_zero_stats(self): + result = _merge_per_window_stats([]) + for window in SERVICE_SCALING_WINDOWS: + assert result[window].requests == 0 + assert result[window].request_time == 0.0 + + def test_single_replica_returns_same_values(self): + stats = {w: Stat(requests=10, request_time=0.5) for w in SERVICE_SCALING_WINDOWS} + result = _merge_per_window_stats([stats]) + for window in SERVICE_SCALING_WINDOWS: + assert result[window].requests == 10 + assert result[window].request_time == pytest.approx(0.5) + + def test_multiple_replicas_sums_requests_and_averages_time(self): + stats_a = {w: Stat(requests=10, request_time=1.0) for w in SERVICE_SCALING_WINDOWS} + stats_b = {w: Stat(requests=30, request_time=3.0) for w in SERVICE_SCALING_WINDOWS} + result = _merge_per_window_stats([stats_a, stats_b]) + for window in SERVICE_SCALING_WINDOWS: + assert result[window].requests == 40 + assert result[window].request_time == pytest.approx(2.5) # (10*1 + 30*3) / 40 + + def test_zero_requests_across_all_replicas_returns_zero_time(self): + stats_a = {w: Stat(requests=0, request_time=0.0) for w in SERVICE_SCALING_WINDOWS} + stats_b = {w: Stat(requests=0, request_time=0.0) for w in SERVICE_SCALING_WINDOWS} + result = _merge_per_window_stats([stats_a, stats_b]) + for window in SERVICE_SCALING_WINDOWS: + assert result[window].requests == 0 + assert result[window].request_time == 0.0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestGetGatewayComputeModels: + async def test_new_style_returns_gateway_computes(self, test_db, session: AsyncSession): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + gateway = await create_gateway( + session=session, project_id=project.id, backend_id=backend.id + ) + compute = await create_gateway_compute( + session=session, gateway_id=gateway.id, backend_id=backend.id + ) + await session.refresh(gateway, ["gateway_computes", "gateway_compute"]) + result = get_gateway_compute_models(gateway) + assert len(result) == 1 + assert result[0].id == compute.id + + async def test_old_style_returns_single_compute(self, test_db, session: AsyncSession): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + compute = await create_gateway_compute(session=session, backend_id=backend.id) + gateway = await create_gateway( + session=session, project_id=project.id, backend_id=backend.id + ) + gateway.gateway_compute_id = compute.id + await session.commit() + await session.refresh(gateway, ["gateway_computes", "gateway_compute"]) + result = get_gateway_compute_models(gateway) + assert len(result) == 1 + assert result[0].id == compute.id + + async def test_no_computes_returns_empty(self, test_db, session: AsyncSession): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + gateway = await create_gateway( + session=session, project_id=project.id, backend_id=backend.id + ) + await session.refresh(gateway, ["gateway_computes", "gateway_compute"]) + result = get_gateway_compute_models(gateway) + assert result == [] diff --git a/src/tests/_internal/server/services/jobs/__init__.py b/src/tests/_internal/server/services/jobs/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/server/services/jobs/configurators/__init__.py b/src/tests/_internal/server/services/jobs/configurators/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/server/services/jobs/configurators/test_base.py b/src/tests/_internal/server/services/jobs/configurators/test_base.py new file mode 100644 index 0000000000..0b4fb2cbec --- /dev/null +++ b/src/tests/_internal/server/services/jobs/configurators/test_base.py @@ -0,0 +1,72 @@ +from typing import Union + +import pytest + +from dstack._internal.core.errors import ServerClientError +from dstack._internal.core.models.volumes import InstanceMountPoint, MountPoint, VolumeMountPoint +from dstack._internal.server.services.jobs.configurators.base import interpolate_job_volumes + + +class TestInterpolateJobVolumes: + @pytest.mark.parametrize( + ["run_volumes", "job_num", "job_volumes"], + [ + pytest.param( + [VolumeMountPoint(name="volume", path="/volume")], + 0, + [VolumeMountPoint(name=["volume"], path="/volume")], + id="no_interpolation", + ), + pytest.param( + [InstanceMountPoint(instance_path="/volume", path="/volume")], + 0, + [InstanceMountPoint(instance_path="/volume", path="/volume")], + id="instance_mount", + ), + pytest.param( + [ + VolumeMountPoint( + name="job${{dstack.job_num}}-rank${{dstack.node_rank}}", path="/volume" + ) + ], + 2, + [VolumeMountPoint(name=["job2-rank2"], path="/volume")], + id="job_num_and_node_rank", + ), + ], + ) + def test_interpolates_volumes( + self, + run_volumes: list[Union[MountPoint, str]], + job_num: int, + job_volumes: list[MountPoint], + ): + assert interpolate_job_volumes(run_volumes, job_num) == job_volumes + + @pytest.mark.parametrize( + ["run_volumes", "job_num"], + [ + pytest.param( + [VolumeMountPoint(name="${{}", path="/volume")], + 0, + id="invalid_syntax", + ), + pytest.param( + [VolumeMountPoint(name="${{ unknown.namespace }}", path="/volume")], + 0, + id="unknown_namespace", + ), + pytest.param( + [VolumeMountPoint(name="${{ dstack.var }}", path="/volume")], + 0, + id="unknown_var", + ), + ], + ) + def test_raises_server_client_error( + self, + run_volumes: list[Union[MountPoint, str]], + job_num: int, + ): + with pytest.raises(ServerClientError): + assert interpolate_job_volumes(run_volumes, job_num) diff --git a/src/tests/_internal/server/services/jobs/configurators/test_service.py b/src/tests/_internal/server/services/jobs/configurators/test_service.py new file mode 100644 index 0000000000..a8410fcfac --- /dev/null +++ b/src/tests/_internal/server/services/jobs/configurators/test_service.py @@ -0,0 +1,434 @@ +from unittest.mock import Mock + +import pytest + +from dstack._internal import settings +from dstack._internal.core.models.configurations import ( + OPENAI_MODEL_PROBE_TIMEOUT, + ProbeConfig, + PythonVersion, + ReplicaGroup, + ServiceConfiguration, +) +from dstack._internal.core.models.profiles import SpotPolicy +from dstack._internal.core.models.resources import Range +from dstack._internal.core.models.services import OpenAIChatModel +from dstack._internal.server.services.docker import ImageConfig +from dstack._internal.server.services.jobs.configurators.base import get_default_image +from dstack._internal.server.services.jobs.configurators.service import ServiceJobConfigurator +from dstack._internal.server.testing.common import get_run_spec + + +@pytest.mark.asyncio +@pytest.mark.usefixtures("image_config_mock") +class TestProbes: + async def test_default_probe_when_model_set(self): + """When model is set but probes omitted, a default model probe should be generated.""" + configuration = ServiceConfiguration( + port=80, + image="debian", + model=OpenAIChatModel( + name="meta-llama/Meta-Llama-3.1-8B-Instruct", + format="openai", + ), + ) + run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration) + configurator = ServiceJobConfigurator(run_spec) + + job_specs = await configurator.get_job_specs(replica_num=0) + + assert len(job_specs) == 1 + probes = job_specs[0].probes + assert len(probes) == 1 + probe = probes[0] + assert probe.type == "http" + assert probe.method == "post" + assert probe.url == "/v1/chat/completions" + assert probe.timeout == OPENAI_MODEL_PROBE_TIMEOUT + assert len(probe.headers) == 1 + assert probe.headers[0].name == "Content-Type" + assert probe.headers[0].value == "application/json" + assert "meta-llama/Meta-Llama-3.1-8B-Instruct" in (probe.body or "") + assert "max_tokens" in (probe.body or "") + + async def test_explicit_probes_not_overridden(self): + """When probes are explicitly set, they should be used as-is.""" + configuration = ServiceConfiguration( + port=80, + image="debian", + model=OpenAIChatModel( + name="meta-llama/Meta-Llama-3.1-8B-Instruct", + format="openai", + ), + probes=[ProbeConfig(type="http", url="/health")], + ) + run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration) + configurator = ServiceJobConfigurator(run_spec) + + job_specs = await configurator.get_job_specs(replica_num=0) + + assert len(job_specs) == 1 + probes = job_specs[0].probes + assert len(probes) == 1 + assert probes[0].url == "/health" + + async def test_explicit_empty_probes(self): + """When probes is explicitly set to empty list, no probes should be generated.""" + configuration = ServiceConfiguration( + port=80, + image="debian", + model=OpenAIChatModel( + name="meta-llama/Meta-Llama-3.1-8B-Instruct", + format="openai", + ), + probes=[], + ) + run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration) + configurator = ServiceJobConfigurator(run_spec) + + job_specs = await configurator.get_job_specs(replica_num=0) + + assert len(job_specs) == 1 + assert len(job_specs[0].probes) == 0 + + async def test_no_probe_when_no_model(self): + """When neither model nor probes are set, no probes should be generated.""" + configuration = ServiceConfiguration( + port=80, + image="debian", + ) + run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration) + configurator = ServiceJobConfigurator(run_spec) + + job_specs = await configurator.get_job_specs(replica_num=0) + + assert len(job_specs) == 1 + assert len(job_specs[0].probes) == 0 + + +def _make_run_spec(replicas, **service_kwargs): + configuration = ServiceConfiguration( + port=80, + replicas=replicas, + **service_kwargs, + ) + return get_run_spec(run_name="run", repo_id="id", configuration=configuration) + + +@pytest.mark.asyncio +@pytest.mark.usefixtures("image_config_mock") +class TestPerGroupOverrides: + """Verifies that ServiceJobConfigurator picks up per-replica-group + image-source fields (image, docker, python, nvcc, privileged, etc).""" + + async def test_image_name_uses_group_image(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + image="custom:1.0", + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._image_name() == "custom:1.0" + + async def test_image_name_uses_dind_when_group_docker_true(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + docker=True, + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._image_name() == settings.DSTACK_DIND_IMAGE + + async def test_image_name_uses_nvcc_default_when_group_nvcc_true(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + nvcc=True, + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._image_name() == get_default_image(nvcc=True) + + async def test_image_name_falls_back_to_service_image(self): + run_spec = _make_run_spec( + image="svc:1.0", + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._image_name() == "svc:1.0" + + async def test_privileged_true_when_group_docker(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + docker=True, + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._privileged() is True + + async def test_privileged_returns_group_privileged(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + image="x", + privileged=True, + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._privileged() is True + + async def test_privileged_defers_to_super_when_group_unset(self): + run_spec = _make_run_spec( + image="svc:1.0", + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + # Service-level privileged defaults to False + assert configurator._privileged() is False + + async def test_dstack_image_commands_injects_start_dockerd_for_docker(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + docker=True, + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._dstack_image_commands() == ["start-dockerd"] + + async def test_dstack_image_commands_empty_for_group_image(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + image="alpine", + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._dstack_image_commands() == [] + + async def test_shell_bash_when_group_docker(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + docker=True, + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._shell() == "/bin/bash" + + async def test_shell_sh_when_group_image(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + image="alpine", + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._shell() == "/bin/sh" + + async def test_python_uses_group_python(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + python=PythonVersion.PY312, + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._python() == "3.12" + + async def test_user_looks_up_group_image(self, monkeypatch: pytest.MonkeyPatch): + """When a group sets its own `image`, _user() queries that image's config.""" + image_config = ImageConfig.parse_obj({"User": "nginx", "Entrypoint": None, "Cmd": []}) + monkeypatch.setattr( + "dstack._internal.server.services.jobs.configurators.base._get_image_config", + Mock(return_value=image_config), + ) + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + image="nginxinc/nginx-unprivileged", + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + user = await configurator._user() + assert user is not None + + async def test_user_does_not_lookup_for_group_docker(self, monkeypatch: pytest.MonkeyPatch): + """`docker: true` should not trigger an image-config registry call.""" + mock_get_image_config = Mock() + monkeypatch.setattr( + "dstack._internal.server.services.jobs.configurators.base._get_image_config", + mock_get_image_config, + ) + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + docker=True, + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + await configurator._user() + mock_get_image_config.assert_not_called() + + async def test_spot_policy_uses_group_value(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + commands=["x"], + spot_policy=SpotPolicy.SPOT, + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._spot_policy() == SpotPolicy.SPOT + + async def test_spot_policy_defaults_to_ondemand_when_group_unset(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._spot_policy() == SpotPolicy.ONDEMAND + + async def test_different_groups_different_spot_policies(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="spot", + count=Range(min=1, max=1), + commands=["x"], + spot_policy=SpotPolicy.SPOT, + ), + ReplicaGroup( + name="od", + count=Range(min=1, max=1), + commands=["y"], + spot_policy=SpotPolicy.ONDEMAND, + ), + ], + ) + assert ( + ServiceJobConfigurator(run_spec, replica_group_name="spot")._spot_policy() + == SpotPolicy.SPOT + ) + assert ( + ServiceJobConfigurator(run_spec, replica_group_name="od")._spot_policy() + == SpotPolicy.ONDEMAND + ) + + async def test_reservation_uses_group_value(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + commands=["x"], + reservation="my-reservation", + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._reservation() == "my-reservation" + + async def test_reservation_defaults_to_none_when_group_unset(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._reservation() is None + + async def test_different_groups_different_reservations(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + commands=["x"], + reservation="res-a", + ), + ReplicaGroup( + name="b", + count=Range(min=1, max=1), + commands=["y"], + reservation="res-b", + ), + ], + ) + assert ServiceJobConfigurator(run_spec, replica_group_name="a")._reservation() == "res-a" + assert ServiceJobConfigurator(run_spec, replica_group_name="b")._reservation() == "res-b" diff --git a/src/tests/_internal/server/services/jobs/configurators/test_task.py b/src/tests/_internal/server/services/jobs/configurators/test_task.py new file mode 100644 index 0000000000..3c80bf226f --- /dev/null +++ b/src/tests/_internal/server/services/jobs/configurators/test_task.py @@ -0,0 +1,121 @@ +from typing import Optional +from unittest.mock import patch + +import pytest + +from dstack._internal.core.models.configurations import TaskConfiguration +from dstack._internal.core.models.runs import JobSSHKey +from dstack._internal.server.services.docker import ImageConfig +from dstack._internal.server.services.jobs.configurators.task import TaskJobConfigurator +from dstack._internal.server.testing.common import get_run_spec + + +@pytest.mark.asyncio +@pytest.mark.usefixtures("image_config_mock") +class TestSSHKey: + async def test_single_node(self): + configuration = TaskConfiguration(nodes=1, image="debian") + run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration) + configurator = TaskJobConfigurator(run_spec) + + job_specs = await configurator.get_job_specs(replica_num=0) + + assert len(job_specs) == 1 + assert job_specs[0].ssh_key is None + + async def test_multi_node(self): + configuration = TaskConfiguration(nodes=2, image="debian") + run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration) + configurator = TaskJobConfigurator(run_spec) + + with patch("dstack._internal.utils.crypto.generate_rsa_key_pair_bytes") as gen_mock: + gen_mock.side_effect = [(b"private1", b"public1"), (b"private2", b"public2")] + job_specs = await configurator.get_job_specs(replica_num=0) + + assert len(job_specs) == 2 + assert job_specs[0].ssh_key == JobSSHKey(private="private1", public="public1") + assert job_specs[1].ssh_key == JobSSHKey(private="private1", public="public1") + + +@pytest.mark.asyncio +@pytest.mark.usefixtures("image_config_mock") +class TestCommands: + @pytest.mark.parametrize( + ["commands", "expected_commands"], + [ + pytest.param([], ["/entrypoint.sh", "-v"], id="no-commands"), + pytest.param(["-x", "-u"], ["/entrypoint.sh", "-v", "-x", "-u"], id="with-commands"), + ], + ) + async def test_with_entrypoint(self, commands: list[str], expected_commands: list[str]): + configuration = TaskConfiguration( + image="debian", + entrypoint="/entrypoint.sh -v", + commands=commands, + ) + run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration) + configurator = TaskJobConfigurator(run_spec) + + job_specs = await configurator.get_job_specs(replica_num=0) + + assert job_specs[0].commands == expected_commands + + @pytest.mark.parametrize( + ["shell", "expected_shell"], + [ + pytest.param(None, "/bin/sh", id="default-shell"), + pytest.param("sh", "/bin/sh", id="sh"), + pytest.param("bash", "/bin/bash", id="bash"), + pytest.param("/usr/bin/zsh", "/usr/bin/zsh", id="custom-shell"), + ], + ) + async def test_with_commands_and_image(self, shell: Optional[str], expected_shell: str): + configuration = TaskConfiguration(image="debian", commands=["sleep inf"], shell=shell) + run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration) + configurator = TaskJobConfigurator(run_spec) + + job_specs = await configurator.get_job_specs(replica_num=0) + + assert job_specs[0].commands == [expected_shell, "-i", "-c", "sleep inf"] + + @pytest.mark.parametrize( + ["shell", "expected_shell"], + [ + pytest.param(None, "/bin/bash", id="default-shell"), + pytest.param("sh", "/bin/sh", id="sh"), + pytest.param("bash", "/bin/bash", id="bash"), + pytest.param("/usr/bin/zsh", "/usr/bin/zsh", id="custom-shell"), + ], + ) + async def test_with_commands_no_image(self, shell: Optional[str], expected_shell: str): + configuration = TaskConfiguration(python="3.12", commands=["sleep inf"], shell=shell) + run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration) + configurator = TaskJobConfigurator(run_spec) + + job_specs = await configurator.get_job_specs(replica_num=0) + + assert job_specs[0].commands == [ + expected_shell, + "-i", + "-c", + ( + "eval $(echo 'export DSTACK_VENV_DIR=/dstack/venv' | sudo tee -a /dstack/profile)" + " && sudo rm -rf $DSTACK_VENV_DIR" + " && sudo mkdir $DSTACK_VENV_DIR" + " && sudo chown $(id -u):$(id -g) $DSTACK_VENV_DIR" + " && uv venv -q --prompt dstack -p 3.12 --seed $DSTACK_VENV_DIR" + " && eval $(echo '. $DSTACK_VENV_DIR/bin/activate' | sudo tee -a /dstack/profile)" + " && sleep inf" + ), + ] + + async def test_no_commands(self, image_config_mock: ImageConfig): + image_config_mock.entrypoint = ["/entrypoint.sh"] + image_config_mock.cmd = ["-f", "-x"] + configuration = TaskConfiguration(image="debian") + run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration) + configurator = TaskJobConfigurator(run_spec) + + job_specs = await configurator.get_job_specs(replica_num=0) + + assert job_specs[0].commands == ["/entrypoint.sh", "-f", "-x"] diff --git a/src/tests/_internal/server/services/jobs/test_jobs.py b/src/tests/_internal/server/services/jobs/test_jobs.py new file mode 100644 index 0000000000..21d69062f1 --- /dev/null +++ b/src/tests/_internal/server/services/jobs/test_jobs.py @@ -0,0 +1,101 @@ +from unittest.mock import patch + +import pytest + +import dstack._internal.server.settings as server_settings +from dstack._internal.core.models.common import RegistryAuth +from dstack._internal.core.models.configurations import TaskConfiguration +from dstack._internal.core.models.profiles import Profile +from dstack._internal.core.models.repos.local import LocalRunRepoData +from dstack._internal.core.models.runs import RunSpec +from dstack._internal.server.services.docker import ImageConfig +from dstack._internal.server.services.jobs import get_job_specs_from_run_spec + + +@pytest.mark.parametrize( + "configuration, expected_calls", + [ + pytest.param( + # No need to request the registry if our default image is used. + TaskConfiguration(commands=["sleep infinity"]), + 0, + id="default-dstack-image", + ), + pytest.param( + TaskConfiguration(image="ubuntu"), + 1, + id="custom-image", + ), + pytest.param( + TaskConfiguration(image="ubuntu", commands=["sleep infinity"]), + 1, + id="custom-image-with-commands", + ), + pytest.param( + TaskConfiguration(image="ubuntu", user="root"), + 1, + id="custom-image-with-user", + ), + pytest.param( + # Setting `commands` and `user` is a known hack that we advertised to some customers + # to avoid registry requests. + TaskConfiguration(image="ubuntu", commands=["sleep infinity"], user="root"), + 0, + id="custom-image-with-commands-and-user", + ), + ], +) +@pytest.mark.asyncio +async def test_get_job_specs_from_run_spec_image_config_calls( + configuration: TaskConfiguration, expected_calls: int +) -> None: + """ + Test the number of times we attempt to fetch the image config from the Docker registry. + + Whenever possible, we prefer not to request the registry to avoid hitting rate limits. + """ + + run_spec = RunSpec( + run_name="test-run", + repo_data=LocalRunRepoData(repo_dir="/"), + configuration=configuration, + profile=Profile(name="default"), + ssh_key_pub="user_ssh_key", + ) + fake_image_config = ImageConfig.parse_obj({"Entrypoint": ["/bin/bash"]}) + with patch( + "dstack._internal.server.services.jobs.configurators.base._get_image_config", + return_value=fake_image_config, + ) as mock_get_image_config: + await get_job_specs_from_run_spec(run_spec=run_spec, secrets={}, replica_num=0) + assert mock_get_image_config.call_count == expected_calls + + +@pytest.mark.asyncio +async def test_get_image_config_uses_server_default_registry(monkeypatch) -> None: + monkeypatch.setattr(server_settings, "SERVER_DEFAULT_DOCKER_REGISTRY", "registry.example") + monkeypatch.setattr(server_settings, "SERVER_DEFAULT_DOCKER_REGISTRY_USERNAME", "user") + monkeypatch.setattr(server_settings, "SERVER_DEFAULT_DOCKER_REGISTRY_PASSWORD", "pass") + run_spec = RunSpec( + run_name="test-run", + repo_data=LocalRunRepoData(repo_dir="/"), + configuration=TaskConfiguration(image="ubuntu"), + profile=Profile(name="default"), + ssh_key_pub="user_ssh_key", + ) + fake_image_config = ImageConfig.parse_obj({"Entrypoint": ["/bin/bash"]}) + with patch( + "dstack._internal.server.services.jobs.configurators.base._get_image_config", + return_value=fake_image_config, + ) as mock_get_image_config: + job_specs = await get_job_specs_from_run_spec(run_spec=run_spec, secrets={}, replica_num=0) + mock_get_image_config.assert_called_once_with( + "registry.example/ubuntu", + RegistryAuth(username="user", password="pass"), + ) + + assert len(job_specs) == 1 + # NOTE: server defaults should not be set on the job spec, + # especially the credentials, so as not to leak them in the API. + assert job_specs[0].image_name == "ubuntu" + assert job_specs[0].registry_auth is None diff --git a/src/tests/_internal/server/services/prometheus/__init__.py b/src/tests/_internal/server/services/prometheus/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/server/services/prometheus/test_client_metrics.py b/src/tests/_internal/server/services/prometheus/test_client_metrics.py new file mode 100644 index 0000000000..9d21ff5360 --- /dev/null +++ b/src/tests/_internal/server/services/prometheus/test_client_metrics.py @@ -0,0 +1,45 @@ +from unittest.mock import MagicMock + +from dstack._internal.server.services.prometheus.client_metrics import run_metrics + + +class TestRunMetrics: + def test_log_submit_to_provision_duration(self, monkeypatch): + mock_histogram = MagicMock() + mock_labels = MagicMock() + mock_histogram.labels.return_value = mock_labels + monkeypatch.setattr(run_metrics, "_submit_to_provision_duration", mock_histogram) + + duration = 120.5 + project_name = "test-project" + run_type = "dev" + + run_metrics.log_submit_to_provision_duration(duration, project_name, run_type) + + mock_histogram.labels.assert_called_once_with(project_name=project_name, run_type=run_type) + mock_labels.observe.assert_called_once_with(duration) + + def test_increment_pending_runs(self, monkeypatch): + mock_counter = MagicMock() + mock_labels = MagicMock() + mock_counter.labels.return_value = mock_labels + + monkeypatch.setattr(run_metrics, "_pending_runs_total", mock_counter) + + project_name = "test-project" + run_type = "train" + + run_metrics.increment_pending_runs(project_name, run_type) + mock_counter.labels.assert_called_once_with(project_name=project_name, run_type=run_type) + mock_labels.inc.assert_called_once() + + def test_multiple_calls_to_log_submit_to_provision_duration(self): + run_metrics.log_submit_to_provision_duration(60.0, "project1", "dev") + run_metrics.log_submit_to_provision_duration(120.0, "project1", "prod") + run_metrics.log_submit_to_provision_duration(30.0, "project2", "dev") + + def test_multiple_calls_to_increment_pending_runs(self): + run_metrics.increment_pending_runs("project1", "dev") + run_metrics.increment_pending_runs("project1", "prod") + run_metrics.increment_pending_runs("project2", "dev") + run_metrics.increment_pending_runs("project1", "dev") diff --git a/src/tests/_internal/server/services/proxy/__init__.py b/src/tests/_internal/server/services/proxy/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/server/services/proxy/routers/__init__.py b/src/tests/_internal/server/services/proxy/routers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/server/services/proxy/routers/test_service_proxy.py b/src/tests/_internal/server/services/proxy/routers/test_service_proxy.py new file mode 100644 index 0000000000..cf31e8af0d --- /dev/null +++ b/src/tests/_internal/server/services/proxy/routers/test_service_proxy.py @@ -0,0 +1,280 @@ +from typing import Generator, Optional, Tuple +from unittest.mock import patch + +import httpx +import pytest +from fastapi import FastAPI +from fastapi.responses import PlainTextResponse + +from dstack._internal.proxy.gateway.repo.repo import GatewayProxyRepo +from dstack._internal.proxy.lib.auth import BaseProxyAuthProvider +from dstack._internal.proxy.lib.repo import BaseProxyRepo +from dstack._internal.proxy.lib.services.service_connection import ServiceClient +from dstack._internal.proxy.lib.testing.auth import ProxyTestAuthProvider +from dstack._internal.proxy.lib.testing.common import ( + ProxyTestDependencyInjector, + make_project, + make_service, +) +from dstack._internal.server.services.proxy.routers.service_proxy import router + +MOCK_REPLICA_CLIENT_TIMEOUT = 8 + +# Using GatewayProxyRepo for tests because it is easier to populate than ServerProxyRepo +ProxyTestRepo = GatewayProxyRepo + + +@pytest.fixture +def mock_replica_client_httpbin(httpbin) -> Generator[None, None, None]: + """Mocks deployed services. Replaces them with httpbin""" + + with patch( + "dstack._internal.proxy.lib.services.service_connection.ServiceConnectionPool.get_or_add" + ) as add_connection_mock: + add_connection_mock.return_value.client.return_value = ServiceClient( + base_url=httpbin.url, timeout=MOCK_REPLICA_CLIENT_TIMEOUT + ) + yield + + +@pytest.fixture +def mock_replica_client_path_reporter() -> Generator[None, None, None]: + """Mocks deployed services. Replaces them with an app that returns the requested path""" + + app = FastAPI() + app.get("{path:path}")(lambda path: PlainTextResponse(path)) + client = ServiceClient(base_url="http://test/", transport=httpx.ASGITransport(app)) + with patch( + "dstack._internal.proxy.lib.services.service_connection.ServiceConnectionPool.get_or_add" + ) as add_connection_mock: + add_connection_mock.return_value.client.return_value = client + yield + + +def make_app( + repo: BaseProxyRepo, auth: BaseProxyAuthProvider = ProxyTestAuthProvider() +) -> FastAPI: + app = FastAPI() + app.state.proxy_dependency_injector = ProxyTestDependencyInjector(repo=repo, auth=auth) + app.include_router(router, prefix="/proxy/services") + return app + + +def make_client(app: FastAPI) -> httpx.AsyncClient: + return httpx.AsyncClient(transport=httpx.ASGITransport(app=app)) + + +def make_app_client( + repo: BaseProxyRepo = GatewayProxyRepo(), auth: BaseProxyAuthProvider = ProxyTestAuthProvider() +) -> Tuple[FastAPI, httpx.AsyncClient]: + app = make_app(repo, auth) + client = make_client(app) + return app, client + + +@pytest.mark.asyncio +@pytest.mark.parametrize("method", ["get", "post", "put", "patch", "delete"]) +async def test_proxy(mock_replica_client_httpbin, method: str) -> None: + methods_without_body = "get", "delete" + repo = ProxyTestRepo() + await repo.set_project(make_project("test-proj")) + await repo.set_service(make_service("test-proj", "httpbin")) + _, client = make_app_client(repo) + req_body = "." * 20 * 2**20 if method not in methods_without_body else None + resp = await client.request( + method, + f"http://test-host:8888/proxy/services/test-proj/httpbin/{method}?a=b&c=", + headers={"User-Agent": "test-ua", "Connection": "keep-alive"}, + content=req_body, + ) + assert resp.status_code == 200 + resp_body = resp.json() + assert resp_body["url"] == f"http://test-host:8888/{method}?a=b&c=" + assert resp_body["args"] == {"a": "b", "c": ""} + assert resp_body["headers"]["Host"] == "test-host:8888" + assert resp_body["headers"]["User-Agent"] == "test-ua" + assert resp_body["headers"]["Connection"] == "keep-alive" + if method not in methods_without_body: + assert resp_body["data"] == req_body + + +@pytest.mark.asyncio +async def test_proxy_method_head(mock_replica_client_httpbin) -> None: + repo = ProxyTestRepo() + await repo.set_project(make_project("test-proj")) + await repo.set_service(make_service("test-proj", "httpbin")) + _, client = make_app_client(repo) + url = "http://test-host/proxy/services/test-proj/httpbin/" + get_resp = await client.get(url) + head_resp = await client.head(url) + assert get_resp.status_code == head_resp.status_code == 200 + assert head_resp.headers["Content-Length"] == get_resp.headers["Content-Length"] + assert int(head_resp.headers["Content-Length"]) > 0 + assert head_resp.content == b"" + + +@pytest.mark.asyncio +async def test_proxy_method_options(mock_replica_client_httpbin) -> None: + repo = ProxyTestRepo() + await repo.set_project(make_project("test-proj")) + await repo.set_service(make_service("test-proj", "httpbin")) + _, client = make_app_client(repo) + resp = await client.options("http://test-host/proxy/services/test-proj/httpbin/get") + assert resp.status_code == 200 + assert set(resp.headers["Allow"].split(", ")) == {"HEAD", "GET", "OPTIONS"} + assert resp.content == b"" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("code", [204, 304, 418, 503]) +async def test_proxy_status_codes(mock_replica_client_httpbin, code: int) -> None: + repo = ProxyTestRepo() + await repo.set_project(make_project("test-proj")) + await repo.set_service(make_service("test-proj", "httpbin")) + _, client = make_app_client(repo) + resp = await client.get(f"http://test-host/proxy/services/test-proj/httpbin/status/{code}") + assert resp.status_code == code + + +@pytest.mark.asyncio +async def test_proxy_not_leaks_cookies(mock_replica_client_httpbin) -> None: + repo = ProxyTestRepo() + await repo.set_project(make_project("test-proj")) + await repo.set_service(make_service("test-proj", "httpbin")) + app = make_app(repo) + client1 = make_client(app) + client2 = make_client(app) + cookies_url = "http://test-host/proxy/services/test-proj/httpbin/cookies" + await client1.get(cookies_url + "/set?a=1") + await client1.get(cookies_url + "/set?b=2") + await client2.get(cookies_url + "/set?a=3") + resp1 = await client1.get(cookies_url) + resp2 = await client2.get(cookies_url) + assert resp1.json()["cookies"] == {"a": "1", "b": "2"} + assert resp2.json()["cookies"] == {"a": "3"} + + +@pytest.mark.asyncio +async def test_proxy_gateway_timeout(mock_replica_client_httpbin) -> None: + repo = ProxyTestRepo() + await repo.set_project(make_project("test-proj")) + await repo.set_service(make_service("test-proj", "httpbin")) + _, client = make_app_client(repo) + assert MOCK_REPLICA_CLIENT_TIMEOUT < 10 + resp = await client.get("http://test-host/proxy/services/test-proj/httpbin/delay/10") + assert resp.status_code == 504 + assert resp.json()["detail"] == "Timed out requesting upstream" + + +@pytest.mark.asyncio +async def test_proxy_run_not_found(mock_replica_client_httpbin) -> None: + repo = ProxyTestRepo() + await repo.set_project(make_project("test-proj")) + await repo.set_service(make_service("test-proj", "test-run")) + _, client = make_app_client(repo) + resp = await client.get("http://test-host/proxy/services/test-proj/unknown/") + assert resp.status_code == 404 + assert resp.json()["detail"] == "Service test-proj/unknown not found" + + +@pytest.mark.asyncio +async def test_proxy_project_not_found(mock_replica_client_httpbin) -> None: + _, client = make_app_client(ProxyTestRepo()) + resp = await client.get("http://test-host/proxy/services/unknown/test-run/") + assert resp.status_code == 404 + assert resp.json()["detail"] == "Service unknown/test-run not found" + + +@pytest.mark.asyncio +async def test_redirect_to_service_root(mock_replica_client_httpbin) -> None: + repo = ProxyTestRepo() + await repo.set_project(make_project("test-proj")) + await repo.set_service(make_service("test-proj", "httpbin")) + _, client = make_app_client(repo) + url = "http://test-host/proxy/services/test-proj/httpbin" + resp = await client.get(url, follow_redirects=False) + assert resp.status_code == 308 + assert resp.headers["Location"] == url + "/" + resp = await client.get(url, follow_redirects=True) + assert resp.status_code == 200 + assert resp.request.url == url + "/" + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "response_headers", + [ + pytest.param( + { + "X-Custom-Header": "1", + "Server": "test", + "Date": "Mon, 11 May 2026 00:00:00 GMT", + }, + id="mixed-case", + ), + pytest.param( + { + "x-custom-header": "1", + "server": "test", + "date": "Mon, 11 May 2026 00:00:00 GMT", + }, + id="lower-case", + ), + ], +) +async def test_drop_uvicorn_headers( + mock_replica_client_httpbin, response_headers: dict[str, str] +) -> None: + repo = ProxyTestRepo() + await repo.set_project(make_project("test-proj")) + await repo.set_service(make_service("test-proj", "httpbin")) + _, client = make_app_client(repo) + resp = await client.post( + "http://test-host/proxy/services/test-proj/httpbin/response-headers", + params=response_headers, + ) + assert resp.status_code == 200 + assert "X-Custom-Header" in resp.headers + # These should be stripped by the proxy, as they are then set by uvicorn + assert "Server" not in resp.headers + assert "Date" not in resp.headers + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("token", "status"), [("correct-token", 200), ("incorrect-token", 403), ("", 403), (None, 403)] +) +async def test_auth(mock_replica_client_httpbin, token: Optional[str], status: int) -> None: + auth = ProxyTestAuthProvider({"test-proj": {"correct-token"}}) + repo = ProxyTestRepo() + await repo.set_project(make_project("test-proj")) + await repo.set_service(make_service("test-proj", "httpbin", auth=True)) + _, client = make_app_client(repo, auth) + headers = None + if token is not None: + headers = {"Authorization": f"Bearer {token}"} + url = "http://test-host/proxy/services/test-proj/httpbin/" + resp = await client.get(url, headers=headers) + assert resp.status_code == status + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("strip", "downstream_path", "upstream_path"), + [ + (True, "/proxy/services/my-proj/my-run/", "/"), + (True, "/proxy/services/my-proj/my-run/a/b", "/a/b"), + (False, "/proxy/services/my-proj/my-run/", "/proxy/services/my-proj/my-run/"), + (False, "/proxy/services/my-proj/my-run/a/b", "/proxy/services/my-proj/my-run/a/b"), + ], +) +async def test_strip_prefix( + mock_replica_client_path_reporter, strip: bool, downstream_path: str, upstream_path: str +) -> None: + repo = ProxyTestRepo() + await repo.set_project(make_project("my-proj")) + await repo.set_service(make_service("my-proj", "my-run", strip_prefix=strip)) + _, client = make_app_client(repo) + resp = await client.get(f"http://test-host{downstream_path}") + assert resp.status_code == 200 + assert resp.text == upstream_path diff --git a/src/tests/_internal/server/services/requirements/__init__.py b/src/tests/_internal/server/services/requirements/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/server/services/requirements/test_combine.py b/src/tests/_internal/server/services/requirements/test_combine.py new file mode 100644 index 0000000000..3680161e83 --- /dev/null +++ b/src/tests/_internal/server/services/requirements/test_combine.py @@ -0,0 +1,499 @@ +from typing import Optional + +import gpuhunt +import pytest + +from dstack._internal.core.backends.vastai.profile_options import ( + VastAIOfferOrder, + VastAIProfileOptions, +) +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.profiles import SpotPolicy +from dstack._internal.core.models.resources import ( + ComputeCapability, + CPUSpec, + DiskSpec, + GPUSpec, + Memory, + Range, + ResourcesSpec, +) +from dstack._internal.core.models.runs import Requirements +from dstack._internal.server.services.requirements.combine import ( + CombineError, + Profile, + _combine_backend_options_optional, + _combine_cpu, + _combine_gpu_optional, + _combine_idle_duration_optional, + _combine_resources, + _combine_spot_policy_optional, + _intersect_lists_optional, + combine_fleet_and_run_profiles, + combine_fleet_and_run_requirements, +) + + +class TestCombineFleetAndRunProfiles: + def test_returns_the_same_profile_if_profiles_identical(self): + profile = Profile( + backends=[BackendType.AWS], + regions=["us-west2"], + availability_zones=None, + instance_types=None, + reservation="r-12345", + spot_policy=SpotPolicy.AUTO, + idle_duration=3600, + tags={"tag": "value"}, + ) + assert combine_fleet_and_run_profiles(profile, profile) == profile + + def test_prefers_finite_idle_duration_over_off(self): + combined_profile = combine_fleet_and_run_profiles( + Profile(idle_duration=300), + Profile(idle_duration=-1), + ) + + assert combined_profile is not None + assert combined_profile.idle_duration == 300 + + @pytest.mark.parametrize( + argnames=["fleet_profile", "run_profile", "expected_profile"], + argvalues=[ + pytest.param( + Profile(), + Profile(), + Profile(), + id="empty_profile", + ), + pytest.param( + Profile( + backends=[BackendType.AWS, BackendType.GCP], + regions=["eu-west1", "europe-west-4"], + instance_types=["instance1"], + reservation="r-1", + spot_policy=SpotPolicy.AUTO, + idle_duration=3600, + tags={"tag1": "value1"}, + ), + Profile( + backends=[BackendType.GCP, BackendType.RUNPOD], + regions=["eu-west2", "europe-west-4"], + instance_types=["instance2"], + reservation="r-1", + spot_policy=SpotPolicy.SPOT, + idle_duration=7200, + tags={"tag2": "value2"}, + ), + Profile( + backends=[BackendType.GCP], + regions=["europe-west-4"], + instance_types=[], + reservation="r-1", + spot_policy=SpotPolicy.SPOT, + idle_duration=3600, + tags={"tag1": "value1", "tag2": "value2"}, + ), + id="compatible_profiles", + ), + pytest.param( + Profile( + spot_policy=SpotPolicy.SPOT, + ), + Profile( + spot_policy=SpotPolicy.ONDEMAND, + ), + None, + id="incompatible_profiles", + ), + pytest.param( + Profile(backend_options=[VastAIProfileOptions(min_score=100)]), + Profile(backend_options=[VastAIProfileOptions(min_score=400)]), + Profile(backend_options=[VastAIProfileOptions(min_score=400)]), + id="backend_options_compatible", + ), + pytest.param( + Profile( + backend_options=[VastAIProfileOptions(offer_order=VastAIOfferOrder.PRICE)] + ), + Profile( + backend_options=[VastAIProfileOptions(offer_order=VastAIOfferOrder.SCORE)] + ), + None, + id="backend_options_incompatible", + ), + ], + ) + def test_combines_profiles( + self, + fleet_profile: Profile, + run_profile: Profile, + expected_profile: Optional[Profile], + ): + assert combine_fleet_and_run_profiles(fleet_profile, run_profile) == expected_profile + + +class TestCombineFleetAndRunRequirements: + def test_returns_the_same_requirements_if_requirements_identical(self): + requirements = Requirements( + resources=ResourcesSpec(gpu=GPUSpec(count=Range(min=2, max=None))), + max_price=100, + spot=False, + reservation="r-1", + ) + assert combine_fleet_and_run_requirements(requirements, requirements) == requirements + + @pytest.mark.parametrize( + argnames=["fleet_requirements", "run_requirements", "expected_requirements"], + argvalues=[ + pytest.param( + Requirements( + resources=ResourcesSpec(gpu=GPUSpec(count=Range(min=1, max=3))), + max_price=100, + spot=False, + ), + Requirements( + resources=ResourcesSpec(gpu=GPUSpec(count=Range(min=3, max=4))), + max_price=50, + spot=None, + ), + Requirements( + resources=ResourcesSpec(gpu=GPUSpec(count=Range(min=3, max=3))), + max_price=50, + spot=False, + ), + id="compatible_requirements", + ), + pytest.param( + Requirements( + resources=ResourcesSpec(gpu=GPUSpec(count=Range(min=1, max=2))), + ), + Requirements(resources=ResourcesSpec(gpu=GPUSpec(count=Range(min=3, max=4)))), + None, + id="incompatible_requirements", + ), + pytest.param( + Requirements( + resources=ResourcesSpec(), + backend_options=[VastAIProfileOptions(min_score=100)], + ), + Requirements( + resources=ResourcesSpec(), + backend_options=[VastAIProfileOptions(min_score=400)], + ), + Requirements( + resources=ResourcesSpec(), + backend_options=[VastAIProfileOptions(min_score=400)], + ), + id="backend_options_compatible", + ), + pytest.param( + Requirements( + resources=ResourcesSpec(), + backend_options=[VastAIProfileOptions(offer_order=VastAIOfferOrder.PRICE)], + ), + Requirements( + resources=ResourcesSpec(), + backend_options=[VastAIProfileOptions(offer_order=VastAIOfferOrder.SCORE)], + ), + None, + id="backend_options_incompatible", + ), + ], + ) + def test_combines_requirements( + self, + fleet_requirements: Requirements, + run_requirements: Requirements, + expected_requirements: Optional[Requirements], + ): + assert ( + combine_fleet_and_run_requirements(fleet_requirements, run_requirements) + == expected_requirements + ) + + def test_unconstrained_fleet_resources_pass_through_run_requirements(self): + unconstrained_fleet = Requirements( + resources=ResourcesSpec.unconstrained(), + ) + run = Requirements( + resources=ResourcesSpec( + cpu=CPUSpec(count=Range(min=2, max=None)), + memory=Range(min=Memory.parse("2GB"), max=None), + gpu=GPUSpec(count=Range(min=1, max=None)), + disk=DiskSpec(size=Range(min=Memory.parse("50GB"), max=None)), + ), + ) + result = combine_fleet_and_run_requirements(unconstrained_fleet, run) + assert result is not None + combined_cpu = result.resources.cpu + assert isinstance(combined_cpu, CPUSpec) + assert combined_cpu.count.min == 2 + assert result.resources.memory.min == Memory.parse("2GB") + assert result.resources.gpu is not None + assert result.resources.gpu.count.min == 1 + assert result.resources.disk is not None + assert result.resources.disk.size.min == Memory.parse("50GB") + + +class TestIntersectLists: + def test_both_none_returns_none(self): + assert _intersect_lists_optional(None, None) is None + + def test_first_none_returns_copy_of_second(self): + list2 = ["a", "b", "c"] + result = _intersect_lists_optional(None, list2) + assert result == list2 + assert result is not list2 # Should be a copy + + def test_second_none_returns_copy_of_first(self): + list1 = ["x", "y", "z"] + result = _intersect_lists_optional(list1, None) + assert result == list1 + assert result is not list1 # Should be a copy + + def test_intersection_of_overlapping_lists(self): + list1 = ["a", "b", "c", "d"] + list2 = ["b", "c", "e", "f"] + result = _intersect_lists_optional(list1, list2) + assert result == ["b", "c"] + + def test_intersection_of_non_overlapping_lists(self): + list1 = ["a", "b"] + list2 = ["c", "d"] + result = _intersect_lists_optional(list1, list2) + assert result == [] + + def test_intersection_preserves_order_from_first_list(self): + list1 = ["c", "a", "b"] + list2 = ["a", "b", "c"] + result = _intersect_lists_optional(list1, list2) + assert result == ["c", "a", "b"] + + def test_intersection_with_duplicates(self): + list1 = ["a", "b", "a", "c"] + list2 = ["a", "c", "d"] + result = _intersect_lists_optional(list1, list2) + assert result == ["a", "a", "c"] + + +class TestCombineIdleDuration: + def test_both_none_returns_none(self): + assert _combine_idle_duration_optional(None, None) is None + + def test_first_none_returns_second(self): + assert _combine_idle_duration_optional(None, 3600) == 3600 + + def test_second_none_returns_first(self): + assert _combine_idle_duration_optional(7200, None) == 7200 + + def test_both_positive_returns_minimum(self): + assert _combine_idle_duration_optional(3600, 7200) == 3600 + assert _combine_idle_duration_optional(7200, 3600) == 3600 + + def test_both_negative_returns_minimum(self): + assert _combine_idle_duration_optional(-1, -2) == -2 + assert _combine_idle_duration_optional(-2, -1) == -2 + + def test_both_zero_returns_zero(self): + assert _combine_idle_duration_optional(0, 0) == 0 + + def test_positive_and_negative_returns_positive(self): + assert _combine_idle_duration_optional(3600, -1) == 3600 + + def test_negative_and_positive_returns_positive(self): + assert _combine_idle_duration_optional(-1, 3600) == 3600 + + def test_zero_and_positive_returns_zero(self): + assert _combine_idle_duration_optional(0, 3600) == 0 + assert _combine_idle_duration_optional(3600, 0) == 0 + + def test_zero_and_negative_returns_zero(self): + assert _combine_idle_duration_optional(0, -1) == 0 + assert _combine_idle_duration_optional(-1, 0) == 0 + + +class TestCombineSpotPolicy: + def test_both_none_returns_none(self): + assert _combine_spot_policy_optional(None, None) is None + + def test_first_none_returns_second(self): + assert _combine_spot_policy_optional(None, SpotPolicy.SPOT) == SpotPolicy.SPOT + assert _combine_spot_policy_optional(None, SpotPolicy.ONDEMAND) == SpotPolicy.ONDEMAND + assert _combine_spot_policy_optional(None, SpotPolicy.AUTO) == SpotPolicy.AUTO + + def test_second_none_returns_first(self): + assert _combine_spot_policy_optional(SpotPolicy.SPOT, None) == SpotPolicy.SPOT + assert _combine_spot_policy_optional(SpotPolicy.ONDEMAND, None) == SpotPolicy.ONDEMAND + assert _combine_spot_policy_optional(SpotPolicy.AUTO, None) == SpotPolicy.AUTO + + def test_auto_with_other_returns_other(self): + assert _combine_spot_policy_optional(SpotPolicy.AUTO, SpotPolicy.SPOT) == SpotPolicy.SPOT + assert ( + _combine_spot_policy_optional(SpotPolicy.AUTO, SpotPolicy.ONDEMAND) + == SpotPolicy.ONDEMAND + ) + assert _combine_spot_policy_optional(SpotPolicy.SPOT, SpotPolicy.AUTO) == SpotPolicy.SPOT + assert ( + _combine_spot_policy_optional(SpotPolicy.ONDEMAND, SpotPolicy.AUTO) + == SpotPolicy.ONDEMAND + ) + + def test_auto_with_auto_returns_auto(self): + assert _combine_spot_policy_optional(SpotPolicy.AUTO, SpotPolicy.AUTO) == SpotPolicy.AUTO + + def test_same_non_auto_values_return_same(self): + assert _combine_spot_policy_optional(SpotPolicy.SPOT, SpotPolicy.SPOT) == SpotPolicy.SPOT + assert ( + _combine_spot_policy_optional(SpotPolicy.ONDEMAND, SpotPolicy.ONDEMAND) + == SpotPolicy.ONDEMAND + ) + + def test_different_non_auto_values_raise_error(self): + with pytest.raises(CombineError): + _combine_spot_policy_optional(SpotPolicy.SPOT, SpotPolicy.ONDEMAND) + with pytest.raises(CombineError): + _combine_spot_policy_optional(SpotPolicy.ONDEMAND, SpotPolicy.SPOT) + + +class TestCombineResources: + def test_combines_all_resource_specs(self): + resources1 = ResourcesSpec( + cpu=CPUSpec(arch=gpuhunt.CPUArchitecture.X86, count=Range(min=2, max=8)), + memory=Range(min=Memory(4), max=Memory(16)), + shm_size=Memory(2), + gpu=GPUSpec(vendor=gpuhunt.AcceleratorVendor.NVIDIA), + disk=DiskSpec(size=Range(min=Memory(100), max=Memory(500))), + ) + resources2 = ResourcesSpec( + cpu=CPUSpec(arch=gpuhunt.CPUArchitecture.X86, count=Range(min=4, max=6)), + memory=Range(min=Memory(8), max=Memory(12)), + shm_size=Memory(1), + gpu=GPUSpec(vendor=gpuhunt.AcceleratorVendor.NVIDIA), + disk=DiskSpec(size=Range(min=Memory(100), max=Memory(400))), + ) + result = _combine_resources(resources1, resources2) + expected = ResourcesSpec( + cpu=CPUSpec(arch=gpuhunt.CPUArchitecture.X86, count=Range(min=4, max=6)), + memory=Range(min=Memory(8), max=Memory(12)), + shm_size=Memory(1), + gpu=GPUSpec(vendor=gpuhunt.AcceleratorVendor.NVIDIA), + disk=DiskSpec(size=Range(min=Memory(100), max=Memory(400))), + ) + assert result == expected + + +class TestCombineCpu: + def test_combines_compatible_cpu_specs(self): + cpu1 = CPUSpec(arch=gpuhunt.CPUArchitecture.X86, count=Range(min=2, max=8)) + cpu2 = CPUSpec(arch=gpuhunt.CPUArchitecture.X86, count=Range(min=4, max=6)) + result = _combine_cpu(cpu1, cpu2) + expected = CPUSpec(arch=gpuhunt.CPUArchitecture.X86, count=Range(min=4, max=6)) + assert result == expected + + def test_incompatible_architectures_raises_error(self): + cpu1 = CPUSpec(arch=gpuhunt.CPUArchitecture.X86, count=Range(min=2, max=4)) + cpu2 = CPUSpec(arch=gpuhunt.CPUArchitecture.ARM, count=Range(min=2, max=4)) + with pytest.raises(CombineError): + _combine_cpu(cpu1, cpu2) + + def test_non_overlapping_count_ranges_raises_error(self): + cpu1 = CPUSpec(arch=gpuhunt.CPUArchitecture.X86, count=Range(min=1, max=2)) + cpu2 = CPUSpec(arch=gpuhunt.CPUArchitecture.X86, count=Range(min=4, max=6)) + with pytest.raises(CombineError): + _combine_cpu(cpu1, cpu2) + + def test_handles_none_architecture(self): + cpu1 = CPUSpec(arch=None, count=Range(min=2, max=4)) + cpu2 = CPUSpec(arch=gpuhunt.CPUArchitecture.X86, count=Range(min=2, max=4)) + result = _combine_cpu(cpu1, cpu2) + expected = CPUSpec(arch=gpuhunt.CPUArchitecture.X86, count=Range(min=2, max=4)) + assert result == expected + + def test_both_none_architecture(self): + cpu1 = CPUSpec(arch=None, count=Range(min=2, max=4)) + cpu2 = CPUSpec(arch=None, count=Range(min=3, max=5)) + result = _combine_cpu(cpu1, cpu2) + expected = CPUSpec(arch=None, count=Range(min=3, max=4)) + assert result == expected + + +class TestCombineGpu: + def test_both_none_returns_none(self): + assert _combine_gpu_optional(None, None) is None + + def test_first_none_returns_copy_of_second(self): + gpu2 = GPUSpec(count=Range(min=1, max=2)) + result = _combine_gpu_optional(None, gpu2) + assert result == gpu2 + assert result is not gpu2 # Should be a copy + + def test_second_none_returns_copy_of_first(self): + gpu1 = GPUSpec(count=Range(min=2, max=4)) + result = _combine_gpu_optional(gpu1, None) + assert result == gpu1 + assert result is not gpu1 # Should be a copy + + def test_combines_compatible_gpu_specs(self): + gpu1 = GPUSpec( + vendor=gpuhunt.AcceleratorVendor.NVIDIA, + name=["A100", "V100"], + count=Range(min=1, max=4), + memory=Range(min=Memory(8), max=Memory(32)), + compute_capability=ComputeCapability((7, 0)), + ) + gpu2 = GPUSpec( + vendor=gpuhunt.AcceleratorVendor.NVIDIA, + name=["V100", "T4"], + count=Range(min=2, max=3), + memory=Range(min=Memory(16), max=Memory(24)), + compute_capability=ComputeCapability((7, 8)), + ) + assert _combine_gpu_optional(gpu1, gpu2) == GPUSpec( + vendor=gpuhunt.AcceleratorVendor.NVIDIA, + name=["V100"], + count=Range(min=2, max=3), + memory=Range(min=Memory(16), max=Memory(24)), + compute_capability=ComputeCapability((7, 0)), + ) + + def test_incompatible_vendors_raises_error(self): + gpu1 = GPUSpec(vendor=gpuhunt.AcceleratorVendor.NVIDIA, count=Range(min=1, max=2)) + gpu2 = GPUSpec(vendor=gpuhunt.AcceleratorVendor.AMD, count=Range(min=1, max=2)) + with pytest.raises(CombineError): + _combine_gpu_optional(gpu1, gpu2) + + def test_non_overlapping_count_ranges_raises_error(self): + gpu1 = GPUSpec(count=Range(min=1, max=2)) + gpu2 = GPUSpec(count=Range(min=4, max=6)) + with pytest.raises(CombineError): + _combine_gpu_optional(gpu1, gpu2) + + def test_non_overlapping_memory_ranges_raises_error(self): + gpu1 = GPUSpec(count=Range(min=1, max=2), memory=Range(min=Memory(8), max=Memory(16))) + gpu2 = GPUSpec(count=Range(min=1, max=2), memory=Range(min=Memory(32), max=Memory(64))) + with pytest.raises(CombineError): + _combine_gpu_optional(gpu1, gpu2) + + +class TestCombineBackendOptionsOptional: + def test_both_none_returns_none(self): + assert _combine_backend_options_optional(None, None) is None + + def test_one_none_returns_copy_of_other(self): + opts = [VastAIProfileOptions(min_score=100)] + combine_none_opts = _combine_backend_options_optional(None, opts) + assert combine_none_opts == opts + assert combine_none_opts is not opts + combine_opts_none = _combine_backend_options_optional(opts, None) + assert combine_opts_none == opts + assert combine_opts_none is not opts + + def test_combines_same_backend_type(self): + opts1 = [VastAIProfileOptions(min_score=100, min_reliability=0.7)] + opts2 = [VastAIProfileOptions(min_score=300, min_reliability=0.95)] + result = _combine_backend_options_optional(opts1, opts2) + assert result is not None + assert len(result) == 1 + assert result[0].min_score == 300 + assert result[0].min_reliability == 0.95 diff --git a/src/tests/_internal/server/services/runner/__init__.py b/src/tests/_internal/server/services/runner/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/server/services/runner/test_client.py b/src/tests/_internal/server/services/runner/test_client.py new file mode 100644 index 0000000000..588c231a19 --- /dev/null +++ b/src/tests/_internal/server/services/runner/test_client.py @@ -0,0 +1,530 @@ +import uuid +from collections.abc import Generator +from typing import Optional + +import pytest +import requests_mock + +from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import NetworkMode +from dstack._internal.core.models.resources import Memory +from dstack._internal.core.models.volumes import ( + InstanceMountPoint, + VolumeAttachment, + VolumeAttachmentData, + VolumeInstance, + VolumeMountPoint, +) +from dstack._internal.server.schemas.runner import ( + HealthcheckResponse, + JobResult, + LegacyPullResponse, + PortMapping, + TaskInfoResponse, + TaskStatus, +) +from dstack._internal.server.services.runner.client import ( + ShimClient, + ShimHTTPError, + _parse_version, +) +from dstack._internal.server.testing.common import get_volume, get_volume_configuration + + +class BaseShimClientTest: + @pytest.fixture + def adapter(self) -> Generator[requests_mock.Adapter, None, None]: + adapter = requests_mock.Adapter() + with requests_mock.Mocker(adapter=adapter): + yield adapter + return + + @pytest.fixture + def client(self, request: pytest.FixtureRequest, adapter: requests_mock.Adapter) -> ShimClient: + shim_version_marker = request.node.get_closest_marker("shim_version") + if shim_version_marker is not None: + healthcheck_resp = {"service": "dstack-shim", "version": shim_version_marker.args[0]} + adapter.register_uri("GET", "/api/healthcheck", json=healthcheck_resp) + return ShimClient(port=DSTACK_SHIM_HTTP_PORT, hostname="localhost") + + def assert_request( + self, + adapter: requests_mock.Adapter, + index: int, + method: str, + path: str, + json: Optional[dict] = None, + ): + history = adapter.request_history + assert index < len(history), "index out of history bounds" + req = history[index] + assert req.method == method + assert req.path == path + if json is not None: + assert req.json() == json + + +class TestShimClientNegotiate(BaseShimClientTest): + @pytest.mark.parametrize( + ["expected_shim_version", "expected_api_version"], + [ + # final versions with optional build metadata ("local segment" according to PEP 440); + # boundary-value cases + pytest.param((0, 18, 33), 1, marks=pytest.mark.shim_version("0.18.33")), + pytest.param((0, 18, 33), 1, marks=pytest.mark.shim_version("0.18.33+build.1")), + pytest.param((0, 18, 34), 2, marks=pytest.mark.shim_version("0.18.34")), + pytest.param((0, 18, 34), 2, marks=pytest.mark.shim_version("0.18.34+build.1")), + # looks like major-only version, but not a version at all (stgn build), + # assuming the latest version + pytest.param(None, 2, marks=pytest.mark.shim_version("1494")), + # invalid versions, assuming local builds with the latest version + pytest.param(None, 2, marks=pytest.mark.shim_version("latest")), + pytest.param(None, 2, marks=pytest.mark.shim_version("0.17.0-next")), + # even though this version is less than _FUTURE_API_MIN_VERSION, for the sake of + # simplicity we assume that any non-final version is the latest; normally, users + # should not use non-latest RC versions + pytest.param(None, 2, marks=pytest.mark.shim_version("0.17.0rc1")), + ], + ) + def test( + self, + client: ShimClient, + adapter: requests_mock.Adapter, + expected_shim_version: Optional[tuple[int, int, int]], + expected_api_version: int, + ): + assert not hasattr(client, "_shim_version") + assert not hasattr(client, "_api_version") + + client._negotiate() + + assert client._shim_version_tuple == expected_shim_version + assert client._api_version == expected_api_version + assert adapter.call_count == 1 + self.assert_request(adapter, 0, "GET", "/api/healthcheck") + + +class TestShimClientRaiseForStatus(BaseShimClientTest): + def test(self, client: ShimClient, adapter: requests_mock.Adapter): + adapter.register_uri("GET", "/test/path", status_code=502, reason="Bad Gateway") + response = client._request("GET", "/test/path") + + with pytest.raises(ShimHTTPError) as excinfo: + client._raise_for_status(response) + + exc = excinfo.value + assert exc.status_code == 502 + assert exc.message.startswith("502 Server Error: Bad Gateway") + assert str(exc).startswith("502 Server Error: Bad Gateway") + assert repr(exc) == "ShimHTTPError(502)" + + +@pytest.mark.shim_version("0.18.30") +class TestShimClientV1(BaseShimClientTest): + def test_healthcheck(self, client: ShimClient, adapter: requests_mock.Adapter): + resp = client.healthcheck() + + assert resp == HealthcheckResponse(service="dstack-shim", version="0.18.30") + assert adapter.call_count == 1 + self.assert_request(adapter, 0, "GET", "/api/healthcheck") + # healthcheck() method also performs negotiation to save API calls + assert client._shim_version_tuple == (0, 18, 30) + assert client._api_version == 1 + + def test_submit(self, client: ShimClient, adapter: requests_mock.Adapter): + adapter.register_uri("POST", "/api/submit", json={"state": "pulling"}) + volume = get_volume( + name="vol", + volume_id="vol-id", + configuration=get_volume_configuration(backend=BackendType.GCP), + external=False, + attachments=[ + VolumeAttachment( + instance=VolumeInstance(name="instance", instance_num=0, instance_id="i-1"), + attachment_data=VolumeAttachmentData(device_name="/dev/sdv"), + ) + ], + ) + + submitted = client.submit( + username="", + password="", + image_name="debian", + privileged=False, + container_name="test-0-0", + container_user="root", + shm_size=None, + public_keys=["project_key", "user_key"], + ssh_user="dstack", + ssh_key="host_key", + mounts=[VolumeMountPoint(name="vol", path="/vol")], + volumes=[volume], + instance_mounts=[InstanceMountPoint(instance_path="/mnt/nfs/home", path="/home")], + instance_id="i-1", + ) + + assert submitted is True + assert adapter.call_count == 1 + expected_request = { + "username": "", + "password": "", + "image_name": "debian", + "privileged": False, + "container_name": "test-0-0", + "container_user": "root", + "shm_size": 0, + "public_keys": ["project_key", "user_key"], + "ssh_user": "dstack", + "ssh_key": "host_key", + "mounts": [{"name": "vol", "path": "/vol"}], + "volumes": [ + { + "backend": "gcp", + "name": "vol", + "volume_id": "vol-id", + "init_fs": True, + "device_name": "/dev/sdv", + } + ], + "instance_mounts": [ + {"instance_path": "/mnt/nfs/home", "path": "/home", "optional": False} + ], + } + self.assert_request(adapter, 0, "POST", "/api/submit", expected_request) + + def test_submit_conflict(self, client: ShimClient, adapter: requests_mock.Adapter): + adapter.register_uri("POST", "/api/submit", status_code=409) + + submitted = client.submit( + username="", + password="", + image_name="debian", + privileged=False, + container_name="test-0-0", + container_user="root", + shm_size=None, + public_keys=["project_key", "user_key"], + ssh_user="dstack", + ssh_key="host_key", + mounts=[], + volumes=[], + instance_mounts=[], + instance_id="", + ) + + assert submitted is False + assert adapter.call_count == 1 + self.assert_request(adapter, 0, "POST", "/api/submit") + + def test_stop(self, client: ShimClient, adapter: requests_mock.Adapter): + adapter.register_uri("POST", "/api/stop", json={"state": "pending"}) + + client.stop() + + assert adapter.call_count == 1 + self.assert_request(adapter, 0, "POST", "/api/stop", {"force": False}) + + def test_stop_force(self, client: ShimClient, adapter: requests_mock.Adapter): + adapter.register_uri("POST", "/api/stop", json={"state": "pending"}) + + client.stop(force=True) + + assert adapter.call_count == 1 + self.assert_request(adapter, 0, "POST", "/api/stop", {"force": True}) + + def test_pull(self, client: ShimClient, adapter: requests_mock.Adapter): + adapter.register_uri( + "GET", + "/api/pull", + json={ + "state": "pending", + "result": {"reason": "CONTAINER_EXITED_WITH_ERROR", "reason_message": "killed"}, + }, + ) + + resp = client.pull() + + assert resp == LegacyPullResponse( + state="pending", + result=JobResult(reason="CONTAINER_EXITED_WITH_ERROR", reason_message="killed"), + ) + assert adapter.call_count == 1 + self.assert_request(adapter, 0, "GET", "/api/pull") + + +@pytest.mark.shim_version("0.18.40") +class TestShimClientV2(BaseShimClientTest): + def test_healthcheck(self, client: ShimClient, adapter: requests_mock.Adapter): + resp = client.healthcheck() + + assert resp == HealthcheckResponse(service="dstack-shim", version="0.18.40") + assert adapter.call_count == 1 + self.assert_request(adapter, 0, "GET", "/api/healthcheck") + # healthcheck() method also performs negotiation to save API calls + assert client._shim_version_tuple == (0, 18, 40) + assert client._api_version == 2 + + def test_is_safe_to_restart_false_old_shim( + self, client: ShimClient, adapter: requests_mock.Adapter + ): + adapter.register_uri( + "GET", + "/api/tasks", + json={ + # pre-0.19.26 shim returns ids instead of tasks + "tasks": None, + "ids": [], + }, + ) + + res = client.is_safe_to_restart() + + assert res is False + assert adapter.call_count == 2 + self.assert_request(adapter, 0, "GET", "/api/healthcheck") + self.assert_request(adapter, 1, "GET", "/api/tasks") + + @pytest.mark.parametrize( + "task_status", + [ + TaskStatus.PENDING, + TaskStatus.PREPARING, + TaskStatus.PULLING, + TaskStatus.CREATING, + TaskStatus.RUNNING, + ], + ) + def test_is_safe_to_restart_false_status_not_safe( + self, client: ShimClient, adapter: requests_mock.Adapter, task_status: TaskStatus + ): + adapter.register_uri( + "GET", + "/api/tasks", + json={ + "tasks": [ + { + "id": str(uuid.uuid4()), + "status": "terminated", + }, + { + "id": str(uuid.uuid4()), + "status": task_status.value, + }, + ], + "ids": None, + }, + ) + + res = client.is_safe_to_restart() + + assert res is False + assert adapter.call_count == 2 + self.assert_request(adapter, 0, "GET", "/api/healthcheck") + self.assert_request(adapter, 1, "GET", "/api/tasks") + + def test_is_safe_to_restart_true(self, client: ShimClient, adapter: requests_mock.Adapter): + adapter.register_uri( + "GET", + "/api/tasks", + json={ + "tasks": [ + { + "id": str(uuid.uuid4()), + "status": "terminated", + }, + { + "id": str(uuid.uuid4()), + # TODO: replace with "running" once it's safe + "status": "terminated", + }, + ], + "ids": None, + }, + ) + + res = client.is_safe_to_restart() + + assert res is True + assert adapter.call_count == 2 + self.assert_request(adapter, 0, "GET", "/api/healthcheck") + self.assert_request(adapter, 1, "GET", "/api/tasks") + + def test_get_task(self, client: ShimClient, adapter: requests_mock.Adapter): + task_id = "d35b6e24-b556-4d6e-81e3-5982d2c34449" + url = f"/api/tasks/{task_id}" + adapter.register_uri( + "GET", + url, + json={ + "id": task_id, + "status": "terminated", + "termination_reason": "CONTAINER_EXITED_WITH_ERROR", + "termination_message": "killed", + "ports": [ + {"host": 34770, "container": 10022}, + {"host": 34771, "container": 10999}, + ], + "container_name": "horrible-mule-1-0-0-44f7cb95", # ignored + }, + ) + + resp = client.get_task(uuid.UUID(task_id)) + + assert resp == TaskInfoResponse( + id=task_id, + status=TaskStatus.TERMINATED, + termination_reason="CONTAINER_EXITED_WITH_ERROR", + termination_message="killed", + ports=[ + PortMapping(host=34770, container=10022), + PortMapping(host=34771, container=10999), + ], + ) + assert adapter.call_count == 2 + self.assert_request(adapter, 0, "GET", "/api/healthcheck") + self.assert_request(adapter, 1, "GET", url) + + def test_submit_task(self, client: ShimClient, adapter: requests_mock.Adapter): + adapter.register_uri("POST", "/api/tasks", status_code=200) + volume = get_volume( + name="vol", + volume_id="vol-id", + configuration=get_volume_configuration(backend=BackendType.GCP), + external=False, + attachments=[ + VolumeAttachment( + instance=VolumeInstance(name="instance", instance_num=0, instance_id="i-1"), + attachment_data=VolumeAttachmentData(device_name="/dev/sdv"), + ) + ], + ) + + client.submit_task( + task_id=uuid.UUID("c514f4ee-dfe7-472c-99a3-047178aafb5b"), + name="test-0-0", + registry_username="user", + registry_password="pass", + image_name="debian", + container_user="root", + privileged=True, + gpu=1, + cpu=4.0, + memory=Memory.parse("16GB"), + shm_size=Memory.parse("1GB"), + network_mode=NetworkMode.BRIDGE, + volumes=[volume], + volume_mounts=[VolumeMountPoint(name="vol", path="/vol")], + instance_mounts=[InstanceMountPoint(instance_path="/mnt/nfs/home", path="/home")], + gpu_devices=[], + host_ssh_user="dstack", + host_ssh_keys=["host_key"], + container_ssh_keys=["project_key", "user_key"], + instance_id="i-1", + ) + + assert adapter.call_count == 2 + self.assert_request(adapter, 0, "GET", "/api/healthcheck") + expected_request = { + "id": "c514f4ee-dfe7-472c-99a3-047178aafb5b", + "name": "test-0-0", + "registry_username": "user", + "registry_password": "pass", + "image_name": "debian", + "container_user": "root", + "privileged": True, + "gpu": 1, + "cpu": 4.0, + "memory": 17179869184, + "shm_size": 1073741824, + "network_mode": "bridge", + "volumes": [ + { + "backend": "gcp", + "name": "vol", + "volume_id": "vol-id", + "init_fs": True, + "device_name": "/dev/sdv", + } + ], + "volume_mounts": [{"name": "vol", "path": "/vol"}], + "instance_mounts": [ + {"instance_path": "/mnt/nfs/home", "path": "/home", "optional": False} + ], + "gpu_devices": [], + "host_ssh_user": "dstack", + "host_ssh_keys": ["host_key"], + "container_ssh_keys": ["project_key", "user_key"], + } + self.assert_request(adapter, 1, "POST", "/api/tasks", expected_request) + + def test_terminate_task(self, client: ShimClient, adapter: requests_mock.Adapter): + task_id = "c514f4ee-dfe7-472c-99a3-047178aafb5b" + url = f"/api/tasks/{task_id}/terminate" + adapter.register_uri("POST", url, status_code=200) + + client.terminate_task(uuid.UUID(task_id), "TEST_REASON", "test message", timeout=5) + + assert adapter.call_count == 2 + self.assert_request(adapter, 0, "GET", "/api/healthcheck") + expected_request = { + "termination_reason": "TEST_REASON", + "termination_message": "test message", + "timeout": 5, + } + self.assert_request(adapter, 1, "POST", url, expected_request) + + def test_terminate_task_default_params( + self, client: ShimClient, adapter: requests_mock.Adapter + ): + task_id = uuid.UUID("c514f4ee-dfe7-472c-99a3-047178aafb5b") + url = f"/api/tasks/{task_id}/terminate" + adapter.register_uri("POST", url, status_code=200) + + client.terminate_task(task_id) + + assert adapter.call_count == 2 + self.assert_request(adapter, 0, "GET", "/api/healthcheck") + expected_request = { + "termination_reason": "", + "termination_message": "", + "timeout": 10, + } + self.assert_request(adapter, 1, "POST", url, expected_request) + + def test_remove_task(self, client: ShimClient, adapter: requests_mock.Adapter): + task_id = "c514f4ee-dfe7-472c-99a3-047178aafb5b" + url = f"/api/tasks/{task_id}/remove" + adapter.register_uri("POST", url, status_code=200) + + client.remove_task(uuid.UUID(task_id)) + + assert adapter.call_count == 2 + self.assert_request(adapter, 0, "GET", "/api/healthcheck") + self.assert_request(adapter, 1, "POST", url) + + +class TestParseVersion: + @pytest.mark.parametrize( + ["value", "expected"], + [ + ["1.12", (1, 12, 0)], + ["1.12.3", (1, 12, 3)], + ["1.12.3.1", (1, 12, 3)], + ["1.12.3+build.1", (1, 12, 3)], # local builds are OK + ], + ) + def test_valid_final(self, value: str, expected: tuple[int, int, int]): + assert _parse_version(value) == expected + + @pytest.mark.parametrize("value", ["1.12alpha1", "1.12.3rc1", "1.12.3.dev0"]) + def test_valid_pre_dev_local(self, value: str): + assert _parse_version(value) is None + + @pytest.mark.parametrize("value", ["1", "1234"]) + def test_valid_major_only(self, value: str): + assert _parse_version(value) is None + + @pytest.mark.parametrize("value", ["", "foo", "1.12.3-next.20241231"]) + def test_invalid(self, value: str): + assert _parse_version(value) is None diff --git a/src/tests/_internal/server/services/runs/__init__.py b/src/tests/_internal/server/services/runs/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/server/services/runs/test_plan.py b/src/tests/_internal/server/services/runs/test_plan.py new file mode 100644 index 0000000000..ce586171c4 --- /dev/null +++ b/src/tests/_internal/server/services/runs/test_plan.py @@ -0,0 +1,616 @@ +import copy +from unittest.mock import AsyncMock + +import pytest +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import EntityReference +from dstack._internal.core.models.configurations import ( + DevEnvironmentConfiguration, + TaskConfiguration, +) +from dstack._internal.core.models.fleets import FleetNodesSpec, InstanceGroupPlacement +from dstack._internal.core.models.instances import InstanceAvailability +from dstack._internal.core.models.profiles import ( + CreationPolicy, + FleetInstanceSelector, + InstanceHostnameSelector, + InstanceNameSelector, + Profile, +) +from dstack._internal.core.models.resources import CPUSpec, Memory, Range, ResourcesSpec +from dstack._internal.server.services.jobs import get_jobs_from_run_spec +from dstack._internal.server.services.projects import get_project_model_by_name +from dstack._internal.server.services.runs import get_plan +from dstack._internal.server.services.runs.plan import ( + _freeze_offer_identity_value, + _get_backend_offer_identity, + _get_backend_offers_in_fleet, + _get_job_plan, + get_backend_offers_in_run_candidate_fleets, + get_targeted_instance_offers, +) +from dstack._internal.server.testing.common import ( + create_export, + create_fleet, + create_instance, + create_project, + create_repo, + create_user, + get_fleet_spec, + get_instance_offer_with_availability, + get_job_provisioning_data, + get_remote_connection_info, + get_run_spec, +) + +pytestmark = pytest.mark.usefixtures("image_config_mock") + + +class TestFreezeOfferIdentityValue: + def test_normalizes_nested_mappings_and_sets(self) -> None: + first = { + "b": [1, {"y": InstanceAvailability.IDLE, "x": {3, 2}}], + "a": ("z", None), + } + second = { + "a": ("z", None), + "b": [1, {"x": {2, 3}, "y": InstanceAvailability.IDLE}], + } + + frozen_first = _freeze_offer_identity_value(first) + frozen_second = _freeze_offer_identity_value(second) + + assert frozen_first == frozen_second + assert hash(frozen_first) == hash(frozen_second) + + def test_get_backend_offer_identity_uses_full_offer_payload(self) -> None: + offer = get_instance_offer_with_availability(availability=InstanceAvailability.UNKNOWN) + offer.backend_data = { + "region_hint": {"b": 2, "a": 1}, + "azs": ["us-east-1b", "us-east-1a"], + } + same_offer = copy.deepcopy(offer) + same_offer.backend_data = { + "azs": ["us-east-1b", "us-east-1a"], + "region_hint": {"a": 1, "b": 2}, + } + different_offer = copy.deepcopy(offer) + different_offer.backend_data = { + "azs": ["us-east-1b", "us-east-1a"], + "region_hint": {"a": 3, "b": 2}, + } + + assert _get_backend_offer_identity(offer) == _get_backend_offer_identity(same_offer) + assert _get_backend_offer_identity(offer) != _get_backend_offer_identity(different_offer) + + +class TestGetJobPlan: + @pytest.mark.asyncio + async def test_excludes_backend_offers_when_instances_specified(self) -> None: + run_spec = get_run_spec( + repo_id="test-repo", + configuration=TaskConfiguration(image="debian", commands=["echo"]), + ) + jobs = await get_jobs_from_run_spec(run_spec=run_spec, secrets={}, replica_num=0) + instance_offer = get_instance_offer_with_availability() + backend_offer = get_instance_offer_with_availability() + + job_plan = _get_job_plan( + instance_offers=[(None, instance_offer)], # type: ignore[list-item] + backend_offers=[(None, backend_offer)], # type: ignore[list-item] + profile=Profile( + name="default", + creation_policy=CreationPolicy.REUSE_OR_CREATE, + instances=[InstanceNameSelector(name="my-fleet-0")], + ), + job=jobs[0], + max_offers=None, + ) + + assert job_plan.total_offers == 1 + assert job_plan.offers == [instance_offer] + + +class TestGetPlan: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_empty_dev_environment_with_fleet_does_not_use_targeted_instances( + self, + test_db, + session: AsyncSession, + monkeypatch: pytest.MonkeyPatch, + ) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + repo = await create_repo(session=session, project_id=project.id) + await create_fleet(session=session, project=project) + project = await get_project_model_by_name(session=session, project_name=project.name) + assert project is not None + select_instances_mock = AsyncMock() + monkeypatch.setattr( + "dstack._internal.server.services.runs.plan.select_instances_by_selectors", + select_instances_mock, + ) + run_spec = get_run_spec( + repo_id=repo.name, + configuration=DevEnvironmentConfiguration(), + ) + + await get_plan( + session=session, + project=project, + user=user, + run_spec=run_spec, + max_offers=None, + ) + + select_instances_mock.assert_not_awaited() + + +class TestGetTargetedInstanceOffers: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_only_selected_instance(self, test_db, session: AsyncSession) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + repo = await create_repo(session=session, project_id=project.id) + fleet = await create_fleet(session=session, project=project) + await create_instance( + session=session, + project=project, + fleet=fleet, + name="worker-0", + ) + selected = await create_instance( + session=session, + project=project, + fleet=fleet, + name="worker-1", + ) + run_spec = get_run_spec( + repo_id=repo.name, + configuration=TaskConfiguration(image="debian", commands=["echo"]), + profile=Profile(instances=[InstanceNameSelector(name="worker-1")]), + ) + jobs = await get_jobs_from_run_spec(run_spec=run_spec, secrets={}, replica_num=0) + + offers = await get_targeted_instance_offers( + session=session, + project=project, + run_spec=run_spec, + job=jobs[0], + volumes=None, + exclude_not_available=True, + ) + + assert [instance for instance, _ in offers] == [selected] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_selected_instance_by_hostname( + self, test_db, session: AsyncSession + ) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + repo = await create_repo(session=session, project_id=project.id) + fleet = await create_fleet(session=session, project=project) + await create_instance( + session=session, + project=project, + fleet=fleet, + name="worker-0", + remote_connection_info=get_remote_connection_info(host="192.168.1.10"), + ) + selected = await create_instance( + session=session, + project=project, + fleet=fleet, + name="worker-1", + remote_connection_info=get_remote_connection_info(host="192.168.1.11"), + ) + run_spec = get_run_spec( + repo_id=repo.name, + configuration=TaskConfiguration(image="debian", commands=["echo"]), + profile=Profile(instances=[InstanceHostnameSelector(hostname="192.168.1.11")]), + ) + jobs = await get_jobs_from_run_spec(run_spec=run_spec, secrets={}, replica_num=0) + + offers = await get_targeted_instance_offers( + session=session, + project=project, + run_spec=run_spec, + job=jobs[0], + volumes=None, + exclude_not_available=True, + ) + + assert [instance for instance, _ in offers] == [selected] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_selected_instance_from_imported_fleet_reference( + self, test_db, session: AsyncSession + ) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user, name="importer-project") + exporter_project = await create_project( + session=session, owner=user, name="exporter-project" + ) + repo = await create_repo(session=session, project_id=project.id) + local_fleet = await create_fleet(session=session, project=project, name="same-fleet") + exported_fleet = await create_fleet( + session=session, project=exporter_project, name="same-fleet" + ) + await create_instance( + session=session, + project=project, + fleet=local_fleet, + instance_num=1, + name="local-worker", + ) + selected = await create_instance( + session=session, + project=exporter_project, + fleet=exported_fleet, + instance_num=1, + name="exported-worker", + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[project], + exported_fleets=[exported_fleet], + ) + run_spec = get_run_spec( + repo_id=repo.name, + configuration=TaskConfiguration(image="debian", commands=["echo"]), + profile=Profile( + instances=[ + FleetInstanceSelector( + fleet=EntityReference.parse("exporter-project/same-fleet"), + instance=1, + ) + ] + ), + ) + jobs = await get_jobs_from_run_spec(run_spec=run_spec, secrets={}, replica_num=0) + + offers = await get_targeted_instance_offers( + session=session, + project=project, + run_spec=run_spec, + job=jobs[0], + volumes=None, + exclude_not_available=True, + ) + + assert [instance for instance, _ in offers] == [selected] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_shared_block_offer_for_selected_instance( + self, test_db, session: AsyncSession + ) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + repo = await create_repo(session=session, project_id=project.id) + fleet = await create_fleet(session=session, project=project) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + name="shared-worker", + total_blocks=2, + busy_blocks=1, + ) + run_spec = get_run_spec( + repo_id=repo.name, + configuration=TaskConfiguration( + image="debian", + commands=["echo"], + resources=ResourcesSpec( + cpu=CPUSpec.parse("1"), + memory=Range[Memory](min=Memory.parse("1GB"), max=None), + gpu=None, + ), + ), + profile=Profile(instances=[InstanceNameSelector(name="shared-worker")]), + ) + jobs = await get_jobs_from_run_spec(run_spec=run_spec, secrets={}, replica_num=0) + + offers = await get_targeted_instance_offers( + session=session, + project=project, + run_spec=run_spec, + job=jobs[0], + volumes=None, + exclude_not_available=True, + ) + + assert [selected for selected, _ in offers] == [instance] + assert offers[0][1].blocks == 1 + assert offers[0][1].total_blocks == 2 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_multinode_does_not_count_blocks_as_nodes( + self, test_db, session: AsyncSession + ) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + repo = await create_repo(session=session, project_id=project.id) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.placement = InstanceGroupPlacement.CLUSTER + fleet = await create_fleet(session=session, project=project, spec=fleet_spec) + await create_instance( + session=session, + project=project, + fleet=fleet, + name="shared-worker", + backend=BackendType.AWS, + total_blocks=2, + busy_blocks=0, + ) + run_spec = get_run_spec( + repo_id=repo.name, + configuration=TaskConfiguration(image="debian", nodes=2, commands=["echo"]), + profile=Profile(instances=[InstanceNameSelector(name="shared-worker")]), + ) + jobs = await get_jobs_from_run_spec(run_spec=run_spec, secrets={}, replica_num=0) + + offers = await get_targeted_instance_offers( + session=session, + project=project, + run_spec=run_spec, + job=jobs[0], + volumes=None, + exclude_not_available=True, + ) + + assert offers == [] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_multinode_returns_full_host_offer_per_selected_shared_instance( + self, test_db, session: AsyncSession + ) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + repo = await create_repo(session=session, project_id=project.id) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.placement = InstanceGroupPlacement.CLUSTER + fleet = await create_fleet(session=session, project=project, spec=fleet_spec) + selected_1 = await create_instance( + session=session, + project=project, + fleet=fleet, + name="worker-0", + backend=BackendType.REMOTE, + total_blocks=2, + busy_blocks=0, + ) + selected_2 = await create_instance( + session=session, + project=project, + fleet=fleet, + name="worker-1", + backend=BackendType.REMOTE, + total_blocks=2, + busy_blocks=0, + ) + run_spec = get_run_spec( + repo_id=repo.name, + configuration=TaskConfiguration( + image="debian", + nodes=2, + commands=["echo"], + resources=ResourcesSpec( + cpu=CPUSpec.parse("1.."), + memory=Range[Memory](min=Memory.parse("1GB"), max=None), + gpu=None, + ), + ), + profile=Profile( + instances=[ + InstanceNameSelector(name="worker-0"), + InstanceNameSelector(name="worker-1"), + ] + ), + ) + jobs = await get_jobs_from_run_spec(run_spec=run_spec, secrets={}, replica_num=0) + + offers = await get_targeted_instance_offers( + session=session, + project=project, + run_spec=run_spec, + job=jobs[0], + volumes=None, + exclude_not_available=True, + ) + + assert [instance for instance, _ in offers] == [selected_1, selected_2] + assert [offer.blocks for _, offer in offers] == [2, 2] + assert [offer.total_blocks for _, offer in offers] == [2, 2] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_multinode_returns_selected_instances_in_same_cluster_fleet( + self, test_db, session: AsyncSession + ) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + repo = await create_repo(session=session, project_id=project.id) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.placement = InstanceGroupPlacement.CLUSTER + fleet = await create_fleet(session=session, project=project, spec=fleet_spec) + selected_1 = await create_instance( + session=session, + project=project, + fleet=fleet, + name="worker-0", + backend=BackendType.AWS, + job_provisioning_data=get_job_provisioning_data(region="eu-west-1"), + ) + selected_2 = await create_instance( + session=session, + project=project, + fleet=fleet, + name="worker-1", + backend=BackendType.AWS, + job_provisioning_data=get_job_provisioning_data(region="eu-west-1"), + ) + run_spec = get_run_spec( + repo_id=repo.name, + configuration=TaskConfiguration(image="debian", nodes=2, commands=["echo"]), + profile=Profile( + instances=[ + InstanceNameSelector(name="worker-0"), + InstanceNameSelector(name="worker-1"), + ] + ), + ) + jobs = await get_jobs_from_run_spec(run_spec=run_spec, secrets={}, replica_num=0) + + offers = await get_targeted_instance_offers( + session=session, + project=project, + run_spec=run_spec, + job=jobs[0], + volumes=None, + exclude_not_available=True, + ) + + assert [instance for instance, _ in offers] == [selected_1, selected_2] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_multinode_requires_selected_instances_in_one_cluster_fleet( + self, test_db, session: AsyncSession + ) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + repo = await create_repo(session=session, project_id=project.id) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.placement = InstanceGroupPlacement.CLUSTER + fleet_1 = await create_fleet(session=session, project=project, spec=fleet_spec) + fleet_2 = await create_fleet(session=session, project=project, spec=fleet_spec) + await create_instance( + session=session, + project=project, + fleet=fleet_1, + name="worker-0", + backend=BackendType.AWS, + ) + await create_instance( + session=session, + project=project, + fleet=fleet_2, + name="worker-1", + backend=BackendType.AWS, + ) + run_spec = get_run_spec( + repo_id=repo.name, + configuration=TaskConfiguration(image="debian", nodes=2, commands=["echo"]), + profile=Profile( + instances=[ + InstanceNameSelector(name="worker-0"), + InstanceNameSelector(name="worker-1"), + ] + ), + ) + jobs = await get_jobs_from_run_spec(run_spec=run_spec, secrets={}, replica_num=0) + + offers = await get_targeted_instance_offers( + session=session, + project=project, + run_spec=run_spec, + job=jobs[0], + volumes=None, + exclude_not_available=True, + ) + + assert offers == [] + + +class TestGetBackendOffersInRunCandidateFleets: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_skips_backend_offers_when_instances_specified( + self, test_db, session: AsyncSession, monkeypatch: pytest.MonkeyPatch + ) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec( + repo_id=repo.name, + configuration=TaskConfiguration(image="debian", commands=["echo"]), + profile=Profile(instances=[InstanceNameSelector(name="missing-instance")]), + ) + jobs = await get_jobs_from_run_spec(run_spec=run_spec, secrets={}, replica_num=0) + select_candidate_fleet_models_mock = AsyncMock() + monkeypatch.setattr( + "dstack._internal.server.services.runs.plan._select_candidate_fleet_models", + select_candidate_fleet_models_mock, + ) + + offers = await get_backend_offers_in_run_candidate_fleets( + session=session, + project=project, + run_spec=run_spec, + job=jobs[0], + volumes=None, + ) + + assert offers == [] + select_candidate_fleet_models_mock.assert_not_awaited() + + +class TestGetBackendOffersInFleet: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_keeps_unconstrained_offers_for_non_empty_cluster_fleet_without_elected_master( + self, test_db, session: AsyncSession, monkeypatch: pytest.MonkeyPatch + ) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + repo = await create_repo(session=session, project_id=project.id) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.placement = InstanceGroupPlacement.CLUSTER + fleet_spec.configuration.nodes = FleetNodesSpec(min=0, target=1, max=2) + fleet = await create_fleet(session=session, project=project, spec=fleet_spec) + await create_instance( + session=session, + project=project, + fleet=fleet, + job_provisioning_data=get_job_provisioning_data(region="eu-west-1"), + ) + run_spec = get_run_spec( + repo_id=repo.name, + configuration=TaskConfiguration(image="debian", nodes=2), + ) + jobs = await get_jobs_from_run_spec(run_spec=run_spec, secrets={}, replica_num=0) + get_offers_by_requirements_mock = AsyncMock() + monkeypatch.setattr( + "dstack._internal.server.services.runs.plan.get_offers_by_requirements", + get_offers_by_requirements_mock, + ) + offer = get_instance_offer_with_availability() + backend = AsyncMock() + get_offers_by_requirements_mock.return_value = [(backend, offer)] + + offers = await _get_backend_offers_in_fleet( + project=project, + fleet_model=fleet, + run_spec=run_spec, + job=jobs[0], + volumes=None, + ) + + assert offers == [(backend, offer)] + get_offers_by_requirements_mock.assert_awaited_once() + assert ( + get_offers_by_requirements_mock.await_args.kwargs["master_job_provisioning_data"] + is None + ) diff --git a/src/tests/_internal/server/services/runs/test_router_worker_sync.py b/src/tests/_internal/server/services/runs/test_router_worker_sync.py new file mode 100644 index 0000000000..2cf0275632 --- /dev/null +++ b/src/tests/_internal/server/services/runs/test_router_worker_sync.py @@ -0,0 +1,231 @@ +from contextlib import asynccontextmanager, contextmanager +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from dstack._internal.server.services.runs.router_worker_sync import ( + _get_connection_mode_from_workers, + _get_grpc_worker, + _get_runtime_type_from_workers, + _get_worker, + _grpc_server_info_to_worker, +) + + +class TestGetConnectionModeFromWorkers: + def test_grpc(self): + current = [{"connection_mode": "grpc"}] + assert _get_connection_mode_from_workers(current) == "grpc" + + def test_http(self): + current = [{"connection_mode": "http"}] + assert _get_connection_mode_from_workers(current) == "http" + + def test_mixed(self): + current = [{"connection_mode": "grpc"}, {"connection_mode": "http"}] + assert _get_connection_mode_from_workers(current) is None + + +class TestRuntimeTypeFromRouterWorkers: + def test_vllm_grpc_workers(self): + current = [{"connection_mode": "grpc", "runtime_type": "vllm"}] + assert _get_runtime_type_from_workers(current) == "vllm" + + def test_sglang_grpc_workers(self): + current = [{"connection_mode": "grpc", "runtime_type": "sglang"}] + assert _get_runtime_type_from_workers(current) == "sglang" + + def test_ignores_http_workers(self): + current = [{"connection_mode": "http", "runtime_type": "sglang"}] + assert _get_runtime_type_from_workers(current) is None + + def test_mixed_runtimes(self): + current = [ + {"connection_mode": "grpc", "runtime_type": "vllm"}, + {"connection_mode": "grpc", "runtime_type": "sglang"}, + ] + assert _get_runtime_type_from_workers(current) is None + + +class TestGrpcServerInfoToWorker: + def test_vllm_prefill(self): + response = MagicMock(kv_role="kv_producer", kv_connector="NixlConnector") + worker = _grpc_server_info_to_worker("grpc://10.0.0.1:50051", "vllm", response) + assert worker["worker_type"] == "prefill" + assert worker.get("runtime_type") == "vllm" + assert worker.get("kv_role") == "kv_producer" + + def test_sglang_prefill(self): + server_args = MagicMock() + response = MagicMock(server_args=server_args) + with patch( + "dstack._internal.server.services.runs.router_worker_sync.MessageToDict", + return_value={ + "disaggregation_mode": "prefill", + "disaggregation_bootstrap_port": 8998, + }, + ): + worker = _grpc_server_info_to_worker("grpc://10.0.0.1:8000", "sglang", response) + assert worker == { + "url": "grpc://10.0.0.1:8000", + "worker_type": "prefill", + "connection_mode": "grpc", + "runtime_type": "sglang", + "bootstrap_port": 8998, + } + + +@contextmanager +def _fake_vllm_grpc_proto(*, server_info: MagicMock): + stub = MagicMock() + stub.GetServerInfo = AsyncMock(return_value=server_info) + pb2 = MagicMock(GetServerInfoRequest=MagicMock(return_value="req")) + pb2_grpc = MagicMock(VllmEngineStub=MagicMock(return_value=stub)) + with ( + patch( + "dstack._internal.server.services.runs.router_worker_sync.vllm_engine_pb2", + pb2, + ), + patch( + "dstack._internal.server.services.runs.router_worker_sync.vllm_engine_pb2_grpc", + pb2_grpc, + ), + ): + yield + + +@contextmanager +def _fake_sglang_grpc_proto(*, server_info: MagicMock): + stub = MagicMock() + stub.GetServerInfo = AsyncMock(return_value=server_info) + pb2 = MagicMock(GetServerInfoRequest=MagicMock(return_value="req")) + pb2_grpc = MagicMock(SglangSchedulerStub=MagicMock(return_value=stub)) + with ( + patch( + "dstack._internal.server.services.runs.router_worker_sync.sglang_scheduler_pb2", + pb2, + ), + patch( + "dstack._internal.server.services.runs.router_worker_sync.sglang_scheduler_pb2_grpc", + pb2_grpc, + ), + ): + yield + + +@pytest.mark.asyncio +async def test_get_grpc_worker_ready(): + job = MagicMock() + channel = MagicMock() + + @asynccontextmanager + async def _fake_grpc_client(_job): + yield channel + + server_info = MagicMock(kv_role="kv_producer", kv_connector="NixlConnector") + + with ( + _fake_vllm_grpc_proto(server_info=server_info), + patch( + "dstack._internal.server.services.runs.router_worker_sync.get_service_replica_grpc_client", + _fake_grpc_client, + ), + ): + result = await _get_grpc_worker( + job, + worker_url="grpc://10.0.0.1:50051", + runtime_type="vllm", + ) + + assert result["status"] == "ready" + assert result["worker"] == { + "url": "grpc://10.0.0.1:50051", + "worker_type": "prefill", + "connection_mode": "grpc", + "runtime_type": "vllm", + "kv_connector": "NixlConnector", + "kv_role": "kv_producer", + } + + +@pytest.mark.asyncio +async def test_get_grpc_worker_not_ready_on_error(): + job = MagicMock() + + @asynccontextmanager + async def _failing_client(_job): + raise OSError("ssh failed") + yield # pragma: no cover + + with patch( + "dstack._internal.server.services.runs.router_worker_sync.get_service_replica_grpc_client", + _failing_client, + ): + result = await _get_grpc_worker(job, worker_url="grpc://10.0.0.1:50051") + + assert result == {"status": "not_ready", "worker": None} + + +@pytest.mark.asyncio +async def test_get_grpc_worker_sglang_bootstrap(): + job = MagicMock() + channel = MagicMock() + sglang_server_info = MagicMock(server_args=MagicMock()) + + @asynccontextmanager + async def _fake_grpc_client(_job): + yield channel + + with ( + _fake_sglang_grpc_proto(server_info=sglang_server_info), + patch( + "dstack._internal.server.services.runs.router_worker_sync.MessageToDict", + return_value={ + "disaggregation_mode": "prefill", + "disaggregation_bootstrap_port": 8998, + }, + ), + patch( + "dstack._internal.server.services.runs.router_worker_sync" + ".get_service_replica_grpc_client", + _fake_grpc_client, + ), + ): + result = await _get_grpc_worker(job, worker_url="grpc://10.0.0.1:8000") + + assert result["status"] == "ready" + assert result["worker"] == { + "url": "grpc://10.0.0.1:8000", + "worker_type": "prefill", + "connection_mode": "grpc", + "runtime_type": "sglang", + "bootstrap_port": 8998, + } + + +@pytest.mark.asyncio +async def test_get_worker_grpc_preference_skips_http(): + job = MagicMock() + grpc_not_ready = {"status": "not_ready", "worker": None} + + with ( + patch( + "dstack._internal.server.services.runs.router_worker_sync._get_grpc_worker", + new_callable=AsyncMock, + return_value=grpc_not_ready, + ) as grpc_mock, + patch( + "dstack._internal.server.services.runs.router_worker_sync._get_http_worker", + new_callable=AsyncMock, + ) as http_mock, + ): + result = await _get_worker( + job, + http_worker_url="http://10.0.0.1:8000", + grpc_worker_url="grpc://10.0.0.1:8000", + connection_mode="grpc", + ) + + assert result == grpc_not_ready + grpc_mock.assert_awaited_once() + http_mock.assert_not_awaited() diff --git a/src/tests/_internal/server/services/runs/test_runs.py b/src/tests/_internal/server/services/runs/test_runs.py new file mode 100644 index 0000000000..6f71145141 --- /dev/null +++ b/src/tests/_internal/server/services/runs/test_runs.py @@ -0,0 +1,48 @@ +import pytest + +from dstack._internal.core.errors import ServerClientError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.server.services.jobs import check_can_attach_job_volumes +from dstack._internal.server.testing.common import ( + get_volume, +) + + +class TestCanAttachRunVolumes: + @pytest.mark.asyncio + async def test_can_attach(self): + vol11 = get_volume(name="vol11") + vol11.configuration.backend = BackendType.AWS + vol11.configuration.region = "eu-west-1" + vol12 = get_volume(name="vol12") + vol12.configuration.backend = BackendType.AWS + vol12.configuration.region = "eu-west-2" + vol21 = get_volume(name="vol21") + vol21.configuration.backend = BackendType.AWS + vol21.configuration.region = "eu-west-1" + vol22 = get_volume(name="vol22") + vol22.configuration.backend = BackendType.AWS + vol22.configuration.region = "eu-west-2" + volumes = [[vol11, vol12], [vol21, vol22]] + check_can_attach_job_volumes(volumes) + + @pytest.mark.asyncio + async def test_cannot_attach_different_mount_points_with_different_backends_regions(self): + vol1 = get_volume(name="vol11") + vol1.configuration.backend = BackendType.AWS + vol1.configuration.region = "eu-west-1" + vol2 = get_volume(name="vol12") + vol2.configuration.backend = BackendType.AWS + vol2.configuration.region = "eu-west-2" + volumes = [[vol1], [vol2]] + with pytest.raises(ServerClientError): + check_can_attach_job_volumes(volumes) + + @pytest.mark.asyncio + async def test_cannot_attach_same_volume_at_different_mount_points(self): + vol1 = get_volume(name="vol11") + vol1.configuration.backend = BackendType.AWS + vol1.configuration.region = "eu-west-1" + volumes = [[vol1], [vol1]] + with pytest.raises(ServerClientError): + check_can_attach_job_volumes(volumes) diff --git a/src/tests/_internal/server/services/runs/test_spec.py b/src/tests/_internal/server/services/runs/test_spec.py new file mode 100644 index 0000000000..093ca768cf --- /dev/null +++ b/src/tests/_internal/server/services/runs/test_spec.py @@ -0,0 +1,231 @@ +import re +import uuid +from types import SimpleNamespace + +import pytest + +from dstack._internal.core.errors import ServerClientError +from dstack._internal.core.models.configurations import ServiceConfiguration +from dstack._internal.core.models.files import FileArchiveMapping +from dstack._internal.core.models.profiles import Profile, ProfileRetry +from dstack._internal.core.models.repos.local import LocalRunRepoData +from dstack._internal.core.models.runs import RunSpec +from dstack._internal.server.services.runs.spec import ( + _check_can_update_configuration, + check_can_update_run_spec, + validate_run_spec_and_set_defaults, +) +from dstack._internal.server.testing.common import get_run_spec + + +def _service_configuration( + *, + router_type=None, + image=None, + env=None, + worker_count_min=None, + router_commands="echo router", + worker_commands="echo worker", +): + # Build a ServiceConfiguration instance for the in-place update tests. + worker = { + "name": "worker", + "commands": [worker_commands], + } + if worker_count_min is None: + worker["count"] = 1 + else: + worker["count"] = {"min": worker_count_min, "max": worker_count_min + 1} + worker["scaling"] = {"metric": "rps", "target": 4} + replicas = [worker] + if router_type is not None: + replicas.append( + { + "name": "router", + "router": {"type": router_type}, + "commands": [router_commands], + "count": 1, + } + ) + data = { + "type": "service", + "port": 8000, + "replicas": replicas, + } + if image is not None: + data["image"] = image + if env is not None: + data["env"] = env + return ServiceConfiguration.parse_obj(data) + + +def _run_spec(configuration: ServiceConfiguration, **kwargs): + return get_run_spec( + repo_id="test-repo", run_name="test-run", configuration=configuration, **kwargs + ) + + +def _run_spec_with_overrides(configuration: ServiceConfiguration, **overrides) -> RunSpec: + get_run_spec_keys = {"repo_code_hash", "repo_data"} + get_run_spec_kwargs = {k: v for k, v in overrides.items() if k in get_run_spec_keys} + run_spec_overrides = {k: v for k, v in overrides.items() if k not in get_run_spec_keys} + run_spec = get_run_spec( + repo_id="test-repo", + run_name="test-run", + configuration=configuration, + **get_run_spec_kwargs, + ) + if not run_spec_overrides: + return run_spec + return RunSpec.parse_obj({**run_spec.dict(), **run_spec_overrides}) + + +class TestValidateRunSpecRetryDuration: + def test_model_accepts_negative_retry_duration_for_backward_compatibility(self): + retry = ProfileRetry(duration=-1) + + assert retry.duration == -1 + + def test_rejects_negative_retry_duration_for_new_run_specs(self): + run_spec = get_run_spec( + repo_id="test-repo", + profile=Profile(name="default", retry=ProfileRetry(duration=-1)), + ) + + with pytest.raises(ServerClientError, match="retry.duration cannot be negative"): + validate_run_spec_and_set_defaults( + SimpleNamespace(ssh_public_key="ssh-rsa test"), run_spec + ) + + +class TestCheckCanUpdateConfigurationRouterType: + def test_sglang_to_dynamo_router_type_change_is_rejected(self): + current = _run_spec(_service_configuration(router_type="sglang")) + new = _run_spec(_service_configuration(router_type="dynamo")) + with pytest.raises(ServerClientError, match="router.type"): + check_can_update_run_spec(current, new) + + def test_dynamo_to_sglang_router_type_change_is_rejected(self): + current = _run_spec(_service_configuration(router_type="dynamo")) + new = _run_spec(_service_configuration(router_type="sglang")) + with pytest.raises(ServerClientError, match="router.type"): + check_can_update_run_spec(current, new) + + def test_same_router_type_no_other_changes_succeeds(self): + current = _run_spec(_service_configuration(router_type="dynamo")) + new = _run_spec(_service_configuration(router_type="dynamo")) + check_can_update_run_spec(current, new) + + +class TestCheckCanUpdateConfigurationDynamoRouterGroup: + def test_dynamo_router_group_commands_change_is_rejected(self): + current = _run_spec(_service_configuration(router_type="dynamo", router_commands="a")) + new = _run_spec(_service_configuration(router_type="dynamo", router_commands="b")) + with pytest.raises(ServerClientError, match="Dynamo router replica group"): + check_can_update_run_spec(current, new) + + +class TestCheckCanUpdateConfigurationDynamoTopLevel: + def test_dynamo_top_level_image_change_is_rejected(self): + current = _run_spec(_service_configuration(router_type="dynamo", image="img:1")) + new = _run_spec(_service_configuration(router_type="dynamo", image="img:2")) + with pytest.raises(ServerClientError, match="image.*Dynamo"): + check_can_update_run_spec(current, new) + + def test_dynamo_top_level_env_change_is_rejected(self): + current = _run_spec(_service_configuration(router_type="dynamo", env={"FOO": "1"})) + new = _run_spec(_service_configuration(router_type="dynamo", env={"FOO": "2"})) + with pytest.raises(ServerClientError, match="env.*Dynamo"): + check_can_update_run_spec(current, new) + + +class TestCheckCanUpdateRunSpecDynamoSpecLevel: + @pytest.mark.parametrize( + ("field", "current_overrides", "new_overrides"), + [ + pytest.param( + "repo_code_hash", + {"repo_code_hash": "hash-a"}, + {"repo_code_hash": "hash-b"}, + id="repo_code_hash", + ), + pytest.param( + "repo_data", + {"repo_data": LocalRunRepoData(repo_dir="/repo/a")}, + {"repo_data": LocalRunRepoData(repo_dir="/repo/b")}, + id="repo_data", + ), + pytest.param( + "file_archives", + { + "file_archives": [ + FileArchiveMapping( + id=uuid.UUID("00000000-0000-0000-0000-000000000001"), + path="/work/a.txt", + ), + ], + }, + { + "file_archives": [ + FileArchiveMapping( + id=uuid.UUID("00000000-0000-0000-0000-000000000002"), + path="/work/b.txt", + ), + ], + }, + id="file_archives", + ), + pytest.param( + "working_dir", + {"working_dir": "/old-top"}, + {"working_dir": "/new-top"}, + id="working_dir", + ), + ], + ) + def test_dynamo_spec_level_field_change_is_rejected( + self, field: str, current_overrides: dict, new_overrides: dict + ) -> None: + cfg = _service_configuration(router_type="dynamo") + current = _run_spec_with_overrides(cfg, **current_overrides) + new = _run_spec_with_overrides(cfg, **new_overrides) + + with pytest.raises(ServerClientError, match=re.escape(field)): + check_can_update_run_spec(current, new) + + +class TestCheckCanUpdateConfigurationWorkerOnlyChangesAllowed: + def test_dynamo_worker_count_min_change_is_allowed(self): + current = _run_spec(_service_configuration(router_type="dynamo", worker_count_min=1)) + new = _run_spec(_service_configuration(router_type="dynamo", worker_count_min=2)) + # Worker group count change is allowed on a Dynamo service. + check_can_update_run_spec(current, new) + + def test_dynamo_worker_commands_change_is_allowed(self): + current = _run_spec(_service_configuration(router_type="dynamo", worker_commands="x")) + new = _run_spec(_service_configuration(router_type="dynamo", worker_commands="y")) + # Non-router replica group's commands change is allowed. + check_can_update_run_spec(current, new) + + +class TestCheckCanUpdateConfigurationNonDynamoUnchanged: + def test_sglang_top_level_image_change_is_allowed(self): + current = _run_spec(_service_configuration(router_type="sglang", image="img:1")) + new = _run_spec(_service_configuration(router_type="sglang", image="img:2")) + # Top-level changes on SGLang services flow through to the existing + # rolling-deployment path; no Dynamo gate fires. + check_can_update_run_spec(current, new) + + def test_no_router_top_level_image_change_is_allowed(self): + current = _run_spec(_service_configuration(router_type=None, image="img:1")) + new = _run_spec(_service_configuration(router_type=None, image="img:2")) + check_can_update_run_spec(current, new) + + +class TestCheckCanUpdateConfigurationFieldAllowlist: + """`_check_can_update_configuration` is also called directly with configs only.""" + + def test_non_dynamo_image_change_passes_configuration_gate(self): + current = _service_configuration(router_type="sglang", image="img:1") + new = _service_configuration(router_type="sglang", image="img:2") + _check_can_update_configuration(current, new, ignore_files=True) diff --git a/src/tests/_internal/server/services/services/__init__.py b/src/tests/_internal/server/services/services/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/server/services/services/test_autoscalers.py b/src/tests/_internal/server/services/services/test_autoscalers.py new file mode 100644 index 0000000000..1125ae8ade --- /dev/null +++ b/src/tests/_internal/server/services/services/test_autoscalers.py @@ -0,0 +1,170 @@ +import datetime +from unittest.mock import patch + +import pytest + +from dstack._internal.core.models.configurations import DEFAULT_SCALING_WINDOW +from dstack._internal.proxy.gateway.schemas.stats import PerWindowStats, Stat +from dstack._internal.server.services.services.autoscalers import BaseServiceScaler, RPSAutoscaler + + +@pytest.fixture +def rps_scaler(): + return RPSAutoscaler( + min_replicas=0, + max_replicas=5, + target=10, + window=DEFAULT_SCALING_WINDOW, + scale_up_delay=5 * 60, + scale_down_delay=10 * 60, + ) + + +@pytest.fixture +def time(): + dt = datetime.datetime(2024, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) + with patch("dstack._internal.utils.common.get_current_datetime") as mock: + mock.return_value = dt + yield dt + + +def stats(rps: float) -> PerWindowStats: + return { + DEFAULT_SCALING_WINDOW: Stat(requests=int(rps * DEFAULT_SCALING_WINDOW), request_time=0.1) + } + + +class TestRPSAutoscaler: + def test_do_not_scale(self, rps_scaler: BaseServiceScaler, time: datetime.datetime) -> None: + assert ( + rps_scaler.get_desired_count( + current_desired_count=1, + stats=stats(rps=10), + last_scaled_at=time - datetime.timedelta(seconds=3600), + ) + == 1 + ) + + def test_scale_up(self, rps_scaler: BaseServiceScaler, time: datetime.datetime) -> None: + assert ( + rps_scaler.get_desired_count( + current_desired_count=1, + stats=stats(rps=20), + last_scaled_at=time - datetime.timedelta(seconds=3600), + ) + == 2 + ) + + def test_scale_up_high_load( + self, rps_scaler: BaseServiceScaler, time: datetime.datetime + ) -> None: + assert ( + rps_scaler.get_desired_count( + current_desired_count=2, + stats=stats(rps=50), + last_scaled_at=time - datetime.timedelta(seconds=3600), + ) + == 5 + ) + + def test_scale_up_replicas_limit( + self, rps_scaler: BaseServiceScaler, time: datetime.datetime + ) -> None: + assert ( + rps_scaler.get_desired_count( + current_desired_count=2, + stats=stats(rps=1000), + last_scaled_at=time - datetime.timedelta(seconds=3600), + ) + == 5 + ) + + def test_scale_down(self, rps_scaler: BaseServiceScaler, time: datetime.datetime) -> None: + assert ( + rps_scaler.get_desired_count( + current_desired_count=2, + stats=stats(rps=5), + last_scaled_at=time - datetime.timedelta(seconds=3600), + ) + == 1 + ) + + def test_scale_up_delayed( + self, rps_scaler: BaseServiceScaler, time: datetime.datetime + ) -> None: + assert ( + rps_scaler.get_desired_count( + current_desired_count=1, + stats=stats(rps=20), + # last scaled 1 minute ago, but the delay is 5 minutes + last_scaled_at=time - datetime.timedelta(seconds=60), + ) + == 1 + ) + + def test_scale_down_delayed( + self, rps_scaler: BaseServiceScaler, time: datetime.datetime + ) -> None: + assert ( + rps_scaler.get_desired_count( + current_desired_count=2, + stats=stats(rps=5), + # last scaled 5 minutes ago, but the delay is 10 minutes + last_scaled_at=time - datetime.timedelta(seconds=5 * 60), + ) + == 2 + ) + + def test_scale_from_zero_first_time( + self, rps_scaler: BaseServiceScaler, time: datetime.datetime + ) -> None: + assert ( + rps_scaler.get_desired_count( + current_desired_count=0, + stats=stats(rps=5), + last_scaled_at=None, + ) + == 1 + ) + + def test_scale_from_zero_immediately( + self, rps_scaler: BaseServiceScaler, time: datetime.datetime + ) -> None: + assert ( + rps_scaler.get_desired_count( + current_desired_count=0, + stats=stats(rps=5), + # last scaled 1 second ago, but there are requests + last_scaled_at=time - datetime.timedelta(seconds=1), + ) + == 1 + ) + + def test_scale_to_zero(self, rps_scaler: BaseServiceScaler, time: datetime.datetime) -> None: + assert ( + rps_scaler.get_desired_count( + current_desired_count=2, + stats=stats(rps=0), + last_scaled_at=time - datetime.timedelta(seconds=3600), + ) + == 0 + ) + + @pytest.mark.parametrize("window,expected", [(30, 3), (60, 2), (300, 1)]) + def test_window(self, window: int, expected: int, time: datetime.datetime) -> None: + stats: PerWindowStats = { + 30: Stat(requests=900, request_time=0.1), # 900 req / 30s = 30 rps → 3 replicas + 60: Stat(requests=1200, request_time=0.1), # 1200 req / 60s = 20 rps → 2 replicas + 300: Stat(requests=1500, request_time=0.1), # 1500 req / 300s = 5 rps → 1 replica + } + scaler = RPSAutoscaler( + min_replicas=0, + max_replicas=5, + target=10, + window=window, + scale_up_delay=5 * 60, + scale_down_delay=10 * 60, + ) + assert ( + scaler.get_desired_count(1, stats, time - datetime.timedelta(seconds=3600)) == expected + ) diff --git a/src/tests/_internal/server/services/services/test_services.py b/src/tests/_internal/server/services/services/test_services.py new file mode 100644 index 0000000000..0be5bed9df --- /dev/null +++ b/src/tests/_internal/server/services/services/test_services.py @@ -0,0 +1,137 @@ +from typing import Literal, Optional, Union +from unittest.mock import MagicMock + +import pytest + +from dstack._internal.core.errors import ServerClientError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.configurations import ServiceConfiguration +from dstack._internal.core.models.gateways import ( + ACMGatewayCertificate, + AnyGatewayCertificate, + GatewayConfiguration, + LetsEncryptGatewayCertificate, +) +from dstack._internal.core.models.runs import RunSpec +from dstack._internal.server.services.services import ( + _register_service_in_server, + _should_configure_service_https_on_gateway, + _should_show_service_https, +) +from dstack._internal.server.testing.common import get_run_spec + + +def _service_run_spec(https: Union[bool, Literal["auto"]] = "auto") -> RunSpec: + return get_run_spec( + repo_id="test-repo", + configuration=ServiceConfiguration(commands=["python serve.py"], port=8000, https=https), + ) + + +def _gateway_config( + certificate: Optional[AnyGatewayCertificate] = LetsEncryptGatewayCertificate(), +) -> GatewayConfiguration: + return GatewayConfiguration( + backend=BackendType.AWS, + region="us-east-1", + certificate=certificate, + ) + + +def _mock_run_model() -> MagicMock: + run_model = MagicMock() + run_model.project.name = "test-project" + run_model.run_name = "test-run" + return run_model + + +class TestServiceConfigurationHttps: + def test_accepts_unset(self) -> None: + conf = ServiceConfiguration(commands=["python serve.py"], port=8000) + assert conf.https is None + + def test_accepts_auto(self) -> None: + conf = ServiceConfiguration(commands=["python serve.py"], port=8000, https="auto") + assert conf.https == "auto" + + +class TestShouldConfigureServiceHttpsOnGateway: + def test_auto_resolves_to_true_with_lets_encrypt_gateway(self) -> None: + run_spec = _service_run_spec(https="auto") + gw = _gateway_config(certificate=LetsEncryptGatewayCertificate()) + assert _should_configure_service_https_on_gateway(run_spec, gw) is True + + def test_auto_resolves_to_false_when_gateway_has_no_certificate(self) -> None: + run_spec = _service_run_spec(https="auto") + gw = _gateway_config(certificate=None) + assert _should_configure_service_https_on_gateway(run_spec, gw) is False + + def test_auto_resolves_to_false_with_acm_gateway(self) -> None: + run_spec = _service_run_spec(https="auto") + gw = _gateway_config( + certificate=ACMGatewayCertificate(arn="arn:aws:acm:us-east-1:123:cert/abc") + ) + assert _should_configure_service_https_on_gateway(run_spec, gw) is False + + def test_true_enables_https_when_gateway_has_no_certificate(self) -> None: + run_spec = _service_run_spec(https=True) + gw = _gateway_config(certificate=None) + assert _should_configure_service_https_on_gateway(run_spec, gw) is True + + def test_false_disables_https_regardless_of_gateway_certificate(self) -> None: + run_spec = _service_run_spec(https=False) + gw = _gateway_config(certificate=LetsEncryptGatewayCertificate()) + assert _should_configure_service_https_on_gateway(run_spec, gw) is False + + def test_true_does_not_configure_https_on_acm_gateway(self) -> None: + run_spec = _service_run_spec(https=True) + gw = _gateway_config( + certificate=ACMGatewayCertificate(arn="arn:aws:acm:us-east-1:123:cert/abc") + ) + assert _should_configure_service_https_on_gateway(run_spec, gw) is False + + +class TestShouldShowServiceHttps: + def test_auto_resolves_to_true_with_lets_encrypt_gateway(self) -> None: + run_spec = _service_run_spec(https="auto") + gw = _gateway_config(certificate=LetsEncryptGatewayCertificate()) + assert _should_show_service_https(run_spec, gw) is True + + def test_auto_resolves_to_false_when_gateway_has_no_certificate(self) -> None: + run_spec = _service_run_spec(https="auto") + gw = _gateway_config(certificate=None) + assert _should_show_service_https(run_spec, gw) is False + + def test_auto_resolves_to_true_with_acm_gateway(self) -> None: + run_spec = _service_run_spec(https="auto") + gw = _gateway_config( + certificate=ACMGatewayCertificate(arn="arn:aws:acm:us-east-1:123:cert/abc") + ) + assert _should_show_service_https(run_spec, gw) is True + + def test_true_enables_https_regardless_of_gateway_certificate(self) -> None: + run_spec = _service_run_spec(https=True) + gw = _gateway_config(certificate=None) + assert _should_show_service_https(run_spec, gw) is True + + def test_false_disables_https_regardless_of_gateway_certificate(self) -> None: + run_spec = _service_run_spec(https=False) + gw = _gateway_config(certificate=LetsEncryptGatewayCertificate()) + assert _should_show_service_https(run_spec, gw) is False + + +class TestRegisterServiceInServerHttps: + def test_allows_default_true_without_gateway(self) -> None: + run_spec = _service_run_spec(https=True) + result = _register_service_in_server(_mock_run_model(), run_spec) + assert result is not None + + def test_allows_auto_without_gateway(self) -> None: + run_spec = _service_run_spec(https="auto") + result = _register_service_in_server(_mock_run_model(), run_spec) + assert result is not None + + def test_rejects_explicit_false_without_gateway(self) -> None: + run_spec = _service_run_spec(https=False) + with pytest.raises(ServerClientError, match="not allowed without a gateway"): + _register_service_in_server(_mock_run_model(), run_spec) diff --git a/src/tests/_internal/server/services/test_backend_configs.py b/src/tests/_internal/server/services/test_backend_configs.py new file mode 100644 index 0000000000..d99bdcb985 --- /dev/null +++ b/src/tests/_internal/server/services/test_backend_configs.py @@ -0,0 +1,231 @@ +import json +from pathlib import Path +from textwrap import dedent +from unittest.mock import patch + +import yaml + +from dstack._internal.core.backends.kubernetes.backend import KubernetesBackend +from dstack._internal.server import settings +from dstack._internal.server.services.config import ( + ServerConfigManager, + config_yaml_to_backend_config, + file_config_to_config, +) + + +class TestCrusoeBackendConfig: + def test_config_parsing(self, tmp_path: Path): + config_yaml_path = tmp_path / "config.yml" + config_dict = { + "projects": [ + { + "name": "main", + "backends": [ + { + "type": "crusoe", + "project_id": "test-project-id", + "regions": ["us-east1-a"], + "creds": { + "type": "access_key", + "access_key": "test-access-key", + "secret_key": "test-secret-key", + }, + } + ], + } + ] + } + config_yaml_path.write_text(yaml.dump(config_dict)) + + with patch.object(settings, "SERVER_CONFIG_FILE_PATH", config_yaml_path): + m = ServerConfigManager() + assert m.load_config() + assert m.config is not None + assert m.config.projects is not None + assert len(m.config.projects) > 0 + assert m.config.projects[0].backends is not None + backend_file_cfg = m.config.projects[0].backends[0] + backend_cfg = file_config_to_config(backend_file_cfg) + + assert backend_cfg.type == "crusoe" + assert backend_cfg.project_id == "test-project-id" + assert backend_cfg.regions == ["us-east1-a"] + assert backend_cfg.creds.access_key == "test-access-key" + assert backend_cfg.creds.secret_key == "test-secret-key" + + +class TestNebiusBackendConfig: + def test_with_filename(self, tmp_path: Path): + creds_json = { + "subject-credentials": { + "type": "JWT", + "alg": "RS256", + "private-key": "-----BEGIN PRIVATE KEY-----\nabc\n-----END PRIVATE KEY-----\n", + "kid": "publickey-e00test", + "iss": "serviceaccount-e00test", + "sub": "serviceaccount-e00test", + } + } + creds_file = tmp_path / "nebius_creds.json" + creds_file.write_text(json.dumps(creds_json)) + + config_yaml_path = tmp_path / "config.yml" + config_dict = { + "projects": [ + { + "name": "main", + "backends": [ + { + "type": "nebius", + "creds": {"type": "service_account", "filename": str(creds_file)}, + } + ], + } + ] + } + config_yaml_path.write_text(yaml.dump(config_dict)) + + with patch.object(settings, "SERVER_CONFIG_FILE_PATH", config_yaml_path): + m = ServerConfigManager() + assert m.load_config() + assert m.config is not None + assert m.config.projects is not None + assert len(m.config.projects) > 0 + assert m.config.projects[0].backends is not None + backend_file_cfg = m.config.projects[0].backends[0] + backend_cfg = file_config_to_config(backend_file_cfg) + + assert backend_cfg.type == "nebius" + assert backend_cfg.creds.service_account_id == "serviceaccount-e00test" + assert backend_cfg.creds.public_key_id == "publickey-e00test" + assert ( + backend_cfg.creds.private_key_content + == "-----BEGIN PRIVATE KEY-----\nabc\n-----END PRIVATE KEY-----\n" + ) + + def test_with_private_key_file(self, tmp_path: Path): + pk_file = tmp_path / "private.key" + pk_file.write_text("TEST_PRIVATE_KEY") + + config_yaml_path = tmp_path / "config.yml" + config_dict = { + "projects": [ + { + "name": "main", + "backends": [ + { + "type": "nebius", + "projects": ["project-e00test"], + "creds": { + "type": "service_account", + "service_account_id": "serviceaccount-e00test", + "public_key_id": "publickey-e00test", + "private_key_file": str(pk_file), + }, + } + ], + } + ] + } + config_yaml_path.write_text(yaml.dump(config_dict)) + + with patch.object(settings, "SERVER_CONFIG_FILE_PATH", config_yaml_path): + m = ServerConfigManager() + assert m.load_config() + assert m.config is not None + assert m.config.projects is not None + assert len(m.config.projects) > 0 + assert m.config.projects[0].backends is not None + backend_file_cfg = m.config.projects[0].backends[0] + backend_cfg = file_config_to_config(backend_file_cfg) + + assert backend_cfg.type == "nebius" + assert backend_cfg.creds.service_account_id == "serviceaccount-e00test" + assert backend_cfg.creds.public_key_id == "publickey-e00test" + assert backend_cfg.creds.private_key_content == "TEST_PRIVATE_KEY" + + +class TestKubernetesBackendConfig: + def test_ui_config_embedded_kubeconfig_initializes_backend(self): + config_yaml = dedent( + """ + type: kubernetes + kubeconfig: + data: | + apiVersion: v1 + kind: Config + current-context: gpu-training + + clusters: + - name: gpu-training + cluster: + server: https://gpu-cluster.internal.example.com:6443 + insecure-skip-tls-verify: true + + users: + - name: ml-engineer + user: + token: test-token + + contexts: + - name: gpu-training + context: + cluster: gpu-training + user: ml-engineer + + proxy_jump: + hostname: 204.12.171.137 + port: 32000 + """ + ) + + backend_config = config_yaml_to_backend_config(config_yaml) + backend = KubernetesBackend(backend_config) + + cluster = backend.compute().region_cluster_map[""] + assert cluster.api_client.configuration.host == ( + "https://gpu-cluster.internal.example.com:6443" + ) + assert cluster.proxy_jump.hostname == "204.12.171.137" + assert cluster.proxy_jump.port == 32000 + + def test_kubeconfig_context_namespace_does_not_set_backend_namespace(self): + config_yaml = dedent( + """ + type: kubernetes + kubeconfig: + data: | + apiVersion: v1 + kind: Config + current-context: gpu-training + + clusters: + - name: gpu-training + cluster: + server: https://gpu-cluster.internal.example.com:6443 + insecure-skip-tls-verify: true + + users: + - name: ml-engineer + user: + token: test-token + + contexts: + - name: gpu-training + context: + cluster: gpu-training + user: ml-engineer + namespace: training-jobs + + proxy_jump: + hostname: 204.12.171.137 + port: 32000 + """ + ) + + backend_config = config_yaml_to_backend_config(config_yaml) + backend = KubernetesBackend(backend_config) + + cluster = backend.compute().region_cluster_map[""] + assert cluster.namespace == "default" diff --git a/src/tests/_internal/server/services/test_config.py b/src/tests/_internal/server/services/test_config.py index 6122406046..81265445e8 100644 --- a/src/tests/_internal/server/services/test_config.py +++ b/src/tests/_internal/server/services/test_config.py @@ -1,59 +1,28 @@ +import json from pathlib import Path -from unittest.mock import patch +from unittest.mock import AsyncMock, Mock, patch import pytest import yaml from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession -from dstack._internal.core.models.backends.azure import AzureConfigInfoWithCreds, AzureDefaultCreds -from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.backends.aws.configurator import DEFAULT_REGIONS from dstack._internal.server import settings -from dstack._internal.server.models import BackendModel, ProjectModel -from dstack._internal.server.services.config import AzureConfig, ServerConfigManager +from dstack._internal.server.models import BackendModel, ImportModel, ProjectModel +from dstack._internal.server.services.config import ServerConfigManager from dstack._internal.server.testing.common import ( + create_backend, + create_export, create_project, create_user, ) class TestServerConfigManager: - class TestInitConfig: - @pytest.mark.asyncio - async def test_inits_backend(self, test_db, session: AsyncSession, tmp_path: Path): - await create_project(session=session, name="main") - config_filepath = tmp_path / "config.yml" - with patch.object(settings, "SERVER_CONFIG_FILE_PATH", config_filepath), patch( - "dstack._internal.server.services.backends.list_available_backend_types" - ) as list_available_backend_types_mock, patch( - "dstack._internal.server.services.backends.get_configurator" - ) as get_configurator_mock, patch( - "dstack._internal.server.services.backends.create_backend" - ) as create_backend_mock: - list_available_backend_types_mock.return_value = [BackendType.AZURE] - default_config = AzureConfigInfoWithCreds( - tenant_id="test_tenant", - subscription_id="test_subscription", - locations=["westeurope"], - creds=AzureDefaultCreds(), - ) - get_configurator_mock.return_value.get_default_configs.return_value = [ - default_config - ] - manager = ServerConfigManager() - await manager.init_config(session) - list_available_backend_types_mock.assert_called() - get_configurator_mock.assert_called() - create_backend_mock.assert_called() - assert manager.config.projects[0].backends[0] == AzureConfig( - tenant_id="test_tenant", - subscription_id="test_subscription", - regions=["westeurope"], - creds=AzureDefaultCreds(), - ) - class TestApplyConfig: @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_creates_backend(self, test_db, session: AsyncSession, tmp_path: Path): owner = await create_user(session=session, name="test_owner") await create_project(session=session, owner=owner, name="main") @@ -92,9 +61,13 @@ async def test_creates_backend(self, test_db, session: AsyncSession, tmp_path: P } with open(config_filepath, "w+") as f: yaml.dump(config, f) - with patch("boto3.session.Session"), patch.object( - settings, "SERVER_CONFIG_FILE_PATH", config_filepath - ), patch("dstack._internal.core.backends.aws.compute.get_vpc_id_subnet_id_or_error"): + with ( + patch("boto3.session.Session"), + patch.object(settings, "SERVER_CONFIG_FILE_PATH", config_filepath), + patch( + "dstack._internal.core.backends.aws.compute.get_vpc_id_subnets_ids_or_error" + ), + ): manager = ServerConfigManager() manager.load_config() await manager.apply_config(session, owner) @@ -104,3 +77,173 @@ async def test_creates_backend(self, test_db, session: AsyncSession, tmp_path: P b_res = await session.execute(select(BackendModel)) backends = b_res.scalars().all() assert len(backends) == 2 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_skips_update_when_source_config_matches( + self, test_db, session: AsyncSession, tmp_path: Path + ): + owner = await create_user(session=session, name="test_owner") + project = await create_project(session=session, owner=owner, name="main") + creds = { + "type": "access_key", + "access_key": "1234", + "secret_key": "1234", + } + await create_backend( + session=session, + project_id=project.id, + config={"regions": DEFAULT_REGIONS}, + auth=creds, + source_config={"type": "aws", "regions": None}, + source_auth=creds, + ) + config_filepath = tmp_path / "config.yml" + config = { + "projects": [{"name": "main", "backends": [{"type": "aws", "creds": creds}]}] + } + with open(config_filepath, "w+") as f: + yaml.dump(config, f) + with ( + patch.object(settings, "SERVER_CONFIG_FILE_PATH", config_filepath), + patch( + "dstack._internal.server.services.backends.update_backend", + new_callable=AsyncMock, + ) as update_backend, + ): + manager = ServerConfigManager() + manager.load_config() + await manager.apply_config(session, owner) + update_backend.assert_not_called() + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_populates_source_config_for_legacy_backend( + self, test_db, session: AsyncSession, tmp_path: Path + ): + owner = await create_user(session=session, name="test_owner") + project = await create_project(session=session, owner=owner, name="main") + creds = { + "type": "access_key", + "access_key": "1234", + "secret_key": "1234", + } + backend = await create_backend( + session=session, + project_id=project.id, + config={"regions": DEFAULT_REGIONS}, + auth=creds, + ) + config_filepath = tmp_path / "config.yml" + config = { + "projects": [{"name": "main", "backends": [{"type": "aws", "creds": creds}]}] + } + with open(config_filepath, "w+") as f: + yaml.dump(config, f) + mock_session = Mock() + mock_session.client.return_value = Mock() + with ( + patch.object(settings, "SERVER_CONFIG_FILE_PATH", config_filepath), + patch( + "dstack._internal.core.backends.aws.auth.authenticate", + return_value=mock_session, + ), + patch( + "dstack._internal.core.backends.aws.compute.get_vpc_id_subnets_ids_or_error" + ), + ): + manager = ServerConfigManager() + manager.load_config() + await manager.apply_config(session, owner) + await session.refresh(backend) + assert backend.source_config is not None + assert backend.source_auth is not None + assert json.loads(backend.source_config)["regions"] is None + assert json.loads(backend.source_auth.get_plaintext_or_error()) == creds + with ( + patch.object(settings, "SERVER_CONFIG_FILE_PATH", config_filepath), + patch( + "dstack._internal.server.services.backends.update_backend", + new_callable=AsyncMock, + ) as update_backend, + ): + manager = ServerConfigManager() + manager.load_config() + await manager.apply_config(session, owner) + update_backend.assert_not_called() + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_forces_update_when_current_backend_config_is_unavailable( + self, test_db, session: AsyncSession, tmp_path: Path + ): + owner = await create_user(session=session, name="test_owner") + project = await create_project(session=session, owner=owner, name="main") + creds = { + "type": "access_key", + "access_key": "1234", + "secret_key": "1234", + } + await create_backend( + session=session, + project_id=project.id, + config={"regions": DEFAULT_REGIONS}, + auth=creds, + source_config={"type": "aws", "regions": None}, + source_auth=creds, + ) + config_filepath = tmp_path / "config.yml" + config = { + "projects": [{"name": "main", "backends": [{"type": "aws", "creds": creds}]}] + } + with open(config_filepath, "w+") as f: + yaml.dump(config, f) + with ( + patch.object(settings, "SERVER_CONFIG_FILE_PATH", config_filepath), + patch( + "dstack._internal.server.services.backends.get_backend_config", + new_callable=AsyncMock, + return_value=None, + ), + patch( + "dstack._internal.server.services.backends.update_backend", + new_callable=AsyncMock, + ) as update_backend, + ): + manager = ServerConfigManager() + manager.load_config() + await manager.apply_config(session, owner) + update_backend.assert_awaited_once() + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_new_project_imports_global_exports( + self, test_db, session: AsyncSession, tmp_path: Path + ): + owner = await create_user(session=session, name="test_owner") + exporter_project = await create_project(session=session, owner=owner, name="exporter") + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[], + exported_fleets=[], + name="global-export", + is_global=True, + ) + config_filepath = tmp_path / "config.yml" + config = {"projects": [{"name": "new-project"}]} + with open(config_filepath, "w+") as f: + yaml.dump(config, f) + with patch.object(settings, "SERVER_CONFIG_FILE_PATH", config_filepath): + manager = ServerConfigManager() + manager.load_config() + await manager.apply_config(session, owner) + new_project_res = await session.execute( + select(ProjectModel).where(ProjectModel.name == "new-project") + ) + new_project = new_project_res.scalar_one() + imports_res = await session.execute( + select(ImportModel).where(ImportModel.project_id == new_project.id) + ) + imports = imports_res.scalars().all() + assert len(imports) == 1 diff --git a/src/tests/_internal/server/services/test_docker.py b/src/tests/_internal/server/services/test_docker.py index 69a22e7415..45d2078dc8 100644 --- a/src/tests/_internal/server/services/test_docker.py +++ b/src/tests/_internal/server/services/test_docker.py @@ -1,8 +1,11 @@ import pytest +import dstack._internal.server.settings as server_settings +from dstack._internal.core.models.common import RegistryAuth from dstack._internal.server.services.docker import ( ImageConfigObject, ImageManifest, + apply_server_docker_defaults, is_valid_docker_volume_target, ) @@ -108,6 +111,131 @@ def test_parse_image_config_object_with_config_null(sample_image_config_object): assert config_object.config is not None +@pytest.mark.parametrize( + ["value", "expected"], + [ + [None, None], + ["", None], + ["1000:1000", "1000:1000"], + ], +) +def test_parse_image_config_object_user_field(sample_image_config_object, value, expected): + sample_image_config_object["config"]["User"] = value + config_object = ImageConfigObject.__response__.parse_obj(sample_image_config_object) + assert config_object.config.user == expected + + +def test_parse_image_config_object_user_field_missing(sample_image_config_object): + del sample_image_config_object["config"]["User"] + config_object = ImageConfigObject.__response__.parse_obj(sample_image_config_object) + assert config_object.config.user is None + + +@pytest.mark.parametrize( + ( + "default_registry", + "default_username", + "default_password", + "image_name", + "input_auth", + "expected_image", + "expected_auth", + ), + [ + pytest.param( + None, + None, + None, + "python:3.12", + None, + "python:3.12", + None, + id="no-defaults-configured", + ), + pytest.param( + "registry.example", + None, + None, + "python:3.12", + None, + "registry.example/python:3.12", + None, + id="registry-prepended-no-credentials", + ), + pytest.param( + "registry.example", + "user", + "pass", + "python:3.12", + None, + "registry.example/python:3.12", + RegistryAuth(username="user", password="pass"), + id="registry-prepended-and-credentials-injected", + ), + pytest.param( + "registry.example", + "user", + "pass", + "python:3.12", + RegistryAuth(username="run-user", password="run-pass"), + "registry.example/python:3.12", + RegistryAuth(username="run-user", password="run-pass"), + id="registry-prepended-run-auth-preserved", + ), + pytest.param( + None, + "user", + "pass", + "python:3.12", + None, + "python:3.12", + RegistryAuth(username="user", password="pass"), + id="credentials-injected-without-default-registry", + ), + pytest.param( + "registry.example", + "user", + "pass", + "ghcr.io/org/image:tag", + None, + "ghcr.io/org/image:tag", + None, + id="image-with-registry-unchanged", + ), + pytest.param( + None, + "user", + "pass", + "ghcr.io/org/image:tag", + None, + "ghcr.io/org/image:tag", + None, + id="credentials-not-injected-when-image-has-registry", + ), + ], +) +def test_apply_server_docker_defaults( + monkeypatch, + default_registry, + default_username, + default_password, + image_name, + input_auth, + expected_image, + expected_auth, +): + monkeypatch.setattr(server_settings, "SERVER_DEFAULT_DOCKER_REGISTRY", default_registry) + monkeypatch.setattr( + server_settings, "SERVER_DEFAULT_DOCKER_REGISTRY_USERNAME", default_username + ) + monkeypatch.setattr( + server_settings, "SERVER_DEFAULT_DOCKER_REGISTRY_PASSWORD", default_password + ) + result_image, result_auth = apply_server_docker_defaults(image_name, input_auth) + assert result_image == expected_image + assert result_auth == expected_auth + + class TestIsValidDockerVolumeTarget: @pytest.mark.parametrize( "path", diff --git a/src/tests/_internal/server/services/test_fleets.py b/src/tests/_internal/server/services/test_fleets.py new file mode 100644 index 0000000000..19ae011d65 --- /dev/null +++ b/src/tests/_internal/server/services/test_fleets.py @@ -0,0 +1,205 @@ +from typing import Optional, Union +from unittest.mock import Mock + +import pytest +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.errors import ServerClientError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.fleets import ( + FleetConfiguration, + FleetNodesSpec, + FleetSpec, + InstanceGroupPlacement, + SSHHostParams, + SSHParams, +) +from dstack._internal.core.models.instances import RemoteConnectionInfo +from dstack._internal.server.models import FleetModel, ProjectModel +from dstack._internal.server.services.backends import get_project_backends +from dstack._internal.server.services.fleets import ( + get_fleet_master_instance_provisioning_data, + get_plan, +) +from dstack._internal.server.testing.common import ( + create_fleet, + create_instance, + create_project, + create_user, + get_fleet_spec, + get_job_provisioning_data, + get_ssh_key, +) + + +class TestGetPlanSSHFleetHostsValidation: + @pytest.fixture + def get_project_backends_mock(self, monkeypatch: pytest.MonkeyPatch) -> list[Backend]: + mock = Mock(spec_set=get_project_backends, return_value=[]) + monkeypatch.setattr("dstack._internal.server.services.backends.get_project_backends", mock) + return mock + + def get_ssh_fleet_spec( + self, name: Optional[str], hosts: list[Union[SSHHostParams, str]] + ) -> FleetSpec: + ssh_config = SSHParams( + hosts=hosts, + network=None, + user="ubuntu", + ssh_key=get_ssh_key(), + ) + fleet_conf = FleetConfiguration(name=name, ssh_config=ssh_config) + return get_fleet_spec(conf=fleet_conf) + + async def create_fleet( + self, session: AsyncSession, project: ProjectModel, spec: FleetSpec + ) -> FleetModel: + assert spec.configuration.ssh_config is not None, spec.configuration + fleet = await create_fleet(session=session, project=project, spec=spec) + for host in spec.configuration.ssh_config.hosts: + if isinstance(host, SSHHostParams): + hostname = host.hostname + else: + hostname = host + rci = RemoteConnectionInfo(host=hostname, port=22, ssh_user="admin", ssh_keys=[]) + await create_instance( + session=session, + project=project, + fleet=fleet, + backend=BackendType.REMOTE, + remote_connection_info=rci, + ) + return fleet + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.usefixtures("test_db", "get_project_backends_mock") + async def test_ok_same_fleet_update(self, session: AsyncSession): + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + old_fleet_spec = self.get_ssh_fleet_spec(name="my-fleet", hosts=["192.168.100.201"]) + await self.create_fleet(session, project, old_fleet_spec) + new_fleet_spec = self.get_ssh_fleet_spec( + name="my-fleet", hosts=["192.168.100.201", "192.168.100.202"] + ) + plan = await get_plan(session=session, project=project, user=user, spec=new_fleet_spec) + assert plan.current_resource is not None + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.usefixtures("test_db", "get_project_backends_mock") + async def test_ok_deleted_instances_ignored(self, session: AsyncSession): + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + deleted_fleet_spec = self.get_ssh_fleet_spec(name="my-fleet", hosts=["192.168.100.201"]) + deleted_fleet = await self.create_fleet(session, project, deleted_fleet_spec) + for instance in deleted_fleet.instances: + instance.deleted = True + deleted_fleet.deleted = True + await session.commit() + fleet_spec = self.get_ssh_fleet_spec( + name="my-fleet", hosts=["192.168.100.201", "192.168.100.202"] + ) + plan = await get_plan(session=session, project=project, user=user, spec=fleet_spec) + assert plan.current_resource is None + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.usefixtures("test_db", "get_project_backends_mock") + async def test_ok_no_common_hosts_with_another_fleet(self, session: AsyncSession): + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + another_fleet_spec = self.get_ssh_fleet_spec( + name="another-fleet", hosts=["192.168.100.201"] + ) + await self.create_fleet(session, project, another_fleet_spec) + fleet_spec = self.get_ssh_fleet_spec(name="new-fleet", hosts=["192.168.100.202"]) + plan = await get_plan(session=session, project=project, user=user, spec=fleet_spec) + assert plan.current_resource is None + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.usefixtures("test_db", "get_project_backends_mock") + async def test_error_another_fleet_same_project(self, session: AsyncSession): + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + another_fleet_spec = self.get_ssh_fleet_spec( + name="another-fleet", hosts=["192.168.100.201"] + ) + await self.create_fleet(session, project, another_fleet_spec) + fleet_spec = self.get_ssh_fleet_spec( + name="new-fleet", hosts=["192.168.100.201", "192.168.100.202"] + ) + with pytest.raises( + ServerClientError, match=r"Instances \[192\.168\.100\.201\] are already assigned" + ): + await get_plan(session=session, project=project, user=user, spec=fleet_spec) + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.usefixtures("test_db", "get_project_backends_mock") + async def test_error_another_fleet_another_project(self, session: AsyncSession): + another_user = await create_user(session=session, name="another-user") + another_project = await create_project( + session=session, owner=another_user, name="another-project" + ) + another_fleet_spec = self.get_ssh_fleet_spec( + name="another-fleet", hosts=["192.168.100.201"] + ) + await self.create_fleet(session, another_project, another_fleet_spec) + user = await create_user(session=session, name="my-user") + project = await create_project(session=session, owner=user, name="my-project") + fleet_spec = self.get_ssh_fleet_spec( + name="my-fleet", hosts=["192.168.100.201", "192.168.100.202"] + ) + with pytest.raises( + ServerClientError, match=r"Instances \[192\.168\.100\.201\] are already assigned" + ): + await get_plan(session=session, project=project, user=user, spec=fleet_spec) + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.usefixtures("test_db", "get_project_backends_mock") + async def test_error_fleet_spec_without_name(self, session: AsyncSession): + # Even if the user apply the same configuration again, we cannot be sure if it is the same + # fleet or a brand new fleet, as we identify fleets by name. + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + existing_fleet_spec = self.get_ssh_fleet_spec( + name="autogenerated-fleet-name", hosts=["192.168.100.201"] + ) + await self.create_fleet(session, project, existing_fleet_spec) + fleet_spec_without_name = self.get_ssh_fleet_spec(name=None, hosts=["192.168.100.201"]) + with pytest.raises( + ServerClientError, match=r"Instances \[192\.168\.100\.201\] are already assigned" + ): + await get_plan( + session=session, project=project, user=user, spec=fleet_spec_without_name + ) + + +class TestGetFleetMasterInstanceProvisioningData: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_none_without_current_master_instance( + self, test_db, session: AsyncSession + ) -> None: + project = await create_project(session=session) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.placement = InstanceGroupPlacement.CLUSTER + fleet_spec.configuration.nodes = FleetNodesSpec(min=0, target=1, max=2) + fleet = await create_fleet(session=session, project=project, spec=fleet_spec) + await create_instance( + session=session, + project=project, + fleet=fleet, + job_provisioning_data=get_job_provisioning_data(region="eu-west-1"), + ) + + master_provisioning_data = get_fleet_master_instance_provisioning_data( + fleet_model=fleet, + fleet_spec=fleet_spec, + ) + + assert master_provisioning_data is None diff --git a/src/tests/_internal/server/services/test_fluentbit_logs.py b/src/tests/_internal/server/services/test_fluentbit_logs.py new file mode 100644 index 0000000000..937838e016 --- /dev/null +++ b/src/tests/_internal/server/services/test_fluentbit_logs.py @@ -0,0 +1,659 @@ +from datetime import datetime, timezone +from unittest.mock import Mock, patch +from uuid import UUID + +import pytest +import pytest_asyncio +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.errors import ServerClientError +from dstack._internal.server.models import ProjectModel +from dstack._internal.server.schemas.logs import PollLogsRequest +from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent +from dstack._internal.server.services.logs.base import LogStorageError +from dstack._internal.server.services.logs.fluentbit import ( + ELASTICSEARCH_AVAILABLE, + FLUENTBIT_AVAILABLE, +) +from dstack._internal.server.testing.common import create_project + +pytestmark = pytest.mark.skipif(not FLUENTBIT_AVAILABLE, reason="fluent-logger not installed") + +# Conditionally import classes that are only defined when FLUENTBIT_AVAILABLE is True +if FLUENTBIT_AVAILABLE: + from dstack._internal.server.services.logs.fluentbit import ( + FluentBitLogStorage, + ForwardFluentBitWriter, + HTTPFluentBitWriter, + NullLogReader, + ) + + if ELASTICSEARCH_AVAILABLE: + from dstack._internal.server.services.logs.fluentbit import ElasticsearchReader + + +class TestNullLogReader: + """Tests for the NullLogReader (ship-only mode).""" + + def test_read_returns_empty_logs(self): + reader = NullLogReader() + request = PollLogsRequest( + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + limit=100, + ) + result = reader.read("test-stream", request) + + assert result.logs == [] + assert result.next_token is None + + def test_close_does_nothing(self): + reader = NullLogReader() + reader.close() # Should not raise + + +class TestHTTPFluentBitWriter: + """Tests for the HTTPFluentBitWriter.""" + + @pytest.fixture + def mock_httpx_client(self): + with patch("dstack._internal.server.services.logs.fluentbit.httpx.Client") as mock: + yield mock.return_value + + def test_init_creates_client(self, mock_httpx_client): + writer = HTTPFluentBitWriter(host="localhost", port=8080, tag_prefix="dstack") + assert writer._endpoint == "http://localhost:8080" + assert writer._tag_prefix == "dstack" + + def test_write_posts_records(self, mock_httpx_client): + writer = HTTPFluentBitWriter(host="localhost", port=8080, tag_prefix="dstack") + records = [ + {"message": "Hello", "@timestamp": "2023-10-06T10:00:00+00:00"}, + {"message": "World", "@timestamp": "2023-10-06T10:00:01+00:00"}, + ] + writer.write(tag="test-tag", records=records) + + assert mock_httpx_client.post.call_count == 2 + mock_httpx_client.post.assert_any_call( + "http://localhost:8080/dstack.test-tag", + json=records[0], + headers={"Content-Type": "application/json"}, + ) + mock_httpx_client.post.assert_any_call( + "http://localhost:8080/dstack.test-tag", + json=records[1], + headers={"Content-Type": "application/json"}, + ) + + def test_write_calls_raise_for_status(self, mock_httpx_client): + """Test that response.raise_for_status() is called to detect non-2xx responses.""" + mock_response = Mock() + mock_httpx_client.post.return_value = mock_response + writer = HTTPFluentBitWriter(host="localhost", port=8080, tag_prefix="dstack") + + writer.write(tag="test-tag", records=[{"message": "test"}]) + + mock_response.raise_for_status.assert_called_once() + + def test_write_raises_on_http_status_error(self, mock_httpx_client): + """Test that 4xx/5xx responses are properly detected and raise LogStorageError.""" + import httpx + + mock_response = Mock() + mock_response.status_code = 500 + mock_response.text = "Internal Server Error" + mock_httpx_client.post.return_value = mock_response + mock_response.raise_for_status.side_effect = httpx.HTTPStatusError( + "Server Error", request=Mock(), response=mock_response + ) + writer = HTTPFluentBitWriter(host="localhost", port=8080, tag_prefix="dstack") + + with pytest.raises(LogStorageError, match="Fluent-bit HTTP error: status 500"): + writer.write(tag="test-tag", records=[{"message": "test"}]) + + def test_write_raises_on_transport_error(self, mock_httpx_client): + import httpx + + mock_httpx_client.post.side_effect = httpx.HTTPError("Connection failed") + writer = HTTPFluentBitWriter(host="localhost", port=8080, tag_prefix="dstack") + + with pytest.raises(LogStorageError, match="Fluent-bit HTTP error"): + writer.write(tag="test-tag", records=[{"message": "test"}]) + + def test_close_closes_client(self, mock_httpx_client): + writer = HTTPFluentBitWriter(host="localhost", port=8080, tag_prefix="dstack") + writer.close() + mock_httpx_client.close.assert_called_once() + + def test_write_applies_tag_prefix(self, mock_httpx_client): + """Test that tag prefix is applied to tags in HTTP requests.""" + writer = HTTPFluentBitWriter(host="localhost", port=8080, tag_prefix="dstack") + records = [{"message": "test"}] + writer.write(tag="project/run/job", records=records) + + mock_httpx_client.post.assert_called_once_with( + "http://localhost:8080/dstack.project/run/job", + json=records[0], + headers={"Content-Type": "application/json"}, + ) + + def test_write_with_empty_tag_prefix(self, mock_httpx_client): + """Test that empty tag prefix doesn't break the tag.""" + writer = HTTPFluentBitWriter(host="localhost", port=8080, tag_prefix="") + records = [{"message": "test"}] + writer.write(tag="test-tag", records=records) + + mock_httpx_client.post.assert_called_once_with( + "http://localhost:8080/test-tag", + json=records[0], + headers={"Content-Type": "application/json"}, + ) + + +class TestForwardFluentBitWriter: + """Tests for the ForwardFluentBitWriter.""" + + @pytest.fixture + def mock_fluent_sender(self): + with patch( + "dstack._internal.server.services.logs.fluentbit.fluent_sender.FluentSender" + ) as mock: + mock_instance = Mock() + mock_instance.emit.return_value = True + mock.return_value = mock_instance + yield mock_instance + + def test_init_creates_sender(self, mock_fluent_sender): + with patch( + "dstack._internal.server.services.logs.fluentbit.fluent_sender.FluentSender" + ) as mock: + mock.return_value = mock_fluent_sender + ForwardFluentBitWriter(host="localhost", port=24224, tag_prefix="dstack") + mock.assert_called_once_with("dstack", host="localhost", port=24224) + + def test_write_emits_records(self, mock_fluent_sender): + with patch( + "dstack._internal.server.services.logs.fluentbit.fluent_sender.FluentSender" + ) as mock: + mock.return_value = mock_fluent_sender + writer = ForwardFluentBitWriter(host="localhost", port=24224, tag_prefix="dstack") + + records = [ + {"message": "Hello"}, + {"message": "World"}, + ] + writer.write(tag="test-tag", records=records) + + assert mock_fluent_sender.emit.call_count == 2 + + def test_write_raises_on_emit_failure(self, mock_fluent_sender): + mock_fluent_sender.emit.return_value = False + mock_fluent_sender.last_error = Exception("Connection refused") + + with patch( + "dstack._internal.server.services.logs.fluentbit.fluent_sender.FluentSender" + ) as mock: + mock.return_value = mock_fluent_sender + writer = ForwardFluentBitWriter(host="localhost", port=24224, tag_prefix="dstack") + + with pytest.raises(LogStorageError, match="Fluent-bit Forward error"): + writer.write(tag="test-tag", records=[{"message": "test"}]) + + mock_fluent_sender.clear_last_error.assert_called_once() + + def test_close_closes_sender(self, mock_fluent_sender): + with patch( + "dstack._internal.server.services.logs.fluentbit.fluent_sender.FluentSender" + ) as mock: + mock.return_value = mock_fluent_sender + writer = ForwardFluentBitWriter(host="localhost", port=24224, tag_prefix="dstack") + writer.close() + mock_fluent_sender.close.assert_called_once() + + +class TestFluentBitLogStorage: + """Tests for the FluentBitLogStorage.""" + + @pytest_asyncio.fixture + async def project(self, test_db, session: AsyncSession) -> ProjectModel: + project = await create_project(session=session, name="test-proj") + return project + + @pytest.fixture + def mock_forward_writer(self): + with patch( + "dstack._internal.server.services.logs.fluentbit.ForwardFluentBitWriter" + ) as mock: + mock_instance = Mock() + mock.return_value = mock_instance + yield mock_instance + + @pytest.fixture + def mock_http_writer(self): + with patch("dstack._internal.server.services.logs.fluentbit.HTTPFluentBitWriter") as mock: + mock_instance = Mock() + mock.return_value = mock_instance + yield mock_instance + + @pytest.fixture + def mock_es_reader(self): + with patch("dstack._internal.server.services.logs.fluentbit.ElasticsearchReader") as mock: + mock_instance = Mock() + mock.return_value = mock_instance + yield mock_instance + + def test_init_with_forward_protocol(self, mock_forward_writer): + with patch( + "dstack._internal.server.services.logs.fluentbit.ForwardFluentBitWriter" + ) as mock: + mock.return_value = mock_forward_writer + storage = FluentBitLogStorage( + host="localhost", + port=24224, + protocol="forward", + tag_prefix="dstack", + ) + mock.assert_called_once_with(host="localhost", port=24224, tag_prefix="dstack") + assert isinstance(storage._reader, NullLogReader) + + def test_init_with_http_protocol(self, mock_http_writer): + with patch("dstack._internal.server.services.logs.fluentbit.HTTPFluentBitWriter") as mock: + mock.return_value = mock_http_writer + FluentBitLogStorage( + host="localhost", + port=8080, + protocol="http", + tag_prefix="dstack", + ) + mock.assert_called_once_with(host="localhost", port=8080, tag_prefix="dstack") + + def test_init_with_unsupported_protocol_raises(self): + with pytest.raises(LogStorageError, match="Unsupported Fluent-bit protocol"): + FluentBitLogStorage( + host="localhost", + port=24224, + protocol="grpc", + tag_prefix="dstack", + ) + + def test_init_ship_only_mode(self, mock_forward_writer): + """Test initialization without Elasticsearch (ship-only mode).""" + with patch( + "dstack._internal.server.services.logs.fluentbit.ForwardFluentBitWriter" + ) as mock: + mock.return_value = mock_forward_writer + storage = FluentBitLogStorage( + host="localhost", + port=24224, + protocol="forward", + tag_prefix="dstack", + es_host=None, + ) + assert isinstance(storage._reader, NullLogReader) + + @pytest.mark.skipif(not ELASTICSEARCH_AVAILABLE, reason="elasticsearch not installed") + def test_init_with_elasticsearch(self, mock_forward_writer, mock_es_reader): + """Test initialization with Elasticsearch configured.""" + with ( + patch( + "dstack._internal.server.services.logs.fluentbit.ForwardFluentBitWriter" + ) as writer_mock, + patch( + "dstack._internal.server.services.logs.fluentbit.ElasticsearchReader" + ) as reader_mock, + ): + writer_mock.return_value = mock_forward_writer + reader_mock.return_value = mock_es_reader + + FluentBitLogStorage( + host="localhost", + port=24224, + protocol="forward", + tag_prefix="dstack", + es_host="http://elasticsearch:9200", + es_index="dstack-logs", + es_api_key="test-key", + ) + reader_mock.assert_called_once_with( + host="http://elasticsearch:9200", + index="dstack-logs", + api_key="test-key", + ) + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_write_logs(self, test_db, project: ProjectModel, mock_forward_writer): + """Test writing logs to Fluent-bit.""" + with patch( + "dstack._internal.server.services.logs.fluentbit.ForwardFluentBitWriter" + ) as mock: + mock.return_value = mock_forward_writer + storage = FluentBitLogStorage( + host="localhost", + port=24224, + protocol="forward", + tag_prefix="dstack", + ) + + storage.write_logs( + project=project, + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[ + RunnerLogEvent(timestamp=1696586513234, message=b"Runner log"), + ], + job_logs=[ + RunnerLogEvent(timestamp=1696586513235, message=b"Job log"), + ], + ) + + assert mock_forward_writer.write.call_count == 2 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_write_logs_empty_logs_not_written( + self, test_db, project: ProjectModel, mock_forward_writer + ): + """Test that empty log lists are not written.""" + with patch( + "dstack._internal.server.services.logs.fluentbit.ForwardFluentBitWriter" + ) as mock: + mock.return_value = mock_forward_writer + storage = FluentBitLogStorage( + host="localhost", + port=24224, + protocol="forward", + tag_prefix="dstack", + ) + + storage.write_logs( + project=project, + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[], + job_logs=[], + ) + + mock_forward_writer.write.assert_not_called() + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_poll_logs_ship_only_mode(self, test_db, project: ProjectModel): + """Test that ship-only mode returns empty logs.""" + with patch("dstack._internal.server.services.logs.fluentbit.ForwardFluentBitWriter"): + storage = FluentBitLogStorage( + host="localhost", + port=24224, + protocol="forward", + tag_prefix="dstack", + ) + + request = PollLogsRequest( + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + limit=100, + ) + result = storage.poll_logs(project, request) + + assert result.logs == [] + assert result.next_token is None + + def test_close_closes_writer_and_reader(self, mock_forward_writer): + """Test that close() closes both writer and reader.""" + with patch( + "dstack._internal.server.services.logs.fluentbit.ForwardFluentBitWriter" + ) as mock: + mock.return_value = mock_forward_writer + storage = FluentBitLogStorage( + host="localhost", + port=24224, + protocol="forward", + tag_prefix="dstack", + ) + + storage.close() + + mock_forward_writer.close.assert_called_once() + + def test_close_closes_reader_even_if_writer_fails(self, mock_forward_writer): + """Test that reader is closed even if writer.close() raises an exception.""" + with patch( + "dstack._internal.server.services.logs.fluentbit.ForwardFluentBitWriter" + ) as mock: + mock_forward_writer.close.side_effect = Exception("Writer close failed") + mock.return_value = mock_forward_writer + storage = FluentBitLogStorage( + host="localhost", + port=24224, + protocol="forward", + tag_prefix="dstack", + ) + mock_reader = Mock() + storage._reader = mock_reader + + with pytest.raises(Exception, match="Writer close failed"): + storage.close() + + mock_reader.close.assert_called_once() + + def test_get_stream_name(self, mock_forward_writer): + """Test stream name generation.""" + from dstack._internal.core.models.logs import LogProducer + + with patch( + "dstack._internal.server.services.logs.fluentbit.ForwardFluentBitWriter" + ) as mock: + mock.return_value = mock_forward_writer + storage = FluentBitLogStorage( + host="localhost", + port=24224, + protocol="forward", + tag_prefix="dstack", + ) + + stream_name = storage._get_stream_name( + project_name="my-project", + run_name="my-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + producer=LogProducer.JOB, + ) + + assert stream_name == "my-project/my-run/1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e/job" + + +@pytest.mark.skipif( + not FLUENTBIT_AVAILABLE or not ELASTICSEARCH_AVAILABLE, + reason="fluent-logger or elasticsearch not installed", +) +class TestElasticsearchReader: + """Tests for the ElasticsearchReader.""" + + @pytest.fixture + def mock_es_client(self): + with patch("dstack._internal.server.services.logs.fluentbit.Elasticsearch") as mock: + mock_instance = Mock() + mock_instance.info.return_value = {"version": {"number": "8.0.0"}} + mock_instance.search.return_value = {"hits": {"hits": []}} + mock.return_value = mock_instance + yield mock_instance + + def test_init_verifies_connection(self, mock_es_client): + with patch("dstack._internal.server.services.logs.fluentbit.Elasticsearch") as mock: + mock.return_value = mock_es_client + ElasticsearchReader( + host="http://localhost:9200", + index="dstack-logs", + ) + mock_es_client.info.assert_called_once() + + def test_init_with_api_key(self, mock_es_client): + with patch("dstack._internal.server.services.logs.fluentbit.Elasticsearch") as mock: + mock.return_value = mock_es_client + ElasticsearchReader( + host="http://localhost:9200", + index="dstack-logs", + api_key="test-api-key", + ) + mock.assert_called_once_with(hosts=["http://localhost:9200"], api_key="test-api-key") + + def test_init_connection_error_raises(self): + from elasticsearch.exceptions import ConnectionError as ESConnectionError + + with patch("dstack._internal.server.services.logs.fluentbit.Elasticsearch") as mock: + mock_instance = Mock() + mock_instance.info.side_effect = ESConnectionError("Connection refused") + mock.return_value = mock_instance + + with pytest.raises(LogStorageError, match="Failed to connect"): + ElasticsearchReader( + host="http://localhost:9200", + index="dstack-logs", + ) + + def test_read_returns_logs(self, mock_es_client): + mock_es_client.search.return_value = { + "hits": { + "hits": [ + { + "_source": { + "@timestamp": "2023-10-06T10:01:53.234000+00:00", + "message": "Hello", + "stream": "test-stream", + }, + "sort": [1696586513234, "doc1"], + }, + { + "_source": { + "@timestamp": "2023-10-06T10:01:53.235000+00:00", + "message": "World", + "stream": "test-stream", + }, + "sort": [1696586513235, "doc2"], + }, + ] + } + } + + with patch("dstack._internal.server.services.logs.fluentbit.Elasticsearch") as mock: + mock.return_value = mock_es_client + reader = ElasticsearchReader( + host="http://localhost:9200", + index="dstack-logs", + ) + + request = PollLogsRequest( + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + limit=2, + ) + result = reader.read("test-stream", request) + + assert len(result.logs) == 2 + assert result.logs[0].message == "Hello" + assert result.logs[1].message == "World" + assert result.next_token == "1696586513235:doc2" + + def test_read_with_time_filtering(self, mock_es_client): + with patch("dstack._internal.server.services.logs.fluentbit.Elasticsearch") as mock: + mock.return_value = mock_es_client + reader = ElasticsearchReader( + host="http://localhost:9200", + index="dstack-logs", + ) + + request = PollLogsRequest( + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + start_time=datetime(2023, 10, 6, 10, 0, 0, tzinfo=timezone.utc), + end_time=datetime(2023, 10, 6, 11, 0, 0, tzinfo=timezone.utc), + limit=100, + ) + reader.read("test-stream", request) + + call_args = mock_es_client.search.call_args + query = call_args.kwargs["query"] + assert "filter" in query["bool"] + assert len(query["bool"]["filter"]) == 2 + + def test_read_descending_order(self, mock_es_client): + with patch("dstack._internal.server.services.logs.fluentbit.Elasticsearch") as mock: + mock.return_value = mock_es_client + reader = ElasticsearchReader( + host="http://localhost:9200", + index="dstack-logs", + ) + + request = PollLogsRequest( + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + limit=100, + descending=True, + ) + reader.read("test-stream", request) + + call_args = mock_es_client.search.call_args + assert call_args.kwargs["sort"] == [ + {"@timestamp": {"order": "desc"}}, + {"_id": {"order": "desc"}}, + ] + + def test_read_with_next_token(self, mock_es_client): + with patch("dstack._internal.server.services.logs.fluentbit.Elasticsearch") as mock: + mock.return_value = mock_es_client + reader = ElasticsearchReader( + host="http://localhost:9200", + index="dstack-logs", + ) + + request = PollLogsRequest( + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + next_token="1696586513234:doc1", + limit=100, + ) + reader.read("test-stream", request) + + call_args = mock_es_client.search.call_args + assert call_args.kwargs["search_after"] == ["1696586513234", "doc1"] + + def test_read_with_malformed_next_token_raises_client_error(self, mock_es_client): + """Test that malformed next_token raises ServerClientError (400) instead of IndexError (500).""" + with patch("dstack._internal.server.services.logs.fluentbit.Elasticsearch") as mock: + mock.return_value = mock_es_client + reader = ElasticsearchReader( + host="http://localhost:9200", + index="dstack-logs", + ) + + request = PollLogsRequest( + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + next_token="invalid_token_no_colon", + limit=100, + ) + with pytest.raises(ServerClientError, match="Invalid next_token"): + reader.read("test-stream", request) + + request.next_token = ":" + with pytest.raises(ServerClientError, match="Invalid next_token"): + reader.read("test-stream", request) + + request.next_token = ":doc1" + with pytest.raises(ServerClientError, match="Invalid next_token"): + reader.read("test-stream", request) + + request.next_token = "1696586513234:" + with pytest.raises(ServerClientError, match="Invalid next_token"): + reader.read("test-stream", request) + + mock_es_client.search.assert_not_called() + + def test_close_closes_client(self, mock_es_client): + with patch("dstack._internal.server.services.logs.fluentbit.Elasticsearch") as mock: + mock.return_value = mock_es_client + reader = ElasticsearchReader( + host="http://localhost:9200", + index="dstack-logs", + ) + reader.close() + mock_es_client.close.assert_called_once() diff --git a/src/tests/_internal/server/services/test_instances.py b/src/tests/_internal/server/services/test_instances.py new file mode 100644 index 0000000000..cba11c67ec --- /dev/null +++ b/src/tests/_internal/server/services/test_instances.py @@ -0,0 +1,521 @@ +import uuid +from unittest.mock import Mock, call + +import pytest +from sqlalchemy.ext.asyncio import AsyncSession + +import dstack._internal.server.services.instances as instances_services +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.health import HealthStatus +from dstack._internal.core.models.instances import ( + Instance, + InstanceStatus, + InstanceTerminationReason, + InstanceType, + Resources, +) +from dstack._internal.core.models.profiles import ( + FleetInstanceSelector, + InstanceHostnameSelector, + InstanceNameSelector, + Profile, +) +from dstack._internal.core.models.runs import JobStatus +from dstack._internal.server.models import InstanceModel +from dstack._internal.server.schemas.runner import TaskListItem, TaskListResponse, TaskStatus +from dstack._internal.server.services.runner.client import ShimClient +from dstack._internal.server.testing.common import ( + create_export, + create_fleet, + create_instance, + create_job, + create_project, + create_repo, + create_run, + create_user, + get_job_provisioning_data, + get_kubernetes_volume_configuration, + get_remote_connection_info, + get_volume, + get_volume_configuration, + get_volume_provisioning_data, + list_events, +) +from dstack._internal.utils.common import get_current_datetime + + +class TestSwitchInstanceStatus: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_includes_termination_reason_in_event_messages_only_once( + self, test_db, session: AsyncSession + ) -> None: + project = await create_project(session=session) + instance = await create_instance( + session=session, project=project, status=InstanceStatus.PENDING + ) + instance.termination_reason = InstanceTerminationReason.ERROR + instance.termination_reason_message = "Some err" + instances_services.switch_instance_status(session, instance, InstanceStatus.TERMINATING) + instances_services.switch_instance_status(session, instance, InstanceStatus.TERMINATED) + await session.commit() + events = await list_events(session) + assert len(events) == 2 + assert {e.message for e in events} == { + "Instance status changed PENDING -> TERMINATING. Termination reason: ERROR (Some err)", + # Do not duplicate the termination reason in the second event + "Instance status changed TERMINATING -> TERMINATED", + } + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_includes_termination_reason_in_event_message_when_switching_directly_to_terminated( + self, test_db, session: AsyncSession + ) -> None: + project = await create_project(session=session) + instance = await create_instance( + session=session, project=project, status=InstanceStatus.PENDING + ) + instance.termination_reason = InstanceTerminationReason.ERROR + instance.termination_reason_message = "Some err" + instances_services.switch_instance_status(session, instance, InstanceStatus.TERMINATED) + await session.commit() + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == ( + "Instance status changed PENDING -> TERMINATED. Termination reason: ERROR (Some err)" + ) + + +class TestFilterInstances: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_all_instances(self, test_db, session: AsyncSession): + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + aws_instance = await create_instance( + session=session, + project=project, + backend=BackendType.AWS, + ) + runpod_instance = await create_instance( + session=session, + project=project, + backend=BackendType.RUNPOD, + ) + instances = [aws_instance, runpod_instance] + res = instances_services.filter_instances( + instances=instances, + profile=Profile(name="test"), + ) + assert res == instances + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_multinode_instances(self, test_db, session: AsyncSession): + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + aws_instance = await create_instance( + session=session, + project=project, + backend=BackendType.AWS, + ) + vastai_instance = await create_instance( + session=session, + project=project, + backend=BackendType.VASTAI, + ) + instances = [aws_instance, vastai_instance] + res = instances_services.filter_instances( + instances=instances, + profile=Profile(name="test"), + multinode=True, + ) + assert res == [aws_instance] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_volume_instances(self, test_db, session: AsyncSession): + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + aws_instance = await create_instance( + session=session, + project=project, + backend=BackendType.AWS, + ) + runpod_instance1 = await create_instance( + session=session, + project=project, + backend=BackendType.RUNPOD, + region="eu", + ) + runpod_instance2 = await create_instance( + session=session, + project=project, + backend=BackendType.RUNPOD, + region="us", + ) + instances = [aws_instance, runpod_instance1, runpod_instance2] + res = instances_services.filter_instances( + instances=instances, + profile=Profile(name="test"), + volumes=[ + [ + get_volume( + configuration=get_volume_configuration( + backend=BackendType.RUNPOD, region="us" + ) + ) + ] + ], + ) + assert res == [runpod_instance2] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_volume_instances_with_az(self, test_db, session: AsyncSession): + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + aws_instance_1 = await create_instance( + session=session, + project=project, + backend=BackendType.AWS, + region="us-1", + availability_zone="us-1a", + ) + aws_instance_2 = await create_instance( + session=session, + project=project, + backend=BackendType.AWS, + region="us-1", + availability_zone="us-1b", + ) + gcp_instance = await create_instance( + session=session, + project=project, + backend=BackendType.GCP, + region="us-1", + availability_zone="us-1b", + ) + instances = [aws_instance_1, aws_instance_2, gcp_instance] + volume = get_volume( + configuration=get_volume_configuration(backend=BackendType.AWS, region="us-1"), + provisioning_data=get_volume_provisioning_data( + backend=BackendType.AWS, availability_zone="us-1b" + ), + ) + res = instances_services.filter_instances( + instances=instances, + profile=Profile(name="test"), + volumes=[[volume]], + ) + assert res == [aws_instance_2] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_volume_instances_without_region(self, test_db, session: AsyncSession): + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + aws_instance = await create_instance( + session=session, + project=project, + backend=BackendType.AWS, + ) + # Kubernetes does not support "create instance" feature, but for the sake of this test + # it does not matter + kubernetes_instance = await create_instance( + session=session, + project=project, + backend=BackendType.KUBERNETES, + ) + instances = [aws_instance, kubernetes_instance] + volume = get_volume( + configuration=get_kubernetes_volume_configuration(), + provisioning_data=get_volume_provisioning_data( + backend=BackendType.KUBERNETES, availability_zone=None + ), + ) + res = instances_services.filter_instances( + instances=instances, + profile=Profile(name="test"), + volumes=[[volume]], + ) + assert res == [kubernetes_instance] + + +class TestSelectInstancesBySelectors: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_selects_by_instance_name(self, test_db, session: AsyncSession): + project = await create_project(session=session) + fleet = await create_fleet(session=session, project=project) + await create_instance(session=session, project=project, fleet=fleet, name="worker-0") + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + name="worker-1", + ) + + res = await instances_services.select_instances_by_selectors( + session=session, + project=project, + selectors=[InstanceNameSelector(name="worker-1")], + ) + + assert res == [instance] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_selects_by_cloud_hostname_and_internal_ip(self, test_db, session: AsyncSession): + project = await create_project(session=session) + fleet = await create_fleet(session=session, project=project) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + job_provisioning_data=get_job_provisioning_data( + hostname="203.0.113.8", + internal_ip="10.0.0.8", + ), + ) + + res = await instances_services.select_instances_by_selectors( + session=session, + project=project, + selectors=[InstanceHostnameSelector(hostname="10.0.0.8")], + ) + + assert res == [instance] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_selects_by_ssh_host(self, test_db, session: AsyncSession): + project = await create_project(session=session) + fleet = await create_fleet(session=session, project=project) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + remote_connection_info=get_remote_connection_info(host="192.168.1.11"), + ) + + res = await instances_services.select_instances_by_selectors( + session=session, + project=project, + selectors=[InstanceHostnameSelector(hostname="192.168.1.11")], + ) + + assert res == [instance] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + ("selector", "expected_name"), + [ + ("same-fleet", "local-worker"), + ("exporter-project/same-fleet", "exported-worker"), + ], + ) + async def test_fleet_instance_selector_respects_project_reference( + self, + test_db, + session: AsyncSession, + selector: str, + expected_name: str, + ): + user = await create_user(session=session) + project = await create_project(session=session, owner=user, name="importer-project") + exporter_project = await create_project( + session=session, owner=user, name="exporter-project" + ) + local_fleet = await create_fleet(session=session, project=project, name="same-fleet") + exported_fleet = await create_fleet( + session=session, project=exporter_project, name="same-fleet" + ) + await create_instance( + session=session, + project=project, + fleet=local_fleet, + instance_num=1, + name="local-worker", + ) + await create_instance( + session=session, + project=exporter_project, + fleet=exported_fleet, + instance_num=1, + name="exported-worker", + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[project], + exported_fleets=[exported_fleet], + ) + + res = await instances_services.select_instances_by_selectors( + session=session, + project=project, + selectors=[FleetInstanceSelector(fleet=selector, instance=1)], + ) + + assert [instance.name for instance in res] == [expected_name] + + +@pytest.mark.asyncio +@pytest.mark.usefixtures("image_config_mock") +@pytest.mark.usefixtures("turn_off_keep_shim_tasks_setting") +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestRemoveDanglingTasks: + @pytest.fixture + def turn_off_keep_shim_tasks_setting(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr("dstack._internal.server.settings.SERVER_KEEP_SHIM_TASKS", False) + + async def test_terminates_and_removes_dangling_tasks( + self, test_db, session: AsyncSession + ) -> None: + user = await create_user(session=session) + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + instance=instance, + ) + dangling_task_id_1 = "fe138b77-d0b1-49d3-8c9f-2dfe78ece727" + dangling_task_id_2 = "8b016a75-41de-44f1-91ff-c9b63d2caa1d" + shim_client_mock = Mock(spec_set=ShimClient) + shim_client_mock.is_api_v2_supported.return_value = True + shim_client_mock.list_tasks.return_value = TaskListResponse( + tasks=[ + TaskListItem(id=str(job.id), status=TaskStatus.RUNNING), + TaskListItem(id=dangling_task_id_1, status=TaskStatus.RUNNING), + TaskListItem(id=dangling_task_id_2, status=TaskStatus.TERMINATED), + ] + ) + await session.refresh(instance, attribute_names=["jobs"]) + + instances_services.remove_dangling_tasks_from_instance(shim_client_mock, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.BUSY + + shim_client_mock.terminate_task.assert_called_once_with( + task_id=dangling_task_id_1, + reason=None, + message=None, + timeout=0, + ) + assert shim_client_mock.remove_task.call_count == 2 + shim_client_mock.remove_task.assert_has_calls( + [call(task_id=dangling_task_id_1), call(task_id=dangling_task_id_2)] + ) + + async def test_terminates_and_removes_dangling_tasks_legacy_shim( + self, test_db, session: AsyncSession + ) -> None: + user = await create_user(session=session) + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + instance=instance, + ) + dangling_task_id_1 = "fe138b77-d0b1-49d3-8c9f-2dfe78ece727" + dangling_task_id_2 = "8b016a75-41de-44f1-91ff-c9b63d2caa1d" + shim_client_mock = Mock(spec_set=ShimClient) + shim_client_mock.is_api_v2_supported.return_value = True + shim_client_mock.list_tasks.return_value = TaskListResponse( + ids=[str(job.id), dangling_task_id_1, dangling_task_id_2] + ) + await session.refresh(instance, attribute_names=["jobs"]) + + instances_services.remove_dangling_tasks_from_instance(shim_client_mock, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.BUSY + + assert shim_client_mock.terminate_task.call_count == 2 + shim_client_mock.terminate_task.assert_has_calls( + [ + call(task_id=dangling_task_id_1, reason=None, message=None, timeout=0), + call(task_id=dangling_task_id_2, reason=None, message=None, timeout=0), + ] + ) + assert shim_client_mock.remove_task.call_count == 2 + shim_client_mock.remove_task.assert_has_calls( + [call(task_id=dangling_task_id_1), call(task_id=dangling_task_id_2)] + ) + + +class TestInstanceModelToInstance: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_converts_instance(self, test_db, session: AsyncSession): + project = await create_project( + session=session, + name="test_project", + ) + instance_id = uuid.uuid4() + created = get_current_datetime() + expected_instance = Instance( + id=instance_id, + project_name=project.name, + backend=BackendType.AWS, + instance_type=InstanceType( + name="instance", resources=Resources(cpus=1, memory_mib=512, spot=False, gpus=[]) + ), + name="test_instance", + instance_num=0, + hostname="hostname_test", + status=InstanceStatus.PENDING, + unreachable=False, + health_status=HealthStatus.WARNING, + created=created, + region="eu-west-1", + price=1.0, + total_blocks=1, + busy_blocks=0, + ) + im = InstanceModel( + id=instance_id, + created_at=created, + name="test_instance", + instance_num=0, + status=InstanceStatus.PENDING, + unreachable=False, + health=HealthStatus.WARNING, + project=project, + job_provisioning_data='{"ssh_proxy":null, "backend":"aws","hostname":"hostname_test","region":"eu-west","price":1.0,"username":"user1","ssh_port":12345,"dockerized":false,"instance_id":"test_instance","instance_type": {"name": "instance", "resources": {"cpus": 1, "memory_mib": 512, "gpus": [], "spot": false, "disk": {"size_mib": 102400}, "description":""}}}', + offer='{"price":1.0, "backend":"aws", "region":"eu-west-1", "availability":"available","instance": {"name": "instance", "resources": {"cpus": 1, "memory_mib": 512, "gpus": [], "spot": false, "disk": {"size_mib": 102400}, "description":""}}}', + total_blocks=1, + busy_blocks=0, + ) + instance = instances_services.instance_model_to_instance(im) + assert instance == expected_instance diff --git a/src/tests/_internal/server/services/test_logs.py b/src/tests/_internal/server/services/test_logs.py index f551e6cb66..06bfca7dea 100644 --- a/src/tests/_internal/server/services/test_logs.py +++ b/src/tests/_internal/server/services/test_logs.py @@ -1,31 +1,45 @@ +import logging +from datetime import datetime, timedelta, timezone from pathlib import Path -from unittest.mock import patch +from typing import List +from unittest.mock import Mock, call from uuid import UUID +import botocore.exceptions import pytest +import pytest_asyncio +from freezegun import freeze_time from sqlalchemy.ext.asyncio import AsyncSession -from dstack._internal.server import settings -from dstack._internal.server.schemas.runner import LogEvent -from dstack._internal.server.services.logs import write_logs +from dstack._internal.core.errors import ServerClientError +from dstack._internal.core.models.logs import LogEvent, LogEventSource, LogProducer +from dstack._internal.server.models import ProjectModel +from dstack._internal.server.schemas.logs import PollLogsRequest +from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent +from dstack._internal.server.services.logs.aws import ( + CloudWatchLogStorage, +) +from dstack._internal.server.services.logs.base import LogStorageError +from dstack._internal.server.services.logs.filelog import FileLogStorage from dstack._internal.server.testing.common import create_project -class TestWriteLogs: +class TestFileLogStorage: @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_writes_logs(self, test_db, session: AsyncSession, tmp_path: Path): project = await create_project(session=session) - with patch.object(settings, "SERVER_DIR_PATH", tmp_path): - write_logs( - project=project, - run_name="test_run", - job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), - runner_logs=[ - LogEvent(timestamp=1696586513234234123, message=b"Hello"), - LogEvent(timestamp=1696586513234235123, message=b"World"), - ], - job_logs=[], - ) + log_storage = FileLogStorage(tmp_path) + log_storage.write_logs( + project=project, + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[ + RunnerLogEvent(timestamp=1696586513234, message=b"Hello"), + RunnerLogEvent(timestamp=1696586513235, message=b"World"), + ], + job_logs=[], + ) runner_log_path = ( tmp_path / "projects" @@ -36,6 +50,2065 @@ async def test_writes_logs(self, test_db, session: AsyncSession, tmp_path: Path) / "runner.log" ) assert runner_log_path.read_text() == ( - '{"timestamp": "2023-10-06T10:01:53.234234+00:00", "log_source": "stdout", "message": "SGVsbG8="}\n' - '{"timestamp": "2023-10-06T10:01:53.234235+00:00", "log_source": "stdout", "message": "V29ybGQ="}\n' + '{"timestamp":"2023-10-06T10:01:53.234000+00:00","log_source":"stdout","message":"Hello"}\n' + '{"timestamp":"2023-10-06T10:01:53.235000+00:00","log_source":"stdout","message":"World"}\n' + ) + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_poll_logs_basic(self, test_db, session: AsyncSession, tmp_path: Path): + project = await create_project(session=session) + log_storage = FileLogStorage(tmp_path) + + # Write test logs + log_storage.write_logs( + project=project, + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[ + RunnerLogEvent(timestamp=1696586513234, message=b"Log1"), + RunnerLogEvent(timestamp=1696586513235, message=b"Log2"), + RunnerLogEvent(timestamp=1696586513236, message=b"Log3"), + ], + job_logs=[], ) + + # Test basic polling without pagination + poll_request = PollLogsRequest( + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + limit=10, + diagnose=True, + ) + job_submission_logs = log_storage.poll_logs(project, poll_request) + + assert len(job_submission_logs.logs) == 3 + assert job_submission_logs.next_token is None # No more logs, so no next_token + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_poll_logs_with_next_token_pagination( + self, test_db, session: AsyncSession, tmp_path: Path + ): + project = await create_project(session=session) + log_storage = FileLogStorage(tmp_path) + + # Write test logs + log_storage.write_logs( + project=project, + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[ + RunnerLogEvent(timestamp=1696586513234, message=b"Log1"), + RunnerLogEvent(timestamp=1696586513235, message=b"Log2"), + RunnerLogEvent(timestamp=1696586513236, message=b"Log3"), + RunnerLogEvent(timestamp=1696586513237, message=b"Log4"), + RunnerLogEvent(timestamp=1696586513238, message=b"Log5"), + ], + job_logs=[], + ) + + # First page: get 2 logs + poll_request = PollLogsRequest( + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + limit=2, + diagnose=True, + ) + job_submission_logs = log_storage.poll_logs(project, poll_request) + + assert len(job_submission_logs.logs) == 2 + assert job_submission_logs.logs[0].message == "Log1" + assert job_submission_logs.logs[1].message == "Log2" + assert job_submission_logs.next_token == "2" # Next line to read + + # Second page: use next_token + poll_request.next_token = job_submission_logs.next_token + job_submission_logs = log_storage.poll_logs(project, poll_request) + + assert len(job_submission_logs.logs) == 2 + assert job_submission_logs.logs[0].message == "Log3" + assert job_submission_logs.logs[1].message == "Log4" + assert job_submission_logs.next_token == "4" # Next line to read + + # Third page: get remaining log + poll_request.next_token = job_submission_logs.next_token + job_submission_logs = log_storage.poll_logs(project, poll_request) + + assert len(job_submission_logs.logs) == 1 + assert job_submission_logs.logs[0].message == "Log5" + assert job_submission_logs.next_token is None # No more logs + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_poll_logs_with_start_from_specific_line( + self, test_db, session: AsyncSession, tmp_path: Path + ): + project = await create_project(session=session) + log_storage = FileLogStorage(tmp_path) + + # Write test logs + log_storage.write_logs( + project=project, + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[ + RunnerLogEvent(timestamp=1696586513234, message=b"Log1"), + RunnerLogEvent(timestamp=1696586513235, message=b"Log2"), + RunnerLogEvent(timestamp=1696586513236, message=b"Log3"), + ], + job_logs=[], + ) + + # Start from line 1 (second log) + poll_request = PollLogsRequest( + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + next_token="1", + limit=10, + diagnose=True, + ) + job_submission_logs = log_storage.poll_logs(project, poll_request) + + assert len(job_submission_logs.logs) == 2 + assert job_submission_logs.logs[0].message == "Log2" + assert job_submission_logs.logs[1].message == "Log3" + assert job_submission_logs.next_token is None + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_poll_logs_invalid_next_token_raises_error( + self, test_db, session: AsyncSession, tmp_path: Path + ): + project = await create_project(session=session) + log_storage = FileLogStorage(tmp_path) + + # Test with non-integer next_token + poll_request = PollLogsRequest( + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + next_token="invalid", + limit=10, + diagnose=True, + ) + with pytest.raises(ServerClientError): + log_storage.poll_logs(project, poll_request) + + # Test with negative next_token + poll_request.next_token = "-1" + with pytest.raises(ServerClientError): + log_storage.poll_logs(project, poll_request) + + # Test with float next_token + poll_request.next_token = "1.5" + with pytest.raises(ServerClientError): + log_storage.poll_logs(project, poll_request) + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_poll_logs_file_not_found_raises_no_error( + self, test_db, session: AsyncSession, tmp_path: Path + ): + project = await create_project(session=session) + log_storage = FileLogStorage(tmp_path) + + # Test with non-existent log file + poll_request = PollLogsRequest( + run_name="nonexistent_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + limit=10, + diagnose=True, + ) + log_storage.poll_logs(project, poll_request) + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_poll_logs_with_time_filtering_and_pagination( + self, test_db, session: AsyncSession, tmp_path: Path + ): + project = await create_project(session=session) + log_storage = FileLogStorage(tmp_path) + + # Write test logs with different timestamps + log_storage.write_logs( + project=project, + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[ + RunnerLogEvent( + timestamp=1696586513234, message=b"Log1" + ), # 2023-10-06T10:01:53.234 + RunnerLogEvent( + timestamp=1696586513235, message=b"Log2" + ), # 2023-10-06T10:01:53.235 + RunnerLogEvent( + timestamp=1696586513236, message=b"Log3" + ), # 2023-10-06T10:01:53.236 + RunnerLogEvent( + timestamp=1696586513237, message=b"Log4" + ), # 2023-10-06T10:01:53.237 + ], + job_logs=[], + ) + + # Filter logs after 2023-10-06T10:01:53.235 with pagination + poll_request = PollLogsRequest( + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + start_time=datetime(2023, 10, 6, 10, 1, 53, 235000, timezone.utc), + limit=1, + diagnose=True, + ) + job_submission_logs = log_storage.poll_logs(project, poll_request) + + # Should get Log3 first (timestamp > 235) + assert len(job_submission_logs.logs) == 1 + assert job_submission_logs.logs[0].message == "Log3" + assert job_submission_logs.next_token == "3" + + # Get next page + poll_request.next_token = job_submission_logs.next_token + job_submission_logs = log_storage.poll_logs(project, poll_request) + + # Should get Log4 + assert len(job_submission_logs.logs) == 1 + assert job_submission_logs.logs[0].message == "Log4" + # Should not have next_token since we reached end of file + assert job_submission_logs.next_token is None + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_poll_logs_empty_file_returns_empty_list( + self, test_db, session: AsyncSession, tmp_path: Path + ): + project = await create_project(session=session) + log_storage = FileLogStorage(tmp_path) + + # Create empty log file + log_file_path = ( + tmp_path + / "projects" + / project.name + / "logs" + / "test_run" + / "1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e" + / "runner.log" + ) + log_file_path.parent.mkdir(parents=True, exist_ok=True) + log_file_path.write_text("") + + poll_request = PollLogsRequest( + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + limit=10, + diagnose=True, + ) + job_submission_logs = log_storage.poll_logs(project, poll_request) + + assert len(job_submission_logs.logs) == 0 + assert job_submission_logs.next_token is None + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_next_token_pagination_complete_workflow( + self, test_db, session: AsyncSession, tmp_path: Path + ): + """Test complete pagination workflow using next_token""" + project = await create_project(session=session) + log_storage = FileLogStorage(tmp_path) + + # Write 10 logs + log_storage.write_logs( + project=project, + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[ + RunnerLogEvent(timestamp=1696586513000 + i, message=f"Log{i + 1}".encode()) + for i in range(10) + ], + job_logs=[], + ) + + # First page: get 3 logs + poll_request = PollLogsRequest( + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + limit=3, + diagnose=True, + ) + page1 = log_storage.poll_logs(project, poll_request) + + assert len(page1.logs) == 3 + assert page1.logs[0].message == "Log1" + assert page1.logs[1].message == "Log2" + assert page1.logs[2].message == "Log3" + assert page1.next_token == "3" # Next line to read + + # Second page: use next_token + poll_request.next_token = page1.next_token + page2 = log_storage.poll_logs(project, poll_request) + + assert len(page2.logs) == 3 + assert page2.logs[0].message == "Log4" + assert page2.logs[1].message == "Log5" + assert page2.logs[2].message == "Log6" + assert page2.next_token == "6" + + # Third page: get more logs + poll_request.next_token = page2.next_token + page3 = log_storage.poll_logs(project, poll_request) + + assert len(page3.logs) == 3 + assert page3.logs[0].message == "Log7" + assert page3.logs[1].message == "Log8" + assert page3.logs[2].message == "Log9" + assert page3.next_token == "9" + + # Fourth page: get last log + poll_request.next_token = page3.next_token + page4 = log_storage.poll_logs(project, poll_request) + + assert len(page4.logs) == 1 + assert page4.logs[0].message == "Log10" + assert page4.next_token is None + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_next_token_with_time_filtering( + self, test_db, session: AsyncSession, tmp_path: Path + ): + """Test next_token behavior with time filtering""" + project = await create_project(session=session) + log_storage = FileLogStorage(tmp_path) + + # Write logs with different timestamps + log_storage.write_logs( + project=project, + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[ + RunnerLogEvent(timestamp=1696586513000, message=b"Log1"), # Before filter + RunnerLogEvent(timestamp=1696586513100, message=b"Log2"), # Before filter + RunnerLogEvent(timestamp=1696586513200, message=b"Log3"), # After filter + RunnerLogEvent(timestamp=1696586513300, message=b"Log4"), # After filter + RunnerLogEvent(timestamp=1696586513400, message=b"Log5"), # After filter + ], + job_logs=[], + ) + + # Filter logs after timestamp 150 with pagination + start_time = datetime.fromtimestamp(1696586513.150, tz=timezone.utc) + poll_request = PollLogsRequest( + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + start_time=start_time, + limit=2, + diagnose=True, + ) + + page1 = log_storage.poll_logs(project, poll_request) + assert len(page1.logs) == 2 + assert page1.logs[0].message == "Log3" + assert page1.logs[1].message == "Log4" + assert page1.next_token == "4" + + # Get next page + poll_request.next_token = page1.next_token + page2 = log_storage.poll_logs(project, poll_request) + assert len(page2.logs) == 1 + assert page2.logs[0].message == "Log5" + assert page2.next_token is None + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_next_token_edge_cases(self, test_db, session: AsyncSession, tmp_path: Path): + """Test edge cases for next_token behavior""" + project = await create_project(session=session) + log_storage = FileLogStorage(tmp_path) + + # Write exactly one log + log_storage.write_logs( + project=project, + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[ + RunnerLogEvent(timestamp=1696586513000, message=b"OnlyLog"), + ], + job_logs=[], + ) + + # Request with limit higher than available logs + poll_request = PollLogsRequest( + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + limit=10, + diagnose=True, + ) + result = log_storage.poll_logs(project, poll_request) + + assert len(result.logs) == 1 + assert result.logs[0].message == "OnlyLog" + assert result.next_token is None + + # Request with limit equal to available logs + poll_request.limit = 1 + result = log_storage.poll_logs(project, poll_request) + + assert len(result.logs) == 1 + assert result.logs[0].message == "OnlyLog" + assert result.next_token is None + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_next_token_beyond_file_end( + self, test_db, session: AsyncSession, tmp_path: Path + ): + """Test next_token that points beyond the end of file""" + project = await create_project(session=session) + log_storage = FileLogStorage(tmp_path) + + # Write 3 logs + log_storage.write_logs( + project=project, + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[ + RunnerLogEvent(timestamp=1696586513000, message=b"Log1"), + RunnerLogEvent(timestamp=1696586513100, message=b"Log2"), + RunnerLogEvent(timestamp=1696586513200, message=b"Log3"), + ], + job_logs=[], + ) + + # Use next_token that points beyond the file + poll_request = PollLogsRequest( + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + next_token="10", # Points beyond the 3 logs in file + limit=5, + diagnose=True, + ) + result = log_storage.poll_logs(project, poll_request) + + assert len(result.logs) == 0 + assert result.next_token is None + + +class TestPollLogsRequestValidation: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_poll_logs_descending_basic( + self, test_db, session: AsyncSession, tmp_path: Path + ): + """Test basic descending log polling functionality.""" + project = await create_project(session=session) + log_storage = FileLogStorage(tmp_path) + + # Write test logs + log_storage.write_logs( + project=project, + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[ + RunnerLogEvent(timestamp=1696586513234, message=b"Log1"), + RunnerLogEvent(timestamp=1696586513235, message=b"Log2"), + RunnerLogEvent(timestamp=1696586513236, message=b"Log3"), + RunnerLogEvent(timestamp=1696586513237, message=b"Log4"), + RunnerLogEvent(timestamp=1696586513238, message=b"Log5"), + ], + job_logs=[], + ) + + # Test descending polling + poll_request = PollLogsRequest( + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + limit=10, + diagnose=True, + descending=True, + ) + job_submission_logs = log_storage.poll_logs(project, poll_request) + + # Should return logs in descending order (newest first) + assert len(job_submission_logs.logs) == 5 + assert job_submission_logs.logs[0].message == "Log5" + assert job_submission_logs.logs[1].message == "Log4" + assert job_submission_logs.logs[2].message == "Log3" + assert job_submission_logs.logs[3].message == "Log2" + assert job_submission_logs.logs[4].message == "Log1" + assert job_submission_logs.next_token is None # All logs returned + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_poll_logs_descending_with_limit( + self, test_db, session: AsyncSession, tmp_path: Path + ): + """Test descending log polling with limit smaller than total logs.""" + project = await create_project(session=session) + log_storage = FileLogStorage(tmp_path) + + # Write test logs + log_storage.write_logs( + project=project, + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[ + RunnerLogEvent(timestamp=1696586513234, message=b"Log1"), + RunnerLogEvent(timestamp=1696586513235, message=b"Log2"), + RunnerLogEvent(timestamp=1696586513236, message=b"Log3"), + RunnerLogEvent(timestamp=1696586513237, message=b"Log4"), + RunnerLogEvent(timestamp=1696586513238, message=b"Log5"), + ], + job_logs=[], + ) + + # Test with limit smaller than total logs + poll_request = PollLogsRequest( + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + limit=3, + diagnose=True, + descending=True, + ) + job_submission_logs = log_storage.poll_logs(project, poll_request) + + # Should return only the last 3 logs in descending order + assert len(job_submission_logs.logs) == 3 + assert job_submission_logs.logs[0].message == "Log5" + assert job_submission_logs.logs[1].message == "Log4" + assert job_submission_logs.logs[2].message == "Log3" + # Should have next_token for pagination + assert job_submission_logs.next_token is not None + + # Test next page + poll_request.next_token = job_submission_logs.next_token + job_submission_logs = log_storage.poll_logs(project, poll_request) + + # Should return remaining logs in descending order + assert len(job_submission_logs.logs) == 2 + assert job_submission_logs.logs[0].message == "Log2" + assert job_submission_logs.logs[1].message == "Log1" + assert job_submission_logs.next_token is None # No more logs + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_poll_logs_descending_with_time_filtering( + self, test_db, session: AsyncSession, tmp_path: Path + ): + """Test descending log polling with time filtering.""" + project = await create_project(session=session) + log_storage = FileLogStorage(tmp_path) + + # Write test logs with different timestamps + log_storage.write_logs( + project=project, + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[ + RunnerLogEvent( + timestamp=1696586513234, message=b"Log1" + ), # 2023-10-06T10:01:53.234 + RunnerLogEvent( + timestamp=1696586513235, message=b"Log2" + ), # 2023-10-06T10:01:53.235 + RunnerLogEvent( + timestamp=1696586513236, message=b"Log3" + ), # 2023-10-06T10:01:53.236 + RunnerLogEvent( + timestamp=1696586513237, message=b"Log4" + ), # 2023-10-06T10:01:53.237 + RunnerLogEvent( + timestamp=1696586513238, message=b"Log5" + ), # 2023-10-06T10:01:53.238 + ], + job_logs=[], + ) + + # Filter logs between 2023-10-06T10:01:53.235 and 2023-10-06T10:01:53.237 + poll_request = PollLogsRequest( + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + start_time=datetime(2023, 10, 6, 10, 1, 53, 235000, timezone.utc), + end_time=datetime(2023, 10, 6, 10, 1, 53, 237000, timezone.utc), + limit=10, + diagnose=True, + descending=True, + ) + job_submission_logs = log_storage.poll_logs(project, poll_request) + + # Should return logs in descending order within the time range + assert len(job_submission_logs.logs) == 2 + assert job_submission_logs.logs[0].message == "Log4" # timestamp 237 + assert job_submission_logs.logs[1].message == "Log3" # timestamp 236 + assert job_submission_logs.next_token is None # No more logs in range + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_poll_logs_descending_invalid_next_token( + self, test_db, session: AsyncSession, tmp_path: Path + ): + """Test descending log polling with invalid next_token.""" + project = await create_project(session=session) + log_storage = FileLogStorage(tmp_path) + + # Test with non-integer next_token + poll_request = PollLogsRequest( + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + next_token="invalid", + limit=10, + diagnose=True, + descending=True, + ) + with pytest.raises(ServerClientError): + log_storage.poll_logs(project, poll_request) + + # Test with negative next_token + poll_request.next_token = "-1" + with pytest.raises(ServerClientError): + log_storage.poll_logs(project, poll_request) + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_poll_logs_descending_empty_file( + self, test_db, session: AsyncSession, tmp_path: Path + ): + """Test descending log polling with empty log file.""" + project = await create_project(session=session) + log_storage = FileLogStorage(tmp_path) + + # Test with non-existent log file + poll_request = PollLogsRequest( + run_name="nonexistent_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + limit=10, + diagnose=True, + descending=True, + ) + job_submission_logs = log_storage.poll_logs(project, poll_request) + + # Should return empty logs without error + assert len(job_submission_logs.logs) == 0 + assert job_submission_logs.next_token is None + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_poll_logs_descending_pagination_workflow( + self, test_db, session: AsyncSession, tmp_path: Path + ): + """Test complete descending pagination workflow.""" + project = await create_project(session=session) + log_storage = FileLogStorage(tmp_path) + + # Write test logs + log_storage.write_logs( + project=project, + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[ + RunnerLogEvent(timestamp=1696586513234, message=b"Log1"), + RunnerLogEvent(timestamp=1696586513235, message=b"Log2"), + RunnerLogEvent(timestamp=1696586513236, message=b"Log3"), + RunnerLogEvent(timestamp=1696586513237, message=b"Log4"), + RunnerLogEvent(timestamp=1696586513238, message=b"Log5"), + ], + job_logs=[], + ) + + # First page: get last 2 logs + poll_request = PollLogsRequest( + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + limit=2, + diagnose=True, + descending=True, + ) + job_submission_logs = log_storage.poll_logs(project, poll_request) + + assert len(job_submission_logs.logs) == 2 + assert job_submission_logs.logs[0].message == "Log5" + assert job_submission_logs.logs[1].message == "Log4" + assert job_submission_logs.next_token is not None + + # Second page: get next 2 logs + poll_request.next_token = job_submission_logs.next_token + job_submission_logs = log_storage.poll_logs(project, poll_request) + + assert len(job_submission_logs.logs) == 2 + assert job_submission_logs.logs[0].message == "Log3" + assert job_submission_logs.logs[1].message == "Log2" + assert job_submission_logs.next_token is not None + + # Third page: get remaining log + poll_request.next_token = job_submission_logs.next_token + job_submission_logs = log_storage.poll_logs(project, poll_request) + + assert len(job_submission_logs.logs) == 1 + assert job_submission_logs.logs[0].message == "Log1" + assert job_submission_logs.next_token is None # No more logs + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_poll_logs_descending_malformed_lines( + self, test_db, session: AsyncSession, tmp_path: Path + ): + """Test descending log polling with malformed log lines.""" + project = await create_project(session=session) + log_storage = FileLogStorage(tmp_path) + + # Create log file with malformed lines + log_file_path = log_storage._get_log_file_path( + project_name=project.name, + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + producer=LogProducer.RUNNER, + ) + log_file_path.parent.mkdir(exist_ok=True, parents=True) + + with open(log_file_path, "w") as f: + f.write( + '{"timestamp": "2023-10-06T10:01:53.234Z", "log_source": "stdout", "message": "Log1"}\n' + ) + f.write("invalid json line\n") + f.write( + '{"timestamp": "2023-10-06T10:01:53.235Z", "log_source": "stdout", "message": "Log2"}\n' + ) + f.write("another invalid line\n") + f.write( + '{"timestamp": "2023-10-06T10:01:53.236Z", "log_source": "stdout", "message": "Log3"}\n' + ) + + # Test descending polling + poll_request = PollLogsRequest( + run_name="test_run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + limit=10, + diagnose=True, + descending=True, + ) + job_submission_logs = log_storage.poll_logs(project, poll_request) + + # Should return only valid logs in descending order, skipping malformed lines + assert len(job_submission_logs.logs) == 3 + assert job_submission_logs.logs[0].message == "Log3" + assert job_submission_logs.logs[1].message == "Log2" + assert job_submission_logs.logs[2].message == "Log1" + + +class TestCloudWatchLogStorage: + FAKE_NOW = datetime(2023, 10, 6, 10, 1, 54, tzinfo=timezone.utc) + + @pytest_asyncio.fixture + @freeze_time(FAKE_NOW) + async def project(self, test_db, session: AsyncSession) -> ProjectModel: + project = await create_project(session=session, name="test-proj") + return project + + @pytest.fixture + def mock_client(self, monkeypatch: pytest.MonkeyPatch) -> Mock: + mock = Mock() + monkeypatch.setattr("boto3.Session.client", Mock(return_value=mock)) + mock.get_log_events.return_value = { + "events": [], + "nextBackwardToken": "bwd", + "nextForwardToken": "fwd", + } + return mock + + @pytest.fixture + def log_storage(self, mock_client: Mock) -> CloudWatchLogStorage: + return CloudWatchLogStorage(group="test-group") + + @pytest.fixture + def mock_ensure_stream_exists(self, monkeypatch: pytest.MonkeyPatch) -> Mock: + mock = Mock() + monkeypatch.setattr(CloudWatchLogStorage, "_ensure_stream_exists", mock) + return mock + + @pytest.fixture + def poll_logs_request(self) -> PollLogsRequest: + return PollLogsRequest( + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + start_time=None, + end_time=None, + limit=100, + ) + + def test_init_error_client_instantiation_exception(self, monkeypatch: pytest.MonkeyPatch): + monkeypatch.setattr( + "boto3.Session.client", Mock(side_effect=botocore.exceptions.NoRegionError) + ) + with pytest.raises(LogStorageError, match="NoRegionError"): + CloudWatchLogStorage(group="test-group") + + def test_init_error_client_request_error(self, mock_client: Mock): + mock_client.describe_log_streams.side_effect = botocore.exceptions.ClientError({}, "name") + with pytest.raises(LogStorageError, match="ClientError"): + CloudWatchLogStorage(group="test-group") + + def test_init_error_group_not_found(self, mock_client: Mock): + mock_client.describe_log_streams.side_effect = botocore.exceptions.ClientError( + {"Error": {"Code": "ResourceNotFoundException"}}, "op_name" + ) + with pytest.raises(LogStorageError, match=r"'test-group' does not exist"): + CloudWatchLogStorage(group="test-group") + + def test_ensure_stream_exists_new(self, log_storage: CloudWatchLogStorage, mock_client: Mock): + mock_client.describe_log_streams.reset_mock() + mock_client.describe_log_streams.return_value = { + "logStreams": [{"logStreamName": "test-stream-1"}] + } + log_storage._ensure_stream_exists("test-stream") + + assert "test-stream" in log_storage._streams + mock_client.describe_log_streams.assert_called_once_with( + logGroupName="test-group", logStreamNamePrefix="test-stream" + ) + mock_client.create_log_stream.assert_called_once_with( + logGroupName="test-group", logStreamName="test-stream" + ) + + def test_ensure_stream_exists_existing( + self, log_storage: CloudWatchLogStorage, mock_client: Mock + ): + mock_client.describe_log_streams.reset_mock() + mock_client.describe_log_streams.return_value = { + "logStreams": [{"logStreamName": "test-stream"}] + } + log_storage._ensure_stream_exists("test-stream") + + assert "test-stream" in log_storage._streams + mock_client.describe_log_streams.assert_called_once_with( + logGroupName="test-group", logStreamNamePrefix="test-stream" + ) + mock_client.create_log_stream.assert_not_called() + + def test_ensure_stream_exists_cached( + self, log_storage: CloudWatchLogStorage, mock_client: Mock + ): + mock_client.describe_log_streams.reset_mock() + log_storage._streams.add("test-stream") + log_storage._ensure_stream_exists("test-stream") + + mock_client.describe_log_streams.assert_not_called() + mock_client.create_log_stream.assert_not_called() + + def test_ensure_stream_exists_cached_forced( + self, log_storage: CloudWatchLogStorage, mock_client: Mock + ): + mock_client.describe_log_streams.reset_mock() + mock_client.describe_log_streams.return_value = {"logStreams": []} + log_storage._streams.add("test-stream") + log_storage._ensure_stream_exists("test-stream", force=True) + + assert "test-stream" in log_storage._streams + mock_client.describe_log_streams.assert_called_once_with( + logGroupName="test-group", logStreamNamePrefix="test-stream" + ) + mock_client.create_log_stream.assert_called_once_with( + logGroupName="test-group", logStreamName="test-stream" + ) + + @pytest.mark.asyncio + @pytest.mark.parametrize("descending", [False, True]) + async def test_poll_logs_empty_response( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + poll_logs_request: PollLogsRequest, + descending: bool, + ): + # Test with no next token - should not trigger retrying + mock_client.get_log_events.return_value = { + "events": [], + "nextBackwardToken": None, # No next token + "nextForwardToken": None, # No next token + } + poll_logs_request.descending = descending + job_submission_logs = log_storage.poll_logs(project, poll_logs_request) + + assert job_submission_logs.logs == [] + # When no next token is provided initially, retrying doesn't trigger + assert mock_client.get_log_events.call_count == 1 + + @pytest.mark.asyncio + async def test_poll_logs_descending_empty_response_max_tries( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + poll_logs_request: PollLogsRequest, + ): + # Test that we retry up to MAX_RETRIES times when getting empty responses with changing tokens + # Need to provide exactly 10 responses for MAX_RETRIES + mock_client.get_log_events.side_effect = [ + { + "events": [], + "nextBackwardToken": "bwd1", + "nextForwardToken": "fwd", + }, + { + "events": [], + "nextBackwardToken": "bwd2", # Different token + "nextForwardToken": "fwd", + }, + { + "events": [], + "nextBackwardToken": "bwd3", # Different token + "nextForwardToken": "fwd", + }, + { + "events": [], + "nextBackwardToken": "bwd4", # Different token + "nextForwardToken": "fwd", + }, + { + "events": [], + "nextBackwardToken": "bwd5", # Different token + "nextForwardToken": "fwd", + }, + { + "events": [], + "nextBackwardToken": "bwd6", # Different token + "nextForwardToken": "fwd", + }, + { + "events": [], + "nextBackwardToken": "bwd7", # Different token + "nextForwardToken": "fwd", + }, + { + "events": [], + "nextBackwardToken": "bwd8", # Different token + "nextForwardToken": "fwd", + }, + { + "events": [], + "nextBackwardToken": "bwd9", # Different token + "nextForwardToken": "fwd", + }, + { + "events": [], + "nextBackwardToken": "bwd10", # Different token + "nextForwardToken": "fwd", + }, + ] + poll_logs_request.descending = True + job_submission_logs = log_storage.poll_logs(project, poll_logs_request) + + assert job_submission_logs.logs == [] + # For descending requests, we return the next token even when no logs found + assert job_submission_logs.next_token == "bwd10" + assert mock_client.get_log_events.call_count == 10 # MAX_RETRIES + + @pytest.mark.asyncio + async def test_poll_logs_ascending_empty_response_max_tries( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + poll_logs_request: PollLogsRequest, + ): + # Test that for ascending requests, we return None next_token when no logs found after max retries + # Need to provide exactly 10 responses for MAX_RETRIES + mock_client.get_log_events.side_effect = [ + { + "events": [], + "nextBackwardToken": "bwd", + "nextForwardToken": "fwd1", + }, + { + "events": [], + "nextBackwardToken": "bwd", + "nextForwardToken": "fwd2", # Different token + }, + { + "events": [], + "nextBackwardToken": "bwd", + "nextForwardToken": "fwd3", # Different token + }, + { + "events": [], + "nextBackwardToken": "bwd", + "nextForwardToken": "fwd4", # Different token + }, + { + "events": [], + "nextBackwardToken": "bwd", + "nextForwardToken": "fwd5", # Different token + }, + { + "events": [], + "nextBackwardToken": "bwd", + "nextForwardToken": "fwd6", # Different token + }, + { + "events": [], + "nextBackwardToken": "bwd", + "nextForwardToken": "fwd7", # Different token + }, + { + "events": [], + "nextBackwardToken": "bwd", + "nextForwardToken": "fwd8", # Different token + }, + { + "events": [], + "nextBackwardToken": "bwd", + "nextForwardToken": "fwd9", # Different token + }, + { + "events": [], + "nextBackwardToken": "bwd", + "nextForwardToken": "fwd10", # Different token + }, + ] + poll_logs_request.descending = False + job_submission_logs = log_storage.poll_logs(project, poll_logs_request) + + assert job_submission_logs.logs == [] + # For ascending requests, we return None when no logs found after max retries + assert job_submission_logs.next_token is None + assert mock_client.get_log_events.call_count == 10 # MAX_RETRIES + + @pytest.mark.asyncio + async def test_poll_logs_request_params_asc_no_diag_no_dates( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + poll_logs_request: PollLogsRequest, + ): + # Ensure response has events to avoid retrying + mock_client.get_log_events.return_value = { + "events": [ + {"timestamp": 1696586513234, "message": "Hello"}, + ], + "nextBackwardToken": "bwd", + "nextForwardToken": "fwd", + } + poll_logs_request.descending = False + poll_logs_request.limit = 5 + poll_logs_request.diagnose = False + log_storage.poll_logs(project, poll_logs_request) + assert mock_client.get_log_events.call_count == 1 + mock_client.get_log_events.assert_called_with( + logGroupName="test-group", + logStreamName="test-proj/test-run/1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e/job", + limit=5, + startFromHead=True, # For ascending requests + endTime=mock_client.get_log_events.call_args.kwargs["endTime"], # endTime is auto-set + ) + + @pytest.mark.asyncio + async def test_poll_logs_request_params_desc_diag_with_dates( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + poll_logs_request: PollLogsRequest, + ): + # Ensure the response has events to avoid retrying + mock_client.get_log_events.return_value = { + "events": [{"timestamp": 1696586513234, "message": "SGVsbG8="}], + "nextBackwardToken": "bwd", + "nextForwardToken": "fwd", + } + poll_logs_request.start_time = datetime( + 2023, 10, 6, 10, 1, 53, 234000, tzinfo=timezone.utc + ) + poll_logs_request.end_time = datetime(2023, 10, 7, 10, 1, 53, 234000, tzinfo=timezone.utc) + poll_logs_request.descending = True + poll_logs_request.limit = 10 + poll_logs_request.diagnose = True + log_storage.poll_logs(project, poll_logs_request) + assert mock_client.get_log_events.call_count == 1 + mock_client.get_log_events.assert_called_with( + logGroupName="test-group", + logStreamName="test-proj/test-run/1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e/runner", + limit=10, + startFromHead=False, # For descending requests + startTime=1696586513234, # start_time (no +1ms increment) + endTime=1696672913234, + ) + + @pytest.mark.asyncio + async def test_poll_logs_exception_resource_not_found( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + poll_logs_request: PollLogsRequest, + ): + mock_client.get_log_events.side_effect = botocore.exceptions.ClientError( + {"Error": {"Code": "ResourceNotFoundException"}}, "op_name" + ) + job_submission_logs = log_storage.poll_logs(project, poll_logs_request) + assert job_submission_logs.logs == [] + + @pytest.mark.asyncio + async def test_poll_logs_exception_other( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + poll_logs_request: PollLogsRequest, + ): + mock_client.get_log_events.side_effect = botocore.exceptions.ClientError( + {"Error": {"Code": "SomeError"}}, "op_name" + ) + with pytest.raises(LogStorageError, match="ClientError"): + log_storage.poll_logs(project, poll_logs_request) + + @pytest.mark.asyncio + @freeze_time(FAKE_NOW) + async def test_write_logs( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + mock_ensure_stream_exists: Mock, + ): + log_storage.write_logs( + project=project, + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[ + RunnerLogEvent(timestamp=1696586513234, message=b"Hello"), + ], + job_logs=[ + RunnerLogEvent(timestamp=1696586513235, message=b"World"), + ], + ) + + expected_runner_stream = "test-proj/test-run/1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e/runner" + expected_job_stream = "test-proj/test-run/1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e/job" + expected_ensure_stream_exists_calls = [ + call(expected_runner_stream), + call(expected_job_stream), + ] + expected_put_log_events_calls = [ + call( + logGroupName="test-group", + logStreamName=expected_runner_stream, + logEvents=[ + {"timestamp": 1696586513234, "message": "Hello"}, + ], + ), + call( + logGroupName="test-group", + logStreamName=expected_job_stream, + logEvents=[ + {"timestamp": 1696586513235, "message": "World"}, + ], + ), + ] + + assert mock_ensure_stream_exists.call_count == 2 + mock_ensure_stream_exists.assert_has_calls( + expected_ensure_stream_exists_calls, any_order=True + ) + + assert mock_client.put_log_events.call_count == 2 + mock_client.put_log_events.assert_has_calls(expected_put_log_events_calls, any_order=True) + + @pytest.mark.asyncio + @freeze_time(FAKE_NOW) + async def test_write_logs_resource_not_found( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + mock_ensure_stream_exists: Mock, + ): + mock_client.put_log_events.side_effect = [ + # First call ­-- exception + botocore.exceptions.ClientError( + {"Error": {"Code": "ResourceNotFoundException"}}, "op_name" + ), + # Second call -- OK, stream has been recreated + None, + ] + log_storage.write_logs( + project=project, + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[ + RunnerLogEvent(timestamp=1696586513234, message=b"Hello"), + ], + job_logs=[], + ) + assert mock_ensure_stream_exists.call_count == 2 + mock_ensure_stream_exists.assert_has_calls( + [ + call("test-proj/test-run/1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e/runner"), + call("test-proj/test-run/1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e/runner", force=True), + ] + ) + assert mock_client.put_log_events.call_count == 2 + + @pytest.mark.asyncio + @freeze_time(FAKE_NOW) + async def test_write_logs_other_exception( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + mock_ensure_stream_exists: Mock, + ): + mock_ensure_stream_exists.side_effect = botocore.exceptions.ConnectionError(error="err") + with pytest.raises(LogStorageError, match="ConnectionError"): + log_storage.write_logs( + project=project, + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[ + RunnerLogEvent(timestamp=1696586513234, message=b"Hello"), + ], + job_logs=[], + ) + + @pytest.mark.asyncio + @freeze_time(FAKE_NOW) + async def test_write_logs_not_in_chronological_order( + self, + caplog: pytest.LogCaptureFixture, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + mock_ensure_stream_exists: Mock, + ): + caplog.set_level(logging.ERROR) + log_storage.write_logs( + project=project, + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[ + RunnerLogEvent(timestamp=1696586513235, message=b"1"), + RunnerLogEvent(timestamp=1696586513237, message=b"3"), + RunnerLogEvent(timestamp=1696586513237, message=b"4"), + RunnerLogEvent(timestamp=1696586513236, message=b"2"), + RunnerLogEvent(timestamp=1696586513237, message=b"5"), + ], + job_logs=[], + ) + + mock_client.put_log_events.assert_called_once_with( + logGroupName="test-group", + logStreamName="test-proj/test-run/1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e/runner", + logEvents=[ + {"timestamp": 1696586513235, "message": "1"}, + {"timestamp": 1696586513236, "message": "2"}, + {"timestamp": 1696586513237, "message": "3"}, + {"timestamp": 1696586513237, "message": "4"}, + {"timestamp": 1696586513237, "message": "5"}, + ], + ) + assert "events are not in chronological order" in caplog.text + + @pytest.mark.asyncio + @freeze_time(FAKE_NOW) + async def test_write_logs_past_and_future_events( + self, + caplog: pytest.LogCaptureFixture, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + mock_ensure_stream_exists: Mock, + ): + def _delta_ms(**kwargs: int) -> int: + return int(timedelta(**kwargs).total_seconds() * 1000) + + timestamp = int(self.FAKE_NOW.timestamp() * 1000) + + log_storage.write_logs( + project=project, + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[ + RunnerLogEvent(timestamp=timestamp - _delta_ms(days=14), message=b"skipped"), + RunnerLogEvent(timestamp=timestamp - _delta_ms(days=13, hours=23), message=b"1"), + RunnerLogEvent(timestamp=timestamp, message=b"2"), + RunnerLogEvent(timestamp=timestamp + _delta_ms(minutes=90), message=b"3"), + RunnerLogEvent(timestamp=timestamp + _delta_ms(minutes=115), message=b"skipped"), + RunnerLogEvent(timestamp=timestamp + _delta_ms(hours=2), message=b"skipped"), + ], + job_logs=[], + ) + + assert "skipping 1 past event(s)" in caplog.text + assert "skipping 2 future event(s)" in caplog.text + actual = [ + e["message"] + for c in mock_client.put_log_events.call_args_list + for e in c.kwargs["logEvents"] + ] + assert actual == ["1", "2", "3"] + + @pytest.mark.parametrize( + ["messages", "expected"], + [ + # `messages` is a concatenated list for better readability — each list is a batch + # `expected` is a list of lists, each nested list is a batch. + [ + ["", "toolong"], + [], + ], + [ + ["111", "toolong", "111"] + ["222222"] + ["333"], + [["111", "111"], ["222222"], ["333"]], + ], + [ + ["111", "111"] + ["222", "222"], + [["111", "111"], ["222", "222"]], + ], + [ + ["111", "111"] + ["222"], + [["111", "111"], ["222"]], + ], + [ + ["111"] + ["222222"] + ["333", "333"], + [["111"], ["222222"], ["333", "333"]], + ], + ], + ) + @pytest.mark.asyncio + @freeze_time(FAKE_NOW) + async def test_write_logs_batching_by_size( + self, + monkeypatch: pytest.MonkeyPatch, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + mock_ensure_stream_exists: Mock, + messages: List[str], + expected: List[List[str]], + ): + # maximum 6 bytes: 6 (raw bytes) + 26 (overhead) = 32 + monkeypatch.setattr(CloudWatchLogStorage, "MESSAGE_MAX_SIZE", 32) + monkeypatch.setattr(CloudWatchLogStorage, "BATCH_MAX_SIZE", 60) + log_storage.write_logs( + project=project, + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[ + RunnerLogEvent(timestamp=1696586513234, message=message.encode()) + for message in messages + ], + job_logs=[], + ) + assert mock_client.put_log_events.call_count == len(expected) + actual = [ + [e["message"] for e in c.kwargs["logEvents"]] + for c in mock_client.put_log_events.call_args_list + ] + assert actual == expected + + @pytest.mark.parametrize( + ["messages", "expected"], + [ + # `messages` is a concatenated list for better readability — each list is a batch + # `expected` is a list of lists, each nested list is a batch. + [ + ["111", "111", "111"] + ["222"], + [["111", "111", "111"], ["222"]], + ], + [ + ["111", "111", "111"] + ["222", "222", "toolongtoolong", "", "222222"], + [["111", "111", "111"], ["222", "222", "222222"]], + ], + ], + ) + @pytest.mark.asyncio + @freeze_time(FAKE_NOW) + async def test_write_logs_batching_by_count( + self, + monkeypatch: pytest.MonkeyPatch, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + mock_ensure_stream_exists: Mock, + messages: List[str], + expected: List[List[str]], + ): + # maximum 6 bytes: 6 (raw bytes) + 26 (overhead) = 32 + monkeypatch.setattr(CloudWatchLogStorage, "MESSAGE_MAX_SIZE", 32) + monkeypatch.setattr(CloudWatchLogStorage, "EVENT_MAX_COUNT_IN_BATCH", 3) + log_storage.write_logs( + project=project, + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[ + RunnerLogEvent(timestamp=1696586513234, message=message.encode()) + for message in messages + ], + job_logs=[], + ) + assert mock_client.put_log_events.call_count == len(expected) + actual = [ + [e["message"] for e in c.kwargs["logEvents"]] + for c in mock_client.put_log_events.call_args_list + ] + assert actual == expected + + @pytest.mark.asyncio + @freeze_time(FAKE_NOW) + async def test_write_logs_batching_by_timestamp( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + mock_ensure_stream_exists: Mock, + ): + def _delta_ms(**kwargs: int) -> int: + return int(timedelta(**kwargs).total_seconds() * 1000) + + timestamp = int(self.FAKE_NOW.timestamp() * 1000) - _delta_ms(days=3) + + log_storage.write_logs( + project=project, + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[ + # empty message, should be ignored + RunnerLogEvent(timestamp=timestamp - _delta_ms(days=1), message=b""), + # first batch + RunnerLogEvent(timestamp=timestamp, message=b"1"), + RunnerLogEvent(timestamp=timestamp + _delta_ms(hours=23), message=b"2"), + RunnerLogEvent(timestamp=timestamp + _delta_ms(hours=24), message=b"3"), + # second batch + RunnerLogEvent(timestamp=timestamp + _delta_ms(hours=24, seconds=1), message=b"4"), + RunnerLogEvent(timestamp=timestamp + _delta_ms(hours=30), message=b"5"), + RunnerLogEvent(timestamp=timestamp + _delta_ms(hours=48), message=b"6"), + # third batch + RunnerLogEvent(timestamp=timestamp + _delta_ms(hours=50), message=b"7"), + ], + job_logs=[], + ) + + expected = [["1", "2", "3"], ["4", "5", "6"], ["7"]] + assert mock_client.put_log_events.call_count == len(expected) + actual = [ + [e["message"] for e in c.kwargs["logEvents"]] + for c in mock_client.put_log_events.call_args_list + ] + assert actual == expected + + @pytest.mark.asyncio + async def test_poll_logs_non_empty_response( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + poll_logs_request: PollLogsRequest, + ): + mock_client.get_log_events.return_value["events"] = [ + {"timestamp": 1696586513234, "message": "Hello"}, + {"timestamp": 1696586513235, "message": "World"}, + ] + poll_logs_request.limit = 2 + job_submission_logs = log_storage.poll_logs(project, poll_logs_request) + + assert job_submission_logs.logs == [ + LogEvent( + timestamp=datetime(2023, 10, 6, 10, 1, 53, 234000, tzinfo=timezone.utc), + log_source=LogEventSource.STDOUT, + message="Hello", + ), + LogEvent( + timestamp=datetime(2023, 10, 6, 10, 1, 53, 235000, tzinfo=timezone.utc), + log_source=LogEventSource.STDOUT, + message="World", + ), + ] + + @pytest.mark.asyncio + async def test_poll_logs_descending_non_empty_response_on_first_call( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + poll_logs_request: PollLogsRequest, + ): + # Ensure response has events to avoid retrying + mock_client.get_log_events.return_value = { + "events": [ + {"timestamp": 1696586513234, "message": "Hello"}, + {"timestamp": 1696586513235, "message": "World"}, + ], + "nextBackwardToken": "bwd456", + "nextForwardToken": "fwd", + } + poll_logs_request.descending = True + poll_logs_request.limit = 2 + job_submission_logs = log_storage.poll_logs(project, poll_logs_request) + + # Events should be reversed for descending order + assert job_submission_logs.logs == [ + LogEvent( + timestamp=datetime(2023, 10, 6, 10, 1, 53, 235000, tzinfo=timezone.utc), + log_source=LogEventSource.STDOUT, + message="World", + ), + LogEvent( + timestamp=datetime(2023, 10, 6, 10, 1, 53, 234000, tzinfo=timezone.utc), + log_source=LogEventSource.STDOUT, + message="Hello", + ), + ] + # Should return nextBackwardToken for descending requests + assert job_submission_logs.next_token == "bwd456" + assert mock_client.get_log_events.call_count == 1 + + @pytest.mark.asyncio + async def test_next_token_ascending_pagination( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + poll_logs_request: PollLogsRequest, + ): + """Test next_token behavior for ascending pagination""" + # Setup response with nextForwardToken + mock_client.get_log_events.return_value = { + "events": [ + {"timestamp": 1696586513234, "message": "Hello"}, + {"timestamp": 1696586513235, "message": "World"}, + ], + "nextBackwardToken": "bwd", + "nextForwardToken": "fwd123", + } + + poll_logs_request.descending = False + poll_logs_request.limit = 2 + result = log_storage.poll_logs(project, poll_logs_request) + + assert len(result.logs) == 2 + assert result.next_token == "fwd123" # Should return nextForwardToken + + # Verify API was called with correct parameters + mock_client.get_log_events.assert_called_once_with( + logGroupName="test-group", + logStreamName="test-proj/test-run/1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e/job", + limit=2, + startFromHead=True, # For ascending requests + endTime=mock_client.get_log_events.call_args.kwargs["endTime"], # endTime is auto-set + ) + + @pytest.mark.asyncio + async def test_next_token_descending_pagination( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + poll_logs_request: PollLogsRequest, + ): + """Test next_token behavior for descending pagination""" + # Setup response with nextBackwardToken + mock_client.get_log_events.return_value = { + "events": [ + {"timestamp": 1696586513234, "message": "Hello"}, + {"timestamp": 1696586513235, "message": "World"}, + ], + "nextBackwardToken": "bwd456", + "nextForwardToken": "fwd", + } + + poll_logs_request.descending = True + poll_logs_request.limit = 2 + result = log_storage.poll_logs(project, poll_logs_request) + + assert len(result.logs) == 2 + # Events should be reversed for descending order + assert result.logs[0].message == "World" + assert result.logs[1].message == "Hello" + assert result.next_token == "bwd456" # Should return nextBackwardToken + + # Verify API was called with correct parameters + mock_client.get_log_events.assert_called_once_with( + logGroupName="test-group", + logStreamName="test-proj/test-run/1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e/job", + limit=2, + startFromHead=False, # For descending requests + ) + + @pytest.mark.asyncio + async def test_next_token_provided_in_request( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + poll_logs_request: PollLogsRequest, + ): + """Test that provided next_token is passed to CloudWatch API""" + mock_client.get_log_events.return_value = { + "events": [ + {"timestamp": 1696586513234, "message": "Hello"}, + ], + "nextBackwardToken": "bwd", + "nextForwardToken": "new_fwd", + } + + poll_logs_request.next_token = "existing_token_123" + poll_logs_request.descending = False + poll_logs_request.limit = 1 + result = log_storage.poll_logs(project, poll_logs_request) + + assert len(result.logs) == 1 + assert result.next_token == "new_fwd" + + # Verify API was called with the provided next_token + mock_client.get_log_events.assert_called_once_with( + logGroupName="test-group", + logStreamName="test-proj/test-run/1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e/job", + limit=1, + startFromHead=True, + nextToken="existing_token_123", + endTime=mock_client.get_log_events.call_args.kwargs["endTime"], + ) + + @pytest.mark.asyncio + async def test_next_token_none_when_no_logs( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + poll_logs_request: PollLogsRequest, + ): + """Test that next_token is None when no logs are returned""" + # Test with no next token initially - should not trigger retrying + mock_client.get_log_events.return_value = { + "events": [], + "nextBackwardToken": "bwd", + "nextForwardToken": "fwd", + } + + poll_logs_request.limit = 10 + poll_logs_request.descending = False + result = log_storage.poll_logs(project, poll_logs_request) + + assert len(result.logs) == 0 + assert result.next_token is None # Should be None when no logs returned + + # Test descending behavior with no next token initially + poll_logs_request.descending = True + result = log_storage.poll_logs(project, poll_logs_request) + + assert len(result.logs) == 0 + # For descending requests with no initial next token, we return None + assert result.next_token is None + + @pytest.mark.asyncio + async def test_next_token_with_time_filtering( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + poll_logs_request: PollLogsRequest, + ): + """Test next_token behavior with time filtering""" + mock_client.get_log_events.return_value = { + "events": [ + {"timestamp": 1696586513234, "message": "Hello"}, + ], + "nextBackwardToken": "bwd_with_time", + "nextForwardToken": "fwd_with_time", + } + + poll_logs_request.start_time = datetime(2023, 10, 6, 10, 1, 53, 234000, timezone.utc) + poll_logs_request.end_time = datetime(2023, 10, 7, 10, 1, 53, 234000, timezone.utc) + poll_logs_request.next_token = "time_token" + poll_logs_request.descending = True + poll_logs_request.diagnose = True + result = log_storage.poll_logs(project, poll_logs_request) + + assert len(result.logs) == 1 + assert result.next_token == "bwd_with_time" + + # Verify API was called with time filters and next_token + mock_client.get_log_events.assert_called_once_with( + logGroupName="test-group", + logStreamName="test-proj/test-run/1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e/runner", + limit=100, + startFromHead=False, + startTime=1696586513234, # start_time (no +1ms increment) + endTime=1696672913234, + nextToken="time_token", + ) + + @pytest.mark.asyncio + async def test_next_token_missing_in_cloudwatch_response( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + poll_logs_request: PollLogsRequest, + ): + """Test behavior when CloudWatch doesn't return next tokens""" + mock_client.get_log_events.return_value = { + "events": [ + {"timestamp": 1696586513234, "message": "Hello"}, + ], + # No nextBackwardToken or nextForwardToken in response + } + + poll_logs_request.descending = False + result = log_storage.poll_logs(project, poll_logs_request) + + assert len(result.logs) == 1 + assert result.next_token is None # Should be None when no token in response + + @pytest.mark.asyncio + async def test_next_token_empty_string_in_cloudwatch_response( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + poll_logs_request: PollLogsRequest, + ): + """Test behavior when CloudWatch returns empty string tokens""" + mock_client.get_log_events.return_value = { + "events": [ + {"timestamp": 1696586513234, "message": "Hello"}, + ], + "nextBackwardToken": "", + "nextForwardToken": "", + } + + poll_logs_request.descending = False + result = log_storage.poll_logs(project, poll_logs_request) + + assert len(result.logs) == 1 + assert result.next_token == "" # Should return empty string if that's what AWS returns + + @pytest.mark.asyncio + async def test_next_token_pagination_workflow( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + poll_logs_request: PollLogsRequest, + ): + """Test complete pagination workflow with next_token""" + # First call - returns some logs with next_token + mock_client.get_log_events.side_effect = [ + { + "events": [ + {"timestamp": 1696586513234, "message": "Hello"}, + {"timestamp": 1696586513235, "message": "World"}, + ], + "nextBackwardToken": "bwd", + "nextForwardToken": "token_page2", + }, + # Second call - returns final logs with next_token + { + "events": [ + {"timestamp": 1696586513236, "message": "!"}, + ], + "nextBackwardToken": "final_bwd", + "nextForwardToken": "final_fwd", + }, + ] + + # First page + poll_logs_request.limit = 2 + poll_logs_request.descending = False + page1 = log_storage.poll_logs(project, poll_logs_request) + + assert len(page1.logs) == 2 + assert page1.logs[0].message == "Hello" + assert page1.logs[1].message == "World" + assert page1.next_token == "token_page2" + + # Second page using next_token + poll_logs_request.next_token = page1.next_token + page2 = log_storage.poll_logs(project, poll_logs_request) + + assert len(page2.logs) == 1 + assert page2.logs[0].message == "!" + assert page2.next_token == "final_fwd" + + # Verify both API calls + assert mock_client.get_log_events.call_count == 2 + + # First call should not have nextToken + first_call = mock_client.get_log_events.call_args_list[0] + assert "nextToken" not in first_call.kwargs + + # Second call should have nextToken + second_call = mock_client.get_log_events.call_args_list[1] + assert second_call.kwargs["nextToken"] == "token_page2" + + @pytest.mark.asyncio + async def test_poll_logs_retrying_multiple_empty_responses( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + poll_logs_request: PollLogsRequest, + ): + """Test retrying behavior when multiple empty responses are returned before finding logs""" + # First 3 calls return empty, 4th call returns events + mock_client.get_log_events.side_effect = [ + { + "events": [], + "nextBackwardToken": "bwd1", + "nextForwardToken": "fwd1", + }, + { + "events": [], + "nextBackwardToken": "bwd2", + "nextForwardToken": "fwd2", + }, + { + "events": [], + "nextBackwardToken": "bwd3", + "nextForwardToken": "fwd3", + }, + { + "events": [ + {"timestamp": 1696586513234, "message": "Hello"}, + {"timestamp": 1696586513235, "message": "World"}, + ], + "nextBackwardToken": "bwd4", + "nextForwardToken": "fwd4", + }, + ] + + poll_logs_request.descending = True + poll_logs_request.limit = 2 + result = log_storage.poll_logs(project, poll_logs_request) + + # Should return events from the 4th call, reversed for descending order + assert len(result.logs) == 2 + assert result.logs[0].message == "World" + assert result.logs[1].message == "Hello" + assert result.next_token == "bwd4" + assert mock_client.get_log_events.call_count == 4 + + @pytest.mark.asyncio + async def test_poll_logs_retrying_with_changing_tokens( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + poll_logs_request: PollLogsRequest, + ): + """Test retrying behavior when tokens change between calls""" + # Test that we continue retrying as long as tokens change + mock_client.get_log_events.side_effect = [ + { + "events": [], + "nextBackwardToken": "bwd1", + "nextForwardToken": "fwd1", + }, + { + "events": [], + "nextBackwardToken": "bwd2", # Different token + "nextForwardToken": "fwd2", + }, + { + "events": [ + {"timestamp": 1696586513234, "message": "Found"}, + ], + "nextBackwardToken": "bwd3", + "nextForwardToken": "fwd3", + }, + ] + + poll_logs_request.descending = True + poll_logs_request.limit = 1 + result = log_storage.poll_logs(project, poll_logs_request) + + assert len(result.logs) == 1 + assert result.logs[0].message == "Found" + assert result.next_token == "bwd3" + assert mock_client.get_log_events.call_count == 3 + + @pytest.mark.asyncio + async def test_poll_logs_descending_some_responses_are_empty( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + poll_logs_request: PollLogsRequest, + ): + # Test retrying logic: first call returns empty, second call returns events + mock_client.get_log_events.side_effect = [ + { + "events": [], + "nextBackwardToken": "bwd1", + "nextForwardToken": "fwd", + }, + { + "events": [ + {"timestamp": 1696586513234, "message": "SGVsbG8="}, + {"timestamp": 1696586513235, "message": "V29ybGQ="}, + ], + "nextBackwardToken": "bwd3", + "nextForwardToken": "fwd", + }, + ] + poll_logs_request.descending = True + poll_logs_request.limit = 3 + job_submission_logs = log_storage.poll_logs(project, poll_logs_request) + + # Should return events from second call, reversed for descending order + assert job_submission_logs.logs == [ + LogEvent( + timestamp=datetime(2023, 10, 6, 10, 1, 53, 235000, tzinfo=timezone.utc), + log_source=LogEventSource.STDOUT, + message="V29ybGQ=", + ), + LogEvent( + timestamp=datetime(2023, 10, 6, 10, 1, 53, 234000, tzinfo=timezone.utc), + log_source=LogEventSource.STDOUT, + message="SGVsbG8=", + ), + ] + assert job_submission_logs.next_token == "bwd3" + assert mock_client.get_log_events.call_count == 2 + + @pytest.mark.asyncio + async def test_poll_logs_descending_empty_response_with_same_token( + self, + project: ProjectModel, + log_storage: CloudWatchLogStorage, + mock_client: Mock, + poll_logs_request: PollLogsRequest, + ): + # Test that when next token doesn't change, we stop retrying + mock_client.get_log_events.return_value = { + "events": [], + "nextBackwardToken": "bwd", + "nextForwardToken": "fwd", + } + poll_logs_request.descending = True + poll_logs_request.next_token = "bwd" # Same as returned token + job_submission_logs = log_storage.poll_logs(project, poll_logs_request) + + assert job_submission_logs.logs == [] + assert job_submission_logs.next_token is None + assert mock_client.get_log_events.call_count == 1 + + +class TestFileLogStorageReadLinesReversed: + # No changes to the first 6 tests, they will now pass. + def test_basic_file(self, tmp_path: Path): + file = tmp_path / "test.txt" + content = b"line1\nline2\nline3\n" + file.write_bytes(content) + lines = list(FileLogStorage._read_lines_reversed(file)) + assert lines == [ + (b"", 18), + (b"line3", 12), + (b"line2", 6), + (b"line1", 0), + ] + + def test_file_without_trailing_newline(self, tmp_path: Path): + file = tmp_path / "test.txt" + content = b"line1\nline2" + file.write_bytes(content) + lines = list(FileLogStorage._read_lines_reversed(file)) + assert lines == [ + (b"line2", 6), + (b"line1", 0), + ] + + def test_empty_file(self, tmp_path: Path): + file = tmp_path / "test.txt" + file.touch() + lines = list(FileLogStorage._read_lines_reversed(file)) + assert lines == [] + + def test_single_line_file(self, tmp_path: Path): + file = tmp_path / "test.txt" + content = b"the only line" + file.write_bytes(content) + lines = list(FileLogStorage._read_lines_reversed(file)) + assert lines == [(b"the only line", 0)] + + def test_file_with_empty_lines(self, tmp_path: Path): + file = tmp_path / "test.txt" + content = b"lineA\n\nlineC\n" + file.write_bytes(content) + lines = list(FileLogStorage._read_lines_reversed(file)) + assert lines == [ + (b"", 13), + (b"lineC", 7), + (b"", 6), + (b"lineA", 0), + ] + + def test_file_with_only_newlines(self, tmp_path: Path): + file = tmp_path / "test.txt" + content = b"\n\n" + file.write_bytes(content) + lines = list(FileLogStorage._read_lines_reversed(file)) + assert lines == [ + (b"", 2), + (b"", 1), + ] + + def test_large_file_spanning_multiple_chunks(self, tmp_path: Path): + file = tmp_path / "large_file.txt" + line_content = b"abcdefghi" # 9 bytes + 1 newline = 10 bytes per line + num_lines = 5 + content = b"\n".join([line_content] * num_lines) + file.write_bytes(content) + # Pass the small chunk_size directly to the method + lines = list(FileLogStorage._read_lines_reversed(file, chunk_size=10)) + assert len(lines) == num_lines + assert lines[0] == (line_content, 40) + assert lines[1] == (line_content, 30) + assert lines[2] == (line_content, 20) + assert lines[3] == (line_content, 10) + assert lines[4] == (line_content, 0) + + # The rest of the tests will now pass without modification + def test_start_offset_in_middle_of_line(self, tmp_path: Path): + file = tmp_path / "test.txt" + content = b"line1\nline2\nline3\n" + file.write_bytes(content) + lines = list(FileLogStorage._read_lines_reversed(file, start_offset=10)) + assert lines == [ + (b"line2", 6), + (b"line1", 0), + ] + + def test_start_offset_at_line_boundary(self, tmp_path: Path): + file = tmp_path / "test.txt" + content = b"line1\nline2\nline3\n" + file.write_bytes(content) + lines = list(FileLogStorage._read_lines_reversed(file, start_offset=12)) + assert lines == [ + (b"line2", 6), + (b"line1", 0), + ] + + def test_start_offset_zero(self, tmp_path: Path): + file = tmp_path / "test.txt" + content = b"line1\nline2\nline3\n" + file.write_bytes(content) + lines = list(FileLogStorage._read_lines_reversed(file, start_offset=0)) + assert lines == [] + + def test_start_offset_larger_than_file(self, tmp_path: Path): + file = tmp_path / "test.txt" + content = b"line1\nline2\n" + file.write_bytes(content) + lines_with_offset = list(FileLogStorage._read_lines_reversed(file, start_offset=1000)) + lines_without_offset = list(FileLogStorage._read_lines_reversed(file)) + assert lines_with_offset == lines_without_offset + assert lines_with_offset == [(b"", 12), (b"line2", 6), (b"line1", 0)] + + def test_long_line_larger_than_chunk(self, tmp_path: Path): + file = tmp_path / "long_line.txt" + content = b"a" * 25 + file.write_bytes(content) + # Pass the small chunk_size directly + lines = list(FileLogStorage._read_lines_reversed(file, chunk_size=10)) + assert lines == [(content, 0)] diff --git a/src/tests/_internal/server/services/test_metrics.py b/src/tests/_internal/server/services/test_metrics.py new file mode 100644 index 0000000000..083855652a --- /dev/null +++ b/src/tests/_internal/server/services/test_metrics.py @@ -0,0 +1,167 @@ +from datetime import datetime, timedelta, timezone + +import pytest +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.metrics import Metric +from dstack._internal.server.services.metrics import get_job_metrics +from dstack._internal.server.testing.common import ( + create_job, + create_job_metrics_point, + create_project, + create_repo, + create_run, + create_user, + get_job_provisioning_data, +) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +@pytest.mark.usefixtures("test_db", "image_config_mock") +class TestGetMetrics: + latest_ts = datetime(2023, 1, 2, 3, 4, 25, tzinfo=timezone.utc) + ts: tuple[datetime, ...] = ( + latest_ts, # 0 + latest_ts - timedelta(seconds=10), # 1 + latest_ts - timedelta(seconds=20), # 2 + latest_ts - timedelta(seconds=30), # 3 + latest_ts - timedelta(seconds=40), # 4 + latest_ts - timedelta(seconds=50), # 5 + ) + # dt, cpu_usage_sec, memory_usage_bytes, memory_ws_bytes, gpu0_memory_usage_bytes, gpu0_util, + # gpu1_memory_usage_bytess, gpu1_util + points: tuple[tuple[datetime, int, int, int, int, int, int, int], ...] = ( + (ts[0], 110, 512, 128, 768, 15, 128, 20), + (ts[1], 104, 1024, 512, 1024, 10, 256, 10), + (ts[2], 100, 1024, 512, 1024, 20, 128, 5), + (ts[3], 90, 512, 512, 2048, 40, 512, 20), + (ts[4], 90, 1024, 1024, 1024, 0, 128, 0), + (ts[5], 80, 512, 512, 1024, 10, 256, 0), + ) + + @pytest.mark.parametrize( + ["params", "ts", "cpu", "mem", "mem_ws", "gpu0_mem", "gpu0_util", "gpu1_mem", "gpu1_util"], + [ + pytest.param( + {"limit": 1}, + [ts[0]], + [60], + [512], + [128], + [768], + [15], + [128], + [20], + id="limit-1-latest", + ), + pytest.param( + {"limit": 3}, + [ts[0], ts[1], ts[2]], + [60, 40, 100], + [512, 1024, 1024], + [128, 512, 512], + [768, 1024, 1024], + [15, 10, 20], + [128, 256, 128], + [20, 10, 5], + id="limit-3-latest", + ), + pytest.param( + {}, + [ts[0], ts[1], ts[2], ts[3], ts[4]], + [60, 40, 100, 0, 100], + [512, 1024, 1024, 512, 1024], + [128, 512, 512, 512, 1024], + [768, 1024, 1024, 2048, 1024], + [15, 10, 20, 40, 0], + [128, 256, 128, 512, 128], + [20, 10, 5, 20, 0], + id="all", + ), + pytest.param( + {"after": ts[3]}, + [ts[0], ts[1], ts[2]], + [60, 40, 100], + [512, 1024, 1024], + [128, 512, 512], + [768, 1024, 1024], + [15, 10, 20], + [128, 256, 128], + [20, 10, 5], + id="all-after", + ), + pytest.param( + {"before": ts[2]}, + [ts[3], ts[4]], + [0, 100], + [512, 1024], + [512, 1024], + [2048, 1024], + [40, 0], + [512, 128], + [20, 0], + id="all-before", + ), + ], + ) + async def test_get_metrics( + self, + session: AsyncSession, + params: dict, + ts: list[datetime], + cpu: list[int], + mem: list[int], + mem_ws: list[int], + gpu0_mem: list[int], + gpu0_util: list[int], + gpu1_mem: list[int], + gpu1_util: list[int], + ): + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + repo = await create_repo( + session=session, + project_id=project.id, + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + ) + jpd = get_job_provisioning_data( + cpu_count=64, memory_gib=128, gpu_count=2, gpu_memory_gib=32 + ) + job = await create_job( + session=session, + run=run, + job_provisioning_data=jpd, + ) + for dt, _cpu, _mem, _mem_ws, _gpu0_mem, _gpu0_util, _gpu1_mem, _gpu1_util in self.points: + await create_job_metrics_point( + session=session, + job_model=job, + timestamp=dt, + cpu_usage_micro=_cpu * 1_000_000, + memory_usage_bytes=_mem, + memory_working_set_bytes=_mem_ws, + gpus_memory_usage_bytes=[_gpu0_mem, _gpu1_mem], + gpus_util_percent=[_gpu0_util, _gpu1_util], + ) + + metrics = await get_job_metrics(session, job, **params) + + assert metrics.metrics == [ + Metric(name="cpu_usage_percent", timestamps=ts, values=cpu), + Metric(name="memory_usage_bytes", timestamps=ts, values=mem), + Metric(name="memory_working_set_bytes", timestamps=ts, values=mem_ws), + Metric(name="cpus_detected_num", timestamps=ts, values=[64] * len(ts)), + Metric(name="memory_total_bytes", timestamps=ts, values=[137438953472] * len(ts)), + Metric(name="gpus_detected_num", timestamps=ts, values=[2] * len(ts)), + Metric(name="gpu_memory_total_bytes", timestamps=ts, values=[34359738368] * len(ts)), + Metric(name="gpu_memory_usage_bytes_gpu0", timestamps=ts, values=gpu0_mem), + Metric(name="gpu_memory_usage_bytes_gpu1", timestamps=ts, values=gpu1_mem), + Metric(name="gpu_util_percent_gpu0", timestamps=ts, values=gpu0_util), + Metric(name="gpu_util_percent_gpu1", timestamps=ts, values=gpu1_util), + ] diff --git a/src/tests/_internal/server/services/test_offers.py b/src/tests/_internal/server/services/test_offers.py new file mode 100644 index 0000000000..25ce8021ae --- /dev/null +++ b/src/tests/_internal/server/services/test_offers.py @@ -0,0 +1,191 @@ +from unittest.mock import Mock, patch + +import pytest + +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.profiles import Profile +from dstack._internal.core.models.resources import ResourcesSpec +from dstack._internal.core.models.runs import Requirements +from dstack._internal.server.services.offers import get_offers_by_requirements +from dstack._internal.server.testing.common import ( + get_instance_offer_with_availability, + get_kubernetes_volume_configuration, + get_volume, + get_volume_configuration, +) + + +class TestGetOffersByRequirements: + @pytest.mark.asyncio + async def test_returns_all_offers(self): + profile = Profile(name="test") + requirements = Requirements(resources=ResourcesSpec()) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + aws_backend_mock = Mock() + aws_backend_mock.TYPE = BackendType.AWS + aws_offer = get_instance_offer_with_availability(backend=BackendType.AWS) + aws_backend_mock.compute.return_value.get_offers.return_value = [aws_offer] + runpod_backend_mock = Mock() + runpod_backend_mock.TYPE = BackendType.RUNPOD + runpod_offer = get_instance_offer_with_availability(backend=BackendType.RUNPOD) + runpod_backend_mock.compute.return_value.get_offers.return_value = [runpod_offer] + m.return_value = [aws_backend_mock, runpod_backend_mock] + res = await get_offers_by_requirements( + project=Mock(), + profile=profile, + requirements=requirements, + ) + m.assert_awaited_once() + assert res == [(aws_backend_mock, aws_offer), (runpod_backend_mock, runpod_offer)] + + @pytest.mark.asyncio + async def test_returns_multinode_offers(self): + profile = Profile(name="test") + requirements = Requirements(resources=ResourcesSpec()) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + aws_backend_mock = Mock() + aws_backend_mock.TYPE = BackendType.AWS + aws_offer = get_instance_offer_with_availability(backend=BackendType.AWS) + aws_backend_mock.compute.return_value.get_offers.return_value = [aws_offer] + vastai_backend_mock = Mock() + vastai_backend_mock.TYPE = BackendType.VASTAI + vastai_offer = get_instance_offer_with_availability(backend=BackendType.VASTAI) + vastai_backend_mock.compute.return_value.get_offers.return_value = [vastai_offer] + m.return_value = [aws_backend_mock, vastai_backend_mock] + res = await get_offers_by_requirements( + project=Mock(), + profile=profile, + requirements=requirements, + multinode=True, + ) + m.assert_awaited_once() + assert res == [(aws_backend_mock, aws_offer)] + + @pytest.mark.asyncio + async def test_returns_volume_offers(self): + profile = Profile(name="test") + requirements = Requirements(resources=ResourcesSpec()) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + aws_backend_mock = Mock() + aws_backend_mock.TYPE = BackendType.AWS + aws_offer = get_instance_offer_with_availability(backend=BackendType.AWS) + aws_backend_mock.compute.return_value.get_offers.return_value = [aws_offer] + runpod_backend_mock = Mock() + runpod_backend_mock.TYPE = BackendType.RUNPOD + runpod_offer1 = get_instance_offer_with_availability( + backend=BackendType.RUNPOD, region="eu" + ) + runpod_offer2 = get_instance_offer_with_availability( + backend=BackendType.RUNPOD, region="us" + ) + runpod_backend_mock.compute.return_value.get_offers.return_value = [ + runpod_offer1, + runpod_offer2, + ] + m.return_value = [aws_backend_mock, runpod_backend_mock] + res = await get_offers_by_requirements( + project=Mock(), + profile=profile, + requirements=requirements, + volumes=[ + [ + get_volume( + configuration=get_volume_configuration( + backend=BackendType.RUNPOD, region="us" + ) + ) + ] + ], + ) + m.assert_awaited_once() + assert res == [(runpod_backend_mock, runpod_offer2)] + + @pytest.mark.asyncio + async def test_returns_volume_offers_without_region(self): + profile = Profile(name="test") + requirements = Requirements(resources=ResourcesSpec()) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + aws_backend_mock = Mock() + aws_backend_mock.TYPE = BackendType.AWS + aws_offer = get_instance_offer_with_availability(backend=BackendType.AWS) + aws_backend_mock.compute.return_value.get_offers.return_value = [aws_offer] + kubernetes_backend_mock = Mock() + kubernetes_backend_mock.TYPE = BackendType.KUBERNETES + kubernetes_offer = get_instance_offer_with_availability( + backend=BackendType.KUBERNETES, + region="", + availability_zones=None, + ) + kubernetes_backend_mock.compute.return_value.get_offers.return_value = [ + kubernetes_offer + ] + m.return_value = [aws_backend_mock, kubernetes_backend_mock] + res = await get_offers_by_requirements( + project=Mock(), + profile=profile, + requirements=requirements, + volumes=[[get_volume(configuration=get_kubernetes_volume_configuration())]], + ) + m.assert_awaited_once() + assert res == [(kubernetes_backend_mock, kubernetes_offer)] + + @pytest.mark.asyncio + async def test_returns_az_offers(self): + profile = Profile(name="test", availability_zones=["az1", "az3"]) + requirements = Requirements(resources=ResourcesSpec()) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + aws_backend_mock = Mock() + aws_backend_mock.TYPE = BackendType.AWS + aws_offer1 = get_instance_offer_with_availability( + backend=BackendType.AWS, availability_zones=["az1"] + ) + aws_offer2 = get_instance_offer_with_availability( + backend=BackendType.AWS, availability_zones=["az2"] + ) + aws_offer3 = get_instance_offer_with_availability( + backend=BackendType.AWS, availability_zones=["az2", "az3"] + ) + expected_aws_offer3 = aws_offer3.copy() + expected_aws_offer3.availability_zones = ["az3"] + aws_offer4 = get_instance_offer_with_availability( + backend=BackendType.AWS, availability_zones=None + ) + aws_backend_mock.compute.return_value.get_offers.return_value = [ + aws_offer1, + aws_offer2, + aws_offer3, + aws_offer4, + ] + m.return_value = [aws_backend_mock] + res = await get_offers_by_requirements( + project=Mock(), + profile=profile, + requirements=requirements, + ) + m.assert_awaited_once() + assert res == [(aws_backend_mock, aws_offer1), (aws_backend_mock, expected_aws_offer3)] + + @pytest.mark.asyncio + async def test_returns_no_offers_for_multinode_instance_mounts_and_non_multinode_backend(self): + # Regression test for https://github.com/dstackai/dstack/issues/2211 + profile = Profile(name="test", backends=[BackendType.RUNPOD]) + requirements = Requirements(resources=ResourcesSpec()) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + aws_backend_mock = Mock() + aws_backend_mock.TYPE = BackendType.AWS + aws_offer = get_instance_offer_with_availability(backend=BackendType.AWS) + aws_backend_mock.compute.return_value.get_offers.return_value = [aws_offer] + runpod_backend_mock = Mock() + runpod_backend_mock.TYPE = BackendType.RUNPOD + runpod_offer = get_instance_offer_with_availability(backend=BackendType.RUNPOD) + runpod_backend_mock.compute.return_value.get_offers.return_value = [runpod_offer] + m.return_value = [aws_backend_mock, runpod_backend_mock] + res = await get_offers_by_requirements( + project=Mock(), + profile=profile, + requirements=requirements, + multinode=True, + instance_mounts=True, + ) + m.assert_awaited_once() + assert res == [] diff --git a/src/tests/_internal/server/services/test_plugins.py b/src/tests/_internal/server/services/test_plugins.py new file mode 100644 index 0000000000..ca5f0bfac6 --- /dev/null +++ b/src/tests/_internal/server/services/test_plugins.py @@ -0,0 +1,297 @@ +import logging +from importlib import import_module +from importlib.metadata import EntryPoint +from unittest.mock import MagicMock, patch + +import pytest + +from dstack._internal.server.services.plugins import _PLUGINS, load_plugins +from dstack.plugins import Plugin +from dstack.plugins.builtin.rest_plugin import RESTPlugin + + +class DummyPlugin1(Plugin): + pass + + +class DummyPlugin2(Plugin): + pass + + +class NotAPlugin: + pass + + +@pytest.fixture(autouse=True) +def clear_plugins(): + _PLUGINS.clear() + yield + _PLUGINS.clear() + + +class TestLoadPlugins: + @patch("dstack._internal.server.services.plugins.entry_points") + @patch("dstack._internal.server.services.plugins.import_module") + @pytest.mark.parametrize( + ["plugin_name", "plugin_module_path", "plugin_class"], + [ + ("plugin1", "dummy.plugins", DummyPlugin1), + ("rest_plugin", "dstack.plugins.builtin.rest_plugin", RESTPlugin), + ], + ) + def test_load_single_plugin( + self, + mock_import_module, + mock_entry_points, + caplog, + plugin_name, + plugin_module_path, + plugin_class, + ): + mock_entry_points.return_value = [ + EntryPoint( + name=plugin_name, + value=f"{plugin_module_path}:{plugin_class.__name__}", + group="dstack.plugins", + ) + ] + mock_module = MagicMock() + setattr(mock_module, plugin_class.__name__, plugin_class) + # if it's a built-in plugin, do the real import + mock_import_module.side_effect = ( + lambda module_path: import_module(module_path) + if module_path.startswith("dstack.plugins.builtin") + else mock_module + ) + + with caplog.at_level(logging.INFO): + load_plugins([plugin_name]) + + assert len(_PLUGINS) == 1 + assert isinstance(_PLUGINS[0], plugin_class) + mock_entry_points.assert_called_once_with(group="dstack.plugins") + mock_import_module.assert_called_once_with(plugin_module_path) + assert f"Loaded plugin {plugin_name}" in caplog.text + + @patch("dstack._internal.server.services.plugins.entry_points") + @patch("dstack._internal.server.services.plugins.import_module") + @pytest.mark.parametrize( + ["plugin_names", "plugin_module_paths", "plugin_classes"], + [ + ( + ["plugin1", "plugin2"], + ["dummy.plugins", "dummy.plugins"], + [DummyPlugin1, DummyPlugin2], + ), + ( + ["plugin1", "plugin2", "rest_plugin"], + ["dummy.plugins", "dummy.plugins", "dstack.plugins.builtin.rest_plugin"], + [DummyPlugin1, DummyPlugin2, RESTPlugin], + ), + ], + ids=["multiple_plugins_without_builtin_plugin", "multiple_plugins_with_builtin_plugin"], + ) + def test_load_multiple_plugins( + self, + mock_import_module, + mock_entry_points, + caplog, + plugin_names, + plugin_module_paths, + plugin_classes, + ): + mock_entry_points.return_value = [ + EntryPoint( + name=plugin_name, + value=f"{plugin_module_path}:{plugin_class.__name__}", + group="dstack.plugins", + ) + for plugin_name, plugin_module_path, plugin_class in zip( + plugin_names, plugin_module_paths, plugin_classes + ) + ] + mock_module = MagicMock() + + for plugin_class, plugin_module_path in zip(plugin_classes, plugin_module_paths): + if not plugin_module_path.startswith("dstack.plugins.builtin"): + setattr(mock_module, plugin_class.__name__, plugin_class) + + mock_import_module.side_effect = ( + lambda module_path: import_module(module_path) + if module_path.startswith("dstack.plugins.builtin") + else mock_module + ) + + with caplog.at_level(logging.INFO): + load_plugins(plugin_names) + + assert len(_PLUGINS) == len(plugin_names) + for i, plugin_class in enumerate(plugin_classes): + assert isinstance(_PLUGINS[i], plugin_class) + + for plugin_name in plugin_names: + assert f"Loaded plugin {plugin_name}" in caplog.text + + @patch("dstack._internal.server.services.plugins.entry_points") + @patch("dstack._internal.server.services.plugins.import_module") + def test_plugin_not_enabled(self, mock_import_module, mock_entry_points, caplog): + mock_entry_points.return_value = [ + EntryPoint( + name="plugin1", + value="dummy.plugins:DummyPlugin1", + group="dstack.plugins", + ) + ] + + with caplog.at_level(logging.INFO): + load_plugins([]) # Enable no plugins + + assert len(_PLUGINS) == 0 + mock_import_module.assert_not_called() + assert "Found not enabled plugin plugin1" in caplog.text + + @patch("dstack._internal.server.services.plugins.entry_points") + @patch("dstack._internal.server.services.plugins.import_module") + def test_enabled_plugin_not_found(self, mock_import_module, mock_entry_points, caplog): + mock_entry_points.return_value = [ + EntryPoint( + name="plugin1", + value="dummy.plugins:DummyPlugin1", + group="dstack.plugins", + ) + ] + + with caplog.at_level(logging.INFO): + load_plugins(["plugin2"]) # Enable a plugin that doesn't have an entry point + + assert len(_PLUGINS) == 0 + mock_import_module.assert_not_called() + assert "Found not enabled plugin plugin1" in caplog.text + assert "Enabled plugins not found: ['plugin2']" in caplog.text + + @patch("dstack._internal.server.services.plugins.entry_points") + @patch( + "dstack._internal.server.services.plugins.import_module", + side_effect=ImportError("Module not found"), + ) + def test_import_error(self, mock_import_module, mock_entry_points, caplog): + mock_entry_points.return_value = [ + EntryPoint( + name="plugin1", + value="dummy.plugins:DummyPlugin1", + group="dstack.plugins", + ) + ] + + with caplog.at_level(logging.INFO): + load_plugins(["plugin1"]) + + assert len(_PLUGINS) == 0 + assert ( + "Failed to load plugin plugin1 when importing dummy.plugins:DummyPlugin1" + in caplog.text + ) + assert "Enabled plugins not found: ['plugin1']" in caplog.text # Because loading failed + + @patch("dstack._internal.server.services.plugins.entry_points") + @patch("dstack._internal.server.services.plugins.import_module") + def test_class_not_found(self, mock_import_module, mock_entry_points, caplog): + mock_entry_points.return_value = [ + EntryPoint( + name="plugin1", + value="dummy.plugins:NonExistentClass", + group="dstack.plugins", + ) + ] + mock_module = MagicMock() + # Simulate the class not being present + del mock_module.NonExistentClass + mock_import_module.return_value = mock_module + + with caplog.at_level(logging.INFO): + load_plugins(["plugin1"]) + + assert len(_PLUGINS) == 0 + assert ( + "Failed to load plugin plugin1: plugin class NonExistentClass not found" in caplog.text + ) + assert "Enabled plugins not found: ['plugin1']" in caplog.text + + @patch("dstack._internal.server.services.plugins.entry_points") + @patch("dstack._internal.server.services.plugins.import_module") + def test_not_a_plugin_subclass(self, mock_import_module, mock_entry_points, caplog): + mock_entry_points.return_value = [ + EntryPoint( + name="plugin1", + value="dummy.plugins:NotAPlugin", + group="dstack.plugins", + ) + ] + mock_module = MagicMock() + mock_module.NotAPlugin = NotAPlugin + mock_import_module.return_value = mock_module + + with caplog.at_level(logging.INFO): + load_plugins(["plugin1"]) + + assert len(_PLUGINS) == 0 + assert ( + "Failed to load plugin plugin1: plugin class NotAPlugin is not a subclass of Plugin" + in caplog.text + ) + assert "Enabled plugins not found: ['plugin1']" in caplog.text + + @patch("dstack._internal.server.services.plugins.entry_points") + @patch("dstack._internal.server.services.plugins.import_module") + def test_clears_existing_plugins(self, mock_import_module, mock_entry_points): + # Pre-populate _PLUGINS + _PLUGINS.append(DummyPlugin1()) + + mock_entry_points.return_value = [ + EntryPoint( + name="plugin2", + value="dummy.plugins:DummyPlugin2", + group="dstack.plugins", + ) + ] + mock_module = MagicMock() + mock_module.DummyPlugin2 = DummyPlugin2 + mock_import_module.return_value = mock_module + + load_plugins(["plugin2"]) + + assert len(_PLUGINS) == 1 # Should only contain plugin2 + assert isinstance(_PLUGINS[0], DummyPlugin2) + + @patch("dstack._internal.server.services.plugins.entry_points") + @patch("dstack._internal.server.services.plugins.import_module") + def test_load_no_plugins_found(self, mock_import_module, mock_entry_points, caplog): + mock_entry_points.return_value = [] # No entry points found + + with caplog.at_level(logging.INFO): + load_plugins(["plugin1"]) # Try to enable one + + assert len(_PLUGINS) == 0 + mock_import_module.assert_not_called() + assert "Enabled plugins not found: ['plugin1']" in caplog.text + + @patch("dstack._internal.server.services.plugins.entry_points") + @patch("dstack._internal.server.services.plugins.import_module") + def test_load_no_plugins_enabled(self, mock_import_module, mock_entry_points, caplog): + mock_entry_points.return_value = [ + EntryPoint( + name="plugin1", + value="dummy.plugins:DummyPlugin1", + group="dstack.plugins", + ) + ] + + with caplog.at_level(logging.INFO): + load_plugins([]) # Enable none + + assert len(_PLUGINS) == 0 + mock_import_module.assert_not_called() + assert "Found not enabled plugin plugin1" in caplog.text + assert ( + "Enabled plugins not found" not in caplog.text + ) # Should not warn if none were enabled diff --git a/src/tests/_internal/server/services/test_pools.py b/src/tests/_internal/server/services/test_pools.py deleted file mode 100644 index 70e89d8d36..0000000000 --- a/src/tests/_internal/server/services/test_pools.py +++ /dev/null @@ -1,80 +0,0 @@ -import uuid - -import pytest -from sqlalchemy.ext.asyncio import AsyncSession - -import dstack._internal.server.services.pools as services_pools -from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.instances import InstanceType, Resources -from dstack._internal.core.models.pools import Instance -from dstack._internal.core.models.runs import InstanceStatus -from dstack._internal.server.models import InstanceModel -from dstack._internal.server.testing.common import create_project, create_user -from dstack._internal.utils.common import get_current_datetime - - -class TestGenerateInstanceName: - @pytest.mark.asyncio - async def test_generates_instance_name(self, test_db, session: AsyncSession): - user = await create_user(session=session) - project = await create_project(session=session, owner=user) - pool = await services_pools.create_pool(session=session, project=project, name="test_pool") - im = InstanceModel( - name="test_instnce", - project=project, - pool=pool, - status=InstanceStatus.PENDING, - unreachable=False, - job_provisioning_data="", - offer="", - backend=BackendType.REMOTE, - region="", - price=0, - ) - session.add(im) - await session.commit() - - name = await services_pools.generate_instance_name( - session=session, project=project, pool_name="test_pool" - ) - car, _, cdr = name.partition("-") - assert len(car) > 0 - assert len(cdr) > 0 - - -class TestInstanceModelToInstance: - @pytest.mark.asyncio - async def test_converts_instance(self, test_db, session: AsyncSession): - project = await create_project( - session=session, - name="test_project", - ) - instance_id = uuid.uuid4() - created = get_current_datetime() - expected_instance = Instance( - id=instance_id, - project_name=project.name, - backend=BackendType.LOCAL, - instance_type=InstanceType( - name="instance", resources=Resources(cpus=1, memory_mib=512, spot=False, gpus=[]) - ), - name="test_instance", - hostname="hostname_test", - status=InstanceStatus.PENDING, - created=created, - region="eu-west-1", - price=1.0, - ) - im = InstanceModel( - id=instance_id, - created_at=created, - name="test_instance", - status=InstanceStatus.PENDING, - unreachable=False, - project=project, - pool=None, - job_provisioning_data='{"ssh_proxy":null, "backend":"local","hostname":"hostname_test","region":"eu-west","price":1.0,"username":"user1","ssh_port":12345,"dockerized":false,"instance_id":"test_instance","instance_type": {"name": "instance", "resources": {"cpus": 1, "memory_mib": 512, "gpus": [], "spot": false, "disk": {"size_mib": 102400}, "description":""}}}', - offer='{"price":"LOCAL", "price":1.0, "backend":"local", "region":"eu-west-1", "availability":"available","instance": {"name": "instance", "resources": {"cpus": 1, "memory_mib": 512, "gpus": [], "spot": false, "disk": {"size_mib": 102400}, "description":""}}}', - ) - instance = services_pools.instance_model_to_instance(im) - assert instance == expected_instance diff --git a/src/tests/_internal/server/services/test_repos.py b/src/tests/_internal/server/services/test_repos.py new file mode 100644 index 0000000000..50e64f84ed --- /dev/null +++ b/src/tests/_internal/server/services/test_repos.py @@ -0,0 +1,401 @@ +from typing import Optional +from uuid import UUID + +import pytest +import pytest_asyncio +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.repos import RemoteRepoCreds, RemoteRepoInfo, RepoHeadWithCreds +from dstack._internal.core.models.repos.base import RepoType +from dstack._internal.core.models.users import GlobalRole, ProjectRole +from dstack._internal.server.models import ProjectModel, RepoCredsModel, UserModel +from dstack._internal.server.services.projects import add_project_member +from dstack._internal.server.services.repos import get_repo, init_repo +from dstack._internal.server.testing.common import ( + create_project, + create_repo, + create_repo_creds, + create_user, +) + +pytestmark = [ + pytest.mark.asyncio, + pytest.mark.usefixtures("test_db"), + pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True), +] + + +_REPO_ID = "test-36senvbc" + + +async def _create_user(session: AsyncSession, project: ProjectModel, name: str) -> UserModel: + user = await create_user(session=session, name=name, global_role=GlobalRole.USER) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + return user + + +async def _get_repo_creds( + session: AsyncSession, repo_id: UUID, user_id: UUID +) -> Optional[RemoteRepoCreds]: + res = await session.execute(select(RepoCredsModel).filter_by(repo_id=repo_id, user_id=user_id)) + repo_creds = res.scalar() + if repo_creds is None: + return None + creds_raw = repo_creds.creds.plaintext + assert creds_raw is not None + return RemoteRepoCreds.parse_raw(creds_raw) + + +@pytest_asyncio.fixture +async def project(session: AsyncSession) -> ProjectModel: + owner = await create_user(session=session, name="project-admin", global_role=GlobalRole.USER) + project = await create_project(session=session, owner=owner, name="our-project") + await add_project_member( + session=session, project=project, user=owner, project_role=ProjectRole.ADMIN + ) + return project + + +@pytest_asyncio.fixture +async def user(session: AsyncSession, project: ProjectModel) -> UserModel: + return await _create_user(session, project, name="default-user") + + +class TestGetRemoteRepo: + async def test_returns_none_if_repo_not_found( + self, session: AsyncSession, project: ProjectModel, user: UserModel + ): + another_project = await create_project(session=session, owner=user, name="another-project") + repo_info = RemoteRepoInfo(repo_type="remote", repo_name="test") + # a repo with the same project_id in another project, should be ignored + await create_repo( + session=session, + project_id=another_project.id, + repo_name=_REPO_ID, + repo_type=RepoType.REMOTE, + info=repo_info.dict(), + ) + + repo = await get_repo( + session=session, project=project, user=user, repo_id=_REPO_ID, include_creds=False + ) + + assert repo is None + + async def test_returns_repo_with_none_creds_if_include_creds_is_false( + self, session: AsyncSession, project: ProjectModel, user: UserModel + ): + repo_info = RemoteRepoInfo(repo_type="remote", repo_name="test") + legacy_repo_creds = RemoteRepoCreds( + clone_url="https://git.example.com/repo.git", + private_key=None, + oauth_token="legacy-oauth-token", + ) + repo_model = await create_repo( + session=session, + project_id=project.id, + repo_name=_REPO_ID, + repo_type=RepoType.REMOTE, + info=repo_info.dict(), + creds=legacy_repo_creds.dict(), + ) + user_repo_creds = RemoteRepoCreds( + clone_url="https://git.example.com/repo.git", + private_key=None, + oauth_token="user-oauth-token", + ) + await create_repo_creds( + session=session, + repo_id=repo_model.id, + user_id=user.id, + creds=user_repo_creds.dict(), + ) + + repo = await get_repo( + session=session, project=project, user=user, repo_id=_REPO_ID, include_creds=False + ) + + assert repo == RepoHeadWithCreds( + repo_id=_REPO_ID, + repo_info=repo_info, + # both legacy and user creds are ignored + repo_creds=None, + ) + + async def test_returns_repo_with_none_creds_if_no_user_or_legacy_creds( + self, session: AsyncSession, project: ProjectModel, user: UserModel + ): + repo_info = RemoteRepoInfo(repo_type="remote", repo_name="test") + repo_model = await create_repo( + session=session, + project_id=project.id, + repo_name=_REPO_ID, + repo_type=RepoType.REMOTE, + info=repo_info.dict(), + creds=None, + ) + # another user's creds should be ignored + another_user = await _create_user(session, project, name="another-user") + another_user_repo_creds = RemoteRepoCreds( + clone_url="https://git.example.com/repo.git", + private_key=None, + oauth_token="another-oauth-token", + ) + await create_repo_creds( + session=session, + repo_id=repo_model.id, + user_id=another_user.id, + creds=another_user_repo_creds.dict(), + ) + + repo = await get_repo( + session=session, + project=project, + user=user, + repo_id=_REPO_ID, + include_creds=True, + ) + + assert repo == RepoHeadWithCreds( + repo_id=_REPO_ID, + repo_info=repo_info, + repo_creds=None, + ) + + @pytest.mark.parametrize( + "with_legacy_creds", + [ + pytest.param(False, id="without-legacy-creds"), + pytest.param(True, id="with-legacy-creds"), + ], + ) + async def test_returns_repo_with_user_creds_if_present( + self, + session: AsyncSession, + project: ProjectModel, + user: UserModel, + with_legacy_creds: bool, + ): + repo_info = RemoteRepoInfo(repo_type="remote", repo_name="test") + if with_legacy_creds: + legacy_repo_creds = RemoteRepoCreds( + clone_url="https://git.example.com/repo.git", + private_key=None, + oauth_token="legacy-oauth-token", + ) + else: + legacy_repo_creds = None + repo_model = await create_repo( + session=session, + project_id=project.id, + repo_name=_REPO_ID, + repo_type=RepoType.REMOTE, + info=repo_info.dict(), + creds=legacy_repo_creds.dict() if legacy_repo_creds else None, + ) + user_repo_creds = RemoteRepoCreds( + clone_url="https://git.example.com/repo.git", + private_key=None, + oauth_token="user-oauth-token", + ) + await create_repo_creds( + session=session, + repo_id=repo_model.id, + user_id=user.id, + creds=user_repo_creds.dict(), + ) + + repo = await get_repo( + session=session, project=project, user=user, repo_id=_REPO_ID, include_creds=True + ) + + assert repo == RepoHeadWithCreds( + repo_id=_REPO_ID, + repo_info=repo_info, + repo_creds=user_repo_creds, + ) + + async def test_returns_repo_with_legacy_creds_if_user_creds_not_found( + self, session: AsyncSession, project: ProjectModel, user: UserModel + ): + repo_info = RemoteRepoInfo(repo_type="remote", repo_name="test") + legacy_repo_creds = RemoteRepoCreds( + clone_url="https://git.example.com/repo.git", + private_key=None, + oauth_token="legacy-oauth-token", + ) + await create_repo( + session=session, + project_id=project.id, + repo_name=_REPO_ID, + repo_type=RepoType.REMOTE, + info=repo_info.dict(), + creds=legacy_repo_creds.dict(), + ) + + repo = await get_repo( + session=session, project=project, user=user, repo_id=_REPO_ID, include_creds=True + ) + + assert repo == RepoHeadWithCreds( + repo_id=_REPO_ID, + repo_info=repo_info, + repo_creds=legacy_repo_creds, + ) + + +class TestInitRemoteRepo: + async def test_creates_new_repo_with_user_creds( + self, session: AsyncSession, project: ProjectModel, user: UserModel + ): + repo_info = RemoteRepoInfo(repo_type="remote", repo_name="test") + repo_creds = RemoteRepoCreds( + clone_url="https://git.example.com/repo.git", + private_key=None, + oauth_token="oauth-token", + ) + + repo = await init_repo( + session=session, + project=project, + user=user, + repo_id=_REPO_ID, + repo_info=repo_info, + repo_creds=repo_creds, + ) + + assert repo.creds is None + assert await _get_repo_creds(session, repo.id, user.id) == repo_creds + + async def test_updates_repo_adding_user_creds( + self, session: AsyncSession, project: ProjectModel, user: UserModel + ): + old_repo_info = RemoteRepoInfo(repo_type="remote", repo_name="old-name") + new_repo_info = RemoteRepoInfo(repo_type="remote", repo_name="new-name") + our_repo_creds = RemoteRepoCreds( + clone_url="https://git.example.com/repo.git", + private_key=None, + oauth_token="our-oauth-token", + ) + repo = await create_repo( + session=session, + project_id=project.id, + repo_name=_REPO_ID, + repo_type=RepoType.REMOTE, + info=old_repo_info.dict(), + creds=None, + ) + + repo = await init_repo( + session=session, + project=project, + user=user, + repo_id=_REPO_ID, + repo_info=new_repo_info, + repo_creds=our_repo_creds, + ) + + assert repo.creds is None + assert RemoteRepoInfo.parse_raw(repo.info) == new_repo_info + assert await _get_repo_creds(session, repo.id, user.id) == our_repo_creds + + async def test_updates_repo_updating_user_creds( + self, session: AsyncSession, project: ProjectModel, user: UserModel + ): + repo_info = RemoteRepoInfo(repo_type="remote", repo_name="test") + repo = await create_repo( + session=session, + project_id=project.id, + repo_name=_REPO_ID, + repo_type=RepoType.REMOTE, + info=repo_info.dict(), + creds=None, + ) + old_repo_creds = RemoteRepoCreds( + clone_url="https://git.example.com/repo.git", + private_key=None, + oauth_token="oauth-token", + ) + await create_repo_creds( + session=session, + repo_id=repo.id, + user_id=user.id, + creds=old_repo_creds.dict(), + ) + new_repo_creds = RemoteRepoCreds( + clone_url="ssh://git@git.example.com/repo.git", + private_key="private-key", + oauth_token=None, + ) + + repo = await init_repo( + session=session, + project=project, + user=user, + repo_id=_REPO_ID, + repo_info=repo_info, + repo_creds=new_repo_creds, + ) + + assert await _get_repo_creds(session, repo.id, user.id) == new_repo_creds + + async def test_updates_repo_removing_user_creds( + self, session: AsyncSession, project: ProjectModel, user: UserModel + ): + repo_info = RemoteRepoInfo(repo_type="remote", repo_name="test") + legacy_repo_creds = RemoteRepoCreds( + clone_url="https://git.example.com/repo.git", + private_key=None, + oauth_token="legacy-oauth-token", + ) + repo = await create_repo( + session=session, + project_id=project.id, + repo_name=_REPO_ID, + repo_type=RepoType.REMOTE, + info=repo_info.dict(), + creds=legacy_repo_creds.dict(), + ) + our_repo_creds = RemoteRepoCreds( + clone_url="https://git.example.com/repo.git", + private_key=None, + oauth_token="our-oauth-token", + ) + await create_repo_creds( + session=session, + repo_id=repo.id, + user_id=user.id, + creds=our_repo_creds.dict(), + ) + another_user = await _create_user(session, project, name="another-user") + another_user_repo_creds = RemoteRepoCreds( + clone_url="https://git.example.com/repo.git", + private_key=None, + oauth_token="another-oauth-token", + ) + await create_repo_creds( + session=session, + repo_id=repo.id, + user_id=another_user.id, + creds=another_user_repo_creds.dict(), + ) + + repo = await init_repo( + session=session, + project=project, + user=user, + repo_id=_REPO_ID, + repo_info=repo_info, + repo_creds=None, + ) + + # legacy creds stored in the repo are still here + assert repo.creds is not None + assert RemoteRepoCreds.parse_raw(repo.creds) == legacy_repo_creds + # our personal creds are deleted + assert await _get_repo_creds(session, repo.id, user.id) is None + # another user's creds are still here + assert await _get_repo_creds(session, repo.id, another_user.id) == another_user_repo_creds diff --git a/src/tests/_internal/server/services/test_runs.py b/src/tests/_internal/server/services/test_runs.py deleted file mode 100644 index 7814a4d684..0000000000 --- a/src/tests/_internal/server/services/test_runs.py +++ /dev/null @@ -1,211 +0,0 @@ -from typing import List - -import pytest -from pydantic import parse_obj_as -from sqlalchemy.ext.asyncio import AsyncSession - -from dstack._internal.core.errors import ServerError -from dstack._internal.core.models.configurations import ScalingSpec, ServiceConfiguration -from dstack._internal.core.models.profiles import Profile -from dstack._internal.core.models.resources import Range -from dstack._internal.core.models.runs import JobStatus, JobTerminationReason, RunStatus -from dstack._internal.server.models import RunModel -from dstack._internal.server.services.runs import scale_run_replicas -from dstack._internal.server.testing.common import ( - create_job, - create_pool, - create_project, - create_repo, - create_run, - create_user, - get_run_spec, -) - - -async def make_run( - session: AsyncSession, - replicas_statuses: List[JobStatus], - status: RunStatus = RunStatus.RUNNING, - replicas: str = 1, -) -> RunModel: - project = await create_project(session=session) - user = await create_user(session=session) - repo = await create_repo( - session=session, - project_id=project.id, - ) - project.default_pool = await create_pool( - session=session, project=project, pool_name="default-pool" - ) - run_name = "test-run" - profile = Profile(name="test-profile") - run_spec = get_run_spec( - repo_id=repo.name, - run_name=run_name, - profile=profile, - configuration=ServiceConfiguration( - commands=["echo hello"], - port=8000, - replicas=parse_obj_as(Range[int], replicas), - scaling=ScalingSpec( - metric="rps", - target=1, - ), - ), - ) - run = await create_run( - session=session, - project=project, - repo=repo, - user=user, - run_name=run_name, - run_spec=run_spec, - status=status, - ) - for replica_num, job_status in enumerate(replicas_statuses): - await create_job( - session=session, - run=run, - status=job_status, - replica_num=replica_num, - ) - await session.refresh(run) - return run - - -async def scale_wrapper(session: AsyncSession, run: RunModel, diff: int): - await scale_run_replicas(session, run, diff) - await session.commit() - await session.refresh(run) - - -class TestScaleRunReplicas: - @pytest.mark.asyncio - async def test_no_scale(self, test_db, session: AsyncSession): - run = await make_run( - session, - [ - JobStatus.RUNNING, - ], - replicas="0..1", - ) - await scale_wrapper(session, run, 0) - assert len(run.jobs) == 1 - - @pytest.mark.asyncio - async def test_downscale_to_zero(self, test_db, session: AsyncSession): - run = await make_run( - session, - [ - JobStatus.RUNNING, - ], - replicas="0..1", - ) - await scale_wrapper(session, run, -1) - assert len(run.jobs) == 1 - assert run.jobs[0].status == JobStatus.TERMINATING - assert run.jobs[0].termination_reason == JobTerminationReason.SCALED_DOWN - - @pytest.mark.asyncio - async def test_upscale_new(self, test_db, session: AsyncSession): - run = await make_run( - session, - [ - JobStatus.RUNNING, - ], - replicas="0..2", - ) - await scale_wrapper(session, run, 1) - assert len(run.jobs) == 2 - assert run.jobs[1].status == JobStatus.SUBMITTED - assert run.jobs[1].replica_num == 1 - - @pytest.mark.asyncio - async def test_upscale_terminated(self, test_db, session: AsyncSession): - run = await make_run( - session, - [ - JobStatus.RUNNING, - JobStatus.TERMINATED, - ], - replicas="0..2", - ) - await scale_wrapper(session, run, 1) - assert len(run.jobs) == 3 - assert run.jobs[0].status == JobStatus.RUNNING - assert run.jobs[1].status == JobStatus.TERMINATED - assert run.jobs[2].status == JobStatus.SUBMITTED - assert run.jobs[2].replica_num == 1 - - @pytest.mark.asyncio - async def test_downscale_less_important(self, test_db, session: AsyncSession): - run = await make_run( - session, - [ - JobStatus.PROVISIONING, - JobStatus.RUNNING, - ], - replicas="0..2", - ) - await scale_wrapper(session, run, -1) - assert len(run.jobs) == 2 - assert run.jobs[0].status == JobStatus.TERMINATING - assert run.jobs[0].termination_reason == JobTerminationReason.SCALED_DOWN - assert run.jobs[1].status == JobStatus.RUNNING - - @pytest.mark.asyncio - async def test_downscale_greater_replica_num(self, test_db, session: AsyncSession): - run = await make_run( - session, - [ - JobStatus.RUNNING, - JobStatus.RUNNING, - ], - replicas="0..2", - ) - await scale_wrapper(session, run, -1) - assert len(run.jobs) == 2 - assert run.jobs[0].status == JobStatus.RUNNING - assert run.jobs[1].status == JobStatus.TERMINATING - assert run.jobs[1].termination_reason == JobTerminationReason.SCALED_DOWN - - @pytest.mark.asyncio - async def test_no_downscale_below_limit(self, test_db, session: AsyncSession): - run = await make_run( - session, - [ - JobStatus.RUNNING, - ], - replicas="1..2", - ) - with pytest.raises(ServerError): - await scale_wrapper(session, run, -1) - - @pytest.mark.asyncio - async def test_no_upscale_above_limit(self, test_db, session: AsyncSession): - run = await make_run( - session, - [ - JobStatus.RUNNING, - ], - replicas="0..1", - ) - with pytest.raises(ServerError): - await scale_wrapper(session, run, 1) - - @pytest.mark.asyncio - async def test_upscale_mixed(self, test_db, session: AsyncSession): - run = await make_run( - session, - [ - JobStatus.TERMINATED, - ], - replicas="0..2", - ) - await scale_wrapper(session, run, 2) - assert len(run.jobs) == 3 - assert run.jobs[0].status == JobStatus.TERMINATED - assert run.jobs[1].status == JobStatus.SUBMITTED - assert run.jobs[1].replica_num == 0 - assert run.jobs[2].status == JobStatus.SUBMITTED - assert run.jobs[2].replica_num == 1 diff --git a/src/tests/_internal/server/services/test_ssh.py b/src/tests/_internal/server/services/test_ssh.py new file mode 100644 index 0000000000..ec8fda453d --- /dev/null +++ b/src/tests/_internal/server/services/test_ssh.py @@ -0,0 +1,285 @@ +from typing import Optional + +import pytest +import pytest_asyncio +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import NetworkMode +from dstack._internal.core.models.instances import SSHConnectionParams, SSHKey +from dstack._internal.core.models.runs import ( + JobRuntimeData, +) +from dstack._internal.server.models import ProjectModel, RunModel +from dstack._internal.server.services.ssh import get_container_ssh_credentials +from dstack._internal.server.testing.common import ( + create_instance, + create_job, + create_project, + create_repo, + create_run, + create_user, + get_job_provisioning_data, + get_job_runtime_data, + get_remote_connection_info, +) +from dstack._internal.utils.path import FileContent + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +@pytest.mark.usefixtures("test_db", "image_config_mock") +class TestGetContainerSSHCredentials: + instance_project_key = "instance-project-key" + run_project_key = "run-project-key" + + @pytest_asyncio.fixture + async def instance_project(self, session: AsyncSession) -> ProjectModel: + owner = await create_user(session=session, name="instance-project-owner") + return await create_project( + session=session, + name="instance-project", + owner=owner, + ssh_private_key=self.instance_project_key, + ) + + @pytest_asyncio.fixture + async def run(self, session: AsyncSession) -> RunModel: + run_project_owner = await create_user(session=session, name="run-project-owner") + run_project = await create_project( + session=session, name="run-project", ssh_private_key=self.run_project_key + ) + repo = await create_repo(session=session, project_id=run_project.id) + run = await create_run( + session=session, project=run_project, user=run_project_owner, repo=repo + ) + # Triggers session magic, attaches ProjectModel to JobModel somehow + assert run.project is not None + return run + + @pytest.mark.parametrize( + ["jrd", "expected_port"], + [ + pytest.param(None, DSTACK_RUNNER_SSH_PORT, id="no-jrd"), + pytest.param( + get_job_runtime_data(network_mode=NetworkMode.HOST, ports={}), + DSTACK_RUNNER_SSH_PORT, + id="host", + ), + pytest.param( + get_job_runtime_data( + network_mode=NetworkMode.HOST, ports={DSTACK_RUNNER_SSH_PORT: 32772} + ), + 32772, + id="bridge", + ), + ], + ) + async def test_vm_based_backend( + self, + session: AsyncSession, + instance_project: ProjectModel, + run: RunModel, + jrd: Optional[JobRuntimeData], + expected_port: int, + ): + instance = await create_instance( + session=session, project=instance_project, backend=BackendType.AWS + ) + jpd = get_job_provisioning_data( + backend=BackendType.AWS, + dockerized=True, + hostname="80.80.80.80", + username="ubuntu", + ssh_port=22, + ssh_proxy=None, + ) + job = await create_job( + session=session, + run=run, + instance=instance, + job_provisioning_data=jpd, + job_runtime_data=jrd, + ) + + hosts = get_container_ssh_credentials(job) + + assert hosts == [ + ( + SSHConnectionParams( + hostname="80.80.80.80", + username="ubuntu", + port=22, + ), + FileContent(self.instance_project_key), + ), + ( + SSHConnectionParams( + hostname="localhost", + username="root", + port=expected_port, + ), + FileContent(self.run_project_key), + ), + ] + + async def test_container_based_backend( + self, + session: AsyncSession, + instance_project: ProjectModel, + run: RunModel, + ): + instance = await create_instance( + session=session, project=instance_project, backend=BackendType.RUNPOD + ) + jpd = get_job_provisioning_data( + backend=BackendType.RUNPOD, + dockerized=False, + hostname="100.100.100.100", + username="root", + ssh_port=32768, + ssh_proxy=None, + ) + job = await create_job( + session=session, + run=run, + instance=instance, + job_provisioning_data=jpd, + ) + + hosts = get_container_ssh_credentials(job) + + assert hosts == [ + ( + SSHConnectionParams( + hostname="100.100.100.100", + username="root", + port=32768, + ), + FileContent(self.run_project_key), + ), + ] + + async def test_container_based_backend_with_proxy( + self, + session: AsyncSession, + instance_project: ProjectModel, + run: RunModel, + ): + instance = await create_instance( + session=session, project=instance_project, backend=BackendType.KUBERNETES + ) + jpd = get_job_provisioning_data( + backend=BackendType.KUBERNETES, + dockerized=False, + hostname="10.105.30.22", + username="root", + ssh_port=DSTACK_RUNNER_SSH_PORT, + ssh_proxy=SSHConnectionParams( + hostname="120.120.120.120", + username="root", + port=30022, + ), + ) + job = await create_job( + session=session, + run=run, + instance=instance, + job_provisioning_data=jpd, + ) + + hosts = get_container_ssh_credentials(job) + + assert hosts == [ + ( + SSHConnectionParams( + hostname="120.120.120.120", + username="root", + port=30022, + ), + FileContent(self.run_project_key), + ), + ( + SSHConnectionParams( + hostname="10.105.30.22", + username="root", + port=DSTACK_RUNNER_SSH_PORT, + ), + FileContent(self.run_project_key), + ), + ] + + async def test_ssh_instance_with_head_proxy( + self, + session: AsyncSession, + instance_project: ProjectModel, + run: RunModel, + ): + rci = get_remote_connection_info( + host="192.168.100.50", + port=22222, + ssh_user="ubuntu", + # User-provided key is only used for instance provisioning, then we always use + # the project key, which is added during provisioning + ssh_keys=[SSHKey(public="public", private="instance-key")], + ssh_proxy=SSHConnectionParams( + hostname="140.140.140.140", + username="bastion", + port=22, + ), + ssh_proxy_keys=[SSHKey(public="public", private="head-key")], + ) + instance = await create_instance( + session=session, + project=instance_project, + backend=BackendType.REMOTE, + remote_connection_info=rci, + ) + jpd = get_job_provisioning_data( + backend=BackendType.REMOTE, + dockerized=True, + hostname="192.168.100.50", + username="ubuntu", + ssh_port=22222, + # Actually, JobModel.job_provisioning_data.ssh_proxy is set to + # InstanceModel.remote_connection_info.ssh_proxy but not used in the function we test + ssh_proxy=None, + ) + job = await create_job( + session=session, + run=run, + instance=instance, + job_provisioning_data=jpd, + # jrd is tested in vm-based backend tests + job_runtime_data=None, + ) + + hosts = get_container_ssh_credentials(job) + + assert hosts == [ + ( + SSHConnectionParams( + hostname="140.140.140.140", + username="bastion", + port=22, + ), + FileContent("head-key"), + ), + ( + SSHConnectionParams( + hostname="192.168.100.50", + username="ubuntu", + port=22222, + ), + FileContent(self.instance_project_key), + ), + ( + SSHConnectionParams( + hostname="localhost", + username="root", + port=DSTACK_RUNNER_SSH_PORT, + ), + FileContent(self.run_project_key), + ), + ] diff --git a/src/tests/_internal/server/services/test_templates.py b/src/tests/_internal/server/services/test_templates.py new file mode 100644 index 0000000000..45fcaf5628 --- /dev/null +++ b/src/tests/_internal/server/services/test_templates.py @@ -0,0 +1,331 @@ +import uuid +from pathlib import Path +from unittest.mock import patch + +import pytest +import yaml +from git import GitCommandError + +from dstack._internal.core.models.templates import ( + EnvUITemplateParameter, + NameUITemplateParameter, +) +from dstack._internal.server.services import templates as templates_service + + +@pytest.fixture(autouse=True) +def _reset_cache(): + """Reset the templates cache before each test.""" + templates_service._templates_cache.clear() + yield + templates_service._templates_cache.clear() + + +def _create_template_file(templates_dir: Path, filename: str, data: dict) -> Path: + filepath = templates_dir / filename + with open(filepath, "w") as f: + yaml.dump(data, f) + return filepath + + +def _create_templates_repo(tmp_path: Path) -> Path: + """Create a fake templates repo directory with .dstack/templates/.""" + templates_dir = tmp_path / ".dstack" / "templates" + templates_dir.mkdir(parents=True) + return templates_dir + + +class TestListTemplates: + @pytest.mark.asyncio + async def test_returns_empty_when_no_repo_configured(self): + with patch.object(templates_service.settings, "SERVER_TEMPLATES_REPO", None): + project = type("Project", (), {"templates_repo": None, "id": "project-id"})() + result = await templates_service.list_templates(project) + assert result == [] + + +class TestParseTemplates: + def test_returns_empty_when_templates_dir_missing(self, tmp_path: Path): + result = templates_service._parse_templates(tmp_path) + assert result == [] + + def test_parses_valid_template(self, tmp_path: Path): + templates_dir = _create_templates_repo(tmp_path) + _create_template_file( + templates_dir, + "test.yml", + { + "type": "template", + "name": "test-template", + "title": "Test Template", + "parameters": [{"type": "name"}], + "configuration": {"type": "dev-environment"}, + }, + ) + result = templates_service._parse_templates(tmp_path) + assert len(result) == 1 + assert result[0].name == "test-template" + assert isinstance(result[0].parameters[0], NameUITemplateParameter) + + def test_parses_template_with_env_parameter(self, tmp_path: Path): + templates_dir = _create_templates_repo(tmp_path) + _create_template_file( + templates_dir, + "test.yml", + { + "type": "template", + "name": "test", + "title": "Test", + "parameters": [ + {"type": "env", "title": "Password", "name": "PASSWORD", "value": "secret"} + ], + "configuration": {"type": "service"}, + }, + ) + result = templates_service._parse_templates(tmp_path) + assert len(result) == 1 + param = result[0].parameters[0] + assert isinstance(param, EnvUITemplateParameter) + assert param.title == "Password" + assert param.name == "PASSWORD" + assert param.value == "secret" + + def test_skips_non_yaml_files(self, tmp_path: Path): + templates_dir = _create_templates_repo(tmp_path) + _create_template_file( + templates_dir, + "valid.yml", + { + "type": "template", + "name": "valid", + "title": "Valid", + "configuration": {"type": "task"}, + }, + ) + (templates_dir / "readme.txt").write_text("not a template") + result = templates_service._parse_templates(tmp_path) + assert len(result) == 1 + assert result[0].name == "valid" + + def test_skips_non_template_type(self, tmp_path: Path): + templates_dir = _create_templates_repo(tmp_path) + _create_template_file( + templates_dir, + "other.yml", + {"type": "something-else", "name": "other", "title": "Other"}, + ) + result = templates_service._parse_templates(tmp_path) + assert result == [] + + def test_skips_invalid_yaml(self, tmp_path: Path): + templates_dir = _create_templates_repo(tmp_path) + (templates_dir / "bad.yml").write_text(": invalid: yaml: [") + _create_template_file( + templates_dir, + "good.yml", + { + "type": "template", + "name": "good", + "title": "Good", + "configuration": {"type": "task"}, + }, + ) + result = templates_service._parse_templates(tmp_path) + assert len(result) == 1 + assert result[0].name == "good" + + def test_skips_template_with_unknown_parameter_type(self, tmp_path: Path): + templates_dir = _create_templates_repo(tmp_path) + _create_template_file( + templates_dir, + "bad_param.yml", + { + "type": "template", + "name": "bad-param", + "title": "Bad Param", + "parameters": [{"type": "unknown_type"}], + "configuration": {"type": "task"}, + }, + ) + _create_template_file( + templates_dir, + "good.yml", + { + "type": "template", + "name": "good", + "title": "Good", + "configuration": {"type": "task"}, + }, + ) + result = templates_service._parse_templates(tmp_path) + assert len(result) == 1 + assert result[0].name == "good" + + def test_parses_yaml_extension(self, tmp_path: Path): + templates_dir = _create_templates_repo(tmp_path) + _create_template_file( + templates_dir, + "test.yaml", + { + "type": "template", + "name": "yaml-ext", + "title": "YAML Extension", + "configuration": {"type": "task"}, + }, + ) + result = templates_service._parse_templates(tmp_path) + assert len(result) == 1 + assert result[0].name == "yaml-ext" + + def test_returns_templates_sorted_by_filename(self, tmp_path: Path): + templates_dir = _create_templates_repo(tmp_path) + _create_template_file( + templates_dir, + "b.yml", + { + "type": "template", + "name": "b", + "title": "B", + "configuration": {"type": "task"}, + }, + ) + _create_template_file( + templates_dir, + "a.yml", + { + "type": "template", + "name": "a", + "title": "A", + "configuration": {"type": "task"}, + }, + ) + result = templates_service._parse_templates(tmp_path) + assert len(result) == 2 + assert result[0].name == "a" + assert result[1].name == "b" + + +class TestListTemplatesSync: + def test_returns_empty_if_repo_fetch_fails(self): + with patch.object( + templates_service, + "_fetch_templates_repo", + side_effect=GitCommandError(["git", "clone"], 128, stderr="not found"), + ): + result = templates_service._list_templates_sync( + "project-key", "https://github.com/dstackai/dstack-sky" + ) + assert result == [] + + def test_caches_result(self, tmp_path: Path): + templates_dir = _create_templates_repo(tmp_path) + _create_template_file( + templates_dir, + "test.yml", + { + "type": "template", + "name": "cached", + "title": "Cached", + "configuration": {"type": "task"}, + }, + ) + + with ( + patch.object(templates_service, "_fetch_templates_repo", return_value=tmp_path), + ): + result1 = templates_service._list_templates_sync("project-key", "https://example.com") + assert len(result1) == 1 + + (templates_dir / "test.yml").unlink() + + result2 = templates_service._list_templates_sync("project-key", "https://example.com") + assert len(result2) == 1 + assert result2[0].name == "cached" + + def test_refreshes_after_cache_clear(self, tmp_path: Path): + templates_dir = _create_templates_repo(tmp_path) + _create_template_file( + templates_dir, + "test.yml", + { + "type": "template", + "name": "original", + "title": "Original", + "configuration": {"type": "task"}, + }, + ) + + with ( + patch.object(templates_service, "_fetch_templates_repo", return_value=tmp_path), + ): + result1 = templates_service._list_templates_sync("project-key", "https://example.com") + assert result1[0].name == "original" + + _create_template_file( + templates_dir, + "test.yml", + { + "type": "template", + "name": "updated", + "title": "Updated", + "configuration": {"type": "task"}, + }, + ) + templates_service._templates_cache.clear() + + result2 = templates_service._list_templates_sync("project-key", "https://example.com") + assert result2[0].name == "updated" + + def test_refreshes_after_cache_ttl_expiration(self, tmp_path: Path): + templates_dir = _create_templates_repo(tmp_path) + _create_template_file( + templates_dir, + "test.yml", + { + "type": "template", + "name": "original", + "title": "Original", + "configuration": {"type": "task"}, + }, + ) + + with patch.object(templates_service, "_fetch_templates_repo", return_value=tmp_path): + result1 = templates_service._list_templates_sync("project-key", "https://example.com") + assert result1[0].name == "original" + + _create_template_file( + templates_dir, + "test.yml", + { + "type": "template", + "name": "updated-after-expire", + "title": "Updated", + "configuration": {"type": "task"}, + }, + ) + + templates_service._templates_cache.expire( + time=templates_service._templates_cache.timer() + + templates_service.CACHE_TTL_SECONDS + + 1 + ) + + result2 = templates_service._list_templates_sync("project-key", "https://example.com") + assert result2[0].name == "updated-after-expire" + + +class TestInvalidateTemplatesCache: + def test_removes_cache_entries_for_project_repo_keys(self): + templates_service._templates_cache.clear() + project_id = uuid.UUID("00000000-0000-0000-0000-000000000001") + repo1 = "https://example.com/templates-1.git" + repo2 = "https://example.com/templates-2.git" + key1 = templates_service._repo_key(project_id=project_id, repo_url=repo1) + key2 = templates_service._repo_key(project_id=project_id, repo_url=repo2) + templates_service._templates_cache[(key1, repo1)] = ["a"] + templates_service._templates_cache[(key2, repo2)] = ["b"] + + templates_service.invalidate_templates_cache(project_id, repo1, repo2) + + assert (key1, repo1) not in templates_service._templates_cache + assert (key2, repo2) not in templates_service._templates_cache diff --git a/src/tests/_internal/server/services/test_users.py b/src/tests/_internal/server/services/test_users.py new file mode 100644 index 0000000000..9fd137eef8 --- /dev/null +++ b/src/tests/_internal/server/services/test_users.py @@ -0,0 +1,29 @@ +import pytest + +from dstack._internal.server.services.users import is_valid_username + + +class TestIsValidUsername: + @pytest.mark.parametrize( + "username", + [ + "special#$symbols", + "A,B", + "", + "a" * 61, + ], + ) + def test_valid(self, username: str): + assert not is_valid_username(username) + + @pytest.mark.parametrize( + "username", + [ + "regularusername", + "CaseUsername", + "username_with_underscores-and-dashes1234", + "a" * 60, + ], + ) + def test_invalid(self, username: str): + assert is_valid_username(username) diff --git a/src/tests/_internal/server/services/test_volumes.py b/src/tests/_internal/server/services/test_volumes.py new file mode 100644 index 0000000000..82477812ab --- /dev/null +++ b/src/tests/_internal/server/services/test_volumes.py @@ -0,0 +1,132 @@ +from datetime import datetime, timezone + +import pytest +from freezegun import freeze_time + +from dstack._internal.core.errors import ServerClientError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.volumes import AWSVolumeConfiguration, VolumeStatus +from dstack._internal.server.services.volumes import ( + _get_volume_cost, + _validate_volume_configuration, +) +from dstack._internal.server.testing.common import ( + get_volume, + get_volume_provisioning_data, +) + + +class TestValidateVolumeConfiguration: + def test_external_volume_with_auto_cleanup_duration_raises_error(self): + """External volumes (with volume_id) should not allow auto_cleanup_duration""" + config = AWSVolumeConfiguration( + backend=BackendType.AWS, + region="us-east-1", + volume_id="vol-123456", + auto_cleanup_duration="1h", + ) + with pytest.raises( + ServerClientError, match="External volumes.*do not support auto_cleanup_duration" + ): + _validate_volume_configuration(config) + + def test_external_volume_with_auto_cleanup_duration_int_raises_error(self): + """External volumes with integer auto_cleanup_duration should also raise error""" + config = AWSVolumeConfiguration( + backend=BackendType.AWS, + region="us-east-1", + volume_id="vol-123456", + auto_cleanup_duration=3600, + ) + with pytest.raises( + ServerClientError, match="External volumes.*do not support auto_cleanup_duration" + ): + _validate_volume_configuration(config) + + def test_external_volume_with_auto_cleanup_disabled_succeeds(self): + """External volumes with auto_cleanup_duration='off' or -1 should be allowed""" + config1 = AWSVolumeConfiguration( + backend=BackendType.AWS, + region="us-east-1", + volume_id="vol-123456", + auto_cleanup_duration="off", + ) + config2 = AWSVolumeConfiguration( + backend=BackendType.AWS, + region="us-east-1", + volume_id="vol-123456", + auto_cleanup_duration=-1, + ) + # Should not raise any errors + _validate_volume_configuration(config1) + _validate_volume_configuration(config2) + + def test_external_volume_without_auto_cleanup_succeeds(self): + """External volumes without auto_cleanup_duration should be allowed""" + config = AWSVolumeConfiguration( + backend=BackendType.AWS, region="us-east-1", volume_id="vol-123456" + ) + # Should not raise any errors + _validate_volume_configuration(config) + + def test_new_volume_with_auto_cleanup_duration_succeeds(self): + """New volumes (without volume_id) with auto_cleanup_duration should be allowed""" + config = AWSVolumeConfiguration( + backend=BackendType.AWS, region="us-east-1", size=100, auto_cleanup_duration="1h" + ) + # Should not raise any errors + _validate_volume_configuration(config) + + +class TestGetVolumeCost: + def test_returns_0_when_no_provisioning_data(self): + volume = get_volume(provisioning_data=None) + assert _get_volume_cost(volume) == 0.0 + + def test_returns_0_when_no_price(self): + volume = get_volume( + provisioning_data=get_volume_provisioning_data(price=None), + ) + assert _get_volume_cost(volume) == 0.0 + + @freeze_time(datetime(2025, 1, 31, 0, 0, tzinfo=timezone.utc)) + def test_calculates_active_volume_cost(self): + volume = get_volume( + status=VolumeStatus.ACTIVE, + deleted=False, + provisioning_data=get_volume_provisioning_data(price=30), + created_at=datetime(2025, 1, 1, 0, 0, tzinfo=timezone.utc), + ) + assert _get_volume_cost(volume) == pytest.approx(30.0) + + @freeze_time(datetime(2025, 1, 31, 0, 0, tzinfo=timezone.utc)) + def test_calculates_finished_volume_cost(self): + volume = get_volume( + provisioning_data=get_volume_provisioning_data(price=30), + created_at=datetime(2025, 1, 1, 0, 0, tzinfo=timezone.utc), + deleted=True, + deleted_at=datetime(2025, 1, 16, 0, 0, tzinfo=timezone.utc), # 15 days later + ) + # Cost should be for 15 days out of a 30-day pricing period + assert _get_volume_cost(volume) == pytest.approx(15.0) + + @freeze_time(datetime(2025, 1, 1, 0, 0, tzinfo=timezone.utc)) + def test_calculates_zero_cost_for_zero_duration_active(self): + volume = get_volume( + status=VolumeStatus.ACTIVE, + deleted=False, + provisioning_data=get_volume_provisioning_data(price=30), + created_at=datetime(2025, 1, 1, 0, 0, tzinfo=timezone.utc), # Same as frozen time + ) + assert _get_volume_cost(volume) == 0.0 + + def test_calculates_zero_cost_for_zero_duration_finished(self): + finished_time = datetime(2025, 1, 1, 0, 0, tzinfo=timezone.utc) + volume = get_volume( + status=VolumeStatus.FAILED, + deleted=False, # Can be failed without being deleted + provisioning_data=get_volume_provisioning_data(price=30), + created_at=finished_time, + last_processed_at=finished_time, + ) + assert _get_volume_cost(volume) == 0.0 diff --git a/src/tests/_internal/server/test_app.py b/src/tests/_internal/server/test_app.py new file mode 100644 index 0000000000..6d6d7db01b --- /dev/null +++ b/src/tests/_internal/server/test_app.py @@ -0,0 +1,97 @@ +from typing import Optional +from unittest.mock import patch + +import pytest +from fastapi.testclient import TestClient +from httpx import AsyncClient +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal import settings +from dstack._internal.server.main import app +from dstack._internal.server.testing.common import create_user, get_auth_headers + +client = TestClient(app) + + +class TestIndex: + @pytest.mark.ui + @pytest.mark.asyncio + async def test_returns_html(self, client: AsyncClient): + response = await client.get("/") + assert response.status_code == 200 + assert response.content.startswith(b'<') + + +class TestCheckXApiVersion: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + ("client_version", "server_version", "is_compatible"), + [ + ("12.12.12", None, True), + ("0.12.4", "0.12.4", True), + (None, "0.1.12", True), + ("0.13.0", "0.12.4", False), + # For test performance, only a few cases are covered here. + # More cases are covered in `TestCheckClientServerCompatibility`. + ], + ) + @pytest.mark.parametrize("endpoint", ["/api/users/list", "/api/projects/list"]) + async def test_check_client_compatibility( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + endpoint: str, + client_version: Optional[str], + server_version: Optional[str], + is_compatible: bool, + ): + user = await create_user(session=session) + headers = get_auth_headers(user.token) + if client_version is not None: + headers["X-API-Version"] = client_version + + with patch.object(settings, "DSTACK_VERSION", server_version): + response = await client.post(endpoint, headers=headers, json={}) + + if is_compatible: + assert response.status_code == 200, response.text + else: + assert response.status_code == 400 + assert response.json() == { + "detail": [ + { + "code": "error", + "msg": f"The client/CLI version ({client_version}) is incompatible with the server version ({server_version}).", + } + ] + } + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize("endpoint", ["/api/users/list", "/api/projects/list"]) + @pytest.mark.parametrize("invalid_value", ["", "1..0", "version1"]) + async def test_invalid_x_api_version_header( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + endpoint: str, + invalid_value: str, + ): + user = await create_user(session=session) + headers = get_auth_headers(user.token) + headers["X-API-Version"] = invalid_value + + response = await client.post(endpoint, headers=headers, json={}) + + assert response.status_code == 400 + assert response.json() == { + "detail": [ + { + "code": None, + "msg": f"Invalid version: {invalid_value}", + } + ] + } diff --git a/src/tests/_internal/server/test_migrations.py b/src/tests/_internal/server/test_migrations.py index c552dea425..49022fda61 100644 --- a/src/tests/_internal/server/test_migrations.py +++ b/src/tests/_internal/server/test_migrations.py @@ -1,21 +1,52 @@ from pathlib import Path import pytest -from alembic.command import check, upgrade +from alembic.command import check, downgrade, upgrade from alembic.config import Config from alembic.util.exc import CommandError from sqlalchemy import create_engine +from sqlalchemy.ext.asyncio import create_async_engine +from testcontainers.postgres import PostgresContainer -def test_no_database_migration_needs_to_be_added(monkeypatch: pytest.MonkeyPatch): +def test_sqlite_migrations(monkeypatch: pytest.MonkeyPatch): server_dir = Path(__file__).parent.joinpath("../../../dstack/_internal/server").resolve() monkeypatch.chdir(server_dir) alembic_cfg = Config("alembic.ini") alembic_cfg.attributes["connection"] = create_engine("sqlite://").connect() + # disable fileConfig() call in env.py as it breaks pytest.LogCaptureFixture + alembic_cfg.attributes["configure_logging"] = False try: upgrade(alembic_cfg, "head") check(alembic_cfg) + downgrade(alembic_cfg, "base") except CommandError as e: pytest.fail(str(e)) + + +@pytest.mark.postgres +@pytest.mark.asyncio +async def test_postgres_migrations(monkeypatch: pytest.MonkeyPatch): + def f(connection, alembic_cfg): + alembic_cfg.attributes["connection"] = connection + # disable fileConfig() call in env.py as it breaks pytest.LogCaptureFixture + alembic_cfg.attributes["configure_logging"] = False + try: + upgrade(alembic_cfg, "head") + check(alembic_cfg) + downgrade(alembic_cfg, "base") + except CommandError as e: + pytest.fail(str(e)) + + server_dir = Path(__file__).parent.joinpath("../../../dstack/_internal/server").resolve() + monkeypatch.chdir(server_dir) + alembic_cfg = Config("alembic.ini") + with PostgresContainer("postgres:16-alpine", driver="asyncpg") as postgres: + db_url = postgres.get_connection_url() + # This is needed to run offline(sync) migrations via async driver + # https://alembic.sqlalchemy.org/en/latest/cookbook.html#programmatic-api-use-connection-sharing-with-asyncio + engine = create_async_engine(db_url) + async with engine.connect() as conn: + await conn.run_sync(f, alembic_cfg) diff --git a/src/tests/_internal/server/utils/test_routers.py b/src/tests/_internal/server/utils/test_routers.py index 8b3eb22f09..0aeb4be8b8 100644 --- a/src/tests/_internal/server/utils/test_routers.py +++ b/src/tests/_internal/server/utils/test_routers.py @@ -1,85 +1,52 @@ from typing import Optional +import packaging.version import pytest +from fastapi import HTTPException from dstack._internal.server.utils.routers import check_client_server_compatibility class TestCheckClientServerCompatibility: - @pytest.mark.parametrize("client_version", ["12.12.12", None]) - def test_returns_none_if_server_version_is_none(self, client_version: Optional[str]): - assert ( - check_client_server_compatibility( - client_version=client_version, - server_version=None, - ) - is None - ) - @pytest.mark.parametrize( - "client_version,server_version", + ("client_version", "server_version"), [ + ("0.12.5", "0.12.4"), + ("0.12.5rc1", "0.12.4"), + ("0.12.4rc1", "0.12.4"), ("0.12.4", "0.12.4"), ("0.12.4", "0.12.5"), - ("1.0.5", "1.0.6"), + ("0.12.4", "0.13.0"), + ("0.12.4", "1.12.0"), ("0.12.4", "0.12.5rc1"), + ("1.0.5", "1.0.6"), + ("12.12.12", None), + (None, "0.1.12"), + (None, None), ], ) - def test_returns_none_if_compatible( + def test_compatible( self, client_version: Optional[str], server_version: Optional[str] - ): - assert ( - check_client_server_compatibility( - client_version=client_version, - server_version=server_version, - ) - is None - ) + ) -> None: + parsed_client_version = None + if client_version is not None: + parsed_client_version = packaging.version.parse(client_version) - @pytest.mark.parametrize( - "client_version,server_version", - [ - ("0.12.4", "0.13.0"), - ("0.12.0", "1.12.0"), - ], - ) - def test_returns_error_if_client_version_smaller( - self, client_version: Optional[str], server_version: Optional[str] - ): - res = check_client_server_compatibility( - client_version=client_version, + check_client_server_compatibility( + client_version=parsed_client_version, server_version=server_version, ) - assert res is not None @pytest.mark.parametrize( - "client_version,server_version", + ("client_version", "server_version"), [ - # no forward-compatibility at all (see https://github.com/dstackai/dstack/issues/1162) - ("0.12.5", "0.12.4"), ("0.13.0", "0.12.4"), ("1.12.0", "0.12.0"), ], ) - def test_returns_error_if_client_version_larger( - self, client_version: Optional[str], server_version: Optional[str] - ): - res = check_client_server_compatibility( - client_version=client_version, - server_version=server_version, - ) - assert res is not None - - @pytest.mark.parametrize( - "server_version", - [ - None, - "0.1.12", - ], - ) - def test_returns_none_if_client_version_is_latest(self, server_version: Optional[str]): - res = check_client_server_compatibility( - client_version="latest", - server_version=server_version, - ) - assert res is None + def test_incompatible(self, client_version: str, server_version: str) -> None: + with pytest.raises(HTTPException): + check_client_server_compatibility( + client_version=packaging.version.parse(client_version), + server_version=server_version, + ) diff --git a/src/tests/_internal/server/utils/test_settings.py b/src/tests/_internal/server/utils/test_settings.py new file mode 100644 index 0000000000..d2d40385db --- /dev/null +++ b/src/tests/_internal/server/utils/test_settings.py @@ -0,0 +1,46 @@ +from typing import Optional + +import pytest + +from dstack._internal.server.utils.settings import parse_hostname_port + + +class TestParseHostnamePort: + @pytest.mark.parametrize( + ["value", "expected_hostname", "expected_port"], + [ + pytest.param("example.com", "example.com", None, id="domain"), + pytest.param("example.com:22", "example.com", 22, id="domain-port"), + pytest.param("10.0.0.1", "10.0.0.1", None, id="ipv4"), + pytest.param( + "[fd69:b03c:7b2:b68a:6eda:b557:9526:757]", + "fd69:b03c:7b2:b68a:6eda:b557:9526:757", + None, + id="ipv6", + ), + pytest.param( + "[fd69:b03c:7b2:b68a:6eda:b557:9526:757]:22", + "fd69:b03c:7b2:b68a:6eda:b557:9526:757", + 22, + id="ipv6-port", + ), + ], + ) + def test_valid(self, value: str, expected_hostname: str, expected_port: Optional[int]): + hostname, port = parse_hostname_port(value) + assert hostname == expected_hostname + assert port == expected_port + + @pytest.mark.parametrize( + "value", + [ + pytest.param("", id="empty-string"), + pytest.param(":22", id="no-hostname"), + pytest.param("fd69:b03c:7b2:b68a:6eda:b557:9526:757", id="ipv6-without-brackets"), + pytest.param("example.com:port", id="non-integer-port"), + pytest.param("example.com:1000000", id="port-out-of-range"), + ], + ) + def test_invalid(self, value: str): + with pytest.raises(ValueError, match=r"must be valid HOSTNAME\[:PORT\]"): + parse_hostname_port(value) diff --git a/src/tests/_internal/utils/test_common.py b/src/tests/_internal/utils/test_common.py index 6e77d2f0a5..70d12c8f39 100644 --- a/src/tests/_internal/utils/test_common.py +++ b/src/tests/_internal/utils/test_common.py @@ -1,10 +1,32 @@ from datetime import datetime, timedelta, timezone -from typing import Any, Iterable, List +from typing import Any, Iterable import pytest from freezegun import freeze_time -from dstack._internal.utils.common import parse_memory, pretty_date, split_chunks +from dstack._internal.utils.common import ( + batched, + concat_url_path, + format_duration_multiunit, + has_duplicates, + local_time, + make_proxy_url, + parse_memory, + pretty_date, + pretty_resources, + sizeof_fmt, +) + + +@pytest.mark.parametrize( + ("dt", "result"), + [ + (datetime.fromisoformat("1970-01-01T12:34"), "12:34"), + (datetime.fromisoformat("2024-12-01T01:02:03"), "01:02"), + ], +) +def test_local_time(dt: datetime, result: str) -> None: + assert local_time(dt) == result @freeze_time(datetime(2023, 10, 4, 12, 0, tzinfo=timezone.utc)) @@ -48,29 +70,84 @@ def test_days_ago(self): past_time = now - timedelta(days=5) assert pretty_date(past_time) == "5 days ago" + def test_week_ago(self): + now = datetime.now(tz=timezone.utc) + past_time = now - timedelta(days=7) + assert pretty_date(past_time) == "1 week ago" + def test_weeks_ago(self): now = datetime.now(tz=timezone.utc) past_time = now - timedelta(days=21) assert pretty_date(past_time) == "3 weeks ago" + def test_month_ago(self): + now = datetime.now(tz=timezone.utc) + past_time = now - timedelta(days=31) + assert pretty_date(past_time) == "1 month ago" + def test_months_ago(self): now = datetime.now(tz=timezone.utc) past_time = now - timedelta(days=90) assert pretty_date(past_time) == "3 months ago" - def test_years_ago(self): + def test_year_ago(self): now = datetime.now(tz=timezone.utc) past_time = now - timedelta(days=400) assert pretty_date(past_time) == "1 year ago" + def test_years_ago(self): + now = datetime.now(tz=timezone.utc) + past_time = now - timedelta(days=700) + assert pretty_date(past_time) == "2 years ago" + def test_future_time(self): now = datetime.now(tz=timezone.utc) future_time = now + timedelta(hours=1) assert pretty_date(future_time) == "" - def test_epoch_timestamp(self): - epoch_time = 1609459200 # January 1, 2021 - assert pretty_date(epoch_time) == "3 years ago" + +class TestFormatDurationMultiunit: + @pytest.mark.parametrize( + ("input", "output"), + [ + (0, "0s"), + (59, "59s"), + (60, "1m"), + (61, "1m 1s"), + (694861, "1w 1d 1h 1m 1s"), + (86401, "1d 1s"), + ], + ) + def test(self, input: int, output: str) -> None: + assert format_duration_multiunit(input) == output + + def test_forbids_negative(self) -> None: + with pytest.raises(ValueError): + format_duration_multiunit(-1) + + +@pytest.mark.parametrize( + "iterable, expected", + [ + ([1, 2, 3, 4], False), + ([1, 2, 3, 4, 2], True), + (iter([1, 2, 3, 4]), False), + (iter([1, 2, 3, 4, 2]), True), + ("abcde", False), + ("hello", True), + ([1, "a"], False), + ([1, "a", 1], True), + ([[1, 2], [3, 4]], False), + ([[1, 2], [1, 2]], True), + ([{"a": "b"}, {"a": "c"}], False), + ([{"a": "b"}, {"a": "b"}], True), + ([{}, {}], True), + ([], False), + ([1], False), + ], +) +def test_has_duplicates(iterable, expected): + assert has_duplicates(iterable) == expected class TestParseMemory: @@ -87,9 +164,9 @@ def test_parses_memory(self, memory, as_units, expected): assert parse_memory(memory, as_untis=as_units) == expected -class TestSplitChunks: +class TestBatched: @pytest.mark.parametrize( - ("iterable", "chunk_size", "expected_chunks"), + ("iterable", "n", "expected_batches"), [ ([1, 2, 3, 4], 2, [[1, 2], [3, 4]]), ([1, 2, 3], 2, [[1, 2], [3]]), @@ -100,12 +177,161 @@ class TestSplitChunks: ((x for x in range(5)), 3, [[0, 1, 2], [3, 4]]), ], ) - def test_split_chunks( - self, iterable: Iterable[Any], chunk_size: int, expected_chunks: List[List[Any]] + def test_batched( + self, iterable: Iterable[Any], n: int, expected_batches: list[list[Any]] ) -> None: - assert list(split_chunks(iterable, chunk_size)) == expected_chunks + assert list(batched(iterable, n)) == expected_batches - @pytest.mark.parametrize("chunk_size", [0, -1]) - def test_raises_on_invalid_chunk_size(self, chunk_size: int) -> None: + @pytest.mark.parametrize("n", [0, -1]) + def test_raises_on_invalid_n(self, n: int) -> None: with pytest.raises(ValueError): - list(split_chunks([1, 2, 3], chunk_size)) + list(batched([1, 2, 3], n)) + + +@pytest.mark.parametrize( + ("a", "b", "result"), + [ + ("/a/b", "c/d", "/a/b/c/d"), + ("/a/b/", "/c/d", "/a/b/c/d"), + ("/a/b//", "//c/d", "/a/b///c/d"), + ("/a", "", "/a"), + ("/a", "/", "/a/"), + ("", "a", "/a"), + ("/", "a", "/a"), + ("", "", ""), + ], +) +def test_concat_url_path(a: str, b: str, result: str) -> None: + assert concat_url_path(a, b) == result + assert concat_url_path(a.encode(), b.encode()) == result.encode() + + +@pytest.mark.parametrize( + ("server_url", "proxy_url", "expected_url"), + [ + pytest.param( + "http://localhost:3000", + "https://gateway.mycompany.example/", + "https://gateway.mycompany.example/", + ), + ( + "https://dstack.mycompany.example/", + "http://gateway.mycompany.example/some/path", + "http://gateway.mycompany.example/some/path", + ), + ( + "http://localhost:3000", + "/proxy/services/main/service/", + "http://localhost:3000/proxy/services/main/service/", + ), + ( + "http://localhost:3000/", + "/proxy/models/main", + "http://localhost:3000/proxy/models/main", + ), + ( + "https://dstack.mycompany.example/some/prefix", + "/proxy/models/main", + "https://dstack.mycompany.example/some/prefix/proxy/models/main", + ), + ], +) +def test_make_proxy_url(server_url, proxy_url, expected_url): + assert make_proxy_url(server_url, proxy_url) == expected_url + + +class TestPrettyResources: + def test_cpu_and_memory(self): + assert pretty_resources(cpus=4, memory="16GB") == "cpu=4 mem=16GB" + + def test_gpu_count_without_name(self): + assert pretty_resources(cpus=4, memory="16GB", gpu_count=1) == "cpu=4 mem=16GB gpu=1" + + def test_gpu_count_with_vendor(self): + assert ( + pretty_resources(cpus=4, memory="16GB", gpu_count=1, gpu_vendor="nvidia") + == "cpu=4 mem=16GB gpu=nvidia:1" + ) + + def test_gpu_count_with_name(self): + assert ( + pretty_resources(cpus=4, memory="16GB", gpu_count=1, gpu_name="A100") + == "cpu=4 mem=16GB gpu=A100:1" + ) + + def test_gpu_with_name_and_memory(self): + assert ( + pretty_resources( + cpus=4, memory="16GB", gpu_count=1, gpu_name="A100", gpu_memory="40GB" + ) + == "cpu=4 mem=16GB gpu=A100:40GB:1" + ) + + def test_gpu_with_total_memory_without_name(self): + assert ( + pretty_resources(cpus=4, memory="16GB", gpu_count=1, total_gpu_memory="80GB") + == "cpu=4 mem=16GB gpu=1:80GB" + ) + + def test_gpu_with_name_memory_and_total_memory(self): + assert ( + pretty_resources( + cpus=4, + memory="16GB", + gpu_count=2, + gpu_name="A100", + gpu_memory="40GB", + total_gpu_memory="80GB", + ) + == "cpu=4 mem=16GB gpu=A100:40GB:2:80GB" + ) + + def test_gpu_with_compute_capability(self): + assert pretty_resources(gpu_count=1, compute_capability="8.0") == "gpu=1:8.0" + + def test_disk(self): + assert ( + pretty_resources(cpus=2, memory="8GB", disk_size="100GB") == "cpu=2 mem=8GB disk=100GB" + ) + + def test_no_gpu(self): + assert pretty_resources(cpus=2, memory="8GB") == "cpu=2 mem=8GB" + + def test_gpu_zero_count_range(self): + """Default GPU spec (0..) should display gpu=0..""" + assert ( + pretty_resources(cpus=2, memory="8GB", disk_size="100GB", gpu_count="0..") + == "cpu=2 mem=8GB disk=100GB gpu=0.." + ) + + def test_gpu_zero_count_range_with_vendor(self): + """Default GPU spec with nvidia vendor should display gpu=nvidia:0..""" + assert ( + pretty_resources( + cpus=2, memory="8GB", disk_size="100GB", gpu_count="0..", gpu_vendor="nvidia" + ) + == "cpu=2 mem=8GB disk=100GB gpu=nvidia:0.." + ) + + +class TestSizeofFmt: + @pytest.mark.parametrize( + ("num", "suffix", "expected"), + [ + (0, "B", "0.0B"), + (1023, "B", "1023.0B"), + (1024, "B", "1.0KiB"), + (1536, "B", "1.5KiB"), + (1048576, "B", "1.0MiB"), + (1073741824, "B", "1.0GiB"), + (1099511627776, "B", "1.0TiB"), + (1125899906842624, "B", "1.0PiB"), + (1152921504606846976, "B", "1.0EiB"), + (1180591620717411303424, "B", "1.0ZiB"), + (1208925819614629174706176, "B", "1.0YiB"), + (2000, "", "2.0Ki"), + (3000000, "Hz", "2.9MiHz"), + ], + ) + def test_sizeof_fmt(self, num: int, suffix: str, expected: str) -> None: + assert sizeof_fmt(num, suffix) == expected diff --git a/src/tests/_internal/utils/test_docker.py b/src/tests/_internal/utils/test_docker.py new file mode 100644 index 0000000000..b2a16e4dc2 --- /dev/null +++ b/src/tests/_internal/utils/test_docker.py @@ -0,0 +1,60 @@ +import pytest + +from dstack._internal.utils.docker import DockerImage, _is_host, parse_image_name + + +class TestParseImageName: + @pytest.mark.parametrize( + ["image", "expected"], + [ + ( + "ubuntu:22.04", + DockerImage(image="ubuntu", registry=None, repo="library/ubuntu", tag="22.04"), + ), + ( + "dstackai/miniforge:py3.9-0.2", + DockerImage( + image="dstackai/miniforge", + registry=None, + repo="dstackai/miniforge", + tag="py3.9-0.2", + ), + ), + ( + "ghcr.io/dstackai/miniforge", + DockerImage( + image="ghcr.io/dstackai/miniforge", + registry="ghcr.io", + repo="dstackai/miniforge", + tag="latest", + ), + ), + ( + "dstackai/miniforge@sha256:a4ba18a847a172a248d68faf6689e69fae4779b90b250211b79a26d21ddd6a15", + DockerImage( + image="dstackai/miniforge", + registry=None, + repo="dstackai/miniforge", + tag="latest", + digest="sha256:a4ba18a847a172a248d68faf6689e69fae4779b90b250211b79a26d21ddd6a15", + ), + ), + ], + ) + def test_parse(self, image: str, expected: DockerImage) -> None: + assert parse_image_name(image) == expected + + +class TestIsHost: + @pytest.mark.parametrize( + ["value", "expected"], + [ + ("localhost", True), + ("localhost:5000", True), + ("ghcr.io", True), + ("127.0.0.1", True), + ("dstackai", False), + ], + ) + def test_is_host(self, value: str, expected: bool) -> None: + assert _is_host(value) is expected diff --git a/src/tests/_internal/utils/test_env.py b/src/tests/_internal/utils/test_env.py new file mode 100644 index 0000000000..d9242a13e9 --- /dev/null +++ b/src/tests/_internal/utils/test_env.py @@ -0,0 +1,139 @@ +from enum import Enum +from typing import Union + +import pytest + +from dstack._internal.utils.env import Environ + + +class _TestEnviron: + def get_environ(self, **env: str) -> Environ: + return Environ(env) + + +class TestEnvironGetBool(_TestEnviron): + @pytest.mark.parametrize( + ["value", "expected"], + [ + ["0", False], + ["1", True], + ["true", True], + ["True", True], + ["FALSE", False], + ["off", False], + ["ON", True], + ], + ) + def test_is_set(self, value: str, expected: bool): + environ = self.get_environ(VAR=value) + assert environ.get_bool("VAR") is expected + + def test_not_set_default_not_set(self): + environ = self.get_environ() + assert environ.get_bool("VAR") is None + + @pytest.mark.parametrize("default", [False, True]) + def test_not_set_default_is_set(self, default: bool): + environ = self.get_environ() + assert environ.get_bool("VAR", default=default) is default + + @pytest.mark.parametrize("value", ["", "2", "foo"]) + def test_error_bad_value(self, value: str): + environ = self.get_environ(VAR=value) + with pytest.raises(ValueError, match=f"VAR={value}"): + environ.get_bool("VAR") + + +class TestEnvironGetInt(_TestEnviron): + def test_is_set(self): + environ = self.get_environ(VAR="12") + assert environ.get_int("VAR") == 12 + + def test_not_set_default_not_set(self): + environ = self.get_environ() + assert environ.get_int("VAR") is None + + def test_not_set_default_is_set(self): + environ = self.get_environ() + assert environ.get_int("VAR", default=12) == 12 + + @pytest.mark.parametrize("value", ["", "false", "10a"]) + def test_error_bad_value(self, value: str): + environ = self.get_environ(VAR=value) + with pytest.raises(ValueError, match=f"VAR={value}"): + environ.get_int("VAR") + + +class _Enum(Enum): + FOO: Union[str, int] + BAR: Union[str, int] + + +class _StrEnum(_Enum): + FOO = "foo" + BAR = "bar" + + +class _IntEnum(_Enum): + FOO = 100 + BAR = 200 + + +class TestEnvironGetEnum(_TestEnviron): + @pytest.mark.parametrize( + ["enum_cls", "value_type", "value"], + [ + pytest.param(_StrEnum, str, "foo", id="str"), + pytest.param(_IntEnum, int, "100", id="int"), + ], + ) + def test_is_set( + self, enum_cls: type[_Enum], value_type: Union[type[str], type[int]], value: str + ): + environ = self.get_environ(VAR=value) + assert environ.get_enum("VAR", enum_cls, value_type=value_type) is enum_cls.FOO + + def test_not_set_default_not_set(self): + environ = self.get_environ() + assert environ.get_enum("VAR", _StrEnum) is None + + def test_not_set_default_is_set(self): + environ = self.get_environ() + assert environ.get_enum("VAR", _IntEnum, default=_IntEnum.BAR) is _IntEnum.BAR + + @pytest.mark.parametrize( + ["enum_cls", "value_type", "value"], + [ + pytest.param(_StrEnum, str, "baz", id="str"), + pytest.param(_IntEnum, int, "300", id="int"), + pytest.param(_IntEnum, int, "10a", id="invalid-int"), + ], + ) + def test_error_bad_value( + self, enum_cls: type[_Enum], value_type: Union[type[str], type[int]], value: str + ): + environ = self.get_environ(VAR=value) + with pytest.raises(ValueError, match=f"VAR={value}"): + environ.get_enum("VAR", enum_cls, value_type=value_type) + + +class TestEnvironGetCallback(_TestEnviron): + def test_is_set(self): + environ = self.get_environ(VAR="foo bar") + assert environ.get_callback("VAR", str.split) == ["foo", "bar"] + + def test_not_set_default_not_set(self): + environ = self.get_environ() + assert environ.get_callback("VAR", str.split) is None + + def test_not_set_default_is_set(self): + environ = self.get_environ() + assert environ.get_callback("VAR", str.split, default=["default"]) == ["default"] + + def test_error_bad_value(self): + def callback(value: str) -> list[str]: + raise ValueError("bad value") + + environ = self.get_environ(VAR="value") + with pytest.raises(ValueError, match="bad value: VAR=value"): + environ.get_callback("VAR", callback=callback) diff --git a/src/tests/_internal/utils/test_event_loop.py b/src/tests/_internal/utils/test_event_loop.py new file mode 100644 index 0000000000..98812b52ac --- /dev/null +++ b/src/tests/_internal/utils/test_event_loop.py @@ -0,0 +1,18 @@ +import asyncio + +from dstack._internal.utils.event_loop import DaemonEventLoop + + +def test_daemon_event_loop(): + q = asyncio.Queue() + + async def worker(i): + await q.put(i) + + async def all_workers(): + await asyncio.gather(*[worker(i) for i in range(3)]) + + loop = DaemonEventLoop() + loop.await_(all_workers()) + assert q.qsize() == 3 + assert {loop.await_(q.get()) for _ in range(3)} == {0, 1, 2} diff --git a/src/tests/_internal/utils/test_gpu.py b/src/tests/_internal/utils/test_gpu.py index fc7bd39a92..b649b8d61a 100644 --- a/src/tests/_internal/utils/test_gpu.py +++ b/src/tests/_internal/utils/test_gpu.py @@ -1,19 +1,61 @@ import pytest -from dstack._internal.utils.gpu import convert_gpu_name - -TESTS = [ - ("NVIDIA GeForce RTX 4060 Ti", "RTX4060Ti"), - ("NVIDIA GeForce RTX 4060", "RTX4060"), - ("NVIDIA L4", "L4"), - ("NVIDIA GH200 120GB", "GH200"), - ("NVIDIA A100-SXM4-80GB", "A100"), - ("NVIDIA A10G", "A10"), - ("Tesla T4", "T4"), -] +from dstack._internal.utils.gpu import ( + convert_amd_gpu_name, + convert_intel_accelerator_name, + convert_nvidia_gpu_name, +) class TestConvertGpuName: - @pytest.mark.parametrize("test_input,expected", TESTS) - def test_convert_gpu_name(self, test_input, expected): - assert convert_gpu_name(test_input) == expected + @pytest.mark.parametrize( + ["test_input", "expected"], + [ + ("NVIDIA GeForce RTX 4060 Ti", "RTX4060Ti"), + ("NVIDIA GeForce RTX 4060", "RTX4060"), + ("NVIDIA RTX 4000 Ada Generation", "RTX4000Ada"), + ("NVIDIA L4", "L4"), + ("NVIDIA GH200 120GB", "GH200"), + ("NVIDIA A100-SXM4-80GB", "A100"), + ("NVIDIA A10G", "A10G"), + ("NVIDIA L40S", "L40S"), + ("NVIDIA H100 NVL", "H100NVL"), + ("NVIDIA H100 80GB HBM3", "H100"), + ("Tesla T4", "T4"), + ("NVIDIA GeForce RTX 2070 Super", "RTX2070SUPER"), + ("NVIDIA GeForce RTX 4070 Ti SUPER", "RTX4070TiSUPER"), + ], + ) + def test_convert_nvidia_gpu_name(self, test_input, expected): + assert convert_nvidia_gpu_name(test_input) == expected + + @pytest.mark.parametrize( + ["test_input", "expected"], + [ + # The following are asic.market_name from amd-smi collected in the wild + ("MI300X-O", "MI300X"), + ("Instinct MI210", "MI210"), + ("AMD INSTINCT MI250 (MCM) OAM AC MBA", "MI250"), + # The following are made-up examples + ("MI300A", "MI300A"), + ("Instinct MI325X", "MI325X"), + ("AMD Radeon PRO W7900", "AMD Radeon PRO W7900"), + ], + ) + def test_convert_amd_gpu_name(self, test_input, expected): + assert convert_amd_gpu_name(test_input) == expected + + @pytest.mark.parametrize( + ["test_input", "expected"], + [ + # The following are name from hl-smi collected in the wild + ("HL-225", "Gaudi2"), + # The following are made-up examples + ("HL-225B", "Gaudi2"), + ("HL-325L", "Gaudi3"), + ("HL-338", "Gaudi3"), + ("HL-1000", "HL-1000"), + ], + ) + def test_convert_intel_accelerator_name(self, test_input, expected): + assert convert_intel_accelerator_name(test_input) == expected diff --git a/src/tests/_internal/utils/test_interpolator.py b/src/tests/_internal/utils/test_interpolator.py index 43cd14c4bf..50c3845832 100644 --- a/src/tests/_internal/utils/test_interpolator.py +++ b/src/tests/_internal/utils/test_interpolator.py @@ -1,6 +1,6 @@ import pytest -from dstack._internal.utils.interpolator import VariablesInterpolator +from dstack._internal.utils.interpolator import InterpolatorError, VariablesInterpolator def get_interpolator(): @@ -35,17 +35,17 @@ def test_missing(self): assert ["env.name"] == missing def test_unclosed_pattern(self): - with pytest.raises(ValueError): + with pytest.raises(InterpolatorError): get_interpolator().interpolate("${{ secrets.password }") def test_illegal_name(self): - with pytest.raises(ValueError): + with pytest.raises(InterpolatorError): get_interpolator().interpolate("${{ secrets.pass-word }}") - with pytest.raises(ValueError): + with pytest.raises(InterpolatorError): get_interpolator().interpolate("${{ .password }}") - with pytest.raises(ValueError): + with pytest.raises(InterpolatorError): get_interpolator().interpolate("${{ password. }}") - with pytest.raises(ValueError): + with pytest.raises(InterpolatorError): get_interpolator().interpolate("${{ secrets.password.hash }}") - with pytest.raises(ValueError): + with pytest.raises(InterpolatorError): get_interpolator().interpolate("${{ secrets.007 }}") diff --git a/src/tests/_internal/utils/test_nested_list.py b/src/tests/_internal/utils/test_nested_list.py new file mode 100644 index 0000000000..7a86962592 --- /dev/null +++ b/src/tests/_internal/utils/test_nested_list.py @@ -0,0 +1,56 @@ +from textwrap import dedent + +import pytest + +from dstack._internal.utils.nested_list import NestedList, NestedListItem + + +def test_render_flat_list(): + nested = NestedList( + children=[NestedListItem("Item 1"), NestedListItem("Item 2"), NestedListItem("Item 3")] + ) + expected = "- Item 1\n- Item 2\n- Item 3\n" + assert nested.render() == expected + + +def test_render_nested_list(): + nested = NestedList( + children=[ + NestedListItem("Item 1"), + NestedListItem( + "Item 2", + [ + NestedListItem("Item 2.1"), + NestedListItem("Item 2.2", [NestedListItem("Item 2.2.1")]), + ], + ), + NestedListItem("Item 3"), + ] + ) + expected = dedent( + """ + - Item 1 + - Item 2 + - Item 2.1 + - Item 2.2 + - Item 2.2.1 + - Item 3 + """ + ).lstrip() + assert nested.render() == expected + + +def test_render_empty_list(): + nested = NestedList() + assert nested.render() == "" + + +def test_cycle_detection(): + a = NestedListItem("A") + b = NestedListItem("B", [a]) + a.children.append(b) # Introduce a cycle: A → B → A + + nested = NestedList(children=[a]) + + with pytest.raises(ValueError, match="Cycle detected at item: A"): + nested.render() diff --git a/src/tests/_internal/utils/test_path.py b/src/tests/_internal/utils/test_path.py index 04fb5dfd2d..cb707892bc 100644 --- a/src/tests/_internal/utils/test_path.py +++ b/src/tests/_internal/utils/test_path.py @@ -2,7 +2,19 @@ import pytest -from dstack._internal.utils.path import resolve_relative_path +from dstack._internal.utils.path import normalize_path, resolve_relative_path + + +class TestNormalizePath: + def test_escape_top(self): + with pytest.raises(ValueError): + normalize_path("dir/../..") + + def test_normalize_rel(self): + assert normalize_path("dir/.///..///sibling") == PurePath("sibling") + + def test_normalize_abs(self): + assert normalize_path("/dir/.///..///sibling") == PurePath("/sibling") class TestResolveRelativePath: diff --git a/src/tests/_internal/utils/test_tags.py b/src/tests/_internal/utils/test_tags.py new file mode 100644 index 0000000000..d152540d55 --- /dev/null +++ b/src/tests/_internal/utils/test_tags.py @@ -0,0 +1,73 @@ +import pytest + +from dstack._internal.utils.tags import is_valid_tag_key, is_valid_tag_value, validate_tags + + +class TestIsValidTagKey: + @pytest.mark.parametrize( + "key", + [ + "Environment", + "Project123", + "special-chars_", + "a" * 60, + ], + ) + def test_valid_tag_key(self, key): + assert is_valid_tag_key(key) + + @pytest.mark.parametrize( + "key", + [ + "key\twith\nweird\nspaces", + "", + "a" * 61, + "Invalid#Char", + ], + ) + def test_invalid_tag_key(self, key): + assert not is_valid_tag_key(key) + + +class TestIsValidTagValue: + @pytest.mark.parametrize( + "value", + [ + "Production", + "v1.0", + "", + "a" * 256, + ], + ) + def test_valid_tag_value(self, value): + assert is_valid_tag_value(value) is True + + @pytest.mark.parametrize( + "value", + [ + "a" * 257, + "Invalid#Value", + ], + ) + def test_invalid_tag_value(self, value): + assert is_valid_tag_value(value) is False + + +class TestValidateTags: + def test_validate_valid_tags(self): + tags = { + "Environment": "Production", + "project": "Tag_Validator", + } + assert validate_tags(tags) is None + + @pytest.mark.parametrize( + "tags", + [ + {"invalidkey!": "SomeValue"}, + {"ValidKey": "Invalid#Value"}, + ], + ) + def test_validate_invalid_tags(self, tags): + with pytest.raises(ValueError): + validate_tags(tags) diff --git a/src/tests/_internal/utils/test_version.py b/src/tests/_internal/utils/test_version.py new file mode 100644 index 0000000000..d24d0d1632 --- /dev/null +++ b/src/tests/_internal/utils/test_version.py @@ -0,0 +1,17 @@ +import packaging.version +import pytest + +from dstack._internal.utils.version import parse_version + + +class TestParseVersion: + @pytest.mark.parametrize("version", ["0.0.0", "0.0.0.dev0", "0.0.0alpha", "latest"]) + def test_latest(self, version: str): + assert parse_version(version) is None + + def test_release(self): + assert parse_version("0.19.27") == packaging.version.parse("0.19.27") + + def test_error_invalid_version(self): + with pytest.raises(ValueError, match=r"Invalid version: 0\.0invalid"): + parse_version("0.0invalid") diff --git a/src/tests/api/common.py b/src/tests/api/common.py new file mode 100644 index 0000000000..c453b6afee --- /dev/null +++ b/src/tests/api/common.py @@ -0,0 +1,29 @@ +import json +from dataclasses import dataclass, field +from typing import Any, Optional + +import requests + + +@dataclass +class RequestRecorder: + payload: Any + last_path: Optional[str] = None + last_body: Optional[str] = None + last_kwargs: dict[str, Any] = field(default_factory=dict) + + def __call__( + self, + path: str, + body: Optional[str] = None, + raise_for_status: bool = True, + method: str = "POST", + **kwargs, + ) -> requests.Response: + self.last_path = path + self.last_body = body + self.last_kwargs = kwargs + resp = requests.Response() + resp.status_code = 200 + resp._content = json.dumps(self.payload).encode("utf-8") + return resp diff --git a/src/tests/api/test_projects.py b/src/tests/api/test_projects.py new file mode 100644 index 0000000000..38b93bae5a --- /dev/null +++ b/src/tests/api/test_projects.py @@ -0,0 +1,62 @@ +import json +import logging +from datetime import datetime, timezone +from uuid import UUID + +from dstack.api.server._projects import ProjectsAPIClient +from tests.api.common import RequestRecorder + +PROJECT_PAYLOAD = { + "project_id": "1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e", + "project_name": "p", + "owner": { + "id": "2b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e", + "username": "u", + "created_at": "2023-01-02T03:04:00+00:00", + "global_role": "user", + "email": None, + "active": True, + "permissions": {"can_create_projects": True}, + "ssh_public_key": None, + }, + "created_at": "2023-01-02T03:04:00+00:00", + "backends": [], + "members": [], + "is_public": False, +} + + +class TestProjectsAPIClientList: + def test_projects_list_serializes_pagination_and_parses_info_list(self): + request = RequestRecorder(payload={"total_count": 1, "projects": [PROJECT_PAYLOAD]}) + client = ProjectsAPIClient(_request=request, _logger=logging.getLogger("test")) + dt = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc) + pid = UUID("3b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e") + + result = client.list( + return_total_count=True, + prev_created_at=dt, + name_pattern="p", + prev_id=pid, + limit=1, + ascending=True, + ) + + payload = json.loads(request.last_body) + assert request.last_path == "/api/projects/list" + assert payload["include_not_joined"] is True + assert payload["return_total_count"] is True + assert payload["name_pattern"] == "p" + assert payload["prev_created_at"] == dt.isoformat() + assert payload["prev_id"] == str(pid) + assert payload["limit"] == 1 + assert payload["ascending"] is True + assert result.total_count == 1 + assert result.projects[0].project_name == "p" + + def test_projects_list_parses_list_response(self): + request = RequestRecorder(payload=[PROJECT_PAYLOAD]) + client = ProjectsAPIClient(_request=request, _logger=logging.getLogger("test")) + result = client.list() + assert isinstance(result, list) + assert result[0].project_name == PROJECT_PAYLOAD["project_name"] diff --git a/src/tests/api/test_users.py b/src/tests/api/test_users.py new file mode 100644 index 0000000000..c01703b811 --- /dev/null +++ b/src/tests/api/test_users.py @@ -0,0 +1,54 @@ +import json +import logging +from datetime import datetime, timezone +from uuid import UUID + +from dstack.api.server._users import UsersAPIClient +from tests.api.common import RequestRecorder + +USER_PAYLOAD = { + "id": "11111111-1111-4111-8111-111111111111", + "username": "user", + "created_at": "2023-01-02T03:04:00+00:00", + "global_role": "user", + "email": None, + "active": True, + "permissions": {"can_create_projects": True}, + "ssh_public_key": None, +} + + +class TestUsersAPIClientList: + def test_serializes_pagination_and_parses_info_list(self): + recorder = RequestRecorder({"total_count": 1, "users": [USER_PAYLOAD]}) + client = UsersAPIClient(_request=recorder, _logger=logging.getLogger("test")) + dt = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc) + uid = UUID("22222222-2222-4222-8222-222222222222") + + result = client.list( + return_total_count=True, + name_pattern="user", + prev_created_at=dt, + prev_id=uid, + limit=1, + ascending=True, + ) + + payload = json.loads(recorder.last_body) + assert recorder.last_path == "/api/users/list" + assert payload["return_total_count"] is True + assert payload["name_pattern"] == "user" + assert payload["prev_created_at"] == dt.isoformat() + assert payload["prev_id"] == str(uid) + assert payload["limit"] == 1 + assert payload["ascending"] is True + assert result.total_count == 1 + assert result.users[0].username == "user" + + def test_parses_list_response(self): + recorder = RequestRecorder([USER_PAYLOAD]) + client = UsersAPIClient(_request=recorder, _logger=logging.getLogger("test")) + result = client.list() + + assert isinstance(result, list) + assert result[0].username == "user" diff --git a/src/tests/conftest.py b/src/tests/conftest.py new file mode 100644 index 0000000000..8106d67a16 --- /dev/null +++ b/src/tests/conftest.py @@ -0,0 +1,62 @@ +import inspect +import os + +import pytest + +from dstack._internal.server.testing.conf import ( # noqa: F401 + postgres_container, + session, + test_db, +) +from dstack._internal.settings import FeatureFlags + + +def pytest_configure(config): + config.addinivalue_line("markers", "ui: mark test as testing UI to run only with --runui") + config.addinivalue_line( + "markers", "postgres: mark test as testing Postgres to run only with --runpostgres" + ) + config.addinivalue_line( + "markers", "windows: mark test to be run on Windows in addition to POSIX" + ) + config.addinivalue_line("markers", "windows_only: mark test to be run on Windows only") + + +def pytest_addoption(parser): + parser.addoption("--runui", action="store_true", default=False, help="Run UI tests") + parser.addoption( + "--runpostgres", action="store_true", default=False, help="Run tests with PostgreSQL" + ) + + +def pytest_collection_modifyitems(config, items): + skip_ui = pytest.mark.skip(reason="need --runui option to run") + skip_postgres = pytest.mark.skip(reason="need --runpostgres option to run") + is_windows = os.name == "nt" + skip_posix = pytest.mark.skip(reason="requires POSIX") + skip_windows = pytest.mark.skip(reason="requires Windows") + for item in items: + if not config.getoption("--runui") and "ui" in item.keywords: + item.add_marker(skip_ui) + if not config.getoption("--runpostgres") and "postgres" in item.keywords: + item.add_marker(skip_postgres) + for_windows_only = "windows_only" in item.keywords + for_windows = for_windows_only or "windows" in item.keywords + if for_windows_only and not is_windows: + item.add_marker(skip_windows) + if not for_windows and is_windows: + item.add_marker(skip_posix) + + +@pytest.fixture(scope="session", autouse=True) +def disable_feature_flags(): + """ + Disables all feature flags once per test session. + + If you need to test a feature flag, monkeypatch `FeatureFlags` class on a per-test basis. + """ + for name, value in inspect.getmembers(FeatureFlags): + if not name.startswith("_") and name.isupper(): + if not isinstance(value, bool): + raise RuntimeError(f"FeatureFlags.{name}: only bool values are supported") + setattr(FeatureFlags, name, False) diff --git a/src/tests/plugins/__init__.py b/src/tests/plugins/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/plugins/test_rest_plugin.py b/src/tests/plugins/test_rest_plugin.py new file mode 100644 index 0000000000..7d9e35a51d --- /dev/null +++ b/src/tests/plugins/test_rest_plugin.py @@ -0,0 +1,206 @@ +import json +import os +from contextlib import nullcontext as does_not_raise +from unittest.mock import Mock + +import pytest +import pytest_asyncio +import requests +from pydantic import parse_obj_as +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.errors import ServerClientError, ServerError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.configurations import ServiceConfiguration +from dstack._internal.core.models.fleets import FleetConfiguration, FleetSpec +from dstack._internal.core.models.gateways import GatewayConfiguration, GatewaySpec +from dstack._internal.core.models.profiles import Profile +from dstack._internal.core.models.resources import Range +from dstack._internal.core.models.runs import RunSpec +from dstack._internal.core.models.volumes import VolumeSpec +from dstack._internal.server.models import ProjectModel +from dstack._internal.server.services import encryption as encryption +from dstack._internal.server.testing.common import ( + create_project, + create_repo, + create_user, + get_fleet_spec, + get_run_spec, + get_volume_configuration, +) +from dstack.plugins.builtin.rest_plugin import PLUGIN_SERVICE_URI_ENV_VAR_NAME, CustomApplyPolicy + + +async def create_run_spec( + session: AsyncSession, + project: ProjectModel, + replicas: str = 1, +) -> RunSpec: + repo = await create_repo(session=session, project_id=project.id) + run_name = "test-run" + profile = Profile(name="test-profile") + spec = get_run_spec( + repo_id=repo.name, + run_name=run_name, + profile=profile, + configuration=ServiceConfiguration( + commands=["echo hello"], port=8000, replicas=parse_obj_as(Range[int], replicas) + ), + ) + return spec + + +async def create_fleet_spec(): + name = "test-fleet-spec" + fleet_conf = FleetConfiguration(name=name) + return get_fleet_spec(conf=fleet_conf) + + +async def create_volume_spec(): + return VolumeSpec(configuration=get_volume_configuration()) + + +async def create_gateway_spec(): + configuration = GatewayConfiguration( + name="test-gateway-config", + backend=BackendType.AWS, + region="us-central", + ) + return GatewaySpec(configuration=configuration) + + +@pytest_asyncio.fixture +async def project(session): + return await create_project(session=session) + + +@pytest_asyncio.fixture +async def user(session): + return await create_user(session=session) + + +@pytest_asyncio.fixture +async def spec(request, session, project): + if request.param == "run_spec": + return await create_run_spec(session, project) + elif request.param == "fleet_spec": + return await create_fleet_spec() + elif request.param == "volume_spec": + return await create_volume_spec() + elif request.param == "gateway_spec": + return await create_gateway_spec() + else: + raise ValueError(f"Unknown spec fixture: {request.param}") + + +class TestRESTPlugin: + @pytest.mark.asyncio + async def test_on_run_apply_plugin_service_uri_not_set(self): + with pytest.raises(ServerError): + CustomApplyPolicy() + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + "spec", ["run_spec", "fleet_spec", "volume_spec", "gateway_spec"], indirect=True + ) + async def test_on_apply_plugin_service_returns_mutated_spec( + self, mocker, test_db, user, project, spec + ): + mocker.patch.dict(os.environ, {PLUGIN_SERVICE_URI_ENV_VAR_NAME: "http://mock"}) + policy = CustomApplyPolicy() + mock_response = Mock() + response_dict = {"spec": spec.dict(), "error": None} + + if isinstance(spec, (RunSpec, FleetSpec)): + response_dict["spec"]["profile"]["tags"] = {"env": "test", "team": "qa"} + else: + response_dict["spec"]["configuration_path"] = "/path/to/something" + + mock_response.text = json.dumps(response_dict) + mock_response.raise_for_status = Mock() + mocker.patch("requests.post", return_value=mock_response) + result = policy.on_apply(user=user.name, project=project.name, spec=spec) + assert result == type(spec)(**response_dict["spec"]) + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + "spec", ["run_spec", "fleet_spec", "volume_spec", "gateway_spec"], indirect=True + ) + async def test_on_apply_plugin_service_call_fails(self, mocker, test_db, user, project, spec): + mocker.patch.dict(os.environ, {PLUGIN_SERVICE_URI_ENV_VAR_NAME: "http://mock"}) + policy = CustomApplyPolicy() + mocker.patch("requests.post", side_effect=requests.RequestException("fail")) + with pytest.raises(ServerClientError): + policy.on_apply(user=user.name, project=project.name, spec=spec) + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + "spec", ["run_spec", "fleet_spec", "volume_spec", "gateway_spec"], indirect=True + ) + async def test_on_apply_plugin_service_connection_fails( + self, mocker, test_db, user, project, spec + ): + mocker.patch.dict(os.environ, {PLUGIN_SERVICE_URI_ENV_VAR_NAME: "http://mock"}) + policy = CustomApplyPolicy() + mocker.patch("requests.post", side_effect=requests.ConnectionError("Failed to connect")) + with pytest.raises(ServerClientError): + policy.on_apply(user=user.name, project=project.name, spec=spec) + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + "spec", ["run_spec", "fleet_spec", "volume_spec", "gateway_spec"], indirect=True + ) + async def test_on_apply_plugin_service_returns_invalid_spec( + self, mocker, test_db, user, project, spec + ): + mocker.patch.dict(os.environ, {PLUGIN_SERVICE_URI_ENV_VAR_NAME: "http://mock"}) + policy = CustomApplyPolicy() + mock_response = Mock() + mock_response.text = json.dumps({"invalid-key": "abc"}) + mock_response.raise_for_status = Mock() + mocker.patch("requests.post", return_value=mock_response) + with pytest.raises(ServerClientError): + policy.on_apply(user.name, project=project.name, spec=spec) + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + "spec", ["run_spec", "fleet_spec", "volume_spec", "gateway_spec"], indirect=True + ) + @pytest.mark.parametrize( + ("error", "expectation"), + [ + pytest.param(None, does_not_raise(), id="error_none"), + pytest.param( + "", + pytest.raises( + ServerClientError, match="Plugin service returned an invalid response" + ), + id="error_empty_str", + ), + pytest.param( + "validation failed", + pytest.raises( + ServerClientError, match="Apply request rejected: validation failed" + ), + id="error_non_empty_str", + ), + ], + ) + async def test_on_apply_plugin_service_error_handling( + self, mocker, test_db, user, project, spec, error, expectation + ): + mocker.patch.dict(os.environ, {PLUGIN_SERVICE_URI_ENV_VAR_NAME: "http://mock"}) + policy = CustomApplyPolicy() + mock_response = Mock() + response_dict = {"spec": spec.dict(), "error": error} + mock_response.text = json.dumps(response_dict) + mock_response.raise_for_status = Mock() + mocker.patch("requests.post", return_value=mock_response) + with expectation: + result = policy.on_apply(user=user.name, project=project.name, spec=spec) + assert result == type(spec)(**response_dict["spec"])